diff mbox series

support "THPeligible" semantics for mTHP with anonymous shmem

Message ID 20240628104926.34209-1-libang.li@antgroup.com (mailing list archive)
State New
Headers show
Series support "THPeligible" semantics for mTHP with anonymous shmem | expand

Commit Message

Bang Li June 28, 2024, 10:49 a.m. UTC
After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
anonymous shmem"), we can configure different policies through
the multi-size THP sysfs interface for anonymous shmem. But
currently "THPeligible" indicates only whether the mapping is
eligible for allocating THP-pages as well as the THP is PMD
mappable or not for anonymous shmem, we need to support semantics
for mTHP with anonymous shmem similar to those for mTHP with
anonymous memory.

Signed-off-by: Bang Li <libang.li@antgroup.com>
---
 fs/proc/task_mmu.c      | 10 +++++++---
 include/linux/huge_mm.h | 11 +++++++++++
 mm/shmem.c              |  9 +--------
 3 files changed, 19 insertions(+), 11 deletions(-)

Comments

Baolin Wang July 1, 2024, 6:47 a.m. UTC | #1
CC Barry.

On 2024/6/28 18:49, Bang Li wrote:
> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
> anonymous shmem"), we can configure different policies through
> the multi-size THP sysfs interface for anonymous shmem. But
> currently "THPeligible" indicates only whether the mapping is
> eligible for allocating THP-pages as well as the THP is PMD
> mappable or not for anonymous shmem, we need to support semantics
> for mTHP with anonymous shmem similar to those for mTHP with
> anonymous memory.

I did not see a consensus that "THP*" related statistics should contain 
mTHP in previous discussion [1].

In addition, if we all agree that "THPeligible" should include mTHP 
statistics, you should update the corresponding documentation to keep 
consistency.

[1] 
https://lore.kernel.org/linux-mm/202406262300.iAURISyJ-lkp@intel.com/T/#md7a77056110cebcc2a9b3cd7e4a8d682667f6ba5

> Signed-off-by: Bang Li <libang.li@antgroup.com>
> ---
>   fs/proc/task_mmu.c      | 10 +++++++---
>   include/linux/huge_mm.h | 11 +++++++++++
>   mm/shmem.c              |  9 +--------
>   3 files changed, 19 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 93fb2c61b154..09b5db356886 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>   {
>   	struct vm_area_struct *vma = v;
>   	struct mem_size_stats mss = {};
> +	bool thp_eligible;
>   
>   	smap_gather_stats(vma, &mss, 0);
>   
> @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>   
>   	__show_smap(m, &mss, false);
>   
> -	seq_printf(m, "THPeligible:    %8u\n",
> -		   !!thp_vma_allowable_orders(vma, vma->vm_flags,
> -			   TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
> +	thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
> +						TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
> +	if (vma_is_anon_shmem(vma))
> +		thp_eligible = !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
> +							vma, vma->vm_pgoff, thp_eligible);
> +	seq_printf(m, "THPeligible:    %8u\n", thp_eligible);
>   
>   	if (arch_pkeys_enabled())
>   		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 212cca384d7e..f87136f38aa1 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -267,6 +267,10 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>   	return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
>   }
>   
> +unsigned long shmem_allowable_huge_orders(struct inode *inode,
> +				struct vm_area_struct *vma, pgoff_t index,
> +				bool global_huge);
> +
>   struct thpsize {
>   	struct kobject kobj;
>   	struct list_head node;
> @@ -460,6 +464,13 @@ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>   	return 0;
>   }
>   
> +static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
> +				struct vm_area_struct *vma, pgoff_t index,
> +				bool global_huge)
> +{
> +	return 0;
> +}
> +
>   #define transparent_hugepage_flags 0UL
>   
>   #define thp_get_unmapped_area	NULL
> diff --git a/mm/shmem.c b/mm/shmem.c
> index d495c0701a83..aa85df9c662a 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1622,7 +1622,7 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
>   }
>   
>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -static unsigned long shmem_allowable_huge_orders(struct inode *inode,
> +unsigned long shmem_allowable_huge_orders(struct inode *inode,
>   				struct vm_area_struct *vma, pgoff_t index,
>   				bool global_huge)
>   {
> @@ -1707,13 +1707,6 @@ static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault
>   	return orders;
>   }
>   #else
> -static unsigned long shmem_allowable_huge_orders(struct inode *inode,
> -				struct vm_area_struct *vma, pgoff_t index,
> -				bool global_huge)
> -{
> -	return 0;
> -}
> -
>   static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
>   					   struct address_space *mapping, pgoff_t index,
>   					   unsigned long orders)
David Hildenbrand July 1, 2024, 6:55 a.m. UTC | #2
On 01.07.24 08:47, Baolin Wang wrote:
> CC Barry.
> 
> On 2024/6/28 18:49, Bang Li wrote:
>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>> anonymous shmem"), we can configure different policies through
>> the multi-size THP sysfs interface for anonymous shmem. But
>> currently "THPeligible" indicates only whether the mapping is
>> eligible for allocating THP-pages as well as the THP is PMD
>> mappable or not for anonymous shmem, we need to support semantics
>> for mTHP with anonymous shmem similar to those for mTHP with
>> anonymous memory.
> 
> I did not see a consensus that "THP*" related statistics should contain
> mTHP in previous discussion [1].
> 
> In addition, if we all agree that "THPeligible" should include mTHP
> statistics, you should update the corresponding documentation to keep
> consistency.
> 
> [1]
> https://lore.kernel.org/linux-mm/202406262300.iAURISyJ-lkp@intel.com/T/#md7a77056110cebcc2a9b3cd7e4a8d682667f6ba5
> 

Fortunately, documentation (Documentation/filesystems/proc.rst) says:

"THPeligible" indicates whether the mapping is eligible for allocating 
naturally aligned THP pages of any currently enabled size. 1 if true, 0 
otherwise."

So that documentation is already pretty clear (we just have to make sure 
the other ones are properly documented, for example as raised in reply 
to [1]).
David Hildenbrand July 1, 2024, 6:57 a.m. UTC | #3
On 28.06.24 12:49, Bang Li wrote:
> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
> anonymous shmem"), we can configure different policies through
> the multi-size THP sysfs interface for anonymous shmem. But
> currently "THPeligible" indicates only whether the mapping is
> eligible for allocating THP-pages as well as the THP is PMD
> mappable or not for anonymous shmem, we need to support semantics
> for mTHP with anonymous shmem similar to those for mTHP with
> anonymous memory.
> 
> Signed-off-by: Bang Li <libang.li@antgroup.com>
> ---
>   fs/proc/task_mmu.c      | 10 +++++++---
>   include/linux/huge_mm.h | 11 +++++++++++
>   mm/shmem.c              |  9 +--------
>   3 files changed, 19 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 93fb2c61b154..09b5db356886 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>   {
>   	struct vm_area_struct *vma = v;
>   	struct mem_size_stats mss = {};
> +	bool thp_eligible;
>   
>   	smap_gather_stats(vma, &mss, 0);
>   
> @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>   
>   	__show_smap(m, &mss, false);
>   
> -	seq_printf(m, "THPeligible:    %8u\n",
> -		   !!thp_vma_allowable_orders(vma, vma->vm_flags,
> -			   TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
> +	thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
> +						TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
> +	if (vma_is_anon_shmem(vma))
> +		thp_eligible = !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
> +							vma, vma->vm_pgoff, thp_eligible);

I would have thought the correct fix is to return the correct result 
from thp_vma_allowable_orders().
Baolin Wang July 1, 2024, 7:18 a.m. UTC | #4
On 2024/7/1 14:55, David Hildenbrand wrote:
> On 01.07.24 08:47, Baolin Wang wrote:
>> CC Barry.
>>
>> On 2024/6/28 18:49, Bang Li wrote:
>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>> anonymous shmem"), we can configure different policies through
>>> the multi-size THP sysfs interface for anonymous shmem. But
>>> currently "THPeligible" indicates only whether the mapping is
>>> eligible for allocating THP-pages as well as the THP is PMD
>>> mappable or not for anonymous shmem, we need to support semantics
>>> for mTHP with anonymous shmem similar to those for mTHP with
>>> anonymous memory.
>>
>> I did not see a consensus that "THP*" related statistics should contain
>> mTHP in previous discussion [1].
>>
>> In addition, if we all agree that "THPeligible" should include mTHP
>> statistics, you should update the corresponding documentation to keep
>> consistency.
>>
>> [1]
>> https://lore.kernel.org/linux-mm/202406262300.iAURISyJ-lkp@intel.com/T/#md7a77056110cebcc2a9b3cd7e4a8d682667f6ba5
>>
> 
> Fortunately, documentation (Documentation/filesystems/proc.rst) says:
> 
> "THPeligible" indicates whether the mapping is eligible for allocating 
> naturally aligned THP pages of any currently enabled size. 1 if true, 0 
> otherwise."
> 
> So that documentation is already pretty clear (we just have to make sure 
> the other ones are properly documented, for example as raised in reply 
> to [1]).

OK, great. Thanks.
Ryan Roberts July 1, 2024, 7:55 a.m. UTC | #5
On 28/06/2024 11:49, Bang Li wrote:
> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
> anonymous shmem"), we can configure different policies through
> the multi-size THP sysfs interface for anonymous shmem. But
> currently "THPeligible" indicates only whether the mapping is
> eligible for allocating THP-pages as well as the THP is PMD
> mappable or not for anonymous shmem, we need to support semantics
> for mTHP with anonymous shmem similar to those for mTHP with
> anonymous memory.
> 
> Signed-off-by: Bang Li <libang.li@antgroup.com>
> ---
>  fs/proc/task_mmu.c      | 10 +++++++---
>  include/linux/huge_mm.h | 11 +++++++++++
>  mm/shmem.c              |  9 +--------
>  3 files changed, 19 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 93fb2c61b154..09b5db356886 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>  {
>  	struct vm_area_struct *vma = v;
>  	struct mem_size_stats mss = {};
> +	bool thp_eligible;
>  
>  	smap_gather_stats(vma, &mss, 0);
>  
> @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>  
>  	__show_smap(m, &mss, false);
>  
> -	seq_printf(m, "THPeligible:    %8u\n",
> -		   !!thp_vma_allowable_orders(vma, vma->vm_flags,
> -			   TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
> +	thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
> +						TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
> +	if (vma_is_anon_shmem(vma))
> +		thp_eligible = !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
> +							vma, vma->vm_pgoff, thp_eligible);

Afraid I haven't been following the shmem mTHP support work as much as I would
have liked, but is there a reason why we need a separate function for shmem?
Couldn't (shouldn't) thp_vma_allowable_orders() be taught to handle shmem too?

> +	seq_printf(m, "THPeligible:    %8u\n", thp_eligible);
>  
>  	if (arch_pkeys_enabled())
>  		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 212cca384d7e..f87136f38aa1 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -267,6 +267,10 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>  	return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
>  }
>  
> +unsigned long shmem_allowable_huge_orders(struct inode *inode,
> +				struct vm_area_struct *vma, pgoff_t index,
> +				bool global_huge);
> +
>  struct thpsize {
>  	struct kobject kobj;
>  	struct list_head node;
> @@ -460,6 +464,13 @@ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>  	return 0;
>  }
>  
> +static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
> +				struct vm_area_struct *vma, pgoff_t index,
> +				bool global_huge)
> +{
> +	return 0;
> +}
> +
>  #define transparent_hugepage_flags 0UL
>  
>  #define thp_get_unmapped_area	NULL
> diff --git a/mm/shmem.c b/mm/shmem.c
> index d495c0701a83..aa85df9c662a 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1622,7 +1622,7 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
>  }
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -static unsigned long shmem_allowable_huge_orders(struct inode *inode,
> +unsigned long shmem_allowable_huge_orders(struct inode *inode,
>  				struct vm_area_struct *vma, pgoff_t index,
>  				bool global_huge)
>  {
> @@ -1707,13 +1707,6 @@ static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault
>  	return orders;
>  }
>  #else
> -static unsigned long shmem_allowable_huge_orders(struct inode *inode,
> -				struct vm_area_struct *vma, pgoff_t index,
> -				bool global_huge)
> -{
> -	return 0;
> -}
> -
>  static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
>  					   struct address_space *mapping, pgoff_t index,
>  					   unsigned long orders)
Bang Li July 1, 2024, 8:22 a.m. UTC | #6
Hi David,

On 2024/7/1 14:55, David Hildenbrand wrote:
> On 01.07.24 08:47, Baolin Wang wrote:
>> CC Barry.
>>
>> On 2024/6/28 18:49, Bang Li wrote:
>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>> anonymous shmem"), we can configure different policies through
>>> the multi-size THP sysfs interface for anonymous shmem. But
>>> currently "THPeligible" indicates only whether the mapping is
>>> eligible for allocating THP-pages as well as the THP is PMD
>>> mappable or not for anonymous shmem, we need to support semantics
>>> for mTHP with anonymous shmem similar to those for mTHP with
>>> anonymous memory.
>>
>> I did not see a consensus that "THP*" related statistics should contain
>> mTHP in previous discussion [1].
>>
>> In addition, if we all agree that "THPeligible" should include mTHP
>> statistics, you should update the corresponding documentation to keep
>> consistency.
>>
>> [1]
>> https://lore.kernel.org/linux-mm/202406262300.iAURISyJ-lkp@intel.com/T/#md7a77056110cebcc2a9b3cd7e4a8d682667f6ba5
>>
> 
> Fortunately, documentation (Documentation/filesystems/proc.rst) says:
> 
> "THPeligible" indicates whether the mapping is eligible for allocating 
> naturally aligned THP pages of any currently enabled size. 1 if true, 0 
> otherwise."
> 
> So that documentation is already pretty clear (we just have to make sure 
> the other ones are properly documented, for example as raised in reply 
> to [1]).
> 

Thanks for the clarification.

Thanks,
Bang
Bang Li July 1, 2024, 8:24 a.m. UTC | #7
Hi David,

Thanks for you review!

On 2024/7/1 14:57, David Hildenbrand wrote:
> On 28.06.24 12:49, Bang Li wrote:
>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>> anonymous shmem"), we can configure different policies through
>> the multi-size THP sysfs interface for anonymous shmem. But
>> currently "THPeligible" indicates only whether the mapping is
>> eligible for allocating THP-pages as well as the THP is PMD
>> mappable or not for anonymous shmem, we need to support semantics
>> for mTHP with anonymous shmem similar to those for mTHP with
>> anonymous memory.
>>
>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>> ---
>>   fs/proc/task_mmu.c      | 10 +++++++---
>>   include/linux/huge_mm.h | 11 +++++++++++
>>   mm/shmem.c              |  9 +--------
>>   3 files changed, 19 insertions(+), 11 deletions(-)
>>
>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>> index 93fb2c61b154..09b5db356886 100644
>> --- a/fs/proc/task_mmu.c
>> +++ b/fs/proc/task_mmu.c
>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>   {
>>       struct vm_area_struct *vma = v;
>>       struct mem_size_stats mss = {};
>> +    bool thp_eligible;
>>       smap_gather_stats(vma, &mss, 0);
>> @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>>       __show_smap(m, &mss, false);
>> -    seq_printf(m, "THPeligible:    %8u\n",
>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>> +    if (vma_is_anon_shmem(vma))
>> +        thp_eligible = 
>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>> +                            vma, vma->vm_pgoff, thp_eligible);
> 
> I would have thought the correct fix is to return the correct result 
> from thp_vma_allowable_orders().
> 

Agreed. I'll try to reimplement this in thp_vma_allowable_orders().

Thanks again for the review!
Bang
Baolin Wang July 1, 2024, 8:33 a.m. UTC | #8
On 2024/7/1 15:55, Ryan Roberts wrote:
> On 28/06/2024 11:49, Bang Li wrote:
>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>> anonymous shmem"), we can configure different policies through
>> the multi-size THP sysfs interface for anonymous shmem. But
>> currently "THPeligible" indicates only whether the mapping is
>> eligible for allocating THP-pages as well as the THP is PMD
>> mappable or not for anonymous shmem, we need to support semantics
>> for mTHP with anonymous shmem similar to those for mTHP with
>> anonymous memory.
>>
>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>> ---
>>   fs/proc/task_mmu.c      | 10 +++++++---
>>   include/linux/huge_mm.h | 11 +++++++++++
>>   mm/shmem.c              |  9 +--------
>>   3 files changed, 19 insertions(+), 11 deletions(-)
>>
>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>> index 93fb2c61b154..09b5db356886 100644
>> --- a/fs/proc/task_mmu.c
>> +++ b/fs/proc/task_mmu.c
>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>   {
>>   	struct vm_area_struct *vma = v;
>>   	struct mem_size_stats mss = {};
>> +	bool thp_eligible;
>>   
>>   	smap_gather_stats(vma, &mss, 0);
>>   
>> @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>>   
>>   	__show_smap(m, &mss, false);
>>   
>> -	seq_printf(m, "THPeligible:    %8u\n",
>> -		   !!thp_vma_allowable_orders(vma, vma->vm_flags,
>> -			   TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>> +	thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>> +						TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>> +	if (vma_is_anon_shmem(vma))
>> +		thp_eligible = !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>> +							vma, vma->vm_pgoff, thp_eligible);
> 
> Afraid I haven't been following the shmem mTHP support work as much as I would
> have liked, but is there a reason why we need a separate function for shmem?

Since shmem_allowable_huge_orders() only uses shmem specific logic to 
determine if huge orders are allowable, there is no need to complicate 
the thp_vma_allowable_orders() function by adding more shmem related 
logic, making it more bloated. In my view, providing a dedicated helper 
shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.

IIUC, I agree with David's suggestion that the 
shmem_allowable_huge_orders() helper function could be used in 
thp_vma_allowable_orders() to support shmem mTHP. Something like:

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c7ce28f6b7f3..9677fe6cf478 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -151,10 +151,13 @@ unsigned long __thp_vma_allowable_orders(struct 
vm_area_struct *vma,
          * Must be done before hugepage flags check since shmem has its
          * own flags.
          */
-       if (!in_pf && shmem_file(vma->vm_file))
-               return shmem_is_huge(file_inode(vma->vm_file), 
vma->vm_pgoff,
-                                    !enforce_sysfs, vma->vm_mm, vm_flags)
-                       ? orders : 0;
+       if (!in_pf && shmem_file(vma->vm_file)) {
+               bool global_huge = 
shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
+                                    !enforce_sysfs, vma->vm_mm, vm_flags);
+
+               return shmem_allowable_huge_orders(file_inode(vma->vm_file),
+                                       vma, vma->vm_pgoff, global_huge);
+       }

         if (!vma_is_anonymous(vma)) {
                 /*

> Couldn't (shouldn't) thp_vma_allowable_orders() be taught to handle shmem too?
> 
>> +	seq_printf(m, "THPeligible:    %8u\n", thp_eligible);
>>   
>>   	if (arch_pkeys_enabled())
>>   		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 212cca384d7e..f87136f38aa1 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -267,6 +267,10 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>>   	return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
>>   }
>>   
>> +unsigned long shmem_allowable_huge_orders(struct inode *inode,
>> +				struct vm_area_struct *vma, pgoff_t index,
>> +				bool global_huge);
>> +
>>   struct thpsize {
>>   	struct kobject kobj;
>>   	struct list_head node;
>> @@ -460,6 +464,13 @@ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>>   	return 0;
>>   }
>>   
>> +static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
>> +				struct vm_area_struct *vma, pgoff_t index,
>> +				bool global_huge)
>> +{
>> +	return 0;
>> +}
>> +
>>   #define transparent_hugepage_flags 0UL
>>   
>>   #define thp_get_unmapped_area	NULL
>> diff --git a/mm/shmem.c b/mm/shmem.c
>> index d495c0701a83..aa85df9c662a 100644
>> --- a/mm/shmem.c
>> +++ b/mm/shmem.c
>> @@ -1622,7 +1622,7 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
>>   }
>>   
>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> -static unsigned long shmem_allowable_huge_orders(struct inode *inode,
>> +unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>   				struct vm_area_struct *vma, pgoff_t index,
>>   				bool global_huge)
>>   {
>> @@ -1707,13 +1707,6 @@ static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault
>>   	return orders;
>>   }
>>   #else
>> -static unsigned long shmem_allowable_huge_orders(struct inode *inode,
>> -				struct vm_area_struct *vma, pgoff_t index,
>> -				bool global_huge)
>> -{
>> -	return 0;
>> -}
>> -
>>   static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
>>   					   struct address_space *mapping, pgoff_t index,
>>   					   unsigned long orders)
Ryan Roberts July 1, 2024, 8:40 a.m. UTC | #9
On 01/07/2024 09:33, Baolin Wang wrote:
> 
> 
> On 2024/7/1 15:55, Ryan Roberts wrote:
>> On 28/06/2024 11:49, Bang Li wrote:
>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>> anonymous shmem"), we can configure different policies through
>>> the multi-size THP sysfs interface for anonymous shmem. But
>>> currently "THPeligible" indicates only whether the mapping is
>>> eligible for allocating THP-pages as well as the THP is PMD
>>> mappable or not for anonymous shmem, we need to support semantics
>>> for mTHP with anonymous shmem similar to those for mTHP with
>>> anonymous memory.
>>>
>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>> ---
>>>   fs/proc/task_mmu.c      | 10 +++++++---
>>>   include/linux/huge_mm.h | 11 +++++++++++
>>>   mm/shmem.c              |  9 +--------
>>>   3 files changed, 19 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>> index 93fb2c61b154..09b5db356886 100644
>>> --- a/fs/proc/task_mmu.c
>>> +++ b/fs/proc/task_mmu.c
>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>   {
>>>       struct vm_area_struct *vma = v;
>>>       struct mem_size_stats mss = {};
>>> +    bool thp_eligible;
>>>         smap_gather_stats(vma, &mss, 0);
>>>   @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>>>         __show_smap(m, &mss, false);
>>>   -    seq_printf(m, "THPeligible:    %8u\n",
>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>>> +    if (vma_is_anon_shmem(vma))
>>> +        thp_eligible = !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>
>> Afraid I haven't been following the shmem mTHP support work as much as I would
>> have liked, but is there a reason why we need a separate function for shmem?
> 
> Since shmem_allowable_huge_orders() only uses shmem specific logic to determine
> if huge orders are allowable, there is no need to complicate the
> thp_vma_allowable_orders() function by adding more shmem related logic, making
> it more bloated. In my view, providing a dedicated helper
> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.

My point was really that a single interface (thp_vma_allowable_orders) should be
used to get this information. I have no strong opinon on how the implementation
of that interface looks. What you suggest below seems perfectly reasonable to me.

> 
> IIUC, I agree with David's suggestion that the shmem_allowable_huge_orders()
> helper function could be used in thp_vma_allowable_orders() to support shmem
> mTHP. Something like:

I hadn't seen David's suggestion until after I sent my mail. But I think we are
both suggesting the same thing.

> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index c7ce28f6b7f3..9677fe6cf478 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -151,10 +151,13 @@ unsigned long __thp_vma_allowable_orders(struct
> vm_area_struct *vma,
>          * Must be done before hugepage flags check since shmem has its
>          * own flags.
>          */
> -       if (!in_pf && shmem_file(vma->vm_file))
> -               return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
> -                                    !enforce_sysfs, vma->vm_mm, vm_flags)
> -                       ? orders : 0;
> +       if (!in_pf && shmem_file(vma->vm_file)) {
> +               bool global_huge = shmem_is_huge(file_inode(vma->vm_file),
> vma->vm_pgoff,
> +                                    !enforce_sysfs, vma->vm_mm, vm_flags);
> +
> +               return shmem_allowable_huge_orders(file_inode(vma->vm_file),
> +                                       vma, vma->vm_pgoff, global_huge);
> +       }
> 
>         if (!vma_is_anonymous(vma)) {
>                 /*
> 
>> Couldn't (shouldn't) thp_vma_allowable_orders() be taught to handle shmem too?
>>
>>> +    seq_printf(m, "THPeligible:    %8u\n", thp_eligible);
>>>         if (arch_pkeys_enabled())
>>>           seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index 212cca384d7e..f87136f38aa1 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -267,6 +267,10 @@ unsigned long thp_vma_allowable_orders(struct
>>> vm_area_struct *vma,
>>>       return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
>>>   }
>>>   +unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>> +                struct vm_area_struct *vma, pgoff_t index,
>>> +                bool global_huge);
>>> +
>>>   struct thpsize {
>>>       struct kobject kobj;
>>>       struct list_head node;
>>> @@ -460,6 +464,13 @@ static inline unsigned long
>>> thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>       return 0;
>>>   }
>>>   +static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>> +                struct vm_area_struct *vma, pgoff_t index,
>>> +                bool global_huge)
>>> +{
>>> +    return 0;
>>> +}
>>> +
>>>   #define transparent_hugepage_flags 0UL
>>>     #define thp_get_unmapped_area    NULL
>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>> index d495c0701a83..aa85df9c662a 100644
>>> --- a/mm/shmem.c
>>> +++ b/mm/shmem.c
>>> @@ -1622,7 +1622,7 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t
>>> limit_gfp)
>>>   }
>>>     #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> -static unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>> +unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>>                   struct vm_area_struct *vma, pgoff_t index,
>>>                   bool global_huge)
>>>   {
>>> @@ -1707,13 +1707,6 @@ static unsigned long shmem_suitable_orders(struct
>>> inode *inode, struct vm_fault
>>>       return orders;
>>>   }
>>>   #else
>>> -static unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>> -                struct vm_area_struct *vma, pgoff_t index,
>>> -                bool global_huge)
>>> -{
>>> -    return 0;
>>> -}
>>> -
>>>   static unsigned long shmem_suitable_orders(struct inode *inode, struct
>>> vm_fault *vmf,
>>>                          struct address_space *mapping, pgoff_t index,
>>>                          unsigned long orders)
Baolin Wang July 1, 2024, 8:46 a.m. UTC | #10
On 2024/7/1 16:40, Ryan Roberts wrote:
> On 01/07/2024 09:33, Baolin Wang wrote:
>>
>>
>> On 2024/7/1 15:55, Ryan Roberts wrote:
>>> On 28/06/2024 11:49, Bang Li wrote:
>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>>> anonymous shmem"), we can configure different policies through
>>>> the multi-size THP sysfs interface for anonymous shmem. But
>>>> currently "THPeligible" indicates only whether the mapping is
>>>> eligible for allocating THP-pages as well as the THP is PMD
>>>> mappable or not for anonymous shmem, we need to support semantics
>>>> for mTHP with anonymous shmem similar to those for mTHP with
>>>> anonymous memory.
>>>>
>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>>> ---
>>>>    fs/proc/task_mmu.c      | 10 +++++++---
>>>>    include/linux/huge_mm.h | 11 +++++++++++
>>>>    mm/shmem.c              |  9 +--------
>>>>    3 files changed, 19 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>> index 93fb2c61b154..09b5db356886 100644
>>>> --- a/fs/proc/task_mmu.c
>>>> +++ b/fs/proc/task_mmu.c
>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>>    {
>>>>        struct vm_area_struct *vma = v;
>>>>        struct mem_size_stats mss = {};
>>>> +    bool thp_eligible;
>>>>          smap_gather_stats(vma, &mss, 0);
>>>>    @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>>>>          __show_smap(m, &mss, false);
>>>>    -    seq_printf(m, "THPeligible:    %8u\n",
>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>>>> +    if (vma_is_anon_shmem(vma))
>>>> +        thp_eligible = !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>>
>>> Afraid I haven't been following the shmem mTHP support work as much as I would
>>> have liked, but is there a reason why we need a separate function for shmem?
>>
>> Since shmem_allowable_huge_orders() only uses shmem specific logic to determine
>> if huge orders are allowable, there is no need to complicate the
>> thp_vma_allowable_orders() function by adding more shmem related logic, making
>> it more bloated. In my view, providing a dedicated helper
>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
> 
> My point was really that a single interface (thp_vma_allowable_orders) should be
> used to get this information. I have no strong opinon on how the implementation
> of that interface looks. What you suggest below seems perfectly reasonable to me.
> 
>>
>> IIUC, I agree with David's suggestion that the shmem_allowable_huge_orders()
>> helper function could be used in thp_vma_allowable_orders() to support shmem
>> mTHP. Something like:
> 
> I hadn't seen David's suggestion until after I sent my mail. But I think we are > both suggesting the same thing.

OK. Great :)
David Hildenbrand July 1, 2024, 8:48 a.m. UTC | #11
On 01.07.24 10:40, Ryan Roberts wrote:
> On 01/07/2024 09:33, Baolin Wang wrote:
>>
>>
>> On 2024/7/1 15:55, Ryan Roberts wrote:
>>> On 28/06/2024 11:49, Bang Li wrote:
>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>>> anonymous shmem"), we can configure different policies through
>>>> the multi-size THP sysfs interface for anonymous shmem. But
>>>> currently "THPeligible" indicates only whether the mapping is
>>>> eligible for allocating THP-pages as well as the THP is PMD
>>>> mappable or not for anonymous shmem, we need to support semantics
>>>> for mTHP with anonymous shmem similar to those for mTHP with
>>>> anonymous memory.
>>>>
>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>>> ---
>>>>    fs/proc/task_mmu.c      | 10 +++++++---
>>>>    include/linux/huge_mm.h | 11 +++++++++++
>>>>    mm/shmem.c              |  9 +--------
>>>>    3 files changed, 19 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>> index 93fb2c61b154..09b5db356886 100644
>>>> --- a/fs/proc/task_mmu.c
>>>> +++ b/fs/proc/task_mmu.c
>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>>    {
>>>>        struct vm_area_struct *vma = v;
>>>>        struct mem_size_stats mss = {};
>>>> +    bool thp_eligible;
>>>>          smap_gather_stats(vma, &mss, 0);
>>>>    @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>>>>          __show_smap(m, &mss, false);
>>>>    -    seq_printf(m, "THPeligible:    %8u\n",
>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>>>> +    if (vma_is_anon_shmem(vma))
>>>> +        thp_eligible = !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>>
>>> Afraid I haven't been following the shmem mTHP support work as much as I would
>>> have liked, but is there a reason why we need a separate function for shmem?
>>
>> Since shmem_allowable_huge_orders() only uses shmem specific logic to determine
>> if huge orders are allowable, there is no need to complicate the
>> thp_vma_allowable_orders() function by adding more shmem related logic, making
>> it more bloated. In my view, providing a dedicated helper
>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
> 
> My point was really that a single interface (thp_vma_allowable_orders) should be
> used to get this information. I have no strong opinon on how the implementation
> of that interface looks. What you suggest below seems perfectly reasonable to me.

Right. thp_vma_allowable_orders() might require some care as discussed 
in other context (cleanly separate dax and shmem handling/orders). But 
that would be follow-up cleanups.
Ryan Roberts July 1, 2024, 8:50 a.m. UTC | #12
On 01/07/2024 09:48, David Hildenbrand wrote:
> On 01.07.24 10:40, Ryan Roberts wrote:
>> On 01/07/2024 09:33, Baolin Wang wrote:
>>>
>>>
>>> On 2024/7/1 15:55, Ryan Roberts wrote:
>>>> On 28/06/2024 11:49, Bang Li wrote:
>>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>>>> anonymous shmem"), we can configure different policies through
>>>>> the multi-size THP sysfs interface for anonymous shmem. But
>>>>> currently "THPeligible" indicates only whether the mapping is
>>>>> eligible for allocating THP-pages as well as the THP is PMD
>>>>> mappable or not for anonymous shmem, we need to support semantics
>>>>> for mTHP with anonymous shmem similar to those for mTHP with
>>>>> anonymous memory.
>>>>>
>>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>>>> ---
>>>>>    fs/proc/task_mmu.c      | 10 +++++++---
>>>>>    include/linux/huge_mm.h | 11 +++++++++++
>>>>>    mm/shmem.c              |  9 +--------
>>>>>    3 files changed, 19 insertions(+), 11 deletions(-)
>>>>>
>>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>>> index 93fb2c61b154..09b5db356886 100644
>>>>> --- a/fs/proc/task_mmu.c
>>>>> +++ b/fs/proc/task_mmu.c
>>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>    {
>>>>>        struct vm_area_struct *vma = v;
>>>>>        struct mem_size_stats mss = {};
>>>>> +    bool thp_eligible;
>>>>>          smap_gather_stats(vma, &mss, 0);
>>>>>    @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>          __show_smap(m, &mss, false);
>>>>>    -    seq_printf(m, "THPeligible:    %8u\n",
>>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>>>>> +    if (vma_is_anon_shmem(vma))
>>>>> +        thp_eligible =
>>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>>>
>>>> Afraid I haven't been following the shmem mTHP support work as much as I would
>>>> have liked, but is there a reason why we need a separate function for shmem?
>>>
>>> Since shmem_allowable_huge_orders() only uses shmem specific logic to determine
>>> if huge orders are allowable, there is no need to complicate the
>>> thp_vma_allowable_orders() function by adding more shmem related logic, making
>>> it more bloated. In my view, providing a dedicated helper
>>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
>>
>> My point was really that a single interface (thp_vma_allowable_orders) should be
>> used to get this information. I have no strong opinon on how the implementation
>> of that interface looks. What you suggest below seems perfectly reasonable to me.
> 
> Right. thp_vma_allowable_orders() might require some care as discussed in other
> context (cleanly separate dax and shmem handling/orders). But that would be
> follow-up cleanups.

Are you planning to do that, or do you want me to send a patch?
David Hildenbrand July 1, 2024, 8:57 a.m. UTC | #13
On 01.07.24 10:50, Ryan Roberts wrote:
> On 01/07/2024 09:48, David Hildenbrand wrote:
>> On 01.07.24 10:40, Ryan Roberts wrote:
>>> On 01/07/2024 09:33, Baolin Wang wrote:
>>>>
>>>>
>>>> On 2024/7/1 15:55, Ryan Roberts wrote:
>>>>> On 28/06/2024 11:49, Bang Li wrote:
>>>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>>>>> anonymous shmem"), we can configure different policies through
>>>>>> the multi-size THP sysfs interface for anonymous shmem. But
>>>>>> currently "THPeligible" indicates only whether the mapping is
>>>>>> eligible for allocating THP-pages as well as the THP is PMD
>>>>>> mappable or not for anonymous shmem, we need to support semantics
>>>>>> for mTHP with anonymous shmem similar to those for mTHP with
>>>>>> anonymous memory.
>>>>>>
>>>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>>>>> ---
>>>>>>     fs/proc/task_mmu.c      | 10 +++++++---
>>>>>>     include/linux/huge_mm.h | 11 +++++++++++
>>>>>>     mm/shmem.c              |  9 +--------
>>>>>>     3 files changed, 19 insertions(+), 11 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>>>> index 93fb2c61b154..09b5db356886 100644
>>>>>> --- a/fs/proc/task_mmu.c
>>>>>> +++ b/fs/proc/task_mmu.c
>>>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>>     {
>>>>>>         struct vm_area_struct *vma = v;
>>>>>>         struct mem_size_stats mss = {};
>>>>>> +    bool thp_eligible;
>>>>>>           smap_gather_stats(vma, &mss, 0);
>>>>>>     @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>>           __show_smap(m, &mss, false);
>>>>>>     -    seq_printf(m, "THPeligible:    %8u\n",
>>>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>>>>>> +    if (vma_is_anon_shmem(vma))
>>>>>> +        thp_eligible =
>>>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>>>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>>>>
>>>>> Afraid I haven't been following the shmem mTHP support work as much as I would
>>>>> have liked, but is there a reason why we need a separate function for shmem?
>>>>
>>>> Since shmem_allowable_huge_orders() only uses shmem specific logic to determine
>>>> if huge orders are allowable, there is no need to complicate the
>>>> thp_vma_allowable_orders() function by adding more shmem related logic, making
>>>> it more bloated. In my view, providing a dedicated helper
>>>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
>>>
>>> My point was really that a single interface (thp_vma_allowable_orders) should be
>>> used to get this information. I have no strong opinon on how the implementation
>>> of that interface looks. What you suggest below seems perfectly reasonable to me.
>>
>> Right. thp_vma_allowable_orders() might require some care as discussed in other
>> context (cleanly separate dax and shmem handling/orders). But that would be
>> follow-up cleanups.
> 
> Are you planning to do that, or do you want me to send a patch?

I'm planning on looking into some details, especially the interaction 
with large folios in the pagecache. I'll let you know once I have a 
better idea what actually should be done :)
Ryan Roberts July 1, 2024, 9:14 a.m. UTC | #14
On 01/07/2024 09:57, David Hildenbrand wrote:
> On 01.07.24 10:50, Ryan Roberts wrote:
>> On 01/07/2024 09:48, David Hildenbrand wrote:
>>> On 01.07.24 10:40, Ryan Roberts wrote:
>>>> On 01/07/2024 09:33, Baolin Wang wrote:
>>>>>
>>>>>
>>>>> On 2024/7/1 15:55, Ryan Roberts wrote:
>>>>>> On 28/06/2024 11:49, Bang Li wrote:
>>>>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>>>>>> anonymous shmem"), we can configure different policies through
>>>>>>> the multi-size THP sysfs interface for anonymous shmem. But
>>>>>>> currently "THPeligible" indicates only whether the mapping is
>>>>>>> eligible for allocating THP-pages as well as the THP is PMD
>>>>>>> mappable or not for anonymous shmem, we need to support semantics
>>>>>>> for mTHP with anonymous shmem similar to those for mTHP with
>>>>>>> anonymous memory.
>>>>>>>
>>>>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>>>>>> ---
>>>>>>>     fs/proc/task_mmu.c      | 10 +++++++---
>>>>>>>     include/linux/huge_mm.h | 11 +++++++++++
>>>>>>>     mm/shmem.c              |  9 +--------
>>>>>>>     3 files changed, 19 insertions(+), 11 deletions(-)
>>>>>>>
>>>>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>>>>> index 93fb2c61b154..09b5db356886 100644
>>>>>>> --- a/fs/proc/task_mmu.c
>>>>>>> +++ b/fs/proc/task_mmu.c
>>>>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>>>     {
>>>>>>>         struct vm_area_struct *vma = v;
>>>>>>>         struct mem_size_stats mss = {};
>>>>>>> +    bool thp_eligible;
>>>>>>>           smap_gather_stats(vma, &mss, 0);
>>>>>>>     @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>>>           __show_smap(m, &mss, false);
>>>>>>>     -    seq_printf(m, "THPeligible:    %8u\n",
>>>>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>>>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>>>>>>> +    if (vma_is_anon_shmem(vma))
>>>>>>> +        thp_eligible =
>>>>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>>>>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>>>>>
>>>>>> Afraid I haven't been following the shmem mTHP support work as much as I
>>>>>> would
>>>>>> have liked, but is there a reason why we need a separate function for shmem?
>>>>>
>>>>> Since shmem_allowable_huge_orders() only uses shmem specific logic to
>>>>> determine
>>>>> if huge orders are allowable, there is no need to complicate the
>>>>> thp_vma_allowable_orders() function by adding more shmem related logic, making
>>>>> it more bloated. In my view, providing a dedicated helper
>>>>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
>>>>
>>>> My point was really that a single interface (thp_vma_allowable_orders)
>>>> should be
>>>> used to get this information. I have no strong opinon on how the implementation
>>>> of that interface looks. What you suggest below seems perfectly reasonable
>>>> to me.
>>>
>>> Right. thp_vma_allowable_orders() might require some care as discussed in other
>>> context (cleanly separate dax and shmem handling/orders). But that would be
>>> follow-up cleanups.
>>
>> Are you planning to do that, or do you want me to send a patch?
> 
> I'm planning on looking into some details, especially the interaction with large
> folios in the pagecache. I'll let you know once I have a better idea what
> actually should be done :)

OK great - I'll scrub it from my todo list... really getting things done today :)
David Hildenbrand July 1, 2024, 9:17 a.m. UTC | #15
On 01.07.24 11:14, Ryan Roberts wrote:
> On 01/07/2024 09:57, David Hildenbrand wrote:
>> On 01.07.24 10:50, Ryan Roberts wrote:
>>> On 01/07/2024 09:48, David Hildenbrand wrote:
>>>> On 01.07.24 10:40, Ryan Roberts wrote:
>>>>> On 01/07/2024 09:33, Baolin Wang wrote:
>>>>>>
>>>>>>
>>>>>> On 2024/7/1 15:55, Ryan Roberts wrote:
>>>>>>> On 28/06/2024 11:49, Bang Li wrote:
>>>>>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>>>>>>> anonymous shmem"), we can configure different policies through
>>>>>>>> the multi-size THP sysfs interface for anonymous shmem. But
>>>>>>>> currently "THPeligible" indicates only whether the mapping is
>>>>>>>> eligible for allocating THP-pages as well as the THP is PMD
>>>>>>>> mappable or not for anonymous shmem, we need to support semantics
>>>>>>>> for mTHP with anonymous shmem similar to those for mTHP with
>>>>>>>> anonymous memory.
>>>>>>>>
>>>>>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>>>>>>> ---
>>>>>>>>      fs/proc/task_mmu.c      | 10 +++++++---
>>>>>>>>      include/linux/huge_mm.h | 11 +++++++++++
>>>>>>>>      mm/shmem.c              |  9 +--------
>>>>>>>>      3 files changed, 19 insertions(+), 11 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>>>>>> index 93fb2c61b154..09b5db356886 100644
>>>>>>>> --- a/fs/proc/task_mmu.c
>>>>>>>> +++ b/fs/proc/task_mmu.c
>>>>>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>>>>      {
>>>>>>>>          struct vm_area_struct *vma = v;
>>>>>>>>          struct mem_size_stats mss = {};
>>>>>>>> +    bool thp_eligible;
>>>>>>>>            smap_gather_stats(vma, &mss, 0);
>>>>>>>>      @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>>>>            __show_smap(m, &mss, false);
>>>>>>>>      -    seq_printf(m, "THPeligible:    %8u\n",
>>>>>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>>>>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>>>>>>>> +    if (vma_is_anon_shmem(vma))
>>>>>>>> +        thp_eligible =
>>>>>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>>>>>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>>>>>>
>>>>>>> Afraid I haven't been following the shmem mTHP support work as much as I
>>>>>>> would
>>>>>>> have liked, but is there a reason why we need a separate function for shmem?
>>>>>>
>>>>>> Since shmem_allowable_huge_orders() only uses shmem specific logic to
>>>>>> determine
>>>>>> if huge orders are allowable, there is no need to complicate the
>>>>>> thp_vma_allowable_orders() function by adding more shmem related logic, making
>>>>>> it more bloated. In my view, providing a dedicated helper
>>>>>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
>>>>>
>>>>> My point was really that a single interface (thp_vma_allowable_orders)
>>>>> should be
>>>>> used to get this information. I have no strong opinon on how the implementation
>>>>> of that interface looks. What you suggest below seems perfectly reasonable
>>>>> to me.
>>>>
>>>> Right. thp_vma_allowable_orders() might require some care as discussed in other
>>>> context (cleanly separate dax and shmem handling/orders). But that would be
>>>> follow-up cleanups.
>>>
>>> Are you planning to do that, or do you want me to send a patch?
>>
>> I'm planning on looking into some details, especially the interaction with large
>> folios in the pagecache. I'll let you know once I have a better idea what
>> actually should be done :)
> 
> OK great - I'll scrub it from my todo list... really getting things done today :)

Resolved the khugepaged thiny already? :P

[khugepaged not active when only enabling the sub-size via the 2M folder 
IIRC]
Bang Li July 1, 2024, 9:43 a.m. UTC | #16
Hi, Baolin

On 2024/7/1 16:33, Baolin Wang wrote:
> 
> 
> On 2024/7/1 15:55, Ryan Roberts wrote:
>> On 28/06/2024 11:49, Bang Li wrote:
>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>> anonymous shmem"), we can configure different policies through
>>> the multi-size THP sysfs interface for anonymous shmem. But
>>> currently "THPeligible" indicates only whether the mapping is
>>> eligible for allocating THP-pages as well as the THP is PMD
>>> mappable or not for anonymous shmem, we need to support semantics
>>> for mTHP with anonymous shmem similar to those for mTHP with
>>> anonymous memory.
>>>
>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>> ---
>>>   fs/proc/task_mmu.c      | 10 +++++++---
>>>   include/linux/huge_mm.h | 11 +++++++++++
>>>   mm/shmem.c              |  9 +--------
>>>   3 files changed, 19 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>> index 93fb2c61b154..09b5db356886 100644
>>> --- a/fs/proc/task_mmu.c
>>> +++ b/fs/proc/task_mmu.c
>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>   {
>>>       struct vm_area_struct *vma = v;
>>>       struct mem_size_stats mss = {};
>>> +    bool thp_eligible;
>>>       smap_gather_stats(vma, &mss, 0);
>>> @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>>>       __show_smap(m, &mss, false);
>>> -    seq_printf(m, "THPeligible:    %8u\n",
>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>>> +    if (vma_is_anon_shmem(vma))
>>> +        thp_eligible = 
>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>
>> Afraid I haven't been following the shmem mTHP support work as much as 
>> I would
>> have liked, but is there a reason why we need a separate function for 
>> shmem?
> 
> Since shmem_allowable_huge_orders() only uses shmem specific logic to 
> determine if huge orders are allowable, there is no need to complicate 
> the thp_vma_allowable_orders() function by adding more shmem related 
> logic, making it more bloated. In my view, providing a dedicated helper 
> shmem_allowable_huge_orders(), specifically for shmem, simplifies the 
> logic.
> 
> IIUC, I agree with David's suggestion that the 
> shmem_allowable_huge_orders() helper function could be used in 
> thp_vma_allowable_orders() to support shmem mTHP. Something like:
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index c7ce28f6b7f3..9677fe6cf478 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -151,10 +151,13 @@ unsigned long __thp_vma_allowable_orders(struct 
> vm_area_struct *vma,
>           * Must be done before hugepage flags check since shmem has its
>           * own flags.
>           */
> -       if (!in_pf && shmem_file(vma->vm_file))
> -               return shmem_is_huge(file_inode(vma->vm_file), 
> vma->vm_pgoff,
> -                                    !enforce_sysfs, vma->vm_mm, vm_flags)
> -                       ? orders : 0;
> +       if (!in_pf && shmem_file(vma->vm_file)) {
> +               bool global_huge = 
> shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
> +                                    !enforce_sysfs, vma->vm_mm, vm_flags);
> +
> +               return 
> shmem_allowable_huge_orders(file_inode(vma->vm_file),
> +                                       vma, vma->vm_pgoff, global_huge);
> +       }
> 
>          if (!vma_is_anonymous(vma)) {
>                  /*
> 
>> Couldn't (shouldn't) thp_vma_allowable_orders() be taught to handle 
>> shmem too?
>>
>>> +    seq_printf(m, "THPeligible:    %8u\n", thp_eligible);
>>>       if (arch_pkeys_enabled())
>>>           seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index 212cca384d7e..f87136f38aa1 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -267,6 +267,10 @@ unsigned long thp_vma_allowable_orders(struct 
>>> vm_area_struct *vma,
>>>       return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, 
>>> orders);
>>>   }
>>> +unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>> +                struct vm_area_struct *vma, pgoff_t index,
>>> +                bool global_huge);
>>> +
>>>   struct thpsize {
>>>       struct kobject kobj;
>>>       struct list_head node;
>>> @@ -460,6 +464,13 @@ static inline unsigned long 
>>> thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>       return 0;
>>>   }
>>> +static inline unsigned long shmem_allowable_huge_orders(struct inode 
>>> *inode,
>>> +                struct vm_area_struct *vma, pgoff_t index,
>>> +                bool global_huge)
>>> +{
>>> +    return 0;
>>> +}
>>> +
>>>   #define transparent_hugepage_flags 0UL
>>>   #define thp_get_unmapped_area    NULL
>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>> index d495c0701a83..aa85df9c662a 100644
>>> --- a/mm/shmem.c
>>> +++ b/mm/shmem.c
>>> @@ -1622,7 +1622,7 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, 
>>> gfp_t limit_gfp)
>>>   }
>>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> -static unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>> +unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>>                   struct vm_area_struct *vma, pgoff_t index,
>>>                   bool global_huge)
>>>   {
>>> @@ -1707,13 +1707,6 @@ static unsigned long 
>>> shmem_suitable_orders(struct inode *inode, struct vm_fault
>>>       return orders;
>>>   }
>>>   #else
>>> -static unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>> -                struct vm_area_struct *vma, pgoff_t index,
>>> -                bool global_huge)
>>> -{
>>> -    return 0;
>>> -}
>>> -
>>>   static unsigned long shmem_suitable_orders(struct inode *inode, 
>>> struct vm_fault *vmf,
>>>                          struct address_space *mapping, pgoff_t index,
>>>                          unsigned long orders)

Thanks for the reference code. Currently, we only implement the mTHP of
anonymous shmem, so we only need to handle anonymous shmem specially. As
shown in the following code:

--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -151,10 +151,14 @@ unsigned long __thp_vma_allowable_orders(struct 
vm_area_struct *vma,
          * Must be done before hugepage flags check since shmem has its
          * own flags.
          */
-       if (!in_pf && shmem_file(vma->vm_file))
-               return shmem_is_huge(file_inode(vma->vm_file), 
vma->vm_pgoff,
-                                    !enforce_sysfs, vma->vm_mm, vm_flags)
-                       ? orders : 0;
+       if (!in_pf && shmem_file(vma->vm_file)) {
+               bool global_huge = 
shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
+                                    !enforce_sysfs, vma->vm_mm, vm_flags);
+               if (!vma_is_anon_shmem(vma))
+                       return global_huge? orders : 0;
+               return shmem_allowable_huge_orders(file_inode(vma->vm_file),
+                                               vma, vma->vm_pgoff, 
global_huge);
+       }

         if (!vma_is_anonymous(vma)) {

Thanks,
Bang
Ryan Roberts July 1, 2024, 10:16 a.m. UTC | #17
On 01/07/2024 10:17, David Hildenbrand wrote:
> On 01.07.24 11:14, Ryan Roberts wrote:
>> On 01/07/2024 09:57, David Hildenbrand wrote:
>>> On 01.07.24 10:50, Ryan Roberts wrote:
>>>> On 01/07/2024 09:48, David Hildenbrand wrote:
>>>>> On 01.07.24 10:40, Ryan Roberts wrote:
>>>>>> On 01/07/2024 09:33, Baolin Wang wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 2024/7/1 15:55, Ryan Roberts wrote:
>>>>>>>> On 28/06/2024 11:49, Bang Li wrote:
>>>>>>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>>>>>>>> anonymous shmem"), we can configure different policies through
>>>>>>>>> the multi-size THP sysfs interface for anonymous shmem. But
>>>>>>>>> currently "THPeligible" indicates only whether the mapping is
>>>>>>>>> eligible for allocating THP-pages as well as the THP is PMD
>>>>>>>>> mappable or not for anonymous shmem, we need to support semantics
>>>>>>>>> for mTHP with anonymous shmem similar to those for mTHP with
>>>>>>>>> anonymous memory.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>>>>>>>> ---
>>>>>>>>>      fs/proc/task_mmu.c      | 10 +++++++---
>>>>>>>>>      include/linux/huge_mm.h | 11 +++++++++++
>>>>>>>>>      mm/shmem.c              |  9 +--------
>>>>>>>>>      3 files changed, 19 insertions(+), 11 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>>>>>>> index 93fb2c61b154..09b5db356886 100644
>>>>>>>>> --- a/fs/proc/task_mmu.c
>>>>>>>>> +++ b/fs/proc/task_mmu.c
>>>>>>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>>>>>      {
>>>>>>>>>          struct vm_area_struct *vma = v;
>>>>>>>>>          struct mem_size_stats mss = {};
>>>>>>>>> +    bool thp_eligible;
>>>>>>>>>            smap_gather_stats(vma, &mss, 0);
>>>>>>>>>      @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void
>>>>>>>>> *v)
>>>>>>>>>            __show_smap(m, &mss, false);
>>>>>>>>>      -    seq_printf(m, "THPeligible:    %8u\n",
>>>>>>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>>>>>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>>>>>>>>> +    if (vma_is_anon_shmem(vma))
>>>>>>>>> +        thp_eligible =
>>>>>>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>>>>>>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>>>>>>>
>>>>>>>> Afraid I haven't been following the shmem mTHP support work as much as I
>>>>>>>> would
>>>>>>>> have liked, but is there a reason why we need a separate function for
>>>>>>>> shmem?
>>>>>>>
>>>>>>> Since shmem_allowable_huge_orders() only uses shmem specific logic to
>>>>>>> determine
>>>>>>> if huge orders are allowable, there is no need to complicate the
>>>>>>> thp_vma_allowable_orders() function by adding more shmem related logic,
>>>>>>> making
>>>>>>> it more bloated. In my view, providing a dedicated helper
>>>>>>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
>>>>>>
>>>>>> My point was really that a single interface (thp_vma_allowable_orders)
>>>>>> should be
>>>>>> used to get this information. I have no strong opinon on how the
>>>>>> implementation
>>>>>> of that interface looks. What you suggest below seems perfectly reasonable
>>>>>> to me.
>>>>>
>>>>> Right. thp_vma_allowable_orders() might require some care as discussed in
>>>>> other
>>>>> context (cleanly separate dax and shmem handling/orders). But that would be
>>>>> follow-up cleanups.
>>>>
>>>> Are you planning to do that, or do you want me to send a patch?
>>>
>>> I'm planning on looking into some details, especially the interaction with large
>>> folios in the pagecache. I'll let you know once I have a better idea what
>>> actually should be done :)
>>
>> OK great - I'll scrub it from my todo list... really getting things done today :)
> 
> Resolved the khugepaged thiny already? :P
> 
> [khugepaged not active when only enabling the sub-size via the 2M folder IIRC]

Hmm... baby brain?

Sorry about that. I've been a bit useless lately. For some reason it wasn't on
my list, but its there now. Will prioritise it, because I agree it's not good.
David Hildenbrand July 1, 2024, 10:22 a.m. UTC | #18
On 01.07.24 12:16, Ryan Roberts wrote:
> On 01/07/2024 10:17, David Hildenbrand wrote:
>> On 01.07.24 11:14, Ryan Roberts wrote:
>>> On 01/07/2024 09:57, David Hildenbrand wrote:
>>>> On 01.07.24 10:50, Ryan Roberts wrote:
>>>>> On 01/07/2024 09:48, David Hildenbrand wrote:
>>>>>> On 01.07.24 10:40, Ryan Roberts wrote:
>>>>>>> On 01/07/2024 09:33, Baolin Wang wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> On 2024/7/1 15:55, Ryan Roberts wrote:
>>>>>>>>> On 28/06/2024 11:49, Bang Li wrote:
>>>>>>>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>>>>>>>>> anonymous shmem"), we can configure different policies through
>>>>>>>>>> the multi-size THP sysfs interface for anonymous shmem. But
>>>>>>>>>> currently "THPeligible" indicates only whether the mapping is
>>>>>>>>>> eligible for allocating THP-pages as well as the THP is PMD
>>>>>>>>>> mappable or not for anonymous shmem, we need to support semantics
>>>>>>>>>> for mTHP with anonymous shmem similar to those for mTHP with
>>>>>>>>>> anonymous memory.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>>>>>>>>> ---
>>>>>>>>>>       fs/proc/task_mmu.c      | 10 +++++++---
>>>>>>>>>>       include/linux/huge_mm.h | 11 +++++++++++
>>>>>>>>>>       mm/shmem.c              |  9 +--------
>>>>>>>>>>       3 files changed, 19 insertions(+), 11 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>>>>>>>> index 93fb2c61b154..09b5db356886 100644
>>>>>>>>>> --- a/fs/proc/task_mmu.c
>>>>>>>>>> +++ b/fs/proc/task_mmu.c
>>>>>>>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>>>>>>       {
>>>>>>>>>>           struct vm_area_struct *vma = v;
>>>>>>>>>>           struct mem_size_stats mss = {};
>>>>>>>>>> +    bool thp_eligible;
>>>>>>>>>>             smap_gather_stats(vma, &mss, 0);
>>>>>>>>>>       @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void
>>>>>>>>>> *v)
>>>>>>>>>>             __show_smap(m, &mss, false);
>>>>>>>>>>       -    seq_printf(m, "THPeligible:    %8u\n",
>>>>>>>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>>>>>>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>>>>>>>>>> +    if (vma_is_anon_shmem(vma))
>>>>>>>>>> +        thp_eligible =
>>>>>>>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>>>>>>>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>>>>>>>>
>>>>>>>>> Afraid I haven't been following the shmem mTHP support work as much as I
>>>>>>>>> would
>>>>>>>>> have liked, but is there a reason why we need a separate function for
>>>>>>>>> shmem?
>>>>>>>>
>>>>>>>> Since shmem_allowable_huge_orders() only uses shmem specific logic to
>>>>>>>> determine
>>>>>>>> if huge orders are allowable, there is no need to complicate the
>>>>>>>> thp_vma_allowable_orders() function by adding more shmem related logic,
>>>>>>>> making
>>>>>>>> it more bloated. In my view, providing a dedicated helper
>>>>>>>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
>>>>>>>
>>>>>>> My point was really that a single interface (thp_vma_allowable_orders)
>>>>>>> should be
>>>>>>> used to get this information. I have no strong opinon on how the
>>>>>>> implementation
>>>>>>> of that interface looks. What you suggest below seems perfectly reasonable
>>>>>>> to me.
>>>>>>
>>>>>> Right. thp_vma_allowable_orders() might require some care as discussed in
>>>>>> other
>>>>>> context (cleanly separate dax and shmem handling/orders). But that would be
>>>>>> follow-up cleanups.
>>>>>
>>>>> Are you planning to do that, or do you want me to send a patch?
>>>>
>>>> I'm planning on looking into some details, especially the interaction with large
>>>> folios in the pagecache. I'll let you know once I have a better idea what
>>>> actually should be done :)
>>>
>>> OK great - I'll scrub it from my todo list... really getting things done today :)
>>
>> Resolved the khugepaged thiny already? :P
>>
>> [khugepaged not active when only enabling the sub-size via the 2M folder IIRC]
> 
> Hmm... baby brain?

:)

I think I only mentioned it in a private mail at some point.

> 
> Sorry about that. I've been a bit useless lately. For some reason it wasn't on
> my list, but its there now. Will prioritise it, because I agree it's not good.


IIRC, if you do

echo never > /sys/kernel/mm/transparent_hugepage/enabled
echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled

khugepaged will not get activated.
Baolin Wang July 1, 2024, 11:12 a.m. UTC | #19
On 2024/7/1 17:43, Bang Li wrote:
> Hi, Baolin
> 
> On 2024/7/1 16:33, Baolin Wang wrote:
>>
>>
>> On 2024/7/1 15:55, Ryan Roberts wrote:
>>> On 28/06/2024 11:49, Bang Li wrote:
>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>>> anonymous shmem"), we can configure different policies through
>>>> the multi-size THP sysfs interface for anonymous shmem. But
>>>> currently "THPeligible" indicates only whether the mapping is
>>>> eligible for allocating THP-pages as well as the THP is PMD
>>>> mappable or not for anonymous shmem, we need to support semantics
>>>> for mTHP with anonymous shmem similar to those for mTHP with
>>>> anonymous memory.
>>>>
>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>>> ---
>>>>   fs/proc/task_mmu.c      | 10 +++++++---
>>>>   include/linux/huge_mm.h | 11 +++++++++++
>>>>   mm/shmem.c              |  9 +--------
>>>>   3 files changed, 19 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>> index 93fb2c61b154..09b5db356886 100644
>>>> --- a/fs/proc/task_mmu.c
>>>> +++ b/fs/proc/task_mmu.c
>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>>   {
>>>>       struct vm_area_struct *vma = v;
>>>>       struct mem_size_stats mss = {};
>>>> +    bool thp_eligible;
>>>>       smap_gather_stats(vma, &mss, 0);
>>>> @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>>>>       __show_smap(m, &mss, false);
>>>> -    seq_printf(m, "THPeligible:    %8u\n",
>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, 
>>>> THP_ORDERS_ALL);
>>>> +    if (vma_is_anon_shmem(vma))
>>>> +        thp_eligible = 
>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>>
>>> Afraid I haven't been following the shmem mTHP support work as much 
>>> as I would
>>> have liked, but is there a reason why we need a separate function for 
>>> shmem?
>>
>> Since shmem_allowable_huge_orders() only uses shmem specific logic to 
>> determine if huge orders are allowable, there is no need to complicate 
>> the thp_vma_allowable_orders() function by adding more shmem related 
>> logic, making it more bloated. In my view, providing a dedicated 
>> helper shmem_allowable_huge_orders(), specifically for shmem, 
>> simplifies the logic.
>>
>> IIUC, I agree with David's suggestion that the 
>> shmem_allowable_huge_orders() helper function could be used in 
>> thp_vma_allowable_orders() to support shmem mTHP. Something like:
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index c7ce28f6b7f3..9677fe6cf478 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -151,10 +151,13 @@ unsigned long __thp_vma_allowable_orders(struct 
>> vm_area_struct *vma,
>>           * Must be done before hugepage flags check since shmem has its
>>           * own flags.
>>           */
>> -       if (!in_pf && shmem_file(vma->vm_file))
>> -               return shmem_is_huge(file_inode(vma->vm_file), 
>> vma->vm_pgoff,
>> -                                    !enforce_sysfs, vma->vm_mm, 
>> vm_flags)
>> -                       ? orders : 0;
>> +       if (!in_pf && shmem_file(vma->vm_file)) {
>> +               bool global_huge = 
>> shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
>> +                                    !enforce_sysfs, vma->vm_mm, 
>> vm_flags);
>> +
>> +               return 
>> shmem_allowable_huge_orders(file_inode(vma->vm_file),
>> +                                       vma, vma->vm_pgoff, global_huge);
>> +       }
>>
>>          if (!vma_is_anonymous(vma)) {
>>                  /*
>>
>>> Couldn't (shouldn't) thp_vma_allowable_orders() be taught to handle 
>>> shmem too?
>>>
>>>> +    seq_printf(m, "THPeligible:    %8u\n", thp_eligible);
>>>>       if (arch_pkeys_enabled())
>>>>           seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index 212cca384d7e..f87136f38aa1 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -267,6 +267,10 @@ unsigned long thp_vma_allowable_orders(struct 
>>>> vm_area_struct *vma,
>>>>       return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, 
>>>> orders);
>>>>   }
>>>> +unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>>> +                struct vm_area_struct *vma, pgoff_t index,
>>>> +                bool global_huge);
>>>> +
>>>>   struct thpsize {
>>>>       struct kobject kobj;
>>>>       struct list_head node;
>>>> @@ -460,6 +464,13 @@ static inline unsigned long 
>>>> thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>>       return 0;
>>>>   }
>>>> +static inline unsigned long shmem_allowable_huge_orders(struct 
>>>> inode *inode,
>>>> +                struct vm_area_struct *vma, pgoff_t index,
>>>> +                bool global_huge)
>>>> +{
>>>> +    return 0;
>>>> +}
>>>> +
>>>>   #define transparent_hugepage_flags 0UL
>>>>   #define thp_get_unmapped_area    NULL
>>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>>> index d495c0701a83..aa85df9c662a 100644
>>>> --- a/mm/shmem.c
>>>> +++ b/mm/shmem.c
>>>> @@ -1622,7 +1622,7 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, 
>>>> gfp_t limit_gfp)
>>>>   }
>>>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>> -static unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>>> +unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>>>                   struct vm_area_struct *vma, pgoff_t index,
>>>>                   bool global_huge)
>>>>   {
>>>> @@ -1707,13 +1707,6 @@ static unsigned long 
>>>> shmem_suitable_orders(struct inode *inode, struct vm_fault
>>>>       return orders;
>>>>   }
>>>>   #else
>>>> -static unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>>> -                struct vm_area_struct *vma, pgoff_t index,
>>>> -                bool global_huge)
>>>> -{
>>>> -    return 0;
>>>> -}
>>>> -
>>>>   static unsigned long shmem_suitable_orders(struct inode *inode, 
>>>> struct vm_fault *vmf,
>>>>                          struct address_space *mapping, pgoff_t index,
>>>>                          unsigned long orders)
> 
> Thanks for the reference code. Currently, we only implement the mTHP of
> anonymous shmem, so we only need to handle anonymous shmem specially. As
> shown in the following code:
> 
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -151,10 +151,14 @@ unsigned long __thp_vma_allowable_orders(struct 
> vm_area_struct *vma,
>           * Must be done before hugepage flags check since shmem has its
>           * own flags.
>           */
> -       if (!in_pf && shmem_file(vma->vm_file))
> -               return shmem_is_huge(file_inode(vma->vm_file), 
> vma->vm_pgoff,
> -                                    !enforce_sysfs, vma->vm_mm, vm_flags)
> -                       ? orders : 0;
> +       if (!in_pf && shmem_file(vma->vm_file)) {
> +               bool global_huge = 
> shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
> +                                    !enforce_sysfs, vma->vm_mm, vm_flags);

Nit: add a blank line after the declaration. Otherwise looks good to me.

> +               if (!vma_is_anon_shmem(vma))
> +                       return global_huge? orders : 0;
> +               return 
> shmem_allowable_huge_orders(file_inode(vma->vm_file),
> +                                               vma, vma->vm_pgoff, 
> global_huge);
> +       }
> 
>          if (!vma_is_anonymous(vma)) {
> 
> Thanks,
> Bang
Bang Li July 1, 2024, 2:51 p.m. UTC | #20
On 2024/7/1 19:12, Baolin Wang wrote:
> 
> 
> On 2024/7/1 17:43, Bang Li wrote:
>> Hi, Baolin
>>
>> On 2024/7/1 16:33, Baolin Wang wrote:
>>>
>>>
>>> On 2024/7/1 15:55, Ryan Roberts wrote:
>>>> On 28/06/2024 11:49, Bang Li wrote:
>>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>>>> anonymous shmem"), we can configure different policies through
>>>>> the multi-size THP sysfs interface for anonymous shmem. But
>>>>> currently "THPeligible" indicates only whether the mapping is
>>>>> eligible for allocating THP-pages as well as the THP is PMD
>>>>> mappable or not for anonymous shmem, we need to support semantics
>>>>> for mTHP with anonymous shmem similar to those for mTHP with
>>>>> anonymous memory.
>>>>>
>>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>>>> ---
>>>>>   fs/proc/task_mmu.c      | 10 +++++++---
>>>>>   include/linux/huge_mm.h | 11 +++++++++++
>>>>>   mm/shmem.c              |  9 +--------
>>>>>   3 files changed, 19 insertions(+), 11 deletions(-)
>>>>>
>>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>>> index 93fb2c61b154..09b5db356886 100644
>>>>> --- a/fs/proc/task_mmu.c
>>>>> +++ b/fs/proc/task_mmu.c
>>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>   {
>>>>>       struct vm_area_struct *vma = v;
>>>>>       struct mem_size_stats mss = {};
>>>>> +    bool thp_eligible;
>>>>>       smap_gather_stats(vma, &mss, 0);
>>>>> @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>       __show_smap(m, &mss, false);
>>>>> -    seq_printf(m, "THPeligible:    %8u\n",
>>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, 
>>>>> THP_ORDERS_ALL);
>>>>> +    if (vma_is_anon_shmem(vma))
>>>>> +        thp_eligible = 
>>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>>>
>>>> Afraid I haven't been following the shmem mTHP support work as much 
>>>> as I would
>>>> have liked, but is there a reason why we need a separate function 
>>>> for shmem?
>>>
>>> Since shmem_allowable_huge_orders() only uses shmem specific logic to 
>>> determine if huge orders are allowable, there is no need to 
>>> complicate the thp_vma_allowable_orders() function by adding more 
>>> shmem related logic, making it more bloated. In my view, providing a 
>>> dedicated helper shmem_allowable_huge_orders(), specifically for 
>>> shmem, simplifies the logic.
>>>
>>> IIUC, I agree with David's suggestion that the 
>>> shmem_allowable_huge_orders() helper function could be used in 
>>> thp_vma_allowable_orders() to support shmem mTHP. Something like:
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index c7ce28f6b7f3..9677fe6cf478 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -151,10 +151,13 @@ unsigned long __thp_vma_allowable_orders(struct 
>>> vm_area_struct *vma,
>>>           * Must be done before hugepage flags check since shmem has its
>>>           * own flags.
>>>           */
>>> -       if (!in_pf && shmem_file(vma->vm_file))
>>> -               return shmem_is_huge(file_inode(vma->vm_file), 
>>> vma->vm_pgoff,
>>> -                                    !enforce_sysfs, vma->vm_mm, 
>>> vm_flags)
>>> -                       ? orders : 0;
>>> +       if (!in_pf && shmem_file(vma->vm_file)) {
>>> +               bool global_huge = 
>>> shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
>>> +                                    !enforce_sysfs, vma->vm_mm, 
>>> vm_flags);
>>> +
>>> +               return 
>>> shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>> +                                       vma, vma->vm_pgoff, 
>>> global_huge);
>>> +       }
>>>
>>>          if (!vma_is_anonymous(vma)) {
>>>                  /*
>>>
>>>> Couldn't (shouldn't) thp_vma_allowable_orders() be taught to handle 
>>>> shmem too?
>>>>
>>>>> +    seq_printf(m, "THPeligible:    %8u\n", thp_eligible);
>>>>>       if (arch_pkeys_enabled())
>>>>>           seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>> index 212cca384d7e..f87136f38aa1 100644
>>>>> --- a/include/linux/huge_mm.h
>>>>> +++ b/include/linux/huge_mm.h
>>>>> @@ -267,6 +267,10 @@ unsigned long thp_vma_allowable_orders(struct 
>>>>> vm_area_struct *vma,
>>>>>       return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, 
>>>>> orders);
>>>>>   }
>>>>> +unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>>>> +                struct vm_area_struct *vma, pgoff_t index,
>>>>> +                bool global_huge);
>>>>> +
>>>>>   struct thpsize {
>>>>>       struct kobject kobj;
>>>>>       struct list_head node;
>>>>> @@ -460,6 +464,13 @@ static inline unsigned long 
>>>>> thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>>>       return 0;
>>>>>   }
>>>>> +static inline unsigned long shmem_allowable_huge_orders(struct 
>>>>> inode *inode,
>>>>> +                struct vm_area_struct *vma, pgoff_t index,
>>>>> +                bool global_huge)
>>>>> +{
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>>   #define transparent_hugepage_flags 0UL
>>>>>   #define thp_get_unmapped_area    NULL
>>>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>>>> index d495c0701a83..aa85df9c662a 100644
>>>>> --- a/mm/shmem.c
>>>>> +++ b/mm/shmem.c
>>>>> @@ -1622,7 +1622,7 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, 
>>>>> gfp_t limit_gfp)
>>>>>   }
>>>>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>>> -static unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>>>> +unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>>>>                   struct vm_area_struct *vma, pgoff_t index,
>>>>>                   bool global_huge)
>>>>>   {
>>>>> @@ -1707,13 +1707,6 @@ static unsigned long 
>>>>> shmem_suitable_orders(struct inode *inode, struct vm_fault
>>>>>       return orders;
>>>>>   }
>>>>>   #else
>>>>> -static unsigned long shmem_allowable_huge_orders(struct inode *inode,
>>>>> -                struct vm_area_struct *vma, pgoff_t index,
>>>>> -                bool global_huge)
>>>>> -{
>>>>> -    return 0;
>>>>> -}
>>>>> -
>>>>>   static unsigned long shmem_suitable_orders(struct inode *inode, 
>>>>> struct vm_fault *vmf,
>>>>>                          struct address_space *mapping, pgoff_t index,
>>>>>                          unsigned long orders)
>>
>> Thanks for the reference code. Currently, we only implement the mTHP of
>> anonymous shmem, so we only need to handle anonymous shmem specially. As
>> shown in the following code:
>>
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -151,10 +151,14 @@ unsigned long __thp_vma_allowable_orders(struct 
>> vm_area_struct *vma,
>>           * Must be done before hugepage flags check since shmem has its
>>           * own flags.
>>           */
>> -       if (!in_pf && shmem_file(vma->vm_file))
>> -               return shmem_is_huge(file_inode(vma->vm_file), 
>> vma->vm_pgoff,
>> -                                    !enforce_sysfs, vma->vm_mm, 
>> vm_flags)
>> -                       ? orders : 0;
>> +       if (!in_pf && shmem_file(vma->vm_file)) {
>> +               bool global_huge = 
>> shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
>> +                                    !enforce_sysfs, vma->vm_mm, 
>> vm_flags);
> 
> Nit: add a blank line after the declaration. Otherwise looks good to me.

It doesn't matter to me whether I add spaces or not, thanks for your
suggestion anyway.

Thanks,
Bang

> 
>> +               if (!vma_is_anon_shmem(vma))
>> +                       return global_huge? orders : 0;
>> +               return 
>> shmem_allowable_huge_orders(file_inode(vma->vm_file),
>> +                                               vma, vma->vm_pgoff, 
>> global_huge);
>> +       }
>>
>>          if (!vma_is_anonymous(vma)) {
>>
>> Thanks,
>> Bang
Yang Shi July 1, 2024, 6:20 p.m. UTC | #21
On Mon, Jul 1, 2024 at 3:23 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 01.07.24 12:16, Ryan Roberts wrote:
> > On 01/07/2024 10:17, David Hildenbrand wrote:
> >> On 01.07.24 11:14, Ryan Roberts wrote:
> >>> On 01/07/2024 09:57, David Hildenbrand wrote:
> >>>> On 01.07.24 10:50, Ryan Roberts wrote:
> >>>>> On 01/07/2024 09:48, David Hildenbrand wrote:
> >>>>>> On 01.07.24 10:40, Ryan Roberts wrote:
> >>>>>>> On 01/07/2024 09:33, Baolin Wang wrote:
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> On 2024/7/1 15:55, Ryan Roberts wrote:
> >>>>>>>>> On 28/06/2024 11:49, Bang Li wrote:
> >>>>>>>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
> >>>>>>>>>> anonymous shmem"), we can configure different policies through
> >>>>>>>>>> the multi-size THP sysfs interface for anonymous shmem. But
> >>>>>>>>>> currently "THPeligible" indicates only whether the mapping is
> >>>>>>>>>> eligible for allocating THP-pages as well as the THP is PMD
> >>>>>>>>>> mappable or not for anonymous shmem, we need to support semantics
> >>>>>>>>>> for mTHP with anonymous shmem similar to those for mTHP with
> >>>>>>>>>> anonymous memory.
> >>>>>>>>>>
> >>>>>>>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
> >>>>>>>>>> ---
> >>>>>>>>>>       fs/proc/task_mmu.c      | 10 +++++++---
> >>>>>>>>>>       include/linux/huge_mm.h | 11 +++++++++++
> >>>>>>>>>>       mm/shmem.c              |  9 +--------
> >>>>>>>>>>       3 files changed, 19 insertions(+), 11 deletions(-)
> >>>>>>>>>>
> >>>>>>>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> >>>>>>>>>> index 93fb2c61b154..09b5db356886 100644
> >>>>>>>>>> --- a/fs/proc/task_mmu.c
> >>>>>>>>>> +++ b/fs/proc/task_mmu.c
> >>>>>>>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
> >>>>>>>>>>       {
> >>>>>>>>>>           struct vm_area_struct *vma = v;
> >>>>>>>>>>           struct mem_size_stats mss = {};
> >>>>>>>>>> +    bool thp_eligible;
> >>>>>>>>>>             smap_gather_stats(vma, &mss, 0);
> >>>>>>>>>>       @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void
> >>>>>>>>>> *v)
> >>>>>>>>>>             __show_smap(m, &mss, false);
> >>>>>>>>>>       -    seq_printf(m, "THPeligible:    %8u\n",
> >>>>>>>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
> >>>>>>>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
> >>>>>>>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
> >>>>>>>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
> >>>>>>>>>> +    if (vma_is_anon_shmem(vma))
> >>>>>>>>>> +        thp_eligible =
> >>>>>>>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
> >>>>>>>>>> +                            vma, vma->vm_pgoff, thp_eligible);
> >>>>>>>>>
> >>>>>>>>> Afraid I haven't been following the shmem mTHP support work as much as I
> >>>>>>>>> would
> >>>>>>>>> have liked, but is there a reason why we need a separate function for
> >>>>>>>>> shmem?
> >>>>>>>>
> >>>>>>>> Since shmem_allowable_huge_orders() only uses shmem specific logic to
> >>>>>>>> determine
> >>>>>>>> if huge orders are allowable, there is no need to complicate the
> >>>>>>>> thp_vma_allowable_orders() function by adding more shmem related logic,
> >>>>>>>> making
> >>>>>>>> it more bloated. In my view, providing a dedicated helper
> >>>>>>>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
> >>>>>>>
> >>>>>>> My point was really that a single interface (thp_vma_allowable_orders)
> >>>>>>> should be
> >>>>>>> used to get this information. I have no strong opinon on how the
> >>>>>>> implementation
> >>>>>>> of that interface looks. What you suggest below seems perfectly reasonable
> >>>>>>> to me.
> >>>>>>
> >>>>>> Right. thp_vma_allowable_orders() might require some care as discussed in
> >>>>>> other
> >>>>>> context (cleanly separate dax and shmem handling/orders). But that would be
> >>>>>> follow-up cleanups.
> >>>>>
> >>>>> Are you planning to do that, or do you want me to send a patch?
> >>>>
> >>>> I'm planning on looking into some details, especially the interaction with large
> >>>> folios in the pagecache. I'll let you know once I have a better idea what
> >>>> actually should be done :)
> >>>
> >>> OK great - I'll scrub it from my todo list... really getting things done today :)
> >>
> >> Resolved the khugepaged thiny already? :P
> >>
> >> [khugepaged not active when only enabling the sub-size via the 2M folder IIRC]
> >
> > Hmm... baby brain?
>
> :)
>
> I think I only mentioned it in a private mail at some point.
>
> >
> > Sorry about that. I've been a bit useless lately. For some reason it wasn't on
> > my list, but its there now. Will prioritise it, because I agree it's not good.
>
>
> IIRC, if you do
>
> echo never > /sys/kernel/mm/transparent_hugepage/enabled
> echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
>
> khugepaged will not get activated.

khugepaged is controlled by the top level knob. But the above setting
sounds confusing, can we disable the top level knob, but enable it on
a per-order basis? TBH, it sounds weird and doesn't make too much
sense to me.

>
> --
> Cheers,
>
> David / dhildenb
>
>
Ryan Roberts July 2, 2024, 8:24 a.m. UTC | #22
On 01/07/2024 19:20, Yang Shi wrote:
> On Mon, Jul 1, 2024 at 3:23 AM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 01.07.24 12:16, Ryan Roberts wrote:
>>> On 01/07/2024 10:17, David Hildenbrand wrote:
>>>> On 01.07.24 11:14, Ryan Roberts wrote:
>>>>> On 01/07/2024 09:57, David Hildenbrand wrote:
>>>>>> On 01.07.24 10:50, Ryan Roberts wrote:
>>>>>>> On 01/07/2024 09:48, David Hildenbrand wrote:
>>>>>>>> On 01.07.24 10:40, Ryan Roberts wrote:
>>>>>>>>> On 01/07/2024 09:33, Baolin Wang wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On 2024/7/1 15:55, Ryan Roberts wrote:
>>>>>>>>>>> On 28/06/2024 11:49, Bang Li wrote:
>>>>>>>>>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>>>>>>>>>>> anonymous shmem"), we can configure different policies through
>>>>>>>>>>>> the multi-size THP sysfs interface for anonymous shmem. But
>>>>>>>>>>>> currently "THPeligible" indicates only whether the mapping is
>>>>>>>>>>>> eligible for allocating THP-pages as well as the THP is PMD
>>>>>>>>>>>> mappable or not for anonymous shmem, we need to support semantics
>>>>>>>>>>>> for mTHP with anonymous shmem similar to those for mTHP with
>>>>>>>>>>>> anonymous memory.
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>       fs/proc/task_mmu.c      | 10 +++++++---
>>>>>>>>>>>>       include/linux/huge_mm.h | 11 +++++++++++
>>>>>>>>>>>>       mm/shmem.c              |  9 +--------
>>>>>>>>>>>>       3 files changed, 19 insertions(+), 11 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>>>>>>>>>> index 93fb2c61b154..09b5db356886 100644
>>>>>>>>>>>> --- a/fs/proc/task_mmu.c
>>>>>>>>>>>> +++ b/fs/proc/task_mmu.c
>>>>>>>>>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>>>>>>>>       {
>>>>>>>>>>>>           struct vm_area_struct *vma = v;
>>>>>>>>>>>>           struct mem_size_stats mss = {};
>>>>>>>>>>>> +    bool thp_eligible;
>>>>>>>>>>>>             smap_gather_stats(vma, &mss, 0);
>>>>>>>>>>>>       @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void
>>>>>>>>>>>> *v)
>>>>>>>>>>>>             __show_smap(m, &mss, false);
>>>>>>>>>>>>       -    seq_printf(m, "THPeligible:    %8u\n",
>>>>>>>>>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>>>>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>>>>>>>>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>>>>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>>>>>>>>>>>> +    if (vma_is_anon_shmem(vma))
>>>>>>>>>>>> +        thp_eligible =
>>>>>>>>>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>>>>>>>>>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>>>>>>>>>>
>>>>>>>>>>> Afraid I haven't been following the shmem mTHP support work as much as I
>>>>>>>>>>> would
>>>>>>>>>>> have liked, but is there a reason why we need a separate function for
>>>>>>>>>>> shmem?
>>>>>>>>>>
>>>>>>>>>> Since shmem_allowable_huge_orders() only uses shmem specific logic to
>>>>>>>>>> determine
>>>>>>>>>> if huge orders are allowable, there is no need to complicate the
>>>>>>>>>> thp_vma_allowable_orders() function by adding more shmem related logic,
>>>>>>>>>> making
>>>>>>>>>> it more bloated. In my view, providing a dedicated helper
>>>>>>>>>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
>>>>>>>>>
>>>>>>>>> My point was really that a single interface (thp_vma_allowable_orders)
>>>>>>>>> should be
>>>>>>>>> used to get this information. I have no strong opinon on how the
>>>>>>>>> implementation
>>>>>>>>> of that interface looks. What you suggest below seems perfectly reasonable
>>>>>>>>> to me.
>>>>>>>>
>>>>>>>> Right. thp_vma_allowable_orders() might require some care as discussed in
>>>>>>>> other
>>>>>>>> context (cleanly separate dax and shmem handling/orders). But that would be
>>>>>>>> follow-up cleanups.
>>>>>>>
>>>>>>> Are you planning to do that, or do you want me to send a patch?
>>>>>>
>>>>>> I'm planning on looking into some details, especially the interaction with large
>>>>>> folios in the pagecache. I'll let you know once I have a better idea what
>>>>>> actually should be done :)
>>>>>
>>>>> OK great - I'll scrub it from my todo list... really getting things done today :)
>>>>
>>>> Resolved the khugepaged thiny already? :P
>>>>
>>>> [khugepaged not active when only enabling the sub-size via the 2M folder IIRC]
>>>
>>> Hmm... baby brain?
>>
>> :)
>>
>> I think I only mentioned it in a private mail at some point.
>>
>>>
>>> Sorry about that. I've been a bit useless lately. For some reason it wasn't on
>>> my list, but its there now. Will prioritise it, because I agree it's not good.
>>
>>
>> IIRC, if you do
>>
>> echo never > /sys/kernel/mm/transparent_hugepage/enabled
>> echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
>>
>> khugepaged will not get activated.
> 
> khugepaged is controlled by the top level knob. 

What do you mean by "top level knob"? I assume
/sys/kernel/mm/transparent_hugepage/enabled ?

If so, that's not really a thing in its own right; its just the legacy PMD-size
THP control, and we only take any notice of it if a per-size control is set to
"inherit". So if we have:

# echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled

Then by design, /sys/kernel/mm/transparent_hugepage/enabled should be ignored.

> But the above setting
> sounds confusing, can we disable the top level knob, but enable it on
> a per-order basis? TBH, it sounds weird and doesn't make too much
> sense to me.

Well that's the design and that's how its documented. It's done this way for
back-compat. All controls are now per-size. But at boot, we default all per-size
controls to "never" except for the PMD-sized control, which is defaulted to
"inherit". That way, an unenlightened user-space can still control PMD-sized THP
via the legacy (top-level) control. But enlightened apps can directly control
per-size.

I'm not sure how your way would work, because you would have 2 controls
competing to do the same thing?

> 
>>
>> --
>> Cheers,
>>
>> David / dhildenb
>>
>>
David Hildenbrand July 2, 2024, 8:28 a.m. UTC | #23
On 02.07.24 10:24, Ryan Roberts wrote:
> On 01/07/2024 19:20, Yang Shi wrote:
>> On Mon, Jul 1, 2024 at 3:23 AM David Hildenbrand <david@redhat.com> wrote:
>>>
>>> On 01.07.24 12:16, Ryan Roberts wrote:
>>>> On 01/07/2024 10:17, David Hildenbrand wrote:
>>>>> On 01.07.24 11:14, Ryan Roberts wrote:
>>>>>> On 01/07/2024 09:57, David Hildenbrand wrote:
>>>>>>> On 01.07.24 10:50, Ryan Roberts wrote:
>>>>>>>> On 01/07/2024 09:48, David Hildenbrand wrote:
>>>>>>>>> On 01.07.24 10:40, Ryan Roberts wrote:
>>>>>>>>>> On 01/07/2024 09:33, Baolin Wang wrote:
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> On 2024/7/1 15:55, Ryan Roberts wrote:
>>>>>>>>>>>> On 28/06/2024 11:49, Bang Li wrote:
>>>>>>>>>>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>>>>>>>>>>>> anonymous shmem"), we can configure different policies through
>>>>>>>>>>>>> the multi-size THP sysfs interface for anonymous shmem. But
>>>>>>>>>>>>> currently "THPeligible" indicates only whether the mapping is
>>>>>>>>>>>>> eligible for allocating THP-pages as well as the THP is PMD
>>>>>>>>>>>>> mappable or not for anonymous shmem, we need to support semantics
>>>>>>>>>>>>> for mTHP with anonymous shmem similar to those for mTHP with
>>>>>>>>>>>>> anonymous memory.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>        fs/proc/task_mmu.c      | 10 +++++++---
>>>>>>>>>>>>>        include/linux/huge_mm.h | 11 +++++++++++
>>>>>>>>>>>>>        mm/shmem.c              |  9 +--------
>>>>>>>>>>>>>        3 files changed, 19 insertions(+), 11 deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>>>>>>>>>>> index 93fb2c61b154..09b5db356886 100644
>>>>>>>>>>>>> --- a/fs/proc/task_mmu.c
>>>>>>>>>>>>> +++ b/fs/proc/task_mmu.c
>>>>>>>>>>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>>>>>>>>>        {
>>>>>>>>>>>>>            struct vm_area_struct *vma = v;
>>>>>>>>>>>>>            struct mem_size_stats mss = {};
>>>>>>>>>>>>> +    bool thp_eligible;
>>>>>>>>>>>>>              smap_gather_stats(vma, &mss, 0);
>>>>>>>>>>>>>        @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void
>>>>>>>>>>>>> *v)
>>>>>>>>>>>>>              __show_smap(m, &mss, false);
>>>>>>>>>>>>>        -    seq_printf(m, "THPeligible:    %8u\n",
>>>>>>>>>>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>>>>>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>>>>>>>>>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>>>>>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>>>>>>>>>>>>> +    if (vma_is_anon_shmem(vma))
>>>>>>>>>>>>> +        thp_eligible =
>>>>>>>>>>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>>>>>>>>>>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>>>>>>>>>>>
>>>>>>>>>>>> Afraid I haven't been following the shmem mTHP support work as much as I
>>>>>>>>>>>> would
>>>>>>>>>>>> have liked, but is there a reason why we need a separate function for
>>>>>>>>>>>> shmem?
>>>>>>>>>>>
>>>>>>>>>>> Since shmem_allowable_huge_orders() only uses shmem specific logic to
>>>>>>>>>>> determine
>>>>>>>>>>> if huge orders are allowable, there is no need to complicate the
>>>>>>>>>>> thp_vma_allowable_orders() function by adding more shmem related logic,
>>>>>>>>>>> making
>>>>>>>>>>> it more bloated. In my view, providing a dedicated helper
>>>>>>>>>>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
>>>>>>>>>>
>>>>>>>>>> My point was really that a single interface (thp_vma_allowable_orders)
>>>>>>>>>> should be
>>>>>>>>>> used to get this information. I have no strong opinon on how the
>>>>>>>>>> implementation
>>>>>>>>>> of that interface looks. What you suggest below seems perfectly reasonable
>>>>>>>>>> to me.
>>>>>>>>>
>>>>>>>>> Right. thp_vma_allowable_orders() might require some care as discussed in
>>>>>>>>> other
>>>>>>>>> context (cleanly separate dax and shmem handling/orders). But that would be
>>>>>>>>> follow-up cleanups.
>>>>>>>>
>>>>>>>> Are you planning to do that, or do you want me to send a patch?
>>>>>>>
>>>>>>> I'm planning on looking into some details, especially the interaction with large
>>>>>>> folios in the pagecache. I'll let you know once I have a better idea what
>>>>>>> actually should be done :)
>>>>>>
>>>>>> OK great - I'll scrub it from my todo list... really getting things done today :)
>>>>>
>>>>> Resolved the khugepaged thiny already? :P
>>>>>
>>>>> [khugepaged not active when only enabling the sub-size via the 2M folder IIRC]
>>>>
>>>> Hmm... baby brain?
>>>
>>> :)
>>>
>>> I think I only mentioned it in a private mail at some point.
>>>
>>>>
>>>> Sorry about that. I've been a bit useless lately. For some reason it wasn't on
>>>> my list, but its there now. Will prioritise it, because I agree it's not good.
>>>
>>>
>>> IIRC, if you do
>>>
>>> echo never > /sys/kernel/mm/transparent_hugepage/enabled
>>> echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
>>>
>>> khugepaged will not get activated.
>>
>> khugepaged is controlled by the top level knob.
> 
> What do you mean by "top level knob"? I assume
> /sys/kernel/mm/transparent_hugepage/enabled ?
> 
> If so, that's not really a thing in its own right; its just the legacy PMD-size
> THP control, and we only take any notice of it if a per-size control is set to
> "inherit". So if we have:

In a simpler world, where "enabled" would have been a boolean (true / 
false), we could have made it a universal killswitch that is AND'ed with 
the other ones.

Unfortunately, we don't live in such a simple world.
Yang Shi July 3, 2024, 4:08 p.m. UTC | #24
On Tue, Jul 2, 2024 at 1:24 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 01/07/2024 19:20, Yang Shi wrote:
> > On Mon, Jul 1, 2024 at 3:23 AM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 01.07.24 12:16, Ryan Roberts wrote:
> >>> On 01/07/2024 10:17, David Hildenbrand wrote:
> >>>> On 01.07.24 11:14, Ryan Roberts wrote:
> >>>>> On 01/07/2024 09:57, David Hildenbrand wrote:
> >>>>>> On 01.07.24 10:50, Ryan Roberts wrote:
> >>>>>>> On 01/07/2024 09:48, David Hildenbrand wrote:
> >>>>>>>> On 01.07.24 10:40, Ryan Roberts wrote:
> >>>>>>>>> On 01/07/2024 09:33, Baolin Wang wrote:
> >>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>> On 2024/7/1 15:55, Ryan Roberts wrote:
> >>>>>>>>>>> On 28/06/2024 11:49, Bang Li wrote:
> >>>>>>>>>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
> >>>>>>>>>>>> anonymous shmem"), we can configure different policies through
> >>>>>>>>>>>> the multi-size THP sysfs interface for anonymous shmem. But
> >>>>>>>>>>>> currently "THPeligible" indicates only whether the mapping is
> >>>>>>>>>>>> eligible for allocating THP-pages as well as the THP is PMD
> >>>>>>>>>>>> mappable or not for anonymous shmem, we need to support semantics
> >>>>>>>>>>>> for mTHP with anonymous shmem similar to those for mTHP with
> >>>>>>>>>>>> anonymous memory.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
> >>>>>>>>>>>> ---
> >>>>>>>>>>>>       fs/proc/task_mmu.c      | 10 +++++++---
> >>>>>>>>>>>>       include/linux/huge_mm.h | 11 +++++++++++
> >>>>>>>>>>>>       mm/shmem.c              |  9 +--------
> >>>>>>>>>>>>       3 files changed, 19 insertions(+), 11 deletions(-)
> >>>>>>>>>>>>
> >>>>>>>>>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> >>>>>>>>>>>> index 93fb2c61b154..09b5db356886 100644
> >>>>>>>>>>>> --- a/fs/proc/task_mmu.c
> >>>>>>>>>>>> +++ b/fs/proc/task_mmu.c
> >>>>>>>>>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
> >>>>>>>>>>>>       {
> >>>>>>>>>>>>           struct vm_area_struct *vma = v;
> >>>>>>>>>>>>           struct mem_size_stats mss = {};
> >>>>>>>>>>>> +    bool thp_eligible;
> >>>>>>>>>>>>             smap_gather_stats(vma, &mss, 0);
> >>>>>>>>>>>>       @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void
> >>>>>>>>>>>> *v)
> >>>>>>>>>>>>             __show_smap(m, &mss, false);
> >>>>>>>>>>>>       -    seq_printf(m, "THPeligible:    %8u\n",
> >>>>>>>>>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
> >>>>>>>>>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
> >>>>>>>>>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
> >>>>>>>>>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
> >>>>>>>>>>>> +    if (vma_is_anon_shmem(vma))
> >>>>>>>>>>>> +        thp_eligible =
> >>>>>>>>>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
> >>>>>>>>>>>> +                            vma, vma->vm_pgoff, thp_eligible);
> >>>>>>>>>>>
> >>>>>>>>>>> Afraid I haven't been following the shmem mTHP support work as much as I
> >>>>>>>>>>> would
> >>>>>>>>>>> have liked, but is there a reason why we need a separate function for
> >>>>>>>>>>> shmem?
> >>>>>>>>>>
> >>>>>>>>>> Since shmem_allowable_huge_orders() only uses shmem specific logic to
> >>>>>>>>>> determine
> >>>>>>>>>> if huge orders are allowable, there is no need to complicate the
> >>>>>>>>>> thp_vma_allowable_orders() function by adding more shmem related logic,
> >>>>>>>>>> making
> >>>>>>>>>> it more bloated. In my view, providing a dedicated helper
> >>>>>>>>>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
> >>>>>>>>>
> >>>>>>>>> My point was really that a single interface (thp_vma_allowable_orders)
> >>>>>>>>> should be
> >>>>>>>>> used to get this information. I have no strong opinon on how the
> >>>>>>>>> implementation
> >>>>>>>>> of that interface looks. What you suggest below seems perfectly reasonable
> >>>>>>>>> to me.
> >>>>>>>>
> >>>>>>>> Right. thp_vma_allowable_orders() might require some care as discussed in
> >>>>>>>> other
> >>>>>>>> context (cleanly separate dax and shmem handling/orders). But that would be
> >>>>>>>> follow-up cleanups.
> >>>>>>>
> >>>>>>> Are you planning to do that, or do you want me to send a patch?
> >>>>>>
> >>>>>> I'm planning on looking into some details, especially the interaction with large
> >>>>>> folios in the pagecache. I'll let you know once I have a better idea what
> >>>>>> actually should be done :)
> >>>>>
> >>>>> OK great - I'll scrub it from my todo list... really getting things done today :)
> >>>>
> >>>> Resolved the khugepaged thiny already? :P
> >>>>
> >>>> [khugepaged not active when only enabling the sub-size via the 2M folder IIRC]
> >>>
> >>> Hmm... baby brain?
> >>
> >> :)
> >>
> >> I think I only mentioned it in a private mail at some point.
> >>
> >>>
> >>> Sorry about that. I've been a bit useless lately. For some reason it wasn't on
> >>> my list, but its there now. Will prioritise it, because I agree it's not good.
> >>
> >>
> >> IIRC, if you do
> >>
> >> echo never > /sys/kernel/mm/transparent_hugepage/enabled
> >> echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> >>
> >> khugepaged will not get activated.
> >
> > khugepaged is controlled by the top level knob.
>
> What do you mean by "top level knob"? I assume
> /sys/kernel/mm/transparent_hugepage/enabled ?

Yes.

>
> If so, that's not really a thing in its own right; its just the legacy PMD-size
> THP control, and we only take any notice of it if a per-size control is set to
> "inherit". So if we have:
>
> # echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
>
> Then by design, /sys/kernel/mm/transparent_hugepage/enabled should be ignored.
>
> > But the above setting
> > sounds confusing, can we disable the top level knob, but enable it on
> > a per-order basis? TBH, it sounds weird and doesn't make too much
> > sense to me.
>
> Well that's the design and that's how its documented. It's done this way for
> back-compat. All controls are now per-size. But at boot, we default all per-size
> controls to "never" except for the PMD-sized control, which is defaulted to
> "inherit". That way, an unenlightened user-space can still control PMD-sized THP
> via the legacy (top-level) control. But enlightened apps can directly control
> per-size.

OK, good to know.

>
> I'm not sure how your way would work, because you would have 2 controls
> competing to do the same thing?

I don't see how they compete if they are 2-level knobs. And I failed
to see how it achieved back-compat. For example, memcached reads
/sys/kernel/mm/transparent_hugepage/enabled to determine whether it
should manage memory in huge page (2M) granularity. If the setting is
set to :

# echo never > /sys/kernel/mm/transparent_hugepage/enabled
# echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled

memcached will manage memory in 4K granularity, but 2M THP is actually
enabled unless memcached checks the per-order knobs.

If we use 2-level mode, memcached doesn't need check per-order setting
at all in order to know whether THP is enabled or not. And it actually
doesn't care about what orders are enabled, it assumes THP size is 2M
(or PMD size). Even though 2M is not enabled but lower orders are
enabled, memcached still can fully utilize the mTHP since the memory
chunk managed by memcached is still 2M aligned in this setting. So
unenlightened applications still can work well. Jemalloc should do the
similar thing if I remember correctly.

>
> >
> >>
> >> --
> >> Cheers,
> >>
> >> David / dhildenb
> >>
> >>
>
David Hildenbrand July 3, 2024, 4:19 p.m. UTC | #25
On 03.07.24 18:08, Yang Shi wrote:
> On Tue, Jul 2, 2024 at 1:24 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 01/07/2024 19:20, Yang Shi wrote:
>>> On Mon, Jul 1, 2024 at 3:23 AM David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>> On 01.07.24 12:16, Ryan Roberts wrote:
>>>>> On 01/07/2024 10:17, David Hildenbrand wrote:
>>>>>> On 01.07.24 11:14, Ryan Roberts wrote:
>>>>>>> On 01/07/2024 09:57, David Hildenbrand wrote:
>>>>>>>> On 01.07.24 10:50, Ryan Roberts wrote:
>>>>>>>>> On 01/07/2024 09:48, David Hildenbrand wrote:
>>>>>>>>>> On 01.07.24 10:40, Ryan Roberts wrote:
>>>>>>>>>>> On 01/07/2024 09:33, Baolin Wang wrote:
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> On 2024/7/1 15:55, Ryan Roberts wrote:
>>>>>>>>>>>>> On 28/06/2024 11:49, Bang Li wrote:
>>>>>>>>>>>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>>>>>>>>>>>>> anonymous shmem"), we can configure different policies through
>>>>>>>>>>>>>> the multi-size THP sysfs interface for anonymous shmem. But
>>>>>>>>>>>>>> currently "THPeligible" indicates only whether the mapping is
>>>>>>>>>>>>>> eligible for allocating THP-pages as well as the THP is PMD
>>>>>>>>>>>>>> mappable or not for anonymous shmem, we need to support semantics
>>>>>>>>>>>>>> for mTHP with anonymous shmem similar to those for mTHP with
>>>>>>>>>>>>>> anonymous memory.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>        fs/proc/task_mmu.c      | 10 +++++++---
>>>>>>>>>>>>>>        include/linux/huge_mm.h | 11 +++++++++++
>>>>>>>>>>>>>>        mm/shmem.c              |  9 +--------
>>>>>>>>>>>>>>        3 files changed, 19 insertions(+), 11 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>>>>>>>>>>>> index 93fb2c61b154..09b5db356886 100644
>>>>>>>>>>>>>> --- a/fs/proc/task_mmu.c
>>>>>>>>>>>>>> +++ b/fs/proc/task_mmu.c
>>>>>>>>>>>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>>>>>>>>>>        {
>>>>>>>>>>>>>>            struct vm_area_struct *vma = v;
>>>>>>>>>>>>>>            struct mem_size_stats mss = {};
>>>>>>>>>>>>>> +    bool thp_eligible;
>>>>>>>>>>>>>>              smap_gather_stats(vma, &mss, 0);
>>>>>>>>>>>>>>        @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void
>>>>>>>>>>>>>> *v)
>>>>>>>>>>>>>>              __show_smap(m, &mss, false);
>>>>>>>>>>>>>>        -    seq_printf(m, "THPeligible:    %8u\n",
>>>>>>>>>>>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>>>>>>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>>>>>>>>>>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>>>>>>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>>>>>>>>>>>>>> +    if (vma_is_anon_shmem(vma))
>>>>>>>>>>>>>> +        thp_eligible =
>>>>>>>>>>>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>>>>>>>>>>>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>>>>>>>>>>>>
>>>>>>>>>>>>> Afraid I haven't been following the shmem mTHP support work as much as I
>>>>>>>>>>>>> would
>>>>>>>>>>>>> have liked, but is there a reason why we need a separate function for
>>>>>>>>>>>>> shmem?
>>>>>>>>>>>>
>>>>>>>>>>>> Since shmem_allowable_huge_orders() only uses shmem specific logic to
>>>>>>>>>>>> determine
>>>>>>>>>>>> if huge orders are allowable, there is no need to complicate the
>>>>>>>>>>>> thp_vma_allowable_orders() function by adding more shmem related logic,
>>>>>>>>>>>> making
>>>>>>>>>>>> it more bloated. In my view, providing a dedicated helper
>>>>>>>>>>>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
>>>>>>>>>>>
>>>>>>>>>>> My point was really that a single interface (thp_vma_allowable_orders)
>>>>>>>>>>> should be
>>>>>>>>>>> used to get this information. I have no strong opinon on how the
>>>>>>>>>>> implementation
>>>>>>>>>>> of that interface looks. What you suggest below seems perfectly reasonable
>>>>>>>>>>> to me.
>>>>>>>>>>
>>>>>>>>>> Right. thp_vma_allowable_orders() might require some care as discussed in
>>>>>>>>>> other
>>>>>>>>>> context (cleanly separate dax and shmem handling/orders). But that would be
>>>>>>>>>> follow-up cleanups.
>>>>>>>>>
>>>>>>>>> Are you planning to do that, or do you want me to send a patch?
>>>>>>>>
>>>>>>>> I'm planning on looking into some details, especially the interaction with large
>>>>>>>> folios in the pagecache. I'll let you know once I have a better idea what
>>>>>>>> actually should be done :)
>>>>>>>
>>>>>>> OK great - I'll scrub it from my todo list... really getting things done today :)
>>>>>>
>>>>>> Resolved the khugepaged thiny already? :P
>>>>>>
>>>>>> [khugepaged not active when only enabling the sub-size via the 2M folder IIRC]
>>>>>
>>>>> Hmm... baby brain?
>>>>
>>>> :)
>>>>
>>>> I think I only mentioned it in a private mail at some point.
>>>>
>>>>>
>>>>> Sorry about that. I've been a bit useless lately. For some reason it wasn't on
>>>>> my list, but its there now. Will prioritise it, because I agree it's not good.
>>>>
>>>>
>>>> IIRC, if you do
>>>>
>>>> echo never > /sys/kernel/mm/transparent_hugepage/enabled
>>>> echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
>>>>
>>>> khugepaged will not get activated.
>>>
>>> khugepaged is controlled by the top level knob.
>>
>> What do you mean by "top level knob"? I assume
>> /sys/kernel/mm/transparent_hugepage/enabled ?
> 
> Yes.
> 
>>
>> If so, that's not really a thing in its own right; its just the legacy PMD-size
>> THP control, and we only take any notice of it if a per-size control is set to
>> "inherit". So if we have:
>>
>> # echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
>>
>> Then by design, /sys/kernel/mm/transparent_hugepage/enabled should be ignored.
>>
>>> But the above setting
>>> sounds confusing, can we disable the top level knob, but enable it on
>>> a per-order basis? TBH, it sounds weird and doesn't make too much
>>> sense to me.
>>
>> Well that's the design and that's how its documented. It's done this way for
>> back-compat. All controls are now per-size. But at boot, we default all per-size
>> controls to "never" except for the PMD-sized control, which is defaulted to
>> "inherit". That way, an unenlightened user-space can still control PMD-sized THP
>> via the legacy (top-level) control. But enlightened apps can directly control
>> per-size.
> 
> OK, good to know.
> 
>>
>> I'm not sure how your way would work, because you would have 2 controls
>> competing to do the same thing?
> 
> I don't see how they compete if they are 2-level knobs. And I failed
> to see how it achieved back-compat. For example, memcached reads
> /sys/kernel/mm/transparent_hugepage/enabled to determine whether it
> should manage memory in huge page (2M) granularity. If the setting is
> set to :
> 
> # echo never > /sys/kernel/mm/transparent_hugepage/enabled
> # echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> 
> memcached will manage memory in 4K granularity, but 2M THP is actually
> enabled unless memcached checks the per-order knobs.

And you can still do it the old way and keep it all working with 
existing software (compat mode as default).

It's just another option and some software might need updates to benefit 
from it (just like if you would enable other folio sizes).

You can happily do

echo always > /sys/kernel/mm/transparent_hugepage/enabled
echo inherit > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled

It's an admin choice.
Ryan Roberts July 4, 2024, 9:43 a.m. UTC | #26
On 03/07/2024 17:08, Yang Shi wrote:
> On Tue, Jul 2, 2024 at 1:24 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 01/07/2024 19:20, Yang Shi wrote:
>>> On Mon, Jul 1, 2024 at 3:23 AM David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>> On 01.07.24 12:16, Ryan Roberts wrote:
>>>>> On 01/07/2024 10:17, David Hildenbrand wrote:
>>>>>> On 01.07.24 11:14, Ryan Roberts wrote:
>>>>>>> On 01/07/2024 09:57, David Hildenbrand wrote:
>>>>>>>> On 01.07.24 10:50, Ryan Roberts wrote:
>>>>>>>>> On 01/07/2024 09:48, David Hildenbrand wrote:
>>>>>>>>>> On 01.07.24 10:40, Ryan Roberts wrote:
>>>>>>>>>>> On 01/07/2024 09:33, Baolin Wang wrote:
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> On 2024/7/1 15:55, Ryan Roberts wrote:
>>>>>>>>>>>>> On 28/06/2024 11:49, Bang Li wrote:
>>>>>>>>>>>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
>>>>>>>>>>>>>> anonymous shmem"), we can configure different policies through
>>>>>>>>>>>>>> the multi-size THP sysfs interface for anonymous shmem. But
>>>>>>>>>>>>>> currently "THPeligible" indicates only whether the mapping is
>>>>>>>>>>>>>> eligible for allocating THP-pages as well as the THP is PMD
>>>>>>>>>>>>>> mappable or not for anonymous shmem, we need to support semantics
>>>>>>>>>>>>>> for mTHP with anonymous shmem similar to those for mTHP with
>>>>>>>>>>>>>> anonymous memory.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>       fs/proc/task_mmu.c      | 10 +++++++---
>>>>>>>>>>>>>>       include/linux/huge_mm.h | 11 +++++++++++
>>>>>>>>>>>>>>       mm/shmem.c              |  9 +--------
>>>>>>>>>>>>>>       3 files changed, 19 insertions(+), 11 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>>>>>>>>>>>>>> index 93fb2c61b154..09b5db356886 100644
>>>>>>>>>>>>>> --- a/fs/proc/task_mmu.c
>>>>>>>>>>>>>> +++ b/fs/proc/task_mmu.c
>>>>>>>>>>>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
>>>>>>>>>>>>>>       {
>>>>>>>>>>>>>>           struct vm_area_struct *vma = v;
>>>>>>>>>>>>>>           struct mem_size_stats mss = {};
>>>>>>>>>>>>>> +    bool thp_eligible;
>>>>>>>>>>>>>>             smap_gather_stats(vma, &mss, 0);
>>>>>>>>>>>>>>       @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void
>>>>>>>>>>>>>> *v)
>>>>>>>>>>>>>>             __show_smap(m, &mss, false);
>>>>>>>>>>>>>>       -    seq_printf(m, "THPeligible:    %8u\n",
>>>>>>>>>>>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>>>>>>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
>>>>>>>>>>>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
>>>>>>>>>>>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
>>>>>>>>>>>>>> +    if (vma_is_anon_shmem(vma))
>>>>>>>>>>>>>> +        thp_eligible =
>>>>>>>>>>>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
>>>>>>>>>>>>>> +                            vma, vma->vm_pgoff, thp_eligible);
>>>>>>>>>>>>>
>>>>>>>>>>>>> Afraid I haven't been following the shmem mTHP support work as much as I
>>>>>>>>>>>>> would
>>>>>>>>>>>>> have liked, but is there a reason why we need a separate function for
>>>>>>>>>>>>> shmem?
>>>>>>>>>>>>
>>>>>>>>>>>> Since shmem_allowable_huge_orders() only uses shmem specific logic to
>>>>>>>>>>>> determine
>>>>>>>>>>>> if huge orders are allowable, there is no need to complicate the
>>>>>>>>>>>> thp_vma_allowable_orders() function by adding more shmem related logic,
>>>>>>>>>>>> making
>>>>>>>>>>>> it more bloated. In my view, providing a dedicated helper
>>>>>>>>>>>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
>>>>>>>>>>>
>>>>>>>>>>> My point was really that a single interface (thp_vma_allowable_orders)
>>>>>>>>>>> should be
>>>>>>>>>>> used to get this information. I have no strong opinon on how the
>>>>>>>>>>> implementation
>>>>>>>>>>> of that interface looks. What you suggest below seems perfectly reasonable
>>>>>>>>>>> to me.
>>>>>>>>>>
>>>>>>>>>> Right. thp_vma_allowable_orders() might require some care as discussed in
>>>>>>>>>> other
>>>>>>>>>> context (cleanly separate dax and shmem handling/orders). But that would be
>>>>>>>>>> follow-up cleanups.
>>>>>>>>>
>>>>>>>>> Are you planning to do that, or do you want me to send a patch?
>>>>>>>>
>>>>>>>> I'm planning on looking into some details, especially the interaction with large
>>>>>>>> folios in the pagecache. I'll let you know once I have a better idea what
>>>>>>>> actually should be done :)
>>>>>>>
>>>>>>> OK great - I'll scrub it from my todo list... really getting things done today :)
>>>>>>
>>>>>> Resolved the khugepaged thiny already? :P
>>>>>>
>>>>>> [khugepaged not active when only enabling the sub-size via the 2M folder IIRC]
>>>>>
>>>>> Hmm... baby brain?
>>>>
>>>> :)
>>>>
>>>> I think I only mentioned it in a private mail at some point.
>>>>
>>>>>
>>>>> Sorry about that. I've been a bit useless lately. For some reason it wasn't on
>>>>> my list, but its there now. Will prioritise it, because I agree it's not good.
>>>>
>>>>
>>>> IIRC, if you do
>>>>
>>>> echo never > /sys/kernel/mm/transparent_hugepage/enabled
>>>> echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
>>>>
>>>> khugepaged will not get activated.
>>>
>>> khugepaged is controlled by the top level knob.
>>
>> What do you mean by "top level knob"? I assume
>> /sys/kernel/mm/transparent_hugepage/enabled ?
> 
> Yes.
> 
>>
>> If so, that's not really a thing in its own right; its just the legacy PMD-size
>> THP control, and we only take any notice of it if a per-size control is set to
>> "inherit". So if we have:
>>
>> # echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
>>
>> Then by design, /sys/kernel/mm/transparent_hugepage/enabled should be ignored.
>>
>>> But the above setting
>>> sounds confusing, can we disable the top level knob, but enable it on
>>> a per-order basis? TBH, it sounds weird and doesn't make too much
>>> sense to me.
>>
>> Well that's the design and that's how its documented. It's done this way for
>> back-compat. All controls are now per-size. But at boot, we default all per-size
>> controls to "never" except for the PMD-sized control, which is defaulted to
>> "inherit". That way, an unenlightened user-space can still control PMD-sized THP
>> via the legacy (top-level) control. But enlightened apps can directly control
>> per-size.
> 
> OK, good to know.
> 
>>
>> I'm not sure how your way would work, because you would have 2 controls
>> competing to do the same thing?
> 
> I don't see how they compete if they are 2-level knobs. 

I'm not sure I understand exactly how your 2-level proposal works. Could you
explain in more detail?

The problem as I see it, is that the control can take multiple values; "never",
"always" or "madvise". In a two-level scheme, what do we do when top level says
"always" but per-size control says "madvise", or vice-versa? The scheme we
adopted has clear and obvious (to me at least) semantics in this case.

The other problem is that the top-level control is still used to control file
memory collapse (when CONFIG_READ_ONLY_THP_FOR_FS is configured). If you're
advocating for a scheme where the top-level is set to the most permissive you
want to allow, then the per-size controls are only able to further restrict,
that would make it impossible to, for instance set all 2M THP (inc file-backed)
to madvise, but set all 64K THP to always.

> And I failed
> to see how it achieved back-compat. For example, memcached reads
> /sys/kernel/mm/transparent_hugepage/enabled to determine whether it
> should manage memory in huge page (2M) granularity. If the setting is
> set to :
> 
> # echo never > /sys/kernel/mm/transparent_hugepage/enabled
> # echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> 
> memcached will manage memory in 4K granularity, but 2M THP is actually
> enabled unless memcached checks the per-order knobs.
> 
> If we use 2-level mode, memcached doesn't need check per-order setting
> at all in order to know whether THP is enabled or not. And it actually
> doesn't care about what orders are enabled, it assumes THP size is 2M
> (or PMD size). Even though 2M is not enabled but lower orders are
> enabled, memcached still can fully utilize the mTHP since the memory
> chunk managed by memcached is still 2M aligned in this setting. So
> unenlightened applications still can work well. Jemalloc should do the
> similar thing if I remember correctly.

I wonder why we didn't decide to just make
/sys/kernel/mm/transparent_hugepage/enabled an alias for
/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled ? That may have
solved this problem more cleanly? But that would have made it difficult to
introduce "auto" in future (the idea was to set all per-size to 'inherit' and
then set top-level to 'auto').


> 
>>
>>>
>>>>
>>>> --
>>>> Cheers,
>>>>
>>>> David / dhildenb
>>>>
>>>>
>>
Yang Shi July 9, 2024, 7:01 p.m. UTC | #27
On Thu, Jul 4, 2024 at 2:43 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 03/07/2024 17:08, Yang Shi wrote:
> > On Tue, Jul 2, 2024 at 1:24 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>
> >> On 01/07/2024 19:20, Yang Shi wrote:
> >>> On Mon, Jul 1, 2024 at 3:23 AM David Hildenbrand <david@redhat.com> wrote:
> >>>>
> >>>> On 01.07.24 12:16, Ryan Roberts wrote:
> >>>>> On 01/07/2024 10:17, David Hildenbrand wrote:
> >>>>>> On 01.07.24 11:14, Ryan Roberts wrote:
> >>>>>>> On 01/07/2024 09:57, David Hildenbrand wrote:
> >>>>>>>> On 01.07.24 10:50, Ryan Roberts wrote:
> >>>>>>>>> On 01/07/2024 09:48, David Hildenbrand wrote:
> >>>>>>>>>> On 01.07.24 10:40, Ryan Roberts wrote:
> >>>>>>>>>>> On 01/07/2024 09:33, Baolin Wang wrote:
> >>>>>>>>>>>>
> >>>>>>>>>>>>
> >>>>>>>>>>>> On 2024/7/1 15:55, Ryan Roberts wrote:
> >>>>>>>>>>>>> On 28/06/2024 11:49, Bang Li wrote:
> >>>>>>>>>>>>>> After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for
> >>>>>>>>>>>>>> anonymous shmem"), we can configure different policies through
> >>>>>>>>>>>>>> the multi-size THP sysfs interface for anonymous shmem. But
> >>>>>>>>>>>>>> currently "THPeligible" indicates only whether the mapping is
> >>>>>>>>>>>>>> eligible for allocating THP-pages as well as the THP is PMD
> >>>>>>>>>>>>>> mappable or not for anonymous shmem, we need to support semantics
> >>>>>>>>>>>>>> for mTHP with anonymous shmem similar to those for mTHP with
> >>>>>>>>>>>>>> anonymous memory.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Signed-off-by: Bang Li <libang.li@antgroup.com>
> >>>>>>>>>>>>>> ---
> >>>>>>>>>>>>>>       fs/proc/task_mmu.c      | 10 +++++++---
> >>>>>>>>>>>>>>       include/linux/huge_mm.h | 11 +++++++++++
> >>>>>>>>>>>>>>       mm/shmem.c              |  9 +--------
> >>>>>>>>>>>>>>       3 files changed, 19 insertions(+), 11 deletions(-)
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> >>>>>>>>>>>>>> index 93fb2c61b154..09b5db356886 100644
> >>>>>>>>>>>>>> --- a/fs/proc/task_mmu.c
> >>>>>>>>>>>>>> +++ b/fs/proc/task_mmu.c
> >>>>>>>>>>>>>> @@ -870,6 +870,7 @@ static int show_smap(struct seq_file *m, void *v)
> >>>>>>>>>>>>>>       {
> >>>>>>>>>>>>>>           struct vm_area_struct *vma = v;
> >>>>>>>>>>>>>>           struct mem_size_stats mss = {};
> >>>>>>>>>>>>>> +    bool thp_eligible;
> >>>>>>>>>>>>>>             smap_gather_stats(vma, &mss, 0);
> >>>>>>>>>>>>>>       @@ -882,9 +883,12 @@ static int show_smap(struct seq_file *m, void
> >>>>>>>>>>>>>> *v)
> >>>>>>>>>>>>>>             __show_smap(m, &mss, false);
> >>>>>>>>>>>>>>       -    seq_printf(m, "THPeligible:    %8u\n",
> >>>>>>>>>>>>>> -           !!thp_vma_allowable_orders(vma, vma->vm_flags,
> >>>>>>>>>>>>>> -               TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
> >>>>>>>>>>>>>> +    thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
> >>>>>>>>>>>>>> +                        TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
> >>>>>>>>>>>>>> +    if (vma_is_anon_shmem(vma))
> >>>>>>>>>>>>>> +        thp_eligible =
> >>>>>>>>>>>>>> !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
> >>>>>>>>>>>>>> +                            vma, vma->vm_pgoff, thp_eligible);
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Afraid I haven't been following the shmem mTHP support work as much as I
> >>>>>>>>>>>>> would
> >>>>>>>>>>>>> have liked, but is there a reason why we need a separate function for
> >>>>>>>>>>>>> shmem?
> >>>>>>>>>>>>
> >>>>>>>>>>>> Since shmem_allowable_huge_orders() only uses shmem specific logic to
> >>>>>>>>>>>> determine
> >>>>>>>>>>>> if huge orders are allowable, there is no need to complicate the
> >>>>>>>>>>>> thp_vma_allowable_orders() function by adding more shmem related logic,
> >>>>>>>>>>>> making
> >>>>>>>>>>>> it more bloated. In my view, providing a dedicated helper
> >>>>>>>>>>>> shmem_allowable_huge_orders(), specifically for shmem, simplifies the logic.
> >>>>>>>>>>>
> >>>>>>>>>>> My point was really that a single interface (thp_vma_allowable_orders)
> >>>>>>>>>>> should be
> >>>>>>>>>>> used to get this information. I have no strong opinon on how the
> >>>>>>>>>>> implementation
> >>>>>>>>>>> of that interface looks. What you suggest below seems perfectly reasonable
> >>>>>>>>>>> to me.
> >>>>>>>>>>
> >>>>>>>>>> Right. thp_vma_allowable_orders() might require some care as discussed in
> >>>>>>>>>> other
> >>>>>>>>>> context (cleanly separate dax and shmem handling/orders). But that would be
> >>>>>>>>>> follow-up cleanups.
> >>>>>>>>>
> >>>>>>>>> Are you planning to do that, or do you want me to send a patch?
> >>>>>>>>
> >>>>>>>> I'm planning on looking into some details, especially the interaction with large
> >>>>>>>> folios in the pagecache. I'll let you know once I have a better idea what
> >>>>>>>> actually should be done :)
> >>>>>>>
> >>>>>>> OK great - I'll scrub it from my todo list... really getting things done today :)
> >>>>>>
> >>>>>> Resolved the khugepaged thiny already? :P
> >>>>>>
> >>>>>> [khugepaged not active when only enabling the sub-size via the 2M folder IIRC]
> >>>>>
> >>>>> Hmm... baby brain?
> >>>>
> >>>> :)
> >>>>
> >>>> I think I only mentioned it in a private mail at some point.
> >>>>
> >>>>>
> >>>>> Sorry about that. I've been a bit useless lately. For some reason it wasn't on
> >>>>> my list, but its there now. Will prioritise it, because I agree it's not good.
> >>>>
> >>>>
> >>>> IIRC, if you do
> >>>>
> >>>> echo never > /sys/kernel/mm/transparent_hugepage/enabled
> >>>> echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> >>>>
> >>>> khugepaged will not get activated.
> >>>
> >>> khugepaged is controlled by the top level knob.
> >>
> >> What do you mean by "top level knob"? I assume
> >> /sys/kernel/mm/transparent_hugepage/enabled ?
> >
> > Yes.
> >
> >>
> >> If so, that's not really a thing in its own right; its just the legacy PMD-size
> >> THP control, and we only take any notice of it if a per-size control is set to
> >> "inherit". So if we have:
> >>
> >> # echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> >>
> >> Then by design, /sys/kernel/mm/transparent_hugepage/enabled should be ignored.
> >>
> >>> But the above setting
> >>> sounds confusing, can we disable the top level knob, but enable it on
> >>> a per-order basis? TBH, it sounds weird and doesn't make too much
> >>> sense to me.
> >>
> >> Well that's the design and that's how its documented. It's done this way for
> >> back-compat. All controls are now per-size. But at boot, we default all per-size
> >> controls to "never" except for the PMD-sized control, which is defaulted to
> >> "inherit". That way, an unenlightened user-space can still control PMD-sized THP
> >> via the legacy (top-level) control. But enlightened apps can directly control
> >> per-size.
> >
> > OK, good to know.
> >
> >>
> >> I'm not sure how your way would work, because you would have 2 controls
> >> competing to do the same thing?
> >
> > I don't see how they compete if they are 2-level knobs.
>
> I'm not sure I understand exactly how your 2-level proposal works. Could you
> explain in more detail?
>
> The problem as I see it, is that the control can take multiple values; "never",
> "always" or "madvise". In a two-level scheme, what do we do when top level says
> "always" but per-size control says "madvise", or vice-versa? The scheme we
> adopted has clear and obvious (to me at least) semantics in this case.
>
> The other problem is that the top-level control is still used to control file
> memory collapse (when CONFIG_READ_ONLY_THP_FOR_FS is configured). If you're
> advocating for a scheme where the top-level is set to the most permissive you
> want to allow, then the per-size controls are only able to further restrict,
> that would make it impossible to, for instance set all 2M THP (inc file-backed)
> to madvise, but set all 64K THP to always.

If we really have such usecases having different orders to have
different modes (always or madvise), I think the top level knob can be
actually simplified to on/off mode. For example, always/madvise maps
to on, never maps to off since the allocation decision (shall allocate
THP or not, and how hard to try) is actually done on a per order
basis. The Memcached usecase actually doesn't care about always or
madvise, as long as it is not never Memcached will manage memory in
huge page granularity.

This might be useful, for example, having high order (i.e. 2M) set to
madvise, low order (i.e. 64K) set to always. This may be able to
achieve some balance between performance and memory usage.

>
> > And I failed
> > to see how it achieved back-compat. For example, memcached reads
> > /sys/kernel/mm/transparent_hugepage/enabled to determine whether it
> > should manage memory in huge page (2M) granularity. If the setting is
> > set to :
> >
> > # echo never > /sys/kernel/mm/transparent_hugepage/enabled
> > # echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
> >
> > memcached will manage memory in 4K granularity, but 2M THP is actually
> > enabled unless memcached checks the per-order knobs.
> >
> > If we use 2-level mode, memcached doesn't need check per-order setting
> > at all in order to know whether THP is enabled or not. And it actually
> > doesn't care about what orders are enabled, it assumes THP size is 2M
> > (or PMD size). Even though 2M is not enabled but lower orders are
> > enabled, memcached still can fully utilize the mTHP since the memory
> > chunk managed by memcached is still 2M aligned in this setting. So
> > unenlightened applications still can work well. Jemalloc should do the
> > similar thing if I remember correctly.
>
> I wonder why we didn't decide to just make
> /sys/kernel/mm/transparent_hugepage/enabled an alias for
> /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled ? That may have
> solved this problem more cleanly? But that would have made it difficult to
> introduce "auto" in future (the idea was to set all per-size to 'inherit' and
> then set top-level to 'auto').
>
>
> >
> >>
> >>>
> >>>>
> >>>> --
> >>>> Cheers,
> >>>>
> >>>> David / dhildenb
> >>>>
> >>>>
> >>
>
diff mbox series

Patch

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 93fb2c61b154..09b5db356886 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -870,6 +870,7 @@  static int show_smap(struct seq_file *m, void *v)
 {
 	struct vm_area_struct *vma = v;
 	struct mem_size_stats mss = {};
+	bool thp_eligible;
 
 	smap_gather_stats(vma, &mss, 0);
 
@@ -882,9 +883,12 @@  static int show_smap(struct seq_file *m, void *v)
 
 	__show_smap(m, &mss, false);
 
-	seq_printf(m, "THPeligible:    %8u\n",
-		   !!thp_vma_allowable_orders(vma, vma->vm_flags,
-			   TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
+	thp_eligible = !!thp_vma_allowable_orders(vma, vma->vm_flags,
+						TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL);
+	if (vma_is_anon_shmem(vma))
+		thp_eligible = !!shmem_allowable_huge_orders(file_inode(vma->vm_file),
+							vma, vma->vm_pgoff, thp_eligible);
+	seq_printf(m, "THPeligible:    %8u\n", thp_eligible);
 
 	if (arch_pkeys_enabled())
 		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 212cca384d7e..f87136f38aa1 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -267,6 +267,10 @@  unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 	return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
 }
 
+unsigned long shmem_allowable_huge_orders(struct inode *inode,
+				struct vm_area_struct *vma, pgoff_t index,
+				bool global_huge);
+
 struct thpsize {
 	struct kobject kobj;
 	struct list_head node;
@@ -460,6 +464,13 @@  static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 	return 0;
 }
 
+static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
+				struct vm_area_struct *vma, pgoff_t index,
+				bool global_huge)
+{
+	return 0;
+}
+
 #define transparent_hugepage_flags 0UL
 
 #define thp_get_unmapped_area	NULL
diff --git a/mm/shmem.c b/mm/shmem.c
index d495c0701a83..aa85df9c662a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1622,7 +1622,7 @@  static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static unsigned long shmem_allowable_huge_orders(struct inode *inode,
+unsigned long shmem_allowable_huge_orders(struct inode *inode,
 				struct vm_area_struct *vma, pgoff_t index,
 				bool global_huge)
 {
@@ -1707,13 +1707,6 @@  static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault
 	return orders;
 }
 #else
-static unsigned long shmem_allowable_huge_orders(struct inode *inode,
-				struct vm_area_struct *vma, pgoff_t index,
-				bool global_huge)
-{
-	return 0;
-}
-
 static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
 					   struct address_space *mapping, pgoff_t index,
 					   unsigned long orders)