diff mbox series

[v5,2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Message ID 20240830100438.3623486-3-usamaarif642@gmail.com (mailing list archive)
State New
Headers show
Series mm: split underused THPs | expand

Commit Message

Usama Arif Aug. 30, 2024, 10:03 a.m. UTC
From: Yu Zhao <yuzhao@google.com>

Here being unused means containing only zeros and inaccessible to
userspace. When splitting an isolated thp under reclaim or migration,
the unused subpages can be mapped to the shared zeropage, hence saving
memory. This is particularly helpful when the internal
fragmentation of a thp is high, i.e. it has many untouched subpages.

This is also a prerequisite for THP low utilization shrinker which will
be introduced in later patches, where underutilized THPs are split, and
the zero-filled pages are freed saving memory.

Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Shuang Zhai <zhais@google.com>
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
 include/linux/rmap.h |  7 ++++-
 mm/huge_memory.c     |  8 ++---
 mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++------
 mm/migrate_device.c  |  4 +--
 4 files changed, 75 insertions(+), 16 deletions(-)

Comments

Zi Yan Oct. 23, 2024, 4:21 p.m. UTC | #1
On 30 Aug 2024, at 6:03, Usama Arif wrote:

> From: Yu Zhao <yuzhao@google.com>
>
> Here being unused means containing only zeros and inaccessible to
> userspace. When splitting an isolated thp under reclaim or migration,
> the unused subpages can be mapped to the shared zeropage, hence saving
> memory. This is particularly helpful when the internal
> fragmentation of a thp is high, i.e. it has many untouched subpages.
>
> This is also a prerequisite for THP low utilization shrinker which will
> be introduced in later patches, where underutilized THPs are split, and
> the zero-filled pages are freed saving memory.
>
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> Tested-by: Shuang Zhai <zhais@google.com>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
>  include/linux/rmap.h |  7 ++++-
>  mm/huge_memory.c     |  8 ++---
>  mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++------
>  mm/migrate_device.c  |  4 +--
>  4 files changed, 75 insertions(+), 16 deletions(-)
>
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 91b5935e8485..d5e93e44322e 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>  int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
>  		      struct vm_area_struct *vma);
>
> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
> +enum rmp_flags {
> +	RMP_LOCKED		= 1 << 0,
> +	RMP_USE_SHARED_ZEROPAGE	= 1 << 1,
> +};
> +
> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
>
>  /*
>   * rmap_walk_control: To control rmap traversing for specific needs
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 0c48806ccb9a..af60684e7c70 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
>  	return false;
>  }
>
> -static void remap_page(struct folio *folio, unsigned long nr)
> +static void remap_page(struct folio *folio, unsigned long nr, int flags)
>  {
>  	int i = 0;
>
> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
>  	if (!folio_test_anon(folio))
>  		return;
>  	for (;;) {
> -		remove_migration_ptes(folio, folio, true);
> +		remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
>  		i += folio_nr_pages(folio);
>  		if (i >= nr)
>  			break;
> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>
>  	if (nr_dropped)
>  		shmem_uncharge(folio->mapping->host, nr_dropped);
> -	remap_page(folio, nr);
> +	remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
>
>  	/*
>  	 * set page to its compound_head when split to non order-0 pages, so
> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>  		if (mapping)
>  			xas_unlock(&xas);
>  		local_irq_enable();
> -		remap_page(folio, folio_nr_pages(folio));
> +		remap_page(folio, folio_nr_pages(folio), 0);
>  		ret = -EAGAIN;
>  	}
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 6f9c62c746be..d039863e014b 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
>  	return true;
>  }
>
> +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
> +					  struct folio *folio,
> +					  unsigned long idx)
> +{
> +	struct page *page = folio_page(folio, idx);
> +	bool contains_data;
> +	pte_t newpte;
> +	void *addr;
> +
> +	VM_BUG_ON_PAGE(PageCompound(page), page);

This should be:

diff --git a/mm/migrate.c b/mm/migrate.c
index e950fd62607f..7ffdbe078aa7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
        pte_t newpte;
        void *addr;

-       VM_BUG_ON_PAGE(PageCompound(page), page);
+       if (PageCompound(page))
+               return false;
        VM_BUG_ON_PAGE(!PageAnon(page), page);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);

Otherwise, splitting anonymous large folios to non order-0 ones just
triggers this BUG_ON.

> +	VM_BUG_ON_PAGE(!PageAnon(page), page);
> +	VM_BUG_ON_PAGE(!PageLocked(page), page);
> +	VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
> +
> +	if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
> +	    mm_forbids_zeropage(pvmw->vma->vm_mm))
> +		return false;
> +
> +	/*
> +	 * The pmd entry mapping the old thp was flushed and the pte mapping
> +	 * this subpage has been non present. If the subpage is only zero-filled
> +	 * then map it to the shared zeropage.
> +	 */
> +	addr = kmap_local_page(page);
> +	contains_data = memchr_inv(addr, 0, PAGE_SIZE);
> +	kunmap_local(addr);
> +
> +	if (contains_data)
> +		return false;
> +
> +	newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
> +					pvmw->vma->vm_page_prot));
> +	set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
> +
> +	dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
> +	return true;
> +}
> +
> +struct rmap_walk_arg {
> +	struct folio *folio;
> +	bool map_unused_to_zeropage;
> +};
> +
>  /*
>   * Restore a potential migration pte to a working pte entry
>   */
>  static bool remove_migration_pte(struct folio *folio,
> -		struct vm_area_struct *vma, unsigned long addr, void *old)
> +		struct vm_area_struct *vma, unsigned long addr, void *arg)
>  {
> -	DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
> +	struct rmap_walk_arg *rmap_walk_arg = arg;
> +	DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
>
>  	while (page_vma_mapped_walk(&pvmw)) {
>  		rmap_t rmap_flags = RMAP_NONE;
> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio *folio,
>  			continue;
>  		}
>  #endif
> +		if (rmap_walk_arg->map_unused_to_zeropage &&
> +		    try_to_map_unused_to_zeropage(&pvmw, folio, idx))
> +			continue;
>
>  		folio_get(folio);
>  		pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio *folio,
>   * Get rid of all migration entries and replace them by
>   * references to the indicated page.
>   */
> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
>  {
> +	struct rmap_walk_arg rmap_walk_arg = {
> +		.folio = src,
> +		.map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
> +	};
> +
>  	struct rmap_walk_control rwc = {
>  		.rmap_one = remove_migration_pte,
> -		.arg = src,
> +		.arg = &rmap_walk_arg,
>  	};
>
> -	if (locked)
> +	VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
> +
> +	if (flags & RMP_LOCKED)
>  		rmap_walk_locked(dst, &rwc);
>  	else
>  		rmap_walk(dst, &rwc);
> @@ -934,7 +988,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
>  	 * At this point we know that the migration attempt cannot
>  	 * be successful.
>  	 */
> -	remove_migration_ptes(folio, folio, false);
> +	remove_migration_ptes(folio, folio, 0);
>
>  	rc = mapping->a_ops->writepage(&folio->page, &wbc);
>
> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio *src,
>  				   struct list_head *ret)
>  {
>  	if (page_was_mapped)
> -		remove_migration_ptes(src, src, false);
> +		remove_migration_ptes(src, src, 0);
>  	/* Drop an anon_vma reference if we took one */
>  	if (anon_vma)
>  		put_anon_vma(anon_vma);
> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
>  		lru_add_drain();
>
>  	if (old_page_state & PAGE_WAS_MAPPED)
> -		remove_migration_ptes(src, dst, false);
> +		remove_migration_ptes(src, dst, 0);
>
>  out_unlock_both:
>  	folio_unlock(dst);
> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
>
>  	if (page_was_mapped)
>  		remove_migration_ptes(src,
> -			rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
> +			rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>
>  unlock_put_anon:
>  	folio_unlock(dst);
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 8d687de88a03..9cf26592ac93 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -424,7 +424,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>  			continue;
>
>  		folio = page_folio(page);
> -		remove_migration_ptes(folio, folio, false);
> +		remove_migration_ptes(folio, folio, 0);
>
>  		src_pfns[i] = 0;
>  		folio_unlock(folio);
> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
>  			dst = src;
>  		}
>
> -		remove_migration_ptes(src, dst, false);
> +		remove_migration_ptes(src, dst, 0);
>  		folio_unlock(src);
>
>  		if (folio_is_zone_device(src))
> -- 
> 2.43.5

Best Regards,
Yan, Zi
Usama Arif Oct. 23, 2024, 4:50 p.m. UTC | #2
On 23/10/2024 17:21, Zi Yan wrote:
> On 30 Aug 2024, at 6:03, Usama Arif wrote:
> 
>> From: Yu Zhao <yuzhao@google.com>
>>
>> Here being unused means containing only zeros and inaccessible to
>> userspace. When splitting an isolated thp under reclaim or migration,
>> the unused subpages can be mapped to the shared zeropage, hence saving
>> memory. This is particularly helpful when the internal
>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>
>> This is also a prerequisite for THP low utilization shrinker which will
>> be introduced in later patches, where underutilized THPs are split, and
>> the zero-filled pages are freed saving memory.
>>
>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>> Tested-by: Shuang Zhai <zhais@google.com>
>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>> ---
>>  include/linux/rmap.h |  7 ++++-
>>  mm/huge_memory.c     |  8 ++---
>>  mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++------
>>  mm/migrate_device.c  |  4 +--
>>  4 files changed, 75 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>> index 91b5935e8485..d5e93e44322e 100644
>> --- a/include/linux/rmap.h
>> +++ b/include/linux/rmap.h
>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>>  int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
>>  		      struct vm_area_struct *vma);
>>
>> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
>> +enum rmp_flags {
>> +	RMP_LOCKED		= 1 << 0,
>> +	RMP_USE_SHARED_ZEROPAGE	= 1 << 1,
>> +};
>> +
>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
>>
>>  /*
>>   * rmap_walk_control: To control rmap traversing for specific needs
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 0c48806ccb9a..af60684e7c70 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
>>  	return false;
>>  }
>>
>> -static void remap_page(struct folio *folio, unsigned long nr)
>> +static void remap_page(struct folio *folio, unsigned long nr, int flags)
>>  {
>>  	int i = 0;
>>
>> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
>>  	if (!folio_test_anon(folio))
>>  		return;
>>  	for (;;) {
>> -		remove_migration_ptes(folio, folio, true);
>> +		remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
>>  		i += folio_nr_pages(folio);
>>  		if (i >= nr)
>>  			break;
>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>
>>  	if (nr_dropped)
>>  		shmem_uncharge(folio->mapping->host, nr_dropped);
>> -	remap_page(folio, nr);
>> +	remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
>>
>>  	/*
>>  	 * set page to its compound_head when split to non order-0 pages, so
>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>  		if (mapping)
>>  			xas_unlock(&xas);
>>  		local_irq_enable();
>> -		remap_page(folio, folio_nr_pages(folio));
>> +		remap_page(folio, folio_nr_pages(folio), 0);
>>  		ret = -EAGAIN;
>>  	}
>>
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 6f9c62c746be..d039863e014b 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
>>  	return true;
>>  }
>>
>> +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
>> +					  struct folio *folio,
>> +					  unsigned long idx)
>> +{
>> +	struct page *page = folio_page(folio, idx);
>> +	bool contains_data;
>> +	pte_t newpte;
>> +	void *addr;
>> +
>> +	VM_BUG_ON_PAGE(PageCompound(page), page);
> 
> This should be:
> 
> diff --git a/mm/migrate.c b/mm/migrate.c
> index e950fd62607f..7ffdbe078aa7 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
>         pte_t newpte;
>         void *addr;
> 
> -       VM_BUG_ON_PAGE(PageCompound(page), page);
> +       if (PageCompound(page))
> +               return false;
>         VM_BUG_ON_PAGE(!PageAnon(page), page);
>         VM_BUG_ON_PAGE(!PageLocked(page), page);
>         VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
> 
> Otherwise, splitting anonymous large folios to non order-0 ones just
> triggers this BUG_ON.
> 

That makes sense, would you like to send the fix?

Adding Yu Zhao to "To" incase he has any objections.

Thanks,
Usama

>> +	VM_BUG_ON_PAGE(!PageAnon(page), page);
>> +	VM_BUG_ON_PAGE(!PageLocked(page), page);
>> +	VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>> +
>> +	if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
>> +	    mm_forbids_zeropage(pvmw->vma->vm_mm))
>> +		return false;
>> +
>> +	/*
>> +	 * The pmd entry mapping the old thp was flushed and the pte mapping
>> +	 * this subpage has been non present. If the subpage is only zero-filled
>> +	 * then map it to the shared zeropage.
>> +	 */
>> +	addr = kmap_local_page(page);
>> +	contains_data = memchr_inv(addr, 0, PAGE_SIZE);
>> +	kunmap_local(addr);
>> +
>> +	if (contains_data)
>> +		return false;
>> +
>> +	newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
>> +					pvmw->vma->vm_page_prot));
>> +	set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
>> +
>> +	dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
>> +	return true;
>> +}
>> +
>> +struct rmap_walk_arg {
>> +	struct folio *folio;
>> +	bool map_unused_to_zeropage;
>> +};
>> +
>>  /*
>>   * Restore a potential migration pte to a working pte entry
>>   */
>>  static bool remove_migration_pte(struct folio *folio,
>> -		struct vm_area_struct *vma, unsigned long addr, void *old)
>> +		struct vm_area_struct *vma, unsigned long addr, void *arg)
>>  {
>> -	DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
>> +	struct rmap_walk_arg *rmap_walk_arg = arg;
>> +	DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
>>
>>  	while (page_vma_mapped_walk(&pvmw)) {
>>  		rmap_t rmap_flags = RMAP_NONE;
>> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio *folio,
>>  			continue;
>>  		}
>>  #endif
>> +		if (rmap_walk_arg->map_unused_to_zeropage &&
>> +		    try_to_map_unused_to_zeropage(&pvmw, folio, idx))
>> +			continue;
>>
>>  		folio_get(folio);
>>  		pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
>> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio *folio,
>>   * Get rid of all migration entries and replace them by
>>   * references to the indicated page.
>>   */
>> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
>>  {
>> +	struct rmap_walk_arg rmap_walk_arg = {
>> +		.folio = src,
>> +		.map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
>> +	};
>> +
>>  	struct rmap_walk_control rwc = {
>>  		.rmap_one = remove_migration_pte,
>> -		.arg = src,
>> +		.arg = &rmap_walk_arg,
>>  	};
>>
>> -	if (locked)
>> +	VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
>> +
>> +	if (flags & RMP_LOCKED)
>>  		rmap_walk_locked(dst, &rwc);
>>  	else
>>  		rmap_walk(dst, &rwc);
>> @@ -934,7 +988,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
>>  	 * At this point we know that the migration attempt cannot
>>  	 * be successful.
>>  	 */
>> -	remove_migration_ptes(folio, folio, false);
>> +	remove_migration_ptes(folio, folio, 0);
>>
>>  	rc = mapping->a_ops->writepage(&folio->page, &wbc);
>>
>> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio *src,
>>  				   struct list_head *ret)
>>  {
>>  	if (page_was_mapped)
>> -		remove_migration_ptes(src, src, false);
>> +		remove_migration_ptes(src, src, 0);
>>  	/* Drop an anon_vma reference if we took one */
>>  	if (anon_vma)
>>  		put_anon_vma(anon_vma);
>> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
>>  		lru_add_drain();
>>
>>  	if (old_page_state & PAGE_WAS_MAPPED)
>> -		remove_migration_ptes(src, dst, false);
>> +		remove_migration_ptes(src, dst, 0);
>>
>>  out_unlock_both:
>>  	folio_unlock(dst);
>> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
>>
>>  	if (page_was_mapped)
>>  		remove_migration_ptes(src,
>> -			rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
>> +			rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>>
>>  unlock_put_anon:
>>  	folio_unlock(dst);
>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> index 8d687de88a03..9cf26592ac93 100644
>> --- a/mm/migrate_device.c
>> +++ b/mm/migrate_device.c
>> @@ -424,7 +424,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>>  			continue;
>>
>>  		folio = page_folio(page);
>> -		remove_migration_ptes(folio, folio, false);
>> +		remove_migration_ptes(folio, folio, 0);
>>
>>  		src_pfns[i] = 0;
>>  		folio_unlock(folio);
>> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
>>  			dst = src;
>>  		}
>>
>> -		remove_migration_ptes(src, dst, false);
>> +		remove_migration_ptes(src, dst, 0);
>>  		folio_unlock(src);
>>
>>  		if (folio_is_zone_device(src))
>> -- 
>> 2.43.5
> 
> Best Regards,
> Yan, Zi
Zi Yan Oct. 23, 2024, 4:55 p.m. UTC | #3
On 23 Oct 2024, at 12:50, Usama Arif wrote:

> On 23/10/2024 17:21, Zi Yan wrote:
>> On 30 Aug 2024, at 6:03, Usama Arif wrote:
>>
>>> From: Yu Zhao <yuzhao@google.com>
>>>
>>> Here being unused means containing only zeros and inaccessible to
>>> userspace. When splitting an isolated thp under reclaim or migration,
>>> the unused subpages can be mapped to the shared zeropage, hence saving
>>> memory. This is particularly helpful when the internal
>>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>>
>>> This is also a prerequisite for THP low utilization shrinker which will
>>> be introduced in later patches, where underutilized THPs are split, and
>>> the zero-filled pages are freed saving memory.
>>>
>>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>>> Tested-by: Shuang Zhai <zhais@google.com>
>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>> ---
>>>  include/linux/rmap.h |  7 ++++-
>>>  mm/huge_memory.c     |  8 ++---
>>>  mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++------
>>>  mm/migrate_device.c  |  4 +--
>>>  4 files changed, 75 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>> index 91b5935e8485..d5e93e44322e 100644
>>> --- a/include/linux/rmap.h
>>> +++ b/include/linux/rmap.h
>>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>>>  int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
>>>  		      struct vm_area_struct *vma);
>>>
>>> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
>>> +enum rmp_flags {
>>> +	RMP_LOCKED		= 1 << 0,
>>> +	RMP_USE_SHARED_ZEROPAGE	= 1 << 1,
>>> +};
>>> +
>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
>>>
>>>  /*
>>>   * rmap_walk_control: To control rmap traversing for specific needs
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 0c48806ccb9a..af60684e7c70 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
>>>  	return false;
>>>  }
>>>
>>> -static void remap_page(struct folio *folio, unsigned long nr)
>>> +static void remap_page(struct folio *folio, unsigned long nr, int flags)
>>>  {
>>>  	int i = 0;
>>>
>>> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
>>>  	if (!folio_test_anon(folio))
>>>  		return;
>>>  	for (;;) {
>>> -		remove_migration_ptes(folio, folio, true);
>>> +		remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
>>>  		i += folio_nr_pages(folio);
>>>  		if (i >= nr)
>>>  			break;
>>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>>
>>>  	if (nr_dropped)
>>>  		shmem_uncharge(folio->mapping->host, nr_dropped);
>>> -	remap_page(folio, nr);
>>> +	remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
>>>
>>>  	/*
>>>  	 * set page to its compound_head when split to non order-0 pages, so
>>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>  		if (mapping)
>>>  			xas_unlock(&xas);
>>>  		local_irq_enable();
>>> -		remap_page(folio, folio_nr_pages(folio));
>>> +		remap_page(folio, folio_nr_pages(folio), 0);
>>>  		ret = -EAGAIN;
>>>  	}
>>>
>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>> index 6f9c62c746be..d039863e014b 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
>>>  	return true;
>>>  }
>>>
>>> +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
>>> +					  struct folio *folio,
>>> +					  unsigned long idx)
>>> +{
>>> +	struct page *page = folio_page(folio, idx);
>>> +	bool contains_data;
>>> +	pte_t newpte;
>>> +	void *addr;
>>> +
>>> +	VM_BUG_ON_PAGE(PageCompound(page), page);
>>
>> This should be:
>>
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index e950fd62607f..7ffdbe078aa7 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
>>         pte_t newpte;
>>         void *addr;
>>
>> -       VM_BUG_ON_PAGE(PageCompound(page), page);
>> +       if (PageCompound(page))
>> +               return false;
>>         VM_BUG_ON_PAGE(!PageAnon(page), page);
>>         VM_BUG_ON_PAGE(!PageLocked(page), page);
>>         VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>>
>> Otherwise, splitting anonymous large folios to non order-0 ones just
>> triggers this BUG_ON.
>>
>
> That makes sense, would you like to send the fix?
>
> Adding Yu Zhao to "To" incase he has any objections.
>
Sure, will do.

Best Regards,
Yan, Zi
Yu Zhao Oct. 23, 2024, 4:56 p.m. UTC | #4
On Wed, Oct 23, 2024 at 10:51 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
> On 23/10/2024 17:21, Zi Yan wrote:
> > On 30 Aug 2024, at 6:03, Usama Arif wrote:
> >
> >> From: Yu Zhao <yuzhao@google.com>
> >>
> >> Here being unused means containing only zeros and inaccessible to
> >> userspace. When splitting an isolated thp under reclaim or migration,
> >> the unused subpages can be mapped to the shared zeropage, hence saving
> >> memory. This is particularly helpful when the internal
> >> fragmentation of a thp is high, i.e. it has many untouched subpages.
> >>
> >> This is also a prerequisite for THP low utilization shrinker which will
> >> be introduced in later patches, where underutilized THPs are split, and
> >> the zero-filled pages are freed saving memory.
> >>
> >> Signed-off-by: Yu Zhao <yuzhao@google.com>
> >> Tested-by: Shuang Zhai <zhais@google.com>
> >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >> ---
> >>  include/linux/rmap.h |  7 ++++-
> >>  mm/huge_memory.c     |  8 ++---
> >>  mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++------
> >>  mm/migrate_device.c  |  4 +--
> >>  4 files changed, 75 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> >> index 91b5935e8485..d5e93e44322e 100644
> >> --- a/include/linux/rmap.h
> >> +++ b/include/linux/rmap.h
> >> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
> >>  int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
> >>                    struct vm_area_struct *vma);
> >>
> >> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
> >> +enum rmp_flags {
> >> +    RMP_LOCKED              = 1 << 0,
> >> +    RMP_USE_SHARED_ZEROPAGE = 1 << 1,
> >> +};
> >> +
> >> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
> >>
> >>  /*
> >>   * rmap_walk_control: To control rmap traversing for specific needs
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 0c48806ccb9a..af60684e7c70 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
> >>      return false;
> >>  }
> >>
> >> -static void remap_page(struct folio *folio, unsigned long nr)
> >> +static void remap_page(struct folio *folio, unsigned long nr, int flags)
> >>  {
> >>      int i = 0;
> >>
> >> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
> >>      if (!folio_test_anon(folio))
> >>              return;
> >>      for (;;) {
> >> -            remove_migration_ptes(folio, folio, true);
> >> +            remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
> >>              i += folio_nr_pages(folio);
> >>              if (i >= nr)
> >>                      break;
> >> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
> >>
> >>      if (nr_dropped)
> >>              shmem_uncharge(folio->mapping->host, nr_dropped);
> >> -    remap_page(folio, nr);
> >> +    remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
> >>
> >>      /*
> >>       * set page to its compound_head when split to non order-0 pages, so
> >> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>              if (mapping)
> >>                      xas_unlock(&xas);
> >>              local_irq_enable();
> >> -            remap_page(folio, folio_nr_pages(folio));
> >> +            remap_page(folio, folio_nr_pages(folio), 0);
> >>              ret = -EAGAIN;
> >>      }
> >>
> >> diff --git a/mm/migrate.c b/mm/migrate.c
> >> index 6f9c62c746be..d039863e014b 100644
> >> --- a/mm/migrate.c
> >> +++ b/mm/migrate.c
> >> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
> >>      return true;
> >>  }
> >>
> >> +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
> >> +                                      struct folio *folio,
> >> +                                      unsigned long idx)
> >> +{
> >> +    struct page *page = folio_page(folio, idx);
> >> +    bool contains_data;
> >> +    pte_t newpte;
> >> +    void *addr;
> >> +
> >> +    VM_BUG_ON_PAGE(PageCompound(page), page);
> >
> > This should be:
> >
> > diff --git a/mm/migrate.c b/mm/migrate.c
> > index e950fd62607f..7ffdbe078aa7 100644
> > --- a/mm/migrate.c
> > +++ b/mm/migrate.c
> > @@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
> >         pte_t newpte;
> >         void *addr;
> >
> > -       VM_BUG_ON_PAGE(PageCompound(page), page);
> > +       if (PageCompound(page))
> > +               return false;
> >         VM_BUG_ON_PAGE(!PageAnon(page), page);
> >         VM_BUG_ON_PAGE(!PageLocked(page), page);
> >         VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
> >
> > Otherwise, splitting anonymous large folios to non order-0 ones just
> > triggers this BUG_ON.
> >
>
> That makes sense, would you like to send the fix?
>
> Adding Yu Zhao to "To" incase he has any objections.

LGTM. Thanks!
diff mbox series

Patch

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 91b5935e8485..d5e93e44322e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -745,7 +745,12 @@  int folio_mkclean(struct folio *);
 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
 		      struct vm_area_struct *vma);
 
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
+enum rmp_flags {
+	RMP_LOCKED		= 1 << 0,
+	RMP_USE_SHARED_ZEROPAGE	= 1 << 1,
+};
+
+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
 
 /*
  * rmap_walk_control: To control rmap traversing for specific needs
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0c48806ccb9a..af60684e7c70 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3020,7 +3020,7 @@  bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
 	return false;
 }
 
-static void remap_page(struct folio *folio, unsigned long nr)
+static void remap_page(struct folio *folio, unsigned long nr, int flags)
 {
 	int i = 0;
 
@@ -3028,7 +3028,7 @@  static void remap_page(struct folio *folio, unsigned long nr)
 	if (!folio_test_anon(folio))
 		return;
 	for (;;) {
-		remove_migration_ptes(folio, folio, true);
+		remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
 		i += folio_nr_pages(folio);
 		if (i >= nr)
 			break;
@@ -3240,7 +3240,7 @@  static void __split_huge_page(struct page *page, struct list_head *list,
 
 	if (nr_dropped)
 		shmem_uncharge(folio->mapping->host, nr_dropped);
-	remap_page(folio, nr);
+	remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
 
 	/*
 	 * set page to its compound_head when split to non order-0 pages, so
@@ -3542,7 +3542,7 @@  int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		if (mapping)
 			xas_unlock(&xas);
 		local_irq_enable();
-		remap_page(folio, folio_nr_pages(folio));
+		remap_page(folio, folio_nr_pages(folio), 0);
 		ret = -EAGAIN;
 	}
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 6f9c62c746be..d039863e014b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -204,13 +204,57 @@  bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
 	return true;
 }
 
+static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
+					  struct folio *folio,
+					  unsigned long idx)
+{
+	struct page *page = folio_page(folio, idx);
+	bool contains_data;
+	pte_t newpte;
+	void *addr;
+
+	VM_BUG_ON_PAGE(PageCompound(page), page);
+	VM_BUG_ON_PAGE(!PageAnon(page), page);
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
+
+	if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
+	    mm_forbids_zeropage(pvmw->vma->vm_mm))
+		return false;
+
+	/*
+	 * The pmd entry mapping the old thp was flushed and the pte mapping
+	 * this subpage has been non present. If the subpage is only zero-filled
+	 * then map it to the shared zeropage.
+	 */
+	addr = kmap_local_page(page);
+	contains_data = memchr_inv(addr, 0, PAGE_SIZE);
+	kunmap_local(addr);
+
+	if (contains_data)
+		return false;
+
+	newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
+					pvmw->vma->vm_page_prot));
+	set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
+
+	dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
+	return true;
+}
+
+struct rmap_walk_arg {
+	struct folio *folio;
+	bool map_unused_to_zeropage;
+};
+
 /*
  * Restore a potential migration pte to a working pte entry
  */
 static bool remove_migration_pte(struct folio *folio,
-		struct vm_area_struct *vma, unsigned long addr, void *old)
+		struct vm_area_struct *vma, unsigned long addr, void *arg)
 {
-	DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
+	struct rmap_walk_arg *rmap_walk_arg = arg;
+	DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
 
 	while (page_vma_mapped_walk(&pvmw)) {
 		rmap_t rmap_flags = RMAP_NONE;
@@ -234,6 +278,9 @@  static bool remove_migration_pte(struct folio *folio,
 			continue;
 		}
 #endif
+		if (rmap_walk_arg->map_unused_to_zeropage &&
+		    try_to_map_unused_to_zeropage(&pvmw, folio, idx))
+			continue;
 
 		folio_get(folio);
 		pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
@@ -312,14 +359,21 @@  static bool remove_migration_pte(struct folio *folio,
  * Get rid of all migration entries and replace them by
  * references to the indicated page.
  */
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
 {
+	struct rmap_walk_arg rmap_walk_arg = {
+		.folio = src,
+		.map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
+	};
+
 	struct rmap_walk_control rwc = {
 		.rmap_one = remove_migration_pte,
-		.arg = src,
+		.arg = &rmap_walk_arg,
 	};
 
-	if (locked)
+	VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
+
+	if (flags & RMP_LOCKED)
 		rmap_walk_locked(dst, &rwc);
 	else
 		rmap_walk(dst, &rwc);
@@ -934,7 +988,7 @@  static int writeout(struct address_space *mapping, struct folio *folio)
 	 * At this point we know that the migration attempt cannot
 	 * be successful.
 	 */
-	remove_migration_ptes(folio, folio, false);
+	remove_migration_ptes(folio, folio, 0);
 
 	rc = mapping->a_ops->writepage(&folio->page, &wbc);
 
@@ -1098,7 +1152,7 @@  static void migrate_folio_undo_src(struct folio *src,
 				   struct list_head *ret)
 {
 	if (page_was_mapped)
-		remove_migration_ptes(src, src, false);
+		remove_migration_ptes(src, src, 0);
 	/* Drop an anon_vma reference if we took one */
 	if (anon_vma)
 		put_anon_vma(anon_vma);
@@ -1336,7 +1390,7 @@  static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
 		lru_add_drain();
 
 	if (old_page_state & PAGE_WAS_MAPPED)
-		remove_migration_ptes(src, dst, false);
+		remove_migration_ptes(src, dst, 0);
 
 out_unlock_both:
 	folio_unlock(dst);
@@ -1474,7 +1528,7 @@  static int unmap_and_move_huge_page(new_folio_t get_new_folio,
 
 	if (page_was_mapped)
 		remove_migration_ptes(src,
-			rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
+			rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
 
 unlock_put_anon:
 	folio_unlock(dst);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 8d687de88a03..9cf26592ac93 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -424,7 +424,7 @@  static unsigned long migrate_device_unmap(unsigned long *src_pfns,
 			continue;
 
 		folio = page_folio(page);
-		remove_migration_ptes(folio, folio, false);
+		remove_migration_ptes(folio, folio, 0);
 
 		src_pfns[i] = 0;
 		folio_unlock(folio);
@@ -840,7 +840,7 @@  void migrate_device_finalize(unsigned long *src_pfns,
 			dst = src;
 		}
 
-		remove_migration_ptes(src, dst, false);
+		remove_migration_ptes(src, dst, 0);
 		folio_unlock(src);
 
 		if (folio_is_zone_device(src))