diff mbox series

[2/2] mm/damon: Add a new scheme to support demotion on tiered memory system

Message ID 1c014ce5c6f6c62a30d07096c5e28aa1310c1bbd.1640077468.git.baolin.wang@linux.alibaba.com (mailing list archive)
State New
Headers show
Series Add a new scheme to support demotion on tiered memory system | expand

Commit Message

Baolin Wang Dec. 21, 2021, 9:18 a.m. UTC
On tiered memory system, the reclaim path in shrink_page_list() already
support demoting pages to slow memory node instead of discarding the
pages. However, at that time the fast memory node memory wartermark is
already tense, which will increase the memory allocation latency during
page demotion.

We can rely on the DAMON in user space to help to monitor the cold
memory on fast memory node, and demote the cold pages to slow memory
node proactively to keep the fast memory node in a healthy state.
Thus this patch introduces a new scheme named DAMOS_DEMOTE to support
this feature.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
 include/linux/damon.h |   3 +
 mm/damon/dbgfs.c      |   1 +
 mm/damon/vaddr.c      | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 160 insertions(+)

Comments

SeongJae Park Dec. 21, 2021, 1:24 p.m. UTC | #1
Hi Baolin,


Thank you for this great patch!  I left some comments below.

On Tue, 21 Dec 2021 17:18:04 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote:

> On tiered memory system, the reclaim path in shrink_page_list() already
> support demoting pages to slow memory node instead of discarding the
> pages. However, at that time the fast memory node memory wartermark is
> already tense, which will increase the memory allocation latency during
> page demotion.
> 
> We can rely on the DAMON in user space to help to monitor the cold
> memory on fast memory node, and demote the cold pages to slow memory
> node proactively to keep the fast memory node in a healthy state.
> Thus this patch introduces a new scheme named DAMOS_DEMOTE to support
> this feature.
> 
> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> ---
>  include/linux/damon.h |   3 +
>  mm/damon/dbgfs.c      |   1 +
>  mm/damon/vaddr.c      | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 160 insertions(+)
> 
> diff --git a/include/linux/damon.h b/include/linux/damon.h
> index af64838..da9957c 100644
> --- a/include/linux/damon.h
> +++ b/include/linux/damon.h
> @@ -87,6 +87,8 @@ struct damon_target {
>   * @DAMOS_PAGEOUT:	Call ``madvise()`` for the region with MADV_PAGEOUT.
>   * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
>   * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
> + * @DAMOS_DEMOTE:	Migate cold pages from fast memory type (DRAM) to slow
> + * memory type (persistent memory).

s/Migate/Migrate/

Also, could you make the second line of the description aligned with the first
line? e.g.,

    + * @DAMOS_DEMOTE:	Migate cold pages from fast memory type (DRAM) to slow
    + *                 memory type (persistent memory).

>   * @DAMOS_STAT:		Do nothing but count the stat.
>   */
>  enum damos_action {
> @@ -95,6 +97,7 @@ enum damos_action {
>  	DAMOS_PAGEOUT,
>  	DAMOS_HUGEPAGE,
>  	DAMOS_NOHUGEPAGE,
> +	DAMOS_DEMOTE,
>  	DAMOS_STAT,		/* Do nothing but only record the stat */

This would make user space people who were using 'DAMOS_STAT' be confused.
Could you put 'DAMOS_DEMOTE' after 'DAMOS_STAT'?

>  };
>  
> diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
> index 58dbb96..43355ab 100644
> --- a/mm/damon/dbgfs.c
> +++ b/mm/damon/dbgfs.c
> @@ -168,6 +168,7 @@ static bool damos_action_valid(int action)
>  	case DAMOS_PAGEOUT:
>  	case DAMOS_HUGEPAGE:
>  	case DAMOS_NOHUGEPAGE:
> +	case DAMOS_DEMOTE:
>  	case DAMOS_STAT:
>  		return true;
>  	default:
> diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
> index 9e213a1..b354d3e 100644
> --- a/mm/damon/vaddr.c
> +++ b/mm/damon/vaddr.c
> @@ -14,6 +14,10 @@
>  #include <linux/page_idle.h>
>  #include <linux/pagewalk.h>
>  #include <linux/sched/mm.h>
> +#include <linux/migrate.h>
> +#include <linux/mm_inline.h>
> +#include <linux/swapops.h>
> +#include "../internal.h"
>  
>  #include "prmtv-common.h"
>  
> @@ -693,6 +697,156 @@ static unsigned long damos_madvise(struct damon_target *target,
>  }
>  #endif	/* CONFIG_ADVISE_SYSCALLS */
>  
> +static bool damos_isolate_page(struct page *page, struct list_head *demote_list)
> +{
> +	struct page *head = compound_head(page);
> +
> +	/* Do not interfere with other mappings of this page */
> +	if (page_mapcount(head) != 1)
> +		return false;
> +
> +	/* No need migration if the target demotion node is empty. */
> +	if (next_demotion_node(page_to_nid(head)) == NUMA_NO_NODE)
> +		return false;
> +
> +	if (isolate_lru_page(head))
> +		return false;
> +
> +	list_add_tail(&head->lru, demote_list);
> +	mod_node_page_state(page_pgdat(head),
> +			    NR_ISOLATED_ANON + page_is_file_lru(head),
> +			    thp_nr_pages(head));
> +	return true;

The return value is not used by callers.  If not needed, let's remove it.

> +}
> +
> +static int damos_migrate_pmd_entry(pmd_t *pmd, unsigned long addr,
> +				   unsigned long end, struct mm_walk *walk)
> +{
> +	struct vm_area_struct *vma = walk->vma;
> +	struct list_head *demote_list = walk->private;
> +	spinlock_t *ptl;
> +	struct page *page;
> +	pte_t *pte, *mapped_pte;
> +
> +	if (!vma_migratable(vma))
> +		return -EFAULT;
> +
> +	ptl = pmd_trans_huge_lock(pmd, vma);
> +	if (ptl) {
> +		/* Bail out if THP migration is not supported. */
> +		if (!thp_migration_supported())
> +			goto thp_out;
> +
> +		/* If the THP pte is under migration, do not bother it. */
> +		if (unlikely(is_pmd_migration_entry(*pmd)))
> +			goto thp_out;
> +
> +		page = damon_get_page(pmd_pfn(*pmd));
> +		if (!page)
> +			goto thp_out;
> +
> +		damos_isolate_page(page, demote_list);
> +
> +		put_page(page);
> +thp_out:
> +		spin_unlock(ptl);
> +		return 0;
> +	}

Could we wrap above THP handling code with '#ifdef CONFIG_TRANSPARENT_HUGEPAGE'?

> +
> +	/* regular page handling */
> +	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
> +		return -EINVAL;
> +
> +	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
> +	for (; addr != end; pte++, addr += PAGE_SIZE) {
> +		if (pte_none(*pte) || !pte_present(*pte))
> +			continue;
> +
> +		page = damon_get_page(pte_pfn(*pte));
> +		if (!page)
> +			continue;
> +
> +		damos_isolate_page(page, demote_list);
> +		put_page(page);
> +	}
> +	pte_unmap_unlock(mapped_pte, ptl);
> +	cond_resched();
> +
> +	return 0;
> +}
> +
> +static const struct mm_walk_ops damos_migrate_pages_walk_ops = {
> +	.pmd_entry              = damos_migrate_pmd_entry,

The names are little bit confusing to me.  How about 's/migrate/isolate/'?

> +};
> +
> +/*
> + * damos_demote() - demote cold pages from fast memory to slow memory
> + * @target:    the given target
> + * @r:         region of the target
> + *
> + * On tiered memory system, if DAMON monitored cold pages on fast memory
> + * node (DRAM), we can demote them to slow memory node proactively in case
> + * accumulating much more cold memory on fast memory node (DRAM) to reclaim.
> + *
> + * Return:
> + * = 0 means no pages were demoted.
> + * > 0 means how many cold pages were demoted successfully.
> + * < 0 means errors happened.

damon_va_apply_scheme(), which returns what this function returns when
DAMOS_DEMOTE action is given, should return bytes of the region that the action
is successfully applied.

> + */
> +static int damos_demote(struct damon_target *target, struct damon_region *r)
> +{
> +	struct mm_struct *mm;
> +	LIST_HEAD(demote_pages);
> +	LIST_HEAD(pagelist);
> +	int target_nid, nr_pages, ret = 0;
> +	unsigned int nr_succeeded, demoted_pages = 0;
> +	struct page *page, *page2;
> +
> +	/* Validate if allowing to do page demotion */
> +	if (!numa_demotion_enabled)
> +		return -EINVAL;
> +
> +	mm = damon_get_mm(target);
> +	if (!mm)
> +		return -ENOMEM;
> +
> +	mmap_read_lock(mm);
> +	walk_page_range(mm, PAGE_ALIGN(r->ar.start), PAGE_ALIGN(r->ar.end),

I guess PAGE_ALIGN()s are not necessary here?

> +			&damos_migrate_pages_walk_ops, &demote_pages);
> +	mmap_read_unlock(mm);
> +
> +	mmput(mm);
> +	if (list_empty(&demote_pages))
> +		return 0;
> +
> +	list_for_each_entry_safe(page, page2, &demote_pages, lru) {

I'd prefer 'next' or 'next_page' instead of 'page2'.

> +		list_add(&page->lru, &pagelist);
> +		target_nid = next_demotion_node(page_to_nid(page));
> +		nr_pages = thp_nr_pages(page);
> +
> +		ret = migrate_pages(&pagelist, alloc_demote_page, NULL,
> +				    target_nid, MIGRATE_SYNC, MR_DEMOTION,
> +				    &nr_succeeded);
> +		if (ret) {
> +			if (!list_empty(&pagelist)) {
> +				list_del(&page->lru);
> +				mod_node_page_state(page_pgdat(page),
> +						    NR_ISOLATED_ANON + page_is_file_lru(page),
> +						    -nr_pages);
> +				putback_lru_page(page);
> +			}
> +		} else {
> +			__count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
> +			demoted_pages += nr_succeeded;
> +		}

Why should we do above work for each page on our own instead of exporting and
calling 'demote_page_list()'?


Thanks,
SJ

> +
> +		INIT_LIST_HEAD(&pagelist);
> +		cond_resched();
> +	}
> +
> +	return demoted_pages;
> +}
> +
>  static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
>  		struct damon_target *t, struct damon_region *r,
>  		struct damos *scheme)
> @@ -715,6 +869,8 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
>  	case DAMOS_NOHUGEPAGE:
>  		madv_action = MADV_NOHUGEPAGE;
>  		break;
> +	case DAMOS_DEMOTE:
> +		return damos_demote(t, r);
>  	case DAMOS_STAT:
>  		return 0;
>  	default:
> -- 
> 1.8.3.1
Baolin Wang Dec. 21, 2021, 2:18 p.m. UTC | #2
Hi SeongJae,

On 12/21/2021 9:24 PM, SeongJae Park Wrote:
> Hi Baolin,
> 
> 
> Thank you for this great patch!  I left some comments below.
> 
> On Tue, 21 Dec 2021 17:18:04 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote:
> 
>> On tiered memory system, the reclaim path in shrink_page_list() already
>> support demoting pages to slow memory node instead of discarding the
>> pages. However, at that time the fast memory node memory wartermark is
>> already tense, which will increase the memory allocation latency during
>> page demotion.
>>
>> We can rely on the DAMON in user space to help to monitor the cold
>> memory on fast memory node, and demote the cold pages to slow memory
>> node proactively to keep the fast memory node in a healthy state.
>> Thus this patch introduces a new scheme named DAMOS_DEMOTE to support
>> this feature.
>>
>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>> ---
>>   include/linux/damon.h |   3 +
>>   mm/damon/dbgfs.c      |   1 +
>>   mm/damon/vaddr.c      | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 160 insertions(+)
>>
>> diff --git a/include/linux/damon.h b/include/linux/damon.h
>> index af64838..da9957c 100644
>> --- a/include/linux/damon.h
>> +++ b/include/linux/damon.h
>> @@ -87,6 +87,8 @@ struct damon_target {
>>    * @DAMOS_PAGEOUT:	Call ``madvise()`` for the region with MADV_PAGEOUT.
>>    * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
>>    * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
>> + * @DAMOS_DEMOTE:	Migate cold pages from fast memory type (DRAM) to slow
>> + * memory type (persistent memory).
> 
> s/Migate/Migrate/
> 
> Also, could you make the second line of the description aligned with the first
> line? e.g.,
> 
>      + * @DAMOS_DEMOTE:	Migate cold pages from fast memory type (DRAM) to slow
>      + *                 memory type (persistent memory).

Sure.

> 
>>    * @DAMOS_STAT:		Do nothing but count the stat.
>>    */
>>   enum damos_action {
>> @@ -95,6 +97,7 @@ enum damos_action {
>>   	DAMOS_PAGEOUT,
>>   	DAMOS_HUGEPAGE,
>>   	DAMOS_NOHUGEPAGE,
>> +	DAMOS_DEMOTE,
>>   	DAMOS_STAT,		/* Do nothing but only record the stat */
> 
> This would make user space people who were using 'DAMOS_STAT' be confused.
> Could you put 'DAMOS_DEMOTE' after 'DAMOS_STAT'?

Ah, right, will do.

> 
>>   };
>>   
>> diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
>> index 58dbb96..43355ab 100644
>> --- a/mm/damon/dbgfs.c
>> +++ b/mm/damon/dbgfs.c
>> @@ -168,6 +168,7 @@ static bool damos_action_valid(int action)
>>   	case DAMOS_PAGEOUT:
>>   	case DAMOS_HUGEPAGE:
>>   	case DAMOS_NOHUGEPAGE:
>> +	case DAMOS_DEMOTE:
>>   	case DAMOS_STAT:
>>   		return true;
>>   	default:
>> diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
>> index 9e213a1..b354d3e 100644
>> --- a/mm/damon/vaddr.c
>> +++ b/mm/damon/vaddr.c
>> @@ -14,6 +14,10 @@
>>   #include <linux/page_idle.h>
>>   #include <linux/pagewalk.h>
>>   #include <linux/sched/mm.h>
>> +#include <linux/migrate.h>
>> +#include <linux/mm_inline.h>
>> +#include <linux/swapops.h>
>> +#include "../internal.h"
>>   
>>   #include "prmtv-common.h"
>>   
>> @@ -693,6 +697,156 @@ static unsigned long damos_madvise(struct damon_target *target,
>>   }
>>   #endif	/* CONFIG_ADVISE_SYSCALLS */
>>   
>> +static bool damos_isolate_page(struct page *page, struct list_head *demote_list)
>> +{
>> +	struct page *head = compound_head(page);
>> +
>> +	/* Do not interfere with other mappings of this page */
>> +	if (page_mapcount(head) != 1)
>> +		return false;
>> +
>> +	/* No need migration if the target demotion node is empty. */
>> +	if (next_demotion_node(page_to_nid(head)) == NUMA_NO_NODE)
>> +		return false;
>> +
>> +	if (isolate_lru_page(head))
>> +		return false;
>> +
>> +	list_add_tail(&head->lru, demote_list);
>> +	mod_node_page_state(page_pgdat(head),
>> +			    NR_ISOLATED_ANON + page_is_file_lru(head),
>> +			    thp_nr_pages(head));
>> +	return true;
> 
> The return value is not used by callers.  If not needed, let's remove it.

Yes.

> 
>> +}
>> +
>> +static int damos_migrate_pmd_entry(pmd_t *pmd, unsigned long addr,
>> +				   unsigned long end, struct mm_walk *walk)
>> +{
>> +	struct vm_area_struct *vma = walk->vma;
>> +	struct list_head *demote_list = walk->private;
>> +	spinlock_t *ptl;
>> +	struct page *page;
>> +	pte_t *pte, *mapped_pte;
>> +
>> +	if (!vma_migratable(vma))
>> +		return -EFAULT;
>> +
>> +	ptl = pmd_trans_huge_lock(pmd, vma);
>> +	if (ptl) {
>> +		/* Bail out if THP migration is not supported. */
>> +		if (!thp_migration_supported())
>> +			goto thp_out;
>> +
>> +		/* If the THP pte is under migration, do not bother it. */
>> +		if (unlikely(is_pmd_migration_entry(*pmd)))
>> +			goto thp_out;
>> +
>> +		page = damon_get_page(pmd_pfn(*pmd));
>> +		if (!page)
>> +			goto thp_out;
>> +
>> +		damos_isolate_page(page, demote_list);
>> +
>> +		put_page(page);
>> +thp_out:
>> +		spin_unlock(ptl);
>> +		return 0;
>> +	}
> 
> Could we wrap above THP handling code with '#ifdef CONFIG_TRANSPARENT_HUGEPAGE'?

The pmd_trans_huge_lock() will return NULL if the 
CONFIG_TRANSPARENT_HUGEPAGE is not defined, meanwhile I think the 
compiler will optimize the code if the CONFIG_TRANSPARENT_HUGEPAGE is 
not select. Se we can use it usually instead of adding annoying "#ifdef 
CONFIG_XXX" in many places to handle THP, which looks simpler and more 
readable for me.

> 
>> +
>> +	/* regular page handling */
>> +	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
>> +		return -EINVAL;
>> +
>> +	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
>> +	for (; addr != end; pte++, addr += PAGE_SIZE) {
>> +		if (pte_none(*pte) || !pte_present(*pte))
>> +			continue;
>> +
>> +		page = damon_get_page(pte_pfn(*pte));
>> +		if (!page)
>> +			continue;
>> +
>> +		damos_isolate_page(page, demote_list);
>> +		put_page(page);
>> +	}
>> +	pte_unmap_unlock(mapped_pte, ptl);
>> +	cond_resched();
>> +
>> +	return 0;
>> +}
>> +
>> +static const struct mm_walk_ops damos_migrate_pages_walk_ops = {
>> +	.pmd_entry              = damos_migrate_pmd_entry,
> 
> The names are little bit confusing to me.  How about 's/migrate/isolate/'?

Sure.

> 
>> +};
>> +
>> +/*
>> + * damos_demote() - demote cold pages from fast memory to slow memory
>> + * @target:    the given target
>> + * @r:         region of the target
>> + *
>> + * On tiered memory system, if DAMON monitored cold pages on fast memory
>> + * node (DRAM), we can demote them to slow memory node proactively in case
>> + * accumulating much more cold memory on fast memory node (DRAM) to reclaim.
>> + *
>> + * Return:
>> + * = 0 means no pages were demoted.
>> + * > 0 means how many cold pages were demoted successfully.
>> + * < 0 means errors happened.
> 
> damon_va_apply_scheme(), which returns what this function returns when
> DAMOS_DEMOTE action is given, should return bytes of the region that the action
> is successfully applied.

OK, I will change the return values as you suggested in next version.

>> + */
>> +static int damos_demote(struct damon_target *target, struct damon_region *r)
>> +{
>> +	struct mm_struct *mm;
>> +	LIST_HEAD(demote_pages);
>> +	LIST_HEAD(pagelist);
>> +	int target_nid, nr_pages, ret = 0;
>> +	unsigned int nr_succeeded, demoted_pages = 0;
>> +	struct page *page, *page2;
>> +
>> +	/* Validate if allowing to do page demotion */
>> +	if (!numa_demotion_enabled)
>> +		return -EINVAL;
>> +
>> +	mm = damon_get_mm(target);
>> +	if (!mm)
>> +		return -ENOMEM;
>> +
>> +	mmap_read_lock(mm);
>> +	walk_page_range(mm, PAGE_ALIGN(r->ar.start), PAGE_ALIGN(r->ar.end),
> 
> I guess PAGE_ALIGN()s are not necessary here?

Yes, I think so too, will remove it.

> 
>> +			&damos_migrate_pages_walk_ops, &demote_pages);
>> +	mmap_read_unlock(mm);
>> +
>> +	mmput(mm);
>> +	if (list_empty(&demote_pages))
>> +		return 0;
>> +
>> +	list_for_each_entry_safe(page, page2, &demote_pages, lru) {
> 
> I'd prefer 'next' or 'next_page' instead of 'page2'.

Sure.

> 
>> +		list_add(&page->lru, &pagelist);
>> +		target_nid = next_demotion_node(page_to_nid(page));
>> +		nr_pages = thp_nr_pages(page);
>> +
>> +		ret = migrate_pages(&pagelist, alloc_demote_page, NULL,
>> +				    target_nid, MIGRATE_SYNC, MR_DEMOTION,
>> +				    &nr_succeeded);
>> +		if (ret) {
>> +			if (!list_empty(&pagelist)) {
>> +				list_del(&page->lru);
>> +				mod_node_page_state(page_pgdat(page),
>> +						    NR_ISOLATED_ANON + page_is_file_lru(page),
>> +						    -nr_pages);
>> +				putback_lru_page(page);
>> +			}
>> +		} else {
>> +			__count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
>> +			demoted_pages += nr_succeeded;
>> +		}
> 
> Why should we do above work for each page on our own instead of exporting and
> calling 'demote_page_list()'?

Cuase demote_page_list() is used for demote cold pages which are from 
one same fast memory node, they can have one same target demotion node.

But for the regions for the process, we can get some cold pages from 
different fast memory nodes (suppose we have mulptiple dram node on the 
system), so its target demotion node may be different. Thus we should 
migrate each cold pages with getting the correct target demotion node.

Thanks for your comments.
SeongJae Park Dec. 22, 2021, 8:43 a.m. UTC | #3
On Tue, 21 Dec 2021 22:18:06 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote:

> Hi SeongJae,
> 
> On 12/21/2021 9:24 PM, SeongJae Park Wrote:
> > Hi Baolin,
> > 
> > 
> > Thank you for this great patch!  I left some comments below.
> > 
> > On Tue, 21 Dec 2021 17:18:04 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote:
> > 
> >> On tiered memory system, the reclaim path in shrink_page_list() already
> >> support demoting pages to slow memory node instead of discarding the
> >> pages. However, at that time the fast memory node memory wartermark is
> >> already tense, which will increase the memory allocation latency during
> >> page demotion.
> >>
> >> We can rely on the DAMON in user space to help to monitor the cold
> >> memory on fast memory node, and demote the cold pages to slow memory
> >> node proactively to keep the fast memory node in a healthy state.
> >> Thus this patch introduces a new scheme named DAMOS_DEMOTE to support
> >> this feature.
> >>
> >> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> >> ---
[...]
> >> + */
> >> +static int damos_demote(struct damon_target *target, struct damon_region *r)
> >> +{
> >> +	struct mm_struct *mm;
> >> +	LIST_HEAD(demote_pages);
> >> +	LIST_HEAD(pagelist);
> >> +	int target_nid, nr_pages, ret = 0;
> >> +	unsigned int nr_succeeded, demoted_pages = 0;
> >> +	struct page *page, *page2;
> >> +
> >> +	/* Validate if allowing to do page demotion */
> >> +	if (!numa_demotion_enabled)
> >> +		return -EINVAL;
> >> +
> >> +	mm = damon_get_mm(target);
> >> +	if (!mm)
> >> +		return -ENOMEM;
> >> +
> >> +	mmap_read_lock(mm);
> >> +	walk_page_range(mm, PAGE_ALIGN(r->ar.start), PAGE_ALIGN(r->ar.end),
[...]
> >> +			&damos_migrate_pages_walk_ops, &demote_pages);
> >> +	mmap_read_unlock(mm);
> >> +
> >> +	mmput(mm);
> >> +	if (list_empty(&demote_pages))
> >> +		return 0;
> >> +
> >> +	list_for_each_entry_safe(page, page2, &demote_pages, lru) {
> > 
> > I'd prefer 'next' or 'next_page' instead of 'page2'.
> 
> Sure.
> 
> > 
> >> +		list_add(&page->lru, &pagelist);
> >> +		target_nid = next_demotion_node(page_to_nid(page));
> >> +		nr_pages = thp_nr_pages(page);
> >> +
> >> +		ret = migrate_pages(&pagelist, alloc_demote_page, NULL,
> >> +				    target_nid, MIGRATE_SYNC, MR_DEMOTION,
> >> +				    &nr_succeeded);
> >> +		if (ret) {
> >> +			if (!list_empty(&pagelist)) {
> >> +				list_del(&page->lru);
> >> +				mod_node_page_state(page_pgdat(page),
> >> +						    NR_ISOLATED_ANON + page_is_file_lru(page),
> >> +						    -nr_pages);
> >> +				putback_lru_page(page);
> >> +			}
> >> +		} else {
> >> +			__count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
> >> +			demoted_pages += nr_succeeded;
> >> +		}
> > 
> > Why should we do above work for each page on our own instead of exporting and
> > calling 'demote_page_list()'?
> 
> Cuase demote_page_list() is used for demote cold pages which are from 
> one same fast memory node, they can have one same target demotion node.
> 
> But for the regions for the process, we can get some cold pages from 
> different fast memory nodes (suppose we have mulptiple dram node on the 
> system), so its target demotion node may be different. Thus we should 
> migrate each cold pages with getting the correct target demotion node.

Thank you for the kind explanation.  But, I still want to reuse the code rather
than copying if possible.  How about below dumb ideas off the top of my head?

1. Call demote_page_list() for each page from here
2. Call demote_page_list() for each page from damos_migrate_pmd_entry()
3. Make damos_migrate_pages_walk_ops to configure multiple demote_pages lists,
   each per fast memory node, and calling demote_page_list() here for each
   per-fast-memory-node demote_pages list.
4. Make demote_page_list() handles this case and use it.  e.g., NULL pgdat
   parameter means the pages are not from same node.

Please let me know if I'm missing something.


Thanks,
SJ

> 
> Thanks for your comments.
Baolin Wang Dec. 22, 2021, 9:15 a.m. UTC | #4
On 12/22/2021 4:43 PM, SeongJae Park wrote:
> On Tue, 21 Dec 2021 22:18:06 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote:
> 
>> Hi SeongJae,
>>
>> On 12/21/2021 9:24 PM, SeongJae Park Wrote:
>>> Hi Baolin,
>>>
>>>
>>> Thank you for this great patch!  I left some comments below.
>>>
>>> On Tue, 21 Dec 2021 17:18:04 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote:
>>>
>>>> On tiered memory system, the reclaim path in shrink_page_list() already
>>>> support demoting pages to slow memory node instead of discarding the
>>>> pages. However, at that time the fast memory node memory wartermark is
>>>> already tense, which will increase the memory allocation latency during
>>>> page demotion.
>>>>
>>>> We can rely on the DAMON in user space to help to monitor the cold
>>>> memory on fast memory node, and demote the cold pages to slow memory
>>>> node proactively to keep the fast memory node in a healthy state.
>>>> Thus this patch introduces a new scheme named DAMOS_DEMOTE to support
>>>> this feature.
>>>>
>>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>> ---

[snip]

>>
>>>
>>>> +		list_add(&page->lru, &pagelist);
>>>> +		target_nid = next_demotion_node(page_to_nid(page));
>>>> +		nr_pages = thp_nr_pages(page);
>>>> +
>>>> +		ret = migrate_pages(&pagelist, alloc_demote_page, NULL,
>>>> +				    target_nid, MIGRATE_SYNC, MR_DEMOTION,
>>>> +				    &nr_succeeded);
>>>> +		if (ret) {
>>>> +			if (!list_empty(&pagelist)) {
>>>> +				list_del(&page->lru);
>>>> +				mod_node_page_state(page_pgdat(page),
>>>> +						    NR_ISOLATED_ANON + page_is_file_lru(page),
>>>> +						    -nr_pages);
>>>> +				putback_lru_page(page);
>>>> +			}
>>>> +		} else {
>>>> +			__count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
>>>> +			demoted_pages += nr_succeeded;
>>>> +		}
>>>
>>> Why should we do above work for each page on our own instead of exporting and
>>> calling 'demote_page_list()'?
>>
>> Cuase demote_page_list() is used for demote cold pages which are from
>> one same fast memory node, they can have one same target demotion node.
>>
>> But for the regions for the process, we can get some cold pages from
>> different fast memory nodes (suppose we have mulptiple dram node on the
>> system), so its target demotion node may be different. Thus we should
>> migrate each cold pages with getting the correct target demotion node.
> 
> Thank you for the kind explanation.  But, I still want to reuse the code rather
> than copying if possible.  How about below dumb ideas off the top of my head?
> 
> 1. Call demote_page_list() for each page from here

Sounds reasonable.

> 2. Call demote_page_list() for each page from damos_migrate_pmd_entry()

We are under mmap lock in damos_migrate_pmd_entry(), it is not suitable 
to do page migration.

> 3. Make damos_migrate_pages_walk_ops to configure multiple demote_pages lists,
>     each per fast memory node, and calling demote_page_list() here for each
>     per-fast-memory-node demote_pages list.

Which will bring more complexity I think, and we should avoid doing 
migration under mmap lock.

> 4. Make demote_page_list() handles this case and use it.  e.g., NULL pgdat
>     parameter means the pages are not from same node.

Thanks for your suggestion, actually after more thinking, I think we can 
reuse the demote_page_list() and it can be easy to change. Somethink 
like below on top of my current patch, how do you think? Thanks.

--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -796,7 +796,7 @@ static unsigned long damos_demote(struct 
damon_target *target,
         struct mm_struct *mm;
         LIST_HEAD(demote_pages);
         LIST_HEAD(pagelist);
-       int target_nid, nr_pages, ret = 0;
+       int nr_pages;
         unsigned int nr_succeeded, demoted_pages = 0;
         struct page *page, *next;

@@ -818,14 +818,11 @@ static unsigned long damos_demote(struct 
damon_target *target,
                 return 0;

         list_for_each_entry_safe(page, next, &demote_pages, lru) {
-               list_add(&page->lru, &pagelist);
-               target_nid = next_demotion_node(page_to_nid(page));
                 nr_pages = thp_nr_pages(page);
+               list_add(&page->lru, &pagelist);

-               ret = migrate_pages(&pagelist, alloc_demote_page, NULL,
-                                   target_nid, MIGRATE_SYNC, MR_DEMOTION,
-                                   &nr_succeeded);
-               if (ret) {
+               nr_succeeded = demote_page_list(pagelist, page_pgdat(page));
+               if (!nr_succeeded) {
                         if (!list_empty(&pagelist)) {
                                 list_del(&page->lru);
                                 mod_node_page_state(page_pgdat(page),
@@ -834,11 +831,9 @@ static unsigned long damos_demote(struct 
damon_target *target,
                                 putback_lru_page(page);
                         }
                 } else {
-                       __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
                         demoted_pages += nr_succeeded;
                 }

-               INIT_LIST_HEAD(&pagelist);
                 cond_resched();
         }
kernel test robot Dec. 22, 2021, 11:17 a.m. UTC | #5
Hi Baolin,

I love your patch! Yet something to improve:

[auto build test ERROR on next-20211220]
[cannot apply to hnaz-mm/master linux/master linus/master v5.16-rc6 v5.16-rc5 v5.16-rc4 v5.16-rc6]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Baolin-Wang/Add-a-new-scheme-to-support-demotion-on-tiered-memory-system/20211221-172017
base:    07f8c60fe60f84977dc815ec8a6b1100827c34dd
config: arc-randconfig-r043-20211222 (https://download.01.org/0day-ci/archive/20211222/202112221923.4m9tlPoy-lkp@intel.com/config)
compiler: arc-elf-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/ed1c1ea9c5b5ea81916c10e50c4a1613bce4b5a9
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Baolin-Wang/Add-a-new-scheme-to-support-demotion-on-tiered-memory-system/20211221-172017
        git checkout ed1c1ea9c5b5ea81916c10e50c4a1613bce4b5a9
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=arc SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   mm/damon/vaddr.c: In function 'damos_migrate_pmd_entry':
>> mm/damon/vaddr.c:635:14: error: implicit declaration of function 'vma_migratable'; did you mean 'HPageMigratable'? [-Werror=implicit-function-declaration]
     635 |         if (!vma_migratable(vma))
         |              ^~~~~~~~~~~~~~
         |              HPageMigratable
>> mm/damon/vaddr.c:648:39: error: implicit declaration of function 'pmd_pfn'; did you mean 'pmd_off'? [-Werror=implicit-function-declaration]
     648 |                 page = damon_get_page(pmd_pfn(*pmd));
         |                                       ^~~~~~~
         |                                       pmd_off
   cc1: some warnings being treated as errors


vim +635 mm/damon/vaddr.c

   625	
   626	static int damos_migrate_pmd_entry(pmd_t *pmd, unsigned long addr,
   627					   unsigned long end, struct mm_walk *walk)
   628	{
   629		struct vm_area_struct *vma = walk->vma;
   630		struct list_head *demote_list = walk->private;
   631		spinlock_t *ptl;
   632		struct page *page;
   633		pte_t *pte, *mapped_pte;
   634	
 > 635		if (!vma_migratable(vma))
   636			return -EFAULT;
   637	
   638		ptl = pmd_trans_huge_lock(pmd, vma);
   639		if (ptl) {
   640			/* Bail out if THP migration is not supported. */
   641			if (!thp_migration_supported())
   642				goto thp_out;
   643	
   644			/* If the THP pte is under migration, do not bother it. */
   645			if (unlikely(is_pmd_migration_entry(*pmd)))
   646				goto thp_out;
   647	
 > 648			page = damon_get_page(pmd_pfn(*pmd));
   649			if (!page)
   650				goto thp_out;
   651	
   652			damos_isolate_page(page, demote_list);
   653	
   654			put_page(page);
   655	thp_out:
   656			spin_unlock(ptl);
   657			return 0;
   658		}
   659	
   660		/* regular page handling */
   661		if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
   662			return -EINVAL;
   663	
   664		mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
   665		for (; addr != end; pte++, addr += PAGE_SIZE) {
   666			if (pte_none(*pte) || !pte_present(*pte))
   667				continue;
   668	
   669			page = damon_get_page(pte_pfn(*pte));
   670			if (!page)
   671				continue;
   672	
   673			damos_isolate_page(page, demote_list);
   674			put_page(page);
   675		}
   676		pte_unmap_unlock(mapped_pte, ptl);
   677		cond_resched();
   678	
   679		return 0;
   680	}
   681	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
SeongJae Park Dec. 23, 2021, 8:53 a.m. UTC | #6
On Wed, 22 Dec 2021 17:15:18 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote:

> 
> 
> On 12/22/2021 4:43 PM, SeongJae Park wrote:
> > On Tue, 21 Dec 2021 22:18:06 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote:
> > 
> >> Hi SeongJae,
> >>
> >> On 12/21/2021 9:24 PM, SeongJae Park Wrote:
> >>> Hi Baolin,
> >>>
> >>>
> >>> Thank you for this great patch!  I left some comments below.
> >>>
> >>> On Tue, 21 Dec 2021 17:18:04 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote:
> >>>
> >>>> On tiered memory system, the reclaim path in shrink_page_list() already
> >>>> support demoting pages to slow memory node instead of discarding the
> >>>> pages. However, at that time the fast memory node memory wartermark is
> >>>> already tense, which will increase the memory allocation latency during
> >>>> page demotion.
> >>>>
> >>>> We can rely on the DAMON in user space to help to monitor the cold
> >>>> memory on fast memory node, and demote the cold pages to slow memory
> >>>> node proactively to keep the fast memory node in a healthy state.
> >>>> Thus this patch introduces a new scheme named DAMOS_DEMOTE to support
> >>>> this feature.
> >>>>
> >>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> >>>> ---
> 
> [snip]
> 
> >>
> >>>
> >>>> +		list_add(&page->lru, &pagelist);
> >>>> +		target_nid = next_demotion_node(page_to_nid(page));
> >>>> +		nr_pages = thp_nr_pages(page);
> >>>> +
> >>>> +		ret = migrate_pages(&pagelist, alloc_demote_page, NULL,
> >>>> +				    target_nid, MIGRATE_SYNC, MR_DEMOTION,
> >>>> +				    &nr_succeeded);
> >>>> +		if (ret) {
> >>>> +			if (!list_empty(&pagelist)) {
> >>>> +				list_del(&page->lru);
> >>>> +				mod_node_page_state(page_pgdat(page),
> >>>> +						    NR_ISOLATED_ANON + page_is_file_lru(page),
> >>>> +						    -nr_pages);
> >>>> +				putback_lru_page(page);
> >>>> +			}
> >>>> +		} else {
> >>>> +			__count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
> >>>> +			demoted_pages += nr_succeeded;
> >>>> +		}
> >>>
> >>> Why should we do above work for each page on our own instead of exporting and
> >>> calling 'demote_page_list()'?
> >>
> >> Cuase demote_page_list() is used for demote cold pages which are from
> >> one same fast memory node, they can have one same target demotion node.
> >>
> >> But for the regions for the process, we can get some cold pages from
> >> different fast memory nodes (suppose we have mulptiple dram node on the
> >> system), so its target demotion node may be different. Thus we should
> >> migrate each cold pages with getting the correct target demotion node.
> > 
> > Thank you for the kind explanation.  But, I still want to reuse the code rather
> > than copying if possible.  How about below dumb ideas off the top of my head?
> > 
> > 1. Call demote_page_list() for each page from here
> 
> Sounds reasonable.
> 
> > 2. Call demote_page_list() for each page from damos_migrate_pmd_entry()
> 
> We are under mmap lock in damos_migrate_pmd_entry(), it is not suitable 
> to do page migration.
> 
> > 3. Make damos_migrate_pages_walk_ops to configure multiple demote_pages lists,
> >     each per fast memory node, and calling demote_page_list() here for each
> >     per-fast-memory-node demote_pages list.
> 
> Which will bring more complexity I think, and we should avoid doing 
> migration under mmap lock.
> 
> > 4. Make demote_page_list() handles this case and use it.  e.g., NULL pgdat
> >     parameter means the pages are not from same node.
> 
> Thanks for your suggestion, actually after more thinking, I think we can 
> reuse the demote_page_list() and it can be easy to change. Somethink 
> like below on top of my current patch, how do you think? Thanks.

So, you selected the option 1.  I personally think option 3 or 4 would be more
optimal, but also agree that it could increase the complexity.  As we already
have the time/space quota feature for schemes overhead control, I think
starting with this simple approach makes sense to me.


Thanks,
SJ

[...]
diff mbox series

Patch

diff --git a/include/linux/damon.h b/include/linux/damon.h
index af64838..da9957c 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -87,6 +87,8 @@  struct damon_target {
  * @DAMOS_PAGEOUT:	Call ``madvise()`` for the region with MADV_PAGEOUT.
  * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
  * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
+ * @DAMOS_DEMOTE:	Migate cold pages from fast memory type (DRAM) to slow
+ * memory type (persistent memory).
  * @DAMOS_STAT:		Do nothing but count the stat.
  */
 enum damos_action {
@@ -95,6 +97,7 @@  enum damos_action {
 	DAMOS_PAGEOUT,
 	DAMOS_HUGEPAGE,
 	DAMOS_NOHUGEPAGE,
+	DAMOS_DEMOTE,
 	DAMOS_STAT,		/* Do nothing but only record the stat */
 };
 
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 58dbb96..43355ab 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -168,6 +168,7 @@  static bool damos_action_valid(int action)
 	case DAMOS_PAGEOUT:
 	case DAMOS_HUGEPAGE:
 	case DAMOS_NOHUGEPAGE:
+	case DAMOS_DEMOTE:
 	case DAMOS_STAT:
 		return true;
 	default:
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 9e213a1..b354d3e 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -14,6 +14,10 @@ 
 #include <linux/page_idle.h>
 #include <linux/pagewalk.h>
 #include <linux/sched/mm.h>
+#include <linux/migrate.h>
+#include <linux/mm_inline.h>
+#include <linux/swapops.h>
+#include "../internal.h"
 
 #include "prmtv-common.h"
 
@@ -693,6 +697,156 @@  static unsigned long damos_madvise(struct damon_target *target,
 }
 #endif	/* CONFIG_ADVISE_SYSCALLS */
 
+static bool damos_isolate_page(struct page *page, struct list_head *demote_list)
+{
+	struct page *head = compound_head(page);
+
+	/* Do not interfere with other mappings of this page */
+	if (page_mapcount(head) != 1)
+		return false;
+
+	/* No need migration if the target demotion node is empty. */
+	if (next_demotion_node(page_to_nid(head)) == NUMA_NO_NODE)
+		return false;
+
+	if (isolate_lru_page(head))
+		return false;
+
+	list_add_tail(&head->lru, demote_list);
+	mod_node_page_state(page_pgdat(head),
+			    NR_ISOLATED_ANON + page_is_file_lru(head),
+			    thp_nr_pages(head));
+	return true;
+}
+
+static int damos_migrate_pmd_entry(pmd_t *pmd, unsigned long addr,
+				   unsigned long end, struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+	struct list_head *demote_list = walk->private;
+	spinlock_t *ptl;
+	struct page *page;
+	pte_t *pte, *mapped_pte;
+
+	if (!vma_migratable(vma))
+		return -EFAULT;
+
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
+		/* Bail out if THP migration is not supported. */
+		if (!thp_migration_supported())
+			goto thp_out;
+
+		/* If the THP pte is under migration, do not bother it. */
+		if (unlikely(is_pmd_migration_entry(*pmd)))
+			goto thp_out;
+
+		page = damon_get_page(pmd_pfn(*pmd));
+		if (!page)
+			goto thp_out;
+
+		damos_isolate_page(page, demote_list);
+
+		put_page(page);
+thp_out:
+		spin_unlock(ptl);
+		return 0;
+	}
+
+	/* regular page handling */
+	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		return -EINVAL;
+
+	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+	for (; addr != end; pte++, addr += PAGE_SIZE) {
+		if (pte_none(*pte) || !pte_present(*pte))
+			continue;
+
+		page = damon_get_page(pte_pfn(*pte));
+		if (!page)
+			continue;
+
+		damos_isolate_page(page, demote_list);
+		put_page(page);
+	}
+	pte_unmap_unlock(mapped_pte, ptl);
+	cond_resched();
+
+	return 0;
+}
+
+static const struct mm_walk_ops damos_migrate_pages_walk_ops = {
+	.pmd_entry              = damos_migrate_pmd_entry,
+};
+
+/*
+ * damos_demote() - demote cold pages from fast memory to slow memory
+ * @target:    the given target
+ * @r:         region of the target
+ *
+ * On tiered memory system, if DAMON monitored cold pages on fast memory
+ * node (DRAM), we can demote them to slow memory node proactively in case
+ * accumulating much more cold memory on fast memory node (DRAM) to reclaim.
+ *
+ * Return:
+ * = 0 means no pages were demoted.
+ * > 0 means how many cold pages were demoted successfully.
+ * < 0 means errors happened.
+ */
+static int damos_demote(struct damon_target *target, struct damon_region *r)
+{
+	struct mm_struct *mm;
+	LIST_HEAD(demote_pages);
+	LIST_HEAD(pagelist);
+	int target_nid, nr_pages, ret = 0;
+	unsigned int nr_succeeded, demoted_pages = 0;
+	struct page *page, *page2;
+
+	/* Validate if allowing to do page demotion */
+	if (!numa_demotion_enabled)
+		return -EINVAL;
+
+	mm = damon_get_mm(target);
+	if (!mm)
+		return -ENOMEM;
+
+	mmap_read_lock(mm);
+	walk_page_range(mm, PAGE_ALIGN(r->ar.start), PAGE_ALIGN(r->ar.end),
+			&damos_migrate_pages_walk_ops, &demote_pages);
+	mmap_read_unlock(mm);
+
+	mmput(mm);
+	if (list_empty(&demote_pages))
+		return 0;
+
+	list_for_each_entry_safe(page, page2, &demote_pages, lru) {
+		list_add(&page->lru, &pagelist);
+		target_nid = next_demotion_node(page_to_nid(page));
+		nr_pages = thp_nr_pages(page);
+
+		ret = migrate_pages(&pagelist, alloc_demote_page, NULL,
+				    target_nid, MIGRATE_SYNC, MR_DEMOTION,
+				    &nr_succeeded);
+		if (ret) {
+			if (!list_empty(&pagelist)) {
+				list_del(&page->lru);
+				mod_node_page_state(page_pgdat(page),
+						    NR_ISOLATED_ANON + page_is_file_lru(page),
+						    -nr_pages);
+				putback_lru_page(page);
+			}
+		} else {
+			__count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+			demoted_pages += nr_succeeded;
+		}
+
+		INIT_LIST_HEAD(&pagelist);
+		cond_resched();
+	}
+
+	return demoted_pages;
+}
+
 static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
 		struct damon_target *t, struct damon_region *r,
 		struct damos *scheme)
@@ -715,6 +869,8 @@  static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
 	case DAMOS_NOHUGEPAGE:
 		madv_action = MADV_NOHUGEPAGE;
 		break;
+	case DAMOS_DEMOTE:
+		return damos_demote(t, r);
 	case DAMOS_STAT:
 		return 0;
 	default: