Message ID | 1c014ce5c6f6c62a30d07096c5e28aa1310c1bbd.1640077468.git.baolin.wang@linux.alibaba.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Add a new scheme to support demotion on tiered memory system | expand |
Hi Baolin, Thank you for this great patch! I left some comments below. On Tue, 21 Dec 2021 17:18:04 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote: > On tiered memory system, the reclaim path in shrink_page_list() already > support demoting pages to slow memory node instead of discarding the > pages. However, at that time the fast memory node memory wartermark is > already tense, which will increase the memory allocation latency during > page demotion. > > We can rely on the DAMON in user space to help to monitor the cold > memory on fast memory node, and demote the cold pages to slow memory > node proactively to keep the fast memory node in a healthy state. > Thus this patch introduces a new scheme named DAMOS_DEMOTE to support > this feature. > > Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com> > --- > include/linux/damon.h | 3 + > mm/damon/dbgfs.c | 1 + > mm/damon/vaddr.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 160 insertions(+) > > diff --git a/include/linux/damon.h b/include/linux/damon.h > index af64838..da9957c 100644 > --- a/include/linux/damon.h > +++ b/include/linux/damon.h > @@ -87,6 +87,8 @@ struct damon_target { > * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. > * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. > * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. > + * @DAMOS_DEMOTE: Migate cold pages from fast memory type (DRAM) to slow > + * memory type (persistent memory). s/Migate/Migrate/ Also, could you make the second line of the description aligned with the first line? e.g., + * @DAMOS_DEMOTE: Migate cold pages from fast memory type (DRAM) to slow + * memory type (persistent memory). > * @DAMOS_STAT: Do nothing but count the stat. > */ > enum damos_action { > @@ -95,6 +97,7 @@ enum damos_action { > DAMOS_PAGEOUT, > DAMOS_HUGEPAGE, > DAMOS_NOHUGEPAGE, > + DAMOS_DEMOTE, > DAMOS_STAT, /* Do nothing but only record the stat */ This would make user space people who were using 'DAMOS_STAT' be confused. Could you put 'DAMOS_DEMOTE' after 'DAMOS_STAT'? > }; > > diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c > index 58dbb96..43355ab 100644 > --- a/mm/damon/dbgfs.c > +++ b/mm/damon/dbgfs.c > @@ -168,6 +168,7 @@ static bool damos_action_valid(int action) > case DAMOS_PAGEOUT: > case DAMOS_HUGEPAGE: > case DAMOS_NOHUGEPAGE: > + case DAMOS_DEMOTE: > case DAMOS_STAT: > return true; > default: > diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c > index 9e213a1..b354d3e 100644 > --- a/mm/damon/vaddr.c > +++ b/mm/damon/vaddr.c > @@ -14,6 +14,10 @@ > #include <linux/page_idle.h> > #include <linux/pagewalk.h> > #include <linux/sched/mm.h> > +#include <linux/migrate.h> > +#include <linux/mm_inline.h> > +#include <linux/swapops.h> > +#include "../internal.h" > > #include "prmtv-common.h" > > @@ -693,6 +697,156 @@ static unsigned long damos_madvise(struct damon_target *target, > } > #endif /* CONFIG_ADVISE_SYSCALLS */ > > +static bool damos_isolate_page(struct page *page, struct list_head *demote_list) > +{ > + struct page *head = compound_head(page); > + > + /* Do not interfere with other mappings of this page */ > + if (page_mapcount(head) != 1) > + return false; > + > + /* No need migration if the target demotion node is empty. */ > + if (next_demotion_node(page_to_nid(head)) == NUMA_NO_NODE) > + return false; > + > + if (isolate_lru_page(head)) > + return false; > + > + list_add_tail(&head->lru, demote_list); > + mod_node_page_state(page_pgdat(head), > + NR_ISOLATED_ANON + page_is_file_lru(head), > + thp_nr_pages(head)); > + return true; The return value is not used by callers. If not needed, let's remove it. > +} > + > +static int damos_migrate_pmd_entry(pmd_t *pmd, unsigned long addr, > + unsigned long end, struct mm_walk *walk) > +{ > + struct vm_area_struct *vma = walk->vma; > + struct list_head *demote_list = walk->private; > + spinlock_t *ptl; > + struct page *page; > + pte_t *pte, *mapped_pte; > + > + if (!vma_migratable(vma)) > + return -EFAULT; > + > + ptl = pmd_trans_huge_lock(pmd, vma); > + if (ptl) { > + /* Bail out if THP migration is not supported. */ > + if (!thp_migration_supported()) > + goto thp_out; > + > + /* If the THP pte is under migration, do not bother it. */ > + if (unlikely(is_pmd_migration_entry(*pmd))) > + goto thp_out; > + > + page = damon_get_page(pmd_pfn(*pmd)); > + if (!page) > + goto thp_out; > + > + damos_isolate_page(page, demote_list); > + > + put_page(page); > +thp_out: > + spin_unlock(ptl); > + return 0; > + } Could we wrap above THP handling code with '#ifdef CONFIG_TRANSPARENT_HUGEPAGE'? > + > + /* regular page handling */ > + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) > + return -EINVAL; > + > + mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); > + for (; addr != end; pte++, addr += PAGE_SIZE) { > + if (pte_none(*pte) || !pte_present(*pte)) > + continue; > + > + page = damon_get_page(pte_pfn(*pte)); > + if (!page) > + continue; > + > + damos_isolate_page(page, demote_list); > + put_page(page); > + } > + pte_unmap_unlock(mapped_pte, ptl); > + cond_resched(); > + > + return 0; > +} > + > +static const struct mm_walk_ops damos_migrate_pages_walk_ops = { > + .pmd_entry = damos_migrate_pmd_entry, The names are little bit confusing to me. How about 's/migrate/isolate/'? > +}; > + > +/* > + * damos_demote() - demote cold pages from fast memory to slow memory > + * @target: the given target > + * @r: region of the target > + * > + * On tiered memory system, if DAMON monitored cold pages on fast memory > + * node (DRAM), we can demote them to slow memory node proactively in case > + * accumulating much more cold memory on fast memory node (DRAM) to reclaim. > + * > + * Return: > + * = 0 means no pages were demoted. > + * > 0 means how many cold pages were demoted successfully. > + * < 0 means errors happened. damon_va_apply_scheme(), which returns what this function returns when DAMOS_DEMOTE action is given, should return bytes of the region that the action is successfully applied. > + */ > +static int damos_demote(struct damon_target *target, struct damon_region *r) > +{ > + struct mm_struct *mm; > + LIST_HEAD(demote_pages); > + LIST_HEAD(pagelist); > + int target_nid, nr_pages, ret = 0; > + unsigned int nr_succeeded, demoted_pages = 0; > + struct page *page, *page2; > + > + /* Validate if allowing to do page demotion */ > + if (!numa_demotion_enabled) > + return -EINVAL; > + > + mm = damon_get_mm(target); > + if (!mm) > + return -ENOMEM; > + > + mmap_read_lock(mm); > + walk_page_range(mm, PAGE_ALIGN(r->ar.start), PAGE_ALIGN(r->ar.end), I guess PAGE_ALIGN()s are not necessary here? > + &damos_migrate_pages_walk_ops, &demote_pages); > + mmap_read_unlock(mm); > + > + mmput(mm); > + if (list_empty(&demote_pages)) > + return 0; > + > + list_for_each_entry_safe(page, page2, &demote_pages, lru) { I'd prefer 'next' or 'next_page' instead of 'page2'. > + list_add(&page->lru, &pagelist); > + target_nid = next_demotion_node(page_to_nid(page)); > + nr_pages = thp_nr_pages(page); > + > + ret = migrate_pages(&pagelist, alloc_demote_page, NULL, > + target_nid, MIGRATE_SYNC, MR_DEMOTION, > + &nr_succeeded); > + if (ret) { > + if (!list_empty(&pagelist)) { > + list_del(&page->lru); > + mod_node_page_state(page_pgdat(page), > + NR_ISOLATED_ANON + page_is_file_lru(page), > + -nr_pages); > + putback_lru_page(page); > + } > + } else { > + __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); > + demoted_pages += nr_succeeded; > + } Why should we do above work for each page on our own instead of exporting and calling 'demote_page_list()'? Thanks, SJ > + > + INIT_LIST_HEAD(&pagelist); > + cond_resched(); > + } > + > + return demoted_pages; > +} > + > static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, > struct damon_target *t, struct damon_region *r, > struct damos *scheme) > @@ -715,6 +869,8 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, > case DAMOS_NOHUGEPAGE: > madv_action = MADV_NOHUGEPAGE; > break; > + case DAMOS_DEMOTE: > + return damos_demote(t, r); > case DAMOS_STAT: > return 0; > default: > -- > 1.8.3.1
Hi SeongJae, On 12/21/2021 9:24 PM, SeongJae Park Wrote: > Hi Baolin, > > > Thank you for this great patch! I left some comments below. > > On Tue, 21 Dec 2021 17:18:04 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote: > >> On tiered memory system, the reclaim path in shrink_page_list() already >> support demoting pages to slow memory node instead of discarding the >> pages. However, at that time the fast memory node memory wartermark is >> already tense, which will increase the memory allocation latency during >> page demotion. >> >> We can rely on the DAMON in user space to help to monitor the cold >> memory on fast memory node, and demote the cold pages to slow memory >> node proactively to keep the fast memory node in a healthy state. >> Thus this patch introduces a new scheme named DAMOS_DEMOTE to support >> this feature. >> >> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com> >> --- >> include/linux/damon.h | 3 + >> mm/damon/dbgfs.c | 1 + >> mm/damon/vaddr.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++ >> 3 files changed, 160 insertions(+) >> >> diff --git a/include/linux/damon.h b/include/linux/damon.h >> index af64838..da9957c 100644 >> --- a/include/linux/damon.h >> +++ b/include/linux/damon.h >> @@ -87,6 +87,8 @@ struct damon_target { >> * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. >> * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. >> * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. >> + * @DAMOS_DEMOTE: Migate cold pages from fast memory type (DRAM) to slow >> + * memory type (persistent memory). > > s/Migate/Migrate/ > > Also, could you make the second line of the description aligned with the first > line? e.g., > > + * @DAMOS_DEMOTE: Migate cold pages from fast memory type (DRAM) to slow > + * memory type (persistent memory). Sure. > >> * @DAMOS_STAT: Do nothing but count the stat. >> */ >> enum damos_action { >> @@ -95,6 +97,7 @@ enum damos_action { >> DAMOS_PAGEOUT, >> DAMOS_HUGEPAGE, >> DAMOS_NOHUGEPAGE, >> + DAMOS_DEMOTE, >> DAMOS_STAT, /* Do nothing but only record the stat */ > > This would make user space people who were using 'DAMOS_STAT' be confused. > Could you put 'DAMOS_DEMOTE' after 'DAMOS_STAT'? Ah, right, will do. > >> }; >> >> diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c >> index 58dbb96..43355ab 100644 >> --- a/mm/damon/dbgfs.c >> +++ b/mm/damon/dbgfs.c >> @@ -168,6 +168,7 @@ static bool damos_action_valid(int action) >> case DAMOS_PAGEOUT: >> case DAMOS_HUGEPAGE: >> case DAMOS_NOHUGEPAGE: >> + case DAMOS_DEMOTE: >> case DAMOS_STAT: >> return true; >> default: >> diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c >> index 9e213a1..b354d3e 100644 >> --- a/mm/damon/vaddr.c >> +++ b/mm/damon/vaddr.c >> @@ -14,6 +14,10 @@ >> #include <linux/page_idle.h> >> #include <linux/pagewalk.h> >> #include <linux/sched/mm.h> >> +#include <linux/migrate.h> >> +#include <linux/mm_inline.h> >> +#include <linux/swapops.h> >> +#include "../internal.h" >> >> #include "prmtv-common.h" >> >> @@ -693,6 +697,156 @@ static unsigned long damos_madvise(struct damon_target *target, >> } >> #endif /* CONFIG_ADVISE_SYSCALLS */ >> >> +static bool damos_isolate_page(struct page *page, struct list_head *demote_list) >> +{ >> + struct page *head = compound_head(page); >> + >> + /* Do not interfere with other mappings of this page */ >> + if (page_mapcount(head) != 1) >> + return false; >> + >> + /* No need migration if the target demotion node is empty. */ >> + if (next_demotion_node(page_to_nid(head)) == NUMA_NO_NODE) >> + return false; >> + >> + if (isolate_lru_page(head)) >> + return false; >> + >> + list_add_tail(&head->lru, demote_list); >> + mod_node_page_state(page_pgdat(head), >> + NR_ISOLATED_ANON + page_is_file_lru(head), >> + thp_nr_pages(head)); >> + return true; > > The return value is not used by callers. If not needed, let's remove it. Yes. > >> +} >> + >> +static int damos_migrate_pmd_entry(pmd_t *pmd, unsigned long addr, >> + unsigned long end, struct mm_walk *walk) >> +{ >> + struct vm_area_struct *vma = walk->vma; >> + struct list_head *demote_list = walk->private; >> + spinlock_t *ptl; >> + struct page *page; >> + pte_t *pte, *mapped_pte; >> + >> + if (!vma_migratable(vma)) >> + return -EFAULT; >> + >> + ptl = pmd_trans_huge_lock(pmd, vma); >> + if (ptl) { >> + /* Bail out if THP migration is not supported. */ >> + if (!thp_migration_supported()) >> + goto thp_out; >> + >> + /* If the THP pte is under migration, do not bother it. */ >> + if (unlikely(is_pmd_migration_entry(*pmd))) >> + goto thp_out; >> + >> + page = damon_get_page(pmd_pfn(*pmd)); >> + if (!page) >> + goto thp_out; >> + >> + damos_isolate_page(page, demote_list); >> + >> + put_page(page); >> +thp_out: >> + spin_unlock(ptl); >> + return 0; >> + } > > Could we wrap above THP handling code with '#ifdef CONFIG_TRANSPARENT_HUGEPAGE'? The pmd_trans_huge_lock() will return NULL if the CONFIG_TRANSPARENT_HUGEPAGE is not defined, meanwhile I think the compiler will optimize the code if the CONFIG_TRANSPARENT_HUGEPAGE is not select. Se we can use it usually instead of adding annoying "#ifdef CONFIG_XXX" in many places to handle THP, which looks simpler and more readable for me. > >> + >> + /* regular page handling */ >> + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) >> + return -EINVAL; >> + >> + mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); >> + for (; addr != end; pte++, addr += PAGE_SIZE) { >> + if (pte_none(*pte) || !pte_present(*pte)) >> + continue; >> + >> + page = damon_get_page(pte_pfn(*pte)); >> + if (!page) >> + continue; >> + >> + damos_isolate_page(page, demote_list); >> + put_page(page); >> + } >> + pte_unmap_unlock(mapped_pte, ptl); >> + cond_resched(); >> + >> + return 0; >> +} >> + >> +static const struct mm_walk_ops damos_migrate_pages_walk_ops = { >> + .pmd_entry = damos_migrate_pmd_entry, > > The names are little bit confusing to me. How about 's/migrate/isolate/'? Sure. > >> +}; >> + >> +/* >> + * damos_demote() - demote cold pages from fast memory to slow memory >> + * @target: the given target >> + * @r: region of the target >> + * >> + * On tiered memory system, if DAMON monitored cold pages on fast memory >> + * node (DRAM), we can demote them to slow memory node proactively in case >> + * accumulating much more cold memory on fast memory node (DRAM) to reclaim. >> + * >> + * Return: >> + * = 0 means no pages were demoted. >> + * > 0 means how many cold pages were demoted successfully. >> + * < 0 means errors happened. > > damon_va_apply_scheme(), which returns what this function returns when > DAMOS_DEMOTE action is given, should return bytes of the region that the action > is successfully applied. OK, I will change the return values as you suggested in next version. >> + */ >> +static int damos_demote(struct damon_target *target, struct damon_region *r) >> +{ >> + struct mm_struct *mm; >> + LIST_HEAD(demote_pages); >> + LIST_HEAD(pagelist); >> + int target_nid, nr_pages, ret = 0; >> + unsigned int nr_succeeded, demoted_pages = 0; >> + struct page *page, *page2; >> + >> + /* Validate if allowing to do page demotion */ >> + if (!numa_demotion_enabled) >> + return -EINVAL; >> + >> + mm = damon_get_mm(target); >> + if (!mm) >> + return -ENOMEM; >> + >> + mmap_read_lock(mm); >> + walk_page_range(mm, PAGE_ALIGN(r->ar.start), PAGE_ALIGN(r->ar.end), > > I guess PAGE_ALIGN()s are not necessary here? Yes, I think so too, will remove it. > >> + &damos_migrate_pages_walk_ops, &demote_pages); >> + mmap_read_unlock(mm); >> + >> + mmput(mm); >> + if (list_empty(&demote_pages)) >> + return 0; >> + >> + list_for_each_entry_safe(page, page2, &demote_pages, lru) { > > I'd prefer 'next' or 'next_page' instead of 'page2'. Sure. > >> + list_add(&page->lru, &pagelist); >> + target_nid = next_demotion_node(page_to_nid(page)); >> + nr_pages = thp_nr_pages(page); >> + >> + ret = migrate_pages(&pagelist, alloc_demote_page, NULL, >> + target_nid, MIGRATE_SYNC, MR_DEMOTION, >> + &nr_succeeded); >> + if (ret) { >> + if (!list_empty(&pagelist)) { >> + list_del(&page->lru); >> + mod_node_page_state(page_pgdat(page), >> + NR_ISOLATED_ANON + page_is_file_lru(page), >> + -nr_pages); >> + putback_lru_page(page); >> + } >> + } else { >> + __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); >> + demoted_pages += nr_succeeded; >> + } > > Why should we do above work for each page on our own instead of exporting and > calling 'demote_page_list()'? Cuase demote_page_list() is used for demote cold pages which are from one same fast memory node, they can have one same target demotion node. But for the regions for the process, we can get some cold pages from different fast memory nodes (suppose we have mulptiple dram node on the system), so its target demotion node may be different. Thus we should migrate each cold pages with getting the correct target demotion node. Thanks for your comments.
On Tue, 21 Dec 2021 22:18:06 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote: > Hi SeongJae, > > On 12/21/2021 9:24 PM, SeongJae Park Wrote: > > Hi Baolin, > > > > > > Thank you for this great patch! I left some comments below. > > > > On Tue, 21 Dec 2021 17:18:04 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote: > > > >> On tiered memory system, the reclaim path in shrink_page_list() already > >> support demoting pages to slow memory node instead of discarding the > >> pages. However, at that time the fast memory node memory wartermark is > >> already tense, which will increase the memory allocation latency during > >> page demotion. > >> > >> We can rely on the DAMON in user space to help to monitor the cold > >> memory on fast memory node, and demote the cold pages to slow memory > >> node proactively to keep the fast memory node in a healthy state. > >> Thus this patch introduces a new scheme named DAMOS_DEMOTE to support > >> this feature. > >> > >> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com> > >> --- [...] > >> + */ > >> +static int damos_demote(struct damon_target *target, struct damon_region *r) > >> +{ > >> + struct mm_struct *mm; > >> + LIST_HEAD(demote_pages); > >> + LIST_HEAD(pagelist); > >> + int target_nid, nr_pages, ret = 0; > >> + unsigned int nr_succeeded, demoted_pages = 0; > >> + struct page *page, *page2; > >> + > >> + /* Validate if allowing to do page demotion */ > >> + if (!numa_demotion_enabled) > >> + return -EINVAL; > >> + > >> + mm = damon_get_mm(target); > >> + if (!mm) > >> + return -ENOMEM; > >> + > >> + mmap_read_lock(mm); > >> + walk_page_range(mm, PAGE_ALIGN(r->ar.start), PAGE_ALIGN(r->ar.end), [...] > >> + &damos_migrate_pages_walk_ops, &demote_pages); > >> + mmap_read_unlock(mm); > >> + > >> + mmput(mm); > >> + if (list_empty(&demote_pages)) > >> + return 0; > >> + > >> + list_for_each_entry_safe(page, page2, &demote_pages, lru) { > > > > I'd prefer 'next' or 'next_page' instead of 'page2'. > > Sure. > > > > >> + list_add(&page->lru, &pagelist); > >> + target_nid = next_demotion_node(page_to_nid(page)); > >> + nr_pages = thp_nr_pages(page); > >> + > >> + ret = migrate_pages(&pagelist, alloc_demote_page, NULL, > >> + target_nid, MIGRATE_SYNC, MR_DEMOTION, > >> + &nr_succeeded); > >> + if (ret) { > >> + if (!list_empty(&pagelist)) { > >> + list_del(&page->lru); > >> + mod_node_page_state(page_pgdat(page), > >> + NR_ISOLATED_ANON + page_is_file_lru(page), > >> + -nr_pages); > >> + putback_lru_page(page); > >> + } > >> + } else { > >> + __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); > >> + demoted_pages += nr_succeeded; > >> + } > > > > Why should we do above work for each page on our own instead of exporting and > > calling 'demote_page_list()'? > > Cuase demote_page_list() is used for demote cold pages which are from > one same fast memory node, they can have one same target demotion node. > > But for the regions for the process, we can get some cold pages from > different fast memory nodes (suppose we have mulptiple dram node on the > system), so its target demotion node may be different. Thus we should > migrate each cold pages with getting the correct target demotion node. Thank you for the kind explanation. But, I still want to reuse the code rather than copying if possible. How about below dumb ideas off the top of my head? 1. Call demote_page_list() for each page from here 2. Call demote_page_list() for each page from damos_migrate_pmd_entry() 3. Make damos_migrate_pages_walk_ops to configure multiple demote_pages lists, each per fast memory node, and calling demote_page_list() here for each per-fast-memory-node demote_pages list. 4. Make demote_page_list() handles this case and use it. e.g., NULL pgdat parameter means the pages are not from same node. Please let me know if I'm missing something. Thanks, SJ > > Thanks for your comments.
On 12/22/2021 4:43 PM, SeongJae Park wrote: > On Tue, 21 Dec 2021 22:18:06 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote: > >> Hi SeongJae, >> >> On 12/21/2021 9:24 PM, SeongJae Park Wrote: >>> Hi Baolin, >>> >>> >>> Thank you for this great patch! I left some comments below. >>> >>> On Tue, 21 Dec 2021 17:18:04 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote: >>> >>>> On tiered memory system, the reclaim path in shrink_page_list() already >>>> support demoting pages to slow memory node instead of discarding the >>>> pages. However, at that time the fast memory node memory wartermark is >>>> already tense, which will increase the memory allocation latency during >>>> page demotion. >>>> >>>> We can rely on the DAMON in user space to help to monitor the cold >>>> memory on fast memory node, and demote the cold pages to slow memory >>>> node proactively to keep the fast memory node in a healthy state. >>>> Thus this patch introduces a new scheme named DAMOS_DEMOTE to support >>>> this feature. >>>> >>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com> >>>> --- [snip] >> >>> >>>> + list_add(&page->lru, &pagelist); >>>> + target_nid = next_demotion_node(page_to_nid(page)); >>>> + nr_pages = thp_nr_pages(page); >>>> + >>>> + ret = migrate_pages(&pagelist, alloc_demote_page, NULL, >>>> + target_nid, MIGRATE_SYNC, MR_DEMOTION, >>>> + &nr_succeeded); >>>> + if (ret) { >>>> + if (!list_empty(&pagelist)) { >>>> + list_del(&page->lru); >>>> + mod_node_page_state(page_pgdat(page), >>>> + NR_ISOLATED_ANON + page_is_file_lru(page), >>>> + -nr_pages); >>>> + putback_lru_page(page); >>>> + } >>>> + } else { >>>> + __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); >>>> + demoted_pages += nr_succeeded; >>>> + } >>> >>> Why should we do above work for each page on our own instead of exporting and >>> calling 'demote_page_list()'? >> >> Cuase demote_page_list() is used for demote cold pages which are from >> one same fast memory node, they can have one same target demotion node. >> >> But for the regions for the process, we can get some cold pages from >> different fast memory nodes (suppose we have mulptiple dram node on the >> system), so its target demotion node may be different. Thus we should >> migrate each cold pages with getting the correct target demotion node. > > Thank you for the kind explanation. But, I still want to reuse the code rather > than copying if possible. How about below dumb ideas off the top of my head? > > 1. Call demote_page_list() for each page from here Sounds reasonable. > 2. Call demote_page_list() for each page from damos_migrate_pmd_entry() We are under mmap lock in damos_migrate_pmd_entry(), it is not suitable to do page migration. > 3. Make damos_migrate_pages_walk_ops to configure multiple demote_pages lists, > each per fast memory node, and calling demote_page_list() here for each > per-fast-memory-node demote_pages list. Which will bring more complexity I think, and we should avoid doing migration under mmap lock. > 4. Make demote_page_list() handles this case and use it. e.g., NULL pgdat > parameter means the pages are not from same node. Thanks for your suggestion, actually after more thinking, I think we can reuse the demote_page_list() and it can be easy to change. Somethink like below on top of my current patch, how do you think? Thanks. --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -796,7 +796,7 @@ static unsigned long damos_demote(struct damon_target *target, struct mm_struct *mm; LIST_HEAD(demote_pages); LIST_HEAD(pagelist); - int target_nid, nr_pages, ret = 0; + int nr_pages; unsigned int nr_succeeded, demoted_pages = 0; struct page *page, *next; @@ -818,14 +818,11 @@ static unsigned long damos_demote(struct damon_target *target, return 0; list_for_each_entry_safe(page, next, &demote_pages, lru) { - list_add(&page->lru, &pagelist); - target_nid = next_demotion_node(page_to_nid(page)); nr_pages = thp_nr_pages(page); + list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, alloc_demote_page, NULL, - target_nid, MIGRATE_SYNC, MR_DEMOTION, - &nr_succeeded); - if (ret) { + nr_succeeded = demote_page_list(pagelist, page_pgdat(page)); + if (!nr_succeeded) { if (!list_empty(&pagelist)) { list_del(&page->lru); mod_node_page_state(page_pgdat(page), @@ -834,11 +831,9 @@ static unsigned long damos_demote(struct damon_target *target, putback_lru_page(page); } } else { - __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); demoted_pages += nr_succeeded; } - INIT_LIST_HEAD(&pagelist); cond_resched(); }
Hi Baolin, I love your patch! Yet something to improve: [auto build test ERROR on next-20211220] [cannot apply to hnaz-mm/master linux/master linus/master v5.16-rc6 v5.16-rc5 v5.16-rc4 v5.16-rc6] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Baolin-Wang/Add-a-new-scheme-to-support-demotion-on-tiered-memory-system/20211221-172017 base: 07f8c60fe60f84977dc815ec8a6b1100827c34dd config: arc-randconfig-r043-20211222 (https://download.01.org/0day-ci/archive/20211222/202112221923.4m9tlPoy-lkp@intel.com/config) compiler: arc-elf-gcc (GCC) 11.2.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/0day-ci/linux/commit/ed1c1ea9c5b5ea81916c10e50c4a1613bce4b5a9 git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Baolin-Wang/Add-a-new-scheme-to-support-demotion-on-tiered-memory-system/20211221-172017 git checkout ed1c1ea9c5b5ea81916c10e50c4a1613bce4b5a9 # save the config file to linux build tree mkdir build_dir COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=arc SHELL=/bin/bash If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot <lkp@intel.com> All errors (new ones prefixed by >>): mm/damon/vaddr.c: In function 'damos_migrate_pmd_entry': >> mm/damon/vaddr.c:635:14: error: implicit declaration of function 'vma_migratable'; did you mean 'HPageMigratable'? [-Werror=implicit-function-declaration] 635 | if (!vma_migratable(vma)) | ^~~~~~~~~~~~~~ | HPageMigratable >> mm/damon/vaddr.c:648:39: error: implicit declaration of function 'pmd_pfn'; did you mean 'pmd_off'? [-Werror=implicit-function-declaration] 648 | page = damon_get_page(pmd_pfn(*pmd)); | ^~~~~~~ | pmd_off cc1: some warnings being treated as errors vim +635 mm/damon/vaddr.c 625 626 static int damos_migrate_pmd_entry(pmd_t *pmd, unsigned long addr, 627 unsigned long end, struct mm_walk *walk) 628 { 629 struct vm_area_struct *vma = walk->vma; 630 struct list_head *demote_list = walk->private; 631 spinlock_t *ptl; 632 struct page *page; 633 pte_t *pte, *mapped_pte; 634 > 635 if (!vma_migratable(vma)) 636 return -EFAULT; 637 638 ptl = pmd_trans_huge_lock(pmd, vma); 639 if (ptl) { 640 /* Bail out if THP migration is not supported. */ 641 if (!thp_migration_supported()) 642 goto thp_out; 643 644 /* If the THP pte is under migration, do not bother it. */ 645 if (unlikely(is_pmd_migration_entry(*pmd))) 646 goto thp_out; 647 > 648 page = damon_get_page(pmd_pfn(*pmd)); 649 if (!page) 650 goto thp_out; 651 652 damos_isolate_page(page, demote_list); 653 654 put_page(page); 655 thp_out: 656 spin_unlock(ptl); 657 return 0; 658 } 659 660 /* regular page handling */ 661 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 662 return -EINVAL; 663 664 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 665 for (; addr != end; pte++, addr += PAGE_SIZE) { 666 if (pte_none(*pte) || !pte_present(*pte)) 667 continue; 668 669 page = damon_get_page(pte_pfn(*pte)); 670 if (!page) 671 continue; 672 673 damos_isolate_page(page, demote_list); 674 put_page(page); 675 } 676 pte_unmap_unlock(mapped_pte, ptl); 677 cond_resched(); 678 679 return 0; 680 } 681 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
On Wed, 22 Dec 2021 17:15:18 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote: > > > On 12/22/2021 4:43 PM, SeongJae Park wrote: > > On Tue, 21 Dec 2021 22:18:06 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote: > > > >> Hi SeongJae, > >> > >> On 12/21/2021 9:24 PM, SeongJae Park Wrote: > >>> Hi Baolin, > >>> > >>> > >>> Thank you for this great patch! I left some comments below. > >>> > >>> On Tue, 21 Dec 2021 17:18:04 +0800 Baolin Wang <baolin.wang@linux.alibaba.com> wrote: > >>> > >>>> On tiered memory system, the reclaim path in shrink_page_list() already > >>>> support demoting pages to slow memory node instead of discarding the > >>>> pages. However, at that time the fast memory node memory wartermark is > >>>> already tense, which will increase the memory allocation latency during > >>>> page demotion. > >>>> > >>>> We can rely on the DAMON in user space to help to monitor the cold > >>>> memory on fast memory node, and demote the cold pages to slow memory > >>>> node proactively to keep the fast memory node in a healthy state. > >>>> Thus this patch introduces a new scheme named DAMOS_DEMOTE to support > >>>> this feature. > >>>> > >>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com> > >>>> --- > > [snip] > > >> > >>> > >>>> + list_add(&page->lru, &pagelist); > >>>> + target_nid = next_demotion_node(page_to_nid(page)); > >>>> + nr_pages = thp_nr_pages(page); > >>>> + > >>>> + ret = migrate_pages(&pagelist, alloc_demote_page, NULL, > >>>> + target_nid, MIGRATE_SYNC, MR_DEMOTION, > >>>> + &nr_succeeded); > >>>> + if (ret) { > >>>> + if (!list_empty(&pagelist)) { > >>>> + list_del(&page->lru); > >>>> + mod_node_page_state(page_pgdat(page), > >>>> + NR_ISOLATED_ANON + page_is_file_lru(page), > >>>> + -nr_pages); > >>>> + putback_lru_page(page); > >>>> + } > >>>> + } else { > >>>> + __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); > >>>> + demoted_pages += nr_succeeded; > >>>> + } > >>> > >>> Why should we do above work for each page on our own instead of exporting and > >>> calling 'demote_page_list()'? > >> > >> Cuase demote_page_list() is used for demote cold pages which are from > >> one same fast memory node, they can have one same target demotion node. > >> > >> But for the regions for the process, we can get some cold pages from > >> different fast memory nodes (suppose we have mulptiple dram node on the > >> system), so its target demotion node may be different. Thus we should > >> migrate each cold pages with getting the correct target demotion node. > > > > Thank you for the kind explanation. But, I still want to reuse the code rather > > than copying if possible. How about below dumb ideas off the top of my head? > > > > 1. Call demote_page_list() for each page from here > > Sounds reasonable. > > > 2. Call demote_page_list() for each page from damos_migrate_pmd_entry() > > We are under mmap lock in damos_migrate_pmd_entry(), it is not suitable > to do page migration. > > > 3. Make damos_migrate_pages_walk_ops to configure multiple demote_pages lists, > > each per fast memory node, and calling demote_page_list() here for each > > per-fast-memory-node demote_pages list. > > Which will bring more complexity I think, and we should avoid doing > migration under mmap lock. > > > 4. Make demote_page_list() handles this case and use it. e.g., NULL pgdat > > parameter means the pages are not from same node. > > Thanks for your suggestion, actually after more thinking, I think we can > reuse the demote_page_list() and it can be easy to change. Somethink > like below on top of my current patch, how do you think? Thanks. So, you selected the option 1. I personally think option 3 or 4 would be more optimal, but also agree that it could increase the complexity. As we already have the time/space quota feature for schemes overhead control, I think starting with this simple approach makes sense to me. Thanks, SJ [...]
diff --git a/include/linux/damon.h b/include/linux/damon.h index af64838..da9957c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -87,6 +87,8 @@ struct damon_target { * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. + * @DAMOS_DEMOTE: Migate cold pages from fast memory type (DRAM) to slow + * memory type (persistent memory). * @DAMOS_STAT: Do nothing but count the stat. */ enum damos_action { @@ -95,6 +97,7 @@ enum damos_action { DAMOS_PAGEOUT, DAMOS_HUGEPAGE, DAMOS_NOHUGEPAGE, + DAMOS_DEMOTE, DAMOS_STAT, /* Do nothing but only record the stat */ }; diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 58dbb96..43355ab 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -168,6 +168,7 @@ static bool damos_action_valid(int action) case DAMOS_PAGEOUT: case DAMOS_HUGEPAGE: case DAMOS_NOHUGEPAGE: + case DAMOS_DEMOTE: case DAMOS_STAT: return true; default: diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 9e213a1..b354d3e 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -14,6 +14,10 @@ #include <linux/page_idle.h> #include <linux/pagewalk.h> #include <linux/sched/mm.h> +#include <linux/migrate.h> +#include <linux/mm_inline.h> +#include <linux/swapops.h> +#include "../internal.h" #include "prmtv-common.h" @@ -693,6 +697,156 @@ static unsigned long damos_madvise(struct damon_target *target, } #endif /* CONFIG_ADVISE_SYSCALLS */ +static bool damos_isolate_page(struct page *page, struct list_head *demote_list) +{ + struct page *head = compound_head(page); + + /* Do not interfere with other mappings of this page */ + if (page_mapcount(head) != 1) + return false; + + /* No need migration if the target demotion node is empty. */ + if (next_demotion_node(page_to_nid(head)) == NUMA_NO_NODE) + return false; + + if (isolate_lru_page(head)) + return false; + + list_add_tail(&head->lru, demote_list); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_lru(head), + thp_nr_pages(head)); + return true; +} + +static int damos_migrate_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct list_head *demote_list = walk->private; + spinlock_t *ptl; + struct page *page; + pte_t *pte, *mapped_pte; + + if (!vma_migratable(vma)) + return -EFAULT; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + /* Bail out if THP migration is not supported. */ + if (!thp_migration_supported()) + goto thp_out; + + /* If the THP pte is under migration, do not bother it. */ + if (unlikely(is_pmd_migration_entry(*pmd))) + goto thp_out; + + page = damon_get_page(pmd_pfn(*pmd)); + if (!page) + goto thp_out; + + damos_isolate_page(page, demote_list); + + put_page(page); +thp_out: + spin_unlock(ptl); + return 0; + } + + /* regular page handling */ + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + return -EINVAL; + + mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { + if (pte_none(*pte) || !pte_present(*pte)) + continue; + + page = damon_get_page(pte_pfn(*pte)); + if (!page) + continue; + + damos_isolate_page(page, demote_list); + put_page(page); + } + pte_unmap_unlock(mapped_pte, ptl); + cond_resched(); + + return 0; +} + +static const struct mm_walk_ops damos_migrate_pages_walk_ops = { + .pmd_entry = damos_migrate_pmd_entry, +}; + +/* + * damos_demote() - demote cold pages from fast memory to slow memory + * @target: the given target + * @r: region of the target + * + * On tiered memory system, if DAMON monitored cold pages on fast memory + * node (DRAM), we can demote them to slow memory node proactively in case + * accumulating much more cold memory on fast memory node (DRAM) to reclaim. + * + * Return: + * = 0 means no pages were demoted. + * > 0 means how many cold pages were demoted successfully. + * < 0 means errors happened. + */ +static int damos_demote(struct damon_target *target, struct damon_region *r) +{ + struct mm_struct *mm; + LIST_HEAD(demote_pages); + LIST_HEAD(pagelist); + int target_nid, nr_pages, ret = 0; + unsigned int nr_succeeded, demoted_pages = 0; + struct page *page, *page2; + + /* Validate if allowing to do page demotion */ + if (!numa_demotion_enabled) + return -EINVAL; + + mm = damon_get_mm(target); + if (!mm) + return -ENOMEM; + + mmap_read_lock(mm); + walk_page_range(mm, PAGE_ALIGN(r->ar.start), PAGE_ALIGN(r->ar.end), + &damos_migrate_pages_walk_ops, &demote_pages); + mmap_read_unlock(mm); + + mmput(mm); + if (list_empty(&demote_pages)) + return 0; + + list_for_each_entry_safe(page, page2, &demote_pages, lru) { + list_add(&page->lru, &pagelist); + target_nid = next_demotion_node(page_to_nid(page)); + nr_pages = thp_nr_pages(page); + + ret = migrate_pages(&pagelist, alloc_demote_page, NULL, + target_nid, MIGRATE_SYNC, MR_DEMOTION, + &nr_succeeded); + if (ret) { + if (!list_empty(&pagelist)) { + list_del(&page->lru); + mod_node_page_state(page_pgdat(page), + NR_ISOLATED_ANON + page_is_file_lru(page), + -nr_pages); + putback_lru_page(page); + } + } else { + __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); + demoted_pages += nr_succeeded; + } + + INIT_LIST_HEAD(&pagelist); + cond_resched(); + } + + return demoted_pages; +} + static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme) @@ -715,6 +869,8 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_NOHUGEPAGE: madv_action = MADV_NOHUGEPAGE; break; + case DAMOS_DEMOTE: + return damos_demote(t, r); case DAMOS_STAT: return 0; default:
On tiered memory system, the reclaim path in shrink_page_list() already support demoting pages to slow memory node instead of discarding the pages. However, at that time the fast memory node memory wartermark is already tense, which will increase the memory allocation latency during page demotion. We can rely on the DAMON in user space to help to monitor the cold memory on fast memory node, and demote the cold pages to slow memory node proactively to keep the fast memory node in a healthy state. Thus this patch introduces a new scheme named DAMOS_DEMOTE to support this feature. Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com> --- include/linux/damon.h | 3 + mm/damon/dbgfs.c | 1 + mm/damon/vaddr.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 160 insertions(+)