Message ID | 20200619215649.32297-14-rcampbell@nvidia.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm/hmm/nouveau: THP mapping and migration | expand |
On 19 Jun 2020, at 17:56, Ralph Campbell wrote: > Support transparent huge page migration to ZONE_DEVICE private memory. > A new flag (MIGRATE_PFN_COMPOUND) is added to the input PFN array to > indicate the huge page was fully mapped by the CPU. > Export prep_compound_page() so that device drivers can create huge > device private pages after calling memremap_pages(). > > Signed-off-by: Ralph Campbell <rcampbell@nvidia.com> > --- > include/linux/migrate.h | 1 + > include/linux/mm.h | 1 + > mm/huge_memory.c | 30 ++++-- > mm/internal.h | 1 - > mm/memory.c | 10 +- > mm/memremap.c | 9 +- > mm/migrate.c | 226 ++++++++++++++++++++++++++++++++-------- > mm/page_alloc.c | 1 + > 8 files changed, 226 insertions(+), 53 deletions(-) > > diff --git a/include/linux/migrate.h b/include/linux/migrate.h > index 3e546cbf03dd..f6a64965c8bd 100644 > --- a/include/linux/migrate.h > +++ b/include/linux/migrate.h > @@ -166,6 +166,7 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, > #define MIGRATE_PFN_MIGRATE (1UL << 1) > #define MIGRATE_PFN_LOCKED (1UL << 2) > #define MIGRATE_PFN_WRITE (1UL << 3) > +#define MIGRATE_PFN_COMPOUND (1UL << 4) > #define MIGRATE_PFN_SHIFT 6 > > static inline struct page *migrate_pfn_to_page(unsigned long mpfn) > diff --git a/include/linux/mm.h b/include/linux/mm.h > index dc7b87310c10..020b9dd3cddb 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -932,6 +932,7 @@ static inline unsigned int page_shift(struct page *page) > } > > void free_compound_page(struct page *page); > +void prep_compound_page(struct page *page, unsigned int order); > > #ifdef CONFIG_MMU > /* > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 78c84bee7e29..25d95f7b1e98 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1663,23 +1663,35 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > } else { > struct page *page = NULL; > int flush_needed = 1; > + bool is_anon = false; > > if (pmd_present(orig_pmd)) { > page = pmd_page(orig_pmd); > + is_anon = PageAnon(page); > page_remove_rmap(page, true); > VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); > VM_BUG_ON_PAGE(!PageHead(page), page); > } else if (thp_migration_supported()) { > swp_entry_t entry; > > - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); > entry = pmd_to_swp_entry(orig_pmd); > - page = pfn_to_page(swp_offset(entry)); > + if (is_device_private_entry(entry)) { > + page = device_private_entry_to_page(entry); > + is_anon = PageAnon(page); > + page_remove_rmap(page, true); > + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); > + VM_BUG_ON_PAGE(!PageHead(page), page); > + put_page(page); Why do you hide this code behind thp_migration_supported()? It seems that you just need pmd swap entry not pmd migration entry. Also the condition is not consistent with the code in __handle_mm_fault(), in which you handle is_device_private_entry() directly without checking thp_migration_support(). Do we need to support split_huge_pmd() if a page is migrated to device? Any new code needed in split_huge_pmd()? > + } else { > + VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); > + page = pfn_to_page(swp_offset(entry)); > + is_anon = PageAnon(page); > + } > flush_needed = 0; > } else > WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); > > - if (PageAnon(page)) { > + if (is_anon) { > zap_deposited_table(tlb->mm, pmd); > add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); > } else { > @@ -1778,8 +1790,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, > /* > * Returns > * - 0 if PMD could not be locked > - * - 1 if PMD was locked but protections unchange and TLB flush unnecessary > - * - HPAGE_PMD_NR is protections changed and TLB flush necessary > + * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary > + * - HPAGE_PMD_NR if protections changed and TLB flush necessary > */ > int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > unsigned long addr, pgprot_t newprot, unsigned long cp_flags) > @@ -2689,7 +2701,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) > if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { > if (!list_empty(page_deferred_list(head))) { > ds_queue->split_queue_len--; > - list_del(page_deferred_list(head)); > + list_del_init(page_deferred_list(head)); > } > spin_unlock(&ds_queue->split_queue_lock); > if (mapping) { > @@ -2743,7 +2755,7 @@ void free_transhuge_page(struct page *page) > spin_lock_irqsave(&ds_queue->split_queue_lock, flags); > if (!list_empty(page_deferred_list(page))) { > ds_queue->split_queue_len--; > - list_del(page_deferred_list(page)); > + list_del_init(page_deferred_list(page)); > } > spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); > free_compound_page(page); > @@ -2963,6 +2975,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) > pmde = pmd_mksoft_dirty(pmde); > if (is_write_migration_entry(entry)) > pmde = maybe_pmd_mkwrite(pmde, vma); > + if (unlikely(is_device_private_page(new))) { > + entry = make_device_private_entry(new, pmd_write(pmde)); > + pmde = swp_entry_to_pmd(entry); > + } > > flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); > if (PageAnon(new)) > diff --git a/mm/internal.h b/mm/internal.h > index 9886db20d94f..58f051a14dae 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -196,7 +196,6 @@ extern void __putback_isolated_page(struct page *page, unsigned int order, > extern void memblock_free_pages(struct page *page, unsigned long pfn, > unsigned int order); > extern void __free_pages_core(struct page *page, unsigned int order); > -extern void prep_compound_page(struct page *page, unsigned int order); > extern void post_alloc_hook(struct page *page, unsigned int order, > gfp_t gfp_flags); > extern int user_min_free_kbytes; > diff --git a/mm/memory.c b/mm/memory.c > index dc7f3543b1fd..4e03d1a2ead5 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -4322,9 +4322,15 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, > > barrier(); > if (unlikely(is_swap_pmd(orig_pmd))) { > + swp_entry_t entry = pmd_to_swp_entry(orig_pmd); > + > + if (is_device_private_entry(entry)) { > + vmf.page = device_private_entry_to_page(entry); > + return vmf.page->pgmap->ops->migrate_to_ram(&vmf); > + } > VM_BUG_ON(thp_migration_supported() && > - !is_pmd_migration_entry(orig_pmd)); > - if (is_pmd_migration_entry(orig_pmd)) > + !is_migration_entry(entry)); > + if (is_migration_entry(entry)) > pmd_migration_entry_wait(mm, vmf.pmd); > return 0; > } > diff --git a/mm/memremap.c b/mm/memremap.c > index 03e38b7a38f1..4231054188b4 100644 > --- a/mm/memremap.c > +++ b/mm/memremap.c > @@ -132,8 +132,13 @@ void memunmap_pages(struct dev_pagemap *pgmap) > int nid; > > dev_pagemap_kill(pgmap); > - for_each_device_pfn(pfn, pgmap) > - put_page(pfn_to_page(pfn)); > + for_each_device_pfn(pfn, pgmap) { > + struct page *page = pfn_to_page(pfn); > + unsigned int order = compound_order(page); > + > + put_page(page); > + pfn += (1U << order) - 1; > + } > dev_pagemap_cleanup(pgmap); > > /* make sure to access a memmap that was actually initialized */ > diff --git a/mm/migrate.c b/mm/migrate.c > index 87c52e0ee580..1536677cefc9 100644 > --- a/mm/migrate.c > +++ b/mm/migrate.c > @@ -51,6 +51,7 @@ > #include <linux/oom.h> > > #include <asm/tlbflush.h> > +#include <asm/pgalloc.h> > > #define CREATE_TRACE_POINTS > #include <trace/events/migrate.h> > @@ -2185,6 +2186,8 @@ static int migrate_vma_collect_hole(unsigned long start, > > for (addr = start; addr < end; addr += PAGE_SIZE) { > migrate->src[migrate->npages] = flags; > + if ((addr & ~PMD_MASK) == 0 && (end & ~PMD_MASK) == 0) > + migrate->src[migrate->npages] |= MIGRATE_PFN_COMPOUND; > migrate->dst[migrate->npages] = 0; > migrate->npages++; > migrate->cpages++; > @@ -2219,48 +2222,87 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, > unsigned long addr = start, unmapped = 0; > spinlock_t *ptl; > pte_t *ptep; > + pmd_t pmd; > > again: > - if (pmd_none(*pmdp)) > + pmd = READ_ONCE(*pmdp); > + if (pmd_none(pmd)) > return migrate_vma_collect_hole(start, end, -1, walk); > > - if (pmd_trans_huge(*pmdp)) { > + if (pmd_trans_huge(pmd) || !pmd_present(pmd)) { > struct page *page; > + unsigned long write = 0; > + int ret; > > ptl = pmd_lock(mm, pmdp); > - if (unlikely(!pmd_trans_huge(*pmdp))) { > - spin_unlock(ptl); > - goto again; > - } > + if (pmd_trans_huge(*pmdp)) { > + page = pmd_page(*pmdp); > + if (is_huge_zero_page(page)) { > + spin_unlock(ptl); > + return migrate_vma_collect_hole(start, end, -1, > + walk); > + } > + if (pmd_write(*pmdp)) > + write = MIGRATE_PFN_WRITE; > + } else if (!pmd_present(*pmdp)) { > + swp_entry_t entry = pmd_to_swp_entry(*pmdp); > > - page = pmd_page(*pmdp); > - if (is_huge_zero_page(page)) { > - spin_unlock(ptl); > - split_huge_pmd(vma, pmdp, addr); > - if (pmd_trans_unstable(pmdp)) > + if (!is_device_private_entry(entry)) { > + spin_unlock(ptl); > return migrate_vma_collect_skip(start, end, > walk); > + } > + page = device_private_entry_to_page(entry); > + if (is_write_device_private_entry(entry)) > + write = MIGRATE_PFN_WRITE; > } else { > - int ret; > + spin_unlock(ptl); > + goto again; > + } > > - get_page(page); > + get_page(page); > + if (unlikely(!trylock_page(page))) { > spin_unlock(ptl); > - if (unlikely(!trylock_page(page))) > - return migrate_vma_collect_skip(start, end, > - walk); > - ret = split_huge_page(page); > - unlock_page(page); > put_page(page); > - if (ret) > - return migrate_vma_collect_skip(start, end, > - walk); > - if (pmd_none(*pmdp)) > - return migrate_vma_collect_hole(start, end, -1, > - walk); > + return migrate_vma_collect_skip(start, end, walk); > + } > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > + if ((start & HPAGE_PMD_MASK) == start && > + start + HPAGE_PMD_SIZE == end) { > + struct page_vma_mapped_walk vmw = { > + .vma = vma, > + .address = start, > + .pmd = pmdp, > + .ptl = ptl, > + }; > + > + migrate->src[migrate->npages] = write | > + migrate_pfn(page_to_pfn(page)) | > + MIGRATE_PFN_MIGRATE | MIGRATE_PFN_LOCKED | > + MIGRATE_PFN_COMPOUND; > + migrate->dst[migrate->npages] = 0; > + migrate->npages++; > + migrate->cpages++; > + migrate_vma_collect_skip(start + PAGE_SIZE, end, walk); > + > + /* Note that this also removes the page from the rmap. */ > + set_pmd_migration_entry(&vmw, page); > + spin_unlock(ptl); > + > + return 0; > } > +#endif > + spin_unlock(ptl); > + ret = split_huge_page(page); > + unlock_page(page); > + put_page(page); > + if (ret) > + return migrate_vma_collect_skip(start, end, walk); > + if (pmd_none(*pmdp)) > + return migrate_vma_collect_hole(start, end, -1, walk); > } > > - if (unlikely(pmd_bad(*pmdp))) > + if (unlikely(pmd_bad(pmd) || pmd_devmap(pmd))) > return migrate_vma_collect_skip(start, end, walk); > > ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); > @@ -2310,8 +2352,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, > mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; > } > > - /* FIXME support THP */ > - if (!page || !page->mapping || PageTransCompound(page)) { > + if (!page || !page->mapping) { > mpfn = 0; > goto next; > } > @@ -2420,14 +2461,6 @@ static bool migrate_vma_check_page(struct page *page) > */ > int extra = 1; > > - /* > - * FIXME support THP (transparent huge page), it is bit more complex to > - * check them than regular pages, because they can be mapped with a pmd > - * or with a pte (split pte mapping). > - */ > - if (PageCompound(page)) > - return false; > - > /* Page from ZONE_DEVICE have one extra reference */ > if (is_zone_device_page(page)) { > /* > @@ -2726,13 +2759,115 @@ int migrate_vma_setup(struct migrate_vma *args) > } > EXPORT_SYMBOL(migrate_vma_setup); > > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > +/* > + * This code closely follows: > + * do_huge_pmd_anonymous_page() > + * __do_huge_pmd_anonymous_page() > + * except that the page being inserted is likely to be a device private page > + * instead of an allocated or zero page. > + */ > +static int insert_huge_pmd_anonymous_page(struct vm_area_struct *vma, > + unsigned long haddr, > + struct page *page, > + unsigned long *src, > + unsigned long *dst, > + pmd_t *pmdp) > +{ > + struct mm_struct *mm = vma->vm_mm; > + unsigned int i; > + spinlock_t *ptl; > + bool flush = false; > + pgtable_t pgtable; > + gfp_t gfp; > + pmd_t entry; > + > + if (WARN_ON_ONCE(compound_order(page) != HPAGE_PMD_ORDER)) > + goto abort; > + > + if (unlikely(anon_vma_prepare(vma))) > + goto abort; > + > + prep_transhuge_page(page); > + > + gfp = GFP_TRANSHUGE_LIGHT; > + if (mem_cgroup_charge(page, mm, gfp)) > + goto abort; > + > + pgtable = pte_alloc_one(mm); > + if (unlikely(!pgtable)) > + goto abort; > + > + __SetPageUptodate(page); > + > + if (is_zone_device_page(page)) { > + if (!is_device_private_page(page)) > + goto pgtable_abort; > + entry = swp_entry_to_pmd(make_device_private_entry(page, > + vma->vm_flags & VM_WRITE)); > + } else { > + entry = mk_huge_pmd(page, vma->vm_page_prot); > + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); > + } > + > + ptl = pmd_lock(mm, pmdp); > + > + if (check_stable_address_space(mm)) > + goto unlock_abort; > + > + /* > + * Check for userfaultfd but do not deliver the fault. Instead, > + * just back off. > + */ > + if (userfaultfd_missing(vma)) > + goto unlock_abort; > + > + if (pmd_present(*pmdp)) { > + if (!is_huge_zero_pmd(*pmdp)) > + goto unlock_abort; > + flush = true; > + } else if (!pmd_none(*pmdp)) > + goto unlock_abort; > + > + get_page(page); > + page_add_new_anon_rmap(page, vma, haddr, true); > + if (!is_device_private_page(page)) > + lru_cache_add_active_or_unevictable(page, vma); > + if (flush) { > + pte_free(mm, pgtable); > + flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE); > + pmdp_invalidate(vma, haddr, pmdp); > + } else { > + pgtable_trans_huge_deposit(mm, pmdp, pgtable); > + mm_inc_nr_ptes(mm); > + } > + set_pmd_at(mm, haddr, pmdp, entry); > + update_mmu_cache_pmd(vma, haddr, pmdp); > + add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); > + spin_unlock(ptl); > + count_vm_event(THP_FAULT_ALLOC); > + count_memcg_event_mm(mm, THP_FAULT_ALLOC); > + > + return 0; > + > +unlock_abort: > + spin_unlock(ptl); > +pgtable_abort: > + pte_free(mm, pgtable); > +abort: > + for (i = 0; i < HPAGE_PMD_NR; i++) > + src[i] &= ~MIGRATE_PFN_MIGRATE; > + return -EINVAL; > +} > +#endif > + > /* > * This code closely matches the code in: > * __handle_mm_fault() > * handle_pte_fault() > * do_anonymous_page() > - * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE > - * private page. > + * to map in an anonymous zero page except the struct page is already allocated > + * and will likely be a ZONE_DEVICE private page. > */ > static void migrate_vma_insert_page(struct migrate_vma *migrate, > unsigned long addr, > @@ -2766,7 +2901,16 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, > if (!pmdp) > goto abort; > > - if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > + if (*dst & MIGRATE_PFN_COMPOUND) { > + int ret = insert_huge_pmd_anonymous_page(vma, addr, page, src, > + dst, pmdp); > + if (ret) > + goto abort; > + return; > + } > +#endif > + if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp) || pmd_bad(*pmdp)) > goto abort; > > /* > @@ -2804,7 +2948,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, > > swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); > entry = swp_entry_to_pte(swp_entry); > - } > + } else > + goto abort; > } else { > entry = mk_pte(page, vma->vm_page_prot); > if (vma->vm_flags & VM_WRITE) > @@ -2833,10 +2978,10 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, > goto unlock_abort; > > inc_mm_counter(mm, MM_ANONPAGES); > + get_page(page); > page_add_new_anon_rmap(page, vma, addr, false); > if (!is_zone_device_page(page)) > lru_cache_add_active_or_unevictable(page, vma); > - get_page(page); > > if (flush) { > flush_cache_page(vma, addr, pte_pfn(*ptep)); > @@ -2850,7 +2995,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, > } > > pte_unmap_unlock(ptep, ptl); > - *src = MIGRATE_PFN_MIGRATE; > return; > > unlock_abort: > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 48eb0f1410d4..a852ed2f204c 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -686,6 +686,7 @@ void prep_compound_page(struct page *page, unsigned int order) > if (hpage_pincount_available(page)) > atomic_set(compound_pincount_ptr(page), 0); > } > +EXPORT_SYMBOL_GPL(prep_compound_page); > > #ifdef CONFIG_DEBUG_PAGEALLOC > unsigned int _debug_guardpage_minorder; > -- > 2.20.1 -- Best Regards, Yan Zi
On 6/21/20 4:20 PM, Zi Yan wrote: > On 19 Jun 2020, at 17:56, Ralph Campbell wrote: > >> Support transparent huge page migration to ZONE_DEVICE private memory. >> A new flag (MIGRATE_PFN_COMPOUND) is added to the input PFN array to >> indicate the huge page was fully mapped by the CPU. >> Export prep_compound_page() so that device drivers can create huge >> device private pages after calling memremap_pages(). >> >> Signed-off-by: Ralph Campbell <rcampbell@nvidia.com> >> --- >> include/linux/migrate.h | 1 + >> include/linux/mm.h | 1 + >> mm/huge_memory.c | 30 ++++-- >> mm/internal.h | 1 - >> mm/memory.c | 10 +- >> mm/memremap.c | 9 +- >> mm/migrate.c | 226 ++++++++++++++++++++++++++++++++-------- >> mm/page_alloc.c | 1 + >> 8 files changed, 226 insertions(+), 53 deletions(-) >> >> diff --git a/include/linux/migrate.h b/include/linux/migrate.h >> index 3e546cbf03dd..f6a64965c8bd 100644 >> --- a/include/linux/migrate.h >> +++ b/include/linux/migrate.h >> @@ -166,6 +166,7 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, >> #define MIGRATE_PFN_MIGRATE (1UL << 1) >> #define MIGRATE_PFN_LOCKED (1UL << 2) >> #define MIGRATE_PFN_WRITE (1UL << 3) >> +#define MIGRATE_PFN_COMPOUND (1UL << 4) >> #define MIGRATE_PFN_SHIFT 6 >> >> static inline struct page *migrate_pfn_to_page(unsigned long mpfn) >> diff --git a/include/linux/mm.h b/include/linux/mm.h >> index dc7b87310c10..020b9dd3cddb 100644 >> --- a/include/linux/mm.h >> +++ b/include/linux/mm.h >> @@ -932,6 +932,7 @@ static inline unsigned int page_shift(struct page *page) >> } >> >> void free_compound_page(struct page *page); >> +void prep_compound_page(struct page *page, unsigned int order); >> >> #ifdef CONFIG_MMU >> /* >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index 78c84bee7e29..25d95f7b1e98 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -1663,23 +1663,35 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, >> } else { >> struct page *page = NULL; >> int flush_needed = 1; >> + bool is_anon = false; >> >> if (pmd_present(orig_pmd)) { >> page = pmd_page(orig_pmd); >> + is_anon = PageAnon(page); >> page_remove_rmap(page, true); >> VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); >> VM_BUG_ON_PAGE(!PageHead(page), page); >> } else if (thp_migration_supported()) { >> swp_entry_t entry; >> >> - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); >> entry = pmd_to_swp_entry(orig_pmd); >> - page = pfn_to_page(swp_offset(entry)); >> + if (is_device_private_entry(entry)) { >> + page = device_private_entry_to_page(entry); >> + is_anon = PageAnon(page); >> + page_remove_rmap(page, true); >> + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); >> + VM_BUG_ON_PAGE(!PageHead(page), page); >> + put_page(page); > > Why do you hide this code behind thp_migration_supported()? It seems that you just need > pmd swap entry not pmd migration entry. Also the condition is not consistent with the code > in __handle_mm_fault(), in which you handle is_device_private_entry() directly without > checking thp_migration_support(). Good point, I think "else if (thp_migration_supported())" should be "else if (is_pmd_migration_entry(orig_pmd))" since if the PMD *is* a device private or migration entry, then it should be handled and the VM_BUG_ON() should be that thp_migration_supported() is true (or maybe remove the VM_BUG_ON?). > Do we need to support split_huge_pmd() if a page is migrated to device? Any new code > needed in split_huge_pmd()? I was thinking that any CPU usage of the device private page would cause it to be migrated back to system memory as a whole PMD/PUD page but I'll double check. At least there should be a check that the page isn't a device private page.
On 22 Jun 2020, at 15:36, Ralph Campbell wrote: > On 6/21/20 4:20 PM, Zi Yan wrote: >> On 19 Jun 2020, at 17:56, Ralph Campbell wrote: >> >>> Support transparent huge page migration to ZONE_DEVICE private memory. >>> A new flag (MIGRATE_PFN_COMPOUND) is added to the input PFN array to >>> indicate the huge page was fully mapped by the CPU. >>> Export prep_compound_page() so that device drivers can create huge >>> device private pages after calling memremap_pages(). >>> >>> Signed-off-by: Ralph Campbell <rcampbell@nvidia.com> >>> --- >>> include/linux/migrate.h | 1 + >>> include/linux/mm.h | 1 + >>> mm/huge_memory.c | 30 ++++-- >>> mm/internal.h | 1 - >>> mm/memory.c | 10 +- >>> mm/memremap.c | 9 +- >>> mm/migrate.c | 226 ++++++++++++++++++++++++++++++++-------- >>> mm/page_alloc.c | 1 + >>> 8 files changed, 226 insertions(+), 53 deletions(-) >>> >>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h >>> index 3e546cbf03dd..f6a64965c8bd 100644 >>> --- a/include/linux/migrate.h >>> +++ b/include/linux/migrate.h >>> @@ -166,6 +166,7 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, >>> #define MIGRATE_PFN_MIGRATE (1UL << 1) >>> #define MIGRATE_PFN_LOCKED (1UL << 2) >>> #define MIGRATE_PFN_WRITE (1UL << 3) >>> +#define MIGRATE_PFN_COMPOUND (1UL << 4) >>> #define MIGRATE_PFN_SHIFT 6 >>> >>> static inline struct page *migrate_pfn_to_page(unsigned long mpfn) >>> diff --git a/include/linux/mm.h b/include/linux/mm.h >>> index dc7b87310c10..020b9dd3cddb 100644 >>> --- a/include/linux/mm.h >>> +++ b/include/linux/mm.h >>> @@ -932,6 +932,7 @@ static inline unsigned int page_shift(struct page *page) >>> } >>> >>> void free_compound_page(struct page *page); >>> +void prep_compound_page(struct page *page, unsigned int order); >>> >>> #ifdef CONFIG_MMU >>> /* >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>> index 78c84bee7e29..25d95f7b1e98 100644 >>> --- a/mm/huge_memory.c >>> +++ b/mm/huge_memory.c >>> @@ -1663,23 +1663,35 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, >>> } else { >>> struct page *page = NULL; >>> int flush_needed = 1; >>> + bool is_anon = false; >>> >>> if (pmd_present(orig_pmd)) { >>> page = pmd_page(orig_pmd); >>> + is_anon = PageAnon(page); >>> page_remove_rmap(page, true); >>> VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); >>> VM_BUG_ON_PAGE(!PageHead(page), page); >>> } else if (thp_migration_supported()) { >>> swp_entry_t entry; >>> >>> - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); >>> entry = pmd_to_swp_entry(orig_pmd); >>> - page = pfn_to_page(swp_offset(entry)); >>> + if (is_device_private_entry(entry)) { >>> + page = device_private_entry_to_page(entry); >>> + is_anon = PageAnon(page); >>> + page_remove_rmap(page, true); >>> + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); >>> + VM_BUG_ON_PAGE(!PageHead(page), page); >>> + put_page(page); >> >> Why do you hide this code behind thp_migration_supported()? It seems that you just need >> pmd swap entry not pmd migration entry. Also the condition is not consistent with the code >> in __handle_mm_fault(), in which you handle is_device_private_entry() directly without >> checking thp_migration_support(). > > Good point, I think "else if (thp_migration_supported())" should be > "else if (is_pmd_migration_entry(orig_pmd))" since if the PMD *is* > a device private or migration entry, then it should be handled and the > VM_BUG_ON() should be that thp_migration_supported() is true > (or maybe remove the VM_BUG_ON?). I disagree. A device private entry is independent of a PMD migration entry, since a device private entry is just a swap entry, which is available when CONFIG_TRANSPARENT_HUGEPAGE. So for architectures support THP but not THP migration (like ARM64), your code should still work. I would suggest you to check all the use of is_swap_pmd() and make sure the code can handle is_device_private_entry(). For new device private code, you might need to guard it either statically or dynamically in case CONFIG_DEVICE_PRIVATE is disabled. Potentially, you would like to make sure a system without CONFIG_DEVICE_PRIVATE will not see is_device_private_entry() == true and give errors when it does. > >> Do we need to support split_huge_pmd() if a page is migrated to device? Any new code >> needed in split_huge_pmd()? > > I was thinking that any CPU usage of the device private page would cause it to be > migrated back to system memory as a whole PMD/PUD page but I'll double check. > At least there should be a check that the page isn't a device private page. Well, that depends. If we can allocate a THP on CPU memory, we can migrate the whole page back. But if no THP is allocated due to low on free memory or memory fragmentation, I think you might need a fallback plan, either splitting the device private page and migrating smaller pages instead or reclaiming CPU memory until you get a THP. IMHO, the former might be preferred, since the latter might cost a lot of CPU cycles but still gives no THP after all. — Best Regards, Yan Zi
On 6/22/20 1:10 PM, Zi Yan wrote: > On 22 Jun 2020, at 15:36, Ralph Campbell wrote: > >> On 6/21/20 4:20 PM, Zi Yan wrote: >>> On 19 Jun 2020, at 17:56, Ralph Campbell wrote: >>> >>>> Support transparent huge page migration to ZONE_DEVICE private memory. >>>> A new flag (MIGRATE_PFN_COMPOUND) is added to the input PFN array to >>>> indicate the huge page was fully mapped by the CPU. >>>> Export prep_compound_page() so that device drivers can create huge >>>> device private pages after calling memremap_pages(). >>>> >>>> Signed-off-by: Ralph Campbell <rcampbell@nvidia.com> >>>> --- >>>> include/linux/migrate.h | 1 + >>>> include/linux/mm.h | 1 + >>>> mm/huge_memory.c | 30 ++++-- >>>> mm/internal.h | 1 - >>>> mm/memory.c | 10 +- >>>> mm/memremap.c | 9 +- >>>> mm/migrate.c | 226 ++++++++++++++++++++++++++++++++-------- >>>> mm/page_alloc.c | 1 + >>>> 8 files changed, 226 insertions(+), 53 deletions(-) >>>> >>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h >>>> index 3e546cbf03dd..f6a64965c8bd 100644 >>>> --- a/include/linux/migrate.h >>>> +++ b/include/linux/migrate.h >>>> @@ -166,6 +166,7 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, >>>> #define MIGRATE_PFN_MIGRATE (1UL << 1) >>>> #define MIGRATE_PFN_LOCKED (1UL << 2) >>>> #define MIGRATE_PFN_WRITE (1UL << 3) >>>> +#define MIGRATE_PFN_COMPOUND (1UL << 4) >>>> #define MIGRATE_PFN_SHIFT 6 >>>> >>>> static inline struct page *migrate_pfn_to_page(unsigned long mpfn) >>>> diff --git a/include/linux/mm.h b/include/linux/mm.h >>>> index dc7b87310c10..020b9dd3cddb 100644 >>>> --- a/include/linux/mm.h >>>> +++ b/include/linux/mm.h >>>> @@ -932,6 +932,7 @@ static inline unsigned int page_shift(struct page *page) >>>> } >>>> >>>> void free_compound_page(struct page *page); >>>> +void prep_compound_page(struct page *page, unsigned int order); >>>> >>>> #ifdef CONFIG_MMU >>>> /* >>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>>> index 78c84bee7e29..25d95f7b1e98 100644 >>>> --- a/mm/huge_memory.c >>>> +++ b/mm/huge_memory.c >>>> @@ -1663,23 +1663,35 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, >>>> } else { >>>> struct page *page = NULL; >>>> int flush_needed = 1; >>>> + bool is_anon = false; >>>> >>>> if (pmd_present(orig_pmd)) { >>>> page = pmd_page(orig_pmd); >>>> + is_anon = PageAnon(page); >>>> page_remove_rmap(page, true); >>>> VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); >>>> VM_BUG_ON_PAGE(!PageHead(page), page); >>>> } else if (thp_migration_supported()) { >>>> swp_entry_t entry; >>>> >>>> - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); >>>> entry = pmd_to_swp_entry(orig_pmd); >>>> - page = pfn_to_page(swp_offset(entry)); >>>> + if (is_device_private_entry(entry)) { >>>> + page = device_private_entry_to_page(entry); >>>> + is_anon = PageAnon(page); >>>> + page_remove_rmap(page, true); >>>> + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); >>>> + VM_BUG_ON_PAGE(!PageHead(page), page); >>>> + put_page(page); >>> >>> Why do you hide this code behind thp_migration_supported()? It seems that you just need >>> pmd swap entry not pmd migration entry. Also the condition is not consistent with the code >>> in __handle_mm_fault(), in which you handle is_device_private_entry() directly without >>> checking thp_migration_support(). >> >> Good point, I think "else if (thp_migration_supported())" should be >> "else if (is_pmd_migration_entry(orig_pmd))" since if the PMD *is* >> a device private or migration entry, then it should be handled and the >> VM_BUG_ON() should be that thp_migration_supported() is true >> (or maybe remove the VM_BUG_ON?). > > I disagree. A device private entry is independent of a PMD migration entry, since a device private > entry is just a swap entry, which is available when CONFIG_TRANSPARENT_HUGEPAGE. So for architectures > support THP but not THP migration (like ARM64), your code should still work. I'll fix this up for v2 and you can double check me. > I would suggest you to check all the use of is_swap_pmd() and make sure the code > can handle is_device_private_entry(). OK. > For new device private code, you might need to guard it either statically or dynamically in case > CONFIG_DEVICE_PRIVATE is disabled. Potentially, you would like to make sure a system without > CONFIG_DEVICE_PRIVATE will not see is_device_private_entry() == true and give errors when it does. I have compiled and run with CONFIG_DEVICE_PRIVATE off but I can test more combinations of config settings. >> >>> Do we need to support split_huge_pmd() if a page is migrated to device? Any new code >>> needed in split_huge_pmd()? >> >> I was thinking that any CPU usage of the device private page would cause it to be >> migrated back to system memory as a whole PMD/PUD page but I'll double check. >> At least there should be a check that the page isn't a device private page. > > Well, that depends. If we can allocate a THP on CPU memory, we can migrate the whole page back. > But if no THP is allocated due to low on free memory or memory fragmentation, I think you > might need a fallback plan, either splitting the device private page and migrating smaller > pages instead or reclaiming CPU memory until you get a THP. IMHO, the former might be preferred, > since the latter might cost a lot of CPU cycles but still gives no THP after all. Sounds reasonable. I'll work on adding the fallback path for v2.
On 22 Jun 2020, at 17:31, Ralph Campbell wrote: > On 6/22/20 1:10 PM, Zi Yan wrote: >> On 22 Jun 2020, at 15:36, Ralph Campbell wrote: >> >>> On 6/21/20 4:20 PM, Zi Yan wrote: >>>> On 19 Jun 2020, at 17:56, Ralph Campbell wrote: >>>> >>>>> Support transparent huge page migration to ZONE_DEVICE private memory. >>>>> A new flag (MIGRATE_PFN_COMPOUND) is added to the input PFN array to >>>>> indicate the huge page was fully mapped by the CPU. >>>>> Export prep_compound_page() so that device drivers can create huge >>>>> device private pages after calling memremap_pages(). >>>>> >>>>> Signed-off-by: Ralph Campbell <rcampbell@nvidia.com> >>>>> --- >>>>> include/linux/migrate.h | 1 + >>>>> include/linux/mm.h | 1 + >>>>> mm/huge_memory.c | 30 ++++-- >>>>> mm/internal.h | 1 - >>>>> mm/memory.c | 10 +- >>>>> mm/memremap.c | 9 +- >>>>> mm/migrate.c | 226 ++++++++++++++++++++++++++++++++-------- >>>>> mm/page_alloc.c | 1 + >>>>> 8 files changed, 226 insertions(+), 53 deletions(-) >>>>> >>>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h >>>>> index 3e546cbf03dd..f6a64965c8bd 100644 >>>>> --- a/include/linux/migrate.h >>>>> +++ b/include/linux/migrate.h >>>>> @@ -166,6 +166,7 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, >>>>> #define MIGRATE_PFN_MIGRATE (1UL << 1) >>>>> #define MIGRATE_PFN_LOCKED (1UL << 2) >>>>> #define MIGRATE_PFN_WRITE (1UL << 3) >>>>> +#define MIGRATE_PFN_COMPOUND (1UL << 4) >>>>> #define MIGRATE_PFN_SHIFT 6 >>>>> >>>>> static inline struct page *migrate_pfn_to_page(unsigned long mpfn) >>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h >>>>> index dc7b87310c10..020b9dd3cddb 100644 >>>>> --- a/include/linux/mm.h >>>>> +++ b/include/linux/mm.h >>>>> @@ -932,6 +932,7 @@ static inline unsigned int page_shift(struct page *page) >>>>> } >>>>> >>>>> void free_compound_page(struct page *page); >>>>> +void prep_compound_page(struct page *page, unsigned int order); >>>>> >>>>> #ifdef CONFIG_MMU >>>>> /* >>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>>>> index 78c84bee7e29..25d95f7b1e98 100644 >>>>> --- a/mm/huge_memory.c >>>>> +++ b/mm/huge_memory.c >>>>> @@ -1663,23 +1663,35 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, >>>>> } else { >>>>> struct page *page = NULL; >>>>> int flush_needed = 1; >>>>> + bool is_anon = false; >>>>> >>>>> if (pmd_present(orig_pmd)) { >>>>> page = pmd_page(orig_pmd); >>>>> + is_anon = PageAnon(page); >>>>> page_remove_rmap(page, true); >>>>> VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); >>>>> VM_BUG_ON_PAGE(!PageHead(page), page); >>>>> } else if (thp_migration_supported()) { >>>>> swp_entry_t entry; >>>>> >>>>> - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); >>>>> entry = pmd_to_swp_entry(orig_pmd); >>>>> - page = pfn_to_page(swp_offset(entry)); >>>>> + if (is_device_private_entry(entry)) { >>>>> + page = device_private_entry_to_page(entry); >>>>> + is_anon = PageAnon(page); >>>>> + page_remove_rmap(page, true); >>>>> + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); >>>>> + VM_BUG_ON_PAGE(!PageHead(page), page); >>>>> + put_page(page); >>>> >>>> Why do you hide this code behind thp_migration_supported()? It seems that you just need >>>> pmd swap entry not pmd migration entry. Also the condition is not consistent with the code >>>> in __handle_mm_fault(), in which you handle is_device_private_entry() directly without >>>> checking thp_migration_support(). >>> >>> Good point, I think "else if (thp_migration_supported())" should be >>> "else if (is_pmd_migration_entry(orig_pmd))" since if the PMD *is* >>> a device private or migration entry, then it should be handled and the >>> VM_BUG_ON() should be that thp_migration_supported() is true >>> (or maybe remove the VM_BUG_ON?). >> >> I disagree. A device private entry is independent of a PMD migration entry, since a device private >> entry is just a swap entry, which is available when CONFIG_TRANSPARENT_HUGEPAGE. So for architectures >> support THP but not THP migration (like ARM64), your code should still work. > > I'll fix this up for v2 and you can double check me. Sure. > >> I would suggest you to check all the use of is_swap_pmd() and make sure the code >> can handle is_device_private_entry(). > > OK. > >> For new device private code, you might need to guard it either statically or dynamically in case >> CONFIG_DEVICE_PRIVATE is disabled. Potentially, you would like to make sure a system without >> CONFIG_DEVICE_PRIVATE will not see is_device_private_entry() == true and give errors when it does. > > I have compiled and run with CONFIG_DEVICE_PRIVATE off but I can test more combinations of > config settings. Thanks. > >>> >>>> Do we need to support split_huge_pmd() if a page is migrated to device? Any new code >>>> needed in split_huge_pmd()? >>> >>> I was thinking that any CPU usage of the device private page would cause it to be >>> migrated back to system memory as a whole PMD/PUD page but I'll double check. >>> At least there should be a check that the page isn't a device private page. >> >> Well, that depends. If we can allocate a THP on CPU memory, we can migrate the whole page back. >> But if no THP is allocated due to low on free memory or memory fragmentation, I think you >> might need a fallback plan, either splitting the device private page and migrating smaller >> pages instead or reclaiming CPU memory until you get a THP. IMHO, the former might be preferred, >> since the latter might cost a lot of CPU cycles but still gives no THP after all. > > Sounds reasonable. I'll work on adding the fallback path for v2. Ying(cc’d) developed the code to swapout and swapin THP in one piece: https://lore.kernel.org/linux-mm/20181207054122.27822-1-ying.huang@intel.com/. I am not sure whether the patchset makes into mainstream or not. It could be a good technical reference for swapping in device private pages, although swapping in pages from disk and from device private memory are two different scenarios. Since the device private memory swapin impacts core mm performance, we might want to discuss your patches with more people, like the ones from Ying’s patchset, in the next version. — Best Regards, Yan Zi
On Mon, Jun 22, 2020 at 2:53 PM Zi Yan <ziy@nvidia.com> wrote: > > On 22 Jun 2020, at 17:31, Ralph Campbell wrote: > > > On 6/22/20 1:10 PM, Zi Yan wrote: > >> On 22 Jun 2020, at 15:36, Ralph Campbell wrote: > >> > >>> On 6/21/20 4:20 PM, Zi Yan wrote: > >>>> On 19 Jun 2020, at 17:56, Ralph Campbell wrote: > >>>> > >>>>> Support transparent huge page migration to ZONE_DEVICE private memory. > >>>>> A new flag (MIGRATE_PFN_COMPOUND) is added to the input PFN array to > >>>>> indicate the huge page was fully mapped by the CPU. > >>>>> Export prep_compound_page() so that device drivers can create huge > >>>>> device private pages after calling memremap_pages(). > >>>>> > >>>>> Signed-off-by: Ralph Campbell <rcampbell@nvidia.com> > >>>>> --- > >>>>> include/linux/migrate.h | 1 + > >>>>> include/linux/mm.h | 1 + > >>>>> mm/huge_memory.c | 30 ++++-- > >>>>> mm/internal.h | 1 - > >>>>> mm/memory.c | 10 +- > >>>>> mm/memremap.c | 9 +- > >>>>> mm/migrate.c | 226 ++++++++++++++++++++++++++++++++-------- > >>>>> mm/page_alloc.c | 1 + > >>>>> 8 files changed, 226 insertions(+), 53 deletions(-) > >>>>> > >>>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h > >>>>> index 3e546cbf03dd..f6a64965c8bd 100644 > >>>>> --- a/include/linux/migrate.h > >>>>> +++ b/include/linux/migrate.h > >>>>> @@ -166,6 +166,7 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, > >>>>> #define MIGRATE_PFN_MIGRATE (1UL << 1) > >>>>> #define MIGRATE_PFN_LOCKED (1UL << 2) > >>>>> #define MIGRATE_PFN_WRITE (1UL << 3) > >>>>> +#define MIGRATE_PFN_COMPOUND (1UL << 4) > >>>>> #define MIGRATE_PFN_SHIFT 6 > >>>>> > >>>>> static inline struct page *migrate_pfn_to_page(unsigned long mpfn) > >>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h > >>>>> index dc7b87310c10..020b9dd3cddb 100644 > >>>>> --- a/include/linux/mm.h > >>>>> +++ b/include/linux/mm.h > >>>>> @@ -932,6 +932,7 @@ static inline unsigned int page_shift(struct page *page) > >>>>> } > >>>>> > >>>>> void free_compound_page(struct page *page); > >>>>> +void prep_compound_page(struct page *page, unsigned int order); > >>>>> > >>>>> #ifdef CONFIG_MMU > >>>>> /* > >>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c > >>>>> index 78c84bee7e29..25d95f7b1e98 100644 > >>>>> --- a/mm/huge_memory.c > >>>>> +++ b/mm/huge_memory.c > >>>>> @@ -1663,23 +1663,35 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > >>>>> } else { > >>>>> struct page *page = NULL; > >>>>> int flush_needed = 1; > >>>>> + bool is_anon = false; > >>>>> > >>>>> if (pmd_present(orig_pmd)) { > >>>>> page = pmd_page(orig_pmd); > >>>>> + is_anon = PageAnon(page); > >>>>> page_remove_rmap(page, true); > >>>>> VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); > >>>>> VM_BUG_ON_PAGE(!PageHead(page), page); > >>>>> } else if (thp_migration_supported()) { > >>>>> swp_entry_t entry; > >>>>> > >>>>> - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); > >>>>> entry = pmd_to_swp_entry(orig_pmd); > >>>>> - page = pfn_to_page(swp_offset(entry)); > >>>>> + if (is_device_private_entry(entry)) { > >>>>> + page = device_private_entry_to_page(entry); > >>>>> + is_anon = PageAnon(page); > >>>>> + page_remove_rmap(page, true); > >>>>> + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); > >>>>> + VM_BUG_ON_PAGE(!PageHead(page), page); > >>>>> + put_page(page); > >>>> > >>>> Why do you hide this code behind thp_migration_supported()? It seems that you just need > >>>> pmd swap entry not pmd migration entry. Also the condition is not consistent with the code > >>>> in __handle_mm_fault(), in which you handle is_device_private_entry() directly without > >>>> checking thp_migration_support(). > >>> > >>> Good point, I think "else if (thp_migration_supported())" should be > >>> "else if (is_pmd_migration_entry(orig_pmd))" since if the PMD *is* > >>> a device private or migration entry, then it should be handled and the > >>> VM_BUG_ON() should be that thp_migration_supported() is true > >>> (or maybe remove the VM_BUG_ON?). > >> > >> I disagree. A device private entry is independent of a PMD migration entry, since a device private > >> entry is just a swap entry, which is available when CONFIG_TRANSPARENT_HUGEPAGE. So for architectures > >> support THP but not THP migration (like ARM64), your code should still work. > > > > I'll fix this up for v2 and you can double check me. > > Sure. > > > > >> I would suggest you to check all the use of is_swap_pmd() and make sure the code > >> can handle is_device_private_entry(). > > > > OK. > > > >> For new device private code, you might need to guard it either statically or dynamically in case > >> CONFIG_DEVICE_PRIVATE is disabled. Potentially, you would like to make sure a system without > >> CONFIG_DEVICE_PRIVATE will not see is_device_private_entry() == true and give errors when it does. > > > > I have compiled and run with CONFIG_DEVICE_PRIVATE off but I can test more combinations of > > config settings. > > Thanks. > > > > >>> > >>>> Do we need to support split_huge_pmd() if a page is migrated to device? Any new code > >>>> needed in split_huge_pmd()? > >>> > >>> I was thinking that any CPU usage of the device private page would cause it to be > >>> migrated back to system memory as a whole PMD/PUD page but I'll double check. > >>> At least there should be a check that the page isn't a device private page. > >> > >> Well, that depends. If we can allocate a THP on CPU memory, we can migrate the whole page back. > >> But if no THP is allocated due to low on free memory or memory fragmentation, I think you > >> might need a fallback plan, either splitting the device private page and migrating smaller > >> pages instead or reclaiming CPU memory until you get a THP. IMHO, the former might be preferred, > >> since the latter might cost a lot of CPU cycles but still gives no THP after all. > > > > Sounds reasonable. I'll work on adding the fallback path for v2. > > Ying(cc’d) developed the code to swapout and swapin THP in one piece: https://lore.kernel.org/linux-mm/20181207054122.27822-1-ying.huang@intel.com/. > I am not sure whether the patchset makes into mainstream or not. It could be a good technical reference > for swapping in device private pages, although swapping in pages from disk and from device private > memory are two different scenarios. > > Since the device private memory swapin impacts core mm performance, we might want to discuss your patches > with more people, like the ones from Ying’s patchset, in the next version. I believe Ying will give you more insights about how THP swap works. But, IMHO device memory migration (migrate to system memory) seems like THP CoW more than swap. When migrating in: > > > > — > Best Regards, > Yan Zi
On Mon, Jun 22, 2020 at 3:30 PM Yang Shi <shy828301@gmail.com> wrote: > > On Mon, Jun 22, 2020 at 2:53 PM Zi Yan <ziy@nvidia.com> wrote: > > > > On 22 Jun 2020, at 17:31, Ralph Campbell wrote: > > > > > On 6/22/20 1:10 PM, Zi Yan wrote: > > >> On 22 Jun 2020, at 15:36, Ralph Campbell wrote: > > >> > > >>> On 6/21/20 4:20 PM, Zi Yan wrote: > > >>>> On 19 Jun 2020, at 17:56, Ralph Campbell wrote: > > >>>> > > >>>>> Support transparent huge page migration to ZONE_DEVICE private memory. > > >>>>> A new flag (MIGRATE_PFN_COMPOUND) is added to the input PFN array to > > >>>>> indicate the huge page was fully mapped by the CPU. > > >>>>> Export prep_compound_page() so that device drivers can create huge > > >>>>> device private pages after calling memremap_pages(). > > >>>>> > > >>>>> Signed-off-by: Ralph Campbell <rcampbell@nvidia.com> > > >>>>> --- > > >>>>> include/linux/migrate.h | 1 + > > >>>>> include/linux/mm.h | 1 + > > >>>>> mm/huge_memory.c | 30 ++++-- > > >>>>> mm/internal.h | 1 - > > >>>>> mm/memory.c | 10 +- > > >>>>> mm/memremap.c | 9 +- > > >>>>> mm/migrate.c | 226 ++++++++++++++++++++++++++++++++-------- > > >>>>> mm/page_alloc.c | 1 + > > >>>>> 8 files changed, 226 insertions(+), 53 deletions(-) > > >>>>> > > >>>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h > > >>>>> index 3e546cbf03dd..f6a64965c8bd 100644 > > >>>>> --- a/include/linux/migrate.h > > >>>>> +++ b/include/linux/migrate.h > > >>>>> @@ -166,6 +166,7 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, > > >>>>> #define MIGRATE_PFN_MIGRATE (1UL << 1) > > >>>>> #define MIGRATE_PFN_LOCKED (1UL << 2) > > >>>>> #define MIGRATE_PFN_WRITE (1UL << 3) > > >>>>> +#define MIGRATE_PFN_COMPOUND (1UL << 4) > > >>>>> #define MIGRATE_PFN_SHIFT 6 > > >>>>> > > >>>>> static inline struct page *migrate_pfn_to_page(unsigned long mpfn) > > >>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h > > >>>>> index dc7b87310c10..020b9dd3cddb 100644 > > >>>>> --- a/include/linux/mm.h > > >>>>> +++ b/include/linux/mm.h > > >>>>> @@ -932,6 +932,7 @@ static inline unsigned int page_shift(struct page *page) > > >>>>> } > > >>>>> > > >>>>> void free_compound_page(struct page *page); > > >>>>> +void prep_compound_page(struct page *page, unsigned int order); > > >>>>> > > >>>>> #ifdef CONFIG_MMU > > >>>>> /* > > >>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > >>>>> index 78c84bee7e29..25d95f7b1e98 100644 > > >>>>> --- a/mm/huge_memory.c > > >>>>> +++ b/mm/huge_memory.c > > >>>>> @@ -1663,23 +1663,35 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > > >>>>> } else { > > >>>>> struct page *page = NULL; > > >>>>> int flush_needed = 1; > > >>>>> + bool is_anon = false; > > >>>>> > > >>>>> if (pmd_present(orig_pmd)) { > > >>>>> page = pmd_page(orig_pmd); > > >>>>> + is_anon = PageAnon(page); > > >>>>> page_remove_rmap(page, true); > > >>>>> VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); > > >>>>> VM_BUG_ON_PAGE(!PageHead(page), page); > > >>>>> } else if (thp_migration_supported()) { > > >>>>> swp_entry_t entry; > > >>>>> > > >>>>> - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); > > >>>>> entry = pmd_to_swp_entry(orig_pmd); > > >>>>> - page = pfn_to_page(swp_offset(entry)); > > >>>>> + if (is_device_private_entry(entry)) { > > >>>>> + page = device_private_entry_to_page(entry); > > >>>>> + is_anon = PageAnon(page); > > >>>>> + page_remove_rmap(page, true); > > >>>>> + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); > > >>>>> + VM_BUG_ON_PAGE(!PageHead(page), page); > > >>>>> + put_page(page); > > >>>> > > >>>> Why do you hide this code behind thp_migration_supported()? It seems that you just need > > >>>> pmd swap entry not pmd migration entry. Also the condition is not consistent with the code > > >>>> in __handle_mm_fault(), in which you handle is_device_private_entry() directly without > > >>>> checking thp_migration_support(). > > >>> > > >>> Good point, I think "else if (thp_migration_supported())" should be > > >>> "else if (is_pmd_migration_entry(orig_pmd))" since if the PMD *is* > > >>> a device private or migration entry, then it should be handled and the > > >>> VM_BUG_ON() should be that thp_migration_supported() is true > > >>> (or maybe remove the VM_BUG_ON?). > > >> > > >> I disagree. A device private entry is independent of a PMD migration entry, since a device private > > >> entry is just a swap entry, which is available when CONFIG_TRANSPARENT_HUGEPAGE. So for architectures > > >> support THP but not THP migration (like ARM64), your code should still work. > > > > > > I'll fix this up for v2 and you can double check me. > > > > Sure. > > > > > > > >> I would suggest you to check all the use of is_swap_pmd() and make sure the code > > >> can handle is_device_private_entry(). > > > > > > OK. > > > > > >> For new device private code, you might need to guard it either statically or dynamically in case > > >> CONFIG_DEVICE_PRIVATE is disabled. Potentially, you would like to make sure a system without > > >> CONFIG_DEVICE_PRIVATE will not see is_device_private_entry() == true and give errors when it does. > > > > > > I have compiled and run with CONFIG_DEVICE_PRIVATE off but I can test more combinations of > > > config settings. > > > > Thanks. > > > > > > > >>> > > >>>> Do we need to support split_huge_pmd() if a page is migrated to device? Any new code > > >>>> needed in split_huge_pmd()? > > >>> > > >>> I was thinking that any CPU usage of the device private page would cause it to be > > >>> migrated back to system memory as a whole PMD/PUD page but I'll double check. > > >>> At least there should be a check that the page isn't a device private page. > > >> > > >> Well, that depends. If we can allocate a THP on CPU memory, we can migrate the whole page back. > > >> But if no THP is allocated due to low on free memory or memory fragmentation, I think you > > >> might need a fallback plan, either splitting the device private page and migrating smaller > > >> pages instead or reclaiming CPU memory until you get a THP. IMHO, the former might be preferred, > > >> since the latter might cost a lot of CPU cycles but still gives no THP after all. > > > > > > Sounds reasonable. I'll work on adding the fallback path for v2. > > > > Ying(cc’d) developed the code to swapout and swapin THP in one piece: https://lore.kernel.org/linux-mm/20181207054122.27822-1-ying.huang@intel.com/. > > I am not sure whether the patchset makes into mainstream or not. It could be a good technical reference > > for swapping in device private pages, although swapping in pages from disk and from device private > > memory are two different scenarios. > > > > Since the device private memory swapin impacts core mm performance, we might want to discuss your patches > > with more people, like the ones from Ying’s patchset, in the next version. > > I believe Ying will give you more insights about how THP swap works. > > But, IMHO device memory migration (migrate to system memory) seems > like THP CoW more than swap. > > When migrating in: Sorry for my fat finger, hit sent button inadvertently, let me finish here. When migrating in: - if THP is enabled: allocate THP, but need handle allocation failure by falling back to base page - if THP is disabled: fallback to base page > > > > > > > > > — > > Best Regards, > > Yan Zi
On 2020-06-22 15:33, Yang Shi wrote: > On Mon, Jun 22, 2020 at 3:30 PM Yang Shi <shy828301@gmail.com> wrote: >> On Mon, Jun 22, 2020 at 2:53 PM Zi Yan <ziy@nvidia.com> wrote: >>> On 22 Jun 2020, at 17:31, Ralph Campbell wrote: >>>> On 6/22/20 1:10 PM, Zi Yan wrote: >>>>> On 22 Jun 2020, at 15:36, Ralph Campbell wrote: >>>>>> On 6/21/20 4:20 PM, Zi Yan wrote: >>>>>>> On 19 Jun 2020, at 17:56, Ralph Campbell wrote: ... >>> Ying(cc’d) developed the code to swapout and swapin THP in one piece: https://lore.kernel.org/linux-mm/20181207054122.27822-1-ying.huang@intel.com/. >>> I am not sure whether the patchset makes into mainstream or not. It could be a good technical reference >>> for swapping in device private pages, although swapping in pages from disk and from device private >>> memory are two different scenarios. >>> >>> Since the device private memory swapin impacts core mm performance, we might want to discuss your patches >>> with more people, like the ones from Ying’s patchset, in the next version. >> >> I believe Ying will give you more insights about how THP swap works. >> >> But, IMHO device memory migration (migrate to system memory) seems >> like THP CoW more than swap. A fine point: overall, the desired behavior is "migrate", not CoW. That's important. Migrate means that you don't leave a page behind, even a read-only one. And that's exactly how device private migration is specified. We should try to avoid any erosion of clarity here. Even if somehow (really?) the underlying implementation calls this THP CoW, the actual goal is to migrate pages over to the device (and back). >> >> When migrating in: > > Sorry for my fat finger, hit sent button inadvertently, let me finish here. > > When migrating in: > > - if THP is enabled: allocate THP, but need handle allocation > failure by falling back to base page > - if THP is disabled: fallback to base page > OK, but *all* page entries (base and huge/large pages) need to be cleared, when migrating to device memory, unless I'm really confused here. So: not CoW. thanks,
On Mon, Jun 22, 2020 at 4:02 PM John Hubbard <jhubbard@nvidia.com> wrote: > > On 2020-06-22 15:33, Yang Shi wrote: > > On Mon, Jun 22, 2020 at 3:30 PM Yang Shi <shy828301@gmail.com> wrote: > >> On Mon, Jun 22, 2020 at 2:53 PM Zi Yan <ziy@nvidia.com> wrote: > >>> On 22 Jun 2020, at 17:31, Ralph Campbell wrote: > >>>> On 6/22/20 1:10 PM, Zi Yan wrote: > >>>>> On 22 Jun 2020, at 15:36, Ralph Campbell wrote: > >>>>>> On 6/21/20 4:20 PM, Zi Yan wrote: > >>>>>>> On 19 Jun 2020, at 17:56, Ralph Campbell wrote: > ... > >>> Ying(cc’d) developed the code to swapout and swapin THP in one piece: https://lore.kernel.org/linux-mm/20181207054122.27822-1-ying.huang@intel.com/. > >>> I am not sure whether the patchset makes into mainstream or not. It could be a good technical reference > >>> for swapping in device private pages, although swapping in pages from disk and from device private > >>> memory are two different scenarios. > >>> > >>> Since the device private memory swapin impacts core mm performance, we might want to discuss your patches > >>> with more people, like the ones from Ying’s patchset, in the next version. > >> > >> I believe Ying will give you more insights about how THP swap works. > >> > >> But, IMHO device memory migration (migrate to system memory) seems > >> like THP CoW more than swap. > > > A fine point: overall, the desired behavior is "migrate", not CoW. > That's important. Migrate means that you don't leave a page behind, even > a read-only one. And that's exactly how device private migration is > specified. > > We should try to avoid any erosion of clarity here. Even if somehow > (really?) the underlying implementation calls this THP CoW, the actual > goal is to migrate pages over to the device (and back). > > > >> > >> When migrating in: > > > > Sorry for my fat finger, hit sent button inadvertently, let me finish here. > > > > When migrating in: > > > > - if THP is enabled: allocate THP, but need handle allocation > > failure by falling back to base page > > - if THP is disabled: fallback to base page > > > > OK, but *all* page entries (base and huge/large pages) need to be cleared, > when migrating to device memory, unless I'm really confused here. > So: not CoW. I realized the comment caused more confusion. I apologize for the confusion. Yes, the trigger condition for swap/migration and CoW are definitely different. Here I mean the fault handling part of migrating into system memory. Swap-in just needs to handle the base page case since THP swapin is not supported in upstream yet and the PMD is split in swap-out phase (see shrink_page_list). The patch adds THP migration support to device memory, but you need to handle migrate in (back to system memory) case correctly. The fault handling should look like THP CoW fault handling behavior (before 5.8): - if THP is enabled: allocate THP, fallback if allocation is failed - if THP is disabled: fallback to base page Swap fault handling doesn't look like the above. So, I said it seems like more THP CoW (fault handling part only before 5.8). I hope I articulate my mind. However, I didn't see such fallback is handled. It looks if THP allocation is failed, it just returns SIGBUS; and no check about THP status if I read the patches correctly. The THP might be disabled for the specific vma or system wide before migrating from device memory back to system memory. > > thanks, > -- > John Hubbard > NVIDIA
On 6/22/20 4:54 PM, Yang Shi wrote: > On Mon, Jun 22, 2020 at 4:02 PM John Hubbard <jhubbard@nvidia.com> wrote: >> >> On 2020-06-22 15:33, Yang Shi wrote: >>> On Mon, Jun 22, 2020 at 3:30 PM Yang Shi <shy828301@gmail.com> wrote: >>>> On Mon, Jun 22, 2020 at 2:53 PM Zi Yan <ziy@nvidia.com> wrote: >>>>> On 22 Jun 2020, at 17:31, Ralph Campbell wrote: >>>>>> On 6/22/20 1:10 PM, Zi Yan wrote: >>>>>>> On 22 Jun 2020, at 15:36, Ralph Campbell wrote: >>>>>>>> On 6/21/20 4:20 PM, Zi Yan wrote: >>>>>>>>> On 19 Jun 2020, at 17:56, Ralph Campbell wrote: >> ... >>>>> Ying(cc’d) developed the code to swapout and swapin THP in one piece: https://lore.kernel.org/linux-mm/20181207054122.27822-1-ying.huang@intel.com/. >>>>> I am not sure whether the patchset makes into mainstream or not. It could be a good technical reference >>>>> for swapping in device private pages, although swapping in pages from disk and from device private >>>>> memory are two different scenarios. >>>>> >>>>> Since the device private memory swapin impacts core mm performance, we might want to discuss your patches >>>>> with more people, like the ones from Ying’s patchset, in the next version. >>>> >>>> I believe Ying will give you more insights about how THP swap works. >>>> >>>> But, IMHO device memory migration (migrate to system memory) seems >>>> like THP CoW more than swap. >> >> >> A fine point: overall, the desired behavior is "migrate", not CoW. >> That's important. Migrate means that you don't leave a page behind, even >> a read-only one. And that's exactly how device private migration is >> specified. >> >> We should try to avoid any erosion of clarity here. Even if somehow >> (really?) the underlying implementation calls this THP CoW, the actual >> goal is to migrate pages over to the device (and back). >> >> >>>> >>>> When migrating in: >>> >>> Sorry for my fat finger, hit sent button inadvertently, let me finish here. >>> >>> When migrating in: >>> >>> - if THP is enabled: allocate THP, but need handle allocation >>> failure by falling back to base page >>> - if THP is disabled: fallback to base page >>> >> >> OK, but *all* page entries (base and huge/large pages) need to be cleared, >> when migrating to device memory, unless I'm really confused here. >> So: not CoW. > > I realized the comment caused more confusion. I apologize for the > confusion. Yes, the trigger condition for swap/migration and CoW are > definitely different. Here I mean the fault handling part of migrating > into system memory. > > Swap-in just needs to handle the base page case since THP swapin is > not supported in upstream yet and the PMD is split in swap-out phase > (see shrink_page_list). > > The patch adds THP migration support to device memory, but you need to > handle migrate in (back to system memory) case correctly. The fault > handling should look like THP CoW fault handling behavior (before > 5.8): > - if THP is enabled: allocate THP, fallback if allocation is failed > - if THP is disabled: fallback to base page > > Swap fault handling doesn't look like the above. So, I said it seems > like more THP CoW (fault handling part only before 5.8). I hope I > articulate my mind. > > However, I didn't see such fallback is handled. It looks if THP > allocation is failed, it just returns SIGBUS; and no check about THP > status if I read the patches correctly. The THP might be disabled for > the specific vma or system wide before migrating from device memory > back to system memory. You are correct, the patch wasn't handling the fallback case. I'll add that in the next version. >> >> thanks, >> -- >> John Hubbard >> NVIDIA
Ralph Campbell <rcampbell@nvidia.com> writes: > On 6/22/20 4:54 PM, Yang Shi wrote: >> On Mon, Jun 22, 2020 at 4:02 PM John Hubbard <jhubbard@nvidia.com> wrote: >>> >>> On 2020-06-22 15:33, Yang Shi wrote: >>>> On Mon, Jun 22, 2020 at 3:30 PM Yang Shi <shy828301@gmail.com> wrote: >>>>> On Mon, Jun 22, 2020 at 2:53 PM Zi Yan <ziy@nvidia.com> wrote: >>>>>> On 22 Jun 2020, at 17:31, Ralph Campbell wrote: >>>>>>> On 6/22/20 1:10 PM, Zi Yan wrote: >>>>>>>> On 22 Jun 2020, at 15:36, Ralph Campbell wrote: >>>>>>>>> On 6/21/20 4:20 PM, Zi Yan wrote: >>>>>>>>>> On 19 Jun 2020, at 17:56, Ralph Campbell wrote: >>> ... >>>>>> Ying(cc’d) developed the code to swapout and swapin THP in one piece: https://lore.kernel.org/linux-mm/20181207054122.27822-1-ying.huang@intel.com/. >>>>>> I am not sure whether the patchset makes into mainstream or not. It could be a good technical reference >>>>>> for swapping in device private pages, although swapping in pages from disk and from device private >>>>>> memory are two different scenarios. >>>>>> >>>>>> Since the device private memory swapin impacts core mm performance, we might want to discuss your patches >>>>>> with more people, like the ones from Ying’s patchset, in the next version. >>>>> >>>>> I believe Ying will give you more insights about how THP swap works. >>>>> >>>>> But, IMHO device memory migration (migrate to system memory) seems >>>>> like THP CoW more than swap. >>> >>> >>> A fine point: overall, the desired behavior is "migrate", not CoW. >>> That's important. Migrate means that you don't leave a page behind, even >>> a read-only one. And that's exactly how device private migration is >>> specified. >>> >>> We should try to avoid any erosion of clarity here. Even if somehow >>> (really?) the underlying implementation calls this THP CoW, the actual >>> goal is to migrate pages over to the device (and back). >>> >>> >>>>> >>>>> When migrating in: >>>> >>>> Sorry for my fat finger, hit sent button inadvertently, let me finish here. >>>> >>>> When migrating in: >>>> >>>> - if THP is enabled: allocate THP, but need handle allocation >>>> failure by falling back to base page >>>> - if THP is disabled: fallback to base page >>>> >>> >>> OK, but *all* page entries (base and huge/large pages) need to be cleared, >>> when migrating to device memory, unless I'm really confused here. >>> So: not CoW. >> >> I realized the comment caused more confusion. I apologize for the >> confusion. Yes, the trigger condition for swap/migration and CoW are >> definitely different. Here I mean the fault handling part of migrating >> into system memory. >> >> Swap-in just needs to handle the base page case since THP swapin is >> not supported in upstream yet and the PMD is split in swap-out phase >> (see shrink_page_list). >> >> The patch adds THP migration support to device memory, but you need to >> handle migrate in (back to system memory) case correctly. The fault >> handling should look like THP CoW fault handling behavior (before >> 5.8): >> - if THP is enabled: allocate THP, fallback if allocation is failed >> - if THP is disabled: fallback to base page >> >> Swap fault handling doesn't look like the above. So, I said it seems >> like more THP CoW (fault handling part only before 5.8). I hope I >> articulate my mind. >> >> However, I didn't see such fallback is handled. It looks if THP >> allocation is failed, it just returns SIGBUS; and no check about THP >> status if I read the patches correctly. The THP might be disabled for >> the specific vma or system wide before migrating from device memory >> back to system memory. > > You are correct, the patch wasn't handling the fallback case. > I'll add that in the next version. For fallback, you need to split the THP in device firstly. Because you will migrate a base page instead a whole THP now. Best Regards, Huang, Ying >>> >>> thanks, >>> -- >>> John Hubbard >>> NVIDIA
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3e546cbf03dd..f6a64965c8bd 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -166,6 +166,7 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, #define MIGRATE_PFN_MIGRATE (1UL << 1) #define MIGRATE_PFN_LOCKED (1UL << 2) #define MIGRATE_PFN_WRITE (1UL << 3) +#define MIGRATE_PFN_COMPOUND (1UL << 4) #define MIGRATE_PFN_SHIFT 6 static inline struct page *migrate_pfn_to_page(unsigned long mpfn) diff --git a/include/linux/mm.h b/include/linux/mm.h index dc7b87310c10..020b9dd3cddb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -932,6 +932,7 @@ static inline unsigned int page_shift(struct page *page) } void free_compound_page(struct page *page); +void prep_compound_page(struct page *page, unsigned int order); #ifdef CONFIG_MMU /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 78c84bee7e29..25d95f7b1e98 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1663,23 +1663,35 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, } else { struct page *page = NULL; int flush_needed = 1; + bool is_anon = false; if (pmd_present(orig_pmd)) { page = pmd_page(orig_pmd); + is_anon = PageAnon(page); page_remove_rmap(page, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); } else if (thp_migration_supported()) { swp_entry_t entry; - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); - page = pfn_to_page(swp_offset(entry)); + if (is_device_private_entry(entry)) { + page = device_private_entry_to_page(entry); + is_anon = PageAnon(page); + page_remove_rmap(page, true); + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); + VM_BUG_ON_PAGE(!PageHead(page), page); + put_page(page); + } else { + VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); + page = pfn_to_page(swp_offset(entry)); + is_anon = PageAnon(page); + } flush_needed = 0; } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); - if (PageAnon(page)) { + if (is_anon) { zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { @@ -1778,8 +1790,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, /* * Returns * - 0 if PMD could not be locked - * - 1 if PMD was locked but protections unchange and TLB flush unnecessary - * - HPAGE_PMD_NR is protections changed and TLB flush necessary + * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary + * - HPAGE_PMD_NR if protections changed and TLB flush necessary */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, unsigned long cp_flags) @@ -2689,7 +2701,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { if (!list_empty(page_deferred_list(head))) { ds_queue->split_queue_len--; - list_del(page_deferred_list(head)); + list_del_init(page_deferred_list(head)); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { @@ -2743,7 +2755,7 @@ void free_transhuge_page(struct page *page) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(page_deferred_list(page))) { ds_queue->split_queue_len--; - list_del(page_deferred_list(page)); + list_del_init(page_deferred_list(page)); } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); free_compound_page(page); @@ -2963,6 +2975,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) pmde = pmd_mksoft_dirty(pmde); if (is_write_migration_entry(entry)) pmde = maybe_pmd_mkwrite(pmde, vma); + if (unlikely(is_device_private_page(new))) { + entry = make_device_private_entry(new, pmd_write(pmde)); + pmde = swp_entry_to_pmd(entry); + } flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); if (PageAnon(new)) diff --git a/mm/internal.h b/mm/internal.h index 9886db20d94f..58f051a14dae 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -196,7 +196,6 @@ extern void __putback_isolated_page(struct page *page, unsigned int order, extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order); -extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; diff --git a/mm/memory.c b/mm/memory.c index dc7f3543b1fd..4e03d1a2ead5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4322,9 +4322,15 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, barrier(); if (unlikely(is_swap_pmd(orig_pmd))) { + swp_entry_t entry = pmd_to_swp_entry(orig_pmd); + + if (is_device_private_entry(entry)) { + vmf.page = device_private_entry_to_page(entry); + return vmf.page->pgmap->ops->migrate_to_ram(&vmf); + } VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(orig_pmd)); - if (is_pmd_migration_entry(orig_pmd)) + !is_migration_entry(entry)); + if (is_migration_entry(entry)) pmd_migration_entry_wait(mm, vmf.pmd); return 0; } diff --git a/mm/memremap.c b/mm/memremap.c index 03e38b7a38f1..4231054188b4 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -132,8 +132,13 @@ void memunmap_pages(struct dev_pagemap *pgmap) int nid; dev_pagemap_kill(pgmap); - for_each_device_pfn(pfn, pgmap) - put_page(pfn_to_page(pfn)); + for_each_device_pfn(pfn, pgmap) { + struct page *page = pfn_to_page(pfn); + unsigned int order = compound_order(page); + + put_page(page); + pfn += (1U << order) - 1; + } dev_pagemap_cleanup(pgmap); /* make sure to access a memmap that was actually initialized */ diff --git a/mm/migrate.c b/mm/migrate.c index 87c52e0ee580..1536677cefc9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -51,6 +51,7 @@ #include <linux/oom.h> #include <asm/tlbflush.h> +#include <asm/pgalloc.h> #define CREATE_TRACE_POINTS #include <trace/events/migrate.h> @@ -2185,6 +2186,8 @@ static int migrate_vma_collect_hole(unsigned long start, for (addr = start; addr < end; addr += PAGE_SIZE) { migrate->src[migrate->npages] = flags; + if ((addr & ~PMD_MASK) == 0 && (end & ~PMD_MASK) == 0) + migrate->src[migrate->npages] |= MIGRATE_PFN_COMPOUND; migrate->dst[migrate->npages] = 0; migrate->npages++; migrate->cpages++; @@ -2219,48 +2222,87 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, unsigned long addr = start, unmapped = 0; spinlock_t *ptl; pte_t *ptep; + pmd_t pmd; again: - if (pmd_none(*pmdp)) + pmd = READ_ONCE(*pmdp); + if (pmd_none(pmd)) return migrate_vma_collect_hole(start, end, -1, walk); - if (pmd_trans_huge(*pmdp)) { + if (pmd_trans_huge(pmd) || !pmd_present(pmd)) { struct page *page; + unsigned long write = 0; + int ret; ptl = pmd_lock(mm, pmdp); - if (unlikely(!pmd_trans_huge(*pmdp))) { - spin_unlock(ptl); - goto again; - } + if (pmd_trans_huge(*pmdp)) { + page = pmd_page(*pmdp); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + return migrate_vma_collect_hole(start, end, -1, + walk); + } + if (pmd_write(*pmdp)) + write = MIGRATE_PFN_WRITE; + } else if (!pmd_present(*pmdp)) { + swp_entry_t entry = pmd_to_swp_entry(*pmdp); - page = pmd_page(*pmdp); - if (is_huge_zero_page(page)) { - spin_unlock(ptl); - split_huge_pmd(vma, pmdp, addr); - if (pmd_trans_unstable(pmdp)) + if (!is_device_private_entry(entry)) { + spin_unlock(ptl); return migrate_vma_collect_skip(start, end, walk); + } + page = device_private_entry_to_page(entry); + if (is_write_device_private_entry(entry)) + write = MIGRATE_PFN_WRITE; } else { - int ret; + spin_unlock(ptl); + goto again; + } - get_page(page); + get_page(page); + if (unlikely(!trylock_page(page))) { spin_unlock(ptl); - if (unlikely(!trylock_page(page))) - return migrate_vma_collect_skip(start, end, - walk); - ret = split_huge_page(page); - unlock_page(page); put_page(page); - if (ret) - return migrate_vma_collect_skip(start, end, - walk); - if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, -1, - walk); + return migrate_vma_collect_skip(start, end, walk); + } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if ((start & HPAGE_PMD_MASK) == start && + start + HPAGE_PMD_SIZE == end) { + struct page_vma_mapped_walk vmw = { + .vma = vma, + .address = start, + .pmd = pmdp, + .ptl = ptl, + }; + + migrate->src[migrate->npages] = write | + migrate_pfn(page_to_pfn(page)) | + MIGRATE_PFN_MIGRATE | MIGRATE_PFN_LOCKED | + MIGRATE_PFN_COMPOUND; + migrate->dst[migrate->npages] = 0; + migrate->npages++; + migrate->cpages++; + migrate_vma_collect_skip(start + PAGE_SIZE, end, walk); + + /* Note that this also removes the page from the rmap. */ + set_pmd_migration_entry(&vmw, page); + spin_unlock(ptl); + + return 0; } +#endif + spin_unlock(ptl); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return migrate_vma_collect_skip(start, end, walk); + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, -1, walk); } - if (unlikely(pmd_bad(*pmdp))) + if (unlikely(pmd_bad(pmd) || pmd_devmap(pmd))) return migrate_vma_collect_skip(start, end, walk); ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); @@ -2310,8 +2352,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } - /* FIXME support THP */ - if (!page || !page->mapping || PageTransCompound(page)) { + if (!page || !page->mapping) { mpfn = 0; goto next; } @@ -2420,14 +2461,6 @@ static bool migrate_vma_check_page(struct page *page) */ int extra = 1; - /* - * FIXME support THP (transparent huge page), it is bit more complex to - * check them than regular pages, because they can be mapped with a pmd - * or with a pte (split pte mapping). - */ - if (PageCompound(page)) - return false; - /* Page from ZONE_DEVICE have one extra reference */ if (is_zone_device_page(page)) { /* @@ -2726,13 +2759,115 @@ int migrate_vma_setup(struct migrate_vma *args) } EXPORT_SYMBOL(migrate_vma_setup); +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +/* + * This code closely follows: + * do_huge_pmd_anonymous_page() + * __do_huge_pmd_anonymous_page() + * except that the page being inserted is likely to be a device private page + * instead of an allocated or zero page. + */ +static int insert_huge_pmd_anonymous_page(struct vm_area_struct *vma, + unsigned long haddr, + struct page *page, + unsigned long *src, + unsigned long *dst, + pmd_t *pmdp) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned int i; + spinlock_t *ptl; + bool flush = false; + pgtable_t pgtable; + gfp_t gfp; + pmd_t entry; + + if (WARN_ON_ONCE(compound_order(page) != HPAGE_PMD_ORDER)) + goto abort; + + if (unlikely(anon_vma_prepare(vma))) + goto abort; + + prep_transhuge_page(page); + + gfp = GFP_TRANSHUGE_LIGHT; + if (mem_cgroup_charge(page, mm, gfp)) + goto abort; + + pgtable = pte_alloc_one(mm); + if (unlikely(!pgtable)) + goto abort; + + __SetPageUptodate(page); + + if (is_zone_device_page(page)) { + if (!is_device_private_page(page)) + goto pgtable_abort; + entry = swp_entry_to_pmd(make_device_private_entry(page, + vma->vm_flags & VM_WRITE)); + } else { + entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + } + + ptl = pmd_lock(mm, pmdp); + + if (check_stable_address_space(mm)) + goto unlock_abort; + + /* + * Check for userfaultfd but do not deliver the fault. Instead, + * just back off. + */ + if (userfaultfd_missing(vma)) + goto unlock_abort; + + if (pmd_present(*pmdp)) { + if (!is_huge_zero_pmd(*pmdp)) + goto unlock_abort; + flush = true; + } else if (!pmd_none(*pmdp)) + goto unlock_abort; + + get_page(page); + page_add_new_anon_rmap(page, vma, haddr, true); + if (!is_device_private_page(page)) + lru_cache_add_active_or_unevictable(page, vma); + if (flush) { + pte_free(mm, pgtable); + flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + pmdp_invalidate(vma, haddr, pmdp); + } else { + pgtable_trans_huge_deposit(mm, pmdp, pgtable); + mm_inc_nr_ptes(mm); + } + set_pmd_at(mm, haddr, pmdp, entry); + update_mmu_cache_pmd(vma, haddr, pmdp); + add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + spin_unlock(ptl); + count_vm_event(THP_FAULT_ALLOC); + count_memcg_event_mm(mm, THP_FAULT_ALLOC); + + return 0; + +unlock_abort: + spin_unlock(ptl); +pgtable_abort: + pte_free(mm, pgtable); +abort: + for (i = 0; i < HPAGE_PMD_NR; i++) + src[i] &= ~MIGRATE_PFN_MIGRATE; + return -EINVAL; +} +#endif + /* * This code closely matches the code in: * __handle_mm_fault() * handle_pte_fault() * do_anonymous_page() - * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE - * private page. + * to map in an anonymous zero page except the struct page is already allocated + * and will likely be a ZONE_DEVICE private page. */ static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, @@ -2766,7 +2901,16 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (!pmdp) goto abort; - if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (*dst & MIGRATE_PFN_COMPOUND) { + int ret = insert_huge_pmd_anonymous_page(vma, addr, page, src, + dst, pmdp); + if (ret) + goto abort; + return; + } +#endif + if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp) || pmd_bad(*pmdp)) goto abort; /* @@ -2804,7 +2948,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); entry = swp_entry_to_pte(swp_entry); - } + } else + goto abort; } else { entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) @@ -2833,10 +2978,10 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); + get_page(page); page_add_new_anon_rmap(page, vma, addr, false); if (!is_zone_device_page(page)) lru_cache_add_active_or_unevictable(page, vma); - get_page(page); if (flush) { flush_cache_page(vma, addr, pte_pfn(*ptep)); @@ -2850,7 +2995,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, } pte_unmap_unlock(ptep, ptl); - *src = MIGRATE_PFN_MIGRATE; return; unlock_abort: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 48eb0f1410d4..a852ed2f204c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -686,6 +686,7 @@ void prep_compound_page(struct page *page, unsigned int order) if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); } +EXPORT_SYMBOL_GPL(prep_compound_page); #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder;
Support transparent huge page migration to ZONE_DEVICE private memory. A new flag (MIGRATE_PFN_COMPOUND) is added to the input PFN array to indicate the huge page was fully mapped by the CPU. Export prep_compound_page() so that device drivers can create huge device private pages after calling memremap_pages(). Signed-off-by: Ralph Campbell <rcampbell@nvidia.com> --- include/linux/migrate.h | 1 + include/linux/mm.h | 1 + mm/huge_memory.c | 30 ++++-- mm/internal.h | 1 - mm/memory.c | 10 +- mm/memremap.c | 9 +- mm/migrate.c | 226 ++++++++++++++++++++++++++++++++-------- mm/page_alloc.c | 1 + 8 files changed, 226 insertions(+), 53 deletions(-)