Message ID | 20220629035426.20013-4-alex.sierra@amd.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Add MEMORY_DEVICE_COHERENT for coherent device memory mapping | expand |
On 29.06.22 05:54, Alex Sierra wrote: > With DEVICE_COHERENT, we'll soon have vm_normal_pages() return > device-managed anonymous pages that are not LRU pages. Although they > behave like normal pages for purposes of mapping in CPU page, and for > COW. They do not support LRU lists, NUMA migration or THP. > > Callers to follow_page that expect LRU pages, are also checked for > device zone pages due to DEVICE_COHERENT type. Can we rephrase that to (because zeropage) "Callers to follow_page() currently don't expect ZONE_DEVICE pages, however, with DEVICE_COHERENT we might now return ZONE_DEVICE. Check for ZONE_DEVICE pages in applicable users of follow_page() as well." [...] > /* > diff --git a/mm/memory.c b/mm/memory.c > index 7a089145cad4..e18555af9024 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -624,6 +624,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, > if (is_zero_pfn(pfn)) > return NULL; > if (pte_devmap(pte)) > +/* > + * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have s/uers/users/ > + * refcounts incremented on their struct pages when they are inserted into > + * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set > + * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is > + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. > + */ [...] > diff --git a/mm/mprotect.c b/mm/mprotect.c > index ba5592655ee3..e034aae2a98b 100644 > --- a/mm/mprotect.c > +++ b/mm/mprotect.c > @@ -95,7 +95,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, > continue; > > page = vm_normal_page(vma, addr, oldpte); > - if (!page || PageKsm(page)) > + if (!page || is_zone_device_page(page) || PageKsm(page)) > continue; > > /* Also skip shared copy-on-write pages */ In -next/-mm there is now an additional can_change_pte_writable() that calls vm_normal_page() -- added by me. I assume that that is indeed fine because we can simply map device coherent pages writable. Besides the nits, LGTM Acked-by: David Hildenbrand <david@redhat.com>
On Wed, 29 Jun 2022 11:59:26 +0200 David Hildenbrand <david@redhat.com> wrote: > On 29.06.22 05:54, Alex Sierra wrote: > > With DEVICE_COHERENT, we'll soon have vm_normal_pages() return > > device-managed anonymous pages that are not LRU pages. Although they > > behave like normal pages for purposes of mapping in CPU page, and for > > COW. They do not support LRU lists, NUMA migration or THP. > > > > Callers to follow_page that expect LRU pages, are also checked for > > device zone pages due to DEVICE_COHERENT type. > > Can we rephrase that to (because zeropage) > > "Callers to follow_page() currently don't expect ZONE_DEVICE pages, > however, with DEVICE_COHERENT we might now return ZONE_DEVICE. Check for > ZONE_DEVICE pages in applicable users of follow_page() as well." I made that change to my copy. > > --- a/mm/memory.c > > +++ b/mm/memory.c > > @@ -624,6 +624,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, > > if (is_zero_pfn(pfn)) > > return NULL; > > if (pte_devmap(pte)) > > +/* > > + * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have > > s/uers/users/ > > > + * refcounts incremented on their struct pages when they are inserted into > > + * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set > > + * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is > > + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. > > + */ And let's regularize that comment placement? --- a/mm/memory.c~mm-handling-non-lru-pages-returned-by-vm_normal_pages-fix +++ a/mm/memory.c @@ -632,16 +632,16 @@ struct page *vm_normal_page(struct vm_ar return NULL; if (is_zero_pfn(pfn)) return NULL; + /* + * NOTE: New users of ZONE_DEVICE will not set pte_devmap() + * and will have refcounts incremented on their struct pages + * when they are inserted into PTEs, thus they are safe to + * return here. Legacy ZONE_DEVICE pages that set pte_devmap() + * do not have refcounts. Example of legacy ZONE_DEVICE is + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. + */ if (pte_devmap(pte)) -/* - * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have - * refcounts incremented on their struct pages when they are inserted into - * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set - * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is - * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. - */ return NULL; - print_bad_pte(vma, addr, pte, NULL); return NULL; }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2d04e3470d4c..2dd8c8a66924 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1792,7 +1792,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, return NULL; page = vm_normal_page(vma, addr, pte); - if (!page) + if (!page || is_zone_device_page(page)) return NULL; if (PageReserved(page)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 834f288b3769..c47e95b02244 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2910,7 +2910,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, if (IS_ERR(page)) continue; - if (!page) + if (!page || is_zone_device_page(page)) continue; if (!is_transparent_hugepage(page)) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 16be62d493cd..671ac7800e53 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -618,7 +618,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } page = vm_normal_page(vma, address, pteval); - if (unlikely(!page)) { + if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out; } @@ -1267,7 +1267,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, writable = true; page = vm_normal_page(vma, _address, pteval); - if (unlikely(!page)) { + if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out_unmap; } @@ -1479,7 +1479,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) goto abort; page = vm_normal_page(vma, addr, *pte); - + if (WARN_ON_ONCE(page && is_zone_device_page(page))) + page = NULL; /* * Note that uprobe, debugger, or MAP_PRIVATE may change the * page table, but the new page will not be a subpage of hpage. @@ -1497,6 +1498,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (pte_none(*pte)) continue; page = vm_normal_page(vma, addr, *pte); + if (WARN_ON_ONCE(page && is_zone_device_page(page))) + goto abort; page_remove_rmap(page, vma, false); } diff --git a/mm/ksm.c b/mm/ksm.c index 54f78c9eecae..831b18a7a50b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) cond_resched(); page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); - if (IS_ERR_OR_NULL(page)) + if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) break; if (PageKsm(page)) ret = handle_mm_fault(vma, addr, @@ -560,7 +560,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) goto out; page = follow_page(vma, addr, FOLL_GET); - if (IS_ERR_OR_NULL(page)) + if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) goto out; if (PageAnon(page)) { flush_anon_page(vma, page, addr); @@ -2308,7 +2308,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) if (ksm_test_exit(mm)) break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (IS_ERR_OR_NULL(*page)) { + if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) { ksm_scan.address += PAGE_SIZE; cond_resched(); continue; diff --git a/mm/madvise.c b/mm/madvise.c index 0316bbc6441b..e252635fe935 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -421,7 +421,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, continue; page = vm_normal_page(vma, addr, ptent); - if (!page) + if (!page || is_zone_device_page(page)) continue; /* @@ -639,7 +639,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, } page = vm_normal_page(vma, addr, ptent); - if (!page) + if (!page || is_zone_device_page(page)) continue; /* diff --git a/mm/memory.c b/mm/memory.c index 7a089145cad4..e18555af9024 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -624,6 +624,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, if (is_zero_pfn(pfn)) return NULL; if (pte_devmap(pte)) +/* + * NOTE: New uers of ZONE_DEVICE will not set pte_devmap() and will have + * refcounts incremented on their struct pages when they are inserted into + * PTEs, thus they are safe to return here. Legacy ZONE_DEVICE pages that set + * pte_devmap() do not have refcounts. Example of legacy ZONE_DEVICE is + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. + */ return NULL; print_bad_pte(vma, addr, pte, NULL); @@ -4693,7 +4700,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) pte = pte_modify(old_pte, vma->vm_page_prot); page = vm_normal_page(vma, vmf->address, pte); - if (!page) + if (!page || is_zone_device_page(page)) goto out_map; /* TODO: handle PTE-mapped THP */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d39b01fd52fe..abc26890fc95 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -523,7 +523,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, if (!pte_present(*pte)) continue; page = vm_normal_page(vma, addr, *pte); - if (!page) + if (!page || is_zone_device_page(page)) continue; /* * vm_normal_page() filters out zero pages, but there might diff --git a/mm/migrate.c b/mm/migrate.c index 6c1ea61f39d8..a98a219d12ab 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1620,7 +1620,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, goto out; err = -ENOENT; - if (!page) + if (!page || is_zone_device_page(page)) goto out; err = 0; @@ -1810,7 +1810,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (IS_ERR(page)) goto set_status; - if (page) { + if (page && !is_zone_device_page(page)) { err = page_to_nid(page); put_page(page); } else { diff --git a/mm/mlock.c b/mm/mlock.c index 716caf851043..b14e929084cc 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, if (!pte_present(*pte)) continue; page = vm_normal_page(vma, addr, *pte); - if (!page) + if (!page || is_zone_device_page(page)) continue; if (PageTransCompound(page)) continue; diff --git a/mm/mprotect.c b/mm/mprotect.c index ba5592655ee3..e034aae2a98b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -95,7 +95,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, continue; page = vm_normal_page(vma, addr, oldpte); - if (!page || PageKsm(page)) + if (!page || is_zone_device_page(page) || PageKsm(page)) continue; /* Also skip shared copy-on-write pages */