@@ -293,6 +293,10 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, int flags, struct dev_pagemap **pgmap);
+long follow_devmap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page **pages, struct vm_area_struct **vmas,
+ unsigned long *position, unsigned long *nr_pages,
+ long i, unsigned int flags, int *locked);
extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
@@ -1164,6 +1164,8 @@ static inline void get_page(struct page *page)
page_ref_inc(page);
}
+__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs,
+ unsigned int flags);
bool __must_check try_grab_page(struct page *page, unsigned int flags);
static inline __must_check bool try_get_page(struct page *page)
@@ -78,7 +78,7 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
* considered failure, and furthermore, a likely bug in the caller, so a warning
* is also emitted.
*/
-static __maybe_unused struct page *try_grab_compound_head(struct page *page,
+__maybe_unused struct page *try_grab_compound_head(struct page *page,
int refs,
unsigned int flags)
{
@@ -880,8 +880,8 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
* does not include FOLL_NOWAIT, the mmap_lock may be released. If it
* is, *@locked will be set to 0 and -EBUSY returned.
*/
-static int faultin_page(struct vm_area_struct *vma,
- unsigned long address, unsigned int *flags, int *locked)
+int faultin_page(struct vm_area_struct *vma,
+ unsigned long address, unsigned int *flags, int *locked)
{
unsigned int fault_flags = 0;
vm_fault_t ret;
@@ -1103,6 +1103,22 @@ static long __get_user_pages(struct mm_struct *mm,
}
continue;
}
+ if (vma_is_dax(vma)) {
+ i = follow_devmap_page(mm, vma, pages, vmas,
+ &start, &nr_pages, i,
+ gup_flags, locked);
+ if (locked && *locked == 0) {
+ /*
+ * We've got a VM_FAULT_RETRY
+ * and we've lost mmap_lock.
+ * We must stop here.
+ */
+ BUG_ON(gup_flags & FOLL_NOWAIT);
+ BUG_ON(ret != 0);
+ goto out;
+ }
+ continue;
+ }
}
retry:
/*
@@ -1168,6 +1168,208 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
return page;
}
+long follow_devmap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page **pages, struct vm_area_struct **vmas,
+ unsigned long *position, unsigned long *nr_pages,
+ long i, unsigned int flags, int *locked)
+{
+ unsigned long pfn_offset;
+ unsigned long vaddr = *position;
+ unsigned long remainder = *nr_pages;
+ unsigned long align = vma_kernel_pagesize(vma);
+ unsigned long align_nr_pages = align >> PAGE_SHIFT;
+ unsigned long mask = ~(align-1);
+ unsigned long nr_pages_hpage = 0;
+ struct dev_pagemap *pgmap = NULL;
+ int err = -EFAULT;
+
+ if (align == PAGE_SIZE)
+ return i;
+
+ while (vaddr < vma->vm_end && remainder) {
+ pte_t *pte;
+ spinlock_t *ptl = NULL;
+ int absent;
+ struct page *page;
+
+ /*
+ * If we have a pending SIGKILL, don't keep faulting pages and
+ * potentially allocating memory.
+ */
+ if (fatal_signal_pending(current)) {
+ remainder = 0;
+ break;
+ }
+
+ /*
+ * Some archs (sparc64, sh*) have multiple pte_ts to
+ * each hugepage. We have to make sure we get the
+ * first, for the page indexing below to work.
+ *
+ * Note that page table lock is not held when pte is null.
+ */
+ pte = huge_pte_offset(mm, vaddr & mask, align);
+ if (pte) {
+ if (align == PMD_SIZE)
+ ptl = pmd_lockptr(mm, (pmd_t *) pte);
+ else if (align == PUD_SIZE)
+ ptl = pud_lockptr(mm, (pud_t *) pte);
+ spin_lock(ptl);
+ }
+ absent = !pte || pte_none(ptep_get(pte));
+
+ if (absent && (flags & FOLL_DUMP)) {
+ if (pte)
+ spin_unlock(ptl);
+ remainder = 0;
+ break;
+ }
+
+ if (absent ||
+ ((flags & FOLL_WRITE) &&
+ !pte_write(ptep_get(pte)))) {
+ vm_fault_t ret;
+ unsigned int fault_flags = 0;
+
+ if (pte)
+ spin_unlock(ptl);
+ if (flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (locked)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+ FAULT_FLAG_KILLABLE;
+ if (flags & FOLL_NOWAIT)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+ FAULT_FLAG_RETRY_NOWAIT;
+ if (flags & FOLL_TRIED) {
+ /*
+ * Note: FAULT_FLAG_ALLOW_RETRY and
+ * FAULT_FLAG_TRIED can co-exist
+ */
+ fault_flags |= FAULT_FLAG_TRIED;
+ }
+ ret = handle_mm_fault(vma, vaddr, flags, NULL);
+ if (ret & VM_FAULT_ERROR) {
+ err = vm_fault_to_errno(ret, flags);
+ remainder = 0;
+ break;
+ }
+ if (ret & VM_FAULT_RETRY) {
+ if (locked &&
+ !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
+ *locked = 0;
+ *nr_pages = 0;
+ /*
+ * VM_FAULT_RETRY must not return an
+ * error, it will return zero
+ * instead.
+ *
+ * No need to update "position" as the
+ * caller will not check it after
+ * *nr_pages is set to 0.
+ */
+ return i;
+ }
+ continue;
+ }
+
+ pfn_offset = (vaddr & ~mask) >> PAGE_SHIFT;
+ page = pte_page(ptep_get(pte));
+
+ pgmap = get_dev_pagemap(page_to_pfn(page), pgmap);
+ if (!pgmap) {
+ spin_unlock(ptl);
+ remainder = 0;
+ err = -EFAULT;
+ break;
+ }
+
+ /*
+ * If subpage information not requested, update counters
+ * and skip the same_page loop below.
+ */
+ if (!pages && !vmas && !pfn_offset &&
+ (vaddr + align < vma->vm_end) &&
+ (remainder >= (align_nr_pages))) {
+ vaddr += align;
+ remainder -= align_nr_pages;
+ i += align_nr_pages;
+ spin_unlock(ptl);
+ continue;
+ }
+
+ nr_pages_hpage = 0;
+
+same_page:
+ if (pages) {
+ pages[i] = mem_map_offset(page, pfn_offset);
+
+ /*
+ * try_grab_page() should always succeed here, because:
+ * a) we hold the ptl lock, and b) we've just checked
+ * that the huge page is present in the page tables.
+ */
+ if (!(pgmap->flags & PGMAP_COMPOUND) &&
+ WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
+ spin_unlock(ptl);
+ remainder = 0;
+ err = -ENOMEM;
+ break;
+ }
+
+ }
+
+ if (vmas)
+ vmas[i] = vma;
+
+ vaddr += PAGE_SIZE;
+ ++pfn_offset;
+ --remainder;
+ ++i;
+ nr_pages_hpage++;
+ if (vaddr < vma->vm_end && remainder &&
+ pfn_offset < align_nr_pages) {
+ /*
+ * We use pfn_offset to avoid touching the pageframes
+ * of this compound page.
+ */
+ goto same_page;
+ } else {
+ /*
+ * try_grab_compound_head() should always succeed here,
+ * because: a) we hold the ptl lock, and b) we've just
+ * checked that the huge page is present in the page
+ * tables. If the huge page is present, then the tail
+ * pages must also be present. The ptl prevents the
+ * head page and tail pages from being rearranged in
+ * any way. So this page must be available at this
+ * point, unless the page refcount overflowed:
+ */
+ if ((pgmap->flags & PGMAP_COMPOUND) &&
+ WARN_ON_ONCE(!try_grab_compound_head(pages[i-1],
+ nr_pages_hpage,
+ flags))) {
+ put_dev_pagemap(pgmap);
+ spin_unlock(ptl);
+ remainder = 0;
+ err = -ENOMEM;
+ break;
+ }
+ put_dev_pagemap(pgmap);
+ }
+ spin_unlock(ptl);
+ }
+ *nr_pages = remainder;
+ /*
+ * setting position is actually required only if remainder is
+ * not zero but it's faster not to add a "if (remainder)"
+ * branch.
+ */
+ *position = vaddr;
+
+ return i ? i : err;
+}
+
int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
struct vm_area_struct *vma)
Similar to follow_hugetlb_page() add a follow_devmap_page which rather than calling follow_page() per 4K page in a PMD/PUD it does so for the entire PMD, where we lock the pmd/pud, get all pages , unlock. While doing so, we only change the refcount once when PGMAP_COMPOUND is passed in. This let us improve {pin,get}_user_pages{,_longterm}() considerably: $ gup_benchmark -f /dev/dax0.2 -m 16384 -r 10 -S [-U,-b,-L] -n 512 -w (<test>) [before] -> [after] (get_user_pages 2M pages) ~150k us -> ~8.9k us (pin_user_pages 2M pages) ~192k us -> ~9k us (pin_user_pages_longterm 2M pages) ~200k us -> ~19k us Signed-off-by: Joao Martins <joao.m.martins@oracle.com> --- I've special-cased this to device-dax vmas given its similar page size guarantees as hugetlbfs, but I feel this is a bit wrong. I am replicating follow_hugetlb_page() as RFC ought to seek feedback whether this should be generalized if no fundamental issues exist. In such case, should I be changing follow_page_mask() to take either an array of pages or a function pointer and opaque arguments which would let caller pick its structure? --- include/linux/huge_mm.h | 4 + include/linux/mm.h | 2 + mm/gup.c | 22 ++++- mm/huge_memory.c | 202 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 227 insertions(+), 3 deletions(-)