Message ID | aef43be2-f877-b0f8-b41c-37f847d3a7b4@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm: allow pte_offset_map[_lock]() to fail | expand |
On Sun, May 21, 2023 at 10:24 PM Hugh Dickins <hughd@google.com> wrote: > > __collapse_huge_page_swapin(): don't drop the map after every pte, it > only has to be dropped by do_swap_page(); give up if pte_offset_map() > fails; trace_mm_collapse_huge_page_swapin() at the end, with result; > fix comment on returned result; fix vmf.pgoff, though it's not used. > > collapse_huge_page(): use pte_offset_map_lock() on the _pmd returned > from clearing; allow failure, but it should be impossible there. > hpage_collapse_scan_pmd() and collapse_pte_mapped_thp() allow for > pte_offset_map_lock() failure. > > Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Yang Shi <shy828301@gmail.com> A nit below: > --- > mm/khugepaged.c | 72 +++++++++++++++++++++++++++++++++---------------- > 1 file changed, 49 insertions(+), 23 deletions(-) > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index 732f9ac393fc..49cfa7cdfe93 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -993,9 +993,8 @@ static int check_pmd_still_valid(struct mm_struct *mm, > * Only done if hpage_collapse_scan_pmd believes it is worthwhile. > * > * Called and returns without pte mapped or spinlocks held. > - * Note that if false is returned, mmap_lock will be released. > + * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. > */ > - > static int __collapse_huge_page_swapin(struct mm_struct *mm, > struct vm_area_struct *vma, > unsigned long haddr, pmd_t *pmd, > @@ -1004,23 +1003,35 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, > int swapped_in = 0; > vm_fault_t ret = 0; > unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); > + int result; > + pte_t *pte = NULL; > > for (address = haddr; address < end; address += PAGE_SIZE) { > struct vm_fault vmf = { > .vma = vma, > .address = address, > - .pgoff = linear_page_index(vma, haddr), > + .pgoff = linear_page_index(vma, address), > .flags = FAULT_FLAG_ALLOW_RETRY, > .pmd = pmd, > }; > > - vmf.pte = pte_offset_map(pmd, address); > - vmf.orig_pte = *vmf.pte; > - if (!is_swap_pte(vmf.orig_pte)) { > - pte_unmap(vmf.pte); > - continue; > + if (!pte++) { > + pte = pte_offset_map(pmd, address); > + if (!pte) { > + mmap_read_unlock(mm); > + result = SCAN_PMD_NULL; > + goto out; > + } > } > + > + vmf.orig_pte = *pte; > + if (!is_swap_pte(vmf.orig_pte)) > + continue; > + > + vmf.pte = pte; > ret = do_swap_page(&vmf); > + /* Which unmaps pte (after perhaps re-checking the entry) */ > + pte = NULL; > > /* > * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. > @@ -1029,24 +1040,29 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, > * resulting in later failure. > */ > if (ret & VM_FAULT_RETRY) { > - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); > /* Likely, but not guaranteed, that page lock failed */ > - return SCAN_PAGE_LOCK; > + result = SCAN_PAGE_LOCK; With per-VMA lock, this may not be true anymore, at least not true until per-VMA lock supports swap fault. It may be better to have a more general failure code, for example, SCAN_FAIL. But anyway you don't have to change it in your patch, I can send a follow-up patch once this series is landed on mm-unstable. > + goto out; > } > if (ret & VM_FAULT_ERROR) { > mmap_read_unlock(mm); > - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); > - return SCAN_FAIL; > + result = SCAN_FAIL; > + goto out; > } > swapped_in++; > } > > + if (pte) > + pte_unmap(pte); > + > /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ > if (swapped_in) > lru_add_drain(); > > - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); > - return SCAN_SUCCEED; > + result = SCAN_SUCCEED; > +out: > + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); > + return result; > } > > static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, > @@ -1146,9 +1162,6 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > address + HPAGE_PMD_SIZE); > mmu_notifier_invalidate_range_start(&range); > > - pte = pte_offset_map(pmd, address); > - pte_ptl = pte_lockptr(mm, pmd); > - > pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ > /* > * This removes any huge TLB entry from the CPU so we won't allow > @@ -1163,13 +1176,18 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > mmu_notifier_invalidate_range_end(&range); > tlb_remove_table_sync_one(); > > - spin_lock(pte_ptl); > - result = __collapse_huge_page_isolate(vma, address, pte, cc, > - &compound_pagelist); > - spin_unlock(pte_ptl); > + pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); > + if (pte) { > + result = __collapse_huge_page_isolate(vma, address, pte, cc, > + &compound_pagelist); > + spin_unlock(pte_ptl); > + } else { > + result = SCAN_PMD_NULL; > + } > > if (unlikely(result != SCAN_SUCCEED)) { > - pte_unmap(pte); > + if (pte) > + pte_unmap(pte); > spin_lock(pmd_ptl); > BUG_ON(!pmd_none(*pmd)); > /* > @@ -1253,6 +1271,11 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, > memset(cc->node_load, 0, sizeof(cc->node_load)); > nodes_clear(cc->alloc_nmask); > pte = pte_offset_map_lock(mm, pmd, address, &ptl); > + if (!pte) { > + result = SCAN_PMD_NULL; > + goto out; > + } > + > for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; > _pte++, _address += PAGE_SIZE) { > pte_t pteval = *_pte; > @@ -1622,8 +1645,10 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, > * lockless_pages_from_mm() and the hardware page walker can access page > * tables while all the high-level locks are held in write mode. > */ > - start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); > result = SCAN_FAIL; > + start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); > + if (!start_pte) > + goto drop_immap; > > /* step 1: check all mapped PTEs are to the right huge page */ > for (i = 0, addr = haddr, pte = start_pte; > @@ -1697,6 +1722,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, > > abort: > pte_unmap_unlock(start_pte, ptl); > +drop_immap: > i_mmap_unlock_write(vma->vm_file->f_mapping); > goto drop_hpage; > } > -- > 2.35.3 >
On Mon, 22 May 2023, Yang Shi wrote: > On Sun, May 21, 2023 at 10:24 PM Hugh Dickins <hughd@google.com> wrote: > > > > __collapse_huge_page_swapin(): don't drop the map after every pte, it > > only has to be dropped by do_swap_page(); give up if pte_offset_map() > > fails; trace_mm_collapse_huge_page_swapin() at the end, with result; > > fix comment on returned result; fix vmf.pgoff, though it's not used. > > > > collapse_huge_page(): use pte_offset_map_lock() on the _pmd returned > > from clearing; allow failure, but it should be impossible there. > > hpage_collapse_scan_pmd() and collapse_pte_mapped_thp() allow for > > pte_offset_map_lock() failure. > > > > Signed-off-by: Hugh Dickins <hughd@google.com> > > Reviewed-by: Yang Shi <shy828301@gmail.com> Thanks. > > A nit below: > > > --- > > mm/khugepaged.c | 72 +++++++++++++++++++++++++++++++++---------------- > > 1 file changed, 49 insertions(+), 23 deletions(-) > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > > index 732f9ac393fc..49cfa7cdfe93 100644 > > --- a/mm/khugepaged.c > > +++ b/mm/khugepaged.c ... > > @@ -1029,24 +1040,29 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, > > * resulting in later failure. > > */ > > if (ret & VM_FAULT_RETRY) { > > - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); > > /* Likely, but not guaranteed, that page lock failed */ > > - return SCAN_PAGE_LOCK; > > + result = SCAN_PAGE_LOCK; > > With per-VMA lock, this may not be true anymore, at least not true > until per-VMA lock supports swap fault. It may be better to have a > more general failure code, for example, SCAN_FAIL. But anyway you > don't have to change it in your patch, I can send a follow-up patch > once this series is landed on mm-unstable. Interesting point (I've not tried to wrap my head around what differences per-VMA locking would make to old likelihoods here), and thank you for deferring a change on it - appreciated. Something to beware of, if you do choose to change it: mostly those SCAN codes (I'm not a fan of them!) are only for a tracepoint somewhere, but madvise_collapse() and madvise_collapse_errno() take some of them more seriously than others - I think SCAN_PAGE_LOCK ends up as an EAGAIN (rightly), but SCAN_FAIL as an EINVAL (depends). But maybe there are layers in between which do not propagate the result code, I didn't check. All in all, not something I'd spend time on myself. Hugh
On Tue, May 23, 2023 at 9:44 PM Hugh Dickins <hughd@google.com> wrote: > > On Mon, 22 May 2023, Yang Shi wrote: > > On Sun, May 21, 2023 at 10:24 PM Hugh Dickins <hughd@google.com> wrote: > > > > > > __collapse_huge_page_swapin(): don't drop the map after every pte, it > > > only has to be dropped by do_swap_page(); give up if pte_offset_map() > > > fails; trace_mm_collapse_huge_page_swapin() at the end, with result; > > > fix comment on returned result; fix vmf.pgoff, though it's not used. > > > > > > collapse_huge_page(): use pte_offset_map_lock() on the _pmd returned > > > from clearing; allow failure, but it should be impossible there. > > > hpage_collapse_scan_pmd() and collapse_pte_mapped_thp() allow for > > > pte_offset_map_lock() failure. > > > > > > Signed-off-by: Hugh Dickins <hughd@google.com> > > > > Reviewed-by: Yang Shi <shy828301@gmail.com> > > Thanks. > > > > > A nit below: > > > > > --- > > > mm/khugepaged.c | 72 +++++++++++++++++++++++++++++++++---------------- > > > 1 file changed, 49 insertions(+), 23 deletions(-) > > > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > > > index 732f9ac393fc..49cfa7cdfe93 100644 > > > --- a/mm/khugepaged.c > > > +++ b/mm/khugepaged.c > ... > > > @@ -1029,24 +1040,29 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, > > > * resulting in later failure. > > > */ > > > if (ret & VM_FAULT_RETRY) { > > > - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); > > > /* Likely, but not guaranteed, that page lock failed */ > > > - return SCAN_PAGE_LOCK; > > > + result = SCAN_PAGE_LOCK; > > > > With per-VMA lock, this may not be true anymore, at least not true > > until per-VMA lock supports swap fault. It may be better to have a > > more general failure code, for example, SCAN_FAIL. But anyway you > > don't have to change it in your patch, I can send a follow-up patch > > once this series is landed on mm-unstable. > > Interesting point (I've not tried to wrap my head around what differences > per-VMA locking would make to old likelihoods here), and thank you for > deferring a change on it - appreciated. > > Something to beware of, if you do choose to change it: mostly those > SCAN codes (I'm not a fan of them!) are only for a tracepoint somewhere, > but madvise_collapse() and madvise_collapse_errno() take some of them > more seriously than others - I think SCAN_PAGE_LOCK ends up as an > EAGAIN (rightly), but SCAN_FAIL as an EINVAL (depends). > > But maybe there are layers in between which do not propagate the result > code, I didn't check. All in all, not something I'd spend time on myself. Thanks, Hugh. A second look shows do_swap_page() should not return VM_FAULT_RETRY due to per-VMA lock since it depends on FAULT_FLAG_VMA_LOCK flag, but it is actually not set in khugepaged path. Khugepaged just has FAULT_FLAG_ALLOW_RETRY flag set. So we don't have to change anything. > > Hugh
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 732f9ac393fc..49cfa7cdfe93 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -993,9 +993,8 @@ static int check_pmd_still_valid(struct mm_struct *mm, * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. - * Note that if false is returned, mmap_lock will be released. + * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. */ - static int __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, @@ -1004,23 +1003,35 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, int swapped_in = 0; vm_fault_t ret = 0; unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); + int result; + pte_t *pte = NULL; for (address = haddr; address < end; address += PAGE_SIZE) { struct vm_fault vmf = { .vma = vma, .address = address, - .pgoff = linear_page_index(vma, haddr), + .pgoff = linear_page_index(vma, address), .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, }; - vmf.pte = pte_offset_map(pmd, address); - vmf.orig_pte = *vmf.pte; - if (!is_swap_pte(vmf.orig_pte)) { - pte_unmap(vmf.pte); - continue; + if (!pte++) { + pte = pte_offset_map(pmd, address); + if (!pte) { + mmap_read_unlock(mm); + result = SCAN_PMD_NULL; + goto out; + } } + + vmf.orig_pte = *pte; + if (!is_swap_pte(vmf.orig_pte)) + continue; + + vmf.pte = pte; ret = do_swap_page(&vmf); + /* Which unmaps pte (after perhaps re-checking the entry) */ + pte = NULL; /* * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. @@ -1029,24 +1040,29 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, * resulting in later failure. */ if (ret & VM_FAULT_RETRY) { - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); /* Likely, but not guaranteed, that page lock failed */ - return SCAN_PAGE_LOCK; + result = SCAN_PAGE_LOCK; + goto out; } if (ret & VM_FAULT_ERROR) { mmap_read_unlock(mm); - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return SCAN_FAIL; + result = SCAN_FAIL; + goto out; } swapped_in++; } + if (pte) + pte_unmap(pte); + /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ if (swapped_in) lru_add_drain(); - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); - return SCAN_SUCCEED; + result = SCAN_SUCCEED; +out: + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); + return result; } static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, @@ -1146,9 +1162,6 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); - pte = pte_offset_map(pmd, address); - pte_ptl = pte_lockptr(mm, pmd); - pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * This removes any huge TLB entry from the CPU so we won't allow @@ -1163,13 +1176,18 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, mmu_notifier_invalidate_range_end(&range); tlb_remove_table_sync_one(); - spin_lock(pte_ptl); - result = __collapse_huge_page_isolate(vma, address, pte, cc, - &compound_pagelist); - spin_unlock(pte_ptl); + pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); + if (pte) { + result = __collapse_huge_page_isolate(vma, address, pte, cc, + &compound_pagelist); + spin_unlock(pte_ptl); + } else { + result = SCAN_PMD_NULL; + } if (unlikely(result != SCAN_SUCCEED)) { - pte_unmap(pte); + if (pte) + pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); /* @@ -1253,6 +1271,11 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!pte) { + result = SCAN_PMD_NULL; + goto out; + } + for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; @@ -1622,8 +1645,10 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * lockless_pages_from_mm() and the hardware page walker can access page * tables while all the high-level locks are held in write mode. */ - start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); result = SCAN_FAIL; + start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + if (!start_pte) + goto drop_immap; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; @@ -1697,6 +1722,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, abort: pte_unmap_unlock(start_pte, ptl); +drop_immap: i_mmap_unlock_write(vma->vm_file->f_mapping); goto drop_hpage; }
__collapse_huge_page_swapin(): don't drop the map after every pte, it only has to be dropped by do_swap_page(); give up if pte_offset_map() fails; trace_mm_collapse_huge_page_swapin() at the end, with result; fix comment on returned result; fix vmf.pgoff, though it's not used. collapse_huge_page(): use pte_offset_map_lock() on the _pmd returned from clearing; allow failure, but it should be impossible there. hpage_collapse_scan_pmd() and collapse_pte_mapped_thp() allow for pte_offset_map_lock() failure. Signed-off-by: Hugh Dickins <hughd@google.com> --- mm/khugepaged.c | 72 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 23 deletions(-)