Message ID | 20240409082631.187483-5-21cnbao@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | large folios swap-in: handle refault cases first | expand |
On 09/04/2024 09:26, Barry Song wrote: > From: Chuanhua Han <hanchuanhua@oppo.com> > > When a large folio is found in the swapcache, the current implementation > requires calling do_swap_page() nr_pages times, resulting in nr_pages > page faults. This patch opts to map the entire large folio at once to > minimize page faults. Additionally, redundant checks and early exits > for ARM64 MTE restoring are removed. > > Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com> > Co-developed-by: Barry Song <v-songbaohua@oppo.com> > Signed-off-by: Barry Song <v-songbaohua@oppo.com> > --- > mm/memory.c | 64 +++++++++++++++++++++++++++++++++++++++++++---------- > 1 file changed, 52 insertions(+), 12 deletions(-) > > diff --git a/mm/memory.c b/mm/memory.c > index c4a52e8d740a..9818dc1893c8 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -3947,6 +3947,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > pte_t pte; > vm_fault_t ret = 0; > void *shadow = NULL; > + int nr_pages = 1; > + unsigned long start_address = vmf->address; > + pte_t *start_pte = vmf->pte; possible bug?: there are code paths that assign to vmf-pte below in this function, so couldn't start_pte be stale in some cases? I'd just do the assignment (all 4 of these variables in fact) in an else clause below, after any messing about with them is complete. nit: rename start_pte -> start_ptep ? > + bool any_swap_shared = false; Suggest you defer initialization of this to your "We hit large folios in swapcache" block below, and init it to: any_swap_shared = !pte_swp_exclusive(vmf->pte); Then the any_shared semantic in swap_pte_batch() can be the same as for folio_pte_batch(). > > if (!pte_unmap_same(vmf)) > goto out; > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > */ > vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, > &vmf->ptl); bug: vmf->pte may be NULL and you are not checking it until check_pte:. Byt you are using it in this block. It also seems odd to do all the work in the below block under the PTL but before checking if the pte has changed. Suggest moving both checks here. > + > + /* We hit large folios in swapcache */ > + if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) { What's the start_pte check protecting? > + int nr = folio_nr_pages(folio); > + int idx = folio_page_idx(folio, page); > + unsigned long folio_start = vmf->address - idx * PAGE_SIZE; > + unsigned long folio_end = folio_start + nr * PAGE_SIZE; > + pte_t *folio_ptep; > + pte_t folio_pte; > + > + if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start))) > + goto check_pte; > + if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end))) > + goto check_pte; > + > + folio_ptep = vmf->pte - idx; > + folio_pte = ptep_get(folio_ptep); > + if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || > + swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) > + goto check_pte; > + > + start_address = folio_start; > + start_pte = folio_ptep; > + nr_pages = nr; > + entry = folio->swap; > + page = &folio->page; > + } > + > +check_pte: > if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) > goto out_nomap; > > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > */ > exclusive = false; > } > + > + /* Reuse the whole large folio iff all entries are exclusive */ > + if (nr_pages > 1 && any_swap_shared) > + exclusive = false; If you init any_shared with the firt pte as I suggested then you could just set exclusive = !any_shared at the top of this if block without needing this separate fixup. > } > > /* > @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > * We're already holding a reference on the page but haven't mapped it > * yet. > */ > - swap_free(entry); > + swap_free_nr(entry, nr_pages); > if (should_try_to_free_swap(folio, vma, vmf->flags)) > folio_free_swap(folio); > > - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); > - dec_mm_counter(vma->vm_mm, MM_SWAPENTS); > + folio_ref_add(folio, nr_pages - 1); > + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); > + add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); > + > pte = mk_pte(page, vma->vm_page_prot); > > /* > @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > * exclusivity. > */ > if (!folio_test_ksm(folio) && > - (exclusive || folio_ref_count(folio) == 1)) { > + (exclusive || (folio_ref_count(folio) == nr_pages && > + folio_nr_pages(folio) == nr_pages))) { > if (vmf->flags & FAULT_FLAG_WRITE) { > pte = maybe_mkwrite(pte_mkdirty(pte), vma); > vmf->flags &= ~FAULT_FLAG_WRITE; > } > rmap_flags |= RMAP_EXCLUSIVE; > } > - flush_icache_page(vma, page); > + flush_icache_pages(vma, page, nr_pages); > if (pte_swp_soft_dirty(vmf->orig_pte)) > pte = pte_mksoft_dirty(pte); > if (pte_swp_uffd_wp(vmf->orig_pte)) > pte = pte_mkuffd_wp(pte); I'm not sure about all this... you are smearing these SW bits from the faulting PTE across all the ptes you are mapping. Although I guess actually that's ok because swap_pte_batch() only returns a batch with all these bits the same? > - vmf->orig_pte = pte; Instead of doing a readback below, perhaps: vmf->orig_pte = pte_advance_pfn(pte, nr_pages); > > /* ksm created a completely new copy */ > if (unlikely(folio != swapcache && swapcache)) { > - folio_add_new_anon_rmap(folio, vma, vmf->address); > + folio_add_new_anon_rmap(folio, vma, start_address); > folio_add_lru_vma(folio, vma); > } else { > - folio_add_anon_rmap_pte(folio, page, vma, vmf->address, > - rmap_flags); > + folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address, > + rmap_flags); > } > > VM_BUG_ON(!folio_test_anon(folio) || > (pte_write(pte) && !PageAnonExclusive(page))); > - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); > - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); > + set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages); > + vmf->orig_pte = ptep_get(vmf->pte); > + arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte); > > folio_unlock(folio); > if (folio != swapcache && swapcache) { > @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > } > > /* No need to invalidate - it was non-present before */ > - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); > + update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages); > unlock: > if (vmf->pte) > pte_unmap_unlock(vmf->pte, vmf->ptl);
On Fri, Apr 12, 2024 at 3:33 AM Ryan Roberts <ryan.roberts@arm.com> wrote: > > On 09/04/2024 09:26, Barry Song wrote: > > From: Chuanhua Han <hanchuanhua@oppo.com> > > > > When a large folio is found in the swapcache, the current implementation > > requires calling do_swap_page() nr_pages times, resulting in nr_pages > > page faults. This patch opts to map the entire large folio at once to > > minimize page faults. Additionally, redundant checks and early exits > > for ARM64 MTE restoring are removed. > > > > Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com> > > Co-developed-by: Barry Song <v-songbaohua@oppo.com> > > Signed-off-by: Barry Song <v-songbaohua@oppo.com> > > --- > > mm/memory.c | 64 +++++++++++++++++++++++++++++++++++++++++++---------- > > 1 file changed, 52 insertions(+), 12 deletions(-) > > > > diff --git a/mm/memory.c b/mm/memory.c > > index c4a52e8d740a..9818dc1893c8 100644 > > --- a/mm/memory.c > > +++ b/mm/memory.c > > @@ -3947,6 +3947,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > > pte_t pte; > > vm_fault_t ret = 0; > > void *shadow = NULL; > > + int nr_pages = 1; > > + unsigned long start_address = vmf->address; > > + pte_t *start_pte = vmf->pte; > > possible bug?: there are code paths that assign to vmf-pte below in this > function, so couldn't start_pte be stale in some cases? I'd just do the > assignment (all 4 of these variables in fact) in an else clause below, after any > messing about with them is complete. > > nit: rename start_pte -> start_ptep ? Agreed. > > > + bool any_swap_shared = false; > > Suggest you defer initialization of this to your "We hit large folios in > swapcache" block below, and init it to: > > any_swap_shared = !pte_swp_exclusive(vmf->pte); > > Then the any_shared semantic in swap_pte_batch() can be the same as for > folio_pte_batch(). > > > > > if (!pte_unmap_same(vmf)) > > goto out; > > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > > */ > > vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, > > &vmf->ptl); > > bug: vmf->pte may be NULL and you are not checking it until check_pte:. Byt you > are using it in this block. It also seems odd to do all the work in the below > block under the PTL but before checking if the pte has changed. Suggest moving > both checks here. agreed. > > > + > > + /* We hit large folios in swapcache */ > > + if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) { > > What's the start_pte check protecting? This is exactly protecting the case vmf->pte==NULL but for some reason it was assigned in the beginning of the function incorrectly. The intention of the code was actually doing start_pte = vmf->pte after "vmf->pte = pte_offset_map_lock". > > > + int nr = folio_nr_pages(folio); > > + int idx = folio_page_idx(folio, page); > > + unsigned long folio_start = vmf->address - idx * PAGE_SIZE; > > + unsigned long folio_end = folio_start + nr * PAGE_SIZE; > > + pte_t *folio_ptep; > > + pte_t folio_pte; > > + > > + if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start))) > > + goto check_pte; > > + if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end))) > > + goto check_pte; > > + > > + folio_ptep = vmf->pte - idx; > > + folio_pte = ptep_get(folio_ptep); > > + if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || > > + swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) > > + goto check_pte; > > + > > + start_address = folio_start; > > + start_pte = folio_ptep; > > + nr_pages = nr; > > + entry = folio->swap; > > + page = &folio->page; > > + } > > + > > +check_pte: > > if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) > > goto out_nomap; > > > > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > > */ > > exclusive = false; > > } > > + > > + /* Reuse the whole large folio iff all entries are exclusive */ > > + if (nr_pages > 1 && any_swap_shared) > > + exclusive = false; > > If you init any_shared with the firt pte as I suggested then you could just set > exclusive = !any_shared at the top of this if block without needing this > separate fixup. Since your swap_pte_batch() function checks that all PTEs have the same exclusive bits, I'll be removing any_shared first in version 3 per David's suggestions. We could potentially develop "any_shared" as an incremental patchset later on . > > } > > > > /* > > @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > > * We're already holding a reference on the page but haven't mapped it > > * yet. > > */ > > - swap_free(entry); > > + swap_free_nr(entry, nr_pages); > > if (should_try_to_free_swap(folio, vma, vmf->flags)) > > folio_free_swap(folio); > > > > - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); > > - dec_mm_counter(vma->vm_mm, MM_SWAPENTS); > > + folio_ref_add(folio, nr_pages - 1); > > + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); > > + add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); > > + > > pte = mk_pte(page, vma->vm_page_prot); > > > > /* > > @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > > * exclusivity. > > */ > > if (!folio_test_ksm(folio) && > > - (exclusive || folio_ref_count(folio) == 1)) { > > + (exclusive || (folio_ref_count(folio) == nr_pages && > > + folio_nr_pages(folio) == nr_pages))) { > > if (vmf->flags & FAULT_FLAG_WRITE) { > > pte = maybe_mkwrite(pte_mkdirty(pte), vma); > > vmf->flags &= ~FAULT_FLAG_WRITE; > > } > > rmap_flags |= RMAP_EXCLUSIVE; > > } > > - flush_icache_page(vma, page); > > + flush_icache_pages(vma, page, nr_pages); > > if (pte_swp_soft_dirty(vmf->orig_pte)) > > pte = pte_mksoft_dirty(pte); > > if (pte_swp_uffd_wp(vmf->orig_pte)) > > pte = pte_mkuffd_wp(pte); > > I'm not sure about all this... you are smearing these SW bits from the faulting > PTE across all the ptes you are mapping. Although I guess actually that's ok > because swap_pte_batch() only returns a batch with all these bits the same? Initially, I didn't recognize the issue at all because the tested architecture arm64 didn't include these bits. However, after reviewing your latest swpout series, which verifies the consistent bits for soft_dirty and uffd_wp, I now feel its safety even for platforms with these bits. > > > - vmf->orig_pte = pte; > > Instead of doing a readback below, perhaps: > > vmf->orig_pte = pte_advance_pfn(pte, nr_pages); Nice ! > > > > > /* ksm created a completely new copy */ > > if (unlikely(folio != swapcache && swapcache)) { > > - folio_add_new_anon_rmap(folio, vma, vmf->address); > > + folio_add_new_anon_rmap(folio, vma, start_address); > > folio_add_lru_vma(folio, vma); > > } else { > > - folio_add_anon_rmap_pte(folio, page, vma, vmf->address, > > - rmap_flags); > > + folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address, > > + rmap_flags); > > } > > > > VM_BUG_ON(!folio_test_anon(folio) || > > (pte_write(pte) && !PageAnonExclusive(page))); > > - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); > > - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); > > + set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages); > > + vmf->orig_pte = ptep_get(vmf->pte); > > + arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte); > > > > folio_unlock(folio); > > if (folio != swapcache && swapcache) { > > @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > > } > > > > /* No need to invalidate - it was non-present before */ > > - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); > > + update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages); > > unlock: > > if (vmf->pte) > > pte_unmap_unlock(vmf->pte, vmf->ptl); > Thanks Barry
On 12/04/2024 00:30, Barry Song wrote: > On Fri, Apr 12, 2024 at 3:33 AM Ryan Roberts <ryan.roberts@arm.com> wrote: >> >> On 09/04/2024 09:26, Barry Song wrote: >>> From: Chuanhua Han <hanchuanhua@oppo.com> >>> >>> When a large folio is found in the swapcache, the current implementation >>> requires calling do_swap_page() nr_pages times, resulting in nr_pages >>> page faults. This patch opts to map the entire large folio at once to >>> minimize page faults. Additionally, redundant checks and early exits >>> for ARM64 MTE restoring are removed. >>> >>> Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com> >>> Co-developed-by: Barry Song <v-songbaohua@oppo.com> >>> Signed-off-by: Barry Song <v-songbaohua@oppo.com> >>> --- >>> mm/memory.c | 64 +++++++++++++++++++++++++++++++++++++++++++---------- >>> 1 file changed, 52 insertions(+), 12 deletions(-) >>> >>> diff --git a/mm/memory.c b/mm/memory.c >>> index c4a52e8d740a..9818dc1893c8 100644 >>> --- a/mm/memory.c >>> +++ b/mm/memory.c >>> @@ -3947,6 +3947,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >>> pte_t pte; >>> vm_fault_t ret = 0; >>> void *shadow = NULL; >>> + int nr_pages = 1; >>> + unsigned long start_address = vmf->address; >>> + pte_t *start_pte = vmf->pte; >> >> possible bug?: there are code paths that assign to vmf-pte below in this >> function, so couldn't start_pte be stale in some cases? I'd just do the >> assignment (all 4 of these variables in fact) in an else clause below, after any >> messing about with them is complete. >> >> nit: rename start_pte -> start_ptep ? > > Agreed. > >> >>> + bool any_swap_shared = false; >> >> Suggest you defer initialization of this to your "We hit large folios in >> swapcache" block below, and init it to: >> >> any_swap_shared = !pte_swp_exclusive(vmf->pte); >> >> Then the any_shared semantic in swap_pte_batch() can be the same as for >> folio_pte_batch(). >> >>> >>> if (!pte_unmap_same(vmf)) >>> goto out; >>> @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >>> */ >>> vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, >>> &vmf->ptl); >> >> bug: vmf->pte may be NULL and you are not checking it until check_pte:. Byt you >> are using it in this block. It also seems odd to do all the work in the below >> block under the PTL but before checking if the pte has changed. Suggest moving >> both checks here. > > agreed. > >> >>> + >>> + /* We hit large folios in swapcache */ >>> + if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) { >> >> What's the start_pte check protecting? > > This is exactly protecting the case vmf->pte==NULL but for some reason it was > assigned in the beginning of the function incorrectly. The intention of the code > was actually doing start_pte = vmf->pte after "vmf->pte = pte_offset_map_lock". > >> >>> + int nr = folio_nr_pages(folio); >>> + int idx = folio_page_idx(folio, page); >>> + unsigned long folio_start = vmf->address - idx * PAGE_SIZE; >>> + unsigned long folio_end = folio_start + nr * PAGE_SIZE; >>> + pte_t *folio_ptep; >>> + pte_t folio_pte; >>> + >>> + if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start))) >>> + goto check_pte; >>> + if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end))) >>> + goto check_pte; >>> + >>> + folio_ptep = vmf->pte - idx; >>> + folio_pte = ptep_get(folio_ptep); >>> + if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || >>> + swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) >>> + goto check_pte; >>> + >>> + start_address = folio_start; >>> + start_pte = folio_ptep; >>> + nr_pages = nr; >>> + entry = folio->swap; >>> + page = &folio->page; >>> + } >>> + >>> +check_pte: >>> if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) >>> goto out_nomap; >>> >>> @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >>> */ >>> exclusive = false; >>> } >>> + >>> + /* Reuse the whole large folio iff all entries are exclusive */ >>> + if (nr_pages > 1 && any_swap_shared) >>> + exclusive = false; >> >> If you init any_shared with the firt pte as I suggested then you could just set >> exclusive = !any_shared at the top of this if block without needing this >> separate fixup. > > Since your swap_pte_batch() function checks that all PTEs have the same > exclusive bits, I'll be removing any_shared first in version 3 per David's > suggestions. We could potentially develop "any_shared" as an incremental > patchset later on . Ahh yes, good point. I'll admit that your conversation about this went over my head at the time since I hadn't yet looked at this. > >>> } >>> >>> /* >>> @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >>> * We're already holding a reference on the page but haven't mapped it >>> * yet. >>> */ >>> - swap_free(entry); >>> + swap_free_nr(entry, nr_pages); >>> if (should_try_to_free_swap(folio, vma, vmf->flags)) >>> folio_free_swap(folio); >>> >>> - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); >>> - dec_mm_counter(vma->vm_mm, MM_SWAPENTS); >>> + folio_ref_add(folio, nr_pages - 1); >>> + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); >>> + add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); >>> + >>> pte = mk_pte(page, vma->vm_page_prot); >>> >>> /* >>> @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >>> * exclusivity. >>> */ >>> if (!folio_test_ksm(folio) && >>> - (exclusive || folio_ref_count(folio) == 1)) { >>> + (exclusive || (folio_ref_count(folio) == nr_pages && >>> + folio_nr_pages(folio) == nr_pages))) { >>> if (vmf->flags & FAULT_FLAG_WRITE) { >>> pte = maybe_mkwrite(pte_mkdirty(pte), vma); >>> vmf->flags &= ~FAULT_FLAG_WRITE; >>> } >>> rmap_flags |= RMAP_EXCLUSIVE; >>> } >>> - flush_icache_page(vma, page); >>> + flush_icache_pages(vma, page, nr_pages); >>> if (pte_swp_soft_dirty(vmf->orig_pte)) >>> pte = pte_mksoft_dirty(pte); >>> if (pte_swp_uffd_wp(vmf->orig_pte)) >>> pte = pte_mkuffd_wp(pte); >> >> I'm not sure about all this... you are smearing these SW bits from the faulting >> PTE across all the ptes you are mapping. Although I guess actually that's ok >> because swap_pte_batch() only returns a batch with all these bits the same? > > Initially, I didn't recognize the issue at all because the tested > architecture arm64 > didn't include these bits. However, after reviewing your latest swpout series, > which verifies the consistent bits for soft_dirty and uffd_wp, I now > feel its safety > even for platforms with these bits. Yep, agreed. > >> >>> - vmf->orig_pte = pte; >> >> Instead of doing a readback below, perhaps: >> >> vmf->orig_pte = pte_advance_pfn(pte, nr_pages); > > Nice ! > >> >>> >>> /* ksm created a completely new copy */ >>> if (unlikely(folio != swapcache && swapcache)) { >>> - folio_add_new_anon_rmap(folio, vma, vmf->address); >>> + folio_add_new_anon_rmap(folio, vma, start_address); >>> folio_add_lru_vma(folio, vma); >>> } else { >>> - folio_add_anon_rmap_pte(folio, page, vma, vmf->address, >>> - rmap_flags); >>> + folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address, >>> + rmap_flags); >>> } >>> >>> VM_BUG_ON(!folio_test_anon(folio) || >>> (pte_write(pte) && !PageAnonExclusive(page))); >>> - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); >>> - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); >>> + set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages); >>> + vmf->orig_pte = ptep_get(vmf->pte); >>> + arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte); >>> >>> folio_unlock(folio); >>> if (folio != swapcache && swapcache) { >>> @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >>> } >>> >>> /* No need to invalidate - it was non-present before */ >>> - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); >>> + update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages); >>> unlock: >>> if (vmf->pte) >>> pte_unmap_unlock(vmf->pte, vmf->ptl); >> > > Thanks > Barry
Barry Song <21cnbao@gmail.com> writes: > From: Chuanhua Han <hanchuanhua@oppo.com> > > When a large folio is found in the swapcache, the current implementation > requires calling do_swap_page() nr_pages times, resulting in nr_pages > page faults. This patch opts to map the entire large folio at once to > minimize page faults. Additionally, redundant checks and early exits > for ARM64 MTE restoring are removed. > > Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com> > Co-developed-by: Barry Song <v-songbaohua@oppo.com> > Signed-off-by: Barry Song <v-songbaohua@oppo.com> > --- > mm/memory.c | 64 +++++++++++++++++++++++++++++++++++++++++++---------- > 1 file changed, 52 insertions(+), 12 deletions(-) > > diff --git a/mm/memory.c b/mm/memory.c > index c4a52e8d740a..9818dc1893c8 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -3947,6 +3947,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > pte_t pte; > vm_fault_t ret = 0; > void *shadow = NULL; > + int nr_pages = 1; > + unsigned long start_address = vmf->address; > + pte_t *start_pte = vmf->pte; IMHO, it's better to rename the above 2 local variables to "address" and "ptep". Just my personal opinion. Feel free to ignore the comments. > + bool any_swap_shared = false; > > if (!pte_unmap_same(vmf)) > goto out; > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > */ > vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, > &vmf->ptl); We should move pte check here. That is, if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto out_nomap; This will simplify the situation for large folio. > + > + /* We hit large folios in swapcache */ The comments seems unnecessary because the code tells that already. > + if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) { > + int nr = folio_nr_pages(folio); > + int idx = folio_page_idx(folio, page); > + unsigned long folio_start = vmf->address - idx * PAGE_SIZE; > + unsigned long folio_end = folio_start + nr * PAGE_SIZE; > + pte_t *folio_ptep; > + pte_t folio_pte; > + > + if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start))) > + goto check_pte; > + if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end))) > + goto check_pte; > + > + folio_ptep = vmf->pte - idx; > + folio_pte = ptep_get(folio_ptep); It's better to construct pte based on fault PTE via generalizing pte_next_swp_offset() (may be pte_move_swp_offset()). Then we can find inconsistent PTEs quicker. > + if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || > + swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) > + goto check_pte; > + > + start_address = folio_start; > + start_pte = folio_ptep; > + nr_pages = nr; > + entry = folio->swap; > + page = &folio->page; > + } > + > +check_pte: > if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) > goto out_nomap; > > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > */ > exclusive = false; > } > + > + /* Reuse the whole large folio iff all entries are exclusive */ > + if (nr_pages > 1 && any_swap_shared) > + exclusive = false; > } > > /* > @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > * We're already holding a reference on the page but haven't mapped it > * yet. > */ > - swap_free(entry); > + swap_free_nr(entry, nr_pages); > if (should_try_to_free_swap(folio, vma, vmf->flags)) > folio_free_swap(folio); > > - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); > - dec_mm_counter(vma->vm_mm, MM_SWAPENTS); > + folio_ref_add(folio, nr_pages - 1); > + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); > + add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); > + > pte = mk_pte(page, vma->vm_page_prot); > > /* > @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > * exclusivity. > */ > if (!folio_test_ksm(folio) && > - (exclusive || folio_ref_count(folio) == 1)) { > + (exclusive || (folio_ref_count(folio) == nr_pages && > + folio_nr_pages(folio) == nr_pages))) { > if (vmf->flags & FAULT_FLAG_WRITE) { > pte = maybe_mkwrite(pte_mkdirty(pte), vma); > vmf->flags &= ~FAULT_FLAG_WRITE; > } > rmap_flags |= RMAP_EXCLUSIVE; > } > - flush_icache_page(vma, page); > + flush_icache_pages(vma, page, nr_pages); > if (pte_swp_soft_dirty(vmf->orig_pte)) > pte = pte_mksoft_dirty(pte); > if (pte_swp_uffd_wp(vmf->orig_pte)) > pte = pte_mkuffd_wp(pte); > - vmf->orig_pte = pte; > > /* ksm created a completely new copy */ > if (unlikely(folio != swapcache && swapcache)) { > - folio_add_new_anon_rmap(folio, vma, vmf->address); > + folio_add_new_anon_rmap(folio, vma, start_address); > folio_add_lru_vma(folio, vma); > } else { > - folio_add_anon_rmap_pte(folio, page, vma, vmf->address, > - rmap_flags); > + folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address, > + rmap_flags); > } > > VM_BUG_ON(!folio_test_anon(folio) || > (pte_write(pte) && !PageAnonExclusive(page))); > - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); > - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); > + set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages); > + vmf->orig_pte = ptep_get(vmf->pte); > + arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte); Do we need to call arch_do_swap_page() for each subpage? IIUC, the corresponding arch_unmap_one() will be called for each subpage. > folio_unlock(folio); > if (folio != swapcache && swapcache) { > @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > } > > /* No need to invalidate - it was non-present before */ > - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); > + update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages); > unlock: > if (vmf->pte) > pte_unmap_unlock(vmf->pte, vmf->ptl); -- Best Regards, Huang, Ying
On Mon, Apr 15, 2024 at 8:39 PM Huang, Ying <ying.huang@intel.com> wrote: > > Barry Song <21cnbao@gmail.com> writes: > > > From: Chuanhua Han <hanchuanhua@oppo.com> > > > > When a large folio is found in the swapcache, the current implementation > > requires calling do_swap_page() nr_pages times, resulting in nr_pages > > page faults. This patch opts to map the entire large folio at once to > > minimize page faults. Additionally, redundant checks and early exits > > for ARM64 MTE restoring are removed. > > > > Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com> > > Co-developed-by: Barry Song <v-songbaohua@oppo.com> > > Signed-off-by: Barry Song <v-songbaohua@oppo.com> > > --- > > mm/memory.c | 64 +++++++++++++++++++++++++++++++++++++++++++---------- > > 1 file changed, 52 insertions(+), 12 deletions(-) > > > > diff --git a/mm/memory.c b/mm/memory.c > > index c4a52e8d740a..9818dc1893c8 100644 > > --- a/mm/memory.c > > +++ b/mm/memory.c > > @@ -3947,6 +3947,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > > pte_t pte; > > vm_fault_t ret = 0; > > void *shadow = NULL; > > + int nr_pages = 1; > > + unsigned long start_address = vmf->address; > > + pte_t *start_pte = vmf->pte; > > IMHO, it's better to rename the above 2 local variables to "address" and > "ptep". Just my personal opinion. Feel free to ignore the comments. fine. > > > + bool any_swap_shared = false; > > > > if (!pte_unmap_same(vmf)) > > goto out; > > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > > */ > > vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, > > &vmf->ptl); > > We should move pte check here. That is, > > if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) > goto out_nomap; > > This will simplify the situation for large folio. the plan is moving the whole code block if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) after if (unlikely(!folio_test_uptodate(folio))) { ret = VM_FAULT_SIGBUS; goto out_nomap; } though we couldn't be !folio_test_uptodate(folio)) for hitting swapcache but it seems logically better for future use. > > > + > > + /* We hit large folios in swapcache */ > > The comments seems unnecessary because the code tells that already. > > > + if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) { > > + int nr = folio_nr_pages(folio); > > + int idx = folio_page_idx(folio, page); > > + unsigned long folio_start = vmf->address - idx * PAGE_SIZE; > > + unsigned long folio_end = folio_start + nr * PAGE_SIZE; > > + pte_t *folio_ptep; > > + pte_t folio_pte; > > + > > + if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start))) > > + goto check_pte; > > + if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end))) > > + goto check_pte; > > + > > + folio_ptep = vmf->pte - idx; > > + folio_pte = ptep_get(folio_ptep); > > It's better to construct pte based on fault PTE via generalizing > pte_next_swp_offset() (may be pte_move_swp_offset()). Then we can find > inconsistent PTEs quicker. it seems your point is getting the pte of page0 by pte_next_swp_offset() unfortunately pte_next_swp_offset can't go back. on the other hand, we have to check the real pte value of the 0nd entry right now because swap_pte_batch() only really reads pte from the 1st entry. it assumes pte argument is the real value for the 0nd pte entry. static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) { pte_t expected_pte = pte_next_swp_offset(pte); const pte_t *end_ptep = start_ptep + max_nr; pte_t *ptep = start_ptep + 1; VM_WARN_ON(max_nr < 1); VM_WARN_ON(!is_swap_pte(pte)); VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte))); while (ptep < end_ptep) { pte = ptep_get(ptep); if (!pte_same(pte, expected_pte)) break; expected_pte = pte_next_swp_offset(expected_pte); ptep++; } return ptep - start_ptep; } > > > + if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || > > + swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) > > + goto check_pte; > > + > > + start_address = folio_start; > > + start_pte = folio_ptep; > > + nr_pages = nr; > > + entry = folio->swap; > > + page = &folio->page; > > + } > > + > > +check_pte: > > if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) > > goto out_nomap; > > > > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > > */ > > exclusive = false; > > } > > + > > + /* Reuse the whole large folio iff all entries are exclusive */ > > + if (nr_pages > 1 && any_swap_shared) > > + exclusive = false; > > } > > > > /* > > @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > > * We're already holding a reference on the page but haven't mapped it > > * yet. > > */ > > - swap_free(entry); > > + swap_free_nr(entry, nr_pages); > > if (should_try_to_free_swap(folio, vma, vmf->flags)) > > folio_free_swap(folio); > > > > - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); > > - dec_mm_counter(vma->vm_mm, MM_SWAPENTS); > > + folio_ref_add(folio, nr_pages - 1); > > + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); > > + add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); > > + > > pte = mk_pte(page, vma->vm_page_prot); > > > > /* > > @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > > * exclusivity. > > */ > > if (!folio_test_ksm(folio) && > > - (exclusive || folio_ref_count(folio) == 1)) { > > + (exclusive || (folio_ref_count(folio) == nr_pages && > > + folio_nr_pages(folio) == nr_pages))) { > > if (vmf->flags & FAULT_FLAG_WRITE) { > > pte = maybe_mkwrite(pte_mkdirty(pte), vma); > > vmf->flags &= ~FAULT_FLAG_WRITE; > > } > > rmap_flags |= RMAP_EXCLUSIVE; > > } > > - flush_icache_page(vma, page); > > + flush_icache_pages(vma, page, nr_pages); > > if (pte_swp_soft_dirty(vmf->orig_pte)) > > pte = pte_mksoft_dirty(pte); > > if (pte_swp_uffd_wp(vmf->orig_pte)) > > pte = pte_mkuffd_wp(pte); > > - vmf->orig_pte = pte; > > > > /* ksm created a completely new copy */ > > if (unlikely(folio != swapcache && swapcache)) { > > - folio_add_new_anon_rmap(folio, vma, vmf->address); > > + folio_add_new_anon_rmap(folio, vma, start_address); > > folio_add_lru_vma(folio, vma); > > } else { > > - folio_add_anon_rmap_pte(folio, page, vma, vmf->address, > > - rmap_flags); > > + folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address, > > + rmap_flags); > > } > > > > VM_BUG_ON(!folio_test_anon(folio) || > > (pte_write(pte) && !PageAnonExclusive(page))); > > - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); > > - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); > > + set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages); > > + vmf->orig_pte = ptep_get(vmf->pte); > > + arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte); > > Do we need to call arch_do_swap_page() for each subpage? IIUC, the > corresponding arch_unmap_one() will be called for each subpage. i actually thought about this very carefully, right now, the only one who needs this is sparc and it doesn't support THP_SWAPOUT at all. and there is no proof doing restoration one by one won't really break sparc. so i'd like to defer this to when sparc really needs THP_SWAPOUT. on the other hand, it seems really bad we have both arch_swap_restore - for this, arm64 has moved to using folio and arch_do_swap_page we should somehow unify them later if sparc wants THP_SWPOUT. > > > folio_unlock(folio); > > if (folio != swapcache && swapcache) { > > @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > > } > > > > /* No need to invalidate - it was non-present before */ > > - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); > > + update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages); > > unlock: > > if (vmf->pte) > > pte_unmap_unlock(vmf->pte, vmf->ptl); > > -- > Best Regards, > Huang, Ying
Added Khalid for arch_do_swap_page(). Barry Song <21cnbao@gmail.com> writes: > On Mon, Apr 15, 2024 at 8:39 PM Huang, Ying <ying.huang@intel.com> wrote: >> >> Barry Song <21cnbao@gmail.com> writes: [snip] >> >> > + bool any_swap_shared = false; >> > >> > if (!pte_unmap_same(vmf)) >> > goto out; >> > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >> > */ >> > vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, >> > &vmf->ptl); >> >> We should move pte check here. That is, >> >> if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) >> goto out_nomap; >> >> This will simplify the situation for large folio. > > the plan is moving the whole code block > > if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) > > after > if (unlikely(!folio_test_uptodate(folio))) { > ret = VM_FAULT_SIGBUS; > goto out_nomap; > } > > though we couldn't be !folio_test_uptodate(folio)) for hitting > swapcache but it seems > logically better for future use. LGTM, Thanks! >> >> > + >> > + /* We hit large folios in swapcache */ >> >> The comments seems unnecessary because the code tells that already. >> >> > + if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) { >> > + int nr = folio_nr_pages(folio); >> > + int idx = folio_page_idx(folio, page); >> > + unsigned long folio_start = vmf->address - idx * PAGE_SIZE; >> > + unsigned long folio_end = folio_start + nr * PAGE_SIZE; >> > + pte_t *folio_ptep; >> > + pte_t folio_pte; >> > + >> > + if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start))) >> > + goto check_pte; >> > + if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end))) >> > + goto check_pte; >> > + >> > + folio_ptep = vmf->pte - idx; >> > + folio_pte = ptep_get(folio_ptep); >> >> It's better to construct pte based on fault PTE via generalizing >> pte_next_swp_offset() (may be pte_move_swp_offset()). Then we can find >> inconsistent PTEs quicker. > > it seems your point is getting the pte of page0 by pte_next_swp_offset() > unfortunately pte_next_swp_offset can't go back. on the other hand, > we have to check the real pte value of the 0nd entry right now because > swap_pte_batch() only really reads pte from the 1st entry. it assumes > pte argument is the real value for the 0nd pte entry. > > static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) > { > pte_t expected_pte = pte_next_swp_offset(pte); > const pte_t *end_ptep = start_ptep + max_nr; > pte_t *ptep = start_ptep + 1; > > VM_WARN_ON(max_nr < 1); > VM_WARN_ON(!is_swap_pte(pte)); > VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte))); > > while (ptep < end_ptep) { > pte = ptep_get(ptep); > > if (!pte_same(pte, expected_pte)) > break; > > expected_pte = pte_next_swp_offset(expected_pte); > ptep++; > } > > return ptep - start_ptep; > } Yes. You are right. But we may check whether the pte of page0 is same as "vmf->orig_pte - folio_page_idx()" (fake code). You need to check the pte of page 0 anyway. >> >> > + if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || >> > + swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) >> > + goto check_pte; >> > + >> > + start_address = folio_start; >> > + start_pte = folio_ptep; >> > + nr_pages = nr; >> > + entry = folio->swap; >> > + page = &folio->page; >> > + } >> > + >> > +check_pte: >> > if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) >> > goto out_nomap; >> > >> > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >> > */ >> > exclusive = false; >> > } >> > + >> > + /* Reuse the whole large folio iff all entries are exclusive */ >> > + if (nr_pages > 1 && any_swap_shared) >> > + exclusive = false; >> > } >> > >> > /* >> > @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >> > * We're already holding a reference on the page but haven't mapped it >> > * yet. >> > */ >> > - swap_free(entry); >> > + swap_free_nr(entry, nr_pages); >> > if (should_try_to_free_swap(folio, vma, vmf->flags)) >> > folio_free_swap(folio); >> > >> > - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); >> > - dec_mm_counter(vma->vm_mm, MM_SWAPENTS); >> > + folio_ref_add(folio, nr_pages - 1); >> > + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); >> > + add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); >> > + >> > pte = mk_pte(page, vma->vm_page_prot); >> > >> > /* >> > @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >> > * exclusivity. >> > */ >> > if (!folio_test_ksm(folio) && >> > - (exclusive || folio_ref_count(folio) == 1)) { >> > + (exclusive || (folio_ref_count(folio) == nr_pages && >> > + folio_nr_pages(folio) == nr_pages))) { >> > if (vmf->flags & FAULT_FLAG_WRITE) { >> > pte = maybe_mkwrite(pte_mkdirty(pte), vma); >> > vmf->flags &= ~FAULT_FLAG_WRITE; >> > } >> > rmap_flags |= RMAP_EXCLUSIVE; >> > } >> > - flush_icache_page(vma, page); >> > + flush_icache_pages(vma, page, nr_pages); >> > if (pte_swp_soft_dirty(vmf->orig_pte)) >> > pte = pte_mksoft_dirty(pte); >> > if (pte_swp_uffd_wp(vmf->orig_pte)) >> > pte = pte_mkuffd_wp(pte); >> > - vmf->orig_pte = pte; >> > >> > /* ksm created a completely new copy */ >> > if (unlikely(folio != swapcache && swapcache)) { >> > - folio_add_new_anon_rmap(folio, vma, vmf->address); >> > + folio_add_new_anon_rmap(folio, vma, start_address); >> > folio_add_lru_vma(folio, vma); >> > } else { >> > - folio_add_anon_rmap_pte(folio, page, vma, vmf->address, >> > - rmap_flags); >> > + folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address, >> > + rmap_flags); >> > } >> > >> > VM_BUG_ON(!folio_test_anon(folio) || >> > (pte_write(pte) && !PageAnonExclusive(page))); >> > - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); >> > - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); >> > + set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages); >> > + vmf->orig_pte = ptep_get(vmf->pte); >> > + arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte); >> >> Do we need to call arch_do_swap_page() for each subpage? IIUC, the >> corresponding arch_unmap_one() will be called for each subpage. > > i actually thought about this very carefully, right now, the only one who > needs this is sparc and it doesn't support THP_SWAPOUT at all. and > there is no proof doing restoration one by one won't really break sparc. > so i'd like to defer this to when sparc really needs THP_SWAPOUT. Let's ask SPARC developer (Cced) for this. IMHO, even if we cannot get help, we need to change code with our understanding instead of deferring it. > on the other hand, it seems really bad we have both > arch_swap_restore - for this, arm64 has moved to using folio > and > arch_do_swap_page > > we should somehow unify them later if sparc wants THP_SWPOUT. > >> >> > folio_unlock(folio); >> > if (folio != swapcache && swapcache) { >> > @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >> > } >> > >> > /* No need to invalidate - it was non-present before */ >> > - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); >> > + update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages); >> > unlock: >> > if (vmf->pte) >> > pte_unmap_unlock(vmf->pte, vmf->ptl); -- Best Regards, Huang, Ying
On Tue, Apr 16, 2024 at 2:27 PM Huang, Ying <ying.huang@intel.com> wrote: > > > Added Khalid for arch_do_swap_page(). > > Barry Song <21cnbao@gmail.com> writes: > > > On Mon, Apr 15, 2024 at 8:39 PM Huang, Ying <ying.huang@intel.com> wrote: > >> > >> Barry Song <21cnbao@gmail.com> writes: > > [snip] > > >> > >> > + bool any_swap_shared = false; > >> > > >> > if (!pte_unmap_same(vmf)) > >> > goto out; > >> > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > >> > */ > >> > vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, > >> > &vmf->ptl); > >> > >> We should move pte check here. That is, > >> > >> if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) > >> goto out_nomap; > >> > >> This will simplify the situation for large folio. > > > > the plan is moving the whole code block > > > > if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) > > > > after > > if (unlikely(!folio_test_uptodate(folio))) { > > ret = VM_FAULT_SIGBUS; > > goto out_nomap; > > } > > > > though we couldn't be !folio_test_uptodate(folio)) for hitting > > swapcache but it seems > > logically better for future use. > > LGTM, Thanks! > > >> > >> > + > >> > + /* We hit large folios in swapcache */ > >> > >> The comments seems unnecessary because the code tells that already. > >> > >> > + if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) { > >> > + int nr = folio_nr_pages(folio); > >> > + int idx = folio_page_idx(folio, page); > >> > + unsigned long folio_start = vmf->address - idx * PAGE_SIZE; > >> > + unsigned long folio_end = folio_start + nr * PAGE_SIZE; > >> > + pte_t *folio_ptep; > >> > + pte_t folio_pte; > >> > + > >> > + if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start))) > >> > + goto check_pte; > >> > + if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end))) > >> > + goto check_pte; > >> > + > >> > + folio_ptep = vmf->pte - idx; > >> > + folio_pte = ptep_get(folio_ptep); > >> > >> It's better to construct pte based on fault PTE via generalizing > >> pte_next_swp_offset() (may be pte_move_swp_offset()). Then we can find > >> inconsistent PTEs quicker. > > > > it seems your point is getting the pte of page0 by pte_next_swp_offset() > > unfortunately pte_next_swp_offset can't go back. on the other hand, > > we have to check the real pte value of the 0nd entry right now because > > swap_pte_batch() only really reads pte from the 1st entry. it assumes > > pte argument is the real value for the 0nd pte entry. > > > > static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) > > { > > pte_t expected_pte = pte_next_swp_offset(pte); > > const pte_t *end_ptep = start_ptep + max_nr; > > pte_t *ptep = start_ptep + 1; > > > > VM_WARN_ON(max_nr < 1); > > VM_WARN_ON(!is_swap_pte(pte)); > > VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte))); > > > > while (ptep < end_ptep) { > > pte = ptep_get(ptep); > > > > if (!pte_same(pte, expected_pte)) > > break; > > > > expected_pte = pte_next_swp_offset(expected_pte); > > ptep++; > > } > > > > return ptep - start_ptep; > > } > > Yes. You are right. > > But we may check whether the pte of page0 is same as "vmf->orig_pte - > folio_page_idx()" (fake code). right, that is why we are reading and checking PTE0 before calling swap_pte_batch() right now. folio_ptep = vmf->pte - idx; folio_pte = ptep_get(folio_ptep); if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) goto check_pte; So, if I understand correctly, you're proposing that we should directly check PTE0 in swap_pte_batch(). Personally, I don't have any objections to this idea. However, I'd also like to hear the feedback from Ryan and David :-) > > You need to check the pte of page 0 anyway. > > >> > >> > + if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || > >> > + swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) > >> > + goto check_pte; > >> > + > >> > + start_address = folio_start; > >> > + start_pte = folio_ptep; > >> > + nr_pages = nr; > >> > + entry = folio->swap; > >> > + page = &folio->page; > >> > + } > >> > + > >> > +check_pte: > >> > if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) > >> > goto out_nomap; > >> > > >> > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > >> > */ > >> > exclusive = false; > >> > } > >> > + > >> > + /* Reuse the whole large folio iff all entries are exclusive */ > >> > + if (nr_pages > 1 && any_swap_shared) > >> > + exclusive = false; > >> > } > >> > > >> > /* > >> > @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > >> > * We're already holding a reference on the page but haven't mapped it > >> > * yet. > >> > */ > >> > - swap_free(entry); > >> > + swap_free_nr(entry, nr_pages); > >> > if (should_try_to_free_swap(folio, vma, vmf->flags)) > >> > folio_free_swap(folio); > >> > > >> > - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); > >> > - dec_mm_counter(vma->vm_mm, MM_SWAPENTS); > >> > + folio_ref_add(folio, nr_pages - 1); > >> > + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); > >> > + add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); > >> > + > >> > pte = mk_pte(page, vma->vm_page_prot); > >> > > >> > /* > >> > @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > >> > * exclusivity. > >> > */ > >> > if (!folio_test_ksm(folio) && > >> > - (exclusive || folio_ref_count(folio) == 1)) { > >> > + (exclusive || (folio_ref_count(folio) == nr_pages && > >> > + folio_nr_pages(folio) == nr_pages))) { > >> > if (vmf->flags & FAULT_FLAG_WRITE) { > >> > pte = maybe_mkwrite(pte_mkdirty(pte), vma); > >> > vmf->flags &= ~FAULT_FLAG_WRITE; > >> > } > >> > rmap_flags |= RMAP_EXCLUSIVE; > >> > } > >> > - flush_icache_page(vma, page); > >> > + flush_icache_pages(vma, page, nr_pages); > >> > if (pte_swp_soft_dirty(vmf->orig_pte)) > >> > pte = pte_mksoft_dirty(pte); > >> > if (pte_swp_uffd_wp(vmf->orig_pte)) > >> > pte = pte_mkuffd_wp(pte); > >> > - vmf->orig_pte = pte; > >> > > >> > /* ksm created a completely new copy */ > >> > if (unlikely(folio != swapcache && swapcache)) { > >> > - folio_add_new_anon_rmap(folio, vma, vmf->address); > >> > + folio_add_new_anon_rmap(folio, vma, start_address); > >> > folio_add_lru_vma(folio, vma); > >> > } else { > >> > - folio_add_anon_rmap_pte(folio, page, vma, vmf->address, > >> > - rmap_flags); > >> > + folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address, > >> > + rmap_flags); > >> > } > >> > > >> > VM_BUG_ON(!folio_test_anon(folio) || > >> > (pte_write(pte) && !PageAnonExclusive(page))); > >> > - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); > >> > - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); > >> > + set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages); > >> > + vmf->orig_pte = ptep_get(vmf->pte); > >> > + arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte); > >> > >> Do we need to call arch_do_swap_page() for each subpage? IIUC, the > >> corresponding arch_unmap_one() will be called for each subpage. > > > > i actually thought about this very carefully, right now, the only one who > > needs this is sparc and it doesn't support THP_SWAPOUT at all. and > > there is no proof doing restoration one by one won't really break sparc. > > so i'd like to defer this to when sparc really needs THP_SWAPOUT. > > Let's ask SPARC developer (Cced) for this. > > IMHO, even if we cannot get help, we need to change code with our > understanding instead of deferring it. ok. Thanks for Ccing sparc developers. > > > on the other hand, it seems really bad we have both > > arch_swap_restore - for this, arm64 has moved to using folio > > and > > arch_do_swap_page > > > > we should somehow unify them later if sparc wants THP_SWPOUT. > > > >> > >> > folio_unlock(folio); > >> > if (folio != swapcache && swapcache) { > >> > @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > >> > } > >> > > >> > /* No need to invalidate - it was non-present before */ > >> > - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); > >> > + update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages); > >> > unlock: > >> > if (vmf->pte) > >> > pte_unmap_unlock(vmf->pte, vmf->ptl); > > -- > Best Regards, > Huang, Ying Thanks Barry
Barry Song <21cnbao@gmail.com> writes: > On Tue, Apr 16, 2024 at 2:27 PM Huang, Ying <ying.huang@intel.com> wrote: >> >> >> Added Khalid for arch_do_swap_page(). >> >> Barry Song <21cnbao@gmail.com> writes: >> >> > On Mon, Apr 15, 2024 at 8:39 PM Huang, Ying <ying.huang@intel.com> wrote: >> >> >> >> Barry Song <21cnbao@gmail.com> writes: >> >> [snip] >> >> >> >> >> > + bool any_swap_shared = false; >> >> > >> >> > if (!pte_unmap_same(vmf)) >> >> > goto out; >> >> > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >> >> > */ >> >> > vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, >> >> > &vmf->ptl); >> >> >> >> We should move pte check here. That is, >> >> >> >> if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) >> >> goto out_nomap; >> >> >> >> This will simplify the situation for large folio. >> > >> > the plan is moving the whole code block >> > >> > if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) >> > >> > after >> > if (unlikely(!folio_test_uptodate(folio))) { >> > ret = VM_FAULT_SIGBUS; >> > goto out_nomap; >> > } >> > >> > though we couldn't be !folio_test_uptodate(folio)) for hitting >> > swapcache but it seems >> > logically better for future use. >> >> LGTM, Thanks! >> >> >> >> >> > + >> >> > + /* We hit large folios in swapcache */ >> >> >> >> The comments seems unnecessary because the code tells that already. >> >> >> >> > + if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) { >> >> > + int nr = folio_nr_pages(folio); >> >> > + int idx = folio_page_idx(folio, page); >> >> > + unsigned long folio_start = vmf->address - idx * PAGE_SIZE; >> >> > + unsigned long folio_end = folio_start + nr * PAGE_SIZE; >> >> > + pte_t *folio_ptep; >> >> > + pte_t folio_pte; >> >> > + >> >> > + if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start))) >> >> > + goto check_pte; >> >> > + if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end))) >> >> > + goto check_pte; >> >> > + >> >> > + folio_ptep = vmf->pte - idx; >> >> > + folio_pte = ptep_get(folio_ptep); >> >> >> >> It's better to construct pte based on fault PTE via generalizing >> >> pte_next_swp_offset() (may be pte_move_swp_offset()). Then we can find >> >> inconsistent PTEs quicker. >> > >> > it seems your point is getting the pte of page0 by pte_next_swp_offset() >> > unfortunately pte_next_swp_offset can't go back. on the other hand, >> > we have to check the real pte value of the 0nd entry right now because >> > swap_pte_batch() only really reads pte from the 1st entry. it assumes >> > pte argument is the real value for the 0nd pte entry. >> > >> > static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) >> > { >> > pte_t expected_pte = pte_next_swp_offset(pte); >> > const pte_t *end_ptep = start_ptep + max_nr; >> > pte_t *ptep = start_ptep + 1; >> > >> > VM_WARN_ON(max_nr < 1); >> > VM_WARN_ON(!is_swap_pte(pte)); >> > VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte))); >> > >> > while (ptep < end_ptep) { >> > pte = ptep_get(ptep); >> > >> > if (!pte_same(pte, expected_pte)) >> > break; >> > >> > expected_pte = pte_next_swp_offset(expected_pte); >> > ptep++; >> > } >> > >> > return ptep - start_ptep; >> > } >> >> Yes. You are right. >> >> But we may check whether the pte of page0 is same as "vmf->orig_pte - >> folio_page_idx()" (fake code). > > right, that is why we are reading and checking PTE0 before calling > swap_pte_batch() > right now. > > folio_ptep = vmf->pte - idx; > folio_pte = ptep_get(folio_ptep); > if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || > swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) > goto check_pte; > > So, if I understand correctly, you're proposing that we should directly check > PTE0 in swap_pte_batch(). Personally, I don't have any objections to this idea. > However, I'd also like to hear the feedback from Ryan and David :-) I mean that we can replace !is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) in above code with pte_same() with constructed expected first pte. >> >> You need to check the pte of page 0 anyway. >> >> >> >> >> > + if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || >> >> > + swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) >> >> > + goto check_pte; >> >> > + >> >> > + start_address = folio_start; >> >> > + start_pte = folio_ptep; >> >> > + nr_pages = nr; >> >> > + entry = folio->swap; >> >> > + page = &folio->page; >> >> > + } >> >> > + >> >> > +check_pte: >> >> > if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) >> >> > goto out_nomap; >> >> > >> >> > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >> >> > */ >> >> > exclusive = false; >> >> > } >> >> > + >> >> > + /* Reuse the whole large folio iff all entries are exclusive */ >> >> > + if (nr_pages > 1 && any_swap_shared) >> >> > + exclusive = false; >> >> > } >> >> > [snip] -- Best Regards, Huang, Ying
On Tue, Apr 16, 2024 at 2:41 PM Huang, Ying <ying.huang@intel.com> wrote: > > Barry Song <21cnbao@gmail.com> writes: > > > On Tue, Apr 16, 2024 at 2:27 PM Huang, Ying <ying.huang@intel.com> wrote: > >> > >> > >> Added Khalid for arch_do_swap_page(). > >> > >> Barry Song <21cnbao@gmail.com> writes: > >> > >> > On Mon, Apr 15, 2024 at 8:39 PM Huang, Ying <ying.huang@intel.com> wrote: > >> >> > >> >> Barry Song <21cnbao@gmail.com> writes: > >> > >> [snip] > >> > >> >> > >> >> > + bool any_swap_shared = false; > >> >> > > >> >> > if (!pte_unmap_same(vmf)) > >> >> > goto out; > >> >> > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > >> >> > */ > >> >> > vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, > >> >> > &vmf->ptl); > >> >> > >> >> We should move pte check here. That is, > >> >> > >> >> if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) > >> >> goto out_nomap; > >> >> > >> >> This will simplify the situation for large folio. > >> > > >> > the plan is moving the whole code block > >> > > >> > if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) > >> > > >> > after > >> > if (unlikely(!folio_test_uptodate(folio))) { > >> > ret = VM_FAULT_SIGBUS; > >> > goto out_nomap; > >> > } > >> > > >> > though we couldn't be !folio_test_uptodate(folio)) for hitting > >> > swapcache but it seems > >> > logically better for future use. > >> > >> LGTM, Thanks! > >> > >> >> > >> >> > + > >> >> > + /* We hit large folios in swapcache */ > >> >> > >> >> The comments seems unnecessary because the code tells that already. > >> >> > >> >> > + if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) { > >> >> > + int nr = folio_nr_pages(folio); > >> >> > + int idx = folio_page_idx(folio, page); > >> >> > + unsigned long folio_start = vmf->address - idx * PAGE_SIZE; > >> >> > + unsigned long folio_end = folio_start + nr * PAGE_SIZE; > >> >> > + pte_t *folio_ptep; > >> >> > + pte_t folio_pte; > >> >> > + > >> >> > + if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start))) > >> >> > + goto check_pte; > >> >> > + if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end))) > >> >> > + goto check_pte; > >> >> > + > >> >> > + folio_ptep = vmf->pte - idx; > >> >> > + folio_pte = ptep_get(folio_ptep); > >> >> > >> >> It's better to construct pte based on fault PTE via generalizing > >> >> pte_next_swp_offset() (may be pte_move_swp_offset()). Then we can find > >> >> inconsistent PTEs quicker. > >> > > >> > it seems your point is getting the pte of page0 by pte_next_swp_offset() > >> > unfortunately pte_next_swp_offset can't go back. on the other hand, > >> > we have to check the real pte value of the 0nd entry right now because > >> > swap_pte_batch() only really reads pte from the 1st entry. it assumes > >> > pte argument is the real value for the 0nd pte entry. > >> > > >> > static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) > >> > { > >> > pte_t expected_pte = pte_next_swp_offset(pte); > >> > const pte_t *end_ptep = start_ptep + max_nr; > >> > pte_t *ptep = start_ptep + 1; > >> > > >> > VM_WARN_ON(max_nr < 1); > >> > VM_WARN_ON(!is_swap_pte(pte)); > >> > VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte))); > >> > > >> > while (ptep < end_ptep) { > >> > pte = ptep_get(ptep); > >> > > >> > if (!pte_same(pte, expected_pte)) > >> > break; > >> > > >> > expected_pte = pte_next_swp_offset(expected_pte); > >> > ptep++; > >> > } > >> > > >> > return ptep - start_ptep; > >> > } > >> > >> Yes. You are right. > >> > >> But we may check whether the pte of page0 is same as "vmf->orig_pte - > >> folio_page_idx()" (fake code). > > > > right, that is why we are reading and checking PTE0 before calling > > swap_pte_batch() > > right now. > > > > folio_ptep = vmf->pte - idx; > > folio_pte = ptep_get(folio_ptep); > > if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || > > swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) > > goto check_pte; > > > > So, if I understand correctly, you're proposing that we should directly check > > PTE0 in swap_pte_batch(). Personally, I don't have any objections to this idea. > > However, I'd also like to hear the feedback from Ryan and David :-) > > I mean that we can replace > > !is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) > > in above code with pte_same() with constructed expected first pte. Got it. It could be quite tricky, especially with considerations like pte_swp_soft_dirty, pte_swp_exclusive, and pte_swp_uffd_wp. We might require a helper function similar to pte_next_swp_offset() but capable of moving both forward and backward. For instance: pte_move_swp_offset(pte_t pte, long delta) pte_next_swp_offset can insteadly call it by: pte_move_swp_offset(pte, 1); Is it what you are proposing? > > >> > >> You need to check the pte of page 0 anyway. > >> > >> >> > >> >> > + if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || > >> >> > + swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) > >> >> > + goto check_pte; > >> >> > + > >> >> > + start_address = folio_start; > >> >> > + start_pte = folio_ptep; > >> >> > + nr_pages = nr; > >> >> > + entry = folio->swap; > >> >> > + page = &folio->page; > >> >> > + } > >> >> > + > >> >> > +check_pte: > >> >> > if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) > >> >> > goto out_nomap; > >> >> > > >> >> > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > >> >> > */ > >> >> > exclusive = false; > >> >> > } > >> >> > + > >> >> > + /* Reuse the whole large folio iff all entries are exclusive */ > >> >> > + if (nr_pages > 1 && any_swap_shared) > >> >> > + exclusive = false; > >> >> > } > >> >> > > > [snip] > > -- > Best Regards, > Huang, Ying
Barry Song <21cnbao@gmail.com> writes: > On Tue, Apr 16, 2024 at 2:41 PM Huang, Ying <ying.huang@intel.com> wrote: >> >> Barry Song <21cnbao@gmail.com> writes: >> >> > On Tue, Apr 16, 2024 at 2:27 PM Huang, Ying <ying.huang@intel.com> wrote: >> >> >> >> >> >> Added Khalid for arch_do_swap_page(). >> >> >> >> Barry Song <21cnbao@gmail.com> writes: >> >> >> >> > On Mon, Apr 15, 2024 at 8:39 PM Huang, Ying <ying.huang@intel.com> wrote: >> >> >> >> >> >> Barry Song <21cnbao@gmail.com> writes: >> >> >> >> [snip] >> >> >> >> >> >> >> >> > + bool any_swap_shared = false; >> >> >> > >> >> >> > if (!pte_unmap_same(vmf)) >> >> >> > goto out; >> >> >> > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >> >> >> > */ >> >> >> > vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, >> >> >> > &vmf->ptl); >> >> >> >> >> >> We should move pte check here. That is, >> >> >> >> >> >> if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) >> >> >> goto out_nomap; >> >> >> >> >> >> This will simplify the situation for large folio. >> >> > >> >> > the plan is moving the whole code block >> >> > >> >> > if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) >> >> > >> >> > after >> >> > if (unlikely(!folio_test_uptodate(folio))) { >> >> > ret = VM_FAULT_SIGBUS; >> >> > goto out_nomap; >> >> > } >> >> > >> >> > though we couldn't be !folio_test_uptodate(folio)) for hitting >> >> > swapcache but it seems >> >> > logically better for future use. >> >> >> >> LGTM, Thanks! >> >> >> >> >> >> >> >> > + >> >> >> > + /* We hit large folios in swapcache */ >> >> >> >> >> >> The comments seems unnecessary because the code tells that already. >> >> >> >> >> >> > + if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) { >> >> >> > + int nr = folio_nr_pages(folio); >> >> >> > + int idx = folio_page_idx(folio, page); >> >> >> > + unsigned long folio_start = vmf->address - idx * PAGE_SIZE; >> >> >> > + unsigned long folio_end = folio_start + nr * PAGE_SIZE; >> >> >> > + pte_t *folio_ptep; >> >> >> > + pte_t folio_pte; >> >> >> > + >> >> >> > + if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start))) >> >> >> > + goto check_pte; >> >> >> > + if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end))) >> >> >> > + goto check_pte; >> >> >> > + >> >> >> > + folio_ptep = vmf->pte - idx; >> >> >> > + folio_pte = ptep_get(folio_ptep); >> >> >> >> >> >> It's better to construct pte based on fault PTE via generalizing >> >> >> pte_next_swp_offset() (may be pte_move_swp_offset()). Then we can find >> >> >> inconsistent PTEs quicker. >> >> > >> >> > it seems your point is getting the pte of page0 by pte_next_swp_offset() >> >> > unfortunately pte_next_swp_offset can't go back. on the other hand, >> >> > we have to check the real pte value of the 0nd entry right now because >> >> > swap_pte_batch() only really reads pte from the 1st entry. it assumes >> >> > pte argument is the real value for the 0nd pte entry. >> >> > >> >> > static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) >> >> > { >> >> > pte_t expected_pte = pte_next_swp_offset(pte); >> >> > const pte_t *end_ptep = start_ptep + max_nr; >> >> > pte_t *ptep = start_ptep + 1; >> >> > >> >> > VM_WARN_ON(max_nr < 1); >> >> > VM_WARN_ON(!is_swap_pte(pte)); >> >> > VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte))); >> >> > >> >> > while (ptep < end_ptep) { >> >> > pte = ptep_get(ptep); >> >> > >> >> > if (!pte_same(pte, expected_pte)) >> >> > break; >> >> > >> >> > expected_pte = pte_next_swp_offset(expected_pte); >> >> > ptep++; >> >> > } >> >> > >> >> > return ptep - start_ptep; >> >> > } >> >> >> >> Yes. You are right. >> >> >> >> But we may check whether the pte of page0 is same as "vmf->orig_pte - >> >> folio_page_idx()" (fake code). >> > >> > right, that is why we are reading and checking PTE0 before calling >> > swap_pte_batch() >> > right now. >> > >> > folio_ptep = vmf->pte - idx; >> > folio_pte = ptep_get(folio_ptep); >> > if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || >> > swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) >> > goto check_pte; >> > >> > So, if I understand correctly, you're proposing that we should directly check >> > PTE0 in swap_pte_batch(). Personally, I don't have any objections to this idea. >> > However, I'd also like to hear the feedback from Ryan and David :-) >> >> I mean that we can replace >> >> !is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) >> >> in above code with pte_same() with constructed expected first pte. > > Got it. It could be quite tricky, especially with considerations like > pte_swp_soft_dirty, pte_swp_exclusive, and pte_swp_uffd_wp. We might > require a helper function similar to pte_next_swp_offset() but capable of > moving both forward and backward. For instance: > > pte_move_swp_offset(pte_t pte, long delta) > > pte_next_swp_offset can insteadly call it by: > pte_move_swp_offset(pte, 1); > > Is it what you are proposing? Yes. Exactly. -- Best Regards, Huang, Ying >> >> >> >> >> You need to check the pte of page 0 anyway. >> >> >> >> >> >> >> >> > + if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || >> >> >> > + swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) >> >> >> > + goto check_pte; >> >> >> > + >> >> >> > + start_address = folio_start; >> >> >> > + start_pte = folio_ptep; >> >> >> > + nr_pages = nr; >> >> >> > + entry = folio->swap; >> >> >> > + page = &folio->page; >> >> >> > + } >> >> >> > + >> >> >> > +check_pte: >> >> >> > if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) >> >> >> > goto out_nomap; >> >> >> > >> >> >> > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) >> >> >> > */ >> >> >> > exclusive = false; >> >> >> > } >> >> >> > + >> >> >> > + /* Reuse the whole large folio iff all entries are exclusive */ >> >> >> > + if (nr_pages > 1 && any_swap_shared) >> >> >> > + exclusive = false; >> >> >> > } >> >> >> > >> >> [snip] >> >> -- >> Best Regards, >> Huang, Ying
On Tue, Apr 16, 2024 at 3:19 PM Huang, Ying <ying.huang@intel.com> wrote: > > Barry Song <21cnbao@gmail.com> writes: > > > On Tue, Apr 16, 2024 at 2:41 PM Huang, Ying <ying.huang@intel.com> wrote: > >> > >> Barry Song <21cnbao@gmail.com> writes: > >> > >> > On Tue, Apr 16, 2024 at 2:27 PM Huang, Ying <ying.huang@intel.com> wrote: > >> >> > >> >> > >> >> Added Khalid for arch_do_swap_page(). > >> >> > >> >> Barry Song <21cnbao@gmail.com> writes: > >> >> > >> >> > On Mon, Apr 15, 2024 at 8:39 PM Huang, Ying <ying.huang@intel.com> wrote: > >> >> >> > >> >> >> Barry Song <21cnbao@gmail.com> writes: > >> >> > >> >> [snip] > >> >> > >> >> >> > >> >> >> > + bool any_swap_shared = false; > >> >> >> > > >> >> >> > if (!pte_unmap_same(vmf)) > >> >> >> > goto out; > >> >> >> > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > >> >> >> > */ > >> >> >> > vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, > >> >> >> > &vmf->ptl); > >> >> >> > >> >> >> We should move pte check here. That is, > >> >> >> > >> >> >> if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) > >> >> >> goto out_nomap; > >> >> >> > >> >> >> This will simplify the situation for large folio. > >> >> > > >> >> > the plan is moving the whole code block > >> >> > > >> >> > if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) > >> >> > > >> >> > after > >> >> > if (unlikely(!folio_test_uptodate(folio))) { > >> >> > ret = VM_FAULT_SIGBUS; > >> >> > goto out_nomap; > >> >> > } > >> >> > > >> >> > though we couldn't be !folio_test_uptodate(folio)) for hitting > >> >> > swapcache but it seems > >> >> > logically better for future use. > >> >> > >> >> LGTM, Thanks! > >> >> > >> >> >> > >> >> >> > + > >> >> >> > + /* We hit large folios in swapcache */ > >> >> >> > >> >> >> The comments seems unnecessary because the code tells that already. > >> >> >> > >> >> >> > + if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) { > >> >> >> > + int nr = folio_nr_pages(folio); > >> >> >> > + int idx = folio_page_idx(folio, page); > >> >> >> > + unsigned long folio_start = vmf->address - idx * PAGE_SIZE; > >> >> >> > + unsigned long folio_end = folio_start + nr * PAGE_SIZE; > >> >> >> > + pte_t *folio_ptep; > >> >> >> > + pte_t folio_pte; > >> >> >> > + > >> >> >> > + if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start))) > >> >> >> > + goto check_pte; > >> >> >> > + if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end))) > >> >> >> > + goto check_pte; > >> >> >> > + > >> >> >> > + folio_ptep = vmf->pte - idx; > >> >> >> > + folio_pte = ptep_get(folio_ptep); > >> >> >> > >> >> >> It's better to construct pte based on fault PTE via generalizing > >> >> >> pte_next_swp_offset() (may be pte_move_swp_offset()). Then we can find > >> >> >> inconsistent PTEs quicker. > >> >> > > >> >> > it seems your point is getting the pte of page0 by pte_next_swp_offset() > >> >> > unfortunately pte_next_swp_offset can't go back. on the other hand, > >> >> > we have to check the real pte value of the 0nd entry right now because > >> >> > swap_pte_batch() only really reads pte from the 1st entry. it assumes > >> >> > pte argument is the real value for the 0nd pte entry. > >> >> > > >> >> > static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) > >> >> > { > >> >> > pte_t expected_pte = pte_next_swp_offset(pte); > >> >> > const pte_t *end_ptep = start_ptep + max_nr; > >> >> > pte_t *ptep = start_ptep + 1; > >> >> > > >> >> > VM_WARN_ON(max_nr < 1); > >> >> > VM_WARN_ON(!is_swap_pte(pte)); > >> >> > VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte))); > >> >> > > >> >> > while (ptep < end_ptep) { > >> >> > pte = ptep_get(ptep); > >> >> > > >> >> > if (!pte_same(pte, expected_pte)) > >> >> > break; > >> >> > > >> >> > expected_pte = pte_next_swp_offset(expected_pte); > >> >> > ptep++; > >> >> > } > >> >> > > >> >> > return ptep - start_ptep; > >> >> > } > >> >> > >> >> Yes. You are right. > >> >> > >> >> But we may check whether the pte of page0 is same as "vmf->orig_pte - > >> >> folio_page_idx()" (fake code). > >> > > >> > right, that is why we are reading and checking PTE0 before calling > >> > swap_pte_batch() > >> > right now. > >> > > >> > folio_ptep = vmf->pte - idx; > >> > folio_pte = ptep_get(folio_ptep); > >> > if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || > >> > swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) > >> > goto check_pte; > >> > > >> > So, if I understand correctly, you're proposing that we should directly check > >> > PTE0 in swap_pte_batch(). Personally, I don't have any objections to this idea. > >> > However, I'd also like to hear the feedback from Ryan and David :-) > >> > >> I mean that we can replace > >> > >> !is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) > >> > >> in above code with pte_same() with constructed expected first pte. > > > > Got it. It could be quite tricky, especially with considerations like > > pte_swp_soft_dirty, pte_swp_exclusive, and pte_swp_uffd_wp. We might > > require a helper function similar to pte_next_swp_offset() but capable of > > moving both forward and backward. For instance: > > > > pte_move_swp_offset(pte_t pte, long delta) > > > > pte_next_swp_offset can insteadly call it by: > > pte_move_swp_offset(pte, 1); > > > > Is it what you are proposing? > > Yes. Exactly. Great. I agree that this appears to be much cleaner than the current code. > > -- > Best Regards, > Huang, Ying > > >> > >> >> > >> >> You need to check the pte of page 0 anyway. > >> >> > >> >> >> > >> >> >> > + if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || > >> >> >> > + swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) > >> >> >> > + goto check_pte; > >> >> >> > + > >> >> >> > + start_address = folio_start; > >> >> >> > + start_pte = folio_ptep; > >> >> >> > + nr_pages = nr; > >> >> >> > + entry = folio->swap; > >> >> >> > + page = &folio->page; > >> >> >> > + } > >> >> >> > + > >> >> >> > +check_pte: > >> >> >> > if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) > >> >> >> > goto out_nomap; > >> >> >> > > >> >> >> > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > >> >> >> > */ > >> >> >> > exclusive = false; > >> >> >> > } > >> >> >> > + > >> >> >> > + /* Reuse the whole large folio iff all entries are exclusive */ > >> >> >> > + if (nr_pages > 1 && any_swap_shared) > >> >> >> > + exclusive = false; > >> >> >> > } > >> >> >> > > >> > >> [snip] > >> > >> -- > >> Best Regards, > >> Huang, Ying
[snip] > > >> > > > >> > VM_BUG_ON(!folio_test_anon(folio) || > > >> > (pte_write(pte) && !PageAnonExclusive(page))); > > >> > - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); > > >> > - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); > > >> > + set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages); > > >> > + vmf->orig_pte = ptep_get(vmf->pte); > > >> > + arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte); > > >> > > >> Do we need to call arch_do_swap_page() for each subpage? IIUC, the > > >> corresponding arch_unmap_one() will be called for each subpage. > > > > > > i actually thought about this very carefully, right now, the only one who > > > needs this is sparc and it doesn't support THP_SWAPOUT at all. and > > > there is no proof doing restoration one by one won't really break sparc. > > > so i'd like to defer this to when sparc really needs THP_SWAPOUT. > > > > Let's ask SPARC developer (Cced) for this. > > > > IMHO, even if we cannot get help, we need to change code with our > > understanding instead of deferring it. > > ok. Thanks for Ccing sparc developers. Hi Khalid & Ying (also Cced sparc maillist), SPARC is the only platform which needs arch_do_swap_page(), right now, its THP_SWAPOUT is not enabled. so we will not really hit a large folio in swapcache. just in case you might need THP_SWAPOUT later, i am changing the code as below, @@ -4286,7 +4285,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) VM_BUG_ON(!folio_test_anon(folio) || (pte_write(pte) && !PageAnonExclusive(page))); set_ptes(vma->vm_mm, start_address, start_ptep, pte, nr_pages); - arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte); + for (int i = 0; i < nr_pages; i++) { + arch_do_swap_page(vma->vm_mm, vma, start_address + i * PAGE_SIZE, + pte, pte); + pte = pte_advance_pfn(pte, 1); + } folio_unlock(folio); if (folio != swapcache && swapcache) { for sparc, nr_pages will always be 1(THP_SWAPOUT not enabled). for arm64/x86/riscv, it seems redundant to do a for loop "for (int i = 0; i < nr_pages; i++)". so another option is adding a helper as below to avoid the idle loop for arm64/x86/riscv etc. diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e2f45e22a6d1..ea314a5f9b5e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1085,6 +1085,28 @@ static inline void arch_do_swap_page(struct mm_struct *mm, { } + +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + +} +#else +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + for (int i = 0; i < nr; i++) { + arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE, + pte_advance_pfn(pte, i), + pte_advance_pfn(oldpte, i)); + } +} #endif Please tell me your preference. BTW, i found oldpte and pte are always same in do_swap_page(), is it something wrong? does arch_do_swap_page() really need two same arguments? vmf->orig_pte = pte; ... arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); > > > > > > on the other hand, it seems really bad we have both > > > arch_swap_restore - for this, arm64 has moved to using folio > > > and > > > arch_do_swap_page > > > > > > we should somehow unify them later if sparc wants THP_SWPOUT. Thanks Barry
diff --git a/mm/memory.c b/mm/memory.c index c4a52e8d740a..9818dc1893c8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3947,6 +3947,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) pte_t pte; vm_fault_t ret = 0; void *shadow = NULL; + int nr_pages = 1; + unsigned long start_address = vmf->address; + pte_t *start_pte = vmf->pte; + bool any_swap_shared = false; if (!pte_unmap_same(vmf)) goto out; @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); + + /* We hit large folios in swapcache */ + if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) { + int nr = folio_nr_pages(folio); + int idx = folio_page_idx(folio, page); + unsigned long folio_start = vmf->address - idx * PAGE_SIZE; + unsigned long folio_end = folio_start + nr * PAGE_SIZE; + pte_t *folio_ptep; + pte_t folio_pte; + + if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start))) + goto check_pte; + if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end))) + goto check_pte; + + folio_ptep = vmf->pte - idx; + folio_pte = ptep_get(folio_ptep); + if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) || + swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr) + goto check_pte; + + start_address = folio_start; + start_pte = folio_ptep; + nr_pages = nr; + entry = folio->swap; + page = &folio->page; + } + +check_pte: if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto out_nomap; @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ exclusive = false; } + + /* Reuse the whole large folio iff all entries are exclusive */ + if (nr_pages > 1 && any_swap_shared) + exclusive = false; } /* @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * We're already holding a reference on the page but haven't mapped it * yet. */ - swap_free(entry); + swap_free_nr(entry, nr_pages); if (should_try_to_free_swap(folio, vma, vmf->flags)) folio_free_swap(folio); - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); - dec_mm_counter(vma->vm_mm, MM_SWAPENTS); + folio_ref_add(folio, nr_pages - 1); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); + add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); + pte = mk_pte(page, vma->vm_page_prot); /* @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * exclusivity. */ if (!folio_test_ksm(folio) && - (exclusive || folio_ref_count(folio) == 1)) { + (exclusive || (folio_ref_count(folio) == nr_pages && + folio_nr_pages(folio) == nr_pages))) { if (vmf->flags & FAULT_FLAG_WRITE) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; } rmap_flags |= RMAP_EXCLUSIVE; } - flush_icache_page(vma, page); + flush_icache_pages(vma, page, nr_pages); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); if (pte_swp_uffd_wp(vmf->orig_pte)) pte = pte_mkuffd_wp(pte); - vmf->orig_pte = pte; /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { - folio_add_new_anon_rmap(folio, vma, vmf->address); + folio_add_new_anon_rmap(folio, vma, start_address); folio_add_lru_vma(folio, vma); } else { - folio_add_anon_rmap_pte(folio, page, vma, vmf->address, - rmap_flags); + folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address, + rmap_flags); } VM_BUG_ON(!folio_test_anon(folio) || (pte_write(pte) && !PageAnonExclusive(page))); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); + set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages); + vmf->orig_pte = ptep_get(vmf->pte); + arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte); folio_unlock(folio); if (folio != swapcache && swapcache) { @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } /* No need to invalidate - it was non-present before */ - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); + update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl);