Message ID | 163969850296.20885.16043920355602134308.stgit@noble.brown (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Repair SWAP-over-NFS | expand |
Hi NeilBrown, Thank you for the patch! Yet something to improve: [auto build test ERROR on cifs/for-next] [also build test ERROR on axboe-block/for-next rostedt-trace/for-next linus/master v5.16-rc5] [cannot apply to trondmy-nfs/linux-next hnaz-mm/master mszeredi-vfs/overlayfs-next next-20211216] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/NeilBrown/Repair-SWAP-over-NFS/20211217-075659 base: git://git.samba.org/sfrench/cifs-2.6.git for-next config: nds32-allnoconfig (https://download.01.org/0day-ci/archive/20211217/202112171515.XWCl9bpF-lkp@intel.com/config) compiler: nds32le-linux-gcc (GCC) 11.2.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/0day-ci/linux/commit/d34716a962c31e9e0a6e40a702e581a02b7e29f7 git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review NeilBrown/Repair-SWAP-over-NFS/20211217-075659 git checkout d34716a962c31e9e0a6e40a702e581a02b7e29f7 # save the config file to linux build tree mkdir build_dir COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=nds32 SHELL=/bin/bash If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot <lkp@intel.com> All errors (new ones prefixed by >>): mm/memory.c: In function 'do_swap_page': >> mm/memory.c:3541:33: error: too many arguments to function 'swap_readpage' 3541 | swap_readpage(page, true, NULL); | ^~~~~~~~~~~~~ In file included from mm/memory.c:88: mm/swap.h:61:19: note: declared here 61 | static inline int swap_readpage(struct page *page, bool do_poll) | ^~~~~~~~~~~~~ vim +/swap_readpage +3541 mm/memory.c 3462 3463 /* 3464 * We enter with non-exclusive mmap_lock (to exclude vma changes, 3465 * but allow concurrent faults), and pte mapped but not yet locked. 3466 * We return with pte unmapped and unlocked. 3467 * 3468 * We return with the mmap_lock locked or unlocked in the same cases 3469 * as does filemap_fault(). 3470 */ 3471 vm_fault_t do_swap_page(struct vm_fault *vmf) 3472 { 3473 struct vm_area_struct *vma = vmf->vma; 3474 struct page *page = NULL, *swapcache; 3475 struct swap_info_struct *si = NULL; 3476 swp_entry_t entry; 3477 pte_t pte; 3478 int locked; 3479 int exclusive = 0; 3480 vm_fault_t ret = 0; 3481 void *shadow = NULL; 3482 3483 if (!pte_unmap_same(vmf)) 3484 goto out; 3485 3486 entry = pte_to_swp_entry(vmf->orig_pte); 3487 if (unlikely(non_swap_entry(entry))) { 3488 if (is_migration_entry(entry)) { 3489 migration_entry_wait(vma->vm_mm, vmf->pmd, 3490 vmf->address); 3491 } else if (is_device_exclusive_entry(entry)) { 3492 vmf->page = pfn_swap_entry_to_page(entry); 3493 ret = remove_device_exclusive_entry(vmf); 3494 } else if (is_device_private_entry(entry)) { 3495 vmf->page = pfn_swap_entry_to_page(entry); 3496 ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); 3497 } else if (is_hwpoison_entry(entry)) { 3498 ret = VM_FAULT_HWPOISON; 3499 } else { 3500 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); 3501 ret = VM_FAULT_SIGBUS; 3502 } 3503 goto out; 3504 } 3505 3506 /* Prevent swapoff from happening to us. */ 3507 si = get_swap_device(entry); 3508 if (unlikely(!si)) 3509 goto out; 3510 3511 delayacct_set_flag(current, DELAYACCT_PF_SWAPIN); 3512 page = lookup_swap_cache(entry, vma, vmf->address); 3513 swapcache = page; 3514 3515 if (!page) { 3516 if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && 3517 __swap_count(entry) == 1) { 3518 /* skip swapcache */ 3519 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, 3520 vmf->address); 3521 if (page) { 3522 __SetPageLocked(page); 3523 __SetPageSwapBacked(page); 3524 3525 if (mem_cgroup_swapin_charge_page(page, 3526 vma->vm_mm, GFP_KERNEL, entry)) { 3527 ret = VM_FAULT_OOM; 3528 goto out_page; 3529 } 3530 mem_cgroup_swapin_uncharge_swap(entry); 3531 3532 shadow = get_shadow_from_swap_cache(entry); 3533 if (shadow) 3534 workingset_refault(page_folio(page), 3535 shadow); 3536 3537 lru_cache_add(page); 3538 3539 /* To provide entry to swap_readpage() */ 3540 set_page_private(page, entry.val); > 3541 swap_readpage(page, true, NULL); 3542 set_page_private(page, 0); 3543 } 3544 } else { 3545 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, 3546 vmf); 3547 swapcache = page; 3548 } 3549 3550 if (!page) { 3551 /* 3552 * Back out if somebody else faulted in this pte 3553 * while we released the pte lock. 3554 */ 3555 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, 3556 vmf->address, &vmf->ptl); 3557 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) 3558 ret = VM_FAULT_OOM; 3559 delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); 3560 goto unlock; 3561 } 3562 3563 /* Had to read the page from swap area: Major fault */ 3564 ret = VM_FAULT_MAJOR; 3565 count_vm_event(PGMAJFAULT); 3566 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); 3567 } else if (PageHWPoison(page)) { 3568 /* 3569 * hwpoisoned dirty swapcache pages are kept for killing 3570 * owner processes (which may be unknown at hwpoison time) 3571 */ 3572 ret = VM_FAULT_HWPOISON; 3573 delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); 3574 goto out_release; 3575 } 3576 3577 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); 3578 3579 delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); 3580 if (!locked) { 3581 ret |= VM_FAULT_RETRY; 3582 goto out_release; 3583 } 3584 3585 /* 3586 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not 3587 * release the swapcache from under us. The page pin, and pte_same 3588 * test below, are not enough to exclude that. Even if it is still 3589 * swapcache, we need to check that the page's swap has not changed. 3590 */ 3591 if (unlikely((!PageSwapCache(page) || 3592 page_private(page) != entry.val)) && swapcache) 3593 goto out_page; 3594 3595 page = ksm_might_need_to_copy(page, vma, vmf->address); 3596 if (unlikely(!page)) { 3597 ret = VM_FAULT_OOM; 3598 page = swapcache; 3599 goto out_page; 3600 } 3601 3602 cgroup_throttle_swaprate(page, GFP_KERNEL); 3603 3604 /* 3605 * Back out if somebody else already faulted in this pte. 3606 */ 3607 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, 3608 &vmf->ptl); 3609 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) 3610 goto out_nomap; 3611 3612 if (unlikely(!PageUptodate(page))) { 3613 ret = VM_FAULT_SIGBUS; 3614 goto out_nomap; 3615 } 3616 3617 /* 3618 * The page isn't present yet, go ahead with the fault. 3619 * 3620 * Be careful about the sequence of operations here. 3621 * To get its accounting right, reuse_swap_page() must be called 3622 * while the page is counted on swap but not yet in mapcount i.e. 3623 * before page_add_anon_rmap() and swap_free(); try_to_free_swap() 3624 * must be called after the swap_free(), or it will never succeed. 3625 */ 3626 3627 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 3628 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); 3629 pte = mk_pte(page, vma->vm_page_prot); 3630 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { 3631 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 3632 vmf->flags &= ~FAULT_FLAG_WRITE; 3633 ret |= VM_FAULT_WRITE; 3634 exclusive = RMAP_EXCLUSIVE; 3635 } 3636 flush_icache_page(vma, page); 3637 if (pte_swp_soft_dirty(vmf->orig_pte)) 3638 pte = pte_mksoft_dirty(pte); 3639 if (pte_swp_uffd_wp(vmf->orig_pte)) { 3640 pte = pte_mkuffd_wp(pte); 3641 pte = pte_wrprotect(pte); 3642 } 3643 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); 3644 arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); 3645 vmf->orig_pte = pte; 3646 3647 /* ksm created a completely new copy */ 3648 if (unlikely(page != swapcache && swapcache)) { 3649 page_add_new_anon_rmap(page, vma, vmf->address, false); 3650 lru_cache_add_inactive_or_unevictable(page, vma); 3651 } else { 3652 do_page_add_anon_rmap(page, vma, vmf->address, exclusive); 3653 } 3654 3655 swap_free(entry); 3656 if (mem_cgroup_swap_full(page) || 3657 (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 3658 try_to_free_swap(page); 3659 unlock_page(page); 3660 if (page != swapcache && swapcache) { 3661 /* 3662 * Hold the lock to avoid the swap entry to be reused 3663 * until we take the PT lock for the pte_same() check 3664 * (to avoid false positives from pte_same). For 3665 * further safety release the lock after the swap_free 3666 * so that the swap count won't change under a 3667 * parallel locked swapcache. 3668 */ 3669 unlock_page(swapcache); 3670 put_page(swapcache); 3671 } 3672 3673 if (vmf->flags & FAULT_FLAG_WRITE) { 3674 ret |= do_wp_page(vmf); 3675 if (ret & VM_FAULT_ERROR) 3676 ret &= VM_FAULT_ERROR; 3677 goto out; 3678 } 3679 3680 /* No need to invalidate - it was non-present before */ 3681 update_mmu_cache(vma, vmf->address, vmf->pte); 3682 unlock: 3683 pte_unmap_unlock(vmf->pte, vmf->ptl); 3684 out: 3685 if (si) 3686 put_swap_device(si); 3687 return ret; 3688 out_nomap: 3689 pte_unmap_unlock(vmf->pte, vmf->ptl); 3690 out_page: 3691 unlock_page(page); 3692 out_release: 3693 put_page(page); 3694 if (page != swapcache && swapcache) { 3695 unlock_page(swapcache); 3696 put_page(swapcache); 3697 } 3698 if (si) 3699 put_swap_device(si); 3700 return ret; 3701 } 3702 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
On Fri, Dec 17, 2021 at 10:48:22AM +1100, NeilBrown wrote: > Some caller currently call blk_finish_plug() *before* the final call to > swap_readpage(), so the last page cannot be included. This patch moves > blk_finish_plug() to after the last call, and calls swap_read_unplug() > there too. Can you move this fix into a separate prep patch, preferably with a Fixes tag so that it gets picked up for backports? Otherwise this looks sensible to me.
diff --git a/mm/madvise.c b/mm/madvise.c index 724470773582..a90870c7a2df 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -191,6 +191,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, pte_t *orig_pte; struct vm_area_struct *vma = walk->private; unsigned long index; + struct swap_iocb *splug = NULL; if (pmd_none_or_trans_huge_or_clear_bad(pmd)) return 0; @@ -212,10 +213,11 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, continue; page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, - vma, index, false); + vma, index, false, &splug); if (page) put_page(page); } + swap_read_unplug(splug); return 0; } @@ -231,6 +233,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); struct page *page; + struct swap_iocb *splug = NULL; rcu_read_lock(); xas_for_each(&xas, page, end_index) { @@ -243,13 +246,14 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, swap = radix_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, - NULL, 0, false); + NULL, 0, false, &splug); if (page) put_page(page); rcu_read_lock(); } rcu_read_unlock(); + swap_read_unplug(splug); lru_add_drain(); /* Push any new pages onto the LRU now */ } diff --git a/mm/memory.c b/mm/memory.c index 80bbfd449b40..0ca00f2a6890 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3538,7 +3538,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* To provide entry to swap_readpage() */ set_page_private(page, entry.val); - swap_readpage(page, true); + swap_readpage(page, true, NULL); set_page_private(page, 0); } } else { diff --git a/mm/page_io.c b/mm/page_io.c index 84859132c9c6..03fbf9463081 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -285,7 +285,8 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) struct swap_iocb { struct kiocb iocb; - struct bio_vec bvec; + struct bio_vec bvec[SWAP_CLUSTER_MAX]; + int pages; }; static mempool_t *sio_pool; @@ -303,7 +304,7 @@ int sio_pool_init(void) static void sio_write_complete(struct kiocb *iocb, long ret) { struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); - struct page *page = sio->bvec.bv_page; + struct page *page = sio->bvec[0].bv_page; if (ret != 0 && ret != PAGE_SIZE) { /* @@ -346,10 +347,10 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, init_sync_kiocb(&sio->iocb, swap_file); sio->iocb.ki_complete = sio_write_complete; sio->iocb.ki_pos = page_file_offset(page); - sio->bvec.bv_page = page; - sio->bvec.bv_len = PAGE_SIZE; - sio->bvec.bv_offset = 0; - iov_iter_bvec(&from, WRITE, &sio->bvec, 1, PAGE_SIZE); + sio->bvec[0].bv_page = page; + sio->bvec[0].bv_len = PAGE_SIZE; + sio->bvec[0].bv_offset = 0; + iov_iter_bvec(&from, WRITE, &sio->bvec[0], 1, PAGE_SIZE); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_write_complete(&sio->iocb, ret); @@ -382,21 +383,25 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, static void sio_read_complete(struct kiocb *iocb, long ret) { struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); - struct page *page = sio->bvec.bv_page; - - if (ret != 0 && ret != PAGE_SIZE) { - SetPageError(page); - ClearPageUptodate(page); - pr_alert_ratelimited("Read-error on swap-device\n"); - } else { - SetPageUptodate(page); - count_vm_event(PSWPIN); + int p; + + for (p = 0; p < sio->pages; p++) { + struct page *page = sio->bvec[p].bv_page; + if (ret != 0 && ret != PAGE_SIZE * sio->pages) { + SetPageError(page); + ClearPageUptodate(page); + pr_alert_ratelimited("Read-error on swap-device\n"); + } else { + SetPageUptodate(page); + count_vm_event(PSWPIN); + } + unlock_page(page); } - unlock_page(page); mempool_free(sio, sio_pool); } -int swap_readpage(struct page *page, bool synchronous) +int swap_readpage(struct page *page, bool synchronous, + struct swap_iocb **plug) { struct bio *bio; int ret = 0; @@ -421,24 +426,35 @@ int swap_readpage(struct page *page, bool synchronous) } if (data_race(sis->flags & SWP_FS_OPS)) { - struct file *swap_file = sis->swap_file; - struct address_space *mapping = swap_file->f_mapping; - struct iov_iter from; - struct swap_iocb *sio; + struct swap_iocb *sio = NULL; loff_t pos = page_file_offset(page); - sio = mempool_alloc(sio_pool, GFP_KERNEL); - init_sync_kiocb(&sio->iocb, swap_file); - sio->iocb.ki_pos = pos; - sio->iocb.ki_complete = sio_read_complete; - sio->bvec.bv_page = page; - sio->bvec.bv_len = PAGE_SIZE; - sio->bvec.bv_offset = 0; - - iov_iter_bvec(&from, READ, &sio->bvec, 1, PAGE_SIZE); - ret = mapping->a_ops->swap_rw(&sio->iocb, &from); - if (ret != -EIOCBQUEUED) - sio_read_complete(&sio->iocb, ret); + if (*plug) + sio = *plug; + if (sio) { + if (sio->iocb.ki_filp != sis->swap_file || + sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) { + swap_read_unplug(sio); + sio = NULL; + } + } + if (!sio) { + sio = mempool_alloc(sio_pool, GFP_KERNEL); + init_sync_kiocb(&sio->iocb, sis->swap_file); + sio->iocb.ki_pos = pos; + sio->iocb.ki_complete = sio_read_complete; + sio->pages = 0; + } + sio->bvec[sio->pages].bv_page = page; + sio->bvec[sio->pages].bv_len = PAGE_SIZE; + sio->bvec[sio->pages].bv_offset = 0; + sio->pages += 1; + if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) { + swap_read_unplug(sio); + sio = NULL; + } + if (plug) + *plug = sio; goto out; } @@ -490,3 +506,16 @@ int swap_readpage(struct page *page, bool synchronous) psi_memstall_leave(&pflags); return ret; } + +void __swap_read_unplug(struct swap_iocb *sio) +{ + struct iov_iter from; + struct address_space *mapping = sio->iocb.ki_filp->f_mapping; + int ret; + + iov_iter_bvec(&from, READ, sio->bvec, sio->pages, + PAGE_SIZE * sio->pages); + ret = mapping->a_ops->swap_rw(&sio->iocb, &from); + if (ret != -EIOCBQUEUED) + sio_read_complete(&sio->iocb, ret); +} diff --git a/mm/swap.h b/mm/swap.h index 128a1d3e5558..ce967abc5f46 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -4,7 +4,15 @@ /* linux/mm/page_io.c */ int sio_pool_init(void); -int swap_readpage(struct page *page, bool do_poll); +struct swap_iocb; +int swap_readpage(struct page *page, bool do_poll, + struct swap_iocb **plug); +void __swap_read_unplug(struct swap_iocb *plug); +static inline void swap_read_unplug(struct swap_iocb *plug) +{ + if (unlikely(plug)) + __swap_read_unplug(plug); +} int swap_writepage(struct page *page, struct writeback_control *wbc); void end_swap_bio_write(struct bio *bio); int __swap_writepage(struct page *page, struct writeback_control *wbc, @@ -38,7 +46,8 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); struct page *read_swap_cache_async(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr, - bool do_poll); + bool do_poll, + struct swap_iocb **plug); struct page *__read_swap_cache_async(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/swap_state.c b/mm/swap_state.c index 514b86b05488..5cb2c75fa247 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -520,14 +520,16 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * the swap entry is no longer in use. */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr, bool do_poll) + struct vm_area_struct *vma, + unsigned long addr, bool do_poll, + struct swap_iocb **plug) { bool page_was_allocated; struct page *retpage = __read_swap_cache_async(entry, gfp_mask, vma, addr, &page_was_allocated); if (page_was_allocated) - swap_readpage(retpage, do_poll); + swap_readpage(retpage, do_poll, plug); return retpage; } @@ -621,10 +623,12 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long mask; struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; + struct swap_iocb *splug = NULL; bool do_poll = true, page_allocated; struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address; + blk_start_plug(&plug); mask = swapin_nr_pages(offset) - 1; if (!mask) goto skip; @@ -638,7 +642,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (end_offset >= si->max) end_offset = si->max - 1; - blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ page = __read_swap_cache_async( @@ -647,7 +650,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!page) continue; if (page_allocated) { - swap_readpage(page, false); + swap_readpage(page, false, &splug); if (offset != entry_offset) { SetPageReadahead(page); count_vm_event(SWAP_RA); @@ -655,11 +658,14 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, } put_page(page); } - blk_finish_plug(&plug); lru_add_drain(); /* Push any new pages onto the LRU now */ skip: - return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); + page = read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, + &splug); + blk_finish_plug(&plug); + swap_read_unplug(splug); + return page; } int init_swap_address_space(unsigned int type, unsigned long nr_pages) @@ -790,6 +796,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, struct vm_fault *vmf) { struct blk_plug plug; + struct swap_iocb *splug = NULL; struct vm_area_struct *vma = vmf->vma; struct page *page; pte_t *pte, pentry; @@ -800,11 +807,11 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, .win = 1, }; + blk_start_plug(&plug); swap_ra_info(vmf, &ra_info); if (ra_info.win == 1) goto skip; - blk_start_plug(&plug); for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; i++, pte++) { pentry = *pte; @@ -820,7 +827,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, if (!page) continue; if (page_allocated) { - swap_readpage(page, false); + swap_readpage(page, false, &splug); if (i != ra_info.offset) { SetPageReadahead(page); count_vm_event(SWAP_RA); @@ -828,11 +835,13 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, } put_page(page); } - blk_finish_plug(&plug); lru_add_drain(); skip: - return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, - ra_info.win == 1); + page = read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, + ra_info.win == 1, &splug); + blk_finish_plug(&plug); + swap_read_unplug(splug); + return page; } /**
swap_readpage() is given one page at a time, but maybe called repeatedly in succession. For block-device swapspace, the blk_plug functionality allows the multiple pages to be combined together at lower layers. That cannot be used for SWP_FS_OPS as blk_plug may not exist - it is only active when CONFIG_BLOCK=y. Consequently all swap reads over NFS are single page reads. With this patch we pass in a pointer-to-pointer when swap_readpage can store state between calls - much like the effect of blk_plug. After calling swap_readpage() some number of times, the state will be passed to swap_read_unplug() which can submit the combined request. Some caller currently call blk_finish_plug() *before* the final call to swap_readpage(), so the last page cannot be included. This patch moves blk_finish_plug() to after the last call, and calls swap_read_unplug() there too. Signed-off-by: NeilBrown <neilb@suse.de> --- mm/madvise.c | 8 +++-- mm/memory.c | 2 + mm/page_io.c | 95 ++++++++++++++++++++++++++++++++++++------------------- mm/swap.h | 13 ++++++-- mm/swap_state.c | 31 ++++++++++++------ 5 files changed, 100 insertions(+), 49 deletions(-)