diff mbox series

[06/18] MM: submit multipage reads for SWP_FS_OPS swap-space

Message ID 163969850296.20885.16043920355602134308.stgit@noble.brown (mailing list archive)
State New
Headers show
Series Repair SWAP-over-NFS | expand

Commit Message

NeilBrown Dec. 16, 2021, 11:48 p.m. UTC
swap_readpage() is given one page at a time, but maybe called repeatedly
in succession.
For block-device swapspace, the blk_plug functionality allows the
multiple pages to be combined together at lower layers.
That cannot be used for SWP_FS_OPS as blk_plug may not exist - it is
only active when CONFIG_BLOCK=y.  Consequently all swap reads over NFS
are single page reads.

With this patch we pass in a pointer-to-pointer when swap_readpage can
store state between calls - much like the effect of blk_plug.  After
calling swap_readpage() some number of times, the state will be passed
to swap_read_unplug() which can submit the combined request.

Some caller currently call blk_finish_plug() *before* the final call to
swap_readpage(), so the last page cannot be included.  This patch moves
blk_finish_plug() to after the last call, and calls swap_read_unplug()
there too.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 mm/madvise.c    |    8 +++--
 mm/memory.c     |    2 +
 mm/page_io.c    |   95 ++++++++++++++++++++++++++++++++++++-------------------
 mm/swap.h       |   13 ++++++--
 mm/swap_state.c |   31 ++++++++++++------
 5 files changed, 100 insertions(+), 49 deletions(-)

Comments

kernel test robot Dec. 17, 2021, 7:09 a.m. UTC | #1
Hi NeilBrown,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on cifs/for-next]
[also build test ERROR on axboe-block/for-next rostedt-trace/for-next linus/master v5.16-rc5]
[cannot apply to trondmy-nfs/linux-next hnaz-mm/master mszeredi-vfs/overlayfs-next next-20211216]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/NeilBrown/Repair-SWAP-over-NFS/20211217-075659
base:   git://git.samba.org/sfrench/cifs-2.6.git for-next
config: nds32-allnoconfig (https://download.01.org/0day-ci/archive/20211217/202112171515.XWCl9bpF-lkp@intel.com/config)
compiler: nds32le-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/d34716a962c31e9e0a6e40a702e581a02b7e29f7
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review NeilBrown/Repair-SWAP-over-NFS/20211217-075659
        git checkout d34716a962c31e9e0a6e40a702e581a02b7e29f7
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=nds32 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   mm/memory.c: In function 'do_swap_page':
>> mm/memory.c:3541:33: error: too many arguments to function 'swap_readpage'
    3541 |                                 swap_readpage(page, true, NULL);
         |                                 ^~~~~~~~~~~~~
   In file included from mm/memory.c:88:
   mm/swap.h:61:19: note: declared here
      61 | static inline int swap_readpage(struct page *page, bool do_poll)
         |                   ^~~~~~~~~~~~~


vim +/swap_readpage +3541 mm/memory.c

  3462	
  3463	/*
  3464	 * We enter with non-exclusive mmap_lock (to exclude vma changes,
  3465	 * but allow concurrent faults), and pte mapped but not yet locked.
  3466	 * We return with pte unmapped and unlocked.
  3467	 *
  3468	 * We return with the mmap_lock locked or unlocked in the same cases
  3469	 * as does filemap_fault().
  3470	 */
  3471	vm_fault_t do_swap_page(struct vm_fault *vmf)
  3472	{
  3473		struct vm_area_struct *vma = vmf->vma;
  3474		struct page *page = NULL, *swapcache;
  3475		struct swap_info_struct *si = NULL;
  3476		swp_entry_t entry;
  3477		pte_t pte;
  3478		int locked;
  3479		int exclusive = 0;
  3480		vm_fault_t ret = 0;
  3481		void *shadow = NULL;
  3482	
  3483		if (!pte_unmap_same(vmf))
  3484			goto out;
  3485	
  3486		entry = pte_to_swp_entry(vmf->orig_pte);
  3487		if (unlikely(non_swap_entry(entry))) {
  3488			if (is_migration_entry(entry)) {
  3489				migration_entry_wait(vma->vm_mm, vmf->pmd,
  3490						     vmf->address);
  3491			} else if (is_device_exclusive_entry(entry)) {
  3492				vmf->page = pfn_swap_entry_to_page(entry);
  3493				ret = remove_device_exclusive_entry(vmf);
  3494			} else if (is_device_private_entry(entry)) {
  3495				vmf->page = pfn_swap_entry_to_page(entry);
  3496				ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
  3497			} else if (is_hwpoison_entry(entry)) {
  3498				ret = VM_FAULT_HWPOISON;
  3499			} else {
  3500				print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
  3501				ret = VM_FAULT_SIGBUS;
  3502			}
  3503			goto out;
  3504		}
  3505	
  3506		/* Prevent swapoff from happening to us. */
  3507		si = get_swap_device(entry);
  3508		if (unlikely(!si))
  3509			goto out;
  3510	
  3511		delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
  3512		page = lookup_swap_cache(entry, vma, vmf->address);
  3513		swapcache = page;
  3514	
  3515		if (!page) {
  3516			if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
  3517			    __swap_count(entry) == 1) {
  3518				/* skip swapcache */
  3519				page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
  3520								vmf->address);
  3521				if (page) {
  3522					__SetPageLocked(page);
  3523					__SetPageSwapBacked(page);
  3524	
  3525					if (mem_cgroup_swapin_charge_page(page,
  3526						vma->vm_mm, GFP_KERNEL, entry)) {
  3527						ret = VM_FAULT_OOM;
  3528						goto out_page;
  3529					}
  3530					mem_cgroup_swapin_uncharge_swap(entry);
  3531	
  3532					shadow = get_shadow_from_swap_cache(entry);
  3533					if (shadow)
  3534						workingset_refault(page_folio(page),
  3535									shadow);
  3536	
  3537					lru_cache_add(page);
  3538	
  3539					/* To provide entry to swap_readpage() */
  3540					set_page_private(page, entry.val);
> 3541					swap_readpage(page, true, NULL);
  3542					set_page_private(page, 0);
  3543				}
  3544			} else {
  3545				page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
  3546							vmf);
  3547				swapcache = page;
  3548			}
  3549	
  3550			if (!page) {
  3551				/*
  3552				 * Back out if somebody else faulted in this pte
  3553				 * while we released the pte lock.
  3554				 */
  3555				vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
  3556						vmf->address, &vmf->ptl);
  3557				if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
  3558					ret = VM_FAULT_OOM;
  3559				delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
  3560				goto unlock;
  3561			}
  3562	
  3563			/* Had to read the page from swap area: Major fault */
  3564			ret = VM_FAULT_MAJOR;
  3565			count_vm_event(PGMAJFAULT);
  3566			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
  3567		} else if (PageHWPoison(page)) {
  3568			/*
  3569			 * hwpoisoned dirty swapcache pages are kept for killing
  3570			 * owner processes (which may be unknown at hwpoison time)
  3571			 */
  3572			ret = VM_FAULT_HWPOISON;
  3573			delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
  3574			goto out_release;
  3575		}
  3576	
  3577		locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
  3578	
  3579		delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
  3580		if (!locked) {
  3581			ret |= VM_FAULT_RETRY;
  3582			goto out_release;
  3583		}
  3584	
  3585		/*
  3586		 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
  3587		 * release the swapcache from under us.  The page pin, and pte_same
  3588		 * test below, are not enough to exclude that.  Even if it is still
  3589		 * swapcache, we need to check that the page's swap has not changed.
  3590		 */
  3591		if (unlikely((!PageSwapCache(page) ||
  3592				page_private(page) != entry.val)) && swapcache)
  3593			goto out_page;
  3594	
  3595		page = ksm_might_need_to_copy(page, vma, vmf->address);
  3596		if (unlikely(!page)) {
  3597			ret = VM_FAULT_OOM;
  3598			page = swapcache;
  3599			goto out_page;
  3600		}
  3601	
  3602		cgroup_throttle_swaprate(page, GFP_KERNEL);
  3603	
  3604		/*
  3605		 * Back out if somebody else already faulted in this pte.
  3606		 */
  3607		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
  3608				&vmf->ptl);
  3609		if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
  3610			goto out_nomap;
  3611	
  3612		if (unlikely(!PageUptodate(page))) {
  3613			ret = VM_FAULT_SIGBUS;
  3614			goto out_nomap;
  3615		}
  3616	
  3617		/*
  3618		 * The page isn't present yet, go ahead with the fault.
  3619		 *
  3620		 * Be careful about the sequence of operations here.
  3621		 * To get its accounting right, reuse_swap_page() must be called
  3622		 * while the page is counted on swap but not yet in mapcount i.e.
  3623		 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
  3624		 * must be called after the swap_free(), or it will never succeed.
  3625		 */
  3626	
  3627		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
  3628		dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
  3629		pte = mk_pte(page, vma->vm_page_prot);
  3630		if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
  3631			pte = maybe_mkwrite(pte_mkdirty(pte), vma);
  3632			vmf->flags &= ~FAULT_FLAG_WRITE;
  3633			ret |= VM_FAULT_WRITE;
  3634			exclusive = RMAP_EXCLUSIVE;
  3635		}
  3636		flush_icache_page(vma, page);
  3637		if (pte_swp_soft_dirty(vmf->orig_pte))
  3638			pte = pte_mksoft_dirty(pte);
  3639		if (pte_swp_uffd_wp(vmf->orig_pte)) {
  3640			pte = pte_mkuffd_wp(pte);
  3641			pte = pte_wrprotect(pte);
  3642		}
  3643		set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
  3644		arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
  3645		vmf->orig_pte = pte;
  3646	
  3647		/* ksm created a completely new copy */
  3648		if (unlikely(page != swapcache && swapcache)) {
  3649			page_add_new_anon_rmap(page, vma, vmf->address, false);
  3650			lru_cache_add_inactive_or_unevictable(page, vma);
  3651		} else {
  3652			do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
  3653		}
  3654	
  3655		swap_free(entry);
  3656		if (mem_cgroup_swap_full(page) ||
  3657		    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
  3658			try_to_free_swap(page);
  3659		unlock_page(page);
  3660		if (page != swapcache && swapcache) {
  3661			/*
  3662			 * Hold the lock to avoid the swap entry to be reused
  3663			 * until we take the PT lock for the pte_same() check
  3664			 * (to avoid false positives from pte_same). For
  3665			 * further safety release the lock after the swap_free
  3666			 * so that the swap count won't change under a
  3667			 * parallel locked swapcache.
  3668			 */
  3669			unlock_page(swapcache);
  3670			put_page(swapcache);
  3671		}
  3672	
  3673		if (vmf->flags & FAULT_FLAG_WRITE) {
  3674			ret |= do_wp_page(vmf);
  3675			if (ret & VM_FAULT_ERROR)
  3676				ret &= VM_FAULT_ERROR;
  3677			goto out;
  3678		}
  3679	
  3680		/* No need to invalidate - it was non-present before */
  3681		update_mmu_cache(vma, vmf->address, vmf->pte);
  3682	unlock:
  3683		pte_unmap_unlock(vmf->pte, vmf->ptl);
  3684	out:
  3685		if (si)
  3686			put_swap_device(si);
  3687		return ret;
  3688	out_nomap:
  3689		pte_unmap_unlock(vmf->pte, vmf->ptl);
  3690	out_page:
  3691		unlock_page(page);
  3692	out_release:
  3693		put_page(page);
  3694		if (page != swapcache && swapcache) {
  3695			unlock_page(swapcache);
  3696			put_page(swapcache);
  3697		}
  3698		if (si)
  3699			put_swap_device(si);
  3700		return ret;
  3701	}
  3702	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Christoph Hellwig Dec. 21, 2021, 8:44 a.m. UTC | #2
On Fri, Dec 17, 2021 at 10:48:22AM +1100, NeilBrown wrote:
> Some caller currently call blk_finish_plug() *before* the final call to
> swap_readpage(), so the last page cannot be included.  This patch moves
> blk_finish_plug() to after the last call, and calls swap_read_unplug()
> there too.

Can you move this fix into a separate prep patch, preferably with a
Fixes tag so that it gets picked up for backports?

Otherwise this looks sensible to me.
diff mbox series

Patch

diff --git a/mm/madvise.c b/mm/madvise.c
index 724470773582..a90870c7a2df 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -191,6 +191,7 @@  static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 	pte_t *orig_pte;
 	struct vm_area_struct *vma = walk->private;
 	unsigned long index;
+	struct swap_iocb *splug = NULL;
 
 	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 		return 0;
@@ -212,10 +213,11 @@  static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 			continue;
 
 		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
-							vma, index, false);
+					     vma, index, false, &splug);
 		if (page)
 			put_page(page);
 	}
+	swap_read_unplug(splug);
 
 	return 0;
 }
@@ -231,6 +233,7 @@  static void force_shm_swapin_readahead(struct vm_area_struct *vma,
 	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
 	pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
 	struct page *page;
+	struct swap_iocb *splug = NULL;
 
 	rcu_read_lock();
 	xas_for_each(&xas, page, end_index) {
@@ -243,13 +246,14 @@  static void force_shm_swapin_readahead(struct vm_area_struct *vma,
 
 		swap = radix_to_swp_entry(page);
 		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
-							NULL, 0, false);
+					     NULL, 0, false, &splug);
 		if (page)
 			put_page(page);
 
 		rcu_read_lock();
 	}
 	rcu_read_unlock();
+	swap_read_unplug(splug);
 
 	lru_add_drain();	/* Push any new pages onto the LRU now */
 }
diff --git a/mm/memory.c b/mm/memory.c
index 80bbfd449b40..0ca00f2a6890 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3538,7 +3538,7 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 
 				/* To provide entry to swap_readpage() */
 				set_page_private(page, entry.val);
-				swap_readpage(page, true);
+				swap_readpage(page, true, NULL);
 				set_page_private(page, 0);
 			}
 		} else {
diff --git a/mm/page_io.c b/mm/page_io.c
index 84859132c9c6..03fbf9463081 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -285,7 +285,8 @@  static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
 
 struct swap_iocb {
 	struct kiocb		iocb;
-	struct bio_vec		bvec;
+	struct bio_vec		bvec[SWAP_CLUSTER_MAX];
+	int			pages;
 };
 static mempool_t *sio_pool;
 
@@ -303,7 +304,7 @@  int sio_pool_init(void)
 static void sio_write_complete(struct kiocb *iocb, long ret)
 {
 	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
-	struct page *page = sio->bvec.bv_page;
+	struct page *page = sio->bvec[0].bv_page;
 
 	if (ret != 0 && ret != PAGE_SIZE) {
 		/*
@@ -346,10 +347,10 @@  int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		init_sync_kiocb(&sio->iocb, swap_file);
 		sio->iocb.ki_complete = sio_write_complete;
 		sio->iocb.ki_pos = page_file_offset(page);
-		sio->bvec.bv_page = page;
-		sio->bvec.bv_len = PAGE_SIZE;
-		sio->bvec.bv_offset = 0;
-		iov_iter_bvec(&from, WRITE, &sio->bvec, 1, PAGE_SIZE);
+		sio->bvec[0].bv_page = page;
+		sio->bvec[0].bv_len = PAGE_SIZE;
+		sio->bvec[0].bv_offset = 0;
+		iov_iter_bvec(&from, WRITE, &sio->bvec[0], 1, PAGE_SIZE);
 		ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
 		if (ret != -EIOCBQUEUED)
 			sio_write_complete(&sio->iocb, ret);
@@ -382,21 +383,25 @@  int __swap_writepage(struct page *page, struct writeback_control *wbc,
 static void sio_read_complete(struct kiocb *iocb, long ret)
 {
 	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
-	struct page *page = sio->bvec.bv_page;
-
-	if (ret != 0 && ret != PAGE_SIZE) {
-		SetPageError(page);
-		ClearPageUptodate(page);
-		pr_alert_ratelimited("Read-error on swap-device\n");
-	} else {
-		SetPageUptodate(page);
-		count_vm_event(PSWPIN);
+	int p;
+
+	for (p = 0; p < sio->pages; p++) {
+		struct page *page = sio->bvec[p].bv_page;
+		if (ret != 0 && ret != PAGE_SIZE * sio->pages) {
+			SetPageError(page);
+			ClearPageUptodate(page);
+			pr_alert_ratelimited("Read-error on swap-device\n");
+		} else {
+			SetPageUptodate(page);
+			count_vm_event(PSWPIN);
+		}
+		unlock_page(page);
 	}
-	unlock_page(page);
 	mempool_free(sio, sio_pool);
 }
 
-int swap_readpage(struct page *page, bool synchronous)
+int swap_readpage(struct page *page, bool synchronous,
+		  struct swap_iocb **plug)
 {
 	struct bio *bio;
 	int ret = 0;
@@ -421,24 +426,35 @@  int swap_readpage(struct page *page, bool synchronous)
 	}
 
 	if (data_race(sis->flags & SWP_FS_OPS)) {
-		struct file *swap_file = sis->swap_file;
-		struct address_space *mapping = swap_file->f_mapping;
-		struct iov_iter from;
-		struct swap_iocb *sio;
+		struct swap_iocb *sio = NULL;
 		loff_t pos = page_file_offset(page);
 
-		sio = mempool_alloc(sio_pool, GFP_KERNEL);
-		init_sync_kiocb(&sio->iocb, swap_file);
-		sio->iocb.ki_pos = pos;
-		sio->iocb.ki_complete = sio_read_complete;
-		sio->bvec.bv_page = page;
-		sio->bvec.bv_len = PAGE_SIZE;
-		sio->bvec.bv_offset = 0;
-
-		iov_iter_bvec(&from, READ, &sio->bvec, 1, PAGE_SIZE);
-		ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
-		if (ret != -EIOCBQUEUED)
-			sio_read_complete(&sio->iocb, ret);
+		if (*plug)
+			sio = *plug;
+		if (sio) {
+			if (sio->iocb.ki_filp != sis->swap_file ||
+			    sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) {
+				swap_read_unplug(sio);
+				sio = NULL;
+			}
+		}
+		if (!sio) {
+			sio = mempool_alloc(sio_pool, GFP_KERNEL);
+			init_sync_kiocb(&sio->iocb, sis->swap_file);
+			sio->iocb.ki_pos = pos;
+			sio->iocb.ki_complete = sio_read_complete;
+			sio->pages = 0;
+		}
+		sio->bvec[sio->pages].bv_page = page;
+		sio->bvec[sio->pages].bv_len = PAGE_SIZE;
+		sio->bvec[sio->pages].bv_offset = 0;
+		sio->pages += 1;
+		if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
+			swap_read_unplug(sio);
+			sio = NULL;
+		}
+		if (plug)
+			*plug = sio;
 
 		goto out;
 	}
@@ -490,3 +506,16 @@  int swap_readpage(struct page *page, bool synchronous)
 	psi_memstall_leave(&pflags);
 	return ret;
 }
+
+void __swap_read_unplug(struct swap_iocb *sio)
+{
+	struct iov_iter from;
+	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
+	int ret;
+
+	iov_iter_bvec(&from, READ, sio->bvec, sio->pages,
+		      PAGE_SIZE * sio->pages);
+	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
+	if (ret != -EIOCBQUEUED)
+		sio_read_complete(&sio->iocb, ret);
+}
diff --git a/mm/swap.h b/mm/swap.h
index 128a1d3e5558..ce967abc5f46 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -4,7 +4,15 @@ 
 
 /* linux/mm/page_io.c */
 int sio_pool_init(void);
-int swap_readpage(struct page *page, bool do_poll);
+struct swap_iocb;
+int swap_readpage(struct page *page, bool do_poll,
+		  struct swap_iocb **plug);
+void __swap_read_unplug(struct swap_iocb *plug);
+static inline void swap_read_unplug(struct swap_iocb *plug)
+{
+	if (unlikely(plug))
+		__swap_read_unplug(plug);
+}
 int swap_writepage(struct page *page, struct writeback_control *wbc);
 void end_swap_bio_write(struct bio *bio);
 int __swap_writepage(struct page *page, struct writeback_control *wbc,
@@ -38,7 +46,8 @@  struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index);
 struct page *read_swap_cache_async(swp_entry_t, gfp_t,
 				   struct vm_area_struct *vma,
 				   unsigned long addr,
-				   bool do_poll);
+				   bool do_poll,
+				   struct swap_iocb **plug);
 struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
 				     struct vm_area_struct *vma,
 				     unsigned long addr,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 514b86b05488..5cb2c75fa247 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -520,14 +520,16 @@  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
  * the swap entry is no longer in use.
  */
 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
-		struct vm_area_struct *vma, unsigned long addr, bool do_poll)
+				   struct vm_area_struct *vma,
+				   unsigned long addr, bool do_poll,
+				   struct swap_iocb **plug)
 {
 	bool page_was_allocated;
 	struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
 			vma, addr, &page_was_allocated);
 
 	if (page_was_allocated)
-		swap_readpage(retpage, do_poll);
+		swap_readpage(retpage, do_poll, plug);
 
 	return retpage;
 }
@@ -621,10 +623,12 @@  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	unsigned long mask;
 	struct swap_info_struct *si = swp_swap_info(entry);
 	struct blk_plug plug;
+	struct swap_iocb *splug = NULL;
 	bool do_poll = true, page_allocated;
 	struct vm_area_struct *vma = vmf->vma;
 	unsigned long addr = vmf->address;
 
+	blk_start_plug(&plug);
 	mask = swapin_nr_pages(offset) - 1;
 	if (!mask)
 		goto skip;
@@ -638,7 +642,6 @@  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	if (end_offset >= si->max)
 		end_offset = si->max - 1;
 
-	blk_start_plug(&plug);
 	for (offset = start_offset; offset <= end_offset ; offset++) {
 		/* Ok, do the async read-ahead now */
 		page = __read_swap_cache_async(
@@ -647,7 +650,7 @@  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 		if (!page)
 			continue;
 		if (page_allocated) {
-			swap_readpage(page, false);
+			swap_readpage(page, false, &splug);
 			if (offset != entry_offset) {
 				SetPageReadahead(page);
 				count_vm_event(SWAP_RA);
@@ -655,11 +658,14 @@  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 		}
 		put_page(page);
 	}
-	blk_finish_plug(&plug);
 
 	lru_add_drain();	/* Push any new pages onto the LRU now */
 skip:
-	return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
+	page = read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll,
+				     &splug);
+	blk_finish_plug(&plug);
+	swap_read_unplug(splug);
+	return page;
 }
 
 int init_swap_address_space(unsigned int type, unsigned long nr_pages)
@@ -790,6 +796,7 @@  static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 				       struct vm_fault *vmf)
 {
 	struct blk_plug plug;
+	struct swap_iocb *splug = NULL;
 	struct vm_area_struct *vma = vmf->vma;
 	struct page *page;
 	pte_t *pte, pentry;
@@ -800,11 +807,11 @@  static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 		.win = 1,
 	};
 
+	blk_start_plug(&plug);
 	swap_ra_info(vmf, &ra_info);
 	if (ra_info.win == 1)
 		goto skip;
 
-	blk_start_plug(&plug);
 	for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
 	     i++, pte++) {
 		pentry = *pte;
@@ -820,7 +827,7 @@  static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 		if (!page)
 			continue;
 		if (page_allocated) {
-			swap_readpage(page, false);
+			swap_readpage(page, false, &splug);
 			if (i != ra_info.offset) {
 				SetPageReadahead(page);
 				count_vm_event(SWAP_RA);
@@ -828,11 +835,13 @@  static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 		}
 		put_page(page);
 	}
-	blk_finish_plug(&plug);
 	lru_add_drain();
 skip:
-	return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
-				     ra_info.win == 1);
+	page = read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
+				     ra_info.win == 1, &splug);
+	blk_finish_plug(&plug);
+	swap_read_unplug(splug);
+	return page;
 }
 
 /**