diff mbox series

[01/18] Structural cleanup for filesystem-based swap

Message ID 163969850251.20885.10819272484905153807.stgit@noble.brown (mailing list archive)
State New
Headers show
Series Repair SWAP-over-NFS | expand

Commit Message

NeilBrown Dec. 16, 2021, 11:48 p.m. UTC
Linux primarily uses IO to block devices for swap, but can send the IO
requests to a filesystem.  This has only ever worked for NFS, and that
hasn't worked for a while due to a lack of testing.  This seems like a
good time for some tidy-up before restoring swap-over-NFS functionality.

This patch:
 - updates the documentation (both copies!) for swap_activate which
   is woefully out-of-date
 - introduces a new address_space operation "swap_rw" for swap IO.
   The code currently used ->readpage for reads and ->direct_IO for
   writes.  The former imposes a limit of one-page-at-a-time, the
   later means that direct writes and swap writes are encouraged to
   use the same path.  While similar, swap can often be simpler as
   it can assume that no allocation is needed, and coherence with the
   page cache is irrelevant.
 - move the responsibility for setting SWP_FS_OPS to ->swap_activate()
   and also requires it to always call add_swap_extent().  This makes
   it much easier to find filesystems that require SWP_FS_OPS.
 - drops the call to the filesystem for ->set_page_dirty().  These
   pages do not belong to the filesystem, and it has no interest
   in the dirty status.

writeout is switched to ->swap_rw, but read-in is not as that requires
too much change for this patch.

Both cifs and nfs set SWP_FS_OPS but neither provide a swap_rw, so both
will now fail to activate swap.  cifs never really tried to provide swap
support as ->direct_IO always returns an error.  NFS will be fixed up
with following patches.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 Documentation/filesystems/locking.rst |   18 ++++++++++++------
 Documentation/filesystems/vfs.rst     |   17 ++++++++++++-----
 fs/cifs/file.c                        |    7 ++++++-
 fs/nfs/file.c                         |   17 +++++++++++++++--
 include/linux/fs.h                    |    1 +
 include/linux/swap.h                  |    1 -
 mm/page_io.c                          |   26 ++++++--------------------
 mm/swap_state.c                       |    2 +-
 mm/swapfile.c                         |   10 +++-------
 9 files changed, 56 insertions(+), 43 deletions(-)

Comments

kernel test robot Dec. 17, 2021, 10:33 a.m. UTC | #1
Hi NeilBrown,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on cifs/for-next]
[also build test ERROR on axboe-block/for-next mszeredi-vfs/overlayfs-next rostedt-trace/for-next linus/master v5.16-rc5 next-20211216]
[cannot apply to trondmy-nfs/linux-next hnaz-mm/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/NeilBrown/Repair-SWAP-over-NFS/20211217-075659
base:   git://git.samba.org/sfrench/cifs-2.6.git for-next
config: arm-randconfig-r005-20211216 (https://download.01.org/0day-ci/archive/20211217/202112171822.DW1WPE1G-lkp@intel.com/config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 9043c3d65b11b442226015acfbf8167684586cfa)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install arm cross compiling tool for clang build
        # apt-get install binutils-arm-linux-gnueabi
        # https://github.com/0day-ci/linux/commit/6443c9d01129c1a1c19f3df4a594b01e3772e6bd
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review NeilBrown/Repair-SWAP-over-NFS/20211217-075659
        git checkout 6443c9d01129c1a1c19f3df4a594b01e3772e6bd
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash fs/nfs/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> fs/nfs/file.c:512:8: error: implicit declaration of function 'add_swap_extent' [-Werror,-Wimplicit-function-declaration]
           ret = add_swap_extent(sis, 0, sis->max, 0);
                 ^
   1 error generated.


vim +/add_swap_extent +512 fs/nfs/file.c

   486	
   487	static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
   488							sector_t *span)
   489	{
   490		unsigned long blocks;
   491		long long isize;
   492		int ret;
   493		struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
   494		struct inode *inode = file->f_mapping->host;
   495	
   496		if (!file->f_mapping->a_ops->swap_rw)
   497			/* Cannot support swap */
   498			return -EINVAL;
   499	
   500		spin_lock(&inode->i_lock);
   501		blocks = inode->i_blocks;
   502		isize = inode->i_size;
   503		spin_unlock(&inode->i_lock);
   504		if (blocks*512 < isize) {
   505			pr_warn("swap activate: swapfile has holes\n");
   506			return -EINVAL;
   507		}
   508	
   509		ret = rpc_clnt_swap_activate(clnt);
   510		if (ret)
   511			return ret;
 > 512		ret = add_swap_extent(sis, 0, sis->max, 0);
   513		if (ret < 0) {
   514			rpc_clnt_swap_deactivate(clnt);
   515			return ret;
   516		}
   517		*span = sis->pages;
   518		sis->flags |= SWP_FS_OPS;
   519		return ret;
   520	}
   521	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Christoph Hellwig Dec. 21, 2021, 8:34 a.m. UTC | #2
On Fri, Dec 17, 2021 at 10:48:22AM +1100, NeilBrown wrote:
> Linux primarily uses IO to block devices for swap, but can send the IO
> requests to a filesystem.  This has only ever worked for NFS, and that
> hasn't worked for a while due to a lack of testing.  This seems like a
> good time for some tidy-up before restoring swap-over-NFS functionality.

The changes look good to me, but I think this needs to be split into
separate, self-contained patches.

> 
> This patch:

Patch 1:

>  - updates the documentation (both copies!) for swap_activate which
>    is woefully out-of-date

Patch 2:

>  - drops the call to the filesystem for ->set_page_dirty().  These
>    pages do not belong to the filesystem, and it has no interest
>    in the dirty status.

Patch 3:


>  - move the responsibility for setting SWP_FS_OPS to ->swap_activate()
>    and also requires it to always call add_swap_extent().  This makes
>    it much easier to find filesystems that require SWP_FS_OPS.

Patch 4:

>  - introduces a new address_space operation "swap_rw" for swap IO.
>    The code currently used ->readpage for reads and ->direct_IO for
>    writes.  The former imposes a limit of one-page-at-a-time, the
>    later means that direct writes and swap writes are encouraged to
>    use the same path.  While similar, swap can often be simpler as
>    it can assume that no allocation is needed, and coherence with the
>    page cache is irrelevant.
diff mbox series

Patch

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index d36fe79167b3..c2bb753bf688 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -265,8 +265,9 @@  prototypes::
 	int (*launder_page)(struct page *);
 	int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
 	int (*error_remove_page)(struct address_space *, struct page *);
-	int (*swap_activate)(struct file *);
+	int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span)
 	int (*swap_deactivate)(struct file *);
+	int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
 
 locking rules:
 	All except set_page_dirty and freepage may block
@@ -295,6 +296,7 @@  is_partially_uptodate:	yes
 error_remove_page:	yes
 swap_activate:		no
 swap_deactivate:	no
+swap_rw:		yes, unlocks
 ======================	======================== =========	===============
 
 ->write_begin(), ->write_end() and ->readpage() may be called from
@@ -397,15 +399,19 @@  cleaned, or an error value if not. Note that in order to prevent the page
 getting mapped back in and redirtied, it needs to be kept locked
 across the entire operation.
 
-->swap_activate will be called with a non-zero argument on
-files backing (non block device backed) swapfiles. A return value
-of zero indicates success, in which case this file can be used for
-backing swapspace. The swapspace operations will be proxied to the
-address space operations.
+->swap_activate() will be called to prepare the given file for swap.  It
+should perform any validation and preparation necessary to ensure that
+writes can be performed with minimal memory allocation.  It should call
+add_swap_extent(), or the helper iomap_swapfile_activate(), and return
+the number of extents added.  If IO should be submitted through
+->swap_rw(), it should set SWP_FS_OPS, otherwise IO will be submitted
+directly to the block device ``sis->bdev``.
 
 ->swap_deactivate() will be called in the sys_swapoff()
 path after ->swap_activate() returned success.
 
+->swap_rw will be called for swap IO if ->swap_activate() set SWP_FS_OPS.
+
 file_lock_operations
 ====================
 
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index bf5c48066fac..70d7ce335565 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -751,8 +751,9 @@  cache in your filesystem.  The following members are defined:
 					      unsigned long);
 		void (*is_dirty_writeback) (struct page *, bool *, bool *);
 		int (*error_remove_page) (struct mapping *mapping, struct page *page);
-		int (*swap_activate)(struct file *);
+		int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span)
 		int (*swap_deactivate)(struct file *);
+		int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
 	};
 
 ``writepage``
@@ -959,15 +960,21 @@  cache in your filesystem.  The following members are defined:
 	unless you have them locked or reference counts increased.
 
 ``swap_activate``
-	Called when swapon is used on a file to allocate space if
-	necessary and pin the block lookup information in memory.  A
-	return value of zero indicates success, in which case this file
-	can be used to back swapspace.
+
+	Called to prepare the given file for swap.  It should perform
+	any validation and preparation necessary to ensure that writes
+	can be performed with minimal memory allocation.  It should call
+	add_swap_extent(), or the helper iomap_swapfile_activate(), and
+	return the number of extents added.  If IO should be submitted
+	through ->swap_rw(), it should set SWP_FS_OPS, otherwise IO will
+	be submitted directly to the block device ``sis->bdev``.
 
 ``swap_deactivate``
 	Called during swapoff on files where swap_activate was
 	successful.
 
+``swap_rw``
+	Called to read or write swap pages when swap_activate() set SWP_FS_OPS.
 
 The File Object
 ===============
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9fee3af83a73..50bebf5f15cc 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4943,6 +4943,10 @@  static int cifs_swap_activate(struct swap_info_struct *sis,
 
 	cifs_dbg(FYI, "swap activate\n");
 
+	if (!swap_file->f_mapping->a_ops->swap_rw)
+		/* Cannot support swap */
+		return -EINVAL;
+
 	spin_lock(&inode->i_lock);
 	blocks = inode->i_blocks;
 	isize = inode->i_size;
@@ -4971,7 +4975,8 @@  static int cifs_swap_activate(struct swap_info_struct *sis,
 	 * from reading or writing the file
 	 */
 
-	return 0;
+	sis->flags |= SWP_FS_OPS;
+	return add_swap_extent(sis, 0, sis->max, 0);
 }
 
 static void cifs_swap_deactivate(struct file *file)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 24e7dccce355..0d33c95eefb6 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -489,9 +489,14 @@  static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 {
 	unsigned long blocks;
 	long long isize;
+	int ret;
 	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
 	struct inode *inode = file->f_mapping->host;
 
+	if (!file->f_mapping->a_ops->swap_rw)
+		/* Cannot support swap */
+		return -EINVAL;
+
 	spin_lock(&inode->i_lock);
 	blocks = inode->i_blocks;
 	isize = inode->i_size;
@@ -501,9 +506,17 @@  static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 		return -EINVAL;
 	}
 
+	ret = rpc_clnt_swap_activate(clnt);
+	if (ret)
+		return ret;
+	ret = add_swap_extent(sis, 0, sis->max, 0);
+	if (ret < 0) {
+		rpc_clnt_swap_deactivate(clnt);
+		return ret;
+	}
 	*span = sis->pages;
-
-	return rpc_clnt_swap_activate(clnt);
+	sis->flags |= SWP_FS_OPS;
+	return ret;
 }
 
 static void nfs_swap_deactivate(struct file *file)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bbf812ce89a8..deaaf359cc49 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -415,6 +415,7 @@  struct address_space_operations {
 	int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
 				sector_t *span);
 	void (*swap_deactivate)(struct file *file);
+	int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
 };
 
 extern const struct address_space_operations empty_aops;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d1ea44b31f19..10b2a92c1aa1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -427,7 +427,6 @@  extern int swap_writepage(struct page *page, struct writeback_control *wbc);
 extern void end_swap_bio_write(struct bio *bio);
 extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
 	bio_end_io_t end_write_func);
-extern int swap_set_page_dirty(struct page *page);
 
 int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 		unsigned long nr_pages, sector_t start_block);
diff --git a/mm/page_io.c b/mm/page_io.c
index 9725c7e1eeea..cb617a4f59df 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -307,10 +307,9 @@  int __swap_writepage(struct page *page, struct writeback_control *wbc,
 
 		set_page_writeback(page);
 		unlock_page(page);
-		ret = mapping->a_ops->direct_IO(&kiocb, &from);
-		if (ret == PAGE_SIZE) {
+		ret = mapping->a_ops->swap_rw(&kiocb, &from);
+		if (ret == 0) {
 			count_vm_event(PSWPOUT);
-			ret = 0;
 		} else {
 			/*
 			 * In the case of swap-over-nfs, this can be a
@@ -378,10 +377,11 @@  int swap_readpage(struct page *page, bool synchronous)
 	}
 
 	if (data_race(sis->flags & SWP_FS_OPS)) {
-		struct file *swap_file = sis->swap_file;
-		struct address_space *mapping = swap_file->f_mapping;
+		//struct file *swap_file = sis->swap_file;
+		//struct address_space *mapping = swap_file->f_mapping;
 
-		ret = mapping->a_ops->readpage(swap_file, page);
+		/* This needs to use ->swap_rw() */
+		ret = -EINVAL;
 		if (!ret)
 			count_vm_event(PSWPIN);
 		goto out;
@@ -434,17 +434,3 @@  int swap_readpage(struct page *page, bool synchronous)
 	psi_memstall_leave(&pflags);
 	return ret;
 }
-
-int swap_set_page_dirty(struct page *page)
-{
-	struct swap_info_struct *sis = page_swap_info(page);
-
-	if (data_race(sis->flags & SWP_FS_OPS)) {
-		struct address_space *mapping = sis->swap_file->f_mapping;
-
-		VM_BUG_ON_PAGE(!PageSwapCache(page), page);
-		return mapping->a_ops->set_page_dirty(page);
-	} else {
-		return __set_page_dirty_no_writeback(page);
-	}
-}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8d4104242100..616eb1d75b35 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -30,7 +30,7 @@ 
  */
 static const struct address_space_operations swap_aops = {
 	.writepage	= swap_writepage,
-	.set_page_dirty	= swap_set_page_dirty,
+	.set_page_dirty	= __set_page_dirty_no_writeback,
 #ifdef CONFIG_MIGRATION
 	.migratepage	= migrate_page,
 #endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e59e08ef46e1..419eacf474c5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2397,13 +2397,9 @@  static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 
 	if (mapping->a_ops->swap_activate) {
 		ret = mapping->a_ops->swap_activate(sis, swap_file, span);
-		if (ret >= 0)
-			sis->flags |= SWP_ACTIVATED;
-		if (!ret) {
-			sis->flags |= SWP_FS_OPS;
-			ret = add_swap_extent(sis, 0, sis->max, 0);
-			*span = sis->pages;
-		}
+		if (ret < 0)
+			return ret;
+		sis->flags |= SWP_ACTIVATED;
 		return ret;
 	}