diff mbox series

[v2,9/9] mm/swap, shmem: use new swapin helper to skip readahead conditionally

Message ID 20240102175338.62012-10-ryncsn@gmail.com (mailing list archive)
State New
Headers show
Series swapin refactor for optimization and unified readahead | expand

Commit Message

Kairui Song Jan. 2, 2024, 5:53 p.m. UTC
From: Kairui Song <kasong@tencent.com>

Currently, shmem uses cluster readahead for all swap backends. Cluster
readahead is not a good solution for ramdisk based device (ZRAM) at all.

After switching to the new helper, most benchmarks showed a good result:

- Single file sequence read:
  perf stat --repeat 20 dd if=/tmpfs/test of=/dev/null bs=1M count=8192
  (/tmpfs/test is a zero filled file, using brd as swap, 4G memcg limit)
  Before: 22.248 +- 0.549
  After:  22.021 +- 0.684 (-1.1%)

- Random read stress test:
  fio -name=tmpfs --numjobs=16 --directory=/tmpfs \
  --size=256m --ioengine=mmap --rw=randread --random_distribution=random \
  --time_based --ramp_time=1m --runtime=5m --group_reporting
  (using brd as swap, 2G memcg limit)

  Before: 1818MiB/s
  After:  1888MiB/s (+3.85%)

- Zipf biased random read stress test:
  fio -name=tmpfs --numjobs=16 --directory=/tmpfs \
  --size=256m --ioengine=mmap --rw=randread --random_distribution=zipf:1.2 \
  --time_based --ramp_time=1m --runtime=5m --group_reporting
  (using brd as swap, 2G memcg limit)

  Before: 31.1GiB/s
  After:  32.3GiB/s (+3.86%)

So cluster readahead doesn't help much even for single sequence read,
and for random stress test, the performance is better without it.

Considering both memory and swap device will get more fragmented
slowly, and commonly used ZRAM consumes much more CPU than plain
ramdisk, false readahead could occur more frequently and waste
more CPU. Direct SWAP is cheaper, so use the new helper and skip
read ahead for SWP_SYNCHRONOUS_IO device.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/shmem.c      | 67 +++++++++++++++++++++++++------------------------
 mm/swap.h       |  9 -------
 mm/swap_state.c | 11 ++++++--
 3 files changed, 43 insertions(+), 44 deletions(-)

Comments

kernel test robot Jan. 3, 2024, 11:56 a.m. UTC | #1
Hi Kairui,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on next-20240103]
[cannot apply to linus/master v6.7-rc8]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Kairui-Song/mm-swapfile-c-add-back-some-comment/20240103-015650
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20240102175338.62012-10-ryncsn%40gmail.com
patch subject: [PATCH v2 9/9] mm/swap, shmem: use new swapin helper to skip readahead conditionally
config: arm-randconfig-004-20240103 (https://download.01.org/0day-ci/archive/20240103/202401031953.WBMZtIe7-lkp@intel.com/config)
compiler: clang version 18.0.0git (https://github.com/llvm/llvm-project baf8a39aaf8b61a38b5b2b5591deb765e42eb00b)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240103/202401031953.WBMZtIe7-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202401031953.WBMZtIe7-lkp@intel.com/

All errors (new ones prefixed by >>):

>> mm/shmem.c:1864:8: error: incompatible pointer types assigning to 'struct folio *' from 'struct page *' [-Werror,-Wincompatible-pointer-types]
    1864 |         folio = swapin_entry_mpol(swap, gfp, mpol, ilx, &cache_result);
         |               ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   1 error generated.


vim +1864 mm/shmem.c

  1826	
  1827	/*
  1828	 * Swap in the folio pointed to by *foliop.
  1829	 * Caller has to make sure that *foliop contains a valid swapped folio.
  1830	 * Returns 0 and the folio in foliop if success. On failure, returns the
  1831	 * error code and NULL in *foliop.
  1832	 */
  1833	static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
  1834				     struct folio **foliop, enum sgp_type sgp,
  1835				     gfp_t gfp, struct mm_struct *fault_mm,
  1836				     vm_fault_t *fault_type)
  1837	{
  1838		struct address_space *mapping = inode->i_mapping;
  1839		struct shmem_inode_info *info = SHMEM_I(inode);
  1840		enum swap_cache_result cache_result;
  1841		struct swap_info_struct *si;
  1842		struct folio *folio = NULL;
  1843		struct mempolicy *mpol;
  1844		swp_entry_t swap;
  1845		pgoff_t ilx;
  1846		int error;
  1847	
  1848		VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
  1849		swap = radix_to_swp_entry(*foliop);
  1850		*foliop = NULL;
  1851	
  1852		if (is_poisoned_swp_entry(swap))
  1853			return -EIO;
  1854	
  1855		si = get_swap_device(swap);
  1856		if (!si) {
  1857			if (!shmem_confirm_swap(mapping, index, swap))
  1858				return -EEXIST;
  1859			else
  1860				return -EINVAL;
  1861		}
  1862	
  1863		mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
> 1864		folio = swapin_entry_mpol(swap, gfp, mpol, ilx, &cache_result);
  1865		mpol_cond_put(mpol);
  1866	
  1867		if (!folio) {
  1868			error = -ENOMEM;
  1869			goto failed;
  1870		}
  1871		if (cache_result != SWAP_CACHE_HIT) {
  1872			if (fault_type) {
  1873				*fault_type |= VM_FAULT_MAJOR;
  1874				count_vm_event(PGMAJFAULT);
  1875				count_memcg_event_mm(fault_mm, PGMAJFAULT);
  1876			}
  1877		}
  1878	
  1879		/* We have to do this with folio locked to prevent races */
  1880		folio_lock(folio);
  1881		if (cache_result != SWAP_CACHE_BYPASS) {
  1882			/* With cache bypass, folio is new allocated, sync, and not in cache */
  1883			if (!folio_test_swapcache(folio) || folio->swap.val != swap.val) {
  1884				error = -EEXIST;
  1885				goto unlock;
  1886			}
  1887			if (!folio_test_uptodate(folio)) {
  1888				error = -EIO;
  1889				goto failed;
  1890			}
  1891			folio_wait_writeback(folio);
  1892		}
  1893		if (!shmem_confirm_swap(mapping, index, swap)) {
  1894			error = -EEXIST;
  1895			goto unlock;
  1896		}
  1897	
  1898		/*
  1899		 * Some architectures may have to restore extra metadata to the
  1900		 * folio after reading from swap.
  1901		 */
  1902		arch_swap_restore(swap, folio);
  1903	
  1904		/* With cache bypass, folio is new allocated and always respect gfp flags */
  1905		if (cache_result != SWAP_CACHE_BYPASS && shmem_should_replace_folio(folio, gfp)) {
  1906			error = shmem_replace_folio(&folio, gfp, info, index);
  1907			if (error)
  1908				goto failed;
  1909		}
  1910	
  1911		/*
  1912		 * The expected value checking below should be enough to ensure
  1913		 * only one up-to-date swapin success. swap_free() is called after
  1914		 * this, so the entry can't be reused. As long as the mapping still
  1915		 * has the old entry value, it's never swapped in or modified.
  1916		 */
  1917		error = shmem_add_to_page_cache(folio, mapping, index,
  1918						swp_to_radix_entry(swap), gfp);
  1919		if (error)
  1920			goto failed;
  1921	
  1922		shmem_recalc_inode(inode, 0, -1);
  1923	
  1924		if (sgp == SGP_WRITE)
  1925			folio_mark_accessed(folio);
  1926	
  1927		if (cache_result != SWAP_CACHE_BYPASS)
  1928			delete_from_swap_cache(folio);
  1929		folio_mark_dirty(folio);
  1930		swap_free(swap);
  1931		put_swap_device(si);
  1932	
  1933		*foliop = folio;
  1934		return 0;
  1935	failed:
  1936		if (!shmem_confirm_swap(mapping, index, swap))
  1937			error = -EEXIST;
  1938		if (error == -EIO)
  1939			shmem_set_folio_swapin_error(inode, index, folio, swap);
  1940	unlock:
  1941		if (folio) {
  1942			folio_unlock(folio);
  1943			folio_put(folio);
  1944		}
  1945		put_swap_device(si);
  1946	
  1947		return error;
  1948	}
  1949
kernel test robot Jan. 3, 2024, 1:45 p.m. UTC | #2
Hi Kairui,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on next-20240103]
[cannot apply to linus/master v6.7-rc8]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Kairui-Song/mm-swapfile-c-add-back-some-comment/20240103-015650
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20240102175338.62012-10-ryncsn%40gmail.com
patch subject: [PATCH v2 9/9] mm/swap, shmem: use new swapin helper to skip readahead conditionally
config: i386-randconfig-014-20240103 (https://download.01.org/0day-ci/archive/20240103/202401032120.uWKybc56-lkp@intel.com/config)
compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240103/202401032120.uWKybc56-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202401032120.uWKybc56-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/shmem.c: In function 'shmem_swapin_folio':
>> mm/shmem.c:1864:8: error: assignment from incompatible pointer type [-Werror=incompatible-pointer-types]
     folio = swapin_entry_mpol(swap, gfp, mpol, ilx, &cache_result);
           ^
   cc1: some warnings being treated as errors


vim +1864 mm/shmem.c

  1826	
  1827	/*
  1828	 * Swap in the folio pointed to by *foliop.
  1829	 * Caller has to make sure that *foliop contains a valid swapped folio.
  1830	 * Returns 0 and the folio in foliop if success. On failure, returns the
  1831	 * error code and NULL in *foliop.
  1832	 */
  1833	static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
  1834				     struct folio **foliop, enum sgp_type sgp,
  1835				     gfp_t gfp, struct mm_struct *fault_mm,
  1836				     vm_fault_t *fault_type)
  1837	{
  1838		struct address_space *mapping = inode->i_mapping;
  1839		struct shmem_inode_info *info = SHMEM_I(inode);
  1840		enum swap_cache_result cache_result;
  1841		struct swap_info_struct *si;
  1842		struct folio *folio = NULL;
  1843		struct mempolicy *mpol;
  1844		swp_entry_t swap;
  1845		pgoff_t ilx;
  1846		int error;
  1847	
  1848		VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
  1849		swap = radix_to_swp_entry(*foliop);
  1850		*foliop = NULL;
  1851	
  1852		if (is_poisoned_swp_entry(swap))
  1853			return -EIO;
  1854	
  1855		si = get_swap_device(swap);
  1856		if (!si) {
  1857			if (!shmem_confirm_swap(mapping, index, swap))
  1858				return -EEXIST;
  1859			else
  1860				return -EINVAL;
  1861		}
  1862	
  1863		mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
> 1864		folio = swapin_entry_mpol(swap, gfp, mpol, ilx, &cache_result);
  1865		mpol_cond_put(mpol);
  1866	
  1867		if (!folio) {
  1868			error = -ENOMEM;
  1869			goto failed;
  1870		}
  1871		if (cache_result != SWAP_CACHE_HIT) {
  1872			if (fault_type) {
  1873				*fault_type |= VM_FAULT_MAJOR;
  1874				count_vm_event(PGMAJFAULT);
  1875				count_memcg_event_mm(fault_mm, PGMAJFAULT);
  1876			}
  1877		}
  1878	
  1879		/* We have to do this with folio locked to prevent races */
  1880		folio_lock(folio);
  1881		if (cache_result != SWAP_CACHE_BYPASS) {
  1882			/* With cache bypass, folio is new allocated, sync, and not in cache */
  1883			if (!folio_test_swapcache(folio) || folio->swap.val != swap.val) {
  1884				error = -EEXIST;
  1885				goto unlock;
  1886			}
  1887			if (!folio_test_uptodate(folio)) {
  1888				error = -EIO;
  1889				goto failed;
  1890			}
  1891			folio_wait_writeback(folio);
  1892		}
  1893		if (!shmem_confirm_swap(mapping, index, swap)) {
  1894			error = -EEXIST;
  1895			goto unlock;
  1896		}
  1897	
  1898		/*
  1899		 * Some architectures may have to restore extra metadata to the
  1900		 * folio after reading from swap.
  1901		 */
  1902		arch_swap_restore(swap, folio);
  1903	
  1904		/* With cache bypass, folio is new allocated and always respect gfp flags */
  1905		if (cache_result != SWAP_CACHE_BYPASS && shmem_should_replace_folio(folio, gfp)) {
  1906			error = shmem_replace_folio(&folio, gfp, info, index);
  1907			if (error)
  1908				goto failed;
  1909		}
  1910	
  1911		/*
  1912		 * The expected value checking below should be enough to ensure
  1913		 * only one up-to-date swapin success. swap_free() is called after
  1914		 * this, so the entry can't be reused. As long as the mapping still
  1915		 * has the old entry value, it's never swapped in or modified.
  1916		 */
  1917		error = shmem_add_to_page_cache(folio, mapping, index,
  1918						swp_to_radix_entry(swap), gfp);
  1919		if (error)
  1920			goto failed;
  1921	
  1922		shmem_recalc_inode(inode, 0, -1);
  1923	
  1924		if (sgp == SGP_WRITE)
  1925			folio_mark_accessed(folio);
  1926	
  1927		if (cache_result != SWAP_CACHE_BYPASS)
  1928			delete_from_swap_cache(folio);
  1929		folio_mark_dirty(folio);
  1930		swap_free(swap);
  1931		put_swap_device(si);
  1932	
  1933		*foliop = folio;
  1934		return 0;
  1935	failed:
  1936		if (!shmem_confirm_swap(mapping, index, swap))
  1937			error = -EEXIST;
  1938		if (error == -EIO)
  1939			shmem_set_folio_swapin_error(inode, index, folio, swap);
  1940	unlock:
  1941		if (folio) {
  1942			folio_unlock(folio);
  1943			folio_put(folio);
  1944		}
  1945		put_swap_device(si);
  1946	
  1947		return error;
  1948	}
  1949
Huang, Ying Jan. 9, 2024, 2:03 a.m. UTC | #3
Kairui Song <ryncsn@gmail.com> writes:

> From: Kairui Song <kasong@tencent.com>
>
> Currently, shmem uses cluster readahead for all swap backends. Cluster
> readahead is not a good solution for ramdisk based device (ZRAM) at all.
>
> After switching to the new helper, most benchmarks showed a good result:
>
> - Single file sequence read:
>   perf stat --repeat 20 dd if=/tmpfs/test of=/dev/null bs=1M count=8192
>   (/tmpfs/test is a zero filled file, using brd as swap, 4G memcg limit)
>   Before: 22.248 +- 0.549
>   After:  22.021 +- 0.684 (-1.1%)
>
> - Random read stress test:
>   fio -name=tmpfs --numjobs=16 --directory=/tmpfs \
>   --size=256m --ioengine=mmap --rw=randread --random_distribution=random \
>   --time_based --ramp_time=1m --runtime=5m --group_reporting
>   (using brd as swap, 2G memcg limit)
>
>   Before: 1818MiB/s
>   After:  1888MiB/s (+3.85%)
>
> - Zipf biased random read stress test:
>   fio -name=tmpfs --numjobs=16 --directory=/tmpfs \
>   --size=256m --ioengine=mmap --rw=randread --random_distribution=zipf:1.2 \
>   --time_based --ramp_time=1m --runtime=5m --group_reporting
>   (using brd as swap, 2G memcg limit)
>
>   Before: 31.1GiB/s
>   After:  32.3GiB/s (+3.86%)
>
> So cluster readahead doesn't help much even for single sequence read,
> and for random stress test, the performance is better without it.
>
> Considering both memory and swap device will get more fragmented
> slowly, and commonly used ZRAM consumes much more CPU than plain
> ramdisk, false readahead could occur more frequently and waste
> more CPU. Direct SWAP is cheaper, so use the new helper and skip
> read ahead for SWP_SYNCHRONOUS_IO device.

It's good to take advantage of swap_direct (no readahead).  I also hopes
we can take advantage of VMA based swapin if shmem is accessed via mmap.
That appears possible.

--
Best Regards,
Huang, Ying

> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/shmem.c      | 67 +++++++++++++++++++++++++------------------------
>  mm/swap.h       |  9 -------
>  mm/swap_state.c | 11 ++++++--
>  3 files changed, 43 insertions(+), 44 deletions(-)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 9da9f7a0e620..3c0729fe934d 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1564,20 +1564,6 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
>  static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
>  			pgoff_t index, unsigned int order, pgoff_t *ilx);
>  
> -static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
> -			struct shmem_inode_info *info, pgoff_t index)
> -{
> -	struct mempolicy *mpol;
> -	pgoff_t ilx;
> -	struct folio *folio;
> -
> -	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
> -	folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
> -	mpol_cond_put(mpol);
> -
> -	return folio;
> -}
> -
>  /*
>   * Make sure huge_gfp is always more limited than limit_gfp.
>   * Some of the flags set permissions, while others set limitations.
> @@ -1851,9 +1837,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  {
>  	struct address_space *mapping = inode->i_mapping;
>  	struct shmem_inode_info *info = SHMEM_I(inode);
> +	enum swap_cache_result cache_result;
>  	struct swap_info_struct *si;
>  	struct folio *folio = NULL;
> +	struct mempolicy *mpol;
>  	swp_entry_t swap;
> +	pgoff_t ilx;
>  	int error;
>  
>  	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
> @@ -1871,36 +1860,40 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  			return -EINVAL;
>  	}
>  
> -	/* Look it up and read it in.. */
> -	folio = swap_cache_get_folio(swap, NULL, 0, NULL);
> +	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
> +	folio = swapin_entry_mpol(swap, gfp, mpol, ilx, &cache_result);
> +	mpol_cond_put(mpol);
> +
>  	if (!folio) {
> -		/* Or update major stats only when swapin succeeds?? */
> +		error = -ENOMEM;
> +		goto failed;
> +	}
> +	if (cache_result != SWAP_CACHE_HIT) {
>  		if (fault_type) {
>  			*fault_type |= VM_FAULT_MAJOR;
>  			count_vm_event(PGMAJFAULT);
>  			count_memcg_event_mm(fault_mm, PGMAJFAULT);
>  		}
> -		/* Here we actually start the io */
> -		folio = shmem_swapin_cluster(swap, gfp, info, index);
> -		if (!folio) {
> -			error = -ENOMEM;
> -			goto failed;
> -		}
>  	}
>  
>  	/* We have to do this with folio locked to prevent races */
>  	folio_lock(folio);
> -	if (!folio_test_swapcache(folio) ||
> -	    folio->swap.val != swap.val ||
> -	    !shmem_confirm_swap(mapping, index, swap)) {
> +	if (cache_result != SWAP_CACHE_BYPASS) {
> +		/* With cache bypass, folio is new allocated, sync, and not in cache */
> +		if (!folio_test_swapcache(folio) || folio->swap.val != swap.val) {
> +			error = -EEXIST;
> +			goto unlock;
> +		}
> +		if (!folio_test_uptodate(folio)) {
> +			error = -EIO;
> +			goto failed;
> +		}
> +		folio_wait_writeback(folio);
> +	}
> +	if (!shmem_confirm_swap(mapping, index, swap)) {
>  		error = -EEXIST;
>  		goto unlock;
>  	}
> -	if (!folio_test_uptodate(folio)) {
> -		error = -EIO;
> -		goto failed;
> -	}
> -	folio_wait_writeback(folio);
>  
>  	/*
>  	 * Some architectures may have to restore extra metadata to the
> @@ -1908,12 +1901,19 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  	 */
>  	arch_swap_restore(swap, folio);
>  
> -	if (shmem_should_replace_folio(folio, gfp)) {
> +	/* With cache bypass, folio is new allocated and always respect gfp flags */
> +	if (cache_result != SWAP_CACHE_BYPASS && shmem_should_replace_folio(folio, gfp)) {
>  		error = shmem_replace_folio(&folio, gfp, info, index);
>  		if (error)
>  			goto failed;
>  	}
>  
> +	/*
> +	 * The expected value checking below should be enough to ensure
> +	 * only one up-to-date swapin success. swap_free() is called after
> +	 * this, so the entry can't be reused. As long as the mapping still
> +	 * has the old entry value, it's never swapped in or modified.
> +	 */
>  	error = shmem_add_to_page_cache(folio, mapping, index,
>  					swp_to_radix_entry(swap), gfp);
>  	if (error)
> @@ -1924,7 +1924,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  	if (sgp == SGP_WRITE)
>  		folio_mark_accessed(folio);
>  
> -	delete_from_swap_cache(folio);
> +	if (cache_result != SWAP_CACHE_BYPASS)
> +		delete_from_swap_cache(folio);
>  	folio_mark_dirty(folio);
>  	swap_free(swap);
>  	put_swap_device(si);
> diff --git a/mm/swap.h b/mm/swap.h
> index 8f790a67b948..20f4048c971c 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -57,9 +57,6 @@ void __delete_from_swap_cache(struct folio *folio,
>  void delete_from_swap_cache(struct folio *folio);
>  void clear_shadow_from_swap_cache(int type, unsigned long begin,
>  				  unsigned long end);
> -struct folio *swap_cache_get_folio(swp_entry_t entry,
> -		struct vm_area_struct *vma, unsigned long addr,
> -		void **shadowp);
>  struct folio *filemap_get_incore_folio(struct address_space *mapping,
>  		pgoff_t index);
>  
> @@ -123,12 +120,6 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
>  	return 0;
>  }
>  
> -static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
> -		struct vm_area_struct *vma, unsigned long addr)
> -{
> -	return NULL;
> -}
> -
>  static inline
>  struct folio *filemap_get_incore_folio(struct address_space *mapping,
>  		pgoff_t index)
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 3edf4b63158d..10eec68475dd 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -318,7 +318,14 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
>  
>  static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_t entry)
>  {
> -	return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
> +	int count;
> +
> +	if (!data_race(si->flags & SWP_SYNCHRONOUS_IO))
> +		return false;
> +
> +	count = __swap_count(entry);
> +
> +	return (count == 1 || count == SWAP_MAP_SHMEM);
>  }
>  
>  static inline bool swap_use_vma_readahead(void)
> @@ -334,7 +341,7 @@ static inline bool swap_use_vma_readahead(void)
>   *
>   * Caller must lock the swap device or hold a reference to keep it valid.
>   */
> -struct folio *swap_cache_get_folio(swp_entry_t entry,
> +static struct folio *swap_cache_get_folio(swp_entry_t entry,
>  		struct vm_area_struct *vma, unsigned long addr, void **shadowp)
>  {
>  	struct folio *folio;
Kairui Song Jan. 10, 2024, 3:35 a.m. UTC | #4
Huang, Ying <ying.huang@intel.com> 于2024年1月9日周二 10:05写道:
>
> Kairui Song <ryncsn@gmail.com> writes:
>
> > From: Kairui Song <kasong@tencent.com>
> >
> > Currently, shmem uses cluster readahead for all swap backends. Cluster
> > readahead is not a good solution for ramdisk based device (ZRAM) at all.
> >
> > After switching to the new helper, most benchmarks showed a good result:
> >
> > - Single file sequence read:
> >   perf stat --repeat 20 dd if=/tmpfs/test of=/dev/null bs=1M count=8192
> >   (/tmpfs/test is a zero filled file, using brd as swap, 4G memcg limit)
> >   Before: 22.248 +- 0.549
> >   After:  22.021 +- 0.684 (-1.1%)
> >
> > - Random read stress test:
> >   fio -name=tmpfs --numjobs=16 --directory=/tmpfs \
> >   --size=256m --ioengine=mmap --rw=randread --random_distribution=random \
> >   --time_based --ramp_time=1m --runtime=5m --group_reporting
> >   (using brd as swap, 2G memcg limit)
> >
> >   Before: 1818MiB/s
> >   After:  1888MiB/s (+3.85%)
> >
> > - Zipf biased random read stress test:
> >   fio -name=tmpfs --numjobs=16 --directory=/tmpfs \
> >   --size=256m --ioengine=mmap --rw=randread --random_distribution=zipf:1.2 \
> >   --time_based --ramp_time=1m --runtime=5m --group_reporting
> >   (using brd as swap, 2G memcg limit)
> >
> >   Before: 31.1GiB/s
> >   After:  32.3GiB/s (+3.86%)
> >
> > So cluster readahead doesn't help much even for single sequence read,
> > and for random stress test, the performance is better without it.
> >
> > Considering both memory and swap device will get more fragmented
> > slowly, and commonly used ZRAM consumes much more CPU than plain
> > ramdisk, false readahead could occur more frequently and waste
> > more CPU. Direct SWAP is cheaper, so use the new helper and skip
> > read ahead for SWP_SYNCHRONOUS_IO device.
>
> It's good to take advantage of swap_direct (no readahead).  I also hopes
> we can take advantage of VMA based swapin if shmem is accessed via mmap.
> That appears possible.

Good idea, that should be doable, will update the series.
Kairui Song Jan. 30, 2024, 12:39 a.m. UTC | #5
On Wed, Jan 10, 2024 at 11:35 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> Huang, Ying <ying.huang@intel.com> 于2024年1月9日周二 10:05写道:
> >
> > Kairui Song <ryncsn@gmail.com> writes:
> >
> > > From: Kairui Song <kasong@tencent.com>
> > >
> > > Currently, shmem uses cluster readahead for all swap backends. Cluster
> > > readahead is not a good solution for ramdisk based device (ZRAM) at
all.
> > >
> > > After switching to the new helper, most benchmarks showed a good
result:
> > >
> > > - Single file sequence read:
> > >   perf stat --repeat 20 dd if=/tmpfs/test of=/dev/null bs=1M
count=8192
> > >   (/tmpfs/test is a zero filled file, using brd as swap, 4G memcg
limit)
> > >   Before: 22.248 +- 0.549
> > >   After:  22.021 +- 0.684 (-1.1%)
> > >
> > > - Random read stress test:
> > >   fio -name=tmpfs --numjobs=16 --directory=/tmpfs \
> > >   --size=256m --ioengine=mmap --rw=randread
--random_distribution=random \
> > >   --time_based --ramp_time=1m --runtime=5m --group_reporting
> > >   (using brd as swap, 2G memcg limit)
> > >
> > >   Before: 1818MiB/s
> > >   After:  1888MiB/s (+3.85%)
> > >
> > > - Zipf biased random read stress test:
> > >   fio -name=tmpfs --numjobs=16 --directory=/tmpfs \
> > >   --size=256m --ioengine=mmap --rw=randread
--random_distribution=zipf:1.2 \
> > >   --time_based --ramp_time=1m --runtime=5m --group_reporting
> > >   (using brd as swap, 2G memcg limit)
> > >
> > >   Before: 31.1GiB/s
> > >   After:  32.3GiB/s (+3.86%)
> > >
> > > So cluster readahead doesn't help much even for single sequence read,
> > > and for random stress test, the performance is better without it.
> > >
> > > Considering both memory and swap device will get more fragmented
> > > slowly, and commonly used ZRAM consumes much more CPU than plain
> > > ramdisk, false readahead could occur more frequently and waste
> > > more CPU. Direct SWAP is cheaper, so use the new helper and skip
> > > read ahead for SWP_SYNCHRONOUS_IO device.
> >
> > It's good to take advantage of swap_direct (no readahead).  I also hopes
> > we can take advantage of VMA based swapin if shmem is accessed via mmap.
> > That appears possible.
>
> Good idea, that should be doable, will update the series.

Hi Ying,

Turns out it's quite complex to do VMA bases swapin readhead for shmem: VMA
address / Page Tables doesn't contain swapin entry for shmem. For anon page
simply read nearby page table is easy and good enough, but for shmem, it's
stored in the inode mapping so the readahead needs to walk the inode
mapping instead. That's doable but requires more work to make it actually
usable. I've sent V3 without this feature, worth another series for this
readahead extension.
Huang, Ying Jan. 30, 2024, 2:01 a.m. UTC | #6
Kairui Song <ryncsn@gmail.com> writes:

> On Wed, Jan 10, 2024 at 11:35 AM Kairui Song <ryncsn@gmail.com> wrote:
>>
>> Huang, Ying <ying.huang@intel.com> 于2024年1月9日周二 10:05写道:
>> >
>> > Kairui Song <ryncsn@gmail.com> writes:
>> >
>> > > From: Kairui Song <kasong@tencent.com>
>> > >
>> > > Currently, shmem uses cluster readahead for all swap backends. Cluster
>> > > readahead is not a good solution for ramdisk based device (ZRAM) at
> all.
>> > >
>> > > After switching to the new helper, most benchmarks showed a good
> result:
>> > >
>> > > - Single file sequence read:
>> > >   perf stat --repeat 20 dd if=/tmpfs/test of=/dev/null bs=1M
> count=8192
>> > >   (/tmpfs/test is a zero filled file, using brd as swap, 4G memcg
> limit)
>> > >   Before: 22.248 +- 0.549
>> > >   After:  22.021 +- 0.684 (-1.1%)
>> > >
>> > > - Random read stress test:
>> > >   fio -name=tmpfs --numjobs=16 --directory=/tmpfs \
>> > >   --size=256m --ioengine=mmap --rw=randread
> --random_distribution=random \
>> > >   --time_based --ramp_time=1m --runtime=5m --group_reporting
>> > >   (using brd as swap, 2G memcg limit)
>> > >
>> > >   Before: 1818MiB/s
>> > >   After:  1888MiB/s (+3.85%)
>> > >
>> > > - Zipf biased random read stress test:
>> > >   fio -name=tmpfs --numjobs=16 --directory=/tmpfs \
>> > >   --size=256m --ioengine=mmap --rw=randread
> --random_distribution=zipf:1.2 \
>> > >   --time_based --ramp_time=1m --runtime=5m --group_reporting
>> > >   (using brd as swap, 2G memcg limit)
>> > >
>> > >   Before: 31.1GiB/s
>> > >   After:  32.3GiB/s (+3.86%)
>> > >
>> > > So cluster readahead doesn't help much even for single sequence read,
>> > > and for random stress test, the performance is better without it.
>> > >
>> > > Considering both memory and swap device will get more fragmented
>> > > slowly, and commonly used ZRAM consumes much more CPU than plain
>> > > ramdisk, false readahead could occur more frequently and waste
>> > > more CPU. Direct SWAP is cheaper, so use the new helper and skip
>> > > read ahead for SWP_SYNCHRONOUS_IO device.
>> >
>> > It's good to take advantage of swap_direct (no readahead).  I also hopes
>> > we can take advantage of VMA based swapin if shmem is accessed via mmap.
>> > That appears possible.
>>
>> Good idea, that should be doable, will update the series.
>
> Hi Ying,
>
> Turns out it's quite complex to do VMA bases swapin readhead for shmem: VMA
> address / Page Tables doesn't contain swapin entry for shmem. For anon page
> simply read nearby page table is easy and good enough, but for shmem, it's
> stored in the inode mapping so the readahead needs to walk the inode
> mapping instead. That's doable but requires more work to make it actually
> usable. I've sent V3 without this feature, worth another series for this
> readahead extension.

Got it.  Thanks for looking at this.

--
Best Regards,
Huang, Ying
diff mbox series

Patch

diff --git a/mm/shmem.c b/mm/shmem.c
index 9da9f7a0e620..3c0729fe934d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1564,20 +1564,6 @@  static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
 			pgoff_t index, unsigned int order, pgoff_t *ilx);
 
-static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
-			struct shmem_inode_info *info, pgoff_t index)
-{
-	struct mempolicy *mpol;
-	pgoff_t ilx;
-	struct folio *folio;
-
-	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
-	folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
-	mpol_cond_put(mpol);
-
-	return folio;
-}
-
 /*
  * Make sure huge_gfp is always more limited than limit_gfp.
  * Some of the flags set permissions, while others set limitations.
@@ -1851,9 +1837,12 @@  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
+	enum swap_cache_result cache_result;
 	struct swap_info_struct *si;
 	struct folio *folio = NULL;
+	struct mempolicy *mpol;
 	swp_entry_t swap;
+	pgoff_t ilx;
 	int error;
 
 	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
@@ -1871,36 +1860,40 @@  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 			return -EINVAL;
 	}
 
-	/* Look it up and read it in.. */
-	folio = swap_cache_get_folio(swap, NULL, 0, NULL);
+	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+	folio = swapin_entry_mpol(swap, gfp, mpol, ilx, &cache_result);
+	mpol_cond_put(mpol);
+
 	if (!folio) {
-		/* Or update major stats only when swapin succeeds?? */
+		error = -ENOMEM;
+		goto failed;
+	}
+	if (cache_result != SWAP_CACHE_HIT) {
 		if (fault_type) {
 			*fault_type |= VM_FAULT_MAJOR;
 			count_vm_event(PGMAJFAULT);
 			count_memcg_event_mm(fault_mm, PGMAJFAULT);
 		}
-		/* Here we actually start the io */
-		folio = shmem_swapin_cluster(swap, gfp, info, index);
-		if (!folio) {
-			error = -ENOMEM;
-			goto failed;
-		}
 	}
 
 	/* We have to do this with folio locked to prevent races */
 	folio_lock(folio);
-	if (!folio_test_swapcache(folio) ||
-	    folio->swap.val != swap.val ||
-	    !shmem_confirm_swap(mapping, index, swap)) {
+	if (cache_result != SWAP_CACHE_BYPASS) {
+		/* With cache bypass, folio is new allocated, sync, and not in cache */
+		if (!folio_test_swapcache(folio) || folio->swap.val != swap.val) {
+			error = -EEXIST;
+			goto unlock;
+		}
+		if (!folio_test_uptodate(folio)) {
+			error = -EIO;
+			goto failed;
+		}
+		folio_wait_writeback(folio);
+	}
+	if (!shmem_confirm_swap(mapping, index, swap)) {
 		error = -EEXIST;
 		goto unlock;
 	}
-	if (!folio_test_uptodate(folio)) {
-		error = -EIO;
-		goto failed;
-	}
-	folio_wait_writeback(folio);
 
 	/*
 	 * Some architectures may have to restore extra metadata to the
@@ -1908,12 +1901,19 @@  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	 */
 	arch_swap_restore(swap, folio);
 
-	if (shmem_should_replace_folio(folio, gfp)) {
+	/* With cache bypass, folio is new allocated and always respect gfp flags */
+	if (cache_result != SWAP_CACHE_BYPASS && shmem_should_replace_folio(folio, gfp)) {
 		error = shmem_replace_folio(&folio, gfp, info, index);
 		if (error)
 			goto failed;
 	}
 
+	/*
+	 * The expected value checking below should be enough to ensure
+	 * only one up-to-date swapin success. swap_free() is called after
+	 * this, so the entry can't be reused. As long as the mapping still
+	 * has the old entry value, it's never swapped in or modified.
+	 */
 	error = shmem_add_to_page_cache(folio, mapping, index,
 					swp_to_radix_entry(swap), gfp);
 	if (error)
@@ -1924,7 +1924,8 @@  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	if (sgp == SGP_WRITE)
 		folio_mark_accessed(folio);
 
-	delete_from_swap_cache(folio);
+	if (cache_result != SWAP_CACHE_BYPASS)
+		delete_from_swap_cache(folio);
 	folio_mark_dirty(folio);
 	swap_free(swap);
 	put_swap_device(si);
diff --git a/mm/swap.h b/mm/swap.h
index 8f790a67b948..20f4048c971c 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -57,9 +57,6 @@  void __delete_from_swap_cache(struct folio *folio,
 void delete_from_swap_cache(struct folio *folio);
 void clear_shadow_from_swap_cache(int type, unsigned long begin,
 				  unsigned long end);
-struct folio *swap_cache_get_folio(swp_entry_t entry,
-		struct vm_area_struct *vma, unsigned long addr,
-		void **shadowp);
 struct folio *filemap_get_incore_folio(struct address_space *mapping,
 		pgoff_t index);
 
@@ -123,12 +120,6 @@  static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
 	return 0;
 }
 
-static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
-		struct vm_area_struct *vma, unsigned long addr)
-{
-	return NULL;
-}
-
 static inline
 struct folio *filemap_get_incore_folio(struct address_space *mapping,
 		pgoff_t index)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3edf4b63158d..10eec68475dd 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -318,7 +318,14 @@  void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
 
 static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_t entry)
 {
-	return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
+	int count;
+
+	if (!data_race(si->flags & SWP_SYNCHRONOUS_IO))
+		return false;
+
+	count = __swap_count(entry);
+
+	return (count == 1 || count == SWAP_MAP_SHMEM);
 }
 
 static inline bool swap_use_vma_readahead(void)
@@ -334,7 +341,7 @@  static inline bool swap_use_vma_readahead(void)
  *
  * Caller must lock the swap device or hold a reference to keep it valid.
  */
-struct folio *swap_cache_get_folio(swp_entry_t entry,
+static struct folio *swap_cache_get_folio(swp_entry_t entry,
 		struct vm_area_struct *vma, unsigned long addr, void **shadowp)
 {
 	struct folio *folio;