diff mbox series

mm/vmscan: add sysctl knobs for protecting the working set

Message ID 20211130201652.2218636d@mail.inbox.lv (mailing list archive)
State New, archived
Headers show
Series mm/vmscan: add sysctl knobs for protecting the working set | expand

Commit Message

Alexey Avramov Nov. 30, 2021, 11:16 a.m. UTC
The kernel does not provide a way to protect the working set under memory
pressure. A certain amount of anonymous and clean file pages is required by
the userspace for normal operation. First of all, the userspace needs a
cache of shared libraries and executable binaries. If the amount of the
clean file pages falls below a certain level, then thrashing and even
livelock can take place.

The patch provides sysctl knobs for protecting the working set (anonymous
and clean file pages) under memory pressure.

The vm.anon_min_kbytes sysctl knob provides *hard* protection of anonymous
pages. The anonymous pages on the current node won't be reclaimed under any
conditions when their amount is below vm.anon_min_kbytes. This knob may be
used to prevent excessive swap thrashing when anonymous memory is low (for
example, when memory is going to be overfilled by compressed data of zram
module). The default value is defined by CONFIG_ANON_MIN_KBYTES (suggested
0 in Kconfig).

The vm.clean_low_kbytes sysctl knob provides *best-effort* protection of
clean file pages. The file pages on the current node won't be reclaimed
under memory pressure when the amount of clean file pages is below
vm.clean_low_kbytes *unless* we threaten to OOM. Protection of clean file
pages using this knob may be used when swapping is still possible to
  - prevent disk I/O thrashing under memory pressure;
  - improve performance in disk cache-bound tasks under memory pressure.
The default value is defined by CONFIG_CLEAN_LOW_KBYTES (suggested 0 in
Kconfig).

The vm.clean_min_kbytes sysctl knob provides *hard* protection of clean
file pages. The file pages on the current node won't be reclaimed under
memory pressure when the amount of clean file pages is below
vm.clean_min_kbytes. Hard protection of clean file pages using this knob
may be used to
  - prevent disk I/O thrashing under memory pressure even with no free swap
    space;
  - improve performance in disk cache-bound tasks under memory pressure;
  - avoid high latency and prevent livelock in near-OOM conditions.
The default value is defined by CONFIG_CLEAN_MIN_KBYTES (suggested 0 in
Kconfig).

Signed-off-by: Alexey Avramov <hakavlad@inbox.lv>
Reported-by: Artem S. Tashkinov <aros@gmx.com>
---
 Repo:
 https://github.com/hakavlad/le9-patch

 Documentation/admin-guide/sysctl/vm.rst | 66 ++++++++++++++++++++++++
 include/linux/mm.h                      |  4 ++
 kernel/sysctl.c                         | 21 ++++++++
 mm/Kconfig                              | 63 +++++++++++++++++++++++
 mm/vmscan.c                             | 91 +++++++++++++++++++++++++++++++++
 5 files changed, 245 insertions(+)


base-commit: d58071a8a76d779eedab38033ae4c821c30295a5
--
2.11.0

Comments

Luis Chamberlain Nov. 30, 2021, 3:28 p.m. UTC | #1
On Tue, Nov 30, 2021 at 08:16:52PM +0900, Alexey Avramov wrote:
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 083be6af2..65fc38756 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -3132,6 +3132,27 @@ static struct ctl_table vm_table[] = {
>  	},
>  #endif
>  	{
> +		.procname	= "anon_min_kbytes",
> +		.data		= &sysctl_anon_min_kbytes,
> +		.maxlen		= sizeof(unsigned long),
> +		.mode		= 0644,
> +		.proc_handler	= proc_doulongvec_minmax,
> +	},
> +	{
> +		.procname	= "clean_low_kbytes",
> +		.data		= &sysctl_clean_low_kbytes,
> +		.maxlen		= sizeof(unsigned long),
> +		.mode		= 0644,
> +		.proc_handler	= proc_doulongvec_minmax,
> +	},
> +	{
> +		.procname	= "clean_min_kbytes",
> +		.data		= &sysctl_clean_min_kbytes,
> +		.maxlen		= sizeof(unsigned long),
> +		.mode		= 0644,
> +		.proc_handler	= proc_doulongvec_minmax,
> +	},
> +	{
>  		.procname	= "user_reserve_kbytes",
>  		.data		= &sysctl_user_reserve_kbytes,
>  		.maxlen		= sizeof(sysctl_user_reserve_kbytes),

Please don't clutter this file anymore than what we have with random
sysctls, as otherwise it becomes a pain to deal with merge conficts.
You can use register_sysctl_init("vm", whatever_your_local_table_name)
within the file you are adding your sysctl.

 Luis
Oleksandr Natalenko Nov. 30, 2021, 6:56 p.m. UTC | #2
Hello.

On úterý 30. listopadu 2021 12:16:52 CET Alexey Avramov wrote:
> The kernel does not provide a way to protect the working set under memory
> pressure. A certain amount of anonymous and clean file pages is required by
> the userspace for normal operation. First of all, the userspace needs a
> cache of shared libraries and executable binaries. If the amount of the
> clean file pages falls below a certain level, then thrashing and even
> livelock can take place.
> 
> The patch provides sysctl knobs for protecting the working set (anonymous
> and clean file pages) under memory pressure.
> 
> The vm.anon_min_kbytes sysctl knob provides *hard* protection of anonymous
> pages. The anonymous pages on the current node won't be reclaimed under any
> conditions when their amount is below vm.anon_min_kbytes. This knob may be
> used to prevent excessive swap thrashing when anonymous memory is low (for
> example, when memory is going to be overfilled by compressed data of zram
> module). The default value is defined by CONFIG_ANON_MIN_KBYTES (suggested
> 0 in Kconfig).
> 
> The vm.clean_low_kbytes sysctl knob provides *best-effort* protection of
> clean file pages. The file pages on the current node won't be reclaimed
> under memory pressure when the amount of clean file pages is below
> vm.clean_low_kbytes *unless* we threaten to OOM. Protection of clean file
> pages using this knob may be used when swapping is still possible to
>   - prevent disk I/O thrashing under memory pressure;
>   - improve performance in disk cache-bound tasks under memory pressure.
> The default value is defined by CONFIG_CLEAN_LOW_KBYTES (suggested 0 in
> Kconfig).
> 
> The vm.clean_min_kbytes sysctl knob provides *hard* protection of clean
> file pages. The file pages on the current node won't be reclaimed under
> memory pressure when the amount of clean file pages is below
> vm.clean_min_kbytes. Hard protection of clean file pages using this knob
> may be used to
>   - prevent disk I/O thrashing under memory pressure even with no free swap
>     space;
>   - improve performance in disk cache-bound tasks under memory pressure;
>   - avoid high latency and prevent livelock in near-OOM conditions.
> The default value is defined by CONFIG_CLEAN_MIN_KBYTES (suggested 0 in
> Kconfig).

Although this is a definitely system-wide knob, wouldn't it make sense to 
implement this also on a per-cgroup basis?

Thanks.

> 
> Signed-off-by: Alexey Avramov <hakavlad@inbox.lv>
> Reported-by: Artem S. Tashkinov <aros@gmx.com>
> ---
>  Repo:
>  https://github.com/hakavlad/le9-patch
> 
>  Documentation/admin-guide/sysctl/vm.rst | 66 ++++++++++++++++++++++++
>  include/linux/mm.h                      |  4 ++
>  kernel/sysctl.c                         | 21 ++++++++
>  mm/Kconfig                              | 63 +++++++++++++++++++++++
>  mm/vmscan.c                             | 91
> +++++++++++++++++++++++++++++++++ 5 files changed, 245 insertions(+)
> 
> diff --git a/Documentation/admin-guide/sysctl/vm.rst
> b/Documentation/admin-guide/sysctl/vm.rst index 5e7952021..2f606e23b 100644
> --- a/Documentation/admin-guide/sysctl/vm.rst
> +++ b/Documentation/admin-guide/sysctl/vm.rst
> @@ -25,6 +25,9 @@ files can be found in mm/swap.c.
>  Currently, these files are in /proc/sys/vm:
> 
>  - admin_reserve_kbytes
> +- anon_min_kbytes
> +- clean_low_kbytes
> +- clean_min_kbytes
>  - compact_memory
>  - compaction_proactiveness
>  - compact_unevictable_allowed
> @@ -105,6 +108,61 @@ On x86_64 this is about 128MB.
>  Changing this takes effect whenever an application requests memory.
> 
> 
> +anon_min_kbytes
> +===============
> +
> +This knob provides *hard* protection of anonymous pages. The anonymous
> pages +on the current node won't be reclaimed under any conditions when
> their amount +is below vm.anon_min_kbytes.
> +
> +This knob may be used to prevent excessive swap thrashing when anonymous
> +memory is low (for example, when memory is going to be overfilled by
> +compressed data of zram module).
> +
> +Setting this value too high (close to MemTotal) can result in inability to
> +swap and can lead to early OOM under memory pressure.
> +
> +The default value is defined by CONFIG_ANON_MIN_KBYTES.
> +
> +
> +clean_low_kbytes
> +================
> +
> +This knob provides *best-effort* protection of clean file pages. The file
> pages +on the current node won't be reclaimed under memory pressure when
> the amount of +clean file pages is below vm.clean_low_kbytes *unless* we
> threaten to OOM. +
> +Protection of clean file pages using this knob may be used when swapping is
> +still possible to
> +  - prevent disk I/O thrashing under memory pressure;
> +  - improve performance in disk cache-bound tasks under memory pressure.
> +
> +Setting it to a high value may result in a early eviction of anonymous
> pages +into the swap space by attempting to hold the protected amount of
> clean file +pages in memory.
> +
> +The default value is defined by CONFIG_CLEAN_LOW_KBYTES.
> +
> +
> +clean_min_kbytes
> +================
> +
> +This knob provides *hard* protection of clean file pages. The file pages on
> the +current node won't be reclaimed under memory pressure when the amount
> of clean +file pages is below vm.clean_min_kbytes.
> +
> +Hard protection of clean file pages using this knob may be used to
> +  - prevent disk I/O thrashing under memory pressure even with no free swap
> space; +  - improve performance in disk cache-bound tasks under memory
> pressure; +  - avoid high latency and prevent livelock in near-OOM
> conditions. +
> +Setting it to a high value may result in a early out-of-memory condition
> due to +the inability to reclaim the protected amount of clean file pages
> when other +types of pages cannot be reclaimed.
> +
> +The default value is defined by CONFIG_CLEAN_MIN_KBYTES.
> +
> +
>  compact_memory
>  ==============
> 
> @@ -864,6 +922,14 @@ be 133 (x + 2x = 200, 2x = 133.33).
>  At 0, the kernel will not initiate swap until the amount of free and
>  file-backed pages is less than the high watermark in a zone.
> 
> +This knob has no effect if the amount of clean file pages on the current
> +node is below vm.clean_low_kbytes or vm.clean_min_kbytes. In this case,
> +only anonymous pages can be reclaimed.
> +
> +If the number of anonymous pages on the current node is below
> +vm.anon_min_kbytes, then only file pages can be reclaimed with
> +any vm.swappiness value.
> +
> 
>  unprivileged_userfaultfd
>  ========================
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a7e4a9e7d..bee9807d5 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -200,6 +200,10 @@ static inline void __mm_zero_struct_page(struct page
> *page)
> 
>  extern int sysctl_max_map_count;
> 
> +extern unsigned long sysctl_anon_min_kbytes;
> +extern unsigned long sysctl_clean_low_kbytes;
> +extern unsigned long sysctl_clean_min_kbytes;
> +
>  extern unsigned long sysctl_user_reserve_kbytes;
>  extern unsigned long sysctl_admin_reserve_kbytes;
> 
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 083be6af2..65fc38756 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -3132,6 +3132,27 @@ static struct ctl_table vm_table[] = {
>  	},
>  #endif
>  	{
> +		.procname	= "anon_min_kbytes",
> +		.data		= &sysctl_anon_min_kbytes,
> +		.maxlen		= sizeof(unsigned long),
> +		.mode		= 0644,
> +		.proc_handler	= proc_doulongvec_minmax,
> +	},
> +	{
> +		.procname	= "clean_low_kbytes",
> +		.data		= &sysctl_clean_low_kbytes,
> +		.maxlen		= sizeof(unsigned long),
> +		.mode		= 0644,
> +		.proc_handler	= proc_doulongvec_minmax,
> +	},
> +	{
> +		.procname	= "clean_min_kbytes",
> +		.data		= &sysctl_clean_min_kbytes,
> +		.maxlen		= sizeof(unsigned long),
> +		.mode		= 0644,
> +		.proc_handler	= proc_doulongvec_minmax,
> +	},
> +	{
>  		.procname	= "user_reserve_kbytes",
>  		.data		= &sysctl_user_reserve_kbytes,
>  		.maxlen		= sizeof(sysctl_user_reserve_kbytes),
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 28edafc82..dea0806d7 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -89,6 +89,69 @@ config SPARSEMEM_VMEMMAP
>  	  pfn_to_page and page_to_pfn operations.  This is the most
>  	  efficient option when sufficient kernel resources are available.
> 
> +config ANON_MIN_KBYTES
> +	int "Default value for vm.anon_min_kbytes"
> +	depends on SYSCTL
> +	range 0 4294967295
> +	default 0
> +	help
> +	  This option sets the default value for vm.anon_min_kbytes sysctl 
knob.
> +
> +	  The vm.anon_min_kbytes sysctl knob provides *hard* protection of
> +	  anonymous pages. The anonymous pages on the current node won't be
> +	  reclaimed under any conditions when their amount is below
> +	  vm.anon_min_kbytes. This knob may be used to prevent excessive swap
> +	  thrashing when anonymous memory is low (for example, when memory is
> +	  going to be overfilled by compressed data of zram module).
> +
> +	  Setting this value too high (close to MemTotal) can result in
> +	  inability to swap and can lead to early OOM under memory pressure.
> +
> +config CLEAN_LOW_KBYTES
> +	int "Default value for vm.clean_low_kbytes"
> +	depends on SYSCTL
> +	range 0 4294967295
> +	default 0
> +	help
> +	  This option sets the default value for vm.clean_low_kbytes sysctl 
knob.
> +
> +	  The vm.clean_low_kbytes sysctl knob provides *best-effort*
> +	  protection of clean file pages. The file pages on the current node
> +	  won't be reclaimed under memory pressure when the amount of clean file
> +	  pages is below vm.clean_low_kbytes *unless* we threaten to OOM.
> +	  Protection of clean file pages using this knob may be used when
> +	  swapping is still possible to
> +	    - prevent disk I/O thrashing under memory pressure;
> +	    - improve performance in disk cache-bound tasks under memory
> +	      pressure.
> +
> +	  Setting it to a high value may result in a early eviction of 
anonymous
> +	  pages into the swap space by attempting to hold the protected amount
> +	  of clean file pages in memory.
> +
> +config CLEAN_MIN_KBYTES
> +	int "Default value for vm.clean_min_kbytes"
> +	depends on SYSCTL
> +	range 0 4294967295
> +	default 0
> +	help
> +	  This option sets the default value for vm.clean_min_kbytes sysctl 
knob.
> +
> +	  The vm.clean_min_kbytes sysctl knob provides *hard* protection of
> +	  clean file pages. The file pages on the current node won't be
> +	  reclaimed under memory pressure when the amount of clean file pages is
> +	  below vm.clean_min_kbytes. Hard protection of clean file pages using
> +	  this knob may be used to
> +	    - prevent disk I/O thrashing under memory pressure even with no 
free
> +	      swap space;
> +	    - improve performance in disk cache-bound tasks under memory
> +	      pressure;
> +	    - avoid high latency and prevent livelock in near-OOM conditions.
> +
> +	  Setting it to a high value may result in a early out-of-memory 
condition
> +	  due to the inability to reclaim the protected amount of clean file
> pages +	  when other types of pages cannot be reclaimed.
> +
>  config HAVE_MEMBLOCK_PHYS_MAP
>  	bool
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index fb9584641..928f3371d 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -122,6 +122,15 @@ struct scan_control {
>  	/* The file pages on the current node are dangerously low */
>  	unsigned int file_is_tiny:1;
> 
> +	/* The anonymous pages on the current node are below vm.anon_min_kbytes 
*/
> +	unsigned int anon_below_min:1;
> +
> +	/* The clean file pages on the current node are below 
vm.clean_low_kbytes
> */ +	unsigned int clean_below_low:1;
> +
> +	/* The clean file pages on the current node are below 
vm.clean_min_kbytes
> */ +	unsigned int clean_below_min:1;
> +
>  	/* Always discard instead of demoting to lower tier memory */
>  	unsigned int no_demotion:1;
> 
> @@ -171,6 +180,10 @@ struct scan_control {
>  #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
>  #endif
> 
> +unsigned long sysctl_anon_min_kbytes __read_mostly =
> CONFIG_ANON_MIN_KBYTES; +unsigned long sysctl_clean_low_kbytes
> __read_mostly = CONFIG_CLEAN_LOW_KBYTES; +unsigned long
> sysctl_clean_min_kbytes __read_mostly = CONFIG_CLEAN_MIN_KBYTES; +
>  /*
>   * From 0 .. 200.  Higher means more swappy.
>   */
> @@ -2734,6 +2747,15 @@ static void get_scan_count(struct lruvec *lruvec,
> struct scan_control *sc, }
> 
>  	/*
> +	 * Force-scan anon if clean file pages is under vm.clean_low_kbytes
> +	 * or vm.clean_min_kbytes.
> +	 */
> +	if (sc->clean_below_low || sc->clean_below_min) {
> +		scan_balance = SCAN_ANON;
> +		goto out;
> +	}
> +
> +	/*
>  	 * If there is enough inactive page cache, we do not reclaim
>  	 * anything from the anonymous working right now.
>  	 */
> @@ -2877,6 +2899,25 @@ static void get_scan_count(struct lruvec *lruvec,
> struct scan_control *sc, BUG();
>  		}
> 
> +		/*
> +		 * Hard protection of the working set.
> +		 */
> +		if (file) {
> +			/*
> +			 * Don't reclaim file pages when the amount of
> +			 * clean file pages is below vm.clean_min_kbytes.
> +			 */
> +			if (sc->clean_below_min)
> +				scan = 0;
> +		} else {
> +			/*
> +			 * Don't reclaim anonymous pages when their
> +			 * amount is below vm.anon_min_kbytes.
> +			 */
> +			if (sc->anon_below_min)
> +				scan = 0;
> +		}
> +
>  		nr[lru] = scan;
>  	}
>  }
> @@ -3082,6 +3123,54 @@ static inline bool should_continue_reclaim(struct
> pglist_data *pgdat, return inactive_lru_pages > pages_for_compaction;
>  }
> 
> +static void prepare_workingset_protection(pg_data_t *pgdat, struct
> scan_control *sc) +{
> +	/*
> +	 * Check the number of anonymous pages to protect them from
> +	 * reclaiming if their amount is below the specified.
> +	 */
> +	if (sysctl_anon_min_kbytes) {
> +		unsigned long reclaimable_anon;
> +
> +		reclaimable_anon =
> +			node_page_state(pgdat, NR_ACTIVE_ANON) +
> +			node_page_state(pgdat, NR_INACTIVE_ANON) +
> +			node_page_state(pgdat, NR_ISOLATED_ANON);
> +		reclaimable_anon <<= (PAGE_SHIFT - 10);
> +
> +		sc->anon_below_min = reclaimable_anon < sysctl_anon_min_kbytes;
> +	} else
> +		sc->anon_below_min = 0;
> +
> +	/*
> +	 * Check the number of clean file pages to protect them from
> +	 * reclaiming if their amount is below the specified.
> +	 */
> +	if (sysctl_clean_low_kbytes || sysctl_clean_min_kbytes) {
> +		unsigned long reclaimable_file, dirty, clean;
> +
> +		reclaimable_file =
> +			node_page_state(pgdat, NR_ACTIVE_FILE) +
> +			node_page_state(pgdat, NR_INACTIVE_FILE) +
> +			node_page_state(pgdat, NR_ISOLATED_FILE);
> +		dirty = node_page_state(pgdat, NR_FILE_DIRTY);
> +		/*
> +		 * node_page_state() sum can go out of sync since
> +		 * all the values are not read at once.
> +		 */
> +		if (likely(reclaimable_file > dirty))
> +			clean = (reclaimable_file - dirty) << (PAGE_SHIFT - 10);
> +		else
> +			clean = 0;
> +
> +		sc->clean_below_low = clean < sysctl_clean_low_kbytes;
> +		sc->clean_below_min = clean < sysctl_clean_min_kbytes;
> +	} else {
> +		sc->clean_below_low = 0;
> +		sc->clean_below_min = 0;
> +	}
> +}
> +
>  static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
>  {
>  	struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
> @@ -3249,6 +3338,8 @@ static void shrink_node(pg_data_t *pgdat, struct
> scan_control *sc) anon >> sc->priority;
>  	}
> 
> +	prepare_workingset_protection(pgdat, sc);
> +
>  	shrink_node_memcgs(pgdat, sc);
> 
>  	if (reclaim_state) {
> 
> base-commit: d58071a8a76d779eedab38033ae4c821c30295a5
> --
> 2.11.0
Alexey Avramov Dec. 1, 2021, 3:51 p.m. UTC | #3
>Although this is a definitely system-wide knob, wouldn't it make sense to 
>implement this also on a per-cgroup basis?

memory.min and memory.low are alreary exist.

Regarding the protection of file pages, we are primarily interested in
shared libraries. I don't see the point of creating such tunables
for cgroups.
ValdikSS Dec. 2, 2021, 6:05 p.m. UTC | #4
This patchset is surprisingly effective and very useful for low-end PC 
with slow HDD, single-board ARM boards with slow storage, cheap Android 
smartphones with limited amount of memory. It almost completely prevents 
thrashing condition and aids in fast OOM killer invocation.

The similar file-locking patch is used in ChromeOS for nearly 10 years 
but not on stock Linux or Android. It would be very beneficial for 
lower-performance Android phones, SBCs, old PCs and other devices.

With this patch, combined with zram, I'm able to run the following 
software on an old office PC from 2007 with __only 2GB of RAM__ 
simultaneously:

  * Firefox with 37 active tabs (all data in RAM, no tab unloading)
  * Discord
  * Skype
  * LibreOffice with the document opened
  * Two PDF files (14 and 47 megabytes in size)

And the PC doesn't crawl like a snail, even with 2+ GB in zram!
Without the patch, this PC is barely usable.
Please watch the video:
https://notes.valdikss.org.ru/linux-for-old-pc-from-2007/en/



On 30.11.2021 14:16, Alexey Avramov wrote:
> The kernel does not provide a way to protect the working set under memory
> pressure. A certain amount of anonymous and clean file pages is required by
> the userspace for normal operation. First of all, the userspace needs a
> cache of shared libraries and executable binaries. If the amount of the
> clean file pages falls below a certain level, then thrashing and even
> livelock can take place.
> 
> The patch provides sysctl knobs for protecting the working set (anonymous
> and clean file pages) under memory pressure.
> 
> The vm.anon_min_kbytes sysctl knob provides *hard* protection of anonymous
> pages. The anonymous pages on the current node won't be reclaimed under any
> conditions when their amount is below vm.anon_min_kbytes. This knob may be
> used to prevent excessive swap thrashing when anonymous memory is low (for
> example, when memory is going to be overfilled by compressed data of zram
> module). The default value is defined by CONFIG_ANON_MIN_KBYTES (suggested
> 0 in Kconfig).
> 
> The vm.clean_low_kbytes sysctl knob provides *best-effort* protection of
> clean file pages. The file pages on the current node won't be reclaimed
> under memory pressure when the amount of clean file pages is below
> vm.clean_low_kbytes *unless* we threaten to OOM. Protection of clean file
> pages using this knob may be used when swapping is still possible to
>    - prevent disk I/O thrashing under memory pressure;
>    - improve performance in disk cache-bound tasks under memory pressure.
> The default value is defined by CONFIG_CLEAN_LOW_KBYTES (suggested 0 in
> Kconfig).
> 
> The vm.clean_min_kbytes sysctl knob provides *hard* protection of clean
> file pages. The file pages on the current node won't be reclaimed under
> memory pressure when the amount of clean file pages is below
> vm.clean_min_kbytes. Hard protection of clean file pages using this knob
> may be used to
>    - prevent disk I/O thrashing under memory pressure even with no free swap
>      space;
>    - improve performance in disk cache-bound tasks under memory pressure;
>    - avoid high latency and prevent livelock in near-OOM conditions.
> The default value is defined by CONFIG_CLEAN_MIN_KBYTES (suggested 0 in
> Kconfig).
> 
> Signed-off-by: Alexey Avramov <hakavlad@inbox.lv>
> Reported-by: Artem S. Tashkinov <aros@gmx.com>
> ---
>   Repo:
>   https://github.com/hakavlad/le9-patch
> 
>   Documentation/admin-guide/sysctl/vm.rst | 66 ++++++++++++++++++++++++
>   include/linux/mm.h                      |  4 ++
>   kernel/sysctl.c                         | 21 ++++++++
>   mm/Kconfig                              | 63 +++++++++++++++++++++++
>   mm/vmscan.c                             | 91 +++++++++++++++++++++++++++++++++
>   5 files changed, 245 insertions(+)
> 
> diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> index 5e7952021..2f606e23b 100644
> --- a/Documentation/admin-guide/sysctl/vm.rst
> +++ b/Documentation/admin-guide/sysctl/vm.rst
> @@ -25,6 +25,9 @@ files can be found in mm/swap.c.
>   Currently, these files are in /proc/sys/vm:
> 
>   - admin_reserve_kbytes
> +- anon_min_kbytes
> +- clean_low_kbytes
> +- clean_min_kbytes
>   - compact_memory
>   - compaction_proactiveness
>   - compact_unevictable_allowed
> @@ -105,6 +108,61 @@ On x86_64 this is about 128MB.
>   Changing this takes effect whenever an application requests memory.
> 
> 
> +anon_min_kbytes
> +===============
> +
> +This knob provides *hard* protection of anonymous pages. The anonymous pages
> +on the current node won't be reclaimed under any conditions when their amount
> +is below vm.anon_min_kbytes.
> +
> +This knob may be used to prevent excessive swap thrashing when anonymous
> +memory is low (for example, when memory is going to be overfilled by
> +compressed data of zram module).
> +
> +Setting this value too high (close to MemTotal) can result in inability to
> +swap and can lead to early OOM under memory pressure.
> +
> +The default value is defined by CONFIG_ANON_MIN_KBYTES.
> +
> +
> +clean_low_kbytes
> +================
> +
> +This knob provides *best-effort* protection of clean file pages. The file pages
> +on the current node won't be reclaimed under memory pressure when the amount of
> +clean file pages is below vm.clean_low_kbytes *unless* we threaten to OOM.
> +
> +Protection of clean file pages using this knob may be used when swapping is
> +still possible to
> +  - prevent disk I/O thrashing under memory pressure;
> +  - improve performance in disk cache-bound tasks under memory pressure.
> +
> +Setting it to a high value may result in a early eviction of anonymous pages
> +into the swap space by attempting to hold the protected amount of clean file
> +pages in memory.
> +
> +The default value is defined by CONFIG_CLEAN_LOW_KBYTES.
> +
> +
> +clean_min_kbytes
> +================
> +
> +This knob provides *hard* protection of clean file pages. The file pages on the
> +current node won't be reclaimed under memory pressure when the amount of clean
> +file pages is below vm.clean_min_kbytes.
> +
> +Hard protection of clean file pages using this knob may be used to
> +  - prevent disk I/O thrashing under memory pressure even with no free swap space;
> +  - improve performance in disk cache-bound tasks under memory pressure;
> +  - avoid high latency and prevent livelock in near-OOM conditions.
> +
> +Setting it to a high value may result in a early out-of-memory condition due to
> +the inability to reclaim the protected amount of clean file pages when other
> +types of pages cannot be reclaimed.
> +
> +The default value is defined by CONFIG_CLEAN_MIN_KBYTES.
> +
> +
>   compact_memory
>   ==============
> 
> @@ -864,6 +922,14 @@ be 133 (x + 2x = 200, 2x = 133.33).
>   At 0, the kernel will not initiate swap until the amount of free and
>   file-backed pages is less than the high watermark in a zone.
> 
> +This knob has no effect if the amount of clean file pages on the current
> +node is below vm.clean_low_kbytes or vm.clean_min_kbytes. In this case,
> +only anonymous pages can be reclaimed.
> +
> +If the number of anonymous pages on the current node is below
> +vm.anon_min_kbytes, then only file pages can be reclaimed with
> +any vm.swappiness value.
> +
> 
>   unprivileged_userfaultfd
>   ========================
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a7e4a9e7d..bee9807d5 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -200,6 +200,10 @@ static inline void __mm_zero_struct_page(struct page *page)
> 
>   extern int sysctl_max_map_count;
> 
> +extern unsigned long sysctl_anon_min_kbytes;
> +extern unsigned long sysctl_clean_low_kbytes;
> +extern unsigned long sysctl_clean_min_kbytes;
> +
>   extern unsigned long sysctl_user_reserve_kbytes;
>   extern unsigned long sysctl_admin_reserve_kbytes;
> 
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 083be6af2..65fc38756 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -3132,6 +3132,27 @@ static struct ctl_table vm_table[] = {
>   	},
>   #endif
>   	{
> +		.procname	= "anon_min_kbytes",
> +		.data		= &sysctl_anon_min_kbytes,
> +		.maxlen		= sizeof(unsigned long),
> +		.mode		= 0644,
> +		.proc_handler	= proc_doulongvec_minmax,
> +	},
> +	{
> +		.procname	= "clean_low_kbytes",
> +		.data		= &sysctl_clean_low_kbytes,
> +		.maxlen		= sizeof(unsigned long),
> +		.mode		= 0644,
> +		.proc_handler	= proc_doulongvec_minmax,
> +	},
> +	{
> +		.procname	= "clean_min_kbytes",
> +		.data		= &sysctl_clean_min_kbytes,
> +		.maxlen		= sizeof(unsigned long),
> +		.mode		= 0644,
> +		.proc_handler	= proc_doulongvec_minmax,
> +	},
> +	{
>   		.procname	= "user_reserve_kbytes",
>   		.data		= &sysctl_user_reserve_kbytes,
>   		.maxlen		= sizeof(sysctl_user_reserve_kbytes),
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 28edafc82..dea0806d7 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -89,6 +89,69 @@ config SPARSEMEM_VMEMMAP
>   	  pfn_to_page and page_to_pfn operations.  This is the most
>   	  efficient option when sufficient kernel resources are available.
> 
> +config ANON_MIN_KBYTES
> +	int "Default value for vm.anon_min_kbytes"
> +	depends on SYSCTL
> +	range 0 4294967295
> +	default 0
> +	help
> +	  This option sets the default value for vm.anon_min_kbytes sysctl knob.
> +
> +	  The vm.anon_min_kbytes sysctl knob provides *hard* protection of
> +	  anonymous pages. The anonymous pages on the current node won't be
> +	  reclaimed under any conditions when their amount is below
> +	  vm.anon_min_kbytes. This knob may be used to prevent excessive swap
> +	  thrashing when anonymous memory is low (for example, when memory is
> +	  going to be overfilled by compressed data of zram module).
> +
> +	  Setting this value too high (close to MemTotal) can result in
> +	  inability to swap and can lead to early OOM under memory pressure.
> +
> +config CLEAN_LOW_KBYTES
> +	int "Default value for vm.clean_low_kbytes"
> +	depends on SYSCTL
> +	range 0 4294967295
> +	default 0
> +	help
> +	  This option sets the default value for vm.clean_low_kbytes sysctl knob.
> +
> +	  The vm.clean_low_kbytes sysctl knob provides *best-effort*
> +	  protection of clean file pages. The file pages on the current node
> +	  won't be reclaimed under memory pressure when the amount of clean file
> +	  pages is below vm.clean_low_kbytes *unless* we threaten to OOM.
> +	  Protection of clean file pages using this knob may be used when
> +	  swapping is still possible to
> +	    - prevent disk I/O thrashing under memory pressure;
> +	    - improve performance in disk cache-bound tasks under memory
> +	      pressure.
> +
> +	  Setting it to a high value may result in a early eviction of anonymous
> +	  pages into the swap space by attempting to hold the protected amount
> +	  of clean file pages in memory.
> +
> +config CLEAN_MIN_KBYTES
> +	int "Default value for vm.clean_min_kbytes"
> +	depends on SYSCTL
> +	range 0 4294967295
> +	default 0
> +	help
> +	  This option sets the default value for vm.clean_min_kbytes sysctl knob.
> +
> +	  The vm.clean_min_kbytes sysctl knob provides *hard* protection of
> +	  clean file pages. The file pages on the current node won't be
> +	  reclaimed under memory pressure when the amount of clean file pages is
> +	  below vm.clean_min_kbytes. Hard protection of clean file pages using
> +	  this knob may be used to
> +	    - prevent disk I/O thrashing under memory pressure even with no free
> +	      swap space;
> +	    - improve performance in disk cache-bound tasks under memory
> +	      pressure;
> +	    - avoid high latency and prevent livelock in near-OOM conditions.
> +
> +	  Setting it to a high value may result in a early out-of-memory condition
> +	  due to the inability to reclaim the protected amount of clean file pages
> +	  when other types of pages cannot be reclaimed.
> +
>   config HAVE_MEMBLOCK_PHYS_MAP
>   	bool
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index fb9584641..928f3371d 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -122,6 +122,15 @@ struct scan_control {
>   	/* The file pages on the current node are dangerously low */
>   	unsigned int file_is_tiny:1;
> 
> +	/* The anonymous pages on the current node are below vm.anon_min_kbytes */
> +	unsigned int anon_below_min:1;
> +
> +	/* The clean file pages on the current node are below vm.clean_low_kbytes */
> +	unsigned int clean_below_low:1;
> +
> +	/* The clean file pages on the current node are below vm.clean_min_kbytes */
> +	unsigned int clean_below_min:1;
> +
>   	/* Always discard instead of demoting to lower tier memory */
>   	unsigned int no_demotion:1;
> 
> @@ -171,6 +180,10 @@ struct scan_control {
>   #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
>   #endif
> 
> +unsigned long sysctl_anon_min_kbytes __read_mostly = CONFIG_ANON_MIN_KBYTES;
> +unsigned long sysctl_clean_low_kbytes __read_mostly = CONFIG_CLEAN_LOW_KBYTES;
> +unsigned long sysctl_clean_min_kbytes __read_mostly = CONFIG_CLEAN_MIN_KBYTES;
> +
>   /*
>    * From 0 .. 200.  Higher means more swappy.
>    */
> @@ -2734,6 +2747,15 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>   	}
> 
>   	/*
> +	 * Force-scan anon if clean file pages is under vm.clean_low_kbytes
> +	 * or vm.clean_min_kbytes.
> +	 */
> +	if (sc->clean_below_low || sc->clean_below_min) {
> +		scan_balance = SCAN_ANON;
> +		goto out;
> +	}
> +
> +	/*
>   	 * If there is enough inactive page cache, we do not reclaim
>   	 * anything from the anonymous working right now.
>   	 */
> @@ -2877,6 +2899,25 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
>   			BUG();
>   		}
> 
> +		/*
> +		 * Hard protection of the working set.
> +		 */
> +		if (file) {
> +			/*
> +			 * Don't reclaim file pages when the amount of
> +			 * clean file pages is below vm.clean_min_kbytes.
> +			 */
> +			if (sc->clean_below_min)
> +				scan = 0;
> +		} else {
> +			/*
> +			 * Don't reclaim anonymous pages when their
> +			 * amount is below vm.anon_min_kbytes.
> +			 */
> +			if (sc->anon_below_min)
> +				scan = 0;
> +		}
> +
>   		nr[lru] = scan;
>   	}
>   }
> @@ -3082,6 +3123,54 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
>   	return inactive_lru_pages > pages_for_compaction;
>   }
> 
> +static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc)
> +{
> +	/*
> +	 * Check the number of anonymous pages to protect them from
> +	 * reclaiming if their amount is below the specified.
> +	 */
> +	if (sysctl_anon_min_kbytes) {
> +		unsigned long reclaimable_anon;
> +
> +		reclaimable_anon =
> +			node_page_state(pgdat, NR_ACTIVE_ANON) +
> +			node_page_state(pgdat, NR_INACTIVE_ANON) +
> +			node_page_state(pgdat, NR_ISOLATED_ANON);
> +		reclaimable_anon <<= (PAGE_SHIFT - 10);
> +
> +		sc->anon_below_min = reclaimable_anon < sysctl_anon_min_kbytes;
> +	} else
> +		sc->anon_below_min = 0;
> +
> +	/*
> +	 * Check the number of clean file pages to protect them from
> +	 * reclaiming if their amount is below the specified.
> +	 */
> +	if (sysctl_clean_low_kbytes || sysctl_clean_min_kbytes) {
> +		unsigned long reclaimable_file, dirty, clean;
> +
> +		reclaimable_file =
> +			node_page_state(pgdat, NR_ACTIVE_FILE) +
> +			node_page_state(pgdat, NR_INACTIVE_FILE) +
> +			node_page_state(pgdat, NR_ISOLATED_FILE);
> +		dirty = node_page_state(pgdat, NR_FILE_DIRTY);
> +		/*
> +		 * node_page_state() sum can go out of sync since
> +		 * all the values are not read at once.
> +		 */
> +		if (likely(reclaimable_file > dirty))
> +			clean = (reclaimable_file - dirty) << (PAGE_SHIFT - 10);
> +		else
> +			clean = 0;
> +
> +		sc->clean_below_low = clean < sysctl_clean_low_kbytes;
> +		sc->clean_below_min = clean < sysctl_clean_min_kbytes;
> +	} else {
> +		sc->clean_below_low = 0;
> +		sc->clean_below_min = 0;
> +	}
> +}
> +
>   static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
>   {
>   	struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
> @@ -3249,6 +3338,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>   			anon >> sc->priority;
>   	}
> 
> +	prepare_workingset_protection(pgdat, sc);
> +
>   	shrink_node_memcgs(pgdat, sc);
> 
>   	if (reclaim_state) {
> 
> base-commit: d58071a8a76d779eedab38033ae4c821c30295a5
> --
> 2.11.0
>
Andrew Morton Dec. 2, 2021, 9:58 p.m. UTC | #5
On Thu, 2 Dec 2021 21:05:01 +0300 ValdikSS <iam@valdikss.org.ru> wrote:

> This patchset is surprisingly effective and very useful for low-end PC 
> with slow HDD, single-board ARM boards with slow storage, cheap Android 
> smartphones with limited amount of memory. It almost completely prevents 
> thrashing condition and aids in fast OOM killer invocation.
> 
> The similar file-locking patch is used in ChromeOS for nearly 10 years 
> but not on stock Linux or Android. It would be very beneficial for 
> lower-performance Android phones, SBCs, old PCs and other devices.
> 
> With this patch, combined with zram, I'm able to run the following 
> software on an old office PC from 2007 with __only 2GB of RAM__ 
> simultaneously:
> 
>   * Firefox with 37 active tabs (all data in RAM, no tab unloading)
>   * Discord
>   * Skype
>   * LibreOffice with the document opened
>   * Two PDF files (14 and 47 megabytes in size)
> 
> And the PC doesn't crawl like a snail, even with 2+ GB in zram!
> Without the patch, this PC is barely usable.
> Please watch the video:
> https://notes.valdikss.org.ru/linux-for-old-pc-from-2007/en/
> 

This is quite a condemnation of the current VM.  It shouldn't crawl
like a snail.

The patch simply sets hard limits on page reclaim's malfunctioning. 
I'd prefer that reclaim not malfunction :(

That being said, I can see that a blunt instrument like this would be
useful.

I don't think that the limits should be "N bytes on the current node". 
Nodes can have different amounts of memory so I expect it should scale
the hard limits on a per-node basis.  And of course, the various zones
have different size as well.

We do already have a lot of sysctls for controlling these sort of
things.  Was much work put into attempting to utilize the existing
sysctls to overcome these issues?
Vlastimil Babka Dec. 3, 2021, 11:59 a.m. UTC | #6
On 12/2/21 22:58, Andrew Morton wrote:
> On Thu, 2 Dec 2021 21:05:01 +0300 ValdikSS <iam@valdikss.org.ru> wrote:
> 
>> This patchset is surprisingly effective and very useful for low-end PC 
>> with slow HDD, single-board ARM boards with slow storage, cheap Android 
>> smartphones with limited amount of memory. It almost completely prevents 
>> thrashing condition and aids in fast OOM killer invocation.
>> 
>> The similar file-locking patch is used in ChromeOS for nearly 10 years 
>> but not on stock Linux or Android. It would be very beneficial for 
>> lower-performance Android phones, SBCs, old PCs and other devices.
>> 
>> With this patch, combined with zram, I'm able to run the following 
>> software on an old office PC from 2007 with __only 2GB of RAM__ 
>> simultaneously:
>> 
>>   * Firefox with 37 active tabs (all data in RAM, no tab unloading)
>>   * Discord
>>   * Skype
>>   * LibreOffice with the document opened
>>   * Two PDF files (14 and 47 megabytes in size)
>> 
>> And the PC doesn't crawl like a snail, even with 2+ GB in zram!
>> Without the patch, this PC is barely usable.
>> Please watch the video:
>> https://notes.valdikss.org.ru/linux-for-old-pc-from-2007/en/
>> 
> 
> This is quite a condemnation of the current VM.  It shouldn't crawl
> like a snail.
> 
> The patch simply sets hard limits on page reclaim's malfunctioning. 
> I'd prefer that reclaim not malfunction :(

+CC Johannes

I'd also like to know where that malfunction happens in this case. The
relatively well known scenario is that memory overloaded systems thrash
instead of going OOM quickly - something PSI should be able to help with.

But in your case, if there is no OOM due to the added protections, it would
mean that the system is in fact not overloaded, just that the normal reclaim
decisions lead to reclaming something that should be left in memory, while
there is other memory that can be reclaimed without causing thrashing?
That's perhaps worse and worth investigating.

> That being said, I can see that a blunt instrument like this would be
> useful.
> 
> I don't think that the limits should be "N bytes on the current node". 
> Nodes can have different amounts of memory so I expect it should scale
> the hard limits on a per-node basis.  And of course, the various zones
> have different size as well.
> 
> We do already have a lot of sysctls for controlling these sort of
> things.  Was much work put into attempting to utilize the existing
> sysctls to overcome these issues?
> 
>
Alexey Avramov Dec. 3, 2021, 1:27 p.m. UTC | #7
>I'd also like to know where that malfunction happens in this case.

User-space processes need to always access shared libraries to work.
It can be tens or hundreds of megabytes, depending on the type of workload. 
This is a hot cache, which is pushed out and then read leads to thrashing. 
There is no way in the kernel to forbid evicting the minimum file cache. 
This is the problem that the patch solves. And the malfunction is exactly
that - the inability of the kernel to hold the minimum amount of the
hottest cache in memory.

Anothe explanation:

> in normal operation you will have nearly all of your executables nad 
> libraries sitting in good ol' physical RAM. But when RAM runs low, but 
> not low enough for the out-of-memory killer to be run, these pages are 
> evicted from RAM. So you end up with a situation where pages are 
> evicted -- at first, no problem, because they are evicted 
> least-recently-used first and it kicks out pages you aren't using 
> anyway. But then, it kicks out the ones you are using, just to have 
> to page them right back in moments later. Thrash city.
-- [0]

Just look at prelockd [1]. This is the process that mlocks mmapped
libraries/binaries of existing processes. The result of it's work:
it's impossible to invoke thrashing under memory pressure, at least 
with noswap. And OOM killer comes *instantly* when it runs.
Please see the demo [2]. The same effect we can get when set
vm.clean_min_kbytes=250000, for example.

>something PSI should be able to help with

PSI acts post-factum: on the basis of PSI we react when memory 
pressure is already high. PSI annot help *prevent* thrashing.

Using vm.clean_min_kbytes knob allows you to get OOM *before*
memory/io pressure gets high and keep the system manageable instead
of getting livelock indefinitely.

Demo [3]: playing supertux under stress, fs on HDD,
vm.clean_low_kbytes=250000, no thrashing, no freeze,
io pressure closed to 0.

Yet another demo [4]: no stalls with the case that was reported [5] by
Artem S. Tashkinov in 2019. Interesting that in that thread ndrw
suggested [6] the right solution:

> Would it be possible to reserve a fixed (configurable) amount of RAM 
> for caches, and trigger OOM killer earlier, before most UI code is 
> evicted from memory? In my use case, I am happy sacrificing e.g. 0.5GB 
> and kill runaway tasks _before_ the system freezes. Potentially OOM 
> killer would also work better in such conditions. I almost never work 
> at close to full memory capacity, it's always a single task that goes 
> wrong and brings the system down.

> The problem with PSI sensing is that it works after the fact (after 
> the freeze has already occurred). It is not very different from issuing 
> SysRq-f manually on a frozen system, although it would still be a 
> handy feature for batched tasks and remote access. 

but Michal Hocko immediately criticized [7] the proposal unfairly. 
This patch just implements ndrw's suggestion.

[0] https://serverfault.com/a/319818
[1] https://github.com/hakavlad/prelockd

[2] https://www.youtube.com/watch?v=vykUrP1UvcI
    On this video: running fast memory hog in a loop on Debian 10 GNOME, 
    4 GiB MemTotal without swap space. FS is ext4 on *HDD*.
    - 1. prelockd enabled: about 500 MiB mlocked. Starting 
        `while true; do tail /dev/zero; done`: no freezes. 
        The OOM killer comes quickly, the system recovers quickly.
    - 2. prelockd disabled: system hangs.

[3] https://www.youtube.com/watch?v=g9GCmp-7WXw
[4] https://www.youtube.com/watch?v=iU3ikgNgp3M
[5] Let's talk about the elephant in the room - the Linux kernel's 
    inability to gracefully handle low memory pressure
    https://lore.kernel.org/all/d9802b6a-949b-b327-c4a6-3dbca485ec20@gmx.com/
[6] https://lore.kernel.org/all/806F5696-A8D6-481D-A82F-49DEC1F2B035@redhazel.co.uk/
[7] https://lore.kernel.org/all/20190808163228.GE18351@dhcp22.suse.cz/
Oleksandr Natalenko Dec. 3, 2021, 2:01 p.m. UTC | #8
Hello.

On čtvrtek 2. prosince 2021 22:58:24 CET Andrew Morton wrote:
> On Thu, 2 Dec 2021 21:05:01 +0300 ValdikSS <iam@valdikss.org.ru> wrote:
> > This patchset is surprisingly effective and very useful for low-end PC
> > with slow HDD, single-board ARM boards with slow storage, cheap Android
> > smartphones with limited amount of memory. It almost completely prevents
> > thrashing condition and aids in fast OOM killer invocation.
> > 
> > The similar file-locking patch is used in ChromeOS for nearly 10 years
> > but not on stock Linux or Android. It would be very beneficial for
> > lower-performance Android phones, SBCs, old PCs and other devices.
> > 
> > With this patch, combined with zram, I'm able to run the following
> > software on an old office PC from 2007 with __only 2GB of RAM__
> > 
> > simultaneously:
> >   * Firefox with 37 active tabs (all data in RAM, no tab unloading)
> >   * Discord
> >   * Skype
> >   * LibreOffice with the document opened
> >   * Two PDF files (14 and 47 megabytes in size)
> > 
> > And the PC doesn't crawl like a snail, even with 2+ GB in zram!
> > Without the patch, this PC is barely usable.
> > Please watch the video:
> > https://notes.valdikss.org.ru/linux-for-old-pc-from-2007/en/
> 
> This is quite a condemnation of the current VM.  It shouldn't crawl
> like a snail.
> 
> The patch simply sets hard limits on page reclaim's malfunctioning.
> I'd prefer that reclaim not malfunction :(
> 
> That being said, I can see that a blunt instrument like this would be
> useful.
> 
> I don't think that the limits should be "N bytes on the current node".
> Nodes can have different amounts of memory so I expect it should scale
> the hard limits on a per-node basis.  And of course, the various zones
> have different size as well.

Probably not. To my understanding, the limits should roughly correspond to 
what you see after executing this:

```
$ echo 1 | sudo tee /proc/sys/vm/drop_caches; grep -F 'Active(file)' /proc/
meminfo
```

IMO, this has nothing to do with the size of the node.

> We do already have a lot of sysctls for controlling these sort of
> things.  Was much work put into attempting to utilize the existing
> sysctls to overcome these issues?
Michal Hocko Dec. 6, 2021, 9:59 a.m. UTC | #9
On Fri 03-12-21 22:27:10, Alexey Avramov wrote:
> >I'd also like to know where that malfunction happens in this case.
> 
> User-space processes need to always access shared libraries to work.
> It can be tens or hundreds of megabytes, depending on the type of workload. 
> This is a hot cache, which is pushed out and then read leads to thrashing. 
> There is no way in the kernel to forbid evicting the minimum file cache. 
> This is the problem that the patch solves. And the malfunction is exactly
> that - the inability of the kernel to hold the minimum amount of the
> hottest cache in memory.

Executable pages are a protected resource already page_check_references.
Shared libraries have more page tables pointing to them so they are more
likely to be referenced and thus kept around. What is the other memory
demand to push those away and cause a trashing?

I do agree with Vlastimil that we should be addressing these problems
rather than papering them over by limits nobody will know how to set
up properly and so we will have to deal all sorts of misconfigured
systems. I have a first hand experience with that in a form of page
cache limit that we used to have in older SLES kernels.

[...]
> > The problem with PSI sensing is that it works after the fact (after 
> > the freeze has already occurred). It is not very different from issuing 
> > SysRq-f manually on a frozen system, although it would still be a 
> > handy feature for batched tasks and remote access. 
> 
> but Michal Hocko immediately criticized [7] the proposal unfairly. 
> This patch just implements ndrw's suggestion.

It would be more productive if you were more specific what you consider
an unfair criticism. Thrashing is a real problem and we all recognize
that. We have much better tools in our tool box these days (refault data
for both page cache and swapped back memory). The kernel itself is
rather conservative when using that data for OOM situations because
historically users were more concerned about pre-mature oom killer
invocations because that is a disruptive action.
For those who prefer very agile oom policy there are userspace tools
which can implement more advanced policies.
I am open to any idea to improve the kernel side of things as well.

As mentioned above I am against global knobs to special case the global
memory reclaim because that leads to inconsistencies with the memcg
reclaim, add future maintenance burden and most importantly it
outsources reponsibility to admins who will have hard time to know the
proper value for those knobs effectivelly pushing them towards all sorts
of cargo cult.

> [0] https://serverfault.com/a/319818
> [1] https://github.com/hakavlad/prelockd
> 
> [2] https://www.youtube.com/watch?v=vykUrP1UvcI
>     On this video: running fast memory hog in a loop on Debian 10 GNOME, 
>     4 GiB MemTotal without swap space. FS is ext4 on *HDD*.
>     - 1. prelockd enabled: about 500 MiB mlocked. Starting 
>         `while true; do tail /dev/zero; done`: no freezes. 
>         The OOM killer comes quickly, the system recovers quickly.
>     - 2. prelockd disabled: system hangs.
> 
> [3] https://www.youtube.com/watch?v=g9GCmp-7WXw
> [4] https://www.youtube.com/watch?v=iU3ikgNgp3M
> [5] Let's talk about the elephant in the room - the Linux kernel's 
>     inability to gracefully handle low memory pressure
>     https://lore.kernel.org/all/d9802b6a-949b-b327-c4a6-3dbca485ec20@gmx.com/
> [6] https://lore.kernel.org/all/806F5696-A8D6-481D-A82F-49DEC1F2B035@redhazel.co.uk/
> [7] https://lore.kernel.org/all/20190808163228.GE18351@dhcp22.suse.cz/
Alexey Avramov Dec. 12, 2021, 8:15 p.m. UTC | #10
> I don't think that the limits should be "N bytes on the current node". 

It's not a problem to add a _ratio knobs. How the tunables should look and 
what their default values should be can still be discussed. Now my task is 
to prove that the problem exists and the solution I have proposed is 
effective and correct.

> the various zones have different size as well.

I'll just point out the precedent: sc->file_is_tiny works the same way 
(per node) as suggested sc->clean_below_min etc.

> We do already have a lot of sysctls for controlling these sort of
> things.  

There are many of them, but there are no most important ones for solving 
the problem - those that are proposed in the patch. 

> Was much work put into attempting to utilize the existing
> sysctls to overcome these issues?

Oh yes! This is all I have been doing for the last 4 years. At the end of 
2017, I was forced to write my own userspace OOM killer [1] to resist 
freezes (I didn't know then that earlyoom already existed).

In 2018, Facebook came on the scene with its oomd [2]:

> The traditional Linux OOM killer works fine in some cases, but in others 
> it kicks in too late, resulting in the system entering a livelock for an 
> indeterminate period.

Here we can assume that Facebook's engineers haven't found the kernel 
sysctl tunables that would satisfy them.

In 2019 LKML people could not offer Artem S. Tashkinov a simple solution to 
the problem he described [3]. In addition to discussing user-space 
solutions, 2 kernel-side solutions are proposed:

- PSI-based solution was proposed by Johannes Weiner [4].
- Reserve a fixed (configurable) amount of RAM for caches, and trigger OOM 
  killer earlier, before most UI code is evicted from memory was suggested 
  by ndrw [5]. This is what I propose to accept in the mainline. It is the 
  right way to go.

None of the suggestions posted in that thread were accepted in the 
mainline.

In 2019, at the same time, Fedora Workstation group discussed [6]
Issue #98 Better interactivity in low-memory situations.
As a result, it was decided to enable earlyoom by default for Fedora 
Workstation 32. No existing sysctl was found to be of much help.
It was also suggested to use a swap on zram and to enable the cgroup-based 
uresourced daemon to protect the user session.

So, the problem described by Artem S. Tashkinov in 2019 is still easily 
reproduced in 2021. The assurances of the maintainers that they consider 
the thrashing and near-OOM stalls to be a serious problems are difficult to 
take seriously while they ignore the obvious solution: if reclaiming file 
caches leads to thrashing, then you just need to prohibit deleting the file 
cache. And allow the user to control its minimum amount.
By the way, the implementation of such an idea has been known [7] since 
2010 and was even used in Chrome OS.

Bonus: demo: https://youtu.be/ZrLqUWRodh4
Debian 11 on VM, Linux 5.14 with the patch, no swap space, 
playing SuperTux while 1000 `tail /dev/zero` started simultaneously:
1. No freezes with vm.clean_min_kbytes=300000, I/O pressure was closed to 
   zero, memory pressure was moderate (70-80 some, 12-17 full), all tail 
   processes has been killed in 2 minutes (0:06 - 2:14), it's about 
   8 processes reaped by oom_reaper per second;
2. Complete UI freeze without the working set protection (since 3:40).

[1] https://github.com/hakavlad/nohang
[2] https://engineering.fb.com/2018/07/19/production-engineering/oomd/
[3] https://lore.kernel.org/lkml/d9802b6a-949b-b327-c4a6-3dbca485ec20@gmx.com/
[4] https://lore.kernel.org/lkml/20190807205138.GA24222@cmpxchg.org/
[5] https://lore.kernel.org/lkml/806F5696-A8D6-481D-A82F-49DEC1F2B035@redhazel.co.uk/
[6] https://pagure.io/fedora-workstation/issue/98
[7] https://lore.kernel.org/lkml/20101028191523.GA14972@google.com/
Barry Song Dec. 13, 2021, 8:38 a.m. UTC | #11
On Tue, Dec 7, 2021 at 5:47 AM ValdikSS <iam@valdikss.org.ru> wrote:
>
> This patchset is surprisingly effective and very useful for low-end PC
> with slow HDD, single-board ARM boards with slow storage, cheap Android
> smartphones with limited amount of memory. It almost completely prevents
> thrashing condition and aids in fast OOM killer invocation.
>

Can you please post your hardware information like what is the cpu, how much
memory you have and also post your sysctl knobs, like how do you set
vm.anon_min_kbytes,  vm.clean_low_kbytes and vm.clean_min_kbytes?

> The similar file-locking patch is used in ChromeOS for nearly 10 years
> but not on stock Linux or Android. It would be very beneficial for
> lower-performance Android phones, SBCs, old PCs and other devices.
>

Can you post the link of the similar file-locking patch?

> With this patch, combined with zram, I'm able to run the following
> software on an old office PC from 2007 with __only 2GB of RAM__
> simultaneously:
>
>   * Firefox with 37 active tabs (all data in RAM, no tab unloading)
>   * Discord
>   * Skype
>   * LibreOffice with the document opened
>   * Two PDF files (14 and 47 megabytes in size)
>
> And the PC doesn't crawl like a snail, even with 2+ GB in zram!
> Without the patch, this PC is barely usable.
> Please watch the video:
> https://notes.valdikss.org.ru/linux-for-old-pc-from-2007/en/
>

The video was captured before using this patch? what video says
"the result of the test computer after the configuration", what does
"the configuration" mean?

Thanks
Barry
Barry Song Dec. 13, 2021, 9:06 a.m. UTC | #12
On Mon, Dec 13, 2021 at 10:23 AM Alexey Avramov <hakavlad@inbox.lv> wrote:
>
> > I don't think that the limits should be "N bytes on the current node".
>
> It's not a problem to add a _ratio knobs. How the tunables should look and
> what their default values should be can still be discussed. Now my task is
> to prove that the problem exists and the solution I have proposed is
> effective and correct.
>
> > the various zones have different size as well.
>
> I'll just point out the precedent: sc->file_is_tiny works the same way
> (per node) as suggested sc->clean_below_min etc.
>
> > We do already have a lot of sysctls for controlling these sort of
> > things.
>
> There are many of them, but there are no most important ones for solving
> the problem - those that are proposed in the patch.
>
> > Was much work put into attempting to utilize the existing
> > sysctls to overcome these issues?
>
> Oh yes! This is all I have been doing for the last 4 years. At the end of
> 2017, I was forced to write my own userspace OOM killer [1] to resist
> freezes (I didn't know then that earlyoom already existed).

I'd like to understand the problem of the existing sysctls.  For example,
if we want to keep more free memory, the min free kbytes can help. On
the other hand, if we want to keep more file-backed memory,  a big
swappiness will help.
I believe you have tried all of the above and they have all failed to satisfy
your use cases, but I really expect a more detailed explanation why they
don't work.

>
> In 2018, Facebook came on the scene with its oomd [2]:
>
> > The traditional Linux OOM killer works fine in some cases, but in others
> > it kicks in too late, resulting in the system entering a livelock for an
> > indeterminate period.
>
> Here we can assume that Facebook's engineers haven't found the kernel
> sysctl tunables that would satisfy them.
>
> In 2019 LKML people could not offer Artem S. Tashkinov a simple solution to
> the problem he described [3]. In addition to discussing user-space
> solutions, 2 kernel-side solutions are proposed:
>
> - PSI-based solution was proposed by Johannes Weiner [4].
> - Reserve a fixed (configurable) amount of RAM for caches, and trigger OOM
>   killer earlier, before most UI code is evicted from memory was suggested
>   by ndrw [5]. This is what I propose to accept in the mainline. It is the
>   right way to go.

isn't this something like setting a bigger min_free_kbytes?

>
> None of the suggestions posted in that thread were accepted in the
> mainline.
>
> In 2019, at the same time, Fedora Workstation group discussed [6]
> Issue #98 Better interactivity in low-memory situations.
> As a result, it was decided to enable earlyoom by default for Fedora
> Workstation 32. No existing sysctl was found to be of much help.
> It was also suggested to use a swap on zram and to enable the cgroup-based
> uresourced daemon to protect the user session.
>
> So, the problem described by Artem S. Tashkinov in 2019 is still easily
> reproduced in 2021. The assurances of the maintainers that they consider
> the thrashing and near-OOM stalls to be a serious problems are difficult to
> take seriously while they ignore the obvious solution: if reclaiming file
> caches leads to thrashing, then you just need to prohibit deleting the file
> cache. And allow the user to control its minimum amount.
> By the way, the implementation of such an idea has been known [7] since
> 2010 and was even used in Chrome OS.
>
> Bonus: demo: https://youtu.be/ZrLqUWRodh4
> Debian 11 on VM, Linux 5.14 with the patch, no swap space,
> playing SuperTux while 1000 `tail /dev/zero` started simultaneously:
> 1. No freezes with vm.clean_min_kbytes=300000, I/O pressure was closed to
>    zero, memory pressure was moderate (70-80 some, 12-17 full), all tail
>    processes has been killed in 2 minutes (0:06 - 2:14), it's about
>    8 processes reaped by oom_reaper per second;
> 2. Complete UI freeze without the working set protection (since 3:40).

I do agree we need some way to stop the thrashing of memory especially when
free memory is low and we are very close to OOM.
Mainly you are mentioning the benefit of keeping shared libraries, so
what is the
purpose of vm.anon_min_kbytes?
And will switching multiple applications under the low memory
situation still trigger
thrashing of memory, for example, a library kicks another library out?
anon pages
of one application kick  anon pages of another application out?

>
> [1] https://github.com/hakavlad/nohang
> [2] https://engineering.fb.com/2018/07/19/production-engineering/oomd/
> [3] https://lore.kernel.org/lkml/d9802b6a-949b-b327-c4a6-3dbca485ec20@gmx.com/
> [4] https://lore.kernel.org/lkml/20190807205138.GA24222@cmpxchg.org/
> [5] https://lore.kernel.org/lkml/806F5696-A8D6-481D-A82F-49DEC1F2B035@redhazel.co.uk/
> [6] https://pagure.io/fedora-workstation/issue/98
> [7] https://lore.kernel.org/lkml/20101028191523.GA14972@google.com/
>

Thanks
Barry
Michal Hocko Dec. 13, 2021, 9:07 a.m. UTC | #13
On Mon 13-12-21 05:15:21, Alexey Avramov wrote:
> So, the problem described by Artem S. Tashkinov in 2019 is still easily 
> reproduced in 2021. The assurances of the maintainers that they consider 
> the thrashing and near-OOM stalls to be a serious problems are difficult to 
> take seriously while they ignore the obvious solution: if reclaiming file 
> caches leads to thrashing, then you just need to prohibit deleting the file 
> cache. And allow the user to control its minimum amount.

These are rather strong claims. While this might sound like a very easy
solution/workaround I have already tried to express my concerns [1].

Really, you should realize that such a knob would become carved
into stone as soon as wee merge this and we will need to support it
for ever! It is really painful (if possible at all) to deprecate any
tunable knobs that cannot be supported anymore because the underlying
implementation doesn't allow for that.  So we would absolutely need to
be sure this is the right approach to the problem.  I am not convinced
about that though.

How does the admin know the limit should be set to a certain
workload? What if the workload characteristics change and the existing
setting is just to restrictive? What if the workload istrashing over
something different than anon/file memory (e.g. any other cache that we
have or might have in the future)?

As you have pointed out there were general recommendations to use user
space based oom killer solutions which can be tuned for the specific
workload or used in an environment where the disruptive OOM killer
action is less of a problem because workload can be restarted easily
without too much harm caused by the oom killer.
Please keep in mind that there are many more different workloads that
have different requirements and an oom killer invocation can be really
much worse than a slow progress due to ephemeral, peak or even longer
term trashing or heavy refaults.

The kernel OOM killer acts as the last resort solution and therefore
stays really conservative. I do believe that integrating PSI metrics
into that decision is the right direction. It is not a trivial one
though.

Why is this better approach than a simple limit? Well, for one, it is a
feedback based solution. System knows it is trashing and can estimate
how hard. It is not about a specific type of memory because we can
detect refaults on both file and anonymous memory (it can be extended
should there be a need for future types of caches or reclaimable
memory). Memory reclaim can work with that information and balance
differen resources dynamically based on the available feedback. MM code
will not need to expose implementation details about how the reclaim
works and so we do not bind ourselves into longterm specifics.

See the difference?

If you can live with pre-mature and over-eager OOM killer policy then
all fine. Use existing userspace solutions. If you want to work on an in
kernel solution please try to understand complexity and historical
experience with similar solution first. It also helps to understand that
there are no simple solutions on the table. MM reclaim code has evolved
over many years. I am strongly suspecting we ran out of simple solutions
already. We also got burnt many times. Let's not repeat some errors
again.

[1] http://lkml.kernel.org/r/Ya3fG2rp+860Yb+t@dhcp22.suse.cz
Barry Song Jan. 9, 2022, 10:59 p.m. UTC | #14
On Tue, Dec 7, 2021 at 4:51 PM Michal Hocko <mhocko@suse.com> wrote:
>
> On Fri 03-12-21 22:27:10, Alexey Avramov wrote:
> > >I'd also like to know where that malfunction happens in this case.
> >
> > User-space processes need to always access shared libraries to work.
> > It can be tens or hundreds of megabytes, depending on the type of workload.
> > This is a hot cache, which is pushed out and then read leads to thrashing.
> > There is no way in the kernel to forbid evicting the minimum file cache.
> > This is the problem that the patch solves. And the malfunction is exactly
> > that - the inability of the kernel to hold the minimum amount of the
> > hottest cache in memory.
>
> Executable pages are a protected resource already page_check_references.
> Shared libraries have more page tables pointing to them so they are more
> likely to be referenced and thus kept around. What is the other memory
> demand to push those away and cause a trashing?

I've heard a lot of complaints that shared libraries can be swapped
out and thrashing.
it seems page_check_references won't be able to relieve the thrashing for them.
on the other hand, exec pages could have a very big mapcount, that means reverse
mapping of them will take a lot of time while they are reclaimed, so
this makes the user
experience even much worse while memory is under high pressure.

Are we actually able to make mapcount a factor for memory reclaim in
some way? The
difficulty might be that a big mapcount doesn't necessarily mean the
page is active. for
For example,  all processes mapping the page might be inactive. but
reclaiming pages
with a big mapcount has been a big pain as far as i know.

Thanks
Barry
ValdikSS Jan. 25, 2022, 8:19 a.m. UTC | #15
On 13.12.2021 11:38, Barry Song wrote:
> On Tue, Dec 7, 2021 at 5:47 AM ValdikSS <iam@valdikss.org.ru> wrote:
>>
>> This patchset is surprisingly effective and very useful for low-end PC
>> with slow HDD, single-board ARM boards with slow storage, cheap Android
>> smartphones with limited amount of memory. It almost completely prevents
>> thrashing condition and aids in fast OOM killer invocation.
>>
> 
> Can you please post your hardware information like what is the cpu, how much
> memory you have and also post your sysctl knobs, like how do you set
> vm.anon_min_kbytes,  vm.clean_low_kbytes and vm.clean_min_kbytes?

I have a typical office computer of year 2007:

* Motherboard: Gigabyte GA-945GCM-S2L (early LGA775 socket, GMA950 
integrated graphics, September 2007)
* 2 core 64 bit CPU: Intel® Core™2 Duo E4600 (2 cores, 2.4 GHz, late 2007)
* 2 GB of RAM (DDR2 667 MHz, single module)
* Very old and slow 160 GB Hard Disk: Samsung HD161HJ (SATA II, June 2007):
* No discrete graphics card

I used vm.clean_low_kbytes=384000 (384 MB) to keep most of file cache in 
memory, because the HDD is slow and every data re-read leads to 
uncomfortable freezes and slow work.

More information, including the video, is here: 
https://notes.valdikss.org.ru/linux-for-old-pc-from-2007/en/

> 
>> The similar file-locking patch is used in ChromeOS for nearly 10 years
>> but not on stock Linux or Android. It would be very beneficial for
>> lower-performance Android phones, SBCs, old PCs and other devices.
>>
> 
> Can you post the link of the similar file-locking patch?

Here's a patch: https://lkml.org/lkml/2010/10/28/289
Here's more in-depth description: https://lkml.org/lkml/2010/11/1/20

Please also note that another Google developer, Yu Zhao, has also made a 
modern version of this (ChromiumOS) patch called MGLRU, the goal of 
which is quite similar to le9 (the patch we're discussing here), but 
with "more brains":
https://lore.kernel.org/lkml/20220104202247.2903702-1-yuzhao@google.com/T/#m8fd2a29bc557d27d1000f837f65b6c930eef9dff

Please take a moment and read the information in the link above. Yu Zhao 
develops this patch for almost two years and knows the issue better than 
me, a casual user.


> 
>> With this patch, combined with zram, I'm able to run the following
>> software on an old office PC from 2007 with __only 2GB of RAM__
>> simultaneously:
>>
>>    * Firefox with 37 active tabs (all data in RAM, no tab unloading)
>>    * Discord
>>    * Skype
>>    * LibreOffice with the document opened
>>    * Two PDF files (14 and 47 megabytes in size)
>>
>> And the PC doesn't crawl like a snail, even with 2+ GB in zram!
>> Without the patch, this PC is barely usable.
>> Please watch the video:
>> https://notes.valdikss.org.ru/linux-for-old-pc-from-2007/en/
>>
> 
> The video was captured before using this patch? what video says
> "the result of the test computer after the configuration", what does
> "the configuration" mean?

The video was captured after the patch. Before the patch, it's basically 
not possible to use Firefox only with 20+ tabs because the PC enters 
thrashing condition and reacts so slow that even mouse cursor freezes 
frequently. The PC is absolutely unusable for any decent work without 
the patch, regardless of swappiness, vm.min_free_kbytes or any other 
tunables.

The configuration is this patch with vm.clean_low_kbytes=384000 and 150% 
zram. More information is provided on the website.

> 
> Thanks
> Barry
Barry Song Feb. 12, 2022, 12:01 a.m. UTC | #16
On Tue, Jan 25, 2022 at 9:19 PM ValdikSS <iam@valdikss.org.ru> wrote:
>
> On 13.12.2021 11:38, Barry Song wrote:
> > On Tue, Dec 7, 2021 at 5:47 AM ValdikSS <iam@valdikss.org.ru> wrote:
> >>
> >> This patchset is surprisingly effective and very useful for low-end PC
> >> with slow HDD, single-board ARM boards with slow storage, cheap Android
> >> smartphones with limited amount of memory. It almost completely prevents
> >> thrashing condition and aids in fast OOM killer invocation.
> >>
> >
> > Can you please post your hardware information like what is the cpu, how much
> > memory you have and also post your sysctl knobs, like how do you set
> > vm.anon_min_kbytes,  vm.clean_low_kbytes and vm.clean_min_kbytes?
>
> I have a typical office computer of year 2007:
>
> * Motherboard: Gigabyte GA-945GCM-S2L (early LGA775 socket, GMA950
> integrated graphics, September 2007)
> * 2 core 64 bit CPU: Intel® Core™2 Duo E4600 (2 cores, 2.4 GHz, late 2007)
> * 2 GB of RAM (DDR2 667 MHz, single module)
> * Very old and slow 160 GB Hard Disk: Samsung HD161HJ (SATA II, June 2007):
> * No discrete graphics card
>
> I used vm.clean_low_kbytes=384000 (384 MB) to keep most of file cache in
> memory, because the HDD is slow and every data re-read leads to
> uncomfortable freezes and slow work.
>
> More information, including the video, is here:
> https://notes.valdikss.org.ru/linux-for-old-pc-from-2007/en/

thanks!

>
> >
> >> The similar file-locking patch is used in ChromeOS for nearly 10 years
> >> but not on stock Linux or Android. It would be very beneficial for
> >> lower-performance Android phones, SBCs, old PCs and other devices.
> >>
> >
> > Can you post the link of the similar file-locking patch?
>
> Here's a patch: https://lkml.org/lkml/2010/10/28/289
> Here's more in-depth description: https://lkml.org/lkml/2010/11/1/20

thanks, seems to be quite similar with this patch.

>
> Please also note that another Google developer, Yu Zhao, has also made a
> modern version of this (ChromiumOS) patch called MGLRU, the goal of
> which is quite similar to le9 (the patch we're discussing here), but
> with "more brains":
> https://lore.kernel.org/lkml/20220104202247.2903702-1-yuzhao@google.com/T/#m8fd2a29bc557d27d1000f837f65b6c930eef9dff
>
> Please take a moment and read the information in the link above. Yu Zhao
> develops this patch for almost two years and knows the issue better than
> me, a casual user.
>

Thanks for all the information you provided. I think I have noticed MGLRU
for a while. Curiously, does MGLRU also resolve your problem of using
"a typical office computer of year 2007" ?

>
> >
> >> With this patch, combined with zram, I'm able to run the following
> >> software on an old office PC from 2007 with __only 2GB of RAM__
> >> simultaneously:
> >>
> >>    * Firefox with 37 active tabs (all data in RAM, no tab unloading)
> >>    * Discord
> >>    * Skype
> >>    * LibreOffice with the document opened
> >>    * Two PDF files (14 and 47 megabytes in size)
> >>
> >> And the PC doesn't crawl like a snail, even with 2+ GB in zram!
> >> Without the patch, this PC is barely usable.
> >> Please watch the video:
> >> https://notes.valdikss.org.ru/linux-for-old-pc-from-2007/en/
> >>
> >
> > The video was captured before using this patch? what video says
> > "the result of the test computer after the configuration", what does
> > "the configuration" mean?
>
> The video was captured after the patch. Before the patch, it's basically
> not possible to use Firefox only with 20+ tabs because the PC enters
> thrashing condition and reacts so slow that even mouse cursor freezes
> frequently. The PC is absolutely unusable for any decent work without
> the patch, regardless of swappiness, vm.min_free_kbytes or any other
> tunables.
>
> The configuration is this patch with vm.clean_low_kbytes=384000 and 150%
> zram. More information is provided on the website.

thanks!

>
> >
> > Thanks
> > Barry

Thanks
Barry
diff mbox series

Patch

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 5e7952021..2f606e23b 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -25,6 +25,9 @@  files can be found in mm/swap.c.
 Currently, these files are in /proc/sys/vm:

 - admin_reserve_kbytes
+- anon_min_kbytes
+- clean_low_kbytes
+- clean_min_kbytes
 - compact_memory
 - compaction_proactiveness
 - compact_unevictable_allowed
@@ -105,6 +108,61 @@  On x86_64 this is about 128MB.
 Changing this takes effect whenever an application requests memory.


+anon_min_kbytes
+===============
+
+This knob provides *hard* protection of anonymous pages. The anonymous pages
+on the current node won't be reclaimed under any conditions when their amount
+is below vm.anon_min_kbytes.
+
+This knob may be used to prevent excessive swap thrashing when anonymous
+memory is low (for example, when memory is going to be overfilled by
+compressed data of zram module).
+
+Setting this value too high (close to MemTotal) can result in inability to
+swap and can lead to early OOM under memory pressure.
+
+The default value is defined by CONFIG_ANON_MIN_KBYTES.
+
+
+clean_low_kbytes
+================
+
+This knob provides *best-effort* protection of clean file pages. The file pages
+on the current node won't be reclaimed under memory pressure when the amount of
+clean file pages is below vm.clean_low_kbytes *unless* we threaten to OOM.
+
+Protection of clean file pages using this knob may be used when swapping is
+still possible to
+  - prevent disk I/O thrashing under memory pressure;
+  - improve performance in disk cache-bound tasks under memory pressure.
+
+Setting it to a high value may result in a early eviction of anonymous pages
+into the swap space by attempting to hold the protected amount of clean file
+pages in memory.
+
+The default value is defined by CONFIG_CLEAN_LOW_KBYTES.
+
+
+clean_min_kbytes
+================
+
+This knob provides *hard* protection of clean file pages. The file pages on the
+current node won't be reclaimed under memory pressure when the amount of clean
+file pages is below vm.clean_min_kbytes.
+
+Hard protection of clean file pages using this knob may be used to
+  - prevent disk I/O thrashing under memory pressure even with no free swap space;
+  - improve performance in disk cache-bound tasks under memory pressure;
+  - avoid high latency and prevent livelock in near-OOM conditions.
+
+Setting it to a high value may result in a early out-of-memory condition due to
+the inability to reclaim the protected amount of clean file pages when other
+types of pages cannot be reclaimed.
+
+The default value is defined by CONFIG_CLEAN_MIN_KBYTES.
+
+
 compact_memory
 ==============

@@ -864,6 +922,14 @@  be 133 (x + 2x = 200, 2x = 133.33).
 At 0, the kernel will not initiate swap until the amount of free and
 file-backed pages is less than the high watermark in a zone.

+This knob has no effect if the amount of clean file pages on the current
+node is below vm.clean_low_kbytes or vm.clean_min_kbytes. In this case,
+only anonymous pages can be reclaimed.
+
+If the number of anonymous pages on the current node is below
+vm.anon_min_kbytes, then only file pages can be reclaimed with
+any vm.swappiness value.
+

 unprivileged_userfaultfd
 ========================
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a7e4a9e7d..bee9807d5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -200,6 +200,10 @@  static inline void __mm_zero_struct_page(struct page *page)

 extern int sysctl_max_map_count;

+extern unsigned long sysctl_anon_min_kbytes;
+extern unsigned long sysctl_clean_low_kbytes;
+extern unsigned long sysctl_clean_min_kbytes;
+
 extern unsigned long sysctl_user_reserve_kbytes;
 extern unsigned long sysctl_admin_reserve_kbytes;

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 083be6af2..65fc38756 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3132,6 +3132,27 @@  static struct ctl_table vm_table[] = {
 	},
 #endif
 	{
+		.procname	= "anon_min_kbytes",
+		.data		= &sysctl_anon_min_kbytes,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "clean_low_kbytes",
+		.data		= &sysctl_clean_low_kbytes,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "clean_min_kbytes",
+		.data		= &sysctl_clean_min_kbytes,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
 		.procname	= "user_reserve_kbytes",
 		.data		= &sysctl_user_reserve_kbytes,
 		.maxlen		= sizeof(sysctl_user_reserve_kbytes),
diff --git a/mm/Kconfig b/mm/Kconfig
index 28edafc82..dea0806d7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -89,6 +89,69 @@  config SPARSEMEM_VMEMMAP
 	  pfn_to_page and page_to_pfn operations.  This is the most
 	  efficient option when sufficient kernel resources are available.

+config ANON_MIN_KBYTES
+	int "Default value for vm.anon_min_kbytes"
+	depends on SYSCTL
+	range 0 4294967295
+	default 0
+	help
+	  This option sets the default value for vm.anon_min_kbytes sysctl knob.
+
+	  The vm.anon_min_kbytes sysctl knob provides *hard* protection of
+	  anonymous pages. The anonymous pages on the current node won't be
+	  reclaimed under any conditions when their amount is below
+	  vm.anon_min_kbytes. This knob may be used to prevent excessive swap
+	  thrashing when anonymous memory is low (for example, when memory is
+	  going to be overfilled by compressed data of zram module).
+
+	  Setting this value too high (close to MemTotal) can result in
+	  inability to swap and can lead to early OOM under memory pressure.
+
+config CLEAN_LOW_KBYTES
+	int "Default value for vm.clean_low_kbytes"
+	depends on SYSCTL
+	range 0 4294967295
+	default 0
+	help
+	  This option sets the default value for vm.clean_low_kbytes sysctl knob.
+
+	  The vm.clean_low_kbytes sysctl knob provides *best-effort*
+	  protection of clean file pages. The file pages on the current node
+	  won't be reclaimed under memory pressure when the amount of clean file
+	  pages is below vm.clean_low_kbytes *unless* we threaten to OOM.
+	  Protection of clean file pages using this knob may be used when
+	  swapping is still possible to
+	    - prevent disk I/O thrashing under memory pressure;
+	    - improve performance in disk cache-bound tasks under memory
+	      pressure.
+
+	  Setting it to a high value may result in a early eviction of anonymous
+	  pages into the swap space by attempting to hold the protected amount
+	  of clean file pages in memory.
+
+config CLEAN_MIN_KBYTES
+	int "Default value for vm.clean_min_kbytes"
+	depends on SYSCTL
+	range 0 4294967295
+	default 0
+	help
+	  This option sets the default value for vm.clean_min_kbytes sysctl knob.
+
+	  The vm.clean_min_kbytes sysctl knob provides *hard* protection of
+	  clean file pages. The file pages on the current node won't be
+	  reclaimed under memory pressure when the amount of clean file pages is
+	  below vm.clean_min_kbytes. Hard protection of clean file pages using
+	  this knob may be used to
+	    - prevent disk I/O thrashing under memory pressure even with no free
+	      swap space;
+	    - improve performance in disk cache-bound tasks under memory
+	      pressure;
+	    - avoid high latency and prevent livelock in near-OOM conditions.
+
+	  Setting it to a high value may result in a early out-of-memory condition
+	  due to the inability to reclaim the protected amount of clean file pages
+	  when other types of pages cannot be reclaimed.
+
 config HAVE_MEMBLOCK_PHYS_MAP
 	bool

diff --git a/mm/vmscan.c b/mm/vmscan.c
index fb9584641..928f3371d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -122,6 +122,15 @@  struct scan_control {
 	/* The file pages on the current node are dangerously low */
 	unsigned int file_is_tiny:1;

+	/* The anonymous pages on the current node are below vm.anon_min_kbytes */
+	unsigned int anon_below_min:1;
+
+	/* The clean file pages on the current node are below vm.clean_low_kbytes */
+	unsigned int clean_below_low:1;
+
+	/* The clean file pages on the current node are below vm.clean_min_kbytes */
+	unsigned int clean_below_min:1;
+
 	/* Always discard instead of demoting to lower tier memory */
 	unsigned int no_demotion:1;

@@ -171,6 +180,10 @@  struct scan_control {
 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
 #endif

+unsigned long sysctl_anon_min_kbytes __read_mostly = CONFIG_ANON_MIN_KBYTES;
+unsigned long sysctl_clean_low_kbytes __read_mostly = CONFIG_CLEAN_LOW_KBYTES;
+unsigned long sysctl_clean_min_kbytes __read_mostly = CONFIG_CLEAN_MIN_KBYTES;
+
 /*
  * From 0 .. 200.  Higher means more swappy.
  */
@@ -2734,6 +2747,15 @@  static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	}

 	/*
+	 * Force-scan anon if clean file pages is under vm.clean_low_kbytes
+	 * or vm.clean_min_kbytes.
+	 */
+	if (sc->clean_below_low || sc->clean_below_min) {
+		scan_balance = SCAN_ANON;
+		goto out;
+	}
+
+	/*
 	 * If there is enough inactive page cache, we do not reclaim
 	 * anything from the anonymous working right now.
 	 */
@@ -2877,6 +2899,25 @@  static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 			BUG();
 		}

+		/*
+		 * Hard protection of the working set.
+		 */
+		if (file) {
+			/*
+			 * Don't reclaim file pages when the amount of
+			 * clean file pages is below vm.clean_min_kbytes.
+			 */
+			if (sc->clean_below_min)
+				scan = 0;
+		} else {
+			/*
+			 * Don't reclaim anonymous pages when their
+			 * amount is below vm.anon_min_kbytes.
+			 */
+			if (sc->anon_below_min)
+				scan = 0;
+		}
+
 		nr[lru] = scan;
 	}
 }
@@ -3082,6 +3123,54 @@  static inline bool should_continue_reclaim(struct pglist_data *pgdat,
 	return inactive_lru_pages > pages_for_compaction;
 }

+static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc)
+{
+	/*
+	 * Check the number of anonymous pages to protect them from
+	 * reclaiming if their amount is below the specified.
+	 */
+	if (sysctl_anon_min_kbytes) {
+		unsigned long reclaimable_anon;
+
+		reclaimable_anon =
+			node_page_state(pgdat, NR_ACTIVE_ANON) +
+			node_page_state(pgdat, NR_INACTIVE_ANON) +
+			node_page_state(pgdat, NR_ISOLATED_ANON);
+		reclaimable_anon <<= (PAGE_SHIFT - 10);
+
+		sc->anon_below_min = reclaimable_anon < sysctl_anon_min_kbytes;
+	} else
+		sc->anon_below_min = 0;
+
+	/*
+	 * Check the number of clean file pages to protect them from
+	 * reclaiming if their amount is below the specified.
+	 */
+	if (sysctl_clean_low_kbytes || sysctl_clean_min_kbytes) {
+		unsigned long reclaimable_file, dirty, clean;
+
+		reclaimable_file =
+			node_page_state(pgdat, NR_ACTIVE_FILE) +
+			node_page_state(pgdat, NR_INACTIVE_FILE) +
+			node_page_state(pgdat, NR_ISOLATED_FILE);
+		dirty = node_page_state(pgdat, NR_FILE_DIRTY);
+		/*
+		 * node_page_state() sum can go out of sync since
+		 * all the values are not read at once.
+		 */
+		if (likely(reclaimable_file > dirty))
+			clean = (reclaimable_file - dirty) << (PAGE_SHIFT - 10);
+		else
+			clean = 0;
+
+		sc->clean_below_low = clean < sysctl_clean_low_kbytes;
+		sc->clean_below_min = clean < sysctl_clean_min_kbytes;
+	} else {
+		sc->clean_below_low = 0;
+		sc->clean_below_min = 0;
+	}
+}
+
 static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 {
 	struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
@@ -3249,6 +3338,8 @@  static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			anon >> sc->priority;
 	}

+	prepare_workingset_protection(pgdat, sc);
+
 	shrink_node_memcgs(pgdat, sc);

 	if (reclaim_state) {