diff mbox series

memcg: Add a new sysctl parameter for automatically setting memory.high

Message ID 20240623204514.1032662-1-longman@redhat.com (mailing list archive)
State New
Headers show
Series memcg: Add a new sysctl parameter for automatically setting memory.high | expand

Commit Message

Waiman Long June 23, 2024, 8:45 p.m. UTC
With memory cgroup v1, there is only a single "memory.limit_in_bytes"
to be set to specify the maximum amount of memory that is allowed to
be used. So a lot of memory cgroup using tools and applications allow
users to specify a single memory limit. When they migrate to cgroup
v2, they use the given memory limit to set memory.max and disregard
memory.high for the time being.

Without properly setting memory.high, these user space applications
cannot make use of the memory cgroup v2 ability to further reduce the
chance of OOM kills by throttling and early memory reclaim.

This patch adds a new sysctl parameter "vm/memory_high_autoset_ratio"
to enable setting "memory.high" automatically whenever "memory.max" is
set as long as "memory.high" hasn't been explicitly set before. This
will allow a system administrator or a middleware layer to greatly
reduce the chance of memory cgroup OOM kills without worrying about
how to properly set memory.high.

The new sysctl parameter will allow a range of 0-100. The default value
of 0 will disable memory.high auto setting. For any non-zero value "n",
the actual ratio used will be "n/(n+1)". A user cannot set a fraction
less than 1/2.

Signed-off-by: Waiman Long <longman@redhat.com>
---
 Documentation/admin-guide/sysctl/vm.rst | 10 ++++++
 include/linux/memcontrol.h              |  3 ++
 mm/memcontrol.c                         | 41 +++++++++++++++++++++++++
 3 files changed, 54 insertions(+)

Comments

Waiman Long June 23, 2024, 8:52 p.m. UTC | #1
Correct some email addresses.

On 6/23/24 16:45, Waiman Long wrote:
> With memory cgroup v1, there is only a single "memory.limit_in_bytes"
> to be set to specify the maximum amount of memory that is allowed to
> be used. So a lot of memory cgroup using tools and applications allow
> users to specify a single memory limit. When they migrate to cgroup
> v2, they use the given memory limit to set memory.max and disregard
> memory.high for the time being.
>
> Without properly setting memory.high, these user space applications
> cannot make use of the memory cgroup v2 ability to further reduce the
> chance of OOM kills by throttling and early memory reclaim.
>
> This patch adds a new sysctl parameter "vm/memory_high_autoset_ratio"
> to enable setting "memory.high" automatically whenever "memory.max" is
> set as long as "memory.high" hasn't been explicitly set before. This
> will allow a system administrator or a middleware layer to greatly
> reduce the chance of memory cgroup OOM kills without worrying about
> how to properly set memory.high.
>
> The new sysctl parameter will allow a range of 0-100. The default value
> of 0 will disable memory.high auto setting. For any non-zero value "n",
> the actual ratio used will be "n/(n+1)". A user cannot set a fraction
> less than 1/2.
>
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>   Documentation/admin-guide/sysctl/vm.rst | 10 ++++++
>   include/linux/memcontrol.h              |  3 ++
>   mm/memcontrol.c                         | 41 +++++++++++++++++++++++++
>   3 files changed, 54 insertions(+)
>
> diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> index e86c968a7a0e..250ec39dd5af 100644
> --- a/Documentation/admin-guide/sysctl/vm.rst
> +++ b/Documentation/admin-guide/sysctl/vm.rst
> @@ -46,6 +46,7 @@ Currently, these files are in /proc/sys/vm:
>   - mem_profiling         (only if CONFIG_MEM_ALLOC_PROFILING=y)
>   - memory_failure_early_kill
>   - memory_failure_recovery
> +- memory_high_autoset_ratio
>   - min_free_kbytes
>   - min_slab_ratio
>   - min_unmapped_ratio
> @@ -479,6 +480,15 @@ Enable memory failure recovery (when supported by the platform)
>   0: Always panic on a memory failure.
>   
>   
> +memory_high_autoset_ratio
> +=========================
> +
> +Specify a ratio by which memory.high should be set as a fraction of
> +memory.max if it hasn't been explicitly set before.  It allows a range
> +of 0-100.  The default value of 0 means auto setting will be disabled.
> +For any non-zero value "n", the actual ratio used will be "n/(n+1)".
> +
> +
>   min_free_kbytes
>   ===============
>   
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 030d34e9d117..6be161a6b922 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -221,6 +221,9 @@ struct mem_cgroup {
>   	 */
>   	bool oom_group;
>   
> +	/* %true if memory.high has been explicitly set */
> +	bool memory_high_set;
> +
>   	/* protected by memcg_oom_lock */
>   	bool		oom_lock;
>   	int		under_oom;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 71fe2a95b8bd..2cfb000bf543 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -48,6 +48,7 @@
>   #include <linux/swap.h>
>   #include <linux/swapops.h>
>   #include <linux/spinlock.h>
> +#include <linux/sysctl.h>
>   #include <linux/eventfd.h>
>   #include <linux/poll.h>
>   #include <linux/sort.h>
> @@ -6889,6 +6890,35 @@ static void mem_cgroup_attach(struct cgroup_taskset *tset)
>   }
>   #endif
>   
> +/*
> + * The memory.high autoset ratio specifies a ratio by which memory.high
> + * should be set as a fraction of memory.max if it hasn't been explicitly
> + * set before. The default value of 0 means auto setting will be disabled.
> + * For any non-zero value "n", the actual ratio is "n/(n+1)".
> + */
> +static int sysctl_memory_high_autoset_ratio;
> +
> +#ifdef CONFIG_SYSCTL
> +static struct ctl_table memcg_table[] = {
> +	{
> +		.procname	= "memory_high_autoset_ratio",
> +		.data		= &sysctl_memory_high_autoset_ratio,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= SYSCTL_ZERO,
> +		.extra2		= SYSCTL_ONE_HUNDRED,
> +	},
> +};
> +
> +static inline void memcg_sysctl_init(void)
> +{
> +	register_sysctl_init("vm", memcg_table);
> +}
> +#else
> +static void memcg_sysctl_init(void)	{ }
> +#endif /* CONFIG_SYSCTL */
> +
>   static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
>   {
>   	if (value == PAGE_COUNTER_MAX)
> @@ -6982,6 +7012,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
>   		return err;
>   
>   	page_counter_set_high(&memcg->memory, high);
> +	memcg->memory_high_set = true;
>   
>   	for (;;) {
>   		unsigned long nr_pages = page_counter_read(&memcg->memory);
> @@ -7023,6 +7054,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
>   	unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
>   	bool drained = false;
>   	unsigned long max;
> +	unsigned int high_ratio = sysctl_memory_high_autoset_ratio;
>   	int err;
>   
>   	buf = strstrip(buf);
> @@ -7032,6 +7064,13 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
>   
>   	xchg(&memcg->memory.max, max);
>   
> +	if (high_ratio && !memcg->memory_high_set) {
> +		/* Set memory.high as a fraction of memory.max */
> +		unsigned long high = max * high_ratio / (high_ratio + 1);
> +
> +		page_counter_set_high(&memcg->memory, high);
> +	}
> +
>   	for (;;) {
>   		unsigned long nr_pages = page_counter_read(&memcg->memory);
>   
> @@ -7977,6 +8016,8 @@ static int __init mem_cgroup_init(void)
>   		soft_limit_tree.rb_tree_per_node[node] = rtpn;
>   	}
>   
> +	memcg_sysctl_init();
> +
>   	return 0;
>   }
>   subsys_initcall(mem_cgroup_init);
Michal Hocko June 24, 2024, 8:37 a.m. UTC | #2
On Sun 23-06-24 16:52:00, Waiman Long wrote:
> Correct some email addresses.
> 
> On 6/23/24 16:45, Waiman Long wrote:
> > With memory cgroup v1, there is only a single "memory.limit_in_bytes"
> > to be set to specify the maximum amount of memory that is allowed to
> > be used. So a lot of memory cgroup using tools and applications allow
> > users to specify a single memory limit. When they migrate to cgroup
> > v2, they use the given memory limit to set memory.max and disregard
> > memory.high for the time being.
> > 
> > Without properly setting memory.high, these user space applications
> > cannot make use of the memory cgroup v2 ability to further reduce the
> > chance of OOM kills by throttling and early memory reclaim.
> > 
> > This patch adds a new sysctl parameter "vm/memory_high_autoset_ratio"
> > to enable setting "memory.high" automatically whenever "memory.max" is
> > set as long as "memory.high" hasn't been explicitly set before. This
> > will allow a system administrator or a middleware layer to greatly
> > reduce the chance of memory cgroup OOM kills without worrying about
> > how to properly set memory.high.
> > 
> > The new sysctl parameter will allow a range of 0-100. The default value
> > of 0 will disable memory.high auto setting. For any non-zero value "n",
> > the actual ratio used will be "n/(n+1)". A user cannot set a fraction
> > less than 1/2.

I am sorry but this is a bad idea. It is also completely unnecessary. If
somebody goes all the way to set the hard limit there is no reason to
not set the high limit along the way. I see a zero reason to make a
global hard coded policy for something like that.  Not to mention that
%age is a really bad interface as it gets hugely impractical with large
%limits.

> > 
> > Signed-off-by: Waiman Long <longman@redhat.com>

Nacked-by: Michal Hocko <mhocko@suse.com>

> > ---
> >   Documentation/admin-guide/sysctl/vm.rst | 10 ++++++
> >   include/linux/memcontrol.h              |  3 ++
> >   mm/memcontrol.c                         | 41 +++++++++++++++++++++++++
> >   3 files changed, 54 insertions(+)
> > 
> > diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> > index e86c968a7a0e..250ec39dd5af 100644
> > --- a/Documentation/admin-guide/sysctl/vm.rst
> > +++ b/Documentation/admin-guide/sysctl/vm.rst
> > @@ -46,6 +46,7 @@ Currently, these files are in /proc/sys/vm:
> >   - mem_profiling         (only if CONFIG_MEM_ALLOC_PROFILING=y)
> >   - memory_failure_early_kill
> >   - memory_failure_recovery
> > +- memory_high_autoset_ratio
> >   - min_free_kbytes
> >   - min_slab_ratio
> >   - min_unmapped_ratio
> > @@ -479,6 +480,15 @@ Enable memory failure recovery (when supported by the platform)
> >   0: Always panic on a memory failure.
> > +memory_high_autoset_ratio
> > +=========================
> > +
> > +Specify a ratio by which memory.high should be set as a fraction of
> > +memory.max if it hasn't been explicitly set before.  It allows a range
> > +of 0-100.  The default value of 0 means auto setting will be disabled.
> > +For any non-zero value "n", the actual ratio used will be "n/(n+1)".
> > +
> > +
> >   min_free_kbytes
> >   ===============
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 030d34e9d117..6be161a6b922 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -221,6 +221,9 @@ struct mem_cgroup {
> >   	 */
> >   	bool oom_group;
> > +	/* %true if memory.high has been explicitly set */
> > +	bool memory_high_set;
> > +
> >   	/* protected by memcg_oom_lock */
> >   	bool		oom_lock;
> >   	int		under_oom;
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 71fe2a95b8bd..2cfb000bf543 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -48,6 +48,7 @@
> >   #include <linux/swap.h>
> >   #include <linux/swapops.h>
> >   #include <linux/spinlock.h>
> > +#include <linux/sysctl.h>
> >   #include <linux/eventfd.h>
> >   #include <linux/poll.h>
> >   #include <linux/sort.h>
> > @@ -6889,6 +6890,35 @@ static void mem_cgroup_attach(struct cgroup_taskset *tset)
> >   }
> >   #endif
> > +/*
> > + * The memory.high autoset ratio specifies a ratio by which memory.high
> > + * should be set as a fraction of memory.max if it hasn't been explicitly
> > + * set before. The default value of 0 means auto setting will be disabled.
> > + * For any non-zero value "n", the actual ratio is "n/(n+1)".
> > + */
> > +static int sysctl_memory_high_autoset_ratio;
> > +
> > +#ifdef CONFIG_SYSCTL
> > +static struct ctl_table memcg_table[] = {
> > +	{
> > +		.procname	= "memory_high_autoset_ratio",
> > +		.data		= &sysctl_memory_high_autoset_ratio,
> > +		.maxlen		= sizeof(int),
> > +		.mode		= 0644,
> > +		.proc_handler	= proc_dointvec_minmax,
> > +		.extra1		= SYSCTL_ZERO,
> > +		.extra2		= SYSCTL_ONE_HUNDRED,
> > +	},
> > +};
> > +
> > +static inline void memcg_sysctl_init(void)
> > +{
> > +	register_sysctl_init("vm", memcg_table);
> > +}
> > +#else
> > +static void memcg_sysctl_init(void)	{ }
> > +#endif /* CONFIG_SYSCTL */
> > +
> >   static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
> >   {
> >   	if (value == PAGE_COUNTER_MAX)
> > @@ -6982,6 +7012,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
> >   		return err;
> >   	page_counter_set_high(&memcg->memory, high);
> > +	memcg->memory_high_set = true;
> >   	for (;;) {
> >   		unsigned long nr_pages = page_counter_read(&memcg->memory);
> > @@ -7023,6 +7054,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
> >   	unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
> >   	bool drained = false;
> >   	unsigned long max;
> > +	unsigned int high_ratio = sysctl_memory_high_autoset_ratio;
> >   	int err;
> >   	buf = strstrip(buf);
> > @@ -7032,6 +7064,13 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
> >   	xchg(&memcg->memory.max, max);
> > +	if (high_ratio && !memcg->memory_high_set) {
> > +		/* Set memory.high as a fraction of memory.max */
> > +		unsigned long high = max * high_ratio / (high_ratio + 1);
> > +
> > +		page_counter_set_high(&memcg->memory, high);
> > +	}
> > +
> >   	for (;;) {
> >   		unsigned long nr_pages = page_counter_read(&memcg->memory);
> > @@ -7977,6 +8016,8 @@ static int __init mem_cgroup_init(void)
> >   		soft_limit_tree.rb_tree_per_node[node] = rtpn;
> >   	}
> > +	memcg_sysctl_init();
> > +
> >   	return 0;
> >   }
> >   subsys_initcall(mem_cgroup_init);
Roman Gushchin June 24, 2024, 3:21 p.m. UTC | #3
On Sun, Jun 23, 2024 at 04:52:00PM -0400, Waiman Long wrote:
> Correct some email addresses.
> 
> On 6/23/24 16:45, Waiman Long wrote:
> > With memory cgroup v1, there is only a single "memory.limit_in_bytes"
> > to be set to specify the maximum amount of memory that is allowed to
> > be used. So a lot of memory cgroup using tools and applications allow
> > users to specify a single memory limit. When they migrate to cgroup
> > v2, they use the given memory limit to set memory.max and disregard
> > memory.high for the time being.
> > 
> > Without properly setting memory.high, these user space applications
> > cannot make use of the memory cgroup v2 ability to further reduce the
> > chance of OOM kills by throttling and early memory reclaim.
> > 
> > This patch adds a new sysctl parameter "vm/memory_high_autoset_ratio"
> > to enable setting "memory.high" automatically whenever "memory.max" is
> > set as long as "memory.high" hasn't been explicitly set before. This
> > will allow a system administrator or a middleware layer to greatly
> > reduce the chance of memory cgroup OOM kills without worrying about
> > how to properly set memory.high.
> > 
> > The new sysctl parameter will allow a range of 0-100. The default value
> > of 0 will disable memory.high auto setting. For any non-zero value "n",
> > the actual ratio used will be "n/(n+1)". A user cannot set a fraction
> > less than 1/2.

Hi Waiman,

I'm not sure that setting memory.high is always a good idea (it comes
with a certain cost, e.g. can increase latency), but even if it is,
why systemd or similar userspace tools can't do this?

I wonder what's special about your case if you do see a lot of OOMs
which can be avoided by setting memory.high? Do you have a bursty workload?

Thanks!
Waiman Long June 24, 2024, 4:33 p.m. UTC | #4
On 6/24/24 11:21, Roman Gushchin wrote:
> On Sun, Jun 23, 2024 at 04:52:00PM -0400, Waiman Long wrote:
>> Correct some email addresses.
>>
>> On 6/23/24 16:45, Waiman Long wrote:
>>> With memory cgroup v1, there is only a single "memory.limit_in_bytes"
>>> to be set to specify the maximum amount of memory that is allowed to
>>> be used. So a lot of memory cgroup using tools and applications allow
>>> users to specify a single memory limit. When they migrate to cgroup
>>> v2, they use the given memory limit to set memory.max and disregard
>>> memory.high for the time being.
>>>
>>> Without properly setting memory.high, these user space applications
>>> cannot make use of the memory cgroup v2 ability to further reduce the
>>> chance of OOM kills by throttling and early memory reclaim.
>>>
>>> This patch adds a new sysctl parameter "vm/memory_high_autoset_ratio"
>>> to enable setting "memory.high" automatically whenever "memory.max" is
>>> set as long as "memory.high" hasn't been explicitly set before. This
>>> will allow a system administrator or a middleware layer to greatly
>>> reduce the chance of memory cgroup OOM kills without worrying about
>>> how to properly set memory.high.
>>>
>>> The new sysctl parameter will allow a range of 0-100. The default value
>>> of 0 will disable memory.high auto setting. For any non-zero value "n",
>>> the actual ratio used will be "n/(n+1)". A user cannot set a fraction
>>> less than 1/2.
> Hi Waiman,
>
> I'm not sure that setting memory.high is always a good idea (it comes
> with a certain cost, e.g. can increase latency), but even if it is,
> why systemd or similar userspace tools can't do this?

We actually have a OOM problem with OpenShift which is based on 
Kubernetes. AFAIK, the setting of memory.high is still in alpha for 
Kubernetes. So a memory cgroup is set up just by setting memory.max at 
the moment.

I also trace back the OOM problem to commit 14aa8b2d5c2e ("mm/mglru: 
don't sync disk for each aging cycle") in the MGLRU code. So setting 
memory.high automatically is one way to avoid premature OOM. That is the 
motivation behind this patch.

>
> I wonder what's special about your case if you do see a lot of OOMs
> which can be avoided by setting memory.high? Do you have a bursty workload?

In our case, the OOM kill can be triggered by writing a large data file 
that exceeds memory.max to a NFS mounted filesystem as long as there is 
enough free pages that the dirty_bytes/dirty_background_bytes mechanism 
isn't triggered.

Regards,
Longman
Michal Hocko June 24, 2024, 4:46 p.m. UTC | #5
On Mon 24-06-24 12:33:27, Waiman Long wrote:
> I also trace back the OOM problem to commit 14aa8b2d5c2e ("mm/mglru: don't
> sync disk for each aging cycle") in the MGLRU code. So setting memory.high
> automatically is one way to avoid premature OOM. That is the motivation
> behind this patch.

Please report this.
Waiman Long June 24, 2024, 5:05 p.m. UTC | #6
On 6/24/24 12:46, Michal Hocko wrote:
> On Mon 24-06-24 12:33:27, Waiman Long wrote:
>> I also trace back the OOM problem to commit 14aa8b2d5c2e ("mm/mglru: don't
>> sync disk for each aging cycle") in the MGLRU code. So setting memory.high
>> automatically is one way to avoid premature OOM. That is the motivation
>> behind this patch.
> Please report this.

OK, will do.

Cheers,
Longman
diff mbox series

Patch

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index e86c968a7a0e..250ec39dd5af 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -46,6 +46,7 @@  Currently, these files are in /proc/sys/vm:
 - mem_profiling         (only if CONFIG_MEM_ALLOC_PROFILING=y)
 - memory_failure_early_kill
 - memory_failure_recovery
+- memory_high_autoset_ratio
 - min_free_kbytes
 - min_slab_ratio
 - min_unmapped_ratio
@@ -479,6 +480,15 @@  Enable memory failure recovery (when supported by the platform)
 0: Always panic on a memory failure.
 
 
+memory_high_autoset_ratio
+=========================
+
+Specify a ratio by which memory.high should be set as a fraction of
+memory.max if it hasn't been explicitly set before.  It allows a range
+of 0-100.  The default value of 0 means auto setting will be disabled.
+For any non-zero value "n", the actual ratio used will be "n/(n+1)".
+
+
 min_free_kbytes
 ===============
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 030d34e9d117..6be161a6b922 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -221,6 +221,9 @@  struct mem_cgroup {
 	 */
 	bool oom_group;
 
+	/* %true if memory.high has been explicitly set */
+	bool memory_high_set;
+
 	/* protected by memcg_oom_lock */
 	bool		oom_lock;
 	int		under_oom;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 71fe2a95b8bd..2cfb000bf543 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -48,6 +48,7 @@ 
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/spinlock.h>
+#include <linux/sysctl.h>
 #include <linux/eventfd.h>
 #include <linux/poll.h>
 #include <linux/sort.h>
@@ -6889,6 +6890,35 @@  static void mem_cgroup_attach(struct cgroup_taskset *tset)
 }
 #endif
 
+/*
+ * The memory.high autoset ratio specifies a ratio by which memory.high
+ * should be set as a fraction of memory.max if it hasn't been explicitly
+ * set before. The default value of 0 means auto setting will be disabled.
+ * For any non-zero value "n", the actual ratio is "n/(n+1)".
+ */
+static int sysctl_memory_high_autoset_ratio;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table memcg_table[] = {
+	{
+		.procname	= "memory_high_autoset_ratio",
+		.data		= &sysctl_memory_high_autoset_ratio,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_HUNDRED,
+	},
+};
+
+static inline void memcg_sysctl_init(void)
+{
+	register_sysctl_init("vm", memcg_table);
+}
+#else
+static void memcg_sysctl_init(void)	{ }
+#endif /* CONFIG_SYSCTL */
+
 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
 {
 	if (value == PAGE_COUNTER_MAX)
@@ -6982,6 +7012,7 @@  static ssize_t memory_high_write(struct kernfs_open_file *of,
 		return err;
 
 	page_counter_set_high(&memcg->memory, high);
+	memcg->memory_high_set = true;
 
 	for (;;) {
 		unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -7023,6 +7054,7 @@  static ssize_t memory_max_write(struct kernfs_open_file *of,
 	unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
 	bool drained = false;
 	unsigned long max;
+	unsigned int high_ratio = sysctl_memory_high_autoset_ratio;
 	int err;
 
 	buf = strstrip(buf);
@@ -7032,6 +7064,13 @@  static ssize_t memory_max_write(struct kernfs_open_file *of,
 
 	xchg(&memcg->memory.max, max);
 
+	if (high_ratio && !memcg->memory_high_set) {
+		/* Set memory.high as a fraction of memory.max */
+		unsigned long high = max * high_ratio / (high_ratio + 1);
+
+		page_counter_set_high(&memcg->memory, high);
+	}
+
 	for (;;) {
 		unsigned long nr_pages = page_counter_read(&memcg->memory);
 
@@ -7977,6 +8016,8 @@  static int __init mem_cgroup_init(void)
 		soft_limit_tree.rb_tree_per_node[node] = rtpn;
 	}
 
+	memcg_sysctl_init();
+
 	return 0;
 }
 subsys_initcall(mem_cgroup_init);