@@ -230,6 +230,21 @@ struct mem_cgroup {
int under_oom;
int swappiness;
+ /*
+ * slrp, soft limit reclaiming priority
+ *
+ * 0, by default, no slrp considered on soft reclaiming.
+ *
+ * 1-32, user configurable in ascending order,
+ * no page will be reclaimed from memcg of higher slrp in
+ * favor of memcg of lower slrp.
+ *
+ * only in direct reclaiming context now.
+ */
+ int slrp;
+#define MEMCG_SLRP_MIN 1
+#define MEMCG_SLRP_MAX 32
+
/* OOM-Killer disable */
int oom_kill_disable;
@@ -647,7 +647,8 @@ static void mem_cgroup_remove_from_trees
}
static struct mem_cgroup_per_node *
-__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz,
+ int slrp)
{
struct mem_cgroup_per_node *mz;
@@ -664,7 +665,7 @@ retry:
* position in the tree.
*/
__mem_cgroup_remove_exceeded(mz, mctz);
- if (!soft_limit_excess(mz->memcg) ||
+ if (!soft_limit_excess(mz->memcg) || mz->memcg->slrp > slrp ||
!css_tryget_online(&mz->memcg->css))
goto retry;
done:
@@ -672,12 +673,13 @@ done:
}
static struct mem_cgroup_per_node *
-mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz,
+ int slrp)
{
struct mem_cgroup_per_node *mz;
spin_lock_irq(&mctz->lock);
- mz = __mem_cgroup_largest_soft_limit_node(mctz);
+ mz = __mem_cgroup_largest_soft_limit_node(mctz, slrp);
spin_unlock_irq(&mctz->lock);
return mz;
}
@@ -2972,6 +2974,31 @@ static int mem_cgroup_resize_max(struct
return ret;
}
+static int mem_cgroup_get_slrp(void)
+{
+ int slrp;
+
+ if (current->flags & PF_KTHREAD) {
+ /*
+ * now slrp does not churn in background reclaiming to
+ * make life simple
+ */
+ slrp = 0;
+ } else {
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (!memcg || memcg == root_mem_cgroup)
+ slrp = 0;
+ else
+ slrp = memcg->slrp;
+ rcu_read_unlock();
+ }
+
+ return slrp;
+}
+
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned)
@@ -2980,6 +3007,7 @@ unsigned long mem_cgroup_soft_limit_recl
struct mem_cgroup_per_node *mz, *next_mz = NULL;
unsigned long reclaimed;
int loop = 0;
+ int slrp;
struct mem_cgroup_tree_per_node *mctz;
unsigned long excess;
unsigned long nr_scanned;
@@ -2997,6 +3025,7 @@ unsigned long mem_cgroup_soft_limit_recl
if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
return 0;
+ slrp = mem_cgroup_get_slrp();
/*
* This loop can run a while, specially if mem_cgroup's continuously
* keep exceeding their soft limit and putting the system under
@@ -3006,7 +3035,7 @@ unsigned long mem_cgroup_soft_limit_recl
if (next_mz)
mz = next_mz;
else
- mz = mem_cgroup_largest_soft_limit_node(mctz);
+ mz = mem_cgroup_largest_soft_limit_node(mctz, slrp);
if (!mz)
break;
@@ -3024,8 +3053,8 @@ unsigned long mem_cgroup_soft_limit_recl
*/
next_mz = NULL;
if (!reclaimed)
- next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
-
+ next_mz = __mem_cgroup_largest_soft_limit_node(mctz,
+ slrp);
excess = soft_limit_excess(mz->memcg);
/*
* One school of thought says that we should not add
@@ -5817,6 +5846,37 @@ static ssize_t memory_oom_group_write(st
return nbytes;
}
+static int memory_slrp_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ seq_printf(m, "%d\n", memcg->slrp);
+
+ return 0;
+}
+
+static ssize_t memory_slrp_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ int ret, slrp;
+
+ buf = strstrip(buf);
+ if (!buf)
+ return -EINVAL;
+
+ ret = kstrtoint(buf, 0, &slrp);
+ if (ret)
+ return ret;
+
+ if (slrp < MEMCG_SLRP_MIN || MEMCG_SLRP_MAX < slrp)
+ return -EINVAL;
+
+ memcg->slrp = slrp;
+
+ return nbytes;
+}
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -5870,6 +5930,12 @@ static struct cftype memory_files[] = {
.seq_show = memory_oom_group_show,
.write = memory_oom_group_write,
},
+ {
+ .name = "slrp",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_slrp_show,
+ .write = memory_slrp_write,
+ },
{ } /* terminate */
};
Currently memory controler is playing increasingly important role in how memory is used and how pages are reclaimed on memory pressure. In daily works memcg is often created for critical tasks and their pre configured memory usage is supposed to be met even on memory pressure. Administrator wants to make it configurable that the pages consumed by memcg-B can be reclaimed by page allocations invoked not by memcg-A but by memcg-C. That configurability is addressed by adding priority for soft limit reclaiming to make sure that no pages will be reclaimed from memcg of higer priortiy in favor of memcg of lower priority. Pages are reclaimed with no priority being taken into account by default unless user turns it on, and then they are responsible for their smart activities almost the same way as they play realtime FIFO/RR games. Priority is available only in the direct reclaiming context in order to advoid churning in the complex kswapd behavior. Cc: Shakeel Butt <shakeelb@google.com> Cc: Roman Gushchin <guro@fb.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Signed-off-by: Hillf Danton <hdanton@sina.com> ---