@@ -16,6 +16,7 @@
#include <linux/overflow.h>
#include <linux/types.h>
#include <linux/workqueue.h>
+#include <linux/percpu-refcount.h>
/*
@@ -152,7 +153,6 @@ int kmem_cache_shrink(struct kmem_cache *);
void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
void memcg_deactivate_kmem_caches(struct mem_cgroup *);
-void memcg_destroy_kmem_caches(struct mem_cgroup *);
/*
* Please use this macro to create slab caches. Simply specify the
@@ -641,6 +641,7 @@ struct memcg_cache_params {
struct mem_cgroup *memcg;
struct list_head children_node;
struct list_head kmem_caches_node;
+ struct percpu_ref refcnt;
void (*work_fn)(struct kmem_cache *);
union {
@@ -2663,8 +2663,11 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
goto out;
memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
- if (likely(memcg_cachep))
+ if (likely(memcg_cachep)) {
+ percpu_ref_get(&memcg_cachep->memcg_params.refcnt);
+ css_put(&memcg->css);
return memcg_cachep;
+ }
/*
* If we are in a safe context (can wait, and not in interrupt
@@ -2691,7 +2694,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (!is_root_cache(cachep))
- css_put(&cachep->memcg_params.memcg->css);
+ percpu_ref_put(&cachep->memcg_params.refcnt);
}
/**
@@ -2719,9 +2722,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
cancel_charge(memcg, nr_pages);
return -ENOMEM;
}
-
- page->mem_cgroup = memcg;
-
return 0;
}
@@ -2744,8 +2744,10 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
- if (!ret)
+ if (!ret) {
+ page->mem_cgroup = memcg;
__SetPageKmemcg(page);
+ }
}
css_put(&memcg->css);
return ret;
@@ -3238,7 +3240,7 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
memcg_offline_kmem(memcg);
if (memcg->kmem_state == KMEM_ALLOCATED) {
- memcg_destroy_kmem_caches(memcg);
+ WARN_ON(!list_empty(&memcg->kmem_caches));
static_branch_dec(&memcg_kmem_enabled_key);
WARN_ON(page_counter_read(&memcg->kmem));
}
@@ -173,6 +173,7 @@ void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *);
void __kmemcg_cache_deactivate(struct kmem_cache *s);
void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
+void kmemcg_cache_shutdown(struct kmem_cache *s);
void slab_kmem_cache_release(struct kmem_cache *);
struct seq_file;
@@ -284,15 +285,37 @@ static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
- if (is_root_cache(s))
- return 0;
- return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+ int ret;
+
+ memcg = s->memcg_params.memcg;
+ ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ if (ret)
+ return ret;
+
+ lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
+ mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order);
+
+ /* transer try_charge() page references to kmem_cache */
+ percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
+ css_put_many(&memcg->css, 1 << order);
+
+ return 0;
}
static __always_inline void memcg_uncharge_slab(struct page *page, int order,
struct kmem_cache *s)
{
- memcg_kmem_uncharge(page, order);
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ memcg = s->memcg_params.memcg;
+ lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
+ mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order));
+ memcg_kmem_uncharge_memcg(page, order, memcg);
+
+ percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
}
extern void slab_init_memcg_params(struct kmem_cache *);
@@ -362,15 +385,24 @@ static __always_inline int charge_slab_page(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
- memcg_charge_slab(page, gfp, order, s);
- mod_lruvec_page_state(page, cache_vmstat_idx(s), 1 << order);
- return 0;
+ if (is_root_cache(s)) {
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ 1 << order);
+ return 0;
+ }
+
+ return memcg_charge_slab(page, gfp, order, s);
}
static __always_inline void uncharge_slab_page(struct page *page, int order,
struct kmem_cache *s)
{
- mod_lruvec_page_state(page, cache_vmstat_idx(s), -(1 << order));
+ if (is_root_cache(s)) {
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ -(1 << order));
+ return;
+ }
+
memcg_uncharge_slab(page, order, s);
}
@@ -45,6 +45,8 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
slab_caches_to_rcu_destroy_workfn);
+static void kmemcg_queue_cache_shutdown(struct percpu_ref *percpu_ref);
+
/*
* Set of flags that will prevent slab merging
*/
@@ -145,6 +147,12 @@ static int init_memcg_params(struct kmem_cache *s,
struct memcg_cache_array *arr;
if (root_cache) {
+ int ret = percpu_ref_init(&s->memcg_params.refcnt,
+ kmemcg_queue_cache_shutdown,
+ 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
s->memcg_params.root_cache = root_cache;
INIT_LIST_HEAD(&s->memcg_params.children_node);
INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
@@ -170,6 +178,8 @@ static void destroy_memcg_params(struct kmem_cache *s)
{
if (is_root_cache(s))
kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+ else
+ percpu_ref_exit(&s->memcg_params.refcnt);
}
static void free_memcg_params(struct rcu_head *rcu)
@@ -225,6 +235,7 @@ void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg)
if (is_root_cache(s)) {
list_add(&s->root_caches_node, &slab_root_caches);
} else {
+ css_get(&memcg->css);
s->memcg_params.memcg = memcg;
list_add(&s->memcg_params.children_node,
&s->memcg_params.root_cache->memcg_params.children);
@@ -240,6 +251,7 @@ static void memcg_unlink_cache(struct kmem_cache *s)
} else {
list_del(&s->memcg_params.children_node);
list_del(&s->memcg_params.kmem_caches_node);
+ css_put(&s->memcg_params.memcg->css);
}
}
#else
@@ -708,16 +720,13 @@ static void kmemcg_after_rcu_workfn(struct work_struct *work)
put_online_mems();
put_online_cpus();
-
- /* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
- css_put(&s->memcg_params.memcg->css);
}
/*
* We need to grab blocking locks. Bounce to ->work. The
* work item shares the space with the RCU head and can't be
- * initialized eariler.
-*/
+ * initialized earlier.
+ */
static void kmemcg_schedule_work_after_rcu(struct rcu_head *head)
{
struct kmem_cache *s = container_of(head, struct kmem_cache,
@@ -727,9 +736,28 @@ static void kmemcg_schedule_work_after_rcu(struct rcu_head *head)
queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
}
+static void kmemcg_cache_shutdown_after_rcu(struct kmem_cache *s)
+{
+ WARN_ON(shutdown_cache(s));
+}
+
+static void kmemcg_queue_cache_shutdown(struct percpu_ref *percpu_ref)
+{
+ struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache,
+ memcg_params.refcnt);
+
+ if (s->memcg_params.root_cache->memcg_params.dying)
+ return;
+
+ WARN_ON(s->memcg_params.work_fn);
+ s->memcg_params.work_fn = kmemcg_cache_shutdown_after_rcu;
+ call_rcu(&s->memcg_params.rcu_head, kmemcg_schedule_work_after_rcu);
+}
+
static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
{
__kmemcg_cache_deactivate_after_rcu(s);
+ percpu_ref_kill(&s->memcg_params.refcnt);
}
static void kmemcg_cache_deactivate(struct kmem_cache *s)
@@ -739,9 +767,6 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s)
if (s->memcg_params.root_cache->memcg_params.dying)
return;
- /* pin memcg so that @s doesn't get destroyed in the middle */
- css_get(&s->memcg_params.memcg->css);
-
WARN_ON_ONCE(s->memcg_params.work_fn);
s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu;
call_rcu(&s->memcg_params.rcu_head, kmemcg_schedule_work_after_rcu);
@@ -775,28 +800,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
put_online_cpus();
}
-void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
-{
- struct kmem_cache *s, *s2;
-
- get_online_cpus();
- get_online_mems();
-
- mutex_lock(&slab_mutex);
- list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
- memcg_params.kmem_caches_node) {
- /*
- * The cgroup is about to be freed and therefore has no charges
- * left. Hence, all its caches must be empty by now.
- */
- BUG_ON(shutdown_cache(s));
- }
- mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
-}
-
static int shutdown_memcg_caches(struct kmem_cache *s)
{
struct memcg_cache_array *arr;
@@ -4027,18 +4027,8 @@ void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
{
/*
* Called with all the locks held after a sched RCU grace period.
- * Even if @s becomes empty after shrinking, we can't know that @s
- * doesn't have allocations already in-flight and thus can't
- * destroy @s until the associated memcg is released.
- *
- * However, let's remove the sysfs files for empty caches here.
- * Each cache has a lot of interface files which aren't
- * particularly useful for empty draining caches; otherwise, we can
- * easily end up with millions of unnecessary sysfs files on
- * systems which have a lot of memory and transient cgroups.
*/
- if (!__kmem_cache_shrink(s))
- sysfs_slab_remove(s);
+ __kmem_cache_shrink(s);
}
void __kmemcg_cache_deactivate(struct kmem_cache *s)
This commit makes several important changes in the lifecycle of a non-root kmem_cache, which also affect the lifecycle of a memory cgroup. Currently each charged slab page has a page->mem_cgroup pointer to the memory cgroup and holds a reference to it. Kmem_caches are held by the memcg and are released with it. It means that none of kmem_caches are released unless at least one reference to the memcg exists, which is not optimal. So the current scheme can be illustrated as: page->mem_cgroup->kmem_cache. To implement the slab memory reparenting we need to invert the scheme into: page->kmem_cache->mem_cgroup. Let's make every page to hold a reference to the kmem_cache (we already have a stable pointer), and make kmem_caches to hold a single reference to the memory cgroup. To make this possible we need to introduce a new percpu refcounter for non-root kmem_caches. The counter is initialized to the percpu mode, and is switched to atomic mode after deactivation, so we never shutdown an active cache. The counter is bumped for every charged page and also for every running allocation. So the kmem_cache can't be released unless all allocations complete. To shutdown non-active empty kmem_caches, let's reuse the infrastructure of the RCU-delayed work queue, used previously for the deactivation. After the generalization, it's perfectly suited for our needs. Since now we can release a kmem_cache at any moment after the deactivation, let's call sysfs_slab_remove() only from the shutdown path. It makes deactivation path simpler. Because we don't set the page->mem_cgroup pointer, we need to change the way how memcg-level stats is working for slab pages. We can't use mod_lruvec_page_state() helpers anymore, so switch over to mod_lruvec_state(). * I used the following simple approach to test the performance (stolen from another patchset by T. Harding): time find / -name fname-no-exist echo 2 > /proc/sys/vm/drop_caches repeat several times Results (I've chosen best results in several runs): orig patched real 0m0.700s 0m0.722s user 0m0.114s 0m0.120s sys 0m0.317s 0m0.324s real 0m0.729s 0m0.746s user 0m0.110s 0m0.139s sys 0m0.320s 0m0.317s real 0m0.745s 0m0.719s user 0m0.108s 0m0.124s sys 0m0.320s 0m0.323s So it looks like the difference is not noticeable in this test. Signed-off-by: Roman Gushchin <guro@fb.com> --- include/linux/slab.h | 3 ++- mm/memcontrol.c | 16 ++++++----- mm/slab.h | 48 +++++++++++++++++++++++++++------ mm/slab_common.c | 63 +++++++++++++++++++++++--------------------- mm/slub.c | 12 +-------- 5 files changed, 85 insertions(+), 57 deletions(-)