Message ID | 20231024203302.1920362-3-nphamcs@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | workload-specific and memory pressure-driven zswap writeback | expand |
On Tue, Oct 24, 2023 at 1:33 PM Nhat Pham <nphamcs@gmail.com> wrote: > > From: Domenico Cerasuolo <cerasuolodomenico@gmail.com> > > Currently, we only have a single global LRU for zswap. This makes it > impossible to perform worload-specific shrinking - an memcg cannot > determine which pages in the pool it owns, and often ends up writing > pages from other memcgs. This issue has been previously observed in > practice and mitigated by simply disabling memcg-initiated shrinking: > > https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u > > This patch fully resolves the issue by replacing the global zswap LRU > with memcg- and NUMA-specific LRUs, and modify the reclaim logic: > > a) When a store attempt hits an memcg limit, it now triggers a > synchronous reclaim attempt that, if successful, allows the new > hotter page to be accepted by zswap. > b) If the store attempt instead hits the global zswap limit, it will > trigger an asynchronous reclaim attempt, in which an memcg is > selected for reclaim in a round-robin-like fashion. > > Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com> > Co-developed-by: Nhat Pham <nphamcs@gmail.com> > Signed-off-by: Nhat Pham <nphamcs@gmail.com> > --- > include/linux/memcontrol.h | 5 + > mm/swap.h | 3 +- > mm/swap_state.c | 23 +++-- > mm/zswap.c | 188 ++++++++++++++++++++++++++----------- > 4 files changed, 156 insertions(+), 63 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 6edd3ec4d8d5..c1846e57011b 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -1187,6 +1187,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) > return NULL; > } > > +static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) > +{ > + return NULL; > +} > + > static inline bool folio_memcg_kmem(struct folio *folio) > { > return false; > diff --git a/mm/swap.h b/mm/swap.h > index 73c332ee4d91..c0dc73e10e91 100644 > --- a/mm/swap.h > +++ b/mm/swap.h > @@ -51,7 +51,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > struct swap_iocb **plug); > struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > struct mempolicy *mpol, pgoff_t ilx, > - bool *new_page_allocated); > + bool *new_page_allocated, > + bool skip_if_exists); > struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, > struct mempolicy *mpol, pgoff_t ilx); > struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, > diff --git a/mm/swap_state.c b/mm/swap_state.c > index 85d9e5806a6a..040639e1c77e 100644 > --- a/mm/swap_state.c > +++ b/mm/swap_state.c > @@ -412,7 +412,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, > > struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > struct mempolicy *mpol, pgoff_t ilx, > - bool *new_page_allocated) > + bool *new_page_allocated, > + bool skip_if_exists) > { > struct swap_info_struct *si; > struct folio *folio; > @@ -470,6 +471,16 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > if (err != -EEXIST) > goto fail_put_swap; > > + /* Protect against a recursive call to __read_swap_cache_async() nit: insert new line before "Protect", see surrounding comments. > + * on the same entry waiting forever here because SWAP_HAS_CACHE > + * is set but the folio is not the swap cache yet. This can > + * happen today if mem_cgroup_swapin_charge_folio() below > + * triggers reclaim through zswap, which may call > + * __read_swap_cache_async() in the writeback path. > + */ > + if (skip_if_exists) > + goto fail_put_swap; > + > /* > * We might race against __delete_from_swap_cache(), and > * stumble across a swap_map entry whose SWAP_HAS_CACHE [..] > +/********************************* > +* lru functions > +**********************************/ > +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) > +{ > + struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry); > + int nid = entry_to_nid(entry); > + bool added = list_lru_add(list_lru, &entry->lru, nid, memcg); > + > + mem_cgroup_put(memcg); Still not fond of the get/put pattern but okay.. > + return added; > +} > + > +static bool zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) > +{ > + struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry); > + int nid = entry_to_nid(entry); > + bool removed = list_lru_del(list_lru, &entry->lru, nid, memcg); > + > + mem_cgroup_put(memcg); > + return removed; > +} > + > /********************************* > * rbtree functions > **********************************/ [..] > @@ -652,28 +679,37 @@ static int zswap_reclaim_entry(struct zswap_pool *pool) > */ > swpoffset = swp_offset(entry->swpentry); > tree = zswap_trees[swp_type(entry->swpentry)]; > - spin_unlock(&pool->lru_lock); > + list_lru_isolate(l, item); > + /* > + * It's safe to drop the lock here because we return either > + * LRU_REMOVED_RETRY or LRU_RETRY. > + */ > + spin_unlock(lock); > > /* Check for invalidate() race */ > spin_lock(&tree->lock); > - if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) { > - ret = -EAGAIN; > + if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) > goto unlock; > - } > + > /* Hold a reference to prevent a free during writeback */ > zswap_entry_get(entry); > spin_unlock(&tree->lock); > > - ret = zswap_writeback_entry(entry, tree); > + writeback_result = zswap_writeback_entry(entry, tree); > > spin_lock(&tree->lock); > - if (ret) { > - /* Writeback failed, put entry back on LRU */ > - spin_lock(&pool->lru_lock); > - list_move(&entry->lru, &pool->lru); > - spin_unlock(&pool->lru_lock); > + if (writeback_result) { > + zswap_reject_reclaim_fail++; > + memcg = get_mem_cgroup_from_entry(entry); Can this return NULL? Seems like we don't check the return in most/all places. > + spin_lock(lock); > + /* we cannot use zswap_lru_add here, because it increments node's lru count */ > + list_lru_putback(&entry->pool->list_lru, item, entry_to_nid(entry), memcg); Perhaps we can move this call with the memcg get/put to a helper like add/del? (e.g. zswap_lru_putback) We would need to move get_mem_cgroup_from_entry() into the lock but I think that's okay. > + spin_unlock(lock); > + mem_cgroup_put(memcg); > + ret = LRU_RETRY; > goto put_unlock; > } > + zswap_written_back_pages++; > > /* > * Writeback started successfully, the page now belongs to the > @@ -687,7 +723,34 @@ static int zswap_reclaim_entry(struct zswap_pool *pool) > zswap_entry_put(tree, entry); > unlock: > spin_unlock(&tree->lock); > - return ret ? -EAGAIN : 0; > + spin_lock(lock); > + return ret; > +} > + > +static int shrink_memcg(struct mem_cgroup *memcg) > +{ > + struct zswap_pool *pool; > + int nid, shrunk = 0; > + > + /* > + * Skip zombies because their LRUs are reparented and we would be > + * reclaiming from the parent instead of the dead memcg. > + */ > + if (memcg && !mem_cgroup_online(memcg)) > + return -ENOENT; > + > + pool = zswap_pool_current_get(); > + if (!pool) > + return -EINVAL; > + > + for_each_node_state(nid, N_NORMAL_MEMORY) { > + unsigned long nr_to_walk = 1; > + > + shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, > + &shrink_memcg_cb, NULL, &nr_to_walk); > + } > + zswap_pool_put(pool); > + return shrunk ? 0 : -EAGAIN; > } > > static void shrink_worker(struct work_struct *w) > @@ -696,15 +759,17 @@ static void shrink_worker(struct work_struct *w) > shrink_work); > int ret, failures = 0; > > + /* global reclaim will select cgroup in a round-robin fashion. */ > do { > - ret = zswap_reclaim_entry(pool); > - if (ret) { > - zswap_reject_reclaim_fail++; > - if (ret != -EAGAIN) > - break; > - if (++failures == MAX_RECLAIM_RETRIES) > - break; > - } > + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); I think this can be a problem. We hold a ref to a memcg here until the next time we shrink, which can be a long time IIUC. This can cause the memcg to linger as a zombie. I understand it is one memcg per-zswap pool, but I am still unsure about it. MGLRU maintains a memcg LRU for global reclaim that gets properly cleaned up when a memcg is going away, so that's one option, although complicated. A second option would be to hold a pointer to the objcg instead, which should be less problematic (although we are still holding that objcg hostage indefinitely). The problem here is that if the objcg gets reparented, next time we will start at the parent of the memcg we stopped at last time, which tbh doesn't sound bad at all to me. A third option would be to flag the memcg such that when it is getting offlined we can call into zswap to reset pool->next_shrink (or move it to the parent) and drop the ref. Although synchronization can get hairy when racing with offlining. Not sure what's the right solution, but I prefer we don't hold any memcgs hostages indefinitely. I also think if we end up using mem_cgroup_iter() then there should be a mem_cgroup_iter_break() somewhere if/when breaking the iteration. > + > + ret = shrink_memcg(pool->next_shrink); > + > + if (ret == -EINVAL) > + break; > + if (ret && ++failures == MAX_RECLAIM_RETRIES) > + break; > + > cond_resched(); > } while (!zswap_can_accept()); > zswap_pool_put(pool); [..] > @@ -1233,15 +1301,15 @@ bool zswap_store(struct folio *folio) > zswap_invalidate_entry(tree, dupentry); > } > spin_unlock(&tree->lock); > - > - /* > - * XXX: zswap reclaim does not work with cgroups yet. Without a > - * cgroup-aware entry LRU, we will push out entries system-wide based on > - * local cgroup limits. > - */ > objcg = get_obj_cgroup_from_folio(folio); > - if (objcg && !obj_cgroup_may_zswap(objcg)) > - goto reject; > + if (objcg && !obj_cgroup_may_zswap(objcg)) { > + memcg = get_mem_cgroup_from_objcg(objcg); > + if (shrink_memcg(memcg)) { > + mem_cgroup_put(memcg); > + goto reject; > + } > + mem_cgroup_put(memcg); Here we choose to replicate mem_cgroup_put(). > + } > > /* reclaim space if needed */ > if (zswap_is_full()) { > @@ -1258,7 +1326,7 @@ bool zswap_store(struct folio *folio) > } > > /* allocate entry */ > - entry = zswap_entry_cache_alloc(GFP_KERNEL); > + entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); > if (!entry) { > zswap_reject_kmemcache_fail++; > goto reject; > @@ -1285,6 +1353,15 @@ bool zswap_store(struct folio *folio) > if (!entry->pool) > goto freepage; > > + if (objcg) { > + memcg = get_mem_cgroup_from_objcg(objcg); > + lru_alloc_ret = memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL); > + mem_cgroup_put(memcg); > + > + if (lru_alloc_ret) > + goto put_pool; > + } Yet here we choose to have a single mem_cgroup_put() and stash the output in a variable. Consistency would be nice. > + > /* compress */ > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > > @@ -1361,9 +1438,8 @@ bool zswap_store(struct folio *folio) > zswap_invalidate_entry(tree, dupentry); > } > if (entry->length) { > - spin_lock(&entry->pool->lru_lock); > - list_add(&entry->lru, &entry->pool->lru); > - spin_unlock(&entry->pool->lru_lock); > + INIT_LIST_HEAD(&entry->lru); > + zswap_lru_add(&entry->pool->list_lru, entry); > } > spin_unlock(&tree->lock); > > @@ -1376,6 +1452,7 @@ bool zswap_store(struct folio *folio) > > put_dstmem: > mutex_unlock(acomp_ctx->mutex); > +put_pool: > zswap_pool_put(entry->pool); > freepage: > zswap_entry_cache_free(entry); > @@ -1470,9 +1547,8 @@ bool zswap_load(struct folio *folio) > zswap_invalidate_entry(tree, entry); > folio_mark_dirty(folio); > } else if (entry->length) { > - spin_lock(&entry->pool->lru_lock); > - list_move(&entry->lru, &entry->pool->lru); > - spin_unlock(&entry->pool->lru_lock); > + zswap_lru_del(&entry->pool->list_lru, entry); > + zswap_lru_add(&entry->pool->list_lru, entry); Can we use list_move_tail() here? (perhaps wrapped in a helper if needed). > } > zswap_entry_put(tree, entry); > spin_unlock(&tree->lock); > -- > 2.34.1
On Tue, Oct 24, 2023 at 8:17 PM Yosry Ahmed <yosryahmed@google.com> wrote: > > On Tue, Oct 24, 2023 at 1:33 PM Nhat Pham <nphamcs@gmail.com> wrote: > > > > From: Domenico Cerasuolo <cerasuolodomenico@gmail.com> > > > > Currently, we only have a single global LRU for zswap. This makes it > > impossible to perform worload-specific shrinking - an memcg cannot > > determine which pages in the pool it owns, and often ends up writing > > pages from other memcgs. This issue has been previously observed in > > practice and mitigated by simply disabling memcg-initiated shrinking: > > > > https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u > > > > This patch fully resolves the issue by replacing the global zswap LRU > > with memcg- and NUMA-specific LRUs, and modify the reclaim logic: > > > > a) When a store attempt hits an memcg limit, it now triggers a > > synchronous reclaim attempt that, if successful, allows the new > > hotter page to be accepted by zswap. > > b) If the store attempt instead hits the global zswap limit, it will > > trigger an asynchronous reclaim attempt, in which an memcg is > > selected for reclaim in a round-robin-like fashion. > > > > Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com> > > Co-developed-by: Nhat Pham <nphamcs@gmail.com> > > Signed-off-by: Nhat Pham <nphamcs@gmail.com> > > --- > > include/linux/memcontrol.h | 5 + > > mm/swap.h | 3 +- > > mm/swap_state.c | 23 +++-- > > mm/zswap.c | 188 ++++++++++++++++++++++++++----------- > > 4 files changed, 156 insertions(+), 63 deletions(-) > > > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > > index 6edd3ec4d8d5..c1846e57011b 100644 > > --- a/include/linux/memcontrol.h > > +++ b/include/linux/memcontrol.h > > @@ -1187,6 +1187,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) > > return NULL; > > } > > > > +static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) > > +{ > > + return NULL; > > +} > > + > > static inline bool folio_memcg_kmem(struct folio *folio) > > { > > return false; > > diff --git a/mm/swap.h b/mm/swap.h > > index 73c332ee4d91..c0dc73e10e91 100644 > > --- a/mm/swap.h > > +++ b/mm/swap.h > > @@ -51,7 +51,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > > struct swap_iocb **plug); > > struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > > struct mempolicy *mpol, pgoff_t ilx, > > - bool *new_page_allocated); > > + bool *new_page_allocated, > > + bool skip_if_exists); > > struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, > > struct mempolicy *mpol, pgoff_t ilx); > > struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, > > diff --git a/mm/swap_state.c b/mm/swap_state.c > > index 85d9e5806a6a..040639e1c77e 100644 > > --- a/mm/swap_state.c > > +++ b/mm/swap_state.c > > @@ -412,7 +412,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, > > > > struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > > struct mempolicy *mpol, pgoff_t ilx, > > - bool *new_page_allocated) > > + bool *new_page_allocated, > > + bool skip_if_exists) > > { > > struct swap_info_struct *si; > > struct folio *folio; > > @@ -470,6 +471,16 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > > if (err != -EEXIST) > > goto fail_put_swap; > > > > + /* Protect against a recursive call to __read_swap_cache_async() > > nit: insert new line before "Protect", see surrounding comments. > > > + * on the same entry waiting forever here because SWAP_HAS_CACHE > > + * is set but the folio is not the swap cache yet. This can > > + * happen today if mem_cgroup_swapin_charge_folio() below > > + * triggers reclaim through zswap, which may call > > + * __read_swap_cache_async() in the writeback path. > > + */ > > + if (skip_if_exists) > > + goto fail_put_swap; > > + > > /* > > * We might race against __delete_from_swap_cache(), and > > * stumble across a swap_map entry whose SWAP_HAS_CACHE > [..] > > +/********************************* > > +* lru functions > > +**********************************/ > > +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) > > +{ > > + struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry); > > + int nid = entry_to_nid(entry); > > + bool added = list_lru_add(list_lru, &entry->lru, nid, memcg); > > + > > + mem_cgroup_put(memcg); > > Still not fond of the get/put pattern but okay.. Actually, Johannes and I took another look to see if we can replace the memcg reference getting with just rcu_read_lock(). It seems there might be a race between zswap LRU manipulation and memcg offlining - not just with the rcu_read_lock() idea, but also with our current implementation! I'll shoot another email with more details later when I'm sure of it one way or another... > > > + return added; > > +} > > + > > +static bool zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) > > +{ > > + struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry); > > + int nid = entry_to_nid(entry); > > + bool removed = list_lru_del(list_lru, &entry->lru, nid, memcg); > > + > > + mem_cgroup_put(memcg); > > + return removed; > > +} > > + > > /********************************* > > * rbtree functions > > **********************************/ > [..] > > @@ -652,28 +679,37 @@ static int zswap_reclaim_entry(struct zswap_pool *pool) > > */ > > swpoffset = swp_offset(entry->swpentry); > > tree = zswap_trees[swp_type(entry->swpentry)]; > > - spin_unlock(&pool->lru_lock); > > + list_lru_isolate(l, item); > > + /* > > + * It's safe to drop the lock here because we return either > > + * LRU_REMOVED_RETRY or LRU_RETRY. > > + */ > > + spin_unlock(lock); > > > > /* Check for invalidate() race */ > > spin_lock(&tree->lock); > > - if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) { > > - ret = -EAGAIN; > > + if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) > > goto unlock; > > - } > > + > > /* Hold a reference to prevent a free during writeback */ > > zswap_entry_get(entry); > > spin_unlock(&tree->lock); > > > > - ret = zswap_writeback_entry(entry, tree); > > + writeback_result = zswap_writeback_entry(entry, tree); > > > > spin_lock(&tree->lock); > > - if (ret) { > > - /* Writeback failed, put entry back on LRU */ > > - spin_lock(&pool->lru_lock); > > - list_move(&entry->lru, &pool->lru); > > - spin_unlock(&pool->lru_lock); > > + if (writeback_result) { > > + zswap_reject_reclaim_fail++; > > + memcg = get_mem_cgroup_from_entry(entry); > > Can this return NULL? Seems like we don't check the return in most/all places. I believe so, but memcg experts should fact check me on this. It's roughly the same pattern as zswap charging/uncharging: obj_cgroup_uncharge_zswap(entry->objcg, entry->length) -> getting memcg (under rcu_read_lock()) > > > + spin_lock(lock); > > + /* we cannot use zswap_lru_add here, because it increments node's lru count */ > > + list_lru_putback(&entry->pool->list_lru, item, entry_to_nid(entry), memcg); > > Perhaps we can move this call with the memcg get/put to a helper like > add/del? (e.g. zswap_lru_putback) > > We would need to move get_mem_cgroup_from_entry() into the lock but I > think that's okay. We probably could, but that sounds like extra code for not a lot of gains, no? > > > + spin_unlock(lock); > > + mem_cgroup_put(memcg); > > + ret = LRU_RETRY; > > goto put_unlock; > > } > > + zswap_written_back_pages++; > > > > /* > > * Writeback started successfully, the page now belongs to the > > @@ -687,7 +723,34 @@ static int zswap_reclaim_entry(struct zswap_pool *pool) > > zswap_entry_put(tree, entry); > > unlock: > > spin_unlock(&tree->lock); > > - return ret ? -EAGAIN : 0; > > + spin_lock(lock); > > + return ret; > > +} > > + > > +static int shrink_memcg(struct mem_cgroup *memcg) > > +{ > > + struct zswap_pool *pool; > > + int nid, shrunk = 0; > > + > > + /* > > + * Skip zombies because their LRUs are reparented and we would be > > + * reclaiming from the parent instead of the dead memcg. > > + */ > > + if (memcg && !mem_cgroup_online(memcg)) > > + return -ENOENT; > > + > > + pool = zswap_pool_current_get(); > > + if (!pool) > > + return -EINVAL; > > + > > + for_each_node_state(nid, N_NORMAL_MEMORY) { > > + unsigned long nr_to_walk = 1; > > + > > + shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, > > + &shrink_memcg_cb, NULL, &nr_to_walk); > > + } > > + zswap_pool_put(pool); > > + return shrunk ? 0 : -EAGAIN; > > } > > > > static void shrink_worker(struct work_struct *w) > > @@ -696,15 +759,17 @@ static void shrink_worker(struct work_struct *w) > > shrink_work); > > int ret, failures = 0; > > > > + /* global reclaim will select cgroup in a round-robin fashion. */ > > do { > > - ret = zswap_reclaim_entry(pool); > > - if (ret) { > > - zswap_reject_reclaim_fail++; > > - if (ret != -EAGAIN) > > - break; > > - if (++failures == MAX_RECLAIM_RETRIES) > > - break; > > - } > > + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); > > I think this can be a problem. We hold a ref to a memcg here until the > next time we shrink, which can be a long time IIUC. This can cause the > memcg to linger as a zombie. I understand it is one memcg per-zswap > pool, but I am still unsure about it. > > MGLRU maintains a memcg LRU for global reclaim that gets properly > cleaned up when a memcg is going away, so that's one option, although > complicated. > > A second option would be to hold a pointer to the objcg instead, which > should be less problematic (although we are still holding that objcg > hostage indefinitely). The problem here is that if the objcg gets > reparented, next time we will start at the parent of the memcg we > stopped at last time, which tbh doesn't sound bad at all to me. > > A third option would be to flag the memcg such that when it is getting > offlined we can call into zswap to reset pool->next_shrink (or move it > to the parent) and drop the ref. Although synchronization can get > hairy when racing with offlining. > > Not sure what's the right solution, but I prefer we don't hold any > memcgs hostages indefinitely. I also think if we end up using > mem_cgroup_iter() then there should be a mem_cgroup_iter_break() > somewhere if/when breaking the iteration. > I'm not sure if this is that big of a problem in the first place, but if it is, doing something similar to MGLRU is probably the cleanest: when the memcg is freed, trigger the zswap_exit_memcg() callback, which will loop through all the zswap pools and update pool->next_shrink where appropriate. Note that we only have one pool per (compression algorithm x allocator) combinations, so there cannot be that many pools, correct? Johannes suggests this idea to me (my apologies if I butcher it) during one of our conversations. That sounds relatively easy IIUC. > > + > > + ret = shrink_memcg(pool->next_shrink); > > + > > + if (ret == -EINVAL) > > + break; > > + if (ret && ++failures == MAX_RECLAIM_RETRIES) > > + break; > > + > > cond_resched(); > > } while (!zswap_can_accept()); > > zswap_pool_put(pool); > [..] > > @@ -1233,15 +1301,15 @@ bool zswap_store(struct folio *folio) > > zswap_invalidate_entry(tree, dupentry); > > } > > spin_unlock(&tree->lock); > > - > > - /* > > - * XXX: zswap reclaim does not work with cgroups yet. Without a > > - * cgroup-aware entry LRU, we will push out entries system-wide based on > > - * local cgroup limits. > > - */ > > objcg = get_obj_cgroup_from_folio(folio); > > - if (objcg && !obj_cgroup_may_zswap(objcg)) > > - goto reject; > > + if (objcg && !obj_cgroup_may_zswap(objcg)) { > > + memcg = get_mem_cgroup_from_objcg(objcg); > > + if (shrink_memcg(memcg)) { > > + mem_cgroup_put(memcg); > > + goto reject; > > + } > > + mem_cgroup_put(memcg); > > Here we choose to replicate mem_cgroup_put(). > > > + } > > > > /* reclaim space if needed */ > > if (zswap_is_full()) { > > @@ -1258,7 +1326,7 @@ bool zswap_store(struct folio *folio) > > } > > > > /* allocate entry */ > > - entry = zswap_entry_cache_alloc(GFP_KERNEL); > > + entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); > > if (!entry) { > > zswap_reject_kmemcache_fail++; > > goto reject; > > @@ -1285,6 +1353,15 @@ bool zswap_store(struct folio *folio) > > if (!entry->pool) > > goto freepage; > > > > + if (objcg) { > > + memcg = get_mem_cgroup_from_objcg(objcg); > > + lru_alloc_ret = memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL); > > + mem_cgroup_put(memcg); > > + > > + if (lru_alloc_ret) > > + goto put_pool; > > + } > > Yet here we choose to have a single mem_cgroup_put() and stash the > output in a variable. > > Consistency would be nice. No strong opinions here, but yeah we should fix it to make it consistent. > > > + > > /* compress */ > > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > > > > @@ -1361,9 +1438,8 @@ bool zswap_store(struct folio *folio) > > zswap_invalidate_entry(tree, dupentry); > > } > > if (entry->length) { > > - spin_lock(&entry->pool->lru_lock); > > - list_add(&entry->lru, &entry->pool->lru); > > - spin_unlock(&entry->pool->lru_lock); > > + INIT_LIST_HEAD(&entry->lru); > > + zswap_lru_add(&entry->pool->list_lru, entry); > > } > > spin_unlock(&tree->lock); > > > > @@ -1376,6 +1452,7 @@ bool zswap_store(struct folio *folio) > > > > put_dstmem: > > mutex_unlock(acomp_ctx->mutex); > > +put_pool: > > zswap_pool_put(entry->pool); > > freepage: > > zswap_entry_cache_free(entry); > > @@ -1470,9 +1547,8 @@ bool zswap_load(struct folio *folio) > > zswap_invalidate_entry(tree, entry); > > folio_mark_dirty(folio); > > } else if (entry->length) { > > - spin_lock(&entry->pool->lru_lock); > > - list_move(&entry->lru, &entry->pool->lru); > > - spin_unlock(&entry->pool->lru_lock); > > + zswap_lru_del(&entry->pool->list_lru, entry); > > + zswap_lru_add(&entry->pool->list_lru, entry); > > Can we use list_move_tail() here? (perhaps wrapped in a helper if needed). Maybe zswap_lru_move_tail()? :) FWIW, list_lru() interface does not have a move_tail function, (weird, I know). So this seems to be the common pattern for LRU rotation with list_lru. > > > } > > zswap_entry_put(tree, entry); > > spin_unlock(&tree->lock); > > -- > > 2.34.1
On Fri, Oct 27, 2023 at 2:10 PM Nhat Pham <nphamcs@gmail.com> wrote: > > On Tue, Oct 24, 2023 at 8:17 PM Yosry Ahmed <yosryahmed@google.com> wrote: > > > > On Tue, Oct 24, 2023 at 1:33 PM Nhat Pham <nphamcs@gmail.com> wrote: > > > > > > From: Domenico Cerasuolo <cerasuolodomenico@gmail.com> > > > > > > Currently, we only have a single global LRU for zswap. This makes it > > > impossible to perform worload-specific shrinking - an memcg cannot > > > determine which pages in the pool it owns, and often ends up writing > > > pages from other memcgs. This issue has been previously observed in > > > practice and mitigated by simply disabling memcg-initiated shrinking: > > > > > > https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u > > > > > > This patch fully resolves the issue by replacing the global zswap LRU > > > with memcg- and NUMA-specific LRUs, and modify the reclaim logic: > > > > > > a) When a store attempt hits an memcg limit, it now triggers a > > > synchronous reclaim attempt that, if successful, allows the new > > > hotter page to be accepted by zswap. > > > b) If the store attempt instead hits the global zswap limit, it will > > > trigger an asynchronous reclaim attempt, in which an memcg is > > > selected for reclaim in a round-robin-like fashion. > > > > > > Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com> > > > Co-developed-by: Nhat Pham <nphamcs@gmail.com> > > > Signed-off-by: Nhat Pham <nphamcs@gmail.com> > > > --- > > > include/linux/memcontrol.h | 5 + > > > mm/swap.h | 3 +- > > > mm/swap_state.c | 23 +++-- > > > mm/zswap.c | 188 ++++++++++++++++++++++++++----------- > > > 4 files changed, 156 insertions(+), 63 deletions(-) > > > > > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > > > index 6edd3ec4d8d5..c1846e57011b 100644 > > > --- a/include/linux/memcontrol.h > > > +++ b/include/linux/memcontrol.h > > > @@ -1187,6 +1187,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) > > > return NULL; > > > } > > > > > > +static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) > > > +{ > > > + return NULL; > > > +} > > > + > > > static inline bool folio_memcg_kmem(struct folio *folio) > > > { > > > return false; > > > diff --git a/mm/swap.h b/mm/swap.h > > > index 73c332ee4d91..c0dc73e10e91 100644 > > > --- a/mm/swap.h > > > +++ b/mm/swap.h > > > @@ -51,7 +51,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > > > struct swap_iocb **plug); > > > struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > > > struct mempolicy *mpol, pgoff_t ilx, > > > - bool *new_page_allocated); > > > + bool *new_page_allocated, > > > + bool skip_if_exists); > > > struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, > > > struct mempolicy *mpol, pgoff_t ilx); > > > struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, > > > diff --git a/mm/swap_state.c b/mm/swap_state.c > > > index 85d9e5806a6a..040639e1c77e 100644 > > > --- a/mm/swap_state.c > > > +++ b/mm/swap_state.c > > > @@ -412,7 +412,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, > > > > > > struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > > > struct mempolicy *mpol, pgoff_t ilx, > > > - bool *new_page_allocated) > > > + bool *new_page_allocated, > > > + bool skip_if_exists) > > > { > > > struct swap_info_struct *si; > > > struct folio *folio; > > > @@ -470,6 +471,16 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > > > if (err != -EEXIST) > > > goto fail_put_swap; > > > > > > + /* Protect against a recursive call to __read_swap_cache_async() > > > > nit: insert new line before "Protect", see surrounding comments. > > > > > + * on the same entry waiting forever here because SWAP_HAS_CACHE > > > + * is set but the folio is not the swap cache yet. This can > > > + * happen today if mem_cgroup_swapin_charge_folio() below > > > + * triggers reclaim through zswap, which may call > > > + * __read_swap_cache_async() in the writeback path. > > > + */ > > > + if (skip_if_exists) > > > + goto fail_put_swap; > > > + > > > /* > > > * We might race against __delete_from_swap_cache(), and > > > * stumble across a swap_map entry whose SWAP_HAS_CACHE > > [..] > > > +/********************************* > > > +* lru functions > > > +**********************************/ > > > +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) > > > +{ > > > + struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry); > > > + int nid = entry_to_nid(entry); > > > + bool added = list_lru_add(list_lru, &entry->lru, nid, memcg); > > > + > > > + mem_cgroup_put(memcg); > > > > Still not fond of the get/put pattern but okay.. > > Actually, Johannes and I took another look to see if we can replace > the memcg reference getting with just rcu_read_lock(). > > It seems there might be a race between zswap LRU manipulation > and memcg offlining - not just with the rcu_read_lock() idea, but also > with our current implementation! > > I'll shoot another email with more details later when I'm sure of it > one way or another... > > > > > > + return added; > > > +} > > > + > > > +static bool zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) > > > +{ > > > + struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry); > > > + int nid = entry_to_nid(entry); > > > + bool removed = list_lru_del(list_lru, &entry->lru, nid, memcg); > > > + > > > + mem_cgroup_put(memcg); > > > + return removed; > > > +} > > > + > > > /********************************* > > > * rbtree functions > > > **********************************/ > > [..] > > > @@ -652,28 +679,37 @@ static int zswap_reclaim_entry(struct zswap_pool *pool) > > > */ > > > swpoffset = swp_offset(entry->swpentry); > > > tree = zswap_trees[swp_type(entry->swpentry)]; > > > - spin_unlock(&pool->lru_lock); > > > + list_lru_isolate(l, item); > > > + /* > > > + * It's safe to drop the lock here because we return either > > > + * LRU_REMOVED_RETRY or LRU_RETRY. > > > + */ > > > + spin_unlock(lock); > > > > > > /* Check for invalidate() race */ > > > spin_lock(&tree->lock); > > > - if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) { > > > - ret = -EAGAIN; > > > + if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) > > > goto unlock; > > > - } > > > + > > > /* Hold a reference to prevent a free during writeback */ > > > zswap_entry_get(entry); > > > spin_unlock(&tree->lock); > > > > > > - ret = zswap_writeback_entry(entry, tree); > > > + writeback_result = zswap_writeback_entry(entry, tree); > > > > > > spin_lock(&tree->lock); > > > - if (ret) { > > > - /* Writeback failed, put entry back on LRU */ > > > - spin_lock(&pool->lru_lock); > > > - list_move(&entry->lru, &pool->lru); > > > - spin_unlock(&pool->lru_lock); > > > + if (writeback_result) { > > > + zswap_reject_reclaim_fail++; > > > + memcg = get_mem_cgroup_from_entry(entry); > > > > Can this return NULL? Seems like we don't check the return in most/all places. > > I believe so, but memcg experts should fact check me on this. > It's roughly the same pattern as zswap charging/uncharging: > > obj_cgroup_uncharge_zswap(entry->objcg, entry->length) > -> getting memcg (under rcu_read_lock()) > > > > > > + spin_lock(lock); > > > + /* we cannot use zswap_lru_add here, because it increments node's lru count */ > > > + list_lru_putback(&entry->pool->list_lru, item, entry_to_nid(entry), memcg); > > > > Perhaps we can move this call with the memcg get/put to a helper like > > add/del? (e.g. zswap_lru_putback) > > > > We would need to move get_mem_cgroup_from_entry() into the lock but I > > think that's okay. > > We probably could, but that sounds like extra code for not a lot of gains, no? > > > > > > + spin_unlock(lock); > > > + mem_cgroup_put(memcg); > > > + ret = LRU_RETRY; > > > goto put_unlock; > > > } > > > + zswap_written_back_pages++; > > > > > > /* > > > * Writeback started successfully, the page now belongs to the > > > @@ -687,7 +723,34 @@ static int zswap_reclaim_entry(struct zswap_pool *pool) > > > zswap_entry_put(tree, entry); > > > unlock: > > > spin_unlock(&tree->lock); > > > - return ret ? -EAGAIN : 0; > > > + spin_lock(lock); > > > + return ret; > > > +} > > > + > > > +static int shrink_memcg(struct mem_cgroup *memcg) > > > +{ > > > + struct zswap_pool *pool; > > > + int nid, shrunk = 0; > > > + > > > + /* > > > + * Skip zombies because their LRUs are reparented and we would be > > > + * reclaiming from the parent instead of the dead memcg. > > > + */ > > > + if (memcg && !mem_cgroup_online(memcg)) > > > + return -ENOENT; > > > + > > > + pool = zswap_pool_current_get(); > > > + if (!pool) > > > + return -EINVAL; > > > + > > > + for_each_node_state(nid, N_NORMAL_MEMORY) { > > > + unsigned long nr_to_walk = 1; > > > + > > > + shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, > > > + &shrink_memcg_cb, NULL, &nr_to_walk); > > > + } > > > + zswap_pool_put(pool); > > > + return shrunk ? 0 : -EAGAIN; > > > } > > > > > > static void shrink_worker(struct work_struct *w) > > > @@ -696,15 +759,17 @@ static void shrink_worker(struct work_struct *w) > > > shrink_work); > > > int ret, failures = 0; > > > > > > + /* global reclaim will select cgroup in a round-robin fashion. */ > > > do { > > > - ret = zswap_reclaim_entry(pool); > > > - if (ret) { > > > - zswap_reject_reclaim_fail++; > > > - if (ret != -EAGAIN) > > > - break; > > > - if (++failures == MAX_RECLAIM_RETRIES) > > > - break; > > > - } > > > + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); > > > > I think this can be a problem. We hold a ref to a memcg here until the > > next time we shrink, which can be a long time IIUC. This can cause the > > memcg to linger as a zombie. I understand it is one memcg per-zswap > > pool, but I am still unsure about it. > > > > MGLRU maintains a memcg LRU for global reclaim that gets properly > > cleaned up when a memcg is going away, so that's one option, although > > complicated. > > > > A second option would be to hold a pointer to the objcg instead, which > > should be less problematic (although we are still holding that objcg > > hostage indefinitely). The problem here is that if the objcg gets > > reparented, next time we will start at the parent of the memcg we > > stopped at last time, which tbh doesn't sound bad at all to me. > > > > A third option would be to flag the memcg such that when it is getting > > offlined we can call into zswap to reset pool->next_shrink (or move it > > to the parent) and drop the ref. Although synchronization can get > > hairy when racing with offlining. > : > > Not sure what's the right solution, but I prefer we don't hold any > > memcgs hostages indefinitely. I also think if we end up using > > mem_cgroup_iter() then there should be a mem_cgroup_iter_break() > > somewhere if/when breaking the iteration. > > > > I'm not sure if this is that big of a problem in the first place, but > if it is, doing something similar to MGLRU is probably the cleanest: > when the memcg is freed, trigger the zswap_exit_memcg() callback, > which will loop through all the zswap pools and update pool->next_shrink > where appropriate. Correction: I think I should have said "when the memcg is offlined" (which I assume is when we want to drop the memcg's reference so that zswap will not hold the cgroup hostage in the zombie stage). The remainder of the idea still holds of course. > > Note that we only have one pool per (compression algorithm x allocator) > combinations, so there cannot be that many pools, correct? > > Johannes suggests this idea to me (my apologies if I butcher it) > during one of our conversations. That sounds relatively easy IIUC. > > > > + > > > + ret = shrink_memcg(pool->next_shrink); > > > + > > > + if (ret == -EINVAL) > > > + break; > > > + if (ret && ++failures == MAX_RECLAIM_RETRIES) > > > + break; > > > + > > > cond_resched(); > > > } while (!zswap_can_accept()); > > > zswap_pool_put(pool); > > [..] > > > @@ -1233,15 +1301,15 @@ bool zswap_store(struct folio *folio) > > > zswap_invalidate_entry(tree, dupentry); > > > } > > > spin_unlock(&tree->lock); > > > - > > > - /* > > > - * XXX: zswap reclaim does not work with cgroups yet. Without a > > > - * cgroup-aware entry LRU, we will push out entries system-wide based on > > > - * local cgroup limits. > > > - */ > > > objcg = get_obj_cgroup_from_folio(folio); > > > - if (objcg && !obj_cgroup_may_zswap(objcg)) > > > - goto reject; > > > + if (objcg && !obj_cgroup_may_zswap(objcg)) { > > > + memcg = get_mem_cgroup_from_objcg(objcg); > > > + if (shrink_memcg(memcg)) { > > > + mem_cgroup_put(memcg); > > > + goto reject; > > > + } > > > + mem_cgroup_put(memcg); > > > > Here we choose to replicate mem_cgroup_put(). > > > > > + } > > > > > > /* reclaim space if needed */ > > > if (zswap_is_full()) { > > > @@ -1258,7 +1326,7 @@ bool zswap_store(struct folio *folio) > > > } > > > > > > /* allocate entry */ > > > - entry = zswap_entry_cache_alloc(GFP_KERNEL); > > > + entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); > > > if (!entry) { > > > zswap_reject_kmemcache_fail++; > > > goto reject; > > > @@ -1285,6 +1353,15 @@ bool zswap_store(struct folio *folio) > > > if (!entry->pool) > > > goto freepage; > > > > > > + if (objcg) { > > > + memcg = get_mem_cgroup_from_objcg(objcg); > > > + lru_alloc_ret = memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL); > > > + mem_cgroup_put(memcg); > > > + > > > + if (lru_alloc_ret) > > > + goto put_pool; > > > + } > > > > Yet here we choose to have a single mem_cgroup_put() and stash the > > output in a variable. > > > > Consistency would be nice. > > No strong opinions here, but yeah we should fix it to make it > consistent. > > > > > > + > > > /* compress */ > > > acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); > > > > > > @@ -1361,9 +1438,8 @@ bool zswap_store(struct folio *folio) > > > zswap_invalidate_entry(tree, dupentry); > > > } > > > if (entry->length) { > > > - spin_lock(&entry->pool->lru_lock); > > > - list_add(&entry->lru, &entry->pool->lru); > > > - spin_unlock(&entry->pool->lru_lock); > > > + INIT_LIST_HEAD(&entry->lru); > > > + zswap_lru_add(&entry->pool->list_lru, entry); > > > } > > > spin_unlock(&tree->lock); > > > > > > @@ -1376,6 +1452,7 @@ bool zswap_store(struct folio *folio) > > > > > > put_dstmem: > > > mutex_unlock(acomp_ctx->mutex); > > > +put_pool: > > > zswap_pool_put(entry->pool); > > > freepage: > > > zswap_entry_cache_free(entry); > > > @@ -1470,9 +1547,8 @@ bool zswap_load(struct folio *folio) > > > zswap_invalidate_entry(tree, entry); > > > folio_mark_dirty(folio); > > > } else if (entry->length) { > > > - spin_lock(&entry->pool->lru_lock); > > > - list_move(&entry->lru, &entry->pool->lru); > > > - spin_unlock(&entry->pool->lru_lock); > > > + zswap_lru_del(&entry->pool->list_lru, entry); > > > + zswap_lru_add(&entry->pool->list_lru, entry); > > > > Can we use list_move_tail() here? (perhaps wrapped in a helper if needed). > > Maybe zswap_lru_move_tail()? :) > > FWIW, list_lru() interface does not have a move_tail function, (weird, I know). > So this seems to be the common pattern for LRU rotation with list_lru. > > > > > > } > > > zswap_entry_put(tree, entry); > > > spin_unlock(&tree->lock); > > > -- > > > 2.34.1
> > [..] > > > +/********************************* > > > +* lru functions > > > +**********************************/ > > > +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) > > > +{ > > > + struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry); > > > + int nid = entry_to_nid(entry); > > > + bool added = list_lru_add(list_lru, &entry->lru, nid, memcg); > > > + > > > + mem_cgroup_put(memcg); > > > > Still not fond of the get/put pattern but okay.. > > Actually, Johannes and I took another look to see if we can replace > the memcg reference getting with just rcu_read_lock(). > > It seems there might be a race between zswap LRU manipulation > and memcg offlining - not just with the rcu_read_lock() idea, but also > with our current implementation! > > I'll shoot another email with more details later when I'm sure of it > one way or another... > Interesting, well at least something came out of my complaining :) > > [..] > > > @@ -652,28 +679,37 @@ static int zswap_reclaim_entry(struct zswap_pool *pool) > > > */ > > > swpoffset = swp_offset(entry->swpentry); > > > tree = zswap_trees[swp_type(entry->swpentry)]; > > > - spin_unlock(&pool->lru_lock); > > > + list_lru_isolate(l, item); > > > + /* > > > + * It's safe to drop the lock here because we return either > > > + * LRU_REMOVED_RETRY or LRU_RETRY. > > > + */ > > > + spin_unlock(lock); > > > > > > /* Check for invalidate() race */ > > > spin_lock(&tree->lock); > > > - if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) { > > > - ret = -EAGAIN; > > > + if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) > > > goto unlock; > > > - } > > > + > > > /* Hold a reference to prevent a free during writeback */ > > > zswap_entry_get(entry); > > > spin_unlock(&tree->lock); > > > > > > - ret = zswap_writeback_entry(entry, tree); > > > + writeback_result = zswap_writeback_entry(entry, tree); > > > > > > spin_lock(&tree->lock); > > > - if (ret) { > > > - /* Writeback failed, put entry back on LRU */ > > > - spin_lock(&pool->lru_lock); > > > - list_move(&entry->lru, &pool->lru); > > > - spin_unlock(&pool->lru_lock); > > > + if (writeback_result) { > > > + zswap_reject_reclaim_fail++; > > > + memcg = get_mem_cgroup_from_entry(entry); > > > > Can this return NULL? Seems like we don't check the return in most/all places. > > I believe so, but memcg experts should fact check me on this. If that's the case, there should be NULL checks, no? > It's roughly the same pattern as zswap charging/uncharging: > > obj_cgroup_uncharge_zswap(entry->objcg, entry->length) > -> getting memcg (under rcu_read_lock()) > > > > > > + spin_lock(lock); > > > + /* we cannot use zswap_lru_add here, because it increments node's lru count */ > > > + list_lru_putback(&entry->pool->list_lru, item, entry_to_nid(entry), memcg); > > > > Perhaps we can move this call with the memcg get/put to a helper like > > add/del? (e.g. zswap_lru_putback) > > > > We would need to move get_mem_cgroup_from_entry() into the lock but I > > think that's okay. > > We probably could, but that sounds like extra code for not a lot of gains, no? I don't feel strongly, just a fan of consistency. > > > > > > + spin_unlock(lock); > > > + mem_cgroup_put(memcg); > > > + ret = LRU_RETRY; > > > goto put_unlock; > > > } > > > + zswap_written_back_pages++; > > > > > > /* > > > * Writeback started successfully, the page now belongs to the [..] > > > @@ -696,15 +759,17 @@ static void shrink_worker(struct work_struct *w) > > > shrink_work); > > > int ret, failures = 0; > > > > > > + /* global reclaim will select cgroup in a round-robin fashion. */ > > > do { > > > - ret = zswap_reclaim_entry(pool); > > > - if (ret) { > > > - zswap_reject_reclaim_fail++; > > > - if (ret != -EAGAIN) > > > - break; > > > - if (++failures == MAX_RECLAIM_RETRIES) > > > - break; > > > - } > > > + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); > > > > I think this can be a problem. We hold a ref to a memcg here until the > > next time we shrink, which can be a long time IIUC. This can cause the > > memcg to linger as a zombie. I understand it is one memcg per-zswap > > pool, but I am still unsure about it. > > > > MGLRU maintains a memcg LRU for global reclaim that gets properly > > cleaned up when a memcg is going away, so that's one option, although > > complicated. > > > > A second option would be to hold a pointer to the objcg instead, which > > should be less problematic (although we are still holding that objcg > > hostage indefinitely). The problem here is that if the objcg gets > > reparented, next time we will start at the parent of the memcg we > > stopped at last time, which tbh doesn't sound bad at all to me. > > > > A third option would be to flag the memcg such that when it is getting > > offlined we can call into zswap to reset pool->next_shrink (or move it > > to the parent) and drop the ref. Although synchronization can get > > hairy when racing with offlining. > > > > Not sure what's the right solution, but I prefer we don't hold any > > memcgs hostages indefinitely. I also think if we end up using > > mem_cgroup_iter() then there should be a mem_cgroup_iter_break() > > somewhere if/when breaking the iteration. > > > > I'm not sure if this is that big of a problem in the first place, but > if it is, doing something similar to MGLRU is probably the cleanest: > when the memcg is freed, trigger the zswap_exit_memcg() callback, > which will loop through all the zswap pools and update pool->next_shrink > where appropriate. > > Note that we only have one pool per (compression algorithm x allocator) > combinations, so there cannot be that many pools, correct? > > Johannes suggests this idea to me (my apologies if I butcher it) > during one of our conversations. That sounds relatively easy IIUC. Be careful that there will be a race between memcg offlining and zswap's usage of pool->next_shrink. AFAICT there is no lock to prevent offlining so there will have to be some sort of dance here to make sure everything works correctly.
cc-ing Johannes, Roman, Shakeel, Muchun since you all know much more about memory controller + list_lru reparenting logic than me. There seems to be a race between memcg offlining and zswap’s cgroup-aware LRU implementation: CPU0 CPU1 zswap_lru_add() mem_cgroup_css_offline() get_mem_cgroup_from_objcg() memcg_offline_kmem() memcg_reparent_objcgs() memcg_reparent_list_lrus() memcg_reparent_list_lru() memcg_reparent_list_lru_node() list_lru_add() memcg_list_lru_free() Essentially: on CPU0, zswap gets the memcg from the entry's objcg (before the objcgs are reparented). Then it performs list_lru_add() after the list_lru entries reparenting (memcg_reparent_list_lru_node()) step. If the list_lru of the memcg being offlined has not been freed (i.e before the memcg_list_lru_free() call), then the list_lru_add() call would succeed - but the list will be freed soon after. The new zswap entry as a result will not be subjected to future reclaim attempt. IOW, this list_lru_add() call is effectively swallowed. And worse, there might be a crash when we invalidate the zswap_entry in the future (which will perform a list_lru removal). Within get_mem_cgroup_from_objcg(), none of the following seem sufficient to prevent this race: 1. Perform the objcg-to-memcg lookup inside a rcu_read_lock() section. 2. Checking if the memcg is freed yet (with css_tryget()) (what we're currently doing in this patch series). 3. Checking if the memcg is still online (with css_tryget_online()) The memcg can still be offlined down the line. I've discussed this privately with Johannes, and it seems like the cleanest solution here is to move the reparenting logic down to release stage. That way, when get_mem_cgroup_from_objcg() returns, zswap_lru_add() is given an memcg that is reparenting-safe (until we drop the obtained reference).
On Tue, Oct 31, 2023 at 6:26 PM Nhat Pham <nphamcs@gmail.com> wrote: > > cc-ing Johannes, Roman, Shakeel, Muchun since you all know much more > about memory controller + list_lru reparenting logic than me. > > There seems to be a race between memcg offlining and zswap’s > cgroup-aware LRU implementation: > > CPU0 CPU1 > zswap_lru_add() mem_cgroup_css_offline() > get_mem_cgroup_from_objcg() > memcg_offline_kmem() > memcg_reparent_objcgs() > memcg_reparent_list_lrus() > memcg_reparent_list_lru() > memcg_reparent_list_lru_node() > list_lru_add() > memcg_list_lru_free() > > > Essentially: on CPU0, zswap gets the memcg from the entry's objcg > (before the objcgs are reparented). Then it performs list_lru_add() > after the list_lru entries reparenting (memcg_reparent_list_lru_node()) > step. If the list_lru of the memcg being offlined has not been freed > (i.e before the memcg_list_lru_free() call), then the list_lru_add() > call would succeed - but the list will be freed soon after. The new > zswap entry as a result will not be subjected to future reclaim > attempt. IOW, this list_lru_add() call is effectively swallowed. And > worse, there might be a crash when we invalidate the zswap_entry in the > future (which will perform a list_lru removal). > > Within get_mem_cgroup_from_objcg(), none of the following seem > sufficient to prevent this race: > > 1. Perform the objcg-to-memcg lookup inside a rcu_read_lock() > section. > 2. Checking if the memcg is freed yet (with css_tryget()) (what > we're currently doing in this patch series). > 3. Checking if the memcg is still online (with css_tryget_online()) > The memcg can still be offlined down the line. > > > I've discussed this privately with Johannes, and it seems like the > cleanest solution here is to move the reparenting logic down to release > stage. That way, when get_mem_cgroup_from_objcg() returns, > zswap_lru_add() is given an memcg that is reparenting-safe (until we > drop the obtained reference). The objcgs hold refs to the memcg, which are dropped during reparenting. How can we do reparenting in the release stage, which IIUC happens after all refs are dropped?
On Tue, Oct 31, 2023 at 6:33 PM Yosry Ahmed <yosryahmed@google.com> wrote: > > On Tue, Oct 31, 2023 at 6:26 PM Nhat Pham <nphamcs@gmail.com> wrote: > > > > cc-ing Johannes, Roman, Shakeel, Muchun since you all know much more > > about memory controller + list_lru reparenting logic than me. > > > > There seems to be a race between memcg offlining and zswap’s > > cgroup-aware LRU implementation: > > > > CPU0 CPU1 > > zswap_lru_add() mem_cgroup_css_offline() > > get_mem_cgroup_from_objcg() > > memcg_offline_kmem() > > memcg_reparent_objcgs() > > memcg_reparent_list_lrus() > > memcg_reparent_list_lru() > > memcg_reparent_list_lru_node() > > list_lru_add() > > memcg_list_lru_free() > > > > > > Essentially: on CPU0, zswap gets the memcg from the entry's objcg > > (before the objcgs are reparented). Then it performs list_lru_add() > > after the list_lru entries reparenting (memcg_reparent_list_lru_node()) > > step. If the list_lru of the memcg being offlined has not been freed > > (i.e before the memcg_list_lru_free() call), then the list_lru_add() > > call would succeed - but the list will be freed soon after. The new > > zswap entry as a result will not be subjected to future reclaim > > attempt. IOW, this list_lru_add() call is effectively swallowed. And > > worse, there might be a crash when we invalidate the zswap_entry in the > > future (which will perform a list_lru removal). > > > > Within get_mem_cgroup_from_objcg(), none of the following seem > > sufficient to prevent this race: > > > > 1. Perform the objcg-to-memcg lookup inside a rcu_read_lock() > > section. > > 2. Checking if the memcg is freed yet (with css_tryget()) (what > > we're currently doing in this patch series). > > 3. Checking if the memcg is still online (with css_tryget_online()) > > The memcg can still be offlined down the line. > > > > > > I've discussed this privately with Johannes, and it seems like the > > cleanest solution here is to move the reparenting logic down to release > > stage. That way, when get_mem_cgroup_from_objcg() returns, > > zswap_lru_add() is given an memcg that is reparenting-safe (until we > > drop the obtained reference). > > The objcgs hold refs to the memcg, which are dropped during > reparenting. How can we do reparenting in the release stage, which > IIUC happens after all refs are dropped? Oh I meant just the list_lru reparenting. The list_lru themselves don't hold any ref I believe, right? Then it's safe to perform this at the release stage. (also, I think I might have messed up the encoding for the email above. Let me know if people cannot view it, and I'll resend it :( )
> On Nov 1, 2023, at 09:26, Nhat Pham <nphamcs@gmail.com> wrote: > > cc-ing Johannes, Roman, Shakeel, Muchun since you all know much more > about memory controller + list_lru reparenting logic than me. > > There seems to be a race between memcg offlining and zswap’s > cgroup-aware LRU implementation: > > CPU0 CPU1 > zswap_lru_add() mem_cgroup_css_offline() > get_mem_cgroup_from_objcg() > memcg_offline_kmem() > memcg_reparent_objcgs() > memcg_reparent_list_lrus() > memcg_reparent_list_lru() > memcg_reparent_list_lru_node() > list_lru_add() > memcg_list_lru_free() > > > Essentially: on CPU0, zswap gets the memcg from the entry's objcg > (before the objcgs are reparented). Then it performs list_lru_add() > after the list_lru entries reparenting (memcg_reparent_list_lru_node()) > step. If the list_lru of the memcg being offlined has not been freed > (i.e before the memcg_list_lru_free() call), then the list_lru_add() > call would succeed - but the list will be freed soon after. The new No worries. list_lru_add() will add the object to the lru list of the parent of the memcg being offlined, because the ->kmemcg_id of the memcg being offlined will be changed to its parent's ->kmemcg_id before memcg_reparent_list_lru(). Muchun, Thanks > zswap entry as a result will not be subjected to future reclaim > attempt. IOW, this list_lru_add() call is effectively swallowed. And > worse, there might be a crash when we invalidate the zswap_entry in the > future (which will perform a list_lru removal). > > Within get_mem_cgroup_from_objcg(), none of the following seem > sufficient to prevent this race: > > 1. Perform the objcg-to-memcg lookup inside a rcu_read_lock() > section. > 2. Checking if the memcg is freed yet (with css_tryget()) (what > we're currently doing in this patch series). > 3. Checking if the memcg is still online (with css_tryget_online()) > The memcg can still be offlined down the line. > > > I've discussed this privately with Johannes, and it seems like the > cleanest solution here is to move the reparenting logic down to release > stage. That way, when get_mem_cgroup_from_objcg() returns, > zswap_lru_add() is given an memcg that is reparenting-safe (until we > drop the obtained reference).
On Tue, Oct 31, 2023 at 8:07 PM Muchun Song <muchun.song@linux.dev> wrote: > > > > > On Nov 1, 2023, at 09:26, Nhat Pham <nphamcs@gmail.com> wrote: > > > > cc-ing Johannes, Roman, Shakeel, Muchun since you all know much more > > about memory controller + list_lru reparenting logic than me. > > > > There seems to be a race between memcg offlining and zswap’s > > cgroup-aware LRU implementation: > > > > CPU0 CPU1 > > zswap_lru_add() mem_cgroup_css_offline() > > get_mem_cgroup_from_objcg() > > memcg_offline_kmem() > > memcg_reparent_objcgs() > > memcg_reparent_list_lrus() > > memcg_reparent_list_lru() > > memcg_reparent_list_lru_node() > > list_lru_add() > > memcg_list_lru_free() > > > > > > Essentially: on CPU0, zswap gets the memcg from the entry's objcg > > (before the objcgs are reparented). Then it performs list_lru_add() > > after the list_lru entries reparenting (memcg_reparent_list_lru_node()) > > step. If the list_lru of the memcg being offlined has not been freed > > (i.e before the memcg_list_lru_free() call), then the list_lru_add() > > call would succeed - but the list will be freed soon after. The new > > No worries. list_lru_add() will add the object to the lru list of > the parent of the memcg being offlined, because the ->kmemcg_id of the > memcg being offlined will be changed to its parent's ->kmemcg_id before memcg_reparent_list_lru(). > Ohhh that is subtle. Thanks for pointing this out, Muchun! In that case, I think Yosry is right after all! We don't even need to get a reference to the memcg: rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); list_lru_add(); rcu_read_unlock(); As long as we're inside this rcu section, we're guaranteed to get an un-freed memcg. Now it could be offlined etc., but as Muchun has pointed out, the list_lru_add() call will still does the right thing - it will either add the new entry to the parent list if this happens after the kmemcg_id update, or the child list before the list_lru reparenting action. Both of these scenarios are fine. > Muchun, > Thanks > > > zswap entry as a result will not be subjected to future reclaim > > attempt. IOW, this list_lru_add() call is effectively swallowed. And > > worse, there might be a crash when we invalidate the zswap_entry in the > > future (which will perform a list_lru removal). > > > > Within get_mem_cgroup_from_objcg(), none of the following seem > > sufficient to prevent this race: > > > > 1. Perform the objcg-to-memcg lookup inside a rcu_read_lock() > > section. > > 2. Checking if the memcg is freed yet (with css_tryget()) (what > > we're currently doing in this patch series). > > 3. Checking if the memcg is still online (with css_tryget_online()) > > The memcg can still be offlined down the line. > > > > > > I've discussed this privately with Johannes, and it seems like the > > cleanest solution here is to move the reparenting logic down to release > > stage. That way, when get_mem_cgroup_from_objcg() returns, > > zswap_lru_add() is given an memcg that is reparenting-safe (until we > > drop the obtained reference). >
> On Nov 2, 2023, at 01:44, Nhat Pham <nphamcs@gmail.com> wrote: > > On Tue, Oct 31, 2023 at 8:07 PM Muchun Song <muchun.song@linux.dev> wrote: >> >> >> >>> On Nov 1, 2023, at 09:26, Nhat Pham <nphamcs@gmail.com> wrote: >>> >>> cc-ing Johannes, Roman, Shakeel, Muchun since you all know much more >>> about memory controller + list_lru reparenting logic than me. >>> >>> There seems to be a race between memcg offlining and zswap’s >>> cgroup-aware LRU implementation: >>> >>> CPU0 CPU1 >>> zswap_lru_add() mem_cgroup_css_offline() >>> get_mem_cgroup_from_objcg() >>> memcg_offline_kmem() >>> memcg_reparent_objcgs() >>> memcg_reparent_list_lrus() >>> memcg_reparent_list_lru() >>> memcg_reparent_list_lru_node() >>> list_lru_add() >>> memcg_list_lru_free() >>> >>> >>> Essentially: on CPU0, zswap gets the memcg from the entry's objcg >>> (before the objcgs are reparented). Then it performs list_lru_add() >>> after the list_lru entries reparenting (memcg_reparent_list_lru_node()) >>> step. If the list_lru of the memcg being offlined has not been freed >>> (i.e before the memcg_list_lru_free() call), then the list_lru_add() >>> call would succeed - but the list will be freed soon after. The new >> >> No worries. list_lru_add() will add the object to the lru list of >> the parent of the memcg being offlined, because the ->kmemcg_id of the >> memcg being offlined will be changed to its parent's ->kmemcg_id before memcg_reparent_list_lru(). >> > > Ohhh that is subtle. Thanks for pointing this out, Muchun! > > In that case, I think Yosry is right after all! We don't even need to get > a reference to the memcg: > > rcu_read_lock(); > memcg = obj_cgroup_memcg(objcg); > list_lru_add(); > rcu_read_unlock(); > > As long as we're inside this rcu section, we're guaranteed to get > an un-freed memcg. Now it could be offlined etc., but as Muchun has Right. Thanks. > pointed out, the list_lru_add() call will still does the right thing - it will > either add the new entry to the parent list if this happens after the > kmemcg_id update, or the child list before the list_lru reparenting > action. Both of these scenarios are fine. > >> Muchun, >> Thanks >> >>> zswap entry as a result will not be subjected to future reclaim >>> attempt. IOW, this list_lru_add() call is effectively swallowed. And >>> worse, there might be a crash when we invalidate the zswap_entry in the >>> future (which will perform a list_lru removal). >>> >>> Within get_mem_cgroup_from_objcg(), none of the following seem >>> sufficient to prevent this race: >>> >>> 1. Perform the objcg-to-memcg lookup inside a rcu_read_lock() >>> section. >>> 2. Checking if the memcg is freed yet (with css_tryget()) (what >>> we're currently doing in this patch series). >>> 3. Checking if the memcg is still online (with css_tryget_online()) >>> The memcg can still be offlined down the line. >>> >>> >>> I've discussed this privately with Johannes, and it seems like the >>> cleanest solution here is to move the reparenting logic down to release >>> stage. That way, when get_mem_cgroup_from_objcg() returns, >>> zswap_lru_add() is given an memcg that is reparenting-safe (until we >>> drop the obtained reference).
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6edd3ec4d8d5..c1846e57011b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1187,6 +1187,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return NULL; } +static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) +{ + return NULL; +} + static inline bool folio_memcg_kmem(struct folio *folio) { return false; diff --git a/mm/swap.h b/mm/swap.h index 73c332ee4d91..c0dc73e10e91 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -51,7 +51,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct swap_iocb **plug); struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated); + bool *new_page_allocated, + bool skip_if_exists); struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, diff --git a/mm/swap_state.c b/mm/swap_state.c index 85d9e5806a6a..040639e1c77e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -412,7 +412,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated) + bool *new_page_allocated, + bool skip_if_exists) { struct swap_info_struct *si; struct folio *folio; @@ -470,6 +471,16 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (err != -EEXIST) goto fail_put_swap; + /* Protect against a recursive call to __read_swap_cache_async() + * on the same entry waiting forever here because SWAP_HAS_CACHE + * is set but the folio is not the swap cache yet. This can + * happen today if mem_cgroup_swapin_charge_folio() below + * triggers reclaim through zswap, which may call + * __read_swap_cache_async() in the writeback path. + */ + if (skip_if_exists) + goto fail_put_swap; + /* * We might race against __delete_from_swap_cache(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE @@ -537,7 +548,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, mpol = get_vma_policy(vma, addr, 0, &ilx); page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated); + &page_allocated, false); mpol_cond_put(mpol); if (page_allocated) @@ -654,7 +665,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, /* Ok, do the async read-ahead now */ page = __read_swap_cache_async( swp_entry(swp_type(entry), offset), - gfp_mask, mpol, ilx, &page_allocated); + gfp_mask, mpol, ilx, &page_allocated, false); if (!page) continue; if (page_allocated) { @@ -672,7 +683,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, skip: /* The page was likely read above, so no need for plugging here */ page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated); + &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); return page; @@ -827,7 +838,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, pte_unmap(pte); pte = NULL; page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated); + &page_allocated, false); if (!page) continue; if (page_allocated) { @@ -847,7 +858,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, skip: /* The page was likely read above, so no need for plugging here */ page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, - &page_allocated); + &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); return page; diff --git a/mm/zswap.c b/mm/zswap.c index 2e691cd1a466..ee8e227e7b0b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -35,6 +35,7 @@ #include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/workqueue.h> +#include <linux/list_lru.h> #include "swap.h" #include "internal.h" @@ -172,8 +173,8 @@ struct zswap_pool { struct work_struct shrink_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; - struct list_head lru; - spinlock_t lru_lock; + struct list_lru list_lru; + struct mem_cgroup *next_shrink; }; /* @@ -289,15 +290,25 @@ static void zswap_update_total_size(void) zswap_pool_total_size = total; } +static inline struct mem_cgroup *get_mem_cgroup_from_entry(struct zswap_entry *entry) +{ + return entry->objcg ? get_mem_cgroup_from_objcg(entry->objcg) : NULL; +} + +static inline int entry_to_nid(struct zswap_entry *entry) +{ + return page_to_nid(virt_to_page(entry)); +} + /********************************* * zswap entry functions **********************************/ static struct kmem_cache *zswap_entry_cache; -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) { struct zswap_entry *entry; - entry = kmem_cache_alloc(zswap_entry_cache, gfp); + entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; entry->refcount = 1; @@ -310,6 +321,29 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } +/********************************* +* lru functions +**********************************/ +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) +{ + struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry); + int nid = entry_to_nid(entry); + bool added = list_lru_add(list_lru, &entry->lru, nid, memcg); + + mem_cgroup_put(memcg); + return added; +} + +static bool zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) +{ + struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry); + int nid = entry_to_nid(entry); + bool removed = list_lru_del(list_lru, &entry->lru, nid, memcg); + + mem_cgroup_put(memcg); + return removed; +} + /********************************* * rbtree functions **********************************/ @@ -394,9 +428,7 @@ static void zswap_free_entry(struct zswap_entry *entry) if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { - spin_lock(&entry->pool->lru_lock); - list_del(&entry->lru); - spin_unlock(&entry->pool->lru_lock); + zswap_lru_del(&entry->pool->list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); zswap_pool_put(entry->pool); } @@ -630,21 +662,16 @@ static void zswap_invalidate_entry(struct zswap_tree *tree, zswap_entry_put(tree, entry); } -static int zswap_reclaim_entry(struct zswap_pool *pool) +static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, + spinlock_t *lock, void *arg) { - struct zswap_entry *entry; + struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); + struct mem_cgroup *memcg; struct zswap_tree *tree; pgoff_t swpoffset; - int ret; + enum lru_status ret = LRU_REMOVED_RETRY; + int writeback_result; - /* Get an entry off the LRU */ - spin_lock(&pool->lru_lock); - if (list_empty(&pool->lru)) { - spin_unlock(&pool->lru_lock); - return -EINVAL; - } - entry = list_last_entry(&pool->lru, struct zswap_entry, lru); - list_del_init(&entry->lru); /* * Once the lru lock is dropped, the entry might get freed. The * swpoffset is copied to the stack, and entry isn't deref'd again @@ -652,28 +679,37 @@ static int zswap_reclaim_entry(struct zswap_pool *pool) */ swpoffset = swp_offset(entry->swpentry); tree = zswap_trees[swp_type(entry->swpentry)]; - spin_unlock(&pool->lru_lock); + list_lru_isolate(l, item); + /* + * It's safe to drop the lock here because we return either + * LRU_REMOVED_RETRY or LRU_RETRY. + */ + spin_unlock(lock); /* Check for invalidate() race */ spin_lock(&tree->lock); - if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) { - ret = -EAGAIN; + if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) goto unlock; - } + /* Hold a reference to prevent a free during writeback */ zswap_entry_get(entry); spin_unlock(&tree->lock); - ret = zswap_writeback_entry(entry, tree); + writeback_result = zswap_writeback_entry(entry, tree); spin_lock(&tree->lock); - if (ret) { - /* Writeback failed, put entry back on LRU */ - spin_lock(&pool->lru_lock); - list_move(&entry->lru, &pool->lru); - spin_unlock(&pool->lru_lock); + if (writeback_result) { + zswap_reject_reclaim_fail++; + memcg = get_mem_cgroup_from_entry(entry); + spin_lock(lock); + /* we cannot use zswap_lru_add here, because it increments node's lru count */ + list_lru_putback(&entry->pool->list_lru, item, entry_to_nid(entry), memcg); + spin_unlock(lock); + mem_cgroup_put(memcg); + ret = LRU_RETRY; goto put_unlock; } + zswap_written_back_pages++; /* * Writeback started successfully, the page now belongs to the @@ -687,7 +723,34 @@ static int zswap_reclaim_entry(struct zswap_pool *pool) zswap_entry_put(tree, entry); unlock: spin_unlock(&tree->lock); - return ret ? -EAGAIN : 0; + spin_lock(lock); + return ret; +} + +static int shrink_memcg(struct mem_cgroup *memcg) +{ + struct zswap_pool *pool; + int nid, shrunk = 0; + + /* + * Skip zombies because their LRUs are reparented and we would be + * reclaiming from the parent instead of the dead memcg. + */ + if (memcg && !mem_cgroup_online(memcg)) + return -ENOENT; + + pool = zswap_pool_current_get(); + if (!pool) + return -EINVAL; + + for_each_node_state(nid, N_NORMAL_MEMORY) { + unsigned long nr_to_walk = 1; + + shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, + &shrink_memcg_cb, NULL, &nr_to_walk); + } + zswap_pool_put(pool); + return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) @@ -696,15 +759,17 @@ static void shrink_worker(struct work_struct *w) shrink_work); int ret, failures = 0; + /* global reclaim will select cgroup in a round-robin fashion. */ do { - ret = zswap_reclaim_entry(pool); - if (ret) { - zswap_reject_reclaim_fail++; - if (ret != -EAGAIN) - break; - if (++failures == MAX_RECLAIM_RETRIES) - break; - } + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); + + ret = shrink_memcg(pool->next_shrink); + + if (ret == -EINVAL) + break; + if (ret && ++failures == MAX_RECLAIM_RETRIES) + break; + cond_resched(); } while (!zswap_can_accept()); zswap_pool_put(pool); @@ -765,8 +830,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); - INIT_LIST_HEAD(&pool->lru); - spin_lock_init(&pool->lru_lock); + list_lru_init_memcg(&pool->list_lru, NULL); INIT_WORK(&pool->shrink_work, shrink_worker); zswap_pool_debug("created", pool); @@ -832,6 +896,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool) cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); + list_lru_destroy(&pool->list_lru); + if (pool->next_shrink) + mem_cgroup_put(pool->next_shrink); for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) zpool_destroy_pool(pool->zpools[i]); kfree(pool); @@ -1079,7 +1146,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* try to allocate swap cache page */ mpol = get_task_policy(current); page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &page_was_allocated); + NO_INTERLEAVE_INDEX, &page_was_allocated, true); if (!page) { ret = -ENOMEM; goto fail; @@ -1145,7 +1212,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* start writeback */ __swap_writepage(page, &wbc); put_page(page); - zswap_written_back_pages++; return ret; @@ -1202,8 +1268,10 @@ bool zswap_store(struct folio *folio) struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL; + struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; struct zpool *zpool; + int lru_alloc_ret; unsigned int dlen = PAGE_SIZE; unsigned long handle, value; char *buf; @@ -1233,15 +1301,15 @@ bool zswap_store(struct folio *folio) zswap_invalidate_entry(tree, dupentry); } spin_unlock(&tree->lock); - - /* - * XXX: zswap reclaim does not work with cgroups yet. Without a - * cgroup-aware entry LRU, we will push out entries system-wide based on - * local cgroup limits. - */ objcg = get_obj_cgroup_from_folio(folio); - if (objcg && !obj_cgroup_may_zswap(objcg)) - goto reject; + if (objcg && !obj_cgroup_may_zswap(objcg)) { + memcg = get_mem_cgroup_from_objcg(objcg); + if (shrink_memcg(memcg)) { + mem_cgroup_put(memcg); + goto reject; + } + mem_cgroup_put(memcg); + } /* reclaim space if needed */ if (zswap_is_full()) { @@ -1258,7 +1326,7 @@ bool zswap_store(struct folio *folio) } /* allocate entry */ - entry = zswap_entry_cache_alloc(GFP_KERNEL); + entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; @@ -1285,6 +1353,15 @@ bool zswap_store(struct folio *folio) if (!entry->pool) goto freepage; + if (objcg) { + memcg = get_mem_cgroup_from_objcg(objcg); + lru_alloc_ret = memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL); + mem_cgroup_put(memcg); + + if (lru_alloc_ret) + goto put_pool; + } + /* compress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); @@ -1361,9 +1438,8 @@ bool zswap_store(struct folio *folio) zswap_invalidate_entry(tree, dupentry); } if (entry->length) { - spin_lock(&entry->pool->lru_lock); - list_add(&entry->lru, &entry->pool->lru); - spin_unlock(&entry->pool->lru_lock); + INIT_LIST_HEAD(&entry->lru); + zswap_lru_add(&entry->pool->list_lru, entry); } spin_unlock(&tree->lock); @@ -1376,6 +1452,7 @@ bool zswap_store(struct folio *folio) put_dstmem: mutex_unlock(acomp_ctx->mutex); +put_pool: zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); @@ -1470,9 +1547,8 @@ bool zswap_load(struct folio *folio) zswap_invalidate_entry(tree, entry); folio_mark_dirty(folio); } else if (entry->length) { - spin_lock(&entry->pool->lru_lock); - list_move(&entry->lru, &entry->pool->lru); - spin_unlock(&entry->pool->lru_lock); + zswap_lru_del(&entry->pool->list_lru, entry); + zswap_lru_add(&entry->pool->list_lru, entry); } zswap_entry_put(tree, entry); spin_unlock(&tree->lock);