Message ID | 1604566549-62481-18-git-send-email-alex.shi@linux.alibaba.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | per memcg lru lock | expand |
在 2020/11/5 下午4:55, Alex Shi 写道: > /** > + * lock_page_lruvec - lock and return lruvec for a given page. > + * @page: the page > + * > + * This series functions should be used in either conditions: > + * PageLRU is cleared or unset > + * or page is locked. or page->_refcount is zero. Ops, here is a typo, we need to add back above line. so the patch updated here: From 9f187d04c7ba62bb7e07c07733b2848f155961f6 Mon Sep 17 00:00:00 2001 From: Alex Shi <alex.shi@linux.alibaba.com> Date: Tue, 18 Aug 2020 16:44:21 +0800 Subject: [PATCH v21 17/19] mm/lru: replace pgdat lru_lock with lruvec lock This patch moves per node lru_lock into lruvec, thus bring a lru_lock for each of memcg per node. So on a large machine, each of memcg don't have to suffer from per node pgdat->lru_lock competition. They could go fast with their self lru_lock. After move memcg charge before lru inserting, page isolation could serialize page's memcg, then per memcg lruvec lock is stable and could replace per node lru lock. In func isolate_migratepages_block, compact_unlock_should_abort and lock_page_lruvec_irqsave are open coded to work with compact_control. Also add a debug func in locking which may give some clues if there are sth out of hands. Daniel Jordan's testing show 62% improvement on modified readtwice case on his 2P * 10 core * 2 HT broadwell box. https://lore.kernel.org/lkml/20200915165807.kpp7uhiw7l3loofu@ca-dmjordan1.us.oracle.com/ On a large machine with memcg enabled but not used, the page's lruvec seeking pass a few pointers, that may lead to lru_lock holding time increase and a bit regression. Hugh Dickins helped on the patch polish, thanks! Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com> Acked-by: Hugh Dickins <hughd@google.com> Cc: Rong Chen <rong.a.chen@intel.com> Cc: Hugh Dickins <hughd@google.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@kernel.org> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Yang Shi <yang.shi@linux.alibaba.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> Cc: Tejun Heo <tj@kernel.org> Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: cgroups@vger.kernel.org --- include/linux/memcontrol.h | 58 +++++++++++++++++++++++ include/linux/mmzone.h | 3 +- mm/compaction.c | 56 ++++++++++++++-------- mm/huge_memory.c | 11 ++--- mm/memcontrol.c | 74 +++++++++++++++++++++++++++-- mm/mlock.c | 22 ++++++--- mm/mmzone.c | 1 + mm/page_alloc.c | 1 - mm/swap.c | 116 ++++++++++++++++++++++----------------------- mm/vmscan.c | 55 ++++++++++----------- 10 files changed, 271 insertions(+), 126 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0f4dd7829fb2..6ecb08ff4ad1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -666,6 +666,19 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct mem_cgroup *get_mem_cgroup_from_page(struct page *page); +struct lruvec *lock_page_lruvec(struct page *page); +struct lruvec *lock_page_lruvec_irq(struct page *page); +struct lruvec *lock_page_lruvec_irqsave(struct page *page, + unsigned long *flags); + +#ifdef CONFIG_DEBUG_VM +void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page); +#else +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ +} +#endif + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@ -1233,6 +1246,31 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } +static inline struct lruvec *lock_page_lruvec(struct page *page) +{ + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock(&pgdat->__lruvec.lru_lock); + return &pgdat->__lruvec; +} + +static inline struct lruvec *lock_page_lruvec_irq(struct page *page) +{ + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock_irq(&pgdat->__lruvec.lru_lock); + return &pgdat->__lruvec; +} + +static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page, + unsigned long *flagsp) +{ + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); + return &pgdat->__lruvec; +} + static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, @@ -1476,6 +1514,10 @@ static inline void count_memcg_page_event(struct page *page, void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } + +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ +} #endif /* CONFIG_MEMCG */ /* idx can be of type enum memcg_stat_item or node_stat_item */ @@ -1605,6 +1647,22 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } +static inline void unlock_page_lruvec(struct lruvec *lruvec) +{ + spin_unlock(&lruvec->lru_lock); +} + +static inline void unlock_page_lruvec_irq(struct lruvec *lruvec) +{ + spin_unlock_irq(&lruvec->lru_lock); +} + +static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, + unsigned long flags) +{ + spin_unlock_irqrestore(&lruvec->lru_lock, flags); +} + #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fb3bf696c05e..0afba4ea2a21 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -276,6 +276,8 @@ enum lruvec_flags { struct lruvec { struct list_head lists[NR_LRU_LISTS]; + /* per lruvec lru_lock for memcg */ + spinlock_t lru_lock; /* * These track the cost of reclaiming one LRU - file or anon - * over the other. As the observed cost of reclaiming one LRU @@ -796,7 +798,6 @@ struct deferred_split { /* Write-intensive fields used by page reclaim */ ZONE_PADDING(_pad1_) - spinlock_t lru_lock; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* diff --git a/mm/compaction.c b/mm/compaction.c index 7b1cf48884dd..9cfe90961493 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -804,7 +804,7 @@ static bool too_many_isolated(pg_data_t *pgdat) unsigned long nr_scanned = 0, nr_isolated = 0; struct lruvec *lruvec; unsigned long flags = 0; - bool locked = false; + struct lruvec *locked = NULL; struct page *page = NULL, *valid_page = NULL; unsigned long start_pfn = low_pfn; bool skip_on_failure = false; @@ -864,11 +864,20 @@ static bool too_many_isolated(pg_data_t *pgdat) * contention, to give chance to IRQs. Abort completely if * a fatal signal is pending. */ - if (!(low_pfn % SWAP_CLUSTER_MAX) - && compact_unlock_should_abort(&pgdat->lru_lock, - flags, &locked, cc)) { - low_pfn = 0; - goto fatal_pending; + if (!(low_pfn % SWAP_CLUSTER_MAX)) { + if (locked) { + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; + } + + if (fatal_signal_pending(current)) { + cc->contended = true; + + low_pfn = 0; + goto fatal_pending; + } + + cond_resched(); } if (!pfn_valid_within(low_pfn)) @@ -940,9 +949,8 @@ static bool too_many_isolated(pg_data_t *pgdat) if (unlikely(__PageMovable(page)) && !PageIsolated(page)) { if (locked) { - spin_unlock_irqrestore(&pgdat->lru_lock, - flags); - locked = false; + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; } if (!isolate_movable_page(page, isolate_mode)) @@ -983,10 +991,19 @@ static bool too_many_isolated(pg_data_t *pgdat) if (!TestClearPageLRU(page)) goto isolate_fail_put; + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + /* If we already hold the lock, we can skip some rechecking */ - if (!locked) { - locked = compact_lock_irqsave(&pgdat->lru_lock, - &flags, cc); + if (lruvec != locked) { + if (locked) + unlock_page_lruvec_irqrestore(locked, flags); + + compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); + locked = lruvec; + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); /* Try get exclusive access under lock */ if (!skip_updated) { @@ -1005,9 +1022,8 @@ static bool too_many_isolated(pg_data_t *pgdat) SetPageLRU(page); goto isolate_fail_put; } - } - - lruvec = mem_cgroup_page_lruvec(page, pgdat); + } else + rcu_read_unlock(); /* The whole page is taken off the LRU; skip the tail pages. */ if (PageCompound(page)) @@ -1041,8 +1057,8 @@ static bool too_many_isolated(pg_data_t *pgdat) isolate_fail_put: /* Avoid potential deadlock in freeing page under lru_lock */ if (locked) { - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - locked = false; + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; } put_page(page); @@ -1057,8 +1073,8 @@ static bool too_many_isolated(pg_data_t *pgdat) */ if (nr_isolated) { if (locked) { - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - locked = false; + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; } putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; @@ -1086,7 +1102,7 @@ static bool too_many_isolated(pg_data_t *pgdat) isolate_abort: if (locked) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + unlock_page_lruvec_irqrestore(locked, flags); if (page) { SetPageLRU(page); put_page(page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b70ec0c6076b..94e42dba052a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2354,7 +2354,7 @@ static void lru_add_page_tail(struct page *head, struct page *tail, VM_BUG_ON_PAGE(!PageHead(head), head); VM_BUG_ON_PAGE(PageCompound(tail), head); VM_BUG_ON_PAGE(PageLRU(tail), head); - lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock); + lockdep_assert_held(&lruvec->lru_lock); if (list) { /* page reclaim is reclaiming a huge page */ @@ -2438,7 +2438,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, pgoff_t end) { struct page *head = compound_head(page); - pg_data_t *pgdat = page_pgdat(head); struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; @@ -2456,10 +2455,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, xa_lock(&swap_cache->i_pages); } - /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock(&pgdat->lru_lock); - - lruvec = mem_cgroup_page_lruvec(head, pgdat); + /* lock lru list/PageCompound, ref freezed by page_ref_freeze */ + lruvec = lock_page_lruvec(head); for (i = nr - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); @@ -2480,7 +2477,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } ClearPageCompound(head); - spin_unlock(&pgdat->lru_lock); + unlock_page_lruvec(lruvec); /* Caller disabled irqs, so they are still disabled here */ split_page_owner(head, nr); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 157b745031a4..591f9f9ca8b2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -20,6 +20,9 @@ * Lockless page tracking & accounting * Unified hierarchy configuration model * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner + * + * Per memcg lru locking + * Copyright (C) 2020 Alibaba, Inc, Alex Shi */ #include <linux/page_counter.h> @@ -1305,6 +1308,19 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, return ret; } +#ifdef CONFIG_DEBUG_VM +void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ + if (mem_cgroup_disabled()) + return; + + if (!page->mem_cgroup) + VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page); + else + VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != page->mem_cgroup, page); +} +#endif + /** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page @@ -1343,6 +1359,60 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd } /** + * lock_page_lruvec - lock and return lruvec for a given page. + * @page: the page + * + * This series functions should be used in either conditions: + * PageLRU is cleared or unset + * or page->_refcount is zero + * or page is locked. + */ +struct lruvec *lock_page_lruvec(struct page *page) +{ + struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock(&lruvec->lru_lock); + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); + + return lruvec; +} + +struct lruvec *lock_page_lruvec_irq(struct page *page) +{ + struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock_irq(&lruvec->lru_lock); + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); + + return lruvec; +} + +struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags) +{ + struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock_irqsave(&lruvec->lru_lock, *flags); + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); + + return lruvec; +} + +/** * mem_cgroup_update_lru_size - account for adding or removing an lru page * @lruvec: mem_cgroup per zone lru vector * @lru: index of lru list the page is sitting on @@ -3245,10 +3315,8 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* - * Because tail pages are not marked as "used", set it. We're under - * pgdat->lru_lock and migration entries setup in all page mappings. + * Because page->mem_cgroup is not set on compound tails, set it now. */ void mem_cgroup_split_huge_fixup(struct page *head) { diff --git a/mm/mlock.c b/mm/mlock.c index 7b0e6334be6f..ab164a675c25 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -262,12 +262,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) int nr = pagevec_count(pvec); int delta_munlocked = -nr; struct pagevec pvec_putback; + struct lruvec *lruvec = NULL; int pgrescued = 0; pagevec_init(&pvec_putback); /* Phase 1: page isolation */ - spin_lock_irq(&zone->zone_pgdat->lru_lock); for (i = 0; i < nr; i++) { struct page *page = pvec->pages[i]; @@ -277,10 +277,16 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) * so we can spare the get_page() here. */ if (TestClearPageLRU(page)) { - struct lruvec *lruvec; + struct lruvec *new_lruvec; + + new_lruvec = mem_cgroup_page_lruvec(page, + page_pgdat(page)); + if (new_lruvec != lruvec) { + if (lruvec) + unlock_page_lruvec_irq(lruvec); + lruvec = lock_page_lruvec_irq(page); + } - lruvec = mem_cgroup_page_lruvec(page, - page_pgdat(page)); del_page_from_lru_list(page, lruvec, page_lru(page)); continue; @@ -299,8 +305,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pagevec_add(&pvec_putback, pvec->pages[i]); pvec->pages[i] = NULL; } - __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - spin_unlock_irq(&zone->zone_pgdat->lru_lock); + if (lruvec) { + __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); + unlock_page_lruvec_irq(lruvec); + } else if (delta_munlocked) { + mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); + } /* Now we can release pins of pages that we are not munlocking */ pagevec_release(&pvec_putback); diff --git a/mm/mmzone.c b/mm/mmzone.c index 4686fdc23bb9..3750a90ed4a0 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -91,6 +91,7 @@ void lruvec_init(struct lruvec *lruvec) enum lru_list lru; memset(lruvec, 0, sizeof(struct lruvec)); + spin_lock_init(&lruvec->lru_lock); for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d77220615fd5..74bf7f4c6317 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6808,7 +6808,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_ext_init(pgdat); - spin_lock_init(&pgdat->lru_lock); lruvec_init(&pgdat->__lruvec); } diff --git a/mm/swap.c b/mm/swap.c index 1838a9535703..ed033f7c4f2d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -79,16 +79,14 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { static void __page_cache_release(struct page *page) { if (PageLRU(page)) { - pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; unsigned long flags; - spin_lock_irqsave(&pgdat->lru_lock, flags); - lruvec = mem_cgroup_page_lruvec(page, pgdat); + lruvec = lock_page_lruvec_irqsave(page, &flags); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + unlock_page_lruvec_irqrestore(lruvec, flags); } __ClearPageWaiters(page); } @@ -207,32 +205,30 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, void (*move_fn)(struct page *page, struct lruvec *lruvec)) { int i; - struct pglist_data *pgdat = NULL; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct pglist_data *pagepgdat = page_pgdat(page); - - if (pagepgdat != pgdat) { - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - pgdat = pagepgdat; - spin_lock_irqsave(&pgdat->lru_lock, flags); - } + struct lruvec *new_lruvec; /* block memcg migration during page moving between lru */ if (!TestClearPageLRU(page)) continue; - lruvec = mem_cgroup_page_lruvec(page, pgdat); + new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + if (lruvec != new_lruvec) { + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = lock_page_lruvec_irqsave(page, &flags); + } + (*move_fn)(page, lruvec); SetPageLRU(page); } - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } @@ -274,9 +270,15 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) { do { unsigned long lrusize; - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - spin_lock_irq(&pgdat->lru_lock); + /* + * Hold lruvec->lru_lock is safe here, since + * 1) The pinned lruvec in reclaim, or + * 2) From a pre-LRU page during refault (which also holds the + * rcu lock, so would be safe even if the page was on the LRU + * and could move simultaneously to a new lruvec). + */ + spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) lruvec->file_cost += nr_pages; @@ -300,7 +302,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); } while ((lruvec = parent_lruvec(lruvec))); } @@ -364,13 +366,15 @@ static inline void activate_page_drain(int cpu) static void activate_page(struct page *page) { - pg_data_t *pgdat = page_pgdat(page); + struct lruvec *lruvec; page = compound_head(page); - spin_lock_irq(&pgdat->lru_lock); - if (PageLRU(page)) - __activate_page(page, mem_cgroup_page_lruvec(page, pgdat)); - spin_unlock_irq(&pgdat->lru_lock); + if (TestClearPageLRU(page)) { + lruvec = lock_page_lruvec_irq(page); + __activate_page(page, lruvec); + unlock_page_lruvec_irq(lruvec); + SetPageLRU(page); + } } #endif @@ -860,8 +864,7 @@ void release_pages(struct page **pages, int nr) { int i; LIST_HEAD(pages_to_free); - struct pglist_data *locked_pgdat = NULL; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags; unsigned int lock_batch; @@ -871,11 +874,11 @@ void release_pages(struct page **pages, int nr) /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the - * same pgdat. The lock is held only if pgdat != NULL. + * same lruvec. The lock is held only if lruvec != NULL. */ - if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); - locked_pgdat = NULL; + if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } page = compound_head(page); @@ -883,10 +886,9 @@ void release_pages(struct page **pages, int nr) continue; if (is_zone_device_page(page)) { - if (locked_pgdat) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, - flags); - locked_pgdat = NULL; + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } /* * ZONE_DEVICE pages that return 'false' from @@ -907,27 +909,27 @@ void release_pages(struct page **pages, int nr) continue; if (PageCompound(page)) { - if (locked_pgdat) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); - locked_pgdat = NULL; + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } __put_compound_page(page); continue; } if (PageLRU(page)) { - struct pglist_data *pgdat = page_pgdat(page); + struct lruvec *new_lruvec; - if (pgdat != locked_pgdat) { - if (locked_pgdat) - spin_unlock_irqrestore(&locked_pgdat->lru_lock, + new_lruvec = mem_cgroup_page_lruvec(page, + page_pgdat(page)); + if (new_lruvec != lruvec) { + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); lock_batch = 0; - locked_pgdat = pgdat; - spin_lock_irqsave(&locked_pgdat->lru_lock, flags); + lruvec = lock_page_lruvec_irqsave(page, &flags); } - lruvec = mem_cgroup_page_lruvec(page, locked_pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); @@ -937,8 +939,8 @@ void release_pages(struct page **pages, int nr) list_add(&page->lru, &pages_to_free); } - if (locked_pgdat) - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); mem_cgroup_uncharge_list(&pages_to_free); free_unref_page_list(&pages_to_free); @@ -1026,26 +1028,24 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) void __pagevec_lru_add(struct pagevec *pvec) { int i; - struct pglist_data *pgdat = NULL; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct pglist_data *pagepgdat = page_pgdat(page); + struct lruvec *new_lruvec; - if (pagepgdat != pgdat) { - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - pgdat = pagepgdat; - spin_lock_irqsave(&pgdat->lru_lock, flags); + new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + if (lruvec != new_lruvec) { + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = lock_page_lruvec_irqsave(page, &flags); } - lruvec = mem_cgroup_page_lruvec(page, pgdat); __pagevec_lru_add_fn(page, lruvec); } - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 0be55d875fde..2953ddec88a0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1765,14 +1765,12 @@ int isolate_lru_page(struct page *page) WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); if (TestClearPageLRU(page)) { - pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; get_page(page); - lruvec = mem_cgroup_page_lruvec(page, pgdat); - spin_lock_irq(&pgdat->lru_lock); + lruvec = lock_page_lruvec_irq(page); del_page_from_lru_list(page, lruvec, page_lru(page)); - spin_unlock_irq(&pgdat->lru_lock); + unlock_page_lruvec_irq(lruvec); ret = 0; } @@ -1839,7 +1837,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, struct list_head *list) { - struct pglist_data *pgdat = lruvec_pgdat(lruvec); int nr_pages, nr_moved = 0; LIST_HEAD(pages_to_free); struct page *page; @@ -1850,9 +1847,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); if (unlikely(!page_evictable(page))) { - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); putback_lru_page(page); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); continue; } @@ -1874,9 +1871,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, __ClearPageActive(page); if (unlikely(PageCompound(page))) { - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); destroy_compound_page(page); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); } else list_add(&page->lru, &pages_to_free); @@ -1953,7 +1950,7 @@ static int current_may_throttle(void) lru_add_drain(); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, &nr_scanned, sc, lru); @@ -1965,7 +1962,7 @@ static int current_may_throttle(void) __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); __count_vm_events(PGSCAN_ANON + file, nr_scanned); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); if (nr_taken == 0) return 0; @@ -1973,7 +1970,7 @@ static int current_may_throttle(void) nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0, &stat, false); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); move_pages_to_lru(lruvec, &page_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); @@ -1982,7 +1979,7 @@ static int current_may_throttle(void) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); lru_note_cost(lruvec, file, stat.nr_pageout); mem_cgroup_uncharge_list(&page_list); @@ -2035,7 +2032,7 @@ static void shrink_active_list(unsigned long nr_to_scan, lru_add_drain(); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); @@ -2046,7 +2043,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __count_vm_events(PGREFILL, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); while (!list_empty(&l_hold)) { cond_resched(); @@ -2092,7 +2089,7 @@ static void shrink_active_list(unsigned long nr_to_scan, /* * Move pages back to the lru list. */ - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); nr_activate = move_pages_to_lru(lruvec, &l_active); nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); @@ -2103,7 +2100,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_list(&l_active); free_unref_page_list(&l_active); @@ -2693,10 +2690,10 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) /* * Determine the scan balance between anon and file LRUs. */ - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&target_lruvec->lru_lock); sc->anon_cost = target_lruvec->anon_cost; sc->file_cost = target_lruvec->file_cost; - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&target_lruvec->lru_lock); /* * Target desirable inactive:active list ratios for the anon @@ -4272,16 +4269,15 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) */ void check_move_unevictable_pages(struct pagevec *pvec) { - struct lruvec *lruvec; - struct pglist_data *pgdat = NULL; + struct lruvec *lruvec = NULL; int pgscanned = 0; int pgrescued = 0; int i; for (i = 0; i < pvec->nr; i++) { struct page *page = pvec->pages[i]; - struct pglist_data *pagepgdat = page_pgdat(page); int nr_pages; + struct lruvec *new_lruvec; if (PageTransTail(page)) continue; @@ -4293,13 +4289,12 @@ void check_move_unevictable_pages(struct pagevec *pvec) if (!TestClearPageLRU(page)) continue; - if (pagepgdat != pgdat) { - if (pgdat) - spin_unlock_irq(&pgdat->lru_lock); - pgdat = pagepgdat; - spin_lock_irq(&pgdat->lru_lock); + new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + if (lruvec != new_lruvec) { + if (lruvec) + unlock_page_lruvec_irq(lruvec); + lruvec = lock_page_lruvec_irq(page); } - lruvec = mem_cgroup_page_lruvec(page, pgdat); if (page_evictable(page) && PageUnevictable(page)) { enum lru_list lru = page_lru_base_type(page); @@ -4313,10 +4308,10 @@ void check_move_unevictable_pages(struct pagevec *pvec) SetPageLRU(page); } - if (pgdat) { + if (lruvec) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); - spin_unlock_irq(&pgdat->lru_lock); + unlock_page_lruvec_irq(lruvec); } else if (pgscanned) { count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); }
在 2020/11/5 下午9:43, Alex Shi 写道: > > 在 2020/11/5 下午4:55, Alex Shi 写道: >> /** >> + * lock_page_lruvec - lock and return lruvec for a given page. >> + * @page: the page >> + * >> + * This series functions should be used in either conditions: >> + * PageLRU is cleared or unset >> + * or page is locked. > or page->_refcount is zero. > Ops, here is a typo, we need to add back above line. > so the patch updated here: Sorry for aother miss, linux-next removed the page.mem_cgroup, replace it to memcg_data and get by page_memcg(). Hence this and next patch needs update for this: From 84e69f892119d99612e9668e3fe47a3922bafff1 Mon Sep 17 00:00:00 2001 From: Alex Shi <alex.shi@linux.alibaba.com> Date: Tue, 18 Aug 2020 16:44:21 +0800 Subject: [PATCH v21 17/19] mm/lru: replace pgdat lru_lock with lruvec lock This patch moves per node lru_lock into lruvec, thus bring a lru_lock for each of memcg per node. So on a large machine, each of memcg don't have to suffer from per node pgdat->lru_lock competition. They could go fast with their self lru_lock. After move memcg charge before lru inserting, page isolation could serialize page's memcg, then per memcg lruvec lock is stable and could replace per node lru lock. In func isolate_migratepages_block, compact_unlock_should_abort and lock_page_lruvec_irqsave are open coded to work with compact_control. Also add a debug func in locking which may give some clues if there are sth out of hands. Daniel Jordan's testing show 62% improvement on modified readtwice case on his 2P * 10 core * 2 HT broadwell box. https://lore.kernel.org/lkml/20200915165807.kpp7uhiw7l3loofu@ca-dmjordan1.us.oracle.com/ On a large machine with memcg enabled but not used, the page's lruvec seeking pass a few pointers, that may lead to lru_lock holding time increase and a bit regression. Hugh Dickins helped on the patch polish, thanks! Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com> Acked-by: Hugh Dickins <hughd@google.com> Cc: Rong Chen <rong.a.chen@intel.com> Cc: Hugh Dickins <hughd@google.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@kernel.org> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Yang Shi <yang.shi@linux.alibaba.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> Cc: Tejun Heo <tj@kernel.org> Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: cgroups@vger.kernel.org --- include/linux/memcontrol.h | 58 +++++++++++++++++++++++ include/linux/mmzone.h | 3 +- mm/compaction.c | 56 ++++++++++++++-------- mm/huge_memory.c | 11 ++--- mm/memcontrol.c | 78 ++++++++++++++++++++++++++++-- mm/mlock.c | 22 ++++++--- mm/mmzone.c | 1 + mm/page_alloc.c | 1 - mm/swap.c | 116 ++++++++++++++++++++++----------------------- mm/vmscan.c | 55 ++++++++++----------- 10 files changed, 275 insertions(+), 126 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0f4dd7829fb2..6ecb08ff4ad1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -666,6 +666,19 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct mem_cgroup *get_mem_cgroup_from_page(struct page *page); +struct lruvec *lock_page_lruvec(struct page *page); +struct lruvec *lock_page_lruvec_irq(struct page *page); +struct lruvec *lock_page_lruvec_irqsave(struct page *page, + unsigned long *flags); + +#ifdef CONFIG_DEBUG_VM +void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page); +#else +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ +} +#endif + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@ -1233,6 +1246,31 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } +static inline struct lruvec *lock_page_lruvec(struct page *page) +{ + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock(&pgdat->__lruvec.lru_lock); + return &pgdat->__lruvec; +} + +static inline struct lruvec *lock_page_lruvec_irq(struct page *page) +{ + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock_irq(&pgdat->__lruvec.lru_lock); + return &pgdat->__lruvec; +} + +static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page, + unsigned long *flagsp) +{ + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); + return &pgdat->__lruvec; +} + static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, @@ -1476,6 +1514,10 @@ static inline void count_memcg_page_event(struct page *page, void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } + +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ +} #endif /* CONFIG_MEMCG */ /* idx can be of type enum memcg_stat_item or node_stat_item */ @@ -1605,6 +1647,22 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } +static inline void unlock_page_lruvec(struct lruvec *lruvec) +{ + spin_unlock(&lruvec->lru_lock); +} + +static inline void unlock_page_lruvec_irq(struct lruvec *lruvec) +{ + spin_unlock_irq(&lruvec->lru_lock); +} + +static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, + unsigned long flags) +{ + spin_unlock_irqrestore(&lruvec->lru_lock, flags); +} + #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fb3bf696c05e..0afba4ea2a21 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -276,6 +276,8 @@ enum lruvec_flags { struct lruvec { struct list_head lists[NR_LRU_LISTS]; + /* per lruvec lru_lock for memcg */ + spinlock_t lru_lock; /* * These track the cost of reclaiming one LRU - file or anon - * over the other. As the observed cost of reclaiming one LRU @@ -796,7 +798,6 @@ struct deferred_split { /* Write-intensive fields used by page reclaim */ ZONE_PADDING(_pad1_) - spinlock_t lru_lock; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* diff --git a/mm/compaction.c b/mm/compaction.c index 7b1cf48884dd..9cfe90961493 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -804,7 +804,7 @@ static bool too_many_isolated(pg_data_t *pgdat) unsigned long nr_scanned = 0, nr_isolated = 0; struct lruvec *lruvec; unsigned long flags = 0; - bool locked = false; + struct lruvec *locked = NULL; struct page *page = NULL, *valid_page = NULL; unsigned long start_pfn = low_pfn; bool skip_on_failure = false; @@ -864,11 +864,20 @@ static bool too_many_isolated(pg_data_t *pgdat) * contention, to give chance to IRQs. Abort completely if * a fatal signal is pending. */ - if (!(low_pfn % SWAP_CLUSTER_MAX) - && compact_unlock_should_abort(&pgdat->lru_lock, - flags, &locked, cc)) { - low_pfn = 0; - goto fatal_pending; + if (!(low_pfn % SWAP_CLUSTER_MAX)) { + if (locked) { + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; + } + + if (fatal_signal_pending(current)) { + cc->contended = true; + + low_pfn = 0; + goto fatal_pending; + } + + cond_resched(); } if (!pfn_valid_within(low_pfn)) @@ -940,9 +949,8 @@ static bool too_many_isolated(pg_data_t *pgdat) if (unlikely(__PageMovable(page)) && !PageIsolated(page)) { if (locked) { - spin_unlock_irqrestore(&pgdat->lru_lock, - flags); - locked = false; + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; } if (!isolate_movable_page(page, isolate_mode)) @@ -983,10 +991,19 @@ static bool too_many_isolated(pg_data_t *pgdat) if (!TestClearPageLRU(page)) goto isolate_fail_put; + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + /* If we already hold the lock, we can skip some rechecking */ - if (!locked) { - locked = compact_lock_irqsave(&pgdat->lru_lock, - &flags, cc); + if (lruvec != locked) { + if (locked) + unlock_page_lruvec_irqrestore(locked, flags); + + compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); + locked = lruvec; + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); /* Try get exclusive access under lock */ if (!skip_updated) { @@ -1005,9 +1022,8 @@ static bool too_many_isolated(pg_data_t *pgdat) SetPageLRU(page); goto isolate_fail_put; } - } - - lruvec = mem_cgroup_page_lruvec(page, pgdat); + } else + rcu_read_unlock(); /* The whole page is taken off the LRU; skip the tail pages. */ if (PageCompound(page)) @@ -1041,8 +1057,8 @@ static bool too_many_isolated(pg_data_t *pgdat) isolate_fail_put: /* Avoid potential deadlock in freeing page under lru_lock */ if (locked) { - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - locked = false; + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; } put_page(page); @@ -1057,8 +1073,8 @@ static bool too_many_isolated(pg_data_t *pgdat) */ if (nr_isolated) { if (locked) { - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - locked = false; + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; } putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; @@ -1086,7 +1102,7 @@ static bool too_many_isolated(pg_data_t *pgdat) isolate_abort: if (locked) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + unlock_page_lruvec_irqrestore(locked, flags); if (page) { SetPageLRU(page); put_page(page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b70ec0c6076b..94e42dba052a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2354,7 +2354,7 @@ static void lru_add_page_tail(struct page *head, struct page *tail, VM_BUG_ON_PAGE(!PageHead(head), head); VM_BUG_ON_PAGE(PageCompound(tail), head); VM_BUG_ON_PAGE(PageLRU(tail), head); - lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock); + lockdep_assert_held(&lruvec->lru_lock); if (list) { /* page reclaim is reclaiming a huge page */ @@ -2438,7 +2438,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, pgoff_t end) { struct page *head = compound_head(page); - pg_data_t *pgdat = page_pgdat(head); struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; @@ -2456,10 +2455,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, xa_lock(&swap_cache->i_pages); } - /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock(&pgdat->lru_lock); - - lruvec = mem_cgroup_page_lruvec(head, pgdat); + /* lock lru list/PageCompound, ref freezed by page_ref_freeze */ + lruvec = lock_page_lruvec(head); for (i = nr - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); @@ -2480,7 +2477,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } ClearPageCompound(head); - spin_unlock(&pgdat->lru_lock); + unlock_page_lruvec(lruvec); /* Caller disabled irqs, so they are still disabled here */ split_page_owner(head, nr); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 157b745031a4..7657f16cf992 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -20,6 +20,9 @@ * Lockless page tracking & accounting * Unified hierarchy configuration model * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner + * + * Per memcg lru locking + * Copyright (C) 2020 Alibaba, Inc, Alex Shi */ #include <linux/page_counter.h> @@ -1305,6 +1308,23 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, return ret; } +#ifdef CONFIG_DEBUG_VM +void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return; + + memcg = page_memcg(page); + + if (!memcg) + VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page); + else + VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page); +} +#endif + /** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page @@ -1343,6 +1363,60 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd } /** + * lock_page_lruvec - lock and return lruvec for a given page. + * @page: the page + * + * This series functions should be used in either conditions: + * PageLRU is cleared or unset + * or page->_refcount is zero + * or page is locked. + */ +struct lruvec *lock_page_lruvec(struct page *page) +{ + struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock(&lruvec->lru_lock); + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); + + return lruvec; +} + +struct lruvec *lock_page_lruvec_irq(struct page *page) +{ + struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock_irq(&lruvec->lru_lock); + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); + + return lruvec; +} + +struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags) +{ + struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock_irqsave(&lruvec->lru_lock, *flags); + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); + + return lruvec; +} + +/** * mem_cgroup_update_lru_size - account for adding or removing an lru page * @lruvec: mem_cgroup per zone lru vector * @lru: index of lru list the page is sitting on @@ -3245,10 +3319,8 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* - * Because tail pages are not marked as "used", set it. We're under - * pgdat->lru_lock and migration entries setup in all page mappings. + * Because page_memcg(head) is not set on compound tails, set it now. */ void mem_cgroup_split_huge_fixup(struct page *head) { diff --git a/mm/mlock.c b/mm/mlock.c index 7b0e6334be6f..ab164a675c25 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -262,12 +262,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) int nr = pagevec_count(pvec); int delta_munlocked = -nr; struct pagevec pvec_putback; + struct lruvec *lruvec = NULL; int pgrescued = 0; pagevec_init(&pvec_putback); /* Phase 1: page isolation */ - spin_lock_irq(&zone->zone_pgdat->lru_lock); for (i = 0; i < nr; i++) { struct page *page = pvec->pages[i]; @@ -277,10 +277,16 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) * so we can spare the get_page() here. */ if (TestClearPageLRU(page)) { - struct lruvec *lruvec; + struct lruvec *new_lruvec; + + new_lruvec = mem_cgroup_page_lruvec(page, + page_pgdat(page)); + if (new_lruvec != lruvec) { + if (lruvec) + unlock_page_lruvec_irq(lruvec); + lruvec = lock_page_lruvec_irq(page); + } - lruvec = mem_cgroup_page_lruvec(page, - page_pgdat(page)); del_page_from_lru_list(page, lruvec, page_lru(page)); continue; @@ -299,8 +305,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pagevec_add(&pvec_putback, pvec->pages[i]); pvec->pages[i] = NULL; } - __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - spin_unlock_irq(&zone->zone_pgdat->lru_lock); + if (lruvec) { + __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); + unlock_page_lruvec_irq(lruvec); + } else if (delta_munlocked) { + mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); + } /* Now we can release pins of pages that we are not munlocking */ pagevec_release(&pvec_putback); diff --git a/mm/mmzone.c b/mm/mmzone.c index 4686fdc23bb9..3750a90ed4a0 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -91,6 +91,7 @@ void lruvec_init(struct lruvec *lruvec) enum lru_list lru; memset(lruvec, 0, sizeof(struct lruvec)); + spin_lock_init(&lruvec->lru_lock); for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d77220615fd5..74bf7f4c6317 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6808,7 +6808,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_ext_init(pgdat); - spin_lock_init(&pgdat->lru_lock); lruvec_init(&pgdat->__lruvec); } diff --git a/mm/swap.c b/mm/swap.c index 1838a9535703..ed033f7c4f2d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -79,16 +79,14 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { static void __page_cache_release(struct page *page) { if (PageLRU(page)) { - pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; unsigned long flags; - spin_lock_irqsave(&pgdat->lru_lock, flags); - lruvec = mem_cgroup_page_lruvec(page, pgdat); + lruvec = lock_page_lruvec_irqsave(page, &flags); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + unlock_page_lruvec_irqrestore(lruvec, flags); } __ClearPageWaiters(page); } @@ -207,32 +205,30 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, void (*move_fn)(struct page *page, struct lruvec *lruvec)) { int i; - struct pglist_data *pgdat = NULL; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct pglist_data *pagepgdat = page_pgdat(page); - - if (pagepgdat != pgdat) { - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - pgdat = pagepgdat; - spin_lock_irqsave(&pgdat->lru_lock, flags); - } + struct lruvec *new_lruvec; /* block memcg migration during page moving between lru */ if (!TestClearPageLRU(page)) continue; - lruvec = mem_cgroup_page_lruvec(page, pgdat); + new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + if (lruvec != new_lruvec) { + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = lock_page_lruvec_irqsave(page, &flags); + } + (*move_fn)(page, lruvec); SetPageLRU(page); } - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } @@ -274,9 +270,15 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) { do { unsigned long lrusize; - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - spin_lock_irq(&pgdat->lru_lock); + /* + * Hold lruvec->lru_lock is safe here, since + * 1) The pinned lruvec in reclaim, or + * 2) From a pre-LRU page during refault (which also holds the + * rcu lock, so would be safe even if the page was on the LRU + * and could move simultaneously to a new lruvec). + */ + spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) lruvec->file_cost += nr_pages; @@ -300,7 +302,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); } while ((lruvec = parent_lruvec(lruvec))); } @@ -364,13 +366,15 @@ static inline void activate_page_drain(int cpu) static void activate_page(struct page *page) { - pg_data_t *pgdat = page_pgdat(page); + struct lruvec *lruvec; page = compound_head(page); - spin_lock_irq(&pgdat->lru_lock); - if (PageLRU(page)) - __activate_page(page, mem_cgroup_page_lruvec(page, pgdat)); - spin_unlock_irq(&pgdat->lru_lock); + if (TestClearPageLRU(page)) { + lruvec = lock_page_lruvec_irq(page); + __activate_page(page, lruvec); + unlock_page_lruvec_irq(lruvec); + SetPageLRU(page); + } } #endif @@ -860,8 +864,7 @@ void release_pages(struct page **pages, int nr) { int i; LIST_HEAD(pages_to_free); - struct pglist_data *locked_pgdat = NULL; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags; unsigned int lock_batch; @@ -871,11 +874,11 @@ void release_pages(struct page **pages, int nr) /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the - * same pgdat. The lock is held only if pgdat != NULL. + * same lruvec. The lock is held only if lruvec != NULL. */ - if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); - locked_pgdat = NULL; + if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } page = compound_head(page); @@ -883,10 +886,9 @@ void release_pages(struct page **pages, int nr) continue; if (is_zone_device_page(page)) { - if (locked_pgdat) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, - flags); - locked_pgdat = NULL; + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } /* * ZONE_DEVICE pages that return 'false' from @@ -907,27 +909,27 @@ void release_pages(struct page **pages, int nr) continue; if (PageCompound(page)) { - if (locked_pgdat) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); - locked_pgdat = NULL; + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } __put_compound_page(page); continue; } if (PageLRU(page)) { - struct pglist_data *pgdat = page_pgdat(page); + struct lruvec *new_lruvec; - if (pgdat != locked_pgdat) { - if (locked_pgdat) - spin_unlock_irqrestore(&locked_pgdat->lru_lock, + new_lruvec = mem_cgroup_page_lruvec(page, + page_pgdat(page)); + if (new_lruvec != lruvec) { + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); lock_batch = 0; - locked_pgdat = pgdat; - spin_lock_irqsave(&locked_pgdat->lru_lock, flags); + lruvec = lock_page_lruvec_irqsave(page, &flags); } - lruvec = mem_cgroup_page_lruvec(page, locked_pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); @@ -937,8 +939,8 @@ void release_pages(struct page **pages, int nr) list_add(&page->lru, &pages_to_free); } - if (locked_pgdat) - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); mem_cgroup_uncharge_list(&pages_to_free); free_unref_page_list(&pages_to_free); @@ -1026,26 +1028,24 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) void __pagevec_lru_add(struct pagevec *pvec) { int i; - struct pglist_data *pgdat = NULL; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct pglist_data *pagepgdat = page_pgdat(page); + struct lruvec *new_lruvec; - if (pagepgdat != pgdat) { - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - pgdat = pagepgdat; - spin_lock_irqsave(&pgdat->lru_lock, flags); + new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + if (lruvec != new_lruvec) { + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = lock_page_lruvec_irqsave(page, &flags); } - lruvec = mem_cgroup_page_lruvec(page, pgdat); __pagevec_lru_add_fn(page, lruvec); } - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 0be55d875fde..2953ddec88a0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1765,14 +1765,12 @@ int isolate_lru_page(struct page *page) WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); if (TestClearPageLRU(page)) { - pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; get_page(page); - lruvec = mem_cgroup_page_lruvec(page, pgdat); - spin_lock_irq(&pgdat->lru_lock); + lruvec = lock_page_lruvec_irq(page); del_page_from_lru_list(page, lruvec, page_lru(page)); - spin_unlock_irq(&pgdat->lru_lock); + unlock_page_lruvec_irq(lruvec); ret = 0; } @@ -1839,7 +1837,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, struct list_head *list) { - struct pglist_data *pgdat = lruvec_pgdat(lruvec); int nr_pages, nr_moved = 0; LIST_HEAD(pages_to_free); struct page *page; @@ -1850,9 +1847,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); if (unlikely(!page_evictable(page))) { - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); putback_lru_page(page); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); continue; } @@ -1874,9 +1871,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, __ClearPageActive(page); if (unlikely(PageCompound(page))) { - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); destroy_compound_page(page); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); } else list_add(&page->lru, &pages_to_free); @@ -1953,7 +1950,7 @@ static int current_may_throttle(void) lru_add_drain(); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, &nr_scanned, sc, lru); @@ -1965,7 +1962,7 @@ static int current_may_throttle(void) __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); __count_vm_events(PGSCAN_ANON + file, nr_scanned); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); if (nr_taken == 0) return 0; @@ -1973,7 +1970,7 @@ static int current_may_throttle(void) nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0, &stat, false); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); move_pages_to_lru(lruvec, &page_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); @@ -1982,7 +1979,7 @@ static int current_may_throttle(void) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); lru_note_cost(lruvec, file, stat.nr_pageout); mem_cgroup_uncharge_list(&page_list); @@ -2035,7 +2032,7 @@ static void shrink_active_list(unsigned long nr_to_scan, lru_add_drain(); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); @@ -2046,7 +2043,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __count_vm_events(PGREFILL, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); while (!list_empty(&l_hold)) { cond_resched(); @@ -2092,7 +2089,7 @@ static void shrink_active_list(unsigned long nr_to_scan, /* * Move pages back to the lru list. */ - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); nr_activate = move_pages_to_lru(lruvec, &l_active); nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); @@ -2103,7 +2100,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_list(&l_active); free_unref_page_list(&l_active); @@ -2693,10 +2690,10 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) /* * Determine the scan balance between anon and file LRUs. */ - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&target_lruvec->lru_lock); sc->anon_cost = target_lruvec->anon_cost; sc->file_cost = target_lruvec->file_cost; - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&target_lruvec->lru_lock); /* * Target desirable inactive:active list ratios for the anon @@ -4272,16 +4269,15 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) */ void check_move_unevictable_pages(struct pagevec *pvec) { - struct lruvec *lruvec; - struct pglist_data *pgdat = NULL; + struct lruvec *lruvec = NULL; int pgscanned = 0; int pgrescued = 0; int i; for (i = 0; i < pvec->nr; i++) { struct page *page = pvec->pages[i]; - struct pglist_data *pagepgdat = page_pgdat(page); int nr_pages; + struct lruvec *new_lruvec; if (PageTransTail(page)) continue; @@ -4293,13 +4289,12 @@ void check_move_unevictable_pages(struct pagevec *pvec) if (!TestClearPageLRU(page)) continue; - if (pagepgdat != pgdat) { - if (pgdat) - spin_unlock_irq(&pgdat->lru_lock); - pgdat = pagepgdat; - spin_lock_irq(&pgdat->lru_lock); + new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + if (lruvec != new_lruvec) { + if (lruvec) + unlock_page_lruvec_irq(lruvec); + lruvec = lock_page_lruvec_irq(page); } - lruvec = mem_cgroup_page_lruvec(page, pgdat); if (page_evictable(page) && PageUnevictable(page)) { enum lru_list lru = page_lru_base_type(page); @@ -4313,10 +4308,10 @@ void check_move_unevictable_pages(struct pagevec *pvec) SetPageLRU(page); } - if (pgdat) { + if (lruvec) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); - spin_unlock_irq(&pgdat->lru_lock); + unlock_page_lruvec_irq(lruvec); } else if (pgscanned) { count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); }
On Fri, Nov 06, 2020 at 03:48:16PM +0800, Alex Shi wrote: > From 84e69f892119d99612e9668e3fe47a3922bafff1 Mon Sep 17 00:00:00 2001 > From: Alex Shi <alex.shi@linux.alibaba.com> > Date: Tue, 18 Aug 2020 16:44:21 +0800 > Subject: [PATCH v21 17/19] mm/lru: replace pgdat lru_lock with lruvec lock > > This patch moves per node lru_lock into lruvec, thus bring a lru_lock for > each of memcg per node. So on a large machine, each of memcg don't > have to suffer from per node pgdat->lru_lock competition. They could go > fast with their self lru_lock. > > After move memcg charge before lru inserting, page isolation could > serialize page's memcg, then per memcg lruvec lock is stable and could > replace per node lru lock. > > In func isolate_migratepages_block, compact_unlock_should_abort and > lock_page_lruvec_irqsave are open coded to work with compact_control. > Also add a debug func in locking which may give some clues if there are > sth out of hands. > > Daniel Jordan's testing show 62% improvement on modified readtwice case > on his 2P * 10 core * 2 HT broadwell box. > https://lore.kernel.org/lkml/20200915165807.kpp7uhiw7l3loofu@ca-dmjordan1.us.oracle.com/ > > On a large machine with memcg enabled but not used, the page's lruvec > seeking pass a few pointers, that may lead to lru_lock holding time > increase and a bit regression. > > Hugh Dickins helped on the patch polish, thanks! > > Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com> > Acked-by: Hugh Dickins <hughd@google.com> > Cc: Rong Chen <rong.a.chen@intel.com> > Cc: Hugh Dickins <hughd@google.com> > Cc: Andrew Morton <akpm@linux-foundation.org> > Cc: Johannes Weiner <hannes@cmpxchg.org> > Cc: Michal Hocko <mhocko@kernel.org> > Cc: Vladimir Davydov <vdavydov.dev@gmail.com> > Cc: Yang Shi <yang.shi@linux.alibaba.com> > Cc: Matthew Wilcox <willy@infradead.org> > Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> > Cc: Tejun Heo <tj@kernel.org> > Cc: linux-kernel@vger.kernel.org > Cc: linux-mm@kvack.org > Cc: cgroups@vger.kernel.org Acked-by: Johannes Weiner <hannes@cmpxchg.org>
On 11/5/20 9:55 AM, Alex Shi wrote: > This patch moves per node lru_lock into lruvec, thus bring a lru_lock for > each of memcg per node. So on a large machine, each of memcg don't > have to suffer from per node pgdat->lru_lock competition. They could go > fast with their self lru_lock. > > After move memcg charge before lru inserting, page isolation could > serialize page's memcg, then per memcg lruvec lock is stable and could > replace per node lru lock. > > In func isolate_migratepages_block, compact_unlock_should_abort and > lock_page_lruvec_irqsave are open coded to work with compact_control. > Also add a debug func in locking which may give some clues if there are > sth out of hands. > > Daniel Jordan's testing show 62% improvement on modified readtwice case > on his 2P * 10 core * 2 HT broadwell box. > https://lore.kernel.org/lkml/20200915165807.kpp7uhiw7l3loofu@ca-dmjordan1.us.oracle.com/ > > On a large machine with memcg enabled but not used, the page's lruvec > seeking pass a few pointers, that may lead to lru_lock holding time > increase and a bit regression. > > Hugh Dickins helped on the patch polish, thanks! > > Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com> > Acked-by: Hugh Dickins <hughd@google.com> > Cc: Rong Chen <rong.a.chen@intel.com> > Cc: Hugh Dickins <hughd@google.com> > Cc: Andrew Morton <akpm@linux-foundation.org> > Cc: Johannes Weiner <hannes@cmpxchg.org> > Cc: Michal Hocko <mhocko@kernel.org> > Cc: Vladimir Davydov <vdavydov.dev@gmail.com> > Cc: Yang Shi <yang.shi@linux.alibaba.com> > Cc: Matthew Wilcox <willy@infradead.org> > Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> > Cc: Tejun Heo <tj@kernel.org> > Cc: linux-kernel@vger.kernel.org > Cc: linux-mm@kvack.org > Cc: cgroups@vger.kernel.org Acked-by: Vlastimil Babka <vbabka@suse.cz>
On 11/11/20 6:46 PM, Vlastimil Babka wrote:
> Acked-by: Vlastimil Babka<vbabka@suse.cz>
Err, not yet, that was supposed for patch 16/17
On 11/5/20 9:55 AM, Alex Shi wrote: > This patch moves per node lru_lock into lruvec, thus bring a lru_lock for > each of memcg per node. So on a large machine, each of memcg don't > have to suffer from per node pgdat->lru_lock competition. They could go > fast with their self lru_lock. > > After move memcg charge before lru inserting, page isolation could > serialize page's memcg, then per memcg lruvec lock is stable and could > replace per node lru lock. > > In func isolate_migratepages_block, compact_unlock_should_abort and > lock_page_lruvec_irqsave are open coded to work with compact_control. > Also add a debug func in locking which may give some clues if there are > sth out of hands. > > Daniel Jordan's testing show 62% improvement on modified readtwice case > on his 2P * 10 core * 2 HT broadwell box. > https://lore.kernel.org/lkml/20200915165807.kpp7uhiw7l3loofu@ca-dmjordan1.us.oracle.com/ > > On a large machine with memcg enabled but not used, the page's lruvec > seeking pass a few pointers, that may lead to lru_lock holding time > increase and a bit regression. > > Hugh Dickins helped on the patch polish, thanks! > > Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com> > Acked-by: Hugh Dickins <hughd@google.com> > Cc: Rong Chen <rong.a.chen@intel.com> > Cc: Hugh Dickins <hughd@google.com> > Cc: Andrew Morton <akpm@linux-foundation.org> > Cc: Johannes Weiner <hannes@cmpxchg.org> > Cc: Michal Hocko <mhocko@kernel.org> > Cc: Vladimir Davydov <vdavydov.dev@gmail.com> > Cc: Yang Shi <yang.shi@linux.alibaba.com> > Cc: Matthew Wilcox <willy@infradead.org> > Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> > Cc: Tejun Heo <tj@kernel.org> > Cc: linux-kernel@vger.kernel.org > Cc: linux-mm@kvack.org > Cc: cgroups@vger.kernel.org I think I need some explanation about the rcu_read_lock() usage in lock_page_lruvec*() (and places effectively opencoding it). Preferably in form of some code comment, but that can be also added as a additional patch later, I don't want to block the series. mem_cgroup_page_lruvec() comment says * This function relies on page->mem_cgroup being stable - see the * access rules in commit_charge(). commit_charge() comment: * Any of the following ensures page->mem_cgroup stability: * * - the page lock * - LRU isolation * - lock_page_memcg() * - exclusive reference "LRU isolation" used to be quite clear, but now is it after TestClearPageLRU(page) or after deleting from the lru list as well? Also it doesn't mention rcu_read_lock(), should it? So what exactly are we protecting by rcu_read_lock() in e.g. lock_page_lruvec()? rcu_read_lock(); lruvec = mem_cgroup_page_lruvec(page, pgdat); spin_lock(&lruvec->lru_lock); rcu_read_unlock(); Looks like we are protecting the lruvec from going away and it can't go away anymore after we take the lru_lock? But then e.g. in __munlock_pagevec() we are doing this without an rcu_read_lock(): new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); where new_lruvec is potentionally not the one that we have locked And the last thing mem_cgroup_page_lruvec() is doing is: if (unlikely(lruvec->pgdat != pgdat)) lruvec->pgdat = pgdat; return lruvec; So without the rcu_read_lock() is this potentionally accessing the pgdat field of lruvec that might have just gone away? Thanks, Vlastimil
在 2020/11/12 下午8:19, Vlastimil Babka 写道: > On 11/5/20 9:55 AM, Alex Shi wrote: >> This patch moves per node lru_lock into lruvec, thus bring a lru_lock for >> each of memcg per node. So on a large machine, each of memcg don't >> have to suffer from per node pgdat->lru_lock competition. They could go >> fast with their self lru_lock. >> >> After move memcg charge before lru inserting, page isolation could >> serialize page's memcg, then per memcg lruvec lock is stable and could >> replace per node lru lock. >> >> In func isolate_migratepages_block, compact_unlock_should_abort and >> lock_page_lruvec_irqsave are open coded to work with compact_control. >> Also add a debug func in locking which may give some clues if there are >> sth out of hands. >> >> Daniel Jordan's testing show 62% improvement on modified readtwice case >> on his 2P * 10 core * 2 HT broadwell box. >> https://lore.kernel.org/lkml/20200915165807.kpp7uhiw7l3loofu@ca-dmjordan1.us.oracle.com/ >> >> On a large machine with memcg enabled but not used, the page's lruvec >> seeking pass a few pointers, that may lead to lru_lock holding time >> increase and a bit regression. >> >> Hugh Dickins helped on the patch polish, thanks! >> >> Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com> >> Acked-by: Hugh Dickins <hughd@google.com> >> Cc: Rong Chen <rong.a.chen@intel.com> >> Cc: Hugh Dickins <hughd@google.com> >> Cc: Andrew Morton <akpm@linux-foundation.org> >> Cc: Johannes Weiner <hannes@cmpxchg.org> >> Cc: Michal Hocko <mhocko@kernel.org> >> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> >> Cc: Yang Shi <yang.shi@linux.alibaba.com> >> Cc: Matthew Wilcox <willy@infradead.org> >> Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> >> Cc: Tejun Heo <tj@kernel.org> >> Cc: linux-kernel@vger.kernel.org >> Cc: linux-mm@kvack.org >> Cc: cgroups@vger.kernel.org > > I think I need some explanation about the rcu_read_lock() usage in lock_page_lruvec*() (and places effectively opencoding it). > Preferably in form of some code comment, but that can be also added as a additional patch later, I don't want to block the series. > Hi Vlastimil, Thanks for comments! Oh, we did talk about the rcu_read_lock which is used to block memcg destroy during locking. and the spin_lock actually includes a rcu_read_lock(). Yes, we could add this comments later. > mem_cgroup_page_lruvec() comment says > > * This function relies on page->mem_cgroup being stable - see the > * access rules in commit_charge(). > > commit_charge() comment: > > * Any of the following ensures page->mem_cgroup stability: > * > * - the page lock > * - LRU isolation > * - lock_page_memcg() > * - exclusive reference > > "LRU isolation" used to be quite clear, but now is it after TestClearPageLRU(page) or after deleting from the lru list as well? > Also it doesn't mention rcu_read_lock(), should it? The lru isolation still is same as old conception, a set actions that take a page from a lru list, and commit_charge do need a isoltion for the page. but the condition of page_memcg could be change since we don't rely on lru isolation for it. The comments could be changed later. > > So what exactly are we protecting by rcu_read_lock() in e.g. lock_page_lruvec()? > > rcu_read_lock(); > lruvec = mem_cgroup_page_lruvec(page, pgdat); > spin_lock(&lruvec->lru_lock); > rcu_read_unlock(); > > Looks like we are protecting the lruvec from going away and it can't go away anymore after we take the lru_lock? > > But then e.g. in __munlock_pagevec() we are doing this without an rcu_read_lock(): > > new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); TestClearPageLRU could block the page from memcg migration/destory. Thanks Alex > > where new_lruvec is potentionally not the one that we have locked > > And the last thing mem_cgroup_page_lruvec() is doing is: > > if (unlikely(lruvec->pgdat != pgdat)) > lruvec->pgdat = pgdat; > return lruvec; > > So without the rcu_read_lock() is this potentionally accessing the pgdat field of lruvec that might have just gone away? > > Thanks, > Vlastimil
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0f4dd7829fb2..6ecb08ff4ad1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -666,6 +666,19 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct mem_cgroup *get_mem_cgroup_from_page(struct page *page); +struct lruvec *lock_page_lruvec(struct page *page); +struct lruvec *lock_page_lruvec_irq(struct page *page); +struct lruvec *lock_page_lruvec_irqsave(struct page *page, + unsigned long *flags); + +#ifdef CONFIG_DEBUG_VM +void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page); +#else +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ +} +#endif + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@ -1233,6 +1246,31 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } +static inline struct lruvec *lock_page_lruvec(struct page *page) +{ + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock(&pgdat->__lruvec.lru_lock); + return &pgdat->__lruvec; +} + +static inline struct lruvec *lock_page_lruvec_irq(struct page *page) +{ + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock_irq(&pgdat->__lruvec.lru_lock); + return &pgdat->__lruvec; +} + +static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page, + unsigned long *flagsp) +{ + struct pglist_data *pgdat = page_pgdat(page); + + spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); + return &pgdat->__lruvec; +} + static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, @@ -1476,6 +1514,10 @@ static inline void count_memcg_page_event(struct page *page, void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } + +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ +} #endif /* CONFIG_MEMCG */ /* idx can be of type enum memcg_stat_item or node_stat_item */ @@ -1605,6 +1647,22 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } +static inline void unlock_page_lruvec(struct lruvec *lruvec) +{ + spin_unlock(&lruvec->lru_lock); +} + +static inline void unlock_page_lruvec_irq(struct lruvec *lruvec) +{ + spin_unlock_irq(&lruvec->lru_lock); +} + +static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, + unsigned long flags) +{ + spin_unlock_irqrestore(&lruvec->lru_lock, flags); +} + #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fb3bf696c05e..0afba4ea2a21 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -276,6 +276,8 @@ enum lruvec_flags { struct lruvec { struct list_head lists[NR_LRU_LISTS]; + /* per lruvec lru_lock for memcg */ + spinlock_t lru_lock; /* * These track the cost of reclaiming one LRU - file or anon - * over the other. As the observed cost of reclaiming one LRU @@ -796,7 +798,6 @@ struct deferred_split { /* Write-intensive fields used by page reclaim */ ZONE_PADDING(_pad1_) - spinlock_t lru_lock; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* diff --git a/mm/compaction.c b/mm/compaction.c index 7b1cf48884dd..9cfe90961493 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -804,7 +804,7 @@ static bool too_many_isolated(pg_data_t *pgdat) unsigned long nr_scanned = 0, nr_isolated = 0; struct lruvec *lruvec; unsigned long flags = 0; - bool locked = false; + struct lruvec *locked = NULL; struct page *page = NULL, *valid_page = NULL; unsigned long start_pfn = low_pfn; bool skip_on_failure = false; @@ -864,11 +864,20 @@ static bool too_many_isolated(pg_data_t *pgdat) * contention, to give chance to IRQs. Abort completely if * a fatal signal is pending. */ - if (!(low_pfn % SWAP_CLUSTER_MAX) - && compact_unlock_should_abort(&pgdat->lru_lock, - flags, &locked, cc)) { - low_pfn = 0; - goto fatal_pending; + if (!(low_pfn % SWAP_CLUSTER_MAX)) { + if (locked) { + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; + } + + if (fatal_signal_pending(current)) { + cc->contended = true; + + low_pfn = 0; + goto fatal_pending; + } + + cond_resched(); } if (!pfn_valid_within(low_pfn)) @@ -940,9 +949,8 @@ static bool too_many_isolated(pg_data_t *pgdat) if (unlikely(__PageMovable(page)) && !PageIsolated(page)) { if (locked) { - spin_unlock_irqrestore(&pgdat->lru_lock, - flags); - locked = false; + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; } if (!isolate_movable_page(page, isolate_mode)) @@ -983,10 +991,19 @@ static bool too_many_isolated(pg_data_t *pgdat) if (!TestClearPageLRU(page)) goto isolate_fail_put; + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + /* If we already hold the lock, we can skip some rechecking */ - if (!locked) { - locked = compact_lock_irqsave(&pgdat->lru_lock, - &flags, cc); + if (lruvec != locked) { + if (locked) + unlock_page_lruvec_irqrestore(locked, flags); + + compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); + locked = lruvec; + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); /* Try get exclusive access under lock */ if (!skip_updated) { @@ -1005,9 +1022,8 @@ static bool too_many_isolated(pg_data_t *pgdat) SetPageLRU(page); goto isolate_fail_put; } - } - - lruvec = mem_cgroup_page_lruvec(page, pgdat); + } else + rcu_read_unlock(); /* The whole page is taken off the LRU; skip the tail pages. */ if (PageCompound(page)) @@ -1041,8 +1057,8 @@ static bool too_many_isolated(pg_data_t *pgdat) isolate_fail_put: /* Avoid potential deadlock in freeing page under lru_lock */ if (locked) { - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - locked = false; + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; } put_page(page); @@ -1057,8 +1073,8 @@ static bool too_many_isolated(pg_data_t *pgdat) */ if (nr_isolated) { if (locked) { - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - locked = false; + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; } putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; @@ -1086,7 +1102,7 @@ static bool too_many_isolated(pg_data_t *pgdat) isolate_abort: if (locked) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + unlock_page_lruvec_irqrestore(locked, flags); if (page) { SetPageLRU(page); put_page(page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b70ec0c6076b..94e42dba052a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2354,7 +2354,7 @@ static void lru_add_page_tail(struct page *head, struct page *tail, VM_BUG_ON_PAGE(!PageHead(head), head); VM_BUG_ON_PAGE(PageCompound(tail), head); VM_BUG_ON_PAGE(PageLRU(tail), head); - lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock); + lockdep_assert_held(&lruvec->lru_lock); if (list) { /* page reclaim is reclaiming a huge page */ @@ -2438,7 +2438,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, pgoff_t end) { struct page *head = compound_head(page); - pg_data_t *pgdat = page_pgdat(head); struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; @@ -2456,10 +2455,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, xa_lock(&swap_cache->i_pages); } - /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock(&pgdat->lru_lock); - - lruvec = mem_cgroup_page_lruvec(head, pgdat); + /* lock lru list/PageCompound, ref freezed by page_ref_freeze */ + lruvec = lock_page_lruvec(head); for (i = nr - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); @@ -2480,7 +2477,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } ClearPageCompound(head); - spin_unlock(&pgdat->lru_lock); + unlock_page_lruvec(lruvec); /* Caller disabled irqs, so they are still disabled here */ split_page_owner(head, nr); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 157b745031a4..91226af58ce8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -20,6 +20,9 @@ * Lockless page tracking & accounting * Unified hierarchy configuration model * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner + * + * Per memcg lru locking + * Copyright (C) 2020 Alibaba, Inc, Alex Shi */ #include <linux/page_counter.h> @@ -1305,6 +1308,19 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, return ret; } +#ifdef CONFIG_DEBUG_VM +void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ + if (mem_cgroup_disabled()) + return; + + if (!page->mem_cgroup) + VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page); + else + VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != page->mem_cgroup, page); +} +#endif + /** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page @@ -1343,6 +1359,59 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd } /** + * lock_page_lruvec - lock and return lruvec for a given page. + * @page: the page + * + * This series functions should be used in either conditions: + * PageLRU is cleared or unset + * or page is locked. + */ +struct lruvec *lock_page_lruvec(struct page *page) +{ + struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock(&lruvec->lru_lock); + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); + + return lruvec; +} + +struct lruvec *lock_page_lruvec_irq(struct page *page) +{ + struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock_irq(&lruvec->lru_lock); + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); + + return lruvec; +} + +struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags) +{ + struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock_irqsave(&lruvec->lru_lock, *flags); + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); + + return lruvec; +} + +/** * mem_cgroup_update_lru_size - account for adding or removing an lru page * @lruvec: mem_cgroup per zone lru vector * @lru: index of lru list the page is sitting on @@ -3245,10 +3314,8 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* - * Because tail pages are not marked as "used", set it. We're under - * pgdat->lru_lock and migration entries setup in all page mappings. + * Because page->mem_cgroup is not set on compound tails, set it now. */ void mem_cgroup_split_huge_fixup(struct page *head) { diff --git a/mm/mlock.c b/mm/mlock.c index 7b0e6334be6f..ab164a675c25 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -262,12 +262,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) int nr = pagevec_count(pvec); int delta_munlocked = -nr; struct pagevec pvec_putback; + struct lruvec *lruvec = NULL; int pgrescued = 0; pagevec_init(&pvec_putback); /* Phase 1: page isolation */ - spin_lock_irq(&zone->zone_pgdat->lru_lock); for (i = 0; i < nr; i++) { struct page *page = pvec->pages[i]; @@ -277,10 +277,16 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) * so we can spare the get_page() here. */ if (TestClearPageLRU(page)) { - struct lruvec *lruvec; + struct lruvec *new_lruvec; + + new_lruvec = mem_cgroup_page_lruvec(page, + page_pgdat(page)); + if (new_lruvec != lruvec) { + if (lruvec) + unlock_page_lruvec_irq(lruvec); + lruvec = lock_page_lruvec_irq(page); + } - lruvec = mem_cgroup_page_lruvec(page, - page_pgdat(page)); del_page_from_lru_list(page, lruvec, page_lru(page)); continue; @@ -299,8 +305,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pagevec_add(&pvec_putback, pvec->pages[i]); pvec->pages[i] = NULL; } - __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - spin_unlock_irq(&zone->zone_pgdat->lru_lock); + if (lruvec) { + __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); + unlock_page_lruvec_irq(lruvec); + } else if (delta_munlocked) { + mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); + } /* Now we can release pins of pages that we are not munlocking */ pagevec_release(&pvec_putback); diff --git a/mm/mmzone.c b/mm/mmzone.c index 4686fdc23bb9..3750a90ed4a0 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -91,6 +91,7 @@ void lruvec_init(struct lruvec *lruvec) enum lru_list lru; memset(lruvec, 0, sizeof(struct lruvec)); + spin_lock_init(&lruvec->lru_lock); for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d77220615fd5..74bf7f4c6317 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6808,7 +6808,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_ext_init(pgdat); - spin_lock_init(&pgdat->lru_lock); lruvec_init(&pgdat->__lruvec); } diff --git a/mm/swap.c b/mm/swap.c index 1838a9535703..ed033f7c4f2d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -79,16 +79,14 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { static void __page_cache_release(struct page *page) { if (PageLRU(page)) { - pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; unsigned long flags; - spin_lock_irqsave(&pgdat->lru_lock, flags); - lruvec = mem_cgroup_page_lruvec(page, pgdat); + lruvec = lock_page_lruvec_irqsave(page, &flags); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + unlock_page_lruvec_irqrestore(lruvec, flags); } __ClearPageWaiters(page); } @@ -207,32 +205,30 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, void (*move_fn)(struct page *page, struct lruvec *lruvec)) { int i; - struct pglist_data *pgdat = NULL; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct pglist_data *pagepgdat = page_pgdat(page); - - if (pagepgdat != pgdat) { - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - pgdat = pagepgdat; - spin_lock_irqsave(&pgdat->lru_lock, flags); - } + struct lruvec *new_lruvec; /* block memcg migration during page moving between lru */ if (!TestClearPageLRU(page)) continue; - lruvec = mem_cgroup_page_lruvec(page, pgdat); + new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + if (lruvec != new_lruvec) { + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = lock_page_lruvec_irqsave(page, &flags); + } + (*move_fn)(page, lruvec); SetPageLRU(page); } - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } @@ -274,9 +270,15 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) { do { unsigned long lrusize; - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - spin_lock_irq(&pgdat->lru_lock); + /* + * Hold lruvec->lru_lock is safe here, since + * 1) The pinned lruvec in reclaim, or + * 2) From a pre-LRU page during refault (which also holds the + * rcu lock, so would be safe even if the page was on the LRU + * and could move simultaneously to a new lruvec). + */ + spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) lruvec->file_cost += nr_pages; @@ -300,7 +302,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); } while ((lruvec = parent_lruvec(lruvec))); } @@ -364,13 +366,15 @@ static inline void activate_page_drain(int cpu) static void activate_page(struct page *page) { - pg_data_t *pgdat = page_pgdat(page); + struct lruvec *lruvec; page = compound_head(page); - spin_lock_irq(&pgdat->lru_lock); - if (PageLRU(page)) - __activate_page(page, mem_cgroup_page_lruvec(page, pgdat)); - spin_unlock_irq(&pgdat->lru_lock); + if (TestClearPageLRU(page)) { + lruvec = lock_page_lruvec_irq(page); + __activate_page(page, lruvec); + unlock_page_lruvec_irq(lruvec); + SetPageLRU(page); + } } #endif @@ -860,8 +864,7 @@ void release_pages(struct page **pages, int nr) { int i; LIST_HEAD(pages_to_free); - struct pglist_data *locked_pgdat = NULL; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags; unsigned int lock_batch; @@ -871,11 +874,11 @@ void release_pages(struct page **pages, int nr) /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the - * same pgdat. The lock is held only if pgdat != NULL. + * same lruvec. The lock is held only if lruvec != NULL. */ - if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); - locked_pgdat = NULL; + if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } page = compound_head(page); @@ -883,10 +886,9 @@ void release_pages(struct page **pages, int nr) continue; if (is_zone_device_page(page)) { - if (locked_pgdat) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, - flags); - locked_pgdat = NULL; + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } /* * ZONE_DEVICE pages that return 'false' from @@ -907,27 +909,27 @@ void release_pages(struct page **pages, int nr) continue; if (PageCompound(page)) { - if (locked_pgdat) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); - locked_pgdat = NULL; + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } __put_compound_page(page); continue; } if (PageLRU(page)) { - struct pglist_data *pgdat = page_pgdat(page); + struct lruvec *new_lruvec; - if (pgdat != locked_pgdat) { - if (locked_pgdat) - spin_unlock_irqrestore(&locked_pgdat->lru_lock, + new_lruvec = mem_cgroup_page_lruvec(page, + page_pgdat(page)); + if (new_lruvec != lruvec) { + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); lock_batch = 0; - locked_pgdat = pgdat; - spin_lock_irqsave(&locked_pgdat->lru_lock, flags); + lruvec = lock_page_lruvec_irqsave(page, &flags); } - lruvec = mem_cgroup_page_lruvec(page, locked_pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); @@ -937,8 +939,8 @@ void release_pages(struct page **pages, int nr) list_add(&page->lru, &pages_to_free); } - if (locked_pgdat) - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); mem_cgroup_uncharge_list(&pages_to_free); free_unref_page_list(&pages_to_free); @@ -1026,26 +1028,24 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) void __pagevec_lru_add(struct pagevec *pvec) { int i; - struct pglist_data *pgdat = NULL; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct pglist_data *pagepgdat = page_pgdat(page); + struct lruvec *new_lruvec; - if (pagepgdat != pgdat) { - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - pgdat = pagepgdat; - spin_lock_irqsave(&pgdat->lru_lock, flags); + new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + if (lruvec != new_lruvec) { + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = lock_page_lruvec_irqsave(page, &flags); } - lruvec = mem_cgroup_page_lruvec(page, pgdat); __pagevec_lru_add_fn(page, lruvec); } - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 0be55d875fde..2953ddec88a0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1765,14 +1765,12 @@ int isolate_lru_page(struct page *page) WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); if (TestClearPageLRU(page)) { - pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; get_page(page); - lruvec = mem_cgroup_page_lruvec(page, pgdat); - spin_lock_irq(&pgdat->lru_lock); + lruvec = lock_page_lruvec_irq(page); del_page_from_lru_list(page, lruvec, page_lru(page)); - spin_unlock_irq(&pgdat->lru_lock); + unlock_page_lruvec_irq(lruvec); ret = 0; } @@ -1839,7 +1837,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, struct list_head *list) { - struct pglist_data *pgdat = lruvec_pgdat(lruvec); int nr_pages, nr_moved = 0; LIST_HEAD(pages_to_free); struct page *page; @@ -1850,9 +1847,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); if (unlikely(!page_evictable(page))) { - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); putback_lru_page(page); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); continue; } @@ -1874,9 +1871,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, __ClearPageActive(page); if (unlikely(PageCompound(page))) { - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); destroy_compound_page(page); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); } else list_add(&page->lru, &pages_to_free); @@ -1953,7 +1950,7 @@ static int current_may_throttle(void) lru_add_drain(); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, &nr_scanned, sc, lru); @@ -1965,7 +1962,7 @@ static int current_may_throttle(void) __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); __count_vm_events(PGSCAN_ANON + file, nr_scanned); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); if (nr_taken == 0) return 0; @@ -1973,7 +1970,7 @@ static int current_may_throttle(void) nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0, &stat, false); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); move_pages_to_lru(lruvec, &page_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); @@ -1982,7 +1979,7 @@ static int current_may_throttle(void) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); lru_note_cost(lruvec, file, stat.nr_pageout); mem_cgroup_uncharge_list(&page_list); @@ -2035,7 +2032,7 @@ static void shrink_active_list(unsigned long nr_to_scan, lru_add_drain(); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); @@ -2046,7 +2043,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __count_vm_events(PGREFILL, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); while (!list_empty(&l_hold)) { cond_resched(); @@ -2092,7 +2089,7 @@ static void shrink_active_list(unsigned long nr_to_scan, /* * Move pages back to the lru list. */ - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); nr_activate = move_pages_to_lru(lruvec, &l_active); nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); @@ -2103,7 +2100,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_list(&l_active); free_unref_page_list(&l_active); @@ -2693,10 +2690,10 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) /* * Determine the scan balance between anon and file LRUs. */ - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&target_lruvec->lru_lock); sc->anon_cost = target_lruvec->anon_cost; sc->file_cost = target_lruvec->file_cost; - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&target_lruvec->lru_lock); /* * Target desirable inactive:active list ratios for the anon @@ -4272,16 +4269,15 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) */ void check_move_unevictable_pages(struct pagevec *pvec) { - struct lruvec *lruvec; - struct pglist_data *pgdat = NULL; + struct lruvec *lruvec = NULL; int pgscanned = 0; int pgrescued = 0; int i; for (i = 0; i < pvec->nr; i++) { struct page *page = pvec->pages[i]; - struct pglist_data *pagepgdat = page_pgdat(page); int nr_pages; + struct lruvec *new_lruvec; if (PageTransTail(page)) continue; @@ -4293,13 +4289,12 @@ void check_move_unevictable_pages(struct pagevec *pvec) if (!TestClearPageLRU(page)) continue; - if (pagepgdat != pgdat) { - if (pgdat) - spin_unlock_irq(&pgdat->lru_lock); - pgdat = pagepgdat; - spin_lock_irq(&pgdat->lru_lock); + new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + if (lruvec != new_lruvec) { + if (lruvec) + unlock_page_lruvec_irq(lruvec); + lruvec = lock_page_lruvec_irq(page); } - lruvec = mem_cgroup_page_lruvec(page, pgdat); if (page_evictable(page) && PageUnevictable(page)) { enum lru_list lru = page_lru_base_type(page); @@ -4313,10 +4308,10 @@ void check_move_unevictable_pages(struct pagevec *pvec) SetPageLRU(page); } - if (pgdat) { + if (lruvec) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); - spin_unlock_irq(&pgdat->lru_lock); + unlock_page_lruvec_irq(lruvec); } else if (pgscanned) { count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); }