diff mbox series

[v17,17/21] mm/lru: replace pgdat lru_lock with lruvec lock

Message ID 1595681998-19193-18-git-send-email-alex.shi@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series per memcg lru lock | expand

Commit Message

Alex Shi July 25, 2020, 12:59 p.m. UTC
This patch moves per node lru_lock into lruvec, thus bring a lru_lock for
each of memcg per node. So on a large machine, each of memcg don't
have to suffer from per node pgdat->lru_lock competition. They could go
fast with their self lru_lock.

After move memcg charge before lru inserting, page isolation could
serialize page's memcg, then per memcg lruvec lock is stable and could
replace per node lru lock.

According to Daniel Jordan's suggestion, I run 208 'dd' with on 104
containers on a 2s * 26cores * HT box with a modefied case:
https://git.kernel.org/pub/scm/linux/kernel/git/wfg/vm-scalability.git/tree/case-lru-file-readtwice

With this and later patches, the readtwice performance increases about
80% within concurrent containers.

Also add a debug func in locking which may give some clues if there are
sth out of hands.

Hugh Dickins helped on patch polish, thanks!

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Tejun Heo <tj@kernel.org>
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: cgroups@vger.kernel.org
---
 include/linux/memcontrol.h |  58 +++++++++++++++++++++++++
 include/linux/mmzone.h     |   2 +
 mm/compaction.c            |  67 ++++++++++++++++++-----------
 mm/huge_memory.c           |  11 ++---
 mm/memcontrol.c            |  63 ++++++++++++++++++++++++++-
 mm/mlock.c                 |  47 +++++++++++++-------
 mm/mmzone.c                |   1 +
 mm/swap.c                  | 104 +++++++++++++++++++++------------------------
 mm/vmscan.c                |  70 ++++++++++++++++--------------
 9 files changed, 288 insertions(+), 135 deletions(-)

Comments

Alexander Duyck July 27, 2020, 11:34 p.m. UTC | #1
On Sat, Jul 25, 2020 at 6:01 AM Alex Shi <alex.shi@linux.alibaba.com> wrote:
>
> This patch moves per node lru_lock into lruvec, thus bring a lru_lock for
> each of memcg per node. So on a large machine, each of memcg don't
> have to suffer from per node pgdat->lru_lock competition. They could go
> fast with their self lru_lock.
>
> After move memcg charge before lru inserting, page isolation could
> serialize page's memcg, then per memcg lruvec lock is stable and could
> replace per node lru lock.
>
> According to Daniel Jordan's suggestion, I run 208 'dd' with on 104
> containers on a 2s * 26cores * HT box with a modefied case:
> https://git.kernel.org/pub/scm/linux/kernel/git/wfg/vm-scalability.git/tree/case-lru-file-readtwice
>
> With this and later patches, the readtwice performance increases about
> 80% within concurrent containers.
>
> Also add a debug func in locking which may give some clues if there are
> sth out of hands.
>
> Hugh Dickins helped on patch polish, thanks!
>
> Reported-by: kernel test robot <lkp@intel.com>
> Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
> Cc: Yang Shi <yang.shi@linux.alibaba.com>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: linux-kernel@vger.kernel.org
> Cc: linux-mm@kvack.org
> Cc: cgroups@vger.kernel.org
> ---
>  include/linux/memcontrol.h |  58 +++++++++++++++++++++++++
>  include/linux/mmzone.h     |   2 +
>  mm/compaction.c            |  67 ++++++++++++++++++-----------
>  mm/huge_memory.c           |  11 ++---
>  mm/memcontrol.c            |  63 ++++++++++++++++++++++++++-
>  mm/mlock.c                 |  47 +++++++++++++-------
>  mm/mmzone.c                |   1 +
>  mm/swap.c                  | 104 +++++++++++++++++++++------------------------
>  mm/vmscan.c                |  70 ++++++++++++++++--------------
>  9 files changed, 288 insertions(+), 135 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index e77197a62809..258901021c6c 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -411,6 +411,19 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
>
>  struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
>
> +struct lruvec *lock_page_lruvec(struct page *page);
> +struct lruvec *lock_page_lruvec_irq(struct page *page);
> +struct lruvec *lock_page_lruvec_irqsave(struct page *page,
> +                                               unsigned long *flags);
> +
> +#ifdef CONFIG_DEBUG_VM
> +void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
> +#else
> +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
> +{
> +}
> +#endif
> +
>  static inline
>  struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
>         return css ? container_of(css, struct mem_cgroup, css) : NULL;
> @@ -892,6 +905,31 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
>  {
>  }
>
> +static inline struct lruvec *lock_page_lruvec(struct page *page)
> +{
> +       struct pglist_data *pgdat = page_pgdat(page);
> +
> +       spin_lock(&pgdat->__lruvec.lru_lock);
> +       return &pgdat->__lruvec;
> +}
> +
> +static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
> +{
> +       struct pglist_data *pgdat = page_pgdat(page);
> +
> +       spin_lock_irq(&pgdat->__lruvec.lru_lock);
> +       return &pgdat->__lruvec;
> +}
> +
> +static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
> +               unsigned long *flagsp)
> +{
> +       struct pglist_data *pgdat = page_pgdat(page);
> +
> +       spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
> +       return &pgdat->__lruvec;
> +}
> +
>  static inline struct mem_cgroup *
>  mem_cgroup_iter(struct mem_cgroup *root,
>                 struct mem_cgroup *prev,
> @@ -1126,6 +1164,10 @@ static inline void count_memcg_page_event(struct page *page,
>  void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
>  {
>  }
> +
> +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
> +{
> +}
>  #endif /* CONFIG_MEMCG */
>
>  /* idx can be of type enum memcg_stat_item or node_stat_item */
> @@ -1255,6 +1297,22 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
>         return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
>  }
>
> +static inline void unlock_page_lruvec(struct lruvec *lruvec)
> +{
> +       spin_unlock(&lruvec->lru_lock);
> +}
> +
> +static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
> +{
> +       spin_unlock_irq(&lruvec->lru_lock);
> +}
> +
> +static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
> +               unsigned long flags)
> +{
> +       spin_unlock_irqrestore(&lruvec->lru_lock, flags);
> +}
> +
>  #ifdef CONFIG_CGROUP_WRITEBACK
>
>  struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 14c668b7e793..30b961a9a749 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -249,6 +249,8 @@ enum lruvec_flags {
>  };
>
>  struct lruvec {
> +       /* per lruvec lru_lock for memcg */
> +       spinlock_t                      lru_lock;
>         struct list_head                lists[NR_LRU_LISTS];
>         /*
>          * These track the cost of reclaiming one LRU - file or anon -
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 2da2933fe56b..88bbd2e93895 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -787,7 +787,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
>         unsigned long nr_scanned = 0, nr_isolated = 0;
>         struct lruvec *lruvec;
>         unsigned long flags = 0;
> -       bool locked = false;
> +       struct lruvec *locked_lruvec = NULL;
>         struct page *page = NULL, *valid_page = NULL;
>         unsigned long start_pfn = low_pfn;
>         bool skip_on_failure = false;
> @@ -847,11 +847,21 @@ static bool too_many_isolated(pg_data_t *pgdat)
>                  * contention, to give chance to IRQs. Abort completely if
>                  * a fatal signal is pending.
>                  */
> -               if (!(low_pfn % SWAP_CLUSTER_MAX)
> -                   && compact_unlock_should_abort(&pgdat->lru_lock,
> -                                           flags, &locked, cc)) {
> -                       low_pfn = 0;
> -                       goto fatal_pending;
> +               if (!(low_pfn % SWAP_CLUSTER_MAX)) {
> +                       if (locked_lruvec) {
> +                               unlock_page_lruvec_irqrestore(locked_lruvec,
> +                                                                       flags);
> +                               locked_lruvec = NULL;
> +                       }
> +
> +                       if (fatal_signal_pending(current)) {
> +                               cc->contended = true;
> +
> +                               low_pfn = 0;
> +                               goto fatal_pending;
> +                       }
> +
> +                       cond_resched();
>                 }
>
>                 if (!pfn_valid_within(low_pfn))

I'm noticing this patch introduces a bunch of noise. What is the
reason for getting rid of compact_unlock_should_abort? It seems like
you just open coded it here. If there is some sort of issue with it
then it might be better to replace it as part of a preparatory patch
before you introduce this one as changes like this make it harder to
review.

It might make more sense to look at modifying
compact_unlock_should_abort and compact_lock_irqsave (which always
returns true so should probably be a void) to address the deficiencies
they have that make them unusable for you.

> @@ -922,10 +932,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
>                          */
>                         if (unlikely(__PageMovable(page)) &&
>                                         !PageIsolated(page)) {
> -                               if (locked) {
> -                                       spin_unlock_irqrestore(&pgdat->lru_lock,
> -                                                                       flags);
> -                                       locked = false;
> +                               if (locked_lruvec) {
> +                                       unlock_page_lruvec_irqrestore(locked_lruvec, flags);
> +                                       locked_lruvec = NULL;
>                                 }
>
>                                 if (!isolate_movable_page(page, isolate_mode))
> @@ -966,10 +975,20 @@ static bool too_many_isolated(pg_data_t *pgdat)
>                 if (!TestClearPageLRU(page))
>                         goto isolate_fail_put;
>
> +               rcu_read_lock();
> +               lruvec = mem_cgroup_page_lruvec(page, pgdat);
> +
>                 /* If we already hold the lock, we can skip some rechecking */
> -               if (!locked) {
> -                       locked = compact_lock_irqsave(&pgdat->lru_lock,
> -                                                               &flags, cc);
> +               if (lruvec != locked_lruvec) {
> +                       if (locked_lruvec)
> +                               unlock_page_lruvec_irqrestore(locked_lruvec,
> +                                                                       flags);
> +
> +                       compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
> +                       locked_lruvec = lruvec;
> +                       rcu_read_unlock();
> +
> +                       lruvec_memcg_debug(lruvec, page);
>
>                         /* Try get exclusive access under lock */
>                         if (!skip_updated) {

So this bit makes things a bit complicated. From what I can can tell
the comment about exclusive access under the lock is supposed to apply
to the pageblock via the lru_lock. However you are having to retest
the lock for each page because it is possible the page was moved to
another memory cgroup while the lru_lock was released correct? So in
this case is the lru vector lock really providing any protection for
the skip_updated portion of this code block if the lock isn't
exclusive to the pageblock? In theory this would probably make more
sense to have protected the skip bits under the zone lock, but I
imagine that was avoided due to the additional overhead.

> @@ -988,9 +1007,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
>                                 SetPageLRU(page);
>                                 goto isolate_fail_put;
>                         }
> -               }
> -
> -               lruvec = mem_cgroup_page_lruvec(page, pgdat);
> +               } else
> +                       rcu_read_unlock();
>
>                 /* The whole page is taken off the LRU; skip the tail pages. */
>                 if (PageCompound(page))
> @@ -1023,9 +1041,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
>
>  isolate_fail_put:
>                 /* Avoid potential deadlock in freeing page under lru_lock */
> -               if (locked) {
> -                       spin_unlock_irqrestore(&pgdat->lru_lock, flags);
> -                       locked = false;
> +               if (locked_lruvec) {
> +                       unlock_page_lruvec_irqrestore(locked_lruvec, flags);
> +                       locked_lruvec = NULL;
>                 }
>                 put_page(page);
>
> @@ -1039,9 +1057,10 @@ static bool too_many_isolated(pg_data_t *pgdat)
>                  * page anyway.
>                  */
>                 if (nr_isolated) {
> -                       if (locked) {
> -                               spin_unlock_irqrestore(&pgdat->lru_lock, flags);
> -                               locked = false;
> +                       if (locked_lruvec) {
> +                               unlock_page_lruvec_irqrestore(locked_lruvec,
> +                                                                       flags);
> +                               locked_lruvec = NULL;
>                         }
>                         putback_movable_pages(&cc->migratepages);
>                         cc->nr_migratepages = 0;
> @@ -1068,8 +1087,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
>         page = NULL;
>
>  isolate_abort:
> -       if (locked)
> -               spin_unlock_irqrestore(&pgdat->lru_lock, flags);
> +       if (locked_lruvec)
> +               unlock_page_lruvec_irqrestore(locked_lruvec, flags);
>         if (page) {
>                 SetPageLRU(page);
>                 put_page(page);

<snip>

> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index f77748adc340..168c1659e430 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1774,15 +1774,13 @@ int isolate_lru_page(struct page *page)
>         WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
>
>         if (TestClearPageLRU(page)) {
> -               pg_data_t *pgdat = page_pgdat(page);
>                 struct lruvec *lruvec;
>                 int lru = page_lru(page);
>
>                 get_page(page);
> -               lruvec = mem_cgroup_page_lruvec(page, pgdat);
> -               spin_lock_irq(&pgdat->lru_lock);
> +               lruvec = lock_page_lruvec_irq(page);
>                 del_page_from_lru_list(page, lruvec, lru);
> -               spin_unlock_irq(&pgdat->lru_lock);
> +               unlock_page_lruvec_irq(lruvec);
>                 ret = 0;
>         }
>
> @@ -1849,20 +1847,22 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
>  static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
>                                                      struct list_head *list)
>  {
> -       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>         int nr_pages, nr_moved = 0;
>         LIST_HEAD(pages_to_free);
>         struct page *page;
> +       struct lruvec *orig_lruvec = lruvec;
>         enum lru_list lru;
>
>         while (!list_empty(list)) {
> +               struct lruvec *new_lruvec = NULL;
> +
>                 page = lru_to_page(list);
>                 VM_BUG_ON_PAGE(PageLRU(page), page);
>                 list_del(&page->lru);
>                 if (unlikely(!page_evictable(page))) {
> -                       spin_unlock_irq(&pgdat->lru_lock);
> +                       spin_unlock_irq(&lruvec->lru_lock);
>                         putback_lru_page(page);
> -                       spin_lock_irq(&pgdat->lru_lock);
> +                       spin_lock_irq(&lruvec->lru_lock);
>                         continue;
>                 }
>
> @@ -1876,6 +1876,12 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
>                  *                                        list_add(&page->lru,)
>                  *     list_add(&page->lru,) //corrupt
>                  */
> +               new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> +               if (new_lruvec != lruvec) {
> +                       if (lruvec)
> +                               spin_unlock_irq(&lruvec->lru_lock);
> +                       lruvec = lock_page_lruvec_irq(page);
> +               }
>                 SetPageLRU(page);
>
>                 if (unlikely(put_page_testzero(page))) {

I was going through the code of the entire patch set and I noticed
these changes in move_pages_to_lru. What is the reason for adding the
new_lruvec logic? My understanding is that we are moving the pages to
the lruvec provided are we not?If so why do we need to add code to get
a new lruvec? The code itself seems to stand out from the rest of the
patch as it is introducing new code instead of replacing existing
locking code, and it doesn't match up with the description of what
this function is supposed to do since it changes the lruvec.

> @@ -1883,16 +1889,15 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
>                         __ClearPageActive(page);
>
>                         if (unlikely(PageCompound(page))) {
> -                               spin_unlock_irq(&pgdat->lru_lock);
> +                               spin_unlock_irq(&lruvec->lru_lock);
>                                 destroy_compound_page(page);
> -                               spin_lock_irq(&pgdat->lru_lock);
> +                               spin_lock_irq(&lruvec->lru_lock);
>                         } else
>                                 list_add(&page->lru, &pages_to_free);
>
>                         continue;
>                 }
>
> -               lruvec = mem_cgroup_page_lruvec(page, pgdat);
>                 lru = page_lru(page);
>                 nr_pages = hpage_nr_pages(page);
>
> @@ -1902,6 +1907,11 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
>                 if (PageActive(page))
>                         workingset_age_nonresident(lruvec, nr_pages);
>         }
> +       if (orig_lruvec != lruvec) {
> +               if (lruvec)
> +                       spin_unlock_irq(&lruvec->lru_lock);
> +               spin_lock_irq(&orig_lruvec->lru_lock);
> +       }
>
>         /*
>          * To save our caller's stack, now use input list for pages to free.
Alex Shi July 28, 2020, 7:15 a.m. UTC | #2
在 2020/7/28 上午7:34, Alexander Duyck 写道:

>> @@ -847,11 +847,21 @@ static bool too_many_isolated(pg_data_t *pgdat)
>>                  * contention, to give chance to IRQs. Abort completely if
>>                  * a fatal signal is pending.
>>                  */
>> -               if (!(low_pfn % SWAP_CLUSTER_MAX)
>> -                   && compact_unlock_should_abort(&pgdat->lru_lock,
>> -                                           flags, &locked, cc)) {
>> -                       low_pfn = 0;
>> -                       goto fatal_pending;
>> +               if (!(low_pfn % SWAP_CLUSTER_MAX)) {
>> +                       if (locked_lruvec) {
>> +                               unlock_page_lruvec_irqrestore(locked_lruvec,
>> +                                                                       flags);
>> +                               locked_lruvec = NULL;
>> +                       }
>> +
>> +                       if (fatal_signal_pending(current)) {
>> +                               cc->contended = true;
>> +
>> +                               low_pfn = 0;
>> +                               goto fatal_pending;
>> +                       }
>> +
>> +                       cond_resched();
>>                 }
>>
>>                 if (!pfn_valid_within(low_pfn))
> 
> I'm noticing this patch introduces a bunch of noise. What is the
> reason for getting rid of compact_unlock_should_abort? It seems like
> you just open coded it here. If there is some sort of issue with it
> then it might be better to replace it as part of a preparatory patch
> before you introduce this one as changes like this make it harder to
> review.

Thanks for comments, Alex.

the func compact_unlock_should_abort should be removed since one of parameters
changed from 'bool *locked' to 'struct lruvec *lruvec'. So it's not applicable
now. I have to open it here instead of adding a only one user func.

> 
> It might make more sense to look at modifying
> compact_unlock_should_abort and compact_lock_irqsave (which always
> returns true so should probably be a void) to address the deficiencies
> they have that make them unusable for you.

I am wondering if people like a patch which just open compact_unlock_should_abort
func and move bool to void as a preparation patch, do you like this?


>> @@ -966,10 +975,20 @@ static bool too_many_isolated(pg_data_t *pgdat)
>>                 if (!TestClearPageLRU(page))
>>                         goto isolate_fail_put;
>>
>> +               rcu_read_lock();
>> +               lruvec = mem_cgroup_page_lruvec(page, pgdat);
>> +
>>                 /* If we already hold the lock, we can skip some rechecking */
>> -               if (!locked) {
>> -                       locked = compact_lock_irqsave(&pgdat->lru_lock,
>> -                                                               &flags, cc);
>> +               if (lruvec != locked_lruvec) {
>> +                       if (locked_lruvec)
>> +                               unlock_page_lruvec_irqrestore(locked_lruvec,
>> +                                                                       flags);
>> +
>> +                       compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
>> +                       locked_lruvec = lruvec;
>> +                       rcu_read_unlock();
>> +
>> +                       lruvec_memcg_debug(lruvec, page);
>>
>>                         /* Try get exclusive access under lock */
>>                         if (!skip_updated) {
> 
> So this bit makes things a bit complicated. From what I can can tell
> the comment about exclusive access under the lock is supposed to apply
> to the pageblock via the lru_lock. However you are having to retest
> the lock for each page because it is possible the page was moved to
> another memory cgroup while the lru_lock was released correct? So in

The pageblock is aligned by pfn, so pages in them maynot on same memcg
originally. and yes, page may be changed memcg also.

> this case is the lru vector lock really providing any protection for
> the skip_updated portion of this code block if the lock isn't
> exclusive to the pageblock? In theory this would probably make more
> sense to have protected the skip bits under the zone lock, but I
> imagine that was avoided due to the additional overhead.

when we change to lruvec->lru_lock, it does the same thing as pgdat->lru_lock.
just may get a bit more chance to here, and find out this is a skipable
pageblock and quit. 
Yes, logically, pgdat lru_lock seems better, but since we are holding lru_lock.
It's fine to not bother more locks.

> 
>> @@ -1876,6 +1876,12 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
>>                  *                                        list_add(&page->lru,)
>>                  *     list_add(&page->lru,) //corrupt
>>                  */
>> +               new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
>> +               if (new_lruvec != lruvec) {
>> +                       if (lruvec)
>> +                               spin_unlock_irq(&lruvec->lru_lock);
>> +                       lruvec = lock_page_lruvec_irq(page);
>> +               }
>>                 SetPageLRU(page);
>>
>>                 if (unlikely(put_page_testzero(page))) {
> 
> I was going through the code of the entire patch set and I noticed
> these changes in move_pages_to_lru. What is the reason for adding the
> new_lruvec logic? My understanding is that we are moving the pages to
> the lruvec provided are we not?If so why do we need to add code to get
> a new lruvec? The code itself seems to stand out from the rest of the
> patch as it is introducing new code instead of replacing existing
> locking code, and it doesn't match up with the description of what
> this function is supposed to do since it changes the lruvec.

A code here since some bugs happened. I will check it again anyway.

Thanks!
Alex Shi July 28, 2020, 11:19 a.m. UTC | #3
在 2020/7/28 上午7:34, Alexander Duyck 写道:
>> @@ -1876,6 +1876,12 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
>>                  *                                        list_add(&page->lru,)
>>                  *     list_add(&page->lru,) //corrupt
>>                  */
>> +               new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
>> +               if (new_lruvec != lruvec) {
>> +                       if (lruvec)
>> +                               spin_unlock_irq(&lruvec->lru_lock);
>> +                       lruvec = lock_page_lruvec_irq(page);
>> +               }
>>                 SetPageLRU(page);
>>
>>                 if (unlikely(put_page_testzero(page))) {
> I was going through the code of the entire patch set and I noticed
> these changes in move_pages_to_lru. What is the reason for adding the
> new_lruvec logic? My understanding is that we are moving the pages to
> the lruvec provided are we not?If so why do we need to add code to get
> a new lruvec? The code itself seems to stand out from the rest of the
> patch as it is introducing new code instead of replacing existing
> locking code, and it doesn't match up with the description of what
> this function is supposed to do since it changes the lruvec.

this new_lruvec is the replacement of removed line, as following code:
>> -               lruvec = mem_cgroup_page_lruvec(page, pgdat);
This recheck is for the page move the root memcg, otherwise it cause the bug:

[ 2081.240795] BUG: kernel NULL pointer dereference, address: 0000000000000000
[ 2081.248125] #PF: supervisor read access in kernel mode
[ 2081.253627] #PF: error_code(0x0000) - not-present page
[ 2081.259124] PGD 8000000044cb0067 P4D 8000000044cb0067 PUD 95c9067 PMD 0
[ 2081.266193] Oops: 0000 [#1] PREEMPT SMP PTI
[ 2081.270740] CPU: 5 PID: 131 Comm: kswapd0 Kdump: loaded Tainted: G        W         5.8.0-rc6-00025-gc708f8a0db47 #45
[ 2081.281960] Hardware name: Alibaba X-Dragon CN 01/20G4B, BIOS 1ALSP016 05/21/2018
[ 2081.290054] RIP: 0010:do_raw_spin_trylock+0x5/0x40
[ 2081.295209] Code: 76 82 48 89 df e8 bb fe ff ff eb 8c 89 c6 48 89 df e8 4f dd ff ff 66 90 eb 8b 90 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 <8b> 07 85 c0 75 28 ba 01 00 00 00 f0 0f b1 17 75 1d 65 8b 05 03 6a
[ 2081.314832] RSP: 0018:ffffc900002ebac8 EFLAGS: 00010082
[ 2081.320410] RAX: 0000000000000000 RBX: 0000000000000018 RCX: 0000000000000000
[ 2081.327907] RDX: ffff888035833480 RSI: 0000000000000000 RDI: 0000000000000000
[ 2081.335407] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000001
[ 2081.342907] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
[ 2081.350405] R13: dead000000000100 R14: 0000000000000000 R15: ffffc900002ebbb0
[ 2081.357908] FS:  0000000000000000(0000) GS:ffff88807a200000(0000) knlGS:0000000000000000
[ 2081.366619] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2081.372717] CR2: 0000000000000000 CR3: 0000000031228005 CR4: 00000000003606e0
[ 2081.380215] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2081.387713] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 2081.395198] Call Trace:
[ 2081.398008]  _raw_spin_lock_irq+0x47/0x80
[ 2081.402387]  ? move_pages_to_lru+0x566/0xb80
[ 2081.407028]  move_pages_to_lru+0x566/0xb80
[ 2081.411495]  shrink_active_list+0x355/0xa70
[ 2081.416054]  shrink_lruvec+0x4f7/0x810
[ 2081.420176]  ? mem_cgroup_iter+0xb6/0x410
[ 2081.424558]  shrink_node+0x1cc/0x8d0
[ 2081.428510]  balance_pgdat+0x3cf/0x760
[ 2081.432634]  kswapd+0x232/0x660
[ 2081.436147]  ? finish_wait+0x80/0x80
[ 2081.440093]  ? balance_pgdat+0x760/0x760
[ 2081.444382]  kthread+0x17e/0x1b0
[ 2081.447975]  ? kthread_park+0xc0/0xc0
[ 2081.452005]  ret_from_fork+0x22/0x30

Thanks!
Alex
> 
>> @@ -1883,16 +1889,15 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
>>                         __ClearPageActive(page);
>>
>>                         if (unlikely(PageCompound(page))) {
>> -                               spin_unlock_irq(&pgdat->lru_lock);
>> +                               spin_unlock_irq(&lruvec->lru_lock);
>>                                 destroy_compound_page(page);
>> -                               spin_lock_irq(&pgdat->lru_lock);
>> +                               spin_lock_irq(&lruvec->lru_lock);
>>                         } else
>>                                 list_add(&page->lru, &pages_to_free);
>>
>>                         continue;
>>                 }
>>
>> -               lruvec = mem_cgroup_page_lruvec(page, pgdat);
>>                 lru = page_lru(page);
>>                 nr_pages = hpage_nr_pages(page);
Alexander Duyck July 28, 2020, 2:54 p.m. UTC | #4
On Tue, Jul 28, 2020 at 4:20 AM Alex Shi <alex.shi@linux.alibaba.com> wrote:
>
>
>
> 在 2020/7/28 上午7:34, Alexander Duyck 写道:
> >> @@ -1876,6 +1876,12 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
> >>                  *                                        list_add(&page->lru,)
> >>                  *     list_add(&page->lru,) //corrupt
> >>                  */
> >> +               new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> >> +               if (new_lruvec != lruvec) {
> >> +                       if (lruvec)
> >> +                               spin_unlock_irq(&lruvec->lru_lock);
> >> +                       lruvec = lock_page_lruvec_irq(page);
> >> +               }
> >>                 SetPageLRU(page);
> >>
> >>                 if (unlikely(put_page_testzero(page))) {
> > I was going through the code of the entire patch set and I noticed
> > these changes in move_pages_to_lru. What is the reason for adding the
> > new_lruvec logic? My understanding is that we are moving the pages to
> > the lruvec provided are we not?If so why do we need to add code to get
> > a new lruvec? The code itself seems to stand out from the rest of the
> > patch as it is introducing new code instead of replacing existing
> > locking code, and it doesn't match up with the description of what
> > this function is supposed to do since it changes the lruvec.
>
> this new_lruvec is the replacement of removed line, as following code:
> >> -               lruvec = mem_cgroup_page_lruvec(page, pgdat);
> This recheck is for the page move the root memcg, otherwise it cause the bug:

Okay, now I see where the issue is. You moved this code so now it has
a different effect than it did before. You are relocking things before
you needed to. Don't forget that when you came into this function you
already had the lock. In addition the patch is broken as it currently
stands as you aren't using similar logic in the code just above this
addition if you encounter an evictable page. As a result this is
really difficult to review as there are subtle bugs here.

I suppose the correct fix is to get rid of this line, but  it should
be placed everywhere the original function was calling
spin_lock_irq().

In addition I would consider changing the arguments/documentation for
move_pages_to_lru. You aren't moving the pages to lruvec, so there is
probably no need to pass that as an argument. Instead I would pass
pgdat since that isn't going to be moving and is the only thing you
actually derive based on the original lruvec.
Alex Shi July 28, 2020, 3:39 p.m. UTC | #5
在 2020/7/28 上午7:34, Alexander Duyck 写道:
> It might make more sense to look at modifying
> compact_unlock_should_abort and compact_lock_irqsave (which always
> returns true so should probably be a void) to address the deficiencies
> they have that make them unusable for you.

One of possible reuse for the func compact_unlock_should_abort, could be
like the following, the locked parameter reused different in 2 places.
but, it's seems no this style usage in kernel, isn't it?

Thanks
Alex

From 41d5ce6562f20f74bc6ac2db83e226ac28d56e90 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@linux.alibaba.com>
Date: Tue, 28 Jul 2020 21:19:32 +0800
Subject: [PATCH] compaction polishing

Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
---
 mm/compaction.c | 71 ++++++++++++++++++++++++---------------------------------
 1 file changed, 30 insertions(+), 41 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index c28a43481f01..36fce988de3e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -479,20 +479,20 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page,
  *
  * Always returns true which makes it easier to track lock state in callers.
  */
-static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
+static void compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
 						struct compact_control *cc)
 	__acquires(lock)
 {
 	/* Track if the lock is contended in async mode */
 	if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
 		if (spin_trylock_irqsave(lock, *flags))
-			return true;
+			return;
 
 		cc->contended = true;
 	}
 
 	spin_lock_irqsave(lock, *flags);
-	return true;
+	return;
 }
 
 /*
@@ -511,11 +511,11 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
  *		scheduled)
  */
 static bool compact_unlock_should_abort(spinlock_t *lock,
-		unsigned long flags, bool *locked, struct compact_control *cc)
+		unsigned long flags, void **locked, struct compact_control *cc)
 {
 	if (*locked) {
 		spin_unlock_irqrestore(lock, flags);
-		*locked = false;
+		*locked = NULL;
 	}
 
 	if (fatal_signal_pending(current)) {
@@ -543,7 +543,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 	int nr_scanned = 0, total_isolated = 0;
 	struct page *cursor;
 	unsigned long flags = 0;
-	bool locked = false;
+	struct compact_control *locked = NULL;
 	unsigned long blockpfn = *start_pfn;
 	unsigned int order;
 
@@ -565,7 +565,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		 */
 		if (!(blockpfn % SWAP_CLUSTER_MAX)
 		    && compact_unlock_should_abort(&cc->zone->lock, flags,
-								&locked, cc))
+							(void**)&locked, cc))
 			break;
 
 		nr_scanned++;
@@ -599,8 +599,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 		 * recheck as well.
 		 */
 		if (!locked) {
-			locked = compact_lock_irqsave(&cc->zone->lock,
-								&flags, cc);
+			compact_lock_irqsave(&cc->zone->lock, &flags, cc);
+			locked = cc;
 
 			/* Recheck this is a buddy page under lock */
 			if (!PageBuddy(page))
@@ -787,7 +787,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
 	unsigned long nr_scanned = 0, nr_isolated = 0;
 	struct lruvec *lruvec;
 	unsigned long flags = 0;
-	struct lruvec *locked_lruvec = NULL;
+	struct lruvec *locked = NULL;
 	struct page *page = NULL, *valid_page = NULL;
 	unsigned long start_pfn = low_pfn;
 	bool skip_on_failure = false;
@@ -847,21 +847,11 @@ static bool too_many_isolated(pg_data_t *pgdat)
 		 * contention, to give chance to IRQs. Abort completely if
 		 * a fatal signal is pending.
 		 */
-		if (!(low_pfn % SWAP_CLUSTER_MAX)) {
-			if (locked_lruvec) {
-				unlock_page_lruvec_irqrestore(locked_lruvec,
-									flags);
-				locked_lruvec = NULL;
-			}
-
-			if (fatal_signal_pending(current)) {
-				cc->contended = true;
-
-				low_pfn = 0;
-				goto fatal_pending;
-			}
-
-			cond_resched();
+		if (!(low_pfn % SWAP_CLUSTER_MAX)
+		    && compact_unlock_should_abort(&locked->lru_lock, flags,
+						(void**)&locked, cc)) {
+			low_pfn = 0;
+			goto fatal_pending;
 		}
 
 		if (!pfn_valid_within(low_pfn))
@@ -932,9 +922,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
 			 */
 			if (unlikely(__PageMovable(page)) &&
 					!PageIsolated(page)) {
-				if (locked_lruvec) {
-					unlock_page_lruvec_irqrestore(locked_lruvec, flags);
-					locked_lruvec = NULL;
+				if (locked) {
+					unlock_page_lruvec_irqrestore(locked, flags);
+					locked = NULL;
 				}
 
 				if (!isolate_movable_page(page, isolate_mode))
@@ -979,13 +969,13 @@ static bool too_many_isolated(pg_data_t *pgdat)
 		lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
 		/* If we already hold the lock, we can skip some rechecking */
-		if (lruvec != locked_lruvec) {
-			if (locked_lruvec)
-				unlock_page_lruvec_irqrestore(locked_lruvec,
+		if (lruvec != locked) {
+			if (locked)
+				unlock_page_lruvec_irqrestore(locked,
 									flags);
 
 			compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
-			locked_lruvec = lruvec;
+			locked = lruvec;
 			rcu_read_unlock();
 
 			lruvec_memcg_debug(lruvec, page);
@@ -1041,9 +1031,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
 
 isolate_fail_put:
 		/* Avoid potential deadlock in freeing page under lru_lock */
-		if (locked_lruvec) {
-			unlock_page_lruvec_irqrestore(locked_lruvec, flags);
-			locked_lruvec = NULL;
+		if (locked) {
+			unlock_page_lruvec_irqrestore(locked, flags);
+			locked = NULL;
 		}
 		put_page(page);
 
@@ -1057,10 +1047,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
 		 * page anyway.
 		 */
 		if (nr_isolated) {
-			if (locked_lruvec) {
-				unlock_page_lruvec_irqrestore(locked_lruvec,
-									flags);
-				locked_lruvec = NULL;
+			if (locked) {
+				unlock_page_lruvec_irqrestore(locked, flags);
+				locked = NULL;
 			}
 			putback_movable_pages(&cc->migratepages);
 			cc->nr_migratepages = 0;
@@ -1087,8 +1076,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
 	page = NULL;
 
 isolate_abort:
-	if (locked_lruvec)
-		unlock_page_lruvec_irqrestore(locked_lruvec, flags);
+	if (locked)
+		unlock_page_lruvec_irqrestore(locked, flags);
 	if (page) {
 		SetPageLRU(page);
 		put_page(page);
Alexander Duyck July 28, 2020, 3:55 p.m. UTC | #6
On Tue, Jul 28, 2020 at 8:40 AM Alex Shi <alex.shi@linux.alibaba.com> wrote:
>
>
>
> 在 2020/7/28 上午7:34, Alexander Duyck 写道:
> > It might make more sense to look at modifying
> > compact_unlock_should_abort and compact_lock_irqsave (which always
> > returns true so should probably be a void) to address the deficiencies
> > they have that make them unusable for you.
>
> One of possible reuse for the func compact_unlock_should_abort, could be
> like the following, the locked parameter reused different in 2 places.
> but, it's seems no this style usage in kernel, isn't it?
>
> Thanks
> Alex
>
> From 41d5ce6562f20f74bc6ac2db83e226ac28d56e90 Mon Sep 17 00:00:00 2001
> From: Alex Shi <alex.shi@linux.alibaba.com>
> Date: Tue, 28 Jul 2020 21:19:32 +0800
> Subject: [PATCH] compaction polishing
>
> Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
> ---
>  mm/compaction.c | 71 ++++++++++++++++++++++++---------------------------------
>  1 file changed, 30 insertions(+), 41 deletions(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> index c28a43481f01..36fce988de3e 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -479,20 +479,20 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page,
>   *
>   * Always returns true which makes it easier to track lock state in callers.
>   */
> -static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
> +static void compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
>                                                 struct compact_control *cc)
>         __acquires(lock)
>  {
>         /* Track if the lock is contended in async mode */
>         if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
>                 if (spin_trylock_irqsave(lock, *flags))
> -                       return true;
> +                       return;
>
>                 cc->contended = true;
>         }
>
>         spin_lock_irqsave(lock, *flags);
> -       return true;
> +       return;
>  }
>
>  /*
> @@ -511,11 +511,11 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
>   *             scheduled)
>   */
>  static bool compact_unlock_should_abort(spinlock_t *lock,
> -               unsigned long flags, bool *locked, struct compact_control *cc)
> +               unsigned long flags, void **locked, struct compact_control *cc)

Instead of passing both a void pointer and the lock why not just pass
the pointer to the lock pointer? You could combine lock and locked
into a single argument and save yourself some extra effort.

>  {
>         if (*locked) {
>                 spin_unlock_irqrestore(lock, flags);
> -               *locked = false;
> +               *locked = NULL;
>         }
>
>         if (fatal_signal_pending(current)) {
> @@ -543,7 +543,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>         int nr_scanned = 0, total_isolated = 0;
>         struct page *cursor;
>         unsigned long flags = 0;
> -       bool locked = false;
> +       struct compact_control *locked = NULL;
>         unsigned long blockpfn = *start_pfn;
>         unsigned int order;
>
> @@ -565,7 +565,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>                  */
>                 if (!(blockpfn % SWAP_CLUSTER_MAX)
>                     && compact_unlock_should_abort(&cc->zone->lock, flags,
> -                                                               &locked, cc))
> +                                                       (void**)&locked, cc))
>                         break;
>
>                 nr_scanned++;
> @@ -599,8 +599,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
>                  * recheck as well.
>                  */
>                 if (!locked) {
> -                       locked = compact_lock_irqsave(&cc->zone->lock,
> -                                                               &flags, cc);
> +                       compact_lock_irqsave(&cc->zone->lock, &flags, cc);
> +                       locked = cc;
>
>                         /* Recheck this is a buddy page under lock */
>                         if (!PageBuddy(page))

If you have to provide a pointer you might as well just provide a
pointer to the zone lock since that is the thing that is actually
holding the lock at this point and would be consistent with your other
uses of the locked value. One possibility would be to change the
return type so that you return a pointer to the lock you are using.
Then the code would look closer to the lruvec code you are already
using.

> @@ -787,7 +787,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
>         unsigned long nr_scanned = 0, nr_isolated = 0;
>         struct lruvec *lruvec;
>         unsigned long flags = 0;
> -       struct lruvec *locked_lruvec = NULL;
> +       struct lruvec *locked = NULL;
>         struct page *page = NULL, *valid_page = NULL;
>         unsigned long start_pfn = low_pfn;
>         bool skip_on_failure = false;
> @@ -847,21 +847,11 @@ static bool too_many_isolated(pg_data_t *pgdat)
>                  * contention, to give chance to IRQs. Abort completely if
>                  * a fatal signal is pending.
>                  */
> -               if (!(low_pfn % SWAP_CLUSTER_MAX)) {
> -                       if (locked_lruvec) {
> -                               unlock_page_lruvec_irqrestore(locked_lruvec,
> -                                                                       flags);
> -                               locked_lruvec = NULL;
> -                       }
> -
> -                       if (fatal_signal_pending(current)) {
> -                               cc->contended = true;
> -
> -                               low_pfn = 0;
> -                               goto fatal_pending;
> -                       }
> -
> -                       cond_resched();
> +               if (!(low_pfn % SWAP_CLUSTER_MAX)
> +                   && compact_unlock_should_abort(&locked->lru_lock, flags,
> +                                               (void**)&locked, cc)) {

An added advantage to making locked a pointer to a spinlock is that
you could reduce the number of pointers you have to pass. Instead of
messing with &locked->lru_lock you would just pass the pointer to
locked resulting in fewer arguments being passed and if it is NULL you
skip the whole unlock pass.

> +                       low_pfn = 0;
> +                       goto fatal_pending;
>                 }
>
>                 if (!pfn_valid_within(low_pfn))
> @@ -932,9 +922,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
>                          */
>                         if (unlikely(__PageMovable(page)) &&
>                                         !PageIsolated(page)) {
> -                               if (locked_lruvec) {
> -                                       unlock_page_lruvec_irqrestore(locked_lruvec, flags);
> -                                       locked_lruvec = NULL;
> +                               if (locked) {
> +                                       unlock_page_lruvec_irqrestore(locked, flags);
> +                                       locked = NULL;
>                                 }
>
>                                 if (!isolate_movable_page(page, isolate_mode))
> @@ -979,13 +969,13 @@ static bool too_many_isolated(pg_data_t *pgdat)
>                 lruvec = mem_cgroup_page_lruvec(page, pgdat);
>
>                 /* If we already hold the lock, we can skip some rechecking */
> -               if (lruvec != locked_lruvec) {
> -                       if (locked_lruvec)
> -                               unlock_page_lruvec_irqrestore(locked_lruvec,
> +               if (lruvec != locked) {
> +                       if (locked)
> +                               unlock_page_lruvec_irqrestore(locked,
>                                                                         flags);
>
>                         compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
> -                       locked_lruvec = lruvec;
> +                       locked = lruvec;
>                         rcu_read_unlock();
>
>                         lruvec_memcg_debug(lruvec, page);
> @@ -1041,9 +1031,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
>
>  isolate_fail_put:
>                 /* Avoid potential deadlock in freeing page under lru_lock */
> -               if (locked_lruvec) {
> -                       unlock_page_lruvec_irqrestore(locked_lruvec, flags);
> -                       locked_lruvec = NULL;
> +               if (locked) {
> +                       unlock_page_lruvec_irqrestore(locked, flags);
> +                       locked = NULL;
>                 }
>                 put_page(page);
>
> @@ -1057,10 +1047,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
>                  * page anyway.
>                  */
>                 if (nr_isolated) {
> -                       if (locked_lruvec) {
> -                               unlock_page_lruvec_irqrestore(locked_lruvec,
> -                                                                       flags);
> -                               locked_lruvec = NULL;
> +                       if (locked) {
> +                               unlock_page_lruvec_irqrestore(locked, flags);
> +                               locked = NULL;
>                         }
>                         putback_movable_pages(&cc->migratepages);
>                         cc->nr_migratepages = 0;
> @@ -1087,8 +1076,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
>         page = NULL;
>
>  isolate_abort:
> -       if (locked_lruvec)
> -               unlock_page_lruvec_irqrestore(locked_lruvec, flags);
> +       if (locked)
> +               unlock_page_lruvec_irqrestore(locked, flags);
>         if (page) {
>                 SetPageLRU(page);
>                 put_page(page);
> --
> 1.8.3.1
>
Alex Shi July 29, 2020, 12:48 a.m. UTC | #7
在 2020/7/28 下午11:55, Alexander Duyck 写道:
>>  /*
>> @@ -511,11 +511,11 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
>>   *             scheduled)
>>   */
>>  static bool compact_unlock_should_abort(spinlock_t *lock,
>> -               unsigned long flags, bool *locked, struct compact_control *cc)
>> +               unsigned long flags, void **locked, struct compact_control *cc)
> Instead of passing both a void pointer and the lock why not just pass
> the pointer to the lock pointer? You could combine lock and locked
> into a single argument and save yourself some extra effort.
> 

the passed locked pointer could be rewrite in the func, that is unacceptable if it is a lock which could
be used other place.

And it is alreay dangerous to NULL a local pointer. In fact, I perfer the orignal verion, not so smart
but rebust enough for future changes, right?

Thanks
Alex


>>  {
>>         if (*locked) {
>>                 spin_unlock_irqrestore(lock, flags);
>> -               *locked = false;
>> +               *locked = NULL;
>>         }
>>
>>         if (fatal_signal_pending(current)) {
Alex Shi July 29, 2020, 1 a.m. UTC | #8
在 2020/7/28 下午10:54, Alexander Duyck 写道:
> On Tue, Jul 28, 2020 at 4:20 AM Alex Shi <alex.shi@linux.alibaba.com> wrote:
>>
>>
>>
>> 在 2020/7/28 上午7:34, Alexander Duyck 写道:
>>>> @@ -1876,6 +1876,12 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
>>>>                  *                                        list_add(&page->lru,)
>>>>                  *     list_add(&page->lru,) //corrupt
>>>>                  */
>>>> +               new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
>>>> +               if (new_lruvec != lruvec) {
>>>> +                       if (lruvec)
>>>> +                               spin_unlock_irq(&lruvec->lru_lock);
>>>> +                       lruvec = lock_page_lruvec_irq(page);
>>>> +               }
>>>>                 SetPageLRU(page);
>>>>
>>>>                 if (unlikely(put_page_testzero(page))) {
>>> I was going through the code of the entire patch set and I noticed
>>> these changes in move_pages_to_lru. What is the reason for adding the
>>> new_lruvec logic? My understanding is that we are moving the pages to
>>> the lruvec provided are we not?If so why do we need to add code to get
>>> a new lruvec? The code itself seems to stand out from the rest of the
>>> patch as it is introducing new code instead of replacing existing
>>> locking code, and it doesn't match up with the description of what
>>> this function is supposed to do since it changes the lruvec.
>>
>> this new_lruvec is the replacement of removed line, as following code:
>>>> -               lruvec = mem_cgroup_page_lruvec(page, pgdat);
>> This recheck is for the page move the root memcg, otherwise it cause the bug:
> 
> Okay, now I see where the issue is. You moved this code so now it has
> a different effect than it did before. You are relocking things before
> you needed to. Don't forget that when you came into this function you
> already had the lock. In addition the patch is broken as it currently
> stands as you aren't using similar logic in the code just above this
> addition if you encounter an evictable page. As a result this is
> really difficult to review as there are subtle bugs here.

Why you think its a bug? the relock only happens if locked lruvec is different.
and unlock the old one.

> 
> I suppose the correct fix is to get rid of this line, but  it should
> be placed everywhere the original function was calling
> spin_lock_irq().
> 
> In addition I would consider changing the arguments/documentation for
> move_pages_to_lru. You aren't moving the pages to lruvec, so there is
> probably no need to pass that as an argument. Instead I would pass
> pgdat since that isn't going to be moving and is the only thing you
> actually derive based on the original lruvec.

yes, The comments should be changed with the line was introduced from long ago. :)
Anyway, I am wondering if it worth a v18 version resend?

>
Alexander Duyck July 29, 2020, 1:27 a.m. UTC | #9
On Tue, Jul 28, 2020 at 6:00 PM Alex Shi <alex.shi@linux.alibaba.com> wrote:
>
>
>
> 在 2020/7/28 下午10:54, Alexander Duyck 写道:
> > On Tue, Jul 28, 2020 at 4:20 AM Alex Shi <alex.shi@linux.alibaba.com> wrote:
> >>
> >>
> >>
> >> 在 2020/7/28 上午7:34, Alexander Duyck 写道:
> >>>> @@ -1876,6 +1876,12 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
> >>>>                  *                                        list_add(&page->lru,)
> >>>>                  *     list_add(&page->lru,) //corrupt
> >>>>                  */
> >>>> +               new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> >>>> +               if (new_lruvec != lruvec) {
> >>>> +                       if (lruvec)
> >>>> +                               spin_unlock_irq(&lruvec->lru_lock);
> >>>> +                       lruvec = lock_page_lruvec_irq(page);
> >>>> +               }
> >>>>                 SetPageLRU(page);
> >>>>
> >>>>                 if (unlikely(put_page_testzero(page))) {
> >>> I was going through the code of the entire patch set and I noticed
> >>> these changes in move_pages_to_lru. What is the reason for adding the
> >>> new_lruvec logic? My understanding is that we are moving the pages to
> >>> the lruvec provided are we not?If so why do we need to add code to get
> >>> a new lruvec? The code itself seems to stand out from the rest of the
> >>> patch as it is introducing new code instead of replacing existing
> >>> locking code, and it doesn't match up with the description of what
> >>> this function is supposed to do since it changes the lruvec.
> >>
> >> this new_lruvec is the replacement of removed line, as following code:
> >>>> -               lruvec = mem_cgroup_page_lruvec(page, pgdat);
> >> This recheck is for the page move the root memcg, otherwise it cause the bug:
> >
> > Okay, now I see where the issue is. You moved this code so now it has
> > a different effect than it did before. You are relocking things before
> > you needed to. Don't forget that when you came into this function you
> > already had the lock. In addition the patch is broken as it currently
> > stands as you aren't using similar logic in the code just above this
> > addition if you encounter an evictable page. As a result this is
> > really difficult to review as there are subtle bugs here.
>
> Why you think its a bug? the relock only happens if locked lruvec is different.
> and unlock the old one.

The section I am talking about with the bug is this section here:
       while (!list_empty(list)) {
+               struct lruvec *new_lruvec = NULL;
+
                page = lru_to_page(list);
                VM_BUG_ON_PAGE(PageLRU(page), page);
                list_del(&page->lru);
                if (unlikely(!page_evictable(page))) {
-                       spin_unlock_irq(&pgdat->lru_lock);
+                       spin_unlock_irq(&lruvec->lru_lock);
                        putback_lru_page(page);
-                       spin_lock_irq(&pgdat->lru_lock);
+                       spin_lock_irq(&lruvec->lru_lock);
                        continue;
                }

Basically it probably is not advisable to be retaking the
lruvec->lru_lock directly as the lruvec may have changed so it
wouldn't be correct for the next page. It would make more sense to be
using your API and calling unlock_page_lruvec_irq and
lock_page_lruvec_irq instead of using the lock directly.

> >
> > I suppose the correct fix is to get rid of this line, but  it should
> > be placed everywhere the original function was calling
> > spin_lock_irq().
> >
> > In addition I would consider changing the arguments/documentation for
> > move_pages_to_lru. You aren't moving the pages to lruvec, so there is
> > probably no need to pass that as an argument. Instead I would pass
> > pgdat since that isn't going to be moving and is the only thing you
> > actually derive based on the original lruvec.
>
> yes, The comments should be changed with the line was introduced from long ago. :)
> Anyway, I am wondering if it worth a v18 version resend?

So I have been looking over the function itself and I wonder if it
isn't worth looking at rewriting this to optimize the locking behavior
to minimize the number of times we have to take the LRU lock. I have
some code I am working on that I plan to submit as an RFC in the next
day or so after I can get it smoke tested. The basic idea would be to
defer returning the evictiable pages or freeing the compound pages
until after we have processed the pages that can be moved while still
holding the lock. I would think it should reduce the lock contention
significantly while improving the throughput.
Alex Shi July 29, 2020, 2:27 a.m. UTC | #10
在 2020/7/29 上午9:27, Alexander Duyck 写道:
> On Tue, Jul 28, 2020 at 6:00 PM Alex Shi <alex.shi@linux.alibaba.com> wrote:
>>
>>
>>
>> 在 2020/7/28 下午10:54, Alexander Duyck 写道:
>>> On Tue, Jul 28, 2020 at 4:20 AM Alex Shi <alex.shi@linux.alibaba.com> wrote:
>>>>
>>>>
>>>>
>>>> 在 2020/7/28 上午7:34, Alexander Duyck 写道:
>>>>>> @@ -1876,6 +1876,12 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
>>>>>>                  *                                        list_add(&page->lru,)
>>>>>>                  *     list_add(&page->lru,) //corrupt
>>>>>>                  */
>>>>>> +               new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
>>>>>> +               if (new_lruvec != lruvec) {
>>>>>> +                       if (lruvec)
>>>>>> +                               spin_unlock_irq(&lruvec->lru_lock);
>>>>>> +                       lruvec = lock_page_lruvec_irq(page);
>>>>>> +               }
>>>>>>                 SetPageLRU(page);
>>>>>>
>>>>>>                 if (unlikely(put_page_testzero(page))) {
>>>>> I was going through the code of the entire patch set and I noticed
>>>>> these changes in move_pages_to_lru. What is the reason for adding the
>>>>> new_lruvec logic? My understanding is that we are moving the pages to
>>>>> the lruvec provided are we not?If so why do we need to add code to get
>>>>> a new lruvec? The code itself seems to stand out from the rest of the
>>>>> patch as it is introducing new code instead of replacing existing
>>>>> locking code, and it doesn't match up with the description of what
>>>>> this function is supposed to do since it changes the lruvec.
>>>>
>>>> this new_lruvec is the replacement of removed line, as following code:
>>>>>> -               lruvec = mem_cgroup_page_lruvec(page, pgdat);
>>>> This recheck is for the page move the root memcg, otherwise it cause the bug:
>>>
>>> Okay, now I see where the issue is. You moved this code so now it has
>>> a different effect than it did before. You are relocking things before
>>> you needed to. Don't forget that when you came into this function you
>>> already had the lock. In addition the patch is broken as it currently
>>> stands as you aren't using similar logic in the code just above this
>>> addition if you encounter an evictable page. As a result this is
>>> really difficult to review as there are subtle bugs here.
>>
>> Why you think its a bug? the relock only happens if locked lruvec is different.
>> and unlock the old one.
> 
> The section I am talking about with the bug is this section here:
>        while (!list_empty(list)) {
> +               struct lruvec *new_lruvec = NULL;
> +
>                 page = lru_to_page(list);
>                 VM_BUG_ON_PAGE(PageLRU(page), page);
>                 list_del(&page->lru);
>                 if (unlikely(!page_evictable(page))) {
> -                       spin_unlock_irq(&pgdat->lru_lock);
> +                       spin_unlock_irq(&lruvec->lru_lock);
>                         putback_lru_page(page);
> -                       spin_lock_irq(&pgdat->lru_lock);
> +                       spin_lock_irq(&lruvec->lru_lock);

It would be still fine. The lruvec->lru_lock will be checked again before
we take and use it. 
And this lock will optimized in patch 19th which did by Hugh Dickins.

>                         continue;
>                 }
> 
> Basically it probably is not advisable to be retaking the
> lruvec->lru_lock directly as the lruvec may have changed so it
> wouldn't be correct for the next page. It would make more sense to be
> using your API and calling unlock_page_lruvec_irq and
> lock_page_lruvec_irq instead of using the lock directly.
> 
>>>
>>> I suppose the correct fix is to get rid of this line, but  it should
>>> be placed everywhere the original function was calling
>>> spin_lock_irq().
>>>
>>> In addition I would consider changing the arguments/documentation for
>>> move_pages_to_lru. You aren't moving the pages to lruvec, so there is
>>> probably no need to pass that as an argument. Instead I would pass
>>> pgdat since that isn't going to be moving and is the only thing you
>>> actually derive based on the original lruvec.
>>
>> yes, The comments should be changed with the line was introduced from long ago. :)
>> Anyway, I am wondering if it worth a v18 version resend?
> 
> So I have been looking over the function itself and I wonder if it
> isn't worth looking at rewriting this to optimize the locking behavior
> to minimize the number of times we have to take the LRU lock. I have
> some code I am working on that I plan to submit as an RFC in the next
> day or so after I can get it smoke tested. The basic idea would be to
> defer returning the evictiable pages or freeing the compound pages
> until after we have processed the pages that can be moved while still
> holding the lock. I would think it should reduce the lock contention
> significantly while improving the throughput.
> 

I had tried once, but the freeing page cross onto release_pages which hard to deal with.
I am very glad to wait your patch, and hope it could be resovled. :)

Thanks
Alex
Alex Shi July 29, 2020, 3:54 a.m. UTC | #11
rewrite the commit log.

From 5e9340444632d69cf10c8db521577d0637819c5f Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@linux.alibaba.com>
Date: Tue, 26 May 2020 17:27:52 +0800
Subject: [PATCH v17 17/23] mm/lru: replace pgdat lru_lock with lruvec lock

This patch moves per node lru_lock into lruvec, thus bring a lru_lock for
each of memcg per node. So on a large machine, each of memcg don't
have to suffer from per node pgdat->lru_lock competition. They could go
fast with their self lru_lock.

After move memcg charge before lru inserting, page isolation could
serialize page's memcg, then per memcg lruvec lock is stable and could
replace per node lru lock.

In func isolate_migratepages_block, compact_unlock_should_abort is
opend, and lock_page_lruvec logical is embedded for tight process.
Also add a debug func in locking which may give some clues if there are
sth out of hands.

According to Daniel Jordan's suggestion, I run 208 'dd' with on 104
containers on a 2s * 26cores * HT box with a modefied case:
https://git.kernel.org/pub/scm/linux/kernel/git/wfg/vm-scalability.git/tree/case-lru-file-readtwice

With this and later patches, the readtwice performance increases about
80% within concurrent containers.

On a large but non memcg machine, the extra recheck if page's lruvec
should be changed in few place, that increase a little lock holding
time, and a little regression.

Hugh Dickins helped on patch polish, thanks!

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Tejun Heo <tj@kernel.org>
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: cgroups@vger.kernel.org
---
 include/linux/memcontrol.h |  58 +++++++++++++++++++++++++
 include/linux/mmzone.h     |   2 +
 mm/compaction.c            |  67 ++++++++++++++++++-----------
 mm/huge_memory.c           |  11 ++---
 mm/memcontrol.c            |  63 ++++++++++++++++++++++++++-
 mm/mlock.c                 |  47 +++++++++++++-------
 mm/mmzone.c                |   1 +
 mm/swap.c                  | 104 +++++++++++++++++++++------------------------
 mm/vmscan.c                |  70 ++++++++++++++++--------------
 9 files changed, 288 insertions(+), 135 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e77197a62809..258901021c6c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -411,6 +411,19 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
 
 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
 
+struct lruvec *lock_page_lruvec(struct page *page);
+struct lruvec *lock_page_lruvec_irq(struct page *page);
+struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+						unsigned long *flags);
+
+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
+#else
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
+#endif
+
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
 	return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -892,6 +905,31 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 }
 
+static inline struct lruvec *lock_page_lruvec(struct page *page)
+{
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	spin_lock(&pgdat->__lruvec.lru_lock);
+	return &pgdat->__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	spin_lock_irq(&pgdat->__lruvec.lru_lock);
+	return &pgdat->__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+		unsigned long *flagsp)
+{
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
+	return &pgdat->__lruvec;
+}
+
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
 		struct mem_cgroup *prev,
@@ -1126,6 +1164,10 @@ static inline void count_memcg_page_event(struct page *page,
 void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
+
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
 #endif /* CONFIG_MEMCG */
 
 /* idx can be of type enum memcg_stat_item or node_stat_item */
@@ -1255,6 +1297,22 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
 	return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
 }
 
+static inline void unlock_page_lruvec(struct lruvec *lruvec)
+{
+	spin_unlock(&lruvec->lru_lock);
+}
+
+static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
+{
+	spin_unlock_irq(&lruvec->lru_lock);
+}
+
+static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
+		unsigned long flags)
+{
+	spin_unlock_irqrestore(&lruvec->lru_lock, flags);
+}
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 14c668b7e793..30b961a9a749 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -249,6 +249,8 @@ enum lruvec_flags {
 };
 
 struct lruvec {
+	/* per lruvec lru_lock for memcg */
+	spinlock_t			lru_lock;
 	struct list_head		lists[NR_LRU_LISTS];
 	/*
 	 * These track the cost of reclaiming one LRU - file or anon -
diff --git a/mm/compaction.c b/mm/compaction.c
index 72f135330f81..c28a43481f01 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -787,7 +787,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
 	unsigned long nr_scanned = 0, nr_isolated = 0;
 	struct lruvec *lruvec;
 	unsigned long flags = 0;
-	bool locked = false;
+	struct lruvec *locked_lruvec = NULL;
 	struct page *page = NULL, *valid_page = NULL;
 	unsigned long start_pfn = low_pfn;
 	bool skip_on_failure = false;
@@ -847,11 +847,21 @@ static bool too_many_isolated(pg_data_t *pgdat)
 		 * contention, to give chance to IRQs. Abort completely if
 		 * a fatal signal is pending.
 		 */
-		if (!(low_pfn % SWAP_CLUSTER_MAX)
-		    && compact_unlock_should_abort(&pgdat->lru_lock,
-					    flags, &locked, cc)) {
-			low_pfn = 0;
-			goto fatal_pending;
+		if (!(low_pfn % SWAP_CLUSTER_MAX)) {
+			if (locked_lruvec) {
+				unlock_page_lruvec_irqrestore(locked_lruvec,
+									flags);
+				locked_lruvec = NULL;
+			}
+
+			if (fatal_signal_pending(current)) {
+				cc->contended = true;
+
+				low_pfn = 0;
+				goto fatal_pending;
+			}
+
+			cond_resched();
 		}
 
 		if (!pfn_valid_within(low_pfn))
@@ -922,10 +932,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
 			 */
 			if (unlikely(__PageMovable(page)) &&
 					!PageIsolated(page)) {
-				if (locked) {
-					spin_unlock_irqrestore(&pgdat->lru_lock,
-									flags);
-					locked = false;
+				if (locked_lruvec) {
+					unlock_page_lruvec_irqrestore(locked_lruvec, flags);
+					locked_lruvec = NULL;
 				}
 
 				if (!isolate_movable_page(page, isolate_mode))
@@ -966,10 +975,20 @@ static bool too_many_isolated(pg_data_t *pgdat)
 		if (!TestClearPageLRU(page))
 			goto isolate_fail_put;
 
+		rcu_read_lock();
+		lruvec = mem_cgroup_page_lruvec(page, pgdat);
+
 		/* If we already hold the lock, we can skip some rechecking */
-		if (!locked) {
-			locked = compact_lock_irqsave(&pgdat->lru_lock,
-								&flags, cc);
+		if (lruvec != locked_lruvec) {
+			if (locked_lruvec)
+				unlock_page_lruvec_irqrestore(locked_lruvec,
+									flags);
+
+			compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
+			locked_lruvec = lruvec;
+			rcu_read_unlock();
+
+			lruvec_memcg_debug(lruvec, page);
 
 			/* Try get exclusive access under lock */
 			if (!skip_updated) {
@@ -988,9 +1007,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
 				SetPageLRU(page);
 				goto isolate_fail_put;
 			}
-		}
-
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
+		} else
+			rcu_read_unlock();
 
 		/* The whole page is taken off the LRU; skip the tail pages. */
 		if (PageCompound(page))
@@ -1023,9 +1041,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
 
 isolate_fail_put:
 		/* Avoid potential deadlock in freeing page under lru_lock */
-		if (locked) {
-			spin_unlock_irqrestore(&pgdat->lru_lock, flags);
-			locked = false;
+		if (locked_lruvec) {
+			unlock_page_lruvec_irqrestore(locked_lruvec, flags);
+			locked_lruvec = NULL;
 		}
 		put_page(page);
 
@@ -1039,9 +1057,10 @@ static bool too_many_isolated(pg_data_t *pgdat)
 		 * page anyway.
 		 */
 		if (nr_isolated) {
-			if (locked) {
-				spin_unlock_irqrestore(&pgdat->lru_lock, flags);
-				locked = false;
+			if (locked_lruvec) {
+				unlock_page_lruvec_irqrestore(locked_lruvec,
+									flags);
+				locked_lruvec = NULL;
 			}
 			putback_movable_pages(&cc->migratepages);
 			cc->nr_migratepages = 0;
@@ -1068,8 +1087,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
 	page = NULL;
 
 isolate_abort:
-	if (locked)
-		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+	if (locked_lruvec)
+		unlock_page_lruvec_irqrestore(locked_lruvec, flags);
 	if (page) {
 		SetPageLRU(page);
 		put_page(page);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 28538444197b..a0cb95891ae5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2346,7 +2346,7 @@ static void lru_add_page_tail(struct page *head, struct page *page_tail,
 	VM_BUG_ON_PAGE(!PageHead(head), head);
 	VM_BUG_ON_PAGE(PageCompound(page_tail), head);
 	VM_BUG_ON_PAGE(PageLRU(page_tail), head);
-	lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
+	lockdep_assert_held(&lruvec->lru_lock);
 
 	if (list) {
 		/* page reclaim is reclaiming a huge page */
@@ -2430,7 +2430,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 			      pgoff_t end)
 {
 	struct page *head = compound_head(page);
-	pg_data_t *pgdat = page_pgdat(head);
 	struct lruvec *lruvec;
 	struct address_space *swap_cache = NULL;
 	unsigned long offset = 0;
@@ -2447,10 +2446,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 		xa_lock(&swap_cache->i_pages);
 	}
 
-	/* prevent PageLRU to go away from under us, and freeze lru stats */
-	spin_lock(&pgdat->lru_lock);
-
-	lruvec = mem_cgroup_page_lruvec(head, pgdat);
+	/* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+	lruvec = lock_page_lruvec(head);
 
 	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
 		__split_huge_page_tail(head, i, lruvec, list);
@@ -2471,7 +2468,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 	}
 
 	ClearPageCompound(head);
-	spin_unlock(&pgdat->lru_lock);
+	unlock_page_lruvec(lruvec);
 	/* Caller disabled irqs, so they are still disabled here */
 
 	split_page_owner(head, HPAGE_PMD_ORDER);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20c8ed69a930..d6746656cc39 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1196,6 +1196,19 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
 	return ret;
 }
 
+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+	if (mem_cgroup_disabled())
+		return;
+
+	if (!page->mem_cgroup)
+		VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
+	else
+		VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != page->mem_cgroup, page);
+}
+#endif
+
 /**
  * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
  * @page: the page
@@ -1215,7 +1228,8 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
 		goto out;
 	}
 
-	memcg = page->mem_cgroup;
+	VM_BUG_ON_PAGE(PageTail(page), page);
+	memcg = READ_ONCE(page->mem_cgroup);
 	/*
 	 * Swapcache readahead pages are added to the LRU - and
 	 * possibly migrated - before they are charged.
@@ -1236,6 +1250,51 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
 	return lruvec;
 }
 
+struct lruvec *lock_page_lruvec(struct page *page)
+{
+	struct lruvec *lruvec;
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	rcu_read_lock();
+	lruvec = mem_cgroup_page_lruvec(page, pgdat);
+	spin_lock(&lruvec->lru_lock);
+	rcu_read_unlock();
+
+	lruvec_memcg_debug(lruvec, page);
+
+	return lruvec;
+}
+
+struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+	struct lruvec *lruvec;
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	rcu_read_lock();
+	lruvec = mem_cgroup_page_lruvec(page, pgdat);
+	spin_lock_irq(&lruvec->lru_lock);
+	rcu_read_unlock();
+
+	lruvec_memcg_debug(lruvec, page);
+
+	return lruvec;
+}
+
+struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
+{
+	struct lruvec *lruvec;
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	rcu_read_lock();
+	lruvec = mem_cgroup_page_lruvec(page, pgdat);
+	spin_lock_irqsave(&lruvec->lru_lock, *flags);
+	rcu_read_unlock();
+
+	lruvec_memcg_debug(lruvec, page);
+
+	return lruvec;
+}
+
 /**
  * mem_cgroup_update_lru_size - account for adding or removing an lru page
  * @lruvec: mem_cgroup per zone lru vector
@@ -2999,7 +3058,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
 
 /*
  * Because tail pages are not marked as "used", set it. We're under
- * pgdat->lru_lock and migration entries setup in all page mappings.
+ * lruvec->lru_lock and migration entries setup in all page mappings.
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
diff --git a/mm/mlock.c b/mm/mlock.c
index 228ba5a8e0a5..5d40d259a931 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -106,12 +106,10 @@ void mlock_vma_page(struct page *page)
  * Isolate a page from LRU with optional get_page() pin.
  * Assumes lru_lock already held and page already pinned.
  */
-static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
+static bool __munlock_isolate_lru_page(struct page *page,
+				struct lruvec *lruvec, bool getpage)
 {
 	if (TestClearPageLRU(page)) {
-		struct lruvec *lruvec;
-
-		lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
 		if (getpage)
 			get_page(page);
 		del_page_from_lru_list(page, lruvec, page_lru(page));
@@ -181,7 +179,7 @@ static void __munlock_isolation_failed(struct page *page)
 unsigned int munlock_vma_page(struct page *page)
 {
 	int nr_pages;
-	pg_data_t *pgdat = page_pgdat(page);
+	struct lruvec *lruvec;
 
 	/* For try_to_munlock() and to serialize with page migration */
 	BUG_ON(!PageLocked(page));
@@ -189,11 +187,16 @@ unsigned int munlock_vma_page(struct page *page)
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
 	/*
-	 * Serialize with any parallel __split_huge_page_refcount() which
+	 * Serialize split tail pages in __split_huge_page_tail() which
 	 * might otherwise copy PageMlocked to part of the tail pages before
 	 * we clear it in the head page. It also stabilizes hpage_nr_pages().
+	 * TestClearPageLRU can't be used here to block page isolation, since
+	 * out of lock clear_page_mlock may interfer PageLRU/PageMlocked
+	 * sequence, same as __pagevec_lru_add_fn, and lead the page place to
+	 * wrong lru list here. So relay on PageLocked to stop lruvec change
+	 * in mem_cgroup_move_account().
 	 */
-	spin_lock_irq(&pgdat->lru_lock);
+	lruvec = lock_page_lruvec_irq(page);
 
 	if (!TestClearPageMlocked(page)) {
 		/* Potentially, PTE-mapped THP: do not skip the rest PTEs */
@@ -204,15 +207,15 @@ unsigned int munlock_vma_page(struct page *page)
 	nr_pages = hpage_nr_pages(page);
 	__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
 
-	if (__munlock_isolate_lru_page(page, true)) {
-		spin_unlock_irq(&pgdat->lru_lock);
+	if (__munlock_isolate_lru_page(page, lruvec, true)) {
+		unlock_page_lruvec_irq(lruvec);
 		__munlock_isolated_page(page);
 		goto out;
 	}
 	__munlock_isolation_failed(page);
 
 unlock_out:
-	spin_unlock_irq(&pgdat->lru_lock);
+	unlock_page_lruvec_irq(lruvec);
 
 out:
 	return nr_pages - 1;
@@ -292,23 +295,34 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 	int nr = pagevec_count(pvec);
 	int delta_munlocked = -nr;
 	struct pagevec pvec_putback;
+	struct lruvec *lruvec = NULL;
 	int pgrescued = 0;
 
 	pagevec_init(&pvec_putback);
 
 	/* Phase 1: page isolation */
-	spin_lock_irq(&zone->zone_pgdat->lru_lock);
 	for (i = 0; i < nr; i++) {
 		struct page *page = pvec->pages[i];
+		struct lruvec *new_lruvec;
+
+		/* block memcg change in mem_cgroup_move_account */
+		lock_page_memcg(page);
+		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		if (new_lruvec != lruvec) {
+			if (lruvec)
+				unlock_page_lruvec_irq(lruvec);
+			lruvec = lock_page_lruvec_irq(page);
+		}
 
 		if (TestClearPageMlocked(page)) {
 			/*
 			 * We already have pin from follow_page_mask()
 			 * so we can spare the get_page() here.
 			 */
-			if (__munlock_isolate_lru_page(page, false))
+			if (__munlock_isolate_lru_page(page, lruvec, false)) {
+				unlock_page_memcg(page);
 				continue;
-			else
+			} else
 				__munlock_isolation_failed(page);
 		} else {
 			delta_munlocked++;
@@ -320,11 +334,14 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 		 * pin. We cannot do it under lru_lock however. If it's
 		 * the last pin, __page_cache_release() would deadlock.
 		 */
+		unlock_page_memcg(page);
 		pagevec_add(&pvec_putback, pvec->pages[i]);
 		pvec->pages[i] = NULL;
 	}
-	__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
-	spin_unlock_irq(&zone->zone_pgdat->lru_lock);
+	if (lruvec) {
+		__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+		unlock_page_lruvec_irq(lruvec);
+	}
 
 	/* Now we can release pins of pages that we are not munlocking */
 	pagevec_release(&pvec_putback);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4686fdc23bb9..3750a90ed4a0 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -91,6 +91,7 @@ void lruvec_init(struct lruvec *lruvec)
 	enum lru_list lru;
 
 	memset(lruvec, 0, sizeof(struct lruvec));
+	spin_lock_init(&lruvec->lru_lock);
 
 	for_each_lru(lru)
 		INIT_LIST_HEAD(&lruvec->lists[lru]);
diff --git a/mm/swap.c b/mm/swap.c
index 3029b3f74811..09edac441eb6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -79,15 +79,13 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
 static void __page_cache_release(struct page *page)
 {
 	if (PageLRU(page)) {
-		pg_data_t *pgdat = page_pgdat(page);
 		struct lruvec *lruvec;
 		unsigned long flags;
 
 		__ClearPageLRU(page);
-		spin_lock_irqsave(&pgdat->lru_lock, flags);
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
+		lruvec = lock_page_lruvec_irqsave(page, &flags);
 		del_page_from_lru_list(page, lruvec, page_off_lru(page));
-		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+		unlock_page_lruvec_irqrestore(lruvec, flags);
 	}
 	__ClearPageWaiters(page);
 }
@@ -206,32 +204,30 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
 	void (*move_fn)(struct page *page, struct lruvec *lruvec))
 {
 	int i;
-	struct pglist_data *pgdat = NULL;
-	struct lruvec *lruvec;
+	struct lruvec *lruvec = NULL;
 	unsigned long flags = 0;
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
-		struct pglist_data *pagepgdat = page_pgdat(page);
-
-		if (pagepgdat != pgdat) {
-			if (pgdat)
-				spin_unlock_irqrestore(&pgdat->lru_lock, flags);
-			pgdat = pagepgdat;
-			spin_lock_irqsave(&pgdat->lru_lock, flags);
-		}
+		struct lruvec *new_lruvec;
 
 		/* block memcg migration during page moving between lru */
 		if (!TestClearPageLRU(page))
 			continue;
 
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
+		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		if (lruvec != new_lruvec) {
+			if (lruvec)
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+			lruvec = lock_page_lruvec_irqsave(page, &flags);
+		}
+
 		(*move_fn)(page, lruvec);
 
 		SetPageLRU(page);
 	}
-	if (pgdat)
-		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+	if (lruvec)
+		unlock_page_lruvec_irqrestore(lruvec, flags);
 	release_pages(pvec->pages, pvec->nr);
 	pagevec_reinit(pvec);
 }
@@ -274,9 +270,8 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
 {
 	do {
 		unsigned long lrusize;
-		struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
-		spin_lock_irq(&pgdat->lru_lock);
+		spin_lock_irq(&lruvec->lru_lock);
 		/* Record cost event */
 		if (file)
 			lruvec->file_cost += nr_pages;
@@ -300,7 +295,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
 			lruvec->file_cost /= 2;
 			lruvec->anon_cost /= 2;
 		}
-		spin_unlock_irq(&pgdat->lru_lock);
+		spin_unlock_irq(&lruvec->lru_lock);
 	} while ((lruvec = parent_lruvec(lruvec)));
 }
 
@@ -365,12 +360,13 @@ static inline void activate_page_drain(int cpu)
 void activate_page(struct page *page)
 {
 	pg_data_t *pgdat = page_pgdat(page);
+	struct lruvec *lruvec;
 
 	page = compound_head(page);
-	spin_lock_irq(&pgdat->lru_lock);
+	lruvec = lock_page_lruvec_irq(page);
 	if (PageLRU(page))
-		__activate_page(page, mem_cgroup_page_lruvec(page, pgdat));
-	spin_unlock_irq(&pgdat->lru_lock);
+		__activate_page(page, lruvec);
+	unlock_page_lruvec_irq(lruvec);
 }
 #endif
 
@@ -817,8 +813,7 @@ void release_pages(struct page **pages, int nr)
 {
 	int i;
 	LIST_HEAD(pages_to_free);
-	struct pglist_data *locked_pgdat = NULL;
-	struct lruvec *lruvec;
+	struct lruvec *lruvec = NULL;
 	unsigned long uninitialized_var(flags);
 	unsigned int uninitialized_var(lock_batch);
 
@@ -828,21 +823,20 @@ void release_pages(struct page **pages, int nr)
 		/*
 		 * Make sure the IRQ-safe lock-holding time does not get
 		 * excessive with a continuous string of pages from the
-		 * same pgdat. The lock is held only if pgdat != NULL.
+		 * same lruvec. The lock is held only if lruvec != NULL.
 		 */
-		if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
-			spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
-			locked_pgdat = NULL;
+		if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
+			unlock_page_lruvec_irqrestore(lruvec, flags);
+			lruvec = NULL;
 		}
 
 		if (is_huge_zero_page(page))
 			continue;
 
 		if (is_zone_device_page(page)) {
-			if (locked_pgdat) {
-				spin_unlock_irqrestore(&locked_pgdat->lru_lock,
-						       flags);
-				locked_pgdat = NULL;
+			if (lruvec) {
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+				lruvec = NULL;
 			}
 			/*
 			 * ZONE_DEVICE pages that return 'false' from
@@ -861,28 +855,28 @@ void release_pages(struct page **pages, int nr)
 			continue;
 
 		if (PageCompound(page)) {
-			if (locked_pgdat) {
-				spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
-				locked_pgdat = NULL;
+			if (lruvec) {
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+				lruvec = NULL;
 			}
 			__put_compound_page(page);
 			continue;
 		}
 
 		if (PageLRU(page)) {
-			struct pglist_data *pgdat = page_pgdat(page);
+			struct lruvec *new_lruvec;
 
-			if (pgdat != locked_pgdat) {
-				if (locked_pgdat)
-					spin_unlock_irqrestore(&locked_pgdat->lru_lock,
+			new_lruvec = mem_cgroup_page_lruvec(page,
+							page_pgdat(page));
+			if (new_lruvec != lruvec) {
+				if (lruvec)
+					unlock_page_lruvec_irqrestore(lruvec,
 									flags);
 				lock_batch = 0;
-				locked_pgdat = pgdat;
-				spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
+				lruvec = lock_page_lruvec_irqsave(page, &flags);
 			}
 
 			__ClearPageLRU(page);
-			lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
 			del_page_from_lru_list(page, lruvec, page_off_lru(page));
 		}
 
@@ -892,8 +886,8 @@ void release_pages(struct page **pages, int nr)
 
 		list_add(&page->lru, &pages_to_free);
 	}
-	if (locked_pgdat)
-		spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+	if (lruvec)
+		unlock_page_lruvec_irqrestore(lruvec, flags);
 
 	mem_cgroup_uncharge_list(&pages_to_free);
 	free_unref_page_list(&pages_to_free);
@@ -981,26 +975,24 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
 void __pagevec_lru_add(struct pagevec *pvec)
 {
 	int i;
-	struct pglist_data *pgdat = NULL;
-	struct lruvec *lruvec;
+	struct lruvec *lruvec = NULL;
 	unsigned long flags = 0;
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
-		struct pglist_data *pagepgdat = page_pgdat(page);
+		struct lruvec *new_lruvec;
 
-		if (pagepgdat != pgdat) {
-			if (pgdat)
-				spin_unlock_irqrestore(&pgdat->lru_lock, flags);
-			pgdat = pagepgdat;
-			spin_lock_irqsave(&pgdat->lru_lock, flags);
+		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		if (lruvec != new_lruvec) {
+			if (lruvec)
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+			lruvec = lock_page_lruvec_irqsave(page, &flags);
 		}
 
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
 		__pagevec_lru_add_fn(page, lruvec);
 	}
-	if (pgdat)
-		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+	if (lruvec)
+		unlock_page_lruvec_irqrestore(lruvec, flags);
 	release_pages(pvec->pages, pvec->nr);
 	pagevec_reinit(pvec);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f77748adc340..168c1659e430 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1774,15 +1774,13 @@ int isolate_lru_page(struct page *page)
 	WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
 
 	if (TestClearPageLRU(page)) {
-		pg_data_t *pgdat = page_pgdat(page);
 		struct lruvec *lruvec;
 		int lru = page_lru(page);
 
 		get_page(page);
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
-		spin_lock_irq(&pgdat->lru_lock);
+		lruvec = lock_page_lruvec_irq(page);
 		del_page_from_lru_list(page, lruvec, lru);
-		spin_unlock_irq(&pgdat->lru_lock);
+		unlock_page_lruvec_irq(lruvec);
 		ret = 0;
 	}
 
@@ -1849,20 +1847,22 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
 static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 						     struct list_head *list)
 {
-	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	int nr_pages, nr_moved = 0;
 	LIST_HEAD(pages_to_free);
 	struct page *page;
+	struct lruvec *orig_lruvec = lruvec;
 	enum lru_list lru;
 
 	while (!list_empty(list)) {
+		struct lruvec *new_lruvec = NULL;
+
 		page = lru_to_page(list);
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 		list_del(&page->lru);
 		if (unlikely(!page_evictable(page))) {
-			spin_unlock_irq(&pgdat->lru_lock);
+			spin_unlock_irq(&lruvec->lru_lock);
 			putback_lru_page(page);
-			spin_lock_irq(&pgdat->lru_lock);
+			spin_lock_irq(&lruvec->lru_lock);
 			continue;
 		}
 
@@ -1876,6 +1876,12 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 		 *                                        list_add(&page->lru,)
 		 *     list_add(&page->lru,) //corrupt
 		 */
+		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		if (new_lruvec != lruvec) {
+			if (lruvec)
+				spin_unlock_irq(&lruvec->lru_lock);
+			lruvec = lock_page_lruvec_irq(page);
+		}
 		SetPageLRU(page);
 
 		if (unlikely(put_page_testzero(page))) {
@@ -1883,16 +1889,15 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 			__ClearPageActive(page);
 
 			if (unlikely(PageCompound(page))) {
-				spin_unlock_irq(&pgdat->lru_lock);
+				spin_unlock_irq(&lruvec->lru_lock);
 				destroy_compound_page(page);
-				spin_lock_irq(&pgdat->lru_lock);
+				spin_lock_irq(&lruvec->lru_lock);
 			} else
 				list_add(&page->lru, &pages_to_free);
 
 			continue;
 		}
 
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
 		lru = page_lru(page);
 		nr_pages = hpage_nr_pages(page);
 
@@ -1902,6 +1907,11 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 		if (PageActive(page))
 			workingset_age_nonresident(lruvec, nr_pages);
 	}
+	if (orig_lruvec != lruvec) {
+		if (lruvec)
+			spin_unlock_irq(&lruvec->lru_lock);
+		spin_lock_irq(&orig_lruvec->lru_lock);
+	}
 
 	/*
 	 * To save our caller's stack, now use input list for pages to free.
@@ -1957,7 +1967,7 @@ static int current_may_throttle(void)
 
 	lru_add_drain();
 
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&lruvec->lru_lock);
 
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
 				     &nr_scanned, sc, lru);
@@ -1969,7 +1979,7 @@ static int current_may_throttle(void)
 	__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
 	__count_vm_events(PGSCAN_ANON + file, nr_scanned);
 
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 
 	if (nr_taken == 0)
 		return 0;
@@ -1977,7 +1987,7 @@ static int current_may_throttle(void)
 	nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
 				&stat, false);
 
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&lruvec->lru_lock);
 	move_pages_to_lru(lruvec, &page_list);
 
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
@@ -1986,7 +1996,7 @@ static int current_may_throttle(void)
 		__count_vm_events(item, nr_reclaimed);
 	__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
 	__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 
 	lru_note_cost(lruvec, file, stat.nr_pageout);
 	mem_cgroup_uncharge_list(&page_list);
@@ -2039,7 +2049,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
 	lru_add_drain();
 
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&lruvec->lru_lock);
 
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
 				     &nr_scanned, sc, lru);
@@ -2049,7 +2059,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	__count_vm_events(PGREFILL, nr_scanned);
 	__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
 
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 
 	while (!list_empty(&l_hold)) {
 		cond_resched();
@@ -2095,7 +2105,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	/*
 	 * Move pages back to the lru list.
 	 */
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&lruvec->lru_lock);
 
 	nr_activate = move_pages_to_lru(lruvec, &l_active);
 	nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
@@ -2106,7 +2116,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
 
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 
 	mem_cgroup_uncharge_list(&l_active);
 	free_unref_page_list(&l_active);
@@ -2696,10 +2706,10 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 	/*
 	 * Determine the scan balance between anon and file LRUs.
 	 */
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&target_lruvec->lru_lock);
 	sc->anon_cost = target_lruvec->anon_cost;
 	sc->file_cost = target_lruvec->file_cost;
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&target_lruvec->lru_lock);
 
 	/*
 	 * Target desirable inactive:active list ratios for the anon
@@ -4275,24 +4285,22 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
  */
 void check_move_unevictable_pages(struct pagevec *pvec)
 {
-	struct lruvec *lruvec;
-	struct pglist_data *pgdat = NULL;
+	struct lruvec *lruvec = NULL;
 	int pgscanned = 0;
 	int pgrescued = 0;
 	int i;
 
 	for (i = 0; i < pvec->nr; i++) {
 		struct page *page = pvec->pages[i];
-		struct pglist_data *pagepgdat = page_pgdat(page);
+		struct lruvec *new_lruvec;
 
 		pgscanned++;
-		if (pagepgdat != pgdat) {
-			if (pgdat)
-				spin_unlock_irq(&pgdat->lru_lock);
-			pgdat = pagepgdat;
-			spin_lock_irq(&pgdat->lru_lock);
+		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		if (lruvec != new_lruvec) {
+			if (lruvec)
+				unlock_page_lruvec_irq(lruvec);
+			lruvec = lock_page_lruvec_irq(page);
 		}
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
 		if (!PageLRU(page) || !PageUnevictable(page))
 			continue;
@@ -4308,10 +4316,10 @@ void check_move_unevictable_pages(struct pagevec *pvec)
 		}
 	}
 
-	if (pgdat) {
+	if (lruvec) {
 		__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
 		__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
-		spin_unlock_irq(&pgdat->lru_lock);
+		unlock_page_lruvec_irq(lruvec);
 	}
 }
 EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
Alex Shi Aug. 6, 2020, 7:41 a.m. UTC | #12
Hi Johannes, Michal,

From page to its lruvec, a few memory access under lock cause extra cost.
Would you like to save the per memcg lruvec pointer to page->private?

Thanks
Alex



在 2020/7/25 下午8:59, Alex Shi 写道:
>  /**
>   * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
>   * @page: the page
> @@ -1215,7 +1228,8 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
>  		goto out;
>  	}
>  
> -	memcg = page->mem_cgroup;
> +	VM_BUG_ON_PAGE(PageTail(page), page);
> +	memcg = READ_ONCE(page->mem_cgroup);
>  	/*
>  	 * Swapcache readahead pages are added to the LRU - and
>  	 * possibly migrated - before they are charged.
> @@ -1236,6 +1250,51 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
>  	return lruvec;
>  }
>  
> +struct lruvec *lock_page_lruvec(struct page *page)
> +{
> +	struct lruvec *lruvec;
> +	struct pglist_data *pgdat = page_pgdat(page);
> +
> +	rcu_read_lock();
> +	lruvec = mem_cgroup_page_lruvec(page, pgdat);
> +	spin_lock(&lruvec->lru_lock);
> +	rcu_read_unlock();
> +
> +	lruvec_memcg_debug(lruvec, page);
> +
> +	return lruvec;
> +}
> +
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e77197a62809..258901021c6c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -411,6 +411,19 @@  static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
 
 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
 
+struct lruvec *lock_page_lruvec(struct page *page);
+struct lruvec *lock_page_lruvec_irq(struct page *page);
+struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+						unsigned long *flags);
+
+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
+#else
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
+#endif
+
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
 	return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -892,6 +905,31 @@  static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 }
 
+static inline struct lruvec *lock_page_lruvec(struct page *page)
+{
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	spin_lock(&pgdat->__lruvec.lru_lock);
+	return &pgdat->__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	spin_lock_irq(&pgdat->__lruvec.lru_lock);
+	return &pgdat->__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+		unsigned long *flagsp)
+{
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
+	return &pgdat->__lruvec;
+}
+
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
 		struct mem_cgroup *prev,
@@ -1126,6 +1164,10 @@  static inline void count_memcg_page_event(struct page *page,
 void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
+
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
 #endif /* CONFIG_MEMCG */
 
 /* idx can be of type enum memcg_stat_item or node_stat_item */
@@ -1255,6 +1297,22 @@  static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
 	return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
 }
 
+static inline void unlock_page_lruvec(struct lruvec *lruvec)
+{
+	spin_unlock(&lruvec->lru_lock);
+}
+
+static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
+{
+	spin_unlock_irq(&lruvec->lru_lock);
+}
+
+static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
+		unsigned long flags)
+{
+	spin_unlock_irqrestore(&lruvec->lru_lock, flags);
+}
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 14c668b7e793..30b961a9a749 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -249,6 +249,8 @@  enum lruvec_flags {
 };
 
 struct lruvec {
+	/* per lruvec lru_lock for memcg */
+	spinlock_t			lru_lock;
 	struct list_head		lists[NR_LRU_LISTS];
 	/*
 	 * These track the cost of reclaiming one LRU - file or anon -
diff --git a/mm/compaction.c b/mm/compaction.c
index 2da2933fe56b..88bbd2e93895 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -787,7 +787,7 @@  static bool too_many_isolated(pg_data_t *pgdat)
 	unsigned long nr_scanned = 0, nr_isolated = 0;
 	struct lruvec *lruvec;
 	unsigned long flags = 0;
-	bool locked = false;
+	struct lruvec *locked_lruvec = NULL;
 	struct page *page = NULL, *valid_page = NULL;
 	unsigned long start_pfn = low_pfn;
 	bool skip_on_failure = false;
@@ -847,11 +847,21 @@  static bool too_many_isolated(pg_data_t *pgdat)
 		 * contention, to give chance to IRQs. Abort completely if
 		 * a fatal signal is pending.
 		 */
-		if (!(low_pfn % SWAP_CLUSTER_MAX)
-		    && compact_unlock_should_abort(&pgdat->lru_lock,
-					    flags, &locked, cc)) {
-			low_pfn = 0;
-			goto fatal_pending;
+		if (!(low_pfn % SWAP_CLUSTER_MAX)) {
+			if (locked_lruvec) {
+				unlock_page_lruvec_irqrestore(locked_lruvec,
+									flags);
+				locked_lruvec = NULL;
+			}
+
+			if (fatal_signal_pending(current)) {
+				cc->contended = true;
+
+				low_pfn = 0;
+				goto fatal_pending;
+			}
+
+			cond_resched();
 		}
 
 		if (!pfn_valid_within(low_pfn))
@@ -922,10 +932,9 @@  static bool too_many_isolated(pg_data_t *pgdat)
 			 */
 			if (unlikely(__PageMovable(page)) &&
 					!PageIsolated(page)) {
-				if (locked) {
-					spin_unlock_irqrestore(&pgdat->lru_lock,
-									flags);
-					locked = false;
+				if (locked_lruvec) {
+					unlock_page_lruvec_irqrestore(locked_lruvec, flags);
+					locked_lruvec = NULL;
 				}
 
 				if (!isolate_movable_page(page, isolate_mode))
@@ -966,10 +975,20 @@  static bool too_many_isolated(pg_data_t *pgdat)
 		if (!TestClearPageLRU(page))
 			goto isolate_fail_put;
 
+		rcu_read_lock();
+		lruvec = mem_cgroup_page_lruvec(page, pgdat);
+
 		/* If we already hold the lock, we can skip some rechecking */
-		if (!locked) {
-			locked = compact_lock_irqsave(&pgdat->lru_lock,
-								&flags, cc);
+		if (lruvec != locked_lruvec) {
+			if (locked_lruvec)
+				unlock_page_lruvec_irqrestore(locked_lruvec,
+									flags);
+
+			compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
+			locked_lruvec = lruvec;
+			rcu_read_unlock();
+
+			lruvec_memcg_debug(lruvec, page);
 
 			/* Try get exclusive access under lock */
 			if (!skip_updated) {
@@ -988,9 +1007,8 @@  static bool too_many_isolated(pg_data_t *pgdat)
 				SetPageLRU(page);
 				goto isolate_fail_put;
 			}
-		}
-
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
+		} else
+			rcu_read_unlock();
 
 		/* The whole page is taken off the LRU; skip the tail pages. */
 		if (PageCompound(page))
@@ -1023,9 +1041,9 @@  static bool too_many_isolated(pg_data_t *pgdat)
 
 isolate_fail_put:
 		/* Avoid potential deadlock in freeing page under lru_lock */
-		if (locked) {
-			spin_unlock_irqrestore(&pgdat->lru_lock, flags);
-			locked = false;
+		if (locked_lruvec) {
+			unlock_page_lruvec_irqrestore(locked_lruvec, flags);
+			locked_lruvec = NULL;
 		}
 		put_page(page);
 
@@ -1039,9 +1057,10 @@  static bool too_many_isolated(pg_data_t *pgdat)
 		 * page anyway.
 		 */
 		if (nr_isolated) {
-			if (locked) {
-				spin_unlock_irqrestore(&pgdat->lru_lock, flags);
-				locked = false;
+			if (locked_lruvec) {
+				unlock_page_lruvec_irqrestore(locked_lruvec,
+									flags);
+				locked_lruvec = NULL;
 			}
 			putback_movable_pages(&cc->migratepages);
 			cc->nr_migratepages = 0;
@@ -1068,8 +1087,8 @@  static bool too_many_isolated(pg_data_t *pgdat)
 	page = NULL;
 
 isolate_abort:
-	if (locked)
-		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+	if (locked_lruvec)
+		unlock_page_lruvec_irqrestore(locked_lruvec, flags);
 	if (page) {
 		SetPageLRU(page);
 		put_page(page);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 28538444197b..a0cb95891ae5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2346,7 +2346,7 @@  static void lru_add_page_tail(struct page *head, struct page *page_tail,
 	VM_BUG_ON_PAGE(!PageHead(head), head);
 	VM_BUG_ON_PAGE(PageCompound(page_tail), head);
 	VM_BUG_ON_PAGE(PageLRU(page_tail), head);
-	lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
+	lockdep_assert_held(&lruvec->lru_lock);
 
 	if (list) {
 		/* page reclaim is reclaiming a huge page */
@@ -2430,7 +2430,6 @@  static void __split_huge_page(struct page *page, struct list_head *list,
 			      pgoff_t end)
 {
 	struct page *head = compound_head(page);
-	pg_data_t *pgdat = page_pgdat(head);
 	struct lruvec *lruvec;
 	struct address_space *swap_cache = NULL;
 	unsigned long offset = 0;
@@ -2447,10 +2446,8 @@  static void __split_huge_page(struct page *page, struct list_head *list,
 		xa_lock(&swap_cache->i_pages);
 	}
 
-	/* prevent PageLRU to go away from under us, and freeze lru stats */
-	spin_lock(&pgdat->lru_lock);
-
-	lruvec = mem_cgroup_page_lruvec(head, pgdat);
+	/* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+	lruvec = lock_page_lruvec(head);
 
 	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
 		__split_huge_page_tail(head, i, lruvec, list);
@@ -2471,7 +2468,7 @@  static void __split_huge_page(struct page *page, struct list_head *list,
 	}
 
 	ClearPageCompound(head);
-	spin_unlock(&pgdat->lru_lock);
+	unlock_page_lruvec(lruvec);
 	/* Caller disabled irqs, so they are still disabled here */
 
 	split_page_owner(head, HPAGE_PMD_ORDER);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20c8ed69a930..d6746656cc39 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1196,6 +1196,19 @@  int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
 	return ret;
 }
 
+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+	if (mem_cgroup_disabled())
+		return;
+
+	if (!page->mem_cgroup)
+		VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
+	else
+		VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != page->mem_cgroup, page);
+}
+#endif
+
 /**
  * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
  * @page: the page
@@ -1215,7 +1228,8 @@  struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
 		goto out;
 	}
 
-	memcg = page->mem_cgroup;
+	VM_BUG_ON_PAGE(PageTail(page), page);
+	memcg = READ_ONCE(page->mem_cgroup);
 	/*
 	 * Swapcache readahead pages are added to the LRU - and
 	 * possibly migrated - before they are charged.
@@ -1236,6 +1250,51 @@  struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
 	return lruvec;
 }
 
+struct lruvec *lock_page_lruvec(struct page *page)
+{
+	struct lruvec *lruvec;
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	rcu_read_lock();
+	lruvec = mem_cgroup_page_lruvec(page, pgdat);
+	spin_lock(&lruvec->lru_lock);
+	rcu_read_unlock();
+
+	lruvec_memcg_debug(lruvec, page);
+
+	return lruvec;
+}
+
+struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+	struct lruvec *lruvec;
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	rcu_read_lock();
+	lruvec = mem_cgroup_page_lruvec(page, pgdat);
+	spin_lock_irq(&lruvec->lru_lock);
+	rcu_read_unlock();
+
+	lruvec_memcg_debug(lruvec, page);
+
+	return lruvec;
+}
+
+struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
+{
+	struct lruvec *lruvec;
+	struct pglist_data *pgdat = page_pgdat(page);
+
+	rcu_read_lock();
+	lruvec = mem_cgroup_page_lruvec(page, pgdat);
+	spin_lock_irqsave(&lruvec->lru_lock, *flags);
+	rcu_read_unlock();
+
+	lruvec_memcg_debug(lruvec, page);
+
+	return lruvec;
+}
+
 /**
  * mem_cgroup_update_lru_size - account for adding or removing an lru page
  * @lruvec: mem_cgroup per zone lru vector
@@ -2999,7 +3058,7 @@  void __memcg_kmem_uncharge_page(struct page *page, int order)
 
 /*
  * Because tail pages are not marked as "used", set it. We're under
- * pgdat->lru_lock and migration entries setup in all page mappings.
+ * lruvec->lru_lock and migration entries setup in all page mappings.
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
diff --git a/mm/mlock.c b/mm/mlock.c
index 228ba5a8e0a5..5d40d259a931 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -106,12 +106,10 @@  void mlock_vma_page(struct page *page)
  * Isolate a page from LRU with optional get_page() pin.
  * Assumes lru_lock already held and page already pinned.
  */
-static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
+static bool __munlock_isolate_lru_page(struct page *page,
+				struct lruvec *lruvec, bool getpage)
 {
 	if (TestClearPageLRU(page)) {
-		struct lruvec *lruvec;
-
-		lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
 		if (getpage)
 			get_page(page);
 		del_page_from_lru_list(page, lruvec, page_lru(page));
@@ -181,7 +179,7 @@  static void __munlock_isolation_failed(struct page *page)
 unsigned int munlock_vma_page(struct page *page)
 {
 	int nr_pages;
-	pg_data_t *pgdat = page_pgdat(page);
+	struct lruvec *lruvec;
 
 	/* For try_to_munlock() and to serialize with page migration */
 	BUG_ON(!PageLocked(page));
@@ -189,11 +187,16 @@  unsigned int munlock_vma_page(struct page *page)
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
 	/*
-	 * Serialize with any parallel __split_huge_page_refcount() which
+	 * Serialize split tail pages in __split_huge_page_tail() which
 	 * might otherwise copy PageMlocked to part of the tail pages before
 	 * we clear it in the head page. It also stabilizes hpage_nr_pages().
+	 * TestClearPageLRU can't be used here to block page isolation, since
+	 * out of lock clear_page_mlock may interfer PageLRU/PageMlocked
+	 * sequence, same as __pagevec_lru_add_fn, and lead the page place to
+	 * wrong lru list here. So relay on PageLocked to stop lruvec change
+	 * in mem_cgroup_move_account().
 	 */
-	spin_lock_irq(&pgdat->lru_lock);
+	lruvec = lock_page_lruvec_irq(page);
 
 	if (!TestClearPageMlocked(page)) {
 		/* Potentially, PTE-mapped THP: do not skip the rest PTEs */
@@ -204,15 +207,15 @@  unsigned int munlock_vma_page(struct page *page)
 	nr_pages = hpage_nr_pages(page);
 	__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
 
-	if (__munlock_isolate_lru_page(page, true)) {
-		spin_unlock_irq(&pgdat->lru_lock);
+	if (__munlock_isolate_lru_page(page, lruvec, true)) {
+		unlock_page_lruvec_irq(lruvec);
 		__munlock_isolated_page(page);
 		goto out;
 	}
 	__munlock_isolation_failed(page);
 
 unlock_out:
-	spin_unlock_irq(&pgdat->lru_lock);
+	unlock_page_lruvec_irq(lruvec);
 
 out:
 	return nr_pages - 1;
@@ -292,23 +295,34 @@  static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 	int nr = pagevec_count(pvec);
 	int delta_munlocked = -nr;
 	struct pagevec pvec_putback;
+	struct lruvec *lruvec = NULL;
 	int pgrescued = 0;
 
 	pagevec_init(&pvec_putback);
 
 	/* Phase 1: page isolation */
-	spin_lock_irq(&zone->zone_pgdat->lru_lock);
 	for (i = 0; i < nr; i++) {
 		struct page *page = pvec->pages[i];
+		struct lruvec *new_lruvec;
+
+		/* block memcg change in mem_cgroup_move_account */
+		lock_page_memcg(page);
+		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		if (new_lruvec != lruvec) {
+			if (lruvec)
+				unlock_page_lruvec_irq(lruvec);
+			lruvec = lock_page_lruvec_irq(page);
+		}
 
 		if (TestClearPageMlocked(page)) {
 			/*
 			 * We already have pin from follow_page_mask()
 			 * so we can spare the get_page() here.
 			 */
-			if (__munlock_isolate_lru_page(page, false))
+			if (__munlock_isolate_lru_page(page, lruvec, false)) {
+				unlock_page_memcg(page);
 				continue;
-			else
+			} else
 				__munlock_isolation_failed(page);
 		} else {
 			delta_munlocked++;
@@ -320,11 +334,14 @@  static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 		 * pin. We cannot do it under lru_lock however. If it's
 		 * the last pin, __page_cache_release() would deadlock.
 		 */
+		unlock_page_memcg(page);
 		pagevec_add(&pvec_putback, pvec->pages[i]);
 		pvec->pages[i] = NULL;
 	}
-	__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
-	spin_unlock_irq(&zone->zone_pgdat->lru_lock);
+	if (lruvec) {
+		__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+		unlock_page_lruvec_irq(lruvec);
+	}
 
 	/* Now we can release pins of pages that we are not munlocking */
 	pagevec_release(&pvec_putback);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4686fdc23bb9..3750a90ed4a0 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -91,6 +91,7 @@  void lruvec_init(struct lruvec *lruvec)
 	enum lru_list lru;
 
 	memset(lruvec, 0, sizeof(struct lruvec));
+	spin_lock_init(&lruvec->lru_lock);
 
 	for_each_lru(lru)
 		INIT_LIST_HEAD(&lruvec->lists[lru]);
diff --git a/mm/swap.c b/mm/swap.c
index 3029b3f74811..09edac441eb6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -79,15 +79,13 @@  static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
 static void __page_cache_release(struct page *page)
 {
 	if (PageLRU(page)) {
-		pg_data_t *pgdat = page_pgdat(page);
 		struct lruvec *lruvec;
 		unsigned long flags;
 
 		__ClearPageLRU(page);
-		spin_lock_irqsave(&pgdat->lru_lock, flags);
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
+		lruvec = lock_page_lruvec_irqsave(page, &flags);
 		del_page_from_lru_list(page, lruvec, page_off_lru(page));
-		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+		unlock_page_lruvec_irqrestore(lruvec, flags);
 	}
 	__ClearPageWaiters(page);
 }
@@ -206,32 +204,30 @@  static void pagevec_lru_move_fn(struct pagevec *pvec,
 	void (*move_fn)(struct page *page, struct lruvec *lruvec))
 {
 	int i;
-	struct pglist_data *pgdat = NULL;
-	struct lruvec *lruvec;
+	struct lruvec *lruvec = NULL;
 	unsigned long flags = 0;
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
-		struct pglist_data *pagepgdat = page_pgdat(page);
-
-		if (pagepgdat != pgdat) {
-			if (pgdat)
-				spin_unlock_irqrestore(&pgdat->lru_lock, flags);
-			pgdat = pagepgdat;
-			spin_lock_irqsave(&pgdat->lru_lock, flags);
-		}
+		struct lruvec *new_lruvec;
 
 		/* block memcg migration during page moving between lru */
 		if (!TestClearPageLRU(page))
 			continue;
 
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
+		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		if (lruvec != new_lruvec) {
+			if (lruvec)
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+			lruvec = lock_page_lruvec_irqsave(page, &flags);
+		}
+
 		(*move_fn)(page, lruvec);
 
 		SetPageLRU(page);
 	}
-	if (pgdat)
-		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+	if (lruvec)
+		unlock_page_lruvec_irqrestore(lruvec, flags);
 	release_pages(pvec->pages, pvec->nr);
 	pagevec_reinit(pvec);
 }
@@ -274,9 +270,8 @@  void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
 {
 	do {
 		unsigned long lrusize;
-		struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
-		spin_lock_irq(&pgdat->lru_lock);
+		spin_lock_irq(&lruvec->lru_lock);
 		/* Record cost event */
 		if (file)
 			lruvec->file_cost += nr_pages;
@@ -300,7 +295,7 @@  void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
 			lruvec->file_cost /= 2;
 			lruvec->anon_cost /= 2;
 		}
-		spin_unlock_irq(&pgdat->lru_lock);
+		spin_unlock_irq(&lruvec->lru_lock);
 	} while ((lruvec = parent_lruvec(lruvec)));
 }
 
@@ -365,12 +360,13 @@  static inline void activate_page_drain(int cpu)
 void activate_page(struct page *page)
 {
 	pg_data_t *pgdat = page_pgdat(page);
+	struct lruvec *lruvec;
 
 	page = compound_head(page);
-	spin_lock_irq(&pgdat->lru_lock);
+	lruvec = lock_page_lruvec_irq(page);
 	if (PageLRU(page))
-		__activate_page(page, mem_cgroup_page_lruvec(page, pgdat));
-	spin_unlock_irq(&pgdat->lru_lock);
+		__activate_page(page, lruvec);
+	unlock_page_lruvec_irq(lruvec);
 }
 #endif
 
@@ -817,8 +813,7 @@  void release_pages(struct page **pages, int nr)
 {
 	int i;
 	LIST_HEAD(pages_to_free);
-	struct pglist_data *locked_pgdat = NULL;
-	struct lruvec *lruvec;
+	struct lruvec *lruvec = NULL;
 	unsigned long uninitialized_var(flags);
 	unsigned int uninitialized_var(lock_batch);
 
@@ -828,21 +823,20 @@  void release_pages(struct page **pages, int nr)
 		/*
 		 * Make sure the IRQ-safe lock-holding time does not get
 		 * excessive with a continuous string of pages from the
-		 * same pgdat. The lock is held only if pgdat != NULL.
+		 * same lruvec. The lock is held only if lruvec != NULL.
 		 */
-		if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
-			spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
-			locked_pgdat = NULL;
+		if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
+			unlock_page_lruvec_irqrestore(lruvec, flags);
+			lruvec = NULL;
 		}
 
 		if (is_huge_zero_page(page))
 			continue;
 
 		if (is_zone_device_page(page)) {
-			if (locked_pgdat) {
-				spin_unlock_irqrestore(&locked_pgdat->lru_lock,
-						       flags);
-				locked_pgdat = NULL;
+			if (lruvec) {
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+				lruvec = NULL;
 			}
 			/*
 			 * ZONE_DEVICE pages that return 'false' from
@@ -861,28 +855,28 @@  void release_pages(struct page **pages, int nr)
 			continue;
 
 		if (PageCompound(page)) {
-			if (locked_pgdat) {
-				spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
-				locked_pgdat = NULL;
+			if (lruvec) {
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+				lruvec = NULL;
 			}
 			__put_compound_page(page);
 			continue;
 		}
 
 		if (PageLRU(page)) {
-			struct pglist_data *pgdat = page_pgdat(page);
+			struct lruvec *new_lruvec;
 
-			if (pgdat != locked_pgdat) {
-				if (locked_pgdat)
-					spin_unlock_irqrestore(&locked_pgdat->lru_lock,
+			new_lruvec = mem_cgroup_page_lruvec(page,
+							page_pgdat(page));
+			if (new_lruvec != lruvec) {
+				if (lruvec)
+					unlock_page_lruvec_irqrestore(lruvec,
 									flags);
 				lock_batch = 0;
-				locked_pgdat = pgdat;
-				spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
+				lruvec = lock_page_lruvec_irqsave(page, &flags);
 			}
 
 			__ClearPageLRU(page);
-			lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
 			del_page_from_lru_list(page, lruvec, page_off_lru(page));
 		}
 
@@ -892,8 +886,8 @@  void release_pages(struct page **pages, int nr)
 
 		list_add(&page->lru, &pages_to_free);
 	}
-	if (locked_pgdat)
-		spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+	if (lruvec)
+		unlock_page_lruvec_irqrestore(lruvec, flags);
 
 	mem_cgroup_uncharge_list(&pages_to_free);
 	free_unref_page_list(&pages_to_free);
@@ -981,26 +975,24 @@  static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
 void __pagevec_lru_add(struct pagevec *pvec)
 {
 	int i;
-	struct pglist_data *pgdat = NULL;
-	struct lruvec *lruvec;
+	struct lruvec *lruvec = NULL;
 	unsigned long flags = 0;
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
-		struct pglist_data *pagepgdat = page_pgdat(page);
+		struct lruvec *new_lruvec;
 
-		if (pagepgdat != pgdat) {
-			if (pgdat)
-				spin_unlock_irqrestore(&pgdat->lru_lock, flags);
-			pgdat = pagepgdat;
-			spin_lock_irqsave(&pgdat->lru_lock, flags);
+		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		if (lruvec != new_lruvec) {
+			if (lruvec)
+				unlock_page_lruvec_irqrestore(lruvec, flags);
+			lruvec = lock_page_lruvec_irqsave(page, &flags);
 		}
 
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
 		__pagevec_lru_add_fn(page, lruvec);
 	}
-	if (pgdat)
-		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+	if (lruvec)
+		unlock_page_lruvec_irqrestore(lruvec, flags);
 	release_pages(pvec->pages, pvec->nr);
 	pagevec_reinit(pvec);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f77748adc340..168c1659e430 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1774,15 +1774,13 @@  int isolate_lru_page(struct page *page)
 	WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
 
 	if (TestClearPageLRU(page)) {
-		pg_data_t *pgdat = page_pgdat(page);
 		struct lruvec *lruvec;
 		int lru = page_lru(page);
 
 		get_page(page);
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
-		spin_lock_irq(&pgdat->lru_lock);
+		lruvec = lock_page_lruvec_irq(page);
 		del_page_from_lru_list(page, lruvec, lru);
-		spin_unlock_irq(&pgdat->lru_lock);
+		unlock_page_lruvec_irq(lruvec);
 		ret = 0;
 	}
 
@@ -1849,20 +1847,22 @@  static int too_many_isolated(struct pglist_data *pgdat, int file,
 static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 						     struct list_head *list)
 {
-	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	int nr_pages, nr_moved = 0;
 	LIST_HEAD(pages_to_free);
 	struct page *page;
+	struct lruvec *orig_lruvec = lruvec;
 	enum lru_list lru;
 
 	while (!list_empty(list)) {
+		struct lruvec *new_lruvec = NULL;
+
 		page = lru_to_page(list);
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 		list_del(&page->lru);
 		if (unlikely(!page_evictable(page))) {
-			spin_unlock_irq(&pgdat->lru_lock);
+			spin_unlock_irq(&lruvec->lru_lock);
 			putback_lru_page(page);
-			spin_lock_irq(&pgdat->lru_lock);
+			spin_lock_irq(&lruvec->lru_lock);
 			continue;
 		}
 
@@ -1876,6 +1876,12 @@  static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 		 *                                        list_add(&page->lru,)
 		 *     list_add(&page->lru,) //corrupt
 		 */
+		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		if (new_lruvec != lruvec) {
+			if (lruvec)
+				spin_unlock_irq(&lruvec->lru_lock);
+			lruvec = lock_page_lruvec_irq(page);
+		}
 		SetPageLRU(page);
 
 		if (unlikely(put_page_testzero(page))) {
@@ -1883,16 +1889,15 @@  static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 			__ClearPageActive(page);
 
 			if (unlikely(PageCompound(page))) {
-				spin_unlock_irq(&pgdat->lru_lock);
+				spin_unlock_irq(&lruvec->lru_lock);
 				destroy_compound_page(page);
-				spin_lock_irq(&pgdat->lru_lock);
+				spin_lock_irq(&lruvec->lru_lock);
 			} else
 				list_add(&page->lru, &pages_to_free);
 
 			continue;
 		}
 
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
 		lru = page_lru(page);
 		nr_pages = hpage_nr_pages(page);
 
@@ -1902,6 +1907,11 @@  static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 		if (PageActive(page))
 			workingset_age_nonresident(lruvec, nr_pages);
 	}
+	if (orig_lruvec != lruvec) {
+		if (lruvec)
+			spin_unlock_irq(&lruvec->lru_lock);
+		spin_lock_irq(&orig_lruvec->lru_lock);
+	}
 
 	/*
 	 * To save our caller's stack, now use input list for pages to free.
@@ -1957,7 +1967,7 @@  static int current_may_throttle(void)
 
 	lru_add_drain();
 
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&lruvec->lru_lock);
 
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
 				     &nr_scanned, sc, lru);
@@ -1969,7 +1979,7 @@  static int current_may_throttle(void)
 	__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
 	__count_vm_events(PGSCAN_ANON + file, nr_scanned);
 
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 
 	if (nr_taken == 0)
 		return 0;
@@ -1977,7 +1987,7 @@  static int current_may_throttle(void)
 	nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
 				&stat, false);
 
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&lruvec->lru_lock);
 	move_pages_to_lru(lruvec, &page_list);
 
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
@@ -1986,7 +1996,7 @@  static int current_may_throttle(void)
 		__count_vm_events(item, nr_reclaimed);
 	__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
 	__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 
 	lru_note_cost(lruvec, file, stat.nr_pageout);
 	mem_cgroup_uncharge_list(&page_list);
@@ -2039,7 +2049,7 @@  static void shrink_active_list(unsigned long nr_to_scan,
 
 	lru_add_drain();
 
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&lruvec->lru_lock);
 
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
 				     &nr_scanned, sc, lru);
@@ -2049,7 +2059,7 @@  static void shrink_active_list(unsigned long nr_to_scan,
 	__count_vm_events(PGREFILL, nr_scanned);
 	__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
 
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 
 	while (!list_empty(&l_hold)) {
 		cond_resched();
@@ -2095,7 +2105,7 @@  static void shrink_active_list(unsigned long nr_to_scan,
 	/*
 	 * Move pages back to the lru list.
 	 */
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&lruvec->lru_lock);
 
 	nr_activate = move_pages_to_lru(lruvec, &l_active);
 	nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
@@ -2106,7 +2116,7 @@  static void shrink_active_list(unsigned long nr_to_scan,
 	__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
 
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&lruvec->lru_lock);
 
 	mem_cgroup_uncharge_list(&l_active);
 	free_unref_page_list(&l_active);
@@ -2696,10 +2706,10 @@  static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 	/*
 	 * Determine the scan balance between anon and file LRUs.
 	 */
-	spin_lock_irq(&pgdat->lru_lock);
+	spin_lock_irq(&target_lruvec->lru_lock);
 	sc->anon_cost = target_lruvec->anon_cost;
 	sc->file_cost = target_lruvec->file_cost;
-	spin_unlock_irq(&pgdat->lru_lock);
+	spin_unlock_irq(&target_lruvec->lru_lock);
 
 	/*
 	 * Target desirable inactive:active list ratios for the anon
@@ -4275,24 +4285,22 @@  int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
  */
 void check_move_unevictable_pages(struct pagevec *pvec)
 {
-	struct lruvec *lruvec;
-	struct pglist_data *pgdat = NULL;
+	struct lruvec *lruvec = NULL;
 	int pgscanned = 0;
 	int pgrescued = 0;
 	int i;
 
 	for (i = 0; i < pvec->nr; i++) {
 		struct page *page = pvec->pages[i];
-		struct pglist_data *pagepgdat = page_pgdat(page);
+		struct lruvec *new_lruvec;
 
 		pgscanned++;
-		if (pagepgdat != pgdat) {
-			if (pgdat)
-				spin_unlock_irq(&pgdat->lru_lock);
-			pgdat = pagepgdat;
-			spin_lock_irq(&pgdat->lru_lock);
+		new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+		if (lruvec != new_lruvec) {
+			if (lruvec)
+				unlock_page_lruvec_irq(lruvec);
+			lruvec = lock_page_lruvec_irq(page);
 		}
-		lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
 		if (!PageLRU(page) || !PageUnevictable(page))
 			continue;
@@ -4308,10 +4316,10 @@  void check_move_unevictable_pages(struct pagevec *pvec)
 		}
 	}
 
-	if (pgdat) {
+	if (lruvec) {
 		__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
 		__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
-		spin_unlock_irq(&pgdat->lru_lock);
+		unlock_page_lruvec_irq(lruvec);
 	}
 }
 EXPORT_SYMBOL_GPL(check_move_unevictable_pages);