diff mbox series

[v14,07/20] mm/thp: narrow lru locking

Message ID 1593752873-4493-8-git-send-email-alex.shi@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series per memcg lru lock | expand

Commit Message

Alex Shi July 3, 2020, 5:07 a.m. UTC
lru_lock and page cache xa_lock have no reason with current sequence,
put them together isn't necessary. let's narrow the lru locking, but
left the local_irq_disable to block interrupt re-entry and statistic update.

Hugh Dickins point: split_huge_page_to_list() was already silly,to be
using the _irqsave variant: it's just been taking sleeping locks, so
would already be broken if entered with interrupts enabled.
so we can save passing flags argument down to __split_huge_page().

Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
---
 mm/huge_memory.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

Comments

Alex Shi July 6, 2020, 9:15 a.m. UTC | #1
Hi Kirill & Johannes & Matthew,

Would you like to give some comments or share your concern of this patchset,
specialy for THP part? 

Many Thanks
Alex

在 2020/7/3 下午1:07, Alex Shi 写道:
> lru_lock and page cache xa_lock have no reason with current sequence,
> put them together isn't necessary. let's narrow the lru locking, but
> left the local_irq_disable to block interrupt re-entry and statistic update.
> 
> Hugh Dickins point: split_huge_page_to_list() was already silly,to be
> using the _irqsave variant: it's just been taking sleeping locks, so
> would already be broken if entered with interrupts enabled.
> so we can save passing flags argument down to __split_huge_page().
> 
> Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
> Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Kirill A. Shutemov <kirill@shutemov.name>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: linux-mm@kvack.org
> Cc: linux-kernel@vger.kernel.org
> ---
>  mm/huge_memory.c | 24 ++++++++++++------------
>  1 file changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index b18f21da4dac..607869330329 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2433,7 +2433,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
>  }
>  
>  static void __split_huge_page(struct page *page, struct list_head *list,
> -		pgoff_t end, unsigned long flags)
> +			      pgoff_t end)
>  {
>  	struct page *head = compound_head(page);
>  	pg_data_t *pgdat = page_pgdat(head);
> @@ -2442,8 +2442,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>  	unsigned long offset = 0;
>  	int i;
>  
> -	lruvec = mem_cgroup_page_lruvec(head, pgdat);
> -
>  	/* complete memcg works before add pages to LRU */
>  	mem_cgroup_split_huge_fixup(head);
>  
> @@ -2455,6 +2453,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>  		xa_lock(&swap_cache->i_pages);
>  	}
>  
> +	/* lock lru list/PageCompound, ref freezed by page_ref_freeze */
> +	spin_lock(&pgdat->lru_lock);
> +
> +	lruvec = mem_cgroup_page_lruvec(head, pgdat);
> +
>  	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
>  		__split_huge_page_tail(head, i, lruvec, list);
>  		/* Some pages can be beyond i_size: drop them from page cache */
> @@ -2474,6 +2477,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>  	}
>  
>  	ClearPageCompound(head);
> +	spin_unlock(&pgdat->lru_lock);
> +	/* Caller disabled irqs, so they are still disabled here */
>  
>  	split_page_owner(head, HPAGE_PMD_ORDER);
>  
> @@ -2491,8 +2496,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>  		page_ref_add(head, 2);
>  		xa_unlock(&head->mapping->i_pages);
>  	}
> -
> -	spin_unlock_irqrestore(&pgdat->lru_lock, flags);
> +	local_irq_enable();
>  
>  	remap_page(head);
>  
> @@ -2631,12 +2635,10 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
>  int split_huge_page_to_list(struct page *page, struct list_head *list)
>  {
>  	struct page *head = compound_head(page);
> -	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
>  	struct deferred_split *ds_queue = get_deferred_split_queue(head);
>  	struct anon_vma *anon_vma = NULL;
>  	struct address_space *mapping = NULL;
>  	int count, mapcount, extra_pins, ret;
> -	unsigned long flags;
>  	pgoff_t end;
>  
>  	VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
> @@ -2697,9 +2699,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  	unmap_page(head);
>  	VM_BUG_ON_PAGE(compound_mapcount(head), head);
>  
> -	/* prevent PageLRU to go away from under us, and freeze lru stats */
> -	spin_lock_irqsave(&pgdata->lru_lock, flags);
> -
> +	local_irq_disable();
>  	if (mapping) {
>  		XA_STATE(xas, &mapping->i_pages, page_index(head));
>  
> @@ -2729,7 +2729,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  				__dec_node_page_state(head, NR_FILE_THPS);
>  		}
>  
> -		__split_huge_page(page, list, end, flags);
> +		__split_huge_page(page, list, end);
>  		if (PageSwapCache(head)) {
>  			swp_entry_t entry = { .val = page_private(head) };
>  
> @@ -2748,7 +2748,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  		spin_unlock(&ds_queue->split_queue_lock);
>  fail:		if (mapping)
>  			xa_unlock(&mapping->i_pages);
> -		spin_unlock_irqrestore(&pgdata->lru_lock, flags);
> +		local_irq_enable();
>  		remap_page(head);
>  		ret = -EBUSY;
>  	}
>
Matthew Wilcox July 6, 2020, 11:35 a.m. UTC | #2
On Mon, Jul 06, 2020 at 05:15:09PM +0800, Alex Shi wrote:
> Hi Kirill & Johannes & Matthew,
> 
> Would you like to give some comments or share your concern of this patchset,
> specialy for THP part? 

I don't have the brain space to understand this patch set fully at
the moment.  I'll note that the realtime folks are doing their best to
stamp out users of local_irq_disable(), so they won't be pleased to see
you adding a new one.  Also, you removed the comment explaining why the
lock needed to be taken.

> Many Thanks
> Alex
> 
> 在 2020/7/3 下午1:07, Alex Shi 写道:
> > lru_lock and page cache xa_lock have no reason with current sequence,
> > put them together isn't necessary. let's narrow the lru locking, but
> > left the local_irq_disable to block interrupt re-entry and statistic update.
> > 
> > Hugh Dickins point: split_huge_page_to_list() was already silly,to be
> > using the _irqsave variant: it's just been taking sleeping locks, so
> > would already be broken if entered with interrupts enabled.
> > so we can save passing flags argument down to __split_huge_page().
> > 
> > Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
> > Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
> > Cc: Hugh Dickins <hughd@google.com>
> > Cc: Kirill A. Shutemov <kirill@shutemov.name>
> > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > Cc: Matthew Wilcox <willy@infradead.org>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: linux-mm@kvack.org
> > Cc: linux-kernel@vger.kernel.org
> > ---
> >  mm/huge_memory.c | 24 ++++++++++++------------
> >  1 file changed, 12 insertions(+), 12 deletions(-)
> > 
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index b18f21da4dac..607869330329 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -2433,7 +2433,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
> >  }
> >  
> >  static void __split_huge_page(struct page *page, struct list_head *list,
> > -		pgoff_t end, unsigned long flags)
> > +			      pgoff_t end)
> >  {
> >  	struct page *head = compound_head(page);
> >  	pg_data_t *pgdat = page_pgdat(head);
> > @@ -2442,8 +2442,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
> >  	unsigned long offset = 0;
> >  	int i;
> >  
> > -	lruvec = mem_cgroup_page_lruvec(head, pgdat);
> > -
> >  	/* complete memcg works before add pages to LRU */
> >  	mem_cgroup_split_huge_fixup(head);
> >  
> > @@ -2455,6 +2453,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
> >  		xa_lock(&swap_cache->i_pages);
> >  	}
> >  
> > +	/* lock lru list/PageCompound, ref freezed by page_ref_freeze */
> > +	spin_lock(&pgdat->lru_lock);
> > +
> > +	lruvec = mem_cgroup_page_lruvec(head, pgdat);
> > +
> >  	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
> >  		__split_huge_page_tail(head, i, lruvec, list);
> >  		/* Some pages can be beyond i_size: drop them from page cache */
> > @@ -2474,6 +2477,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
> >  	}
> >  
> >  	ClearPageCompound(head);
> > +	spin_unlock(&pgdat->lru_lock);
> > +	/* Caller disabled irqs, so they are still disabled here */
> >  
> >  	split_page_owner(head, HPAGE_PMD_ORDER);
> >  
> > @@ -2491,8 +2496,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
> >  		page_ref_add(head, 2);
> >  		xa_unlock(&head->mapping->i_pages);
> >  	}
> > -
> > -	spin_unlock_irqrestore(&pgdat->lru_lock, flags);
> > +	local_irq_enable();
> >  
> >  	remap_page(head);
> >  
> > @@ -2631,12 +2635,10 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
> >  int split_huge_page_to_list(struct page *page, struct list_head *list)
> >  {
> >  	struct page *head = compound_head(page);
> > -	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
> >  	struct deferred_split *ds_queue = get_deferred_split_queue(head);
> >  	struct anon_vma *anon_vma = NULL;
> >  	struct address_space *mapping = NULL;
> >  	int count, mapcount, extra_pins, ret;
> > -	unsigned long flags;
> >  	pgoff_t end;
> >  
> >  	VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
> > @@ -2697,9 +2699,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> >  	unmap_page(head);
> >  	VM_BUG_ON_PAGE(compound_mapcount(head), head);
> >  
> > -	/* prevent PageLRU to go away from under us, and freeze lru stats */
> > -	spin_lock_irqsave(&pgdata->lru_lock, flags);
> > -
> > +	local_irq_disable();
> >  	if (mapping) {
> >  		XA_STATE(xas, &mapping->i_pages, page_index(head));
> >  
> > @@ -2729,7 +2729,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> >  				__dec_node_page_state(head, NR_FILE_THPS);
> >  		}
> >  
> > -		__split_huge_page(page, list, end, flags);
> > +		__split_huge_page(page, list, end);
> >  		if (PageSwapCache(head)) {
> >  			swp_entry_t entry = { .val = page_private(head) };
> >  
> > @@ -2748,7 +2748,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> >  		spin_unlock(&ds_queue->split_queue_lock);
> >  fail:		if (mapping)
> >  			xa_unlock(&mapping->i_pages);
> > -		spin_unlock_irqrestore(&pgdata->lru_lock, flags);
> > +		local_irq_enable();
> >  		remap_page(head);
> >  		ret = -EBUSY;
> >  	}
> >
Hugh Dickins July 7, 2020, 4:52 a.m. UTC | #3
On Mon, 6 Jul 2020, Matthew Wilcox wrote:
> On Mon, Jul 06, 2020 at 05:15:09PM +0800, Alex Shi wrote:
> > Hi Kirill & Johannes & Matthew,

Adding Kirill, who was in patch's Cc list but not mail's Cc list.

I asked Alex to direct this one particularly to Kirill and Johannes
and Matthew because (and I regret that the commit message still does
not make this at all clear) this patch changes the lock ordering:
which for years has been lru_lock outside memcg move_lock outside
i_pages lock, but here inverted to lru_lock inside i_pages lock.

I don't see a strong reason to have them one way round or the other,
and think Alex is right that they can safely be reversed here: but
he doesn't actually give any reason for doing so (if cleanup, then
I think the cleanup should have been taken further), and no reason
for doing so as part of this series.

I had more need to know which way round they should go, when adding
lru_lock into mem_cgroup_move_account (inside or outside move_lock?):
but Alex's use of TestClearPageLRU appears to have successfully
eliminated the need for that; so I only need to know for the final
Doc patch in the series (credited to my name), where mm/rmap.c
documents the lock ordering.

I'm okay with leaving this patch in the series (and the final patch
currently documents this new order); but wondered if someone else
(especially Kirill or Johannes or Matthew) sees a reason against it?

And I have to admit that, in researching this, I discovered that
actually we unconsciously departed from the supposed lock ordering
years ago: back in 3.18's 8186eb6a799e, Johannes did a cleanup which
moved a clear_page_mlock() call to inside memcg move_lock, and in
principle clear_page_mlock() can take lru_lock. But we have never
seen a lockdep complaint about this, so I suspect that the page is
(almost?) always already isolated from lru when that is called,
and the issue therefore hypothetical.

My vote, for dispatch of the series, is to leave this patch in;
but cannot object if consensus were that it should be taken out.

Hugh

> > 
> > Would you like to give some comments or share your concern of this patchset,
> > specialy for THP part? 
> 
> I don't have the brain space to understand this patch set fully at
> the moment.  I'll note that the realtime folks are doing their best to
> stamp out users of local_irq_disable(), so they won't be pleased to see
> you adding a new one.  Also, you removed the comment explaining why the
> lock needed to be taken.
> 
> > Many Thanks
> > Alex
> > 
> > 在 2020/7/3 下午1:07, Alex Shi 写道:
> > > lru_lock and page cache xa_lock have no reason with current sequence,
> > > put them together isn't necessary. let's narrow the lru locking, but
> > > left the local_irq_disable to block interrupt re-entry and statistic update.
> > > 
> > > Hugh Dickins point: split_huge_page_to_list() was already silly,to be
> > > using the _irqsave variant: it's just been taking sleeping locks, so
> > > would already be broken if entered with interrupts enabled.
> > > so we can save passing flags argument down to __split_huge_page().
> > > 
> > > Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
> > > Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
> > > Cc: Hugh Dickins <hughd@google.com>
> > > Cc: Kirill A. Shutemov <kirill@shutemov.name>
> > > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > > Cc: Matthew Wilcox <willy@infradead.org>
> > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > Cc: linux-mm@kvack.org
> > > Cc: linux-kernel@vger.kernel.org
> > > ---
> > >  mm/huge_memory.c | 24 ++++++++++++------------
> > >  1 file changed, 12 insertions(+), 12 deletions(-)
> > > 
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index b18f21da4dac..607869330329 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -2433,7 +2433,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
> > >  }
> > >  
> > >  static void __split_huge_page(struct page *page, struct list_head *list,
> > > -		pgoff_t end, unsigned long flags)
> > > +			      pgoff_t end)
> > >  {
> > >  	struct page *head = compound_head(page);
> > >  	pg_data_t *pgdat = page_pgdat(head);
> > > @@ -2442,8 +2442,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
> > >  	unsigned long offset = 0;
> > >  	int i;
> > >  
> > > -	lruvec = mem_cgroup_page_lruvec(head, pgdat);
> > > -
> > >  	/* complete memcg works before add pages to LRU */
> > >  	mem_cgroup_split_huge_fixup(head);
> > >  
> > > @@ -2455,6 +2453,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
> > >  		xa_lock(&swap_cache->i_pages);
> > >  	}
> > >  
> > > +	/* lock lru list/PageCompound, ref freezed by page_ref_freeze */
> > > +	spin_lock(&pgdat->lru_lock);
> > > +
> > > +	lruvec = mem_cgroup_page_lruvec(head, pgdat);
> > > +
> > >  	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
> > >  		__split_huge_page_tail(head, i, lruvec, list);
> > >  		/* Some pages can be beyond i_size: drop them from page cache */
> > > @@ -2474,6 +2477,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
> > >  	}
> > >  
> > >  	ClearPageCompound(head);
> > > +	spin_unlock(&pgdat->lru_lock);
> > > +	/* Caller disabled irqs, so they are still disabled here */
> > >  
> > >  	split_page_owner(head, HPAGE_PMD_ORDER);
> > >  
> > > @@ -2491,8 +2496,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
> > >  		page_ref_add(head, 2);
> > >  		xa_unlock(&head->mapping->i_pages);
> > >  	}
> > > -
> > > -	spin_unlock_irqrestore(&pgdat->lru_lock, flags);
> > > +	local_irq_enable();
> > >  
> > >  	remap_page(head);
> > >  
> > > @@ -2631,12 +2635,10 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
> > >  int split_huge_page_to_list(struct page *page, struct list_head *list)
> > >  {
> > >  	struct page *head = compound_head(page);
> > > -	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
> > >  	struct deferred_split *ds_queue = get_deferred_split_queue(head);
> > >  	struct anon_vma *anon_vma = NULL;
> > >  	struct address_space *mapping = NULL;
> > >  	int count, mapcount, extra_pins, ret;
> > > -	unsigned long flags;
> > >  	pgoff_t end;
> > >  
> > >  	VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
> > > @@ -2697,9 +2699,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> > >  	unmap_page(head);
> > >  	VM_BUG_ON_PAGE(compound_mapcount(head), head);
> > >  
> > > -	/* prevent PageLRU to go away from under us, and freeze lru stats */
> > > -	spin_lock_irqsave(&pgdata->lru_lock, flags);
> > > -
> > > +	local_irq_disable();
> > >  	if (mapping) {
> > >  		XA_STATE(xas, &mapping->i_pages, page_index(head));
> > >  
> > > @@ -2729,7 +2729,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> > >  				__dec_node_page_state(head, NR_FILE_THPS);
> > >  		}
> > >  
> > > -		__split_huge_page(page, list, end, flags);
> > > +		__split_huge_page(page, list, end);
> > >  		if (PageSwapCache(head)) {
> > >  			swp_entry_t entry = { .val = page_private(head) };
> > >  
> > > @@ -2748,7 +2748,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> > >  		spin_unlock(&ds_queue->split_queue_lock);
> > >  fail:		if (mapping)
> > >  			xa_unlock(&mapping->i_pages);
> > > -		spin_unlock_irqrestore(&pgdata->lru_lock, flags);
> > > +		local_irq_enable();
> > >  		remap_page(head);
> > >  		ret = -EBUSY;
> > >  	}
Alex Shi July 7, 2020, 10:51 a.m. UTC | #4
在 2020/7/6 下午7:35, Matthew Wilcox 写道:
>> Would you like to give some comments or share your concern of this patchset,
>> specialy for THP part? 
> I don't have the brain space to understand this patch set fully at
> the moment.  I'll note that the realtime folks are doing their best to
> stamp out users of local_irq_disable(), so they won't be pleased to see
> you adding a new one.  Also, you removed the comment explaining why the
> lock needed to be taken.
> 

Hi Matthew,

Thanks for response!

As to the local_irq_disable(), we could use local_irq_save(), but Hugh Dickin
suggest it's not necessary here. Also there are still much local_irq_disable()
in code. Hope it would be a big trouble for only one extra.

yes, The lru_lock comments is a bit early to remove, that should do in next 
TestClearPageLRU part. but since it would be changed soon. It won't be a critical
thing. Anyway I can change it to back in next version.

Thanks
Alex
Alex Shi July 9, 2020, 2:02 p.m. UTC | #5
在 2020/7/7 下午12:52, Hugh Dickins 写道:
> My vote, for dispatch of the series, is to leave this patch in;
> but cannot object if consensus were that it should be taken out.

hi Johannes & Kirill,

What's your comments for this patch or others? I am ok to withdraw
this patch if no one like to give more comments on this.

Thanks
Alex
Kirill A. Shutemov July 9, 2020, 3:48 p.m. UTC | #6
On Mon, Jul 06, 2020 at 09:52:34PM -0700, Hugh Dickins wrote:
> On Mon, 6 Jul 2020, Matthew Wilcox wrote:
> > On Mon, Jul 06, 2020 at 05:15:09PM +0800, Alex Shi wrote:
> > > Hi Kirill & Johannes & Matthew,
> 
> Adding Kirill, who was in patch's Cc list but not mail's Cc list.
> 
> I asked Alex to direct this one particularly to Kirill and Johannes
> and Matthew because (and I regret that the commit message still does
> not make this at all clear) this patch changes the lock ordering:
> which for years has been lru_lock outside memcg move_lock outside
> i_pages lock, but here inverted to lru_lock inside i_pages lock.
> 
> I don't see a strong reason to have them one way round or the other,
> and think Alex is right that they can safely be reversed here: but
> he doesn't actually give any reason for doing so (if cleanup, then
> I think the cleanup should have been taken further), and no reason
> for doing so as part of this series.

I've looked around and changing order of lru_lock wrt. i_pages lock seems
safe. I don't have much experience with memcg move_lock.

Alex, if you are going ahead with the patch, please document the locking
order. We have some locking orders listed at the beginning of filemap.c
and rmap.c.

local_irq_disable() also deserves a comment.
Alex Shi July 10, 2020, 8:23 a.m. UTC | #7
在 2020/7/9 下午11:48, Kirill A. Shutemov 写道:
> On Mon, Jul 06, 2020 at 09:52:34PM -0700, Hugh Dickins wrote:
>> On Mon, 6 Jul 2020, Matthew Wilcox wrote:
>>> On Mon, Jul 06, 2020 at 05:15:09PM +0800, Alex Shi wrote:
>>>> Hi Kirill & Johannes & Matthew,
>>
>> Adding Kirill, who was in patch's Cc list but not mail's Cc list.
>>
>> I asked Alex to direct this one particularly to Kirill and Johannes
>> and Matthew because (and I regret that the commit message still does
>> not make this at all clear) this patch changes the lock ordering:
>> which for years has been lru_lock outside memcg move_lock outside
>> i_pages lock, but here inverted to lru_lock inside i_pages lock.
>>
>> I don't see a strong reason to have them one way round or the other,
>> and think Alex is right that they can safely be reversed here: but
>> he doesn't actually give any reason for doing so (if cleanup, then
>> I think the cleanup should have been taken further), and no reason
>> for doing so as part of this series.
> 
> I've looked around and changing order of lru_lock wrt. i_pages lock seems
> safe. I don't have much experience with memcg move_lock.

Hi Kirill,

Thanks for response!
mem_cgroup_move_account(page) could not reach here since 2 blocks,
1, isolate_lru_page() before it will take page from lru, this compete for
   page reclaim path, list non-null.

2, try_lock_page in it, will guard split_huge_page(), !list.

> 
> Alex, if you are going ahead with the patch, please document the locking
> order. We have some locking orders listed at the beginning of filemap.c
> and rmap.c.

Thanks for reminder!
Hugh Dickins did this in above 2 files at the end of patchset, any comments?

diff --git a/mm/filemap.c b/mm/filemap.c
index f0ae9a6308cb..1b42aaae4d3e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -101,8 +101,8 @@
  *    ->swap_lock		(try_to_unmap_one)
  *    ->private_lock		(try_to_unmap_one)
  *    ->i_pages lock		(try_to_unmap_one)
- *    ->pgdat->lru_lock		(follow_page->mark_page_accessed)
- *    ->pgdat->lru_lock		(check_pte_range->isolate_lru_page)
+ *    ->lruvec->lru_lock	(follow_page->mark_page_accessed)
+ *    ->lruvec->lru_lock	(check_pte_range->isolate_lru_page)
  *    ->private_lock		(page_remove_rmap->set_page_dirty)
  *    ->i_pages lock		(page_remove_rmap->set_page_dirty)
  *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5e56be42f21..926d7d95dc1d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3057,7 +3057,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
 /*
- * Because tail pages are not marked as "used", set it. We're under
+ * Because tail pages are not marked as "used", set it. Don't need
  * lruvec->lru_lock and migration entries setup in all page mappings.
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
diff --git a/mm/rmap.c b/mm/rmap.c
index 5fe2dedce1fc..7fbc382e6f9e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -28,12 +28,12 @@
  *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
  *           anon_vma->rwsem
  *             mm->page_table_lock or pte_lock
- *               pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
  *               swap_lock (in swap_duplicate, swap_info_get)
  *                 mmlist_lock (in mmput, drain_mmlist and others)
  *                 mapping->private_lock (in __set_page_dirty_buffers)
- *                   mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ *                   lock_page_memcg move_lock (in __set_page_dirty_buffers)
  *                     i_pages lock (widely used)
+ *                       lock_page_lruvec_irq lruvec->lru_lock
  *                 inode->i_lock (in set_page_dirty's __mark_inode_dirty)
  *                 bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
  *                   sb_lock (within inode_lock in fs/fs-writeback.c)

> 
> local_irq_disable() also deserves a comment.
> 

yes, I will add a comment for this. Do you mind give reviewed-by for this patch?

Thanks
Alex
Kirill A. Shutemov July 10, 2020, 11:28 a.m. UTC | #8
On Fri, Jul 10, 2020 at 04:23:35PM +0800, Alex Shi wrote:
> 
> 
> 在 2020/7/9 下午11:48, Kirill A. Shutemov 写道:
> > On Mon, Jul 06, 2020 at 09:52:34PM -0700, Hugh Dickins wrote:
> >> On Mon, 6 Jul 2020, Matthew Wilcox wrote:
> >>> On Mon, Jul 06, 2020 at 05:15:09PM +0800, Alex Shi wrote:
> >>>> Hi Kirill & Johannes & Matthew,
> >>
> >> Adding Kirill, who was in patch's Cc list but not mail's Cc list.
> >>
> >> I asked Alex to direct this one particularly to Kirill and Johannes
> >> and Matthew because (and I regret that the commit message still does
> >> not make this at all clear) this patch changes the lock ordering:
> >> which for years has been lru_lock outside memcg move_lock outside
> >> i_pages lock, but here inverted to lru_lock inside i_pages lock.
> >>
> >> I don't see a strong reason to have them one way round or the other,
> >> and think Alex is right that they can safely be reversed here: but
> >> he doesn't actually give any reason for doing so (if cleanup, then
> >> I think the cleanup should have been taken further), and no reason
> >> for doing so as part of this series.
> > 
> > I've looked around and changing order of lru_lock wrt. i_pages lock seems
> > safe. I don't have much experience with memcg move_lock.
> 
> Hi Kirill,
> 
> Thanks for response!
> mem_cgroup_move_account(page) could not reach here since 2 blocks,
> 1, isolate_lru_page() before it will take page from lru, this compete for
>    page reclaim path, list non-null.
> 
> 2, try_lock_page in it, will guard split_huge_page(), !list.
> 
> > 
> > Alex, if you are going ahead with the patch, please document the locking
> > order. We have some locking orders listed at the beginning of filemap.c
> > and rmap.c.
> 
> Thanks for reminder!
> Hugh Dickins did this in above 2 files at the end of patchset, any comments?
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index f0ae9a6308cb..1b42aaae4d3e 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -101,8 +101,8 @@
>   *    ->swap_lock		(try_to_unmap_one)
>   *    ->private_lock		(try_to_unmap_one)
>   *    ->i_pages lock		(try_to_unmap_one)
> - *    ->pgdat->lru_lock		(follow_page->mark_page_accessed)
> - *    ->pgdat->lru_lock		(check_pte_range->isolate_lru_page)
> + *    ->lruvec->lru_lock	(follow_page->mark_page_accessed)
> + *    ->lruvec->lru_lock	(check_pte_range->isolate_lru_page)
>   *    ->private_lock		(page_remove_rmap->set_page_dirty)
>   *    ->i_pages lock		(page_remove_rmap->set_page_dirty)
>   *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index d5e56be42f21..926d7d95dc1d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3057,7 +3057,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  
>  /*
> - * Because tail pages are not marked as "used", set it. We're under
> + * Because tail pages are not marked as "used", set it. Don't need
>   * lruvec->lru_lock and migration entries setup in all page mappings.
>   */
>  void mem_cgroup_split_huge_fixup(struct page *head)
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 5fe2dedce1fc..7fbc382e6f9e 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -28,12 +28,12 @@
>   *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
>   *           anon_vma->rwsem
>   *             mm->page_table_lock or pte_lock
> - *               pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
>   *               swap_lock (in swap_duplicate, swap_info_get)
>   *                 mmlist_lock (in mmput, drain_mmlist and others)
>   *                 mapping->private_lock (in __set_page_dirty_buffers)
> - *                   mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
> + *                   lock_page_memcg move_lock (in __set_page_dirty_buffers)
>   *                     i_pages lock (widely used)
> + *                       lock_page_lruvec_irq lruvec->lru_lock

I think it has to be
			    lruvec->lru_lock (in lock_page_lruvec_irq)

No?

>   *                 inode->i_lock (in set_page_dirty's __mark_inode_dirty)
>   *                 bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
>   *                   sb_lock (within inode_lock in fs/fs-writeback.c)
> 
> > 
> > local_irq_disable() also deserves a comment.
> > 
> 
> yes, I will add a comment for this. Do you mind give reviewed-by for this patch?

Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Alex Shi July 10, 2020, 2:09 p.m. UTC | #9
在 2020/7/10 下午7:28, Kirill A. Shutemov 写道:
>>   *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
>>   *           anon_vma->rwsem
>>   *             mm->page_table_lock or pte_lock
>> - *               pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
>>   *               swap_lock (in swap_duplicate, swap_info_get)
>>   *                 mmlist_lock (in mmput, drain_mmlist and others)
>>   *                 mapping->private_lock (in __set_page_dirty_buffers)
>> - *                   mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
>> + *                   lock_page_memcg move_lock (in __set_page_dirty_buffers)
>>   *                     i_pages lock (widely used)
>> + *                       lock_page_lruvec_irq lruvec->lru_lock
> I think it has to be
> 			    lruvec->lru_lock (in lock_page_lruvec_irq)

Good catch! I will update it in next version.
Thanks!

> 
> No?

> 
>>   *                 inode->i_lock (in set_page_dirty's __mark_inode_dirty)
>>   *                 bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
>>   *                   sb_lock (within inode_lock in fs/fs-writeback.c)
>>
>>> local_irq_disable() also deserves a comment.
>>>
>> yes, I will add a comment for this. Do you mind give reviewed-by for this patch?
> Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>

Thanks a lot! :)
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b18f21da4dac..607869330329 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2433,7 +2433,7 @@  static void __split_huge_page_tail(struct page *head, int tail,
 }
 
 static void __split_huge_page(struct page *page, struct list_head *list,
-		pgoff_t end, unsigned long flags)
+			      pgoff_t end)
 {
 	struct page *head = compound_head(page);
 	pg_data_t *pgdat = page_pgdat(head);
@@ -2442,8 +2442,6 @@  static void __split_huge_page(struct page *page, struct list_head *list,
 	unsigned long offset = 0;
 	int i;
 
-	lruvec = mem_cgroup_page_lruvec(head, pgdat);
-
 	/* complete memcg works before add pages to LRU */
 	mem_cgroup_split_huge_fixup(head);
 
@@ -2455,6 +2453,11 @@  static void __split_huge_page(struct page *page, struct list_head *list,
 		xa_lock(&swap_cache->i_pages);
 	}
 
+	/* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+	spin_lock(&pgdat->lru_lock);
+
+	lruvec = mem_cgroup_page_lruvec(head, pgdat);
+
 	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
 		__split_huge_page_tail(head, i, lruvec, list);
 		/* Some pages can be beyond i_size: drop them from page cache */
@@ -2474,6 +2477,8 @@  static void __split_huge_page(struct page *page, struct list_head *list,
 	}
 
 	ClearPageCompound(head);
+	spin_unlock(&pgdat->lru_lock);
+	/* Caller disabled irqs, so they are still disabled here */
 
 	split_page_owner(head, HPAGE_PMD_ORDER);
 
@@ -2491,8 +2496,7 @@  static void __split_huge_page(struct page *page, struct list_head *list,
 		page_ref_add(head, 2);
 		xa_unlock(&head->mapping->i_pages);
 	}
-
-	spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+	local_irq_enable();
 
 	remap_page(head);
 
@@ -2631,12 +2635,10 @@  bool can_split_huge_page(struct page *page, int *pextra_pins)
 int split_huge_page_to_list(struct page *page, struct list_head *list)
 {
 	struct page *head = compound_head(page);
-	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
 	struct deferred_split *ds_queue = get_deferred_split_queue(head);
 	struct anon_vma *anon_vma = NULL;
 	struct address_space *mapping = NULL;
 	int count, mapcount, extra_pins, ret;
-	unsigned long flags;
 	pgoff_t end;
 
 	VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
@@ -2697,9 +2699,7 @@  int split_huge_page_to_list(struct page *page, struct list_head *list)
 	unmap_page(head);
 	VM_BUG_ON_PAGE(compound_mapcount(head), head);
 
-	/* prevent PageLRU to go away from under us, and freeze lru stats */
-	spin_lock_irqsave(&pgdata->lru_lock, flags);
-
+	local_irq_disable();
 	if (mapping) {
 		XA_STATE(xas, &mapping->i_pages, page_index(head));
 
@@ -2729,7 +2729,7 @@  int split_huge_page_to_list(struct page *page, struct list_head *list)
 				__dec_node_page_state(head, NR_FILE_THPS);
 		}
 
-		__split_huge_page(page, list, end, flags);
+		__split_huge_page(page, list, end);
 		if (PageSwapCache(head)) {
 			swp_entry_t entry = { .val = page_private(head) };
 
@@ -2748,7 +2748,7 @@  int split_huge_page_to_list(struct page *page, struct list_head *list)
 		spin_unlock(&ds_queue->split_queue_lock);
 fail:		if (mapping)
 			xa_unlock(&mapping->i_pages);
-		spin_unlock_irqrestore(&pgdata->lru_lock, flags);
+		local_irq_enable();
 		remap_page(head);
 		ret = -EBUSY;
 	}