diff mbox series

[v2,07/11] mm/memory-failure: Convert memory_failure() to use a folio

Message ID 20240408194232.118537-8-willy@infradead.org (mailing list archive)
State New
Headers show
Series Some cleanups for memory-failure | expand

Commit Message

Matthew Wilcox April 8, 2024, 7:42 p.m. UTC
Saves dozens of calls to compound_head().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/memory-failure.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

Comments

Jane Chu April 9, 2024, 12:34 a.m. UTC | #1
On 4/8/2024 12:42 PM, Matthew Wilcox (Oracle) wrote:

> Saves dozens of calls to compound_head().
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>   mm/memory-failure.c | 40 +++++++++++++++++++++-------------------
>   1 file changed, 21 insertions(+), 19 deletions(-)
>
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 0a45fb7fb055..1c7c73776604 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -2173,7 +2173,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
>   int memory_failure(unsigned long pfn, int flags)
>   {
>   	struct page *p;
> -	struct page *hpage;
> +	struct folio *folio;
>   	struct dev_pagemap *pgmap;
>   	int res = 0;
>   	unsigned long page_flags;
> @@ -2261,8 +2261,8 @@ int memory_failure(unsigned long pfn, int flags)
>   		}
>   	}
>   
> -	hpage = compound_head(p);
> -	if (PageTransHuge(hpage)) {
> +	folio = page_folio(p);
> +	if (folio_test_large(folio)) {
>   		/*
>   		 * The flag must be set after the refcount is bumped
>   		 * otherwise it may race with THP split.
> @@ -2276,12 +2276,13 @@ int memory_failure(unsigned long pfn, int flags)
>   		 * or unhandlable page.  The refcount is bumped iff the
>   		 * page is a valid handlable page.
>   		 */
> -		SetPageHasHWPoisoned(hpage);
> +		folio_set_has_hwpoisoned(folio);
>   		if (try_to_split_thp_page(p) < 0) {
>   			res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
>   			goto unlock_mutex;
>   		}
>   		VM_BUG_ON_PAGE(!page_count(p), p);
> +		folio = page_folio(p);
>   	}
>   
>   	/*
> @@ -2292,9 +2293,9 @@ int memory_failure(unsigned long pfn, int flags)
>   	 * The check (unnecessarily) ignores LRU pages being isolated and
>   	 * walked by the page reclaim code, however that's not a big loss.
>   	 */
> -	shake_page(p);
> +	shake_folio(folio);
>   
> -	lock_page(p);
> +	folio_lock(folio);
>   
>   	/*
>   	 * We're only intended to deal with the non-Compound page here.
> @@ -2302,11 +2303,11 @@ int memory_failure(unsigned long pfn, int flags)
>   	 * race window. If this happens, we could try again to hopefully
>   	 * handle the page next round.
>   	 */
> -	if (PageCompound(p)) {
> +	if (folio_test_large(folio)) {

At this stage,  'p' is not expected to be a large page since a _refcount 
has been taken, right?  So is it reasonable to replace the "if (retry)" 
block with a VM_BUG_ON warning?  otherwise, if 'p' became part of a 
different large folio,  how to recover from here ?

>   		if (retry) {
>   			ClearPageHWPoison(p);
> -			unlock_page(p);
> -			put_page(p);
> +			folio_unlock(folio);
> +			folio_put(folio);
>   			flags &= ~MF_COUNT_INCREASED;
>   			retry = false;
>   			goto try_again;
> @@ -2322,29 +2323,29 @@ int memory_failure(unsigned long pfn, int flags)
>   	 * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page
>   	 * status correctly, we save a copy of the page flags at this time.
>   	 */
> -	page_flags = p->flags;
> +	page_flags = folio->flags;
>   
>   	if (hwpoison_filter(p)) {
>   		ClearPageHWPoison(p);
> -		unlock_page(p);
> -		put_page(p);
> +		folio_unlock(folio);
> +		folio_put(folio);
>   		res = -EOPNOTSUPP;
>   		goto unlock_mutex;
>   	}
>   
>   	/*
> -	 * __munlock_folio() may clear a writeback page's LRU flag without
> -	 * page_lock. We need wait writeback completion for this page or it
> -	 * may trigger vfs BUG while evict inode.
> +	 * __munlock_folio() may clear a writeback folio's LRU flag without
> +	 * the folio lock. We need to wait for writeback completion for this
> +	 * folio or it may trigger a vfs BUG while evicting inode.
>   	 */
> -	if (!PageLRU(p) && !PageWriteback(p))
> +	if (!folio_test_lru(folio) && !folio_test_writeback(folio))
>   		goto identify_page_state;
>   
>   	/*
>   	 * It's very difficult to mess with pages currently under IO
>   	 * and in many cases impossible, so we just avoid it here.
>   	 */
> -	wait_on_page_writeback(p);
> +	folio_wait_writeback(folio);
>   
>   	/*
>   	 * Now take care of user space mappings.
> @@ -2358,7 +2359,8 @@ int memory_failure(unsigned long pfn, int flags)
>   	/*
>   	 * Torn down by someone else?
>   	 */
> -	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
> +	if (folio_test_lru(folio) && !folio_test_swapcache(folio) &&
> +	    folio->mapping == NULL) {
>   		res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
>   		goto unlock_page;
>   	}
> @@ -2368,7 +2370,7 @@ int memory_failure(unsigned long pfn, int flags)
>   	mutex_unlock(&mf_mutex);
>   	return res;
>   unlock_page:
> -	unlock_page(p);
> +	folio_unlock(folio);
>   unlock_mutex:
>   	mutex_unlock(&mf_mutex);
>   	return res;

thanks,

-jane
Oscar Salvador April 10, 2024, 10:21 a.m. UTC | #2
On Mon, Apr 08, 2024 at 05:34:29PM -0700, Jane Chu wrote:
> At this stage,  'p' is not expected to be a large page since a _refcount has
> been taken, right?  So is it reasonable to replace the "if (retry)" block
> with a VM_BUG_ON warning?  otherwise, if 'p' became part of a different
> large folio,  how to recover from here ?

We might have split the THP (if it was part of), but AFAICS
nothing stops the kernel to coallesce this page again into a new THP via e.g:
madvise(MADV_HUGEPAGE) ?
Matthew Wilcox April 10, 2024, 2:23 p.m. UTC | #3
On Wed, Apr 10, 2024 at 12:21:07PM +0200, Oscar Salvador wrote:
> On Mon, Apr 08, 2024 at 05:34:29PM -0700, Jane Chu wrote:
> > At this stage,  'p' is not expected to be a large page since a _refcount has
> > been taken, right?  So is it reasonable to replace the "if (retry)" block
> > with a VM_BUG_ON warning?  otherwise, if 'p' became part of a different
> > large folio,  how to recover from here ?
> 
> We might have split the THP (if it was part of), but AFAICS
> nothing stops the kernel to coallesce this page again into a new THP via e.g:
> madvise(MADV_HUGEPAGE) ?

Won't that allocate a _new_ THP rather than coalescing in place?
Oscar Salvador April 10, 2024, 3:30 p.m. UTC | #4
On Wed, Apr 10, 2024 at 03:23:10PM +0100, Matthew Wilcox wrote:
> On Wed, Apr 10, 2024 at 12:21:07PM +0200, Oscar Salvador wrote:
> > On Mon, Apr 08, 2024 at 05:34:29PM -0700, Jane Chu wrote:
> > > At this stage,  'p' is not expected to be a large page since a _refcount has
> > > been taken, right?  So is it reasonable to replace the "if (retry)" block
> > > with a VM_BUG_ON warning?  otherwise, if 'p' became part of a different
> > > large folio,  how to recover from here ?
> > 
> > We might have split the THP (if it was part of), but AFAICS
> > nothing stops the kernel to coallesce this page again into a new THP via e.g:
> > madvise(MADV_HUGEPAGE) ?
> 
> Won't that allocate a _new_ THP rather than coalescing in place?

Sorry, yes, I think it will. I guess I meant MADV_COLLAPSE.
My point is that somehow this page can be used to form a new THP, or is
there anything stoping the page to become part of a compound page again?
Jane Chu April 10, 2024, 11:15 p.m. UTC | #5
On 4/10/2024 3:21 AM, Oscar Salvador wrote:

> On Mon, Apr 08, 2024 at 05:34:29PM -0700, Jane Chu wrote:
>> At this stage,  'p' is not expected to be a large page since a _refcount has
>> been taken, right?  So is it reasonable to replace the "if (retry)" block
>> with a VM_BUG_ON warning?  otherwise, if 'p' became part of a different
>> large folio,  how to recover from here ?
> We might have split the THP (if it was part of), but AFAICS
> nothing stops the kernel to coallesce this page again into a new THP via e.g:
> madvise(MADV_HUGEPAGE) ?

Good point, thanks!

-jane

>
Jane Chu April 11, 2024, 1:27 a.m. UTC | #6
On 4/10/2024 4:15 PM, Jane Chu wrote:

> On 4/10/2024 3:21 AM, Oscar Salvador wrote:
>
>> On Mon, Apr 08, 2024 at 05:34:29PM -0700, Jane Chu wrote:
>>> At this stage,  'p' is not expected to be a large page since a 
>>> _refcount has
>>> been taken, right?  So is it reasonable to replace the "if (retry)" 
>>> block
>>> with a VM_BUG_ON warning?  otherwise, if 'p' became part of a different
>>> large folio,  how to recover from here ?
>> We might have split the THP (if it was part of), but AFAICS
>> nothing stops the kernel to coallesce this page again into a new THP 
>> via e.g:
>> madvise(MADV_HUGEPAGE) ?
>
> Good point, thanks!

Come to think of it a bit more, maybe the check could be

     if (folio_test_large(folio) || (folio != page_folio(p))) {

? Because, 'p' could have become part of a THP after shake_folio() and 
before folio_lock(), right?

thanks,

-jane


>
> -jane
>
>>
Matthew Wilcox April 11, 2024, 1:51 a.m. UTC | #7
On Wed, Apr 10, 2024 at 06:27:38PM -0700, Jane Chu wrote:
> On 4/10/2024 4:15 PM, Jane Chu wrote:
> 
> > On 4/10/2024 3:21 AM, Oscar Salvador wrote:
> > 
> > > On Mon, Apr 08, 2024 at 05:34:29PM -0700, Jane Chu wrote:
> > > > At this stage,  'p' is not expected to be a large page since a
> > > > _refcount has
> > > > been taken, right?  So is it reasonable to replace the "if
> > > > (retry)" block
> > > > with a VM_BUG_ON warning?  otherwise, if 'p' became part of a different
> > > > large folio,  how to recover from here ?
> > > We might have split the THP (if it was part of), but AFAICS
> > > nothing stops the kernel to coallesce this page again into a new THP
> > > via e.g:
> > > madvise(MADV_HUGEPAGE) ?
> > 
> > Good point, thanks!
> 
> Come to think of it a bit more, maybe the check could be
> 
>     if (folio_test_large(folio) || (folio != page_folio(p))) {
> 
> ? Because, 'p' could have become part of a THP after shake_folio() and
> before folio_lock(), right?

What is the mechanism for reassembling a large folio?  I don't see that
code anywhere.
Miaohe Lin April 11, 2024, 9 a.m. UTC | #8
On 2024/4/11 9:51, Matthew Wilcox wrote:
> On Wed, Apr 10, 2024 at 06:27:38PM -0700, Jane Chu wrote:
>> On 4/10/2024 4:15 PM, Jane Chu wrote:
>>
>>> On 4/10/2024 3:21 AM, Oscar Salvador wrote:
>>>
>>>> On Mon, Apr 08, 2024 at 05:34:29PM -0700, Jane Chu wrote:
>>>>> At this stage,  'p' is not expected to be a large page since a
>>>>> _refcount has
>>>>> been taken, right?  So is it reasonable to replace the "if
>>>>> (retry)" block
>>>>> with a VM_BUG_ON warning?  otherwise, if 'p' became part of a different
>>>>> large folio,  how to recover from here ?
>>>> We might have split the THP (if it was part of), but AFAICS
>>>> nothing stops the kernel to coallesce this page again into a new THP
>>>> via e.g:
>>>> madvise(MADV_HUGEPAGE) ?
>>>
>>> Good point, thanks!
>>
>> Come to think of it a bit more, maybe the check could be
>>
>>     if (folio_test_large(folio) || (folio != page_folio(p))) {
>>
>> ? Because, 'p' could have become part of a THP after shake_folio() and
>> before folio_lock(), right?

This check is originally introduced via commit:

"""
commit f37d4298aa7f8b74395aa13c728677e2ed86fdaf
Author: Andi Kleen <ak@xxxxxxxxxxxxxxx>
Date:   Wed Aug 6 16:06:49 2014 -0700

    hwpoison: fix race with changing page during offlining

    When a hwpoison page is locked it could change state due to parallel
    modifications.  The original compound page can be torn down and then
    this 4k page becomes part of a differently-size compound page is is a
    standalone regular page.

    Check after the lock if the page is still the same compound page.

    We could go back, grab the new head page and try again but it should be
    quite rare, so I thought this was safest.  A retry loop would be more
    difficult to test and may have more side effects.

    The hwpoison code by design only tries to handle cases that are
    reasonably common in workloads, as visible in page-flags.

    I'm not really that concerned about handling this (likely rare case),
    just not crashing on it.

    Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
    Acked-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
    Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
    Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a013bc94ebbe..44c6bd201d3a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1172,6 +1172,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)

 	lock_page(hpage);

+	/*
+	 * The page could have changed compound pages during the locking.
+	 * If this happens just bail out.
+	 */
+	if (compound_head(p) != hpage) {
+		action_result(pfn, "different compound page after locking", IGNORED);
+		res = -EBUSY;
+		goto out;
+	}
+
 	/*
 	 * We use page flags to determine what action should be taken, but
 	 * the flags can be modified by the error containment action.  One

"""

And the discussion can be found at [1]. The root cause for the issue is indeterminate, but most probably:

[1] https://lore.kernel.org/linux-mm/20140626195036.GA5311@nhori.redhat.com/

"""
And I think that the problem you report is caused by another part of hwpoison
code, because we have PageSlab check at the beginning of hwpoison_user_mappings(),
so if LRU page truned into slab page just before locking the page, we never
reach try_to_unmap().
I think this was caused by the code around lock migration after thp split
in hwpoison_user_mappings(), which was introduced in commit 54b9dd14d09f
("mm/memory-failure.c: shift page lock from head page to tail page after thp split").
I guess the tail page (raw error page) was freed and turned into Slab page
just after thp split and before locking the error page.
So possible solution is to do page status check again after thp split.
"""

IIUC, it's because the page lock is shifted from @hpage to @p. So there's a window
that p is freed and turned into Slab page.

@@ -942,11 +942,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
                         * We pinned the head page for hwpoison handling,
                         * now we split the thp and we are interested in
                         * the hwpoisoned raw page, so move the refcount
-                        * to it.
+                        * to it. Similarly, page lock is shifted.
                         */
                        if (hpage != p) {
                                put_page(hpage);
                                get_page(p);
+                               lock_page(p);
+                               unlock_page(hpage);
+                               *hpagep = p;
                        }
                        /* THP is split, so ppage should be the real poisoned page. */
                        ppage = p;

> 
> What is the mechanism for reassembling a large folio?  I don't see that
> code anywhere.

But as code changes, the above page lock shift is gone. And I think below logic can't
trigger now. As we hold extra page refcnt so page can't be coallesced into a new THP or Slab page.

	/*
	 * We're only intended to deal with the non-Compound page here.
	 * However, the page could have changed compound pages due to
	 * race window. If this happens, we could try again to hopefully
	 * handle the page next round.
	 */
	if (PageCompound(p)) {
		if (retry) {
			ClearPageHWPoison(p);
			unlock_page(p);
			put_page(p);
			flags &= ~MF_COUNT_INCREASED;
			retry = false;
			goto try_again;
		}
		res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
		goto unlock_page;
	}

So it might be better to replace above code block as WARN_ON(PageCompound(p)) and remove MF_MSG_DIFFERENT_COMPOUND case.
Any thoughts?

Thanks.
.

> 
> .
>
Oscar Salvador April 11, 2024, 11:23 a.m. UTC | #9
On Thu, Apr 11, 2024 at 05:00:33PM +0800, Miaohe Lin wrote:
> But as code changes, the above page lock shift is gone. And I think below logic can't
> trigger now. As we hold extra page refcnt so page can't be coallesced into a new THP or Slab page.

So I was wrong.
I did not check the slap part, but I did check the code that tries to
coallesce pages.
We have the following check in hpage_collapse_scan_pmd():

 /*
  *
  * Here the check may be racy:
  * it may see total_mapcount > refcount in some cases?
  * But such case is ephemeral we could always retry collapse
  * later.  However it may report false positive if the page
  * has excessive GUP pins (i.e. 512).  Anyway the same check
  * will be done again later the risk seems low.
  */
  if (!is_refcount_suitable(folio)) {
          result = SCAN_PAGE_COUNT;
          goto out_unmap;
  }

So I guess that since we hold an extra refcount from memory-failure
code, this page cannot be part of a THP anymore.
If we confirm that the same goes for slab, I would replace that with
your warn check.
In this way, if we ever trigger that warn path, as least we will know
that that possibility exists.

So it is a good way to get rid of false assumptions and code burden.
Matthew Wilcox April 11, 2024, 12:17 p.m. UTC | #10
On Thu, Apr 11, 2024 at 01:23:43PM +0200, Oscar Salvador wrote:
> On Thu, Apr 11, 2024 at 05:00:33PM +0800, Miaohe Lin wrote:
> > But as code changes, the above page lock shift is gone. And I think below logic can't
> > trigger now. As we hold extra page refcnt so page can't be coallesced into a new THP or Slab page.
> 
> So I was wrong.
> I did not check the slap part, but I did check the code that tries to
> coallesce pages.
> We have the following check in hpage_collapse_scan_pmd():

Even if that check succeeds, collapse_huge_page() works by allocating a
new THP, copying the data into it and freeing the existing pages.  It
doesn't reassemble a THP in-place.  I cannot find anywhere that will
reassemble smaller folios into larger ones (other than the page
allocator, and if you have a refcount on a folio, it can't possibly go
into the page allocator).
Matthew Wilcox April 12, 2024, 7:48 p.m. UTC | #11
On Thu, Apr 11, 2024 at 05:00:33PM +0800, Miaohe Lin wrote:
> But as code changes, the above page lock shift is gone. And I think below logic can't
> trigger now. As we hold extra page refcnt so page can't be coallesced into a new THP or Slab page.
> 
> 	/*
> 	 * We're only intended to deal with the non-Compound page here.
> 	 * However, the page could have changed compound pages due to
> 	 * race window. If this happens, we could try again to hopefully
> 	 * handle the page next round.
> 	 */
> 	if (PageCompound(p)) {
> 		if (retry) {
> 			ClearPageHWPoison(p);
> 			unlock_page(p);
> 			put_page(p);
> 			flags &= ~MF_COUNT_INCREASED;
> 			retry = false;
> 			goto try_again;
> 		}
> 		res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
> 		goto unlock_page;
> 	}
> 
> So it might be better to replace above code block as WARN_ON(PageCompound(p)) and remove MF_MSG_DIFFERENT_COMPOUND case.
> Any thoughts?

Yes, I think you're right.  As the MM handling of pages has evolved,
people haven't kept memory-failure uptodate.  That's both understandable
and regrettable.

I don't have the time to focus on memory-failure myself; I have a couple
of hundred uses of page->mapping to eliminate.  And I'd want to get a
lot more serious about testing before starting on that journey.

I do have ideas for handling hwpoison without splitting a folio.
I'd also really like to improve memory-failure to handle sub-page-size
blast radius (Intel CXL used to have a blast radius of 256 bytes).
But realistically, these things are never going to rise high enough on
my todo list to actually get done.
Oscar Salvador April 12, 2024, 10:09 p.m. UTC | #12
On Fri, Apr 12, 2024 at 08:48:11PM +0100, Matthew Wilcox wrote:
> Yes, I think you're right.  As the MM handling of pages has evolved,
> people haven't kept memory-failure uptodate.  That's both understandable
> and regrettable.

We kind of had the same problem with memory-hotplug, but we managed to
get it up to date.

> I don't have the time to focus on memory-failure myself; I have a couple
> of hundred uses of page->mapping to eliminate.  And I'd want to get a
> lot more serious about testing before starting on that journey.
> 
> I do have ideas for handling hwpoison without splitting a folio.

But if you want to recover those pages within a large folio that are
unaffected, you will have to split it eventually?
And so, if we can make sure that the large folio has been split, and
subpage cannot be part of another compound page afterwards, we should be
safe?

But yes, I can see why.
Right now, if we fail to split the folio we do not handle the situation
at all, we just mark the head as HWPoisoned which is pretty unfortunate
because we just lost the chance to contain the error in the subpage,
so we flushed a large folio down the toilet.

I would be interested to hear those ideas, as having the chance to
handle that would be very beneficial, and more deterministic.


Thanks
Jane Chu April 15, 2024, 6:47 p.m. UTC | #13
On 4/12/2024 12:48 PM, Matthew Wilcox wrote:

> On Thu, Apr 11, 2024 at 05:00:33PM +0800, Miaohe Lin wrote:
>> But as code changes, the above page lock shift is gone. And I think below logic can't
>> trigger now. As we hold extra page refcnt so page can't be coallesced into a new THP or Slab page.
>>
>> 	/*
>> 	 * We're only intended to deal with the non-Compound page here.
>> 	 * However, the page could have changed compound pages due to
>> 	 * race window. If this happens, we could try again to hopefully
>> 	 * handle the page next round.
>> 	 */
>> 	if (PageCompound(p)) {
>> 		if (retry) {
>> 			ClearPageHWPoison(p);
>> 			unlock_page(p);
>> 			put_page(p);
>> 			flags &= ~MF_COUNT_INCREASED;
>> 			retry = false;
>> 			goto try_again;
>> 		}
>> 		res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
>> 		goto unlock_page;
>> 	}
>>
>> So it might be better to replace above code block as WARN_ON(PageCompound(p)) and remove MF_MSG_DIFFERENT_COMPOUND case.
>> Any thoughts?
> Yes, I think you're right.  As the MM handling of pages has evolved,
> people haven't kept memory-failure uptodate.  That's both understandable
> and regrettable.
>
> I don't have the time to focus on memory-failure myself; I have a couple
> of hundred uses of page->mapping to eliminate.  And I'd want to get a
> lot more serious about testing before starting on that journey.
>
> I do have ideas for handling hwpoison without splitting a folio.
> I'd also really like to improve memory-failure to handle sub-page-size
> blast radius (Intel CXL used to have a blast radius of 256 bytes).
> But realistically, these things are never going to rise high enough on
> my todo list to actually get done.

Yeah, glad to have the issue identified, that's progress.

I'm working on a series dealing with handling hwpoison without splitting 
a THP, will post it for comments.

thanks,

-jane
Miaohe Lin April 16, 2024, 9:13 a.m. UTC | #14
On 2024/4/13 3:48, Matthew Wilcox wrote:
> On Thu, Apr 11, 2024 at 05:00:33PM +0800, Miaohe Lin wrote:
>> But as code changes, the above page lock shift is gone. And I think below logic can't
>> trigger now. As we hold extra page refcnt so page can't be coallesced into a new THP or Slab page.
>>
>> 	/*
>> 	 * We're only intended to deal with the non-Compound page here.
>> 	 * However, the page could have changed compound pages due to
>> 	 * race window. If this happens, we could try again to hopefully
>> 	 * handle the page next round.
>> 	 */
>> 	if (PageCompound(p)) {
>> 		if (retry) {
>> 			ClearPageHWPoison(p);
>> 			unlock_page(p);
>> 			put_page(p);
>> 			flags &= ~MF_COUNT_INCREASED;
>> 			retry = false;
>> 			goto try_again;
>> 		}
>> 		res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
>> 		goto unlock_page;
>> 	}
>>
>> So it might be better to replace above code block as WARN_ON(PageCompound(p)) and remove MF_MSG_DIFFERENT_COMPOUND case.
>> Any thoughts?
> 
> Yes, I think you're right.  As the MM handling of pages has evolved,
> people haven't kept memory-failure uptodate.  That's both understandable
> and regrettable.
> 
> I don't have the time to focus on memory-failure myself; I have a couple
> of hundred uses of page->mapping to eliminate.  And I'd want to get a
> lot more serious about testing before starting on that journey.

I would do those clean ups when I am free. I want to make sure whether there
was something I missed before start. :)

> 
> I do have ideas for handling hwpoison without splitting a folio.
> I'd also really like to improve memory-failure to handle sub-page-size
> blast radius (Intel CXL used to have a blast radius of 256 bytes).
> But realistically, these things are never going to rise high enough on
> my todo list to actually get done.

Thanks for your hard work.
.

> .
>
diff mbox series

Patch

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0a45fb7fb055..1c7c73776604 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2173,7 +2173,7 @@  static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 int memory_failure(unsigned long pfn, int flags)
 {
 	struct page *p;
-	struct page *hpage;
+	struct folio *folio;
 	struct dev_pagemap *pgmap;
 	int res = 0;
 	unsigned long page_flags;
@@ -2261,8 +2261,8 @@  int memory_failure(unsigned long pfn, int flags)
 		}
 	}
 
-	hpage = compound_head(p);
-	if (PageTransHuge(hpage)) {
+	folio = page_folio(p);
+	if (folio_test_large(folio)) {
 		/*
 		 * The flag must be set after the refcount is bumped
 		 * otherwise it may race with THP split.
@@ -2276,12 +2276,13 @@  int memory_failure(unsigned long pfn, int flags)
 		 * or unhandlable page.  The refcount is bumped iff the
 		 * page is a valid handlable page.
 		 */
-		SetPageHasHWPoisoned(hpage);
+		folio_set_has_hwpoisoned(folio);
 		if (try_to_split_thp_page(p) < 0) {
 			res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
 			goto unlock_mutex;
 		}
 		VM_BUG_ON_PAGE(!page_count(p), p);
+		folio = page_folio(p);
 	}
 
 	/*
@@ -2292,9 +2293,9 @@  int memory_failure(unsigned long pfn, int flags)
 	 * The check (unnecessarily) ignores LRU pages being isolated and
 	 * walked by the page reclaim code, however that's not a big loss.
 	 */
-	shake_page(p);
+	shake_folio(folio);
 
-	lock_page(p);
+	folio_lock(folio);
 
 	/*
 	 * We're only intended to deal with the non-Compound page here.
@@ -2302,11 +2303,11 @@  int memory_failure(unsigned long pfn, int flags)
 	 * race window. If this happens, we could try again to hopefully
 	 * handle the page next round.
 	 */
-	if (PageCompound(p)) {
+	if (folio_test_large(folio)) {
 		if (retry) {
 			ClearPageHWPoison(p);
-			unlock_page(p);
-			put_page(p);
+			folio_unlock(folio);
+			folio_put(folio);
 			flags &= ~MF_COUNT_INCREASED;
 			retry = false;
 			goto try_again;
@@ -2322,29 +2323,29 @@  int memory_failure(unsigned long pfn, int flags)
 	 * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page
 	 * status correctly, we save a copy of the page flags at this time.
 	 */
-	page_flags = p->flags;
+	page_flags = folio->flags;
 
 	if (hwpoison_filter(p)) {
 		ClearPageHWPoison(p);
-		unlock_page(p);
-		put_page(p);
+		folio_unlock(folio);
+		folio_put(folio);
 		res = -EOPNOTSUPP;
 		goto unlock_mutex;
 	}
 
 	/*
-	 * __munlock_folio() may clear a writeback page's LRU flag without
-	 * page_lock. We need wait writeback completion for this page or it
-	 * may trigger vfs BUG while evict inode.
+	 * __munlock_folio() may clear a writeback folio's LRU flag without
+	 * the folio lock. We need to wait for writeback completion for this
+	 * folio or it may trigger a vfs BUG while evicting inode.
 	 */
-	if (!PageLRU(p) && !PageWriteback(p))
+	if (!folio_test_lru(folio) && !folio_test_writeback(folio))
 		goto identify_page_state;
 
 	/*
 	 * It's very difficult to mess with pages currently under IO
 	 * and in many cases impossible, so we just avoid it here.
 	 */
-	wait_on_page_writeback(p);
+	folio_wait_writeback(folio);
 
 	/*
 	 * Now take care of user space mappings.
@@ -2358,7 +2359,8 @@  int memory_failure(unsigned long pfn, int flags)
 	/*
 	 * Torn down by someone else?
 	 */
-	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
+	if (folio_test_lru(folio) && !folio_test_swapcache(folio) &&
+	    folio->mapping == NULL) {
 		res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
 		goto unlock_page;
 	}
@@ -2368,7 +2370,7 @@  int memory_failure(unsigned long pfn, int flags)
 	mutex_unlock(&mf_mutex);
 	return res;
 unlock_page:
-	unlock_page(p);
+	folio_unlock(folio);
 unlock_mutex:
 	mutex_unlock(&mf_mutex);
 	return res;