diff mbox series

[RFC] mm: khugepaged: don't carry huge page to the next loop for !CONFIG_NUMA

Message ID 20210817202146.3218-1-shy828301@gmail.com (mailing list archive)
State New
Headers show
Series [RFC] mm: khugepaged: don't carry huge page to the next loop for !CONFIG_NUMA | expand

Commit Message

Yang Shi Aug. 17, 2021, 8:21 p.m. UTC
The khugepaged has optimization to reduce huge page allocation calls for
!CONFIG_NUMA by carrying the allocated but failed to collapse huge page to
the next loop.  CONFIG_NUMA doesn't do so since the next loop may try to
collapse huge page from a different node, so it doesn't make too much sense
to carry it.

But when NUMA=n, the huge page is allocated by khugepaged_prealloc_page()
before scanning the address space, so it means huge page may be allocated
even though there is no suitable range for collapsing.  Then the page would
be just freed if khugepaged already made enough progress.  This could make
NUMA=n run have 5 times as much thp_collapse_alloc as NUMA=y run.  This
problem actually makes things worse due to the way more pointless THP
allocations and makes the optimization pointless.

This could be fixed by carrying the huge page across scans, but it will
complicate the code further and the huge page may be carried
indefinitely.  But if we take one step back,  the optimization itself seems
not worth keeping nowadays since:
  * Not too many users build NUMA=n kernel nowadays even though the kernel is
    actually running on a non-NUMA machine. Some small devices may run NUMA=n
    kernel, but I don't think they actually use THP.
  * Since commit 44042b449872 ("mm/page_alloc: allow high-order pages to be
    stored on the per-cpu lists"), THP could be cached by pcp.  This actually
    somehow does the job done by the optimization.

Cc: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Yang Shi <shy828301@gmail.com>
---
 mm/khugepaged.c | 74 ++++---------------------------------------------
 1 file changed, 6 insertions(+), 68 deletions(-)

Comments

Yang Shi Aug. 30, 2021, 6:49 p.m. UTC | #1
Gently ping...

Does this patch make sense? BTW, I have a couple of other khugepaged
related patches in my queue. I plan to send them with this patch
together. It would be great to hear some feedback before resending
this one.

Thank,
Yang

On Tue, Aug 17, 2021 at 1:21 PM Yang Shi <shy828301@gmail.com> wrote:
>
> The khugepaged has optimization to reduce huge page allocation calls for
> !CONFIG_NUMA by carrying the allocated but failed to collapse huge page to
> the next loop.  CONFIG_NUMA doesn't do so since the next loop may try to
> collapse huge page from a different node, so it doesn't make too much sense
> to carry it.
>
> But when NUMA=n, the huge page is allocated by khugepaged_prealloc_page()
> before scanning the address space, so it means huge page may be allocated
> even though there is no suitable range for collapsing.  Then the page would
> be just freed if khugepaged already made enough progress.  This could make
> NUMA=n run have 5 times as much thp_collapse_alloc as NUMA=y run.  This
> problem actually makes things worse due to the way more pointless THP
> allocations and makes the optimization pointless.
>
> This could be fixed by carrying the huge page across scans, but it will
> complicate the code further and the huge page may be carried
> indefinitely.  But if we take one step back,  the optimization itself seems
> not worth keeping nowadays since:
>   * Not too many users build NUMA=n kernel nowadays even though the kernel is
>     actually running on a non-NUMA machine. Some small devices may run NUMA=n
>     kernel, but I don't think they actually use THP.
>   * Since commit 44042b449872 ("mm/page_alloc: allow high-order pages to be
>     stored on the per-cpu lists"), THP could be cached by pcp.  This actually
>     somehow does the job done by the optimization.
>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> Signed-off-by: Yang Shi <shy828301@gmail.com>
> ---
>  mm/khugepaged.c | 74 ++++---------------------------------------------
>  1 file changed, 6 insertions(+), 68 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 6b9c98ddcd09..d6beb10e29e2 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -855,6 +855,12 @@ static int khugepaged_find_target_node(void)
>         last_khugepaged_target_node = target_node;
>         return target_node;
>  }
> +#else
> +static inline int khugepaged_find_target_node(void)
> +{
> +       return 0;
> +}
> +#endif
>
>  static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
>  {
> @@ -889,74 +895,6 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
>         count_vm_event(THP_COLLAPSE_ALLOC);
>         return *hpage;
>  }
> -#else
> -static int khugepaged_find_target_node(void)
> -{
> -       return 0;
> -}
> -
> -static inline struct page *alloc_khugepaged_hugepage(void)
> -{
> -       struct page *page;
> -
> -       page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
> -                          HPAGE_PMD_ORDER);
> -       if (page)
> -               prep_transhuge_page(page);
> -       return page;
> -}
> -
> -static struct page *khugepaged_alloc_hugepage(bool *wait)
> -{
> -       struct page *hpage;
> -
> -       do {
> -               hpage = alloc_khugepaged_hugepage();
> -               if (!hpage) {
> -                       count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> -                       if (!*wait)
> -                               return NULL;
> -
> -                       *wait = false;
> -                       khugepaged_alloc_sleep();
> -               } else
> -                       count_vm_event(THP_COLLAPSE_ALLOC);
> -       } while (unlikely(!hpage) && likely(khugepaged_enabled()));
> -
> -       return hpage;
> -}
> -
> -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
> -{
> -       /*
> -        * If the hpage allocated earlier was briefly exposed in page cache
> -        * before collapse_file() failed, it is possible that racing lookups
> -        * have not yet completed, and would then be unpleasantly surprised by
> -        * finding the hpage reused for the same mapping at a different offset.
> -        * Just release the previous allocation if there is any danger of that.
> -        */
> -       if (*hpage && page_count(*hpage) > 1) {
> -               put_page(*hpage);
> -               *hpage = NULL;
> -       }
> -
> -       if (!*hpage)
> -               *hpage = khugepaged_alloc_hugepage(wait);
> -
> -       if (unlikely(!*hpage))
> -               return false;
> -
> -       return true;
> -}
> -
> -static struct page *
> -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> -{
> -       VM_BUG_ON(!*hpage);
> -
> -       return  *hpage;
> -}
> -#endif
>
>  /*
>   * If mmap_lock temporarily dropped, revalidate vma
> --
> 2.26.2
>
Kirill A. Shutemov Aug. 31, 2021, 11:38 p.m. UTC | #2
On Mon, Aug 30, 2021 at 11:49:43AM -0700, Yang Shi wrote:
> Gently ping...
> 
> Does this patch make sense? BTW, I have a couple of other khugepaged
> related patches in my queue. I plan to send them with this patch
> together. It would be great to hear some feedback before resending
> this one.

I don't really care for !NUMA optimization. I believe that most of setups
that benefit from THP has NUMA enabled compile time.

But if you wanna to go this path, make an effort to cleanup other
artifacts for the !NUMA optimization: the ifdef has to be gone and all
callers of these helpers has to be revisited. There's more opportunities to
cleanup. Like it is very odd that khugepaged_prealloc_page() frees the
page.
Yang Shi Sept. 1, 2021, 3:46 a.m. UTC | #3
On Tue, Aug 31, 2021 at 4:38 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Mon, Aug 30, 2021 at 11:49:43AM -0700, Yang Shi wrote:
> > Gently ping...
> >
> > Does this patch make sense? BTW, I have a couple of other khugepaged
> > related patches in my queue. I plan to send them with this patch
> > together. It would be great to hear some feedback before resending
> > this one.
>
> I don't really care for !NUMA optimization. I believe that most of setups
> that benefit from THP has NUMA enabled compile time.

Agreed.

>
> But if you wanna to go this path, make an effort to cleanup other
> artifacts for the !NUMA optimization: the ifdef has to be gone and all
> callers of these helpers has to be revisited. There's more opportunities to
> cleanup. Like it is very odd that khugepaged_prealloc_page() frees the
> page.

Yes, they are gone in this patch. The only remaining for !NUMA is
khugepaged_find_target_node() which just returns 0.

>
>
> --
>  Kirill A. Shutemov
Vlastimil Babka Sept. 1, 2021, 10:26 a.m. UTC | #4
On 9/1/21 05:46, Yang Shi wrote:
> On Tue, Aug 31, 2021 at 4:38 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>>
>> On Mon, Aug 30, 2021 at 11:49:43AM -0700, Yang Shi wrote:
>> > Gently ping...
>> >
>> > Does this patch make sense? BTW, I have a couple of other khugepaged
>> > related patches in my queue. I plan to send them with this patch
>> > together. It would be great to hear some feedback before resending
>> > this one.
>>
>> I don't really care for !NUMA optimization. I believe that most of setups
>> that benefit from THP has NUMA enabled compile time.
> 
> Agreed.
> 
>>
>> But if you wanna to go this path, make an effort to cleanup other
>> artifacts for the !NUMA optimization: the ifdef has to be gone and all
>> callers of these helpers has to be revisited. There's more opportunities to
>> cleanup. Like it is very odd that khugepaged_prealloc_page() frees the
>> page.
> 
> Yes, they are gone in this patch. The only remaining for !NUMA is
> khugepaged_find_target_node() which just returns 0.

As Kirill pointed out, there's also khugepaged_prealloc_page() where the
only remaining variant does actually no preallocation, just freeing of an
unused page and some kind of "sleep after first alloc fail, break after
second alloc fail" logic.
This could now be moved to khugepaged_do_scan() loop itself and maybe it
will be easier to follow.

>>
>>
>> --
>>  Kirill A. Shutemov
>
Yang Shi Sept. 1, 2021, 5:25 p.m. UTC | #5
On Wed, Sep 1, 2021 at 3:26 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 9/1/21 05:46, Yang Shi wrote:
> > On Tue, Aug 31, 2021 at 4:38 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> >>
> >> On Mon, Aug 30, 2021 at 11:49:43AM -0700, Yang Shi wrote:
> >> > Gently ping...
> >> >
> >> > Does this patch make sense? BTW, I have a couple of other khugepaged
> >> > related patches in my queue. I plan to send them with this patch
> >> > together. It would be great to hear some feedback before resending
> >> > this one.
> >>
> >> I don't really care for !NUMA optimization. I believe that most of setups
> >> that benefit from THP has NUMA enabled compile time.
> >
> > Agreed.
> >
> >>
> >> But if you wanna to go this path, make an effort to cleanup other
> >> artifacts for the !NUMA optimization: the ifdef has to be gone and all
> >> callers of these helpers has to be revisited. There's more opportunities to
> >> cleanup. Like it is very odd that khugepaged_prealloc_page() frees the
> >> page.
> >
> > Yes, they are gone in this patch. The only remaining for !NUMA is
> > khugepaged_find_target_node() which just returns 0.
>
> As Kirill pointed out, there's also khugepaged_prealloc_page() where the
> only remaining variant does actually no preallocation, just freeing of an
> unused page and some kind of "sleep after first alloc fail, break after
> second alloc fail" logic.
> This could now be moved to khugepaged_do_scan() loop itself and maybe it
> will be easier to follow.

Aha, I see. Misunderstood him. I'm supposed that you mean move into
khugepaged_scan_mm_slot().

>
> >>
> >>
> >> --
> >>  Kirill A. Shutemov
> >
>
Hugh Dickins Sept. 22, 2021, 11:49 p.m. UTC | #6
On Wed, 1 Sep 2021, Yang Shi wrote:
> On Wed, Sep 1, 2021 at 3:26 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> > On 9/1/21 05:46, Yang Shi wrote:
> > > On Tue, Aug 31, 2021 at 4:38 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> > >> On Mon, Aug 30, 2021 at 11:49:43AM -0700, Yang Shi wrote:
> > >> > Gently ping...
> > >> >
> > >> > Does this patch make sense? BTW, I have a couple of other khugepaged
> > >> > related patches in my queue. I plan to send them with this patch
> > >> > together. It would be great to hear some feedback before resending
> > >> > this one.
> > >>
> > >> I don't really care for !NUMA optimization. I believe that most of setups
> > >> that benefit from THP has NUMA enabled compile time.
> > >
> > > Agreed.
> > >
> > >>
> > >> But if you wanna to go this path, make an effort to cleanup other
> > >> artifacts for the !NUMA optimization: the ifdef has to be gone and all
> > >> callers of these helpers has to be revisited. There's more opportunities to
> > >> cleanup. Like it is very odd that khugepaged_prealloc_page() frees the
> > >> page.
> > >
> > > Yes, they are gone in this patch. The only remaining for !NUMA is
> > > khugepaged_find_target_node() which just returns 0.
> >
> > As Kirill pointed out, there's also khugepaged_prealloc_page() where the
> > only remaining variant does actually no preallocation, just freeing of an
> > unused page and some kind of "sleep after first alloc fail, break after
> > second alloc fail" logic.
> > This could now be moved to khugepaged_do_scan() loop itself and maybe it
> > will be easier to follow.
> 
> Aha, I see. Misunderstood him. I'm supposed that you mean move into
> khugepaged_scan_mm_slot().

It may not be possible, but I'd always imagined that a cleanup of this
kind would get rid of all those "struct page **hpage" artifacts.

Hugh
Yang Shi Sept. 23, 2021, 3:07 a.m. UTC | #7
On Wed, Sep 22, 2021 at 4:49 PM Hugh Dickins <hughd@google.com> wrote:
>
> On Wed, 1 Sep 2021, Yang Shi wrote:
> > On Wed, Sep 1, 2021 at 3:26 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> > > On 9/1/21 05:46, Yang Shi wrote:
> > > > On Tue, Aug 31, 2021 at 4:38 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> > > >> On Mon, Aug 30, 2021 at 11:49:43AM -0700, Yang Shi wrote:
> > > >> > Gently ping...
> > > >> >
> > > >> > Does this patch make sense? BTW, I have a couple of other khugepaged
> > > >> > related patches in my queue. I plan to send them with this patch
> > > >> > together. It would be great to hear some feedback before resending
> > > >> > this one.
> > > >>
> > > >> I don't really care for !NUMA optimization. I believe that most of setups
> > > >> that benefit from THP has NUMA enabled compile time.
> > > >
> > > > Agreed.
> > > >
> > > >>
> > > >> But if you wanna to go this path, make an effort to cleanup other
> > > >> artifacts for the !NUMA optimization: the ifdef has to be gone and all
> > > >> callers of these helpers has to be revisited. There's more opportunities to
> > > >> cleanup. Like it is very odd that khugepaged_prealloc_page() frees the
> > > >> page.
> > > >
> > > > Yes, they are gone in this patch. The only remaining for !NUMA is
> > > > khugepaged_find_target_node() which just returns 0.
> > >
> > > As Kirill pointed out, there's also khugepaged_prealloc_page() where the
> > > only remaining variant does actually no preallocation, just freeing of an
> > > unused page and some kind of "sleep after first alloc fail, break after
> > > second alloc fail" logic.
> > > This could now be moved to khugepaged_do_scan() loop itself and maybe it
> > > will be easier to follow.
> >
> > Aha, I see. Misunderstood him. I'm supposed that you mean move into
> > khugepaged_scan_mm_slot().
>
> It may not be possible, but I'd always imagined that a cleanup of this
> kind would get rid of all those "struct page **hpage" artifacts.

It seems we need to find another way to do "sleep for the first alloc
failure, break loop for the second alloc failure" or just remove the
heuristic.

I will take a closer look once I find some time.

>
> Hugh
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6b9c98ddcd09..d6beb10e29e2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -855,6 +855,12 @@  static int khugepaged_find_target_node(void)
 	last_khugepaged_target_node = target_node;
 	return target_node;
 }
+#else
+static inline int khugepaged_find_target_node(void)
+{
+	return 0;
+}
+#endif
 
 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 {
@@ -889,74 +895,6 @@  khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
 	count_vm_event(THP_COLLAPSE_ALLOC);
 	return *hpage;
 }
-#else
-static int khugepaged_find_target_node(void)
-{
-	return 0;
-}
-
-static inline struct page *alloc_khugepaged_hugepage(void)
-{
-	struct page *page;
-
-	page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
-			   HPAGE_PMD_ORDER);
-	if (page)
-		prep_transhuge_page(page);
-	return page;
-}
-
-static struct page *khugepaged_alloc_hugepage(bool *wait)
-{
-	struct page *hpage;
-
-	do {
-		hpage = alloc_khugepaged_hugepage();
-		if (!hpage) {
-			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-			if (!*wait)
-				return NULL;
-
-			*wait = false;
-			khugepaged_alloc_sleep();
-		} else
-			count_vm_event(THP_COLLAPSE_ALLOC);
-	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
-
-	return hpage;
-}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
-{
-	/*
-	 * If the hpage allocated earlier was briefly exposed in page cache
-	 * before collapse_file() failed, it is possible that racing lookups
-	 * have not yet completed, and would then be unpleasantly surprised by
-	 * finding the hpage reused for the same mapping at a different offset.
-	 * Just release the previous allocation if there is any danger of that.
-	 */
-	if (*hpage && page_count(*hpage) > 1) {
-		put_page(*hpage);
-		*hpage = NULL;
-	}
-
-	if (!*hpage)
-		*hpage = khugepaged_alloc_hugepage(wait);
-
-	if (unlikely(!*hpage))
-		return false;
-
-	return true;
-}
-
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
-{
-	VM_BUG_ON(!*hpage);
-
-	return  *hpage;
-}
-#endif
 
 /*
  * If mmap_lock temporarily dropped, revalidate vma