diff mbox series

[RFC,02/26] hugetlb: sort hstates in hugetlb_init_hstates

Message ID 20220624173656.2033256-3-jthoughton@google.com (mailing list archive)
State New
Headers show
Series hugetlb: Introduce HugeTLB high-granularity mapping | expand

Commit Message

James Houghton June 24, 2022, 5:36 p.m. UTC
When using HugeTLB high-granularity mapping, we need to go through the
supported hugepage sizes in decreasing order so that we pick the largest
size that works. Consider the case where we're faulting in a 1G hugepage
for the first time: we want hugetlb_fault/hugetlb_no_page to map it with
a PUD. By going through the sizes in decreasing order, we will find that
PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too.

Signed-off-by: James Houghton <jthoughton@google.com>
---
 mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

Comments

Mina Almasry June 24, 2022, 6:51 p.m. UTC | #1
On Fri, Jun 24, 2022 at 10:37 AM James Houghton <jthoughton@google.com> wrote:
>
> When using HugeTLB high-granularity mapping, we need to go through the
> supported hugepage sizes in decreasing order so that we pick the largest
> size that works. Consider the case where we're faulting in a 1G hugepage
> for the first time: we want hugetlb_fault/hugetlb_no_page to map it with
> a PUD. By going through the sizes in decreasing order, we will find that
> PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too.
>

Mostly nits:

Reviewed-by: Mina Almasry <almasrymina@google.com>

> Signed-off-by: James Houghton <jthoughton@google.com>
> ---
>  mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++---
>  1 file changed, 37 insertions(+), 3 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index a57e1be41401..5df838d86f32 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -33,6 +33,7 @@
>  #include <linux/migrate.h>
>  #include <linux/nospec.h>
>  #include <linux/delayacct.h>
> +#include <linux/sort.h>
>
>  #include <asm/page.h>
>  #include <asm/pgalloc.h>
> @@ -48,6 +49,10 @@
>
>  int hugetlb_max_hstate __read_mostly;
>  unsigned int default_hstate_idx;
> +/*
> + * After hugetlb_init_hstates is called, hstates will be sorted from largest
> + * to smallest.
> + */
>  struct hstate hstates[HUGE_MAX_HSTATE];
>
>  #ifdef CONFIG_CMA
> @@ -3144,14 +3149,43 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
>         kfree(node_alloc_noretry);
>  }
>
> +static int compare_hstates_decreasing(const void *a, const void *b)
> +{
> +       const int shift_a = huge_page_shift((const struct hstate *)a);
> +       const int shift_b = huge_page_shift((const struct hstate *)b);
> +
> +       if (shift_a < shift_b)
> +               return 1;
> +       if (shift_a > shift_b)
> +               return -1;
> +       return 0;
> +}
> +
> +static void sort_hstates(void)

Maybe sort_hstates_descending(void) for extra clarity.

> +{
> +       unsigned long default_hstate_sz = huge_page_size(&default_hstate);
> +
> +       /* Sort from largest to smallest. */

I'd remove this redundant comment; it's somewhat obvious what the next
line does.

> +       sort(hstates, hugetlb_max_hstate, sizeof(*hstates),
> +            compare_hstates_decreasing, NULL);
> +
> +       /*
> +        * We may have changed the location of the default hstate, so we need to
> +        * update it.
> +        */
> +       default_hstate_idx = hstate_index(size_to_hstate(default_hstate_sz));
> +}
> +
>  static void __init hugetlb_init_hstates(void)
>  {
>         struct hstate *h, *h2;
>
> -       for_each_hstate(h) {
> -               if (minimum_order > huge_page_order(h))
> -                       minimum_order = huge_page_order(h);
> +       sort_hstates();
>
> +       /* The last hstate is now the smallest. */

Same, given that above is sort_hstates().

> +       minimum_order = huge_page_order(&hstates[hugetlb_max_hstate - 1]);
> +
> +       for_each_hstate(h) {
>                 /* oversize hugepages were init'ed in early boot */
>                 if (!hstate_is_gigantic(h))
>                         hugetlb_hstate_alloc_pages(h);
> --
> 2.37.0.rc0.161.g10f37bed90-goog
>
Manish June 27, 2022, 12:08 p.m. UTC | #2
On 24/06/22 11:06 pm, James Houghton wrote:
> When using HugeTLB high-granularity mapping, we need to go through the
> supported hugepage sizes in decreasing order so that we pick the largest
> size that works. Consider the case where we're faulting in a 1G hugepage
> for the first time: we want hugetlb_fault/hugetlb_no_page to map it with
> a PUD. By going through the sizes in decreasing order, we will find that
> PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too.
>
> Signed-off-by: James Houghton <jthoughton@google.com>
> ---
>   mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++---
>   1 file changed, 37 insertions(+), 3 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index a57e1be41401..5df838d86f32 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -33,6 +33,7 @@
>   #include <linux/migrate.h>
>   #include <linux/nospec.h>
>   #include <linux/delayacct.h>
> +#include <linux/sort.h>
>   
>   #include <asm/page.h>
>   #include <asm/pgalloc.h>
> @@ -48,6 +49,10 @@
>   
>   int hugetlb_max_hstate __read_mostly;
>   unsigned int default_hstate_idx;
> +/*
> + * After hugetlb_init_hstates is called, hstates will be sorted from largest
> + * to smallest.
> + */
>   struct hstate hstates[HUGE_MAX_HSTATE];
>   
>   #ifdef CONFIG_CMA
> @@ -3144,14 +3149,43 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
>   	kfree(node_alloc_noretry);
>   }
>   
> +static int compare_hstates_decreasing(const void *a, const void *b)
> +{
> +	const int shift_a = huge_page_shift((const struct hstate *)a);
> +	const int shift_b = huge_page_shift((const struct hstate *)b);
> +
> +	if (shift_a < shift_b)
> +		return 1;
> +	if (shift_a > shift_b)
> +		return -1;
> +	return 0;
> +}
> +
> +static void sort_hstates(void)
> +{
> +	unsigned long default_hstate_sz = huge_page_size(&default_hstate);
> +
> +	/* Sort from largest to smallest. */
> +	sort(hstates, hugetlb_max_hstate, sizeof(*hstates),
> +	     compare_hstates_decreasing, NULL);
> +
> +	/*
> +	 * We may have changed the location of the default hstate, so we need to
> +	 * update it.
> +	 */
> +	default_hstate_idx = hstate_index(size_to_hstate(default_hstate_sz));
> +}
> +
>   static void __init hugetlb_init_hstates(void)
>   {
>   	struct hstate *h, *h2;
>   
> -	for_each_hstate(h) {
> -		if (minimum_order > huge_page_order(h))
> -			minimum_order = huge_page_order(h);
> +	sort_hstates();
>   
> +	/* The last hstate is now the smallest. */
> +	minimum_order = huge_page_order(&hstates[hugetlb_max_hstate - 1]);
> +
> +	for_each_hstate(h) {
>   		/* oversize hugepages were init'ed in early boot */
>   		if (!hstate_is_gigantic(h))
>   			hugetlb_hstate_alloc_pages(h);

As now hstates are ordered can code which does calculation of demot_order

can too be optimised, i mean it can be value of order of hstate at next index?
Mike Kravetz June 27, 2022, 6:42 p.m. UTC | #3
On 06/24/22 17:36, James Houghton wrote:
> When using HugeTLB high-granularity mapping, we need to go through the
> supported hugepage sizes in decreasing order so that we pick the largest
> size that works. Consider the case where we're faulting in a 1G hugepage
> for the first time: we want hugetlb_fault/hugetlb_no_page to map it with
> a PUD. By going through the sizes in decreasing order, we will find that
> PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too.
> 
> Signed-off-by: James Houghton <jthoughton@google.com>
> ---
>  mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++---
>  1 file changed, 37 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index a57e1be41401..5df838d86f32 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -33,6 +33,7 @@
>  #include <linux/migrate.h>
>  #include <linux/nospec.h>
>  #include <linux/delayacct.h>
> +#include <linux/sort.h>
>  
>  #include <asm/page.h>
>  #include <asm/pgalloc.h>
> @@ -48,6 +49,10 @@
>  
>  int hugetlb_max_hstate __read_mostly;
>  unsigned int default_hstate_idx;
> +/*
> + * After hugetlb_init_hstates is called, hstates will be sorted from largest
> + * to smallest.
> + */
>  struct hstate hstates[HUGE_MAX_HSTATE];
>  
>  #ifdef CONFIG_CMA
> @@ -3144,14 +3149,43 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
>  	kfree(node_alloc_noretry);
>  }
>  
> +static int compare_hstates_decreasing(const void *a, const void *b)
> +{
> +	const int shift_a = huge_page_shift((const struct hstate *)a);
> +	const int shift_b = huge_page_shift((const struct hstate *)b);
> +
> +	if (shift_a < shift_b)
> +		return 1;
> +	if (shift_a > shift_b)
> +		return -1;
> +	return 0;
> +}
> +
> +static void sort_hstates(void)
> +{
> +	unsigned long default_hstate_sz = huge_page_size(&default_hstate);
> +
> +	/* Sort from largest to smallest. */
> +	sort(hstates, hugetlb_max_hstate, sizeof(*hstates),
> +	     compare_hstates_decreasing, NULL);
> +
> +	/*
> +	 * We may have changed the location of the default hstate, so we need to
> +	 * update it.
> +	 */
> +	default_hstate_idx = hstate_index(size_to_hstate(default_hstate_sz));
> +}
> +
>  static void __init hugetlb_init_hstates(void)
>  {
>  	struct hstate *h, *h2;
>  
> -	for_each_hstate(h) {
> -		if (minimum_order > huge_page_order(h))
> -			minimum_order = huge_page_order(h);
> +	sort_hstates();
>  
> +	/* The last hstate is now the smallest. */
> +	minimum_order = huge_page_order(&hstates[hugetlb_max_hstate - 1]);
> +
> +	for_each_hstate(h) {
>  		/* oversize hugepages were init'ed in early boot */
>  		if (!hstate_is_gigantic(h))
>  			hugetlb_hstate_alloc_pages(h);

This may/will cause problems for gigantic hugetlb pages allocated at boot
time.  See alloc_bootmem_huge_page() where a pointer to the associated hstate
is encoded within the allocated hugetlb page.  These pages are added to
hugetlb pools by the routine gather_bootmem_prealloc() which uses the saved
hstate to add prep the gigantic page and add to the correct pool.  Currently,
gather_bootmem_prealloc is called after hugetlb_init_hstates.  So, changing
hstate order will cause errors.

I do not see any reason why we could not call gather_bootmem_prealloc before
hugetlb_init_hstates to avoid this issue.
James Houghton June 28, 2022, 3:35 p.m. UTC | #4
On Mon, Jun 27, 2022 at 5:09 AM manish.mishra <manish.mishra@nutanix.com> wrote:
>
>
> On 24/06/22 11:06 pm, James Houghton wrote:
> > When using HugeTLB high-granularity mapping, we need to go through the
> > supported hugepage sizes in decreasing order so that we pick the largest
> > size that works. Consider the case where we're faulting in a 1G hugepage
> > for the first time: we want hugetlb_fault/hugetlb_no_page to map it with
> > a PUD. By going through the sizes in decreasing order, we will find that
> > PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too.
> >
> > Signed-off-by: James Houghton <jthoughton@google.com>
> > ---
> >   mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++---
> >   1 file changed, 37 insertions(+), 3 deletions(-)
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index a57e1be41401..5df838d86f32 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -33,6 +33,7 @@
> >   #include <linux/migrate.h>
> >   #include <linux/nospec.h>
> >   #include <linux/delayacct.h>
> > +#include <linux/sort.h>
> >
> >   #include <asm/page.h>
> >   #include <asm/pgalloc.h>
> > @@ -48,6 +49,10 @@
> >
> >   int hugetlb_max_hstate __read_mostly;
> >   unsigned int default_hstate_idx;
> > +/*
> > + * After hugetlb_init_hstates is called, hstates will be sorted from largest
> > + * to smallest.
> > + */
> >   struct hstate hstates[HUGE_MAX_HSTATE];
> >
> >   #ifdef CONFIG_CMA
> > @@ -3144,14 +3149,43 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
> >       kfree(node_alloc_noretry);
> >   }
> >
> > +static int compare_hstates_decreasing(const void *a, const void *b)
> > +{
> > +     const int shift_a = huge_page_shift((const struct hstate *)a);
> > +     const int shift_b = huge_page_shift((const struct hstate *)b);
> > +
> > +     if (shift_a < shift_b)
> > +             return 1;
> > +     if (shift_a > shift_b)
> > +             return -1;
> > +     return 0;
> > +}
> > +
> > +static void sort_hstates(void)
> > +{
> > +     unsigned long default_hstate_sz = huge_page_size(&default_hstate);
> > +
> > +     /* Sort from largest to smallest. */
> > +     sort(hstates, hugetlb_max_hstate, sizeof(*hstates),
> > +          compare_hstates_decreasing, NULL);
> > +
> > +     /*
> > +      * We may have changed the location of the default hstate, so we need to
> > +      * update it.
> > +      */
> > +     default_hstate_idx = hstate_index(size_to_hstate(default_hstate_sz));
> > +}
> > +
> >   static void __init hugetlb_init_hstates(void)
> >   {
> >       struct hstate *h, *h2;
> >
> > -     for_each_hstate(h) {
> > -             if (minimum_order > huge_page_order(h))
> > -                     minimum_order = huge_page_order(h);
> > +     sort_hstates();
> >
> > +     /* The last hstate is now the smallest. */
> > +     minimum_order = huge_page_order(&hstates[hugetlb_max_hstate - 1]);
> > +
> > +     for_each_hstate(h) {
> >               /* oversize hugepages were init'ed in early boot */
> >               if (!hstate_is_gigantic(h))
> >                       hugetlb_hstate_alloc_pages(h);
>
> As now hstates are ordered can code which does calculation of demot_order
>
> can too be optimised, i mean it can be value of order of hstate at next index?
>

Indeed -- thanks for catching that. I'll make this optimization for
the next version of this series.

>
James Houghton June 28, 2022, 3:40 p.m. UTC | #5
On Mon, Jun 27, 2022 at 11:42 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 06/24/22 17:36, James Houghton wrote:
> > When using HugeTLB high-granularity mapping, we need to go through the
> > supported hugepage sizes in decreasing order so that we pick the largest
> > size that works. Consider the case where we're faulting in a 1G hugepage
> > for the first time: we want hugetlb_fault/hugetlb_no_page to map it with
> > a PUD. By going through the sizes in decreasing order, we will find that
> > PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too.
> >
> > Signed-off-by: James Houghton <jthoughton@google.com>
> > ---
> >  mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 37 insertions(+), 3 deletions(-)
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index a57e1be41401..5df838d86f32 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -33,6 +33,7 @@
> >  #include <linux/migrate.h>
> >  #include <linux/nospec.h>
> >  #include <linux/delayacct.h>
> > +#include <linux/sort.h>
> >
> >  #include <asm/page.h>
> >  #include <asm/pgalloc.h>
> > @@ -48,6 +49,10 @@
> >
> >  int hugetlb_max_hstate __read_mostly;
> >  unsigned int default_hstate_idx;
> > +/*
> > + * After hugetlb_init_hstates is called, hstates will be sorted from largest
> > + * to smallest.
> > + */
> >  struct hstate hstates[HUGE_MAX_HSTATE];
> >
> >  #ifdef CONFIG_CMA
> > @@ -3144,14 +3149,43 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
> >       kfree(node_alloc_noretry);
> >  }
> >
> > +static int compare_hstates_decreasing(const void *a, const void *b)
> > +{
> > +     const int shift_a = huge_page_shift((const struct hstate *)a);
> > +     const int shift_b = huge_page_shift((const struct hstate *)b);
> > +
> > +     if (shift_a < shift_b)
> > +             return 1;
> > +     if (shift_a > shift_b)
> > +             return -1;
> > +     return 0;
> > +}
> > +
> > +static void sort_hstates(void)
> > +{
> > +     unsigned long default_hstate_sz = huge_page_size(&default_hstate);
> > +
> > +     /* Sort from largest to smallest. */
> > +     sort(hstates, hugetlb_max_hstate, sizeof(*hstates),
> > +          compare_hstates_decreasing, NULL);
> > +
> > +     /*
> > +      * We may have changed the location of the default hstate, so we need to
> > +      * update it.
> > +      */
> > +     default_hstate_idx = hstate_index(size_to_hstate(default_hstate_sz));
> > +}
> > +
> >  static void __init hugetlb_init_hstates(void)
> >  {
> >       struct hstate *h, *h2;
> >
> > -     for_each_hstate(h) {
> > -             if (minimum_order > huge_page_order(h))
> > -                     minimum_order = huge_page_order(h);
> > +     sort_hstates();
> >
> > +     /* The last hstate is now the smallest. */
> > +     minimum_order = huge_page_order(&hstates[hugetlb_max_hstate - 1]);
> > +
> > +     for_each_hstate(h) {
> >               /* oversize hugepages were init'ed in early boot */
> >               if (!hstate_is_gigantic(h))
> >                       hugetlb_hstate_alloc_pages(h);
>
> This may/will cause problems for gigantic hugetlb pages allocated at boot
> time.  See alloc_bootmem_huge_page() where a pointer to the associated hstate
> is encoded within the allocated hugetlb page.  These pages are added to
> hugetlb pools by the routine gather_bootmem_prealloc() which uses the saved
> hstate to add prep the gigantic page and add to the correct pool.  Currently,
> gather_bootmem_prealloc is called after hugetlb_init_hstates.  So, changing
> hstate order will cause errors.
>
> I do not see any reason why we could not call gather_bootmem_prealloc before
> hugetlb_init_hstates to avoid this issue.

Thanks for catching this, Mike. Your suggestion certainly seems to
work, but it also seems kind of error prone. I'll have to look at the
code more closely, but maybe it would be better if I just maintained a
separate `struct hstate *sorted_hstate_ptrs[]`, where the original
locations of the hstates remain unchanged, as to not break
gather_bootmem_prealloc/other things.

> --
> Mike Kravetz
Muchun Song June 29, 2022, 6:39 a.m. UTC | #6
On Tue, Jun 28, 2022 at 08:40:27AM -0700, James Houghton wrote:
> On Mon, Jun 27, 2022 at 11:42 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> >
> > On 06/24/22 17:36, James Houghton wrote:
> > > When using HugeTLB high-granularity mapping, we need to go through the
> > > supported hugepage sizes in decreasing order so that we pick the largest
> > > size that works. Consider the case where we're faulting in a 1G hugepage
> > > for the first time: we want hugetlb_fault/hugetlb_no_page to map it with
> > > a PUD. By going through the sizes in decreasing order, we will find that
> > > PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too.
> > >
> > > Signed-off-by: James Houghton <jthoughton@google.com>
> > > ---
> > >  mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++---
> > >  1 file changed, 37 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > > index a57e1be41401..5df838d86f32 100644
> > > --- a/mm/hugetlb.c
> > > +++ b/mm/hugetlb.c
> > > @@ -33,6 +33,7 @@
> > >  #include <linux/migrate.h>
> > >  #include <linux/nospec.h>
> > >  #include <linux/delayacct.h>
> > > +#include <linux/sort.h>
> > >
> > >  #include <asm/page.h>
> > >  #include <asm/pgalloc.h>
> > > @@ -48,6 +49,10 @@
> > >
> > >  int hugetlb_max_hstate __read_mostly;
> > >  unsigned int default_hstate_idx;
> > > +/*
> > > + * After hugetlb_init_hstates is called, hstates will be sorted from largest
> > > + * to smallest.
> > > + */
> > >  struct hstate hstates[HUGE_MAX_HSTATE];
> > >
> > >  #ifdef CONFIG_CMA
> > > @@ -3144,14 +3149,43 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
> > >       kfree(node_alloc_noretry);
> > >  }
> > >
> > > +static int compare_hstates_decreasing(const void *a, const void *b)
> > > +{
> > > +     const int shift_a = huge_page_shift((const struct hstate *)a);
> > > +     const int shift_b = huge_page_shift((const struct hstate *)b);
> > > +
> > > +     if (shift_a < shift_b)
> > > +             return 1;
> > > +     if (shift_a > shift_b)
> > > +             return -1;
> > > +     return 0;
> > > +}
> > > +
> > > +static void sort_hstates(void)
> > > +{
> > > +     unsigned long default_hstate_sz = huge_page_size(&default_hstate);
> > > +
> > > +     /* Sort from largest to smallest. */
> > > +     sort(hstates, hugetlb_max_hstate, sizeof(*hstates),
> > > +          compare_hstates_decreasing, NULL);
> > > +
> > > +     /*
> > > +      * We may have changed the location of the default hstate, so we need to
> > > +      * update it.
> > > +      */
> > > +     default_hstate_idx = hstate_index(size_to_hstate(default_hstate_sz));
> > > +}
> > > +
> > >  static void __init hugetlb_init_hstates(void)
> > >  {
> > >       struct hstate *h, *h2;
> > >
> > > -     for_each_hstate(h) {
> > > -             if (minimum_order > huge_page_order(h))
> > > -                     minimum_order = huge_page_order(h);
> > > +     sort_hstates();
> > >
> > > +     /* The last hstate is now the smallest. */
> > > +     minimum_order = huge_page_order(&hstates[hugetlb_max_hstate - 1]);
> > > +
> > > +     for_each_hstate(h) {
> > >               /* oversize hugepages were init'ed in early boot */
> > >               if (!hstate_is_gigantic(h))
> > >                       hugetlb_hstate_alloc_pages(h);
> >
> > This may/will cause problems for gigantic hugetlb pages allocated at boot
> > time.  See alloc_bootmem_huge_page() where a pointer to the associated hstate
> > is encoded within the allocated hugetlb page.  These pages are added to
> > hugetlb pools by the routine gather_bootmem_prealloc() which uses the saved
> > hstate to add prep the gigantic page and add to the correct pool.  Currently,
> > gather_bootmem_prealloc is called after hugetlb_init_hstates.  So, changing
> > hstate order will cause errors.
> >
> > I do not see any reason why we could not call gather_bootmem_prealloc before
> > hugetlb_init_hstates to avoid this issue.
> 
> Thanks for catching this, Mike. Your suggestion certainly seems to
> work, but it also seems kind of error prone. I'll have to look at the
> code more closely, but maybe it would be better if I just maintained a
> separate `struct hstate *sorted_hstate_ptrs[]`, where the original

I don't think this is a good idea.  If you really rely on the order of
the initialization in this patch.  The easier solution is changing
huge_bootmem_page->hstate to huge_bootmem_page->hugepagesz. Then we
can use size_to_hstate(huge_bootmem_page->hugepagesz) in
gather_bootmem_prealloc().

Thanks.

> locations of the hstates remain unchanged, as to not break
> gather_bootmem_prealloc/other things.
> 
> > --
> > Mike Kravetz
>
Mike Kravetz June 29, 2022, 9:06 p.m. UTC | #7
On 06/29/22 14:39, Muchun Song wrote:
> On Tue, Jun 28, 2022 at 08:40:27AM -0700, James Houghton wrote:
> > On Mon, Jun 27, 2022 at 11:42 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > >
> > > On 06/24/22 17:36, James Houghton wrote:
> > > > When using HugeTLB high-granularity mapping, we need to go through the
> > > > supported hugepage sizes in decreasing order so that we pick the largest
> > > > size that works. Consider the case where we're faulting in a 1G hugepage
> > > > for the first time: we want hugetlb_fault/hugetlb_no_page to map it with
> > > > a PUD. By going through the sizes in decreasing order, we will find that
> > > > PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too.
> > > >
> > >
> > > This may/will cause problems for gigantic hugetlb pages allocated at boot
> > > time.  See alloc_bootmem_huge_page() where a pointer to the associated hstate
> > > is encoded within the allocated hugetlb page.  These pages are added to
> > > hugetlb pools by the routine gather_bootmem_prealloc() which uses the saved
> > > hstate to add prep the gigantic page and add to the correct pool.  Currently,
> > > gather_bootmem_prealloc is called after hugetlb_init_hstates.  So, changing
> > > hstate order will cause errors.
> > >
> > > I do not see any reason why we could not call gather_bootmem_prealloc before
> > > hugetlb_init_hstates to avoid this issue.
> > 
> > Thanks for catching this, Mike. Your suggestion certainly seems to
> > work, but it also seems kind of error prone. I'll have to look at the
> > code more closely, but maybe it would be better if I just maintained a
> > separate `struct hstate *sorted_hstate_ptrs[]`, where the original
> 
> I don't think this is a good idea.  If you really rely on the order of
> the initialization in this patch.  The easier solution is changing
> huge_bootmem_page->hstate to huge_bootmem_page->hugepagesz. Then we
> can use size_to_hstate(huge_bootmem_page->hugepagesz) in
> gather_bootmem_prealloc().
> 

That is a much better solution.  Thanks Muchun!
James Houghton June 29, 2022, 9:13 p.m. UTC | #8
On Wed, Jun 29, 2022 at 2:06 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 06/29/22 14:39, Muchun Song wrote:
> > On Tue, Jun 28, 2022 at 08:40:27AM -0700, James Houghton wrote:
> > > On Mon, Jun 27, 2022 at 11:42 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > > >
> > > > On 06/24/22 17:36, James Houghton wrote:
> > > > > When using HugeTLB high-granularity mapping, we need to go through the
> > > > > supported hugepage sizes in decreasing order so that we pick the largest
> > > > > size that works. Consider the case where we're faulting in a 1G hugepage
> > > > > for the first time: we want hugetlb_fault/hugetlb_no_page to map it with
> > > > > a PUD. By going through the sizes in decreasing order, we will find that
> > > > > PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too.
> > > > >
> > > >
> > > > This may/will cause problems for gigantic hugetlb pages allocated at boot
> > > > time.  See alloc_bootmem_huge_page() where a pointer to the associated hstate
> > > > is encoded within the allocated hugetlb page.  These pages are added to
> > > > hugetlb pools by the routine gather_bootmem_prealloc() which uses the saved
> > > > hstate to add prep the gigantic page and add to the correct pool.  Currently,
> > > > gather_bootmem_prealloc is called after hugetlb_init_hstates.  So, changing
> > > > hstate order will cause errors.
> > > >
> > > > I do not see any reason why we could not call gather_bootmem_prealloc before
> > > > hugetlb_init_hstates to avoid this issue.
> > >
> > > Thanks for catching this, Mike. Your suggestion certainly seems to
> > > work, but it also seems kind of error prone. I'll have to look at the
> > > code more closely, but maybe it would be better if I just maintained a
> > > separate `struct hstate *sorted_hstate_ptrs[]`, where the original
> >
> > I don't think this is a good idea.  If you really rely on the order of
> > the initialization in this patch.  The easier solution is changing
> > huge_bootmem_page->hstate to huge_bootmem_page->hugepagesz. Then we
> > can use size_to_hstate(huge_bootmem_page->hugepagesz) in
> > gather_bootmem_prealloc().
> >
>
> That is a much better solution.  Thanks Muchun!

Indeed. Thank you, Muchun. :)

>
> --
> Mike Kravetz
diff mbox series

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a57e1be41401..5df838d86f32 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -33,6 +33,7 @@ 
 #include <linux/migrate.h>
 #include <linux/nospec.h>
 #include <linux/delayacct.h>
+#include <linux/sort.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
@@ -48,6 +49,10 @@ 
 
 int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
+/*
+ * After hugetlb_init_hstates is called, hstates will be sorted from largest
+ * to smallest.
+ */
 struct hstate hstates[HUGE_MAX_HSTATE];
 
 #ifdef CONFIG_CMA
@@ -3144,14 +3149,43 @@  static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 	kfree(node_alloc_noretry);
 }
 
+static int compare_hstates_decreasing(const void *a, const void *b)
+{
+	const int shift_a = huge_page_shift((const struct hstate *)a);
+	const int shift_b = huge_page_shift((const struct hstate *)b);
+
+	if (shift_a < shift_b)
+		return 1;
+	if (shift_a > shift_b)
+		return -1;
+	return 0;
+}
+
+static void sort_hstates(void)
+{
+	unsigned long default_hstate_sz = huge_page_size(&default_hstate);
+
+	/* Sort from largest to smallest. */
+	sort(hstates, hugetlb_max_hstate, sizeof(*hstates),
+	     compare_hstates_decreasing, NULL);
+
+	/*
+	 * We may have changed the location of the default hstate, so we need to
+	 * update it.
+	 */
+	default_hstate_idx = hstate_index(size_to_hstate(default_hstate_sz));
+}
+
 static void __init hugetlb_init_hstates(void)
 {
 	struct hstate *h, *h2;
 
-	for_each_hstate(h) {
-		if (minimum_order > huge_page_order(h))
-			minimum_order = huge_page_order(h);
+	sort_hstates();
 
+	/* The last hstate is now the smallest. */
+	minimum_order = huge_page_order(&hstates[hugetlb_max_hstate - 1]);
+
+	for_each_hstate(h) {
 		/* oversize hugepages were init'ed in early boot */
 		if (!hstate_is_gigantic(h))
 			hugetlb_hstate_alloc_pages(h);