Message ID | 20220624173656.2033256-3-jthoughton@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | hugetlb: Introduce HugeTLB high-granularity mapping | expand |
On Fri, Jun 24, 2022 at 10:37 AM James Houghton <jthoughton@google.com> wrote: > > When using HugeTLB high-granularity mapping, we need to go through the > supported hugepage sizes in decreasing order so that we pick the largest > size that works. Consider the case where we're faulting in a 1G hugepage > for the first time: we want hugetlb_fault/hugetlb_no_page to map it with > a PUD. By going through the sizes in decreasing order, we will find that > PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too. > Mostly nits: Reviewed-by: Mina Almasry <almasrymina@google.com> > Signed-off-by: James Houghton <jthoughton@google.com> > --- > mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++--- > 1 file changed, 37 insertions(+), 3 deletions(-) > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index a57e1be41401..5df838d86f32 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -33,6 +33,7 @@ > #include <linux/migrate.h> > #include <linux/nospec.h> > #include <linux/delayacct.h> > +#include <linux/sort.h> > > #include <asm/page.h> > #include <asm/pgalloc.h> > @@ -48,6 +49,10 @@ > > int hugetlb_max_hstate __read_mostly; > unsigned int default_hstate_idx; > +/* > + * After hugetlb_init_hstates is called, hstates will be sorted from largest > + * to smallest. > + */ > struct hstate hstates[HUGE_MAX_HSTATE]; > > #ifdef CONFIG_CMA > @@ -3144,14 +3149,43 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) > kfree(node_alloc_noretry); > } > > +static int compare_hstates_decreasing(const void *a, const void *b) > +{ > + const int shift_a = huge_page_shift((const struct hstate *)a); > + const int shift_b = huge_page_shift((const struct hstate *)b); > + > + if (shift_a < shift_b) > + return 1; > + if (shift_a > shift_b) > + return -1; > + return 0; > +} > + > +static void sort_hstates(void) Maybe sort_hstates_descending(void) for extra clarity. > +{ > + unsigned long default_hstate_sz = huge_page_size(&default_hstate); > + > + /* Sort from largest to smallest. */ I'd remove this redundant comment; it's somewhat obvious what the next line does. > + sort(hstates, hugetlb_max_hstate, sizeof(*hstates), > + compare_hstates_decreasing, NULL); > + > + /* > + * We may have changed the location of the default hstate, so we need to > + * update it. > + */ > + default_hstate_idx = hstate_index(size_to_hstate(default_hstate_sz)); > +} > + > static void __init hugetlb_init_hstates(void) > { > struct hstate *h, *h2; > > - for_each_hstate(h) { > - if (minimum_order > huge_page_order(h)) > - minimum_order = huge_page_order(h); > + sort_hstates(); > > + /* The last hstate is now the smallest. */ Same, given that above is sort_hstates(). > + minimum_order = huge_page_order(&hstates[hugetlb_max_hstate - 1]); > + > + for_each_hstate(h) { > /* oversize hugepages were init'ed in early boot */ > if (!hstate_is_gigantic(h)) > hugetlb_hstate_alloc_pages(h); > -- > 2.37.0.rc0.161.g10f37bed90-goog >
On 24/06/22 11:06 pm, James Houghton wrote: > When using HugeTLB high-granularity mapping, we need to go through the > supported hugepage sizes in decreasing order so that we pick the largest > size that works. Consider the case where we're faulting in a 1G hugepage > for the first time: we want hugetlb_fault/hugetlb_no_page to map it with > a PUD. By going through the sizes in decreasing order, we will find that > PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too. > > Signed-off-by: James Houghton <jthoughton@google.com> > --- > mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++--- > 1 file changed, 37 insertions(+), 3 deletions(-) > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index a57e1be41401..5df838d86f32 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -33,6 +33,7 @@ > #include <linux/migrate.h> > #include <linux/nospec.h> > #include <linux/delayacct.h> > +#include <linux/sort.h> > > #include <asm/page.h> > #include <asm/pgalloc.h> > @@ -48,6 +49,10 @@ > > int hugetlb_max_hstate __read_mostly; > unsigned int default_hstate_idx; > +/* > + * After hugetlb_init_hstates is called, hstates will be sorted from largest > + * to smallest. > + */ > struct hstate hstates[HUGE_MAX_HSTATE]; > > #ifdef CONFIG_CMA > @@ -3144,14 +3149,43 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) > kfree(node_alloc_noretry); > } > > +static int compare_hstates_decreasing(const void *a, const void *b) > +{ > + const int shift_a = huge_page_shift((const struct hstate *)a); > + const int shift_b = huge_page_shift((const struct hstate *)b); > + > + if (shift_a < shift_b) > + return 1; > + if (shift_a > shift_b) > + return -1; > + return 0; > +} > + > +static void sort_hstates(void) > +{ > + unsigned long default_hstate_sz = huge_page_size(&default_hstate); > + > + /* Sort from largest to smallest. */ > + sort(hstates, hugetlb_max_hstate, sizeof(*hstates), > + compare_hstates_decreasing, NULL); > + > + /* > + * We may have changed the location of the default hstate, so we need to > + * update it. > + */ > + default_hstate_idx = hstate_index(size_to_hstate(default_hstate_sz)); > +} > + > static void __init hugetlb_init_hstates(void) > { > struct hstate *h, *h2; > > - for_each_hstate(h) { > - if (minimum_order > huge_page_order(h)) > - minimum_order = huge_page_order(h); > + sort_hstates(); > > + /* The last hstate is now the smallest. */ > + minimum_order = huge_page_order(&hstates[hugetlb_max_hstate - 1]); > + > + for_each_hstate(h) { > /* oversize hugepages were init'ed in early boot */ > if (!hstate_is_gigantic(h)) > hugetlb_hstate_alloc_pages(h); As now hstates are ordered can code which does calculation of demot_order can too be optimised, i mean it can be value of order of hstate at next index?
On 06/24/22 17:36, James Houghton wrote: > When using HugeTLB high-granularity mapping, we need to go through the > supported hugepage sizes in decreasing order so that we pick the largest > size that works. Consider the case where we're faulting in a 1G hugepage > for the first time: we want hugetlb_fault/hugetlb_no_page to map it with > a PUD. By going through the sizes in decreasing order, we will find that > PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too. > > Signed-off-by: James Houghton <jthoughton@google.com> > --- > mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++--- > 1 file changed, 37 insertions(+), 3 deletions(-) > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index a57e1be41401..5df838d86f32 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -33,6 +33,7 @@ > #include <linux/migrate.h> > #include <linux/nospec.h> > #include <linux/delayacct.h> > +#include <linux/sort.h> > > #include <asm/page.h> > #include <asm/pgalloc.h> > @@ -48,6 +49,10 @@ > > int hugetlb_max_hstate __read_mostly; > unsigned int default_hstate_idx; > +/* > + * After hugetlb_init_hstates is called, hstates will be sorted from largest > + * to smallest. > + */ > struct hstate hstates[HUGE_MAX_HSTATE]; > > #ifdef CONFIG_CMA > @@ -3144,14 +3149,43 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) > kfree(node_alloc_noretry); > } > > +static int compare_hstates_decreasing(const void *a, const void *b) > +{ > + const int shift_a = huge_page_shift((const struct hstate *)a); > + const int shift_b = huge_page_shift((const struct hstate *)b); > + > + if (shift_a < shift_b) > + return 1; > + if (shift_a > shift_b) > + return -1; > + return 0; > +} > + > +static void sort_hstates(void) > +{ > + unsigned long default_hstate_sz = huge_page_size(&default_hstate); > + > + /* Sort from largest to smallest. */ > + sort(hstates, hugetlb_max_hstate, sizeof(*hstates), > + compare_hstates_decreasing, NULL); > + > + /* > + * We may have changed the location of the default hstate, so we need to > + * update it. > + */ > + default_hstate_idx = hstate_index(size_to_hstate(default_hstate_sz)); > +} > + > static void __init hugetlb_init_hstates(void) > { > struct hstate *h, *h2; > > - for_each_hstate(h) { > - if (minimum_order > huge_page_order(h)) > - minimum_order = huge_page_order(h); > + sort_hstates(); > > + /* The last hstate is now the smallest. */ > + minimum_order = huge_page_order(&hstates[hugetlb_max_hstate - 1]); > + > + for_each_hstate(h) { > /* oversize hugepages were init'ed in early boot */ > if (!hstate_is_gigantic(h)) > hugetlb_hstate_alloc_pages(h); This may/will cause problems for gigantic hugetlb pages allocated at boot time. See alloc_bootmem_huge_page() where a pointer to the associated hstate is encoded within the allocated hugetlb page. These pages are added to hugetlb pools by the routine gather_bootmem_prealloc() which uses the saved hstate to add prep the gigantic page and add to the correct pool. Currently, gather_bootmem_prealloc is called after hugetlb_init_hstates. So, changing hstate order will cause errors. I do not see any reason why we could not call gather_bootmem_prealloc before hugetlb_init_hstates to avoid this issue.
On Mon, Jun 27, 2022 at 5:09 AM manish.mishra <manish.mishra@nutanix.com> wrote: > > > On 24/06/22 11:06 pm, James Houghton wrote: > > When using HugeTLB high-granularity mapping, we need to go through the > > supported hugepage sizes in decreasing order so that we pick the largest > > size that works. Consider the case where we're faulting in a 1G hugepage > > for the first time: we want hugetlb_fault/hugetlb_no_page to map it with > > a PUD. By going through the sizes in decreasing order, we will find that > > PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too. > > > > Signed-off-by: James Houghton <jthoughton@google.com> > > --- > > mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++--- > > 1 file changed, 37 insertions(+), 3 deletions(-) > > > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > > index a57e1be41401..5df838d86f32 100644 > > --- a/mm/hugetlb.c > > +++ b/mm/hugetlb.c > > @@ -33,6 +33,7 @@ > > #include <linux/migrate.h> > > #include <linux/nospec.h> > > #include <linux/delayacct.h> > > +#include <linux/sort.h> > > > > #include <asm/page.h> > > #include <asm/pgalloc.h> > > @@ -48,6 +49,10 @@ > > > > int hugetlb_max_hstate __read_mostly; > > unsigned int default_hstate_idx; > > +/* > > + * After hugetlb_init_hstates is called, hstates will be sorted from largest > > + * to smallest. > > + */ > > struct hstate hstates[HUGE_MAX_HSTATE]; > > > > #ifdef CONFIG_CMA > > @@ -3144,14 +3149,43 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) > > kfree(node_alloc_noretry); > > } > > > > +static int compare_hstates_decreasing(const void *a, const void *b) > > +{ > > + const int shift_a = huge_page_shift((const struct hstate *)a); > > + const int shift_b = huge_page_shift((const struct hstate *)b); > > + > > + if (shift_a < shift_b) > > + return 1; > > + if (shift_a > shift_b) > > + return -1; > > + return 0; > > +} > > + > > +static void sort_hstates(void) > > +{ > > + unsigned long default_hstate_sz = huge_page_size(&default_hstate); > > + > > + /* Sort from largest to smallest. */ > > + sort(hstates, hugetlb_max_hstate, sizeof(*hstates), > > + compare_hstates_decreasing, NULL); > > + > > + /* > > + * We may have changed the location of the default hstate, so we need to > > + * update it. > > + */ > > + default_hstate_idx = hstate_index(size_to_hstate(default_hstate_sz)); > > +} > > + > > static void __init hugetlb_init_hstates(void) > > { > > struct hstate *h, *h2; > > > > - for_each_hstate(h) { > > - if (minimum_order > huge_page_order(h)) > > - minimum_order = huge_page_order(h); > > + sort_hstates(); > > > > + /* The last hstate is now the smallest. */ > > + minimum_order = huge_page_order(&hstates[hugetlb_max_hstate - 1]); > > + > > + for_each_hstate(h) { > > /* oversize hugepages were init'ed in early boot */ > > if (!hstate_is_gigantic(h)) > > hugetlb_hstate_alloc_pages(h); > > As now hstates are ordered can code which does calculation of demot_order > > can too be optimised, i mean it can be value of order of hstate at next index? > Indeed -- thanks for catching that. I'll make this optimization for the next version of this series. >
On Mon, Jun 27, 2022 at 11:42 AM Mike Kravetz <mike.kravetz@oracle.com> wrote: > > On 06/24/22 17:36, James Houghton wrote: > > When using HugeTLB high-granularity mapping, we need to go through the > > supported hugepage sizes in decreasing order so that we pick the largest > > size that works. Consider the case where we're faulting in a 1G hugepage > > for the first time: we want hugetlb_fault/hugetlb_no_page to map it with > > a PUD. By going through the sizes in decreasing order, we will find that > > PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too. > > > > Signed-off-by: James Houghton <jthoughton@google.com> > > --- > > mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++--- > > 1 file changed, 37 insertions(+), 3 deletions(-) > > > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > > index a57e1be41401..5df838d86f32 100644 > > --- a/mm/hugetlb.c > > +++ b/mm/hugetlb.c > > @@ -33,6 +33,7 @@ > > #include <linux/migrate.h> > > #include <linux/nospec.h> > > #include <linux/delayacct.h> > > +#include <linux/sort.h> > > > > #include <asm/page.h> > > #include <asm/pgalloc.h> > > @@ -48,6 +49,10 @@ > > > > int hugetlb_max_hstate __read_mostly; > > unsigned int default_hstate_idx; > > +/* > > + * After hugetlb_init_hstates is called, hstates will be sorted from largest > > + * to smallest. > > + */ > > struct hstate hstates[HUGE_MAX_HSTATE]; > > > > #ifdef CONFIG_CMA > > @@ -3144,14 +3149,43 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) > > kfree(node_alloc_noretry); > > } > > > > +static int compare_hstates_decreasing(const void *a, const void *b) > > +{ > > + const int shift_a = huge_page_shift((const struct hstate *)a); > > + const int shift_b = huge_page_shift((const struct hstate *)b); > > + > > + if (shift_a < shift_b) > > + return 1; > > + if (shift_a > shift_b) > > + return -1; > > + return 0; > > +} > > + > > +static void sort_hstates(void) > > +{ > > + unsigned long default_hstate_sz = huge_page_size(&default_hstate); > > + > > + /* Sort from largest to smallest. */ > > + sort(hstates, hugetlb_max_hstate, sizeof(*hstates), > > + compare_hstates_decreasing, NULL); > > + > > + /* > > + * We may have changed the location of the default hstate, so we need to > > + * update it. > > + */ > > + default_hstate_idx = hstate_index(size_to_hstate(default_hstate_sz)); > > +} > > + > > static void __init hugetlb_init_hstates(void) > > { > > struct hstate *h, *h2; > > > > - for_each_hstate(h) { > > - if (minimum_order > huge_page_order(h)) > > - minimum_order = huge_page_order(h); > > + sort_hstates(); > > > > + /* The last hstate is now the smallest. */ > > + minimum_order = huge_page_order(&hstates[hugetlb_max_hstate - 1]); > > + > > + for_each_hstate(h) { > > /* oversize hugepages were init'ed in early boot */ > > if (!hstate_is_gigantic(h)) > > hugetlb_hstate_alloc_pages(h); > > This may/will cause problems for gigantic hugetlb pages allocated at boot > time. See alloc_bootmem_huge_page() where a pointer to the associated hstate > is encoded within the allocated hugetlb page. These pages are added to > hugetlb pools by the routine gather_bootmem_prealloc() which uses the saved > hstate to add prep the gigantic page and add to the correct pool. Currently, > gather_bootmem_prealloc is called after hugetlb_init_hstates. So, changing > hstate order will cause errors. > > I do not see any reason why we could not call gather_bootmem_prealloc before > hugetlb_init_hstates to avoid this issue. Thanks for catching this, Mike. Your suggestion certainly seems to work, but it also seems kind of error prone. I'll have to look at the code more closely, but maybe it would be better if I just maintained a separate `struct hstate *sorted_hstate_ptrs[]`, where the original locations of the hstates remain unchanged, as to not break gather_bootmem_prealloc/other things. > -- > Mike Kravetz
On Tue, Jun 28, 2022 at 08:40:27AM -0700, James Houghton wrote: > On Mon, Jun 27, 2022 at 11:42 AM Mike Kravetz <mike.kravetz@oracle.com> wrote: > > > > On 06/24/22 17:36, James Houghton wrote: > > > When using HugeTLB high-granularity mapping, we need to go through the > > > supported hugepage sizes in decreasing order so that we pick the largest > > > size that works. Consider the case where we're faulting in a 1G hugepage > > > for the first time: we want hugetlb_fault/hugetlb_no_page to map it with > > > a PUD. By going through the sizes in decreasing order, we will find that > > > PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too. > > > > > > Signed-off-by: James Houghton <jthoughton@google.com> > > > --- > > > mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++--- > > > 1 file changed, 37 insertions(+), 3 deletions(-) > > > > > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > > > index a57e1be41401..5df838d86f32 100644 > > > --- a/mm/hugetlb.c > > > +++ b/mm/hugetlb.c > > > @@ -33,6 +33,7 @@ > > > #include <linux/migrate.h> > > > #include <linux/nospec.h> > > > #include <linux/delayacct.h> > > > +#include <linux/sort.h> > > > > > > #include <asm/page.h> > > > #include <asm/pgalloc.h> > > > @@ -48,6 +49,10 @@ > > > > > > int hugetlb_max_hstate __read_mostly; > > > unsigned int default_hstate_idx; > > > +/* > > > + * After hugetlb_init_hstates is called, hstates will be sorted from largest > > > + * to smallest. > > > + */ > > > struct hstate hstates[HUGE_MAX_HSTATE]; > > > > > > #ifdef CONFIG_CMA > > > @@ -3144,14 +3149,43 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) > > > kfree(node_alloc_noretry); > > > } > > > > > > +static int compare_hstates_decreasing(const void *a, const void *b) > > > +{ > > > + const int shift_a = huge_page_shift((const struct hstate *)a); > > > + const int shift_b = huge_page_shift((const struct hstate *)b); > > > + > > > + if (shift_a < shift_b) > > > + return 1; > > > + if (shift_a > shift_b) > > > + return -1; > > > + return 0; > > > +} > > > + > > > +static void sort_hstates(void) > > > +{ > > > + unsigned long default_hstate_sz = huge_page_size(&default_hstate); > > > + > > > + /* Sort from largest to smallest. */ > > > + sort(hstates, hugetlb_max_hstate, sizeof(*hstates), > > > + compare_hstates_decreasing, NULL); > > > + > > > + /* > > > + * We may have changed the location of the default hstate, so we need to > > > + * update it. > > > + */ > > > + default_hstate_idx = hstate_index(size_to_hstate(default_hstate_sz)); > > > +} > > > + > > > static void __init hugetlb_init_hstates(void) > > > { > > > struct hstate *h, *h2; > > > > > > - for_each_hstate(h) { > > > - if (minimum_order > huge_page_order(h)) > > > - minimum_order = huge_page_order(h); > > > + sort_hstates(); > > > > > > + /* The last hstate is now the smallest. */ > > > + minimum_order = huge_page_order(&hstates[hugetlb_max_hstate - 1]); > > > + > > > + for_each_hstate(h) { > > > /* oversize hugepages were init'ed in early boot */ > > > if (!hstate_is_gigantic(h)) > > > hugetlb_hstate_alloc_pages(h); > > > > This may/will cause problems for gigantic hugetlb pages allocated at boot > > time. See alloc_bootmem_huge_page() where a pointer to the associated hstate > > is encoded within the allocated hugetlb page. These pages are added to > > hugetlb pools by the routine gather_bootmem_prealloc() which uses the saved > > hstate to add prep the gigantic page and add to the correct pool. Currently, > > gather_bootmem_prealloc is called after hugetlb_init_hstates. So, changing > > hstate order will cause errors. > > > > I do not see any reason why we could not call gather_bootmem_prealloc before > > hugetlb_init_hstates to avoid this issue. > > Thanks for catching this, Mike. Your suggestion certainly seems to > work, but it also seems kind of error prone. I'll have to look at the > code more closely, but maybe it would be better if I just maintained a > separate `struct hstate *sorted_hstate_ptrs[]`, where the original I don't think this is a good idea. If you really rely on the order of the initialization in this patch. The easier solution is changing huge_bootmem_page->hstate to huge_bootmem_page->hugepagesz. Then we can use size_to_hstate(huge_bootmem_page->hugepagesz) in gather_bootmem_prealloc(). Thanks. > locations of the hstates remain unchanged, as to not break > gather_bootmem_prealloc/other things. > > > -- > > Mike Kravetz >
On 06/29/22 14:39, Muchun Song wrote: > On Tue, Jun 28, 2022 at 08:40:27AM -0700, James Houghton wrote: > > On Mon, Jun 27, 2022 at 11:42 AM Mike Kravetz <mike.kravetz@oracle.com> wrote: > > > > > > On 06/24/22 17:36, James Houghton wrote: > > > > When using HugeTLB high-granularity mapping, we need to go through the > > > > supported hugepage sizes in decreasing order so that we pick the largest > > > > size that works. Consider the case where we're faulting in a 1G hugepage > > > > for the first time: we want hugetlb_fault/hugetlb_no_page to map it with > > > > a PUD. By going through the sizes in decreasing order, we will find that > > > > PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too. > > > > > > > > > > This may/will cause problems for gigantic hugetlb pages allocated at boot > > > time. See alloc_bootmem_huge_page() where a pointer to the associated hstate > > > is encoded within the allocated hugetlb page. These pages are added to > > > hugetlb pools by the routine gather_bootmem_prealloc() which uses the saved > > > hstate to add prep the gigantic page and add to the correct pool. Currently, > > > gather_bootmem_prealloc is called after hugetlb_init_hstates. So, changing > > > hstate order will cause errors. > > > > > > I do not see any reason why we could not call gather_bootmem_prealloc before > > > hugetlb_init_hstates to avoid this issue. > > > > Thanks for catching this, Mike. Your suggestion certainly seems to > > work, but it also seems kind of error prone. I'll have to look at the > > code more closely, but maybe it would be better if I just maintained a > > separate `struct hstate *sorted_hstate_ptrs[]`, where the original > > I don't think this is a good idea. If you really rely on the order of > the initialization in this patch. The easier solution is changing > huge_bootmem_page->hstate to huge_bootmem_page->hugepagesz. Then we > can use size_to_hstate(huge_bootmem_page->hugepagesz) in > gather_bootmem_prealloc(). > That is a much better solution. Thanks Muchun!
On Wed, Jun 29, 2022 at 2:06 PM Mike Kravetz <mike.kravetz@oracle.com> wrote: > > On 06/29/22 14:39, Muchun Song wrote: > > On Tue, Jun 28, 2022 at 08:40:27AM -0700, James Houghton wrote: > > > On Mon, Jun 27, 2022 at 11:42 AM Mike Kravetz <mike.kravetz@oracle.com> wrote: > > > > > > > > On 06/24/22 17:36, James Houghton wrote: > > > > > When using HugeTLB high-granularity mapping, we need to go through the > > > > > supported hugepage sizes in decreasing order so that we pick the largest > > > > > size that works. Consider the case where we're faulting in a 1G hugepage > > > > > for the first time: we want hugetlb_fault/hugetlb_no_page to map it with > > > > > a PUD. By going through the sizes in decreasing order, we will find that > > > > > PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too. > > > > > > > > > > > > > This may/will cause problems for gigantic hugetlb pages allocated at boot > > > > time. See alloc_bootmem_huge_page() where a pointer to the associated hstate > > > > is encoded within the allocated hugetlb page. These pages are added to > > > > hugetlb pools by the routine gather_bootmem_prealloc() which uses the saved > > > > hstate to add prep the gigantic page and add to the correct pool. Currently, > > > > gather_bootmem_prealloc is called after hugetlb_init_hstates. So, changing > > > > hstate order will cause errors. > > > > > > > > I do not see any reason why we could not call gather_bootmem_prealloc before > > > > hugetlb_init_hstates to avoid this issue. > > > > > > Thanks for catching this, Mike. Your suggestion certainly seems to > > > work, but it also seems kind of error prone. I'll have to look at the > > > code more closely, but maybe it would be better if I just maintained a > > > separate `struct hstate *sorted_hstate_ptrs[]`, where the original > > > > I don't think this is a good idea. If you really rely on the order of > > the initialization in this patch. The easier solution is changing > > huge_bootmem_page->hstate to huge_bootmem_page->hugepagesz. Then we > > can use size_to_hstate(huge_bootmem_page->hugepagesz) in > > gather_bootmem_prealloc(). > > > > That is a much better solution. Thanks Muchun! Indeed. Thank you, Muchun. :) > > -- > Mike Kravetz
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a57e1be41401..5df838d86f32 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -33,6 +33,7 @@ #include <linux/migrate.h> #include <linux/nospec.h> #include <linux/delayacct.h> +#include <linux/sort.h> #include <asm/page.h> #include <asm/pgalloc.h> @@ -48,6 +49,10 @@ int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; +/* + * After hugetlb_init_hstates is called, hstates will be sorted from largest + * to smallest. + */ struct hstate hstates[HUGE_MAX_HSTATE]; #ifdef CONFIG_CMA @@ -3144,14 +3149,43 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) kfree(node_alloc_noretry); } +static int compare_hstates_decreasing(const void *a, const void *b) +{ + const int shift_a = huge_page_shift((const struct hstate *)a); + const int shift_b = huge_page_shift((const struct hstate *)b); + + if (shift_a < shift_b) + return 1; + if (shift_a > shift_b) + return -1; + return 0; +} + +static void sort_hstates(void) +{ + unsigned long default_hstate_sz = huge_page_size(&default_hstate); + + /* Sort from largest to smallest. */ + sort(hstates, hugetlb_max_hstate, sizeof(*hstates), + compare_hstates_decreasing, NULL); + + /* + * We may have changed the location of the default hstate, so we need to + * update it. + */ + default_hstate_idx = hstate_index(size_to_hstate(default_hstate_sz)); +} + static void __init hugetlb_init_hstates(void) { struct hstate *h, *h2; - for_each_hstate(h) { - if (minimum_order > huge_page_order(h)) - minimum_order = huge_page_order(h); + sort_hstates(); + /* The last hstate is now the smallest. */ + minimum_order = huge_page_order(&hstates[hugetlb_max_hstate - 1]); + + for_each_hstate(h) { /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h);
When using HugeTLB high-granularity mapping, we need to go through the supported hugepage sizes in decreasing order so that we pick the largest size that works. Consider the case where we're faulting in a 1G hugepage for the first time: we want hugetlb_fault/hugetlb_no_page to map it with a PUD. By going through the sizes in decreasing order, we will find that PUD_SIZE works before finding out that PMD_SIZE or PAGE_SIZE work too. Signed-off-by: James Houghton <jthoughton@google.com> --- mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-)