diff mbox series

[v1,04/11] mm/memremap: add ZONE_DEVICE support for compound pages

Message ID 20210325230938.30752-5-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series mm, sparse-vmemmap: Introduce compound pagemaps | expand

Commit Message

Joao Martins March 25, 2021, 11:09 p.m. UTC
Add a new align property for struct dev_pagemap which specifies that a
pagemap is composed of a set of compound pages of size @align, instead
of base pages. When these pages are initialised, most are initialised as
tail pages instead of order-0 pages.

For certain ZONE_DEVICE users like device-dax which have a fixed page
size, this creates an opportunity to optimize GUP and GUP-fast walkers,
treating it the same way as THP or hugetlb pages.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 include/linux/memremap.h | 13 +++++++++++++
 mm/memremap.c            |  8 ++++++--
 mm/page_alloc.c          | 24 +++++++++++++++++++++++-
 3 files changed, 42 insertions(+), 3 deletions(-)

Comments

Dan Williams May 5, 2021, 6:44 p.m. UTC | #1
On Thu, Mar 25, 2021 at 4:10 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>
> Add a new align property for struct dev_pagemap which specifies that a
> pagemap is composed of a set of compound pages of size @align, instead
> of base pages. When these pages are initialised, most are initialised as
> tail pages instead of order-0 pages.
>
> For certain ZONE_DEVICE users like device-dax which have a fixed page
> size, this creates an opportunity to optimize GUP and GUP-fast walkers,
> treating it the same way as THP or hugetlb pages.
>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>  include/linux/memremap.h | 13 +++++++++++++
>  mm/memremap.c            |  8 ++++++--
>  mm/page_alloc.c          | 24 +++++++++++++++++++++++-
>  3 files changed, 42 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index b46f63dcaed3..bb28d82dda5e 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -114,6 +114,7 @@ struct dev_pagemap {
>         struct completion done;
>         enum memory_type type;
>         unsigned int flags;
> +       unsigned long align;

I think this wants some kernel-doc above to indicate that non-zero
means "use compound pages with tail-page dedup" and zero / PAGE_SIZE
means "use non-compound base pages". The non-zero value must be
PAGE_SIZE, PMD_PAGE_SIZE or PUD_PAGE_SIZE. Hmm, maybe it should be an
enum:

enum devmap_geometry {
    DEVMAP_PTE,
    DEVMAP_PMD,
    DEVMAP_PUD,
}

...because it's more than just an alignment it's a structural
definition of how the memmap is laid out.

>         const struct dev_pagemap_ops *ops;
>         void *owner;
>         int nr_range;
> @@ -130,6 +131,18 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
>         return NULL;
>  }
>
> +static inline unsigned long pgmap_align(struct dev_pagemap *pgmap)
> +{
> +       if (!pgmap || !pgmap->align)
> +               return PAGE_SIZE;
> +       return pgmap->align;
> +}
> +
> +static inline unsigned long pgmap_pfn_align(struct dev_pagemap *pgmap)
> +{
> +       return PHYS_PFN(pgmap_align(pgmap));
> +}
> +
>  #ifdef CONFIG_ZONE_DEVICE
>  bool pfn_zone_device_reserved(unsigned long pfn);
>  void *memremap_pages(struct dev_pagemap *pgmap, int nid);
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 805d761740c4..d160853670c4 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -318,8 +318,12 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
>         memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
>                                 PHYS_PFN(range->start),
>                                 PHYS_PFN(range_len(range)), pgmap);
> -       percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
> -                       - pfn_first(pgmap, range_id));
> +       if (pgmap_align(pgmap) > PAGE_SIZE)
> +               percpu_ref_get_many(pgmap->ref, (pfn_end(pgmap, range_id)
> +                       - pfn_first(pgmap, range_id)) / pgmap_pfn_align(pgmap));
> +       else
> +               percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
> +                               - pfn_first(pgmap, range_id));
>         return 0;
>
>  err_add_memory:
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 58974067bbd4..3a77f9e43f3a 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6285,6 +6285,8 @@ void __ref memmap_init_zone_device(struct zone *zone,
>         unsigned long pfn, end_pfn = start_pfn + nr_pages;
>         struct pglist_data *pgdat = zone->zone_pgdat;
>         struct vmem_altmap *altmap = pgmap_altmap(pgmap);
> +       unsigned int pfn_align = pgmap_pfn_align(pgmap);
> +       unsigned int order_align = order_base_2(pfn_align);
>         unsigned long zone_idx = zone_idx(zone);
>         unsigned long start = jiffies;
>         int nid = pgdat->node_id;
> @@ -6302,10 +6304,30 @@ void __ref memmap_init_zone_device(struct zone *zone,
>                 nr_pages = end_pfn - start_pfn;
>         }
>
> -       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> +       for (pfn = start_pfn; pfn < end_pfn; pfn += pfn_align) {

pfn_align is in bytes and pfn is in pages... is there a "pfn_align >>=
PAGE_SHIFT" I missed somewhere?
Matthew Wilcox (Oracle) May 5, 2021, 6:58 p.m. UTC | #2
On Wed, May 05, 2021 at 11:44:29AM -0700, Dan Williams wrote:
> > @@ -6285,6 +6285,8 @@ void __ref memmap_init_zone_device(struct zone *zone,
> >         unsigned long pfn, end_pfn = start_pfn + nr_pages;
> >         struct pglist_data *pgdat = zone->zone_pgdat;
> >         struct vmem_altmap *altmap = pgmap_altmap(pgmap);
> > +       unsigned int pfn_align = pgmap_pfn_align(pgmap);
> > +       unsigned int order_align = order_base_2(pfn_align);
> >         unsigned long zone_idx = zone_idx(zone);
> >         unsigned long start = jiffies;
> >         int nid = pgdat->node_id;
> > @@ -6302,10 +6304,30 @@ void __ref memmap_init_zone_device(struct zone *zone,
> >                 nr_pages = end_pfn - start_pfn;
> >         }
> >
> > -       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> > +       for (pfn = start_pfn; pfn < end_pfn; pfn += pfn_align) {
> 
> pfn_align is in bytes and pfn is in pages... is there a "pfn_align >>=
> PAGE_SHIFT" I missed somewhere?

If something is measured in bytes, I like to use size_t (if it's
in memory) and loff_t (if it's on storage).  The compiler doesn't do
anything useful to warn you, but it's a nice indication to humans about
what's going on.  And it removes the temptation to do 'pfn_align >>=
PAGE_SHIFT' and suddenly take pfn_align from being measured in bytes to
being measured in pages.
Joao Martins May 5, 2021, 7:49 p.m. UTC | #3
On 5/5/21 7:44 PM, Dan Williams wrote:
> On Thu, Mar 25, 2021 at 4:10 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>
>> Add a new align property for struct dev_pagemap which specifies that a
>> pagemap is composed of a set of compound pages of size @align, instead
>> of base pages. When these pages are initialised, most are initialised as
>> tail pages instead of order-0 pages.
>>
>> For certain ZONE_DEVICE users like device-dax which have a fixed page
>> size, this creates an opportunity to optimize GUP and GUP-fast walkers,
>> treating it the same way as THP or hugetlb pages.
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>>  include/linux/memremap.h | 13 +++++++++++++
>>  mm/memremap.c            |  8 ++++++--
>>  mm/page_alloc.c          | 24 +++++++++++++++++++++++-
>>  3 files changed, 42 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
>> index b46f63dcaed3..bb28d82dda5e 100644
>> --- a/include/linux/memremap.h
>> +++ b/include/linux/memremap.h
>> @@ -114,6 +114,7 @@ struct dev_pagemap {
>>         struct completion done;
>>         enum memory_type type;
>>         unsigned int flags;
>> +       unsigned long align;
> 
> I think this wants some kernel-doc above to indicate that non-zero
> means "use compound pages with tail-page dedup" and zero / PAGE_SIZE
> means "use non-compound base pages". 

Got it. Are you thinking a kernel_doc on top of the variable above, or preferably on top
of pgmap_align()?

> The non-zero value must be
> PAGE_SIZE, PMD_PAGE_SIZE or PUD_PAGE_SIZE. 
> Hmm, maybe it should be an
> enum:
> 
> enum devmap_geometry {
>     DEVMAP_PTE,
>     DEVMAP_PMD,
>     DEVMAP_PUD,
> }
> 
I suppose a converter between devmap_geometry and page_size would be needed too? And maybe
the whole dax/nvdimm align values change meanwhile (as a followup improvement)?

Although to be fair we only ever care about compound page size in this series (and
similarly dax/nvdimm @align properties).

> ...because it's more than just an alignment it's a structural
> definition of how the memmap is laid out.
> 
>>         const struct dev_pagemap_ops *ops;
>>         void *owner;
>>         int nr_range;
>> @@ -130,6 +131,18 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
>>         return NULL;
>>  }
>>
>> +static inline unsigned long pgmap_align(struct dev_pagemap *pgmap)
>> +{
>> +       if (!pgmap || !pgmap->align)
>> +               return PAGE_SIZE;
>> +       return pgmap->align;
>> +}
>> +
>> +static inline unsigned long pgmap_pfn_align(struct dev_pagemap *pgmap)
>> +{
>> +       return PHYS_PFN(pgmap_align(pgmap));
>> +}
>> +
>>  #ifdef CONFIG_ZONE_DEVICE
>>  bool pfn_zone_device_reserved(unsigned long pfn);
>>  void *memremap_pages(struct dev_pagemap *pgmap, int nid);
>> diff --git a/mm/memremap.c b/mm/memremap.c
>> index 805d761740c4..d160853670c4 100644
>> --- a/mm/memremap.c
>> +++ b/mm/memremap.c
>> @@ -318,8 +318,12 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
>>         memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
>>                                 PHYS_PFN(range->start),
>>                                 PHYS_PFN(range_len(range)), pgmap);
>> -       percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
>> -                       - pfn_first(pgmap, range_id));
>> +       if (pgmap_align(pgmap) > PAGE_SIZE)
>> +               percpu_ref_get_many(pgmap->ref, (pfn_end(pgmap, range_id)
>> +                       - pfn_first(pgmap, range_id)) / pgmap_pfn_align(pgmap));
>> +       else
>> +               percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
>> +                               - pfn_first(pgmap, range_id));
>>         return 0;
>>
>>  err_add_memory:
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 58974067bbd4..3a77f9e43f3a 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -6285,6 +6285,8 @@ void __ref memmap_init_zone_device(struct zone *zone,
>>         unsigned long pfn, end_pfn = start_pfn + nr_pages;
>>         struct pglist_data *pgdat = zone->zone_pgdat;
>>         struct vmem_altmap *altmap = pgmap_altmap(pgmap);
>> +       unsigned int pfn_align = pgmap_pfn_align(pgmap);
>> +       unsigned int order_align = order_base_2(pfn_align);
>>         unsigned long zone_idx = zone_idx(zone);
>>         unsigned long start = jiffies;
>>         int nid = pgdat->node_id;
>> @@ -6302,10 +6304,30 @@ void __ref memmap_init_zone_device(struct zone *zone,
>>                 nr_pages = end_pfn - start_pfn;
>>         }
>>
>> -       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>> +       for (pfn = start_pfn; pfn < end_pfn; pfn += pfn_align) {
> 
> pfn_align is in bytes and pfn is in pages... is there a "pfn_align >>=
> PAGE_SHIFT" I missed somewhere?
> 
@pfn_align is in pages too. It's pgmap_align() which is in bytes:

+static inline unsigned long pgmap_pfn_align(struct dev_pagemap *pgmap)
+{
+       return PHYS_PFN(pgmap_align(pgmap));
+}
Dan Williams May 5, 2021, 10:20 p.m. UTC | #4
On Wed, May 5, 2021 at 12:50 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>
>
>
> On 5/5/21 7:44 PM, Dan Williams wrote:
> > On Thu, Mar 25, 2021 at 4:10 PM Joao Martins <joao.m.martins@oracle.com> wrote:
> >>
> >> Add a new align property for struct dev_pagemap which specifies that a
> >> pagemap is composed of a set of compound pages of size @align, instead
> >> of base pages. When these pages are initialised, most are initialised as
> >> tail pages instead of order-0 pages.
> >>
> >> For certain ZONE_DEVICE users like device-dax which have a fixed page
> >> size, this creates an opportunity to optimize GUP and GUP-fast walkers,
> >> treating it the same way as THP or hugetlb pages.
> >>
> >> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> >> ---
> >>  include/linux/memremap.h | 13 +++++++++++++
> >>  mm/memremap.c            |  8 ++++++--
> >>  mm/page_alloc.c          | 24 +++++++++++++++++++++++-
> >>  3 files changed, 42 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> >> index b46f63dcaed3..bb28d82dda5e 100644
> >> --- a/include/linux/memremap.h
> >> +++ b/include/linux/memremap.h
> >> @@ -114,6 +114,7 @@ struct dev_pagemap {
> >>         struct completion done;
> >>         enum memory_type type;
> >>         unsigned int flags;
> >> +       unsigned long align;
> >
> > I think this wants some kernel-doc above to indicate that non-zero
> > means "use compound pages with tail-page dedup" and zero / PAGE_SIZE
> > means "use non-compound base pages".
>
> Got it. Are you thinking a kernel_doc on top of the variable above, or preferably on top
> of pgmap_align()?

I was thinking in dev_pagemap because this value is more than just a
plain alignment its restructuring the layout and construction of the
memmap(), but when I say it that way it seems much less like a vanilla
align value compared to a construction / geometry mode setting.

>
> > The non-zero value must be
> > PAGE_SIZE, PMD_PAGE_SIZE or PUD_PAGE_SIZE.
> > Hmm, maybe it should be an
> > enum:
> >
> > enum devmap_geometry {
> >     DEVMAP_PTE,
> >     DEVMAP_PMD,
> >     DEVMAP_PUD,
> > }
> >
> I suppose a converter between devmap_geometry and page_size would be needed too? And maybe
> the whole dax/nvdimm align values change meanwhile (as a followup improvement)?

I think it is ok for dax/nvdimm to continue to maintain their align
value because it should be ok to have 4MB align if the device really
wanted. However, when it goes to map that alignment with
memremap_pages() it can pick a mode. For example, it's already the
case that dax->align == 1GB is mapped with DEVMAP_PTE today, so
they're already separate concepts that can stay separate.

>
> Although to be fair we only ever care about compound page size in this series (and
> similarly dax/nvdimm @align properties).
>
> > ...because it's more than just an alignment it's a structural
> > definition of how the memmap is laid out.
> >
> >>         const struct dev_pagemap_ops *ops;
> >>         void *owner;
> >>         int nr_range;
> >> @@ -130,6 +131,18 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
> >>         return NULL;
> >>  }
> >>
> >> +static inline unsigned long pgmap_align(struct dev_pagemap *pgmap)
> >> +{
> >> +       if (!pgmap || !pgmap->align)
> >> +               return PAGE_SIZE;
> >> +       return pgmap->align;
> >> +}
> >> +
> >> +static inline unsigned long pgmap_pfn_align(struct dev_pagemap *pgmap)
> >> +{
> >> +       return PHYS_PFN(pgmap_align(pgmap));
> >> +}
> >> +
> >>  #ifdef CONFIG_ZONE_DEVICE
> >>  bool pfn_zone_device_reserved(unsigned long pfn);
> >>  void *memremap_pages(struct dev_pagemap *pgmap, int nid);
> >> diff --git a/mm/memremap.c b/mm/memremap.c
> >> index 805d761740c4..d160853670c4 100644
> >> --- a/mm/memremap.c
> >> +++ b/mm/memremap.c
> >> @@ -318,8 +318,12 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
> >>         memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
> >>                                 PHYS_PFN(range->start),
> >>                                 PHYS_PFN(range_len(range)), pgmap);
> >> -       percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
> >> -                       - pfn_first(pgmap, range_id));
> >> +       if (pgmap_align(pgmap) > PAGE_SIZE)
> >> +               percpu_ref_get_many(pgmap->ref, (pfn_end(pgmap, range_id)
> >> +                       - pfn_first(pgmap, range_id)) / pgmap_pfn_align(pgmap));
> >> +       else
> >> +               percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
> >> +                               - pfn_first(pgmap, range_id));
> >>         return 0;
> >>
> >>  err_add_memory:
> >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >> index 58974067bbd4..3a77f9e43f3a 100644
> >> --- a/mm/page_alloc.c
> >> +++ b/mm/page_alloc.c
> >> @@ -6285,6 +6285,8 @@ void __ref memmap_init_zone_device(struct zone *zone,
> >>         unsigned long pfn, end_pfn = start_pfn + nr_pages;
> >>         struct pglist_data *pgdat = zone->zone_pgdat;
> >>         struct vmem_altmap *altmap = pgmap_altmap(pgmap);
> >> +       unsigned int pfn_align = pgmap_pfn_align(pgmap);
> >> +       unsigned int order_align = order_base_2(pfn_align);
> >>         unsigned long zone_idx = zone_idx(zone);
> >>         unsigned long start = jiffies;
> >>         int nid = pgdat->node_id;
> >> @@ -6302,10 +6304,30 @@ void __ref memmap_init_zone_device(struct zone *zone,
> >>                 nr_pages = end_pfn - start_pfn;
> >>         }
> >>
> >> -       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> >> +       for (pfn = start_pfn; pfn < end_pfn; pfn += pfn_align) {
> >
> > pfn_align is in bytes and pfn is in pages... is there a "pfn_align >>=
> > PAGE_SHIFT" I missed somewhere?
> >
> @pfn_align is in pages too. It's pgmap_align() which is in bytes:
>
> +static inline unsigned long pgmap_pfn_align(struct dev_pagemap *pgmap)
> +{
> +       return PHYS_PFN(pgmap_align(pgmap));
> +}

Ah yup, my eyes glazed over that. I think this is another place that
benefits from a more specific name than "align". "pfns_per_compound"
"compound_pfns"?
Joao Martins May 5, 2021, 10:36 p.m. UTC | #5
On 5/5/21 11:20 PM, Dan Williams wrote:
> On Wed, May 5, 2021 at 12:50 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>> On 5/5/21 7:44 PM, Dan Williams wrote:
>>> On Thu, Mar 25, 2021 at 4:10 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>
>>>> Add a new align property for struct dev_pagemap which specifies that a
>>>> pagemap is composed of a set of compound pages of size @align, instead
>>>> of base pages. When these pages are initialised, most are initialised as
>>>> tail pages instead of order-0 pages.
>>>>
>>>> For certain ZONE_DEVICE users like device-dax which have a fixed page
>>>> size, this creates an opportunity to optimize GUP and GUP-fast walkers,
>>>> treating it the same way as THP or hugetlb pages.
>>>>
>>>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>>>> ---
>>>>  include/linux/memremap.h | 13 +++++++++++++
>>>>  mm/memremap.c            |  8 ++++++--
>>>>  mm/page_alloc.c          | 24 +++++++++++++++++++++++-
>>>>  3 files changed, 42 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
>>>> index b46f63dcaed3..bb28d82dda5e 100644
>>>> --- a/include/linux/memremap.h
>>>> +++ b/include/linux/memremap.h
>>>> @@ -114,6 +114,7 @@ struct dev_pagemap {
>>>>         struct completion done;
>>>>         enum memory_type type;
>>>>         unsigned int flags;
>>>> +       unsigned long align;
>>>
>>> I think this wants some kernel-doc above to indicate that non-zero
>>> means "use compound pages with tail-page dedup" and zero / PAGE_SIZE
>>> means "use non-compound base pages".
>>
>> Got it. Are you thinking a kernel_doc on top of the variable above, or preferably on top
>> of pgmap_align()?
> 
> I was thinking in dev_pagemap because this value is more than just a
> plain alignment its restructuring the layout and construction of the
> memmap(), but when I say it that way it seems much less like a vanilla
> align value compared to a construction / geometry mode setting.
> 
/me nods

>>
>>> The non-zero value must be
>>> PAGE_SIZE, PMD_PAGE_SIZE or PUD_PAGE_SIZE.
>>> Hmm, maybe it should be an
>>> enum:
>>>
>>> enum devmap_geometry {
>>>     DEVMAP_PTE,
>>>     DEVMAP_PMD,
>>>     DEVMAP_PUD,
>>> }
>>>
>> I suppose a converter between devmap_geometry and page_size would be needed too? And maybe
>> the whole dax/nvdimm align values change meanwhile (as a followup improvement)?
> 
> I think it is ok for dax/nvdimm to continue to maintain their align
> value because it should be ok to have 4MB align if the device really
> wanted. However, when it goes to map that alignment with
> memremap_pages() it can pick a mode. For example, it's already the
> case that dax->align == 1GB is mapped with DEVMAP_PTE today, so
> they're already separate concepts that can stay separate.
> 
Gotcha.

>>
>> Although to be fair we only ever care about compound page size in this series (and
>> similarly dax/nvdimm @align properties).
>>
>>> ...because it's more than just an alignment it's a structural
>>> definition of how the memmap is laid out.
>>>

Somehow I missed this other part of your response.

>>>>         const struct dev_pagemap_ops *ops;
>>>>         void *owner;
>>>>         int nr_range;
>>>> @@ -130,6 +131,18 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
>>>>         return NULL;
>>>>  }
>>>>
>>>> +static inline unsigned long pgmap_align(struct dev_pagemap *pgmap)
>>>> +{
>>>> +       if (!pgmap || !pgmap->align)
>>>> +               return PAGE_SIZE;
>>>> +       return pgmap->align;
>>>> +}
>>>> +
>>>> +static inline unsigned long pgmap_pfn_align(struct dev_pagemap *pgmap)
>>>> +{
>>>> +       return PHYS_PFN(pgmap_align(pgmap));
>>>> +}
>>>> +
>>>>  #ifdef CONFIG_ZONE_DEVICE
>>>>  bool pfn_zone_device_reserved(unsigned long pfn);
>>>>  void *memremap_pages(struct dev_pagemap *pgmap, int nid);
>>>> diff --git a/mm/memremap.c b/mm/memremap.c
>>>> index 805d761740c4..d160853670c4 100644
>>>> --- a/mm/memremap.c
>>>> +++ b/mm/memremap.c
>>>> @@ -318,8 +318,12 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
>>>>         memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
>>>>                                 PHYS_PFN(range->start),
>>>>                                 PHYS_PFN(range_len(range)), pgmap);
>>>> -       percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
>>>> -                       - pfn_first(pgmap, range_id));
>>>> +       if (pgmap_align(pgmap) > PAGE_SIZE)
>>>> +               percpu_ref_get_many(pgmap->ref, (pfn_end(pgmap, range_id)
>>>> +                       - pfn_first(pgmap, range_id)) / pgmap_pfn_align(pgmap));
>>>> +       else
>>>> +               percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
>>>> +                               - pfn_first(pgmap, range_id));
>>>>         return 0;
>>>>
>>>>  err_add_memory:
>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>> index 58974067bbd4..3a77f9e43f3a 100644
>>>> --- a/mm/page_alloc.c
>>>> +++ b/mm/page_alloc.c
>>>> @@ -6285,6 +6285,8 @@ void __ref memmap_init_zone_device(struct zone *zone,
>>>>         unsigned long pfn, end_pfn = start_pfn + nr_pages;
>>>>         struct pglist_data *pgdat = zone->zone_pgdat;
>>>>         struct vmem_altmap *altmap = pgmap_altmap(pgmap);
>>>> +       unsigned int pfn_align = pgmap_pfn_align(pgmap);
>>>> +       unsigned int order_align = order_base_2(pfn_align);
>>>>         unsigned long zone_idx = zone_idx(zone);
>>>>         unsigned long start = jiffies;
>>>>         int nid = pgdat->node_id;
>>>> @@ -6302,10 +6304,30 @@ void __ref memmap_init_zone_device(struct zone *zone,
>>>>                 nr_pages = end_pfn - start_pfn;
>>>>         }
>>>>
>>>> -       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
>>>> +       for (pfn = start_pfn; pfn < end_pfn; pfn += pfn_align) {
>>>
>>> pfn_align is in bytes and pfn is in pages... is there a "pfn_align >>=
>>> PAGE_SHIFT" I missed somewhere?
>>>
>> @pfn_align is in pages too. It's pgmap_align() which is in bytes:
>>
>> +static inline unsigned long pgmap_pfn_align(struct dev_pagemap *pgmap)
>> +{
>> +       return PHYS_PFN(pgmap_align(pgmap));
>> +}
> 
> Ah yup, my eyes glazed over that. I think this is another place that
> benefits from a more specific name than "align". "pfns_per_compound"
> "compound_pfns"?
> 
We are still describing a page, just not a base page. So perhaps @pfns_per_hpage ?

I am fine with @pfns_per_compound or @compound_pfns as well.
Dan Williams May 5, 2021, 11:03 p.m. UTC | #6
On Wed, May 5, 2021 at 3:36 PM Joao Martins <joao.m.martins@oracle.com> wrote:
[..]
> > Ah yup, my eyes glazed over that. I think this is another place that
> > benefits from a more specific name than "align". "pfns_per_compound"
> > "compound_pfns"?
> >
> We are still describing a page, just not a base page. So perhaps @pfns_per_hpage ?
>
> I am fine with @pfns_per_compound or @compound_pfns as well.

My only concern about hpage is that hpage implies PMD, where compound
is generic across PMD and PUD.
Aneesh Kumar K.V May 6, 2021, 8:05 a.m. UTC | #7
IIUC this series is about devdax namespace with aligh of 1G or 2M where we can
save the vmmemap space by not allocating memory for tail struct pages? 

Dan Williams <dan.j.williams@intel.com> writes:

> > enum:
>> >
>> > enum devmap_geometry {
>> >     DEVMAP_PTE,
>> >     DEVMAP_PMD,
>> >     DEVMAP_PUD,
>> > }
>> >
>> I suppose a converter between devmap_geometry and page_size would be needed too? And maybe
>> the whole dax/nvdimm align values change meanwhile (as a followup improvement)?
>
> I think it is ok for dax/nvdimm to continue to maintain their align
> value because it should be ok to have 4MB align if the device really
> wanted. However, when it goes to map that alignment with
> memremap_pages() it can pick a mode. For example, it's already the
> case that dax->align == 1GB is mapped with DEVMAP_PTE today, so
> they're already separate concepts that can stay separate.

devdax namespace with align of 1G implies we expect to map them with 1G
pte entries? I didn't follow when you say we map them today with
DEVMAP_PTE entries.


-aneesh
Joao Martins May 6, 2021, 10:12 a.m. UTC | #8
On 5/6/21 12:03 AM, Dan Williams wrote:
> On Wed, May 5, 2021 at 3:36 PM Joao Martins <joao.m.martins@oracle.com> wrote:
> [..]
>>> Ah yup, my eyes glazed over that. I think this is another place that
>>> benefits from a more specific name than "align". "pfns_per_compound"
>>> "compound_pfns"?
>>>
>> We are still describing a page, just not a base page. So perhaps @pfns_per_hpage ?
>>
>> I am fine with @pfns_per_compound or @compound_pfns as well.
> 
> My only concern about hpage is that hpage implies PMD, where compound
> is generic across PMD and PUD.
> 
True.

I will stick with your suggestions.
Joao Martins May 6, 2021, 10:23 a.m. UTC | #9
On 5/6/21 9:05 AM, Aneesh Kumar K.V wrote:
> 
> 
> IIUC this series is about devdax namespace with aligh of 1G or 2M where we can
> save the vmmemap space by not allocating memory for tail struct pages? 
> 
Right.

It reuses base pages across the vmemmap, but for the base pages containing
only the tail struct pages.

> Dan Williams <dan.j.williams@intel.com> writes:
> 
>>> enum:
>>>>
>>>> enum devmap_geometry {
>>>>     DEVMAP_PTE,
>>>>     DEVMAP_PMD,
>>>>     DEVMAP_PUD,
>>>> }
>>>>
>>> I suppose a converter between devmap_geometry and page_size would be needed too? And maybe
>>> the whole dax/nvdimm align values change meanwhile (as a followup improvement)?
>>
>> I think it is ok for dax/nvdimm to continue to maintain their align
>> value because it should be ok to have 4MB align if the device really
>> wanted. However, when it goes to map that alignment with
>> memremap_pages() it can pick a mode. For example, it's already the
>> case that dax->align == 1GB is mapped with DEVMAP_PTE today, so
>> they're already separate concepts that can stay separate.
> 
> devdax namespace with align of 1G implies we expect to map them with 1G
> pte entries? I didn't follow when you say we map them today with
> DEVMAP_PTE entries.
> 
This sort of confusion is largelly why Dan is suggesting a @geometry for naming rather
than @align (which traditionally refers to page tables entry sizes in pagemap-related stuff).

DEVMAP_{PTE,PMD,PUD} refers to the representation of metadata in base pages (DEVMAP_PTE),
compound pages of PMD order (DEVMAP_PMD) or compound pages of PUD order (DEVMAP_PUD).

So, today:

* namespaces with align of 1G would use *struct pages of order-0* (DEVMAP_PTE) backed with
PMD entries in the direct map.
* namespaces with align of 2M would use *struct pages of order-0* (DEVMAP_PTE) backed with
PMD entries in the direct map.

After this series:

* namespaces with align of 1G would use *compound struct pages of order-30* (DEVMAP_PUD)
backed with PMD entries in the direct map.
* namespaces with align of 1G would use *compound struct pages of order-21* (DEVMAP_PMD)
backed with PTE entries in the direct map.
Matthew Wilcox (Oracle) May 6, 2021, 11:43 a.m. UTC | #10
On Thu, May 06, 2021 at 11:23:25AM +0100, Joao Martins wrote:
> >> I think it is ok for dax/nvdimm to continue to maintain their align
> >> value because it should be ok to have 4MB align if the device really
> >> wanted. However, when it goes to map that alignment with
> >> memremap_pages() it can pick a mode. For example, it's already the
> >> case that dax->align == 1GB is mapped with DEVMAP_PTE today, so
> >> they're already separate concepts that can stay separate.
> > 
> > devdax namespace with align of 1G implies we expect to map them with 1G
> > pte entries? I didn't follow when you say we map them today with
> > DEVMAP_PTE entries.
> 
> This sort of confusion is largelly why Dan is suggesting a @geometry for naming rather
> than @align (which traditionally refers to page tables entry sizes in pagemap-related stuff).
> 
> DEVMAP_{PTE,PMD,PUD} refers to the representation of metadata in base pages (DEVMAP_PTE),
> compound pages of PMD order (DEVMAP_PMD) or compound pages of PUD order (DEVMAP_PUD).
> 
> So, today:
> 
> * namespaces with align of 1G would use *struct pages of order-0* (DEVMAP_PTE) backed with
> PMD entries in the direct map.
> * namespaces with align of 2M would use *struct pages of order-0* (DEVMAP_PTE) backed with
> PMD entries in the direct map.
> 
> After this series:
> 
> * namespaces with align of 1G would use *compound struct pages of order-30* (DEVMAP_PUD)
> backed with PMD entries in the direct map.

order-18

> * namespaces with align of 1G would use *compound struct pages of order-21* (DEVMAP_PMD)
> backed with PTE entries in the direct map.

i think you mean "align of 2M", and order-9.

(note that these numbers are/can be different for powerpc since it
supports different TLB page sizes.  please don't get locked into x86
sizes, and don't ignore the benefits *to software* of managing memory
in sizes other than just those supported by the hardware).
Joao Martins May 6, 2021, 12:15 p.m. UTC | #11
On 5/6/21 12:43 PM, Matthew Wilcox wrote:
> On Thu, May 06, 2021 at 11:23:25AM +0100, Joao Martins wrote:
>>>> I think it is ok for dax/nvdimm to continue to maintain their align
>>>> value because it should be ok to have 4MB align if the device really
>>>> wanted. However, when it goes to map that alignment with
>>>> memremap_pages() it can pick a mode. For example, it's already the
>>>> case that dax->align == 1GB is mapped with DEVMAP_PTE today, so
>>>> they're already separate concepts that can stay separate.
>>>
>>> devdax namespace with align of 1G implies we expect to map them with 1G
>>> pte entries? I didn't follow when you say we map them today with
>>> DEVMAP_PTE entries.
>>
>> This sort of confusion is largelly why Dan is suggesting a @geometry for naming rather
>> than @align (which traditionally refers to page tables entry sizes in pagemap-related stuff).
>>
>> DEVMAP_{PTE,PMD,PUD} refers to the representation of metadata in base pages (DEVMAP_PTE),
>> compound pages of PMD order (DEVMAP_PMD) or compound pages of PUD order (DEVMAP_PUD).
>>
>> So, today:
>>
>> * namespaces with align of 1G would use *struct pages of order-0* (DEVMAP_PTE) backed with
>> PMD entries in the direct map.
>> * namespaces with align of 2M would use *struct pages of order-0* (DEVMAP_PTE) backed with
>> PMD entries in the direct map.
>>
>> After this series:
>>
>> * namespaces with align of 1G would use *compound struct pages of order-30* (DEVMAP_PUD)
>> backed with PMD entries in the direct map.
> 
> order-18
> 
Right, thanks for the correction.

Forgot to subtract PAGE_SIZE order (12).

>> * namespaces with align of 1G would use *compound struct pages of order-21* (DEVMAP_PMD)
>> backed with PTE entries in the direct map.
> 
> i think you mean "align of 2M", and order-9.
> 
True.

> (note that these numbers are/can be different for powerpc since it
> supports different TLB page sizes.  please don't get locked into x86
> sizes, and don't ignore the benefits *to software* of managing memory
> in sizes other than just those supported by the hardware).
> 
I am not ignoring that either that I think.

The page size looking values above is also what the consumer of this (device-dax) allows
you to use as @align, hence why you see DEVMAP_PTE, PMD and PUD as the sizes.
Joao Martins May 18, 2021, 5:27 p.m. UTC | #12
On 5/5/21 11:36 PM, Joao Martins wrote:
> On 5/5/21 11:20 PM, Dan Williams wrote:
>> On Wed, May 5, 2021 at 12:50 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>> On 5/5/21 7:44 PM, Dan Williams wrote:
>>>> On Thu, Mar 25, 2021 at 4:10 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
>>>>> index b46f63dcaed3..bb28d82dda5e 100644
>>>>> --- a/include/linux/memremap.h
>>>>> +++ b/include/linux/memremap.h
>>>>> @@ -114,6 +114,7 @@ struct dev_pagemap {
>>>>>         struct completion done;
>>>>>         enum memory_type type;
>>>>>         unsigned int flags;
>>>>> +       unsigned long align;
>>>>
>>>> I think this wants some kernel-doc above to indicate that non-zero
>>>> means "use compound pages with tail-page dedup" and zero / PAGE_SIZE
>>>> means "use non-compound base pages".

[...]

>>>> The non-zero value must be
>>>> PAGE_SIZE, PMD_PAGE_SIZE or PUD_PAGE_SIZE.
>>>> Hmm, maybe it should be an
>>>> enum:
>>>>
>>>> enum devmap_geometry {
>>>>     DEVMAP_PTE,
>>>>     DEVMAP_PMD,
>>>>     DEVMAP_PUD,
>>>> }
>>>>
>>> I suppose a converter between devmap_geometry and page_size would be needed too? And maybe
>>> the whole dax/nvdimm align values change meanwhile (as a followup improvement)?
>>
>> I think it is ok for dax/nvdimm to continue to maintain their align
>> value because it should be ok to have 4MB align if the device really
>> wanted. However, when it goes to map that alignment with
>> memremap_pages() it can pick a mode. For example, it's already the
>> case that dax->align == 1GB is mapped with DEVMAP_PTE today, so
>> they're already separate concepts that can stay separate.
>>
> Gotcha.

I am reconsidering part of the above. In general, yes, the meaning of devmap @align
represents a slightly different variation of the device @align i.e. how the metadata is
laid out **but** regardless of what kind of page table entries we use vmemmap.

By using DEVMAP_PTE/PMD/PUD we might end up 1) duplicating what nvdimm/dax already
validates in terms of allowed device @align values (i.e. PAGE_SIZE, PMD_SIZE and PUD_SIZE)
2) the geometry of metadata is very much tied to the value we pick to @align at namespace
provisioning -- not the "align" we might use at mmap() perhaps that's what you referred
above? -- and 3) the value of geometry actually derives from dax device @align because we
will need to create compound pages representing a page size of @align value.

Using your example above: you're saying that dax->align == 1G is mapped with DEVMAP_PTEs,
in reality the vmemmap is populated with PMDs/PUDs page tables (depending on what archs
decide to do at vmemmap_populate()) and uses base pages as its metadata regardless of what
device @align. In reality what we want to convey in @geometry is not page table sizes, but
just the page size used for the vmemmap of the dax device. Additionally, limiting its
value might not be desirable... if tomorrow Linux for some arch supports dax/nvdimm
devices with 4M align or 64K align, the value of @geometry will have to reflect the 4M to
create compound pages of order 10 for the said vmemmap.

I am going to wait until you finish reviewing the remaining four patches of this series,
but maybe this is a simple misnomer (s/align/geometry/) with a comment but without
DEVMAP_{PTE,PMD,PUD} enum part? Or perhaps its own struct with a value and enum a
setter/getter to audit its value? Thoughts?

		Joao
Jane Chu May 18, 2021, 7:56 p.m. UTC | #13
On 5/18/2021 10:27 AM, Joao Martins wrote:

> On 5/5/21 11:36 PM, Joao Martins wrote:
>> On 5/5/21 11:20 PM, Dan Williams wrote:
>>> On Wed, May 5, 2021 at 12:50 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>> On 5/5/21 7:44 PM, Dan Williams wrote:
>>>>> On Thu, Mar 25, 2021 at 4:10 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>>> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
>>>>>> index b46f63dcaed3..bb28d82dda5e 100644
>>>>>> --- a/include/linux/memremap.h
>>>>>> +++ b/include/linux/memremap.h
>>>>>> @@ -114,6 +114,7 @@ struct dev_pagemap {
>>>>>>          struct completion done;
>>>>>>          enum memory_type type;
>>>>>>          unsigned int flags;
>>>>>> +       unsigned long align;
>>>>> I think this wants some kernel-doc above to indicate that non-zero
>>>>> means "use compound pages with tail-page dedup" and zero / PAGE_SIZE
>>>>> means "use non-compound base pages".
> [...]
>
>>>>> The non-zero value must be
>>>>> PAGE_SIZE, PMD_PAGE_SIZE or PUD_PAGE_SIZE.
>>>>> Hmm, maybe it should be an
>>>>> enum:
>>>>>
>>>>> enum devmap_geometry {
>>>>>      DEVMAP_PTE,
>>>>>      DEVMAP_PMD,
>>>>>      DEVMAP_PUD,
>>>>> }
>>>>>
>>>> I suppose a converter between devmap_geometry and page_size would be needed too? And maybe
>>>> the whole dax/nvdimm align values change meanwhile (as a followup improvement)?
>>> I think it is ok for dax/nvdimm to continue to maintain their align
>>> value because it should be ok to have 4MB align if the device really
>>> wanted. However, when it goes to map that alignment with
>>> memremap_pages() it can pick a mode. For example, it's already the
>>> case that dax->align == 1GB is mapped with DEVMAP_PTE today, so
>>> they're already separate concepts that can stay separate.
>>>
>> Gotcha.
> I am reconsidering part of the above. In general, yes, the meaning of devmap @align
> represents a slightly different variation of the device @align i.e. how the metadata is
> laid out **but** regardless of what kind of page table entries we use vmemmap.
>
> By using DEVMAP_PTE/PMD/PUD we might end up 1) duplicating what nvdimm/dax already
> validates in terms of allowed device @align values (i.e. PAGE_SIZE, PMD_SIZE and PUD_SIZE)
> 2) the geometry of metadata is very much tied to the value we pick to @align at namespace
> provisioning -- not the "align" we might use at mmap() perhaps that's what you referred
> above? -- and 3) the value of geometry actually derives from dax device @align because we
> will need to create compound pages representing a page size of @align value.
>
> Using your example above: you're saying that dax->align == 1G is mapped with DEVMAP_PTEs,
> in reality the vmemmap is populated with PMDs/PUDs page tables (depending on what archs
> decide to do at vmemmap_populate()) and uses base pages as its metadata regardless of what
> device @align. In reality what we want to convey in @geometry is not page table sizes, but
> just the page size used for the vmemmap of the dax device. Additionally, limiting its
> value might not be desirable... if tomorrow Linux for some arch supports dax/nvdimm
> devices with 4M align or 64K align, the value of @geometry will have to reflect the 4M to
> create compound pages of order 10 for the said vmemmap.
>
> I am going to wait until you finish reviewing the remaining four patches of this series,
> but maybe this is a simple misnomer (s/align/geometry/) with a comment but without
> DEVMAP_{PTE,PMD,PUD} enum part? Or perhaps its own struct with a value and enum a
> setter/getter to audit its value? Thoughts?
>
> 		Joao

Good points there.

My understanding is that  dax->align  conveys granularity of size while 
carving out a namespace,

it's a geometry attribute loosely akin to sector size of a spindle 
disk.  I tend to think that

device pagesize  has almost no relation to "align" in that, it's 
possible to have 1G "align" and

4K pagesize, or verse versa.  That is, with the advent of compound page 
support, it is possible

to totally separate the two concepts.

How about adding a new option to "ndctl create-namespace" that describes 
device creator's desired

pagesize, and another parameter to describe whether the pagesize shall 
be fixed or allowed to be split up,

such that, if the intention is to never split up 2M pagesize, then it 
would be possible to save a lot metadata

space on the device?

thanks,

-jane
Joao Martins May 19, 2021, 11:29 a.m. UTC | #14
On 5/18/21 8:56 PM, Jane Chu wrote:
> On 5/18/2021 10:27 AM, Joao Martins wrote:
> 
>> On 5/5/21 11:36 PM, Joao Martins wrote:
>>> On 5/5/21 11:20 PM, Dan Williams wrote:
>>>> On Wed, May 5, 2021 at 12:50 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>> On 5/5/21 7:44 PM, Dan Williams wrote:
>>>>>> On Thu, Mar 25, 2021 at 4:10 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>>>> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
>>>>>>> index b46f63dcaed3..bb28d82dda5e 100644
>>>>>>> --- a/include/linux/memremap.h
>>>>>>> +++ b/include/linux/memremap.h
>>>>>>> @@ -114,6 +114,7 @@ struct dev_pagemap {
>>>>>>>          struct completion done;
>>>>>>>          enum memory_type type;
>>>>>>>          unsigned int flags;
>>>>>>> +       unsigned long align;
>>>>>> I think this wants some kernel-doc above to indicate that non-zero
>>>>>> means "use compound pages with tail-page dedup" and zero / PAGE_SIZE
>>>>>> means "use non-compound base pages".
>> [...]
>>
>>>>>> The non-zero value must be
>>>>>> PAGE_SIZE, PMD_PAGE_SIZE or PUD_PAGE_SIZE.
>>>>>> Hmm, maybe it should be an
>>>>>> enum:
>>>>>>
>>>>>> enum devmap_geometry {
>>>>>>      DEVMAP_PTE,
>>>>>>      DEVMAP_PMD,
>>>>>>      DEVMAP_PUD,
>>>>>> }
>>>>>>
>>>>> I suppose a converter between devmap_geometry and page_size would be needed too? And maybe
>>>>> the whole dax/nvdimm align values change meanwhile (as a followup improvement)?
>>>> I think it is ok for dax/nvdimm to continue to maintain their align
>>>> value because it should be ok to have 4MB align if the device really
>>>> wanted. However, when it goes to map that alignment with
>>>> memremap_pages() it can pick a mode. For example, it's already the
>>>> case that dax->align == 1GB is mapped with DEVMAP_PTE today, so
>>>> they're already separate concepts that can stay separate.
>>>>
>>> Gotcha.
>> I am reconsidering part of the above. In general, yes, the meaning of devmap @align
>> represents a slightly different variation of the device @align i.e. how the metadata is
>> laid out **but** regardless of what kind of page table entries we use vmemmap.
>>
>> By using DEVMAP_PTE/PMD/PUD we might end up 1) duplicating what nvdimm/dax already
>> validates in terms of allowed device @align values (i.e. PAGE_SIZE, PMD_SIZE and PUD_SIZE)
>> 2) the geometry of metadata is very much tied to the value we pick to @align at namespace
>> provisioning -- not the "align" we might use at mmap() perhaps that's what you referred
>> above? -- and 3) the value of geometry actually derives from dax device @align because we
>> will need to create compound pages representing a page size of @align value.
>>
>> Using your example above: you're saying that dax->align == 1G is mapped with DEVMAP_PTEs,
>> in reality the vmemmap is populated with PMDs/PUDs page tables (depending on what archs
>> decide to do at vmemmap_populate()) and uses base pages as its metadata regardless of what
>> device @align. In reality what we want to convey in @geometry is not page table sizes, but
>> just the page size used for the vmemmap of the dax device. Additionally, limiting its
>> value might not be desirable... if tomorrow Linux for some arch supports dax/nvdimm
>> devices with 4M align or 64K align, the value of @geometry will have to reflect the 4M to
>> create compound pages of order 10 for the said vmemmap.
>>
>> I am going to wait until you finish reviewing the remaining four patches of this series,
>> but maybe this is a simple misnomer (s/align/geometry/) with a comment but without
>> DEVMAP_{PTE,PMD,PUD} enum part? Or perhaps its own struct with a value and enum a
>> setter/getter to audit its value? Thoughts?
> 
> Good points there.
> 
> My understanding is that  dax->align  conveys granularity of size while 
> carving out a namespace it's a geometry attribute loosely akin to sector size of a spindle 
> disk.  I tend to think that device pagesize  has almost no relation to "align" in that, it's 
> possible to have 1G "align" and 4K pagesize, or verse versa.  That is, with the advent of compound page 
> support, it is possible to totally separate the two concepts.
> 
> How about adding a new option to "ndctl create-namespace" that describes 
> device creator's desired pagesize, and another parameter to describe whether the pagesize shall 
> be fixed or allowed to be split up, such that, if the intention is to never split up 2M pagesize, then it 
> would be possible to save a lot metadata space on the device?

Maybe that can be selected by the driver too, but it's an interesting point you raise
should we settle with the geometry (e.g. like a geometry sysfs entry IIUC your
suggestion?). device-dax for example would use geometry == align and therefore save space
(like what I propose in patch 10). But fsdax would retain the default that is geometry =
PAGE_SIZE and align = PMD_SIZE should it want to split pages.

Interestingly, devmap poisoning always occur at @align level regardless of @geometry.

What I am not sure is what value (vs added complexity) it brings to allow geometry *value*
to be selecteable by user given that so far we seem to only ever initialize metadata as
either sets of base pages [*] or sets of compound pages (of a size). And the difference
between both can possibly be summarized to split-ability like you say.

[*] that optionally can are morphed into compound pages by driver
Jane Chu May 19, 2021, 6:36 p.m. UTC | #15
On 5/19/2021 4:29 AM, Joao Martins wrote:
>
> On 5/18/21 8:56 PM, Jane Chu wrote:
>> On 5/18/2021 10:27 AM, Joao Martins wrote:
>>
>>> On 5/5/21 11:36 PM, Joao Martins wrote:
>>>> On 5/5/21 11:20 PM, Dan Williams wrote:
>>>>> On Wed, May 5, 2021 at 12:50 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>>> On 5/5/21 7:44 PM, Dan Williams wrote:
>>>>>>> On Thu, Mar 25, 2021 at 4:10 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>>>>> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
>>>>>>>> index b46f63dcaed3..bb28d82dda5e 100644
>>>>>>>> --- a/include/linux/memremap.h
>>>>>>>> +++ b/include/linux/memremap.h
>>>>>>>> @@ -114,6 +114,7 @@ struct dev_pagemap {
>>>>>>>>           struct completion done;
>>>>>>>>           enum memory_type type;
>>>>>>>>           unsigned int flags;
>>>>>>>> +       unsigned long align;
>>>>>>> I think this wants some kernel-doc above to indicate that non-zero
>>>>>>> means "use compound pages with tail-page dedup" and zero / PAGE_SIZE
>>>>>>> means "use non-compound base pages".
>>> [...]
>>>
>>>>>>> The non-zero value must be
>>>>>>> PAGE_SIZE, PMD_PAGE_SIZE or PUD_PAGE_SIZE.
>>>>>>> Hmm, maybe it should be an
>>>>>>> enum:
>>>>>>>
>>>>>>> enum devmap_geometry {
>>>>>>>       DEVMAP_PTE,
>>>>>>>       DEVMAP_PMD,
>>>>>>>       DEVMAP_PUD,
>>>>>>> }
>>>>>>>
>>>>>> I suppose a converter between devmap_geometry and page_size would be needed too? And maybe
>>>>>> the whole dax/nvdimm align values change meanwhile (as a followup improvement)?
>>>>> I think it is ok for dax/nvdimm to continue to maintain their align
>>>>> value because it should be ok to have 4MB align if the device really
>>>>> wanted. However, when it goes to map that alignment with
>>>>> memremap_pages() it can pick a mode. For example, it's already the
>>>>> case that dax->align == 1GB is mapped with DEVMAP_PTE today, so
>>>>> they're already separate concepts that can stay separate.
>>>>>
>>>> Gotcha.
>>> I am reconsidering part of the above. In general, yes, the meaning of devmap @align
>>> represents a slightly different variation of the device @align i.e. how the metadata is
>>> laid out **but** regardless of what kind of page table entries we use vmemmap.
>>>
>>> By using DEVMAP_PTE/PMD/PUD we might end up 1) duplicating what nvdimm/dax already
>>> validates in terms of allowed device @align values (i.e. PAGE_SIZE, PMD_SIZE and PUD_SIZE)
>>> 2) the geometry of metadata is very much tied to the value we pick to @align at namespace
>>> provisioning -- not the "align" we might use at mmap() perhaps that's what you referred
>>> above? -- and 3) the value of geometry actually derives from dax device @align because we
>>> will need to create compound pages representing a page size of @align value.
>>>
>>> Using your example above: you're saying that dax->align == 1G is mapped with DEVMAP_PTEs,
>>> in reality the vmemmap is populated with PMDs/PUDs page tables (depending on what archs
>>> decide to do at vmemmap_populate()) and uses base pages as its metadata regardless of what
>>> device @align. In reality what we want to convey in @geometry is not page table sizes, but
>>> just the page size used for the vmemmap of the dax device. Additionally, limiting its
>>> value might not be desirable... if tomorrow Linux for some arch supports dax/nvdimm
>>> devices with 4M align or 64K align, the value of @geometry will have to reflect the 4M to
>>> create compound pages of order 10 for the said vmemmap.
>>>
>>> I am going to wait until you finish reviewing the remaining four patches of this series,
>>> but maybe this is a simple misnomer (s/align/geometry/) with a comment but without
>>> DEVMAP_{PTE,PMD,PUD} enum part? Or perhaps its own struct with a value and enum a
>>> setter/getter to audit its value? Thoughts?
>> Good points there.
>>
>> My understanding is that  dax->align  conveys granularity of size while
>> carving out a namespace it's a geometry attribute loosely akin to sector size of a spindle
>> disk.  I tend to think that device pagesize  has almost no relation to "align" in that, it's
>> possible to have 1G "align" and 4K pagesize, or verse versa.  That is, with the advent of compound page
>> support, it is possible to totally separate the two concepts.
>>
>> How about adding a new option to "ndctl create-namespace" that describes
>> device creator's desired pagesize, and another parameter to describe whether the pagesize shall
>> be fixed or allowed to be split up, such that, if the intention is to never split up 2M pagesize, then it
>> would be possible to save a lot metadata space on the device?
> Maybe that can be selected by the driver too, but it's an interesting point you raise
> should we settle with the geometry (e.g. like a geometry sysfs entry IIUC your
> suggestion?). device-dax for example would use geometry == align and therefore save space
> (like what I propose in patch 10). But fsdax would retain the default that is geometry =
> PAGE_SIZE and align = PMD_SIZE should it want to split pages.

Let's see, I think this is what we have today

        | align   hpagesize  geometry  hpage-splittable
=======================================================
devdax | 4K..1G  2M,1G         4K     artificially no
fsdax  | 4K..1G  2M            4K     yes

So a hard no-split means  (hpagesize == geometry), and that does not apply

to fsdax for now. But is it not possible in future?  Some customer prefers

an optional  guarantee that their DAX hpage never been splitted up for 
the sake of

rdma efficiency.

>
> Interestingly, devmap poisoning always occur at @align level regardless of @geometry.
Yeah, it's a simplification that's not ideal, because after all, 
error-blast-radius != UserMapping-pagesize.
>
> What I am not sure is what value (vs added complexity) it brings to allow geometry *value*
> to be selecteable by user given that so far we seem to only ever initialize metadata as
> either sets of base pages [*] or sets of compound pages (of a size). And the difference
> between both can possibly be summarized to split-ability like you say.
>
> [*] that optionally can are morphed into compound pages by driver

Agreed.  For this series, it's simpler not to make the 
compound-page-size selectable.

thanks,

-jane
Dan Williams June 7, 2021, 8:17 p.m. UTC | #16
On Tue, May 18, 2021 at 10:28 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>
> On 5/5/21 11:36 PM, Joao Martins wrote:
> > On 5/5/21 11:20 PM, Dan Williams wrote:
> >> On Wed, May 5, 2021 at 12:50 PM Joao Martins <joao.m.martins@oracle.com> wrote:
> >>> On 5/5/21 7:44 PM, Dan Williams wrote:
> >>>> On Thu, Mar 25, 2021 at 4:10 PM Joao Martins <joao.m.martins@oracle.com> wrote:
> >>>>> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> >>>>> index b46f63dcaed3..bb28d82dda5e 100644
> >>>>> --- a/include/linux/memremap.h
> >>>>> +++ b/include/linux/memremap.h
> >>>>> @@ -114,6 +114,7 @@ struct dev_pagemap {
> >>>>>         struct completion done;
> >>>>>         enum memory_type type;
> >>>>>         unsigned int flags;
> >>>>> +       unsigned long align;
> >>>>
> >>>> I think this wants some kernel-doc above to indicate that non-zero
> >>>> means "use compound pages with tail-page dedup" and zero / PAGE_SIZE
> >>>> means "use non-compound base pages".
>
> [...]
>
> >>>> The non-zero value must be
> >>>> PAGE_SIZE, PMD_PAGE_SIZE or PUD_PAGE_SIZE.
> >>>> Hmm, maybe it should be an
> >>>> enum:
> >>>>
> >>>> enum devmap_geometry {
> >>>>     DEVMAP_PTE,
> >>>>     DEVMAP_PMD,
> >>>>     DEVMAP_PUD,
> >>>> }
> >>>>
> >>> I suppose a converter between devmap_geometry and page_size would be needed too? And maybe
> >>> the whole dax/nvdimm align values change meanwhile (as a followup improvement)?
> >>
> >> I think it is ok for dax/nvdimm to continue to maintain their align
> >> value because it should be ok to have 4MB align if the device really
> >> wanted. However, when it goes to map that alignment with
> >> memremap_pages() it can pick a mode. For example, it's already the
> >> case that dax->align == 1GB is mapped with DEVMAP_PTE today, so
> >> they're already separate concepts that can stay separate.
> >>
> > Gotcha.
>
> I am reconsidering part of the above. In general, yes, the meaning of devmap @align
> represents a slightly different variation of the device @align i.e. how the metadata is
> laid out **but** regardless of what kind of page table entries we use vmemmap.
>
> By using DEVMAP_PTE/PMD/PUD we might end up 1) duplicating what nvdimm/dax already
> validates in terms of allowed device @align values (i.e. PAGE_SIZE, PMD_SIZE and PUD_SIZE)
> 2) the geometry of metadata is very much tied to the value we pick to @align at namespace
> provisioning -- not the "align" we might use at mmap() perhaps that's what you referred
> above? -- and 3) the value of geometry actually derives from dax device @align because we
> will need to create compound pages representing a page size of @align value.
>
> Using your example above: you're saying that dax->align == 1G is mapped with DEVMAP_PTEs,
> in reality the vmemmap is populated with PMDs/PUDs page tables (depending on what archs
> decide to do at vmemmap_populate()) and uses base pages as its metadata regardless of what
> device @align. In reality what we want to convey in @geometry is not page table sizes, but
> just the page size used for the vmemmap of the dax device.

Good point, the names "PTE, PMD, PUD" imply the hardware mapping size,
not the software compound page size.

> Additionally, limiting its
> value might not be desirable... if tomorrow Linux for some arch supports dax/nvdimm
> devices with 4M align or 64K align, the value of @geometry will have to reflect the 4M to
> create compound pages of order 10 for the said vmemmap.
>
> I am going to wait until you finish reviewing the remaining four patches of this series,
> but maybe this is a simple misnomer (s/align/geometry/) with a comment but without
> DEVMAP_{PTE,PMD,PUD} enum part? Or perhaps its own struct with a value and enum a
> setter/getter to audit its value? Thoughts?

I do see what you mean about the confusion DEVMAP_{PTE,PMD,PUD}
introduces, but I still think the device-dax align and the
organization of the 'struct page' metadata are distinct concepts. So
I'm happy with any color of the bikeshed as long as the 2 concepts are
distinct. How about calling it  "compound_page_order"? Open to other
ideas...
Joao Martins June 7, 2021, 8:47 p.m. UTC | #17
On 6/7/21 9:17 PM, Dan Williams wrote:
> On Tue, May 18, 2021 at 10:28 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>
>> On 5/5/21 11:36 PM, Joao Martins wrote:
>>> On 5/5/21 11:20 PM, Dan Williams wrote:
>>>> On Wed, May 5, 2021 at 12:50 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>> On 5/5/21 7:44 PM, Dan Williams wrote:
>>>>>> On Thu, Mar 25, 2021 at 4:10 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>>>> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
>>>>>>> index b46f63dcaed3..bb28d82dda5e 100644
>>>>>>> --- a/include/linux/memremap.h
>>>>>>> +++ b/include/linux/memremap.h
>>>>>>> @@ -114,6 +114,7 @@ struct dev_pagemap {
>>>>>>>         struct completion done;
>>>>>>>         enum memory_type type;
>>>>>>>         unsigned int flags;
>>>>>>> +       unsigned long align;
>>>>>>
>>>>>> I think this wants some kernel-doc above to indicate that non-zero
>>>>>> means "use compound pages with tail-page dedup" and zero / PAGE_SIZE
>>>>>> means "use non-compound base pages".
>>
>> [...]
>>
>>>>>> The non-zero value must be
>>>>>> PAGE_SIZE, PMD_PAGE_SIZE or PUD_PAGE_SIZE.
>>>>>> Hmm, maybe it should be an
>>>>>> enum:
>>>>>>
>>>>>> enum devmap_geometry {
>>>>>>     DEVMAP_PTE,
>>>>>>     DEVMAP_PMD,
>>>>>>     DEVMAP_PUD,
>>>>>> }
>>>>>>
>>>>> I suppose a converter between devmap_geometry and page_size would be needed too? And maybe
>>>>> the whole dax/nvdimm align values change meanwhile (as a followup improvement)?
>>>>
>>>> I think it is ok for dax/nvdimm to continue to maintain their align
>>>> value because it should be ok to have 4MB align if the device really
>>>> wanted. However, when it goes to map that alignment with
>>>> memremap_pages() it can pick a mode. For example, it's already the
>>>> case that dax->align == 1GB is mapped with DEVMAP_PTE today, so
>>>> they're already separate concepts that can stay separate.
>>>>
>>> Gotcha.
>>
>> I am reconsidering part of the above. In general, yes, the meaning of devmap @align
>> represents a slightly different variation of the device @align i.e. how the metadata is
>> laid out **but** regardless of what kind of page table entries we use vmemmap.
>>
>> By using DEVMAP_PTE/PMD/PUD we might end up 1) duplicating what nvdimm/dax already
>> validates in terms of allowed device @align values (i.e. PAGE_SIZE, PMD_SIZE and PUD_SIZE)
>> 2) the geometry of metadata is very much tied to the value we pick to @align at namespace
>> provisioning -- not the "align" we might use at mmap() perhaps that's what you referred
>> above? -- and 3) the value of geometry actually derives from dax device @align because we
>> will need to create compound pages representing a page size of @align value.
>>
>> Using your example above: you're saying that dax->align == 1G is mapped with DEVMAP_PTEs,
>> in reality the vmemmap is populated with PMDs/PUDs page tables (depending on what archs
>> decide to do at vmemmap_populate()) and uses base pages as its metadata regardless of what
>> device @align. In reality what we want to convey in @geometry is not page table sizes, but
>> just the page size used for the vmemmap of the dax device.
> 
> Good point, the names "PTE, PMD, PUD" imply the hardware mapping size,
> not the software compound page size.
> 
>> Additionally, limiting its
>> value might not be desirable... if tomorrow Linux for some arch supports dax/nvdimm
>> devices with 4M align or 64K align, the value of @geometry will have to reflect the 4M to
>> create compound pages of order 10 for the said vmemmap.
>>
>> I am going to wait until you finish reviewing the remaining four patches of this series,
>> but maybe this is a simple misnomer (s/align/geometry/) with a comment but without
>> DEVMAP_{PTE,PMD,PUD} enum part? Or perhaps its own struct with a value and enum a
>> setter/getter to audit its value? Thoughts?
> 
> I do see what you mean about the confusion DEVMAP_{PTE,PMD,PUD}
> introduces, but I still think the device-dax align and the
> organization of the 'struct page' metadata are distinct concepts. So
> I'm happy with any color of the bikeshed as long as the 2 concepts are
> distinct. How about calling it  "compound_page_order"? Open to other
> ideas...
> 
I actually like the name of @geometry. The only thing better would be @vmemmap_geometry
solely because it makes it clear that its the vmemmap that we are talking about -- but
might be unnecssarily verbose. And I still agree that is separate concept that should be
named differently *at least*.

But naming aside, I was trying to get at was to avoid a second geometry value validation
i.e. to be validated the value and set with a value such as DEVMAP_PTE, DEVMAP_PMD and
DEVMAP_PUD. That to me sounds a little redundant, when the geometry value depends on what
align is going to be used from. Here my metnion of @align refers to what's used to create
the dax device, not the mmap() align [which can be lower than the device one]. The dax
device align is the one used to decide whether to use PTEs, PMDs or PUDs at dax fault handler.

So separate concepts, but still its value dependent on one another. At least unless we
want to allow geometry values different than those set by --align as Jane suggested.
Joao Martins June 7, 2021, 9 p.m. UTC | #18
On 6/7/21 9:47 PM, Joao Martins wrote:
> On 6/7/21 9:17 PM, Dan Williams wrote:
>> On Tue, May 18, 2021 at 10:28 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>
>>> On 5/5/21 11:36 PM, Joao Martins wrote:
>>>> On 5/5/21 11:20 PM, Dan Williams wrote:
>>>>> On Wed, May 5, 2021 at 12:50 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>>> On 5/5/21 7:44 PM, Dan Williams wrote:
>>>>>>> On Thu, Mar 25, 2021 at 4:10 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>>>>> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
>>>>>>>> index b46f63dcaed3..bb28d82dda5e 100644
>>>>>>>> --- a/include/linux/memremap.h
>>>>>>>> +++ b/include/linux/memremap.h
>>>>>>>> @@ -114,6 +114,7 @@ struct dev_pagemap {
>>>>>>>>         struct completion done;
>>>>>>>>         enum memory_type type;
>>>>>>>>         unsigned int flags;
>>>>>>>> +       unsigned long align;
>>>>>>>
>>>>>>> I think this wants some kernel-doc above to indicate that non-zero
>>>>>>> means "use compound pages with tail-page dedup" and zero / PAGE_SIZE
>>>>>>> means "use non-compound base pages".
>>>
>>> [...]
>>>
>>>>>>> The non-zero value must be
>>>>>>> PAGE_SIZE, PMD_PAGE_SIZE or PUD_PAGE_SIZE.
>>>>>>> Hmm, maybe it should be an
>>>>>>> enum:
>>>>>>>
>>>>>>> enum devmap_geometry {
>>>>>>>     DEVMAP_PTE,
>>>>>>>     DEVMAP_PMD,
>>>>>>>     DEVMAP_PUD,
>>>>>>> }
>>>>>>>
>>>>>> I suppose a converter between devmap_geometry and page_size would be needed too? And maybe
>>>>>> the whole dax/nvdimm align values change meanwhile (as a followup improvement)?
>>>>>
>>>>> I think it is ok for dax/nvdimm to continue to maintain their align
>>>>> value because it should be ok to have 4MB align if the device really
>>>>> wanted. However, when it goes to map that alignment with
>>>>> memremap_pages() it can pick a mode. For example, it's already the
>>>>> case that dax->align == 1GB is mapped with DEVMAP_PTE today, so
>>>>> they're already separate concepts that can stay separate.
>>>>>
>>>> Gotcha.
>>>
>>> I am reconsidering part of the above. In general, yes, the meaning of devmap @align
>>> represents a slightly different variation of the device @align i.e. how the metadata is
>>> laid out **but** regardless of what kind of page table entries we use vmemmap.
>>>
>>> By using DEVMAP_PTE/PMD/PUD we might end up 1) duplicating what nvdimm/dax already
>>> validates in terms of allowed device @align values (i.e. PAGE_SIZE, PMD_SIZE and PUD_SIZE)
>>> 2) the geometry of metadata is very much tied to the value we pick to @align at namespace
>>> provisioning -- not the "align" we might use at mmap() perhaps that's what you referred
>>> above? -- and 3) the value of geometry actually derives from dax device @align because we
>>> will need to create compound pages representing a page size of @align value.
>>>
>>> Using your example above: you're saying that dax->align == 1G is mapped with DEVMAP_PTEs,
>>> in reality the vmemmap is populated with PMDs/PUDs page tables (depending on what archs
>>> decide to do at vmemmap_populate()) and uses base pages as its metadata regardless of what
>>> device @align. In reality what we want to convey in @geometry is not page table sizes, but
>>> just the page size used for the vmemmap of the dax device.
>>
>> Good point, the names "PTE, PMD, PUD" imply the hardware mapping size,
>> not the software compound page size.
>>
>>> Additionally, limiting its
>>> value might not be desirable... if tomorrow Linux for some arch supports dax/nvdimm
>>> devices with 4M align or 64K align, the value of @geometry will have to reflect the 4M to
>>> create compound pages of order 10 for the said vmemmap.
>>>
>>> I am going to wait until you finish reviewing the remaining four patches of this series,
>>> but maybe this is a simple misnomer (s/align/geometry/) with a comment but without
>>> DEVMAP_{PTE,PMD,PUD} enum part? Or perhaps its own struct with a value and enum a
>>> setter/getter to audit its value? Thoughts?
>>
>> I do see what you mean about the confusion DEVMAP_{PTE,PMD,PUD}
>> introduces, but I still think the device-dax align and the
>> organization of the 'struct page' metadata are distinct concepts. So
>> I'm happy with any color of the bikeshed as long as the 2 concepts are
>> distinct. How about calling it  "compound_page_order"? Open to other
>> ideas...
>>
> I actually like the name of @geometry. The only thing better would be @vmemmap_geometry
> solely because it makes it clear that its the vmemmap that we are talking about -- but
> might be unnecssarily verbose. And I still agree that is separate concept that should be
> named differently *at least*.
> 
> But naming aside, I was trying to get at was to avoid a second geometry value validation
> i.e. to be validated the value and set with a value such as DEVMAP_PTE, DEVMAP_PMD and
> DEVMAP_PUD. 

Sorry my english keeps getting broken, I meant this instead:

But naming aside, what I am trying to get at is to remove the second geometry value
validation i.e. for @geometry to not be validated a second time to be set to DEVMAP_PTE,
DEVMAP_PMD or DEVMAP_PUD.

> That to me sounds a little redundant, when the geometry value depends on what
> align is going to be used from. Here my metnion of @align refers to what's used to create
> the dax device, not the mmap() align [which can be lower than the device one]. The dax
> device align is the one used to decide whether to use PTEs, PMDs or PUDs at dax fault handler.
> 
> So separate concepts, but still its value dependent on one another. At least unless we
> want to allow geometry values different than those set by --align as Jane suggested.
> 

And I should add:

I can maintain the DEVMAP_* enum values, but then these will need to be changed in tandem
anytime a new @align value is supported. Or instead we use the name @geometry albeit with
still as an unsigned long type . Or rather than an unsigned long perhaps making another
type and its value obtained/changed with getter/setter.
Dan Williams June 7, 2021, 9:57 p.m. UTC | #19
On Mon, Jun 7, 2021 at 2:02 PM Joao Martins <joao.m.martins@oracle.com> wrote:
[..]
> > But naming aside, I was trying to get at was to avoid a second geometry value validation
> > i.e. to be validated the value and set with a value such as DEVMAP_PTE, DEVMAP_PMD and
> > DEVMAP_PUD.
>
> Sorry my english keeps getting broken, I meant this instead:
>
> But naming aside, what I am trying to get at is to remove the second geometry value
> validation i.e. for @geometry to not be validated a second time to be set to DEVMAP_PTE,
> DEVMAP_PMD or DEVMAP_PUD.
>
> > That to me sounds a little redundant, when the geometry value depends on what
> > align is going to be used from. Here my metnion of @align refers to what's used to create
> > the dax device, not the mmap() align [which can be lower than the device one]. The dax
> > device align is the one used to decide whether to use PTEs, PMDs or PUDs at dax fault handler.
> >
> > So separate concepts, but still its value dependent on one another. At least unless we
> > want to allow geometry values different than those set by --align as Jane suggested.
> >
>
> And I should add:
>
> I can maintain the DEVMAP_* enum values, but then these will need to be changed in tandem
> anytime a new @align value is supported. Or instead we use the name @geometry albeit with
> still as an unsigned long type . Or rather than an unsigned long perhaps making another
> type and its value obtained/changed with getter/setter.

No, feel free to drop the enum and use an explicit "geometry" value
for the vmemmap.
diff mbox series

Patch

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index b46f63dcaed3..bb28d82dda5e 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -114,6 +114,7 @@  struct dev_pagemap {
 	struct completion done;
 	enum memory_type type;
 	unsigned int flags;
+	unsigned long align;
 	const struct dev_pagemap_ops *ops;
 	void *owner;
 	int nr_range;
@@ -130,6 +131,18 @@  static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
 	return NULL;
 }
 
+static inline unsigned long pgmap_align(struct dev_pagemap *pgmap)
+{
+	if (!pgmap || !pgmap->align)
+		return PAGE_SIZE;
+	return pgmap->align;
+}
+
+static inline unsigned long pgmap_pfn_align(struct dev_pagemap *pgmap)
+{
+	return PHYS_PFN(pgmap_align(pgmap));
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 bool pfn_zone_device_reserved(unsigned long pfn);
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
diff --git a/mm/memremap.c b/mm/memremap.c
index 805d761740c4..d160853670c4 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -318,8 +318,12 @@  static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
 	memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
 				PHYS_PFN(range->start),
 				PHYS_PFN(range_len(range)), pgmap);
-	percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
-			- pfn_first(pgmap, range_id));
+	if (pgmap_align(pgmap) > PAGE_SIZE)
+		percpu_ref_get_many(pgmap->ref, (pfn_end(pgmap, range_id)
+			- pfn_first(pgmap, range_id)) / pgmap_pfn_align(pgmap));
+	else
+		percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
+				- pfn_first(pgmap, range_id));
 	return 0;
 
 err_add_memory:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 58974067bbd4..3a77f9e43f3a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6285,6 +6285,8 @@  void __ref memmap_init_zone_device(struct zone *zone,
 	unsigned long pfn, end_pfn = start_pfn + nr_pages;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	struct vmem_altmap *altmap = pgmap_altmap(pgmap);
+	unsigned int pfn_align = pgmap_pfn_align(pgmap);
+	unsigned int order_align = order_base_2(pfn_align);
 	unsigned long zone_idx = zone_idx(zone);
 	unsigned long start = jiffies;
 	int nid = pgdat->node_id;
@@ -6302,10 +6304,30 @@  void __ref memmap_init_zone_device(struct zone *zone,
 		nr_pages = end_pfn - start_pfn;
 	}
 
-	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+	for (pfn = start_pfn; pfn < end_pfn; pfn += pfn_align) {
 		struct page *page = pfn_to_page(pfn);
+		unsigned long i;
 
 		__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
+
+		if (pfn_align == 1)
+			continue;
+
+		__SetPageHead(page);
+
+		for (i = 1; i < pfn_align; i++) {
+			__init_zone_device_page(page + i, pfn + i, zone_idx,
+						nid, pgmap);
+			prep_compound_tail(page, i);
+
+			/*
+			 * The first and second tail pages need to
+			 * initialized first, hence the head page is
+			 * prepared last.
+			 */
+			if (i == 2)
+				prep_compound_head(page, order_align);
+		}
 	}
 
 	pr_info("%s initialised %lu pages in %ums\n", __func__,