diff mbox series

[Chapter,Three] THP HVO: bring the hugeTLB feature to THP

Message ID 20240229183436.4110845-4-yuzhao@google.com (mailing list archive)
State New
Headers show
Series [Chapter,Three] THP HVO: bring the hugeTLB feature to THP | expand

Commit Message

Yu Zhao Feb. 29, 2024, 6:34 p.m. UTC
HVO can be one of the perks for heavy THP users like it is for hugeTLB
users. For example, if such a user uses 60% of physical memory for 2MB
THPs, THP HVO can reduce the struct page overhead by half (60% * 7/8
~= 50%).

ZONE_NOMERGE considerably simplifies the implementation of HVO for
THPs, since THPs from it cannot be split or merged and thus do not
require any correctness-related operations on tail pages beyond the
second one.

If a THP is mapped by PTEs, two optimization-related operations on its
tail pages, i.e., _mapcount and PG_anon_exclusive, can be binned to
track a group of pages, e.g., eight pages per group for 2MB THPs. The
estimation, as the copying cost incurred during shattering, is also by
design, since mapping by PTEs is another discouraged behavior.

Signed-off-by: Yu Zhao <yuzhao@google.com>
---
 include/linux/mm.h     | 140 ++++++++++++++++++++++++++++++++++++++
 include/linux/mmzone.h |   1 +
 include/linux/rmap.h   |   4 ++
 init/main.c            |   1 +
 mm/gup.c               |   3 +-
 mm/huge_memory.c       |   2 +
 mm/hugetlb_vmemmap.c   |   2 +-
 mm/internal.h          |   9 ---
 mm/memory.c            |  11 +--
 mm/page_alloc.c        | 151 ++++++++++++++++++++++++++++++++++++++++-
 mm/rmap.c              |  17 ++++-
 mm/vmstat.c            |   2 +
 12 files changed, 323 insertions(+), 20 deletions(-)

Comments

Yang Shi Feb. 29, 2024, 10:54 p.m. UTC | #1
On Thu, Feb 29, 2024 at 10:34 AM Yu Zhao <yuzhao@google.com> wrote:
>
> HVO can be one of the perks for heavy THP users like it is for hugeTLB
> users. For example, if such a user uses 60% of physical memory for 2MB
> THPs, THP HVO can reduce the struct page overhead by half (60% * 7/8
> ~= 50%).
>
> ZONE_NOMERGE considerably simplifies the implementation of HVO for
> THPs, since THPs from it cannot be split or merged and thus do not
> require any correctness-related operations on tail pages beyond the
> second one.
>
> If a THP is mapped by PTEs, two optimization-related operations on its
> tail pages, i.e., _mapcount and PG_anon_exclusive, can be binned to
> track a group of pages, e.g., eight pages per group for 2MB THPs. The
> estimation, as the copying cost incurred during shattering, is also by
> design, since mapping by PTEs is another discouraged behavior.

I'm confused by this. Can you please elaborate a little bit about
binning mapcount and PG_anon_exclusive?

For mapcount, IIUC, for example, when inc'ing a subpage's mapcount,
you actually inc the (i % 64) page's mapcount (assuming THP size is 2M
and base page size is 4K, so 8 strides and 64 pages in each stride),
right? But how you can tell each page of the 8 pages has mapcount 1 or
one page is mapped 8 times? Or this actually doesn't matter, we don't
even care to distinguish the two cases?

For PG_anon_exclusive, if one page has it set, it means other 7 pages
in other strides have it set too?

>
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> ---
>  include/linux/mm.h     | 140 ++++++++++++++++++++++++++++++++++++++
>  include/linux/mmzone.h |   1 +
>  include/linux/rmap.h   |   4 ++
>  init/main.c            |   1 +
>  mm/gup.c               |   3 +-
>  mm/huge_memory.c       |   2 +
>  mm/hugetlb_vmemmap.c   |   2 +-
>  mm/internal.h          |   9 ---
>  mm/memory.c            |  11 +--
>  mm/page_alloc.c        | 151 ++++++++++++++++++++++++++++++++++++++++-
>  mm/rmap.c              |  17 ++++-
>  mm/vmstat.c            |   2 +
>  12 files changed, 323 insertions(+), 20 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f5a97dec5169..d7014fc35cca 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1196,6 +1196,138 @@ static inline void page_mapcount_reset(struct page *page)
>         atomic_set(&(page)->_mapcount, -1);
>  }
>
> +#define HVO_MOD (PAGE_SIZE / sizeof(struct page))
> +
> +static inline int hvo_order_size(int order)
> +{
> +       if (PAGE_SIZE % sizeof(struct page) || !is_power_of_2(HVO_MOD))
> +               return 0;
> +
> +       return (1 << order) * sizeof(struct page);
> +}
> +
> +static inline bool page_hvo_suitable(struct page *head, int order)
> +{
> +       VM_WARN_ON_ONCE_PAGE(!test_bit(PG_head, &head->flags), head);
> +
> +       if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
> +               return false;
> +
> +       return page_zonenum(head) == ZONE_NOMERGE &&
> +              IS_ALIGNED((unsigned long)head, PAGE_SIZE) &&
> +              hvo_order_size(order) > PAGE_SIZE;
> +}
> +
> +static inline bool folio_hvo_suitable(struct folio *folio)
> +{
> +       return folio_test_large(folio) && page_hvo_suitable(&folio->page, folio_order(folio));
> +}
> +
> +static inline bool page_is_hvo(struct page *head, int order)
> +{
> +       return page_hvo_suitable(head, order) && test_bit(PG_head, &head[HVO_MOD].flags);
> +}
> +
> +static inline bool folio_is_hvo(struct folio *folio)
> +{
> +       return folio_test_large(folio) && page_is_hvo(&folio->page, folio_order(folio));
> +}
> +
> +/*
> + * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
> + * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit
> + * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently
> + * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
> + */
> +#define ENTIRELY_MAPPED                0x800000
> +#define FOLIO_PAGES_MAPPED     (ENTIRELY_MAPPED - 1)
> +
> +static inline int hvo_range_mapcount(struct folio *folio, struct page *page, int nr_pages, int *ret)
> +{
> +       int i, next, end;
> +       int stride = hvo_order_size(folio_order(folio)) / PAGE_SIZE;
> +
> +       if (!folio_is_hvo(folio))
> +               return false;
> +
> +       *ret = folio_entire_mapcount(folio);
> +
> +       for (i = folio_page_idx(folio, page), end = i + nr_pages; i != end; i = next) {
> +               next = min(end, round_down(i + stride, stride));
> +
> +               page = folio_page(folio, i / stride);
> +               *ret += atomic_read(&page->_mapcount) + 1;
> +       }
> +
> +       return true;
> +}
> +
> +static inline bool hvo_map_range(struct folio *folio, struct page *page, int nr_pages, int *ret)
> +{
> +       int i, next, end;
> +       int stride = hvo_order_size(folio_order(folio)) / PAGE_SIZE;
> +
> +       if (!folio_is_hvo(folio))
> +               return false;
> +
> +       *ret = 0;
> +
> +       for (i = folio_page_idx(folio, page), end = i + nr_pages; i != end; i = next) {
> +               next = min(end, round_down(i + stride, stride));
> +
> +               page = folio_page(folio, i / stride);
> +               if (atomic_add_return(next - i, &page->_mapcount) == next - i - 1)
> +                       *ret += stride;
> +       }
> +
> +       if (atomic_add_return(*ret, &folio->_nr_pages_mapped) >= ENTIRELY_MAPPED)
> +               *ret = 0;
> +
> +       return true;
> +}
> +
> +static inline bool hvo_unmap_range(struct folio *folio, struct page *page, int nr_pages, int *ret)
> +{
> +       int i, next, end;
> +       int stride = hvo_order_size(folio_order(folio)) / PAGE_SIZE;
> +
> +       if (!folio_is_hvo(folio))
> +               return false;
> +
> +       *ret = 0;
> +
> +       for (i = folio_page_idx(folio, page), end = i + nr_pages; i != end; i = next) {
> +               next = min(end, round_down(i + stride, stride));
> +
> +               page = folio_page(folio, i / stride);
> +               if (atomic_sub_return(next - i, &page->_mapcount) == -1)
> +                       *ret += stride;
> +       }
> +
> +       if (atomic_sub_return(*ret, &folio->_nr_pages_mapped) >= ENTIRELY_MAPPED)
> +               *ret = 0;
> +
> +       return true;
> +}
> +
> +static inline bool hvo_dup_range(struct folio *folio, struct page *page, int nr_pages)
> +{
> +       int i, next, end;
> +       int stride = hvo_order_size(folio_order(folio)) / PAGE_SIZE;
> +
> +       if (!folio_is_hvo(folio))
> +               return false;
> +
> +       for (i = folio_page_idx(folio, page), end = i + nr_pages; i != end; i = next) {
> +               next = min(end, round_down(i + stride, stride));
> +
> +               page = folio_page(folio, i / stride);
> +               atomic_add(next - i, &page->_mapcount);
> +       }
> +
> +       return true;
> +}
> +
>  /**
>   * page_mapcount() - Number of times this precise page is mapped.
>   * @page: The page.
> @@ -1212,6 +1344,9 @@ static inline int page_mapcount(struct page *page)
>  {
>         int mapcount = atomic_read(&page->_mapcount) + 1;
>
> +       if (hvo_range_mapcount(page_folio(page), page, 1, &mapcount))
> +               return mapcount;
> +
>         if (unlikely(PageCompound(page)))
>                 mapcount += folio_entire_mapcount(page_folio(page));
>
> @@ -3094,6 +3229,11 @@ static inline void pagetable_pud_dtor(struct ptdesc *ptdesc)
>
>  extern void __init pagecache_init(void);
>  extern void free_initmem(void);
> +extern void free_vmemmap(void);
> +extern int vmemmap_remap_free(unsigned long start, unsigned long end,
> +                             unsigned long reuse,
> +                             struct list_head *vmemmap_pages,
> +                             unsigned long flags);
>
>  /*
>   * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 532218167bba..00e4bb6c8533 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -916,6 +916,7 @@ struct zone {
>  #ifdef CONFIG_CMA
>         unsigned long           cma_pages;
>  #endif
> +       atomic_long_t           hvo_freed;
>
>         const char              *name;
>
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index b7944a833668..d058c4cb3c96 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -322,6 +322,8 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
>
>         switch (level) {
>         case RMAP_LEVEL_PTE:
> +               if (hvo_dup_range(folio, page, nr_pages))
> +                       break;
>                 do {
>                         atomic_inc(&page->_mapcount);
>                 } while (page++, --nr_pages > 0);
> @@ -401,6 +403,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
>                                 if (PageAnonExclusive(page + i))
>                                         return -EBUSY;
>                 }
> +               if (hvo_dup_range(folio, page, nr_pages))
> +                       break;
>                 do {
>                         if (PageAnonExclusive(page))
>                                 ClearPageAnonExclusive(page);
> diff --git a/init/main.c b/init/main.c
> index e24b0780fdff..74003495db32 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -1448,6 +1448,7 @@ static int __ref kernel_init(void *unused)
>         kgdb_free_init_mem();
>         exit_boot_config();
>         free_initmem();
> +       free_vmemmap();
>         mark_readonly();
>
>         /*
> diff --git a/mm/gup.c b/mm/gup.c
> index df83182ec72d..f3df0078505b 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -57,7 +57,7 @@ static inline void sanity_check_pinned_pages(struct page **pages,
>                         continue;
>                 if (!folio_test_large(folio) || folio_test_hugetlb(folio))
>                         VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
> -               else
> +               else if (!folio_is_hvo(folio) || !folio_nr_pages_mapped(folio))
>                         /* Either a PTE-mapped or a PMD-mapped THP. */
>                         VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
>                                        !PageAnonExclusive(page), page);
> @@ -645,6 +645,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
>         }
>
>         VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
> +                      !folio_is_hvo(page_folio(page)) &&
>                        !PageAnonExclusive(page), page);
>
>         /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 62d2254bc51c..9e7e5d587a5c 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2535,6 +2535,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>                  *
>                  * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
>                  */
> +               if (folio_is_hvo(folio))
> +                       ClearPageAnonExclusive(page);
>                 anon_exclusive = PageAnonExclusive(page);
>                 if (freeze && anon_exclusive &&
>                     folio_try_share_anon_rmap_pmd(folio, page))
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index da177e49d956..9f43d900e83c 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -310,7 +310,7 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end,
>   *
>   * Return: %0 on success, negative error code otherwise.
>   */
> -static int vmemmap_remap_free(unsigned long start, unsigned long end,
> +int vmemmap_remap_free(unsigned long start, unsigned long end,
>                               unsigned long reuse,
>                               struct list_head *vmemmap_pages,
>                               unsigned long flags)
> diff --git a/mm/internal.h b/mm/internal.h
> index ac1d27468899..871c6eeb78b8 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -52,15 +52,6 @@ struct folio_batch;
>
>  void page_writeback_init(void);
>
> -/*
> - * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
> - * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit
> - * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently
> - * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
> - */
> -#define ENTIRELY_MAPPED                0x800000
> -#define FOLIO_PAGES_MAPPED     (ENTIRELY_MAPPED - 1)
> -
>  /*
>   * Flags passed to __show_mem() and show_free_areas() to suppress output in
>   * various contexts.
> diff --git a/mm/memory.c b/mm/memory.c
> index 0bfc8b007c01..db389f1d776d 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3047,8 +3047,8 @@ static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
>         VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
>
>         if (folio) {
> -               VM_BUG_ON(folio_test_anon(folio) &&
> -                         !PageAnonExclusive(vmf->page));
> +               VM_BUG_ON_PAGE(folio_test_anon(folio) && !folio_is_hvo(folio) &&
> +                              !PageAnonExclusive(vmf->page), vmf->page);
>                 /*
>                  * Clear the folio's cpupid information as the existing
>                  * information potentially belongs to a now completely
> @@ -3502,7 +3502,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
>          */
>         if (folio && folio_test_anon(folio) &&
>             (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) {
> -               if (!PageAnonExclusive(vmf->page))
> +               if (!folio_is_hvo(folio) && !PageAnonExclusive(vmf->page))
>                         SetPageAnonExclusive(vmf->page);
>                 if (unlikely(unshare)) {
>                         pte_unmap_unlock(vmf->pte, vmf->ptl);
> @@ -4100,8 +4100,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>                                         rmap_flags);
>         }
>
> -       VM_BUG_ON(!folio_test_anon(folio) ||
> -                       (pte_write(pte) && !PageAnonExclusive(page)));
> +       VM_BUG_ON_PAGE(!folio_test_anon(folio) ||
> +                      (pte_write(pte) && !folio_is_hvo(folio) && !PageAnonExclusive(page)),
> +                      page);
>         set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
>         arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index dd843fb04f78..5f8c6583a191 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -53,6 +53,7 @@
>  #include <linux/khugepaged.h>
>  #include <linux/delayacct.h>
>  #include <linux/cacheinfo.h>
> +#include <linux/bootmem_info.h>
>  #include <asm/div64.h>
>  #include "internal.h"
>  #include "shuffle.h"
> @@ -585,6 +586,10 @@ void prep_compound_page(struct page *page, unsigned int order)
>         int nr_pages = 1 << order;
>
>         __SetPageHead(page);
> +
> +       if (page_is_hvo(page, order))
> +               nr_pages = HVO_MOD;
> +
>         for (i = 1; i < nr_pages; i++)
>                 prep_compound_tail(page, i);
>
> @@ -1124,10 +1129,15 @@ static __always_inline bool free_pages_prepare(struct page *page,
>          */
>         if (unlikely(order)) {
>                 int i;
> +               int nr_pages = 1 << order;
>
> -               if (compound)
> +               if (compound) {
> +                       if (page_is_hvo(page, order))
> +                               nr_pages = HVO_MOD;
>                         page[1].flags &= ~PAGE_FLAGS_SECOND;
> -               for (i = 1; i < (1 << order); i++) {
> +               }
> +
> +               for (i = 1; i < nr_pages; i++) {
>                         if (compound)
>                                 bad += free_tail_page_prepare(page, page + i);
>                         if (is_check_pages_enabled()) {
> @@ -1547,6 +1557,141 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
>         page_table_check_alloc(page, order);
>  }
>
> +static void prep_hvo_page(struct page *head, int order)
> +{
> +       LIST_HEAD(list);
> +       struct page *page, *next;
> +       int freed = 0;
> +       unsigned long start = (unsigned long)head;
> +       unsigned long end = start + hvo_order_size(order);
> +
> +       if (page_zonenum(head) != ZONE_NOMERGE)
> +               return;
> +
> +       if (WARN_ON_ONCE(order != page_zone(head)->order)) {
> +               bad_page(head, "invalid page order");
> +               return;
> +       }
> +
> +       if (!page_hvo_suitable(head, order) || page_is_hvo(head, order))
> +               return;
> +
> +       vmemmap_remap_free(start + PAGE_SIZE, end, start, &list, 0);
> +
> +       list_for_each_entry_safe(page, next, &list, lru) {
> +               if (PageReserved(page))
> +                       free_bootmem_page(page);
> +               else
> +                       __free_page(page);
> +               freed++;
> +       }
> +
> +       atomic_long_add(freed, &page_zone(head)->hvo_freed);
> +}
> +
> +static void prep_nomerge_zone(struct zone *zone, enum migratetype type)
> +{
> +       int order;
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&zone->lock, flags);
> +
> +       for (order = MAX_PAGE_ORDER; order > zone->order; order--) {
> +               struct page *page;
> +               int split = 0;
> +               struct free_area *area = zone->free_area + order;
> +
> +               while ((page = get_page_from_free_area(area, type))) {
> +                       del_page_from_free_list(page, zone, order);
> +                       expand(zone, page, zone->order, order, type);
> +                       set_buddy_order(page, zone->order);
> +                       add_to_free_list(page, zone, zone->order, type);
> +                       split++;
> +               }
> +
> +               pr_info("  HVO: order %d split %d\n", order, split);
> +       }
> +
> +       spin_unlock_irqrestore(&zone->lock, flags);
> +}
> +
> +static void hvo_nomerge_zone(struct zone *zone, enum migratetype type)
> +{
> +       LIST_HEAD(old);
> +       LIST_HEAD(new);
> +       int nomem, freed;
> +       unsigned long flags;
> +       struct list_head list;
> +       struct page *page, *next;
> +       struct free_area *area = zone->free_area + zone->order;
> +again:
> +       nomem = freed = 0;
> +       INIT_LIST_HEAD(&list);
> +
> +       spin_lock_irqsave(&zone->lock, flags);
> +       list_splice_init(area->free_list + type, &old);
> +       spin_unlock_irqrestore(&zone->lock, flags);
> +
> +       list_for_each_entry_safe(page, next, &old, buddy_list) {
> +               unsigned long start = (unsigned long)page;
> +               unsigned long end = start + hvo_order_size(zone->order);
> +
> +               if (WARN_ON_ONCE(!IS_ALIGNED(start, PAGE_SIZE)))
> +                       continue;
> +
> +               if (vmemmap_remap_free(start + PAGE_SIZE, end, start, &list, 0))
> +                       nomem++;
> +       }
> +
> +       list_for_each_entry_safe(page, next, &list, lru) {
> +               if (PageReserved(page))
> +                       free_bootmem_page(page);
> +               else
> +                       __free_page(page);
> +               freed++;
> +       }
> +
> +       list_splice_init(&old, &new);
> +       atomic_long_add(freed, &zone->hvo_freed);
> +
> +       pr_info("  HVO: nomem %d freed %d\n", nomem, freed);
> +
> +       if (!list_empty(area->free_list + type))
> +               goto again;
> +
> +       spin_lock_irqsave(&zone->lock, flags);
> +       list_splice(&new, area->free_list + type);
> +       spin_unlock_irqrestore(&zone->lock, flags);
> +}
> +
> +static bool zone_hvo_suitable(struct zone *zone)
> +{
> +       if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
> +               return false;
> +
> +       return zone_idx(zone) == ZONE_NOMERGE && hvo_order_size(zone->order) > PAGE_SIZE;
> +}
> +
> +void free_vmemmap(void)
> +{
> +       struct zone *zone;
> +
> +       static_branch_inc(&hugetlb_optimize_vmemmap_key);
> +
> +       for_each_populated_zone(zone) {
> +               if (!zone_hvo_suitable(zone))
> +                       continue;
> +
> +               pr_info("Freeing vmemmap of node %d zone %s\n",
> +                        zone_to_nid(zone), zone->name);
> +
> +               prep_nomerge_zone(zone, MIGRATE_MOVABLE);
> +               hvo_nomerge_zone(zone, MIGRATE_MOVABLE);
> +
> +               cond_resched();
> +       }
> +}
> +
>  static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
>                                                         unsigned int alloc_flags)
>  {
> @@ -1565,6 +1710,8 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
>                 set_page_pfmemalloc(page);
>         else
>                 clear_page_pfmemalloc(page);
> +
> +       prep_hvo_page(page, order);
>  }
>
>  /*
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 0ddb28c52961..d339bf489230 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1143,6 +1143,10 @@ int folio_total_mapcount(struct folio *folio)
>         /* In the common case, avoid the loop when no pages mapped by PTE */
>         if (folio_nr_pages_mapped(folio) == 0)
>                 return mapcount;
> +
> +       if (hvo_range_mapcount(folio, &folio->page, folio_nr_pages(folio), &mapcount))
> +               return mapcount;
> +
>         /*
>          * Add all the PTE mappings of those pages mapped by PTE.
>          * Limit the loop to folio_nr_pages_mapped()?
> @@ -1168,6 +1172,8 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
>
>         switch (level) {
>         case RMAP_LEVEL_PTE:
> +               if (hvo_map_range(folio, page, nr_pages, &nr))
> +                       break;
>                 do {
>                         first = atomic_inc_and_test(&page->_mapcount);
>                         if (first && folio_test_large(folio)) {
> @@ -1314,6 +1320,8 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
>         if (flags & RMAP_EXCLUSIVE) {
>                 switch (level) {
>                 case RMAP_LEVEL_PTE:
> +                       if (folio_is_hvo(folio))
> +                               break;
>                         for (i = 0; i < nr_pages; i++)
>                                 SetPageAnonExclusive(page + i);
>                         break;
> @@ -1421,6 +1429,9 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>         } else if (!folio_test_pmd_mappable(folio)) {
>                 int i;
>
> +               if (hvo_map_range(folio, &folio->page, nr, &nr))
> +                       goto done;
> +
>                 for (i = 0; i < nr; i++) {
>                         struct page *page = folio_page(folio, i);
>
> @@ -1437,7 +1448,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>                 SetPageAnonExclusive(&folio->page);
>                 __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr);
>         }
> -
> +done:
>         __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
>  }
>
> @@ -1510,6 +1521,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>
>         switch (level) {
>         case RMAP_LEVEL_PTE:
> +               if (hvo_unmap_range(folio, page, nr_pages, &nr))
> +                       break;
>                 do {
>                         last = atomic_add_negative(-1, &page->_mapcount);
>                         if (last && folio_test_large(folio)) {
> @@ -2212,7 +2225,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
>                                 break;
>                         }
>                         VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
> -                                      !anon_exclusive, subpage);
> +                                      !folio_is_hvo(folio) && !anon_exclusive, subpage);
>
>                         /* See folio_try_share_anon_rmap_pte(): clear PTE first. */
>                         if (folio_test_hugetlb(folio)) {
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index ff2114452334..f51f3b872270 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1704,6 +1704,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
>                    "\n        present  %lu"
>                    "\n        managed  %lu"
>                    "\n        cma      %lu"
> +                  "\n  hvo   freed    %lu"
>                    "\n        order    %u",
>                    zone_page_state(zone, NR_FREE_PAGES),
>                    zone->watermark_boost,
> @@ -1714,6 +1715,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
>                    zone->present_pages,
>                    zone_managed_pages(zone),
>                    zone_cma_pages(zone),
> +                  atomic_long_read(&zone->hvo_freed),
>                    zone->order);
>
>         seq_printf(m,
> --
> 2.44.0.rc1.240.g4c46232300-goog
>
>
David Hildenbrand March 1, 2024, 3:42 p.m. UTC | #2
On 29.02.24 23:54, Yang Shi wrote:
> On Thu, Feb 29, 2024 at 10:34 AM Yu Zhao <yuzhao@google.com> wrote:
>>
>> HVO can be one of the perks for heavy THP users like it is for hugeTLB
>> users. For example, if such a user uses 60% of physical memory for 2MB
>> THPs, THP HVO can reduce the struct page overhead by half (60% * 7/8
>> ~= 50%).
>>
>> ZONE_NOMERGE considerably simplifies the implementation of HVO for
>> THPs, since THPs from it cannot be split or merged and thus do not
>> require any correctness-related operations on tail pages beyond the
>> second one.
>>
>> If a THP is mapped by PTEs, two optimization-related operations on its
>> tail pages, i.e., _mapcount and PG_anon_exclusive, can be binned to
>> track a group of pages, e.g., eight pages per group for 2MB THPs. The
>> estimation, as the copying cost incurred during shattering, is also by
>> design, since mapping by PTEs is another discouraged behavior.
> 
> I'm confused by this. Can you please elaborate a little bit about
> binning mapcount and PG_anon_exclusive?
> 
> For mapcount, IIUC, for example, when inc'ing a subpage's mapcount,
> you actually inc the (i % 64) page's mapcount (assuming THP size is 2M
> and base page size is 4K, so 8 strides and 64 pages in each stride),
> right? But how you can tell each page of the 8 pages has mapcount 1 or
> one page is mapped 8 times? Or this actually doesn't matter, we don't
> even care to distinguish the two cases?

I'm hoping we won't need such elaborate approaches that make the 
mapcounts even more complicated in the future.

Just like for hugetlb HGM (if it ever becomes real), I'm hoping that we 
can just avoid subpage mapcounts completely, at least in some kernel 
configs initially.

I was looking into having only a single PAE bit this week, but 
migration+swapout are (again) giving me a really hard time. In theory 
it's simple, the corner cases are killing me.

What I really dislike about PAE right now is not necessarily the space, 
but that they reside in multiple cachelines and that we have to use 
atomic operations to set/clear them simply because other page flags 
might be set concurrently. PAE can only be set/cleared while holding the 
page table lock already, so I really want to avoid atomics.

I have not given up on a single PAE bit per folio, but the alternative I 
was thinking about this week was simply allocating the space required 
for maintaining them and storing a pointer to that in the (anon) folio. 
Not perfect.
Yu Zhao March 3, 2024, 1:46 a.m. UTC | #3
On Thu, Feb 29, 2024 at 5:54 PM Yang Shi <shy828301@gmail.com> wrote:
>
> On Thu, Feb 29, 2024 at 10:34 AM Yu Zhao <yuzhao@google.com> wrote:
> >
> > HVO can be one of the perks for heavy THP users like it is for hugeTLB
> > users. For example, if such a user uses 60% of physical memory for 2MB
> > THPs, THP HVO can reduce the struct page overhead by half (60% * 7/8
> > ~= 50%).
> >
> > ZONE_NOMERGE considerably simplifies the implementation of HVO for
> > THPs, since THPs from it cannot be split or merged and thus do not
> > require any correctness-related operations on tail pages beyond the
> > second one.
> >
> > If a THP is mapped by PTEs, two optimization-related operations on its
> > tail pages, i.e., _mapcount and PG_anon_exclusive, can be binned to
> > track a group of pages, e.g., eight pages per group for 2MB THPs. The
> > estimation, as the copying cost incurred during shattering, is also by
> > design, since mapping by PTEs is another discouraged behavior.
>
> I'm confused by this. Can you please elaborate a little bit about
> binning mapcount and PG_anon_exclusive?
>
> For mapcount, IIUC, for example, when inc'ing a subpage's mapcount,
> you actually inc the (i % 64) page's mapcount (assuming THP size is 2M
> and base page size is 4K, so 8 strides and 64 pages in each stride),
> right?

Correct.

> But how you can tell each page of the 8 pages has mapcount 1 or
> one page is mapped 8 times?

We can't :)

> Or this actually doesn't matter, we don't
> even care to distinguish the two cases?

Exactly.

> For PG_anon_exclusive, if one page has it set, it means other 7 pages
> in other strides have it set too?

Correct. We leverage the fact that they (_mapcount and
PG_anon_exclusive) are optimizations, overestimating _mapcount and
underestimating PG_anon_exclusive (both are for worst) can only affect
the performance for PTE-mapped THPs (as a punishment for splitting).
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f5a97dec5169..d7014fc35cca 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1196,6 +1196,138 @@  static inline void page_mapcount_reset(struct page *page)
 	atomic_set(&(page)->_mapcount, -1);
 }
 
+#define HVO_MOD (PAGE_SIZE / sizeof(struct page))
+
+static inline int hvo_order_size(int order)
+{
+	if (PAGE_SIZE % sizeof(struct page) || !is_power_of_2(HVO_MOD))
+		return 0;
+
+	return (1 << order) * sizeof(struct page);
+}
+
+static inline bool page_hvo_suitable(struct page *head, int order)
+{
+	VM_WARN_ON_ONCE_PAGE(!test_bit(PG_head, &head->flags), head);
+
+	if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
+		return false;
+
+	return page_zonenum(head) == ZONE_NOMERGE &&
+	       IS_ALIGNED((unsigned long)head, PAGE_SIZE) &&
+	       hvo_order_size(order) > PAGE_SIZE;
+}
+
+static inline bool folio_hvo_suitable(struct folio *folio)
+{
+	return folio_test_large(folio) && page_hvo_suitable(&folio->page, folio_order(folio));
+}
+
+static inline bool page_is_hvo(struct page *head, int order)
+{
+	return page_hvo_suitable(head, order) && test_bit(PG_head, &head[HVO_MOD].flags);
+}
+
+static inline bool folio_is_hvo(struct folio *folio)
+{
+	return folio_test_large(folio) && page_is_hvo(&folio->page, folio_order(folio));
+}
+
+/*
+ * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
+ * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit
+ * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently
+ * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
+ */
+#define ENTIRELY_MAPPED		0x800000
+#define FOLIO_PAGES_MAPPED	(ENTIRELY_MAPPED - 1)
+
+static inline int hvo_range_mapcount(struct folio *folio, struct page *page, int nr_pages, int *ret)
+{
+	int i, next, end;
+	int stride = hvo_order_size(folio_order(folio)) / PAGE_SIZE;
+
+	if (!folio_is_hvo(folio))
+		return false;
+
+	*ret = folio_entire_mapcount(folio);
+
+	for (i = folio_page_idx(folio, page), end = i + nr_pages; i != end; i = next) {
+		next = min(end, round_down(i + stride, stride));
+
+		page = folio_page(folio, i / stride);
+		*ret += atomic_read(&page->_mapcount) + 1;
+	}
+
+	return true;
+}
+
+static inline bool hvo_map_range(struct folio *folio, struct page *page, int nr_pages, int *ret)
+{
+	int i, next, end;
+	int stride = hvo_order_size(folio_order(folio)) / PAGE_SIZE;
+
+	if (!folio_is_hvo(folio))
+		return false;
+
+	*ret = 0;
+
+	for (i = folio_page_idx(folio, page), end = i + nr_pages; i != end; i = next) {
+		next = min(end, round_down(i + stride, stride));
+
+		page = folio_page(folio, i / stride);
+		if (atomic_add_return(next - i, &page->_mapcount) == next - i - 1)
+			*ret += stride;
+	}
+
+	if (atomic_add_return(*ret, &folio->_nr_pages_mapped) >= ENTIRELY_MAPPED)
+		*ret = 0;
+
+	return true;
+}
+
+static inline bool hvo_unmap_range(struct folio *folio, struct page *page, int nr_pages, int *ret)
+{
+	int i, next, end;
+	int stride = hvo_order_size(folio_order(folio)) / PAGE_SIZE;
+
+	if (!folio_is_hvo(folio))
+		return false;
+
+	*ret = 0;
+
+	for (i = folio_page_idx(folio, page), end = i + nr_pages; i != end; i = next) {
+		next = min(end, round_down(i + stride, stride));
+
+		page = folio_page(folio, i / stride);
+		if (atomic_sub_return(next - i, &page->_mapcount) == -1)
+			*ret += stride;
+	}
+
+	if (atomic_sub_return(*ret, &folio->_nr_pages_mapped) >= ENTIRELY_MAPPED)
+		*ret = 0;
+
+	return true;
+}
+
+static inline bool hvo_dup_range(struct folio *folio, struct page *page, int nr_pages)
+{
+	int i, next, end;
+	int stride = hvo_order_size(folio_order(folio)) / PAGE_SIZE;
+
+	if (!folio_is_hvo(folio))
+		return false;
+
+	for (i = folio_page_idx(folio, page), end = i + nr_pages; i != end; i = next) {
+		next = min(end, round_down(i + stride, stride));
+
+		page = folio_page(folio, i / stride);
+		atomic_add(next - i, &page->_mapcount);
+	}
+
+	return true;
+}
+
 /**
  * page_mapcount() - Number of times this precise page is mapped.
  * @page: The page.
@@ -1212,6 +1344,9 @@  static inline int page_mapcount(struct page *page)
 {
 	int mapcount = atomic_read(&page->_mapcount) + 1;
 
+	if (hvo_range_mapcount(page_folio(page), page, 1, &mapcount))
+		return mapcount;
+
 	if (unlikely(PageCompound(page)))
 		mapcount += folio_entire_mapcount(page_folio(page));
 
@@ -3094,6 +3229,11 @@  static inline void pagetable_pud_dtor(struct ptdesc *ptdesc)
 
 extern void __init pagecache_init(void);
 extern void free_initmem(void);
+extern void free_vmemmap(void);
+extern int vmemmap_remap_free(unsigned long start, unsigned long end,
+			      unsigned long reuse,
+			      struct list_head *vmemmap_pages,
+			      unsigned long flags);
 
 /*
  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 532218167bba..00e4bb6c8533 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -916,6 +916,7 @@  struct zone {
 #ifdef CONFIG_CMA
 	unsigned long		cma_pages;
 #endif
+	atomic_long_t		hvo_freed;
 
 	const char		*name;
 
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b7944a833668..d058c4cb3c96 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -322,6 +322,8 @@  static __always_inline void __folio_dup_file_rmap(struct folio *folio,
 
 	switch (level) {
 	case RMAP_LEVEL_PTE:
+		if (hvo_dup_range(folio, page, nr_pages))
+			break;
 		do {
 			atomic_inc(&page->_mapcount);
 		} while (page++, --nr_pages > 0);
@@ -401,6 +403,8 @@  static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
 				if (PageAnonExclusive(page + i))
 					return -EBUSY;
 		}
+		if (hvo_dup_range(folio, page, nr_pages))
+			break;
 		do {
 			if (PageAnonExclusive(page))
 				ClearPageAnonExclusive(page);
diff --git a/init/main.c b/init/main.c
index e24b0780fdff..74003495db32 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1448,6 +1448,7 @@  static int __ref kernel_init(void *unused)
 	kgdb_free_init_mem();
 	exit_boot_config();
 	free_initmem();
+	free_vmemmap();
 	mark_readonly();
 
 	/*
diff --git a/mm/gup.c b/mm/gup.c
index df83182ec72d..f3df0078505b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -57,7 +57,7 @@  static inline void sanity_check_pinned_pages(struct page **pages,
 			continue;
 		if (!folio_test_large(folio) || folio_test_hugetlb(folio))
 			VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
-		else
+		else if (!folio_is_hvo(folio) || !folio_nr_pages_mapped(folio))
 			/* Either a PTE-mapped or a PMD-mapped THP. */
 			VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
 				       !PageAnonExclusive(page), page);
@@ -645,6 +645,7 @@  static struct page *follow_page_pte(struct vm_area_struct *vma,
 	}
 
 	VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+		       !folio_is_hvo(page_folio(page)) &&
 		       !PageAnonExclusive(page), page);
 
 	/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 62d2254bc51c..9e7e5d587a5c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2535,6 +2535,8 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		 *
 		 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
 		 */
+		if (folio_is_hvo(folio))
+			ClearPageAnonExclusive(page);
 		anon_exclusive = PageAnonExclusive(page);
 		if (freeze && anon_exclusive &&
 		    folio_try_share_anon_rmap_pmd(folio, page))
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index da177e49d956..9f43d900e83c 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -310,7 +310,7 @@  static int vmemmap_remap_split(unsigned long start, unsigned long end,
  *
  * Return: %0 on success, negative error code otherwise.
  */
-static int vmemmap_remap_free(unsigned long start, unsigned long end,
+int vmemmap_remap_free(unsigned long start, unsigned long end,
 			      unsigned long reuse,
 			      struct list_head *vmemmap_pages,
 			      unsigned long flags)
diff --git a/mm/internal.h b/mm/internal.h
index ac1d27468899..871c6eeb78b8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -52,15 +52,6 @@  struct folio_batch;
 
 void page_writeback_init(void);
 
-/*
- * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
- * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit
- * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently
- * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
- */
-#define ENTIRELY_MAPPED		0x800000
-#define FOLIO_PAGES_MAPPED	(ENTIRELY_MAPPED - 1)
-
 /*
  * Flags passed to __show_mem() and show_free_areas() to suppress output in
  * various contexts.
diff --git a/mm/memory.c b/mm/memory.c
index 0bfc8b007c01..db389f1d776d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3047,8 +3047,8 @@  static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
 	VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
 
 	if (folio) {
-		VM_BUG_ON(folio_test_anon(folio) &&
-			  !PageAnonExclusive(vmf->page));
+		VM_BUG_ON_PAGE(folio_test_anon(folio) && !folio_is_hvo(folio) &&
+			       !PageAnonExclusive(vmf->page), vmf->page);
 		/*
 		 * Clear the folio's cpupid information as the existing
 		 * information potentially belongs to a now completely
@@ -3502,7 +3502,7 @@  static vm_fault_t do_wp_page(struct vm_fault *vmf)
 	 */
 	if (folio && folio_test_anon(folio) &&
 	    (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) {
-		if (!PageAnonExclusive(vmf->page))
+		if (!folio_is_hvo(folio) && !PageAnonExclusive(vmf->page))
 			SetPageAnonExclusive(vmf->page);
 		if (unlikely(unshare)) {
 			pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4100,8 +4100,9 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 					rmap_flags);
 	}
 
-	VM_BUG_ON(!folio_test_anon(folio) ||
-			(pte_write(pte) && !PageAnonExclusive(page)));
+	VM_BUG_ON_PAGE(!folio_test_anon(folio) ||
+		       (pte_write(pte) && !folio_is_hvo(folio) && !PageAnonExclusive(page)),
+		       page);
 	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
 	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd843fb04f78..5f8c6583a191 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,6 +53,7 @@ 
 #include <linux/khugepaged.h>
 #include <linux/delayacct.h>
 #include <linux/cacheinfo.h>
+#include <linux/bootmem_info.h>
 #include <asm/div64.h>
 #include "internal.h"
 #include "shuffle.h"
@@ -585,6 +586,10 @@  void prep_compound_page(struct page *page, unsigned int order)
 	int nr_pages = 1 << order;
 
 	__SetPageHead(page);
+
+	if (page_is_hvo(page, order))
+		nr_pages = HVO_MOD;
+
 	for (i = 1; i < nr_pages; i++)
 		prep_compound_tail(page, i);
 
@@ -1124,10 +1129,15 @@  static __always_inline bool free_pages_prepare(struct page *page,
 	 */
 	if (unlikely(order)) {
 		int i;
+		int nr_pages = 1 << order;
 
-		if (compound)
+		if (compound) {
+			if (page_is_hvo(page, order))
+				nr_pages = HVO_MOD;
 			page[1].flags &= ~PAGE_FLAGS_SECOND;
-		for (i = 1; i < (1 << order); i++) {
+		}
+
+		for (i = 1; i < nr_pages; i++) {
 			if (compound)
 				bad += free_tail_page_prepare(page, page + i);
 			if (is_check_pages_enabled()) {
@@ -1547,6 +1557,141 @@  inline void post_alloc_hook(struct page *page, unsigned int order,
 	page_table_check_alloc(page, order);
 }
 
+static void prep_hvo_page(struct page *head, int order)
+{
+	LIST_HEAD(list);
+	struct page *page, *next;
+	int freed = 0;
+	unsigned long start = (unsigned long)head;
+	unsigned long end = start + hvo_order_size(order);
+
+	if (page_zonenum(head) != ZONE_NOMERGE)
+		return;
+
+	if (WARN_ON_ONCE(order != page_zone(head)->order)) {
+		bad_page(head, "invalid page order");
+		return;
+	}
+
+	if (!page_hvo_suitable(head, order) || page_is_hvo(head, order))
+		return;
+
+	vmemmap_remap_free(start + PAGE_SIZE, end, start, &list, 0);
+
+	list_for_each_entry_safe(page, next, &list, lru) {
+		if (PageReserved(page))
+			free_bootmem_page(page);
+		else
+			__free_page(page);
+		freed++;
+	}
+
+	atomic_long_add(freed, &page_zone(head)->hvo_freed);
+}
+
+static void prep_nomerge_zone(struct zone *zone, enum migratetype type)
+{
+	int order;
+	unsigned long flags;
+
+	spin_lock_irqsave(&zone->lock, flags);
+
+	for (order = MAX_PAGE_ORDER; order > zone->order; order--) {
+		struct page *page;
+		int split = 0;
+		struct free_area *area = zone->free_area + order;
+
+		while ((page = get_page_from_free_area(area, type))) {
+			del_page_from_free_list(page, zone, order);
+			expand(zone, page, zone->order, order, type);
+			set_buddy_order(page, zone->order);
+			add_to_free_list(page, zone, zone->order, type);
+			split++;
+		}
+
+		pr_info("  HVO: order %d split %d\n", order, split);
+	}
+
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+static void hvo_nomerge_zone(struct zone *zone, enum migratetype type)
+{
+	LIST_HEAD(old);
+	LIST_HEAD(new);
+	int nomem, freed;
+	unsigned long flags;
+	struct list_head list;
+	struct page *page, *next;
+	struct free_area *area = zone->free_area + zone->order;
+again:
+	nomem = freed = 0;
+	INIT_LIST_HEAD(&list);
+
+	spin_lock_irqsave(&zone->lock, flags);
+	list_splice_init(area->free_list + type, &old);
+	spin_unlock_irqrestore(&zone->lock, flags);
+
+	list_for_each_entry_safe(page, next, &old, buddy_list) {
+		unsigned long start = (unsigned long)page;
+		unsigned long end = start + hvo_order_size(zone->order);
+
+		if (WARN_ON_ONCE(!IS_ALIGNED(start, PAGE_SIZE)))
+			continue;
+
+		if (vmemmap_remap_free(start + PAGE_SIZE, end, start, &list, 0))
+			nomem++;
+	}
+
+	list_for_each_entry_safe(page, next, &list, lru) {
+		if (PageReserved(page))
+			free_bootmem_page(page);
+		else
+			__free_page(page);
+		freed++;
+	}
+
+	list_splice_init(&old, &new);
+	atomic_long_add(freed, &zone->hvo_freed);
+
+	pr_info("  HVO: nomem %d freed %d\n", nomem, freed);
+
+	if (!list_empty(area->free_list + type))
+		goto again;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	list_splice(&new, area->free_list + type);
+	spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+static bool zone_hvo_suitable(struct zone *zone)
+{
+	if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
+		return false;
+
+	return zone_idx(zone) == ZONE_NOMERGE && hvo_order_size(zone->order) > PAGE_SIZE;
+}
+
+void free_vmemmap(void)
+{
+	struct zone *zone;
+
+	static_branch_inc(&hugetlb_optimize_vmemmap_key);
+
+	for_each_populated_zone(zone) {
+		if (!zone_hvo_suitable(zone))
+			continue;
+
+		pr_info("Freeing vmemmap of node %d zone %s\n",
+			 zone_to_nid(zone), zone->name);
+
+		prep_nomerge_zone(zone, MIGRATE_MOVABLE);
+		hvo_nomerge_zone(zone, MIGRATE_MOVABLE);
+
+		cond_resched();
+	}
+}
+
 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
 							unsigned int alloc_flags)
 {
@@ -1565,6 +1710,8 @@  static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
 		set_page_pfmemalloc(page);
 	else
 		clear_page_pfmemalloc(page);
+
+	prep_hvo_page(page, order);
 }
 
 /*
diff --git a/mm/rmap.c b/mm/rmap.c
index 0ddb28c52961..d339bf489230 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1143,6 +1143,10 @@  int folio_total_mapcount(struct folio *folio)
 	/* In the common case, avoid the loop when no pages mapped by PTE */
 	if (folio_nr_pages_mapped(folio) == 0)
 		return mapcount;
+
+	if (hvo_range_mapcount(folio, &folio->page, folio_nr_pages(folio), &mapcount))
+		return mapcount;
+
 	/*
 	 * Add all the PTE mappings of those pages mapped by PTE.
 	 * Limit the loop to folio_nr_pages_mapped()?
@@ -1168,6 +1172,8 @@  static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
 
 	switch (level) {
 	case RMAP_LEVEL_PTE:
+		if (hvo_map_range(folio, page, nr_pages, &nr))
+			break;
 		do {
 			first = atomic_inc_and_test(&page->_mapcount);
 			if (first && folio_test_large(folio)) {
@@ -1314,6 +1320,8 @@  static __always_inline void __folio_add_anon_rmap(struct folio *folio,
 	if (flags & RMAP_EXCLUSIVE) {
 		switch (level) {
 		case RMAP_LEVEL_PTE:
+			if (folio_is_hvo(folio))
+				break;
 			for (i = 0; i < nr_pages; i++)
 				SetPageAnonExclusive(page + i);
 			break;
@@ -1421,6 +1429,9 @@  void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
 	} else if (!folio_test_pmd_mappable(folio)) {
 		int i;
 
+		if (hvo_map_range(folio, &folio->page, nr, &nr))
+			goto done;
+
 		for (i = 0; i < nr; i++) {
 			struct page *page = folio_page(folio, i);
 
@@ -1437,7 +1448,7 @@  void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
 		SetPageAnonExclusive(&folio->page);
 		__lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr);
 	}
-
+done:
 	__lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
 }
 
@@ -1510,6 +1521,8 @@  static __always_inline void __folio_remove_rmap(struct folio *folio,
 
 	switch (level) {
 	case RMAP_LEVEL_PTE:
+		if (hvo_unmap_range(folio, page, nr_pages, &nr))
+			break;
 		do {
 			last = atomic_add_negative(-1, &page->_mapcount);
 			if (last && folio_test_large(folio)) {
@@ -2212,7 +2225,7 @@  static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 				break;
 			}
 			VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
-				       !anon_exclusive, subpage);
+				       !folio_is_hvo(folio) && !anon_exclusive, subpage);
 
 			/* See folio_try_share_anon_rmap_pte(): clear PTE first. */
 			if (folio_test_hugetlb(folio)) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ff2114452334..f51f3b872270 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1704,6 +1704,7 @@  static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n        present  %lu"
 		   "\n        managed  %lu"
 		   "\n        cma      %lu"
+		   "\n  hvo   freed    %lu"
 		   "\n        order    %u",
 		   zone_page_state(zone, NR_FREE_PAGES),
 		   zone->watermark_boost,
@@ -1714,6 +1715,7 @@  static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   zone->present_pages,
 		   zone_managed_pages(zone),
 		   zone_cma_pages(zone),
+		   atomic_long_read(&zone->hvo_freed),
 		   zone->order);
 
 	seq_printf(m,