Message ID | 20230218002819.1486479-23-jthoughton@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | hugetlb: introduce HugeTLB high-granularity mapping | expand |
On Fri, Feb 17, 2023 at 4:29 PM James Houghton <jthoughton@google.com> wrote: > > This allows fork() to work with high-granularity mappings. The page > table structure is copied such that partially mapped regions will remain > partially mapped in the same way for the new process. > > A page's reference count is incremented for *each* portion of it that > is mapped in the page table. For example, if you have a PMD-mapped 1G > page, the reference count will be incremented by 512. > > mapcount is handled similar to THPs: if you're completely mapping a > hugepage, then the compound_mapcount is incremented. If you're mapping a > part of it, the subpages that are getting mapped will have their > mapcounts incremented. > > Signed-off-by: James Houghton <jthoughton@google.com> > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index 1a1a71868dfd..2fe1eb6897d4 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -162,6 +162,8 @@ void hugepage_put_subpool(struct hugepage_subpool *spool); > > void hugetlb_remove_rmap(struct page *subpage, unsigned long shift, > struct hstate *h, struct vm_area_struct *vma); > +void hugetlb_add_file_rmap(struct page *subpage, unsigned long shift, > + struct hstate *h, struct vm_area_struct *vma); > > void hugetlb_dup_vma_private(struct vm_area_struct *vma); > void clear_vma_resv_huge_pages(struct vm_area_struct *vma); > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 693332b7e186..210c6f2b16a5 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -141,6 +141,37 @@ void hugetlb_remove_rmap(struct page *subpage, unsigned long shift, > page_remove_rmap(subpage, vma, false); > } > } > +/* > + * hugetlb_add_file_rmap() - increment the mapcounts for file-backed hugetlb > + * pages appropriately. > + * > + * For pages that are being mapped with their hstate-level PTE (e.g., a 1G page > + * being mapped with a 1G PUD), then we increment the compound_mapcount for the > + * head page. > + * > + * For pages that are being mapped with high-granularity, we increment the > + * mapcounts for the individual subpages that are getting mapped. > + */ > +void hugetlb_add_file_rmap(struct page *subpage, unsigned long shift, > + struct hstate *h, struct vm_area_struct *vma) > +{ > + struct page *hpage = compound_head(subpage); > + > + if (shift == huge_page_shift(h)) { > + VM_BUG_ON_PAGE(subpage != hpage, subpage); > + page_add_file_rmap(hpage, vma, true); > + } else { > + unsigned long nr_subpages = 1UL << (shift - PAGE_SHIFT); > + struct page *final_page = &subpage[nr_subpages]; > + > + VM_BUG_ON_PAGE(HPageVmemmapOptimized(hpage), hpage); > + /* > + * Increment the mapcount on each page that is getting mapped. > + */ > + for (; subpage < final_page; ++subpage) > + page_add_file_rmap(subpage, vma, false); > + } > +} > > static inline bool subpool_is_free(struct hugepage_subpool *spool) > { > @@ -5210,7 +5241,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, > struct vm_area_struct *src_vma) > { > pte_t *src_pte, *dst_pte, entry; > - struct page *ptepage; > + struct hugetlb_pte src_hpte, dst_hpte; > + struct page *ptepage, *hpage; > unsigned long addr; > bool cow = is_cow_mapping(src_vma->vm_flags); > struct hstate *h = hstate_vma(src_vma); > @@ -5238,18 +5270,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, > } > > last_addr_mask = hugetlb_mask_last_page(h); > - for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { > + addr = src_vma->vm_start; > + while (addr < src_vma->vm_end) { > spinlock_t *src_ptl, *dst_ptl; > - src_pte = hugetlb_walk(src_vma, addr, sz); > - if (!src_pte) { > - addr |= last_addr_mask; > + unsigned long hpte_sz; > + > + if (hugetlb_full_walk(&src_hpte, src_vma, addr)) { > + addr = (addr | last_addr_mask) + sz; > continue; > } > - dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); > - if (!dst_pte) { > - ret = -ENOMEM; > + ret = hugetlb_full_walk_alloc(&dst_hpte, dst_vma, addr, > + hugetlb_pte_size(&src_hpte)); > + if (ret) > break; > - } > + > + src_pte = src_hpte.ptep; > + dst_pte = dst_hpte.ptep; > + > + hpte_sz = hugetlb_pte_size(&src_hpte); > > /* > * If the pagetables are shared don't copy or take references. > @@ -5259,13 +5297,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, > * another vma. So page_count of ptep page is checked instead > * to reliably determine whether pte is shared. > */ > - if (page_count(virt_to_page(dst_pte)) > 1) { > - addr |= last_addr_mask; > + if (hugetlb_pte_size(&dst_hpte) == sz && > + page_count(virt_to_page(dst_pte)) > 1) { > + addr = (addr | last_addr_mask) + sz; > continue; > } > > - dst_ptl = huge_pte_lock(h, dst, dst_pte); > - src_ptl = huge_pte_lockptr(huge_page_shift(h), src, src_pte); > + dst_ptl = hugetlb_pte_lock(&dst_hpte); > + src_ptl = hugetlb_pte_lockptr(&src_hpte); > spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); > entry = huge_ptep_get(src_pte); > again: > @@ -5309,10 +5348,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, > */ > if (userfaultfd_wp(dst_vma)) > set_huge_pte_at(dst, addr, dst_pte, entry); > + } else if (!hugetlb_pte_present_leaf(&src_hpte, entry)) { > + /* Retry the walk. */ > + spin_unlock(src_ptl); > + spin_unlock(dst_ptl); > + continue; > } else { > - entry = huge_ptep_get(src_pte); > ptepage = pte_page(entry); > - get_page(ptepage); > + hpage = compound_head(ptepage); > + get_page(hpage); > > /* > * Failing to duplicate the anon rmap is a rare case > @@ -5324,13 +5368,34 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, > * need to be without the pgtable locks since we could > * sleep during the process. > */ > - if (!PageAnon(ptepage)) { > - page_add_file_rmap(ptepage, src_vma, true); > - } else if (page_try_dup_anon_rmap(ptepage, true, > + if (!PageAnon(hpage)) { > + hugetlb_add_file_rmap(ptepage, > + src_hpte.shift, h, src_vma); > + } > + /* > + * It is currently impossible to get anonymous HugeTLB > + * high-granularity mappings, so we use 'hpage' here. > + * > + * This will need to be changed when HGM support for > + * anon mappings is added. > + */ > + else if (page_try_dup_anon_rmap(hpage, true, > src_vma)) { > pte_t src_pte_old = entry; > struct folio *new_folio; > > + /* > + * If we are mapped at high granularity, we > + * may end up allocating lots and lots of > + * hugepages when we only need one. Bail out > + * now. > + */ > + if (hugetlb_pte_size(&src_hpte) != sz) { > + put_page(hpage); > + ret = -EINVAL; > + break; > + } > + Although this block never executes, it should come after the following spin_unlocks(). > spin_unlock(src_ptl); > spin_unlock(dst_ptl); > /* Do not use reserve as it's private owned */ > @@ -5342,7 +5407,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, > } > copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma, > npages); > - put_page(ptepage); > + put_page(hpage); > > /* Install the new hugetlb folio if src pte stable */ > dst_ptl = huge_pte_lock(h, dst, dst_pte); > @@ -5360,6 +5425,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, > hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio); > spin_unlock(src_ptl); > spin_unlock(dst_ptl); > + addr += hugetlb_pte_size(&src_hpte); > continue; > } > > @@ -5376,10 +5442,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, > } > > set_huge_pte_at(dst, addr, dst_pte, entry); > - hugetlb_count_add(npages, dst); > + hugetlb_count_add( > + hugetlb_pte_size(&dst_hpte) / PAGE_SIZE, > + dst); > } > spin_unlock(src_ptl); > spin_unlock(dst_ptl); > + addr += hugetlb_pte_size(&src_hpte); > } > > if (cow) { > -- > 2.39.2.637.g21b0678d19-goog >
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 1a1a71868dfd..2fe1eb6897d4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -162,6 +162,8 @@ void hugepage_put_subpool(struct hugepage_subpool *spool); void hugetlb_remove_rmap(struct page *subpage, unsigned long shift, struct hstate *h, struct vm_area_struct *vma); +void hugetlb_add_file_rmap(struct page *subpage, unsigned long shift, + struct hstate *h, struct vm_area_struct *vma); void hugetlb_dup_vma_private(struct vm_area_struct *vma); void clear_vma_resv_huge_pages(struct vm_area_struct *vma); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 693332b7e186..210c6f2b16a5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -141,6 +141,37 @@ void hugetlb_remove_rmap(struct page *subpage, unsigned long shift, page_remove_rmap(subpage, vma, false); } } +/* + * hugetlb_add_file_rmap() - increment the mapcounts for file-backed hugetlb + * pages appropriately. + * + * For pages that are being mapped with their hstate-level PTE (e.g., a 1G page + * being mapped with a 1G PUD), then we increment the compound_mapcount for the + * head page. + * + * For pages that are being mapped with high-granularity, we increment the + * mapcounts for the individual subpages that are getting mapped. + */ +void hugetlb_add_file_rmap(struct page *subpage, unsigned long shift, + struct hstate *h, struct vm_area_struct *vma) +{ + struct page *hpage = compound_head(subpage); + + if (shift == huge_page_shift(h)) { + VM_BUG_ON_PAGE(subpage != hpage, subpage); + page_add_file_rmap(hpage, vma, true); + } else { + unsigned long nr_subpages = 1UL << (shift - PAGE_SHIFT); + struct page *final_page = &subpage[nr_subpages]; + + VM_BUG_ON_PAGE(HPageVmemmapOptimized(hpage), hpage); + /* + * Increment the mapcount on each page that is getting mapped. + */ + for (; subpage < final_page; ++subpage) + page_add_file_rmap(subpage, vma, false); + } +} static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -5210,7 +5241,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *src_vma) { pte_t *src_pte, *dst_pte, entry; - struct page *ptepage; + struct hugetlb_pte src_hpte, dst_hpte; + struct page *ptepage, *hpage; unsigned long addr; bool cow = is_cow_mapping(src_vma->vm_flags); struct hstate *h = hstate_vma(src_vma); @@ -5238,18 +5270,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } last_addr_mask = hugetlb_mask_last_page(h); - for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { + addr = src_vma->vm_start; + while (addr < src_vma->vm_end) { spinlock_t *src_ptl, *dst_ptl; - src_pte = hugetlb_walk(src_vma, addr, sz); - if (!src_pte) { - addr |= last_addr_mask; + unsigned long hpte_sz; + + if (hugetlb_full_walk(&src_hpte, src_vma, addr)) { + addr = (addr | last_addr_mask) + sz; continue; } - dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); - if (!dst_pte) { - ret = -ENOMEM; + ret = hugetlb_full_walk_alloc(&dst_hpte, dst_vma, addr, + hugetlb_pte_size(&src_hpte)); + if (ret) break; - } + + src_pte = src_hpte.ptep; + dst_pte = dst_hpte.ptep; + + hpte_sz = hugetlb_pte_size(&src_hpte); /* * If the pagetables are shared don't copy or take references. @@ -5259,13 +5297,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * another vma. So page_count of ptep page is checked instead * to reliably determine whether pte is shared. */ - if (page_count(virt_to_page(dst_pte)) > 1) { - addr |= last_addr_mask; + if (hugetlb_pte_size(&dst_hpte) == sz && + page_count(virt_to_page(dst_pte)) > 1) { + addr = (addr | last_addr_mask) + sz; continue; } - dst_ptl = huge_pte_lock(h, dst, dst_pte); - src_ptl = huge_pte_lockptr(huge_page_shift(h), src, src_pte); + dst_ptl = hugetlb_pte_lock(&dst_hpte); + src_ptl = hugetlb_pte_lockptr(&src_hpte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); again: @@ -5309,10 +5348,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, */ if (userfaultfd_wp(dst_vma)) set_huge_pte_at(dst, addr, dst_pte, entry); + } else if (!hugetlb_pte_present_leaf(&src_hpte, entry)) { + /* Retry the walk. */ + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + continue; } else { - entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); - get_page(ptepage); + hpage = compound_head(ptepage); + get_page(hpage); /* * Failing to duplicate the anon rmap is a rare case @@ -5324,13 +5368,34 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * need to be without the pgtable locks since we could * sleep during the process. */ - if (!PageAnon(ptepage)) { - page_add_file_rmap(ptepage, src_vma, true); - } else if (page_try_dup_anon_rmap(ptepage, true, + if (!PageAnon(hpage)) { + hugetlb_add_file_rmap(ptepage, + src_hpte.shift, h, src_vma); + } + /* + * It is currently impossible to get anonymous HugeTLB + * high-granularity mappings, so we use 'hpage' here. + * + * This will need to be changed when HGM support for + * anon mappings is added. + */ + else if (page_try_dup_anon_rmap(hpage, true, src_vma)) { pte_t src_pte_old = entry; struct folio *new_folio; + /* + * If we are mapped at high granularity, we + * may end up allocating lots and lots of + * hugepages when we only need one. Bail out + * now. + */ + if (hugetlb_pte_size(&src_hpte) != sz) { + put_page(hpage); + ret = -EINVAL; + break; + } + spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ @@ -5342,7 +5407,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma, npages); - put_page(ptepage); + put_page(hpage); /* Install the new hugetlb folio if src pte stable */ dst_ptl = huge_pte_lock(h, dst, dst_pte); @@ -5360,6 +5425,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio); spin_unlock(src_ptl); spin_unlock(dst_ptl); + addr += hugetlb_pte_size(&src_hpte); continue; } @@ -5376,10 +5442,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } set_huge_pte_at(dst, addr, dst_pte, entry); - hugetlb_count_add(npages, dst); + hugetlb_count_add( + hugetlb_pte_size(&dst_hpte) / PAGE_SIZE, + dst); } spin_unlock(src_ptl); spin_unlock(dst_ptl); + addr += hugetlb_pte_size(&src_hpte); } if (cow) {
This allows fork() to work with high-granularity mappings. The page table structure is copied such that partially mapped regions will remain partially mapped in the same way for the new process. A page's reference count is incremented for *each* portion of it that is mapped in the page table. For example, if you have a PMD-mapped 1G page, the reference count will be incremented by 512. mapcount is handled similar to THPs: if you're completely mapping a hugepage, then the compound_mapcount is incremented. If you're mapping a part of it, the subpages that are getting mapped will have their mapcounts incremented. Signed-off-by: James Houghton <jthoughton@google.com>