Message ID | 7f739c9e9f0a25cafb76a482e31e632c8f72102e.1734407924.git-series.apopple@nvidia.com |
---|---|
State | New |
Headers | show |
Series | fs/dax: Fix ZONE_DEVICE page reference counts | expand |
On 17.12.24 06:12, Alistair Popple wrote: > The rmap doesn't currently support adding a PUD mapping of a > folio. This patch adds support for entire PUD mappings of folios, > primarily to allow for more standard refcounting of device DAX > folios. Currently DAX is the only user of this and it doesn't require > support for partially mapped PUD-sized folios so we don't support for > that for now. > > Signed-off-by: Alistair Popple <apopple@nvidia.com> > > --- > > David - Thanks for your previous comments, I'm less familiar with the > rmap code so I would appreciate you taking another look. In particular > I haven't added a stat for PUD mapped folios as it seemed like > overkill for just the device DAX case but let me know if you think > otherwise. > > Changes for v4: > > - New for v4, split out rmap changes as suggested by David. > --- > include/linux/rmap.h | 15 ++++++++++++- > mm/rmap.c | 56 +++++++++++++++++++++++++++++++++++++++++++++- > 2 files changed, 71 insertions(+) > > diff --git a/include/linux/rmap.h b/include/linux/rmap.h > index 683a040..7043914 100644 > --- a/include/linux/rmap.h > +++ b/include/linux/rmap.h > @@ -192,6 +192,7 @@ typedef int __bitwise rmap_t; > enum rmap_level { > RMAP_LEVEL_PTE = 0, > RMAP_LEVEL_PMD, > + RMAP_LEVEL_PUD, > }; > > static inline void __folio_rmap_sanity_checks(const struct folio *folio, > @@ -228,6 +229,14 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, > VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); > VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); > break; > + case RMAP_LEVEL_PUD: > + /* > + * Assume that we are creating * a single "entire" mapping of the > + * folio. > + */ > + VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio); > + VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); > + break; > default: > VM_WARN_ON_ONCE(true); > } > @@ -251,12 +260,16 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, > folio_add_file_rmap_ptes(folio, page, 1, vma) > void folio_add_file_rmap_pmd(struct folio *, struct page *, > struct vm_area_struct *); > +void folio_add_file_rmap_pud(struct folio *, struct page *, > + struct vm_area_struct *); > void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, > struct vm_area_struct *); > #define folio_remove_rmap_pte(folio, page, vma) \ > folio_remove_rmap_ptes(folio, page, 1, vma) > void folio_remove_rmap_pmd(struct folio *, struct page *, > struct vm_area_struct *); > +void folio_remove_rmap_pud(struct folio *, struct page *, > + struct vm_area_struct *); > > void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, > unsigned long address, rmap_t flags); > @@ -341,6 +354,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, > atomic_add(orig_nr_pages, &folio->_large_mapcount); > break; > case RMAP_LEVEL_PMD: > + case RMAP_LEVEL_PUD: > atomic_inc(&folio->_entire_mapcount); > atomic_inc(&folio->_large_mapcount); > break; > @@ -437,6 +451,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, > atomic_add(orig_nr_pages, &folio->_large_mapcount); > break; > case RMAP_LEVEL_PMD: > + case RMAP_LEVEL_PUD: > if (PageAnonExclusive(page)) { > if (unlikely(maybe_pinned)) > return -EBUSY; > diff --git a/mm/rmap.c b/mm/rmap.c > index c6c4d4e..39d0439 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -1203,6 +1203,11 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, > } > atomic_inc(&folio->_large_mapcount); > break; > + case RMAP_LEVEL_PUD: > + /* We only support entire mappings of PUD sized folios in rmap */ > + atomic_inc(&folio->_entire_mapcount); > + atomic_inc(&folio->_large_mapcount); > + break; This way you don't account the pages at all as mapped, whereby PTE-mapping it would? And IIRC, these PUD-sized pages can be either mapped using PTEs or using a single PUD. I suspect what you want is to diff --git a/mm/rmap.c b/mm/rmap.c index c6c4d4ea29a7e..1477028d3a176 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1187,12 +1187,19 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: + case RMAP_LEVEL_PUD: first = atomic_inc_and_test(&folio->_entire_mapcount); if (first) { nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped); if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) { - *nr_pmdmapped = folio_nr_pages(folio); - nr = *nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); + nr_pages = folio_nr_pages(folio); + /* + * We only track PMD mappings of PMD-sized + * folios separately. + */ + if (level == RMAP_LEVEL_PMD) + *nr_pmdmapped = nr_pages; + nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; Similar on the removal path.
On Tue, Dec 17, 2024 at 11:27:13PM +0100, David Hildenbrand wrote: > On 17.12.24 06:12, Alistair Popple wrote: > > The rmap doesn't currently support adding a PUD mapping of a > > folio. This patch adds support for entire PUD mappings of folios, > > primarily to allow for more standard refcounting of device DAX > > folios. Currently DAX is the only user of this and it doesn't require > > support for partially mapped PUD-sized folios so we don't support for > > that for now. > > > > Signed-off-by: Alistair Popple <apopple@nvidia.com> > > > > --- > > > > David - Thanks for your previous comments, I'm less familiar with the > > rmap code so I would appreciate you taking another look. In particular > > I haven't added a stat for PUD mapped folios as it seemed like > > overkill for just the device DAX case but let me know if you think > > otherwise. > > > > Changes for v4: > > > > - New for v4, split out rmap changes as suggested by David. > > --- > > include/linux/rmap.h | 15 ++++++++++++- > > mm/rmap.c | 56 +++++++++++++++++++++++++++++++++++++++++++++- > > 2 files changed, 71 insertions(+) > > > > diff --git a/include/linux/rmap.h b/include/linux/rmap.h > > index 683a040..7043914 100644 > > --- a/include/linux/rmap.h > > +++ b/include/linux/rmap.h > > @@ -192,6 +192,7 @@ typedef int __bitwise rmap_t; > > enum rmap_level { > > RMAP_LEVEL_PTE = 0, > > RMAP_LEVEL_PMD, > > + RMAP_LEVEL_PUD, > > }; > > static inline void __folio_rmap_sanity_checks(const struct folio *folio, > > @@ -228,6 +229,14 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, > > VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); > > VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); > > break; > > + case RMAP_LEVEL_PUD: > > + /* > > + * Assume that we are creating * a single "entire" mapping of the > > + * folio. > > + */ > > + VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio); > > + VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); > > + break; > > default: > > VM_WARN_ON_ONCE(true); > > } > > @@ -251,12 +260,16 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, > > folio_add_file_rmap_ptes(folio, page, 1, vma) > > void folio_add_file_rmap_pmd(struct folio *, struct page *, > > struct vm_area_struct *); > > +void folio_add_file_rmap_pud(struct folio *, struct page *, > > + struct vm_area_struct *); > > void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, > > struct vm_area_struct *); > > #define folio_remove_rmap_pte(folio, page, vma) \ > > folio_remove_rmap_ptes(folio, page, 1, vma) > > void folio_remove_rmap_pmd(struct folio *, struct page *, > > struct vm_area_struct *); > > +void folio_remove_rmap_pud(struct folio *, struct page *, > > + struct vm_area_struct *); > > void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, > > unsigned long address, rmap_t flags); > > @@ -341,6 +354,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, > > atomic_add(orig_nr_pages, &folio->_large_mapcount); > > break; > > case RMAP_LEVEL_PMD: > > + case RMAP_LEVEL_PUD: > > atomic_inc(&folio->_entire_mapcount); > > atomic_inc(&folio->_large_mapcount); > > break; > > @@ -437,6 +451,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, > > atomic_add(orig_nr_pages, &folio->_large_mapcount); > > break; > > case RMAP_LEVEL_PMD: > > + case RMAP_LEVEL_PUD: > > if (PageAnonExclusive(page)) { > > if (unlikely(maybe_pinned)) > > return -EBUSY; > > diff --git a/mm/rmap.c b/mm/rmap.c > > index c6c4d4e..39d0439 100644 > > --- a/mm/rmap.c > > +++ b/mm/rmap.c > > @@ -1203,6 +1203,11 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, > > } > > atomic_inc(&folio->_large_mapcount); > > break; > > + case RMAP_LEVEL_PUD: > > + /* We only support entire mappings of PUD sized folios in rmap */ > > + atomic_inc(&folio->_entire_mapcount); > > + atomic_inc(&folio->_large_mapcount); > > + break; > > > This way you don't account the pages at all as mapped, whereby PTE-mapping it > would? And IIRC, these PUD-sized pages can be either mapped using PTEs or > using a single PUD. Oh good point. I was thinking that because we don't account PUD mappings today that it would be fine to ignore them. But of course this series means we start accounting them if mapped with PTEs so agree we should be consistent. > I suspect what you want is to Yes, I think so. Thanks for the hint. I will be out over the Christmas break but will do a respin to incorporate this before then. > diff --git a/mm/rmap.c b/mm/rmap.c > index c6c4d4ea29a7e..1477028d3a176 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -1187,12 +1187,19 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, > atomic_add(orig_nr_pages, &folio->_large_mapcount); > break; > case RMAP_LEVEL_PMD: > + case RMAP_LEVEL_PUD: > first = atomic_inc_and_test(&folio->_entire_mapcount); > if (first) { > nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped); > if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) { > - *nr_pmdmapped = folio_nr_pages(folio); > - nr = *nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); > + nr_pages = folio_nr_pages(folio); > + /* > + * We only track PMD mappings of PMD-sized > + * folios separately. > + */ > + if (level == RMAP_LEVEL_PMD) > + *nr_pmdmapped = nr_pages; > + nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); > /* Raced ahead of a remove and another add? */ > if (unlikely(nr < 0)) > nr = 0; > > Similar on the removal path. > > > -- > Cheers, > > David / dhildenb >
>>> return -EBUSY; >>> diff --git a/mm/rmap.c b/mm/rmap.c >>> index c6c4d4e..39d0439 100644 >>> --- a/mm/rmap.c >>> +++ b/mm/rmap.c >>> @@ -1203,6 +1203,11 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, >>> } >>> atomic_inc(&folio->_large_mapcount); >>> break; >>> + case RMAP_LEVEL_PUD: >>> + /* We only support entire mappings of PUD sized folios in rmap */ >>> + atomic_inc(&folio->_entire_mapcount); >>> + atomic_inc(&folio->_large_mapcount); >>> + break; >> >> >> This way you don't account the pages at all as mapped, whereby PTE-mapping it >> would? And IIRC, these PUD-sized pages can be either mapped using PTEs or >> using a single PUD. > > Oh good point. I was thinking that because we don't account PUD mappings today > that it would be fine to ignore them. But of course this series means we start > accounting them if mapped with PTEs so agree we should be consistent. > >> I suspect what you want is to > > Yes, I think so. Thanks for the hint. I will be out over the Christmas break but > will do a respin to incorporate this before then. I'll be on PTO starting ... well now. But I'll try to give the other parts a quick peek if anything urgent jumps at me. (bad habit of reading mails while on PTO ...) In any case, happy holidays to you!
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 683a040..7043914 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -192,6 +192,7 @@ typedef int __bitwise rmap_t; enum rmap_level { RMAP_LEVEL_PTE = 0, RMAP_LEVEL_PMD, + RMAP_LEVEL_PUD, }; static inline void __folio_rmap_sanity_checks(const struct folio *folio, @@ -228,6 +229,14 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); break; + case RMAP_LEVEL_PUD: + /* + * Assume that we are creating * a single "entire" mapping of the + * folio. + */ + VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio); + VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); + break; default: VM_WARN_ON_ONCE(true); } @@ -251,12 +260,16 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, folio_add_file_rmap_ptes(folio, page, 1, vma) void folio_add_file_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *); +void folio_add_file_rmap_pud(struct folio *, struct page *, + struct vm_area_struct *); void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_remove_rmap_pte(folio, page, vma) \ folio_remove_rmap_ptes(folio, page, 1, vma) void folio_remove_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *); +void folio_remove_rmap_pud(struct folio *, struct page *, + struct vm_area_struct *); void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address, rmap_t flags); @@ -341,6 +354,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: + case RMAP_LEVEL_PUD: atomic_inc(&folio->_entire_mapcount); atomic_inc(&folio->_large_mapcount); break; @@ -437,6 +451,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: + case RMAP_LEVEL_PUD: if (PageAnonExclusive(page)) { if (unlikely(maybe_pinned)) return -EBUSY; diff --git a/mm/rmap.c b/mm/rmap.c index c6c4d4e..39d0439 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1203,6 +1203,11 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, } atomic_inc(&folio->_large_mapcount); break; + case RMAP_LEVEL_PUD: + /* We only support entire mappings of PUD sized folios in rmap */ + atomic_inc(&folio->_entire_mapcount); + atomic_inc(&folio->_large_mapcount); + break; } return nr; } @@ -1338,6 +1343,13 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, case RMAP_LEVEL_PMD: SetPageAnonExclusive(page); break; + case RMAP_LEVEL_PUD: + /* + * Keep the compiler happy, we don't support anonymous + * PUD mappings. + */ + WARN_ON_ONCE(1); + break; } } for (i = 0; i < nr_pages; i++) { @@ -1531,6 +1543,26 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, #endif } +/** + * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio + * @folio: The folio to add the mapping to + * @page: The first page to add + * @vma: The vm area in which the mapping is added + * + * The page range of the folio is defined by [page, page + HPAGE_PUD_NR) + * + * The caller needs to hold the page table lock. + */ +void folio_add_file_rmap_pud(struct folio *folio, struct page *page, + struct vm_area_struct *vma) +{ +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD); +#else + WARN_ON_ONCE(true); +#endif +} + static __always_inline void __folio_remove_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum rmap_level level) @@ -1578,6 +1610,10 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, partially_mapped = nr && nr < nr_pmdmapped; break; + case RMAP_LEVEL_PUD: + atomic_dec(&folio->_large_mapcount); + atomic_dec(&folio->_entire_mapcount); + break; } /* @@ -1640,6 +1676,26 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page, #endif } +/** + * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio + * @folio: The folio to remove the mapping from + * @page: The first page to remove + * @vma: The vm area from which the mapping is removed + * + * The page range of the folio is defined by [page, page + HPAGE_PUD_NR) + * + * The caller needs to hold the page table lock. + */ +void folio_remove_rmap_pud(struct folio *folio, struct page *page, + struct vm_area_struct *vma) +{ +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD); +#else + WARN_ON_ONCE(true); +#endif +} + /* * @arg: enum ttu_flags will be passed to this argument */
The rmap doesn't currently support adding a PUD mapping of a folio. This patch adds support for entire PUD mappings of folios, primarily to allow for more standard refcounting of device DAX folios. Currently DAX is the only user of this and it doesn't require support for partially mapped PUD-sized folios so we don't support for that for now. Signed-off-by: Alistair Popple <apopple@nvidia.com> --- David - Thanks for your previous comments, I'm less familiar with the rmap code so I would appreciate you taking another look. In particular I haven't added a stat for PUD mapped folios as it seemed like overkill for just the device DAX case but let me know if you think otherwise. Changes for v4: - New for v4, split out rmap changes as suggested by David. --- include/linux/rmap.h | 15 ++++++++++++- mm/rmap.c | 56 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 71 insertions(+)