@@ -2114,21 +2114,69 @@ static inline size_t folio_size(struct folio *folio)
}
/**
- * folio_estimated_sharers - Estimate the number of sharers of a folio.
+ * folio_mapped_shared - Report if a folio is certainly mapped by
+ * multiple entities in their page tables
* @folio: The folio.
*
- * folio_estimated_sharers() aims to serve as a function to efficiently
- * estimate the number of processes sharing a folio. This is done by
- * looking at the precise mapcount of the first subpage in the folio, and
- * assuming the other subpages are the same. This may not be true for large
- * folios. If you want exact mapcounts for exact calculations, look at
- * page_mapcount() or folio_mapcount().
+ * This function checks if a folio is certainly *currently* mapped by
+ * multiple entities in their page table ("mapped shared") or if the folio
+ * may be mapped exclusively by a single entity ("mapped exclusively").
*
- * Return: The estimated number of processes sharing a folio.
+ * Usually, we consider a single entity to be a single MM. However, some
+ * folios (KSM, pagecache) can be mapped multiple times into the same MM.
+ *
+ * For KSM folios, each individual page table mapping is considered a
+ * separate entity. So if a KSM folio is mapped multiple times into the
+ * same process, it is considered "mapped shared".
+ *
+ * For pagecache folios that are entirely mapped multiple times into the
+ * same MM (i.e., multiple VMAs in the same MM cover the same
+ * file range), we traditionally (and for simplicity) consider them,
+ * "mapped shared". For partially-mapped folios (e..g, PTE-mapped THP), we
+ * might detect them either as "mapped shared" or "mapped exclusively" --
+ * whatever is simpler.
+ *
+ * For small folios and entirely mapped large folios (e.g., hugetlb,
+ * PMD-mapped PMD-sized THP), the result will be exactly correct.
+ *
+ * For all other (partially-mappable) folios, such as PTE-mapped THP, the
+ * return value is partially fuzzy: true is not fuzzy, because it means
+ * "certainly mapped shared", but false means "maybe mapped exclusively".
+ *
+ * Note that this function only considers *current* page table mappings
+ * tracked via rmap -- that properly adjusts the folio mapcount(s) -- and
+ * does not consider:
+ * (1) any way the folio might get mapped in the (near) future (e.g.,
+ * swapcache, pagecache, temporary unmapping for migration).
+ * (2) any way a folio might be mapped besides using the rmap (PFN mappings).
+ * (3) any form of page table sharing.
+ *
+ * Return: Whether the folio is certainly mapped by multiple entities.
*/
-static inline int folio_estimated_sharers(struct folio *folio)
+static inline bool folio_mapped_shared(struct folio *folio)
{
- return page_mapcount(folio_page(folio, 0));
+ unsigned int total_mapcount;
+
+ if (likely(!folio_test_large(folio)))
+ return atomic_read(&folio->page._mapcount) != 0;
+ total_mapcount = folio_total_mapcount(folio);
+
+ /* A single mapping implies "mapped exclusively". */
+ if (total_mapcount == 1)
+ return false;
+
+ /* If there is an entire mapping, it must be the only mapping. */
+ if (folio_entire_mapcount(folio) || unlikely(folio_test_hugetlb(folio)))
+ return total_mapcount != 1;
+ /*
+ * Partially-mappable folios are tricky ... but some are "obviously
+ * mapped shared": if we have more (PTE) mappings than we have pages
+ * in the folio, some other entity is certainly involved.
+ */
+ if (total_mapcount > folio_nr_pages(folio))
+ return true;
+ /* ... guess based on the mapcount of the first page of the folio. */
+ return atomic_read(&folio->page._mapcount) > 0;
}
#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
@@ -1638,7 +1638,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* If other processes are mapping this folio, we couldn't discard
* the folio unless they all do MADV_FREE so let's skip the folio.
*/
- if (folio_estimated_sharers(folio) != 1)
+ if (folio_mapped_shared(folio))
goto out;
if (!folio_trylock(folio))
@@ -365,7 +365,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
folio = pfn_folio(pmd_pfn(orig_pmd));
/* Do not interfere with other mappings of this folio */
- if (folio_estimated_sharers(folio) != 1)
+ if (folio_mapped_shared(folio))
goto huge_unlock;
if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -441,7 +441,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (folio_test_large(folio)) {
int err;
- if (folio_estimated_sharers(folio) != 1)
+ if (folio_mapped_shared(folio))
break;
if (pageout_anon_only_filter && !folio_test_anon(folio))
break;
@@ -665,7 +665,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (folio_test_large(folio)) {
int err;
- if (folio_estimated_sharers(folio) != 1)
+ if (folio_mapped_shared(folio))
break;
if (!folio_trylock(folio))
break;
@@ -4848,7 +4848,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* Flag if the folio is shared between multiple address spaces. This
* is later used when determining whether to group tasks together
*/
- if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED))
+ if (folio_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
flags |= TNF_SHARED;
nid = folio_nid(folio);
@@ -605,12 +605,11 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
* Choosing not to migrate a shared folio is not counted as a failure.
*
- * To check if the folio is shared, ideally we want to make sure
- * every page is mapped to the same process. Doing that is very
- * expensive, so check the estimated sharers of the folio instead.
+ * See folio_mapped_shared() on possible imprecision when we cannot
+ * easily detect if a folio is shared.
*/
if ((flags & MPOL_MF_MOVE_ALL) ||
- (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte)))
+ (!folio_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
if (!isolate_hugetlb(folio, qp->pagelist))
qp->nr_failed++;
unlock:
@@ -988,11 +987,10 @@ static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
* Choosing not to migrate a shared folio is not counted as a failure.
*
- * To check if the folio is shared, ideally we want to make sure
- * every page is mapped to the same process. Doing that is very
- * expensive, so check the estimated sharers of the folio instead.
+ * See folio_mapped_shared() on possible imprecision when we cannot
+ * easily detect if a folio is shared.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
+ if ((flags & MPOL_MF_MOVE_ALL) || !folio_mapped_shared(folio)) {
if (folio_isolate_lru(folio)) {
list_add_tail(&folio->lru, foliolist);
node_stat_mod_folio(folio,
@@ -2559,7 +2559,7 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
* every page is mapped to the same process. Doing that is very
* expensive, so check the estimated mapcount of the folio instead.
*/
- if (folio_estimated_sharers(folio) != 1 && folio_is_file_lru(folio) &&
+ if (folio_mapped_shared(folio) && folio_is_file_lru(folio) &&
(vma->vm_flags & VM_EXEC))
goto out;
Callers of folio_estimated_sharers() only care about "mapped shared vs. mapped exclusively". Let's rename the function and improve our detection for partially-mappable folios (i.e., PTE-mapped THPs). For now we can only implement, based on our guess, "certainly mapped shared vs. maybe mapped exclusively". Ideally, we'd have something like "maybe mapped shared vs. certainly mapped exclusive" -- or even better "certainly mapped shared vs. certainly mapped exclusively" instead. But these semantics are currently impossible using our guess-based heuristic we apply for partially-mappable folios. Naming the function "folio_certainly_mapped_shared" could be possible, but let's just keep it simple an call it "folio_mapped_shared" and document the fuzziness that applies for now. As we can now read the total mapcount of large folios very efficiently, use that to improve our implementation, falling back to making a guess only in case the folio is not "obviously mapped shared". Signed-off-by: David Hildenbrand <david@redhat.com> --- include/linux/mm.h | 68 +++++++++++++++++++++++++++++++++++++++------- mm/huge_memory.c | 2 +- mm/madvise.c | 6 ++-- mm/memory.c | 2 +- mm/mempolicy.c | 14 ++++------ mm/migrate.c | 2 +- 6 files changed, 70 insertions(+), 24 deletions(-)