@@ -618,11 +618,17 @@ static inline swp_entry_t get_swap_page(struct page *page)
#ifdef CONFIG_THP_SWAP
extern int split_swap_cluster(swp_entry_t entry);
+extern int split_swap_cluster_map(swp_entry_t entry);
#else
static inline int split_swap_cluster(swp_entry_t entry)
{
return 0;
}
+
+static inline int split_swap_cluster_map(swp_entry_t entry)
+{
+ return 0;
+}
#endif
#ifdef CONFIG_MEMCG
@@ -1602,6 +1602,47 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
return 0;
}
+#ifdef CONFIG_THP_SWAP
+static void __split_huge_swap_pmd(struct vm_area_struct *vma,
+ unsigned long haddr,
+ pmd_t *pmd)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ swp_entry_t entry;
+ int i, soft_dirty;
+
+ entry = pmd_to_swp_entry(*pmd);
+ soft_dirty = pmd_soft_dirty(*pmd);
+
+ split_swap_cluster_map(entry);
+
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE, entry.val++) {
+ pte_t *pte, ptent;
+
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ ptent = swp_entry_to_pte(entry);
+ if (soft_dirty)
+ ptent = pte_swp_mksoft_dirty(ptent);
+ set_pte_at(mm, haddr, pte, ptent);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+}
+#else
+static inline void __split_huge_swap_pmd(struct vm_area_struct *vma,
+ unsigned long haddr,
+ pmd_t *pmd)
+{
+}
+#endif
+
/*
* Return true if we do MADV_FREE successfully on entire pmd page.
* Otherwise, return false.
@@ -2068,7 +2109,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
- VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+ VM_BUG_ON(!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd)
&& !pmd_devmap(*pmd));
count_vm_event(THP_SPLIT_PMD);
@@ -2090,8 +2131,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
put_page(page);
add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
return;
- } else if (is_huge_zero_pmd(*pmd)) {
+ } else if (pmd_present(*pmd) && is_huge_zero_pmd(*pmd)) {
/*
+ * is_huge_zero_pmd() may return true for PMD swap
+ * entry, so checking pmd_present() firstly.
+ *
* FIXME: Do we want to invalidate secondary mmu by calling
* mmu_notifier_invalidate_range() see comments below inside
* __split_huge_pmd() ?
@@ -2134,6 +2178,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
page = pfn_to_page(swp_offset(entry));
} else
#endif
+ if (thp_swap_supported() && is_swap_pmd(old_pmd))
+ return __split_huge_swap_pmd(vma, haddr, pmd);
+ else
page = pmd_page(old_pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
@@ -2225,14 +2272,15 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* pmd against. Otherwise we can end up replacing wrong page.
*/
VM_BUG_ON(freeze && !page);
- if (page && page != pmd_page(*pmd))
- goto out;
+ /* pmd_page() should be called only if pmd_present() */
+ if (page && (!pmd_present(*pmd) || page != pmd_page(*pmd)))
+ goto out;
if (pmd_trans_huge(*pmd)) {
page = pmd_page(*pmd);
if (PageMlocked(page))
clear_page_mlock(page);
- } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
+ } else if (!(pmd_devmap(*pmd) || is_swap_pmd(*pmd)))
goto out;
__split_huge_pmd_locked(vma, pmd, haddr, freeze);
out:
@@ -4046,6 +4046,34 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
}
+#ifdef CONFIG_THP_SWAP
+/* The corresponding page table shouldn't be changed under us */
+int split_swap_cluster_map(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+
+ VM_BUG_ON(!is_cluster_offset(offset));
+ si = _swap_info_get(entry);
+ if (!si)
+ return -EBUSY;
+ ci = lock_cluster(si, offset);
+ /* The swap cluster has been split by someone else */
+ if (!cluster_is_huge(ci))
+ goto out;
+ cluster_set_count(ci, cluster_count(ci) - 1);
+ VM_BUG_ON(cluster_count(ci) < SWAPFILE_CLUSTER);
+ if (cluster_count(ci) == SWAPFILE_CLUSTER &&
+ !(si->swap_map[offset] & SWAP_HAS_CACHE))
+ cluster_clear_huge(ci);
+
+out:
+ unlock_cluster(ci);
+ return 0;
+}
+#endif
+
static int __init swapfile_init(void)
{
int nid;