@@ -1163,6 +1163,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
}
void split_page_memcg(struct page *head, unsigned int nr);
+void folio_copy_memcg(struct folio *folio);
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
@@ -1624,6 +1625,10 @@ static inline void split_page_memcg(struct page *head, unsigned int nr)
{
}
+static inline void folio_copy_memcg(struct folio *folio)
+{
+}
+
static inline
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
@@ -218,6 +218,25 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}
+static inline bool lru_gen_add_dst(struct lruvec *lruvec, struct folio *dst)
+{
+ int gen = folio_lru_gen(dst);
+ int type = folio_is_file_lru(dst);
+ int zone = folio_zonenum(dst);
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+
+ if (gen < 0)
+ return false;
+
+ lockdep_assert_held(&lruvec->lru_lock);
+ VM_WARN_ON_ONCE_FOLIO(folio_lruvec(dst) != lruvec, dst);
+
+ list_add_tail(&dst->lru, &lrugen->folios[gen][type][zone]);
+ lru_gen_update_size(lruvec, dst, -1, gen);
+
+ return true;
+}
+
static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
unsigned long seq;
@@ -303,6 +322,11 @@ static inline bool lru_gen_in_fault(void)
return false;
}
+static inline bool lru_gen_add_dst(struct lruvec *lruvec, struct folio *dst)
+{
+ return false;
+}
+
static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
return false;
@@ -323,14 +323,19 @@ struct folio {
struct {
unsigned long _flags_1;
unsigned long _head_1;
- unsigned long _folio_avail;
/* public: */
atomic_t _entire_mapcount;
atomic_t _nr_pages_mapped;
atomic_t _pincount;
#ifdef CONFIG_64BIT
+ unsigned int __padding;
unsigned int _folio_nr_pages;
#endif
+ union {
+ unsigned long _private_1;
+ unsigned long *_dst_ul;
+ struct page **_dst_pp;
+ };
/* private: the union with struct page is transitional */
};
struct page __page_1;
@@ -382,6 +387,7 @@ FOLIO_MATCH(_last_cpupid, _last_cpupid);
offsetof(struct page, pg) + sizeof(struct page))
FOLIO_MATCH(flags, _flags_1);
FOLIO_MATCH(compound_head, _head_1);
+FOLIO_MATCH(private, _private_1);
#undef FOLIO_MATCH
#define FOLIO_MATCH(pg, fl) \
static_assert(offsetof(struct folio, fl) == \
@@ -105,6 +105,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_SPLIT_PAGE_FAILED,
THP_DEFERRED_SPLIT_PAGE,
THP_SPLIT_PMD,
+ THP_SHATTER_PAGE,
+ THP_SHATTER_PAGE_FAILED,
+ THP_SHATTER_PAGE_DISCARDED,
THP_SCAN_EXCEED_NONE_PTE,
THP_SCAN_EXCEED_SWAP_PTE,
THP_SCAN_EXCEED_SHARED_PTE,
@@ -2586,6 +2586,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_swp_mksoft_dirty(entry);
if (uffd_wp)
entry = pte_swp_mkuffd_wp(entry);
+ if (vma->vm_flags & VM_LOCKED)
+ set_src_usage(page + i, SRC_PAGE_MLOCKED);
+ else
+ set_src_usage(page + i, SRC_PAGE_MAPPED);
} else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
if (write)
@@ -2732,6 +2736,178 @@ static void remap_page(struct folio *folio, unsigned long nr)
}
}
+static int prep_to_unmap(struct folio *src)
+{
+ int nr_pages = folio_nr_pages(src);
+
+ if (folio_can_split(src))
+ return 0;
+
+ WARN_ON_ONCE(src->_dst_pp);
+
+ src->_dst_pp = kcalloc(nr_pages, sizeof(struct page *), GFP_ATOMIC);
+
+ return src->_dst_pp ? 0 : -ENOMEM;
+}
+
+static bool try_to_discard(struct folio *src, int i)
+{
+ int usage;
+ void *addr;
+ struct page *page = folio_page(src, i);
+
+ if (!folio_test_anon(src))
+ return false;
+
+ if (folio_test_swapcache(src))
+ return false;
+
+ usage = src_page_usage(page);
+ if (usage & SRC_PAGE_MLOCKED)
+ return false;
+
+ if (!(usage & SRC_PAGE_MAPPED))
+ return true;
+
+ addr = kmap_local_page(page);
+ if (!memchr_inv(addr, 0, PAGE_SIZE))
+ set_src_usage(page, SRC_PAGE_CLEAN);
+ kunmap_local(addr);
+
+ return can_discard_src(page);
+}
+
+static int prep_dst_pages(struct folio *src)
+{
+ int i;
+ int nr_pages = folio_nr_pages(src);
+
+ if (folio_can_split(src))
+ return 0;
+
+ if (WARN_ON_ONCE(!src->_dst_pp))
+ return -ENOMEM;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *dst = NULL;
+
+ if (try_to_discard(src, i)) {
+ count_vm_event(THP_SHATTER_PAGE_DISCARDED);
+ continue;
+ }
+
+ do {
+ int nid = folio_nid(src);
+ gfp_t gfp = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
+ GFP_NOWAIT | __GFP_THISNODE;
+
+ if (dst)
+ __free_page(dst);
+
+ dst = alloc_pages_node(nid, gfp, 0);
+ if (!dst)
+ return -ENOMEM;
+ } while (!page_ref_freeze(dst, 1));
+
+ copy_highpage(dst, folio_page(src, i));
+ src->_dst_ul[i] |= (unsigned long)dst;
+
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static void free_dst_pages(struct folio *src)
+{
+ int i;
+ int nr_pages = folio_nr_pages(src);
+
+ if (folio_can_split(src))
+ return;
+
+ if (!src->_dst_pp)
+ return;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *dst = folio_dst_page(src, i);
+
+ if (!dst)
+ continue;
+
+ page_ref_unfreeze(dst, 1);
+ __free_page(dst);
+ }
+
+ kfree(src->_dst_pp);
+ src->_dst_pp = NULL;
+}
+
+static void reset_src_folio(struct folio *src)
+{
+ if (folio_can_split(src))
+ return;
+
+ if (WARN_ON_ONCE(!src->_dst_pp))
+ return;
+
+ if (!folio_mapping_flags(src))
+ src->mapping = NULL;
+
+ if (folio_test_anon(src) && folio_test_swapcache(src)) {
+ folio_clear_swapcache(src);
+ src->swap.val = 0;
+ }
+
+ kfree(src->_dst_pp);
+ src->_dst_pp = NULL;
+}
+
+static void copy_page_owner(struct folio *src)
+{
+ int i;
+ int nr_pages = folio_nr_pages(src);
+
+ if (folio_can_split(src))
+ return;
+
+ if (WARN_ON_ONCE(!src->_dst_pp))
+ return;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *dst = folio_dst_page(src, i);
+
+ if (dst)
+ folio_copy_owner(src, page_folio(dst));
+ }
+}
+
+static bool lru_add_dst(struct lruvec *lruvec, struct folio *src, struct folio *dst)
+{
+ if (folio_can_split(src))
+ return false;
+
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_lru(src), src);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_lru(dst), dst);
+ VM_WARN_ON_ONCE_FOLIO(folio_lruvec(dst) != folio_lruvec(src), dst);
+
+ if (!lru_gen_add_dst(lruvec, dst)) {
+ enum lru_list lru = folio_lru_list(dst);
+ int zone = folio_zonenum(dst);
+ int delta = folio_nr_pages(dst);
+
+ if (folio_test_unevictable(dst))
+ dst->mlock_count = 0;
+ else
+ list_add_tail(&dst->lru, &src->lru);
+ update_lru_size(lruvec, lru, zone, delta);
+ }
+
+ folio_set_lru(dst);
+
+ return true;
+}
+
static void lru_add_page_tail(struct page *head, struct page *tail,
struct lruvec *lruvec, struct list_head *list)
{
@@ -2745,7 +2921,7 @@ static void lru_add_page_tail(struct page *head, struct page *tail,
VM_WARN_ON(PageLRU(head));
get_page(tail);
list_add_tail(&tail->lru, list);
- } else {
+ } else if (!lru_add_dst(lruvec, page_folio(head), page_folio(tail))) {
/* head is still on lru (and we have it frozen) */
VM_WARN_ON(!PageLRU(head));
if (PageUnevictable(tail))
@@ -2760,7 +2936,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
struct lruvec *lruvec, struct list_head *list)
{
struct page *head = &folio->page;
- struct page *page_tail = head + tail;
+ struct page *page_tail = folio_dst_page(folio, tail);
/*
* Careful: new_folio is not a "real" folio before we cleared PageTail.
* Don't pass it around before clear_compound_head().
@@ -2801,8 +2977,8 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
LRU_GEN_MASK | LRU_REFS_MASK));
/* ->mapping in first and second tail page is replaced by other uses */
- VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
- page_tail);
+ VM_BUG_ON_PAGE(folio_can_split(folio) && tail > 2 &&
+ page_tail->mapping != TAIL_MAPPING, page_tail);
page_tail->mapping = head->mapping;
page_tail->index = head->index + tail;
@@ -2857,9 +3033,13 @@ static void __split_huge_page(struct page *page, struct list_head *list,
unsigned long offset = 0;
unsigned int nr = thp_nr_pages(head);
int i, nr_dropped = 0;
+ bool can_split = folio_can_split(folio);
/* complete memcg works before add pages to LRU */
- split_page_memcg(head, nr);
+ if (can_split)
+ split_page_memcg(head, nr);
+ else
+ folio_copy_memcg(folio);
if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
offset = swp_offset(folio->swap);
@@ -2872,46 +3052,53 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageHasHWPoisoned(head);
- for (i = nr - 1; i >= 1; i--) {
+ for (i = nr - 1; i >= can_split; i--) {
+ struct page *dst = folio_dst_page(folio, i);
+
+ if (!dst)
+ continue;
+
__split_huge_page_tail(folio, i, lruvec, list);
/* Some pages can be beyond EOF: drop them from page cache */
- if (head[i].index >= end) {
- struct folio *tail = page_folio(head + i);
+ if (dst->index >= end) {
+ struct folio *tail = page_folio(dst);
- if (shmem_mapping(head->mapping))
+ if (shmem_mapping(tail->mapping))
nr_dropped++;
else if (folio_test_clear_dirty(tail))
folio_account_cleaned(tail,
- inode_to_wb(folio->mapping->host));
+ inode_to_wb(tail->mapping->host));
__filemap_remove_folio(tail, NULL);
folio_put(tail);
- } else if (!PageAnon(page)) {
- __xa_store(&head->mapping->i_pages, head[i].index,
- head + i, 0);
+ } else if (!PageAnon(dst)) {
+ __xa_store(&dst->mapping->i_pages, dst->index, dst, 0);
} else if (swap_cache) {
- __xa_store(&swap_cache->i_pages, offset + i,
- head + i, 0);
+ __xa_store(&swap_cache->i_pages, offset + i, dst, 0);
}
}
- ClearPageCompound(head);
+ if (can_split)
+ ClearPageCompound(head);
unlock_page_lruvec(lruvec);
/* Caller disabled irqs, so they are still disabled here */
- split_page_owner(head, nr);
+ if (can_split)
+ split_page_owner(head, nr);
+ else
+ copy_page_owner(folio);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
/* Additional pin to swap cache */
if (PageSwapCache(head)) {
- page_ref_add(head, 2);
+ page_ref_add(head, 2 - !can_split);
xa_unlock(&swap_cache->i_pages);
} else {
page_ref_inc(head);
}
} else {
/* Additional pin to page cache */
- page_ref_add(head, 2);
+ page_ref_add(head, 2 - !can_split);
xa_unlock(&head->mapping->i_pages);
}
local_irq_enable();
@@ -2924,8 +3111,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
split_swap_cluster(folio->swap);
for (i = 0; i < nr; i++) {
- struct page *subpage = head + i;
- if (subpage == page)
+ struct page *subpage = folio_dst_page(folio, i);
+
+ if (!subpage || subpage == page)
continue;
unlock_page(subpage);
@@ -2945,9 +3133,6 @@ static bool can_split_folio(struct folio *folio, int *pextra_pins)
{
int extra_pins;
- if (!folio_can_split(folio))
- return false;
-
/* Additional pins from page cache */
if (folio_test_anon(folio))
extra_pins = folio_test_swapcache(folio) ?
@@ -3067,8 +3252,21 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out_unlock;
}
+ ret = prep_to_unmap(folio);
+ if (ret)
+ goto out_unlock;
+
unmap_folio(folio);
+ if (!folio_ref_freeze(folio, 1 + extra_pins)) {
+ ret = -EAGAIN;
+ goto fail;
+ }
+
+ ret = prep_dst_pages(folio);
+ if (ret)
+ goto fail;
+
/* block interrupt reentry in xa_lock and spinlock */
local_irq_disable();
if (mapping) {
@@ -3078,44 +3276,41 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
*/
xas_lock(&xas);
xas_reset(&xas);
- if (xas_load(&xas) != folio)
+ if (xas_load(&xas) != folio) {
+ xas_unlock(&xas);
+ local_irq_enable();
+ ret = -EAGAIN;
goto fail;
+ }
}
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
- if (folio_ref_freeze(folio, 1 + extra_pins)) {
- if (!list_empty(&folio->_deferred_list)) {
- ds_queue->split_queue_len--;
- list_del(&folio->_deferred_list);
- }
- spin_unlock(&ds_queue->split_queue_lock);
- if (mapping) {
- int nr = folio_nr_pages(folio);
+ if (!list_empty(&folio->_deferred_list)) {
+ ds_queue->split_queue_len--;
+ list_del_init(&folio->_deferred_list);
+ }
+ spin_unlock(&ds_queue->split_queue_lock);
+ if (mapping) {
+ int nr = folio_nr_pages(folio);
- xas_split(&xas, folio, folio_order(folio));
- if (folio_test_pmd_mappable(folio)) {
- if (folio_test_swapbacked(folio)) {
- __lruvec_stat_mod_folio(folio,
- NR_SHMEM_THPS, -nr);
- } else {
- __lruvec_stat_mod_folio(folio,
- NR_FILE_THPS, -nr);
- filemap_nr_thps_dec(mapping);
- }
+ xas_split(&xas, folio, folio_order(folio));
+ if (folio_test_pmd_mappable(folio)) {
+ if (folio_test_swapbacked(folio)) {
+ __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
+ } else {
+ __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
+ filemap_nr_thps_dec(mapping);
}
}
+ }
- __split_huge_page(page, list, end);
- ret = 0;
- } else {
- spin_unlock(&ds_queue->split_queue_lock);
+ __split_huge_page(page, list, end);
+ reset_src_folio(folio);
fail:
- if (mapping)
- xas_unlock(&xas);
- local_irq_enable();
+ if (ret) {
+ free_dst_pages(folio);
remap_page(folio, folio_nr_pages(folio));
- ret = -EAGAIN;
}
out_unlock:
@@ -3127,6 +3322,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
i_mmap_unlock_read(mapping);
out:
xas_destroy(&xas);
+
+ if (!folio_can_split(folio)) {
+ count_vm_event(!ret ? THP_SHATTER_PAGE : THP_SHATTER_PAGE_FAILED);
+ return ret ? : 1;
+ }
+
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
return ret;
}
@@ -1266,4 +1266,42 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
}
#endif /* CONFIG_SHRINKER_DEBUG */
+#define SRC_PAGE_MAPPED BIT(0)
+#define SRC_PAGE_MLOCKED BIT(1)
+#define SRC_PAGE_CLEAN BIT(2)
+#define SRC_PAGE_USAGE_MASK (BIT(3) - 1)
+
+static inline unsigned long src_page_usage(struct page *page)
+{
+ struct folio *src = page_folio(page);
+ int i = folio_page_idx(src, page);
+
+ if (folio_can_split(src) || !src->_dst_ul)
+ return 0;
+
+ return src->_dst_ul[i] & SRC_PAGE_USAGE_MASK;
+}
+
+static inline bool can_discard_src(struct page *page)
+{
+ return src_page_usage(page) & SRC_PAGE_CLEAN;
+}
+
+static inline void set_src_usage(struct page *page, unsigned long usage)
+{
+ struct folio *src = page_folio(page);
+ int i = folio_page_idx(src, page);
+
+ if (!folio_can_split(src) && src->_dst_ul)
+ src->_dst_ul[i] |= usage;
+}
+
+static inline struct page *folio_dst_page(struct folio *src, int i)
+{
+ if (folio_can_split(src) || !src->_dst_ul)
+ return folio_page(src, i);
+
+ return (void *)(src->_dst_ul[i] & ~SRC_PAGE_USAGE_MASK);
+}
+
#endif /* __MM_INTERNAL_H */
@@ -381,7 +381,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
err = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
- if (!err)
+ if (err >= 0)
goto regular_folio;
return 0;
}
@@ -466,8 +466,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
err = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
- if (err)
+ if (err < 0)
break;
+ if (err)
+ goto restart;
start_pte = pte =
pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
@@ -635,6 +637,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
tlb_change_page_size(tlb, PAGE_SIZE);
+restart:
start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
return 0;
@@ -688,8 +691,10 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
err = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
- if (err)
+ if (err < 0)
break;
+ if (err)
+ goto restart;
start_pte = pte =
pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
@@ -3624,6 +3624,53 @@ void split_page_memcg(struct page *head, unsigned int nr)
css_get_many(&memcg->css, nr - 1);
}
+void folio_copy_memcg(struct folio *src)
+{
+ int i;
+ unsigned long flags;
+ int delta = 0;
+ int nr_pages = folio_nr_pages(src);
+ struct mem_cgroup *memcg = folio_memcg(src);
+
+ if (folio_can_split(src))
+ return;
+
+ if (WARN_ON_ONCE(!src->_dst_pp))
+ return;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ if (WARN_ON_ONCE(!memcg))
+ return;
+
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_large(src), src);
+ VM_WARN_ON_ONCE_FOLIO(folio_ref_count(src), src);
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *dst = folio_dst_page(src, i);
+
+ if (!dst)
+ continue;
+
+ commit_charge(page_folio(dst), memcg);
+ delta++;
+ }
+
+ if (!mem_cgroup_is_root(memcg)) {
+ page_counter_charge(&memcg->memory, delta);
+ if (do_memsw_account())
+ page_counter_charge(&memcg->memsw, delta);
+ }
+
+ css_get_many(&memcg->css, delta);
+
+ local_irq_save(flags);
+ mem_cgroup_charge_statistics(memcg, delta);
+ memcg_check_events(memcg, folio_nid(src));
+ local_irq_restore(flags);
+}
+
#ifdef CONFIG_SWAP
/**
* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
@@ -2289,7 +2289,7 @@ int memory_failure(unsigned long pfn, int flags)
* page is a valid handlable page.
*/
SetPageHasHWPoisoned(hpage);
- if (try_to_split_thp_page(p) < 0) {
+ if (try_to_split_thp_page(p)) {
res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
goto unlock_mutex;
}
@@ -180,36 +180,52 @@ void putback_movable_pages(struct list_head *l)
/*
* Restore a potential migration pte to a working pte entry
*/
-static bool remove_migration_pte(struct folio *folio,
- struct vm_area_struct *vma, unsigned long addr, void *old)
+static bool remove_migration_pte(struct folio *dst,
+ struct vm_area_struct *vma, unsigned long addr, void *arg)
{
- DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
+ struct folio *src = arg;
+ DEFINE_FOLIO_VMA_WALK(pvmw, src, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
while (page_vma_mapped_walk(&pvmw)) {
rmap_t rmap_flags = RMAP_NONE;
pte_t old_pte;
pte_t pte;
swp_entry_t entry;
- struct page *new;
+ struct page *page;
+ struct folio *folio;
unsigned long idx = 0;
/* pgoff is invalid for ksm pages, but they are never large */
- if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+ if (folio_test_large(dst) && !folio_test_hugetlb(dst))
idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
- new = folio_page(folio, idx);
+ page = folio_page(dst, idx);
+
+ if (src == dst) {
+ if (can_discard_src(page)) {
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(src), src);
+
+ pte_clear_not_present_full(pvmw.vma->vm_mm, pvmw.address,
+ pvmw.pte, false);
+ dec_mm_counter(pvmw.vma->vm_mm, MM_ANONPAGES);
+ continue;
+ }
+ page = folio_dst_page(src, idx);
+ }
+
+ folio = page_folio(page);
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
!folio_test_pmd_mappable(folio), folio);
- remove_migration_pmd(&pvmw, new);
+ remove_migration_pmd(&pvmw, page);
continue;
}
#endif
folio_get(folio);
- pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
+ pte = mk_pte(page, READ_ONCE(vma->vm_page_prot));
old_pte = ptep_get(pvmw.pte);
if (pte_swp_soft_dirty(old_pte))
pte = pte_mksoft_dirty(pte);
@@ -227,13 +243,13 @@ static bool remove_migration_pte(struct folio *folio,
if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
rmap_flags |= RMAP_EXCLUSIVE;
- if (unlikely(is_device_private_page(new))) {
+ if (unlikely(is_device_private_page(page))) {
if (pte_write(pte))
entry = make_writable_device_private_entry(
- page_to_pfn(new));
+ page_to_pfn(page));
else
entry = make_readable_device_private_entry(
- page_to_pfn(new));
+ page_to_pfn(page));
pte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(old_pte))
pte = pte_swp_mksoft_dirty(pte);
@@ -259,17 +275,17 @@ static bool remove_migration_pte(struct folio *folio,
#endif
{
if (folio_test_anon(folio))
- folio_add_anon_rmap_pte(folio, new, vma,
+ folio_add_anon_rmap_pte(folio, page, vma,
pvmw.address, rmap_flags);
else
- folio_add_file_rmap_pte(folio, new, vma);
+ folio_add_file_rmap_pte(folio, page, vma);
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
}
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
trace_remove_migration_pte(pvmw.address, pte_val(pte),
- compound_order(new));
+ compound_order(page));
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, pvmw.address, pvmw.pte);
@@ -1012,6 +1012,10 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
bad_page(page, "nonzero pincount");
goto out;
}
+ if (unlikely(folio->_private_1)) {
+ bad_page(page, "nonzero _private_1");
+ goto out;
+ }
break;
case 2:
/*
@@ -2260,6 +2260,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
hsz);
else
set_pte_at(mm, address, pvmw.pte, swp_pte);
+ if (vma->vm_flags & VM_LOCKED)
+ set_src_usage(subpage, SRC_PAGE_MLOCKED);
+ else
+ set_src_usage(subpage, SRC_PAGE_MAPPED);
trace_set_migration_pte(address, pte_val(swp_pte),
compound_order(&folio->page));
/*
@@ -696,7 +696,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
folio_put(folio);
/* If split failed move the inode on the list back to shrinklist */
- if (ret)
+ if (ret < 0)
goto move_back;
split++;
@@ -1450,7 +1450,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (folio_test_large(folio)) {
/* Ensure the subpages are still dirty */
folio_test_set_dirty(folio);
- if (split_huge_page(page) < 0)
+ if (split_huge_page(page))
goto redirty;
folio = page_folio(page);
folio_clear_dirty(folio);
@@ -209,6 +209,7 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
*/
bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
+ int err;
loff_t pos = folio_pos(folio);
unsigned int offset, length;
@@ -239,8 +240,11 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
folio_invalidate(folio, offset, length);
if (!folio_test_large(folio))
return true;
- if (split_folio(folio) == 0)
+ err = split_folio(folio);
+ if (!err)
return true;
+ if (err > 0)
+ return false;
if (folio_test_dirty(folio))
return false;
truncate_inode_folio(folio->mapping, folio);
@@ -1094,7 +1094,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
pte_unmap(&orig_dst_pte);
src_pte = dst_pte = NULL;
err = split_folio(src_folio);
- if (err)
+ if (err < 0)
goto out;
/* have to reacquire the folio after it got split */
folio_unlock(src_folio);
@@ -1223,6 +1223,15 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
goto keep_locked;
}
+ if (folio_ref_count(folio) == 1) {
+ folio_unlock(folio);
+ if (folio_put_testzero(folio))
+ goto free_it;
+
+ nr_reclaimed += nr_pages;
+ continue;
+ }
+
/*
* If the folio was split above, the tail pages will make
* their own pass through this function and be accounted
@@ -1364,6 +1364,9 @@ const char * const vmstat_text[] = {
"thp_split_page_failed",
"thp_deferred_split_page",
"thp_split_pmd",
+ "thp_shatter_page",
+ "thp_shatter_page_failed",
+ "thp_shatter_page_discarded",
"thp_scan_exceed_none_pte",
"thp_scan_exceed_swap_pte",
"thp_scan_exceed_share_pte",
In contrast to split, shatter migrates occupied pages in a partially mapped THP to a bunch of base folios. IOW, unlike split done in place, shatter is the exact opposite of collapse. The advantage of shattering is that it keeps the original THP intact. The cost of copying during the migration is not a side effect, but rather by design, since splitting is considered a discouraged behavior. In retail terms, the return of a purchase is charged with a restocking fee and the original goods can be resold. THPs from ZONE_NOMERGE can only be shattered, since they cannot be split or merged. THPs from ZONE_NOSPLIT can be shattered or split (the latter requires [1]), if they are above the minimum order. [1] https://lore.kernel.org/20240226205534.1603748-1-zi.yan@sent.com/ Signed-off-by: Yu Zhao <yuzhao@google.com> --- include/linux/memcontrol.h | 5 + include/linux/mm_inline.h | 24 +++ include/linux/mm_types.h | 8 +- include/linux/vm_event_item.h | 3 + mm/huge_memory.c | 303 ++++++++++++++++++++++++++++------ mm/internal.h | 38 +++++ mm/madvise.c | 11 +- mm/memcontrol.c | 47 ++++++ mm/memory-failure.c | 2 +- mm/migrate.c | 44 +++-- mm/page_alloc.c | 4 + mm/rmap.c | 4 + mm/shmem.c | 4 +- mm/truncate.c | 6 +- mm/userfaultfd.c | 2 +- mm/vmscan.c | 9 + mm/vmstat.c | 3 + 17 files changed, 443 insertions(+), 74 deletions(-)