Message ID | 20240502084939.30250-5-ryncsn@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm/swap: clean up and optimize swap cache index | expand |
Kairui Song <ryncsn@gmail.com> writes: > From: Kairui Song <kasong@tencent.com> > > Currently we use one swap_address_space for every 64M chunk to reduce lock > contention, this is like having a set of smaller swap files inside one > swap device. But when doing swap cache look up or insert, we are > still using the offset of the whole large swap device. This is OK for > correctness, as the offset (key) is unique. > > But Xarray is specially optimized for small indexes, it creates the > radix tree levels lazily to be just enough to fit the largest key > stored in one Xarray. So we are wasting tree nodes unnecessarily. > > For 64M chunk it should only take at most 3 levels to contain everything. > But if we are using the offset from the whole swap device, the offset (key) > value will be way beyond 64M, and so will the tree level. > > Optimize this by using a new helper swap_cache_index to get a swap > entry's unique offset in its own 64M swap_address_space. > > I see a ~1% performance gain in benchmark and actual workload with > high memory pressure. > > Test with `time memhog 128G` inside a 8G memcg using 128G swap (ramdisk > with SWP_SYNCHRONOUS_IO dropped, tested 3 times, results are stable. The > test result is similar but the improvement is smaller if SWP_SYNCHRONOUS_IO > is enabled, as swap out path can never skip swap cache): > > Before: > 6.07user 250.74system 4:17.26elapsed 99%CPU (0avgtext+0avgdata 8373376maxresident)k > 0inputs+0outputs (55major+33555018minor)pagefaults 0swaps > > After (1.8% faster): > 6.08user 246.09system 4:12.58elapsed 99%CPU (0avgtext+0avgdata 8373248maxresident)k > 0inputs+0outputs (54major+33555027minor)pagefaults 0swaps > > Similar result with MySQL and sysbench using swap: > Before: > 94055.61 qps > > After (0.8% faster): > 94834.91 qps > > Radix tree slab usage is also very slightly lower. > > Signed-off-by: Kairui Song <kasong@tencent.com> > --- > mm/huge_memory.c | 2 +- > mm/memcontrol.c | 2 +- > mm/mincore.c | 2 +- > mm/shmem.c | 2 +- > mm/swap.h | 15 +++++++++++++++ > mm/swap_state.c | 19 ++++++++++--------- > mm/swapfile.c | 6 +++--- > 7 files changed, 32 insertions(+), 16 deletions(-) > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index d35d526ed48f..45829cc049d2 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -2918,7 +2918,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, > split_page_memcg(head, order, new_order); > > if (folio_test_anon(folio) && folio_test_swapcache(folio)) { > - offset = swp_offset(folio->swap); > + offset = swap_cache_index(folio->swap); > swap_cache = swap_address_space(folio->swap); > xa_lock(&swap_cache->i_pages); > } > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index d11536ef59ef..81b005c459cb 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -6165,7 +6165,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, > * Because swap_cache_get_folio() updates some statistics counter, > * we call find_get_page() with swapper_space directly. > */ > - page = find_get_page(swap_address_space(ent), swp_offset(ent)); > + page = find_get_page(swap_address_space(ent), swap_cache_index(ent)); > entry->val = ent.val; > > return page; > diff --git a/mm/mincore.c b/mm/mincore.c > index dad3622cc963..e31cf1bde614 100644 > --- a/mm/mincore.c > +++ b/mm/mincore.c > @@ -139,7 +139,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, > } else { > #ifdef CONFIG_SWAP > *vec = mincore_page(swap_address_space(entry), > - swp_offset(entry)); > + swap_cache_index(entry)); > #else > WARN_ON(1); > *vec = 1; > diff --git a/mm/shmem.c b/mm/shmem.c > index fa2a0ed97507..326315c12feb 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -1756,7 +1756,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, > > old = *foliop; > entry = old->swap; > - swap_index = swp_offset(entry); > + swap_index = swap_cache_index(entry); > swap_mapping = swap_address_space(entry); > > /* > diff --git a/mm/swap.h b/mm/swap.h > index 82023ab93205..93e3e1b58a7f 100644 > --- a/mm/swap.h > +++ b/mm/swap.h > @@ -27,6 +27,7 @@ void __swap_writepage(struct folio *folio, struct writeback_control *wbc); > /* One swap address space for each 64M swap space */ > #define SWAP_ADDRESS_SPACE_SHIFT 14 > #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) > +#define SWAP_ADDRESS_SPACE_MASK (BIT(SWAP_ADDRESS_SPACE_SHIFT) - 1) #define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1) ? We can use BIT() in SWAP_ADDRESS_SPACE_PAGES definition. > extern struct address_space *swapper_spaces[]; > #define swap_address_space(entry) \ > (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ > @@ -40,6 +41,15 @@ static inline loff_t swap_dev_pos(swp_entry_t entry) > return ((loff_t)swp_offset(entry)) << PAGE_SHIFT; > } > > +/* > + * Return the swap cache index of the swap entry. > + */ > +static inline pgoff_t swap_cache_index(swp_entry_t entry) > +{ > + BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK); > + return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK; > +} > + > void show_swap_cache_info(void); > bool add_to_swap(struct folio *folio); > void *get_shadow_from_swap_cache(swp_entry_t entry); > @@ -86,6 +96,11 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) > return NULL; > } > > +static inline pgoff_t swap_cache_index(swp_entry_t entry) > +{ > + return 0; > +} > + > static inline void show_swap_cache_info(void) > { > } > diff --git a/mm/swap_state.c b/mm/swap_state.c > index 642c30d8376c..09415d4c7843 100644 > --- a/mm/swap_state.c > +++ b/mm/swap_state.c > @@ -72,7 +72,7 @@ void show_swap_cache_info(void) > void *get_shadow_from_swap_cache(swp_entry_t entry) > { > struct address_space *address_space = swap_address_space(entry); > - pgoff_t idx = swp_offset(entry); > + pgoff_t idx = swap_cache_index(entry); > void *shadow; > > shadow = xa_load(&address_space->i_pages, idx); > @@ -89,7 +89,7 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, > gfp_t gfp, void **shadowp) > { > struct address_space *address_space = swap_address_space(entry); > - pgoff_t idx = swp_offset(entry); > + pgoff_t idx = swap_cache_index(entry); > XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); > unsigned long i, nr = folio_nr_pages(folio); > void *old; > @@ -144,7 +144,7 @@ void __delete_from_swap_cache(struct folio *folio, > struct address_space *address_space = swap_address_space(entry); > int i; > long nr = folio_nr_pages(folio); > - pgoff_t idx = swp_offset(entry); > + pgoff_t idx = swap_cache_index(entry); > XA_STATE(xas, &address_space->i_pages, idx); > > xas_set_update(&xas, workingset_update_node); > @@ -248,18 +248,19 @@ void delete_from_swap_cache(struct folio *folio) > void clear_shadow_from_swap_cache(int type, unsigned long begin, > unsigned long end) > { > - unsigned long curr = begin; > + unsigned long curr = begin, offset; Better to rename "offset" as "index" to avoid confusion? > void *old; > > for (;;) { > + offset = curr & SWAP_ADDRESS_SPACE_MASK; > swp_entry_t entry = swp_entry(type, curr); > struct address_space *address_space = swap_address_space(entry); > - XA_STATE(xas, &address_space->i_pages, curr); > + XA_STATE(xas, &address_space->i_pages, offset); > > xas_set_update(&xas, workingset_update_node); > > xa_lock_irq(&address_space->i_pages); > - xas_for_each(&xas, old, end) { > + xas_for_each(&xas, old, offset + min(end - curr, SWAP_ADDRESS_SPACE_PAGES)) { Is there a bug in the original code? It doesn't check SWAP_ADDRESS_SPACE_PAGES. And should it be changed to xas_for_each(&xas, old, min(offset + end - curr, SWAP_ADDRESS_SPACE_PAGES)) ? > if (!xa_is_value(old)) > continue; > xas_store(&xas, NULL); > @@ -350,7 +351,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, > { > struct folio *folio; > > - folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); > + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); > if (!IS_ERR(folio)) { > bool vma_ra = swap_use_vma_readahead(); > bool readahead; > @@ -420,7 +421,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, > si = get_swap_device(swp); > if (!si) > return ERR_PTR(-ENOENT); > - index = swp_offset(swp); > + index = swap_cache_index(swp); > folio = filemap_get_folio(swap_address_space(swp), index); > put_swap_device(si); > return folio; > @@ -447,7 +448,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > * that would confuse statistics. > */ > folio = filemap_get_folio(swap_address_space(entry), > - swp_offset(entry)); > + swap_cache_index(entry)); > if (!IS_ERR(folio)) > goto got_folio; > > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 0b0ae6e8c764..4f0e8b2ac8aa 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -142,7 +142,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > struct folio *folio; > int ret = 0; > > - folio = filemap_get_folio(swap_address_space(entry), offset); > + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); > if (IS_ERR(folio)) > return 0; > /* > @@ -2158,7 +2158,7 @@ static int try_to_unuse(unsigned int type) > (i = find_next_to_unuse(si, i)) != 0) { > > entry = swp_entry(type, i); > - folio = filemap_get_folio(swap_address_space(entry), i); > + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); > if (IS_ERR(folio)) > continue; > > @@ -3476,7 +3476,7 @@ EXPORT_SYMBOL_GPL(swapcache_mapping); > > pgoff_t __folio_swap_cache_index(struct folio *folio) > { > - return swp_offset(folio->swap); > + return swap_cache_index(folio->swap); > } > EXPORT_SYMBOL_GPL(__folio_swap_cache_index); -- Best Regards, Huang, Ying
On Wed, May 8, 2024 at 3:27 PM Huang, Ying <ying.huang@intel.com> wrote: > > Kairui Song <ryncsn@gmail.com> writes: > > > From: Kairui Song <kasong@tencent.com> > > > > Currently we use one swap_address_space for every 64M chunk to reduce lock > > contention, this is like having a set of smaller swap files inside one > > swap device. But when doing swap cache look up or insert, we are > > still using the offset of the whole large swap device. This is OK for > > correctness, as the offset (key) is unique. > > > > But Xarray is specially optimized for small indexes, it creates the > > radix tree levels lazily to be just enough to fit the largest key > > stored in one Xarray. So we are wasting tree nodes unnecessarily. > > > > For 64M chunk it should only take at most 3 levels to contain everything. it should only take at most 3 levels to contain everything. > > But if we are using the offset from the whole swap device, the offset (key) > > value will be way beyond 64M, and so will the tree level. > > > > Optimize this by using a new helper swap_cache_index to get a swap > > entry's unique offset in its own 64M swap_address_space. > > > > I see a ~1% performance gain in benchmark and actual workload with > > high memory pressure. > > > > Test with `time memhog 128G` inside a 8G memcg using 128G swap (ramdisk > > with SWP_SYNCHRONOUS_IO dropped, tested 3 times, results are stable. The > > test result is similar but the improvement is smaller if SWP_SYNCHRONOUS_IO > > is enabled, as swap out path can never skip swap cache): > > > > Before: > > 6.07user 250.74system 4:17.26elapsed 99%CPU (0avgtext+0avgdata 8373376maxresident)k > > 0inputs+0outputs (55major+33555018minor)pagefaults 0swaps > > > > After (1.8% faster): > > 6.08user 246.09system 4:12.58elapsed 99%CPU (0avgtext+0avgdata 8373248maxresident)k > > 0inputs+0outputs (54major+33555027minor)pagefaults 0swaps > > > > Similar result with MySQL and sysbench using swap: > > Before: > > 94055.61 qps > > > > After (0.8% faster): > > 94834.91 qps > > > > Radix tree slab usage is also very slightly lower. > > > > Signed-off-by: Kairui Song <kasong@tencent.com> > > --- > > mm/huge_memory.c | 2 +- > > mm/memcontrol.c | 2 +- > > mm/mincore.c | 2 +- > > mm/shmem.c | 2 +- > > mm/swap.h | 15 +++++++++++++++ > > mm/swap_state.c | 19 ++++++++++--------- > > mm/swapfile.c | 6 +++--- > > 7 files changed, 32 insertions(+), 16 deletions(-) > > > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > index d35d526ed48f..45829cc049d2 100644 > > --- a/mm/huge_memory.c > > +++ b/mm/huge_memory.c > > @@ -2918,7 +2918,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, > > split_page_memcg(head, order, new_order); > > > > if (folio_test_anon(folio) && folio_test_swapcache(folio)) { > > - offset = swp_offset(folio->swap); > > + offset = swap_cache_index(folio->swap); > > swap_cache = swap_address_space(folio->swap); > > xa_lock(&swap_cache->i_pages); > > } > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > > index d11536ef59ef..81b005c459cb 100644 > > --- a/mm/memcontrol.c > > +++ b/mm/memcontrol.c > > @@ -6165,7 +6165,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, > > * Because swap_cache_get_folio() updates some statistics counter, > > * we call find_get_page() with swapper_space directly. > > */ > > - page = find_get_page(swap_address_space(ent), swp_offset(ent)); > > + page = find_get_page(swap_address_space(ent), swap_cache_index(ent)); > > entry->val = ent.val; > > > > return page; > > diff --git a/mm/mincore.c b/mm/mincore.c > > index dad3622cc963..e31cf1bde614 100644 > > --- a/mm/mincore.c > > +++ b/mm/mincore.c > > @@ -139,7 +139,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, > > } else { > > #ifdef CONFIG_SWAP > > *vec = mincore_page(swap_address_space(entry), > > - swp_offset(entry)); > > + swap_cache_index(entry)); > > #else > > WARN_ON(1); > > *vec = 1; > > diff --git a/mm/shmem.c b/mm/shmem.c > > index fa2a0ed97507..326315c12feb 100644 > > --- a/mm/shmem.c > > +++ b/mm/shmem.c > > @@ -1756,7 +1756,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, > > > > old = *foliop; > > entry = old->swap; > > - swap_index = swp_offset(entry); > > + swap_index = swap_cache_index(entry); > > swap_mapping = swap_address_space(entry); > > > > /* > > diff --git a/mm/swap.h b/mm/swap.h > > index 82023ab93205..93e3e1b58a7f 100644 > > --- a/mm/swap.h > > +++ b/mm/swap.h > > @@ -27,6 +27,7 @@ void __swap_writepage(struct folio *folio, struct writeback_control *wbc); > > /* One swap address space for each 64M swap space */ > > #define SWAP_ADDRESS_SPACE_SHIFT 14 > > #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) > > +#define SWAP_ADDRESS_SPACE_MASK (BIT(SWAP_ADDRESS_SPACE_SHIFT) - 1) > > #define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1) > ? > > We can use BIT() in SWAP_ADDRESS_SPACE_PAGES definition. > I'll just use (SWAP_ADDRESS_SPACE_PAGES - 1) then, I was trying to make the changes minimal, but prefered the BIT macro, a trivial change. > > extern struct address_space *swapper_spaces[]; > > #define swap_address_space(entry) \ > > (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ > > @@ -40,6 +41,15 @@ static inline loff_t swap_dev_pos(swp_entry_t entry) > > return ((loff_t)swp_offset(entry)) << PAGE_SHIFT; > > } > > > > +/* > > + * Return the swap cache index of the swap entry. > > + */ > > +static inline pgoff_t swap_cache_index(swp_entry_t entry) > > +{ > > + BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK); > > + return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK; > > +} > > + > > void show_swap_cache_info(void); > > bool add_to_swap(struct folio *folio); > > void *get_shadow_from_swap_cache(swp_entry_t entry); > > @@ -86,6 +96,11 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) > > return NULL; > > } > > > > +static inline pgoff_t swap_cache_index(swp_entry_t entry) > > +{ > > + return 0; > > +} > > + > > static inline void show_swap_cache_info(void) > > { > > } > > diff --git a/mm/swap_state.c b/mm/swap_state.c > > index 642c30d8376c..09415d4c7843 100644 > > --- a/mm/swap_state.c > > +++ b/mm/swap_state.c > > @@ -72,7 +72,7 @@ void show_swap_cache_info(void) > > void *get_shadow_from_swap_cache(swp_entry_t entry) > > { > > struct address_space *address_space = swap_address_space(entry); > > - pgoff_t idx = swp_offset(entry); > > + pgoff_t idx = swap_cache_index(entry); > > void *shadow; > > > > shadow = xa_load(&address_space->i_pages, idx); > > @@ -89,7 +89,7 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, > > gfp_t gfp, void **shadowp) > > { > > struct address_space *address_space = swap_address_space(entry); > > - pgoff_t idx = swp_offset(entry); > > + pgoff_t idx = swap_cache_index(entry); > > XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); > > unsigned long i, nr = folio_nr_pages(folio); > > void *old; > > @@ -144,7 +144,7 @@ void __delete_from_swap_cache(struct folio *folio, > > struct address_space *address_space = swap_address_space(entry); > > int i; > > long nr = folio_nr_pages(folio); > > - pgoff_t idx = swp_offset(entry); > > + pgoff_t idx = swap_cache_index(entry); > > XA_STATE(xas, &address_space->i_pages, idx); > > > > xas_set_update(&xas, workingset_update_node); > > @@ -248,18 +248,19 @@ void delete_from_swap_cache(struct folio *folio) > > void clear_shadow_from_swap_cache(int type, unsigned long begin, > > unsigned long end) > > { > > - unsigned long curr = begin; > > + unsigned long curr = begin, offset; > > Better to rename "offset" as "index" to avoid confusion? Good idea. > > void *old; > > > > for (;;) { > > + offset = curr & SWAP_ADDRESS_SPACE_MASK; > > swp_entry_t entry = swp_entry(type, curr); > > struct address_space *address_space = swap_address_space(entry); > > - XA_STATE(xas, &address_space->i_pages, curr); > > + XA_STATE(xas, &address_space->i_pages, offset); > > > > xas_set_update(&xas, workingset_update_node); > > > > xa_lock_irq(&address_space->i_pages); > > - xas_for_each(&xas, old, end) { > > + xas_for_each(&xas, old, offset + min(end - curr, SWAP_ADDRESS_SPACE_PAGES)) { > > Is there a bug in the original code? It doesn't check SWAP_ADDRESS_SPACE_PAGES. That's OK, if the (end - curr) goes above SWAP_ADDRESS_SPACE_PAGES, it means all content in current address_space needs to be purged. xas_for_each will stop after it iterated all content in the current address space. This is a bit hackish though. > > And should it be changed to > > xas_for_each(&xas, old, min(offset + end - curr, SWAP_ADDRESS_SPACE_PAGES)) It should be equivalent, as described above, but yeah, this looks cleaner. I'll use your suggested code. > ? > > > if (!xa_is_value(old)) > > continue; > > xas_store(&xas, NULL); > > @@ -350,7 +351,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, > > { > > struct folio *folio; > > > > - folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); > > + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); > > if (!IS_ERR(folio)) { > > bool vma_ra = swap_use_vma_readahead(); > > bool readahead; > > @@ -420,7 +421,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, > > si = get_swap_device(swp); > > if (!si) > > return ERR_PTR(-ENOENT); > > - index = swp_offset(swp); > > + index = swap_cache_index(swp); > > folio = filemap_get_folio(swap_address_space(swp), index); > > put_swap_device(si); > > return folio; > > @@ -447,7 +448,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, > > * that would confuse statistics. > > */ > > folio = filemap_get_folio(swap_address_space(entry), > > - swp_offset(entry)); > > + swap_cache_index(entry)); > > if (!IS_ERR(folio)) > > goto got_folio; > > > > diff --git a/mm/swapfile.c b/mm/swapfile.c > > index 0b0ae6e8c764..4f0e8b2ac8aa 100644 > > --- a/mm/swapfile.c > > +++ b/mm/swapfile.c > > @@ -142,7 +142,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, > > struct folio *folio; > > int ret = 0; > > > > - folio = filemap_get_folio(swap_address_space(entry), offset); > > + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); > > if (IS_ERR(folio)) > > return 0; > > /* > > @@ -2158,7 +2158,7 @@ static int try_to_unuse(unsigned int type) > > (i = find_next_to_unuse(si, i)) != 0) { > > > > entry = swp_entry(type, i); > > - folio = filemap_get_folio(swap_address_space(entry), i); > > + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); > > if (IS_ERR(folio)) > > continue; > > > > @@ -3476,7 +3476,7 @@ EXPORT_SYMBOL_GPL(swapcache_mapping); > > > > pgoff_t __folio_swap_cache_index(struct folio *folio) > > { > > - return swp_offset(folio->swap); > > + return swap_cache_index(folio->swap); > > } > > EXPORT_SYMBOL_GPL(__folio_swap_cache_index); Thanks for the suggestions! > > -- > Best Regards, > Huang, Ying
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d35d526ed48f..45829cc049d2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2918,7 +2918,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, split_page_memcg(head, order, new_order); if (folio_test_anon(folio) && folio_test_swapcache(folio)) { - offset = swp_offset(folio->swap); + offset = swap_cache_index(folio->swap); swap_cache = swap_address_space(folio->swap); xa_lock(&swap_cache->i_pages); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d11536ef59ef..81b005c459cb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6165,7 +6165,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * Because swap_cache_get_folio() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ - page = find_get_page(swap_address_space(ent), swp_offset(ent)); + page = find_get_page(swap_address_space(ent), swap_cache_index(ent)); entry->val = ent.val; return page; diff --git a/mm/mincore.c b/mm/mincore.c index dad3622cc963..e31cf1bde614 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -139,7 +139,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } else { #ifdef CONFIG_SWAP *vec = mincore_page(swap_address_space(entry), - swp_offset(entry)); + swap_cache_index(entry)); #else WARN_ON(1); *vec = 1; diff --git a/mm/shmem.c b/mm/shmem.c index fa2a0ed97507..326315c12feb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1756,7 +1756,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, old = *foliop; entry = old->swap; - swap_index = swp_offset(entry); + swap_index = swap_cache_index(entry); swap_mapping = swap_address_space(entry); /* diff --git a/mm/swap.h b/mm/swap.h index 82023ab93205..93e3e1b58a7f 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -27,6 +27,7 @@ void __swap_writepage(struct folio *folio, struct writeback_control *wbc); /* One swap address space for each 64M swap space */ #define SWAP_ADDRESS_SPACE_SHIFT 14 #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) +#define SWAP_ADDRESS_SPACE_MASK (BIT(SWAP_ADDRESS_SPACE_SHIFT) - 1) extern struct address_space *swapper_spaces[]; #define swap_address_space(entry) \ (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ @@ -40,6 +41,15 @@ static inline loff_t swap_dev_pos(swp_entry_t entry) return ((loff_t)swp_offset(entry)) << PAGE_SHIFT; } +/* + * Return the swap cache index of the swap entry. + */ +static inline pgoff_t swap_cache_index(swp_entry_t entry) +{ + BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK); + return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK; +} + void show_swap_cache_info(void); bool add_to_swap(struct folio *folio); void *get_shadow_from_swap_cache(swp_entry_t entry); @@ -86,6 +96,11 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) return NULL; } +static inline pgoff_t swap_cache_index(swp_entry_t entry) +{ + return 0; +} + static inline void show_swap_cache_info(void) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 642c30d8376c..09415d4c7843 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -72,7 +72,7 @@ void show_swap_cache_info(void) void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swp_offset(entry); + pgoff_t idx = swap_cache_index(entry); void *shadow; shadow = xa_load(&address_space->i_pages, idx); @@ -89,7 +89,7 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swp_offset(entry); + pgoff_t idx = swap_cache_index(entry); XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); unsigned long i, nr = folio_nr_pages(folio); void *old; @@ -144,7 +144,7 @@ void __delete_from_swap_cache(struct folio *folio, struct address_space *address_space = swap_address_space(entry); int i; long nr = folio_nr_pages(folio); - pgoff_t idx = swp_offset(entry); + pgoff_t idx = swap_cache_index(entry); XA_STATE(xas, &address_space->i_pages, idx); xas_set_update(&xas, workingset_update_node); @@ -248,18 +248,19 @@ void delete_from_swap_cache(struct folio *folio) void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end) { - unsigned long curr = begin; + unsigned long curr = begin, offset; void *old; for (;;) { + offset = curr & SWAP_ADDRESS_SPACE_MASK; swp_entry_t entry = swp_entry(type, curr); struct address_space *address_space = swap_address_space(entry); - XA_STATE(xas, &address_space->i_pages, curr); + XA_STATE(xas, &address_space->i_pages, offset); xas_set_update(&xas, workingset_update_node); xa_lock_irq(&address_space->i_pages); - xas_for_each(&xas, old, end) { + xas_for_each(&xas, old, offset + min(end - curr, SWAP_ADDRESS_SPACE_PAGES)) { if (!xa_is_value(old)) continue; xas_store(&xas, NULL); @@ -350,7 +351,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, { struct folio *folio; - folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (!IS_ERR(folio)) { bool vma_ra = swap_use_vma_readahead(); bool readahead; @@ -420,7 +421,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, si = get_swap_device(swp); if (!si) return ERR_PTR(-ENOENT); - index = swp_offset(swp); + index = swap_cache_index(swp); folio = filemap_get_folio(swap_address_space(swp), index); put_swap_device(si); return folio; @@ -447,7 +448,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * that would confuse statistics. */ folio = filemap_get_folio(swap_address_space(entry), - swp_offset(entry)); + swap_cache_index(entry)); if (!IS_ERR(folio)) goto got_folio; diff --git a/mm/swapfile.c b/mm/swapfile.c index 0b0ae6e8c764..4f0e8b2ac8aa 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -142,7 +142,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, struct folio *folio; int ret = 0; - folio = filemap_get_folio(swap_address_space(entry), offset); + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (IS_ERR(folio)) return 0; /* @@ -2158,7 +2158,7 @@ static int try_to_unuse(unsigned int type) (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); - folio = filemap_get_folio(swap_address_space(entry), i); + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (IS_ERR(folio)) continue; @@ -3476,7 +3476,7 @@ EXPORT_SYMBOL_GPL(swapcache_mapping); pgoff_t __folio_swap_cache_index(struct folio *folio) { - return swp_offset(folio->swap); + return swap_cache_index(folio->swap); } EXPORT_SYMBOL_GPL(__folio_swap_cache_index);