@@ -646,7 +646,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
dec_mm_counter(mm, mm_counter(page));
}
- free_swap_and_cache(entry);
+ free_swap_and_cache(entry, false);
}
void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
@@ -453,9 +453,9 @@ extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t);
extern int swap_duplicate(swp_entry_t *entry, bool cluster);
extern int swapcache_prepare(swp_entry_t entry, bool cluster);
-extern void swap_free(swp_entry_t);
+extern void swap_free(swp_entry_t entry, bool cluster);
extern void swapcache_free_entries(swp_entry_t *entries, int n);
-extern int free_swap_and_cache(swp_entry_t);
+extern int free_swap_and_cache(swp_entry_t entry, bool cluster);
extern int swap_type_of(dev_t, sector_t, struct block_device **);
extern unsigned int count_swap_pages(int, int);
extern sector_t map_swap_page(struct page *, struct block_device **);
@@ -509,7 +509,8 @@ static inline void show_swap_cache_info(void)
{
}
-#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
+#define free_swap_and_cache(e, c) \
+ ({(is_migration_entry(e) || is_device_private_entry(e)); })
#define swapcache_prepare(e, c) \
({(is_migration_entry(e) || is_device_private_entry(e)); })
@@ -527,7 +528,7 @@ static inline int swap_duplicate(swp_entry_t *swp, bool cluster)
return 0;
}
-static inline void swap_free(swp_entry_t swp)
+static inline void swap_free(swp_entry_t swp, bool cluster)
{
}
@@ -182,7 +182,7 @@ sector_t alloc_swapdev_block(int swap)
offset = swp_offset(get_swap_page_of_type(swap));
if (offset) {
if (swsusp_extents_insert(offset))
- swap_free(swp_entry(swap, offset));
+ swap_free(swp_entry(swap, offset), false);
else
return swapdev_block(swap, offset);
}
@@ -206,7 +206,7 @@ void free_all_swap_pages(int swap)
ext = rb_entry(node, struct swsusp_extent, node);
rb_erase(node, &swsusp_extents);
for (offset = ext->start; offset <= ext->end; offset++)
- swap_free(swp_entry(swap, offset));
+ swap_free(swp_entry(swap, offset), false);
kfree(ext);
}
@@ -349,7 +349,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (non_swap_entry(entry))
continue;
nr_swap--;
- free_swap_and_cache(entry);
+ free_swap_and_cache(entry, false);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
continue;
}
@@ -1376,7 +1376,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
page = migration_entry_to_page(entry);
rss[mm_counter(page)]--;
}
- if (unlikely(!free_swap_and_cache(entry)))
+ if (unlikely(!free_swap_and_cache(entry, false)))
print_bad_pte(vma, addr, ptent, NULL);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -3047,7 +3047,7 @@ int do_swap_page(struct vm_fault *vmf)
}
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
- swap_free(entry);
+ swap_free(entry, false);
if (mem_cgroup_swap_full(page) ||
(vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
@@ -676,7 +676,7 @@ static int shmem_free_swap(struct address_space *mapping,
xa_unlock_irq(&mapping->i_pages);
if (old != radswap)
return -ENOENT;
- free_swap_and_cache(radix_to_swp_entry(radswap));
+ free_swap_and_cache(radix_to_swp_entry(radswap), false);
return 0;
}
@@ -1211,7 +1211,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
spin_lock_irq(&info->lock);
info->swapped--;
spin_unlock_irq(&info->lock);
- swap_free(swap);
+ swap_free(swap, false);
}
}
return error;
@@ -1750,7 +1750,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
delete_from_swap_cache(page);
set_page_dirty(page);
- swap_free(swap);
+ swap_free(swap, false);
} else {
if (vma && userfaultfd_missing(vma)) {
@@ -885,7 +885,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
}
#ifdef CONFIG_THP_SWAP
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
+static int __swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
{
unsigned long idx;
struct swap_cluster_info *ci;
@@ -911,7 +911,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
return 1;
}
-static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+static void __swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
{
unsigned long offset = idx * SWAPFILE_CLUSTER;
struct swap_cluster_info *ci;
@@ -924,11 +924,15 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
swap_range_free(si, offset, SWAPFILE_CLUSTER);
}
#else
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
+static int __swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
{
VM_WARN_ON_ONCE(1);
return 0;
}
+
+static void __swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+}
#endif /* CONFIG_THP_SWAP */
static unsigned long scan_swap_map(struct swap_info_struct *si,
@@ -996,7 +1000,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
}
if (cluster) {
if (!(si->flags & SWP_FILE))
- n_ret = swap_alloc_cluster(si, swp_entries);
+ n_ret = __swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
n_goal, swp_entries);
@@ -1215,8 +1219,10 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
count = SWAP_MAP_MAX | COUNT_CONTINUED;
else
count = SWAP_MAP_MAX;
- } else
+ } else {
+ VM_BUG_ON(!count);
count--;
+ }
}
usage = count | has_cache;
@@ -1255,17 +1261,90 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
swap_range_free(p, offset, 1);
}
+#ifdef CONFIG_THP_SWAP
+static unsigned char swap_free_cluster(struct swap_info_struct *si,
+ swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+ unsigned int count, i, free_entries = 0, cache_only = 0;
+ unsigned char *map, ret = 1;
+
+ ci = lock_cluster(si, offset);
+ VM_BUG_ON(!is_cluster_offset(offset));
+ /* Cluster has been split, free each swap entries in cluster */
+ if (!cluster_is_huge(ci)) {
+ unlock_cluster(ci);
+ for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+ if (!__swap_entry_free(si, entry, 1)) {
+ free_entries++;
+ free_swap_slot(entry);
+ }
+ }
+ return !(free_entries == SWAPFILE_CLUSTER);
+ }
+ count = cluster_count(ci) - 1;
+ VM_BUG_ON(count < SWAPFILE_CLUSTER);
+ cluster_set_count(ci, count);
+ map = si->swap_map + offset;
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ if (map[i] == 1) {
+ map[i] = SWAP_MAP_BAD;
+ free_entries++;
+ } else if (__swap_entry_free_locked(si, ci, offset + i, 1) ==
+ SWAP_HAS_CACHE)
+ cache_only++;
+ }
+ VM_BUG_ON(free_entries && (count != SWAPFILE_CLUSTER ||
+ (map[0] & SWAP_HAS_CACHE)));
+ if (free_entries == SWAPFILE_CLUSTER)
+ memset(map, SWAP_HAS_CACHE, SWAPFILE_CLUSTER);
+ else if (!cluster_swapcount(ci) && !(map[0] & SWAP_HAS_CACHE))
+ cluster_clear_huge(ci);
+ unlock_cluster(ci);
+ if (free_entries == SWAPFILE_CLUSTER) {
+ spin_lock(&si->lock);
+ mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+ __swap_free_cluster(si, offset / SWAPFILE_CLUSTER);
+ spin_unlock(&si->lock);
+ ret = 0;
+ } else if (free_entries) {
+ ci = lock_cluster(si, offset);
+ for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+ if (map[i] == SWAP_MAP_BAD) {
+ map[i] = SWAP_HAS_CACHE;
+ unlock_cluster(ci);
+ free_swap_slot(entry);
+ ci = lock_cluster(si, offset);
+ }
+ }
+ unlock_cluster(ci);
+ } else if (cache_only == SWAPFILE_CLUSTER)
+ ret = SWAP_HAS_CACHE;
+
+ return ret;
+}
+#else
+static inline unsigned char swap_free_cluster(struct swap_info_struct *si,
+ swp_entry_t entry)
+{
+ return 0;
+}
+#endif
+
/*
* Caller has made sure that the swap device corresponding to entry
* is still around or has not been recycled.
*/
-void swap_free(swp_entry_t entry)
+void swap_free(swp_entry_t entry, bool cluster)
{
struct swap_info_struct *p;
p = _swap_info_get(entry);
if (p) {
- if (!__swap_entry_free(p, entry, 1))
+ if (thp_swap_supported() && cluster)
+ swap_free_cluster(p, entry);
+ else if (!__swap_entry_free(p, entry, 1))
free_swap_slot(entry);
}
}
@@ -1326,7 +1405,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
if (free_entries == SWAPFILE_CLUSTER) {
spin_lock(&si->lock);
mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
- swap_free_cluster(si, idx);
+ __swap_free_cluster(si, idx);
spin_unlock(&si->lock);
} else if (free_entries) {
for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
@@ -1730,7 +1809,7 @@ int try_to_free_swap(struct page *page)
* Free the swap entry like above, but also try to
* free the page cache entry if it is the last user.
*/
-int free_swap_and_cache(swp_entry_t entry)
+int free_swap_and_cache(swp_entry_t entry, bool cluster)
{
struct swap_info_struct *p;
struct page *page = NULL;
@@ -1741,7 +1820,8 @@ int free_swap_and_cache(swp_entry_t entry)
p = _swap_info_get(entry);
if (p) {
- count = __swap_entry_free(p, entry, 1);
+ count = cluster ? swap_free_cluster(p, entry) :
+ __swap_entry_free(p, entry, 1);
if (count == SWAP_HAS_CACHE &&
!swap_page_trans_huge_swapped(p, entry)) {
page = find_get_page(swap_address_space(entry),
@@ -1750,7 +1830,7 @@ int free_swap_and_cache(swp_entry_t entry)
put_page(page);
page = NULL;
}
- } else if (!count)
+ } else if (!count && !cluster)
free_swap_slot(entry);
}
if (page) {
@@ -1914,7 +1994,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
}
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
- swap_free(entry);
+ swap_free(entry, false);
/*
* Move the page to the active list so it is not
* immediately swapped out again after swapon.
@@ -2353,6 +2433,16 @@ int try_to_unuse(unsigned int type, bool frontswap,
}
mmput(start_mm);
+
+ /*
+ * Swap entries may be marked as SWAP_MAP_BAD temporarily in
+ * swap_free_cluster() before being freed really.
+ * find_next_to_unuse() will skip these swap entries, that is
+ * OK. But we need to wait until they are freed really.
+ */
+ while (!retval && READ_ONCE(si->inuse_pages))
+ schedule_timeout_uninterruptible(1);
+
return retval;
}