@@ -79,6 +79,11 @@ extern struct kobj_attribute shmem_enabled_attr;
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
+static inline bool thp_swap_supported(void)
+{
+ return IS_ENABLED(CONFIG_THP_SWAP);
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define HPAGE_PMD_SHIFT PMD_SHIFT
#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
@@ -451,8 +451,8 @@ extern swp_entry_t get_swap_page_of_type(int);
extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t);
-extern int swap_duplicate(swp_entry_t);
-extern int swapcache_prepare(swp_entry_t);
+extern int swap_duplicate(swp_entry_t *entry, bool cluster);
+extern int swapcache_prepare(swp_entry_t entry, bool cluster);
extern void swap_free(swp_entry_t);
extern void swapcache_free_entries(swp_entry_t *entries, int n);
extern int free_swap_and_cache(swp_entry_t);
@@ -510,7 +510,8 @@ static inline void show_swap_cache_info(void)
}
#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
-#define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
+#define swapcache_prepare(e, c) \
+ ({(is_migration_entry(e) || is_device_private_entry(e)); })
static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
{
@@ -521,7 +522,7 @@ static inline void swap_shmem_alloc(swp_entry_t swp)
{
}
-static inline int swap_duplicate(swp_entry_t swp)
+static inline int swap_duplicate(swp_entry_t *swp, bool cluster)
{
return 0;
}
@@ -951,7 +951,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
swp_entry_t entry = pte_to_swp_entry(pte);
if (likely(!non_swap_entry(entry))) {
- if (swap_duplicate(entry) < 0)
+ if (swap_duplicate(&entry, false) < 0)
return entry.val;
/* make sure dst_mm is on swapoff's mmlist. */
@@ -1556,7 +1556,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
break;
}
- if (swap_duplicate(entry) < 0) {
+ if (swap_duplicate(&entry, false) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
page_vma_mapped_walk_done(&pvmw);
@@ -49,6 +49,9 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
static sector_t map_swap_entry(swp_entry_t, struct block_device**);
+static int add_swap_count_continuation_locked(struct swap_info_struct *si,
+ unsigned long offset,
+ struct page *page);
DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
@@ -319,6 +322,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
spin_unlock(&si->lock);
}
+static inline bool is_cluster_offset(unsigned long offset)
+{
+ return !(offset % SWAPFILE_CLUSTER);
+}
+
static inline bool cluster_list_empty(struct swap_cluster_list *list)
{
return cluster_is_null(&list->head);
@@ -1166,16 +1174,14 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
return NULL;
}
-static unsigned char __swap_entry_free(struct swap_info_struct *p,
- swp_entry_t entry, unsigned char usage)
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+ struct swap_cluster_info *ci,
+ unsigned long offset,
+ unsigned char usage)
{
- struct swap_cluster_info *ci;
- unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
- ci = lock_cluster_or_swap_info(p, offset);
-
count = p->swap_map[offset];
has_cache = count & SWAP_HAS_CACHE;
@@ -1203,6 +1209,17 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
usage = count | has_cache;
p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
+ return usage;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t entry, unsigned char usage)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+
+ ci = lock_cluster_or_swap_info(p, offset);
+ usage = __swap_entry_free_locked(p, ci, offset, usage);
unlock_cluster_or_swap_info(p, ci);
return usage;
@@ -3449,32 +3466,12 @@ void si_swapinfo(struct sysinfo *val)
spin_unlock(&swap_lock);
}
-/*
- * Verify that a swap entry is valid and increment its swap map count.
- *
- * Returns error code in following case.
- * - success -> 0
- * - swp_entry is invalid -> EINVAL
- * - swp_entry is migration entry -> EINVAL
- * - swap-cache reference is requested but there is already one. -> EEXIST
- * - swap-cache reference is requested but the entry is not used. -> ENOENT
- * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
- */
-static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
+static int __swap_duplicate_locked(struct swap_info_struct *p,
+ unsigned long offset, unsigned char usage)
{
- struct swap_info_struct *p;
- struct swap_cluster_info *ci;
- unsigned long offset;
unsigned char count;
unsigned char has_cache;
- int err = -EINVAL;
-
- p = get_swap_device(entry);
- if (!p)
- goto out;
-
- offset = swp_offset(entry);
- ci = lock_cluster_or_swap_info(p, offset);
+ int err = 0;
count = p->swap_map[offset];
@@ -3484,12 +3481,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
*/
if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
err = -ENOENT;
- goto unlock_out;
+ goto out;
}
has_cache = count & SWAP_HAS_CACHE;
count &= ~SWAP_HAS_CACHE;
- err = 0;
if (usage == SWAP_HAS_CACHE) {
@@ -3516,11 +3512,39 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
p->swap_map[offset] = count | has_cache;
-unlock_out:
+out:
+ return err;
+}
+
+/*
+ * Verify that a swap entry is valid and increment its swap map count.
+ *
+ * Returns error code in following case.
+ * - success -> 0
+ * - swp_entry is invalid -> EINVAL
+ * - swp_entry is migration entry -> EINVAL
+ * - swap-cache reference is requested but there is already one. -> EEXIST
+ * - swap-cache reference is requested but the entry is not used. -> ENOENT
+ * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
+ */
+static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
+{
+ struct swap_info_struct *p;
+ struct swap_cluster_info *ci;
+ unsigned long offset;
+ int err = -EINVAL;
+
+ p = get_swap_device(entry);
+ if (!p)
+ goto out;
+
+ offset = swp_offset(entry);
+ ci = lock_cluster_or_swap_info(p, offset);
+ err = __swap_duplicate_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
+
+ put_swap_device(p);
out:
- if (p)
- put_swap_device(p);
return err;
}
@@ -3533,6 +3557,81 @@ void swap_shmem_alloc(swp_entry_t entry)
__swap_duplicate(entry, SWAP_MAP_SHMEM);
}
+#ifdef CONFIG_THP_SWAP
+static int __swap_duplicate_cluster(swp_entry_t *entry, unsigned char usage)
+{
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
+ unsigned long offset;
+ unsigned char *map;
+ int i, err = 0;
+
+ si = get_swap_device(*entry);
+ if (!si) {
+ err = -EINVAL;
+ goto out;
+ }
+ offset = swp_offset(*entry);
+ ci = lock_cluster(si, offset);
+ if (cluster_is_free(ci)) {
+ err = -ENOENT;
+ goto unlock;
+ }
+ if (!cluster_is_huge(ci)) {
+ err = -ENOTDIR;
+ goto unlock;
+ }
+ VM_BUG_ON(!is_cluster_offset(offset));
+ VM_BUG_ON(cluster_count(ci) < SWAPFILE_CLUSTER);
+ map = si->swap_map + offset;
+ if (usage == SWAP_HAS_CACHE) {
+ if (map[0] & SWAP_HAS_CACHE) {
+ err = -EEXIST;
+ goto unlock;
+ }
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ VM_BUG_ON(map[i] & SWAP_HAS_CACHE);
+ map[i] |= SWAP_HAS_CACHE;
+ }
+ } else {
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+retry:
+ err = __swap_duplicate_locked(si, offset + i, 1);
+ if (err == -ENOMEM) {
+ struct page *page;
+
+ page = alloc_page(GFP_ATOMIC | __GFP_HIGHMEM);
+ err = add_swap_count_continuation_locked(
+ si, offset + i, page);
+ if (err) {
+ *entry = swp_entry(si->type, offset+i);
+ goto undup;
+ }
+ goto retry;
+ } else if (err)
+ goto undup;
+ }
+ cluster_set_count(ci, cluster_count(ci) + 1);
+ }
+unlock:
+ unlock_cluster(ci);
+ put_swap_device(si);
+out:
+ return err;
+undup:
+ for (i--; i >= 0; i--)
+ __swap_entry_free_locked(
+ si, ci, offset + i, 1);
+ goto unlock;
+}
+#else
+static inline int __swap_duplicate_cluster(swp_entry_t *entry,
+ unsigned char usage)
+{
+ return 0;
+}
+#endif
+
/*
* Increase reference count of swap entry by 1.
* Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
@@ -3540,12 +3639,15 @@ void swap_shmem_alloc(swp_entry_t entry)
* if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
* might occur if a page table entry has got corrupted.
*/
-int swap_duplicate(swp_entry_t entry)
+int swap_duplicate(swp_entry_t *entry, bool cluster)
{
int err = 0;
- while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
- err = add_swap_count_continuation(entry, GFP_ATOMIC);
+ if (thp_swap_supported() && cluster)
+ return __swap_duplicate_cluster(entry, 1);
+
+ while (!err && __swap_duplicate(*entry, 1) == -ENOMEM)
+ err = add_swap_count_continuation(*entry, GFP_ATOMIC);
return err;
}
@@ -3557,9 +3659,12 @@ int swap_duplicate(swp_entry_t entry)
* -EBUSY means there is a swap cache.
* Note: return code is different from swap_duplicate().
*/
-int swapcache_prepare(swp_entry_t entry)
+int swapcache_prepare(swp_entry_t entry, bool cluster)
{
- return __swap_duplicate(entry, SWAP_HAS_CACHE);
+ if (thp_swap_supported() && cluster)
+ return __swap_duplicate_cluster(&entry, SWAP_HAS_CACHE);
+ else
+ return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
@@ -3589,51 +3694,13 @@ pgoff_t __page_file_index(struct page *page)
}
EXPORT_SYMBOL_GPL(__page_file_index);
-/*
- * add_swap_count_continuation - called when a swap count is duplicated
- * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
- * page of the original vmalloc'ed swap_map, to hold the continuation count
- * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
- * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
- *
- * These continuation pages are seldom referenced: the common paths all work
- * on the original swap_map, only referring to a continuation page when the
- * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
- *
- * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
- * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
- * can be called after dropping locks.
- */
-int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
+static int add_swap_count_continuation_locked(struct swap_info_struct *si,
+ unsigned long offset,
+ struct page *page)
{
- struct swap_info_struct *si;
- struct swap_cluster_info *ci;
struct page *head;
- struct page *page;
struct page *list_page;
- pgoff_t offset;
unsigned char count;
- int ret = 0;
-
- /*
- * When debugging, it's easier to use __GFP_ZERO here; but it's better
- * for latency not to zero a page while GFP_ATOMIC and holding locks.
- */
- page = alloc_page(gfp_mask | __GFP_HIGHMEM);
-
- si = get_swap_device(entry);
- if (!si) {
- /*
- * An acceptable race has occurred since the failing
- * __swap_duplicate(): the swap device may be swapoff
- */
- goto outer;
- }
- spin_lock(&si->lock);
-
- offset = swp_offset(entry);
-
- ci = lock_cluster(si, offset);
count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
@@ -3643,13 +3710,11 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
* will race to add swap count continuation: we need to avoid
* over-provisioning.
*/
- goto out;
+ return 0;
}
- if (!page) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!page)
+ return -ENOMEM;
/*
* We are fortunate that although vmalloc_to_page uses pte_offset_map,
@@ -3697,7 +3762,57 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
page = NULL; /* now it's attached, don't free it */
out_unlock_cont:
spin_unlock(&si->cont_lock);
-out:
+ if (page)
+ __free_page(page);
+ return 0;
+}
+
+/*
+ * add_swap_count_continuation - called when a swap count is duplicated
+ * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
+ * page of the original vmalloc'ed swap_map, to hold the continuation count
+ * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
+ * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
+ *
+ * These continuation pages are seldom referenced: the common paths all work
+ * on the original swap_map, only referring to a continuation page when the
+ * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
+ *
+ * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
+ * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
+ * can be called after dropping locks.
+ */
+int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
+{
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
+ struct page *page;
+ unsigned long offset;
+ int ret = 0;
+
+ /*
+ * When debugging, it's easier to use __GFP_ZERO here; but it's better
+ * for latency not to zero a page while GFP_ATOMIC and holding locks.
+ */
+ page = alloc_page(gfp_mask | __GFP_HIGHMEM);
+
+ si = get_swap_device(entry);
+ if (!si) {
+ /*
+ * An acceptable race has occurred since the failing
+ * __swap_duplicate(): the swap device may be swapoff
+ */
+ goto outer;
+ }
+ spin_lock(&si->lock);
+
+ offset = swp_offset(entry);
+
+ ci = lock_cluster(si, offset);
+
+ ret = add_swap_count_continuation_locked(si, offset, page);
+ page = NULL;
+
unlock_cluster(ci);
spin_unlock(&si->lock);
put_swap_device(si);