@@ -40,6 +40,8 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page);
extern int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page, int extra_count);
+void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
+ spinlock_t *ptl);
void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
void folio_migrate_copy(struct folio *newfolio, struct folio *folio);
int folio_migrate_mapping(struct address_space *mapping,
@@ -21,6 +21,7 @@
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
@@ -41,6 +42,7 @@
#include <linux/psi.h>
#include <linux/ramfs.h>
#include <linux/page_idle.h>
+#include <linux/migrate.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -1426,6 +1428,95 @@ static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}
+#ifdef CONFIG_MIGRATION
+/**
+ * migration_entry_wait_on_locked - Wait for a migration entry to be removed
+ * @entry: migration swap entry.
+ * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required
+ * for pte entries, pass NULL for pmd entries.
+ * @ptl: already locked ptl. This function will drop the lock.
+ *
+ * Wait for a migration entry referencing the given page to be removed. This is
+ * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
+ * this can be called without taking a reference on the page. Instead this
+ * should be called while holding the ptl for the migration entry referencing
+ * the page.
+ *
+ * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock().
+ *
+ * This follows the same logic as folio_wait_bit_common() so see the comments
+ * there.
+ */
+void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
+ spinlock_t *ptl)
+{
+ struct wait_page_queue wait_page;
+ wait_queue_entry_t *wait = &wait_page.wait;
+ bool thrashing = false;
+ bool delayacct = false;
+ unsigned long pflags;
+ wait_queue_head_t *q;
+ struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+
+ q = folio_waitqueue(folio);
+ if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
+ if (!folio_test_swapbacked(folio)) {
+ delayacct_thrashing_start();
+ delayacct = true;
+ }
+ psi_memstall_enter(&pflags);
+ thrashing = true;
+ }
+
+ init_wait(wait);
+ wait->func = wake_page_function;
+ wait_page.folio = folio;
+ wait_page.bit_nr = PG_locked;
+ wait->flags = 0;
+
+ spin_lock_irq(&q->lock);
+ folio_set_waiters(folio);
+ if (!folio_trylock_flag(folio, PG_locked, wait))
+ __add_wait_queue_entry_tail(q, wait);
+ spin_unlock_irq(&q->lock);
+
+ /*
+ * If a migration entry exists for the page the migration path must hold
+ * a valid reference to the page, and it must take the ptl to remove the
+ * migration entry. So the page is valid until the ptl is dropped.
+ */
+ if (ptep)
+ pte_unmap_unlock(ptep, ptl);
+ else
+ spin_unlock(ptl);
+
+ for (;;) {
+ unsigned int flags;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ /* Loop until we've been woken or interrupted */
+ flags = smp_load_acquire(&wait->flags);
+ if (!(flags & WQ_FLAG_WOKEN)) {
+ if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
+ break;
+
+ io_schedule();
+ continue;
+ }
+ break;
+ }
+
+ finish_wait(q, wait);
+
+ if (thrashing) {
+ if (delayacct)
+ delayacct_thrashing_end();
+ psi_memstall_leave(&pflags);
+ }
+}
+#endif
+
void folio_wait_bit(struct folio *folio, int bit_nr)
{
folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
@@ -291,7 +291,6 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
{
pte_t pte;
swp_entry_t entry;
- struct page *page;
spin_lock(ptl);
pte = *ptep;
@@ -302,18 +301,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
if (!is_migration_entry(entry))
goto out;
- page = pfn_swap_entry_to_page(entry);
- page = compound_head(page);
-
- /*
- * Once page cache replacement of page migration started, page_count
- * is zero; but we must not call put_and_wait_on_page_locked() without
- * a ref. Use get_page_unless_zero(), and just fault again if it fails.
- */
- if (!get_page_unless_zero(page))
- goto out;
- pte_unmap_unlock(ptep, ptl);
- put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
+ migration_entry_wait_on_locked(entry, ptep, ptl);
return;
out:
pte_unmap_unlock(ptep, ptl);
@@ -338,16 +326,11 @@ void migration_entry_wait_huge(struct vm_area_struct *vma,
void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
{
spinlock_t *ptl;
- struct page *page;
ptl = pmd_lock(mm, pmd);
if (!is_pmd_migration_entry(*pmd))
goto unlock;
- page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd));
- if (!get_page_unless_zero(page))
- goto unlock;
- spin_unlock(ptl);
- put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
+ migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl);
return;
unlock:
spin_unlock(ptl);
@@ -2484,22 +2467,8 @@ static bool migrate_vma_check_page(struct page *page)
return false;
/* Page from ZONE_DEVICE have one extra reference */
- if (is_zone_device_page(page)) {
- /*
- * Private page can never be pin as they have no valid pte and
- * GUP will fail for those. Yet if there is a pending migration
- * a thread might try to wait on the pte migration entry and
- * will bump the page reference count. Sadly there is no way to
- * differentiate a regular pin from migration wait. Hence to
- * avoid 2 racing thread trying to migrate back to CPU to enter
- * infinite loop (one stopping migration because the other is
- * waiting on pte migration entry). We always return true here.
- *
- * FIXME proper solution is to rework migration_entry_wait() so
- * it does not need to take a reference on page.
- */
- return is_device_private_page(page);
- }
+ if (is_zone_device_page(page))
+ extra++;
/* For file back page */
if (page_mapping(page))