@@ -38,7 +38,8 @@
EM( SCAN_TRUNCATED, "truncated") \
EM( SCAN_PAGE_HAS_PRIVATE, "page_has_private") \
EM( SCAN_STORE_FAILED, "store_failed") \
- EMe(SCAN_COPY_MC, "copy_poisoned_page")
+ EM( SCAN_COPY_MC, "copy_poisoned_page") \
+ EMe(SCAN_PAGE_FILLED, "page_filled") \
#undef EM
#undef EMe
@@ -57,6 +57,7 @@ enum scan_result {
SCAN_PAGE_HAS_PRIVATE,
SCAN_COPY_MC,
SCAN_STORE_FAILED,
+ SCAN_PAGE_FILLED,
};
#define CREATE_TRACE_POINTS
@@ -1856,8 +1857,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
* - allocate and lock a new huge page;
* - scan page cache replacing old pages with the new one
* + swap/gup in pages if necessary;
- * + fill in gaps;
* + keep old pages around in case rollback is required;
+ * - finalize updates to the page cache;
* - if replacing succeeds:
* + copy data over;
* + free old pages;
@@ -1935,22 +1936,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
result = SCAN_TRUNCATED;
goto xa_locked;
}
- xas_set(&xas, index);
+ xas_set(&xas, index + 1);
}
if (!shmem_charge(mapping->host, 1)) {
result = SCAN_FAIL;
goto xa_locked;
}
- xas_store(&xas, hpage);
- if (xas_error(&xas)) {
- /* revert shmem_charge performed
- * in the previous condition
- */
- mapping->nrpages--;
- shmem_uncharge(mapping->host, 1);
- result = SCAN_STORE_FAILED;
- goto xa_locked;
- }
nr_none++;
continue;
}
@@ -2161,22 +2152,66 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
index++;
}
- /*
- * Copying old pages to huge one has succeeded, now we
- * need to free the old pages.
- */
- list_for_each_entry_safe(page, tmp, &pagelist, lru) {
- list_del(&page->lru);
- page->mapping = NULL;
- page_ref_unfreeze(page, 1);
- ClearPageActive(page);
- ClearPageUnevictable(page);
- unlock_page(page);
- put_page(page);
+ if (nr_none) {
+ struct vm_area_struct *vma;
+ int nr_none_check = 0;
+
+ i_mmap_lock_read(mapping);
+ xas_lock_irq(&xas);
+
+ xas_set(&xas, start);
+ for (index = start; index < end; index++) {
+ if (!xas_next(&xas)) {
+ xas_store(&xas, XA_RETRY_ENTRY);
+ if (xas_error(&xas)) {
+ result = SCAN_STORE_FAILED;
+ goto immap_locked;
+ }
+ nr_none_check++;
+ }
+ }
+
+ if (nr_none != nr_none_check) {
+ result = SCAN_PAGE_FILLED;
+ goto immap_locked;
+ }
+
+ /*
+ * If userspace observed a missing page in a VMA with a MODE_MISSING
+ * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that
+ * page. If so, we need to roll back to avoid suppressing such an
+ * event. Since wp/minor userfaultfds don't give userspace any
+ * guarantees that the kernel doesn't fill a missing page with a zero
+ * page, so they don't matter here.
+ *
+ * Any userfaultfds registered after this point will not be able to
+ * observe any missing pages due to the previously inserted retry
+ * entries.
+ */
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
+ if (userfaultfd_missing(vma)) {
+ result = SCAN_EXCEED_NONE_PTE;
+ goto immap_locked;
+ }
+ }
+
+immap_locked:
+ i_mmap_unlock_read(mapping);
+ if (result != SCAN_SUCCEED) {
+ xas_set(&xas, start);
+ for (index = start; index < end; index++) {
+ if (xas_next(&xas) == XA_RETRY_ENTRY)
+ xas_store(&xas, NULL);
+ }
+
+ xas_unlock_irq(&xas);
+ goto rollback;
+ }
+ } else {
+ xas_lock_irq(&xas);
}
nr = thp_nr_pages(hpage);
- xas_lock_irq(&xas);
if (is_shmem)
__mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
else
@@ -2206,6 +2241,20 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
result = retract_page_tables(mapping, start, mm, addr, hpage,
cc);
unlock_page(hpage);
+
+ /*
+ * The collapse has succeeded, so free the old pages.
+ */
+ list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+ list_del(&page->lru);
+ page->mapping = NULL;
+ page_ref_unfreeze(page, 1);
+ ClearPageActive(page);
+ ClearPageUnevictable(page);
+ unlock_page(page);
+ put_page(page);
+ }
+
goto out;
rollback:
@@ -2217,15 +2266,13 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
}
xas_set(&xas, start);
- xas_for_each(&xas, page, end - 1) {
+ end = index;
+ for (index = start; index < end; index++) {
+ xas_next(&xas);
page = list_first_entry_or_null(&pagelist,
struct page, lru);
if (!page || xas.xa_index < page->index) {
- if (!nr_none)
- break;
nr_none--;
- /* Put holes back where they were */
- xas_store(&xas, NULL);
continue;
}
@@ -2749,12 +2796,14 @@ static int madvise_collapse_errno(enum scan_result r)
case SCAN_ALLOC_HUGE_PAGE_FAIL:
return -ENOMEM;
case SCAN_CGROUP_CHARGE_FAIL:
+ case SCAN_EXCEED_NONE_PTE:
return -EBUSY;
/* Resource temporary unavailable - trying again might succeed */
case SCAN_PAGE_COUNT:
case SCAN_PAGE_LOCK:
case SCAN_PAGE_LRU:
case SCAN_DEL_PAGE_LRU:
+ case SCAN_PAGE_FILLED:
return -EAGAIN;
/*
* Other: Trying again likely not to succeed / error intrinsic to