@@ -583,6 +583,7 @@ struct huge_bootmem_page {
struct hstate *hstate;
};
+int isolate_or_dissolve_huge_page(struct page *page);
struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve);
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
@@ -865,6 +866,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
+static inline int isolate_or_dissolve_huge_page(struct page *page)
+{
+ return -ENOMEM;
+}
+
static inline struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr,
int avoid_reserve)
@@ -788,7 +788,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
* Isolate all pages that can be migrated from the range specified by
* [low_pfn, end_pfn). The range is expected to be within same pageblock.
* Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
- * or 0.
+ * -ENOMEM in case we could not allocate a page, or 0.
* cc->migrate_pfn will contain the next pfn to scan (which may be both less,
* equal to or more that end_pfn).
*
@@ -809,6 +809,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;
bool skip_updated = false;
+ bool fatal_error = false;
+ int ret = 0;
cc->migrate_pfn = low_pfn;
@@ -906,6 +908,32 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
valid_page = page;
}
+ if (PageHuge(page) && cc->alloc_contig) {
+ ret = isolate_or_dissolve_huge_page(page);
+
+ /*
+ * Fail isolation in case isolate_or_dissolve_huge_page
+ * reports an error. In case of -ENOMEM, abort right away.
+ */
+ if (ret < 0) {
+ /*
+ * Do not report -EBUSY down the chain.
+ */
+ if (ret == -ENOMEM)
+ fatal_error = true;
+ else
+ ret = 0;
+ goto isolate_fail;
+ }
+
+ /*
+ * Ok, the hugepage was dissolved. Now these pages are
+ * Buddy and cannot be re-allocated because they are
+ * isolated. Fall-through as the check below handles
+ * Buddy pages.
+ */
+ }
+
/*
* Skip if free. We read page order here without zone lock
* which is generally unsafe, but the race window is small and
@@ -1091,6 +1119,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
next_skip_pfn += 1UL << cc->order;
}
+
+ if (fatal_error)
+ break;
}
/*
@@ -1134,7 +1165,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
cc->migrate_pfn = low_pfn;
- return 0;
+ return ret;
}
/**
@@ -1144,7 +1175,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* @end_pfn: The one-past-last PFN.
*
* Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
- * or 0.
+ * -ENOMEM in case we could not allocate a page, or 0.
*/
int
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
@@ -1035,13 +1035,18 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
return false;
}
+static void __enqueue_huge_page(struct list_head *list, struct page *page)
+{
+ list_move(&page->lru, list);
+ SetHPageFreed(page);
+}
+
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
- list_move(&page->lru, &h->hugepage_freelists[nid]);
+ __enqueue_huge_page(&h->hugepage_freelists[nid], page);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
- SetHPageFreed(page);
}
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
@@ -2245,6 +2250,106 @@ static void restore_reserve_on_error(struct hstate *h,
}
}
+/*
+ * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
+ * @h: struct hstate old page belongs to
+ * @old_page: Old page to dissolve
+ * Returns 0 on success, otherwise negated error.
+ */
+
+static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page)
+{
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ int nid = page_to_nid(old_page);
+ struct page *new_page;
+ int ret = 0;
+
+ /*
+ * Before dissolving the page, we need to allocate a new one,
+ * so the pool remains stable.
+ */
+ new_page = alloc_fresh_huge_page(h, gfp_mask, nid, NULL, NULL);
+ if (!new_page)
+ return -ENOMEM;
+
+ /*
+ * Pages got from Buddy are self-refcounted, but free hugepages
+ * need to have a refcount of 0.
+ */
+ page_ref_dec(new_page);
+retry:
+ spin_lock(&hugetlb_lock);
+ if (!PageHuge(old_page)) {
+ /*
+ * Freed from under us. Drop new_page too.
+ */
+ update_and_free_page(h, new_page);
+ goto unlock;
+ } else if (page_count(old_page)) {
+ /*
+ * Someone has grabbed the page, fail for now.
+ */
+ ret = -EBUSY;
+ update_and_free_page(h, new_page);
+ goto unlock;
+ } else if (!HPageFreed(old_page)) {
+ /*
+ * Page's refcount is 0 but it has not been enqueued in the
+ * freelist yet. Race window is small, so we can succed here if
+ * we retry.
+ */
+ spin_unlock(&hugetlb_lock);
+ cond_resched();
+ goto retry;
+ } else {
+ /*
+ * Ok, old_page is still a genuine free hugepage. Replace it
+ * with the new one.
+ */
+ list_del(&old_page->lru);
+ update_and_free_page(h, old_page);
+ /*
+ * h->free_huge_pages{_node} counters do not need to be updated.
+ */
+ __enqueue_huge_page(&h->hugepage_freelists[nid], new_page);
+ }
+unlock:
+ spin_unlock(&hugetlb_lock);
+
+ return ret;
+}
+
+int isolate_or_dissolve_huge_page(struct page *page)
+{
+ struct hstate *h;
+ struct page *head;
+
+ /*
+ * The page might have been dissolved from under our feet, so make sure
+ * to carefully check the state under the lock.
+ * Return success when racing as if we dissolved the page ourselves.
+ */
+ spin_lock(&hugetlb_lock);
+ if (PageHuge(page)) {
+ head = compound_head(page);
+ h = page_hstate(head);
+ } else {
+ spin_unlock(&hugetlb_lock);
+ return 0;
+ }
+ spin_unlock(&hugetlb_lock);
+
+ /*
+ * Fence off gigantic pages as there is a cyclic dependency between
+ * alloc_contig_range and them. Return -ENOME as this has the effect
+ * of bailing out right away without further retrying.
+ */
+ if (hstate_is_gigantic(h))
+ return -ENOMEM;
+
+ return alloc_and_dissolve_huge_page(h, head);
+}
+
struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{