@@ -60,6 +60,10 @@ HugePages_Surp
the pool above the value in ``/proc/sys/vm/nr_hugepages``. The
maximum number of surplus huge pages is controlled by
``/proc/sys/vm/nr_overcommit_hugepages``.
+ Note: When the feature of freeing unused vmemmap pages associated
+ with each hugetlb page is enabled, the number of surplus huge pages
+ may be temporarily larger than the maximum number of surplus huge
+ pages when the system is under memory pressure.
Hugepagesize
is the default hugepage size (in Kb).
Hugetlb
@@ -80,6 +84,10 @@ returned to the huge page pool when freed by a task. A user with root
privileges can dynamically allocate more or free some persistent huge pages
by increasing or decreasing the value of ``nr_hugepages``.
+Note: When the feature of freeing unused vmemmap pages associated with each
+hugetlb page is enabled, we can fail to free the huge pages triggered by
+the user when ths system is under memory pressure. Please try again later.
+
Pages that are used as huge pages are reserved inside the kernel and cannot
be used for other purposes. Huge pages cannot be swapped out under
memory pressure.
@@ -357,6 +357,19 @@ creates ZONE_MOVABLE as following.
Unfortunately, there is no information to show which memory block belongs
to ZONE_MOVABLE. This is TBD.
+ Memory offlining can fail when dissolving a free huge page on ZONE_MOVABLE
+ and the feature of freeing unused vmemmap pages associated with each hugetlb
+ page is enabled.
+
+ This can happen when we have plenty of ZONE_MOVABLE memory, but not enough
+ kernel memory to allocate vmemmmap pages. We may even be able to migrate
+ huge page contents, but will not be able to dissolve the source huge page.
+ This will prevent an offline operation and is unfortunate as memory offlining
+ is expected to succeed on movable zones. Users that depend on memory hotplug
+ to succeed for movable zones should carefully consider whether the memory
+ savings gained from this feature are worth the risk of possibly not being
+ able to offline memory in certain situations.
+
.. _memory_hotplug_how_to_offline_memory:
How to offline memory
@@ -2973,6 +2973,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
void vmemmap_remap_free(unsigned long start, unsigned long end,
unsigned long reuse);
+int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+ unsigned long reuse, gfp_t gfp_mask);
void *sparse_buffer_alloc(unsigned long size);
struct page * __populate_section_memmap(unsigned long pfn,
@@ -1304,16 +1304,53 @@ static inline void destroy_compound_gigantic_page(struct page *page,
unsigned int order) { }
#endif
-static void update_and_free_page(struct hstate *h, struct page *page)
+static int update_and_free_page_surplus(struct hstate *h, struct page *page,
+ bool acct_surplus)
+ __releases(&hugetlb_lock) __acquires(&hugetlb_lock)
{
int i;
struct page *subpage = page;
+ int nid = page_to_nid(page);
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
- return;
+ return 0;
h->nr_huge_pages--;
- h->nr_huge_pages_node[page_to_nid(page)]--;
+ h->nr_huge_pages_node[nid]--;
+
+ /*
+ * If the vmemmap pages associated with the HugeTLB page can be
+ * optimized, we might block in alloc_huge_page_vmemmap(), so
+ * drop the hugetlb_lock.
+ */
+ if (free_vmemmap_pages_per_hpage(h))
+ spin_unlock(&hugetlb_lock);
+
+ if (alloc_huge_page_vmemmap(h, page)) {
+ spin_lock(&hugetlb_lock);
+ INIT_LIST_HEAD(&page->lru);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
+
+ /*
+ * If we cannot allocate vmemmap pages, just refuse to free the
+ * page and put the page back on the hugetlb free list and treat
+ * as a surplus page.
+ */
+ if (acct_surplus) {
+ h->surplus_huge_pages++;
+ h->surplus_huge_pages_node[nid]++;
+ }
+
+ arch_clear_hugepage_flags(page);
+ enqueue_huge_page(h, page);
+
+ return -ENOMEM;
+ }
+
+ if (free_vmemmap_pages_per_hpage(h))
+ spin_lock(&hugetlb_lock);
+
for (i = 0; i < pages_per_huge_page(h);
i++, subpage = mem_map_next(subpage, page, i)) {
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1337,6 +1374,13 @@ static void update_and_free_page(struct hstate *h, struct page *page)
} else {
__free_pages(page, huge_page_order(h));
}
+
+ return 0;
+}
+
+static inline int update_and_free_page(struct hstate *h, struct page *page)
+{
+ return update_and_free_page_surplus(h, page, true);
}
struct hstate *size_to_hstate(unsigned long size)
@@ -1404,9 +1448,9 @@ static void __free_huge_page(struct page *page)
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
list_del(&page->lru);
- update_and_free_page(h, page);
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
+ update_and_free_page(h, page);
} else {
arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
@@ -1447,7 +1491,7 @@ void free_huge_page(struct page *page)
/*
* Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
*/
- if (!in_task()) {
+ if (in_atomic()) {
/*
* Only call schedule_work() if hpage_freelist is previously
* empty. Otherwise, schedule_work() had been called but the
@@ -1693,14 +1737,14 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
list_entry(h->hugepage_freelists[node].next,
struct page, lru);
list_del(&page->lru);
+ ClearHPageFreed(page);
h->free_huge_pages--;
h->free_huge_pages_node[node]--;
if (acct_surplus) {
h->surplus_huge_pages--;
h->surplus_huge_pages_node[node]--;
}
- update_and_free_page(h, page);
- ret = 1;
+ ret = !update_and_free_page(h, page);
break;
}
}
@@ -1713,10 +1757,14 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
* nothing for in-use hugepages and non-hugepages.
* This function returns values like below:
*
- * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
- * (allocated or reserved.)
- * 0: successfully dissolved free hugepages or the page is not a
- * hugepage (considered as already dissolved)
+ * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
+ * when the system is under memory pressure and the feature of
+ * freeing unused vmemmap pages associated with each hugetlb page
+ * is enabled.
+ * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
+ * (allocated or reserved.)
+ * 0: successfully dissolved free hugepages or the page is not a
+ * hugepage (considered as already dissolved)
*/
int dissolve_free_huge_page(struct page *page)
{
@@ -1768,11 +1816,13 @@ int dissolve_free_huge_page(struct page *page)
ClearPageHWPoison(head);
}
list_del(&head->lru);
+ ClearHPageFreed(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
h->max_huge_pages--;
- update_and_free_page(h, head);
- rc = 0;
+ rc = update_and_free_page_surplus(h, head, false);
+ if (rc)
+ h->max_huge_pages++;
}
out:
spin_unlock(&hugetlb_lock);
@@ -18,10 +18,9 @@
* 4096 base pages. For each base page, there is a corresponding page struct.
*
* Within the HugeTLB subsystem, only the first 4 page structs are used to
- * contain unique information about a HugeTLB page. HUGETLB_CGROUP_MIN_ORDER
- * provides this upper limit. The only 'useful' information in the remaining
- * page structs is the compound_head field, and this field is the same for all
- * tail pages.
+ * contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
+ * this upper limit. The only 'useful' information in the remaining page structs
+ * is the compound_head field, and this field is the same for all tail pages.
*
* By removing redundant page structs for HugeTLB pages, memory can be returned
* to the buddy allocator for other uses.
@@ -181,21 +180,35 @@
#define RESERVE_VMEMMAP_NR 2U
#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
-/*
- * How many vmemmap pages associated with a HugeTLB page that can be freed
- * to the buddy allocator.
- *
- * Todo: Returns zero for now, which means the feature is disabled. We will
- * enable it once all the infrastructure is there.
- */
-static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
{
- return 0;
+ return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
}
-static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
+/*
+ * Previously discarded vmemmap pages will be allocated and remapping
+ * after this function returns.
+ */
+int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
{
- return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
+ unsigned long vmemmap_addr = (unsigned long)head;
+ unsigned long vmemmap_end, vmemmap_reuse;
+
+ if (!free_vmemmap_pages_per_hpage(h))
+ return 0;
+
+ vmemmap_addr += RESERVE_VMEMMAP_SIZE;
+ vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
+ vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+ /*
+ * The pages which the vmemmap virtual address range [@vmemmap_addr,
+ * @vmemmap_end) are mapped to are freed to the buddy allocator, and
+ * the range is mapped to the page which @vmemmap_reuse is mapped to.
+ * When a HugeTLB page is freed to the buddy allocator, previously
+ * discarded vmemmap pages must be allocated and remapping.
+ */
+ return vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
+ GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
}
void free_huge_page_vmemmap(struct hstate *h, struct page *head)
@@ -11,10 +11,33 @@
#include <linux/hugetlb.h>
#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+int alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
void free_huge_page_vmemmap(struct hstate *h, struct page *head);
+
+/*
+ * How many vmemmap pages associated with a HugeTLB page that can be freed
+ * to the buddy allocator.
+ *
+ * Todo: Returns zero for now, which means the feature is disabled. We will
+ * enable it once all the infrastructure is there.
+ */
+static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+{
+ return 0;
+}
#else
+static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+ return 0;
+}
+
static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
{
}
+
+static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+{
+ return 0;
+}
#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
#endif /* _LINUX_HUGETLB_VMEMMAP_H */
@@ -40,7 +40,8 @@
* @remap_pte: called for each lowest-level entry (PTE).
* @reuse_page: the page which is reused for the tail vmemmap pages.
* @reuse_addr: the virtual address of the @reuse_page page.
- * @vmemmap_pages: the list head of the vmemmap pages that can be freed.
+ * @vmemmap_pages: the list head of the vmemmap pages that can be freed
+ * or is mapped from.
*/
struct vmemmap_remap_walk {
void (*remap_pte)(pte_t *pte, unsigned long addr,
@@ -224,6 +225,78 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
free_vmemmap_page_list(&vmemmap_pages);
}
+static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk)
+{
+ pgprot_t pgprot = PAGE_KERNEL;
+ struct page *page;
+ void *to;
+
+ BUG_ON(pte_page(*pte) != walk->reuse_page);
+
+ page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+ list_del(&page->lru);
+ to = page_to_virt(page);
+ copy_page(to, (void *)walk->reuse_addr);
+
+ set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+}
+
+static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
+ gfp_t gfp_mask, struct list_head *list)
+{
+ unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
+ int nid = page_to_nid((struct page *)start);
+ struct page *page, *next;
+
+ while (nr_pages--) {
+ page = alloc_pages_node(nid, gfp_mask, 0);
+ if (!page)
+ goto out;
+ list_add_tail(&page->lru, list);
+ }
+
+ return 0;
+out:
+ list_for_each_entry_safe(page, next, list, lru)
+ __free_pages(page, 0);
+ return -ENOMEM;
+}
+
+/**
+ * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
+ * to the page which is from the @vmemmap_pages
+ * respectively.
+ * @start: start address of the vmemmap virtual address range that we want
+ * to remap.
+ * @end: end address of the vmemmap virtual address range that we want to
+ * remap.
+ * @reuse: reuse address.
+ * @gpf_mask: GFP flag for allocating vmemmap pages.
+ */
+int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+ unsigned long reuse, gfp_t gfp_mask)
+{
+ LIST_HEAD(vmemmap_pages);
+ struct vmemmap_remap_walk walk = {
+ .remap_pte = vmemmap_restore_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ };
+
+ /* See the comment in the vmemmap_remap_free(). */
+ BUG_ON(start - reuse != PAGE_SIZE);
+
+ might_sleep_if(gfpflags_allow_blocking(gfp_mask));
+
+ if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
+ return -ENOMEM;
+
+ vmemmap_remap_range(reuse, end, &walk);
+
+ return 0;
+}
+
/*
* Allocate a block of memory to be used to back the virtual memory map
* or to back the page tables that are used to create the mapping.