diff mbox series

[v2] mm/hugetlb: fix set_max_huge_pages() when there are surplus pages

Message ID 20250401082339.676723-1-tujinjiang@huawei.com (mailing list archive)
State New
Headers show
Series [v2] mm/hugetlb: fix set_max_huge_pages() when there are surplus pages | expand

Commit Message

Jinjiang Tu April 1, 2025, 8:23 a.m. UTC
In set_max_huge_pages(), min_count should mean the acquired persistent
huge pages, but it contains surplus huge pages. It will leads to failing
to freeing free huge pages for a Node.

Steps to reproduce:
1) create 5 hugetlb folios in Node0
2) run a program to use all the hugetlb folios
3) echo 0 > nr_hugepages for Node0 to free the hugetlb folios. Thus the 5
hugetlb folios in Node0 are accounted as surplus.
4) create 5 hugetlb folios in Node1
5) echo 0 > nr_hugepages for Node1 to free the hugetlb folios

The result:
        Node0    Node1
Total     5         5
Free      0         5
Surp      5         5

We couldn't subtract surplus_huge_pages from min_mount, since free hugetlb
folios may be surplus due to HVO. In __update_and_free_hugetlb_folio(),
hugetlb_vmemmap_restore_folio() may fail, add the folio back to pool and
treat it as surplus. If we directly subtract surplus_huge_pages from
min_mount, some free folios will be subtracted twice.

To fix it, check if count is less than the num of free huge pages that
could be destroyed (i.e., available_huge_pages(h)), and remove hugetlb
folios if so.

Since there may exist free surplus hugetlb folios, we should remove
surplus folios first to make surplus count correct.

The result with this patch:
        Node0    Node1
Total     5         0
Free      0         0
Surp      5         0

Fixes: 9a30523066cd ("hugetlb: add per node hstate attributes")
Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
---
Changelog since v1:
  * Handle free surplus pages due to HVO

 mm/hugetlb.c | 47 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 7 deletions(-)
diff mbox series

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 318624c96584..57319ff7856b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3577,7 +3577,7 @@  static void try_to_free_low(struct hstate *h, unsigned long count,
 		struct folio *folio, *next;
 		struct list_head *freel = &h->hugepage_freelists[i];
 		list_for_each_entry_safe(folio, next, freel, lru) {
-			if (count >= h->nr_huge_pages)
+			if (count >= available_huge_pages(h))
 				goto out;
 			if (folio_test_highmem(folio))
 				continue;
@@ -3631,11 +3631,30 @@  static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
 	return 1;
 }
 
+static struct folio *remove_surplus_pool_hugetlb_folio(struct hstate *h,
+		nodemask_t *nodes_allowed)
+{
+	int nr_nodes, node;
+	struct folio *folio = NULL;
+
+	lockdep_assert_held(&hugetlb_lock);
+	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+		if (h->surplus_huge_pages_node[node] &&
+		    !list_empty(&h->hugepage_freelists[node])) {
+			folio = list_entry(h->hugepage_freelists[node].next,
+					  struct folio, lru);
+			remove_hugetlb_folio(h, folio, true);
+			break;
+		}
+	}
+
+	return folio;
+}
+
 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
 static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 			      nodemask_t *nodes_allowed)
 {
-	unsigned long min_count;
 	unsigned long allocated;
 	struct folio *folio;
 	LIST_HEAD(page_list);
@@ -3770,15 +3789,29 @@  static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	 * and won't grow the pool anywhere else. Not until one of the
 	 * sysctls are changed, or the surplus pages go out of use.
 	 */
-	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
-	min_count = max(count, min_count);
-	try_to_free_low(h, min_count, nodes_allowed);
+	try_to_free_low(h, count, nodes_allowed);
 
 	/*
 	 * Collect pages to be removed on list without dropping lock
+	 *
+	 * There may be free surplus huge pages due to HVO, see comments
+	 * in __update_and_free_hugetlb_folio() when calling
+	 * hugetlb_vmemmap_restore_folio(). Collect surplus pages first.
 	 */
-	while (min_count < persistent_huge_pages(h)) {
-		folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0);
+	while (count < available_huge_pages(h)) {
+		if (h->surplus_huge_pages && h->free_huge_pages) {
+			folio = remove_surplus_pool_hugetlb_folio(h, nodes_allowed);
+			if (!folio)
+				break;
+
+			list_add(&folio->lru, &page_list);
+		} else {
+			break;
+		}
+	}
+
+	while (count < available_huge_pages(h)) {
+		folio = remove_pool_hugetlb_folio(h, nodes_allowed, false);
 		if (!folio)
 			break;