@@ -3577,7 +3577,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
struct folio *folio, *next;
struct list_head *freel = &h->hugepage_freelists[i];
list_for_each_entry_safe(folio, next, freel, lru) {
- if (count >= h->nr_huge_pages)
+ if (count >= available_huge_pages(h))
goto out;
if (folio_test_highmem(folio))
continue;
@@ -3631,11 +3631,30 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
return 1;
}
+static struct folio *remove_surplus_pool_hugetlb_folio(struct hstate *h,
+ nodemask_t *nodes_allowed)
+{
+ int nr_nodes, node;
+ struct folio *folio = NULL;
+
+ lockdep_assert_held(&hugetlb_lock);
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+ if (h->surplus_huge_pages_node[node] &&
+ !list_empty(&h->hugepage_freelists[node])) {
+ folio = list_entry(h->hugepage_freelists[node].next,
+ struct folio, lru);
+ remove_hugetlb_folio(h, folio, true);
+ break;
+ }
+ }
+
+ return folio;
+}
+
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
- unsigned long min_count;
unsigned long allocated;
struct folio *folio;
LIST_HEAD(page_list);
@@ -3770,15 +3789,29 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
*/
- min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
- min_count = max(count, min_count);
- try_to_free_low(h, min_count, nodes_allowed);
+ try_to_free_low(h, count, nodes_allowed);
/*
* Collect pages to be removed on list without dropping lock
+ *
+ * There may be free surplus huge pages due to HVO, see comments
+ * in __update_and_free_hugetlb_folio() when calling
+ * hugetlb_vmemmap_restore_folio(). Collect surplus pages first.
*/
- while (min_count < persistent_huge_pages(h)) {
- folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0);
+ while (count < available_huge_pages(h)) {
+ if (h->surplus_huge_pages && h->free_huge_pages) {
+ folio = remove_surplus_pool_hugetlb_folio(h, nodes_allowed);
+ if (!folio)
+ break;
+
+ list_add(&folio->lru, &page_list);
+ } else {
+ break;
+ }
+ }
+
+ while (count < available_huge_pages(h)) {
+ folio = remove_pool_hugetlb_folio(h, nodes_allowed, false);
if (!folio)
break;
In set_max_huge_pages(), min_count should mean the acquired persistent huge pages, but it contains surplus huge pages. It will leads to failing to freeing free huge pages for a Node. Steps to reproduce: 1) create 5 hugetlb folios in Node0 2) run a program to use all the hugetlb folios 3) echo 0 > nr_hugepages for Node0 to free the hugetlb folios. Thus the 5 hugetlb folios in Node0 are accounted as surplus. 4) create 5 hugetlb folios in Node1 5) echo 0 > nr_hugepages for Node1 to free the hugetlb folios The result: Node0 Node1 Total 5 5 Free 0 5 Surp 5 5 We couldn't subtract surplus_huge_pages from min_mount, since free hugetlb folios may be surplus due to HVO. In __update_and_free_hugetlb_folio(), hugetlb_vmemmap_restore_folio() may fail, add the folio back to pool and treat it as surplus. If we directly subtract surplus_huge_pages from min_mount, some free folios will be subtracted twice. To fix it, check if count is less than the num of free huge pages that could be destroyed (i.e., available_huge_pages(h)), and remove hugetlb folios if so. Since there may exist free surplus hugetlb folios, we should remove surplus folios first to make surplus count correct. The result with this patch: Node0 Node1 Total 5 0 Free 0 0 Surp 5 0 Fixes: 9a30523066cd ("hugetlb: add per node hstate attributes") Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com> --- Changelog since v1: * Handle free surplus pages due to HVO mm/hugetlb.c | 47 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 7 deletions(-)