@@ -596,6 +596,7 @@ struct hstate {
int next_nid_to_alloc;
int next_nid_to_free;
unsigned int order;
+ unsigned int demote_order;
unsigned long mask;
unsigned long max_huge_pages;
unsigned long nr_huge_pages;
@@ -2964,7 +2964,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
static void __init hugetlb_init_hstates(void)
{
- struct hstate *h;
+ struct hstate *h, *h2;
for_each_hstate(h) {
if (minimum_order > huge_page_order(h))
@@ -2973,6 +2973,17 @@ static void __init hugetlb_init_hstates(void)
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
+
+ /*
+ * Set demote order for each hstate. Note that
+ * h->demote_order is initially 0.
+ */
+ for_each_hstate(h2) {
+ if (h2 == h)
+ continue;
+ if (h2->order < h->order && h2->order > h->demote_order)
+ h->demote_order = h2->order;
+ }
}
VM_BUG_ON(minimum_order == UINT_MAX);
}
@@ -3213,9 +3224,36 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
return 0;
}
+static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+ __must_hold(&hugetlb_lock)
+{
+ int rc = 0;
+
+ lockdep_assert_held(&hugetlb_lock);
+ /* If no demote order, free to buddy */
+ if (!h->demote_order) {
+ struct page *page = remove_pool_huge_page(h, nodes_allowed, 0);
+
+ if (!page)
+ return rc;
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_page(h, page, false);
+ spin_lock_irq(&hugetlb_lock);
+ return 1;
+ }
+
+ /*
+ * TODO - demote fucntionality will be added in subsequent patch
+ */
+ return rc;
+}
+
#define HSTATE_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define HSTATE_ATTR_WO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
+
#define HSTATE_ATTR(_name) \
static struct kobj_attribute _name##_attr = \
__ATTR(_name, 0644, _name##_show, _name##_store)
@@ -3411,12 +3449,91 @@ static ssize_t surplus_hugepages_show(struct kobject *kobj,
}
HSTATE_ATTR_RO(surplus_hugepages);
+static ssize_t demote_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ unsigned long nr_demote;
+ unsigned long nr_available;
+ nodemask_t nodes_allowed, *n_mask;
+ struct hstate *h;
+ int err;
+ int nid;
+
+ err = kstrtoul(buf, 10, &nr_demote);
+ if (err)
+ return err;
+ h = kobj_to_hstate(kobj, &nid);
+
+ /* Synchronize with other sysfs operations modifying huge pages */
+ mutex_lock(&h->resize_lock);
+
+ spin_lock_irq(&hugetlb_lock);
+ if (nid != NUMA_NO_NODE) {
+ nr_available = h->free_huge_pages_node[nid];
+ init_nodemask_of_node(&nodes_allowed, nid);
+ n_mask = &nodes_allowed;
+ } else {
+ nr_available = h->free_huge_pages;
+ n_mask = &node_states[N_MEMORY];
+ }
+ nr_available -= h->resv_huge_pages;
+ if (nr_available <= 0)
+ goto out;
+ nr_demote = min(nr_available, nr_demote);
+
+ while (nr_demote) {
+ if (!demote_pool_huge_page(h, n_mask))
+ break;
+
+ /*
+ * We may have dropped the lock in the routines to
+ * demote/free a page. Recompute nr_demote as counts could
+ * have changed and we want to make sure we do not demote
+ * a reserved huge page.
+ */
+ nr_demote--;
+ if (nid != NUMA_NO_NODE)
+ nr_available = h->free_huge_pages_node[nid];
+ else
+ nr_available = h->free_huge_pages;
+ nr_available -= h->resv_huge_pages;
+ if (nr_available <= 0)
+ nr_demote = 0;
+ else
+ nr_demote = min(nr_available, nr_demote);
+ }
+
+out:
+ spin_unlock_irq(&hugetlb_lock);
+ mutex_unlock(&h->resize_lock);
+
+ return len;
+}
+HSTATE_ATTR_WO(demote);
+
+static ssize_t demote_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h;
+ unsigned long demote_size;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ demote_size = h->demote_order;
+
+ return sysfs_emit(buf, "%lukB\n",
+ (unsigned long)(PAGE_SIZE << h->demote_order) / SZ_1K);
+}
+HSTATE_ATTR_RO(demote_size);
+
static struct attribute *hstate_attrs[] = {
&nr_hugepages_attr.attr,
&nr_overcommit_hugepages_attr.attr,
&free_hugepages_attr.attr,
&resv_hugepages_attr.attr,
&surplus_hugepages_attr.attr,
+ &demote_size_attr.attr,
+ &demote_attr.attr,
#ifdef CONFIG_NUMA
&nr_hugepages_mempolicy_attr.attr,
#endif
@@ -3486,6 +3603,8 @@ static struct attribute *per_node_hstate_attrs[] = {
&nr_hugepages_attr.attr,
&free_hugepages_attr.attr,
&surplus_hugepages_attr.attr,
+ &demote_size_attr.attr,
+ &demote_attr.attr,
NULL,
};
Two new sysfs files are added to demote hugtlb pages. These files are both per-hugetlb page size and per node. Files are: demote_size - The size in Kb that pages are demoted to. (read-only) demote - The number of huge pages to demote. (write-only) Writing a value to demote will result in an attempt to demote that number of hugetlb pages to an appropriate number of demote_size pages. This patch does not provide full demote functionality. It only provides the sysfs interfaces and uses existing code to free pages to the buddy allocator if demote_size == PAGESIZE. Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> --- include/linux/hugetlb.h | 1 + mm/hugetlb.c | 121 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 121 insertions(+), 1 deletion(-)