@@ -189,6 +189,17 @@ madvise
never
should be self-explanatory.
+
+There's also sysfs knob to control hugepage to be stored on PCP lists for
+high-orders(greated than PAGE_ALLOC_COSTLY_ORDER), which could reduce
+the zone lock contention when allocate hige-order pages frequently. Please
+note that the PCP behavior of low-order and PMD-order pages cannot changed,
+it is possible to enable other higher-order pages stored on PCP lists by
+writing 1 or disable it back by writing 0::
+
+ echo 0 >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/pcp_enabled
+ echo 1 >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/pcp_enabled
+
By default kernel tries to use huge, PMD-mappable zero page on read
page fault to anonymous mapping. It's possible to disable huge zero
page by writing 0 or enable it back by writing 1::
@@ -365,6 +365,7 @@ extern void page_frag_free(void *addr);
void page_alloc_init_cpuhp(void);
int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
+void drain_all_zone_pages(void);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);
@@ -108,6 +108,7 @@ extern unsigned long transparent_hugepage_flags;
extern unsigned long huge_anon_orders_always;
extern unsigned long huge_anon_orders_madvise;
extern unsigned long huge_anon_orders_inherit;
+extern unsigned long huge_pcp_allow_orders;
static inline bool hugepage_global_enabled(void)
{
@@ -512,8 +512,49 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj,
static struct kobj_attribute thpsize_enabled_attr =
__ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
+unsigned long huge_pcp_allow_orders __read_mostly;
+static ssize_t thpsize_pcp_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int order = to_thpsize(kobj)->order;
+
+ return sysfs_emit(buf, "%d\n",
+ !!test_bit(order, &huge_pcp_allow_orders));
+}
+
+static ssize_t thpsize_pcp_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int order = to_thpsize(kobj)->order;
+ unsigned long value;
+ int ret;
+
+ if (order <= PAGE_ALLOC_COSTLY_ORDER || order == PMD_ORDER)
+ return -EINVAL;
+
+ ret = kstrtoul(buf, 10, &value);
+ if (ret < 0)
+ return ret;
+ if (value > 1)
+ return -EINVAL;
+
+ if (value) {
+ set_bit(order, &huge_pcp_allow_orders);
+ } else {
+ if (test_and_clear_bit(order, &huge_pcp_allow_orders))
+ drain_all_zone_pages();
+ }
+
+ return count;
+}
+
+static struct kobj_attribute thpsize_pcp_enabled_attr = __ATTR(pcp_enabled,
+ 0644, thpsize_pcp_enabled_show, thpsize_pcp_enabled_store);
+
static struct attribute *thpsize_attrs[] = {
&thpsize_enabled_attr.attr,
+ &thpsize_pcp_enabled_attr.attr,
NULL,
};
@@ -624,6 +665,8 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
*/
huge_anon_orders_inherit = BIT(PMD_ORDER);
+ huge_pcp_allow_orders = BIT(PMD_ORDER);
+
*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
if (unlikely(!*hugepage_kobj)) {
pr_err("failed to create transparent hugepage kobject\n");
@@ -658,6 +701,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
err = PTR_ERR(thpsize);
goto remove_all;
}
+
+ if (order <= PAGE_ALLOC_COSTLY_ORDER)
+ huge_pcp_allow_orders |= BIT(order);
+
list_add(&thpsize->node, &thpsize_list);
order = next_order(&orders, order);
}
@@ -537,6 +537,8 @@ static inline bool pcp_allowed_order(unsigned int order)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order == PCP_MAX_ORDER)
return true;
+ if (BIT(order) & huge_pcp_allow_orders)
+ return true;
#endif
return false;
}
@@ -6705,6 +6707,20 @@ void zone_pcp_reset(struct zone *zone)
}
}
+void drain_all_zone_pages(void)
+{
+ struct zone *zone;
+
+ mutex_lock(&pcp_batch_high_lock);
+ for_each_populated_zone(zone)
+ __zone_set_pageset_high_and_batch(zone, 0, 0, 1);
+ __drain_all_pages(NULL, true);
+ for_each_populated_zone(zone)
+ __zone_set_pageset_high_and_batch(zone, zone->pageset_high_min,
+ zone->pageset_high_max, zone->pageset_batch);
+ mutex_unlock(&pcp_batch_high_lock);
+}
+
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
* All pages in the range must be in a single zone, must not contain holes,
The high-order pages stored on PCP list may not always win, even herts some workloads, so it is disabled by default for high-orders except PMD_ORDER. Since there is already per-supported-THP-size interfaces to configrate mTHP behaviours, adding a new control pcp_enabled under above interfaces to allow user to enable/disable the specified high-order pages stored on PCP list or not, but it can't change the existing behaviour for order = PMD_ORDER and order <= PAGE_ALLOC_COSTLY_ORDER, they are always enabled and can't be disabled, meanwhile, when disabled by pcp_enabled for other high-orders, pcplists will be drained. Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com> --- Documentation/admin-guide/mm/transhuge.rst | 11 +++++ include/linux/gfp.h | 1 + include/linux/huge_mm.h | 1 + mm/huge_memory.c | 47 ++++++++++++++++++++++ mm/page_alloc.c | 16 ++++++++ 5 files changed, 76 insertions(+)