Message ID | 20240327213108.2384666-3-yuanchu@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm: workingset reporting | expand |
Yuanchu Xie <yuanchu@google.com> writes: > Hierarchically aggregate all memcgs' MGLRU generations and their > page counts into working set page age histograms. > The histograms break down the system's working set per-node, > per-anon/file. > > The sysfs interfaces are as follows: > /sys/devices/system/node/nodeX/page_age > A per-node page age histogram, showing an aggregate of the > node's lruvecs. The information is extracted from MGLRU's > per-generation page counters. Reading this file causes a > hierarchical aging of all lruvecs, scanning pages and creates a > new generation in each lruvec. > For example: > 1000 anon=0 file=0 > 2000 anon=0 file=0 > 100000 anon=5533696 file=5566464 > 18446744073709551615 anon=0 file=0 > > /sys/devices/system/node/nodeX/page_age_interval > A comma separated list of time in milliseconds that configures > what the page age histogram uses for aggregation. > > Signed-off-by: Yuanchu Xie <yuanchu@google.com> > --- > drivers/base/node.c | 3 + > include/linux/mmzone.h | 4 + > include/linux/workingset_report.h | 69 +++++ > mm/Kconfig | 9 + > mm/Makefile | 1 + > mm/internal.h | 9 + > mm/memcontrol.c | 2 + > mm/mmzone.c | 2 + > mm/vmscan.c | 34 ++- > mm/workingset_report.c | 413 ++++++++++++++++++++++++++++++ > 10 files changed, 545 insertions(+), 1 deletion(-) > create mode 100644 include/linux/workingset_report.h > create mode 100644 mm/workingset_report.c > > diff --git a/drivers/base/node.c b/drivers/base/node.c > index 1c05640461dd..4f589b8253f4 100644 > --- a/drivers/base/node.c > +++ b/drivers/base/node.c > @@ -20,6 +20,7 @@ > #include <linux/pm_runtime.h> > #include <linux/swap.h> > #include <linux/slab.h> > +#include <linux/workingset_report.h> > > static const struct bus_type node_subsys = { > .name = "node", > @@ -625,6 +626,7 @@ static int register_node(struct node *node, int num) > } else { > hugetlb_register_node(node); > compaction_register_node(node); > + wsr_register_node(node); > } > > return error; > @@ -641,6 +643,7 @@ void unregister_node(struct node *node) > { > hugetlb_unregister_node(node); > compaction_unregister_node(node); > + wsr_unregister_node(node); > node_remove_accesses(node); > node_remove_caches(node); > device_unregister(&node->dev); > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index a497f189d988..8839931646ee 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -24,6 +24,7 @@ > #include <linux/local_lock.h> > #include <linux/zswap.h> > #include <asm/page.h> > +#include <linux/workingset_report.h> > > /* Free memory management - zoned buddy allocator. */ > #ifndef CONFIG_ARCH_FORCE_MAX_ORDER > @@ -625,6 +626,9 @@ struct lruvec { > struct lru_gen_mm_state mm_state; > #endif > #endif /* CONFIG_LRU_GEN */ > +#ifdef CONFIG_WORKINGSET_REPORT > + struct wsr_state wsr; > +#endif /* CONFIG_WORKINGSET_REPORT */ > #ifdef CONFIG_MEMCG > struct pglist_data *pgdat; > #endif > diff --git a/include/linux/workingset_report.h b/include/linux/workingset_report.h > new file mode 100644 > index 000000000000..0de640cb1ef0 > --- /dev/null > +++ b/include/linux/workingset_report.h > @@ -0,0 +1,69 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef _LINUX_WORKINGSET_REPORT_H > +#define _LINUX_WORKINGSET_REPORT_H > + > +#include <linux/types.h> > +#include <linux/mutex.h> > + > +struct mem_cgroup; > +struct pglist_data; > +struct node; > +struct lruvec; > + > +#ifdef CONFIG_WORKINGSET_REPORT > + > +#define WORKINGSET_REPORT_MIN_NR_BINS 2 > +#define WORKINGSET_REPORT_MAX_NR_BINS 32 > + > +#define WORKINGSET_INTERVAL_MAX ((unsigned long)-1) > +#define ANON_AND_FILE 2 > + > +struct wsr_report_bin { > + unsigned long idle_age; > + unsigned long nr_pages[ANON_AND_FILE]; > +}; > + > +struct wsr_report_bins { > + unsigned long nr_bins; > + /* last bin contains WORKINGSET_INTERVAL_MAX */ > + struct wsr_report_bin bins[WORKINGSET_REPORT_MAX_NR_BINS]; > +}; > + > +struct wsr_page_age_histo { > + unsigned long timestamp; > + struct wsr_report_bins bins; > +}; > + > +struct wsr_state { > + /* breakdown of workingset by page age */ > + struct mutex page_age_lock; > + struct wsr_page_age_histo *page_age; > +}; > + > +void wsr_init(struct lruvec *lruvec); > +void wsr_destroy(struct lruvec *lruvec); > + > +/* > + * Returns true if the wsr is configured to be refreshed. > + * The next refresh time is stored in refresh_time. > + */ > +bool wsr_refresh_report(struct wsr_state *wsr, struct mem_cgroup *root, > + struct pglist_data *pgdat); > +void wsr_register_node(struct node *node); > +void wsr_unregister_node(struct node *node); > +#else > +static inline void wsr_init(struct lruvec *lruvec) > +{ > +} > +static inline void wsr_destroy(struct lruvec *lruvec) > +{ > +} > +static inline void wsr_register_node(struct node *node) > +{ > +} > +static inline void wsr_unregister_node(struct node *node) > +{ > +} > +#endif /* CONFIG_WORKINGSET_REPORT */ > + > +#endif /* _LINUX_WORKINGSET_REPORT_H */ > diff --git a/mm/Kconfig b/mm/Kconfig > index ffc3a2ba3a8c..212f203b10b9 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -1261,6 +1261,15 @@ config LOCK_MM_AND_FIND_VMA > config IOMMU_MM_DATA > bool > > +config WORKINGSET_REPORT > + bool "Working set reporting" > + depends on LRU_GEN && SYSFS > + help > + Report system and per-memcg working set to userspace. > + > + This option exports stats and events giving the user more insight > + into its memory working set. > + > source "mm/damon/Kconfig" > > endmenu > diff --git a/mm/Makefile b/mm/Makefile > index e4b5b75aaec9..57093657030d 100644 > --- a/mm/Makefile > +++ b/mm/Makefile > @@ -92,6 +92,7 @@ obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o > obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o > obj-$(CONFIG_PAGE_COUNTER) += page_counter.o > obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o > +obj-$(CONFIG_WORKINGSET_REPORT) += workingset_report.o > ifdef CONFIG_SWAP > obj-$(CONFIG_MEMCG) += swap_cgroup.o > endif > diff --git a/mm/internal.h b/mm/internal.h > index f309a010d50f..5e0caba64ee4 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -198,12 +198,21 @@ extern unsigned long highest_memmap_pfn; > /* > * in mm/vmscan.c: > */ > +struct scan_control; > bool isolate_lru_page(struct page *page); > bool folio_isolate_lru(struct folio *folio); > void putback_lru_page(struct page *page); > void folio_putback_lru(struct folio *folio); > extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); > > +#ifdef CONFIG_WORKINGSET_REPORT > +/* > + * in mm/wsr.c > + */ > +/* Requires wsr->page_age_lock held */ > +void wsr_refresh_scan(struct lruvec *lruvec); > +#endif > + > /* > * in mm/rmap.c: > */ > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 1ed40f9d3a27..2f07141de16c 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -65,6 +65,7 @@ > #include <linux/seq_buf.h> > #include <linux/sched/isolation.h> > #include <linux/kmemleak.h> > +#include <linux/workingset_report.h> > #include "internal.h" > #include <net/sock.h> > #include <net/ip.h> > @@ -5457,6 +5458,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) > if (!pn) > return; > > + wsr_destroy(&pn->lruvec); > free_percpu(pn->lruvec_stats_percpu); > kfree(pn); > } > diff --git a/mm/mmzone.c b/mm/mmzone.c > index c01896eca736..efca44c1b84b 100644 > --- a/mm/mmzone.c > +++ b/mm/mmzone.c > @@ -90,6 +90,8 @@ void lruvec_init(struct lruvec *lruvec) > */ > list_del(&lruvec->lists[LRU_UNEVICTABLE]); > > + wsr_init(lruvec); > + > lru_gen_init_lruvec(lruvec); > } > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 1a7c7d537db6..b694d80ab2d1 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -56,6 +56,7 @@ > #include <linux/khugepaged.h> > #include <linux/rculist_nulls.h> > #include <linux/random.h> > +#include <linux/workingset_report.h> > > #include <asm/tlbflush.h> > #include <asm/div64.h> > @@ -3815,7 +3816,7 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, > return success; > } > > -static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, > +bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, > struct scan_control *sc, bool can_swap, bool force_scan) It appears that this change isn't necessary. > { > bool success; > @@ -5606,6 +5607,8 @@ static int __init init_lru_gen(void) > if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) > pr_err("lru_gen: failed to create sysfs group\n"); > > + wsr_register_node(NULL); > + > debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); > debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); > > @@ -5613,6 +5616,35 @@ static int __init init_lru_gen(void) > }; > late_initcall(init_lru_gen); > > +/****************************************************************************** > + * workingset reporting > + ******************************************************************************/ > +#ifdef CONFIG_WORKINGSET_REPORT > +void wsr_refresh_scan(struct lruvec *lruvec) > +{ > + DEFINE_MAX_SEQ(lruvec); > + struct scan_control sc = { > + .may_writepage = true, > + .may_unmap = true, > + .may_swap = true, > + .proactive = true, > + .reclaim_idx = MAX_NR_ZONES - 1, > + .gfp_mask = GFP_KERNEL, > + }; > + unsigned int flags; > + > + set_task_reclaim_state(current, &sc.reclaim_state); > + flags = memalloc_noreclaim_save(); > + /* > + * setting can_swap=true and force_scan=true ensures > + * proper workingset stats when the system cannot swap. > + */ > + try_to_inc_max_seq(lruvec, max_seq, &sc, true, true); > + memalloc_noreclaim_restore(flags); > + set_task_reclaim_state(current, NULL); > +} > +#endif /* CONFIG_WORKINGSET_REPORT */ > + > #else /* !CONFIG_LRU_GEN */ > > static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) > diff --git a/mm/workingset_report.c b/mm/workingset_report.c > new file mode 100644 > index 000000000000..98cdaffcb6b4 > --- /dev/null > +++ b/mm/workingset_report.c > @@ -0,0 +1,413 @@ > +// SPDX-License-Identifier: GPL-2.0 > +// > +#include <linux/export.h> > +#include <linux/lockdep.h> > +#include <linux/jiffies.h> > +#include <linux/kernfs.h> > +#include <linux/memcontrol.h> > +#include <linux/rcupdate.h> > +#include <linux/mutex.h> > +#include <linux/err.h> > +#include <linux/atomic.h> > +#include <linux/node.h> > +#include <linux/mmzone.h> > +#include <linux/mm.h> > +#include <linux/mm_inline.h> > +#include <linux/workingset_report.h> > + > +#include "internal.h" > + > +void wsr_init(struct lruvec *lruvec) > +{ > + struct wsr_state *wsr = &lruvec->wsr; > + > + memset(wsr, 0, sizeof(*wsr)); > + mutex_init(&wsr->page_age_lock); > +} > + > +void wsr_destroy(struct lruvec *lruvec) > +{ > + struct wsr_state *wsr = &lruvec->wsr; > + > + mutex_destroy(&wsr->page_age_lock); > + kfree(wsr->page_age); > + memset(wsr, 0, sizeof(*wsr)); > +} > + > +static int workingset_report_intervals_parse(char *src, > + struct wsr_report_bins *bins) > +{ > + int err = 0, i = 0; > + char *cur, *next = strim(src); > + > + if (*next == '\0') > + return 0; > + > + while ((cur = strsep(&next, ","))) { > + unsigned int interval; > + > + err = kstrtouint(cur, 0, &interval); > + if (err) > + goto out; > + > + bins->bins[i].idle_age = msecs_to_jiffies(interval); > + if (i > 0 && bins->bins[i].idle_age <= bins->bins[i - 1].idle_age) { > + err = -EINVAL; > + goto out; > + } > + > + if (++i == WORKINGSET_REPORT_MAX_NR_BINS) { > + err = -ERANGE; > + goto out; > + } > + } > + > + if (i && i < WORKINGSET_REPORT_MIN_NR_BINS - 1) { > + err = -ERANGE; > + goto out; > + } > + > + bins->nr_bins = i; > + bins->bins[i].idle_age = WORKINGSET_INTERVAL_MAX; > +out: > + return err ?: i; > +} > + > +static unsigned long get_gen_start_time(const struct lru_gen_folio *lrugen, > + unsigned long seq, > + unsigned long max_seq, > + unsigned long curr_timestamp) > +{ > + int younger_gen; > + > + if (seq == max_seq) > + return curr_timestamp; > + younger_gen = lru_gen_from_seq(seq + 1); > + return READ_ONCE(lrugen->timestamps[younger_gen]); > +} > + > +static void collect_page_age_type(const struct lru_gen_folio *lrugen, > + struct wsr_report_bin *bin, > + unsigned long max_seq, unsigned long min_seq, > + unsigned long curr_timestamp, int type) > +{ > + unsigned long seq; > + > + for (seq = max_seq; seq + 1 > min_seq; seq--) { > + int gen, zone; > + unsigned long gen_end, gen_start, size = 0; > + > + gen = lru_gen_from_seq(seq); > + > + for (zone = 0; zone < MAX_NR_ZONES; zone++) > + size += max( > + READ_ONCE(lrugen->nr_pages[gen][type][zone]), > + 0L); > + > + gen_start = get_gen_start_time(lrugen, seq, max_seq, > + curr_timestamp); > + gen_end = READ_ONCE(lrugen->timestamps[gen]); > + > + while (bin->idle_age != WORKINGSET_INTERVAL_MAX && > + time_before(gen_end + bin->idle_age, curr_timestamp)) { > + unsigned long gen_in_bin = (long)gen_start - > + (long)curr_timestamp + > + (long)bin->idle_age; > + unsigned long gen_len = (long)gen_start - (long)gen_end; > + > + if (!gen_len) > + break; > + if (gen_in_bin) { > + unsigned long split_bin = > + size / gen_len * gen_in_bin; > + > + bin->nr_pages[type] += split_bin; > + size -= split_bin; > + } > + gen_start = curr_timestamp - bin->idle_age; > + bin++; > + } > + bin->nr_pages[type] += size; > + } > +} > + > +/* > + * proportionally aggregate Multi-gen LRU bins into a working set report > + * MGLRU generations: > + * current time > + * | max_seq timestamp > + * | | max_seq - 1 timestamp > + * | | | unbounded > + * | | | | > + * -------------------------------- > + * | max_seq | ... | ... | min_seq > + * -------------------------------- > + * > + * Bins: > + * > + * current time > + * | current - idle_age[0] > + * | | current - idle_age[1] > + * | | | unbounded > + * | | | | > + * ------------------------------ > + * | bin 0 | ... | ... | bin n-1 > + * ------------------------------ > + * > + * Assume the heuristic that pages are in the MGLRU generation > + * through uniform accesses, so we can aggregate them > + * proportionally into bins. > + */ > +static void collect_page_age(struct wsr_page_age_histo *page_age, > + const struct lruvec *lruvec) > +{ > + int type; > + const struct lru_gen_folio *lrugen = &lruvec->lrugen; > + unsigned long curr_timestamp = jiffies; > + unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq); > + unsigned long min_seq[ANON_AND_FILE] = { > + READ_ONCE(lruvec->lrugen.min_seq[LRU_GEN_ANON]), > + READ_ONCE(lruvec->lrugen.min_seq[LRU_GEN_FILE]), > + }; > + struct wsr_report_bins *bins = &page_age->bins; > + > + for (type = 0; type < ANON_AND_FILE; type++) { > + struct wsr_report_bin *bin = &bins->bins[0]; > + > + collect_page_age_type(lrugen, bin, max_seq, min_seq[type], > + curr_timestamp, type); > + } > +} > + > +/* First step: hierarchically scan child memcgs. */ > +static void refresh_scan(struct wsr_state *wsr, struct mem_cgroup *root, > + struct pglist_data *pgdat) > +{ > + struct mem_cgroup *memcg; > + > + memcg = mem_cgroup_iter(root, NULL, NULL); > + do { > + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); > + > + wsr_refresh_scan(lruvec); > + cond_resched(); > + } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); > +} > + > +/* Second step: aggregate child memcgs into the page age histogram. */ > +static void refresh_aggregate(struct wsr_page_age_histo *page_age, > + struct mem_cgroup *root, > + struct pglist_data *pgdat) > +{ > + struct mem_cgroup *memcg; > + struct wsr_report_bin *bin; > + > + /* > + * page_age_intervals should free the page_age struct > + * if no intervals are provided. > + */ > + VM_WARN_ON_ONCE(page_age->bins.bins[0].idle_age == > + WORKINGSET_INTERVAL_MAX); > + > + for (bin = page_age->bins.bins; > + bin->idle_age != WORKINGSET_INTERVAL_MAX; bin++) { > + bin->nr_pages[0] = 0; > + bin->nr_pages[1] = 0; > + } > + /* the last used bin has idle_age == WORKINGSET_INTERVAL_MAX. */ > + bin->nr_pages[0] = 0; > + bin->nr_pages[1] = 0; > + > + memcg = mem_cgroup_iter(root, NULL, NULL); > + do { > + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); > + > + collect_page_age(page_age, lruvec); > + cond_resched(); > + } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); > + WRITE_ONCE(page_age->timestamp, jiffies); > +} > + > +bool wsr_refresh_report(struct wsr_state *wsr, struct mem_cgroup *root, > + struct pglist_data *pgdat) > +{ > + struct wsr_page_age_histo *page_age; > + > + if (!READ_ONCE(wsr->page_age)) > + return false; > + > + refresh_scan(wsr, root, pgdat); > + mutex_lock(&wsr->page_age_lock); > + page_age = READ_ONCE(wsr->page_age); > + if (page_age) > + refresh_aggregate(page_age, root, pgdat); > + mutex_unlock(&wsr->page_age_lock); > + return !!page_age; > +} > +EXPORT_SYMBOL_GPL(wsr_refresh_report); > + > +static struct pglist_data *kobj_to_pgdat(struct kobject *kobj) > +{ > + int nid = IS_ENABLED(CONFIG_NUMA) ? kobj_to_dev(kobj)->id : > + first_memory_node; > + > + return NODE_DATA(nid); > +} > + > +static struct wsr_state *kobj_to_wsr(struct kobject *kobj) > +{ > + return &mem_cgroup_lruvec(NULL, kobj_to_pgdat(kobj))->wsr; > +} > + > +static ssize_t page_age_intervals_show(struct kobject *kobj, > + struct kobj_attribute *attr, char *buf) > +{ > + int len = 0; > + struct wsr_state *wsr = kobj_to_wsr(kobj); > + > + mutex_lock(&wsr->page_age_lock); > + > + if (!!wsr->page_age) { > + int i; > + int nr_bins = wsr->page_age->bins.nr_bins; > + > + for (i = 0; i < nr_bins; ++i) { > + struct wsr_report_bin *bin = > + &wsr->page_age->bins.bins[i]; > + > + len += sysfs_emit_at(buf, len, "%u", > + jiffies_to_msecs(bin->idle_age)); > + if (i + 1 < nr_bins) > + len += sysfs_emit_at(buf, len, ","); > + } > + } > + len += sysfs_emit_at(buf, len, "\n"); > + > + mutex_unlock(&wsr->page_age_lock); > + return len; > +} > + > +static ssize_t page_age_intervals_store(struct kobject *kobj, > + struct kobj_attribute *attr, > + const char *src, size_t len) > +{ > + struct wsr_page_age_histo *page_age = NULL, *old; > + char *buf = NULL; > + int err = 0; > + struct wsr_state *wsr = kobj_to_wsr(kobj); > + > + buf = kstrdup(src, GFP_KERNEL); > + if (!buf) { > + err = -ENOMEM; > + goto failed; > + } > + > + page_age = > + kzalloc(sizeof(struct wsr_page_age_histo), GFP_KERNEL_ACCOUNT); > + > + if (!page_age) { > + err = -ENOMEM; > + goto failed; > + } > + > + err = workingset_report_intervals_parse(buf, &page_age->bins); > + if (err < 0) > + goto failed; > + > + if (err == 0) { > + kfree(page_age); > + page_age = NULL; > + } > + > + mutex_lock(&wsr->page_age_lock); > + old = xchg(&wsr->page_age, page_age); > + mutex_unlock(&wsr->page_age_lock); > + kfree(old); > + kfree(buf); > + return len; > +failed: > + kfree(page_age); > + kfree(buf); > + > + return err; > +} > + > +static struct kobj_attribute page_age_intervals_attr = > + __ATTR_RW(page_age_intervals); > + > +static ssize_t page_age_show(struct kobject *kobj, struct kobj_attribute *attr, > + char *buf) > +{ > + struct wsr_report_bin *bin; > + int ret = 0; > + struct wsr_state *wsr = kobj_to_wsr(kobj); > + > + if (!READ_ONCE(wsr->page_age)) > + return -EINVAL; > + > + wsr_refresh_report(wsr, NULL, kobj_to_pgdat(kobj)); > + > + mutex_lock(&wsr->page_age_lock); > + if (!wsr->page_age) { > + ret = -EINVAL; > + goto unlock; > + } > + > + for (bin = wsr->page_age->bins.bins; > + bin->idle_age != WORKINGSET_INTERVAL_MAX; bin++) > + ret += sysfs_emit_at(buf, ret, "%u anon=%lu file=%lu\n", > + jiffies_to_msecs(bin->idle_age), > + bin->nr_pages[0] * PAGE_SIZE, > + bin->nr_pages[1] * PAGE_SIZE); > + > + ret += sysfs_emit_at(buf, ret, "%lu anon=%lu file=%lu\n", > + WORKINGSET_INTERVAL_MAX, > + bin->nr_pages[0] * PAGE_SIZE, > + bin->nr_pages[1] * PAGE_SIZE); > + > +unlock: > + mutex_unlock(&wsr->page_age_lock); > + return ret; > +} > + > +static struct kobj_attribute page_age_attr = __ATTR_RO(page_age); > + > +static struct attribute *workingset_report_attrs[] = { > + &page_age_intervals_attr.attr, &page_age_attr.attr, NULL > +}; > + > +static const struct attribute_group workingset_report_attr_group = { > + .name = "workingset_report", > + .attrs = workingset_report_attrs, > +}; > + > +void wsr_register_node(struct node *node) > +{ > + struct kobject *kobj = node ? &node->dev.kobj : mm_kobj; > + struct wsr_state *wsr; > + > + if (IS_ENABLED(CONFIG_NUMA) && !node) > + return; > + > + wsr = kobj_to_wsr(kobj); > + > + if (sysfs_create_group(kobj, &workingset_report_attr_group)) { > + pr_warn("WSR failed to created group"); > + return; > + } > +} > +EXPORT_SYMBOL_GPL(wsr_register_node); > + > +void wsr_unregister_node(struct node *node) > +{ > + struct kobject *kobj = &node->dev.kobj; > + struct wsr_state *wsr; > + > + if (IS_ENABLED(CONFIG_NUMA) && !node) > + return; > + > + wsr = kobj_to_wsr(kobj); > + sysfs_remove_group(kobj, &workingset_report_attr_group); > + wsr_destroy(mem_cgroup_lruvec(NULL, kobj_to_pgdat(kobj))); > +} > +EXPORT_SYMBOL_GPL(wsr_unregister_node); -- Best Regards, Huang, Ying
diff --git a/drivers/base/node.c b/drivers/base/node.c index 1c05640461dd..4f589b8253f4 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -20,6 +20,7 @@ #include <linux/pm_runtime.h> #include <linux/swap.h> #include <linux/slab.h> +#include <linux/workingset_report.h> static const struct bus_type node_subsys = { .name = "node", @@ -625,6 +626,7 @@ static int register_node(struct node *node, int num) } else { hugetlb_register_node(node); compaction_register_node(node); + wsr_register_node(node); } return error; @@ -641,6 +643,7 @@ void unregister_node(struct node *node) { hugetlb_unregister_node(node); compaction_unregister_node(node); + wsr_unregister_node(node); node_remove_accesses(node); node_remove_caches(node); device_unregister(&node->dev); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a497f189d988..8839931646ee 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -24,6 +24,7 @@ #include <linux/local_lock.h> #include <linux/zswap.h> #include <asm/page.h> +#include <linux/workingset_report.h> /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_ARCH_FORCE_MAX_ORDER @@ -625,6 +626,9 @@ struct lruvec { struct lru_gen_mm_state mm_state; #endif #endif /* CONFIG_LRU_GEN */ +#ifdef CONFIG_WORKINGSET_REPORT + struct wsr_state wsr; +#endif /* CONFIG_WORKINGSET_REPORT */ #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif diff --git a/include/linux/workingset_report.h b/include/linux/workingset_report.h new file mode 100644 index 000000000000..0de640cb1ef0 --- /dev/null +++ b/include/linux/workingset_report.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_WORKINGSET_REPORT_H +#define _LINUX_WORKINGSET_REPORT_H + +#include <linux/types.h> +#include <linux/mutex.h> + +struct mem_cgroup; +struct pglist_data; +struct node; +struct lruvec; + +#ifdef CONFIG_WORKINGSET_REPORT + +#define WORKINGSET_REPORT_MIN_NR_BINS 2 +#define WORKINGSET_REPORT_MAX_NR_BINS 32 + +#define WORKINGSET_INTERVAL_MAX ((unsigned long)-1) +#define ANON_AND_FILE 2 + +struct wsr_report_bin { + unsigned long idle_age; + unsigned long nr_pages[ANON_AND_FILE]; +}; + +struct wsr_report_bins { + unsigned long nr_bins; + /* last bin contains WORKINGSET_INTERVAL_MAX */ + struct wsr_report_bin bins[WORKINGSET_REPORT_MAX_NR_BINS]; +}; + +struct wsr_page_age_histo { + unsigned long timestamp; + struct wsr_report_bins bins; +}; + +struct wsr_state { + /* breakdown of workingset by page age */ + struct mutex page_age_lock; + struct wsr_page_age_histo *page_age; +}; + +void wsr_init(struct lruvec *lruvec); +void wsr_destroy(struct lruvec *lruvec); + +/* + * Returns true if the wsr is configured to be refreshed. + * The next refresh time is stored in refresh_time. + */ +bool wsr_refresh_report(struct wsr_state *wsr, struct mem_cgroup *root, + struct pglist_data *pgdat); +void wsr_register_node(struct node *node); +void wsr_unregister_node(struct node *node); +#else +static inline void wsr_init(struct lruvec *lruvec) +{ +} +static inline void wsr_destroy(struct lruvec *lruvec) +{ +} +static inline void wsr_register_node(struct node *node) +{ +} +static inline void wsr_unregister_node(struct node *node) +{ +} +#endif /* CONFIG_WORKINGSET_REPORT */ + +#endif /* _LINUX_WORKINGSET_REPORT_H */ diff --git a/mm/Kconfig b/mm/Kconfig index ffc3a2ba3a8c..212f203b10b9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1261,6 +1261,15 @@ config LOCK_MM_AND_FIND_VMA config IOMMU_MM_DATA bool +config WORKINGSET_REPORT + bool "Working set reporting" + depends on LRU_GEN && SYSFS + help + Report system and per-memcg working set to userspace. + + This option exports stats and events giving the user more insight + into its memory working set. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index e4b5b75aaec9..57093657030d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -92,6 +92,7 @@ obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o +obj-$(CONFIG_WORKINGSET_REPORT) += workingset_report.o ifdef CONFIG_SWAP obj-$(CONFIG_MEMCG) += swap_cgroup.o endif diff --git a/mm/internal.h b/mm/internal.h index f309a010d50f..5e0caba64ee4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -198,12 +198,21 @@ extern unsigned long highest_memmap_pfn; /* * in mm/vmscan.c: */ +struct scan_control; bool isolate_lru_page(struct page *page); bool folio_isolate_lru(struct folio *folio); void putback_lru_page(struct page *page); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); +#ifdef CONFIG_WORKINGSET_REPORT +/* + * in mm/wsr.c + */ +/* Requires wsr->page_age_lock held */ +void wsr_refresh_scan(struct lruvec *lruvec); +#endif + /* * in mm/rmap.c: */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1ed40f9d3a27..2f07141de16c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -65,6 +65,7 @@ #include <linux/seq_buf.h> #include <linux/sched/isolation.h> #include <linux/kmemleak.h> +#include <linux/workingset_report.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -5457,6 +5458,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return; + wsr_destroy(&pn->lruvec); free_percpu(pn->lruvec_stats_percpu); kfree(pn); } diff --git a/mm/mmzone.c b/mm/mmzone.c index c01896eca736..efca44c1b84b 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -90,6 +90,8 @@ void lruvec_init(struct lruvec *lruvec) */ list_del(&lruvec->lists[LRU_UNEVICTABLE]); + wsr_init(lruvec); + lru_gen_init_lruvec(lruvec); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 1a7c7d537db6..b694d80ab2d1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -56,6 +56,7 @@ #include <linux/khugepaged.h> #include <linux/rculist_nulls.h> #include <linux/random.h> +#include <linux/workingset_report.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -3815,7 +3816,7 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, return success; } -static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, +bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, struct scan_control *sc, bool can_swap, bool force_scan) { bool success; @@ -5606,6 +5607,8 @@ static int __init init_lru_gen(void) if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) pr_err("lru_gen: failed to create sysfs group\n"); + wsr_register_node(NULL); + debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); @@ -5613,6 +5616,35 @@ static int __init init_lru_gen(void) }; late_initcall(init_lru_gen); +/****************************************************************************** + * workingset reporting + ******************************************************************************/ +#ifdef CONFIG_WORKINGSET_REPORT +void wsr_refresh_scan(struct lruvec *lruvec) +{ + DEFINE_MAX_SEQ(lruvec); + struct scan_control sc = { + .may_writepage = true, + .may_unmap = true, + .may_swap = true, + .proactive = true, + .reclaim_idx = MAX_NR_ZONES - 1, + .gfp_mask = GFP_KERNEL, + }; + unsigned int flags; + + set_task_reclaim_state(current, &sc.reclaim_state); + flags = memalloc_noreclaim_save(); + /* + * setting can_swap=true and force_scan=true ensures + * proper workingset stats when the system cannot swap. + */ + try_to_inc_max_seq(lruvec, max_seq, &sc, true, true); + memalloc_noreclaim_restore(flags); + set_task_reclaim_state(current, NULL); +} +#endif /* CONFIG_WORKINGSET_REPORT */ + #else /* !CONFIG_LRU_GEN */ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) diff --git a/mm/workingset_report.c b/mm/workingset_report.c new file mode 100644 index 000000000000..98cdaffcb6b4 --- /dev/null +++ b/mm/workingset_report.c @@ -0,0 +1,413 @@ +// SPDX-License-Identifier: GPL-2.0 +// +#include <linux/export.h> +#include <linux/lockdep.h> +#include <linux/jiffies.h> +#include <linux/kernfs.h> +#include <linux/memcontrol.h> +#include <linux/rcupdate.h> +#include <linux/mutex.h> +#include <linux/err.h> +#include <linux/atomic.h> +#include <linux/node.h> +#include <linux/mmzone.h> +#include <linux/mm.h> +#include <linux/mm_inline.h> +#include <linux/workingset_report.h> + +#include "internal.h" + +void wsr_init(struct lruvec *lruvec) +{ + struct wsr_state *wsr = &lruvec->wsr; + + memset(wsr, 0, sizeof(*wsr)); + mutex_init(&wsr->page_age_lock); +} + +void wsr_destroy(struct lruvec *lruvec) +{ + struct wsr_state *wsr = &lruvec->wsr; + + mutex_destroy(&wsr->page_age_lock); + kfree(wsr->page_age); + memset(wsr, 0, sizeof(*wsr)); +} + +static int workingset_report_intervals_parse(char *src, + struct wsr_report_bins *bins) +{ + int err = 0, i = 0; + char *cur, *next = strim(src); + + if (*next == '\0') + return 0; + + while ((cur = strsep(&next, ","))) { + unsigned int interval; + + err = kstrtouint(cur, 0, &interval); + if (err) + goto out; + + bins->bins[i].idle_age = msecs_to_jiffies(interval); + if (i > 0 && bins->bins[i].idle_age <= bins->bins[i - 1].idle_age) { + err = -EINVAL; + goto out; + } + + if (++i == WORKINGSET_REPORT_MAX_NR_BINS) { + err = -ERANGE; + goto out; + } + } + + if (i && i < WORKINGSET_REPORT_MIN_NR_BINS - 1) { + err = -ERANGE; + goto out; + } + + bins->nr_bins = i; + bins->bins[i].idle_age = WORKINGSET_INTERVAL_MAX; +out: + return err ?: i; +} + +static unsigned long get_gen_start_time(const struct lru_gen_folio *lrugen, + unsigned long seq, + unsigned long max_seq, + unsigned long curr_timestamp) +{ + int younger_gen; + + if (seq == max_seq) + return curr_timestamp; + younger_gen = lru_gen_from_seq(seq + 1); + return READ_ONCE(lrugen->timestamps[younger_gen]); +} + +static void collect_page_age_type(const struct lru_gen_folio *lrugen, + struct wsr_report_bin *bin, + unsigned long max_seq, unsigned long min_seq, + unsigned long curr_timestamp, int type) +{ + unsigned long seq; + + for (seq = max_seq; seq + 1 > min_seq; seq--) { + int gen, zone; + unsigned long gen_end, gen_start, size = 0; + + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max( + READ_ONCE(lrugen->nr_pages[gen][type][zone]), + 0L); + + gen_start = get_gen_start_time(lrugen, seq, max_seq, + curr_timestamp); + gen_end = READ_ONCE(lrugen->timestamps[gen]); + + while (bin->idle_age != WORKINGSET_INTERVAL_MAX && + time_before(gen_end + bin->idle_age, curr_timestamp)) { + unsigned long gen_in_bin = (long)gen_start - + (long)curr_timestamp + + (long)bin->idle_age; + unsigned long gen_len = (long)gen_start - (long)gen_end; + + if (!gen_len) + break; + if (gen_in_bin) { + unsigned long split_bin = + size / gen_len * gen_in_bin; + + bin->nr_pages[type] += split_bin; + size -= split_bin; + } + gen_start = curr_timestamp - bin->idle_age; + bin++; + } + bin->nr_pages[type] += size; + } +} + +/* + * proportionally aggregate Multi-gen LRU bins into a working set report + * MGLRU generations: + * current time + * | max_seq timestamp + * | | max_seq - 1 timestamp + * | | | unbounded + * | | | | + * -------------------------------- + * | max_seq | ... | ... | min_seq + * -------------------------------- + * + * Bins: + * + * current time + * | current - idle_age[0] + * | | current - idle_age[1] + * | | | unbounded + * | | | | + * ------------------------------ + * | bin 0 | ... | ... | bin n-1 + * ------------------------------ + * + * Assume the heuristic that pages are in the MGLRU generation + * through uniform accesses, so we can aggregate them + * proportionally into bins. + */ +static void collect_page_age(struct wsr_page_age_histo *page_age, + const struct lruvec *lruvec) +{ + int type; + const struct lru_gen_folio *lrugen = &lruvec->lrugen; + unsigned long curr_timestamp = jiffies; + unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq); + unsigned long min_seq[ANON_AND_FILE] = { + READ_ONCE(lruvec->lrugen.min_seq[LRU_GEN_ANON]), + READ_ONCE(lruvec->lrugen.min_seq[LRU_GEN_FILE]), + }; + struct wsr_report_bins *bins = &page_age->bins; + + for (type = 0; type < ANON_AND_FILE; type++) { + struct wsr_report_bin *bin = &bins->bins[0]; + + collect_page_age_type(lrugen, bin, max_seq, min_seq[type], + curr_timestamp, type); + } +} + +/* First step: hierarchically scan child memcgs. */ +static void refresh_scan(struct wsr_state *wsr, struct mem_cgroup *root, + struct pglist_data *pgdat) +{ + struct mem_cgroup *memcg; + + memcg = mem_cgroup_iter(root, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + + wsr_refresh_scan(lruvec); + cond_resched(); + } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); +} + +/* Second step: aggregate child memcgs into the page age histogram. */ +static void refresh_aggregate(struct wsr_page_age_histo *page_age, + struct mem_cgroup *root, + struct pglist_data *pgdat) +{ + struct mem_cgroup *memcg; + struct wsr_report_bin *bin; + + /* + * page_age_intervals should free the page_age struct + * if no intervals are provided. + */ + VM_WARN_ON_ONCE(page_age->bins.bins[0].idle_age == + WORKINGSET_INTERVAL_MAX); + + for (bin = page_age->bins.bins; + bin->idle_age != WORKINGSET_INTERVAL_MAX; bin++) { + bin->nr_pages[0] = 0; + bin->nr_pages[1] = 0; + } + /* the last used bin has idle_age == WORKINGSET_INTERVAL_MAX. */ + bin->nr_pages[0] = 0; + bin->nr_pages[1] = 0; + + memcg = mem_cgroup_iter(root, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + + collect_page_age(page_age, lruvec); + cond_resched(); + } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); + WRITE_ONCE(page_age->timestamp, jiffies); +} + +bool wsr_refresh_report(struct wsr_state *wsr, struct mem_cgroup *root, + struct pglist_data *pgdat) +{ + struct wsr_page_age_histo *page_age; + + if (!READ_ONCE(wsr->page_age)) + return false; + + refresh_scan(wsr, root, pgdat); + mutex_lock(&wsr->page_age_lock); + page_age = READ_ONCE(wsr->page_age); + if (page_age) + refresh_aggregate(page_age, root, pgdat); + mutex_unlock(&wsr->page_age_lock); + return !!page_age; +} +EXPORT_SYMBOL_GPL(wsr_refresh_report); + +static struct pglist_data *kobj_to_pgdat(struct kobject *kobj) +{ + int nid = IS_ENABLED(CONFIG_NUMA) ? kobj_to_dev(kobj)->id : + first_memory_node; + + return NODE_DATA(nid); +} + +static struct wsr_state *kobj_to_wsr(struct kobject *kobj) +{ + return &mem_cgroup_lruvec(NULL, kobj_to_pgdat(kobj))->wsr; +} + +static ssize_t page_age_intervals_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int len = 0; + struct wsr_state *wsr = kobj_to_wsr(kobj); + + mutex_lock(&wsr->page_age_lock); + + if (!!wsr->page_age) { + int i; + int nr_bins = wsr->page_age->bins.nr_bins; + + for (i = 0; i < nr_bins; ++i) { + struct wsr_report_bin *bin = + &wsr->page_age->bins.bins[i]; + + len += sysfs_emit_at(buf, len, "%u", + jiffies_to_msecs(bin->idle_age)); + if (i + 1 < nr_bins) + len += sysfs_emit_at(buf, len, ","); + } + } + len += sysfs_emit_at(buf, len, "\n"); + + mutex_unlock(&wsr->page_age_lock); + return len; +} + +static ssize_t page_age_intervals_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *src, size_t len) +{ + struct wsr_page_age_histo *page_age = NULL, *old; + char *buf = NULL; + int err = 0; + struct wsr_state *wsr = kobj_to_wsr(kobj); + + buf = kstrdup(src, GFP_KERNEL); + if (!buf) { + err = -ENOMEM; + goto failed; + } + + page_age = + kzalloc(sizeof(struct wsr_page_age_histo), GFP_KERNEL_ACCOUNT); + + if (!page_age) { + err = -ENOMEM; + goto failed; + } + + err = workingset_report_intervals_parse(buf, &page_age->bins); + if (err < 0) + goto failed; + + if (err == 0) { + kfree(page_age); + page_age = NULL; + } + + mutex_lock(&wsr->page_age_lock); + old = xchg(&wsr->page_age, page_age); + mutex_unlock(&wsr->page_age_lock); + kfree(old); + kfree(buf); + return len; +failed: + kfree(page_age); + kfree(buf); + + return err; +} + +static struct kobj_attribute page_age_intervals_attr = + __ATTR_RW(page_age_intervals); + +static ssize_t page_age_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct wsr_report_bin *bin; + int ret = 0; + struct wsr_state *wsr = kobj_to_wsr(kobj); + + if (!READ_ONCE(wsr->page_age)) + return -EINVAL; + + wsr_refresh_report(wsr, NULL, kobj_to_pgdat(kobj)); + + mutex_lock(&wsr->page_age_lock); + if (!wsr->page_age) { + ret = -EINVAL; + goto unlock; + } + + for (bin = wsr->page_age->bins.bins; + bin->idle_age != WORKINGSET_INTERVAL_MAX; bin++) + ret += sysfs_emit_at(buf, ret, "%u anon=%lu file=%lu\n", + jiffies_to_msecs(bin->idle_age), + bin->nr_pages[0] * PAGE_SIZE, + bin->nr_pages[1] * PAGE_SIZE); + + ret += sysfs_emit_at(buf, ret, "%lu anon=%lu file=%lu\n", + WORKINGSET_INTERVAL_MAX, + bin->nr_pages[0] * PAGE_SIZE, + bin->nr_pages[1] * PAGE_SIZE); + +unlock: + mutex_unlock(&wsr->page_age_lock); + return ret; +} + +static struct kobj_attribute page_age_attr = __ATTR_RO(page_age); + +static struct attribute *workingset_report_attrs[] = { + &page_age_intervals_attr.attr, &page_age_attr.attr, NULL +}; + +static const struct attribute_group workingset_report_attr_group = { + .name = "workingset_report", + .attrs = workingset_report_attrs, +}; + +void wsr_register_node(struct node *node) +{ + struct kobject *kobj = node ? &node->dev.kobj : mm_kobj; + struct wsr_state *wsr; + + if (IS_ENABLED(CONFIG_NUMA) && !node) + return; + + wsr = kobj_to_wsr(kobj); + + if (sysfs_create_group(kobj, &workingset_report_attr_group)) { + pr_warn("WSR failed to created group"); + return; + } +} +EXPORT_SYMBOL_GPL(wsr_register_node); + +void wsr_unregister_node(struct node *node) +{ + struct kobject *kobj = &node->dev.kobj; + struct wsr_state *wsr; + + if (IS_ENABLED(CONFIG_NUMA) && !node) + return; + + wsr = kobj_to_wsr(kobj); + sysfs_remove_group(kobj, &workingset_report_attr_group); + wsr_destroy(mem_cgroup_lruvec(NULL, kobj_to_pgdat(kobj))); +} +EXPORT_SYMBOL_GPL(wsr_unregister_node);
Hierarchically aggregate all memcgs' MGLRU generations and their page counts into working set page age histograms. The histograms break down the system's working set per-node, per-anon/file. The sysfs interfaces are as follows: /sys/devices/system/node/nodeX/page_age A per-node page age histogram, showing an aggregate of the node's lruvecs. The information is extracted from MGLRU's per-generation page counters. Reading this file causes a hierarchical aging of all lruvecs, scanning pages and creates a new generation in each lruvec. For example: 1000 anon=0 file=0 2000 anon=0 file=0 100000 anon=5533696 file=5566464 18446744073709551615 anon=0 file=0 /sys/devices/system/node/nodeX/page_age_interval A comma separated list of time in milliseconds that configures what the page age histogram uses for aggregation. Signed-off-by: Yuanchu Xie <yuanchu@google.com> --- drivers/base/node.c | 3 + include/linux/mmzone.h | 4 + include/linux/workingset_report.h | 69 +++++ mm/Kconfig | 9 + mm/Makefile | 1 + mm/internal.h | 9 + mm/memcontrol.c | 2 + mm/mmzone.c | 2 + mm/vmscan.c | 34 ++- mm/workingset_report.c | 413 ++++++++++++++++++++++++++++++ 10 files changed, 545 insertions(+), 1 deletion(-) create mode 100644 include/linux/workingset_report.h create mode 100644 mm/workingset_report.c