@@ -1045,6 +1045,16 @@ config PPC_SECVAR_SYSFS
read/write operations on these variables. Say Y if you have
secure boot enabled and want to expose variables to userspace.
+config PPC_HCA_HOTNESS
+ prompt "PowerPC HCA engine based page hotness"
+ def_bool y
+ select ARCH_HAS_PAGE_AGING
+ depends on PPC_BOOK3S_64
+ help
+ Use HCA engine to find page hotness
+
+ If unsure, say N.
+
endmenu
config ISA_DMA_API
new file mode 100644
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+/*
+ * Configuration helpers for the Hot-Cold Affinity helper
+ */
+
+#ifndef _ASM_POWERPC_HCA_H
+#define _ASM_POWERPC_HCA_H
+
+#include <linux/types.h>
+
+struct hca_entry {
+ unsigned long count;
+ unsigned long prev_count;
+ uint8_t age;
+};
+
+static inline unsigned long hotness_score(struct hca_entry *entry)
+{
+ unsigned long hotness;
+
+#if 0
+ /*
+ * Give more weightage to the prev_count because it got
+ * historical values. Take smaller part of count as we
+ * age more because prev_count would be a better approximation.
+ * We still need to consider count to accomidate spike in access.
+ * + 1 with age to handle age == 0.
+ */
+ hotness = entry->prev_count + (entry->count / (entry->age + 1));
+#else
+ /* Considering we are not finding in real workloads pages with
+ * very high hotness a decay essentially move count value to prev count.
+ * At that point we could look at decay as period zeroing of the counter.
+ * I am finding better results with the below hotness score with real workloads.
+ */
+ hotness = entry->prev_count + entry->count;
+#endif
+
+ return hotness;
+}
+
+extern void (*hca_backend_node_debugfs_init)(int numa_node, struct dentry *node_dentry);
+extern void (*hca_backend_debugfs_init)(struct dentry *root_dentry);
+extern int (*hca_pfn_entry)(unsigned long pfn, struct hca_entry *entry);
+extern bool (*hca_node_enabled)(int numa_node);
+extern int (*hca_clear_entry)(unsigned long pfn);
+
+#endif /* _ASM_POWERPC_HCA_H */
new file mode 100644
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _ASM_POWERPC_PAGE_AGING_H_
+#define _ASM_POWERPC_PAGE_AGING_H_
+
+#ifdef CONFIG_LRU_GEN
+extern bool hca_lru_age;
+unsigned long hca_map_lru_seq(struct lruvec *lruvec, struct folio *folio);
+bool hca_try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ int scan_priority, bool can_swap, bool force_scan);
+
+#define arch_supports_page_access_count arch_supports_page_access_count
+static inline bool arch_supports_page_access_count(void)
+{
+ return hca_lru_age;
+}
+
+#define arch_try_to_inc_max_seq arch_try_to_inc_max_seq
+static inline bool arch_try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ int scan_priority, bool can_swap,
+ bool force_scan)
+{
+ return hca_try_to_inc_max_seq(lruvec, max_seq, scan_priority,
+ can_swap, force_scan);
+
+}
+
+#define arch_get_lru_gen_seq arch_get_lru_gen_seq
+static inline unsigned long arch_get_lru_gen_seq(struct lruvec *lruvec, struct folio *folio)
+{
+ return hca_map_lru_seq(lruvec, folio);
+}
+
+#endif /* CONFIG_LRU_GEN */
+#endif
@@ -19,3 +19,4 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump/
obj-$(CONFIG_KASAN) += kasan/
+obj-$(CONFIG_PPC_HCA_HOTNESS) += hca.o
new file mode 100644
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/debugfs.h>
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/page_aging.h>
+
+#include <asm/hca.h>
+
+bool hca_lru_age;
+static struct dentry *hca_debugfs_root;
+/*
+ * percentage of pfns to scan from each lurvec list to determine max/min hotness
+ */
+static ulong scan_pfn_ratio __read_mostly = 20;
+/*
+ * Millisec to wait/skip before starting another random scan
+ */
+static ulong scan_skip_msec __read_mostly = 60;
+
+/* backend callbacks */
+void (*hca_backend_node_debugfs_init)(int numa_node, struct dentry *node_dentry);
+void (*hca_backend_debugfs_init)(struct dentry *root_dentry);
+int (*hca_pfn_entry)(unsigned long pfn, struct hca_entry *entry);
+bool (*hca_node_enabled)(int numa_node);
+int (*hca_clear_entry)(unsigned long pfn);
+
+static int parse_hca_age(char *arg)
+{
+ return strtobool(arg, &hca_lru_age);
+}
+early_param("hca_age", parse_hca_age);
+
+static inline int folio_hca_entry(struct folio *folio, struct hca_entry *entry)
+{
+ return hca_pfn_entry(folio_pfn(folio), entry);
+}
+
+#ifdef CONFIG_LRU_GEN
+static inline int get_nr_gens(struct lruvec *lruvec, int type)
+{
+ return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
+}
+
+/* FIXME!! */
+static inline bool folio_evictable(struct folio *folio)
+{
+ bool ret;
+
+ /* Prevent address_space of inode and swap cache from being freed */
+ rcu_read_lock();
+ ret = !mapping_unevictable(folio_mapping(folio)) &&
+ !folio_test_mlocked(folio);
+ rcu_read_unlock();
+ return ret;
+}
+
+static void restablish_hotness_range(struct lruvec *lruvec)
+{
+ bool youngest = true;
+ int gen, nr_pages;
+ unsigned long seq;
+ int new_scan_pfn_count;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ unsigned long current_hotness, max_hotness = 0, min_hotness = 0;
+
+ if (time_is_after_jiffies64(lrugen->next_span_scan))
+ return;
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+retry:
+ for (int type = 0; type < ANON_AND_FILE; type++) {
+ for (int zone = 0; zone < MAX_NR_ZONES; zone++) {
+ int index = 0;
+ struct list_head *head;
+ struct folio *folio;
+ struct hca_entry entry;
+
+ if (youngest)
+ seq = lrugen->max_seq;
+ else
+ seq = lrugen->min_seq[type];
+ gen = lru_gen_from_seq(seq);
+ nr_pages = lrugen->nr_pages[gen][type][zone];
+
+ new_scan_pfn_count = nr_pages * scan_pfn_ratio/100;
+ if (!new_scan_pfn_count)
+ new_scan_pfn_count = nr_pages;
+
+ head = &lrugen->lists[gen][type][zone];
+ list_for_each_entry(folio, head, lru) {
+
+ if (unlikely(!folio_evictable(folio)))
+ continue;
+
+ if (folio_hca_entry(folio, &entry))
+ continue;
+
+ if (index++ > new_scan_pfn_count)
+ break;
+
+ current_hotness = hotness_score(&entry);
+ /* If the page didn't see any access, skip it */
+ if (!current_hotness)
+ continue;
+ /*
+ * Let's make sure we at least wait 1 decay
+ * updates before looking at this pfn for
+ * max/min computation.
+ */
+ if (entry.age < 1)
+ continue;
+
+ if (current_hotness > max_hotness)
+ max_hotness = (current_hotness + max_hotness) / 2;
+ else if ((current_hotness < min_hotness) || !min_hotness)
+ min_hotness = (current_hotness + min_hotness) / 2;
+ else if ((current_hotness - min_hotness) < (max_hotness - min_hotness) / 2)
+ min_hotness = (current_hotness + min_hotness) / 2;
+ else
+ max_hotness = (current_hotness + max_hotness) / 2;
+
+ }
+
+ }
+ }
+ if (youngest) {
+ /* compute with oldest generation */
+ youngest = false;
+ goto retry;
+ }
+ lrugen->next_span_scan = get_jiffies_64() + msecs_to_jiffies(scan_skip_msec);
+ if (min_hotness) {
+ lrugen->max_hotness = max_hotness;
+ lrugen->min_hotness = min_hotness;
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+}
+
+/* Return Multigen LRU generation based on folio hotness */
+unsigned long hca_map_lru_seq(struct lruvec *lruvec, struct folio *folio)
+{
+ unsigned long seq;
+ int type, nr_gens;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct hca_entry folio_entry;
+ unsigned long hotness, seq_range;
+
+ type = folio_is_file_lru(folio);
+ if (!hca_lru_age || folio_hca_entry(folio, &folio_entry))
+ /* return youngest generation ? */
+ return lrugen->min_seq[type];
+
+ hotness = hotness_score(&folio_entry);
+ /* The page didn't see any access, return oldest generation */
+ if (!hotness)
+ return lrugen->min_seq[type];
+
+ /* Also adjust based on current value. */
+ if (hotness > lrugen->max_hotness) {
+ lrugen->max_hotness = (hotness + lrugen->max_hotness) / 2;
+ return lrugen->max_seq;
+ } else if (hotness < lrugen->min_hotness) {
+ lrugen->min_hotness = (hotness + lrugen->min_hotness) / 2;
+ return lrugen->min_seq[type];
+ }
+
+ /*
+ * Convert the max and min hotness into 4 ranges for sequence.
+ * Then place our current hotness into one of these range.
+ * We use the range number as an increment factor for generation.
+ */
+ /* inclusive range min and max */
+ seq_range = lrugen->max_hotness - lrugen->min_hotness + 1;
+ nr_gens = get_nr_gens(lruvec, type);
+ seq_range = (seq_range + nr_gens - 1)/nr_gens;
+
+ /* higher the hotness younger the generation */
+ seq = lrugen->min_seq[type] + ((hotness - lrugen->min_hotness)/seq_range);
+
+ return seq;
+}
+
+bool hca_try_to_inc_max_seq(struct lruvec *lruvec,
+ unsigned long max_seq, int scan_priority,
+ bool can_swap, bool force_scan)
+
+{
+ bool success = false;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+
+ /* see the comment in iterate_mm_list() */
+ if (lruvec->seq_update_progress)
+ success = false;
+ else {
+ spin_lock_irq(&lruvec->lru_lock);
+
+ if (max_seq != lrugen->max_seq)
+ goto done;
+
+ if (lruvec->seq_update_progress)
+ goto done;
+
+ success = true;
+ lruvec->seq_update_progress = true;
+done:
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+ if (!success) {
+ if (scan_priority <= DEF_PRIORITY - 2)
+ wait_event_killable(lruvec->seq_update_wait,
+ max_seq < READ_ONCE(lrugen->max_seq));
+
+ return max_seq < READ_ONCE(lrugen->max_seq);
+ }
+
+ /*
+ * With hardware aging use the counters to update
+ * lruvec max and min hotness.
+ */
+ restablish_hotness_range(lruvec);
+
+ VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
+ inc_max_seq(lruvec, can_swap, force_scan);
+ /* either this sees any waiters or they will see updated max_seq */
+ if (wq_has_sleeper(&lruvec->seq_update_wait))
+ wake_up_all(&lruvec->seq_update_wait);
+
+ return success;
+}
+#endif /* CONFIG_LRU_GEN */
+
+static void hca_debugfs_init(void)
+{
+ int node;
+ char name[32];
+ struct dentry *node_dentry;
+
+ hca_debugfs_root = debugfs_create_dir("hca", arch_debugfs_dir);
+
+ for_each_online_node(node) {
+ snprintf(name, sizeof(name), "node%u", node);
+ node_dentry = debugfs_create_dir(name, hca_debugfs_root);
+
+ hca_backend_node_debugfs_init(node, node_dentry);
+ }
+
+ debugfs_create_ulong("scan-pfn-ratio", 0600, hca_debugfs_root,
+ &scan_pfn_ratio);
+ debugfs_create_ulong("scan-skip-msec", 0600, hca_debugfs_root,
+ &scan_skip_msec);
+ debugfs_create_bool("hca_lru_age", 0600, hca_debugfs_root,
+ &hca_lru_age);
+
+ /* Now create backend debugs */
+ hca_backend_debugfs_init(hca_debugfs_root);
+}
+
+static int __init hca_init(void)
+{
+ if (!hca_backend_debugfs_init) {
+ pr_info("No HCA device registered. Disabling hca lru gen\n");
+ hca_lru_age = false;
+ }
+
+ hca_debugfs_init();
+ return 0;
+}
+
+late_initcall(hca_init);
@@ -425,6 +425,11 @@ struct lru_gen_struct {
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
/* whether the multi-gen LRU is enabled */
+#ifndef CONFIG_LRU_TASK_PAGE_AGING
+ unsigned long max_hotness;
+ unsigned long min_hotness;
+ u64 next_span_scan;
+#endif
bool enabled;
};
@@ -3,6 +3,10 @@
#ifndef _LINUX_PAGE_AGING_H
#define _LINUX_PAGE_AGING_H
+#ifdef CONFIG_ARCH_HAS_PAGE_AGING
+#include <asm/page_aging.h>
+#endif
+
#ifndef arch_supports_page_access_count
static inline bool arch_supports_page_access_count(void)
{
@@ -14,6 +18,7 @@ static inline bool arch_supports_page_access_count(void)
bool __try_to_inc_max_seq(struct lruvec *lruvec,
unsigned long max_seq, int scan_priority,
bool can_swap, bool force_scan);
+void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan);
#ifndef arch_get_lru_gen_seq
static inline unsigned long arch_get_lru_gen_seq(struct lruvec *lruvec, struct folio *folio)
@@ -1181,6 +1181,10 @@ config LRU_GEN_STATS
from evicted generations for debugging purpose.
This option has a per-memcg and per-node memory overhead.
+
+config ARCH_HAS_PAGE_AGING
+ bool
+
# }
source "mm/damon/Kconfig"
@@ -4362,7 +4362,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
return success;
}
-static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
+void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
{
int prev, next;
int type, zone;
@@ -4420,6 +4420,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
#endif
spin_unlock_irq(&lruvec->lru_lock);
}
+
#ifdef CONFIG_LRU_TASK_PAGE_AGING
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
int scan_priority, bool can_swap, bool force_scan)
@@ -5861,7 +5862,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
}
- seq_printf(m, " node %5d\n", nid);
+ seq_printf(m, " node %5d max_hotness %ld min_hotness %ld\n", nid, lrugen->max_hotness, lrugen->min_hotness);
if (!full)
seq = min_seq[LRU_GEN_ANON];