@@ -21,6 +21,7 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
+#include <linux/sched/cputime.h>
#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
@@ -248,6 +249,9 @@ static struct kmem_cache *rmap_item_cache;
static struct kmem_cache *stable_node_cache;
static struct kmem_cache *mm_slot_cache;
+/* Default number of pages to scan per batch */
+#define DEFAULT_PAGES_TO_SCAN 100
+
/* The number of pages scanned */
static unsigned long ksm_pages_scanned;
@@ -276,7 +280,7 @@ static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;
static int ksm_max_page_sharing = 256;
/* Number of pages ksmd should scan in one batch */
-static unsigned int ksm_thread_pages_to_scan = 100;
+static unsigned int ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN;
/* Milliseconds ksmd should sleep between batches */
static unsigned int ksm_thread_sleep_millisecs = 20;
@@ -297,6 +301,152 @@ unsigned long ksm_zero_pages;
/* The number of pages that have been skipped due to "smart scanning" */
static unsigned long ksm_pages_skipped;
+/* Don't scan more than max pages per batch. */
+static unsigned long ksm_advisor_max_pages_to_scan = 30000;
+
+/* Min CPU for scanning pages per scan */
+#define KSM_ADVISOR_MIN_CPU 10
+
+/* Max CPU for scanning pages per scan */
+static unsigned int ksm_advisor_max_cpu = 70;
+
+/* Target scan time in seconds to analyze all KSM candidate pages. */
+static unsigned long ksm_advisor_target_scan_time = 200;
+
+/* Exponentially weighted moving average. */
+#define EWMA_WEIGHT 30
+
+/**
+ * struct advisor_ctx - metadata for KSM advisor
+ * @start_scan: start time of the current scan
+ * @scan_time: scan time of previous scan
+ * @change: change in percent to pages_to_scan parameter
+ * @cpu_time: cpu time consumed by the ksmd thread in the previous scan
+ */
+struct advisor_ctx {
+ ktime_t start_scan;
+ unsigned long scan_time;
+ unsigned long change;
+ unsigned long long cpu_time;
+};
+static struct advisor_ctx advisor_ctx;
+
+/* Define different advisor's */
+enum ksm_advisor_type {
+ KSM_ADVISOR_NONE,
+ KSM_ADVISOR_SCAN_TIME,
+};
+static enum ksm_advisor_type ksm_advisor;
+
+static inline void advisor_start_scan(void)
+{
+ if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
+ advisor_ctx.start_scan = ktime_get();
+}
+
+/*
+ * Use previous scan time if available, otherwise use current scan time as an
+ * approximation for the previous scan time.
+ */
+static inline unsigned long prev_scan_time(struct advisor_ctx *ctx,
+ unsigned long scan_time)
+{
+ return ctx->scan_time ? ctx->scan_time : scan_time;
+}
+
+/* Calculate exponential weighted moving average */
+static unsigned long ewma(unsigned long prev, unsigned long curr)
+{
+ return ((100 - EWMA_WEIGHT) * prev + EWMA_WEIGHT * curr) / 100;
+}
+
+/*
+ * The scan time advisor is based on the current scan rate and the target
+ * scan rate.
+ *
+ * new_pages_to_scan = pages_to_scan * (scan_time / target_scan_time)
+ *
+ * To avoid perturbations it calculates a change factor of previous changes.
+ * A new change factor is calculated for each iteration and it uses an
+ * exponentially weighted moving average. The new pages_to_scan value is
+ * multiplied with that change factor:
+ *
+ * new_pages_to_scan *= change facor
+ *
+ * The new_pages_to_scan value is limited by the cpu min and max values. It
+ * calculates the cpu percent for the last scan and calculates the new
+ * estimated cpu percent cost for the next scan. That value is capped by the
+ * cpu min and max setting.
+ *
+ * In addition the new pages_to_scan value is capped by the max and min
+ * limits.
+ */
+static void scan_time_advisor(void)
+{
+ unsigned int cpu_percent;
+ unsigned long cpu_time;
+ unsigned long cpu_time_diff;
+ unsigned long cpu_time_diff_ms;
+ unsigned long pages;
+ unsigned long per_page_cost;
+ unsigned long factor;
+ unsigned long change;
+ unsigned long last_scan_time;
+ unsigned long scan_time;
+
+ /* Convert scan time to seconds */
+ scan_time = div_s64(ktime_ms_delta(ktime_get(), advisor_ctx.start_scan),
+ MSEC_PER_SEC);
+ scan_time = scan_time ? scan_time : 1;
+
+ /* Calculate CPU consumption of ksmd background thread */
+ cpu_time = task_sched_runtime(current);
+ cpu_time_diff = cpu_time - advisor_ctx.cpu_time;
+ cpu_time_diff_ms = cpu_time_diff / 1000 / 1000;
+
+ cpu_percent = (cpu_time_diff_ms * 100) / (scan_time * 1000);
+ cpu_percent = cpu_percent ? cpu_percent : 1;
+ last_scan_time = prev_scan_time(&advisor_ctx, scan_time);
+
+ /* Calculate scan time as percentage of target scan time */
+ factor = ksm_advisor_target_scan_time * 100 / scan_time;
+ factor = factor ? factor : 1;
+
+ /*
+ * Calculate scan time as percentage of last scan time and use
+ * exponentially weighted average to smooth it
+ */
+ change = scan_time * 100 / last_scan_time;
+ change = change ? change : 1;
+ change = ewma(advisor_ctx.change, change);
+
+ /* Calculate new scan rate based on target scan rate. */
+ pages = ksm_thread_pages_to_scan * 100 / factor;
+ /* Update pages_to_scan by weighted change percentage. */
+ pages = pages * change / 100;
+
+ /* Cap new pages_to_scan value */
+ per_page_cost = ksm_thread_pages_to_scan / cpu_percent;
+ per_page_cost = per_page_cost ? per_page_cost : 1;
+
+ pages = min(pages, per_page_cost * ksm_advisor_max_cpu);
+ pages = max(pages, per_page_cost * KSM_ADVISOR_MIN_CPU);
+ pages = min(pages, ksm_advisor_max_pages_to_scan);
+
+ /* Update advisor context */
+ advisor_ctx.change = change;
+ advisor_ctx.scan_time = scan_time;
+ advisor_ctx.cpu_time = cpu_time;
+
+ ksm_thread_pages_to_scan = pages;
+}
+
+static void advisor_stop_scan(void)
+{
+ if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
+ scan_time_advisor();
+}
+
#ifdef CONFIG_NUMA
/* Zeroed when merging across nodes is not allowed */
static unsigned int ksm_merge_across_nodes = 1;
@@ -2401,6 +2551,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
mm_slot = ksm_scan.mm_slot;
if (mm_slot == &ksm_mm_head) {
+ advisor_start_scan();
trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);
/*
@@ -2558,6 +2709,8 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
if (mm_slot != &ksm_mm_head)
goto next_mm;
+ advisor_stop_scan();
+
trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
ksm_scan.seqnr++;
return NULL;
@@ -3248,6 +3401,9 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
unsigned int nr_pages;
int err;
+ if (ksm_advisor != KSM_ADVISOR_NONE)
+ return -EINVAL;
+
err = kstrtouint(buf, 10, &nr_pages);
if (err)
return -EINVAL;