new file mode 100644
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * DAMON api
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#ifndef _DAMON_H_
+#define _DAMON_H_
+
+#include <linux/mutex.h>
+#include <linux/time64.h>
+#include <linux/types.h>
+
+struct damon_ctx;
+
+/**
+ * struct damon_primitive Monitoring primitives for given use cases.
+ *
+ * @init: Initialize primitive-internal data structures.
+ * @update: Update primitive-internal data structures.
+ * @prepare_access_checks: Prepare next access check of target regions.
+ * @check_accesses: Check the accesses to target regions.
+ * @reset_aggregated: Reset aggregated accesses monitoring results.
+ * @target_valid: Determine if the target is valid.
+ * @cleanup: Clean up the context.
+ *
+ * DAMON can be extended for various address spaces and usages. For this,
+ * users should register the low level primitives for their target address
+ * space and usecase via the &damon_ctx.primitive. Then, the monitoring thread
+ * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting
+ * the monitoring, @update after each &damon_ctx.primitive_update_interval, and
+ * @check_accesses, @target_valid and @prepare_access_checks after each
+ * &damon_ctx.sample_interval. Finally, @reset_aggregated is called after each
+ * &damon_ctx.aggr_interval.
+ *
+ * @init should initialize primitive-internal data structures. For example,
+ * this could be used to construct proper monitoring target regions and link
+ * those to @damon_ctx.target.
+ * @update should update the primitive-internal data structures. For example,
+ * this could be used to update monitoring target regions for current status.
+ * @prepare_access_checks should manipulate the monitoring regions to be
+ * prepared for the next access check.
+ * @check_accesses should check the accesses to each region that made after the
+ * last preparation and update the number of observed accesses of each region.
+ * @reset_aggregated should reset the access monitoring results that aggregated
+ * by @check_accesses.
+ * @target_valid should check whether the target is still valid for the
+ * monitoring.
+ * @cleanup is called from @kdamond just before its termination.
+ */
+struct damon_primitive {
+ void (*init)(struct damon_ctx *context);
+ void (*update)(struct damon_ctx *context);
+ void (*prepare_access_checks)(struct damon_ctx *context);
+ void (*check_accesses)(struct damon_ctx *context);
+ void (*reset_aggregated)(struct damon_ctx *context);
+ bool (*target_valid)(void *target);
+ void (*cleanup)(struct damon_ctx *context);
+};
+
+/*
+ * struct damon_callback Monitoring events notification callbacks.
+ *
+ * @before_start: Called before starting the monitoring.
+ * @after_sampling: Called after each sampling.
+ * @after_aggregation: Called after each aggregation.
+ * @before_terminate: Called before terminating the monitoring.
+ * @private: User private data.
+ *
+ * The monitoring thread (&damon_ctx.kdamond) calls @before_start and
+ * @before_terminate just before starting and finishing the monitoring,
+ * respectively. Therefore, those are good places for installing and cleaning
+ * @private.
+ *
+ * The monitoring thread calls @after_sampling and @after_aggregation for each
+ * of the sampling intervals and aggregation intervals, respectively.
+ * Therefore, users can safely access the monitoring results without additional
+ * protection. For the reason, users are recommended to use these callback for
+ * the accesses to the results.
+ *
+ * If any callback returns non-zero, monitoring stops.
+ */
+struct damon_callback {
+ void *private;
+
+ int (*before_start)(struct damon_ctx *context);
+ int (*after_sampling)(struct damon_ctx *context);
+ int (*after_aggregation)(struct damon_ctx *context);
+ int (*before_terminate)(struct damon_ctx *context);
+};
+
+/**
+ * struct damon_ctx - Represents a context for each monitoring. This is the
+ * main interface that allows users to set the attributes and get the results
+ * of the monitoring.
+ *
+ * @sample_interval: The time between access samplings.
+ * @aggr_interval: The time between monitor results aggregations.
+ * @primitive_update_interval: The time between monitoring primitive updates.
+ *
+ * For each @sample_interval, DAMON checks whether each region is accessed or
+ * not. It aggregates and keeps the access information (number of accesses to
+ * each region) for @aggr_interval time. DAMON also checks whether the target
+ * memory regions need update (e.g., by ``mmap()`` calls from the application,
+ * in case of virtual memory monitoring) and applies the changes for each
+ * @primitive_update_interval. All time intervals are in micro-seconds.
+ * Please refer to &struct damon_primitive and &struct damon_callback for more
+ * detail.
+ *
+ * @kdamond: Kernel thread who does the monitoring.
+ * @kdamond_stop: Notifies whether kdamond should stop.
+ * @kdamond_lock: Mutex for the synchronizations with @kdamond.
+ *
+ * For each monitoring context, one kernel thread for the monitoring is
+ * created. The pointer to the thread is stored in @kdamond.
+ *
+ * Once started, the monitoring thread runs until explicitly required to be
+ * terminated or every monitoring target is invalid. The validity of the
+ * targets is checked via the &damon_primitive.target_valid of @primitive. The
+ * termination can also be explicitly requested by writing non-zero to
+ * @kdamond_stop. The thread sets @kdamond to NULL when it terminates.
+ * Therefore, users can know whether the monitoring is ongoing or terminated by
+ * reading @kdamond. Reads and writes to @kdamond and @kdamond_stop from
+ * outside of the monitoring thread must be protected by @kdamond_lock.
+ *
+ * Note that the monitoring thread protects only @kdamond and @kdamond_stop via
+ * @kdamond_lock. Accesses to other fields must be protected by themselves.
+ *
+ * @primitive: Set of monitoring primitives for given use cases.
+ * @callback: Set of callbacks for monitoring events notifications.
+ *
+ * @target: Pointer to the user-defined monitoring target.
+ */
+struct damon_ctx {
+ unsigned long sample_interval;
+ unsigned long aggr_interval;
+ unsigned long primitive_update_interval;
+
+/* private: internal use only */
+ struct timespec64 last_aggregation;
+ struct timespec64 last_primitive_update;
+
+/* public: */
+ struct task_struct *kdamond;
+ bool kdamond_stop;
+ struct mutex kdamond_lock;
+
+ struct damon_primitive primitive;
+ struct damon_callback callback;
+
+ void *target;
+};
+
+#ifdef CONFIG_DAMON
+
+struct damon_ctx *damon_new_ctx(void);
+void damon_destroy_ctx(struct damon_ctx *ctx);
+int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
+ unsigned long aggr_int, unsigned long primitive_upd_int);
+
+int damon_start(struct damon_ctx **ctxs, int nr_ctxs);
+int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
+
+#endif /* CONFIG_DAMON */
+
+#endif /* _DAMON_H */
@@ -859,4 +859,6 @@ config ARCH_HAS_HUGEPD
config MAPPING_DIRTY_HELPERS
bool
+source "mm/damon/Kconfig"
+
endmenu
@@ -120,3 +120,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
+obj-$(CONFIG_DAMON) += damon/
new file mode 100644
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "Data Access Monitoring"
+
+config DAMON
+ bool "DAMON: Data Access Monitoring Framework"
+ help
+ This builds a framework that allows kernel subsystems to monitor
+ access frequency of each memory region. The information can be useful
+ for performance-centric DRAM level memory management.
+
+ See https://damonitor.github.io/doc/html/latest-damon/index.html for
+ more information.
+
+endmenu
new file mode 100644
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_DAMON) := core.o
new file mode 100644
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Data Access Monitor
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#define pr_fmt(fmt) "damon: " fmt
+
+#include <linux/damon.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+
+static DEFINE_MUTEX(damon_lock);
+static int nr_running_ctxs;
+
+struct damon_ctx *damon_new_ctx(void)
+{
+ struct damon_ctx *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ ctx->sample_interval = 5 * 1000;
+ ctx->aggr_interval = 100 * 1000;
+ ctx->primitive_update_interval = 1000 * 1000;
+
+ ktime_get_coarse_ts64(&ctx->last_aggregation);
+ ctx->last_primitive_update = ctx->last_aggregation;
+
+ mutex_init(&ctx->kdamond_lock);
+
+ ctx->target = NULL;
+
+ return ctx;
+}
+
+void damon_destroy_ctx(struct damon_ctx *ctx)
+{
+ if (ctx->primitive.cleanup)
+ ctx->primitive.cleanup(ctx);
+ kfree(ctx);
+}
+
+/**
+ * damon_set_attrs() - Set attributes for the monitoring.
+ * @ctx: monitoring context
+ * @sample_int: time interval between samplings
+ * @aggr_int: time interval between aggregations
+ * @primitive_upd_int: time interval between monitoring primitive updates
+ *
+ * This function should not be called while the kdamond is running.
+ * Every time interval is in micro-seconds.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
+ unsigned long aggr_int, unsigned long primitive_upd_int)
+{
+ ctx->sample_interval = sample_int;
+ ctx->aggr_interval = aggr_int;
+ ctx->primitive_update_interval = primitive_upd_int;
+
+ return 0;
+}
+
+static bool damon_kdamond_running(struct damon_ctx *ctx)
+{
+ bool running;
+
+ mutex_lock(&ctx->kdamond_lock);
+ running = ctx->kdamond != NULL;
+ mutex_unlock(&ctx->kdamond_lock);
+
+ return running;
+}
+
+static int kdamond_fn(void *data);
+
+/*
+ * __damon_start() - Starts monitoring with given context.
+ * @ctx: monitoring context
+ *
+ * This function should be called while damon_lock is hold.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+static int __damon_start(struct damon_ctx *ctx)
+{
+ int err = -EBUSY;
+
+ mutex_lock(&ctx->kdamond_lock);
+ if (!ctx->kdamond) {
+ err = 0;
+ ctx->kdamond_stop = false;
+ ctx->kdamond = kthread_create(kdamond_fn, ctx, "kdamond.%d",
+ nr_running_ctxs);
+ if (IS_ERR(ctx->kdamond))
+ err = PTR_ERR(ctx->kdamond);
+ else
+ wake_up_process(ctx->kdamond);
+ }
+ mutex_unlock(&ctx->kdamond_lock);
+
+ return err;
+}
+
+/**
+ * damon_start() - Starts the monitorings for a given group of contexts.
+ * @ctxs: an array of the pointers for contexts to start monitoring
+ * @nr_ctxs: size of @ctxs
+ *
+ * This function starts a group of monitoring threads for a group of monitoring
+ * contexts. One thread per each context is created and run in parallel. The
+ * caller should handle synchronization between the threads by itself. If a
+ * group of threads that created by other 'damon_start()' call is currently
+ * running, this function does nothing but returns -EBUSY.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_start(struct damon_ctx **ctxs, int nr_ctxs)
+{
+ int i;
+ int err = 0;
+
+ mutex_lock(&damon_lock);
+ if (nr_running_ctxs) {
+ mutex_unlock(&damon_lock);
+ return -EBUSY;
+ }
+
+ for (i = 0; i < nr_ctxs; i++) {
+ err = __damon_start(ctxs[i]);
+ if (err)
+ break;
+ nr_running_ctxs++;
+ }
+ mutex_unlock(&damon_lock);
+
+ return err;
+}
+
+/*
+ * __damon_stop() - Stops monitoring of given context.
+ * @ctx: monitoring context
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+static int __damon_stop(struct damon_ctx *ctx)
+{
+ mutex_lock(&ctx->kdamond_lock);
+ if (ctx->kdamond) {
+ ctx->kdamond_stop = true;
+ mutex_unlock(&ctx->kdamond_lock);
+ while (damon_kdamond_running(ctx))
+ usleep_range(ctx->sample_interval,
+ ctx->sample_interval * 2);
+ return 0;
+ }
+ mutex_unlock(&ctx->kdamond_lock);
+
+ return -EPERM;
+}
+
+/**
+ * damon_stop() - Stops the monitorings for a given group of contexts.
+ * @ctxs: an array of the pointers for contexts to stop monitoring
+ * @nr_ctxs: size of @ctxs
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_stop(struct damon_ctx **ctxs, int nr_ctxs)
+{
+ int i, err = 0;
+
+ for (i = 0; i < nr_ctxs; i++) {
+ /* nr_running_ctxs is decremented in kdamond_fn */
+ err = __damon_stop(ctxs[i]);
+ if (err)
+ return err;
+ }
+
+ return err;
+}
+
+/*
+ * damon_check_reset_time_interval() - Check if a time interval is elapsed.
+ * @baseline: the time to check whether the interval has elapsed since
+ * @interval: the time interval (microseconds)
+ *
+ * See whether the given time interval has passed since the given baseline
+ * time. If so, it also updates the baseline to current time for next check.
+ *
+ * Return: true if the time interval has passed, or false otherwise.
+ */
+static bool damon_check_reset_time_interval(struct timespec64 *baseline,
+ unsigned long interval)
+{
+ struct timespec64 now;
+
+ ktime_get_coarse_ts64(&now);
+ if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) <
+ interval * 1000)
+ return false;
+ *baseline = now;
+ return true;
+}
+
+/*
+ * Check whether it is time to flush the aggregated information
+ */
+static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
+{
+ return damon_check_reset_time_interval(&ctx->last_aggregation,
+ ctx->aggr_interval);
+}
+
+/*
+ * Check whether it is time to check and apply the target monitoring regions
+ *
+ * Returns true if it is.
+ */
+static bool kdamond_need_update_primitive(struct damon_ctx *ctx)
+{
+ return damon_check_reset_time_interval(&ctx->last_primitive_update,
+ ctx->primitive_update_interval);
+}
+
+/*
+ * Check whether current monitoring should be stopped
+ *
+ * The monitoring is stopped when either the user requested to stop, or all
+ * monitoring targets are invalid.
+ *
+ * Returns true if need to stop current monitoring.
+ */
+static bool kdamond_need_stop(struct damon_ctx *ctx)
+{
+ bool stop;
+
+ mutex_lock(&ctx->kdamond_lock);
+ stop = ctx->kdamond_stop;
+ mutex_unlock(&ctx->kdamond_lock);
+ if (stop)
+ return true;
+
+ if (!ctx->primitive.target_valid)
+ return false;
+
+ return !ctx->primitive.target_valid(ctx->target);
+}
+
+static void set_kdamond_stop(struct damon_ctx *ctx)
+{
+ mutex_lock(&ctx->kdamond_lock);
+ ctx->kdamond_stop = true;
+ mutex_unlock(&ctx->kdamond_lock);
+}
+
+/*
+ * The monitoring daemon that runs as a kernel thread
+ */
+static int kdamond_fn(void *data)
+{
+ struct damon_ctx *ctx = (struct damon_ctx *)data;
+
+ pr_info("kdamond (%d) starts\n", ctx->kdamond->pid);
+
+ if (ctx->primitive.init)
+ ctx->primitive.init(ctx);
+ if (ctx->callback.before_start && ctx->callback.before_start(ctx))
+ set_kdamond_stop(ctx);
+
+ while (!kdamond_need_stop(ctx)) {
+ if (ctx->primitive.prepare_access_checks)
+ ctx->primitive.prepare_access_checks(ctx);
+ if (ctx->callback.after_sampling &&
+ ctx->callback.after_sampling(ctx))
+ set_kdamond_stop(ctx);
+
+ usleep_range(ctx->sample_interval, ctx->sample_interval + 1);
+
+ if (ctx->primitive.check_accesses)
+ ctx->primitive.check_accesses(ctx);
+
+ if (kdamond_aggregate_interval_passed(ctx)) {
+ if (ctx->callback.after_aggregation &&
+ ctx->callback.after_aggregation(ctx))
+ set_kdamond_stop(ctx);
+ if (ctx->primitive.reset_aggregated)
+ ctx->primitive.reset_aggregated(ctx);
+ }
+
+ if (kdamond_need_update_primitive(ctx)) {
+ if (ctx->primitive.update)
+ ctx->primitive.update(ctx);
+ }
+ }
+
+ if (ctx->callback.before_terminate &&
+ ctx->callback.before_terminate(ctx))
+ set_kdamond_stop(ctx);
+ if (ctx->primitive.cleanup)
+ ctx->primitive.cleanup(ctx);
+
+ pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid);
+ mutex_lock(&ctx->kdamond_lock);
+ ctx->kdamond = NULL;
+ mutex_unlock(&ctx->kdamond_lock);
+
+ mutex_lock(&damon_lock);
+ nr_running_ctxs--;
+ mutex_unlock(&damon_lock);
+
+ do_exit(0);
+}