diff mbox series

[RFC,1/7] core/metricfs: Create metricfs, standardized files under debugfs.

Message ID 20200806001431.2072150-2-jwadams@google.com (mailing list archive)
State New, archived
Headers show
Series metricfs metric file system and examples | expand

Commit Message

Jonathan Adams Aug. 6, 2020, 12:14 a.m. UTC
From: Justin TerAvest <teravest@google.com>

Metricfs is a standardized set of files and directories under debugfs,
with a kernel API designed to be simpler than exporting new files under
sysfs. Type and field information is reported so that a userspace daemon
can easily process the information.

The statistics live under debugfs, in a tree rooted at:

	/sys/kernel/debug/metricfs

Each metric is a directory, with four files in it.  This patch includes
a single "metricfs_presence" metric, whose files look like:
/sys/kernel/debug/metricfs:
 metricfs_presence/annotations
  DESCRIPTION A\ basic\ presence\ metric.
 metricfs_presence/fields
  value
  int
 metricfs_presence/values
  1
 metricfs_presence/version
  1

Statistics can have zero, one, or two 'fields', which are keys for the
table of metric values.  With no fields, you have a simple statistic as
above, with one field you have a 1-dimensional table of string -> value,
and with two fields you have a 2-dimensional table of
{string, string} -> value.

When a statistic's 'values' file is opened, we pre-allocate a 64k buffer
and call the statistic's callback to fill it with data, truncating if
the buffer overflows.

Statistic creators can create a hierarchy for their statistics using
metricfs_create_subsys().

Signed-off-by: Justin TerAvest <teravest@google.com>
[jwadams@google.com: Forward ported to v5.8, cleaned up and modernized
	code significantly]
Signed-off-by: Jonathan Adams <jwadams@google.com>

---

notes:
* To go upstream, this will need documentation and a MAINTAINERS update.
* It's not clear what the "version" file is for; it's vestigial and
should probably be removed.

jwadams@google.com: Forward ported to v5.8, removed some google-isms and
    cleaned up some anachronisms (atomic->refcount, moving to
    kvmalloc(), using POISON_POINTER_DELTA, made more functions static,
    made 'emitter_fn' into an explicit union instead of a void *),
    renamed 'struct emitter -> metric_emitter' and renamed
    some funcs for consistency.
---
 include/linux/metricfs.h   | 103 ++++++
 kernel/Makefile            |   2 +
 kernel/metricfs.c          | 727 +++++++++++++++++++++++++++++++++++++
 kernel/metricfs_examples.c | 151 ++++++++
 lib/Kconfig.debug          |  18 +
 5 files changed, 1001 insertions(+)
 create mode 100644 include/linux/metricfs.h
 create mode 100644 kernel/metricfs.c
 create mode 100644 kernel/metricfs_examples.c
diff mbox series

Patch

diff --git a/include/linux/metricfs.h b/include/linux/metricfs.h
new file mode 100644
index 000000000000..65a1baa8e8c1
--- /dev/null
+++ b/include/linux/metricfs.h
@@ -0,0 +1,103 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _METRICFS_H_
+#define _METRICFS_H_
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/stringify.h>
+
+struct metric;
+struct metricfs_subsys;
+
+#define METRIC_EXPORT_GENERIC(name, desc, fname0, fname1, fn, is_str, cumulative) \
+static struct metric *metric_##name; \
+void metric_init_##name(struct metricfs_subsys *parent) \
+{ \
+	metric_##name = metric_register(__stringify(name), (parent), (desc), \
+					(fname0), (fname1), (fn), (is_str), \
+					(cumulative), THIS_MODULE); \
+} \
+void metric_exit_##name(void) \
+{ \
+	metric_unregister(metric_##name); \
+}
+
+/*
+ * Metricfs only deals with two types: int64_t and const char*.
+ *
+ * If a metric has fewer than two fields, pass NULL for the field name
+ * arguments.
+ *
+ * The metric does not take ownership of any of the strings passed in.
+ *
+ * See kernel/metricfs_examples.c for a set of example metrics, with
+ * corresponding output.
+ *
+ * METRIC_EXPORT_INT - An integer-valued metric.
+ * METRIC_EXPORT_COUNTER - An integer-valued cumulative metric.
+ * METRIC_EXPORT_STR - A string-valued metric.
+ */
+#define METRIC_EXPORT_INT(name, desc, fname0, fname1, fn) \
+	METRIC_EXPORT_GENERIC(name, (desc), (fname0), (fname1), (fn), \
+				false, false)
+#define METRIC_EXPORT_COUNTER(name, desc, fname0, fname1, fn) \
+	METRIC_EXPORT_GENERIC(name, (desc), (fname0), (fname1), (fn), \
+				false, true)
+#define METRIC_EXPORT_STR(name, desc, fname0, fname1, fn) \
+	METRIC_EXPORT_GENERIC(name, (desc), (fname0), (fname1), (fn), \
+				true, false)
+
+/* Subsystem support. */
+/* Pass NULL as 'parent' to create a new top-level subsystem. */
+struct metricfs_subsys *metricfs_create_subsys(const char *name,
+						struct metricfs_subsys *parent);
+void metricfs_destroy_subsys(struct metricfs_subsys *d);
+
+/*
+ * An opaque struct that metric emit functions use to keep our internal
+ * state.
+ */
+struct metric_emitter;
+
+/* The number of non-NULL arguments passed to EMIT macros must match the number
+ * of arguments passed to the EXPORT macro for a given metric.
+ *
+ * Failure to do so will cause data to be mangled (or dropped) by userspace or
+ * Monarch.
+ */
+#define METRIC_EMIT_INT(e, v, f0, f1) \
+	metric_emit_int_value((e), (v), (f0), (f1))
+#define METRIC_EMIT_STR(e, v, f0, f1) \
+	metric_emit_str_value((e), (v), (f0), (f1))
+
+/* Users don't have to call any functions below;
+ * use the macro definitions above instead.
+ */
+void metric_emit_int_value(struct metric_emitter *e,
+			   int64_t v, const char *f0, const char *f1);
+void metric_emit_str_value(struct metric_emitter *e,
+			   const char *v, const char *f0, const char *f1);
+
+struct metric *metric_register(const char *name,
+			       struct metricfs_subsys *parent,
+			       const char *description,
+			       const char *fname0, const char *fname1,
+			       void (*fn)(struct metric_emitter *e),
+			       bool is_string,
+			       bool is_cumulative,
+			       struct module *owner);
+
+struct metric *metric_register_parm(const char *name,
+				    struct metricfs_subsys *parent,
+			  const char *description,
+				    const char *fname0, const char *fname1,
+				    void (*fn)(struct metric_emitter *e,
+					       void *parm),
+				    void *parm,
+				    bool is_string,
+				    bool is_cumulative,
+				    struct module *owner);
+
+void metric_unregister(struct metric *m);
+
+#endif /* _METRICFS_H_ */
diff --git a/kernel/Makefile b/kernel/Makefile
index f3218bc5ec69..0edf790935b0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -109,6 +109,8 @@  obj-$(CONFIG_CPU_PM) += cpu_pm.o
 obj-$(CONFIG_BPF) += bpf/
 obj-$(CONFIG_KCSAN) += kcsan/
 obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
+obj-$(CONFIG_METRICFS) += metricfs.o
+obj-$(CONFIG_METRICFS_EXAMPLES) += metricfs_examples.o
 
 obj-$(CONFIG_PERF_EVENTS) += events/
 
diff --git a/kernel/metricfs.c b/kernel/metricfs.c
new file mode 100644
index 000000000000..676b7b04aa2b
--- /dev/null
+++ b/kernel/metricfs.c
@@ -0,0 +1,727 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/refcount.h>
+#include <linux/dcache.h>
+#include <linux/debugfs.h>
+#include <linux/init.h>
+#include <linux/kref.h>
+#include <linux/metricfs.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+
+/*
+ * Metricfs: A mechanism for exporting metrics from the kernel.
+ *
+ * Kernel code must provide:
+ *   - A description of the metric
+ *   - The subsystem for the metric (NULL is ok)
+ *   - Type information about the metric, and
+ *   - A callback function which supplies metric values.
+ *
+ * In return, metricfs provides files in debugfs at:
+ *   /sys/kernel/debug/metricfs/<subsys>/<metric_name>/
+ * The files are:
+ *   - annotations, which provides streamz "annotations"-- the description, and
+ *                  other metadata (e.g. if it's constant, deprecated, etc.)
+ *   - fields, which provides type information about the metric and its fields.
+ *   - values, which contains the actual metric value data.
+ *   - version, which is kept around for future-proofing.
+ *
+ * Metrics only support a limited subset of types-- for fields, they only
+ * support strings, integers, and boolean types. For simplicity, we only support
+ * strings and integers and strictly control how the data is formatted when
+ * displayed from debugfs.
+ *
+ * See kernel/metricfs_examples.c for example code.
+ *
+ * Limitations:
+ *   - "values" files are at MOST 64K. We truncate the file at that point.
+ *   - The list of fields and types is at most 1K.
+ *   - Metrics may have at most 2 fields.
+ *
+ * Best Practices:
+ *   - Emit the most important data first! Once the 64K per-metric buffer
+ *     is full, the emit* functions won't do anything.
+ *   - In userspace, open(), read(), and close() the file quickly! The kernel
+ *     allocation for the metric is alive as long as the file is open. This
+ *     permits users to seek around the contents of the file, while permitting
+ *     an atomic view of the data.
+ *
+ * FAQ:
+ *   - Why is memory allocated for file data at open()?
+ *     Snapshots of data provided by the kernel should be as "atomic" as
+ *     possible. If userspace code performs read()s smaller than the total
+ *     amount of data, we'd like for that tool to still work, while providing a
+ *     consistent view of the file.
+ *
+ * Questions:
+ *   - Would it be simpler if we escaped spaces instead of wrapping strings in
+ *     quotes?
+ */
+struct metric {
+	const char *name;
+	const char *description;
+
+	/* Metric field names (optional, NULL if unused) */
+	const char *fname0;
+	const char *fname1;
+
+	union {
+		void (*emit_noparm)(struct metric_emitter *e); /* !has_parm */
+		void (*emit_parm)(struct metric_emitter *e,
+				  void *parm); /* has_parm */
+	} emit_fn;
+	void *eparm;
+	bool is_string;
+	bool is_cumulative;
+	bool has_parm;
+
+	/* dentry for the directory that contains the metric */
+	struct dentry *dentry;
+
+	struct module *owner;
+
+	refcount_t refcnt;
+
+	/* Inodes that have references to our metric, protected under
+	 * big_mutex.
+	 */
+	struct inode *inodes[4];
+};
+
+/* Returns true if the refcount was successfully incremented for the metric */
+static int metric_module_get(struct metric *m)
+{
+	if (!try_module_get(m->owner))
+		return 0;
+
+	if (!refcount_inc_not_zero(&m->refcnt)) {
+		module_put(m->owner);
+		return 0;
+	}
+
+	return 1;
+}
+
+/* Returns true if the last reference was put. */
+static bool metric_put(struct metric *m)
+{
+	bool rc = refcount_dec_and_test(&m->refcnt);
+
+	if (rc)
+		kfree(m);
+	return rc;
+}
+
+static void metric_module_put(struct metric *m)
+{
+	struct module *owner = m->owner;
+
+	metric_put(m);
+	module_put(owner);
+}
+
+struct metric_emitter {
+	char *buf;
+	char *orig_buf;  /* To calculate total written. */
+	int size;  /* Size of underlying buffer. */
+	struct metric *metric;  /* For type checking. */
+};
+
+#define METRICFS_ANNOTATIONS_BUF_SIZE (1 * 1024)
+#define METRICFS_FIELDS_BUF_SIZE (1 * 1024)
+#define METRICFS_VALUES_BUF_SIZE (64 * 1024)
+#define METRICFS_VERSION_BUF_SIZE (8)
+
+/* Maximum length for fields. They're truncated at this point. */
+#define METRICFS_MAX_FIELD_LEN (100)
+
+static int emit_bytes_left(const struct metric_emitter *e)
+{
+	WARN_ON(e->orig_buf > e->buf);
+	return e->size - (e->buf - e->orig_buf);
+}
+
+struct char_tracker {
+	char *dest;
+	int size;
+	int pos;
+};
+
+static void add_char(struct char_tracker *t, char c)
+{
+	if (t->pos < t->size)
+		t->dest[t->pos] = c;
+	/* Increment pos even if we don't print, so we know how many
+	 * characters we'd print if we had room.
+	 */
+	t->pos++;
+}
+
+/* Escape backslashes, spaces, and newlines in string "s",
+ * copying to "dest", to a maximum of "size" characters.
+ *
+ * examples:
+ *  [Hi\ , "there"] -> [Hi\\\ ,\ "there"]
+ *  [foo
+ *   bar] - > [foo\nbar]
+ *
+ * Returns the number of characters that would be copied, if enough space
+ * was available. Doesn't emit a trailing zero.
+ */
+static int escape_string(char *dest, const char *s, int size)
+{
+	struct char_tracker tracker = {
+		.dest = dest,
+		.size = size,
+		.pos = 0,
+	};
+
+	/* We have to process the entire source string to ensure that
+	 * we return a useful value for the total possible emitted length.
+	 */
+	while (*s != 0) {
+		/* escape newlines */
+		if (*s == '\n') {
+			add_char(&tracker, '\\');
+			add_char(&tracker, 'n');
+			s++;
+			continue;
+		}
+
+		/* escape spaces and backslashes. */
+		if (*s == ' ' || *s == '\\')
+			add_char(&tracker, '\\');
+		add_char(&tracker, *s);
+		s++;
+	}
+
+	return tracker.pos;
+}
+
+/* Emits a string into the emitter buffer, no escaping */
+static bool emit_string(struct metric_emitter *e, const char *s)
+{
+	int bytes_left = emit_bytes_left(e);
+	int rc = snprintf(e->buf, bytes_left, "%s", s);
+
+	e->buf += min(rc, bytes_left);
+	return rc < bytes_left;
+}
+
+/* Emits a string into the emitter buffer, escaping quotes and newlines. */
+static bool emit_quoted_string(struct metric_emitter *e, const char *s)
+{
+	int bytes_left = emit_bytes_left(e);
+	int rc = escape_string(e->buf, s, bytes_left);
+
+	e->buf += min(rc, bytes_left);
+	return rc < bytes_left;
+}
+
+/* Emits an int into the emitter buffer */
+static bool emit_int(struct metric_emitter *e, int64_t i)
+{
+	int bytes_left = emit_bytes_left(e);
+	int rc = snprintf(e->buf, bytes_left, "%lld", i);
+
+	e->buf += min(rc, bytes_left);
+	return rc < bytes_left;
+}
+
+static void check_field_mismatch(struct metric *m, const char *f0,
+				 const char *f1)
+{
+	WARN_ON(m->fname0 && !f0);
+	WARN_ON(!m->fname0 && f0);
+	WARN_ON(m->fname1 && !f1);
+	WARN_ON(!m->fname1 && f1);
+}
+
+void metric_emit_int_value(struct metric_emitter *e, int64_t v,
+			   const char *f0, const char *f1)
+{
+	char *ckpt = e->buf;
+	bool ok = true;
+
+	WARN_ON_ONCE(e->metric->is_string);
+	check_field_mismatch(e->metric, f0, f1);
+	if (f0) {
+		ok &= emit_quoted_string(e, f0);
+		ok &= emit_string(e, " ");
+		if (f1) {
+			ok &= emit_quoted_string(e, f1);
+			ok &= emit_string(e, " ");
+		}
+	}
+	ok &= emit_int(e, v);
+	ok &= emit_string(e, "\n");
+	if (!ok)
+		e->buf = ckpt;
+}
+EXPORT_SYMBOL(metric_emit_int_value);
+
+void metric_emit_str_value(struct metric_emitter *e, const char *v,
+			   const char *f0, const char *f1)
+{
+	char *ckpt = e->buf;
+	bool ok = true;
+
+	WARN_ON_ONCE(!e->metric->is_string);
+	check_field_mismatch(e->metric, f0, f1);
+	if (f0) {
+		ok &= emit_quoted_string(e, f0);
+		ok &= emit_string(e, " ");
+		if (f1) {
+			ok &= emit_quoted_string(e, f1);
+			ok &= emit_string(e, " ");
+		}
+	}
+	ok &= emit_quoted_string(e, v);
+	ok &= emit_string(e, "\n");
+	if (!ok)
+		e->buf = ckpt;
+}
+EXPORT_SYMBOL(metric_emit_str_value);
+
+/* Contains file data generated at open() */
+struct metricfs_file_private {
+	size_t bytes_written;
+	char buf[0];
+};
+
+/* A mutex to prevent races involving the pointer to the inode stored in
+ * inode->i_private. We'll remove this if we can get a callback at inode
+ * deletion in debugfs.
+ */
+static DEFINE_MUTEX(big_mutex);
+
+/* Returns 1 on success, <0 otherwise. */
+static int metric_open_helper(struct inode *inode, struct file *filp,
+			      int buf_size,
+			      struct metric **m,
+			      struct metricfs_file_private **p)
+{
+	int size;
+
+	mutex_lock(&big_mutex);
+	/* Debugfs stores the "data" parameter from debugfs_create_file in
+	 * inode->i_private.
+	 */
+	*m = (struct metric *)inode->i_private;
+	if (!(*m) || !metric_module_get(*m)) {
+		mutex_unlock(&big_mutex);
+		return -ENXIO;
+	}
+	mutex_unlock(&big_mutex);
+
+	size = sizeof(struct metricfs_file_private) + buf_size;
+	*p = kvmalloc(size, GFP_KERNEL);
+	if (!*p) {
+		metric_module_put(*m);
+		return -ENOMEM;
+	}
+	filp->private_data = *p;
+	return 1;
+}
+
+static int metricfs_generic_release(struct inode *inode, struct file *filp)
+{
+	struct metricfs_file_private *p =
+			(struct metricfs_file_private *)filp->private_data;
+	kvfree(p);
+
+	filp->private_data = (void *)(0xDEADBEEFul + POISON_POINTER_DELTA);
+	/* FIXME here too? */
+	metric_module_put((struct metric *)inode->i_private);
+	return 0;
+}
+
+static int metricfs_annotations_open(struct inode *inode, struct file *filp)
+{
+	struct metric_emitter e;
+	struct metric *m;
+	struct metricfs_file_private *p;
+	bool ok = true;
+
+	int rc = metric_open_helper(inode, filp, METRICFS_ANNOTATIONS_BUF_SIZE,
+				    &m, &p);
+	if (rc < 0)
+		return rc;
+
+	e.buf = p->buf;
+	e.orig_buf = p->buf;
+	e.size = METRICFS_ANNOTATIONS_BUF_SIZE;
+	ok &= emit_string(&e, "DESCRIPTION ");
+	ok &= emit_quoted_string(&e, m->description);
+	ok &= emit_string(&e, "\n");
+	if (m->is_cumulative)
+		ok &= emit_string(&e, "CUMULATIVE\n");
+
+	/* Emit all or nothing. */
+	if (ok) {
+		p->bytes_written = e.buf - e.orig_buf;
+	} else {
+		metricfs_generic_release(inode, filp);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int metricfs_fields_open(struct inode *inode, struct file *filp)
+{
+	struct metric_emitter e;
+	struct metric *m;
+	struct metricfs_file_private *p;
+	bool ok = true;
+
+	int rc = metric_open_helper(inode, filp, METRICFS_FIELDS_BUF_SIZE,
+				    &m, &p);
+	if (rc < 0)
+		return rc;
+
+	e.buf = p->buf;
+	e.orig_buf = p->buf;
+	e.size = METRICFS_FIELDS_BUF_SIZE;
+	e.metric = m;
+
+	/* We don't have to do string escaping on fields, as quotes aren't
+	 * permitted in field names.
+	 */
+	if (m->fname0) {
+		ok &= emit_string(&e, m->fname0);
+		ok &= emit_string(&e, " ");
+	}
+	if (m->fname1) {
+		ok &= emit_string(&e, m->fname1);
+		ok &= emit_string(&e, " ");
+	}
+	ok &= emit_string(&e, "value\n");
+
+	if (m->fname0)
+		ok &= emit_string(&e, "str ");
+	if (m->fname1)
+		ok &= emit_string(&e, "str ");
+	ok &= emit_string(&e, (m->is_string) ? "str\n" : "int\n");
+
+	/* Emit all or nothing. */
+	if (ok) {
+		p->bytes_written = e.buf - e.orig_buf;
+	} else {
+		metricfs_generic_release(inode, filp);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int metricfs_version_open(struct inode *inode, struct file *filp)
+{
+	struct metric *m;
+	struct metricfs_file_private *p;
+	int rc = metric_open_helper(inode, filp, METRICFS_VERSION_BUF_SIZE,
+				    &m, &p);
+	if (rc < 0)
+		return rc;
+
+	p->bytes_written = snprintf(p->buf, METRICFS_VERSION_BUF_SIZE,
+				    "1\n");
+
+	if (p->bytes_written >= METRICFS_VERSION_BUF_SIZE) {
+		metricfs_generic_release(inode, filp);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int metricfs_values_open(struct inode *inode, struct file *filp)
+{
+	struct metric_emitter e;
+
+	struct metric *m;
+	struct metricfs_file_private *p;
+	int rc = metric_open_helper(inode, filp, METRICFS_VALUES_BUF_SIZE,
+				    &m, &p);
+	if (rc < 0)
+		return rc;
+
+	e.buf = p->buf;
+	e.orig_buf = p->buf;
+	e.size = METRICFS_VALUES_BUF_SIZE;
+	e.metric = m;
+
+	if (m->has_parm) {
+		if (m->emit_fn.emit_parm)
+			(m->emit_fn.emit_parm)(&e, m->eparm);
+	} else {
+		if (m->emit_fn.emit_noparm)
+			(m->emit_fn.emit_noparm)(&e);
+	}
+	p->bytes_written = e.buf - e.orig_buf;
+	return 0;
+}
+
+static ssize_t metricfs_generic_read(struct file *filp, char __user *ubuf,
+				     size_t cnt, loff_t *ppos)
+{
+	struct metricfs_file_private *p =
+			(struct metricfs_file_private *)filp->private_data;
+	return simple_read_from_buffer(ubuf, cnt, ppos, p->buf,
+					p->bytes_written);
+}
+
+static const struct file_operations metricfs_annotations_ops = {
+	.open = metricfs_annotations_open,
+	.read = metricfs_generic_read,
+	.release = metricfs_generic_release,
+};
+
+static const struct file_operations metricfs_fields_ops = {
+	.open = metricfs_fields_open,
+	.read = metricfs_generic_read,
+	.release = metricfs_generic_release,
+};
+
+static const struct file_operations metricfs_values_ops = {
+	.open = metricfs_values_open,
+	.read = metricfs_generic_read,
+	.release = metricfs_generic_release,
+};
+
+static const struct file_operations metricfs_version_ops = {
+	.open = metricfs_version_open,
+	.read = metricfs_generic_read,
+	.release = metricfs_generic_release,
+};
+
+static struct dentry *d_metricfs;
+
+static struct dentry *metricfs_init_dentry(void)
+{
+	static int once;
+
+	if (d_metricfs)
+		return d_metricfs;
+
+	if (!debugfs_initialized())
+		return NULL;
+
+	d_metricfs = debugfs_create_dir("metricfs", NULL);
+
+	if (!d_metricfs && !once) {
+		once = 1;
+		pr_warn("Could not create debugfs directory 'metricfs'\n");
+		return NULL;
+	}
+
+	return d_metricfs;
+}
+
+/* We always cast in and out to struct dentry. */
+struct metricfs_subsys {
+	struct dentry dentry;
+};
+
+static struct dentry *metricfs_create_file(const char *name,
+					   mode_t mode,
+					   struct dentry *parent,
+					   void *data,
+					   const struct file_operations *fops)
+{
+	struct dentry *ret;
+
+	ret = debugfs_create_file(name, mode, parent, data, fops);
+	if (!ret)
+		pr_warn("Could not create debugfs '%s' entry\n", name);
+
+	return ret;
+}
+
+static struct dentry *metricfs_create_dir(const char *name,
+					  struct metricfs_subsys *s)
+{
+	struct dentry *d;
+
+	if (!s)
+		d = d_metricfs;
+	else
+		d = &s->dentry;
+
+	if (!d) {
+		pr_warn("Couldn't create %s, subsys doesn't exist.", name);
+		return NULL;
+	}
+	return debugfs_create_dir(name, d);
+}
+
+static int metricfs_initialized;
+
+struct metric *metric_register(const char *name,
+				struct metricfs_subsys *parent,
+				const char *description,
+				const char *fname0,
+				const char *fname1,
+				void (*fn)(struct metric_emitter *e),
+				bool is_string,
+				bool is_cumulative,
+				struct module *owner)
+{
+	struct metric *m;
+	struct dentry *d, *t;
+
+	if (!metricfs_initialized) {
+		pr_warn("Could not create metric before initing metricfs\n");
+		return NULL;
+	}
+
+	m = kzalloc(sizeof(*m), GFP_KERNEL);
+	if (!m)
+		return NULL;
+
+	d = metricfs_create_dir(name, parent);
+	if (!d) {
+		pr_warn("Could not create dir '%s' in metricfs.\n", name);
+		kfree(m);
+		return NULL;
+	}
+
+	m->description = description;
+	m->fname0 = fname0;
+	m->fname1 = fname1;
+	m->has_parm = false;
+	m->emit_fn.emit_noparm = fn;
+	m->eparm = NULL;
+	m->is_string = is_string;
+	m->is_cumulative = is_cumulative;
+	refcount_set(&m->refcnt, 1);
+	m->owner = owner;
+	m->dentry = d;
+
+
+	mutex_lock(&big_mutex);
+	t = metricfs_create_file("annotations", 0444, d, m,
+					&metricfs_annotations_ops);
+	if (!t)
+		goto done;
+	m->inodes[0] = t->d_inode;
+
+	t = metricfs_create_file("fields", 0444, d, m,
+					&metricfs_fields_ops);
+	if (!t)
+		goto done;
+	m->inodes[1] = t->d_inode;
+
+	t = metricfs_create_file("values", 0444, d, m,
+					&metricfs_values_ops);
+	if (!t)
+		goto done;
+	m->inodes[2] = t->d_inode;
+
+	t = metricfs_create_file("version", 0444, d, m,
+					&metricfs_version_ops);
+	if (!t)
+		goto done;
+	m->inodes[3] = t->d_inode;
+
+done:
+	/* Unregister the metric before anyone calls open() if we had any
+	 * errors on file creation.
+	 */
+	if (!t) {
+		metric_unregister(m);
+		m = NULL;
+	}
+	mutex_unlock(&big_mutex);
+
+	return m;
+}
+EXPORT_SYMBOL(metric_register);
+
+struct metric *metric_register_parm(const char *name,
+				    struct metricfs_subsys *parent,
+				    const char *description,
+				    const char *fname0,
+				    const char *fname1,
+				    void (*fn)(struct metric_emitter *e,
+					       void *parm),
+				    void *eparm,
+				    bool is_string,
+				    bool is_cumulative,
+				    struct module *owner)
+{
+	struct metric *metric =
+		metric_register(name, parent, description,
+				fname0, fname1,
+				(void (*)(struct metric_emitter *))NULL,
+				is_string,
+				is_cumulative, owner);
+	if (metric) {
+		metric->has_parm = true;
+		metric->emit_fn.emit_parm = fn;
+		metric->eparm = eparm;
+	}
+	return metric;
+}
+EXPORT_SYMBOL(metric_register_parm);
+
+void metric_unregister(struct metric *m)
+{
+	/* We have to NULL out the i_private pointers here so that no other
+	 * callers come into open, getting a pointer to the metric that we
+	 * freed.
+	 */
+	mutex_lock(&big_mutex);
+	m->inodes[0]->i_private = NULL;
+	m->inodes[1]->i_private = NULL;
+	m->inodes[2]->i_private = NULL;
+	m->inodes[3]->i_private = NULL;
+	mutex_unlock(&big_mutex);
+
+	debugfs_remove_recursive(m->dentry);
+	metric_put(m);
+}
+EXPORT_SYMBOL(metric_unregister);
+
+struct metricfs_subsys *metricfs_create_subsys(const char *name,
+					       struct metricfs_subsys *parent)
+{
+	struct dentry *d = metricfs_create_dir(name, parent);
+
+	return container_of(d, struct metricfs_subsys, dentry);
+}
+EXPORT_SYMBOL(metricfs_create_subsys);
+
+void metricfs_destroy_subsys(struct metricfs_subsys *s)
+{
+	if (s)
+		debugfs_remove(&s->dentry);
+}
+EXPORT_SYMBOL(metricfs_destroy_subsys);
+
+static void metricfs_presence_fn(struct metric_emitter *e)
+{
+	METRIC_EMIT_INT(e, 1, NULL, NULL);
+}
+METRIC_EXPORT_INT(metricfs_presence, "A basic presence metric.",
+			NULL, NULL, metricfs_presence_fn);
+
+static int __init metricfs_init(void)
+{
+	if (!metricfs_init_dentry())
+		return -ENOMEM;
+	metricfs_initialized = 1;
+
+	/* Create a basic "presence" metric. */
+	metric_init_metricfs_presence(NULL);
+
+	mutex_init(&big_mutex);
+	return 0;
+}
+
+/*
+ * Debugfs should be fine by the time we're at fs_initcall.
+ */
+fs_initcall(metricfs_init);
diff --git a/kernel/metricfs_examples.c b/kernel/metricfs_examples.c
new file mode 100644
index 000000000000..50d891176728
--- /dev/null
+++ b/kernel/metricfs_examples.c
@@ -0,0 +1,151 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/metricfs.h>
+#include <linux/module.h>
+
+/* A metric to force truncation of the values file. "values" files in
+ * metricfs can be at most 64K in size. It truncates to the last record
+ * that fits entirely in the output file.
+ *
+ * Creates a metric with a values file that looks like:
+ * val"0" 0
+ * val"1" 1
+ * val"2" 2
+ * ...
+ * "val"3565" 3565
+ */
+static void more_than_64k_fn(struct metric_emitter *e)
+{
+	char buf[80];
+	int i;
+
+	for (i = 0; i < 10000; i++) {
+		sprintf(buf, "val\"%d\"", i);
+		/* Argument order is (emitter, value, field0, field1...) */
+		METRIC_EMIT_INT(e, i, buf, NULL);
+	}
+}
+METRIC_EXPORT_INT(more_than_64k, "Stress test metric.",
+			"v", NULL, more_than_64k_fn);
+
+
+/* A metric with two string fields and int64 values.
+ *
+ * # cat /sys/kernel/debug/metricfs/two_string_fields/annotations
+ * DESCRIPTION "Two fields example."
+ * # cat /sys/kernel/debug/metricfs/two_string_fields/fields
+ * disk cgroup value
+ * str str int
+ * # cat /sys/kernel/debug/metricfs/two_string_fields/values
+ * sda /map_reduce1 0
+ * sda /sys 50
+ * sdb /map_reduce2 12
+ */
+static void two_string_fields_fn(struct metric_emitter *e)
+{
+#define NR_ENTRIES 3
+	const char *disk[NR_ENTRIES] = {"sda", "sda", "sdb"};
+	const char *cgroups[NR_ENTRIES] = {
+				"/map_reduce1", "/sys", "/map_reduce2"};
+	const int64_t counters[NR_ENTRIES] = {0, 50, 12};
+	int i;
+
+	for (i = 0; i < NR_ENTRIES; i++) {
+		METRIC_EMIT_INT(e,
+				counters[i], disk[i], cgroups[i]);
+	}
+}
+#undef NR_ENTRIES
+METRIC_EXPORT_INT(two_string_fields, "Two fields example.",
+			"disk", "cgroup", two_string_fields_fn);
+
+
+/* A metric with zero fields and a string value.
+ *
+ * # cat /sys/kernel/debug/metricfs/string_valued_metric/annotations
+ * DESCRIPTION "String metric."
+ * # cat /sys/kernel/debug/metricfs/string_valued_metric/fields
+ * value
+ * str
+ * # cat /sys/kernel/debug/metricfs/string_valued_metric/values
+ * Test\ninfo.
+ */
+static void string_valued_metric_fn(struct metric_emitter *e)
+{
+	METRIC_EMIT_STR(e, "Test\ninfo.", NULL, NULL);
+}
+METRIC_EXPORT_STR(string_valued_metric, "String metric.",
+			NULL, NULL, string_valued_metric_fn);
+
+/* Test metric to ensure we behave properly with a large annotation string. */
+static void huge_annotation_fn(struct metric_emitter *e)
+{
+	METRIC_EMIT_STR(e, "test\n", NULL, NULL);
+}
+static const char *huge_annotation_s =
+	"1231231231231231231231231231231241241212895781930750981347503485"
+	"7029348750923847502384750923847590234857902348759023475028934751"
+	"1111111111111112312312312312312312312312312312412412128957819307"
+	"5098134750348570293487509238475023847509238475902348579023487590"
+	"2347502893475 23123123123123123123123123123124124121289578193075"
+	"0981347503485702934875092384750238475092384759023485790234875902"
+	"347502893475 231231231231231231231231231231241241212895781930750"
+	"9813475034857029348750923847502384750923847590234857902348759023"
+	"47502893475 2312312312312312312312312312312412412128957819307509"
+	"8134750348570293487509238475023847509238475902348579023487590234"
+	"7502893475 23123123123123123123123123123124124121289578193075098"
+	"1347503485702934875092384750238475092384759023485790234875902347"
+	"502893475 231231231231231231231231231231241241212895781930750981"
+	"3475034857029348750923847502384750923847590234857902348759023475"
+	"02893475 2312312312312312312312312312312412412128957819307509813"
+	"4750348570293487509238475023847509238475902348579023487590234750"
+	"2893475 23123123123123123123123123123124124121289578193075098134"
+	"7503485702934875092384750238475092384759023485790234875902347502"
+	"893475 231231231231231231231231231231241241212895781930750981347"
+	"5034857029348750923847502384750923847590234857902348759023475028"
+	"93475 2312312312312312312312312312312412412128957819307509813475"
+	"0348570293487509238475023847509238475902348579023487590234750289"
+	"3475 23123123123123123123123123123124124121289578193075098134750"
+	"3485702934875092384750238475092384759023485790234875902347502893"
+	"475 231231231231231231231231231231241241212895781930750981347503"
+	"4857029348750923847502384750923847590234857902348759023475028934"
+	"75 2312312312312312312312312312312412412128957819307509813475034"
+	"8570293487509238475023847509238475902348579023487590234750289347"
+	"5 23123123123123123123123123123124124121289578193075098134750348"
+	"5702934875092384750238475092384759023485790234875902347502893475"
+	" 231231231231231231231231231231241241212895781930750981347503485"
+	"702934875092384750238475092384759023485790234875902347502893475 "
+	"2312312312312312312312312312312412412128957819307509813475034857"
+	"02934875092384750238475092384759023485790234875902347502893475";
+
+METRIC_EXPORT_STR(huge_annotation, huge_annotation_s, NULL, NULL,
+			huge_annotation_fn);
+
+
+struct metricfs_subsys *examples_subsys;
+
+static int __init metricfs_examples_init(void)
+{
+	examples_subsys = metricfs_create_subsys("examples", NULL);
+	metric_init_more_than_64k(examples_subsys);
+	metric_init_two_string_fields(examples_subsys);
+	metric_init_string_valued_metric(examples_subsys);
+	metric_init_huge_annotation(examples_subsys);
+
+	return 0;
+}
+
+static void __exit metricfs_examples_exit(void)
+{
+	metric_exit_more_than_64k();
+	metric_exit_two_string_fields();
+	metric_exit_string_valued_metric();
+	metric_exit_huge_annotation();
+
+	metricfs_destroy_subsys(examples_subsys);
+}
+
+module_init(metricfs_examples_init);
+module_exit(metricfs_examples_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 9ad9210d70a1..8de0244e7804 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -325,6 +325,24 @@  config READABLE_ASM
 	  to keep kernel developers who have to stare a lot at assembler listings
 	  sane.
 
+config METRICFS
+	bool "Metricfs for sysmon"
+	depends on DEBUG_FS
+	help
+	  metricfs is a library for creating rigidly-formatted files in debugfs
+	  which can be automatically monitored by user-space telemetry.  The
+	  hierarchy is rooted at /sys/kernel/debug/metricfs, and each metric
+	  contains metadata about the metric and types involved, as well as a
+	  tabular values file with the metrics themselves.
+
+config METRICFS_EXAMPLES
+	tristate "Metricfs examples"
+	depends on METRICFS
+	help
+	  example tests and metrics for metricfs.  With this, a set of metrics
+	  appear under "examples", covering various corner cases of the metricfs
+	  interface.  These can be used to test the metricfs functionality.
+
 config HEADERS_INSTALL
 	bool "Install uapi headers to usr/include"
 	depends on !UML