diff mbox

[v3,1/6] cgroup: Allow registration and lookup of cgroup private data

Message ID 20180306234700.6562-2-matthew.d.roper@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Matt Roper March 6, 2018, 11:46 p.m. UTC
There are cases where other parts of the kernel may wish to store data
associated with individual cgroups without building a full cgroup
controller.  Let's add interfaces to allow them to register and lookup
this private data for individual cgroups.

A kernel system (e.g., a driver) that wishes to register private data
for a cgroup will do so by subclassing the 'struct cgroup_priv'
structure to describe the necessary data to store.  Before registering a
private data structure to a cgroup, the caller should fill in the 'key'
and 'free' fields of the base cgroup_priv structure.

 * 'key' should be a unique void* that will act as a key for future
   privdata lookups/removals.  Note that this allows drivers to store
   per-device private data for a cgroup by using a device pointer as a key.

 * 'free' should be a function pointer to a function that may be used
   to destroy the private data.  This function will be called
   automatically if the underlying cgroup is destroyed.

Cc: Tejun Heo <tj@kernel.org>
Cc: cgroups@vger.kernel.org
Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
---
 include/linux/cgroup-defs.h | 38 ++++++++++++++++++++++
 include/linux/cgroup.h      | 78 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/cgroup/cgroup.c      | 14 ++++++++
 3 files changed, 130 insertions(+)

Comments

Tejun Heo March 13, 2018, 8:50 p.m. UTC | #1
Hello, Matt.

cc'ing Roman and Alexei.

On Tue, Mar 06, 2018 at 03:46:55PM -0800, Matt Roper wrote:
> There are cases where other parts of the kernel may wish to store data
> associated with individual cgroups without building a full cgroup
> controller.  Let's add interfaces to allow them to register and lookup
> this private data for individual cgroups.
> 
> A kernel system (e.g., a driver) that wishes to register private data
> for a cgroup will do so by subclassing the 'struct cgroup_priv'
> structure to describe the necessary data to store.  Before registering a
> private data structure to a cgroup, the caller should fill in the 'key'
> and 'free' fields of the base cgroup_priv structure.
> 
>  * 'key' should be a unique void* that will act as a key for future
>    privdata lookups/removals.  Note that this allows drivers to store
>    per-device private data for a cgroup by using a device pointer as a key.
> 
>  * 'free' should be a function pointer to a function that may be used
>    to destroy the private data.  This function will be called
>    automatically if the underlying cgroup is destroyed.

This feature turned out to have more users than I originally
anticipated and bpf also wants something like this to track network
states.  The requirements are pretty similar but not quite the same.
The extra requirements are...

* Lookup must be really cheap.  Probably using pointer hash or walking
  list isn't great, so maybe idr based lookup + RCU protected index
  table per cgroup?

* It should support both regular memory and percpu pointers.  Given
  that what cgroup does is pretty much cgroup:key -> pointer lookup,
  it's mostly about getting the interface right so that it's not too
  error-prone.

Sorry about moving the goalpost.

Thanks.
diff mbox

Patch

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 9f242b876fde..17c679a7b5de 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -8,6 +8,7 @@ 
 #ifndef _LINUX_CGROUP_DEFS_H
 #define _LINUX_CGROUP_DEFS_H
 
+#include <linux/hashtable.h>
 #include <linux/limits.h>
 #include <linux/list.h>
 #include <linux/idr.h>
@@ -307,6 +308,36 @@  struct cgroup_stat {
 	struct prev_cputime prev_cputime;
 };
 
+/*
+ * Private data associated with a cgroup by an indpendent (non-controller) part
+ * of the kernel.  This is useful for things like drivers that may wish to track
+ * their own cgroup-specific data.
+ *
+ * If an individual cgroup is destroyed, the cgroups framework will
+ * automatically free all associated private data.  If cgroup private data is
+ * registered by a kernel module, then it is the module's responsibility to
+ * manually free its own private data upon unload.
+ */
+struct cgroup_priv {
+	/* cgroup this private data is associated with */
+	struct cgroup *cgroup;
+
+	/*
+	 * Lookup key that defines the in-kernel consumer of this private
+	 * data.
+	 */
+	const void *key;
+
+	/*
+	 * Function to release private data.  This will be automatically called
+	 * if/when the cgroup is destroyed.
+	 */
+	void (*free)(struct cgroup_priv *priv);
+
+	/* Hashlist node in cgroup's privdata hashtable */
+	struct hlist_node hnode;
+};
+
 struct cgroup {
 	/* self css with NULL ->ss, points back to this cgroup */
 	struct cgroup_subsys_state self;
@@ -427,6 +458,13 @@  struct cgroup {
 	/* used to store eBPF programs */
 	struct cgroup_bpf bpf;
 
+	/*
+	 * cgroup private data registered by other non-controller parts of the
+	 * kernel
+	 */
+	DECLARE_HASHTABLE(privdata, 4);
+	struct mutex privdata_mutex;
+
 	/* ids of the ancestors at each level including self */
 	int ancestor_ids[];
 };
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 473e0c0abb86..a3604b005417 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -833,4 +833,82 @@  static inline void put_cgroup_ns(struct cgroup_namespace *ns)
 		free_cgroup_ns(ns);
 }
 
+/**
+ * cgroup_priv_install - install new cgroup private data
+ * @key: Key uniquely identifying kernel owner of private data
+ *
+ * Allows non-controller kernel subsystems to register their own private data
+ * associated with a cgroup.  This will often be used by drivers which wish to
+ * track their own per-cgroup data without building a full cgroup controller.
+ *
+ * Callers should ensure that no existing private data exists for the given key
+ * before adding new private data.  If two sets of private data are registered
+ * with the same key, it is undefined which will be returned by future calls
+ * to cgroup_priv_lookup.
+ *
+ * Kernel modules that register private data with this function should take
+ * care to free their private data when unloaded to prevent leaks.
+ */
+static inline void
+cgroup_priv_install(struct cgroup *cgrp,
+		    struct cgroup_priv *priv)
+{
+	WARN_ON(!mutex_is_locked(&cgrp->privdata_mutex));
+	WARN_ON(!priv->key);
+	WARN_ON(!priv->free);
+	WARN_ON(priv->cgroup);
+
+	priv->cgroup = cgrp;
+	hash_add(cgrp->privdata, &priv->hnode,
+		 (unsigned long)priv->key);
+}
+
+/**
+ * cgroup_priv_lookup - looks up cgroup private data
+ * @key: Key uniquely identifying owner of private data to lookup
+ *
+ * Looks up the private data associated with a key.
+ *
+ * Returns:
+ * Previously registered cgroup private data associated with the given key, or
+ * NULL if no private data has been registered.
+ */
+static inline struct cgroup_priv *
+cgroup_priv_lookup(struct cgroup *cgrp,
+		   const void *key)
+{
+	struct cgroup_priv *priv;
+
+	WARN_ON(!mutex_is_locked(&cgrp->privdata_mutex));
+
+	hash_for_each_possible(cgrp->privdata, priv, hnode,
+			       (unsigned long)key)
+		if (priv->key == key)
+			return priv;
+
+	return NULL;
+}
+
+/**
+ * cgroup_priv_free - free cgroup private data
+ * @key: Key uniquely identifying owner of private data to free
+ */
+static inline void
+cgroup_priv_free(struct cgroup *cgrp, const void *key)
+{
+	struct cgroup_priv *priv;
+	struct hlist_node *tmp;
+
+	mutex_lock(&cgrp->privdata_mutex);
+
+	hash_for_each_possible_safe(cgrp->privdata, priv, tmp, hnode,
+				    (unsigned long)key) {
+		hash_del(&priv->hnode);
+		if (priv->key == key && !WARN_ON(priv->free == NULL))
+			priv->free(priv);
+	}
+
+	mutex_unlock(&cgrp->privdata_mutex);
+}
+
 #endif /* _LINUX_CGROUP_H */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8cda3bc3ae22..9e576dc8b566 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1839,6 +1839,8 @@  static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->cset_links);
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	mutex_init(&cgrp->pidlist_mutex);
+	hash_init(cgrp->privdata);
+	mutex_init(&cgrp->privdata_mutex);
 	cgrp->self.cgroup = cgrp;
 	cgrp->self.flags |= CSS_ONLINE;
 	cgrp->dom_cgrp = cgrp;
@@ -4578,6 +4580,9 @@  static void css_release_work_fn(struct work_struct *work)
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 	struct cgroup_subsys *ss = css->ss;
 	struct cgroup *cgrp = css->cgroup;
+	struct cgroup_priv *priv;
+	struct hlist_node *tmp;
+	int i;
 
 	mutex_lock(&cgroup_mutex);
 
@@ -4617,6 +4622,15 @@  static void css_release_work_fn(struct work_struct *work)
 					 NULL);
 
 		cgroup_bpf_put(cgrp);
+
+		/* Any private data must be released automatically */
+		mutex_lock(&cgrp->privdata_mutex);
+		hash_for_each_safe(cgrp->privdata, i, tmp, priv, hnode) {
+			hash_del(&priv->hnode);
+			if (!WARN_ON(!priv->free))
+				priv->free(priv);
+		}
+		mutex_unlock(&cgrp->privdata_mutex);
 	}
 
 	mutex_unlock(&cgroup_mutex);