diff mbox series

[RFC,v2,4/5] drm, cgroup: Add total GEM buffer allocation limit

Message ID 20190509210410.5471-5-Kenny.Ho@amd.com (mailing list archive)
State New, archived
Headers show
Series new cgroup controller for gpu/drm subsystem | expand

Commit Message

Ho, Kenny May 9, 2019, 9:04 p.m. UTC
The drm resource being measured and limited here is the GEM buffer
objects.  User applications allocate and free these buffers.  In
addition, a process can allocate a buffer and share it with another
process.  The consumer of a shared buffer can also outlive the
allocator of the buffer.

For the purpose of cgroup accounting and limiting, ownership of the
buffer is deemed to be the cgroup for which the allocating process
belongs to.  There is one limit per drm device.

In order to prevent the buffer outliving the cgroup that owns it, a
process is prevented from importing buffers that are not own by the
process' cgroup or the ancestors of the process' cgroup.

For this resource, the control files are prefixed with drm.buffer.total.

There are four control file types,
stats (ro) - display current measured values for a resource
max (rw) - limits for a resource
default (ro, root cgroup only) - default values for a resource
help (ro, root cgroup only) - help string for a resource

Each file is multi-lined with one entry/line per drm device.

Usage examples:
// set limit for card1 to 1GB
sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max

// set limit for card0 to 512MB
sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max

Change-Id: I4c249d06d45ec709d6481d4cbe87c5168545c5d0
Signed-off-by: Kenny Ho <Kenny.Ho@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c |   4 +
 drivers/gpu/drm/drm_gem.c                  |   7 +
 drivers/gpu/drm/drm_prime.c                |   9 +
 include/drm/drm_cgroup.h                   |  34 ++-
 include/drm/drm_gem.h                      |  11 +
 include/linux/cgroup_drm.h                 |   3 +
 kernel/cgroup/drm.c                        | 280 +++++++++++++++++++++
 7 files changed, 346 insertions(+), 2 deletions(-)

Comments

Christian König May 10, 2019, 12:28 p.m. UTC | #1
Am 09.05.19 um 23:04 schrieb Kenny Ho:
> The drm resource being measured and limited here is the GEM buffer
> objects.  User applications allocate and free these buffers.  In
> addition, a process can allocate a buffer and share it with another
> process.  The consumer of a shared buffer can also outlive the
> allocator of the buffer.
>
> For the purpose of cgroup accounting and limiting, ownership of the
> buffer is deemed to be the cgroup for which the allocating process
> belongs to.  There is one limit per drm device.
>
> In order to prevent the buffer outliving the cgroup that owns it, a
> process is prevented from importing buffers that are not own by the
> process' cgroup or the ancestors of the process' cgroup.
>
> For this resource, the control files are prefixed with drm.buffer.total.
>
> There are four control file types,
> stats (ro) - display current measured values for a resource
> max (rw) - limits for a resource
> default (ro, root cgroup only) - default values for a resource
> help (ro, root cgroup only) - help string for a resource
>
> Each file is multi-lined with one entry/line per drm device.
>
> Usage examples:
> // set limit for card1 to 1GB
> sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max
>
> // set limit for card0 to 512MB
> sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max
>
> Change-Id: I4c249d06d45ec709d6481d4cbe87c5168545c5d0
> Signed-off-by: Kenny Ho <Kenny.Ho@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c |   4 +
>   drivers/gpu/drm/drm_gem.c                  |   7 +
>   drivers/gpu/drm/drm_prime.c                |   9 +
>   include/drm/drm_cgroup.h                   |  34 ++-
>   include/drm/drm_gem.h                      |  11 +
>   include/linux/cgroup_drm.h                 |   3 +
>   kernel/cgroup/drm.c                        | 280 +++++++++++++++++++++
>   7 files changed, 346 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 93b2c5a48a71..b4c078b7ad63 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -34,6 +34,7 @@
>   #include <drm/drmP.h>
>   #include <drm/amdgpu_drm.h>
>   #include <drm/drm_cache.h>
> +#include <drm/drm_cgroup.h>
>   #include "amdgpu.h"
>   #include "amdgpu_trace.h"
>   #include "amdgpu_amdkfd.h"
> @@ -446,6 +447,9 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
>   	if (!amdgpu_bo_validate_size(adev, size, bp->domain))
>   		return -ENOMEM;
>   
> +	if (!drmcgrp_bo_can_allocate(current, adev->ddev, size))
> +		return -ENOMEM;
> +
>   	*bo_ptr = NULL;
>   
>   	acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
> diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
> index 6a80db077dc6..cbd49bf34dcf 100644
> --- a/drivers/gpu/drm/drm_gem.c
> +++ b/drivers/gpu/drm/drm_gem.c
> @@ -37,10 +37,12 @@
>   #include <linux/shmem_fs.h>
>   #include <linux/dma-buf.h>
>   #include <linux/mem_encrypt.h>
> +#include <linux/cgroup_drm.h>
>   #include <drm/drmP.h>
>   #include <drm/drm_vma_manager.h>
>   #include <drm/drm_gem.h>
>   #include <drm/drm_print.h>
> +#include <drm/drm_cgroup.h>
>   #include "drm_internal.h"
>   
>   /** @file drm_gem.c
> @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev,
>   	obj->handle_count = 0;
>   	obj->size = size;
>   	drm_vma_node_reset(&obj->vma_node);
> +
> +	obj->drmcgrp = get_drmcgrp(current);
> +	drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size);
>   }
>   EXPORT_SYMBOL(drm_gem_private_object_init);
>   
> @@ -804,6 +809,8 @@ drm_gem_object_release(struct drm_gem_object *obj)
>   	if (obj->filp)
>   		fput(obj->filp);
>   
> +	drmcgrp_unchg_bo_alloc(obj->drmcgrp, obj->dev, obj->size);
> +
>   	drm_gem_free_mmap_offset(obj);
>   }
>   EXPORT_SYMBOL(drm_gem_object_release);
> diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
> index 231e3f6d5f41..faed5611a1c6 100644
> --- a/drivers/gpu/drm/drm_prime.c
> +++ b/drivers/gpu/drm/drm_prime.c
> @@ -32,6 +32,7 @@
>   #include <drm/drm_prime.h>
>   #include <drm/drm_gem.h>
>   #include <drm/drmP.h>
> +#include <drm/drm_cgroup.h>
>   
>   #include "drm_internal.h"
>   
> @@ -794,6 +795,7 @@ int drm_gem_prime_fd_to_handle(struct drm_device *dev,
>   {
>   	struct dma_buf *dma_buf;
>   	struct drm_gem_object *obj;
> +	struct drmcgrp *drmcgrp = get_drmcgrp(current);
>   	int ret;
>   
>   	dma_buf = dma_buf_get(prime_fd);
> @@ -818,6 +820,13 @@ int drm_gem_prime_fd_to_handle(struct drm_device *dev,
>   		goto out_unlock;
>   	}
>   
> +	/* only allow bo from the same cgroup or its ancestor to be imported */
> +	if (drmcgrp != NULL &&
> +			!drmcgrp_is_self_or_ancestor(drmcgrp, obj->drmcgrp)) {
> +		ret = -EACCES;
> +		goto out_unlock;
> +	}
> +

This will most likely go up in flames.

If I'm not completely mistaken we already use 
drm_gem_prime_fd_to_handle() to exchange handles between different 
cgroups in current container usages.

Christian.

>   	if (obj->dma_buf) {
>   		WARN_ON(obj->dma_buf != dma_buf);
>   	} else {
> diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h
> index ddb9eab64360..8711b7c5f7bf 100644
> --- a/include/drm/drm_cgroup.h
> +++ b/include/drm/drm_cgroup.h
> @@ -4,12 +4,20 @@
>   #ifndef __DRM_CGROUP_H__
>   #define __DRM_CGROUP_H__
>   
> +#include <linux/cgroup_drm.h>
> +
>   #ifdef CONFIG_CGROUP_DRM
>   
>   int drmcgrp_register_device(struct drm_device *device);
> -
>   int drmcgrp_unregister_device(struct drm_device *device);
> -
> +bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self,
> +		struct drmcgrp *relative);
> +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
> +		size_t size);
> +void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
> +		size_t size);
> +bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev,
> +		size_t size);
>   #else
>   static inline int drmcgrp_register_device(struct drm_device *device)
>   {
> @@ -20,5 +28,27 @@ static inline int drmcgrp_unregister_device(struct drm_device *device)
>   {
>   	return 0;
>   }
> +
> +static inline bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self,
> +		struct drmcgrp *relative)
> +{
> +	return false;
> +}
> +
> +static inline void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp,
> +		struct drm_device *dev,	size_t size)
> +{
> +}
> +
> +static inline void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp,
> +		struct drm_device *dev,	size_t size)
> +{
> +}
> +
> +static inline bool drmcgrp_bo_can_allocate(struct task_struct *task,
> +		struct drm_device *dev,	size_t size)
> +{
> +	return true;
> +}
>   #endif /* CONFIG_CGROUP_DRM */
>   #endif /* __DRM_CGROUP_H__ */
> diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
> index c95727425284..02854c674b5c 100644
> --- a/include/drm/drm_gem.h
> +++ b/include/drm/drm_gem.h
> @@ -272,6 +272,17 @@ struct drm_gem_object {
>   	 *
>   	 */
>   	const struct drm_gem_object_funcs *funcs;
> +
> +	/**
> +	 * @drmcgrp:
> +	 *
> +	 * DRM cgroup this GEM object belongs to.
> +         *
> +         * This is used to track and limit the amount of GEM objects a user
> +         * can allocate.  Since GEM objects can be shared, this is also used
> +         * to ensure GEM objects are only shared within the same cgroup.
> +	 */
> +	struct drmcgrp *drmcgrp;
>   };
>   
>   /**
> diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h
> index d7ccf434ca6b..fe14ba7bb1cf 100644
> --- a/include/linux/cgroup_drm.h
> +++ b/include/linux/cgroup_drm.h
> @@ -15,6 +15,9 @@
>   
>   struct drmcgrp_device_resource {
>   	/* for per device stats */
> +	s64			bo_stats_total_allocated;
> +
> +	s64			bo_limits_total_allocated;
>   };
>   
>   struct drmcgrp {
> diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c
> index f9ef4bf042d8..bc3abff09113 100644
> --- a/kernel/cgroup/drm.c
> +++ b/kernel/cgroup/drm.c
> @@ -15,6 +15,22 @@ static DEFINE_MUTEX(drmcgrp_mutex);
>   struct drmcgrp_device {
>   	struct drm_device	*dev;
>   	struct mutex		mutex;
> +
> +	s64			bo_limits_total_allocated_default;
> +};
> +
> +#define DRMCG_CTF_PRIV_SIZE 3
> +#define DRMCG_CTF_PRIV_MASK GENMASK((DRMCG_CTF_PRIV_SIZE - 1), 0)
> +
> +enum drmcgrp_res_type {
> +	DRMCGRP_TYPE_BO_TOTAL,
> +};
> +
> +enum drmcgrp_file_type {
> +	DRMCGRP_FTYPE_STATS,
> +	DRMCGRP_FTYPE_MAX,
> +	DRMCGRP_FTYPE_DEFAULT,
> +	DRMCGRP_FTYPE_HELP,
>   };
>   
>   /* indexed by drm_minor for access speed */
> @@ -53,6 +69,10 @@ static inline int init_drmcgrp_single(struct drmcgrp *drmcgrp, int i)
>   	}
>   
>   	/* set defaults here */
> +	if (known_drmcgrp_devs[i] != NULL) {
> +		ddr->bo_limits_total_allocated =
> +		  known_drmcgrp_devs[i]->bo_limits_total_allocated_default;
> +	}
>   
>   	return 0;
>   }
> @@ -99,7 +119,187 @@ drmcgrp_css_alloc(struct cgroup_subsys_state *parent_css)
>   	return &drmcgrp->css;
>   }
>   
> +static inline void drmcgrp_print_stats(struct drmcgrp_device_resource *ddr,
> +		struct seq_file *sf, enum drmcgrp_res_type type)
> +{
> +	if (ddr == NULL) {
> +		seq_puts(sf, "\n");
> +		return;
> +	}
> +
> +	switch (type) {
> +	case DRMCGRP_TYPE_BO_TOTAL:
> +		seq_printf(sf, "%lld\n", ddr->bo_stats_total_allocated);
> +		break;
> +	default:
> +		seq_puts(sf, "\n");
> +		break;
> +	}
> +}
> +
> +static inline void drmcgrp_print_limits(struct drmcgrp_device_resource *ddr,
> +		struct seq_file *sf, enum drmcgrp_res_type type)
> +{
> +	if (ddr == NULL) {
> +		seq_puts(sf, "\n");
> +		return;
> +	}
> +
> +	switch (type) {
> +	case DRMCGRP_TYPE_BO_TOTAL:
> +		seq_printf(sf, "%lld\n", ddr->bo_limits_total_allocated);
> +		break;
> +	default:
> +		seq_puts(sf, "\n");
> +		break;
> +	}
> +}
> +
> +static inline void drmcgrp_print_default(struct drmcgrp_device *ddev,
> +		struct seq_file *sf, enum drmcgrp_res_type type)
> +{
> +	if (ddev == NULL) {
> +		seq_puts(sf, "\n");
> +		return;
> +	}
> +
> +	switch (type) {
> +	case DRMCGRP_TYPE_BO_TOTAL:
> +		seq_printf(sf, "%lld\n", ddev->bo_limits_total_allocated_default);
> +		break;
> +	default:
> +		seq_puts(sf, "\n");
> +		break;
> +	}
> +}
> +
> +static inline void drmcgrp_print_help(int cardNum, struct seq_file *sf,
> +		enum drmcgrp_res_type type)
> +{
> +	switch (type) {
> +	case DRMCGRP_TYPE_BO_TOTAL:
> +		seq_printf(sf,
> +		"Total amount of buffer allocation in bytes for card%d\n",
> +		cardNum);
> +		break;
> +	default:
> +		seq_puts(sf, "\n");
> +		break;
> +	}
> +}
> +
> +int drmcgrp_bo_show(struct seq_file *sf, void *v)
> +{
> +	struct drmcgrp *drmcgrp = css_drmcgrp(seq_css(sf));
> +	struct drmcgrp_device_resource *ddr = NULL;
> +	enum drmcgrp_file_type f_type = seq_cft(sf)->
> +		private & DRMCG_CTF_PRIV_MASK;
> +	enum drmcgrp_res_type type = seq_cft(sf)->
> +		private >> DRMCG_CTF_PRIV_SIZE;
> +	struct drmcgrp_device *ddev;
> +	int i;
> +
> +	for (i = 0; i <= max_minor; i++) {
> +		ddr = drmcgrp->dev_resources[i];
> +		ddev = known_drmcgrp_devs[i];
> +
> +		switch (f_type) {
> +		case DRMCGRP_FTYPE_STATS:
> +			drmcgrp_print_stats(ddr, sf, type);
> +			break;
> +		case DRMCGRP_FTYPE_MAX:
> +			drmcgrp_print_limits(ddr, sf, type);
> +			break;
> +		case DRMCGRP_FTYPE_DEFAULT:
> +			drmcgrp_print_default(ddev, sf, type);
> +			break;
> +		case DRMCGRP_FTYPE_HELP:
> +			drmcgrp_print_help(i, sf, type);
> +			break;
> +		default:
> +			seq_puts(sf, "\n");
> +			break;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +ssize_t drmcgrp_bo_limit_write(struct kernfs_open_file *of, char *buf,
> +		size_t nbytes, loff_t off)
> +{
> +	struct drmcgrp *drmcgrp = css_drmcgrp(of_css(of));
> +	enum drmcgrp_res_type type = of_cft(of)->private >> DRMCG_CTF_PRIV_SIZE;
> +	char *cft_name = of_cft(of)->name;
> +	char *limits = strstrip(buf);
> +	struct drmcgrp_device_resource *ddr;
> +	char *sval;
> +	s64 val;
> +	int i = 0;
> +	int rc;
> +
> +	while (i <= max_minor && limits != NULL) {
> +		sval =  strsep(&limits, "\n");
> +		rc = kstrtoll(sval, 0, &val);
> +
> +		if (rc) {
> +			pr_err("drmcgrp: %s: minor %d, err %d. ",
> +				cft_name, i, rc);
> +			pr_cont_cgroup_name(drmcgrp->css.cgroup);
> +			pr_cont("\n");
> +		} else {
> +			ddr = drmcgrp->dev_resources[i];
> +			switch (type) {
> +			case DRMCGRP_TYPE_BO_TOTAL:
> +                                if (val < 0) continue;
> +				ddr->bo_limits_total_allocated = val;
> +				break;
> +			default:
> +				break;
> +			}
> +		}
> +
> +		i++;
> +	}
> +
> +	if (i <= max_minor) {
> +		pr_err("drmcgrp: %s: less entries than # of drm devices. ",
> +				cft_name);
> +		pr_cont_cgroup_name(drmcgrp->css.cgroup);
> +		pr_cont("\n");
> +	}
> +
> +	return nbytes;
> +}
> +
>   struct cftype files[] = {
> +	{
> +		.name = "buffer.total.stats",
> +		.seq_show = drmcgrp_bo_show,
> +		.private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
> +			DRMCGRP_FTYPE_STATS,
> +	},
> +	{
> +		.name = "buffer.total.default",
> +		.seq_show = drmcgrp_bo_show,
> +		.flags = CFTYPE_ONLY_ON_ROOT,
> +		.private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
> +			DRMCGRP_FTYPE_DEFAULT,
> +	},
> +	{
> +		.name = "buffer.total.help",
> +		.seq_show = drmcgrp_bo_show,
> +		.flags = CFTYPE_ONLY_ON_ROOT,
> +		.private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
> +			DRMCGRP_FTYPE_HELP,
> +	},
> +	{
> +		.name = "buffer.total.max",
> +		.write = drmcgrp_bo_limit_write,
> +		.seq_show = drmcgrp_bo_show,
> +		.private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
> +			DRMCGRP_FTYPE_MAX,
> +	},
>   	{ }	/* terminate */
>   };
>   
> @@ -122,6 +322,8 @@ int drmcgrp_register_device(struct drm_device *dev)
>   		return -ENOMEM;
>   
>   	ddev->dev = dev;
> +	ddev->bo_limits_total_allocated_default = S64_MAX;
> +
>   	mutex_init(&ddev->mutex);
>   
>   	mutex_lock(&drmcgrp_mutex);
> @@ -156,3 +358,81 @@ int drmcgrp_unregister_device(struct drm_device *dev)
>   	return 0;
>   }
>   EXPORT_SYMBOL(drmcgrp_unregister_device);
> +
> +bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, struct drmcgrp *relative)
> +{
> +	for (; self != NULL; self = parent_drmcgrp(self))
> +		if (self == relative)
> +			return true;
> +
> +	return false;
> +}
> +EXPORT_SYMBOL(drmcgrp_is_self_or_ancestor);
> +
> +bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev,
> +		size_t size)
> +{
> +	struct drmcgrp *drmcgrp = get_drmcgrp(task);
> +	struct drmcgrp_device_resource *ddr;
> +	struct drmcgrp_device_resource *d;
> +	int devIdx = dev->primary->index;
> +	bool result = true;
> +	s64 delta = 0;
> +
> +	if (drmcgrp == NULL || drmcgrp == root_drmcgrp)
> +		return true;
> +
> +	ddr = drmcgrp->dev_resources[devIdx];
> +	mutex_lock(&known_drmcgrp_devs[devIdx]->mutex);
> +	for ( ; drmcgrp != root_drmcgrp; drmcgrp = parent_drmcgrp(drmcgrp)) {
> +		d = drmcgrp->dev_resources[devIdx];
> +		delta = d->bo_limits_total_allocated -
> +				d->bo_stats_total_allocated;
> +
> +		if (delta <= 0 || size > delta) {
> +			result = false;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex);
> +
> +	return result;
> +}
> +EXPORT_SYMBOL(drmcgrp_bo_can_allocate);
> +
> +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
> +		size_t size)
> +{
> +	struct drmcgrp_device_resource *ddr;
> +	int devIdx = dev->primary->index;
> +
> +	if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL)
> +		return;
> +
> +	mutex_lock(&known_drmcgrp_devs[devIdx]->mutex);
> +	for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp)) {
> +		ddr = drmcgrp->dev_resources[devIdx];
> +
> +		ddr->bo_stats_total_allocated += (s64)size;
> +	}
> +	mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex);
> +}
> +EXPORT_SYMBOL(drmcgrp_chg_bo_alloc);
> +
> +void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
> +		size_t size)
> +{
> +	struct drmcgrp_device_resource *ddr;
> +	int devIdx = dev->primary->index;
> +
> +	if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL)
> +		return;
> +
> +	ddr = drmcgrp->dev_resources[devIdx];
> +	mutex_lock(&known_drmcgrp_devs[devIdx]->mutex);
> +	for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp))
> +		drmcgrp->dev_resources[devIdx]->bo_stats_total_allocated
> +			-= (s64)size;
> +	mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex);
> +}
> +EXPORT_SYMBOL(drmcgrp_unchg_bo_alloc);
Kenny Ho May 10, 2019, 2:57 p.m. UTC | #2
On Fri, May 10, 2019 at 8:28 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 09.05.19 um 23:04 schrieb Kenny Ho:
> > +     /* only allow bo from the same cgroup or its ancestor to be imported */
> > +     if (drmcgrp != NULL &&
> > +                     !drmcgrp_is_self_or_ancestor(drmcgrp, obj->drmcgrp)) {
> > +             ret = -EACCES;
> > +             goto out_unlock;
> > +     }
> > +
>
> This will most likely go up in flames.
>
> If I'm not completely mistaken we already use
> drm_gem_prime_fd_to_handle() to exchange handles between different
> cgroups in current container usages.
This is something that I am interested in getting more details from
the broader community because the details affect how likely this will
go up in flames ;).  Note that this check does not block sharing of
handles from cgroup parent to children in the hierarchy, nor does it
blocks sharing of handles within a cgroup.

I am interested to find out, when existing apps share handles between
containers, if there are any expectations on resource management.
Since there are no drm cgroup for current container usage, I expect
the answer to be no.  In this case, the drm cgroup controller can be
disabled on its own (in the context of cgroup-v2's unified hierarchy),
or the process can remain at the root for the drm cgroup hierarchy (in
the context of cgroup-v1.)  If I understand the cgroup api correctly,
that means all process would be part of the root cgroup as far as the
drm controller is concerned and this block will not come into effect.
I have verified that this is indeed the current default behaviour of a
container runtime (runc, which is used by docker, podman and others.)
The new drm cgroup controller is simply ignored and all processes
remain at the root of the hierarchy (since there are no other
cgroups.)  I plan to make contributions to runc (so folks can actually
use this features with docker/podman/k8s, etc.) once things stabilized
on the kernel side.

On the other hand, if there are expectations for resource management
between containers, I would like to know who is the expected manager
and how does it fit into the concept of container (which enforce some
level of isolation.)  One possible manager may be the display server.
But as long as the display server is in a parent cgroup of the apps'
cgroup, the apps can still import handles from the display server
under the current implementation.  My understanding is that this is
most likely the case, with the display server simply sitting at the
default/root cgroup.  But I certainly want to hear more about other
use cases (for example, is running multiple display servers on a
single host a realistic possibility?  Are there people running
multiple display servers inside peer containers?  If so, how do they
coordinate resources?)

I should probably summarize some of these into the commit message.

Regards,
Kenny



> Christian.
>
> >       if (obj->dma_buf) {
> >               WARN_ON(obj->dma_buf != dma_buf);
> >       } else {
> > diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h
> > index ddb9eab64360..8711b7c5f7bf 100644
> > --- a/include/drm/drm_cgroup.h
> > +++ b/include/drm/drm_cgroup.h
> > @@ -4,12 +4,20 @@
> >   #ifndef __DRM_CGROUP_H__
> >   #define __DRM_CGROUP_H__
> >
> > +#include <linux/cgroup_drm.h>
> > +
> >   #ifdef CONFIG_CGROUP_DRM
> >
> >   int drmcgrp_register_device(struct drm_device *device);
> > -
> >   int drmcgrp_unregister_device(struct drm_device *device);
> > -
> > +bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self,
> > +             struct drmcgrp *relative);
> > +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
> > +             size_t size);
> > +void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
> > +             size_t size);
> > +bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev,
> > +             size_t size);
> >   #else
> >   static inline int drmcgrp_register_device(struct drm_device *device)
> >   {
> > @@ -20,5 +28,27 @@ static inline int drmcgrp_unregister_device(struct drm_device *device)
> >   {
> >       return 0;
> >   }
> > +
> > +static inline bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self,
> > +             struct drmcgrp *relative)
> > +{
> > +     return false;
> > +}
> > +
> > +static inline void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp,
> > +             struct drm_device *dev, size_t size)
> > +{
> > +}
> > +
> > +static inline void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp,
> > +             struct drm_device *dev, size_t size)
> > +{
> > +}
> > +
> > +static inline bool drmcgrp_bo_can_allocate(struct task_struct *task,
> > +             struct drm_device *dev, size_t size)
> > +{
> > +     return true;
> > +}
> >   #endif /* CONFIG_CGROUP_DRM */
> >   #endif /* __DRM_CGROUP_H__ */
> > diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
> > index c95727425284..02854c674b5c 100644
> > --- a/include/drm/drm_gem.h
> > +++ b/include/drm/drm_gem.h
> > @@ -272,6 +272,17 @@ struct drm_gem_object {
> >        *
> >        */
> >       const struct drm_gem_object_funcs *funcs;
> > +
> > +     /**
> > +      * @drmcgrp:
> > +      *
> > +      * DRM cgroup this GEM object belongs to.
> > +         *
> > +         * This is used to track and limit the amount of GEM objects a user
> > +         * can allocate.  Since GEM objects can be shared, this is also used
> > +         * to ensure GEM objects are only shared within the same cgroup.
> > +      */
> > +     struct drmcgrp *drmcgrp;
> >   };
> >
> >   /**
> > diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h
> > index d7ccf434ca6b..fe14ba7bb1cf 100644
> > --- a/include/linux/cgroup_drm.h
> > +++ b/include/linux/cgroup_drm.h
> > @@ -15,6 +15,9 @@
> >
> >   struct drmcgrp_device_resource {
> >       /* for per device stats */
> > +     s64                     bo_stats_total_allocated;
> > +
> > +     s64                     bo_limits_total_allocated;
> >   };
> >
> >   struct drmcgrp {
> > diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c
> > index f9ef4bf042d8..bc3abff09113 100644
> > --- a/kernel/cgroup/drm.c
> > +++ b/kernel/cgroup/drm.c
> > @@ -15,6 +15,22 @@ static DEFINE_MUTEX(drmcgrp_mutex);
> >   struct drmcgrp_device {
> >       struct drm_device       *dev;
> >       struct mutex            mutex;
> > +
> > +     s64                     bo_limits_total_allocated_default;
> > +};
> > +
> > +#define DRMCG_CTF_PRIV_SIZE 3
> > +#define DRMCG_CTF_PRIV_MASK GENMASK((DRMCG_CTF_PRIV_SIZE - 1), 0)
> > +
> > +enum drmcgrp_res_type {
> > +     DRMCGRP_TYPE_BO_TOTAL,
> > +};
> > +
> > +enum drmcgrp_file_type {
> > +     DRMCGRP_FTYPE_STATS,
> > +     DRMCGRP_FTYPE_MAX,
> > +     DRMCGRP_FTYPE_DEFAULT,
> > +     DRMCGRP_FTYPE_HELP,
> >   };
> >
> >   /* indexed by drm_minor for access speed */
> > @@ -53,6 +69,10 @@ static inline int init_drmcgrp_single(struct drmcgrp *drmcgrp, int i)
> >       }
> >
> >       /* set defaults here */
> > +     if (known_drmcgrp_devs[i] != NULL) {
> > +             ddr->bo_limits_total_allocated =
> > +               known_drmcgrp_devs[i]->bo_limits_total_allocated_default;
> > +     }
> >
> >       return 0;
> >   }
> > @@ -99,7 +119,187 @@ drmcgrp_css_alloc(struct cgroup_subsys_state *parent_css)
> >       return &drmcgrp->css;
> >   }
> >
> > +static inline void drmcgrp_print_stats(struct drmcgrp_device_resource *ddr,
> > +             struct seq_file *sf, enum drmcgrp_res_type type)
> > +{
> > +     if (ddr == NULL) {
> > +             seq_puts(sf, "\n");
> > +             return;
> > +     }
> > +
> > +     switch (type) {
> > +     case DRMCGRP_TYPE_BO_TOTAL:
> > +             seq_printf(sf, "%lld\n", ddr->bo_stats_total_allocated);
> > +             break;
> > +     default:
> > +             seq_puts(sf, "\n");
> > +             break;
> > +     }
> > +}
> > +
> > +static inline void drmcgrp_print_limits(struct drmcgrp_device_resource *ddr,
> > +             struct seq_file *sf, enum drmcgrp_res_type type)
> > +{
> > +     if (ddr == NULL) {
> > +             seq_puts(sf, "\n");
> > +             return;
> > +     }
> > +
> > +     switch (type) {
> > +     case DRMCGRP_TYPE_BO_TOTAL:
> > +             seq_printf(sf, "%lld\n", ddr->bo_limits_total_allocated);
> > +             break;
> > +     default:
> > +             seq_puts(sf, "\n");
> > +             break;
> > +     }
> > +}
> > +
> > +static inline void drmcgrp_print_default(struct drmcgrp_device *ddev,
> > +             struct seq_file *sf, enum drmcgrp_res_type type)
> > +{
> > +     if (ddev == NULL) {
> > +             seq_puts(sf, "\n");
> > +             return;
> > +     }
> > +
> > +     switch (type) {
> > +     case DRMCGRP_TYPE_BO_TOTAL:
> > +             seq_printf(sf, "%lld\n", ddev->bo_limits_total_allocated_default);
> > +             break;
> > +     default:
> > +             seq_puts(sf, "\n");
> > +             break;
> > +     }
> > +}
> > +
> > +static inline void drmcgrp_print_help(int cardNum, struct seq_file *sf,
> > +             enum drmcgrp_res_type type)
> > +{
> > +     switch (type) {
> > +     case DRMCGRP_TYPE_BO_TOTAL:
> > +             seq_printf(sf,
> > +             "Total amount of buffer allocation in bytes for card%d\n",
> > +             cardNum);
> > +             break;
> > +     default:
> > +             seq_puts(sf, "\n");
> > +             break;
> > +     }
> > +}
> > +
> > +int drmcgrp_bo_show(struct seq_file *sf, void *v)
> > +{
> > +     struct drmcgrp *drmcgrp = css_drmcgrp(seq_css(sf));
> > +     struct drmcgrp_device_resource *ddr = NULL;
> > +     enum drmcgrp_file_type f_type = seq_cft(sf)->
> > +             private & DRMCG_CTF_PRIV_MASK;
> > +     enum drmcgrp_res_type type = seq_cft(sf)->
> > +             private >> DRMCG_CTF_PRIV_SIZE;
> > +     struct drmcgrp_device *ddev;
> > +     int i;
> > +
> > +     for (i = 0; i <= max_minor; i++) {
> > +             ddr = drmcgrp->dev_resources[i];
> > +             ddev = known_drmcgrp_devs[i];
> > +
> > +             switch (f_type) {
> > +             case DRMCGRP_FTYPE_STATS:
> > +                     drmcgrp_print_stats(ddr, sf, type);
> > +                     break;
> > +             case DRMCGRP_FTYPE_MAX:
> > +                     drmcgrp_print_limits(ddr, sf, type);
> > +                     break;
> > +             case DRMCGRP_FTYPE_DEFAULT:
> > +                     drmcgrp_print_default(ddev, sf, type);
> > +                     break;
> > +             case DRMCGRP_FTYPE_HELP:
> > +                     drmcgrp_print_help(i, sf, type);
> > +                     break;
> > +             default:
> > +                     seq_puts(sf, "\n");
> > +                     break;
> > +             }
> > +     }
> > +
> > +     return 0;
> > +}
> > +
> > +ssize_t drmcgrp_bo_limit_write(struct kernfs_open_file *of, char *buf,
> > +             size_t nbytes, loff_t off)
> > +{
> > +     struct drmcgrp *drmcgrp = css_drmcgrp(of_css(of));
> > +     enum drmcgrp_res_type type = of_cft(of)->private >> DRMCG_CTF_PRIV_SIZE;
> > +     char *cft_name = of_cft(of)->name;
> > +     char *limits = strstrip(buf);
> > +     struct drmcgrp_device_resource *ddr;
> > +     char *sval;
> > +     s64 val;
> > +     int i = 0;
> > +     int rc;
> > +
> > +     while (i <= max_minor && limits != NULL) {
> > +             sval =  strsep(&limits, "\n");
> > +             rc = kstrtoll(sval, 0, &val);
> > +
> > +             if (rc) {
> > +                     pr_err("drmcgrp: %s: minor %d, err %d. ",
> > +                             cft_name, i, rc);
> > +                     pr_cont_cgroup_name(drmcgrp->css.cgroup);
> > +                     pr_cont("\n");
> > +             } else {
> > +                     ddr = drmcgrp->dev_resources[i];
> > +                     switch (type) {
> > +                     case DRMCGRP_TYPE_BO_TOTAL:
> > +                                if (val < 0) continue;
> > +                             ddr->bo_limits_total_allocated = val;
> > +                             break;
> > +                     default:
> > +                             break;
> > +                     }
> > +             }
> > +
> > +             i++;
> > +     }
> > +
> > +     if (i <= max_minor) {
> > +             pr_err("drmcgrp: %s: less entries than # of drm devices. ",
> > +                             cft_name);
> > +             pr_cont_cgroup_name(drmcgrp->css.cgroup);
> > +             pr_cont("\n");
> > +     }
> > +
> > +     return nbytes;
> > +}
> > +
> >   struct cftype files[] = {
> > +     {
> > +             .name = "buffer.total.stats",
> > +             .seq_show = drmcgrp_bo_show,
> > +             .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
> > +                     DRMCGRP_FTYPE_STATS,
> > +     },
> > +     {
> > +             .name = "buffer.total.default",
> > +             .seq_show = drmcgrp_bo_show,
> > +             .flags = CFTYPE_ONLY_ON_ROOT,
> > +             .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
> > +                     DRMCGRP_FTYPE_DEFAULT,
> > +     },
> > +     {
> > +             .name = "buffer.total.help",
> > +             .seq_show = drmcgrp_bo_show,
> > +             .flags = CFTYPE_ONLY_ON_ROOT,
> > +             .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
> > +                     DRMCGRP_FTYPE_HELP,
> > +     },
> > +     {
> > +             .name = "buffer.total.max",
> > +             .write = drmcgrp_bo_limit_write,
> > +             .seq_show = drmcgrp_bo_show,
> > +             .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
> > +                     DRMCGRP_FTYPE_MAX,
> > +     },
> >       { }     /* terminate */
> >   };
> >
> > @@ -122,6 +322,8 @@ int drmcgrp_register_device(struct drm_device *dev)
> >               return -ENOMEM;
> >
> >       ddev->dev = dev;
> > +     ddev->bo_limits_total_allocated_default = S64_MAX;
> > +
> >       mutex_init(&ddev->mutex);
> >
> >       mutex_lock(&drmcgrp_mutex);
> > @@ -156,3 +358,81 @@ int drmcgrp_unregister_device(struct drm_device *dev)
> >       return 0;
> >   }
> >   EXPORT_SYMBOL(drmcgrp_unregister_device);
> > +
> > +bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, struct drmcgrp *relative)
> > +{
> > +     for (; self != NULL; self = parent_drmcgrp(self))
> > +             if (self == relative)
> > +                     return true;
> > +
> > +     return false;
> > +}
> > +EXPORT_SYMBOL(drmcgrp_is_self_or_ancestor);
> > +
> > +bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev,
> > +             size_t size)
> > +{
> > +     struct drmcgrp *drmcgrp = get_drmcgrp(task);
> > +     struct drmcgrp_device_resource *ddr;
> > +     struct drmcgrp_device_resource *d;
> > +     int devIdx = dev->primary->index;
> > +     bool result = true;
> > +     s64 delta = 0;
> > +
> > +     if (drmcgrp == NULL || drmcgrp == root_drmcgrp)
> > +             return true;
> > +
> > +     ddr = drmcgrp->dev_resources[devIdx];
> > +     mutex_lock(&known_drmcgrp_devs[devIdx]->mutex);
> > +     for ( ; drmcgrp != root_drmcgrp; drmcgrp = parent_drmcgrp(drmcgrp)) {
> > +             d = drmcgrp->dev_resources[devIdx];
> > +             delta = d->bo_limits_total_allocated -
> > +                             d->bo_stats_total_allocated;
> > +
> > +             if (delta <= 0 || size > delta) {
> > +                     result = false;
> > +                     break;
> > +             }
> > +     }
> > +     mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex);
> > +
> > +     return result;
> > +}
> > +EXPORT_SYMBOL(drmcgrp_bo_can_allocate);
> > +
> > +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
> > +             size_t size)
> > +{
> > +     struct drmcgrp_device_resource *ddr;
> > +     int devIdx = dev->primary->index;
> > +
> > +     if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL)
> > +             return;
> > +
> > +     mutex_lock(&known_drmcgrp_devs[devIdx]->mutex);
> > +     for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp)) {
> > +             ddr = drmcgrp->dev_resources[devIdx];
> > +
> > +             ddr->bo_stats_total_allocated += (s64)size;
> > +     }
> > +     mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex);
> > +}
> > +EXPORT_SYMBOL(drmcgrp_chg_bo_alloc);
> > +
> > +void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
> > +             size_t size)
> > +{
> > +     struct drmcgrp_device_resource *ddr;
> > +     int devIdx = dev->primary->index;
> > +
> > +     if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL)
> > +             return;
> > +
> > +     ddr = drmcgrp->dev_resources[devIdx];
> > +     mutex_lock(&known_drmcgrp_devs[devIdx]->mutex);
> > +     for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp))
> > +             drmcgrp->dev_resources[devIdx]->bo_stats_total_allocated
> > +                     -= (s64)size;
> > +     mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex);
> > +}
> > +EXPORT_SYMBOL(drmcgrp_unchg_bo_alloc);
>
Christian König May 10, 2019, 3:08 p.m. UTC | #3
Am 10.05.19 um 16:57 schrieb Kenny Ho:
> [CAUTION: External Email]
>
> On Fri, May 10, 2019 at 8:28 AM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>> Am 09.05.19 um 23:04 schrieb Kenny Ho:
>>> +     /* only allow bo from the same cgroup or its ancestor to be imported */
>>> +     if (drmcgrp != NULL &&
>>> +                     !drmcgrp_is_self_or_ancestor(drmcgrp, obj->drmcgrp)) {
>>> +             ret = -EACCES;
>>> +             goto out_unlock;
>>> +     }
>>> +
>> This will most likely go up in flames.
>>
>> If I'm not completely mistaken we already use
>> drm_gem_prime_fd_to_handle() to exchange handles between different
>> cgroups in current container usages.
> This is something that I am interested in getting more details from
> the broader community because the details affect how likely this will
> go up in flames ;).  Note that this check does not block sharing of
> handles from cgroup parent to children in the hierarchy, nor does it
> blocks sharing of handles within a cgroup.
>
> I am interested to find out, when existing apps share handles between
> containers, if there are any expectations on resource management.
> Since there are no drm cgroup for current container usage, I expect
> the answer to be no.  In this case, the drm cgroup controller can be
> disabled on its own (in the context of cgroup-v2's unified hierarchy),
> or the process can remain at the root for the drm cgroup hierarchy (in
> the context of cgroup-v1.)  If I understand the cgroup api correctly,
> that means all process would be part of the root cgroup as far as the
> drm controller is concerned and this block will not come into effect.
> I have verified that this is indeed the current default behaviour of a
> container runtime (runc, which is used by docker, podman and others.)
> The new drm cgroup controller is simply ignored and all processes
> remain at the root of the hierarchy (since there are no other
> cgroups.)  I plan to make contributions to runc (so folks can actually
> use this features with docker/podman/k8s, etc.) once things stabilized
> on the kernel side.

So the drm cgroup container is separate to other cgroup containers?

In other words as long as userspace doesn't change, this wouldn't have 
any effect?

Well that is unexpected cause then a processes would be in different 
groups for different controllers, but if that's really the case that 
would certainly work.

> On the other hand, if there are expectations for resource management
> between containers, I would like to know who is the expected manager
> and how does it fit into the concept of container (which enforce some
> level of isolation.)  One possible manager may be the display server.
> But as long as the display server is in a parent cgroup of the apps'
> cgroup, the apps can still import handles from the display server
> under the current implementation.  My understanding is that this is
> most likely the case, with the display server simply sitting at the
> default/root cgroup.  But I certainly want to hear more about other
> use cases (for example, is running multiple display servers on a
> single host a realistic possibility?  Are there people running
> multiple display servers inside peer containers?  If so, how do they
> coordinate resources?)

We definitely have situations with multiple display servers running 
(just think of VR).

I just can't say if they currently use cgroups in any way.

Thanks,
Christian.

>
> I should probably summarize some of these into the commit message.
>
> Regards,
> Kenny
>
>
>
>> Christian.
>>
Kenny Ho May 10, 2019, 3:25 p.m. UTC | #4
On Fri, May 10, 2019 at 11:08 AM Koenig, Christian
<Christian.Koenig@amd.com> wrote:
> Am 10.05.19 um 16:57 schrieb Kenny Ho:
> > On Fri, May 10, 2019 at 8:28 AM Christian König
> > <ckoenig.leichtzumerken@gmail.com> wrote:
> >> Am 09.05.19 um 23:04 schrieb Kenny Ho:
> So the drm cgroup container is separate to other cgroup containers?
In cgroup-v1, which is most widely deployed currently, all controllers
have their own hierarchy (see /sys/fs/cgroup/).  In cgroup-v2, the
hierarchy is unified by individual controllers can be disabled (I
believe, I am not super familiar with v2.)

> In other words as long as userspace doesn't change, this wouldn't have
> any effect?
As far as things like docker and podman is concern, yes.  I am not
sure about the behaviour of others like lxc, lxd, etc. because I
haven't used those myself.

> Well that is unexpected cause then a processes would be in different
> groups for different controllers, but if that's really the case that
> would certainly work.
I believe this is a possibility for v1 and is why folks came up with
the unified hierarchy in v2 to solve some of the issues.
https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#issues-with-v1-and-rationales-for-v2

Regards,
Kenny

> > On the other hand, if there are expectations for resource management
> > between containers, I would like to know who is the expected manager
> > and how does it fit into the concept of container (which enforce some
> > level of isolation.)  One possible manager may be the display server.
> > But as long as the display server is in a parent cgroup of the apps'
> > cgroup, the apps can still import handles from the display server
> > under the current implementation.  My understanding is that this is
> > most likely the case, with the display server simply sitting at the
> > default/root cgroup.  But I certainly want to hear more about other
> > use cases (for example, is running multiple display servers on a
> > single host a realistic possibility?  Are there people running
> > multiple display servers inside peer containers?  If so, how do they
> > coordinate resources?)
>
> We definitely have situations with multiple display servers running
> (just think of VR).
>
> I just can't say if they currently use cgroups in any way.
>
> Thanks,
> Christian.
>
> >
> > I should probably summarize some of these into the commit message.
> >
> > Regards,
> > Kenny
> >
> >
> >
> >> Christian.
> >>
>
Christian König May 10, 2019, 5:48 p.m. UTC | #5
Am 10.05.19 um 17:25 schrieb Kenny Ho:
> [CAUTION: External Email]
>
> On Fri, May 10, 2019 at 11:08 AM Koenig, Christian
> <Christian.Koenig@amd.com> wrote:
>> Am 10.05.19 um 16:57 schrieb Kenny Ho:
>>> On Fri, May 10, 2019 at 8:28 AM Christian König
>>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>> Am 09.05.19 um 23:04 schrieb Kenny Ho:
>> So the drm cgroup container is separate to other cgroup containers?
> In cgroup-v1, which is most widely deployed currently, all controllers
> have their own hierarchy (see /sys/fs/cgroup/).  In cgroup-v2, the
> hierarchy is unified by individual controllers can be disabled (I
> believe, I am not super familiar with v2.)
>
>> In other words as long as userspace doesn't change, this wouldn't have
>> any effect?
> As far as things like docker and podman is concern, yes.  I am not
> sure about the behaviour of others like lxc, lxd, etc. because I
> haven't used those myself.
>
>> Well that is unexpected cause then a processes would be in different
>> groups for different controllers, but if that's really the case that
>> would certainly work.
> I believe this is a possibility for v1 and is why folks came up with
> the unified hierarchy in v2 to solve some of the issues.
> https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#issues-with-v1-and-rationales-for-v2

Well another question is why do we want to prevent that in the first place?

I mean the worst thing that can happen is that we account a BO multiple 
times.

And going into the same direction where is the code to handle an open 
device file descriptor which is send from one cgroup to another?

Regards,
Christian.

>
> Regards,
> Kenny
>
>>> On the other hand, if there are expectations for resource management
>>> between containers, I would like to know who is the expected manager
>>> and how does it fit into the concept of container (which enforce some
>>> level of isolation.)  One possible manager may be the display server.
>>> But as long as the display server is in a parent cgroup of the apps'
>>> cgroup, the apps can still import handles from the display server
>>> under the current implementation.  My understanding is that this is
>>> most likely the case, with the display server simply sitting at the
>>> default/root cgroup.  But I certainly want to hear more about other
>>> use cases (for example, is running multiple display servers on a
>>> single host a realistic possibility?  Are there people running
>>> multiple display servers inside peer containers?  If so, how do they
>>> coordinate resources?)
>> We definitely have situations with multiple display servers running
>> (just think of VR).
>>
>> I just can't say if they currently use cgroups in any way.
>>
>> Thanks,
>> Christian.
>>
>>> I should probably summarize some of these into the commit message.
>>>
>>> Regards,
>>> Kenny
>>>
>>>
>>>
>>>> Christian.
>>>>
Kenny Ho May 10, 2019, 6:50 p.m. UTC | #6
On Fri, May 10, 2019 at 1:48 PM Koenig, Christian
<Christian.Koenig@amd.com> wrote:
> Well another question is why do we want to prevent that in the first place?
>
> I mean the worst thing that can happen is that we account a BO multiple
> times.
That's one of the problems.  The other one is the BO outliving the
lifetime of a cgroup and there's no good way to un-charge the usage
when the BO is free so the count won't be accurate.

I have looked into two possible solutions.  One is to prevent cgroup
from being removed when there are BOs owned by the cgroup still alive
(similar to how cgroup removal will fail if it still has processes
attached to it.)  My concern here is the possibility of not able to
remove a cgroup forever due to the lifetime of a BO (continuously
being shared and reuse and never die.)  Perhaps you can shed some
light on this possibility.

The other one is to keep track of all the buffers and migrate them to
the parent if a cgroup is closed.  My concern here is the performance
overhead on tracking all the buffers.

> And going into the same direction where is the code to handle an open
> device file descriptor which is send from one cgroup to another?
I looked into this before but I forgot what I found.  Perhaps folks
familiar with device cgroup can chime in.

Actually, just did another quick search right now.  Looks like the
access is enforced at the inode level (__devcgroup_check_permission)
so the fd sent to another cgroup that does not have access to the
device should still not have access.

Regards,
Kenny


> Regards,
> Christian.
>
> >
> > Regards,
> > Kenny
> >
> >>> On the other hand, if there are expectations for resource management
> >>> between containers, I would like to know who is the expected manager
> >>> and how does it fit into the concept of container (which enforce some
> >>> level of isolation.)  One possible manager may be the display server.
> >>> But as long as the display server is in a parent cgroup of the apps'
> >>> cgroup, the apps can still import handles from the display server
> >>> under the current implementation.  My understanding is that this is
> >>> most likely the case, with the display server simply sitting at the
> >>> default/root cgroup.  But I certainly want to hear more about other
> >>> use cases (for example, is running multiple display servers on a
> >>> single host a realistic possibility?  Are there people running
> >>> multiple display servers inside peer containers?  If so, how do they
> >>> coordinate resources?)
> >> We definitely have situations with multiple display servers running
> >> (just think of VR).
> >>
> >> I just can't say if they currently use cgroups in any way.
> >>
> >> Thanks,
> >> Christian.
> >>
> >>> I should probably summarize some of these into the commit message.
> >>>
> >>> Regards,
> >>> Kenny
> >>>
> >>>
> >>>
> >>>> Christian.
> >>>>
>
Daniel Vetter May 13, 2019, 3:10 p.m. UTC | #7
On Fri, May 10, 2019 at 02:50:39PM -0400, Kenny Ho wrote:
> On Fri, May 10, 2019 at 1:48 PM Koenig, Christian
> <Christian.Koenig@amd.com> wrote:
> > Well another question is why do we want to prevent that in the first place?
> >
> > I mean the worst thing that can happen is that we account a BO multiple
> > times.
> That's one of the problems.  The other one is the BO outliving the
> lifetime of a cgroup and there's no good way to un-charge the usage
> when the BO is free so the count won't be accurate.
> 
> I have looked into two possible solutions.  One is to prevent cgroup
> from being removed when there are BOs owned by the cgroup still alive
> (similar to how cgroup removal will fail if it still has processes
> attached to it.)  My concern here is the possibility of not able to
> remove a cgroup forever due to the lifetime of a BO (continuously
> being shared and reuse and never die.)  Perhaps you can shed some
> light on this possibility.
> 
> The other one is to keep track of all the buffers and migrate them to
> the parent if a cgroup is closed.  My concern here is the performance
> overhead on tracking all the buffers.

My understanding is that other cgroups already use reference counting to
make sure the data structure in the kernel doesn't disappear too early. So
you can delete the cgroup, but it might not get freed completely until all
the BO allocated from that cgroup are released. There's a recent lwn
article on how that's not all that awesome for the memory cgroup
controller, and what to do about it:

https://lwn.net/Articles/787614/

We probably want to align with whatever the mem cgroup folks come up with
(so _not_ prevent deletion of the cgroup, since that's different
behaviour).

> > And going into the same direction where is the code to handle an open
> > device file descriptor which is send from one cgroup to another?
> I looked into this before but I forgot what I found.  Perhaps folks
> familiar with device cgroup can chime in.
> 
> Actually, just did another quick search right now.  Looks like the
> access is enforced at the inode level (__devcgroup_check_permission)
> so the fd sent to another cgroup that does not have access to the
> device should still not have access.

That's the device cgroup, not the memory accounting stuff.

Imo for memory allocations we should look at what happens when you pass a
tempfs file around to another cgroup and then extend it there. I think
those allocations are charged against the cgroup which actually allocates
stuff.

So for drm, if you pass around a device fd, then we always charge ioctl
calls to create a BO against the process doing the ioctl call, not against
the process which originally opened the device fd. For e.g. DRI3 that's
actually the only reasonable thing to do, since otherwise we'd charge
everything against the Xserver.
-Daniel

> 
> Regards,
> Kenny
> 
> 
> > Regards,
> > Christian.
> >
> > >
> > > Regards,
> > > Kenny
> > >
> > >>> On the other hand, if there are expectations for resource management
> > >>> between containers, I would like to know who is the expected manager
> > >>> and how does it fit into the concept of container (which enforce some
> > >>> level of isolation.)  One possible manager may be the display server.
> > >>> But as long as the display server is in a parent cgroup of the apps'
> > >>> cgroup, the apps can still import handles from the display server
> > >>> under the current implementation.  My understanding is that this is
> > >>> most likely the case, with the display server simply sitting at the
> > >>> default/root cgroup.  But I certainly want to hear more about other
> > >>> use cases (for example, is running multiple display servers on a
> > >>> single host a realistic possibility?  Are there people running
> > >>> multiple display servers inside peer containers?  If so, how do they
> > >>> coordinate resources?)
> > >> We definitely have situations with multiple display servers running
> > >> (just think of VR).
> > >>
> > >> I just can't say if they currently use cgroups in any way.
> > >>
> > >> Thanks,
> > >> Christian.
> > >>
> > >>> I should probably summarize some of these into the commit message.
> > >>>
> > >>> Regards,
> > >>> Kenny
> > >>>
> > >>>
> > >>>
> > >>>> Christian.
> > >>>>
> >
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Welty, Brian May 15, 2019, 9:26 p.m. UTC | #8
On 5/9/2019 2:04 PM, Kenny Ho wrote:
> The drm resource being measured and limited here is the GEM buffer
> objects.  User applications allocate and free these buffers.  In
> addition, a process can allocate a buffer and share it with another
> process.  The consumer of a shared buffer can also outlive the
> allocator of the buffer.
> 
> For the purpose of cgroup accounting and limiting, ownership of the
> buffer is deemed to be the cgroup for which the allocating process
> belongs to.  There is one limit per drm device.
> 
> In order to prevent the buffer outliving the cgroup that owns it, a
> process is prevented from importing buffers that are not own by the
> process' cgroup or the ancestors of the process' cgroup.
> 
> For this resource, the control files are prefixed with drm.buffer.total.

Overall, this framework looks very good.

But is this a useful resource to track?   See my question down further
below in your drm_gem_private_object_init.


> 
> There are four control file types,
> stats (ro) - display current measured values for a resource
> max (rw) - limits for a resource
> default (ro, root cgroup only) - default values for a resource
> help (ro, root cgroup only) - help string for a resource
> 
> Each file is multi-lined with one entry/line per drm device.

Multi-line is correct for multiple devices, but I believe you need
to use a KEY to denote device for both your set and get routines.
I didn't see your set functions reading a key, or the get functions
printing the key in output.
cgroups-v2 conventions mention using KEY of major:minor, but I think
you can use drm_minor as key?

> 
> Usage examples:
> // set limit for card1 to 1GB
> sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max
> 
> // set limit for card0 to 512MB
> sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max
> 
> Change-Id: I4c249d06d45ec709d6481d4cbe87c5168545c5d0
> Signed-off-by: Kenny Ho <Kenny.Ho@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c |   4 +
>  drivers/gpu/drm/drm_gem.c                  |   7 +
>  drivers/gpu/drm/drm_prime.c                |   9 +
>  include/drm/drm_cgroup.h                   |  34 ++-
>  include/drm/drm_gem.h                      |  11 +
>  include/linux/cgroup_drm.h                 |   3 +
>  kernel/cgroup/drm.c                        | 280 +++++++++++++++++++++
>  7 files changed, 346 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 93b2c5a48a71..b4c078b7ad63 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -34,6 +34,7 @@
>  #include <drm/drmP.h>
>  #include <drm/amdgpu_drm.h>
>  #include <drm/drm_cache.h>
> +#include <drm/drm_cgroup.h>
>  #include "amdgpu.h"
>  #include "amdgpu_trace.h"
>  #include "amdgpu_amdkfd.h"
> @@ -446,6 +447,9 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
>  	if (!amdgpu_bo_validate_size(adev, size, bp->domain))
>  		return -ENOMEM;
>  
> +	if (!drmcgrp_bo_can_allocate(current, adev->ddev, size))
> +		return -ENOMEM;
> +
>  	*bo_ptr = NULL;
>  
>  	acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
> diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
> index 6a80db077dc6..cbd49bf34dcf 100644
> --- a/drivers/gpu/drm/drm_gem.c
> +++ b/drivers/gpu/drm/drm_gem.c
> @@ -37,10 +37,12 @@
>  #include <linux/shmem_fs.h>
>  #include <linux/dma-buf.h>
>  #include <linux/mem_encrypt.h>
> +#include <linux/cgroup_drm.h>
>  #include <drm/drmP.h>
>  #include <drm/drm_vma_manager.h>
>  #include <drm/drm_gem.h>
>  #include <drm/drm_print.h>
> +#include <drm/drm_cgroup.h>
>  #include "drm_internal.h"
>  
>  /** @file drm_gem.c
> @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev,
>  	obj->handle_count = 0;
>  	obj->size = size;
>  	drm_vma_node_reset(&obj->vma_node);
> +
> +	obj->drmcgrp = get_drmcgrp(current);
> +	drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size);


Why do the charging here?
There is no backing store yet for the buffer, so this is really tracking something akin to allowed virtual memory for GEM objects?
Is this really useful for an administrator to control?
Isn't the resource we want to control actually the physical backing store? 


>  }
>  EXPORT_SYMBOL(drm_gem_private_object_init);
>  
> @@ -804,6 +809,8 @@ drm_gem_object_release(struct drm_gem_object *obj)
>  	if (obj->filp)
>  		fput(obj->filp);
>  
> +	drmcgrp_unchg_bo_alloc(obj->drmcgrp, obj->dev, obj->size);
> +
>  	drm_gem_free_mmap_offset(obj);
>  }
>  EXPORT_SYMBOL(drm_gem_object_release);
> diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
> index 231e3f6d5f41..faed5611a1c6 100644
> --- a/drivers/gpu/drm/drm_prime.c
> +++ b/drivers/gpu/drm/drm_prime.c
> @@ -32,6 +32,7 @@
>  #include <drm/drm_prime.h>
>  #include <drm/drm_gem.h>
>  #include <drm/drmP.h>
> +#include <drm/drm_cgroup.h>
>  
>  #include "drm_internal.h"
>  
> @@ -794,6 +795,7 @@ int drm_gem_prime_fd_to_handle(struct drm_device *dev,
>  {
>  	struct dma_buf *dma_buf;
>  	struct drm_gem_object *obj;
> +	struct drmcgrp *drmcgrp = get_drmcgrp(current);
>  	int ret;
>  
>  	dma_buf = dma_buf_get(prime_fd);
> @@ -818,6 +820,13 @@ int drm_gem_prime_fd_to_handle(struct drm_device *dev,
>  		goto out_unlock;
>  	}
>  
> +	/* only allow bo from the same cgroup or its ancestor to be imported */
> +	if (drmcgrp != NULL &&
> +			!drmcgrp_is_self_or_ancestor(drmcgrp, obj->drmcgrp)) {
> +		ret = -EACCES;
> +		goto out_unlock;
> +	}
> +
>  	if (obj->dma_buf) {
>  		WARN_ON(obj->dma_buf != dma_buf);
>  	} else {
> diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h
> index ddb9eab64360..8711b7c5f7bf 100644
> --- a/include/drm/drm_cgroup.h
> +++ b/include/drm/drm_cgroup.h
> @@ -4,12 +4,20 @@
>  #ifndef __DRM_CGROUP_H__
>  #define __DRM_CGROUP_H__
>  
> +#include <linux/cgroup_drm.h>
> +
>  #ifdef CONFIG_CGROUP_DRM
>  
>  int drmcgrp_register_device(struct drm_device *device);
> -
>  int drmcgrp_unregister_device(struct drm_device *device);
> -
> +bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self,
> +		struct drmcgrp *relative);
> +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
> +		size_t size);
> +void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
> +		size_t size);
> +bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev,
> +		size_t size);
>  #else
>  static inline int drmcgrp_register_device(struct drm_device *device)
>  {
> @@ -20,5 +28,27 @@ static inline int drmcgrp_unregister_device(struct drm_device *device)
>  {
>  	return 0;
>  }
> +
> +static inline bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self,
> +		struct drmcgrp *relative)
> +{
> +	return false;
> +}
> +
> +static inline void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp,
> +		struct drm_device *dev,	size_t size)
> +{
> +}
> +
> +static inline void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp,
> +		struct drm_device *dev,	size_t size)
> +{
> +}
> +
> +static inline bool drmcgrp_bo_can_allocate(struct task_struct *task,
> +		struct drm_device *dev,	size_t size)
> +{
> +	return true;
> +}
>  #endif /* CONFIG_CGROUP_DRM */
>  #endif /* __DRM_CGROUP_H__ */
> diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
> index c95727425284..02854c674b5c 100644
> --- a/include/drm/drm_gem.h
> +++ b/include/drm/drm_gem.h
> @@ -272,6 +272,17 @@ struct drm_gem_object {
>  	 *
>  	 */
>  	const struct drm_gem_object_funcs *funcs;
> +
> +	/**
> +	 * @drmcgrp:
> +	 *
> +	 * DRM cgroup this GEM object belongs to.
> +         *
> +         * This is used to track and limit the amount of GEM objects a user
> +         * can allocate.  Since GEM objects can be shared, this is also used
> +         * to ensure GEM objects are only shared within the same cgroup.
> +	 */
> +	struct drmcgrp *drmcgrp;
>  };
>  
>  /**
> diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h
> index d7ccf434ca6b..fe14ba7bb1cf 100644
> --- a/include/linux/cgroup_drm.h
> +++ b/include/linux/cgroup_drm.h
> @@ -15,6 +15,9 @@
>  
>  struct drmcgrp_device_resource {
>  	/* for per device stats */
> +	s64			bo_stats_total_allocated;
> +
> +	s64			bo_limits_total_allocated;
>  };
>  
>  struct drmcgrp {
> diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c
> index f9ef4bf042d8..bc3abff09113 100644
> --- a/kernel/cgroup/drm.c
> +++ b/kernel/cgroup/drm.c
> @@ -15,6 +15,22 @@ static DEFINE_MUTEX(drmcgrp_mutex);
>  struct drmcgrp_device {
>  	struct drm_device	*dev;
>  	struct mutex		mutex;
> +
> +	s64			bo_limits_total_allocated_default;
> +};
> +
> +#define DRMCG_CTF_PRIV_SIZE 3
> +#define DRMCG_CTF_PRIV_MASK GENMASK((DRMCG_CTF_PRIV_SIZE - 1), 0)
> +
> +enum drmcgrp_res_type {
> +	DRMCGRP_TYPE_BO_TOTAL,
> +};
> +
> +enum drmcgrp_file_type {
> +	DRMCGRP_FTYPE_STATS,
> +	DRMCGRP_FTYPE_MAX,
> +	DRMCGRP_FTYPE_DEFAULT,
> +	DRMCGRP_FTYPE_HELP,
>  };
>  
>  /* indexed by drm_minor for access speed */
> @@ -53,6 +69,10 @@ static inline int init_drmcgrp_single(struct drmcgrp *drmcgrp, int i)
>  	}
>  
>  	/* set defaults here */
> +	if (known_drmcgrp_devs[i] != NULL) {
> +		ddr->bo_limits_total_allocated =
> +		  known_drmcgrp_devs[i]->bo_limits_total_allocated_default;
> +	}
>  
>  	return 0;
>  }
> @@ -99,7 +119,187 @@ drmcgrp_css_alloc(struct cgroup_subsys_state *parent_css)
>  	return &drmcgrp->css;
>  }
>  
> +static inline void drmcgrp_print_stats(struct drmcgrp_device_resource *ddr,
> +		struct seq_file *sf, enum drmcgrp_res_type type)
> +{
> +	if (ddr == NULL) {
> +		seq_puts(sf, "\n");
> +		return;
> +	}
> +
> +	switch (type) {
> +	case DRMCGRP_TYPE_BO_TOTAL:
> +		seq_printf(sf, "%lld\n", ddr->bo_stats_total_allocated);
> +		break;
> +	default:
> +		seq_puts(sf, "\n");
> +		break;
> +	}
> +}
> +
> +static inline void drmcgrp_print_limits(struct drmcgrp_device_resource *ddr,
> +		struct seq_file *sf, enum drmcgrp_res_type type)
> +{
> +	if (ddr == NULL) {
> +		seq_puts(sf, "\n");
> +		return;
> +	}
> +
> +	switch (type) {
> +	case DRMCGRP_TYPE_BO_TOTAL:
> +		seq_printf(sf, "%lld\n", ddr->bo_limits_total_allocated);
> +		break;
> +	default:
> +		seq_puts(sf, "\n");
> +		break;
> +	}
> +}
> +
> +static inline void drmcgrp_print_default(struct drmcgrp_device *ddev,
> +		struct seq_file *sf, enum drmcgrp_res_type type)
> +{
> +	if (ddev == NULL) {
> +		seq_puts(sf, "\n");
> +		return;
> +	}
> +
> +	switch (type) {
> +	case DRMCGRP_TYPE_BO_TOTAL:
> +		seq_printf(sf, "%lld\n", ddev->bo_limits_total_allocated_default);
> +		break;
> +	default:
> +		seq_puts(sf, "\n");
> +		break;
> +	}
> +}
> +
> +static inline void drmcgrp_print_help(int cardNum, struct seq_file *sf,
> +		enum drmcgrp_res_type type)
> +{
> +	switch (type) {
> +	case DRMCGRP_TYPE_BO_TOTAL:
> +		seq_printf(sf,
> +		"Total amount of buffer allocation in bytes for card%d\n",
> +		cardNum);
> +		break;
> +	default:
> +		seq_puts(sf, "\n");
> +		break;
> +	}
> +}
> +
> +int drmcgrp_bo_show(struct seq_file *sf, void *v)
> +{
> +	struct drmcgrp *drmcgrp = css_drmcgrp(seq_css(sf));
> +	struct drmcgrp_device_resource *ddr = NULL;
> +	enum drmcgrp_file_type f_type = seq_cft(sf)->
> +		private & DRMCG_CTF_PRIV_MASK;
> +	enum drmcgrp_res_type type = seq_cft(sf)->
> +		private >> DRMCG_CTF_PRIV_SIZE;
> +	struct drmcgrp_device *ddev;
> +	int i;
> +
> +	for (i = 0; i <= max_minor; i++) {
> +		ddr = drmcgrp->dev_resources[i];
> +		ddev = known_drmcgrp_devs[i];
> +
> +		switch (f_type) {
> +		case DRMCGRP_FTYPE_STATS:
> +			drmcgrp_print_stats(ddr, sf, type);
> +			break;
> +		case DRMCGRP_FTYPE_MAX:
> +			drmcgrp_print_limits(ddr, sf, type);
> +			break;
> +		case DRMCGRP_FTYPE_DEFAULT:
> +			drmcgrp_print_default(ddev, sf, type);
> +			break;
> +		case DRMCGRP_FTYPE_HELP:
> +			drmcgrp_print_help(i, sf, type);
> +			break;
> +		default:
> +			seq_puts(sf, "\n");
> +			break;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +ssize_t drmcgrp_bo_limit_write(struct kernfs_open_file *of, char *buf,
> +		size_t nbytes, loff_t off)
> +{
> +	struct drmcgrp *drmcgrp = css_drmcgrp(of_css(of));
> +	enum drmcgrp_res_type type = of_cft(of)->private >> DRMCG_CTF_PRIV_SIZE;
> +	char *cft_name = of_cft(of)->name;
> +	char *limits = strstrip(buf);
> +	struct drmcgrp_device_resource *ddr;
> +	char *sval;
> +	s64 val;
> +	int i = 0;
> +	int rc;
> +
> +	while (i <= max_minor && limits != NULL) {
> +		sval =  strsep(&limits, "\n");
> +		rc = kstrtoll(sval, 0, &val);

Input should be "KEY VALUE", so KEY will determine device to apply this to.
Also, per cgroups-v2 documentation of limits, I believe need to parse and handle the special "max" input value.

parse_resources() in rdma controller is example for both of above.

> +		if (rc) {
> +			pr_err("drmcgrp: %s: minor %d, err %d. ",
> +				cft_name, i, rc);
> +			pr_cont_cgroup_name(drmcgrp->css.cgroup);
> +			pr_cont("\n");
> +		} else {
> +			ddr = drmcgrp->dev_resources[i];
> +			switch (type) {
> +			case DRMCGRP_TYPE_BO_TOTAL:
> +                                if (val < 0) continue;
> +				ddr->bo_limits_total_allocated = val;
> +				break;
> +			default:
> +				break;
> +			}
> +		}
> +
> +		i++;
> +	}
> +
> +	if (i <= max_minor) {
> +		pr_err("drmcgrp: %s: less entries than # of drm devices. ",
> +				cft_name);
> +		pr_cont_cgroup_name(drmcgrp->css.cgroup);
> +		pr_cont("\n");
> +	}
> +
> +	return nbytes;
> +}
> +
>  struct cftype files[] = {
> +	{
> +		.name = "buffer.total.stats",
> +		.seq_show = drmcgrp_bo_show,
> +		.private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
> +			DRMCGRP_FTYPE_STATS,
> +	},
> +	{
> +		.name = "buffer.total.default",
> +		.seq_show = drmcgrp_bo_show,
> +		.flags = CFTYPE_ONLY_ON_ROOT,
> +		.private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
> +			DRMCGRP_FTYPE_DEFAULT,
> +	},
> +	{
> +		.name = "buffer.total.help",
> +		.seq_show = drmcgrp_bo_show,
> +		.flags = CFTYPE_ONLY_ON_ROOT,
> +		.private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
> +			DRMCGRP_FTYPE_HELP,
> +	},
> +	{
> +		.name = "buffer.total.max",
> +		.write = drmcgrp_bo_limit_write,
> +		.seq_show = drmcgrp_bo_show,
> +		.private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
> +			DRMCGRP_FTYPE_MAX,
> +	},
>  	{ }	/* terminate */
>  };
>  
> @@ -122,6 +322,8 @@ int drmcgrp_register_device(struct drm_device *dev)
>  		return -ENOMEM;
>  
>  	ddev->dev = dev;
> +	ddev->bo_limits_total_allocated_default = S64_MAX;
> +
>  	mutex_init(&ddev->mutex);
>  
>  	mutex_lock(&drmcgrp_mutex);
> @@ -156,3 +358,81 @@ int drmcgrp_unregister_device(struct drm_device *dev)
>  	return 0;
>  }
>  EXPORT_SYMBOL(drmcgrp_unregister_device);
> +
> +bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, struct drmcgrp *relative)
> +{
> +	for (; self != NULL; self = parent_drmcgrp(self))
> +		if (self == relative)
> +			return true;
> +
> +	return false;
> +}
> +EXPORT_SYMBOL(drmcgrp_is_self_or_ancestor);
> +
> +bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev,
> +		size_t size)
> +{
> +	struct drmcgrp *drmcgrp = get_drmcgrp(task);
> +	struct drmcgrp_device_resource *ddr;
> +	struct drmcgrp_device_resource *d;
> +	int devIdx = dev->primary->index;
> +	bool result = true;
> +	s64 delta = 0;
> +
> +	if (drmcgrp == NULL || drmcgrp == root_drmcgrp)
> +		return true;
> +
> +	ddr = drmcgrp->dev_resources[devIdx];
> +	mutex_lock(&known_drmcgrp_devs[devIdx]->mutex);
> +	for ( ; drmcgrp != root_drmcgrp; drmcgrp = parent_drmcgrp(drmcgrp)) {
> +		d = drmcgrp->dev_resources[devIdx];
> +		delta = d->bo_limits_total_allocated -
> +				d->bo_stats_total_allocated;
> +
> +		if (delta <= 0 || size > delta) {
> +			result = false;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex);
> +
> +	return result;
> +}
> +EXPORT_SYMBOL(drmcgrp_bo_can_allocate);
> +
> +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
> +		size_t size)

Shouldn't this return an error and be implemented with same semantics as the
try_charge() functions of other controllers?
Below will allow stats_total_allocated to overrun limits_total_allocated.


> +{
> +	struct drmcgrp_device_resource *ddr;
> +	int devIdx = dev->primary->index;
> +
> +	if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL)
> +		return;
> +
> +	mutex_lock(&known_drmcgrp_devs[devIdx]->mutex);
> +	for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp)) {
> +		ddr = drmcgrp->dev_resources[devIdx];
> +
> +		ddr->bo_stats_total_allocated += (s64)size;
> +	}
> +	mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex);
> +}
> +EXPORT_SYMBOL(drmcgrp_chg_bo_alloc);
> +
> +void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
> +		size_t size)
> +{
> +	struct drmcgrp_device_resource *ddr;
> +	int devIdx = dev->primary->index;
> +
> +	if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL)
> +		return;
> +
> +	ddr = drmcgrp->dev_resources[devIdx];
> +	mutex_lock(&known_drmcgrp_devs[devIdx]->mutex);
> +	for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp))
> +		drmcgrp->dev_resources[devIdx]->bo_stats_total_allocated
> +			-= (s64)size;
> +	mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex);
> +}
> +EXPORT_SYMBOL(drmcgrp_unchg_bo_alloc);
>
Kenny Ho May 16, 2019, 2:29 a.m. UTC | #9
On Wed, May 15, 2019 at 5:26 PM Welty, Brian <brian.welty@intel.com> wrote:
> On 5/9/2019 2:04 PM, Kenny Ho wrote:
> > There are four control file types,
> > stats (ro) - display current measured values for a resource
> > max (rw) - limits for a resource
> > default (ro, root cgroup only) - default values for a resource
> > help (ro, root cgroup only) - help string for a resource
> >
> > Each file is multi-lined with one entry/line per drm device.
>
> Multi-line is correct for multiple devices, but I believe you need
> to use a KEY to denote device for both your set and get routines.
> I didn't see your set functions reading a key, or the get functions
> printing the key in output.
> cgroups-v2 conventions mention using KEY of major:minor, but I think
> you can use drm_minor as key?
Given this controller is specific to the drm kernel subsystem which
uses minor to identify drm device, I don't see a need to complicate
the interfaces more by having major and a key.  As you can see in the
examples below, the drm device minor corresponds to the line number.
I am not sure how strict cgroup upstream is about the convention but I
am hoping there are flexibility here to allow for what I have
implemented.  There are a couple of other things I have done that is
not described in the convention: 1) inclusion of read-only *.help file
at the root cgroup, 2) use read-only (which I can potentially make rw)
*.default file instead of having a default entries (since the default
can be different for different devices) inside the control files (this
way, the resetting of cgroup values for all the drm devices, can be
done by a simple 'cp'.)

> > Usage examples:
> > // set limit for card1 to 1GB
> > sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max
> >
> > // set limit for card0 to 512MB
> > sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max


> >  /** @file drm_gem.c
> > @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev,
> >       obj->handle_count = 0;
> >       obj->size = size;
> >       drm_vma_node_reset(&obj->vma_node);
> > +
> > +     obj->drmcgrp = get_drmcgrp(current);
> > +     drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size);
>
> Why do the charging here?
> There is no backing store yet for the buffer, so this is really tracking something akin to allowed virtual memory for GEM objects?
> Is this really useful for an administrator to control?
> Isn't the resource we want to control actually the physical backing store?
That's correct.  This is just the first level of control since the
backing store can be backed by different type of memory.  I am in the
process of adding at least two more resources.  Stay tuned.  I am
doing the charge here to enforce the idea of "creator is deemed owner"
at a place where the code is shared by all (the init function.)

> > +     while (i <= max_minor && limits != NULL) {
> > +             sval =  strsep(&limits, "\n");
> > +             rc = kstrtoll(sval, 0, &val);
>
> Input should be "KEY VALUE", so KEY will determine device to apply this to.
> Also, per cgroups-v2 documentation of limits, I believe need to parse and handle the special "max" input value.
>
> parse_resources() in rdma controller is example for both of above.
Please see my previous reply for the rationale of my hope to not need
a key.  I can certainly add handling of "max" and "default".


> > +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
> > +             size_t size)
>
> Shouldn't this return an error and be implemented with same semantics as the
> try_charge() functions of other controllers?
> Below will allow stats_total_allocated to overrun limits_total_allocated.
This is because I am charging the buffer at the init of the buffer
which does not fail so the "try" (drmcgrp_bo_can_allocate) is separate
and placed earlier and nearer other condition where gem object
allocation may fail.  In other words, there are multiple possibilities
for which gem allocation may fail (cgroup limit being one of them) and
satisfying cgroup limit does not mean a charge is needed.  I can
certainly combine the two functions to have an additional try_charge
semantic as well if that is really needed.

Regards,
Kenny
Christian König May 16, 2019, 7:16 a.m. UTC | #10
Am 16.05.19 um 04:29 schrieb Kenny Ho:
> [CAUTION: External Email]
>
> On Wed, May 15, 2019 at 5:26 PM Welty, Brian <brian.welty@intel.com> wrote:
>> On 5/9/2019 2:04 PM, Kenny Ho wrote:
>>> There are four control file types,
>>> stats (ro) - display current measured values for a resource
>>> max (rw) - limits for a resource
>>> default (ro, root cgroup only) - default values for a resource
>>> help (ro, root cgroup only) - help string for a resource
>>>
>>> Each file is multi-lined with one entry/line per drm device.
>> Multi-line is correct for multiple devices, but I believe you need
>> to use a KEY to denote device for both your set and get routines.
>> I didn't see your set functions reading a key, or the get functions
>> printing the key in output.
>> cgroups-v2 conventions mention using KEY of major:minor, but I think
>> you can use drm_minor as key?
> Given this controller is specific to the drm kernel subsystem which
> uses minor to identify drm device,

Wait a second, using the DRM minor is a good idea in the first place.

I have a test system with a Vega10 and a Vega20. Which device gets which 
minor is not stable, but rather defined by the scan order of the PCIe bus.

Normally the scan order is always the same, but adding or removing 
devices or delaying things just a little bit during init is enough to 
change this.

We need something like the Linux sysfs location or similar to have a 
stable implementation.

Regards,
Christian.

>   I don't see a need to complicate
> the interfaces more by having major and a key.  As you can see in the
> examples below, the drm device minor corresponds to the line number.
> I am not sure how strict cgroup upstream is about the convention but I
> am hoping there are flexibility here to allow for what I have
> implemented.  There are a couple of other things I have done that is
> not described in the convention: 1) inclusion of read-only *.help file
> at the root cgroup, 2) use read-only (which I can potentially make rw)
> *.default file instead of having a default entries (since the default
> can be different for different devices) inside the control files (this
> way, the resetting of cgroup values for all the drm devices, can be
> done by a simple 'cp'.)
>
>>> Usage examples:
>>> // set limit for card1 to 1GB
>>> sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max
>>>
>>> // set limit for card0 to 512MB
>>> sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max
>
>>>   /** @file drm_gem.c
>>> @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev,
>>>        obj->handle_count = 0;
>>>        obj->size = size;
>>>        drm_vma_node_reset(&obj->vma_node);
>>> +
>>> +     obj->drmcgrp = get_drmcgrp(current);
>>> +     drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size);
>> Why do the charging here?
>> There is no backing store yet for the buffer, so this is really tracking something akin to allowed virtual memory for GEM objects?
>> Is this really useful for an administrator to control?
>> Isn't the resource we want to control actually the physical backing store?
> That's correct.  This is just the first level of control since the
> backing store can be backed by different type of memory.  I am in the
> process of adding at least two more resources.  Stay tuned.  I am
> doing the charge here to enforce the idea of "creator is deemed owner"
> at a place where the code is shared by all (the init function.)
>
>>> +     while (i <= max_minor && limits != NULL) {
>>> +             sval =  strsep(&limits, "\n");
>>> +             rc = kstrtoll(sval, 0, &val);
>> Input should be "KEY VALUE", so KEY will determine device to apply this to.
>> Also, per cgroups-v2 documentation of limits, I believe need to parse and handle the special "max" input value.
>>
>> parse_resources() in rdma controller is example for both of above.
> Please see my previous reply for the rationale of my hope to not need
> a key.  I can certainly add handling of "max" and "default".
>
>
>>> +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
>>> +             size_t size)
>> Shouldn't this return an error and be implemented with same semantics as the
>> try_charge() functions of other controllers?
>> Below will allow stats_total_allocated to overrun limits_total_allocated.
> This is because I am charging the buffer at the init of the buffer
> which does not fail so the "try" (drmcgrp_bo_can_allocate) is separate
> and placed earlier and nearer other condition where gem object
> allocation may fail.  In other words, there are multiple possibilities
> for which gem allocation may fail (cgroup limit being one of them) and
> satisfying cgroup limit does not mean a charge is needed.  I can
> certainly combine the two functions to have an additional try_charge
> semantic as well if that is really needed.
>
> Regards,
> Kenny
Christian König May 16, 2019, 7:25 a.m. UTC | #11
Am 16.05.19 um 09:16 schrieb Koenig, Christian:
> Am 16.05.19 um 04:29 schrieb Kenny Ho:
>> [CAUTION: External Email]
>>
>> On Wed, May 15, 2019 at 5:26 PM Welty, Brian <brian.welty@intel.com> wrote:
>>> On 5/9/2019 2:04 PM, Kenny Ho wrote:
>>>> There are four control file types,
>>>> stats (ro) - display current measured values for a resource
>>>> max (rw) - limits for a resource
>>>> default (ro, root cgroup only) - default values for a resource
>>>> help (ro, root cgroup only) - help string for a resource
>>>>
>>>> Each file is multi-lined with one entry/line per drm device.
>>> Multi-line is correct for multiple devices, but I believe you need
>>> to use a KEY to denote device for both your set and get routines.
>>> I didn't see your set functions reading a key, or the get functions
>>> printing the key in output.
>>> cgroups-v2 conventions mention using KEY of major:minor, but I think
>>> you can use drm_minor as key?
>> Given this controller is specific to the drm kernel subsystem which
>> uses minor to identify drm device,
> Wait a second, using the DRM minor is a good idea in the first place.

Well that should have read "is not a good idea"..

Christian.

>
> I have a test system with a Vega10 and a Vega20. Which device gets which
> minor is not stable, but rather defined by the scan order of the PCIe bus.
>
> Normally the scan order is always the same, but adding or removing
> devices or delaying things just a little bit during init is enough to
> change this.
>
> We need something like the Linux sysfs location or similar to have a
> stable implementation.
>
> Regards,
> Christian.
>
>>    I don't see a need to complicate
>> the interfaces more by having major and a key.  As you can see in the
>> examples below, the drm device minor corresponds to the line number.
>> I am not sure how strict cgroup upstream is about the convention but I
>> am hoping there are flexibility here to allow for what I have
>> implemented.  There are a couple of other things I have done that is
>> not described in the convention: 1) inclusion of read-only *.help file
>> at the root cgroup, 2) use read-only (which I can potentially make rw)
>> *.default file instead of having a default entries (since the default
>> can be different for different devices) inside the control files (this
>> way, the resetting of cgroup values for all the drm devices, can be
>> done by a simple 'cp'.)
>>
>>>> Usage examples:
>>>> // set limit for card1 to 1GB
>>>> sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max
>>>>
>>>> // set limit for card0 to 512MB
>>>> sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max
>>>>    /** @file drm_gem.c
>>>> @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev,
>>>>         obj->handle_count = 0;
>>>>         obj->size = size;
>>>>         drm_vma_node_reset(&obj->vma_node);
>>>> +
>>>> +     obj->drmcgrp = get_drmcgrp(current);
>>>> +     drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size);
>>> Why do the charging here?
>>> There is no backing store yet for the buffer, so this is really tracking something akin to allowed virtual memory for GEM objects?
>>> Is this really useful for an administrator to control?
>>> Isn't the resource we want to control actually the physical backing store?
>> That's correct.  This is just the first level of control since the
>> backing store can be backed by different type of memory.  I am in the
>> process of adding at least two more resources.  Stay tuned.  I am
>> doing the charge here to enforce the idea of "creator is deemed owner"
>> at a place where the code is shared by all (the init function.)
>>
>>>> +     while (i <= max_minor && limits != NULL) {
>>>> +             sval =  strsep(&limits, "\n");
>>>> +             rc = kstrtoll(sval, 0, &val);
>>> Input should be "KEY VALUE", so KEY will determine device to apply this to.
>>> Also, per cgroups-v2 documentation of limits, I believe need to parse and handle the special "max" input value.
>>>
>>> parse_resources() in rdma controller is example for both of above.
>> Please see my previous reply for the rationale of my hope to not need
>> a key.  I can certainly add handling of "max" and "default".
>>
>>
>>>> +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
>>>> +             size_t size)
>>> Shouldn't this return an error and be implemented with same semantics as the
>>> try_charge() functions of other controllers?
>>> Below will allow stats_total_allocated to overrun limits_total_allocated.
>> This is because I am charging the buffer at the init of the buffer
>> which does not fail so the "try" (drmcgrp_bo_can_allocate) is separate
>> and placed earlier and nearer other condition where gem object
>> allocation may fail.  In other words, there are multiple possibilities
>> for which gem allocation may fail (cgroup limit being one of them) and
>> satisfying cgroup limit does not mean a charge is needed.  I can
>> certainly combine the two functions to have an additional try_charge
>> semantic as well if that is really needed.
>>
>> Regards,
>> Kenny
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Daniel Vetter May 16, 2019, 12:28 p.m. UTC | #12
On Thu, May 16, 2019 at 09:25:31AM +0200, Christian König wrote:
> Am 16.05.19 um 09:16 schrieb Koenig, Christian:
> > Am 16.05.19 um 04:29 schrieb Kenny Ho:
> > > [CAUTION: External Email]
> > > 
> > > On Wed, May 15, 2019 at 5:26 PM Welty, Brian <brian.welty@intel.com> wrote:
> > > > On 5/9/2019 2:04 PM, Kenny Ho wrote:
> > > > > There are four control file types,
> > > > > stats (ro) - display current measured values for a resource
> > > > > max (rw) - limits for a resource
> > > > > default (ro, root cgroup only) - default values for a resource
> > > > > help (ro, root cgroup only) - help string for a resource
> > > > > 
> > > > > Each file is multi-lined with one entry/line per drm device.
> > > > Multi-line is correct for multiple devices, but I believe you need
> > > > to use a KEY to denote device for both your set and get routines.
> > > > I didn't see your set functions reading a key, or the get functions
> > > > printing the key in output.
> > > > cgroups-v2 conventions mention using KEY of major:minor, but I think
> > > > you can use drm_minor as key?
> > > Given this controller is specific to the drm kernel subsystem which
> > > uses minor to identify drm device,
> > Wait a second, using the DRM minor is a good idea in the first place.
> 
> Well that should have read "is not a good idea"..

What else should we use?
> 
> Christian.
> 
> > 
> > I have a test system with a Vega10 and a Vega20. Which device gets which
> > minor is not stable, but rather defined by the scan order of the PCIe bus.
> > 
> > Normally the scan order is always the same, but adding or removing
> > devices or delaying things just a little bit during init is enough to
> > change this.
> > 
> > We need something like the Linux sysfs location or similar to have a
> > stable implementation.

You can go from sysfs location to drm class directory (in sysfs) and back.
That means if you care you need to walk sysfs yourself a bit, but using
the drm minor isn't a blocker itself.

One downside with the drm minor is that it's pretty good nonsense once you
have more than 64 gpus though, due to how we space render and legacy nodes
in the minor ids :-)
-Daniel
> > 
> > Regards,
> > Christian.
> > 
> > >    I don't see a need to complicate
> > > the interfaces more by having major and a key.  As you can see in the
> > > examples below, the drm device minor corresponds to the line number.
> > > I am not sure how strict cgroup upstream is about the convention but I
> > > am hoping there are flexibility here to allow for what I have
> > > implemented.  There are a couple of other things I have done that is
> > > not described in the convention: 1) inclusion of read-only *.help file
> > > at the root cgroup, 2) use read-only (which I can potentially make rw)
> > > *.default file instead of having a default entries (since the default
> > > can be different for different devices) inside the control files (this
> > > way, the resetting of cgroup values for all the drm devices, can be
> > > done by a simple 'cp'.)
> > > 
> > > > > Usage examples:
> > > > > // set limit for card1 to 1GB
> > > > > sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max
> > > > > 
> > > > > // set limit for card0 to 512MB
> > > > > sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max
> > > > >    /** @file drm_gem.c
> > > > > @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev,
> > > > >         obj->handle_count = 0;
> > > > >         obj->size = size;
> > > > >         drm_vma_node_reset(&obj->vma_node);
> > > > > +
> > > > > +     obj->drmcgrp = get_drmcgrp(current);
> > > > > +     drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size);
> > > > Why do the charging here?
> > > > There is no backing store yet for the buffer, so this is really tracking something akin to allowed virtual memory for GEM objects?
> > > > Is this really useful for an administrator to control?
> > > > Isn't the resource we want to control actually the physical backing store?
> > > That's correct.  This is just the first level of control since the
> > > backing store can be backed by different type of memory.  I am in the
> > > process of adding at least two more resources.  Stay tuned.  I am
> > > doing the charge here to enforce the idea of "creator is deemed owner"
> > > at a place where the code is shared by all (the init function.)
> > > 
> > > > > +     while (i <= max_minor && limits != NULL) {
> > > > > +             sval =  strsep(&limits, "\n");
> > > > > +             rc = kstrtoll(sval, 0, &val);
> > > > Input should be "KEY VALUE", so KEY will determine device to apply this to.
> > > > Also, per cgroups-v2 documentation of limits, I believe need to parse and handle the special "max" input value.
> > > > 
> > > > parse_resources() in rdma controller is example for both of above.
> > > Please see my previous reply for the rationale of my hope to not need
> > > a key.  I can certainly add handling of "max" and "default".
> > > 
> > > 
> > > > > +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
> > > > > +             size_t size)
> > > > Shouldn't this return an error and be implemented with same semantics as the
> > > > try_charge() functions of other controllers?
> > > > Below will allow stats_total_allocated to overrun limits_total_allocated.
> > > This is because I am charging the buffer at the init of the buffer
> > > which does not fail so the "try" (drmcgrp_bo_can_allocate) is separate
> > > and placed earlier and nearer other condition where gem object
> > > allocation may fail.  In other words, there are multiple possibilities
> > > for which gem allocation may fail (cgroup limit being one of them) and
> > > satisfying cgroup limit does not mean a charge is needed.  I can
> > > certainly combine the two functions to have an additional try_charge
> > > semantic as well if that is really needed.
> > > 
> > > Regards,
> > > Kenny
> > _______________________________________________
> > amd-gfx mailing list
> > amd-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Kenny Ho May 16, 2019, 2:03 p.m. UTC | #13
On Thu, May 16, 2019 at 3:25 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
> Am 16.05.19 um 09:16 schrieb Koenig, Christian:
> > Am 16.05.19 um 04:29 schrieb Kenny Ho:
> >> On Wed, May 15, 2019 at 5:26 PM Welty, Brian <brian.welty@intel.com> wrote:
> >>> On 5/9/2019 2:04 PM, Kenny Ho wrote:
> >>>> Each file is multi-lined with one entry/line per drm device.
> >>> Multi-line is correct for multiple devices, but I believe you need
> >>> to use a KEY to denote device for both your set and get routines.
> >>> I didn't see your set functions reading a key, or the get functions
> >>> printing the key in output.
> >>> cgroups-v2 conventions mention using KEY of major:minor, but I think
> >>> you can use drm_minor as key?
> >> Given this controller is specific to the drm kernel subsystem which
> >> uses minor to identify drm device,
> > Wait a second, using the DRM minor is a good idea in the first place.
> Well that should have read "is not a good idea"..
>
> I have a test system with a Vega10 and a Vega20. Which device gets which
> minor is not stable, but rather defined by the scan order of the PCIe bus.
>
> Normally the scan order is always the same, but adding or removing
> devices or delaying things just a little bit during init is enough to
> change this.
>
> We need something like the Linux sysfs location or similar to have a
> stable implementation.

I get that, which is why I don't use minor to identify cards in user
space apps I wrote:
https://github.com/RadeonOpenCompute/k8s-device-plugin/blob/c2659c9d1d0713cad36fb5256681125121e6e32f/internal/pkg/amdgpu/amdgpu.go#L85

But within the kernel, I think my use of minor is consistent with the
rest of the drm subsystem.  I hope I don't need to reform the way the
drm subsystem use minor in order to introduce a cgroup controller.

Regards,
Kenny
Christian König May 16, 2019, 2:08 p.m. UTC | #14
Am 16.05.19 um 14:28 schrieb Daniel Vetter:
> [CAUTION: External Email]
>
> On Thu, May 16, 2019 at 09:25:31AM +0200, Christian König wrote:
>> Am 16.05.19 um 09:16 schrieb Koenig, Christian:
>>> Am 16.05.19 um 04:29 schrieb Kenny Ho:
>>>> [CAUTION: External Email]
>>>>
>>>> On Wed, May 15, 2019 at 5:26 PM Welty, Brian <brian.welty@intel.com> wrote:
>>>>> On 5/9/2019 2:04 PM, Kenny Ho wrote:
>>>>>> There are four control file types,
>>>>>> stats (ro) - display current measured values for a resource
>>>>>> max (rw) - limits for a resource
>>>>>> default (ro, root cgroup only) - default values for a resource
>>>>>> help (ro, root cgroup only) - help string for a resource
>>>>>>
>>>>>> Each file is multi-lined with one entry/line per drm device.
>>>>> Multi-line is correct for multiple devices, but I believe you need
>>>>> to use a KEY to denote device for both your set and get routines.
>>>>> I didn't see your set functions reading a key, or the get functions
>>>>> printing the key in output.
>>>>> cgroups-v2 conventions mention using KEY of major:minor, but I think
>>>>> you can use drm_minor as key?
>>>> Given this controller is specific to the drm kernel subsystem which
>>>> uses minor to identify drm device,
>>> Wait a second, using the DRM minor is a good idea in the first place.
>> Well that should have read "is not a good idea"..
> What else should we use?

Well what does for example udev uses to identify a device?

>> Christian.
>>
>>> I have a test system with a Vega10 and a Vega20. Which device gets which
>>> minor is not stable, but rather defined by the scan order of the PCIe bus.
>>>
>>> Normally the scan order is always the same, but adding or removing
>>> devices or delaying things just a little bit during init is enough to
>>> change this.
>>>
>>> We need something like the Linux sysfs location or similar to have a
>>> stable implementation.
> You can go from sysfs location to drm class directory (in sysfs) and back.
> That means if you care you need to walk sysfs yourself a bit, but using
> the drm minor isn't a blocker itself.

Yeah, agreed that userspace could do this. But I think if there is an of 
hand alternative we should use this instead.

> One downside with the drm minor is that it's pretty good nonsense once you
> have more than 64 gpus though, due to how we space render and legacy nodes
> in the minor ids :-)

Ok, another good reason to at least not use the minor=linenum approach.

Christian.

> -Daniel
>>> Regards,
>>> Christian.
>>>
>>>>     I don't see a need to complicate
>>>> the interfaces more by having major and a key.  As you can see in the
>>>> examples below, the drm device minor corresponds to the line number.
>>>> I am not sure how strict cgroup upstream is about the convention but I
>>>> am hoping there are flexibility here to allow for what I have
>>>> implemented.  There are a couple of other things I have done that is
>>>> not described in the convention: 1) inclusion of read-only *.help file
>>>> at the root cgroup, 2) use read-only (which I can potentially make rw)
>>>> *.default file instead of having a default entries (since the default
>>>> can be different for different devices) inside the control files (this
>>>> way, the resetting of cgroup values for all the drm devices, can be
>>>> done by a simple 'cp'.)
>>>>
>>>>>> Usage examples:
>>>>>> // set limit for card1 to 1GB
>>>>>> sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max
>>>>>>
>>>>>> // set limit for card0 to 512MB
>>>>>> sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max
>>>>>>     /** @file drm_gem.c
>>>>>> @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev,
>>>>>>          obj->handle_count = 0;
>>>>>>          obj->size = size;
>>>>>>          drm_vma_node_reset(&obj->vma_node);
>>>>>> +
>>>>>> +     obj->drmcgrp = get_drmcgrp(current);
>>>>>> +     drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size);
>>>>> Why do the charging here?
>>>>> There is no backing store yet for the buffer, so this is really tracking something akin to allowed virtual memory for GEM objects?
>>>>> Is this really useful for an administrator to control?
>>>>> Isn't the resource we want to control actually the physical backing store?
>>>> That's correct.  This is just the first level of control since the
>>>> backing store can be backed by different type of memory.  I am in the
>>>> process of adding at least two more resources.  Stay tuned.  I am
>>>> doing the charge here to enforce the idea of "creator is deemed owner"
>>>> at a place where the code is shared by all (the init function.)
>>>>
>>>>>> +     while (i <= max_minor && limits != NULL) {
>>>>>> +             sval =  strsep(&limits, "\n");
>>>>>> +             rc = kstrtoll(sval, 0, &val);
>>>>> Input should be "KEY VALUE", so KEY will determine device to apply this to.
>>>>> Also, per cgroups-v2 documentation of limits, I believe need to parse and handle the special "max" input value.
>>>>>
>>>>> parse_resources() in rdma controller is example for both of above.
>>>> Please see my previous reply for the rationale of my hope to not need
>>>> a key.  I can certainly add handling of "max" and "default".
>>>>
>>>>
>>>>>> +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
>>>>>> +             size_t size)
>>>>> Shouldn't this return an error and be implemented with same semantics as the
>>>>> try_charge() functions of other controllers?
>>>>> Below will allow stats_total_allocated to overrun limits_total_allocated.
>>>> This is because I am charging the buffer at the init of the buffer
>>>> which does not fail so the "try" (drmcgrp_bo_can_allocate) is separate
>>>> and placed earlier and nearer other condition where gem object
>>>> allocation may fail.  In other words, there are multiple possibilities
>>>> for which gem allocation may fail (cgroup limit being one of them) and
>>>> satisfying cgroup limit does not mean a charge is needed.  I can
>>>> certainly combine the two functions to have an additional try_charge
>>>> semantic as well if that is really needed.
>>>>
>>>> Regards,
>>>> Kenny
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Tejun Heo May 16, 2019, 2:10 p.m. UTC | #15
Hello,

I haven't gone through the patchset yet but some quick comments.

On Wed, May 15, 2019 at 10:29:21PM -0400, Kenny Ho wrote:
> Given this controller is specific to the drm kernel subsystem which
> uses minor to identify drm device, I don't see a need to complicate
> the interfaces more by having major and a key.  As you can see in the
> examples below, the drm device minor corresponds to the line number.
> I am not sure how strict cgroup upstream is about the convention but I

We're pretty strict.

> am hoping there are flexibility here to allow for what I have
> implemented.  There are a couple of other things I have done that is

So, please follow the interface conventions.  We can definitely add
new ones but that would need functional reasons.

> not described in the convention: 1) inclusion of read-only *.help file
> at the root cgroup, 2) use read-only (which I can potentially make rw)
> *.default file instead of having a default entries (since the default
> can be different for different devices) inside the control files (this
> way, the resetting of cgroup values for all the drm devices, can be
> done by a simple 'cp'.)

Again, please follow the existing conventions.  There's a lot more
harm than good in every controller being creative in their own way.
It's trivial to build convenience features in userspace.  Please do it
there.

> > Is this really useful for an administrator to control?
> > Isn't the resource we want to control actually the physical backing store?
> That's correct.  This is just the first level of control since the
> backing store can be backed by different type of memory.  I am in the
> process of adding at least two more resources.  Stay tuned.  I am
> doing the charge here to enforce the idea of "creator is deemed owner"
> at a place where the code is shared by all (the init function.)

Ideally, controller should only control hard resources which impact
behaviors and performance which are immediately visible to users.

Thanks.
Christian König May 16, 2019, 2:12 p.m. UTC | #16
Am 16.05.19 um 16:03 schrieb Kenny Ho:
> On Thu, May 16, 2019 at 3:25 AM Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>> Am 16.05.19 um 09:16 schrieb Koenig, Christian:
>>> Am 16.05.19 um 04:29 schrieb Kenny Ho:
>>>> On Wed, May 15, 2019 at 5:26 PM Welty, Brian <brian.welty@intel.com> wrote:
>>>>> On 5/9/2019 2:04 PM, Kenny Ho wrote:
>>>>>> Each file is multi-lined with one entry/line per drm device.
>>>>> Multi-line is correct for multiple devices, but I believe you need
>>>>> to use a KEY to denote device for both your set and get routines.
>>>>> I didn't see your set functions reading a key, or the get functions
>>>>> printing the key in output.
>>>>> cgroups-v2 conventions mention using KEY of major:minor, but I think
>>>>> you can use drm_minor as key?
>>>> Given this controller is specific to the drm kernel subsystem which
>>>> uses minor to identify drm device,
>>> Wait a second, using the DRM minor is a good idea in the first place.
>> Well that should have read "is not a good idea"..
>>
>> I have a test system with a Vega10 and a Vega20. Which device gets which
>> minor is not stable, but rather defined by the scan order of the PCIe bus.
>>
>> Normally the scan order is always the same, but adding or removing
>> devices or delaying things just a little bit during init is enough to
>> change this.
>>
>> We need something like the Linux sysfs location or similar to have a
>> stable implementation.
> I get that, which is why I don't use minor to identify cards in user
> space apps I wrote:
> https://github.com/RadeonOpenCompute/k8s-device-plugin/blob/c2659c9d1d0713cad36fb5256681125121e6e32f/internal/pkg/amdgpu/amdgpu.go#L85

Yeah, that is certainly a possibility.

> But within the kernel, I think my use of minor is consistent with the
> rest of the drm subsystem.  I hope I don't need to reform the way the
> drm subsystem use minor in order to introduce a cgroup controller.

Well I would try to avoid using the minor and at least look for 
alternatives. E.g. what does udev uses to identify the devices for 
example? And IIRC we have something like a "device-name" in the kernel 
as well (what's printed in the logs).

The minimum we need to do is get away from the minor=linenum approach, 
cause as Daniel pointed out the minor allocation is quite a mess and not 
necessary contiguous.

Regards,
Christian.

>
> Regards,
> Kenny
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Kenny Ho May 16, 2019, 2:28 p.m. UTC | #17
On Thu, May 16, 2019 at 10:12 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
> Am 16.05.19 um 16:03 schrieb Kenny Ho:
> > On Thu, May 16, 2019 at 3:25 AM Christian König
> > <ckoenig.leichtzumerken@gmail.com> wrote:
> >> Am 16.05.19 um 09:16 schrieb Koenig, Christian:
> >> We need something like the Linux sysfs location or similar to have a
> >> stable implementation.
> > I get that, which is why I don't use minor to identify cards in user
> > space apps I wrote:
> > https://github.com/RadeonOpenCompute/k8s-device-plugin/blob/c2659c9d1d0713cad36fb5256681125121e6e32f/internal/pkg/amdgpu/amdgpu.go#L85
>
> Yeah, that is certainly a possibility.
>
> > But within the kernel, I think my use of minor is consistent with the
> > rest of the drm subsystem.  I hope I don't need to reform the way the
> > drm subsystem use minor in order to introduce a cgroup controller.
>
> Well I would try to avoid using the minor and at least look for
> alternatives. E.g. what does udev uses to identify the devices for
> example? And IIRC we have something like a "device-name" in the kernel
> as well (what's printed in the logs).
>
> The minimum we need to do is get away from the minor=linenum approach,
> cause as Daniel pointed out the minor allocation is quite a mess and not
> necessary contiguous.

I noticed :) but looks like there isn't much of a choice from what
Tejun/cgroup replied about convention.

Regards,
Kenny
Kenny Ho May 16, 2019, 2:58 p.m. UTC | #18
On Thu, May 16, 2019 at 10:10 AM Tejun Heo <tj@kernel.org> wrote:
> I haven't gone through the patchset yet but some quick comments.
>
> On Wed, May 15, 2019 at 10:29:21PM -0400, Kenny Ho wrote:
> > Given this controller is specific to the drm kernel subsystem which
> > uses minor to identify drm device, I don't see a need to complicate
> > the interfaces more by having major and a key.  As you can see in the
> > examples below, the drm device minor corresponds to the line number.
> > I am not sure how strict cgroup upstream is about the convention but I
>
> We're pretty strict.
>
> > am hoping there are flexibility here to allow for what I have
> > implemented.  There are a couple of other things I have done that is
>
> So, please follow the interface conventions.  We can definitely add
> new ones but that would need functional reasons.
>
> > not described in the convention: 1) inclusion of read-only *.help file
> > at the root cgroup, 2) use read-only (which I can potentially make rw)
> > *.default file instead of having a default entries (since the default
> > can be different for different devices) inside the control files (this
> > way, the resetting of cgroup values for all the drm devices, can be
> > done by a simple 'cp'.)
>
> Again, please follow the existing conventions.  There's a lot more
> harm than good in every controller being creative in their own way.
> It's trivial to build convenience features in userspace.  Please do it
> there.
I can certainly remove the ro *.help file and leave the documentation
to Documentation/, but for the *.default I do have a functional reason
to it.  As far as I can tell from the convention, the default is per
cgroup and there is no way to describe per device default.  Although,
perhaps we are talking about two different kinds of defaults.  Anyway,
I can leave the discussion to a more detailed review.

Regards,
Kenny
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 93b2c5a48a71..b4c078b7ad63 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -34,6 +34,7 @@ 
 #include <drm/drmP.h>
 #include <drm/amdgpu_drm.h>
 #include <drm/drm_cache.h>
+#include <drm/drm_cgroup.h>
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_amdkfd.h"
@@ -446,6 +447,9 @@  static int amdgpu_bo_do_create(struct amdgpu_device *adev,
 	if (!amdgpu_bo_validate_size(adev, size, bp->domain))
 		return -ENOMEM;
 
+	if (!drmcgrp_bo_can_allocate(current, adev->ddev, size))
+		return -ENOMEM;
+
 	*bo_ptr = NULL;
 
 	acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 6a80db077dc6..cbd49bf34dcf 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -37,10 +37,12 @@ 
 #include <linux/shmem_fs.h>
 #include <linux/dma-buf.h>
 #include <linux/mem_encrypt.h>
+#include <linux/cgroup_drm.h>
 #include <drm/drmP.h>
 #include <drm/drm_vma_manager.h>
 #include <drm/drm_gem.h>
 #include <drm/drm_print.h>
+#include <drm/drm_cgroup.h>
 #include "drm_internal.h"
 
 /** @file drm_gem.c
@@ -154,6 +156,9 @@  void drm_gem_private_object_init(struct drm_device *dev,
 	obj->handle_count = 0;
 	obj->size = size;
 	drm_vma_node_reset(&obj->vma_node);
+
+	obj->drmcgrp = get_drmcgrp(current);
+	drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size);
 }
 EXPORT_SYMBOL(drm_gem_private_object_init);
 
@@ -804,6 +809,8 @@  drm_gem_object_release(struct drm_gem_object *obj)
 	if (obj->filp)
 		fput(obj->filp);
 
+	drmcgrp_unchg_bo_alloc(obj->drmcgrp, obj->dev, obj->size);
+
 	drm_gem_free_mmap_offset(obj);
 }
 EXPORT_SYMBOL(drm_gem_object_release);
diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 231e3f6d5f41..faed5611a1c6 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -32,6 +32,7 @@ 
 #include <drm/drm_prime.h>
 #include <drm/drm_gem.h>
 #include <drm/drmP.h>
+#include <drm/drm_cgroup.h>
 
 #include "drm_internal.h"
 
@@ -794,6 +795,7 @@  int drm_gem_prime_fd_to_handle(struct drm_device *dev,
 {
 	struct dma_buf *dma_buf;
 	struct drm_gem_object *obj;
+	struct drmcgrp *drmcgrp = get_drmcgrp(current);
 	int ret;
 
 	dma_buf = dma_buf_get(prime_fd);
@@ -818,6 +820,13 @@  int drm_gem_prime_fd_to_handle(struct drm_device *dev,
 		goto out_unlock;
 	}
 
+	/* only allow bo from the same cgroup or its ancestor to be imported */
+	if (drmcgrp != NULL &&
+			!drmcgrp_is_self_or_ancestor(drmcgrp, obj->drmcgrp)) {
+		ret = -EACCES;
+		goto out_unlock;
+	}
+
 	if (obj->dma_buf) {
 		WARN_ON(obj->dma_buf != dma_buf);
 	} else {
diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h
index ddb9eab64360..8711b7c5f7bf 100644
--- a/include/drm/drm_cgroup.h
+++ b/include/drm/drm_cgroup.h
@@ -4,12 +4,20 @@ 
 #ifndef __DRM_CGROUP_H__
 #define __DRM_CGROUP_H__
 
+#include <linux/cgroup_drm.h>
+
 #ifdef CONFIG_CGROUP_DRM
 
 int drmcgrp_register_device(struct drm_device *device);
-
 int drmcgrp_unregister_device(struct drm_device *device);
-
+bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self,
+		struct drmcgrp *relative);
+void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
+		size_t size);
+void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
+		size_t size);
+bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev,
+		size_t size);
 #else
 static inline int drmcgrp_register_device(struct drm_device *device)
 {
@@ -20,5 +28,27 @@  static inline int drmcgrp_unregister_device(struct drm_device *device)
 {
 	return 0;
 }
+
+static inline bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self,
+		struct drmcgrp *relative)
+{
+	return false;
+}
+
+static inline void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp,
+		struct drm_device *dev,	size_t size)
+{
+}
+
+static inline void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp,
+		struct drm_device *dev,	size_t size)
+{
+}
+
+static inline bool drmcgrp_bo_can_allocate(struct task_struct *task,
+		struct drm_device *dev,	size_t size)
+{
+	return true;
+}
 #endif /* CONFIG_CGROUP_DRM */
 #endif /* __DRM_CGROUP_H__ */
diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
index c95727425284..02854c674b5c 100644
--- a/include/drm/drm_gem.h
+++ b/include/drm/drm_gem.h
@@ -272,6 +272,17 @@  struct drm_gem_object {
 	 *
 	 */
 	const struct drm_gem_object_funcs *funcs;
+
+	/**
+	 * @drmcgrp:
+	 *
+	 * DRM cgroup this GEM object belongs to.
+         *
+         * This is used to track and limit the amount of GEM objects a user
+         * can allocate.  Since GEM objects can be shared, this is also used
+         * to ensure GEM objects are only shared within the same cgroup.
+	 */
+	struct drmcgrp *drmcgrp;
 };
 
 /**
diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h
index d7ccf434ca6b..fe14ba7bb1cf 100644
--- a/include/linux/cgroup_drm.h
+++ b/include/linux/cgroup_drm.h
@@ -15,6 +15,9 @@ 
 
 struct drmcgrp_device_resource {
 	/* for per device stats */
+	s64			bo_stats_total_allocated;
+
+	s64			bo_limits_total_allocated;
 };
 
 struct drmcgrp {
diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c
index f9ef4bf042d8..bc3abff09113 100644
--- a/kernel/cgroup/drm.c
+++ b/kernel/cgroup/drm.c
@@ -15,6 +15,22 @@  static DEFINE_MUTEX(drmcgrp_mutex);
 struct drmcgrp_device {
 	struct drm_device	*dev;
 	struct mutex		mutex;
+
+	s64			bo_limits_total_allocated_default;
+};
+
+#define DRMCG_CTF_PRIV_SIZE 3
+#define DRMCG_CTF_PRIV_MASK GENMASK((DRMCG_CTF_PRIV_SIZE - 1), 0)
+
+enum drmcgrp_res_type {
+	DRMCGRP_TYPE_BO_TOTAL,
+};
+
+enum drmcgrp_file_type {
+	DRMCGRP_FTYPE_STATS,
+	DRMCGRP_FTYPE_MAX,
+	DRMCGRP_FTYPE_DEFAULT,
+	DRMCGRP_FTYPE_HELP,
 };
 
 /* indexed by drm_minor for access speed */
@@ -53,6 +69,10 @@  static inline int init_drmcgrp_single(struct drmcgrp *drmcgrp, int i)
 	}
 
 	/* set defaults here */
+	if (known_drmcgrp_devs[i] != NULL) {
+		ddr->bo_limits_total_allocated =
+		  known_drmcgrp_devs[i]->bo_limits_total_allocated_default;
+	}
 
 	return 0;
 }
@@ -99,7 +119,187 @@  drmcgrp_css_alloc(struct cgroup_subsys_state *parent_css)
 	return &drmcgrp->css;
 }
 
+static inline void drmcgrp_print_stats(struct drmcgrp_device_resource *ddr,
+		struct seq_file *sf, enum drmcgrp_res_type type)
+{
+	if (ddr == NULL) {
+		seq_puts(sf, "\n");
+		return;
+	}
+
+	switch (type) {
+	case DRMCGRP_TYPE_BO_TOTAL:
+		seq_printf(sf, "%lld\n", ddr->bo_stats_total_allocated);
+		break;
+	default:
+		seq_puts(sf, "\n");
+		break;
+	}
+}
+
+static inline void drmcgrp_print_limits(struct drmcgrp_device_resource *ddr,
+		struct seq_file *sf, enum drmcgrp_res_type type)
+{
+	if (ddr == NULL) {
+		seq_puts(sf, "\n");
+		return;
+	}
+
+	switch (type) {
+	case DRMCGRP_TYPE_BO_TOTAL:
+		seq_printf(sf, "%lld\n", ddr->bo_limits_total_allocated);
+		break;
+	default:
+		seq_puts(sf, "\n");
+		break;
+	}
+}
+
+static inline void drmcgrp_print_default(struct drmcgrp_device *ddev,
+		struct seq_file *sf, enum drmcgrp_res_type type)
+{
+	if (ddev == NULL) {
+		seq_puts(sf, "\n");
+		return;
+	}
+
+	switch (type) {
+	case DRMCGRP_TYPE_BO_TOTAL:
+		seq_printf(sf, "%lld\n", ddev->bo_limits_total_allocated_default);
+		break;
+	default:
+		seq_puts(sf, "\n");
+		break;
+	}
+}
+
+static inline void drmcgrp_print_help(int cardNum, struct seq_file *sf,
+		enum drmcgrp_res_type type)
+{
+	switch (type) {
+	case DRMCGRP_TYPE_BO_TOTAL:
+		seq_printf(sf,
+		"Total amount of buffer allocation in bytes for card%d\n",
+		cardNum);
+		break;
+	default:
+		seq_puts(sf, "\n");
+		break;
+	}
+}
+
+int drmcgrp_bo_show(struct seq_file *sf, void *v)
+{
+	struct drmcgrp *drmcgrp = css_drmcgrp(seq_css(sf));
+	struct drmcgrp_device_resource *ddr = NULL;
+	enum drmcgrp_file_type f_type = seq_cft(sf)->
+		private & DRMCG_CTF_PRIV_MASK;
+	enum drmcgrp_res_type type = seq_cft(sf)->
+		private >> DRMCG_CTF_PRIV_SIZE;
+	struct drmcgrp_device *ddev;
+	int i;
+
+	for (i = 0; i <= max_minor; i++) {
+		ddr = drmcgrp->dev_resources[i];
+		ddev = known_drmcgrp_devs[i];
+
+		switch (f_type) {
+		case DRMCGRP_FTYPE_STATS:
+			drmcgrp_print_stats(ddr, sf, type);
+			break;
+		case DRMCGRP_FTYPE_MAX:
+			drmcgrp_print_limits(ddr, sf, type);
+			break;
+		case DRMCGRP_FTYPE_DEFAULT:
+			drmcgrp_print_default(ddev, sf, type);
+			break;
+		case DRMCGRP_FTYPE_HELP:
+			drmcgrp_print_help(i, sf, type);
+			break;
+		default:
+			seq_puts(sf, "\n");
+			break;
+		}
+	}
+
+	return 0;
+}
+
+ssize_t drmcgrp_bo_limit_write(struct kernfs_open_file *of, char *buf,
+		size_t nbytes, loff_t off)
+{
+	struct drmcgrp *drmcgrp = css_drmcgrp(of_css(of));
+	enum drmcgrp_res_type type = of_cft(of)->private >> DRMCG_CTF_PRIV_SIZE;
+	char *cft_name = of_cft(of)->name;
+	char *limits = strstrip(buf);
+	struct drmcgrp_device_resource *ddr;
+	char *sval;
+	s64 val;
+	int i = 0;
+	int rc;
+
+	while (i <= max_minor && limits != NULL) {
+		sval =  strsep(&limits, "\n");
+		rc = kstrtoll(sval, 0, &val);
+
+		if (rc) {
+			pr_err("drmcgrp: %s: minor %d, err %d. ",
+				cft_name, i, rc);
+			pr_cont_cgroup_name(drmcgrp->css.cgroup);
+			pr_cont("\n");
+		} else {
+			ddr = drmcgrp->dev_resources[i];
+			switch (type) {
+			case DRMCGRP_TYPE_BO_TOTAL:
+                                if (val < 0) continue;
+				ddr->bo_limits_total_allocated = val;
+				break;
+			default:
+				break;
+			}
+		}
+
+		i++;
+	}
+
+	if (i <= max_minor) {
+		pr_err("drmcgrp: %s: less entries than # of drm devices. ",
+				cft_name);
+		pr_cont_cgroup_name(drmcgrp->css.cgroup);
+		pr_cont("\n");
+	}
+
+	return nbytes;
+}
+
 struct cftype files[] = {
+	{
+		.name = "buffer.total.stats",
+		.seq_show = drmcgrp_bo_show,
+		.private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
+			DRMCGRP_FTYPE_STATS,
+	},
+	{
+		.name = "buffer.total.default",
+		.seq_show = drmcgrp_bo_show,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
+			DRMCGRP_FTYPE_DEFAULT,
+	},
+	{
+		.name = "buffer.total.help",
+		.seq_show = drmcgrp_bo_show,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
+			DRMCGRP_FTYPE_HELP,
+	},
+	{
+		.name = "buffer.total.max",
+		.write = drmcgrp_bo_limit_write,
+		.seq_show = drmcgrp_bo_show,
+		.private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) |
+			DRMCGRP_FTYPE_MAX,
+	},
 	{ }	/* terminate */
 };
 
@@ -122,6 +322,8 @@  int drmcgrp_register_device(struct drm_device *dev)
 		return -ENOMEM;
 
 	ddev->dev = dev;
+	ddev->bo_limits_total_allocated_default = S64_MAX;
+
 	mutex_init(&ddev->mutex);
 
 	mutex_lock(&drmcgrp_mutex);
@@ -156,3 +358,81 @@  int drmcgrp_unregister_device(struct drm_device *dev)
 	return 0;
 }
 EXPORT_SYMBOL(drmcgrp_unregister_device);
+
+bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, struct drmcgrp *relative)
+{
+	for (; self != NULL; self = parent_drmcgrp(self))
+		if (self == relative)
+			return true;
+
+	return false;
+}
+EXPORT_SYMBOL(drmcgrp_is_self_or_ancestor);
+
+bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev,
+		size_t size)
+{
+	struct drmcgrp *drmcgrp = get_drmcgrp(task);
+	struct drmcgrp_device_resource *ddr;
+	struct drmcgrp_device_resource *d;
+	int devIdx = dev->primary->index;
+	bool result = true;
+	s64 delta = 0;
+
+	if (drmcgrp == NULL || drmcgrp == root_drmcgrp)
+		return true;
+
+	ddr = drmcgrp->dev_resources[devIdx];
+	mutex_lock(&known_drmcgrp_devs[devIdx]->mutex);
+	for ( ; drmcgrp != root_drmcgrp; drmcgrp = parent_drmcgrp(drmcgrp)) {
+		d = drmcgrp->dev_resources[devIdx];
+		delta = d->bo_limits_total_allocated -
+				d->bo_stats_total_allocated;
+
+		if (delta <= 0 || size > delta) {
+			result = false;
+			break;
+		}
+	}
+	mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex);
+
+	return result;
+}
+EXPORT_SYMBOL(drmcgrp_bo_can_allocate);
+
+void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
+		size_t size)
+{
+	struct drmcgrp_device_resource *ddr;
+	int devIdx = dev->primary->index;
+
+	if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL)
+		return;
+
+	mutex_lock(&known_drmcgrp_devs[devIdx]->mutex);
+	for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp)) {
+		ddr = drmcgrp->dev_resources[devIdx];
+
+		ddr->bo_stats_total_allocated += (s64)size;
+	}
+	mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex);
+}
+EXPORT_SYMBOL(drmcgrp_chg_bo_alloc);
+
+void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev,
+		size_t size)
+{
+	struct drmcgrp_device_resource *ddr;
+	int devIdx = dev->primary->index;
+
+	if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL)
+		return;
+
+	ddr = drmcgrp->dev_resources[devIdx];
+	mutex_lock(&known_drmcgrp_devs[devIdx]->mutex);
+	for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp))
+		drmcgrp->dev_resources[devIdx]->bo_stats_total_allocated
+			-= (s64)size;
+	mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex);
+}
+EXPORT_SYMBOL(drmcgrp_unchg_bo_alloc);