Message ID | 20190509210410.5471-5-Kenny.Ho@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | new cgroup controller for gpu/drm subsystem | expand |
Am 09.05.19 um 23:04 schrieb Kenny Ho: > The drm resource being measured and limited here is the GEM buffer > objects. User applications allocate and free these buffers. In > addition, a process can allocate a buffer and share it with another > process. The consumer of a shared buffer can also outlive the > allocator of the buffer. > > For the purpose of cgroup accounting and limiting, ownership of the > buffer is deemed to be the cgroup for which the allocating process > belongs to. There is one limit per drm device. > > In order to prevent the buffer outliving the cgroup that owns it, a > process is prevented from importing buffers that are not own by the > process' cgroup or the ancestors of the process' cgroup. > > For this resource, the control files are prefixed with drm.buffer.total. > > There are four control file types, > stats (ro) - display current measured values for a resource > max (rw) - limits for a resource > default (ro, root cgroup only) - default values for a resource > help (ro, root cgroup only) - help string for a resource > > Each file is multi-lined with one entry/line per drm device. > > Usage examples: > // set limit for card1 to 1GB > sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max > > // set limit for card0 to 512MB > sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max > > Change-Id: I4c249d06d45ec709d6481d4cbe87c5168545c5d0 > Signed-off-by: Kenny Ho <Kenny.Ho@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 + > drivers/gpu/drm/drm_gem.c | 7 + > drivers/gpu/drm/drm_prime.c | 9 + > include/drm/drm_cgroup.h | 34 ++- > include/drm/drm_gem.h | 11 + > include/linux/cgroup_drm.h | 3 + > kernel/cgroup/drm.c | 280 +++++++++++++++++++++ > 7 files changed, 346 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > index 93b2c5a48a71..b4c078b7ad63 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > @@ -34,6 +34,7 @@ > #include <drm/drmP.h> > #include <drm/amdgpu_drm.h> > #include <drm/drm_cache.h> > +#include <drm/drm_cgroup.h> > #include "amdgpu.h" > #include "amdgpu_trace.h" > #include "amdgpu_amdkfd.h" > @@ -446,6 +447,9 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, > if (!amdgpu_bo_validate_size(adev, size, bp->domain)) > return -ENOMEM; > > + if (!drmcgrp_bo_can_allocate(current, adev->ddev, size)) > + return -ENOMEM; > + > *bo_ptr = NULL; > > acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, > diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c > index 6a80db077dc6..cbd49bf34dcf 100644 > --- a/drivers/gpu/drm/drm_gem.c > +++ b/drivers/gpu/drm/drm_gem.c > @@ -37,10 +37,12 @@ > #include <linux/shmem_fs.h> > #include <linux/dma-buf.h> > #include <linux/mem_encrypt.h> > +#include <linux/cgroup_drm.h> > #include <drm/drmP.h> > #include <drm/drm_vma_manager.h> > #include <drm/drm_gem.h> > #include <drm/drm_print.h> > +#include <drm/drm_cgroup.h> > #include "drm_internal.h" > > /** @file drm_gem.c > @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev, > obj->handle_count = 0; > obj->size = size; > drm_vma_node_reset(&obj->vma_node); > + > + obj->drmcgrp = get_drmcgrp(current); > + drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size); > } > EXPORT_SYMBOL(drm_gem_private_object_init); > > @@ -804,6 +809,8 @@ drm_gem_object_release(struct drm_gem_object *obj) > if (obj->filp) > fput(obj->filp); > > + drmcgrp_unchg_bo_alloc(obj->drmcgrp, obj->dev, obj->size); > + > drm_gem_free_mmap_offset(obj); > } > EXPORT_SYMBOL(drm_gem_object_release); > diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c > index 231e3f6d5f41..faed5611a1c6 100644 > --- a/drivers/gpu/drm/drm_prime.c > +++ b/drivers/gpu/drm/drm_prime.c > @@ -32,6 +32,7 @@ > #include <drm/drm_prime.h> > #include <drm/drm_gem.h> > #include <drm/drmP.h> > +#include <drm/drm_cgroup.h> > > #include "drm_internal.h" > > @@ -794,6 +795,7 @@ int drm_gem_prime_fd_to_handle(struct drm_device *dev, > { > struct dma_buf *dma_buf; > struct drm_gem_object *obj; > + struct drmcgrp *drmcgrp = get_drmcgrp(current); > int ret; > > dma_buf = dma_buf_get(prime_fd); > @@ -818,6 +820,13 @@ int drm_gem_prime_fd_to_handle(struct drm_device *dev, > goto out_unlock; > } > > + /* only allow bo from the same cgroup or its ancestor to be imported */ > + if (drmcgrp != NULL && > + !drmcgrp_is_self_or_ancestor(drmcgrp, obj->drmcgrp)) { > + ret = -EACCES; > + goto out_unlock; > + } > + This will most likely go up in flames. If I'm not completely mistaken we already use drm_gem_prime_fd_to_handle() to exchange handles between different cgroups in current container usages. Christian. > if (obj->dma_buf) { > WARN_ON(obj->dma_buf != dma_buf); > } else { > diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h > index ddb9eab64360..8711b7c5f7bf 100644 > --- a/include/drm/drm_cgroup.h > +++ b/include/drm/drm_cgroup.h > @@ -4,12 +4,20 @@ > #ifndef __DRM_CGROUP_H__ > #define __DRM_CGROUP_H__ > > +#include <linux/cgroup_drm.h> > + > #ifdef CONFIG_CGROUP_DRM > > int drmcgrp_register_device(struct drm_device *device); > - > int drmcgrp_unregister_device(struct drm_device *device); > - > +bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, > + struct drmcgrp *relative); > +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, > + size_t size); > +void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, > + size_t size); > +bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev, > + size_t size); > #else > static inline int drmcgrp_register_device(struct drm_device *device) > { > @@ -20,5 +28,27 @@ static inline int drmcgrp_unregister_device(struct drm_device *device) > { > return 0; > } > + > +static inline bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, > + struct drmcgrp *relative) > +{ > + return false; > +} > + > +static inline void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, > + struct drm_device *dev, size_t size) > +{ > +} > + > +static inline void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, > + struct drm_device *dev, size_t size) > +{ > +} > + > +static inline bool drmcgrp_bo_can_allocate(struct task_struct *task, > + struct drm_device *dev, size_t size) > +{ > + return true; > +} > #endif /* CONFIG_CGROUP_DRM */ > #endif /* __DRM_CGROUP_H__ */ > diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h > index c95727425284..02854c674b5c 100644 > --- a/include/drm/drm_gem.h > +++ b/include/drm/drm_gem.h > @@ -272,6 +272,17 @@ struct drm_gem_object { > * > */ > const struct drm_gem_object_funcs *funcs; > + > + /** > + * @drmcgrp: > + * > + * DRM cgroup this GEM object belongs to. > + * > + * This is used to track and limit the amount of GEM objects a user > + * can allocate. Since GEM objects can be shared, this is also used > + * to ensure GEM objects are only shared within the same cgroup. > + */ > + struct drmcgrp *drmcgrp; > }; > > /** > diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h > index d7ccf434ca6b..fe14ba7bb1cf 100644 > --- a/include/linux/cgroup_drm.h > +++ b/include/linux/cgroup_drm.h > @@ -15,6 +15,9 @@ > > struct drmcgrp_device_resource { > /* for per device stats */ > + s64 bo_stats_total_allocated; > + > + s64 bo_limits_total_allocated; > }; > > struct drmcgrp { > diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c > index f9ef4bf042d8..bc3abff09113 100644 > --- a/kernel/cgroup/drm.c > +++ b/kernel/cgroup/drm.c > @@ -15,6 +15,22 @@ static DEFINE_MUTEX(drmcgrp_mutex); > struct drmcgrp_device { > struct drm_device *dev; > struct mutex mutex; > + > + s64 bo_limits_total_allocated_default; > +}; > + > +#define DRMCG_CTF_PRIV_SIZE 3 > +#define DRMCG_CTF_PRIV_MASK GENMASK((DRMCG_CTF_PRIV_SIZE - 1), 0) > + > +enum drmcgrp_res_type { > + DRMCGRP_TYPE_BO_TOTAL, > +}; > + > +enum drmcgrp_file_type { > + DRMCGRP_FTYPE_STATS, > + DRMCGRP_FTYPE_MAX, > + DRMCGRP_FTYPE_DEFAULT, > + DRMCGRP_FTYPE_HELP, > }; > > /* indexed by drm_minor for access speed */ > @@ -53,6 +69,10 @@ static inline int init_drmcgrp_single(struct drmcgrp *drmcgrp, int i) > } > > /* set defaults here */ > + if (known_drmcgrp_devs[i] != NULL) { > + ddr->bo_limits_total_allocated = > + known_drmcgrp_devs[i]->bo_limits_total_allocated_default; > + } > > return 0; > } > @@ -99,7 +119,187 @@ drmcgrp_css_alloc(struct cgroup_subsys_state *parent_css) > return &drmcgrp->css; > } > > +static inline void drmcgrp_print_stats(struct drmcgrp_device_resource *ddr, > + struct seq_file *sf, enum drmcgrp_res_type type) > +{ > + if (ddr == NULL) { > + seq_puts(sf, "\n"); > + return; > + } > + > + switch (type) { > + case DRMCGRP_TYPE_BO_TOTAL: > + seq_printf(sf, "%lld\n", ddr->bo_stats_total_allocated); > + break; > + default: > + seq_puts(sf, "\n"); > + break; > + } > +} > + > +static inline void drmcgrp_print_limits(struct drmcgrp_device_resource *ddr, > + struct seq_file *sf, enum drmcgrp_res_type type) > +{ > + if (ddr == NULL) { > + seq_puts(sf, "\n"); > + return; > + } > + > + switch (type) { > + case DRMCGRP_TYPE_BO_TOTAL: > + seq_printf(sf, "%lld\n", ddr->bo_limits_total_allocated); > + break; > + default: > + seq_puts(sf, "\n"); > + break; > + } > +} > + > +static inline void drmcgrp_print_default(struct drmcgrp_device *ddev, > + struct seq_file *sf, enum drmcgrp_res_type type) > +{ > + if (ddev == NULL) { > + seq_puts(sf, "\n"); > + return; > + } > + > + switch (type) { > + case DRMCGRP_TYPE_BO_TOTAL: > + seq_printf(sf, "%lld\n", ddev->bo_limits_total_allocated_default); > + break; > + default: > + seq_puts(sf, "\n"); > + break; > + } > +} > + > +static inline void drmcgrp_print_help(int cardNum, struct seq_file *sf, > + enum drmcgrp_res_type type) > +{ > + switch (type) { > + case DRMCGRP_TYPE_BO_TOTAL: > + seq_printf(sf, > + "Total amount of buffer allocation in bytes for card%d\n", > + cardNum); > + break; > + default: > + seq_puts(sf, "\n"); > + break; > + } > +} > + > +int drmcgrp_bo_show(struct seq_file *sf, void *v) > +{ > + struct drmcgrp *drmcgrp = css_drmcgrp(seq_css(sf)); > + struct drmcgrp_device_resource *ddr = NULL; > + enum drmcgrp_file_type f_type = seq_cft(sf)-> > + private & DRMCG_CTF_PRIV_MASK; > + enum drmcgrp_res_type type = seq_cft(sf)-> > + private >> DRMCG_CTF_PRIV_SIZE; > + struct drmcgrp_device *ddev; > + int i; > + > + for (i = 0; i <= max_minor; i++) { > + ddr = drmcgrp->dev_resources[i]; > + ddev = known_drmcgrp_devs[i]; > + > + switch (f_type) { > + case DRMCGRP_FTYPE_STATS: > + drmcgrp_print_stats(ddr, sf, type); > + break; > + case DRMCGRP_FTYPE_MAX: > + drmcgrp_print_limits(ddr, sf, type); > + break; > + case DRMCGRP_FTYPE_DEFAULT: > + drmcgrp_print_default(ddev, sf, type); > + break; > + case DRMCGRP_FTYPE_HELP: > + drmcgrp_print_help(i, sf, type); > + break; > + default: > + seq_puts(sf, "\n"); > + break; > + } > + } > + > + return 0; > +} > + > +ssize_t drmcgrp_bo_limit_write(struct kernfs_open_file *of, char *buf, > + size_t nbytes, loff_t off) > +{ > + struct drmcgrp *drmcgrp = css_drmcgrp(of_css(of)); > + enum drmcgrp_res_type type = of_cft(of)->private >> DRMCG_CTF_PRIV_SIZE; > + char *cft_name = of_cft(of)->name; > + char *limits = strstrip(buf); > + struct drmcgrp_device_resource *ddr; > + char *sval; > + s64 val; > + int i = 0; > + int rc; > + > + while (i <= max_minor && limits != NULL) { > + sval = strsep(&limits, "\n"); > + rc = kstrtoll(sval, 0, &val); > + > + if (rc) { > + pr_err("drmcgrp: %s: minor %d, err %d. ", > + cft_name, i, rc); > + pr_cont_cgroup_name(drmcgrp->css.cgroup); > + pr_cont("\n"); > + } else { > + ddr = drmcgrp->dev_resources[i]; > + switch (type) { > + case DRMCGRP_TYPE_BO_TOTAL: > + if (val < 0) continue; > + ddr->bo_limits_total_allocated = val; > + break; > + default: > + break; > + } > + } > + > + i++; > + } > + > + if (i <= max_minor) { > + pr_err("drmcgrp: %s: less entries than # of drm devices. ", > + cft_name); > + pr_cont_cgroup_name(drmcgrp->css.cgroup); > + pr_cont("\n"); > + } > + > + return nbytes; > +} > + > struct cftype files[] = { > + { > + .name = "buffer.total.stats", > + .seq_show = drmcgrp_bo_show, > + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | > + DRMCGRP_FTYPE_STATS, > + }, > + { > + .name = "buffer.total.default", > + .seq_show = drmcgrp_bo_show, > + .flags = CFTYPE_ONLY_ON_ROOT, > + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | > + DRMCGRP_FTYPE_DEFAULT, > + }, > + { > + .name = "buffer.total.help", > + .seq_show = drmcgrp_bo_show, > + .flags = CFTYPE_ONLY_ON_ROOT, > + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | > + DRMCGRP_FTYPE_HELP, > + }, > + { > + .name = "buffer.total.max", > + .write = drmcgrp_bo_limit_write, > + .seq_show = drmcgrp_bo_show, > + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | > + DRMCGRP_FTYPE_MAX, > + }, > { } /* terminate */ > }; > > @@ -122,6 +322,8 @@ int drmcgrp_register_device(struct drm_device *dev) > return -ENOMEM; > > ddev->dev = dev; > + ddev->bo_limits_total_allocated_default = S64_MAX; > + > mutex_init(&ddev->mutex); > > mutex_lock(&drmcgrp_mutex); > @@ -156,3 +358,81 @@ int drmcgrp_unregister_device(struct drm_device *dev) > return 0; > } > EXPORT_SYMBOL(drmcgrp_unregister_device); > + > +bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, struct drmcgrp *relative) > +{ > + for (; self != NULL; self = parent_drmcgrp(self)) > + if (self == relative) > + return true; > + > + return false; > +} > +EXPORT_SYMBOL(drmcgrp_is_self_or_ancestor); > + > +bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev, > + size_t size) > +{ > + struct drmcgrp *drmcgrp = get_drmcgrp(task); > + struct drmcgrp_device_resource *ddr; > + struct drmcgrp_device_resource *d; > + int devIdx = dev->primary->index; > + bool result = true; > + s64 delta = 0; > + > + if (drmcgrp == NULL || drmcgrp == root_drmcgrp) > + return true; > + > + ddr = drmcgrp->dev_resources[devIdx]; > + mutex_lock(&known_drmcgrp_devs[devIdx]->mutex); > + for ( ; drmcgrp != root_drmcgrp; drmcgrp = parent_drmcgrp(drmcgrp)) { > + d = drmcgrp->dev_resources[devIdx]; > + delta = d->bo_limits_total_allocated - > + d->bo_stats_total_allocated; > + > + if (delta <= 0 || size > delta) { > + result = false; > + break; > + } > + } > + mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex); > + > + return result; > +} > +EXPORT_SYMBOL(drmcgrp_bo_can_allocate); > + > +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, > + size_t size) > +{ > + struct drmcgrp_device_resource *ddr; > + int devIdx = dev->primary->index; > + > + if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL) > + return; > + > + mutex_lock(&known_drmcgrp_devs[devIdx]->mutex); > + for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp)) { > + ddr = drmcgrp->dev_resources[devIdx]; > + > + ddr->bo_stats_total_allocated += (s64)size; > + } > + mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex); > +} > +EXPORT_SYMBOL(drmcgrp_chg_bo_alloc); > + > +void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, > + size_t size) > +{ > + struct drmcgrp_device_resource *ddr; > + int devIdx = dev->primary->index; > + > + if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL) > + return; > + > + ddr = drmcgrp->dev_resources[devIdx]; > + mutex_lock(&known_drmcgrp_devs[devIdx]->mutex); > + for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp)) > + drmcgrp->dev_resources[devIdx]->bo_stats_total_allocated > + -= (s64)size; > + mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex); > +} > +EXPORT_SYMBOL(drmcgrp_unchg_bo_alloc);
On Fri, May 10, 2019 at 8:28 AM Christian König <ckoenig.leichtzumerken@gmail.com> wrote: > > Am 09.05.19 um 23:04 schrieb Kenny Ho: > > + /* only allow bo from the same cgroup or its ancestor to be imported */ > > + if (drmcgrp != NULL && > > + !drmcgrp_is_self_or_ancestor(drmcgrp, obj->drmcgrp)) { > > + ret = -EACCES; > > + goto out_unlock; > > + } > > + > > This will most likely go up in flames. > > If I'm not completely mistaken we already use > drm_gem_prime_fd_to_handle() to exchange handles between different > cgroups in current container usages. This is something that I am interested in getting more details from the broader community because the details affect how likely this will go up in flames ;). Note that this check does not block sharing of handles from cgroup parent to children in the hierarchy, nor does it blocks sharing of handles within a cgroup. I am interested to find out, when existing apps share handles between containers, if there are any expectations on resource management. Since there are no drm cgroup for current container usage, I expect the answer to be no. In this case, the drm cgroup controller can be disabled on its own (in the context of cgroup-v2's unified hierarchy), or the process can remain at the root for the drm cgroup hierarchy (in the context of cgroup-v1.) If I understand the cgroup api correctly, that means all process would be part of the root cgroup as far as the drm controller is concerned and this block will not come into effect. I have verified that this is indeed the current default behaviour of a container runtime (runc, which is used by docker, podman and others.) The new drm cgroup controller is simply ignored and all processes remain at the root of the hierarchy (since there are no other cgroups.) I plan to make contributions to runc (so folks can actually use this features with docker/podman/k8s, etc.) once things stabilized on the kernel side. On the other hand, if there are expectations for resource management between containers, I would like to know who is the expected manager and how does it fit into the concept of container (which enforce some level of isolation.) One possible manager may be the display server. But as long as the display server is in a parent cgroup of the apps' cgroup, the apps can still import handles from the display server under the current implementation. My understanding is that this is most likely the case, with the display server simply sitting at the default/root cgroup. But I certainly want to hear more about other use cases (for example, is running multiple display servers on a single host a realistic possibility? Are there people running multiple display servers inside peer containers? If so, how do they coordinate resources?) I should probably summarize some of these into the commit message. Regards, Kenny > Christian. > > > if (obj->dma_buf) { > > WARN_ON(obj->dma_buf != dma_buf); > > } else { > > diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h > > index ddb9eab64360..8711b7c5f7bf 100644 > > --- a/include/drm/drm_cgroup.h > > +++ b/include/drm/drm_cgroup.h > > @@ -4,12 +4,20 @@ > > #ifndef __DRM_CGROUP_H__ > > #define __DRM_CGROUP_H__ > > > > +#include <linux/cgroup_drm.h> > > + > > #ifdef CONFIG_CGROUP_DRM > > > > int drmcgrp_register_device(struct drm_device *device); > > - > > int drmcgrp_unregister_device(struct drm_device *device); > > - > > +bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, > > + struct drmcgrp *relative); > > +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, > > + size_t size); > > +void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, > > + size_t size); > > +bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev, > > + size_t size); > > #else > > static inline int drmcgrp_register_device(struct drm_device *device) > > { > > @@ -20,5 +28,27 @@ static inline int drmcgrp_unregister_device(struct drm_device *device) > > { > > return 0; > > } > > + > > +static inline bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, > > + struct drmcgrp *relative) > > +{ > > + return false; > > +} > > + > > +static inline void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, > > + struct drm_device *dev, size_t size) > > +{ > > +} > > + > > +static inline void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, > > + struct drm_device *dev, size_t size) > > +{ > > +} > > + > > +static inline bool drmcgrp_bo_can_allocate(struct task_struct *task, > > + struct drm_device *dev, size_t size) > > +{ > > + return true; > > +} > > #endif /* CONFIG_CGROUP_DRM */ > > #endif /* __DRM_CGROUP_H__ */ > > diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h > > index c95727425284..02854c674b5c 100644 > > --- a/include/drm/drm_gem.h > > +++ b/include/drm/drm_gem.h > > @@ -272,6 +272,17 @@ struct drm_gem_object { > > * > > */ > > const struct drm_gem_object_funcs *funcs; > > + > > + /** > > + * @drmcgrp: > > + * > > + * DRM cgroup this GEM object belongs to. > > + * > > + * This is used to track and limit the amount of GEM objects a user > > + * can allocate. Since GEM objects can be shared, this is also used > > + * to ensure GEM objects are only shared within the same cgroup. > > + */ > > + struct drmcgrp *drmcgrp; > > }; > > > > /** > > diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h > > index d7ccf434ca6b..fe14ba7bb1cf 100644 > > --- a/include/linux/cgroup_drm.h > > +++ b/include/linux/cgroup_drm.h > > @@ -15,6 +15,9 @@ > > > > struct drmcgrp_device_resource { > > /* for per device stats */ > > + s64 bo_stats_total_allocated; > > + > > + s64 bo_limits_total_allocated; > > }; > > > > struct drmcgrp { > > diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c > > index f9ef4bf042d8..bc3abff09113 100644 > > --- a/kernel/cgroup/drm.c > > +++ b/kernel/cgroup/drm.c > > @@ -15,6 +15,22 @@ static DEFINE_MUTEX(drmcgrp_mutex); > > struct drmcgrp_device { > > struct drm_device *dev; > > struct mutex mutex; > > + > > + s64 bo_limits_total_allocated_default; > > +}; > > + > > +#define DRMCG_CTF_PRIV_SIZE 3 > > +#define DRMCG_CTF_PRIV_MASK GENMASK((DRMCG_CTF_PRIV_SIZE - 1), 0) > > + > > +enum drmcgrp_res_type { > > + DRMCGRP_TYPE_BO_TOTAL, > > +}; > > + > > +enum drmcgrp_file_type { > > + DRMCGRP_FTYPE_STATS, > > + DRMCGRP_FTYPE_MAX, > > + DRMCGRP_FTYPE_DEFAULT, > > + DRMCGRP_FTYPE_HELP, > > }; > > > > /* indexed by drm_minor for access speed */ > > @@ -53,6 +69,10 @@ static inline int init_drmcgrp_single(struct drmcgrp *drmcgrp, int i) > > } > > > > /* set defaults here */ > > + if (known_drmcgrp_devs[i] != NULL) { > > + ddr->bo_limits_total_allocated = > > + known_drmcgrp_devs[i]->bo_limits_total_allocated_default; > > + } > > > > return 0; > > } > > @@ -99,7 +119,187 @@ drmcgrp_css_alloc(struct cgroup_subsys_state *parent_css) > > return &drmcgrp->css; > > } > > > > +static inline void drmcgrp_print_stats(struct drmcgrp_device_resource *ddr, > > + struct seq_file *sf, enum drmcgrp_res_type type) > > +{ > > + if (ddr == NULL) { > > + seq_puts(sf, "\n"); > > + return; > > + } > > + > > + switch (type) { > > + case DRMCGRP_TYPE_BO_TOTAL: > > + seq_printf(sf, "%lld\n", ddr->bo_stats_total_allocated); > > + break; > > + default: > > + seq_puts(sf, "\n"); > > + break; > > + } > > +} > > + > > +static inline void drmcgrp_print_limits(struct drmcgrp_device_resource *ddr, > > + struct seq_file *sf, enum drmcgrp_res_type type) > > +{ > > + if (ddr == NULL) { > > + seq_puts(sf, "\n"); > > + return; > > + } > > + > > + switch (type) { > > + case DRMCGRP_TYPE_BO_TOTAL: > > + seq_printf(sf, "%lld\n", ddr->bo_limits_total_allocated); > > + break; > > + default: > > + seq_puts(sf, "\n"); > > + break; > > + } > > +} > > + > > +static inline void drmcgrp_print_default(struct drmcgrp_device *ddev, > > + struct seq_file *sf, enum drmcgrp_res_type type) > > +{ > > + if (ddev == NULL) { > > + seq_puts(sf, "\n"); > > + return; > > + } > > + > > + switch (type) { > > + case DRMCGRP_TYPE_BO_TOTAL: > > + seq_printf(sf, "%lld\n", ddev->bo_limits_total_allocated_default); > > + break; > > + default: > > + seq_puts(sf, "\n"); > > + break; > > + } > > +} > > + > > +static inline void drmcgrp_print_help(int cardNum, struct seq_file *sf, > > + enum drmcgrp_res_type type) > > +{ > > + switch (type) { > > + case DRMCGRP_TYPE_BO_TOTAL: > > + seq_printf(sf, > > + "Total amount of buffer allocation in bytes for card%d\n", > > + cardNum); > > + break; > > + default: > > + seq_puts(sf, "\n"); > > + break; > > + } > > +} > > + > > +int drmcgrp_bo_show(struct seq_file *sf, void *v) > > +{ > > + struct drmcgrp *drmcgrp = css_drmcgrp(seq_css(sf)); > > + struct drmcgrp_device_resource *ddr = NULL; > > + enum drmcgrp_file_type f_type = seq_cft(sf)-> > > + private & DRMCG_CTF_PRIV_MASK; > > + enum drmcgrp_res_type type = seq_cft(sf)-> > > + private >> DRMCG_CTF_PRIV_SIZE; > > + struct drmcgrp_device *ddev; > > + int i; > > + > > + for (i = 0; i <= max_minor; i++) { > > + ddr = drmcgrp->dev_resources[i]; > > + ddev = known_drmcgrp_devs[i]; > > + > > + switch (f_type) { > > + case DRMCGRP_FTYPE_STATS: > > + drmcgrp_print_stats(ddr, sf, type); > > + break; > > + case DRMCGRP_FTYPE_MAX: > > + drmcgrp_print_limits(ddr, sf, type); > > + break; > > + case DRMCGRP_FTYPE_DEFAULT: > > + drmcgrp_print_default(ddev, sf, type); > > + break; > > + case DRMCGRP_FTYPE_HELP: > > + drmcgrp_print_help(i, sf, type); > > + break; > > + default: > > + seq_puts(sf, "\n"); > > + break; > > + } > > + } > > + > > + return 0; > > +} > > + > > +ssize_t drmcgrp_bo_limit_write(struct kernfs_open_file *of, char *buf, > > + size_t nbytes, loff_t off) > > +{ > > + struct drmcgrp *drmcgrp = css_drmcgrp(of_css(of)); > > + enum drmcgrp_res_type type = of_cft(of)->private >> DRMCG_CTF_PRIV_SIZE; > > + char *cft_name = of_cft(of)->name; > > + char *limits = strstrip(buf); > > + struct drmcgrp_device_resource *ddr; > > + char *sval; > > + s64 val; > > + int i = 0; > > + int rc; > > + > > + while (i <= max_minor && limits != NULL) { > > + sval = strsep(&limits, "\n"); > > + rc = kstrtoll(sval, 0, &val); > > + > > + if (rc) { > > + pr_err("drmcgrp: %s: minor %d, err %d. ", > > + cft_name, i, rc); > > + pr_cont_cgroup_name(drmcgrp->css.cgroup); > > + pr_cont("\n"); > > + } else { > > + ddr = drmcgrp->dev_resources[i]; > > + switch (type) { > > + case DRMCGRP_TYPE_BO_TOTAL: > > + if (val < 0) continue; > > + ddr->bo_limits_total_allocated = val; > > + break; > > + default: > > + break; > > + } > > + } > > + > > + i++; > > + } > > + > > + if (i <= max_minor) { > > + pr_err("drmcgrp: %s: less entries than # of drm devices. ", > > + cft_name); > > + pr_cont_cgroup_name(drmcgrp->css.cgroup); > > + pr_cont("\n"); > > + } > > + > > + return nbytes; > > +} > > + > > struct cftype files[] = { > > + { > > + .name = "buffer.total.stats", > > + .seq_show = drmcgrp_bo_show, > > + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | > > + DRMCGRP_FTYPE_STATS, > > + }, > > + { > > + .name = "buffer.total.default", > > + .seq_show = drmcgrp_bo_show, > > + .flags = CFTYPE_ONLY_ON_ROOT, > > + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | > > + DRMCGRP_FTYPE_DEFAULT, > > + }, > > + { > > + .name = "buffer.total.help", > > + .seq_show = drmcgrp_bo_show, > > + .flags = CFTYPE_ONLY_ON_ROOT, > > + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | > > + DRMCGRP_FTYPE_HELP, > > + }, > > + { > > + .name = "buffer.total.max", > > + .write = drmcgrp_bo_limit_write, > > + .seq_show = drmcgrp_bo_show, > > + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | > > + DRMCGRP_FTYPE_MAX, > > + }, > > { } /* terminate */ > > }; > > > > @@ -122,6 +322,8 @@ int drmcgrp_register_device(struct drm_device *dev) > > return -ENOMEM; > > > > ddev->dev = dev; > > + ddev->bo_limits_total_allocated_default = S64_MAX; > > + > > mutex_init(&ddev->mutex); > > > > mutex_lock(&drmcgrp_mutex); > > @@ -156,3 +358,81 @@ int drmcgrp_unregister_device(struct drm_device *dev) > > return 0; > > } > > EXPORT_SYMBOL(drmcgrp_unregister_device); > > + > > +bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, struct drmcgrp *relative) > > +{ > > + for (; self != NULL; self = parent_drmcgrp(self)) > > + if (self == relative) > > + return true; > > + > > + return false; > > +} > > +EXPORT_SYMBOL(drmcgrp_is_self_or_ancestor); > > + > > +bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev, > > + size_t size) > > +{ > > + struct drmcgrp *drmcgrp = get_drmcgrp(task); > > + struct drmcgrp_device_resource *ddr; > > + struct drmcgrp_device_resource *d; > > + int devIdx = dev->primary->index; > > + bool result = true; > > + s64 delta = 0; > > + > > + if (drmcgrp == NULL || drmcgrp == root_drmcgrp) > > + return true; > > + > > + ddr = drmcgrp->dev_resources[devIdx]; > > + mutex_lock(&known_drmcgrp_devs[devIdx]->mutex); > > + for ( ; drmcgrp != root_drmcgrp; drmcgrp = parent_drmcgrp(drmcgrp)) { > > + d = drmcgrp->dev_resources[devIdx]; > > + delta = d->bo_limits_total_allocated - > > + d->bo_stats_total_allocated; > > + > > + if (delta <= 0 || size > delta) { > > + result = false; > > + break; > > + } > > + } > > + mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex); > > + > > + return result; > > +} > > +EXPORT_SYMBOL(drmcgrp_bo_can_allocate); > > + > > +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, > > + size_t size) > > +{ > > + struct drmcgrp_device_resource *ddr; > > + int devIdx = dev->primary->index; > > + > > + if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL) > > + return; > > + > > + mutex_lock(&known_drmcgrp_devs[devIdx]->mutex); > > + for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp)) { > > + ddr = drmcgrp->dev_resources[devIdx]; > > + > > + ddr->bo_stats_total_allocated += (s64)size; > > + } > > + mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex); > > +} > > +EXPORT_SYMBOL(drmcgrp_chg_bo_alloc); > > + > > +void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, > > + size_t size) > > +{ > > + struct drmcgrp_device_resource *ddr; > > + int devIdx = dev->primary->index; > > + > > + if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL) > > + return; > > + > > + ddr = drmcgrp->dev_resources[devIdx]; > > + mutex_lock(&known_drmcgrp_devs[devIdx]->mutex); > > + for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp)) > > + drmcgrp->dev_resources[devIdx]->bo_stats_total_allocated > > + -= (s64)size; > > + mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex); > > +} > > +EXPORT_SYMBOL(drmcgrp_unchg_bo_alloc); >
Am 10.05.19 um 16:57 schrieb Kenny Ho: > [CAUTION: External Email] > > On Fri, May 10, 2019 at 8:28 AM Christian König > <ckoenig.leichtzumerken@gmail.com> wrote: >> Am 09.05.19 um 23:04 schrieb Kenny Ho: >>> + /* only allow bo from the same cgroup or its ancestor to be imported */ >>> + if (drmcgrp != NULL && >>> + !drmcgrp_is_self_or_ancestor(drmcgrp, obj->drmcgrp)) { >>> + ret = -EACCES; >>> + goto out_unlock; >>> + } >>> + >> This will most likely go up in flames. >> >> If I'm not completely mistaken we already use >> drm_gem_prime_fd_to_handle() to exchange handles between different >> cgroups in current container usages. > This is something that I am interested in getting more details from > the broader community because the details affect how likely this will > go up in flames ;). Note that this check does not block sharing of > handles from cgroup parent to children in the hierarchy, nor does it > blocks sharing of handles within a cgroup. > > I am interested to find out, when existing apps share handles between > containers, if there are any expectations on resource management. > Since there are no drm cgroup for current container usage, I expect > the answer to be no. In this case, the drm cgroup controller can be > disabled on its own (in the context of cgroup-v2's unified hierarchy), > or the process can remain at the root for the drm cgroup hierarchy (in > the context of cgroup-v1.) If I understand the cgroup api correctly, > that means all process would be part of the root cgroup as far as the > drm controller is concerned and this block will not come into effect. > I have verified that this is indeed the current default behaviour of a > container runtime (runc, which is used by docker, podman and others.) > The new drm cgroup controller is simply ignored and all processes > remain at the root of the hierarchy (since there are no other > cgroups.) I plan to make contributions to runc (so folks can actually > use this features with docker/podman/k8s, etc.) once things stabilized > on the kernel side. So the drm cgroup container is separate to other cgroup containers? In other words as long as userspace doesn't change, this wouldn't have any effect? Well that is unexpected cause then a processes would be in different groups for different controllers, but if that's really the case that would certainly work. > On the other hand, if there are expectations for resource management > between containers, I would like to know who is the expected manager > and how does it fit into the concept of container (which enforce some > level of isolation.) One possible manager may be the display server. > But as long as the display server is in a parent cgroup of the apps' > cgroup, the apps can still import handles from the display server > under the current implementation. My understanding is that this is > most likely the case, with the display server simply sitting at the > default/root cgroup. But I certainly want to hear more about other > use cases (for example, is running multiple display servers on a > single host a realistic possibility? Are there people running > multiple display servers inside peer containers? If so, how do they > coordinate resources?) We definitely have situations with multiple display servers running (just think of VR). I just can't say if they currently use cgroups in any way. Thanks, Christian. > > I should probably summarize some of these into the commit message. > > Regards, > Kenny > > > >> Christian. >>
On Fri, May 10, 2019 at 11:08 AM Koenig, Christian <Christian.Koenig@amd.com> wrote: > Am 10.05.19 um 16:57 schrieb Kenny Ho: > > On Fri, May 10, 2019 at 8:28 AM Christian König > > <ckoenig.leichtzumerken@gmail.com> wrote: > >> Am 09.05.19 um 23:04 schrieb Kenny Ho: > So the drm cgroup container is separate to other cgroup containers? In cgroup-v1, which is most widely deployed currently, all controllers have their own hierarchy (see /sys/fs/cgroup/). In cgroup-v2, the hierarchy is unified by individual controllers can be disabled (I believe, I am not super familiar with v2.) > In other words as long as userspace doesn't change, this wouldn't have > any effect? As far as things like docker and podman is concern, yes. I am not sure about the behaviour of others like lxc, lxd, etc. because I haven't used those myself. > Well that is unexpected cause then a processes would be in different > groups for different controllers, but if that's really the case that > would certainly work. I believe this is a possibility for v1 and is why folks came up with the unified hierarchy in v2 to solve some of the issues. https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#issues-with-v1-and-rationales-for-v2 Regards, Kenny > > On the other hand, if there are expectations for resource management > > between containers, I would like to know who is the expected manager > > and how does it fit into the concept of container (which enforce some > > level of isolation.) One possible manager may be the display server. > > But as long as the display server is in a parent cgroup of the apps' > > cgroup, the apps can still import handles from the display server > > under the current implementation. My understanding is that this is > > most likely the case, with the display server simply sitting at the > > default/root cgroup. But I certainly want to hear more about other > > use cases (for example, is running multiple display servers on a > > single host a realistic possibility? Are there people running > > multiple display servers inside peer containers? If so, how do they > > coordinate resources?) > > We definitely have situations with multiple display servers running > (just think of VR). > > I just can't say if they currently use cgroups in any way. > > Thanks, > Christian. > > > > > I should probably summarize some of these into the commit message. > > > > Regards, > > Kenny > > > > > > > >> Christian. > >> >
Am 10.05.19 um 17:25 schrieb Kenny Ho: > [CAUTION: External Email] > > On Fri, May 10, 2019 at 11:08 AM Koenig, Christian > <Christian.Koenig@amd.com> wrote: >> Am 10.05.19 um 16:57 schrieb Kenny Ho: >>> On Fri, May 10, 2019 at 8:28 AM Christian König >>> <ckoenig.leichtzumerken@gmail.com> wrote: >>>> Am 09.05.19 um 23:04 schrieb Kenny Ho: >> So the drm cgroup container is separate to other cgroup containers? > In cgroup-v1, which is most widely deployed currently, all controllers > have their own hierarchy (see /sys/fs/cgroup/). In cgroup-v2, the > hierarchy is unified by individual controllers can be disabled (I > believe, I am not super familiar with v2.) > >> In other words as long as userspace doesn't change, this wouldn't have >> any effect? > As far as things like docker and podman is concern, yes. I am not > sure about the behaviour of others like lxc, lxd, etc. because I > haven't used those myself. > >> Well that is unexpected cause then a processes would be in different >> groups for different controllers, but if that's really the case that >> would certainly work. > I believe this is a possibility for v1 and is why folks came up with > the unified hierarchy in v2 to solve some of the issues. > https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#issues-with-v1-and-rationales-for-v2 Well another question is why do we want to prevent that in the first place? I mean the worst thing that can happen is that we account a BO multiple times. And going into the same direction where is the code to handle an open device file descriptor which is send from one cgroup to another? Regards, Christian. > > Regards, > Kenny > >>> On the other hand, if there are expectations for resource management >>> between containers, I would like to know who is the expected manager >>> and how does it fit into the concept of container (which enforce some >>> level of isolation.) One possible manager may be the display server. >>> But as long as the display server is in a parent cgroup of the apps' >>> cgroup, the apps can still import handles from the display server >>> under the current implementation. My understanding is that this is >>> most likely the case, with the display server simply sitting at the >>> default/root cgroup. But I certainly want to hear more about other >>> use cases (for example, is running multiple display servers on a >>> single host a realistic possibility? Are there people running >>> multiple display servers inside peer containers? If so, how do they >>> coordinate resources?) >> We definitely have situations with multiple display servers running >> (just think of VR). >> >> I just can't say if they currently use cgroups in any way. >> >> Thanks, >> Christian. >> >>> I should probably summarize some of these into the commit message. >>> >>> Regards, >>> Kenny >>> >>> >>> >>>> Christian. >>>>
On Fri, May 10, 2019 at 1:48 PM Koenig, Christian <Christian.Koenig@amd.com> wrote: > Well another question is why do we want to prevent that in the first place? > > I mean the worst thing that can happen is that we account a BO multiple > times. That's one of the problems. The other one is the BO outliving the lifetime of a cgroup and there's no good way to un-charge the usage when the BO is free so the count won't be accurate. I have looked into two possible solutions. One is to prevent cgroup from being removed when there are BOs owned by the cgroup still alive (similar to how cgroup removal will fail if it still has processes attached to it.) My concern here is the possibility of not able to remove a cgroup forever due to the lifetime of a BO (continuously being shared and reuse and never die.) Perhaps you can shed some light on this possibility. The other one is to keep track of all the buffers and migrate them to the parent if a cgroup is closed. My concern here is the performance overhead on tracking all the buffers. > And going into the same direction where is the code to handle an open > device file descriptor which is send from one cgroup to another? I looked into this before but I forgot what I found. Perhaps folks familiar with device cgroup can chime in. Actually, just did another quick search right now. Looks like the access is enforced at the inode level (__devcgroup_check_permission) so the fd sent to another cgroup that does not have access to the device should still not have access. Regards, Kenny > Regards, > Christian. > > > > > Regards, > > Kenny > > > >>> On the other hand, if there are expectations for resource management > >>> between containers, I would like to know who is the expected manager > >>> and how does it fit into the concept of container (which enforce some > >>> level of isolation.) One possible manager may be the display server. > >>> But as long as the display server is in a parent cgroup of the apps' > >>> cgroup, the apps can still import handles from the display server > >>> under the current implementation. My understanding is that this is > >>> most likely the case, with the display server simply sitting at the > >>> default/root cgroup. But I certainly want to hear more about other > >>> use cases (for example, is running multiple display servers on a > >>> single host a realistic possibility? Are there people running > >>> multiple display servers inside peer containers? If so, how do they > >>> coordinate resources?) > >> We definitely have situations with multiple display servers running > >> (just think of VR). > >> > >> I just can't say if they currently use cgroups in any way. > >> > >> Thanks, > >> Christian. > >> > >>> I should probably summarize some of these into the commit message. > >>> > >>> Regards, > >>> Kenny > >>> > >>> > >>> > >>>> Christian. > >>>> >
On Fri, May 10, 2019 at 02:50:39PM -0400, Kenny Ho wrote: > On Fri, May 10, 2019 at 1:48 PM Koenig, Christian > <Christian.Koenig@amd.com> wrote: > > Well another question is why do we want to prevent that in the first place? > > > > I mean the worst thing that can happen is that we account a BO multiple > > times. > That's one of the problems. The other one is the BO outliving the > lifetime of a cgroup and there's no good way to un-charge the usage > when the BO is free so the count won't be accurate. > > I have looked into two possible solutions. One is to prevent cgroup > from being removed when there are BOs owned by the cgroup still alive > (similar to how cgroup removal will fail if it still has processes > attached to it.) My concern here is the possibility of not able to > remove a cgroup forever due to the lifetime of a BO (continuously > being shared and reuse and never die.) Perhaps you can shed some > light on this possibility. > > The other one is to keep track of all the buffers and migrate them to > the parent if a cgroup is closed. My concern here is the performance > overhead on tracking all the buffers. My understanding is that other cgroups already use reference counting to make sure the data structure in the kernel doesn't disappear too early. So you can delete the cgroup, but it might not get freed completely until all the BO allocated from that cgroup are released. There's a recent lwn article on how that's not all that awesome for the memory cgroup controller, and what to do about it: https://lwn.net/Articles/787614/ We probably want to align with whatever the mem cgroup folks come up with (so _not_ prevent deletion of the cgroup, since that's different behaviour). > > And going into the same direction where is the code to handle an open > > device file descriptor which is send from one cgroup to another? > I looked into this before but I forgot what I found. Perhaps folks > familiar with device cgroup can chime in. > > Actually, just did another quick search right now. Looks like the > access is enforced at the inode level (__devcgroup_check_permission) > so the fd sent to another cgroup that does not have access to the > device should still not have access. That's the device cgroup, not the memory accounting stuff. Imo for memory allocations we should look at what happens when you pass a tempfs file around to another cgroup and then extend it there. I think those allocations are charged against the cgroup which actually allocates stuff. So for drm, if you pass around a device fd, then we always charge ioctl calls to create a BO against the process doing the ioctl call, not against the process which originally opened the device fd. For e.g. DRI3 that's actually the only reasonable thing to do, since otherwise we'd charge everything against the Xserver. -Daniel > > Regards, > Kenny > > > > Regards, > > Christian. > > > > > > > > Regards, > > > Kenny > > > > > >>> On the other hand, if there are expectations for resource management > > >>> between containers, I would like to know who is the expected manager > > >>> and how does it fit into the concept of container (which enforce some > > >>> level of isolation.) One possible manager may be the display server. > > >>> But as long as the display server is in a parent cgroup of the apps' > > >>> cgroup, the apps can still import handles from the display server > > >>> under the current implementation. My understanding is that this is > > >>> most likely the case, with the display server simply sitting at the > > >>> default/root cgroup. But I certainly want to hear more about other > > >>> use cases (for example, is running multiple display servers on a > > >>> single host a realistic possibility? Are there people running > > >>> multiple display servers inside peer containers? If so, how do they > > >>> coordinate resources?) > > >> We definitely have situations with multiple display servers running > > >> (just think of VR). > > >> > > >> I just can't say if they currently use cgroups in any way. > > >> > > >> Thanks, > > >> Christian. > > >> > > >>> I should probably summarize some of these into the commit message. > > >>> > > >>> Regards, > > >>> Kenny > > >>> > > >>> > > >>> > > >>>> Christian. > > >>>> > > > _______________________________________________ > dri-devel mailing list > dri-devel@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/dri-devel
On 5/9/2019 2:04 PM, Kenny Ho wrote: > The drm resource being measured and limited here is the GEM buffer > objects. User applications allocate and free these buffers. In > addition, a process can allocate a buffer and share it with another > process. The consumer of a shared buffer can also outlive the > allocator of the buffer. > > For the purpose of cgroup accounting and limiting, ownership of the > buffer is deemed to be the cgroup for which the allocating process > belongs to. There is one limit per drm device. > > In order to prevent the buffer outliving the cgroup that owns it, a > process is prevented from importing buffers that are not own by the > process' cgroup or the ancestors of the process' cgroup. > > For this resource, the control files are prefixed with drm.buffer.total. Overall, this framework looks very good. But is this a useful resource to track? See my question down further below in your drm_gem_private_object_init. > > There are four control file types, > stats (ro) - display current measured values for a resource > max (rw) - limits for a resource > default (ro, root cgroup only) - default values for a resource > help (ro, root cgroup only) - help string for a resource > > Each file is multi-lined with one entry/line per drm device. Multi-line is correct for multiple devices, but I believe you need to use a KEY to denote device for both your set and get routines. I didn't see your set functions reading a key, or the get functions printing the key in output. cgroups-v2 conventions mention using KEY of major:minor, but I think you can use drm_minor as key? > > Usage examples: > // set limit for card1 to 1GB > sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max > > // set limit for card0 to 512MB > sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max > > Change-Id: I4c249d06d45ec709d6481d4cbe87c5168545c5d0 > Signed-off-by: Kenny Ho <Kenny.Ho@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 + > drivers/gpu/drm/drm_gem.c | 7 + > drivers/gpu/drm/drm_prime.c | 9 + > include/drm/drm_cgroup.h | 34 ++- > include/drm/drm_gem.h | 11 + > include/linux/cgroup_drm.h | 3 + > kernel/cgroup/drm.c | 280 +++++++++++++++++++++ > 7 files changed, 346 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > index 93b2c5a48a71..b4c078b7ad63 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > @@ -34,6 +34,7 @@ > #include <drm/drmP.h> > #include <drm/amdgpu_drm.h> > #include <drm/drm_cache.h> > +#include <drm/drm_cgroup.h> > #include "amdgpu.h" > #include "amdgpu_trace.h" > #include "amdgpu_amdkfd.h" > @@ -446,6 +447,9 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, > if (!amdgpu_bo_validate_size(adev, size, bp->domain)) > return -ENOMEM; > > + if (!drmcgrp_bo_can_allocate(current, adev->ddev, size)) > + return -ENOMEM; > + > *bo_ptr = NULL; > > acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, > diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c > index 6a80db077dc6..cbd49bf34dcf 100644 > --- a/drivers/gpu/drm/drm_gem.c > +++ b/drivers/gpu/drm/drm_gem.c > @@ -37,10 +37,12 @@ > #include <linux/shmem_fs.h> > #include <linux/dma-buf.h> > #include <linux/mem_encrypt.h> > +#include <linux/cgroup_drm.h> > #include <drm/drmP.h> > #include <drm/drm_vma_manager.h> > #include <drm/drm_gem.h> > #include <drm/drm_print.h> > +#include <drm/drm_cgroup.h> > #include "drm_internal.h" > > /** @file drm_gem.c > @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev, > obj->handle_count = 0; > obj->size = size; > drm_vma_node_reset(&obj->vma_node); > + > + obj->drmcgrp = get_drmcgrp(current); > + drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size); Why do the charging here? There is no backing store yet for the buffer, so this is really tracking something akin to allowed virtual memory for GEM objects? Is this really useful for an administrator to control? Isn't the resource we want to control actually the physical backing store? > } > EXPORT_SYMBOL(drm_gem_private_object_init); > > @@ -804,6 +809,8 @@ drm_gem_object_release(struct drm_gem_object *obj) > if (obj->filp) > fput(obj->filp); > > + drmcgrp_unchg_bo_alloc(obj->drmcgrp, obj->dev, obj->size); > + > drm_gem_free_mmap_offset(obj); > } > EXPORT_SYMBOL(drm_gem_object_release); > diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c > index 231e3f6d5f41..faed5611a1c6 100644 > --- a/drivers/gpu/drm/drm_prime.c > +++ b/drivers/gpu/drm/drm_prime.c > @@ -32,6 +32,7 @@ > #include <drm/drm_prime.h> > #include <drm/drm_gem.h> > #include <drm/drmP.h> > +#include <drm/drm_cgroup.h> > > #include "drm_internal.h" > > @@ -794,6 +795,7 @@ int drm_gem_prime_fd_to_handle(struct drm_device *dev, > { > struct dma_buf *dma_buf; > struct drm_gem_object *obj; > + struct drmcgrp *drmcgrp = get_drmcgrp(current); > int ret; > > dma_buf = dma_buf_get(prime_fd); > @@ -818,6 +820,13 @@ int drm_gem_prime_fd_to_handle(struct drm_device *dev, > goto out_unlock; > } > > + /* only allow bo from the same cgroup or its ancestor to be imported */ > + if (drmcgrp != NULL && > + !drmcgrp_is_self_or_ancestor(drmcgrp, obj->drmcgrp)) { > + ret = -EACCES; > + goto out_unlock; > + } > + > if (obj->dma_buf) { > WARN_ON(obj->dma_buf != dma_buf); > } else { > diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h > index ddb9eab64360..8711b7c5f7bf 100644 > --- a/include/drm/drm_cgroup.h > +++ b/include/drm/drm_cgroup.h > @@ -4,12 +4,20 @@ > #ifndef __DRM_CGROUP_H__ > #define __DRM_CGROUP_H__ > > +#include <linux/cgroup_drm.h> > + > #ifdef CONFIG_CGROUP_DRM > > int drmcgrp_register_device(struct drm_device *device); > - > int drmcgrp_unregister_device(struct drm_device *device); > - > +bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, > + struct drmcgrp *relative); > +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, > + size_t size); > +void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, > + size_t size); > +bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev, > + size_t size); > #else > static inline int drmcgrp_register_device(struct drm_device *device) > { > @@ -20,5 +28,27 @@ static inline int drmcgrp_unregister_device(struct drm_device *device) > { > return 0; > } > + > +static inline bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, > + struct drmcgrp *relative) > +{ > + return false; > +} > + > +static inline void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, > + struct drm_device *dev, size_t size) > +{ > +} > + > +static inline void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, > + struct drm_device *dev, size_t size) > +{ > +} > + > +static inline bool drmcgrp_bo_can_allocate(struct task_struct *task, > + struct drm_device *dev, size_t size) > +{ > + return true; > +} > #endif /* CONFIG_CGROUP_DRM */ > #endif /* __DRM_CGROUP_H__ */ > diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h > index c95727425284..02854c674b5c 100644 > --- a/include/drm/drm_gem.h > +++ b/include/drm/drm_gem.h > @@ -272,6 +272,17 @@ struct drm_gem_object { > * > */ > const struct drm_gem_object_funcs *funcs; > + > + /** > + * @drmcgrp: > + * > + * DRM cgroup this GEM object belongs to. > + * > + * This is used to track and limit the amount of GEM objects a user > + * can allocate. Since GEM objects can be shared, this is also used > + * to ensure GEM objects are only shared within the same cgroup. > + */ > + struct drmcgrp *drmcgrp; > }; > > /** > diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h > index d7ccf434ca6b..fe14ba7bb1cf 100644 > --- a/include/linux/cgroup_drm.h > +++ b/include/linux/cgroup_drm.h > @@ -15,6 +15,9 @@ > > struct drmcgrp_device_resource { > /* for per device stats */ > + s64 bo_stats_total_allocated; > + > + s64 bo_limits_total_allocated; > }; > > struct drmcgrp { > diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c > index f9ef4bf042d8..bc3abff09113 100644 > --- a/kernel/cgroup/drm.c > +++ b/kernel/cgroup/drm.c > @@ -15,6 +15,22 @@ static DEFINE_MUTEX(drmcgrp_mutex); > struct drmcgrp_device { > struct drm_device *dev; > struct mutex mutex; > + > + s64 bo_limits_total_allocated_default; > +}; > + > +#define DRMCG_CTF_PRIV_SIZE 3 > +#define DRMCG_CTF_PRIV_MASK GENMASK((DRMCG_CTF_PRIV_SIZE - 1), 0) > + > +enum drmcgrp_res_type { > + DRMCGRP_TYPE_BO_TOTAL, > +}; > + > +enum drmcgrp_file_type { > + DRMCGRP_FTYPE_STATS, > + DRMCGRP_FTYPE_MAX, > + DRMCGRP_FTYPE_DEFAULT, > + DRMCGRP_FTYPE_HELP, > }; > > /* indexed by drm_minor for access speed */ > @@ -53,6 +69,10 @@ static inline int init_drmcgrp_single(struct drmcgrp *drmcgrp, int i) > } > > /* set defaults here */ > + if (known_drmcgrp_devs[i] != NULL) { > + ddr->bo_limits_total_allocated = > + known_drmcgrp_devs[i]->bo_limits_total_allocated_default; > + } > > return 0; > } > @@ -99,7 +119,187 @@ drmcgrp_css_alloc(struct cgroup_subsys_state *parent_css) > return &drmcgrp->css; > } > > +static inline void drmcgrp_print_stats(struct drmcgrp_device_resource *ddr, > + struct seq_file *sf, enum drmcgrp_res_type type) > +{ > + if (ddr == NULL) { > + seq_puts(sf, "\n"); > + return; > + } > + > + switch (type) { > + case DRMCGRP_TYPE_BO_TOTAL: > + seq_printf(sf, "%lld\n", ddr->bo_stats_total_allocated); > + break; > + default: > + seq_puts(sf, "\n"); > + break; > + } > +} > + > +static inline void drmcgrp_print_limits(struct drmcgrp_device_resource *ddr, > + struct seq_file *sf, enum drmcgrp_res_type type) > +{ > + if (ddr == NULL) { > + seq_puts(sf, "\n"); > + return; > + } > + > + switch (type) { > + case DRMCGRP_TYPE_BO_TOTAL: > + seq_printf(sf, "%lld\n", ddr->bo_limits_total_allocated); > + break; > + default: > + seq_puts(sf, "\n"); > + break; > + } > +} > + > +static inline void drmcgrp_print_default(struct drmcgrp_device *ddev, > + struct seq_file *sf, enum drmcgrp_res_type type) > +{ > + if (ddev == NULL) { > + seq_puts(sf, "\n"); > + return; > + } > + > + switch (type) { > + case DRMCGRP_TYPE_BO_TOTAL: > + seq_printf(sf, "%lld\n", ddev->bo_limits_total_allocated_default); > + break; > + default: > + seq_puts(sf, "\n"); > + break; > + } > +} > + > +static inline void drmcgrp_print_help(int cardNum, struct seq_file *sf, > + enum drmcgrp_res_type type) > +{ > + switch (type) { > + case DRMCGRP_TYPE_BO_TOTAL: > + seq_printf(sf, > + "Total amount of buffer allocation in bytes for card%d\n", > + cardNum); > + break; > + default: > + seq_puts(sf, "\n"); > + break; > + } > +} > + > +int drmcgrp_bo_show(struct seq_file *sf, void *v) > +{ > + struct drmcgrp *drmcgrp = css_drmcgrp(seq_css(sf)); > + struct drmcgrp_device_resource *ddr = NULL; > + enum drmcgrp_file_type f_type = seq_cft(sf)-> > + private & DRMCG_CTF_PRIV_MASK; > + enum drmcgrp_res_type type = seq_cft(sf)-> > + private >> DRMCG_CTF_PRIV_SIZE; > + struct drmcgrp_device *ddev; > + int i; > + > + for (i = 0; i <= max_minor; i++) { > + ddr = drmcgrp->dev_resources[i]; > + ddev = known_drmcgrp_devs[i]; > + > + switch (f_type) { > + case DRMCGRP_FTYPE_STATS: > + drmcgrp_print_stats(ddr, sf, type); > + break; > + case DRMCGRP_FTYPE_MAX: > + drmcgrp_print_limits(ddr, sf, type); > + break; > + case DRMCGRP_FTYPE_DEFAULT: > + drmcgrp_print_default(ddev, sf, type); > + break; > + case DRMCGRP_FTYPE_HELP: > + drmcgrp_print_help(i, sf, type); > + break; > + default: > + seq_puts(sf, "\n"); > + break; > + } > + } > + > + return 0; > +} > + > +ssize_t drmcgrp_bo_limit_write(struct kernfs_open_file *of, char *buf, > + size_t nbytes, loff_t off) > +{ > + struct drmcgrp *drmcgrp = css_drmcgrp(of_css(of)); > + enum drmcgrp_res_type type = of_cft(of)->private >> DRMCG_CTF_PRIV_SIZE; > + char *cft_name = of_cft(of)->name; > + char *limits = strstrip(buf); > + struct drmcgrp_device_resource *ddr; > + char *sval; > + s64 val; > + int i = 0; > + int rc; > + > + while (i <= max_minor && limits != NULL) { > + sval = strsep(&limits, "\n"); > + rc = kstrtoll(sval, 0, &val); Input should be "KEY VALUE", so KEY will determine device to apply this to. Also, per cgroups-v2 documentation of limits, I believe need to parse and handle the special "max" input value. parse_resources() in rdma controller is example for both of above. > + if (rc) { > + pr_err("drmcgrp: %s: minor %d, err %d. ", > + cft_name, i, rc); > + pr_cont_cgroup_name(drmcgrp->css.cgroup); > + pr_cont("\n"); > + } else { > + ddr = drmcgrp->dev_resources[i]; > + switch (type) { > + case DRMCGRP_TYPE_BO_TOTAL: > + if (val < 0) continue; > + ddr->bo_limits_total_allocated = val; > + break; > + default: > + break; > + } > + } > + > + i++; > + } > + > + if (i <= max_minor) { > + pr_err("drmcgrp: %s: less entries than # of drm devices. ", > + cft_name); > + pr_cont_cgroup_name(drmcgrp->css.cgroup); > + pr_cont("\n"); > + } > + > + return nbytes; > +} > + > struct cftype files[] = { > + { > + .name = "buffer.total.stats", > + .seq_show = drmcgrp_bo_show, > + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | > + DRMCGRP_FTYPE_STATS, > + }, > + { > + .name = "buffer.total.default", > + .seq_show = drmcgrp_bo_show, > + .flags = CFTYPE_ONLY_ON_ROOT, > + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | > + DRMCGRP_FTYPE_DEFAULT, > + }, > + { > + .name = "buffer.total.help", > + .seq_show = drmcgrp_bo_show, > + .flags = CFTYPE_ONLY_ON_ROOT, > + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | > + DRMCGRP_FTYPE_HELP, > + }, > + { > + .name = "buffer.total.max", > + .write = drmcgrp_bo_limit_write, > + .seq_show = drmcgrp_bo_show, > + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | > + DRMCGRP_FTYPE_MAX, > + }, > { } /* terminate */ > }; > > @@ -122,6 +322,8 @@ int drmcgrp_register_device(struct drm_device *dev) > return -ENOMEM; > > ddev->dev = dev; > + ddev->bo_limits_total_allocated_default = S64_MAX; > + > mutex_init(&ddev->mutex); > > mutex_lock(&drmcgrp_mutex); > @@ -156,3 +358,81 @@ int drmcgrp_unregister_device(struct drm_device *dev) > return 0; > } > EXPORT_SYMBOL(drmcgrp_unregister_device); > + > +bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, struct drmcgrp *relative) > +{ > + for (; self != NULL; self = parent_drmcgrp(self)) > + if (self == relative) > + return true; > + > + return false; > +} > +EXPORT_SYMBOL(drmcgrp_is_self_or_ancestor); > + > +bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev, > + size_t size) > +{ > + struct drmcgrp *drmcgrp = get_drmcgrp(task); > + struct drmcgrp_device_resource *ddr; > + struct drmcgrp_device_resource *d; > + int devIdx = dev->primary->index; > + bool result = true; > + s64 delta = 0; > + > + if (drmcgrp == NULL || drmcgrp == root_drmcgrp) > + return true; > + > + ddr = drmcgrp->dev_resources[devIdx]; > + mutex_lock(&known_drmcgrp_devs[devIdx]->mutex); > + for ( ; drmcgrp != root_drmcgrp; drmcgrp = parent_drmcgrp(drmcgrp)) { > + d = drmcgrp->dev_resources[devIdx]; > + delta = d->bo_limits_total_allocated - > + d->bo_stats_total_allocated; > + > + if (delta <= 0 || size > delta) { > + result = false; > + break; > + } > + } > + mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex); > + > + return result; > +} > +EXPORT_SYMBOL(drmcgrp_bo_can_allocate); > + > +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, > + size_t size) Shouldn't this return an error and be implemented with same semantics as the try_charge() functions of other controllers? Below will allow stats_total_allocated to overrun limits_total_allocated. > +{ > + struct drmcgrp_device_resource *ddr; > + int devIdx = dev->primary->index; > + > + if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL) > + return; > + > + mutex_lock(&known_drmcgrp_devs[devIdx]->mutex); > + for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp)) { > + ddr = drmcgrp->dev_resources[devIdx]; > + > + ddr->bo_stats_total_allocated += (s64)size; > + } > + mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex); > +} > +EXPORT_SYMBOL(drmcgrp_chg_bo_alloc); > + > +void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, > + size_t size) > +{ > + struct drmcgrp_device_resource *ddr; > + int devIdx = dev->primary->index; > + > + if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL) > + return; > + > + ddr = drmcgrp->dev_resources[devIdx]; > + mutex_lock(&known_drmcgrp_devs[devIdx]->mutex); > + for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp)) > + drmcgrp->dev_resources[devIdx]->bo_stats_total_allocated > + -= (s64)size; > + mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex); > +} > +EXPORT_SYMBOL(drmcgrp_unchg_bo_alloc); >
On Wed, May 15, 2019 at 5:26 PM Welty, Brian <brian.welty@intel.com> wrote: > On 5/9/2019 2:04 PM, Kenny Ho wrote: > > There are four control file types, > > stats (ro) - display current measured values for a resource > > max (rw) - limits for a resource > > default (ro, root cgroup only) - default values for a resource > > help (ro, root cgroup only) - help string for a resource > > > > Each file is multi-lined with one entry/line per drm device. > > Multi-line is correct for multiple devices, but I believe you need > to use a KEY to denote device for both your set and get routines. > I didn't see your set functions reading a key, or the get functions > printing the key in output. > cgroups-v2 conventions mention using KEY of major:minor, but I think > you can use drm_minor as key? Given this controller is specific to the drm kernel subsystem which uses minor to identify drm device, I don't see a need to complicate the interfaces more by having major and a key. As you can see in the examples below, the drm device minor corresponds to the line number. I am not sure how strict cgroup upstream is about the convention but I am hoping there are flexibility here to allow for what I have implemented. There are a couple of other things I have done that is not described in the convention: 1) inclusion of read-only *.help file at the root cgroup, 2) use read-only (which I can potentially make rw) *.default file instead of having a default entries (since the default can be different for different devices) inside the control files (this way, the resetting of cgroup values for all the drm devices, can be done by a simple 'cp'.) > > Usage examples: > > // set limit for card1 to 1GB > > sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max > > > > // set limit for card0 to 512MB > > sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max > > /** @file drm_gem.c > > @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev, > > obj->handle_count = 0; > > obj->size = size; > > drm_vma_node_reset(&obj->vma_node); > > + > > + obj->drmcgrp = get_drmcgrp(current); > > + drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size); > > Why do the charging here? > There is no backing store yet for the buffer, so this is really tracking something akin to allowed virtual memory for GEM objects? > Is this really useful for an administrator to control? > Isn't the resource we want to control actually the physical backing store? That's correct. This is just the first level of control since the backing store can be backed by different type of memory. I am in the process of adding at least two more resources. Stay tuned. I am doing the charge here to enforce the idea of "creator is deemed owner" at a place where the code is shared by all (the init function.) > > + while (i <= max_minor && limits != NULL) { > > + sval = strsep(&limits, "\n"); > > + rc = kstrtoll(sval, 0, &val); > > Input should be "KEY VALUE", so KEY will determine device to apply this to. > Also, per cgroups-v2 documentation of limits, I believe need to parse and handle the special "max" input value. > > parse_resources() in rdma controller is example for both of above. Please see my previous reply for the rationale of my hope to not need a key. I can certainly add handling of "max" and "default". > > +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, > > + size_t size) > > Shouldn't this return an error and be implemented with same semantics as the > try_charge() functions of other controllers? > Below will allow stats_total_allocated to overrun limits_total_allocated. This is because I am charging the buffer at the init of the buffer which does not fail so the "try" (drmcgrp_bo_can_allocate) is separate and placed earlier and nearer other condition where gem object allocation may fail. In other words, there are multiple possibilities for which gem allocation may fail (cgroup limit being one of them) and satisfying cgroup limit does not mean a charge is needed. I can certainly combine the two functions to have an additional try_charge semantic as well if that is really needed. Regards, Kenny
Am 16.05.19 um 04:29 schrieb Kenny Ho: > [CAUTION: External Email] > > On Wed, May 15, 2019 at 5:26 PM Welty, Brian <brian.welty@intel.com> wrote: >> On 5/9/2019 2:04 PM, Kenny Ho wrote: >>> There are four control file types, >>> stats (ro) - display current measured values for a resource >>> max (rw) - limits for a resource >>> default (ro, root cgroup only) - default values for a resource >>> help (ro, root cgroup only) - help string for a resource >>> >>> Each file is multi-lined with one entry/line per drm device. >> Multi-line is correct for multiple devices, but I believe you need >> to use a KEY to denote device for both your set and get routines. >> I didn't see your set functions reading a key, or the get functions >> printing the key in output. >> cgroups-v2 conventions mention using KEY of major:minor, but I think >> you can use drm_minor as key? > Given this controller is specific to the drm kernel subsystem which > uses minor to identify drm device, Wait a second, using the DRM minor is a good idea in the first place. I have a test system with a Vega10 and a Vega20. Which device gets which minor is not stable, but rather defined by the scan order of the PCIe bus. Normally the scan order is always the same, but adding or removing devices or delaying things just a little bit during init is enough to change this. We need something like the Linux sysfs location or similar to have a stable implementation. Regards, Christian. > I don't see a need to complicate > the interfaces more by having major and a key. As you can see in the > examples below, the drm device minor corresponds to the line number. > I am not sure how strict cgroup upstream is about the convention but I > am hoping there are flexibility here to allow for what I have > implemented. There are a couple of other things I have done that is > not described in the convention: 1) inclusion of read-only *.help file > at the root cgroup, 2) use read-only (which I can potentially make rw) > *.default file instead of having a default entries (since the default > can be different for different devices) inside the control files (this > way, the resetting of cgroup values for all the drm devices, can be > done by a simple 'cp'.) > >>> Usage examples: >>> // set limit for card1 to 1GB >>> sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max >>> >>> // set limit for card0 to 512MB >>> sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max > >>> /** @file drm_gem.c >>> @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev, >>> obj->handle_count = 0; >>> obj->size = size; >>> drm_vma_node_reset(&obj->vma_node); >>> + >>> + obj->drmcgrp = get_drmcgrp(current); >>> + drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size); >> Why do the charging here? >> There is no backing store yet for the buffer, so this is really tracking something akin to allowed virtual memory for GEM objects? >> Is this really useful for an administrator to control? >> Isn't the resource we want to control actually the physical backing store? > That's correct. This is just the first level of control since the > backing store can be backed by different type of memory. I am in the > process of adding at least two more resources. Stay tuned. I am > doing the charge here to enforce the idea of "creator is deemed owner" > at a place where the code is shared by all (the init function.) > >>> + while (i <= max_minor && limits != NULL) { >>> + sval = strsep(&limits, "\n"); >>> + rc = kstrtoll(sval, 0, &val); >> Input should be "KEY VALUE", so KEY will determine device to apply this to. >> Also, per cgroups-v2 documentation of limits, I believe need to parse and handle the special "max" input value. >> >> parse_resources() in rdma controller is example for both of above. > Please see my previous reply for the rationale of my hope to not need > a key. I can certainly add handling of "max" and "default". > > >>> +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, >>> + size_t size) >> Shouldn't this return an error and be implemented with same semantics as the >> try_charge() functions of other controllers? >> Below will allow stats_total_allocated to overrun limits_total_allocated. > This is because I am charging the buffer at the init of the buffer > which does not fail so the "try" (drmcgrp_bo_can_allocate) is separate > and placed earlier and nearer other condition where gem object > allocation may fail. In other words, there are multiple possibilities > for which gem allocation may fail (cgroup limit being one of them) and > satisfying cgroup limit does not mean a charge is needed. I can > certainly combine the two functions to have an additional try_charge > semantic as well if that is really needed. > > Regards, > Kenny
Am 16.05.19 um 09:16 schrieb Koenig, Christian: > Am 16.05.19 um 04:29 schrieb Kenny Ho: >> [CAUTION: External Email] >> >> On Wed, May 15, 2019 at 5:26 PM Welty, Brian <brian.welty@intel.com> wrote: >>> On 5/9/2019 2:04 PM, Kenny Ho wrote: >>>> There are four control file types, >>>> stats (ro) - display current measured values for a resource >>>> max (rw) - limits for a resource >>>> default (ro, root cgroup only) - default values for a resource >>>> help (ro, root cgroup only) - help string for a resource >>>> >>>> Each file is multi-lined with one entry/line per drm device. >>> Multi-line is correct for multiple devices, but I believe you need >>> to use a KEY to denote device for both your set and get routines. >>> I didn't see your set functions reading a key, or the get functions >>> printing the key in output. >>> cgroups-v2 conventions mention using KEY of major:minor, but I think >>> you can use drm_minor as key? >> Given this controller is specific to the drm kernel subsystem which >> uses minor to identify drm device, > Wait a second, using the DRM minor is a good idea in the first place. Well that should have read "is not a good idea".. Christian. > > I have a test system with a Vega10 and a Vega20. Which device gets which > minor is not stable, but rather defined by the scan order of the PCIe bus. > > Normally the scan order is always the same, but adding or removing > devices or delaying things just a little bit during init is enough to > change this. > > We need something like the Linux sysfs location or similar to have a > stable implementation. > > Regards, > Christian. > >> I don't see a need to complicate >> the interfaces more by having major and a key. As you can see in the >> examples below, the drm device minor corresponds to the line number. >> I am not sure how strict cgroup upstream is about the convention but I >> am hoping there are flexibility here to allow for what I have >> implemented. There are a couple of other things I have done that is >> not described in the convention: 1) inclusion of read-only *.help file >> at the root cgroup, 2) use read-only (which I can potentially make rw) >> *.default file instead of having a default entries (since the default >> can be different for different devices) inside the control files (this >> way, the resetting of cgroup values for all the drm devices, can be >> done by a simple 'cp'.) >> >>>> Usage examples: >>>> // set limit for card1 to 1GB >>>> sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max >>>> >>>> // set limit for card0 to 512MB >>>> sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max >>>> /** @file drm_gem.c >>>> @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev, >>>> obj->handle_count = 0; >>>> obj->size = size; >>>> drm_vma_node_reset(&obj->vma_node); >>>> + >>>> + obj->drmcgrp = get_drmcgrp(current); >>>> + drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size); >>> Why do the charging here? >>> There is no backing store yet for the buffer, so this is really tracking something akin to allowed virtual memory for GEM objects? >>> Is this really useful for an administrator to control? >>> Isn't the resource we want to control actually the physical backing store? >> That's correct. This is just the first level of control since the >> backing store can be backed by different type of memory. I am in the >> process of adding at least two more resources. Stay tuned. I am >> doing the charge here to enforce the idea of "creator is deemed owner" >> at a place where the code is shared by all (the init function.) >> >>>> + while (i <= max_minor && limits != NULL) { >>>> + sval = strsep(&limits, "\n"); >>>> + rc = kstrtoll(sval, 0, &val); >>> Input should be "KEY VALUE", so KEY will determine device to apply this to. >>> Also, per cgroups-v2 documentation of limits, I believe need to parse and handle the special "max" input value. >>> >>> parse_resources() in rdma controller is example for both of above. >> Please see my previous reply for the rationale of my hope to not need >> a key. I can certainly add handling of "max" and "default". >> >> >>>> +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, >>>> + size_t size) >>> Shouldn't this return an error and be implemented with same semantics as the >>> try_charge() functions of other controllers? >>> Below will allow stats_total_allocated to overrun limits_total_allocated. >> This is because I am charging the buffer at the init of the buffer >> which does not fail so the "try" (drmcgrp_bo_can_allocate) is separate >> and placed earlier and nearer other condition where gem object >> allocation may fail. In other words, there are multiple possibilities >> for which gem allocation may fail (cgroup limit being one of them) and >> satisfying cgroup limit does not mean a charge is needed. I can >> certainly combine the two functions to have an additional try_charge >> semantic as well if that is really needed. >> >> Regards, >> Kenny > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
On Thu, May 16, 2019 at 09:25:31AM +0200, Christian König wrote: > Am 16.05.19 um 09:16 schrieb Koenig, Christian: > > Am 16.05.19 um 04:29 schrieb Kenny Ho: > > > [CAUTION: External Email] > > > > > > On Wed, May 15, 2019 at 5:26 PM Welty, Brian <brian.welty@intel.com> wrote: > > > > On 5/9/2019 2:04 PM, Kenny Ho wrote: > > > > > There are four control file types, > > > > > stats (ro) - display current measured values for a resource > > > > > max (rw) - limits for a resource > > > > > default (ro, root cgroup only) - default values for a resource > > > > > help (ro, root cgroup only) - help string for a resource > > > > > > > > > > Each file is multi-lined with one entry/line per drm device. > > > > Multi-line is correct for multiple devices, but I believe you need > > > > to use a KEY to denote device for both your set and get routines. > > > > I didn't see your set functions reading a key, or the get functions > > > > printing the key in output. > > > > cgroups-v2 conventions mention using KEY of major:minor, but I think > > > > you can use drm_minor as key? > > > Given this controller is specific to the drm kernel subsystem which > > > uses minor to identify drm device, > > Wait a second, using the DRM minor is a good idea in the first place. > > Well that should have read "is not a good idea".. What else should we use? > > Christian. > > > > > I have a test system with a Vega10 and a Vega20. Which device gets which > > minor is not stable, but rather defined by the scan order of the PCIe bus. > > > > Normally the scan order is always the same, but adding or removing > > devices or delaying things just a little bit during init is enough to > > change this. > > > > We need something like the Linux sysfs location or similar to have a > > stable implementation. You can go from sysfs location to drm class directory (in sysfs) and back. That means if you care you need to walk sysfs yourself a bit, but using the drm minor isn't a blocker itself. One downside with the drm minor is that it's pretty good nonsense once you have more than 64 gpus though, due to how we space render and legacy nodes in the minor ids :-) -Daniel > > > > Regards, > > Christian. > > > > > I don't see a need to complicate > > > the interfaces more by having major and a key. As you can see in the > > > examples below, the drm device minor corresponds to the line number. > > > I am not sure how strict cgroup upstream is about the convention but I > > > am hoping there are flexibility here to allow for what I have > > > implemented. There are a couple of other things I have done that is > > > not described in the convention: 1) inclusion of read-only *.help file > > > at the root cgroup, 2) use read-only (which I can potentially make rw) > > > *.default file instead of having a default entries (since the default > > > can be different for different devices) inside the control files (this > > > way, the resetting of cgroup values for all the drm devices, can be > > > done by a simple 'cp'.) > > > > > > > > Usage examples: > > > > > // set limit for card1 to 1GB > > > > > sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max > > > > > > > > > > // set limit for card0 to 512MB > > > > > sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max > > > > > /** @file drm_gem.c > > > > > @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev, > > > > > obj->handle_count = 0; > > > > > obj->size = size; > > > > > drm_vma_node_reset(&obj->vma_node); > > > > > + > > > > > + obj->drmcgrp = get_drmcgrp(current); > > > > > + drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size); > > > > Why do the charging here? > > > > There is no backing store yet for the buffer, so this is really tracking something akin to allowed virtual memory for GEM objects? > > > > Is this really useful for an administrator to control? > > > > Isn't the resource we want to control actually the physical backing store? > > > That's correct. This is just the first level of control since the > > > backing store can be backed by different type of memory. I am in the > > > process of adding at least two more resources. Stay tuned. I am > > > doing the charge here to enforce the idea of "creator is deemed owner" > > > at a place where the code is shared by all (the init function.) > > > > > > > > + while (i <= max_minor && limits != NULL) { > > > > > + sval = strsep(&limits, "\n"); > > > > > + rc = kstrtoll(sval, 0, &val); > > > > Input should be "KEY VALUE", so KEY will determine device to apply this to. > > > > Also, per cgroups-v2 documentation of limits, I believe need to parse and handle the special "max" input value. > > > > > > > > parse_resources() in rdma controller is example for both of above. > > > Please see my previous reply for the rationale of my hope to not need > > > a key. I can certainly add handling of "max" and "default". > > > > > > > > > > > +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, > > > > > + size_t size) > > > > Shouldn't this return an error and be implemented with same semantics as the > > > > try_charge() functions of other controllers? > > > > Below will allow stats_total_allocated to overrun limits_total_allocated. > > > This is because I am charging the buffer at the init of the buffer > > > which does not fail so the "try" (drmcgrp_bo_can_allocate) is separate > > > and placed earlier and nearer other condition where gem object > > > allocation may fail. In other words, there are multiple possibilities > > > for which gem allocation may fail (cgroup limit being one of them) and > > > satisfying cgroup limit does not mean a charge is needed. I can > > > certainly combine the two functions to have an additional try_charge > > > semantic as well if that is really needed. > > > > > > Regards, > > > Kenny > > _______________________________________________ > > amd-gfx mailing list > > amd-gfx@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > _______________________________________________ > dri-devel mailing list > dri-devel@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/dri-devel
On Thu, May 16, 2019 at 3:25 AM Christian König <ckoenig.leichtzumerken@gmail.com> wrote: > Am 16.05.19 um 09:16 schrieb Koenig, Christian: > > Am 16.05.19 um 04:29 schrieb Kenny Ho: > >> On Wed, May 15, 2019 at 5:26 PM Welty, Brian <brian.welty@intel.com> wrote: > >>> On 5/9/2019 2:04 PM, Kenny Ho wrote: > >>>> Each file is multi-lined with one entry/line per drm device. > >>> Multi-line is correct for multiple devices, but I believe you need > >>> to use a KEY to denote device for both your set and get routines. > >>> I didn't see your set functions reading a key, or the get functions > >>> printing the key in output. > >>> cgroups-v2 conventions mention using KEY of major:minor, but I think > >>> you can use drm_minor as key? > >> Given this controller is specific to the drm kernel subsystem which > >> uses minor to identify drm device, > > Wait a second, using the DRM minor is a good idea in the first place. > Well that should have read "is not a good idea".. > > I have a test system with a Vega10 and a Vega20. Which device gets which > minor is not stable, but rather defined by the scan order of the PCIe bus. > > Normally the scan order is always the same, but adding or removing > devices or delaying things just a little bit during init is enough to > change this. > > We need something like the Linux sysfs location or similar to have a > stable implementation. I get that, which is why I don't use minor to identify cards in user space apps I wrote: https://github.com/RadeonOpenCompute/k8s-device-plugin/blob/c2659c9d1d0713cad36fb5256681125121e6e32f/internal/pkg/amdgpu/amdgpu.go#L85 But within the kernel, I think my use of minor is consistent with the rest of the drm subsystem. I hope I don't need to reform the way the drm subsystem use minor in order to introduce a cgroup controller. Regards, Kenny
Am 16.05.19 um 14:28 schrieb Daniel Vetter: > [CAUTION: External Email] > > On Thu, May 16, 2019 at 09:25:31AM +0200, Christian König wrote: >> Am 16.05.19 um 09:16 schrieb Koenig, Christian: >>> Am 16.05.19 um 04:29 schrieb Kenny Ho: >>>> [CAUTION: External Email] >>>> >>>> On Wed, May 15, 2019 at 5:26 PM Welty, Brian <brian.welty@intel.com> wrote: >>>>> On 5/9/2019 2:04 PM, Kenny Ho wrote: >>>>>> There are four control file types, >>>>>> stats (ro) - display current measured values for a resource >>>>>> max (rw) - limits for a resource >>>>>> default (ro, root cgroup only) - default values for a resource >>>>>> help (ro, root cgroup only) - help string for a resource >>>>>> >>>>>> Each file is multi-lined with one entry/line per drm device. >>>>> Multi-line is correct for multiple devices, but I believe you need >>>>> to use a KEY to denote device for both your set and get routines. >>>>> I didn't see your set functions reading a key, or the get functions >>>>> printing the key in output. >>>>> cgroups-v2 conventions mention using KEY of major:minor, but I think >>>>> you can use drm_minor as key? >>>> Given this controller is specific to the drm kernel subsystem which >>>> uses minor to identify drm device, >>> Wait a second, using the DRM minor is a good idea in the first place. >> Well that should have read "is not a good idea".. > What else should we use? Well what does for example udev uses to identify a device? >> Christian. >> >>> I have a test system with a Vega10 and a Vega20. Which device gets which >>> minor is not stable, but rather defined by the scan order of the PCIe bus. >>> >>> Normally the scan order is always the same, but adding or removing >>> devices or delaying things just a little bit during init is enough to >>> change this. >>> >>> We need something like the Linux sysfs location or similar to have a >>> stable implementation. > You can go from sysfs location to drm class directory (in sysfs) and back. > That means if you care you need to walk sysfs yourself a bit, but using > the drm minor isn't a blocker itself. Yeah, agreed that userspace could do this. But I think if there is an of hand alternative we should use this instead. > One downside with the drm minor is that it's pretty good nonsense once you > have more than 64 gpus though, due to how we space render and legacy nodes > in the minor ids :-) Ok, another good reason to at least not use the minor=linenum approach. Christian. > -Daniel >>> Regards, >>> Christian. >>> >>>> I don't see a need to complicate >>>> the interfaces more by having major and a key. As you can see in the >>>> examples below, the drm device minor corresponds to the line number. >>>> I am not sure how strict cgroup upstream is about the convention but I >>>> am hoping there are flexibility here to allow for what I have >>>> implemented. There are a couple of other things I have done that is >>>> not described in the convention: 1) inclusion of read-only *.help file >>>> at the root cgroup, 2) use read-only (which I can potentially make rw) >>>> *.default file instead of having a default entries (since the default >>>> can be different for different devices) inside the control files (this >>>> way, the resetting of cgroup values for all the drm devices, can be >>>> done by a simple 'cp'.) >>>> >>>>>> Usage examples: >>>>>> // set limit for card1 to 1GB >>>>>> sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max >>>>>> >>>>>> // set limit for card0 to 512MB >>>>>> sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max >>>>>> /** @file drm_gem.c >>>>>> @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev, >>>>>> obj->handle_count = 0; >>>>>> obj->size = size; >>>>>> drm_vma_node_reset(&obj->vma_node); >>>>>> + >>>>>> + obj->drmcgrp = get_drmcgrp(current); >>>>>> + drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size); >>>>> Why do the charging here? >>>>> There is no backing store yet for the buffer, so this is really tracking something akin to allowed virtual memory for GEM objects? >>>>> Is this really useful for an administrator to control? >>>>> Isn't the resource we want to control actually the physical backing store? >>>> That's correct. This is just the first level of control since the >>>> backing store can be backed by different type of memory. I am in the >>>> process of adding at least two more resources. Stay tuned. I am >>>> doing the charge here to enforce the idea of "creator is deemed owner" >>>> at a place where the code is shared by all (the init function.) >>>> >>>>>> + while (i <= max_minor && limits != NULL) { >>>>>> + sval = strsep(&limits, "\n"); >>>>>> + rc = kstrtoll(sval, 0, &val); >>>>> Input should be "KEY VALUE", so KEY will determine device to apply this to. >>>>> Also, per cgroups-v2 documentation of limits, I believe need to parse and handle the special "max" input value. >>>>> >>>>> parse_resources() in rdma controller is example for both of above. >>>> Please see my previous reply for the rationale of my hope to not need >>>> a key. I can certainly add handling of "max" and "default". >>>> >>>> >>>>>> +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, >>>>>> + size_t size) >>>>> Shouldn't this return an error and be implemented with same semantics as the >>>>> try_charge() functions of other controllers? >>>>> Below will allow stats_total_allocated to overrun limits_total_allocated. >>>> This is because I am charging the buffer at the init of the buffer >>>> which does not fail so the "try" (drmcgrp_bo_can_allocate) is separate >>>> and placed earlier and nearer other condition where gem object >>>> allocation may fail. In other words, there are multiple possibilities >>>> for which gem allocation may fail (cgroup limit being one of them) and >>>> satisfying cgroup limit does not mean a charge is needed. I can >>>> certainly combine the two functions to have an additional try_charge >>>> semantic as well if that is really needed. >>>> >>>> Regards, >>>> Kenny >>> _______________________________________________ >>> amd-gfx mailing list >>> amd-gfx@lists.freedesktop.org >>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx >> _______________________________________________ >> dri-devel mailing list >> dri-devel@lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/dri-devel > -- > Daniel Vetter > Software Engineer, Intel Corporation > http://blog.ffwll.ch
Hello, I haven't gone through the patchset yet but some quick comments. On Wed, May 15, 2019 at 10:29:21PM -0400, Kenny Ho wrote: > Given this controller is specific to the drm kernel subsystem which > uses minor to identify drm device, I don't see a need to complicate > the interfaces more by having major and a key. As you can see in the > examples below, the drm device minor corresponds to the line number. > I am not sure how strict cgroup upstream is about the convention but I We're pretty strict. > am hoping there are flexibility here to allow for what I have > implemented. There are a couple of other things I have done that is So, please follow the interface conventions. We can definitely add new ones but that would need functional reasons. > not described in the convention: 1) inclusion of read-only *.help file > at the root cgroup, 2) use read-only (which I can potentially make rw) > *.default file instead of having a default entries (since the default > can be different for different devices) inside the control files (this > way, the resetting of cgroup values for all the drm devices, can be > done by a simple 'cp'.) Again, please follow the existing conventions. There's a lot more harm than good in every controller being creative in their own way. It's trivial to build convenience features in userspace. Please do it there. > > Is this really useful for an administrator to control? > > Isn't the resource we want to control actually the physical backing store? > That's correct. This is just the first level of control since the > backing store can be backed by different type of memory. I am in the > process of adding at least two more resources. Stay tuned. I am > doing the charge here to enforce the idea of "creator is deemed owner" > at a place where the code is shared by all (the init function.) Ideally, controller should only control hard resources which impact behaviors and performance which are immediately visible to users. Thanks.
Am 16.05.19 um 16:03 schrieb Kenny Ho: > On Thu, May 16, 2019 at 3:25 AM Christian König > <ckoenig.leichtzumerken@gmail.com> wrote: >> Am 16.05.19 um 09:16 schrieb Koenig, Christian: >>> Am 16.05.19 um 04:29 schrieb Kenny Ho: >>>> On Wed, May 15, 2019 at 5:26 PM Welty, Brian <brian.welty@intel.com> wrote: >>>>> On 5/9/2019 2:04 PM, Kenny Ho wrote: >>>>>> Each file is multi-lined with one entry/line per drm device. >>>>> Multi-line is correct for multiple devices, but I believe you need >>>>> to use a KEY to denote device for both your set and get routines. >>>>> I didn't see your set functions reading a key, or the get functions >>>>> printing the key in output. >>>>> cgroups-v2 conventions mention using KEY of major:minor, but I think >>>>> you can use drm_minor as key? >>>> Given this controller is specific to the drm kernel subsystem which >>>> uses minor to identify drm device, >>> Wait a second, using the DRM minor is a good idea in the first place. >> Well that should have read "is not a good idea".. >> >> I have a test system with a Vega10 and a Vega20. Which device gets which >> minor is not stable, but rather defined by the scan order of the PCIe bus. >> >> Normally the scan order is always the same, but adding or removing >> devices or delaying things just a little bit during init is enough to >> change this. >> >> We need something like the Linux sysfs location or similar to have a >> stable implementation. > I get that, which is why I don't use minor to identify cards in user > space apps I wrote: > https://github.com/RadeonOpenCompute/k8s-device-plugin/blob/c2659c9d1d0713cad36fb5256681125121e6e32f/internal/pkg/amdgpu/amdgpu.go#L85 Yeah, that is certainly a possibility. > But within the kernel, I think my use of minor is consistent with the > rest of the drm subsystem. I hope I don't need to reform the way the > drm subsystem use minor in order to introduce a cgroup controller. Well I would try to avoid using the minor and at least look for alternatives. E.g. what does udev uses to identify the devices for example? And IIRC we have something like a "device-name" in the kernel as well (what's printed in the logs). The minimum we need to do is get away from the minor=linenum approach, cause as Daniel pointed out the minor allocation is quite a mess and not necessary contiguous. Regards, Christian. > > Regards, > Kenny > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
On Thu, May 16, 2019 at 10:12 AM Christian König <ckoenig.leichtzumerken@gmail.com> wrote: > Am 16.05.19 um 16:03 schrieb Kenny Ho: > > On Thu, May 16, 2019 at 3:25 AM Christian König > > <ckoenig.leichtzumerken@gmail.com> wrote: > >> Am 16.05.19 um 09:16 schrieb Koenig, Christian: > >> We need something like the Linux sysfs location or similar to have a > >> stable implementation. > > I get that, which is why I don't use minor to identify cards in user > > space apps I wrote: > > https://github.com/RadeonOpenCompute/k8s-device-plugin/blob/c2659c9d1d0713cad36fb5256681125121e6e32f/internal/pkg/amdgpu/amdgpu.go#L85 > > Yeah, that is certainly a possibility. > > > But within the kernel, I think my use of minor is consistent with the > > rest of the drm subsystem. I hope I don't need to reform the way the > > drm subsystem use minor in order to introduce a cgroup controller. > > Well I would try to avoid using the minor and at least look for > alternatives. E.g. what does udev uses to identify the devices for > example? And IIRC we have something like a "device-name" in the kernel > as well (what's printed in the logs). > > The minimum we need to do is get away from the minor=linenum approach, > cause as Daniel pointed out the minor allocation is quite a mess and not > necessary contiguous. I noticed :) but looks like there isn't much of a choice from what Tejun/cgroup replied about convention. Regards, Kenny
On Thu, May 16, 2019 at 10:10 AM Tejun Heo <tj@kernel.org> wrote: > I haven't gone through the patchset yet but some quick comments. > > On Wed, May 15, 2019 at 10:29:21PM -0400, Kenny Ho wrote: > > Given this controller is specific to the drm kernel subsystem which > > uses minor to identify drm device, I don't see a need to complicate > > the interfaces more by having major and a key. As you can see in the > > examples below, the drm device minor corresponds to the line number. > > I am not sure how strict cgroup upstream is about the convention but I > > We're pretty strict. > > > am hoping there are flexibility here to allow for what I have > > implemented. There are a couple of other things I have done that is > > So, please follow the interface conventions. We can definitely add > new ones but that would need functional reasons. > > > not described in the convention: 1) inclusion of read-only *.help file > > at the root cgroup, 2) use read-only (which I can potentially make rw) > > *.default file instead of having a default entries (since the default > > can be different for different devices) inside the control files (this > > way, the resetting of cgroup values for all the drm devices, can be > > done by a simple 'cp'.) > > Again, please follow the existing conventions. There's a lot more > harm than good in every controller being creative in their own way. > It's trivial to build convenience features in userspace. Please do it > there. I can certainly remove the ro *.help file and leave the documentation to Documentation/, but for the *.default I do have a functional reason to it. As far as I can tell from the convention, the default is per cgroup and there is no way to describe per device default. Although, perhaps we are talking about two different kinds of defaults. Anyway, I can leave the discussion to a more detailed review. Regards, Kenny
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 93b2c5a48a71..b4c078b7ad63 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -34,6 +34,7 @@ #include <drm/drmP.h> #include <drm/amdgpu_drm.h> #include <drm/drm_cache.h> +#include <drm/drm_cgroup.h> #include "amdgpu.h" #include "amdgpu_trace.h" #include "amdgpu_amdkfd.h" @@ -446,6 +447,9 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev, if (!amdgpu_bo_validate_size(adev, size, bp->domain)) return -ENOMEM; + if (!drmcgrp_bo_can_allocate(current, adev->ddev, size)) + return -ENOMEM; + *bo_ptr = NULL; acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size, diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 6a80db077dc6..cbd49bf34dcf 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -37,10 +37,12 @@ #include <linux/shmem_fs.h> #include <linux/dma-buf.h> #include <linux/mem_encrypt.h> +#include <linux/cgroup_drm.h> #include <drm/drmP.h> #include <drm/drm_vma_manager.h> #include <drm/drm_gem.h> #include <drm/drm_print.h> +#include <drm/drm_cgroup.h> #include "drm_internal.h" /** @file drm_gem.c @@ -154,6 +156,9 @@ void drm_gem_private_object_init(struct drm_device *dev, obj->handle_count = 0; obj->size = size; drm_vma_node_reset(&obj->vma_node); + + obj->drmcgrp = get_drmcgrp(current); + drmcgrp_chg_bo_alloc(obj->drmcgrp, dev, size); } EXPORT_SYMBOL(drm_gem_private_object_init); @@ -804,6 +809,8 @@ drm_gem_object_release(struct drm_gem_object *obj) if (obj->filp) fput(obj->filp); + drmcgrp_unchg_bo_alloc(obj->drmcgrp, obj->dev, obj->size); + drm_gem_free_mmap_offset(obj); } EXPORT_SYMBOL(drm_gem_object_release); diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c index 231e3f6d5f41..faed5611a1c6 100644 --- a/drivers/gpu/drm/drm_prime.c +++ b/drivers/gpu/drm/drm_prime.c @@ -32,6 +32,7 @@ #include <drm/drm_prime.h> #include <drm/drm_gem.h> #include <drm/drmP.h> +#include <drm/drm_cgroup.h> #include "drm_internal.h" @@ -794,6 +795,7 @@ int drm_gem_prime_fd_to_handle(struct drm_device *dev, { struct dma_buf *dma_buf; struct drm_gem_object *obj; + struct drmcgrp *drmcgrp = get_drmcgrp(current); int ret; dma_buf = dma_buf_get(prime_fd); @@ -818,6 +820,13 @@ int drm_gem_prime_fd_to_handle(struct drm_device *dev, goto out_unlock; } + /* only allow bo from the same cgroup or its ancestor to be imported */ + if (drmcgrp != NULL && + !drmcgrp_is_self_or_ancestor(drmcgrp, obj->drmcgrp)) { + ret = -EACCES; + goto out_unlock; + } + if (obj->dma_buf) { WARN_ON(obj->dma_buf != dma_buf); } else { diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h index ddb9eab64360..8711b7c5f7bf 100644 --- a/include/drm/drm_cgroup.h +++ b/include/drm/drm_cgroup.h @@ -4,12 +4,20 @@ #ifndef __DRM_CGROUP_H__ #define __DRM_CGROUP_H__ +#include <linux/cgroup_drm.h> + #ifdef CONFIG_CGROUP_DRM int drmcgrp_register_device(struct drm_device *device); - int drmcgrp_unregister_device(struct drm_device *device); - +bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, + struct drmcgrp *relative); +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, + size_t size); +void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, + size_t size); +bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev, + size_t size); #else static inline int drmcgrp_register_device(struct drm_device *device) { @@ -20,5 +28,27 @@ static inline int drmcgrp_unregister_device(struct drm_device *device) { return 0; } + +static inline bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, + struct drmcgrp *relative) +{ + return false; +} + +static inline void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, + struct drm_device *dev, size_t size) +{ +} + +static inline void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, + struct drm_device *dev, size_t size) +{ +} + +static inline bool drmcgrp_bo_can_allocate(struct task_struct *task, + struct drm_device *dev, size_t size) +{ + return true; +} #endif /* CONFIG_CGROUP_DRM */ #endif /* __DRM_CGROUP_H__ */ diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h index c95727425284..02854c674b5c 100644 --- a/include/drm/drm_gem.h +++ b/include/drm/drm_gem.h @@ -272,6 +272,17 @@ struct drm_gem_object { * */ const struct drm_gem_object_funcs *funcs; + + /** + * @drmcgrp: + * + * DRM cgroup this GEM object belongs to. + * + * This is used to track and limit the amount of GEM objects a user + * can allocate. Since GEM objects can be shared, this is also used + * to ensure GEM objects are only shared within the same cgroup. + */ + struct drmcgrp *drmcgrp; }; /** diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h index d7ccf434ca6b..fe14ba7bb1cf 100644 --- a/include/linux/cgroup_drm.h +++ b/include/linux/cgroup_drm.h @@ -15,6 +15,9 @@ struct drmcgrp_device_resource { /* for per device stats */ + s64 bo_stats_total_allocated; + + s64 bo_limits_total_allocated; }; struct drmcgrp { diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c index f9ef4bf042d8..bc3abff09113 100644 --- a/kernel/cgroup/drm.c +++ b/kernel/cgroup/drm.c @@ -15,6 +15,22 @@ static DEFINE_MUTEX(drmcgrp_mutex); struct drmcgrp_device { struct drm_device *dev; struct mutex mutex; + + s64 bo_limits_total_allocated_default; +}; + +#define DRMCG_CTF_PRIV_SIZE 3 +#define DRMCG_CTF_PRIV_MASK GENMASK((DRMCG_CTF_PRIV_SIZE - 1), 0) + +enum drmcgrp_res_type { + DRMCGRP_TYPE_BO_TOTAL, +}; + +enum drmcgrp_file_type { + DRMCGRP_FTYPE_STATS, + DRMCGRP_FTYPE_MAX, + DRMCGRP_FTYPE_DEFAULT, + DRMCGRP_FTYPE_HELP, }; /* indexed by drm_minor for access speed */ @@ -53,6 +69,10 @@ static inline int init_drmcgrp_single(struct drmcgrp *drmcgrp, int i) } /* set defaults here */ + if (known_drmcgrp_devs[i] != NULL) { + ddr->bo_limits_total_allocated = + known_drmcgrp_devs[i]->bo_limits_total_allocated_default; + } return 0; } @@ -99,7 +119,187 @@ drmcgrp_css_alloc(struct cgroup_subsys_state *parent_css) return &drmcgrp->css; } +static inline void drmcgrp_print_stats(struct drmcgrp_device_resource *ddr, + struct seq_file *sf, enum drmcgrp_res_type type) +{ + if (ddr == NULL) { + seq_puts(sf, "\n"); + return; + } + + switch (type) { + case DRMCGRP_TYPE_BO_TOTAL: + seq_printf(sf, "%lld\n", ddr->bo_stats_total_allocated); + break; + default: + seq_puts(sf, "\n"); + break; + } +} + +static inline void drmcgrp_print_limits(struct drmcgrp_device_resource *ddr, + struct seq_file *sf, enum drmcgrp_res_type type) +{ + if (ddr == NULL) { + seq_puts(sf, "\n"); + return; + } + + switch (type) { + case DRMCGRP_TYPE_BO_TOTAL: + seq_printf(sf, "%lld\n", ddr->bo_limits_total_allocated); + break; + default: + seq_puts(sf, "\n"); + break; + } +} + +static inline void drmcgrp_print_default(struct drmcgrp_device *ddev, + struct seq_file *sf, enum drmcgrp_res_type type) +{ + if (ddev == NULL) { + seq_puts(sf, "\n"); + return; + } + + switch (type) { + case DRMCGRP_TYPE_BO_TOTAL: + seq_printf(sf, "%lld\n", ddev->bo_limits_total_allocated_default); + break; + default: + seq_puts(sf, "\n"); + break; + } +} + +static inline void drmcgrp_print_help(int cardNum, struct seq_file *sf, + enum drmcgrp_res_type type) +{ + switch (type) { + case DRMCGRP_TYPE_BO_TOTAL: + seq_printf(sf, + "Total amount of buffer allocation in bytes for card%d\n", + cardNum); + break; + default: + seq_puts(sf, "\n"); + break; + } +} + +int drmcgrp_bo_show(struct seq_file *sf, void *v) +{ + struct drmcgrp *drmcgrp = css_drmcgrp(seq_css(sf)); + struct drmcgrp_device_resource *ddr = NULL; + enum drmcgrp_file_type f_type = seq_cft(sf)-> + private & DRMCG_CTF_PRIV_MASK; + enum drmcgrp_res_type type = seq_cft(sf)-> + private >> DRMCG_CTF_PRIV_SIZE; + struct drmcgrp_device *ddev; + int i; + + for (i = 0; i <= max_minor; i++) { + ddr = drmcgrp->dev_resources[i]; + ddev = known_drmcgrp_devs[i]; + + switch (f_type) { + case DRMCGRP_FTYPE_STATS: + drmcgrp_print_stats(ddr, sf, type); + break; + case DRMCGRP_FTYPE_MAX: + drmcgrp_print_limits(ddr, sf, type); + break; + case DRMCGRP_FTYPE_DEFAULT: + drmcgrp_print_default(ddev, sf, type); + break; + case DRMCGRP_FTYPE_HELP: + drmcgrp_print_help(i, sf, type); + break; + default: + seq_puts(sf, "\n"); + break; + } + } + + return 0; +} + +ssize_t drmcgrp_bo_limit_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct drmcgrp *drmcgrp = css_drmcgrp(of_css(of)); + enum drmcgrp_res_type type = of_cft(of)->private >> DRMCG_CTF_PRIV_SIZE; + char *cft_name = of_cft(of)->name; + char *limits = strstrip(buf); + struct drmcgrp_device_resource *ddr; + char *sval; + s64 val; + int i = 0; + int rc; + + while (i <= max_minor && limits != NULL) { + sval = strsep(&limits, "\n"); + rc = kstrtoll(sval, 0, &val); + + if (rc) { + pr_err("drmcgrp: %s: minor %d, err %d. ", + cft_name, i, rc); + pr_cont_cgroup_name(drmcgrp->css.cgroup); + pr_cont("\n"); + } else { + ddr = drmcgrp->dev_resources[i]; + switch (type) { + case DRMCGRP_TYPE_BO_TOTAL: + if (val < 0) continue; + ddr->bo_limits_total_allocated = val; + break; + default: + break; + } + } + + i++; + } + + if (i <= max_minor) { + pr_err("drmcgrp: %s: less entries than # of drm devices. ", + cft_name); + pr_cont_cgroup_name(drmcgrp->css.cgroup); + pr_cont("\n"); + } + + return nbytes; +} + struct cftype files[] = { + { + .name = "buffer.total.stats", + .seq_show = drmcgrp_bo_show, + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | + DRMCGRP_FTYPE_STATS, + }, + { + .name = "buffer.total.default", + .seq_show = drmcgrp_bo_show, + .flags = CFTYPE_ONLY_ON_ROOT, + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | + DRMCGRP_FTYPE_DEFAULT, + }, + { + .name = "buffer.total.help", + .seq_show = drmcgrp_bo_show, + .flags = CFTYPE_ONLY_ON_ROOT, + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | + DRMCGRP_FTYPE_HELP, + }, + { + .name = "buffer.total.max", + .write = drmcgrp_bo_limit_write, + .seq_show = drmcgrp_bo_show, + .private = (DRMCGRP_TYPE_BO_TOTAL << DRMCG_CTF_PRIV_SIZE) | + DRMCGRP_FTYPE_MAX, + }, { } /* terminate */ }; @@ -122,6 +322,8 @@ int drmcgrp_register_device(struct drm_device *dev) return -ENOMEM; ddev->dev = dev; + ddev->bo_limits_total_allocated_default = S64_MAX; + mutex_init(&ddev->mutex); mutex_lock(&drmcgrp_mutex); @@ -156,3 +358,81 @@ int drmcgrp_unregister_device(struct drm_device *dev) return 0; } EXPORT_SYMBOL(drmcgrp_unregister_device); + +bool drmcgrp_is_self_or_ancestor(struct drmcgrp *self, struct drmcgrp *relative) +{ + for (; self != NULL; self = parent_drmcgrp(self)) + if (self == relative) + return true; + + return false; +} +EXPORT_SYMBOL(drmcgrp_is_self_or_ancestor); + +bool drmcgrp_bo_can_allocate(struct task_struct *task, struct drm_device *dev, + size_t size) +{ + struct drmcgrp *drmcgrp = get_drmcgrp(task); + struct drmcgrp_device_resource *ddr; + struct drmcgrp_device_resource *d; + int devIdx = dev->primary->index; + bool result = true; + s64 delta = 0; + + if (drmcgrp == NULL || drmcgrp == root_drmcgrp) + return true; + + ddr = drmcgrp->dev_resources[devIdx]; + mutex_lock(&known_drmcgrp_devs[devIdx]->mutex); + for ( ; drmcgrp != root_drmcgrp; drmcgrp = parent_drmcgrp(drmcgrp)) { + d = drmcgrp->dev_resources[devIdx]; + delta = d->bo_limits_total_allocated - + d->bo_stats_total_allocated; + + if (delta <= 0 || size > delta) { + result = false; + break; + } + } + mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex); + + return result; +} +EXPORT_SYMBOL(drmcgrp_bo_can_allocate); + +void drmcgrp_chg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, + size_t size) +{ + struct drmcgrp_device_resource *ddr; + int devIdx = dev->primary->index; + + if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL) + return; + + mutex_lock(&known_drmcgrp_devs[devIdx]->mutex); + for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp)) { + ddr = drmcgrp->dev_resources[devIdx]; + + ddr->bo_stats_total_allocated += (s64)size; + } + mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex); +} +EXPORT_SYMBOL(drmcgrp_chg_bo_alloc); + +void drmcgrp_unchg_bo_alloc(struct drmcgrp *drmcgrp, struct drm_device *dev, + size_t size) +{ + struct drmcgrp_device_resource *ddr; + int devIdx = dev->primary->index; + + if (drmcgrp == NULL || known_drmcgrp_devs[devIdx] == NULL) + return; + + ddr = drmcgrp->dev_resources[devIdx]; + mutex_lock(&known_drmcgrp_devs[devIdx]->mutex); + for ( ; drmcgrp != NULL; drmcgrp = parent_drmcgrp(drmcgrp)) + drmcgrp->dev_resources[devIdx]->bo_stats_total_allocated + -= (s64)size; + mutex_unlock(&known_drmcgrp_devs[devIdx]->mutex); +} +EXPORT_SYMBOL(drmcgrp_unchg_bo_alloc);
The drm resource being measured and limited here is the GEM buffer objects. User applications allocate and free these buffers. In addition, a process can allocate a buffer and share it with another process. The consumer of a shared buffer can also outlive the allocator of the buffer. For the purpose of cgroup accounting and limiting, ownership of the buffer is deemed to be the cgroup for which the allocating process belongs to. There is one limit per drm device. In order to prevent the buffer outliving the cgroup that owns it, a process is prevented from importing buffers that are not own by the process' cgroup or the ancestors of the process' cgroup. For this resource, the control files are prefixed with drm.buffer.total. There are four control file types, stats (ro) - display current measured values for a resource max (rw) - limits for a resource default (ro, root cgroup only) - default values for a resource help (ro, root cgroup only) - help string for a resource Each file is multi-lined with one entry/line per drm device. Usage examples: // set limit for card1 to 1GB sed -i '2s/.*/1073741824/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max // set limit for card0 to 512MB sed -i '1s/.*/536870912/' /sys/fs/cgroup/<cgroup>/drm.buffer.total.max Change-Id: I4c249d06d45ec709d6481d4cbe87c5168545c5d0 Signed-off-by: Kenny Ho <Kenny.Ho@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 + drivers/gpu/drm/drm_gem.c | 7 + drivers/gpu/drm/drm_prime.c | 9 + include/drm/drm_cgroup.h | 34 ++- include/drm/drm_gem.h | 11 + include/linux/cgroup_drm.h | 3 + kernel/cgroup/drm.c | 280 +++++++++++++++++++++ 7 files changed, 346 insertions(+), 2 deletions(-)