diff mbox series

[RFC] Add BPF_PROG_TYPE_CGROUP_IOCTL

Message ID 20201007152355.2446741-1-Kenny.Ho@amd.com (mailing list archive)
State Not Applicable
Headers show
Series [RFC] Add BPF_PROG_TYPE_CGROUP_IOCTL | expand

Commit Message

Ho, Kenny Oct. 7, 2020, 3:23 p.m. UTC
This is a skeleton implementation to invite comments and generate
discussion around the idea of introducing a bpf-cgroup program type to
control ioctl access.  This is modelled after
BPF_PROG_TYPE_CGROUP_DEVICE.  The premise is to allow system admins to
write bpf programs to block some ioctl access, potentially in conjunction
with data collected by other bpf programs stored in some bpf maps and
with bpf_spin_lock.

For example, a bpf program has been accumulating resource usaging
statistic and a second bpf program of BPF_PROG_TYPE_CGROUP_IOCTL would
block access to previously mentioned resource via ioctl when the stats
stored in a bpf map reaches certain threshold.

Like BPF_PROG_TYPE_CGROUP_DEVICE, the default is permissive (i.e.,
ioctls are not blocked if no bpf program is present for the cgroup.) to
maintain current interface behaviour when this functionality is unused.

Performance impact to ioctl calls is minimal as bpf's in-kernel verifier
ensure attached bpf programs cannot crash and always terminate quickly.

TODOs:
- correct usage of the verifier
- toolings
- samples
- device driver may provide helper functions that take
bpf_cgroup_ioctl_ctx and return something more useful for specific
device

Signed-off-by: Kenny Ho <Kenny.Ho@amd.com>
---
 fs/ioctl.c                 |  5 +++
 include/linux/bpf-cgroup.h | 14 ++++++++
 include/linux/bpf_types.h  |  2 ++
 include/uapi/linux/bpf.h   |  8 +++++
 kernel/bpf/cgroup.c        | 66 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c       |  7 ++++
 kernel/bpf/verifier.c      |  1 +
 7 files changed, 103 insertions(+)

Comments

Kenny Ho Nov. 2, 2020, 7:23 p.m. UTC | #1
Adding a few more emails from get_maintainer.pl and bumping this
thread since there hasn't been any comments so far.  Is this too
crazy?  Am I missing something fundamental?

Regards,
Kenny


On Wed, Oct 7, 2020 at 11:24 AM Kenny Ho <Kenny.Ho@amd.com> wrote:
>
> This is a skeleton implementation to invite comments and generate
> discussion around the idea of introducing a bpf-cgroup program type to
> control ioctl access.  This is modelled after
> BPF_PROG_TYPE_CGROUP_DEVICE.  The premise is to allow system admins to
> write bpf programs to block some ioctl access, potentially in conjunction
> with data collected by other bpf programs stored in some bpf maps and
> with bpf_spin_lock.
>
> For example, a bpf program has been accumulating resource usaging
> statistic and a second bpf program of BPF_PROG_TYPE_CGROUP_IOCTL would
> block access to previously mentioned resource via ioctl when the stats
> stored in a bpf map reaches certain threshold.
>
> Like BPF_PROG_TYPE_CGROUP_DEVICE, the default is permissive (i.e.,
> ioctls are not blocked if no bpf program is present for the cgroup.) to
> maintain current interface behaviour when this functionality is unused.
>
> Performance impact to ioctl calls is minimal as bpf's in-kernel verifier
> ensure attached bpf programs cannot crash and always terminate quickly.
>
> TODOs:
> - correct usage of the verifier
> - toolings
> - samples
> - device driver may provide helper functions that take
> bpf_cgroup_ioctl_ctx and return something more useful for specific
> device
>
> Signed-off-by: Kenny Ho <Kenny.Ho@amd.com>
> ---
>  fs/ioctl.c                 |  5 +++
>  include/linux/bpf-cgroup.h | 14 ++++++++
>  include/linux/bpf_types.h  |  2 ++
>  include/uapi/linux/bpf.h   |  8 +++++
>  kernel/bpf/cgroup.c        | 66 ++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c       |  7 ++++
>  kernel/bpf/verifier.c      |  1 +
>  7 files changed, 103 insertions(+)
>
> diff --git a/fs/ioctl.c b/fs/ioctl.c
> index 4e6cc0a7d69c..a3925486d417 100644
> --- a/fs/ioctl.c
> +++ b/fs/ioctl.c
> @@ -19,6 +19,7 @@
>  #include <linux/falloc.h>
>  #include <linux/sched/signal.h>
>  #include <linux/fiemap.h>
> +#include <linux/cgroup.h>
>
>  #include "internal.h"
>
> @@ -45,6 +46,10 @@ long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>         if (!filp->f_op->unlocked_ioctl)
>                 goto out;
>
> +       error = BPF_CGROUP_RUN_PROG_IOCTL(filp, cmd, arg);
> +       if (error)
> +               goto out;
> +
>         error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
>         if (error == -ENOIOCTLCMD)
>                 error = -ENOTTY;
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 64f367044e25..a5f0b0a8f82b 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -134,6 +134,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
>  int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
>                                       short access, enum bpf_attach_type type);
>
> +int __cgroup_bpf_check_ioctl_permission(struct file *filp, unsigned int cmd, unsigned long arg,
> +                                       enum bpf_attach_type type);
> +
>  int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
>                                    struct ctl_table *table, int write,
>                                    void **buf, size_t *pcount, loff_t *ppos,
> @@ -346,6 +349,16 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
>         __ret;                                                                 \
>  })
>
> +#define BPF_CGROUP_RUN_PROG_IOCTL(filp, cmd, arg)                            \
> +({                                                                           \
> +       int __ret = 0;                                                        \
> +       if (cgroup_bpf_enabled)                                               \
> +               __ret = __cgroup_bpf_check_ioctl_permission(filp, cmd, arg,   \
> +                                                           BPF_CGROUP_IOCTL);\
> +                                                                             \
> +       __ret;                                                                \
> +})
> +
>  int cgroup_bpf_prog_attach(const union bpf_attr *attr,
>                            enum bpf_prog_type ptype, struct bpf_prog *prog);
>  int cgroup_bpf_prog_detach(const union bpf_attr *attr,
> @@ -429,6 +442,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
>                                        optlen, max_optlen, retval) ({ retval; })
>  #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
>                                        kernel_optval) ({ 0; })
> +#define BPF_CGROUP_RUN_PROG_IOCTL(type,major,minor,access) ({ 0; })
>
>  #define for_each_cgroup_storage_type(stype) for (; false; )
>
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index a52a5688418e..3055e7e4918c 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -56,6 +56,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl,
>               struct bpf_sysctl, struct bpf_sysctl_kern)
>  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt,
>               struct bpf_sockopt, struct bpf_sockopt_kern)
> +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_IOCTL, cg_ioctl,
> +             struct bpf_cgroup_ioctl_ctx, struct bpf_cgroup_ioctl_ctx)
>  #endif
>  #ifdef CONFIG_BPF_LIRC_MODE2
>  BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index b6238b2209b7..6a908e13d3a3 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -197,6 +197,7 @@ enum bpf_prog_type {
>         BPF_PROG_TYPE_EXT,
>         BPF_PROG_TYPE_LSM,
>         BPF_PROG_TYPE_SK_LOOKUP,
> +       BPF_PROG_TYPE_CGROUP_IOCTL,
>  };
>
>  enum bpf_attach_type {
> @@ -238,6 +239,7 @@ enum bpf_attach_type {
>         BPF_XDP_CPUMAP,
>         BPF_SK_LOOKUP,
>         BPF_XDP,
> +       BPF_CGROUP_IOCTL,
>         __MAX_BPF_ATTACH_TYPE
>  };
>
> @@ -4276,6 +4278,12 @@ struct bpf_cgroup_dev_ctx {
>         __u32 minor;
>  };
>
> +struct bpf_cgroup_ioctl_ctx {
> +       __u64 filp;
> +       __u32 cmd;
> +       __u32 arg;
> +};
> +
>  struct bpf_raw_tracepoint_args {
>         __u64 args[0];
>  };
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 83ff127ef7ae..0958bae3b0b7 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -1203,6 +1203,72 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
>         .is_valid_access        = cgroup_dev_is_valid_access,
>  };
>
> +int __cgroup_bpf_check_ioctl_permission(struct file *filp, unsigned int cmd, unsigned long arg,
> +                                     enum bpf_attach_type type)
> +{
> +       struct cgroup *cgrp;
> +       struct bpf_cgroup_ioctl_ctx ctx = {
> +               .filp = filp,
> +               .cmd = cmd,
> +               .arg = arg,
> +       };
> +       int allow = 1;
> +
> +       rcu_read_lock();
> +       cgrp = task_dfl_cgroup(current);
> +       allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
> +                                  BPF_PROG_RUN);
> +       rcu_read_unlock();
> +
> +       return !allow;
> +}
> +
> +static const struct bpf_func_proto *
> +cgroup_ioctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +{
> +       return cgroup_base_func_proto(func_id, prog);
> +}
> +
> +static bool cgroup_ioctl_is_valid_access(int off, int size,
> +                                      enum bpf_access_type type,
> +                                      const struct bpf_prog *prog,
> +                                      struct bpf_insn_access_aux *info)
> +{
> +       const int size_default = sizeof(__u32);
> +
> +       if (type == BPF_WRITE)
> +               return false;
> +
> +       if (off < 0 || off + size > sizeof(struct bpf_cgroup_ioctl_ctx))
> +               return false;
> +       /* The verifier guarantees that size > 0. */
> +       if (off % size != 0)
> +               return false;
> +
> +       switch (off) {
> +       case bpf_ctx_range(struct bpf_cgroup_ioctl_ctx, filp):
> +               bpf_ctx_record_field_size(info, size_default);
> +               if (!bpf_ctx_narrow_access_ok(off, size, size_default))
> +                       return false;
> +               break;
> +       case bpf_ctx_range(struct bpf_cgroup_ioctl_ctx, cmd):
> +       case bpf_ctx_range(struct bpf_cgroup_ioctl_ctx, arg):
> +       default:
> +               if (size != size_default)
> +                       return false;
> +       }
> +
> +       return true;
> +}
> +
> +const struct bpf_prog_ops cg_ioctl_prog_ops = {
> +};
> +
> +const struct bpf_verifier_ops cg_ioctl_verifier_ops = {
> +       .get_func_proto         = cgroup_ioctl_func_proto,
> +       .is_valid_access        = cgroup_ioctl_is_valid_access,
> +};
> +
>  /**
>   * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
>   *
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 86299a292214..6984a62c96f4 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2054,6 +2054,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
>         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
>         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
>         case BPF_PROG_TYPE_CGROUP_SYSCTL:
> +       case BPF_PROG_TYPE_CGROUP_IOCTL:
>         case BPF_PROG_TYPE_SOCK_OPS:
>         case BPF_PROG_TYPE_EXT: /* extends any prog */
>                 return true;
> @@ -2806,6 +2807,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
>                 return BPF_PROG_TYPE_SOCK_OPS;
>         case BPF_CGROUP_DEVICE:
>                 return BPF_PROG_TYPE_CGROUP_DEVICE;
> +       case BPF_CGROUP_IOCTL:
> +               return BPF_PROG_TYPE_CGROUP_IOCTL;
>         case BPF_SK_MSG_VERDICT:
>                 return BPF_PROG_TYPE_SK_MSG;
>         case BPF_SK_SKB_STREAM_PARSER:
> @@ -2878,6 +2881,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
>         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
>         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
>         case BPF_PROG_TYPE_CGROUP_SYSCTL:
> +       case BPF_PROG_TYPE_CGROUP_IOCTL:
>         case BPF_PROG_TYPE_SOCK_OPS:
>                 ret = cgroup_bpf_prog_attach(attr, ptype, prog);
>                 break;
> @@ -2915,6 +2919,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
>         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
>         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
>         case BPF_PROG_TYPE_CGROUP_SYSCTL:
> +       case BPF_PROG_TYPE_CGROUP_IOCTL:
>         case BPF_PROG_TYPE_SOCK_OPS:
>                 return cgroup_bpf_prog_detach(attr, ptype);
>         default:
> @@ -2958,6 +2963,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
>         case BPF_CGROUP_SYSCTL:
>         case BPF_CGROUP_GETSOCKOPT:
>         case BPF_CGROUP_SETSOCKOPT:
> +       case BPF_CGROUP_IOCTL:
>                 return cgroup_bpf_prog_query(attr, uattr);
>         case BPF_LIRC_MODE2:
>                 return lirc_prog_query(attr, uattr);
> @@ -3914,6 +3920,7 @@ static int link_create(union bpf_attr *attr)
>         case BPF_PROG_TYPE_CGROUP_DEVICE:
>         case BPF_PROG_TYPE_CGROUP_SYSCTL:
>         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
> +       case BPF_PROG_TYPE_CGROUP_IOCTL:
>                 ret = cgroup_bpf_link_attach(attr, prog);
>                 break;
>         case BPF_PROG_TYPE_TRACING:
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index ef938f17b944..af68f463e828 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -7419,6 +7419,7 @@ static int check_return_code(struct bpf_verifier_env *env)
>         case BPF_PROG_TYPE_CGROUP_DEVICE:
>         case BPF_PROG_TYPE_CGROUP_SYSCTL:
>         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
> +       case BPF_PROG_TYPE_CGROUP_IOCTL:
>                 break;
>         case BPF_PROG_TYPE_RAW_TRACEPOINT:
>                 if (!env->prog->aux->attach_btf_id)
> --
> 2.25.1
>
Alexei Starovoitov Nov. 3, 2020, 5:32 a.m. UTC | #2
On Mon, Nov 02, 2020 at 02:23:02PM -0500, Kenny Ho wrote:
> Adding a few more emails from get_maintainer.pl and bumping this
> thread since there hasn't been any comments so far.  Is this too
> crazy?  Am I missing something fundamental?

sorry for delay. Missed it earlier. Feel free to ping the mailing list
sooner next time.

> On Wed, Oct 7, 2020 at 11:24 AM Kenny Ho <Kenny.Ho@amd.com> wrote:
> >
> > This is a skeleton implementation to invite comments and generate
> > discussion around the idea of introducing a bpf-cgroup program type to
> > control ioctl access.  This is modelled after
> > BPF_PROG_TYPE_CGROUP_DEVICE.  The premise is to allow system admins to
> > write bpf programs to block some ioctl access, potentially in conjunction
> > with data collected by other bpf programs stored in some bpf maps and
> > with bpf_spin_lock.
> >
> > For example, a bpf program has been accumulating resource usaging
> > statistic and a second bpf program of BPF_PROG_TYPE_CGROUP_IOCTL would
> > block access to previously mentioned resource via ioctl when the stats
> > stored in a bpf map reaches certain threshold.
> >
> > Like BPF_PROG_TYPE_CGROUP_DEVICE, the default is permissive (i.e.,
> > ioctls are not blocked if no bpf program is present for the cgroup.) to
> > maintain current interface behaviour when this functionality is unused.
> >
> > Performance impact to ioctl calls is minimal as bpf's in-kernel verifier
> > ensure attached bpf programs cannot crash and always terminate quickly.
> >
> > TODOs:
> > - correct usage of the verifier
> > - toolings
> > - samples
> > - device driver may provide helper functions that take
> > bpf_cgroup_ioctl_ctx and return something more useful for specific
> > device
> >
> > Signed-off-by: Kenny Ho <Kenny.Ho@amd.com>
...
> > @@ -45,6 +46,10 @@ long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> >         if (!filp->f_op->unlocked_ioctl)
> >                 goto out;
> >
> > +       error = BPF_CGROUP_RUN_PROG_IOCTL(filp, cmd, arg);
> > +       if (error)
> > +               goto out;
> > +

That's a bit problematic, since we have bpf_lsm now.
Could you use security_file_ioctl hook and do the same filtering there?
It's not cgroup based though. Is it a concern?
If cgroup scoping is really necessary then it's probably better
to add it to bpf_lsm. Then all hooks will become cgroup aware.
Kenny Ho Nov. 3, 2020, 5:39 a.m. UTC | #3
Thanks for the reply.  Cgroup awareness is desired because the intent
is to use this for resource management as well (potentially along with
other cgroup controlled resources.)  I will dig into bpf_lsm and learn
more about it.

Regards,
Kenny


On Tue, Nov 3, 2020 at 12:32 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Nov 02, 2020 at 02:23:02PM -0500, Kenny Ho wrote:
> > Adding a few more emails from get_maintainer.pl and bumping this
> > thread since there hasn't been any comments so far.  Is this too
> > crazy?  Am I missing something fundamental?
>
> sorry for delay. Missed it earlier. Feel free to ping the mailing list
> sooner next time.
>
> > On Wed, Oct 7, 2020 at 11:24 AM Kenny Ho <Kenny.Ho@amd.com> wrote:
> > >
> > > This is a skeleton implementation to invite comments and generate
> > > discussion around the idea of introducing a bpf-cgroup program type to
> > > control ioctl access.  This is modelled after
> > > BPF_PROG_TYPE_CGROUP_DEVICE.  The premise is to allow system admins to
> > > write bpf programs to block some ioctl access, potentially in conjunction
> > > with data collected by other bpf programs stored in some bpf maps and
> > > with bpf_spin_lock.
> > >
> > > For example, a bpf program has been accumulating resource usaging
> > > statistic and a second bpf program of BPF_PROG_TYPE_CGROUP_IOCTL would
> > > block access to previously mentioned resource via ioctl when the stats
> > > stored in a bpf map reaches certain threshold.
> > >
> > > Like BPF_PROG_TYPE_CGROUP_DEVICE, the default is permissive (i.e.,
> > > ioctls are not blocked if no bpf program is present for the cgroup.) to
> > > maintain current interface behaviour when this functionality is unused.
> > >
> > > Performance impact to ioctl calls is minimal as bpf's in-kernel verifier
> > > ensure attached bpf programs cannot crash and always terminate quickly.
> > >
> > > TODOs:
> > > - correct usage of the verifier
> > > - toolings
> > > - samples
> > > - device driver may provide helper functions that take
> > > bpf_cgroup_ioctl_ctx and return something more useful for specific
> > > device
> > >
> > > Signed-off-by: Kenny Ho <Kenny.Ho@amd.com>
> ...
> > > @@ -45,6 +46,10 @@ long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> > >         if (!filp->f_op->unlocked_ioctl)
> > >                 goto out;
> > >
> > > +       error = BPF_CGROUP_RUN_PROG_IOCTL(filp, cmd, arg);
> > > +       if (error)
> > > +               goto out;
> > > +
>
> That's a bit problematic, since we have bpf_lsm now.
> Could you use security_file_ioctl hook and do the same filtering there?
> It's not cgroup based though. Is it a concern?
> If cgroup scoping is really necessary then it's probably better
> to add it to bpf_lsm. Then all hooks will become cgroup aware.
Alexei Starovoitov Nov. 3, 2020, 5:42 a.m. UTC | #4
On Mon, Nov 2, 2020 at 9:39 PM Kenny Ho <y2kenny@gmail.com> wrote:
>
> Thanks for the reply.

pls don't top post.

> Cgroup awareness is desired because the intent
> is to use this for resource management as well (potentially along with
> other cgroup controlled resources.)  I will dig into bpf_lsm and learn
> more about it.

Also consider that bpf_lsm hooks have a way to get cgroup-id without
being explicitly scoped. So the bpf program can be made cgroup aware.
It's just not as convenient as attaching a prog to cgroup+hook at once.
For prototyping the existing bpf_lsm facility should be enough.
So please try to follow this route and please share more details about
the use case.
Kenny Ho Nov. 3, 2020, 7:19 p.m. UTC | #5
On Tue, Nov 3, 2020 at 12:43 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Mon, Nov 2, 2020 at 9:39 PM Kenny Ho <y2kenny@gmail.com> wrote:
> pls don't top post.
My apology.

> > Cgroup awareness is desired because the intent
> > is to use this for resource management as well (potentially along with
> > other cgroup controlled resources.)  I will dig into bpf_lsm and learn
> > more about it.
>
> Also consider that bpf_lsm hooks have a way to get cgroup-id without
> being explicitly scoped. So the bpf program can be made cgroup aware.
> It's just not as convenient as attaching a prog to cgroup+hook at once.
> For prototyping the existing bpf_lsm facility should be enough.
> So please try to follow this route and please share more details about
> the use case.

Ok.  I will take a look and see if that is sufficient.  My
understanding of bpf-cgroup is that it not only makes attaching prog
to cgroup easier but it also facilitates hierarchical calling of
attached progs which might be useful if users wants to manage gpu
resources with bpf cgroup along with other cgroup resources (like
cpu/mem/io, etc.)

About the use case.  The high level motivation here is to provide the
ability to subdivide/share a GPU via cgroups/containers in a way that
is similar to other resources like CPU and memory.  Users have been
requesting this type of functionality because GPU compute can get
expensive and they want to maximize the utilization to get the most
bang for their bucks.  A traditional way to do this is via
SRIOV/virtualization but that often means time sharing the GPU as a
whole unit.  That is useful for some applications but not others due
to the flushing and added latency.  We also have a study that
identified various GPU compute application types.  These types can
benefit from more asymmetrical/granular sharing of the GPU (for
example some applications are compute bound while others can be memory
bound that can benefit from having more VRAM.)

I have been trying to add a cgroup subsystem for the drm subsystem for
this purpose but I ran into two challenges.  First, the composition of
a GPU and how some of the subcomponents (like VRAM or shader
engines/compute units) can be shared are very much vendor specific so
we are unable to arrive at a common interface across all vendors.
Because of this and the variety of places a GPU can go into
(smartphone, PC, server, HPC), there is also no agreement on how
exactly a GPU should be shared.  The best way forward appears to
simply provide hooks for users to define how and what they want to
share via a bpf program.

From what I can tell so far (I am still learning), there are multiple
pieces that need to fall in place for bpf-cgroup to work for this use
case.  First there is resource limit enforcement, which is the
motivation for this RFC (I will look into bpf_lsm as the path
forward.)  I have also been thinking about instrumenting the drm
subsystem with a new BPF program type and have various attach types
across the drm subsystem but I am not sure if this is allowed (this
one is more for resource usage monitoring.)  Another thing I have been
considering is to have the gpu driver provide bpf helper functions for
bpf programs to modify drm driver internals.  That was the reason I
asked about the potential of BTF support for kernel modules a couple
of months ago (and Andrii Nakryiko mentioned that it is being worked
on.)

Please feel free to ask more questions if any of the above is unclear.
Feedbacks are always welcome.

Regards,
Kenny
Alexei Starovoitov Nov. 3, 2020, 9:04 p.m. UTC | #6
On Tue, Nov 03, 2020 at 02:19:22PM -0500, Kenny Ho wrote:
> On Tue, Nov 3, 2020 at 12:43 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> > On Mon, Nov 2, 2020 at 9:39 PM Kenny Ho <y2kenny@gmail.com> wrote:
> > pls don't top post.
> My apology.
> 
> > > Cgroup awareness is desired because the intent
> > > is to use this for resource management as well (potentially along with
> > > other cgroup controlled resources.)  I will dig into bpf_lsm and learn
> > > more about it.
> >
> > Also consider that bpf_lsm hooks have a way to get cgroup-id without
> > being explicitly scoped. So the bpf program can be made cgroup aware.
> > It's just not as convenient as attaching a prog to cgroup+hook at once.
> > For prototyping the existing bpf_lsm facility should be enough.
> > So please try to follow this route and please share more details about
> > the use case.
> 
> Ok.  I will take a look and see if that is sufficient.  My
> understanding of bpf-cgroup is that it not only makes attaching prog
> to cgroup easier but it also facilitates hierarchical calling of
> attached progs which might be useful if users wants to manage gpu
> resources with bpf cgroup along with other cgroup resources (like
> cpu/mem/io, etc.)

Right. Hierarchical cgroup-bpf logic cannot be replicated inside
the program. If you're relying on cgv2 hierarchy to containerize
applications then what I suggested earlier won't work indeed.

> About the use case.  The high level motivation here is to provide the
> ability to subdivide/share a GPU via cgroups/containers in a way that
> is similar to other resources like CPU and memory.  Users have been
> requesting this type of functionality because GPU compute can get
> expensive and they want to maximize the utilization to get the most
> bang for their bucks.  A traditional way to do this is via
> SRIOV/virtualization but that often means time sharing the GPU as a
> whole unit.  That is useful for some applications but not others due
> to the flushing and added latency.  We also have a study that
> identified various GPU compute application types.  These types can
> benefit from more asymmetrical/granular sharing of the GPU (for
> example some applications are compute bound while others can be memory
> bound that can benefit from having more VRAM.)
> 
> I have been trying to add a cgroup subsystem for the drm subsystem for
> this purpose but I ran into two challenges.  First, the composition of
> a GPU and how some of the subcomponents (like VRAM or shader
> engines/compute units) can be shared are very much vendor specific so
> we are unable to arrive at a common interface across all vendors.
> Because of this and the variety of places a GPU can go into
> (smartphone, PC, server, HPC), there is also no agreement on how
> exactly a GPU should be shared.  The best way forward appears to
> simply provide hooks for users to define how and what they want to
> share via a bpf program.

Thank you for sharing the details. It certainly helps.

> From what I can tell so far (I am still learning), there are multiple
> pieces that need to fall in place for bpf-cgroup to work for this use
> case.  First there is resource limit enforcement, which is the
> motivation for this RFC (I will look into bpf_lsm as the path
> forward.)  I have also been thinking about instrumenting the drm
> subsystem with a new BPF program type and have various attach types
> across the drm subsystem but I am not sure if this is allowed (this
> one is more for resource usage monitoring.)  Another thing I have been
> considering is to have the gpu driver provide bpf helper functions for
> bpf programs to modify drm driver internals.  That was the reason I
> asked about the potential of BTF support for kernel modules a couple
> of months ago (and Andrii Nakryiko mentioned that it is being worked
> on.)

Sounds like either bpf_lsm needs to be made aware of cgv2 (which would
be a great thing to have regardless) or cgroup-bpf needs a drm/gpu specific hook.
I think generic ioctl hook is too broad for this use case.
I suspect drm/gpu internal state would be easier to access inside
bpf program if the hook is next to gpu/drm. At ioctl level there is 'file'.
It's probably too abstract for the things you want to do.
Like how VRAM/shader/etc can be accessed through file?
Probably possible through a bunch of lookups and dereferences, but
if the hook is custom to GPU that info is likely readily available.
Then such cgroup-bpf check would be suitable in execution paths where
ioctl-based hook would be too slow.
Kenny Ho Nov. 3, 2020, 10:57 p.m. UTC | #7
On Tue, Nov 3, 2020 at 4:04 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Nov 03, 2020 at 02:19:22PM -0500, Kenny Ho wrote:
> > On Tue, Nov 3, 2020 at 12:43 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > > On Mon, Nov 2, 2020 at 9:39 PM Kenny Ho <y2kenny@gmail.com> wrote:
>
> Sounds like either bpf_lsm needs to be made aware of cgv2 (which would
> be a great thing to have regardless) or cgroup-bpf needs a drm/gpu specific hook.
> I think generic ioctl hook is too broad for this use case.
> I suspect drm/gpu internal state would be easier to access inside
> bpf program if the hook is next to gpu/drm. At ioctl level there is 'file'.
> It's probably too abstract for the things you want to do.
> Like how VRAM/shader/etc can be accessed through file?
> Probably possible through a bunch of lookups and dereferences, but
> if the hook is custom to GPU that info is likely readily available.
> Then such cgroup-bpf check would be suitable in execution paths where
> ioctl-based hook would be too slow.
Just to clarify, when you say drm specific hook, did you mean just a
unique attach_type or a unique prog_type+attach_type combination?  (I
am still a bit fuzzy on when a new prog type is needed vs a new attach
type.  I think prog type is associated with a unique type of context
that the bpf prog will get but I could be missing some nuances.)

When I was thinking of doing an ioctl wide hook, the file would be the
device file and the thinking was to have a helper function provided by
device drivers to further disambiguate.  For our (AMD's) driver, we
have a bunch of ioctls for set/get/create/destroy
(https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c#L1763)
so the bpf prog can make the decision after the disambiguation.  For
example, we have an ioctl called "kfd_ioctl_set_cu_mask."  You can
think of cu_mask like cpumask but for the cores/compute-unit inside a
GPU.  The ioctl hook will get the file, the bpf prog will call a
helper function from the amdgpu driver to return some data structure
specific to the driver and then the bpf prog can make a decision on
gating the ioctl or not.  From what you are saying, sounds like this
kind of back and forth lookup and dereferencing should be avoided for
performance considerations?

Having a DRM specific hook is certainly an alternative.  I just wasn't
sure which level of trade off on abstraction/generic is acceptable.  I
am guessing a new BPF_PROG_TYPE_CGROUP_AMDGPU is probably too
specific?  But sounds like BPF_PROG_TYPE_CGROUP_DRM may be ok?

Regards,
Kenny
Alexei Starovoitov Nov. 3, 2020, 11:28 p.m. UTC | #8
On Tue, Nov 03, 2020 at 05:57:47PM -0500, Kenny Ho wrote:
> On Tue, Nov 3, 2020 at 4:04 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Nov 03, 2020 at 02:19:22PM -0500, Kenny Ho wrote:
> > > On Tue, Nov 3, 2020 at 12:43 AM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > > On Mon, Nov 2, 2020 at 9:39 PM Kenny Ho <y2kenny@gmail.com> wrote:
> >
> > Sounds like either bpf_lsm needs to be made aware of cgv2 (which would
> > be a great thing to have regardless) or cgroup-bpf needs a drm/gpu specific hook.
> > I think generic ioctl hook is too broad for this use case.
> > I suspect drm/gpu internal state would be easier to access inside
> > bpf program if the hook is next to gpu/drm. At ioctl level there is 'file'.
> > It's probably too abstract for the things you want to do.
> > Like how VRAM/shader/etc can be accessed through file?
> > Probably possible through a bunch of lookups and dereferences, but
> > if the hook is custom to GPU that info is likely readily available.
> > Then such cgroup-bpf check would be suitable in execution paths where
> > ioctl-based hook would be too slow.
> Just to clarify, when you say drm specific hook, did you mean just a
> unique attach_type or a unique prog_type+attach_type combination?  (I
> am still a bit fuzzy on when a new prog type is needed vs a new attach
> type.  I think prog type is associated with a unique type of context
> that the bpf prog will get but I could be missing some nuances.)
> 
> When I was thinking of doing an ioctl wide hook, the file would be the
> device file and the thinking was to have a helper function provided by
> device drivers to further disambiguate.  For our (AMD's) driver, we
> have a bunch of ioctls for set/get/create/destroy
> (https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c#L1763)
> so the bpf prog can make the decision after the disambiguation.  For
> example, we have an ioctl called "kfd_ioctl_set_cu_mask."  You can

Thanks for the pointer.
That's one monster ioctl. So much copy_from_user.
BPF prog would need to be sleepable to able to examine the args in such depth.
After quick glance at the code I would put a new hook into
kfd_ioctl() right before
retcode = func(filep, process, kdata);
At this point kdata is already copied from user space 
and usize, that is cmd specific, is known.
So bpf prog wouldn't need to copy that data again.
That will save one copy.
To drill into details of kfd_ioctl_set_cu_mask() the prog would
need to be sleepable to do second copy_from_user of cu_mask.
At least it's not that big.
Yes, the attachment point will be amd driver specific,
but the program doesn't need to be.
It can be generic tracing prog that is agumented to use BTF.
Something like writeable tracepoint with BTF support would do.
So on the bpf side there will be minimal amount of changes.
And in the driver you'll add one or few writeable tracepoints
and the result of the tracepoint will gate
retcode = func(filep, process, kdata);
call in kfd_ioctl().
The writeable tracepoint would need to be cgroup-bpf based.
So that's the only tricky part. BPF infra doesn't have
cgroup+tracepoint scheme. It's probably going to be useful
in other cases like this. See trace_nbd_send_request.
Daniel Vetter Feb. 1, 2021, 2:49 p.m. UTC | #9
Adding gpu folks.

On Tue, Nov 03, 2020 at 03:28:05PM -0800, Alexei Starovoitov wrote:
> On Tue, Nov 03, 2020 at 05:57:47PM -0500, Kenny Ho wrote:
> > On Tue, Nov 3, 2020 at 4:04 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Tue, Nov 03, 2020 at 02:19:22PM -0500, Kenny Ho wrote:
> > > > On Tue, Nov 3, 2020 at 12:43 AM Alexei Starovoitov
> > > > <alexei.starovoitov@gmail.com> wrote:
> > > > > On Mon, Nov 2, 2020 at 9:39 PM Kenny Ho <y2kenny@gmail.com> wrote:
> > >
> > > Sounds like either bpf_lsm needs to be made aware of cgv2 (which would
> > > be a great thing to have regardless) or cgroup-bpf needs a drm/gpu specific hook.
> > > I think generic ioctl hook is too broad for this use case.
> > > I suspect drm/gpu internal state would be easier to access inside
> > > bpf program if the hook is next to gpu/drm. At ioctl level there is 'file'.
> > > It's probably too abstract for the things you want to do.
> > > Like how VRAM/shader/etc can be accessed through file?
> > > Probably possible through a bunch of lookups and dereferences, but
> > > if the hook is custom to GPU that info is likely readily available.
> > > Then such cgroup-bpf check would be suitable in execution paths where
> > > ioctl-based hook would be too slow.
> > Just to clarify, when you say drm specific hook, did you mean just a
> > unique attach_type or a unique prog_type+attach_type combination?  (I
> > am still a bit fuzzy on when a new prog type is needed vs a new attach
> > type.  I think prog type is associated with a unique type of context
> > that the bpf prog will get but I could be missing some nuances.)
> > 
> > When I was thinking of doing an ioctl wide hook, the file would be the
> > device file and the thinking was to have a helper function provided by
> > device drivers to further disambiguate.  For our (AMD's) driver, we
> > have a bunch of ioctls for set/get/create/destroy
> > (https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c#L1763)
> > so the bpf prog can make the decision after the disambiguation.  For
> > example, we have an ioctl called "kfd_ioctl_set_cu_mask."  You can
> 
> Thanks for the pointer.
> That's one monster ioctl. So much copy_from_user.
> BPF prog would need to be sleepable to able to examine the args in such depth.
> After quick glance at the code I would put a new hook into
> kfd_ioctl() right before
> retcode = func(filep, process, kdata);
> At this point kdata is already copied from user space 
> and usize, that is cmd specific, is known.
> So bpf prog wouldn't need to copy that data again.
> That will save one copy.
> To drill into details of kfd_ioctl_set_cu_mask() the prog would
> need to be sleepable to do second copy_from_user of cu_mask.
> At least it's not that big.
> Yes, the attachment point will be amd driver specific,
> but the program doesn't need to be.
> It can be generic tracing prog that is agumented to use BTF.
> Something like writeable tracepoint with BTF support would do.
> So on the bpf side there will be minimal amount of changes.
> And in the driver you'll add one or few writeable tracepoints
> and the result of the tracepoint will gate
> retcode = func(filep, process, kdata);
> call in kfd_ioctl().
> The writeable tracepoint would need to be cgroup-bpf based.
> So that's the only tricky part. BPF infra doesn't have
> cgroup+tracepoint scheme. It's probably going to be useful
> in other cases like this. See trace_nbd_send_request.


Yeah I think this proposal doesn't work:

- inspecting ioctl arguments that need copying outside of the
  driver/subsystem doing that copying is fundamentally racy

- there's been a pile of cgroups proposal to manage gpus at the drm
  subsystem level, some by Kenny, and frankly this at least looks a bit
  like a quick hack to sidestep the consensus process for that.

So once we push this into drivers it's not going to be a bpf hook anymore
I think.

Cheers, Daniel
Kenny Ho Feb. 1, 2021, 4:51 p.m. UTC | #10
[Resent in plain text.]

On Mon, Feb 1, 2021 at 9:49 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> - there's been a pile of cgroups proposal to manage gpus at the drm
>   subsystem level, some by Kenny, and frankly this at least looks a bit
>   like a quick hack to sidestep the consensus process for that.
No Daniel, this is quick *draft* to get a conversation going.  Bpf was
actually a path suggested by Tejun back in 2018 so I think you are
mischaracterizing this quite a bit.

"2018-11-20 Kenny Ho:
To put the questions in more concrete terms, let say a user wants to
 expose certain part of a gpu to a particular cgroup similar to the
 way selective cpu cores are exposed to a cgroup via cpuset, how
 should we go about enabling such functionality?

2018-11-20 Tejun Heo:
Do what the intel driver or bpf is doing?  It's not difficult to hook
into cgroup for identification purposes."

Kenny
Daniel Vetter Feb. 3, 2021, 11:09 a.m. UTC | #11
On Mon, Feb 01, 2021 at 11:51:07AM -0500, Kenny Ho wrote:
> [Resent in plain text.]
> 
> On Mon, Feb 1, 2021 at 9:49 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > - there's been a pile of cgroups proposal to manage gpus at the drm
> >   subsystem level, some by Kenny, and frankly this at least looks a bit
> >   like a quick hack to sidestep the consensus process for that.
> No Daniel, this is quick *draft* to get a conversation going.  Bpf was
> actually a path suggested by Tejun back in 2018 so I think you are
> mischaracterizing this quite a bit.
> 
> "2018-11-20 Kenny Ho:
> To put the questions in more concrete terms, let say a user wants to
>  expose certain part of a gpu to a particular cgroup similar to the
>  way selective cpu cores are exposed to a cgroup via cpuset, how
>  should we go about enabling such functionality?
> 
> 2018-11-20 Tejun Heo:
> Do what the intel driver or bpf is doing?  It's not difficult to hook
> into cgroup for identification purposes."

Yeah, but if you go full amd specific for this, you might as well have a
specific BPF hook which is called in amdgpu/kfd and returns you the CU
mask for a given cgroups (and figures that out however it pleases).

Not a generic framework which lets you build pretty much any possible
cgroups controller for anything else using BPF. Trying to filter anything
at the generic ioctl just doesn't feel like a great idea that's long term
maintainable. E.g. what happens if there's new uapi for command
submission/context creation and now your bpf filter isn't catching all
access anymore? If it's an explicit hook that explicitly computes the CU
mask, then we can add more checks as needed. With ioctl that's impossible.

Plus I'm also not sure whether that's really a good idea still, since if
cloud companies have to built their own bespoke container stuff for every
gpu vendor, that's quite a bad platform we're building. And "I'd like to
make sure my gpu is used fairly among multiple tenents" really isn't a
use-case that's specific to amd.

If this would be something very hw specific like cache assignment and
quality of service stuff or things like that, then vendor specific imo
makes sense. But for CU masks essentially we're cutting the compute
resources up in some way, and I kinda expect everyone with a gpu who cares
about isolating workloads with cgroups wants to do that.
-Daniel
Kenny Ho Feb. 3, 2021, 7:01 p.m. UTC | #12
Daniel,

I will have to get back to you later on the details of this because my
head is currently context switched to some infrastructure and
Kubernetes/golang work, so I am having a hard time digesting what you
are saying.  I am new to the bpf stuff so this is about my own
learning as well as a conversation starter.  The high level goal here
is to have a path for flexibility via a bpf program.  Not just GPU or
DRM or CU mask, but devices making decisions via an operator-written
bpf-prog attached to a cgroup.  More inline.

On Wed, Feb 3, 2021 at 6:09 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Mon, Feb 01, 2021 at 11:51:07AM -0500, Kenny Ho wrote:
> > On Mon, Feb 1, 2021 at 9:49 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > - there's been a pile of cgroups proposal to manage gpus at the drm
> > >   subsystem level, some by Kenny, and frankly this at least looks a bit
> > >   like a quick hack to sidestep the consensus process for that.
> > No Daniel, this is quick *draft* to get a conversation going.  Bpf was
> > actually a path suggested by Tejun back in 2018 so I think you are
> > mischaracterizing this quite a bit.
> >
> > "2018-11-20 Kenny Ho:
> > To put the questions in more concrete terms, let say a user wants to
> >  expose certain part of a gpu to a particular cgroup similar to the
> >  way selective cpu cores are exposed to a cgroup via cpuset, how
> >  should we go about enabling such functionality?
> >
> > 2018-11-20 Tejun Heo:
> > Do what the intel driver or bpf is doing?  It's not difficult to hook
> > into cgroup for identification purposes."
>
> Yeah, but if you go full amd specific for this, you might as well have a
> specific BPF hook which is called in amdgpu/kfd and returns you the CU
> mask for a given cgroups (and figures that out however it pleases).
>
> Not a generic framework which lets you build pretty much any possible
> cgroups controller for anything else using BPF. Trying to filter anything
> at the generic ioctl just doesn't feel like a great idea that's long term
> maintainable. E.g. what happens if there's new uapi for command
> submission/context creation and now your bpf filter isn't catching all
> access anymore? If it's an explicit hook that explicitly computes the CU
> mask, then we can add more checks as needed. With ioctl that's impossible.
>
> Plus I'm also not sure whether that's really a good idea still, since if
> cloud companies have to built their own bespoke container stuff for every
> gpu vendor, that's quite a bad platform we're building. And "I'd like to
> make sure my gpu is used fairly among multiple tenents" really isn't a
> use-case that's specific to amd.

I don't understand what you are saying about containers here since
bpf-progs are not the same as container nor are they deployed from
inside a container (as far as I know, I am actually not sure how
bpf-cgroup works with higher level cloud orchestration since folks
like Docker just migrated to cgroup v2 very recently... I don't think
you can specify a bpf-prog to load as part of a k8s pod definition.)
That said, the bit I understand ("not sure whether that's really a
good idea....cloud companies have to built their own bespoke container
stuff for every gpu vendor...") is in fact the current status quo.  If
you look into some of the popular ML/AI-oriented containers/apps, you
will likely see things are mostly hardcoded to CUDA.  Since I work for
AMD, I wouldn't say that's a good thing but this is just the reality.
For Kubernetes at least (where my head is currently), the official
mechanisms are Device Plugins (I am the author for the one for AMD but
there are a few ones from Intel too, you can confirm with your
colleagues)  and Node Feature/Labels.  Kubernetes schedules
pod/container launched by users to the node/servers by the affinity of
the node resources/labels, and the resources/labels in the pod
specification created by the users.

> If this would be something very hw specific like cache assignment and
> quality of service stuff or things like that, then vendor specific imo
> makes sense. But for CU masks essentially we're cutting the compute
> resources up in some way, and I kinda expect everyone with a gpu who cares
> about isolating workloads with cgroups wants to do that.

Right, but isolating workloads is quality of service stuff and *how*
compute resources are cut up are vendor specific.

Anyway, as I said at the beginning of this reply, this is about
flexibility in support of the diversity of devices and architectures.
CU mask is simply a concrete example of hw diversity that a
bpf-program can encapsulate.  I can see this framework (a custom
program making decisions in a specific cgroup and device context) use
for other things as well.  It may even be useful within a vendor to
handle the diversity between SKUs.

Kenny
Daniel Vetter Feb. 5, 2021, 1:49 p.m. UTC | #13
Hi Kenny

On Wed, Feb 3, 2021 at 8:01 PM Kenny Ho <y2kenny@gmail.com> wrote:
>
> Daniel,
>
> I will have to get back to you later on the details of this because my
> head is currently context switched to some infrastructure and
> Kubernetes/golang work, so I am having a hard time digesting what you
> are saying.  I am new to the bpf stuff so this is about my own
> learning as well as a conversation starter.  The high level goal here
> is to have a path for flexibility via a bpf program.  Not just GPU or
> DRM or CU mask, but devices making decisions via an operator-written
> bpf-prog attached to a cgroup.  More inline.

If you have some pointers on this, I'm happy to do some reading and
learning too.

> On Wed, Feb 3, 2021 at 6:09 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > On Mon, Feb 01, 2021 at 11:51:07AM -0500, Kenny Ho wrote:
> > > On Mon, Feb 1, 2021 at 9:49 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > - there's been a pile of cgroups proposal to manage gpus at the drm
> > > >   subsystem level, some by Kenny, and frankly this at least looks a bit
> > > >   like a quick hack to sidestep the consensus process for that.
> > > No Daniel, this is quick *draft* to get a conversation going.  Bpf was
> > > actually a path suggested by Tejun back in 2018 so I think you are
> > > mischaracterizing this quite a bit.
> > >
> > > "2018-11-20 Kenny Ho:
> > > To put the questions in more concrete terms, let say a user wants to
> > >  expose certain part of a gpu to a particular cgroup similar to the
> > >  way selective cpu cores are exposed to a cgroup via cpuset, how
> > >  should we go about enabling such functionality?
> > >
> > > 2018-11-20 Tejun Heo:
> > > Do what the intel driver or bpf is doing?  It's not difficult to hook
> > > into cgroup for identification purposes."
> >
> > Yeah, but if you go full amd specific for this, you might as well have a
> > specific BPF hook which is called in amdgpu/kfd and returns you the CU
> > mask for a given cgroups (and figures that out however it pleases).
> >
> > Not a generic framework which lets you build pretty much any possible
> > cgroups controller for anything else using BPF. Trying to filter anything
> > at the generic ioctl just doesn't feel like a great idea that's long term
> > maintainable. E.g. what happens if there's new uapi for command
> > submission/context creation and now your bpf filter isn't catching all
> > access anymore? If it's an explicit hook that explicitly computes the CU
> > mask, then we can add more checks as needed. With ioctl that's impossible.
> >
> > Plus I'm also not sure whether that's really a good idea still, since if
> > cloud companies have to built their own bespoke container stuff for every
> > gpu vendor, that's quite a bad platform we're building. And "I'd like to
> > make sure my gpu is used fairly among multiple tenents" really isn't a
> > use-case that's specific to amd.
>
> I don't understand what you are saying about containers here since
> bpf-progs are not the same as container nor are they deployed from
> inside a container (as far as I know, I am actually not sure how
> bpf-cgroup works with higher level cloud orchestration since folks
> like Docker just migrated to cgroup v2 very recently... I don't think
> you can specify a bpf-prog to load as part of a k8s pod definition.)
> That said, the bit I understand ("not sure whether that's really a
> good idea....cloud companies have to built their own bespoke container
> stuff for every gpu vendor...") is in fact the current status quo.  If
> you look into some of the popular ML/AI-oriented containers/apps, you
> will likely see things are mostly hardcoded to CUDA.  Since I work for
> AMD, I wouldn't say that's a good thing but this is just the reality.
> For Kubernetes at least (where my head is currently), the official
> mechanisms are Device Plugins (I am the author for the one for AMD but
> there are a few ones from Intel too, you can confirm with your
> colleagues)  and Node Feature/Labels.  Kubernetes schedules
> pod/container launched by users to the node/servers by the affinity of
> the node resources/labels, and the resources/labels in the pod
> specification created by the users.

Sure the current gpu compute ecosystem is pretty badly fragmented,
forcing higher levels (like containers, but also hpc runtimes, or
anything else) to paper over that with more plugins and abstraction
layers.

That's not really a good excuse that when we upstream these features,
that we should continue with the fragmentation.

> > If this would be something very hw specific like cache assignment and
> > quality of service stuff or things like that, then vendor specific imo
> > makes sense. But for CU masks essentially we're cutting the compute
> > resources up in some way, and I kinda expect everyone with a gpu who cares
> > about isolating workloads with cgroups wants to do that.
>
> Right, but isolating workloads is quality of service stuff and *how*
> compute resources are cut up are vendor specific.
>
> Anyway, as I said at the beginning of this reply, this is about
> flexibility in support of the diversity of devices and architectures.
> CU mask is simply a concrete example of hw diversity that a
> bpf-program can encapsulate.  I can see this framework (a custom
> program making decisions in a specific cgroup and device context) use
> for other things as well.  It may even be useful within a vendor to
> handle the diversity between SKUs.

So I agree that on one side CU mask can be used for low-level quality
of service guarantees (like the CLOS cache stuff on intel cpus as an
example), and that's going to be rather hw specific no matter what.

But my understanding of AMD's plans here is that CU mask is the only
thing you'll have to partition gpu usage in a multi-tenant environment
- whether that's cloud or also whether that's containing apps to make
sure the compositor can still draw the desktop (except for fullscreen
ofc) doesn't really matter I think. And since there's clearly a need
for more general (but necessarily less well-defined) gpu usage
controlling and accounting I don't think exposing just the CU mask is
a good idea. That just perpetuates the current fragmented landscape,
and I really don't see why it's not possible to have a generic "I want
50% of my gpu available for these 2 containers each" solution

Of course on top of that having a bfp hook in amd to do the fine
grained QOS assignement for e.g. embedded application which are very
carefully tuned, should still be possible. But that's on top, not as
the exclusive thing available.

Cheers, Daniel
Kenny Ho May 7, 2021, 2:06 a.m. UTC | #14
Sorry for the late reply (I have been working on other stuff.)

On Fri, Feb 5, 2021 at 8:49 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> So I agree that on one side CU mask can be used for low-level quality
> of service guarantees (like the CLOS cache stuff on intel cpus as an
> example), and that's going to be rather hw specific no matter what.
>
> But my understanding of AMD's plans here is that CU mask is the only
> thing you'll have to partition gpu usage in a multi-tenant environment
> - whether that's cloud or also whether that's containing apps to make
> sure the compositor can still draw the desktop (except for fullscreen
> ofc) doesn't really matter I think.
This is not correct.  Even in the original cgroup proposal, it
supports both mask and count as a way to define unit(s) of sub-device.
For AMD, we already have SRIOV that supports GPU partitioning in a
time-sliced-of-a-whole-GPU fashion.

Kenny
Daniel Vetter May 7, 2021, 8:59 a.m. UTC | #15
On Thu, May 06, 2021 at 10:06:32PM -0400, Kenny Ho wrote:
> Sorry for the late reply (I have been working on other stuff.)
> 
> On Fri, Feb 5, 2021 at 8:49 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > So I agree that on one side CU mask can be used for low-level quality
> > of service guarantees (like the CLOS cache stuff on intel cpus as an
> > example), and that's going to be rather hw specific no matter what.
> >
> > But my understanding of AMD's plans here is that CU mask is the only
> > thing you'll have to partition gpu usage in a multi-tenant environment
> > - whether that's cloud or also whether that's containing apps to make
> > sure the compositor can still draw the desktop (except for fullscreen
> > ofc) doesn't really matter I think.
> This is not correct.  Even in the original cgroup proposal, it
> supports both mask and count as a way to define unit(s) of sub-device.
> For AMD, we already have SRIOV that supports GPU partitioning in a
> time-sliced-of-a-whole-GPU fashion.

Hm I missed that. I feel like time-sliced-of-a-whole gpu is the easier gpu
cgroups controler to get started, since it's much closer to other cgroups
that control bandwidth of some kind. Whether it's i/o bandwidth or compute
bandwidht is kinda a wash.

CU mask feels a lot more like an isolation/guaranteed forward progress
kind of thing, and I suspect that's always going to be a lot more gpu hw
specific than anything we can reasonably put into a general cgroups
controller.

Also for the time slice cgroups thing, can you pls give me pointers to
these old patches that had it, and how it's done? I very obviously missed
that part.

Thanks, Daniel
Kenny Ho May 7, 2021, 3:33 p.m. UTC | #16
On Fri, May 7, 2021 at 4:59 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> Hm I missed that. I feel like time-sliced-of-a-whole gpu is the easier gpu
> cgroups controler to get started, since it's much closer to other cgroups
> that control bandwidth of some kind. Whether it's i/o bandwidth or compute
> bandwidht is kinda a wash.
sriov/time-sliced-of-a-whole gpu does not really need a cgroup
interface since each slice appears as a stand alone device.  This is
already in production (not using cgroup) with users.  The cgroup
proposal has always been parallel to that in many sense: 1) spatial
partitioning as an independent but equally valid use case as time
sharing, 2) sub-device resource control as opposed to full device
control motivated by the workload characterization paper.  It was
never about time vs space in terms of use cases but having new API for
users to be able to do spatial subdevice partitioning.

> CU mask feels a lot more like an isolation/guaranteed forward progress
> kind of thing, and I suspect that's always going to be a lot more gpu hw
> specific than anything we can reasonably put into a general cgroups
> controller.
The first half is correct but I disagree with the conclusion.  The
analogy I would use is multi-core CPU.  The capability of individual
CPU cores, core count and core arrangement may be hw specific but
there are general interfaces to support selection of these cores.  CU
mask may be hw specific but spatial partitioning as an idea is not.
Most gpu vendors have the concept of sub-device compute units (EU, SE,
etc.); OpenCL has the concept of subdevice in the language.  I don't
see any obstacle for vendors to implement spatial partitioning just
like many CPU vendors support the idea of multi-core.

> Also for the time slice cgroups thing, can you pls give me pointers to
> these old patches that had it, and how it's done? I very obviously missed
> that part.
I think you misunderstood what I wrote earlier.  The original proposal
was about spatial partitioning of subdevice resources not time sharing
using cgroup (since time sharing is already supported elsewhere.)

Kenny
Daniel Vetter May 7, 2021, 4:13 p.m. UTC | #17
On Fri, May 07, 2021 at 11:33:46AM -0400, Kenny Ho wrote:
> On Fri, May 7, 2021 at 4:59 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > Hm I missed that. I feel like time-sliced-of-a-whole gpu is the easier gpu
> > cgroups controler to get started, since it's much closer to other cgroups
> > that control bandwidth of some kind. Whether it's i/o bandwidth or compute
> > bandwidht is kinda a wash.
> sriov/time-sliced-of-a-whole gpu does not really need a cgroup
> interface since each slice appears as a stand alone device.  This is
> already in production (not using cgroup) with users.  The cgroup
> proposal has always been parallel to that in many sense: 1) spatial
> partitioning as an independent but equally valid use case as time
> sharing, 2) sub-device resource control as opposed to full device
> control motivated by the workload characterization paper.  It was
> never about time vs space in terms of use cases but having new API for
> users to be able to do spatial subdevice partitioning.
> 
> > CU mask feels a lot more like an isolation/guaranteed forward progress
> > kind of thing, and I suspect that's always going to be a lot more gpu hw
> > specific than anything we can reasonably put into a general cgroups
> > controller.
> The first half is correct but I disagree with the conclusion.  The
> analogy I would use is multi-core CPU.  The capability of individual
> CPU cores, core count and core arrangement may be hw specific but
> there are general interfaces to support selection of these cores.  CU
> mask may be hw specific but spatial partitioning as an idea is not.
> Most gpu vendors have the concept of sub-device compute units (EU, SE,
> etc.); OpenCL has the concept of subdevice in the language.  I don't
> see any obstacle for vendors to implement spatial partitioning just
> like many CPU vendors support the idea of multi-core.
> 
> > Also for the time slice cgroups thing, can you pls give me pointers to
> > these old patches that had it, and how it's done? I very obviously missed
> > that part.
> I think you misunderstood what I wrote earlier.  The original proposal
> was about spatial partitioning of subdevice resources not time sharing
> using cgroup (since time sharing is already supported elsewhere.)

Well SRIOV time-sharing is for virtualization. cgroups is for
containerization, which is just virtualization but with less overhead and
more security bugs.

More or less.

So either I get things still wrong, or we'll get time-sharing for
virtualization, and partitioning of CU for containerization. That doesn't
make that much sense to me.

Since time-sharing is the first thing that's done for virtualization I
think it's probably also the most reasonable to start with for containers.
-Daniel
Alex Deucher May 7, 2021, 4:19 p.m. UTC | #18
On Fri, May 7, 2021 at 12:13 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Fri, May 07, 2021 at 11:33:46AM -0400, Kenny Ho wrote:
> > On Fri, May 7, 2021 at 4:59 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > >
> > > Hm I missed that. I feel like time-sliced-of-a-whole gpu is the easier gpu
> > > cgroups controler to get started, since it's much closer to other cgroups
> > > that control bandwidth of some kind. Whether it's i/o bandwidth or compute
> > > bandwidht is kinda a wash.
> > sriov/time-sliced-of-a-whole gpu does not really need a cgroup
> > interface since each slice appears as a stand alone device.  This is
> > already in production (not using cgroup) with users.  The cgroup
> > proposal has always been parallel to that in many sense: 1) spatial
> > partitioning as an independent but equally valid use case as time
> > sharing, 2) sub-device resource control as opposed to full device
> > control motivated by the workload characterization paper.  It was
> > never about time vs space in terms of use cases but having new API for
> > users to be able to do spatial subdevice partitioning.
> >
> > > CU mask feels a lot more like an isolation/guaranteed forward progress
> > > kind of thing, and I suspect that's always going to be a lot more gpu hw
> > > specific than anything we can reasonably put into a general cgroups
> > > controller.
> > The first half is correct but I disagree with the conclusion.  The
> > analogy I would use is multi-core CPU.  The capability of individual
> > CPU cores, core count and core arrangement may be hw specific but
> > there are general interfaces to support selection of these cores.  CU
> > mask may be hw specific but spatial partitioning as an idea is not.
> > Most gpu vendors have the concept of sub-device compute units (EU, SE,
> > etc.); OpenCL has the concept of subdevice in the language.  I don't
> > see any obstacle for vendors to implement spatial partitioning just
> > like many CPU vendors support the idea of multi-core.
> >
> > > Also for the time slice cgroups thing, can you pls give me pointers to
> > > these old patches that had it, and how it's done? I very obviously missed
> > > that part.
> > I think you misunderstood what I wrote earlier.  The original proposal
> > was about spatial partitioning of subdevice resources not time sharing
> > using cgroup (since time sharing is already supported elsewhere.)
>
> Well SRIOV time-sharing is for virtualization. cgroups is for
> containerization, which is just virtualization but with less overhead and
> more security bugs.
>
> More or less.
>
> So either I get things still wrong, or we'll get time-sharing for
> virtualization, and partitioning of CU for containerization. That doesn't
> make that much sense to me.

You could still potentially do SR-IOV for containerization.  You'd
just pass one of the PCI VFs (virtual functions) to the container and
you'd automatically get the time slice.  I don't see why cgroups would
be a factor there.

Alex

>
> Since time-sharing is the first thing that's done for virtualization I
> think it's probably also the most reasonable to start with for containers.
> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Daniel Vetter May 7, 2021, 4:26 p.m. UTC | #19
On Fri, May 07, 2021 at 12:19:13PM -0400, Alex Deucher wrote:
> On Fri, May 7, 2021 at 12:13 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > On Fri, May 07, 2021 at 11:33:46AM -0400, Kenny Ho wrote:
> > > On Fri, May 7, 2021 at 4:59 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > >
> > > > Hm I missed that. I feel like time-sliced-of-a-whole gpu is the easier gpu
> > > > cgroups controler to get started, since it's much closer to other cgroups
> > > > that control bandwidth of some kind. Whether it's i/o bandwidth or compute
> > > > bandwidht is kinda a wash.
> > > sriov/time-sliced-of-a-whole gpu does not really need a cgroup
> > > interface since each slice appears as a stand alone device.  This is
> > > already in production (not using cgroup) with users.  The cgroup
> > > proposal has always been parallel to that in many sense: 1) spatial
> > > partitioning as an independent but equally valid use case as time
> > > sharing, 2) sub-device resource control as opposed to full device
> > > control motivated by the workload characterization paper.  It was
> > > never about time vs space in terms of use cases but having new API for
> > > users to be able to do spatial subdevice partitioning.
> > >
> > > > CU mask feels a lot more like an isolation/guaranteed forward progress
> > > > kind of thing, and I suspect that's always going to be a lot more gpu hw
> > > > specific than anything we can reasonably put into a general cgroups
> > > > controller.
> > > The first half is correct but I disagree with the conclusion.  The
> > > analogy I would use is multi-core CPU.  The capability of individual
> > > CPU cores, core count and core arrangement may be hw specific but
> > > there are general interfaces to support selection of these cores.  CU
> > > mask may be hw specific but spatial partitioning as an idea is not.
> > > Most gpu vendors have the concept of sub-device compute units (EU, SE,
> > > etc.); OpenCL has the concept of subdevice in the language.  I don't
> > > see any obstacle for vendors to implement spatial partitioning just
> > > like many CPU vendors support the idea of multi-core.
> > >
> > > > Also for the time slice cgroups thing, can you pls give me pointers to
> > > > these old patches that had it, and how it's done? I very obviously missed
> > > > that part.
> > > I think you misunderstood what I wrote earlier.  The original proposal
> > > was about spatial partitioning of subdevice resources not time sharing
> > > using cgroup (since time sharing is already supported elsewhere.)
> >
> > Well SRIOV time-sharing is for virtualization. cgroups is for
> > containerization, which is just virtualization but with less overhead and
> > more security bugs.
> >
> > More or less.
> >
> > So either I get things still wrong, or we'll get time-sharing for
> > virtualization, and partitioning of CU for containerization. That doesn't
> > make that much sense to me.
> 
> You could still potentially do SR-IOV for containerization.  You'd
> just pass one of the PCI VFs (virtual functions) to the container and
> you'd automatically get the time slice.  I don't see why cgroups would
> be a factor there.

Standard interface to manage that time-slicing. I guess for SRIOV it's all
vendor sauce (intel as guilty as anyone else from what I can see), but for
cgroups that feels like it's falling a bit short of what we should aim
for.

But dunno, maybe I'm just dreaming too much :-)
-Daniel

> Alex
> 
> >
> > Since time-sharing is the first thing that's done for virtualization I
> > think it's probably also the most reasonable to start with for containers.
> > -Daniel
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
> > _______________________________________________
> > amd-gfx mailing list
> > amd-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Alex Deucher May 7, 2021, 4:31 p.m. UTC | #20
On Fri, May 7, 2021 at 12:26 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Fri, May 07, 2021 at 12:19:13PM -0400, Alex Deucher wrote:
> > On Fri, May 7, 2021 at 12:13 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > >
> > > On Fri, May 07, 2021 at 11:33:46AM -0400, Kenny Ho wrote:
> > > > On Fri, May 7, 2021 at 4:59 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > >
> > > > > Hm I missed that. I feel like time-sliced-of-a-whole gpu is the easier gpu
> > > > > cgroups controler to get started, since it's much closer to other cgroups
> > > > > that control bandwidth of some kind. Whether it's i/o bandwidth or compute
> > > > > bandwidht is kinda a wash.
> > > > sriov/time-sliced-of-a-whole gpu does not really need a cgroup
> > > > interface since each slice appears as a stand alone device.  This is
> > > > already in production (not using cgroup) with users.  The cgroup
> > > > proposal has always been parallel to that in many sense: 1) spatial
> > > > partitioning as an independent but equally valid use case as time
> > > > sharing, 2) sub-device resource control as opposed to full device
> > > > control motivated by the workload characterization paper.  It was
> > > > never about time vs space in terms of use cases but having new API for
> > > > users to be able to do spatial subdevice partitioning.
> > > >
> > > > > CU mask feels a lot more like an isolation/guaranteed forward progress
> > > > > kind of thing, and I suspect that's always going to be a lot more gpu hw
> > > > > specific than anything we can reasonably put into a general cgroups
> > > > > controller.
> > > > The first half is correct but I disagree with the conclusion.  The
> > > > analogy I would use is multi-core CPU.  The capability of individual
> > > > CPU cores, core count and core arrangement may be hw specific but
> > > > there are general interfaces to support selection of these cores.  CU
> > > > mask may be hw specific but spatial partitioning as an idea is not.
> > > > Most gpu vendors have the concept of sub-device compute units (EU, SE,
> > > > etc.); OpenCL has the concept of subdevice in the language.  I don't
> > > > see any obstacle for vendors to implement spatial partitioning just
> > > > like many CPU vendors support the idea of multi-core.
> > > >
> > > > > Also for the time slice cgroups thing, can you pls give me pointers to
> > > > > these old patches that had it, and how it's done? I very obviously missed
> > > > > that part.
> > > > I think you misunderstood what I wrote earlier.  The original proposal
> > > > was about spatial partitioning of subdevice resources not time sharing
> > > > using cgroup (since time sharing is already supported elsewhere.)
> > >
> > > Well SRIOV time-sharing is for virtualization. cgroups is for
> > > containerization, which is just virtualization but with less overhead and
> > > more security bugs.
> > >
> > > More or less.
> > >
> > > So either I get things still wrong, or we'll get time-sharing for
> > > virtualization, and partitioning of CU for containerization. That doesn't
> > > make that much sense to me.
> >
> > You could still potentially do SR-IOV for containerization.  You'd
> > just pass one of the PCI VFs (virtual functions) to the container and
> > you'd automatically get the time slice.  I don't see why cgroups would
> > be a factor there.
>
> Standard interface to manage that time-slicing. I guess for SRIOV it's all
> vendor sauce (intel as guilty as anyone else from what I can see), but for
> cgroups that feels like it's falling a bit short of what we should aim
> for.
>
> But dunno, maybe I'm just dreaming too much :-)

I don't disagree, I'm just not sure how it would apply to SR-IOV.
Once you've created the virtual functions, you've already created the
partitioning (regardless of whether it's spatial or temporal) so where
would cgroups come into play?

Alex

> -Daniel
>
> > Alex
> >
> > >
> > > Since time-sharing is the first thing that's done for virtualization I
> > > think it's probably also the most reasonable to start with for containers.
> > > -Daniel
> > > --
> > > Daniel Vetter
> > > Software Engineer, Intel Corporation
> > > http://blog.ffwll.ch
> > > _______________________________________________
> > > amd-gfx mailing list
> > > amd-gfx@lists.freedesktop.org
> > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Alex Deucher May 7, 2021, 4:50 p.m. UTC | #21
On Fri, May 7, 2021 at 12:31 PM Alex Deucher <alexdeucher@gmail.com> wrote:
>
> On Fri, May 7, 2021 at 12:26 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > On Fri, May 07, 2021 at 12:19:13PM -0400, Alex Deucher wrote:
> > > On Fri, May 7, 2021 at 12:13 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > >
> > > > On Fri, May 07, 2021 at 11:33:46AM -0400, Kenny Ho wrote:
> > > > > On Fri, May 7, 2021 at 4:59 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > > >
> > > > > > Hm I missed that. I feel like time-sliced-of-a-whole gpu is the easier gpu
> > > > > > cgroups controler to get started, since it's much closer to other cgroups
> > > > > > that control bandwidth of some kind. Whether it's i/o bandwidth or compute
> > > > > > bandwidht is kinda a wash.
> > > > > sriov/time-sliced-of-a-whole gpu does not really need a cgroup
> > > > > interface since each slice appears as a stand alone device.  This is
> > > > > already in production (not using cgroup) with users.  The cgroup
> > > > > proposal has always been parallel to that in many sense: 1) spatial
> > > > > partitioning as an independent but equally valid use case as time
> > > > > sharing, 2) sub-device resource control as opposed to full device
> > > > > control motivated by the workload characterization paper.  It was
> > > > > never about time vs space in terms of use cases but having new API for
> > > > > users to be able to do spatial subdevice partitioning.
> > > > >
> > > > > > CU mask feels a lot more like an isolation/guaranteed forward progress
> > > > > > kind of thing, and I suspect that's always going to be a lot more gpu hw
> > > > > > specific than anything we can reasonably put into a general cgroups
> > > > > > controller.
> > > > > The first half is correct but I disagree with the conclusion.  The
> > > > > analogy I would use is multi-core CPU.  The capability of individual
> > > > > CPU cores, core count and core arrangement may be hw specific but
> > > > > there are general interfaces to support selection of these cores.  CU
> > > > > mask may be hw specific but spatial partitioning as an idea is not.
> > > > > Most gpu vendors have the concept of sub-device compute units (EU, SE,
> > > > > etc.); OpenCL has the concept of subdevice in the language.  I don't
> > > > > see any obstacle for vendors to implement spatial partitioning just
> > > > > like many CPU vendors support the idea of multi-core.
> > > > >
> > > > > > Also for the time slice cgroups thing, can you pls give me pointers to
> > > > > > these old patches that had it, and how it's done? I very obviously missed
> > > > > > that part.
> > > > > I think you misunderstood what I wrote earlier.  The original proposal
> > > > > was about spatial partitioning of subdevice resources not time sharing
> > > > > using cgroup (since time sharing is already supported elsewhere.)
> > > >
> > > > Well SRIOV time-sharing is for virtualization. cgroups is for
> > > > containerization, which is just virtualization but with less overhead and
> > > > more security bugs.
> > > >
> > > > More or less.
> > > >
> > > > So either I get things still wrong, or we'll get time-sharing for
> > > > virtualization, and partitioning of CU for containerization. That doesn't
> > > > make that much sense to me.
> > >
> > > You could still potentially do SR-IOV for containerization.  You'd
> > > just pass one of the PCI VFs (virtual functions) to the container and
> > > you'd automatically get the time slice.  I don't see why cgroups would
> > > be a factor there.
> >
> > Standard interface to manage that time-slicing. I guess for SRIOV it's all
> > vendor sauce (intel as guilty as anyone else from what I can see), but for
> > cgroups that feels like it's falling a bit short of what we should aim
> > for.
> >
> > But dunno, maybe I'm just dreaming too much :-)
>
> I don't disagree, I'm just not sure how it would apply to SR-IOV.
> Once you've created the virtual functions, you've already created the
> partitioning (regardless of whether it's spatial or temporal) so where
> would cgroups come into play?

For some background, the SR-IOV virtual functions show up like actual
PCI endpoints on the bus, so SR-IOV is sort of like cgroups
implemented in hardware.  When you enable SR-IOV, the endpoints that
are created are the partitions.

Alex

>
> Alex
>
> > -Daniel
> >
> > > Alex
> > >
> > > >
> > > > Since time-sharing is the first thing that's done for virtualization I
> > > > think it's probably also the most reasonable to start with for containers.
> > > > -Daniel
> > > > --
> > > > Daniel Vetter
> > > > Software Engineer, Intel Corporation
> > > > http://blog.ffwll.ch
> > > > _______________________________________________
> > > > amd-gfx mailing list
> > > > amd-gfx@lists.freedesktop.org
> > > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> >
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
Daniel Vetter May 7, 2021, 4:54 p.m. UTC | #22
On Fri, May 07, 2021 at 12:50:07PM -0400, Alex Deucher wrote:
> On Fri, May 7, 2021 at 12:31 PM Alex Deucher <alexdeucher@gmail.com> wrote:
> >
> > On Fri, May 7, 2021 at 12:26 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > >
> > > On Fri, May 07, 2021 at 12:19:13PM -0400, Alex Deucher wrote:
> > > > On Fri, May 7, 2021 at 12:13 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > >
> > > > > On Fri, May 07, 2021 at 11:33:46AM -0400, Kenny Ho wrote:
> > > > > > On Fri, May 7, 2021 at 4:59 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > > > >
> > > > > > > Hm I missed that. I feel like time-sliced-of-a-whole gpu is the easier gpu
> > > > > > > cgroups controler to get started, since it's much closer to other cgroups
> > > > > > > that control bandwidth of some kind. Whether it's i/o bandwidth or compute
> > > > > > > bandwidht is kinda a wash.
> > > > > > sriov/time-sliced-of-a-whole gpu does not really need a cgroup
> > > > > > interface since each slice appears as a stand alone device.  This is
> > > > > > already in production (not using cgroup) with users.  The cgroup
> > > > > > proposal has always been parallel to that in many sense: 1) spatial
> > > > > > partitioning as an independent but equally valid use case as time
> > > > > > sharing, 2) sub-device resource control as opposed to full device
> > > > > > control motivated by the workload characterization paper.  It was
> > > > > > never about time vs space in terms of use cases but having new API for
> > > > > > users to be able to do spatial subdevice partitioning.
> > > > > >
> > > > > > > CU mask feels a lot more like an isolation/guaranteed forward progress
> > > > > > > kind of thing, and I suspect that's always going to be a lot more gpu hw
> > > > > > > specific than anything we can reasonably put into a general cgroups
> > > > > > > controller.
> > > > > > The first half is correct but I disagree with the conclusion.  The
> > > > > > analogy I would use is multi-core CPU.  The capability of individual
> > > > > > CPU cores, core count and core arrangement may be hw specific but
> > > > > > there are general interfaces to support selection of these cores.  CU
> > > > > > mask may be hw specific but spatial partitioning as an idea is not.
> > > > > > Most gpu vendors have the concept of sub-device compute units (EU, SE,
> > > > > > etc.); OpenCL has the concept of subdevice in the language.  I don't
> > > > > > see any obstacle for vendors to implement spatial partitioning just
> > > > > > like many CPU vendors support the idea of multi-core.
> > > > > >
> > > > > > > Also for the time slice cgroups thing, can you pls give me pointers to
> > > > > > > these old patches that had it, and how it's done? I very obviously missed
> > > > > > > that part.
> > > > > > I think you misunderstood what I wrote earlier.  The original proposal
> > > > > > was about spatial partitioning of subdevice resources not time sharing
> > > > > > using cgroup (since time sharing is already supported elsewhere.)
> > > > >
> > > > > Well SRIOV time-sharing is for virtualization. cgroups is for
> > > > > containerization, which is just virtualization but with less overhead and
> > > > > more security bugs.
> > > > >
> > > > > More or less.
> > > > >
> > > > > So either I get things still wrong, or we'll get time-sharing for
> > > > > virtualization, and partitioning of CU for containerization. That doesn't
> > > > > make that much sense to me.
> > > >
> > > > You could still potentially do SR-IOV for containerization.  You'd
> > > > just pass one of the PCI VFs (virtual functions) to the container and
> > > > you'd automatically get the time slice.  I don't see why cgroups would
> > > > be a factor there.
> > >
> > > Standard interface to manage that time-slicing. I guess for SRIOV it's all
> > > vendor sauce (intel as guilty as anyone else from what I can see), but for
> > > cgroups that feels like it's falling a bit short of what we should aim
> > > for.
> > >
> > > But dunno, maybe I'm just dreaming too much :-)
> >
> > I don't disagree, I'm just not sure how it would apply to SR-IOV.
> > Once you've created the virtual functions, you've already created the
> > partitioning (regardless of whether it's spatial or temporal) so where
> > would cgroups come into play?
> 
> For some background, the SR-IOV virtual functions show up like actual
> PCI endpoints on the bus, so SR-IOV is sort of like cgroups
> implemented in hardware.  When you enable SR-IOV, the endpoints that
> are created are the partitions.

Yeah I think we're massively agreeing right now :-)

SRIOV is kinda by design vendor specific. You set up the VF endpoint, it
shows up, it's all hw+fw magic. Nothing for cgroups to manage here at all.

All I meant is that for the container/cgroups world starting out with
time-sharing feels like the best fit, least because your SRIOV designers
also seem to think that's the best first cut for cloud-y computing.
Whether it's virtualized or containerized is a distinction that's getting
ever more blurry, with virtualization become a lot more dynamic and
container runtimes als possibly using hw virtualization underneath.
-Daniel

> 
> Alex
> 
> >
> > Alex
> >
> > > -Daniel
> > >
> > > > Alex
> > > >
> > > > >
> > > > > Since time-sharing is the first thing that's done for virtualization I
> > > > > think it's probably also the most reasonable to start with for containers.
> > > > > -Daniel
> > > > > --
> > > > > Daniel Vetter
> > > > > Software Engineer, Intel Corporation
> > > > > http://blog.ffwll.ch
> > > > > _______________________________________________
> > > > > amd-gfx mailing list
> > > > > amd-gfx@lists.freedesktop.org
> > > > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> > >
> > > --
> > > Daniel Vetter
> > > Software Engineer, Intel Corporation
> > > http://blog.ffwll.ch
Kenny Ho May 7, 2021, 5:04 p.m. UTC | #23
On Fri, May 7, 2021 at 12:54 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> SRIOV is kinda by design vendor specific. You set up the VF endpoint, it
> shows up, it's all hw+fw magic. Nothing for cgroups to manage here at all.
Right, so in theory you just use the device cgroup with the VF endpoints.

> All I meant is that for the container/cgroups world starting out with
> time-sharing feels like the best fit, least because your SRIOV designers
> also seem to think that's the best first cut for cloud-y computing.
> Whether it's virtualized or containerized is a distinction that's getting
> ever more blurry, with virtualization become a lot more dynamic and
> container runtimes als possibly using hw virtualization underneath.
I disagree.  By the same logic, the existence of CU mask would imply
it being the preferred way for sub-device control per process.

Kenny
Tejun Heo May 7, 2021, 7:33 p.m. UTC | #24
Hello,

On Fri, May 07, 2021 at 06:54:13PM +0200, Daniel Vetter wrote:
> All I meant is that for the container/cgroups world starting out with
> time-sharing feels like the best fit, least because your SRIOV designers
> also seem to think that's the best first cut for cloud-y computing.
> Whether it's virtualized or containerized is a distinction that's getting
> ever more blurry, with virtualization become a lot more dynamic and
> container runtimes als possibly using hw virtualization underneath.

FWIW, I'm completely on the same boat. There are two fundamental issues with
hardware-mask based control - control granularity and work conservation.
Combined, they make it a significantly more difficult interface to use which
requires hardware-specific tuning rather than simply being able to say "I
wanna prioritize this job twice over that one".

My knoweldge of gpus is really limited but my understanding is also that the
gpu cores and threads aren't as homogeneous as the CPU counterparts across
the vendors, product generations and possibly even within a single chip,
which makes the problem even worse.

Given that GPUs are time-shareable to begin with, the most universal
solution seems pretty clear.

Thanks.
Alex Deucher May 7, 2021, 7:55 p.m. UTC | #25
On Fri, May 7, 2021 at 3:33 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Fri, May 07, 2021 at 06:54:13PM +0200, Daniel Vetter wrote:
> > All I meant is that for the container/cgroups world starting out with
> > time-sharing feels like the best fit, least because your SRIOV designers
> > also seem to think that's the best first cut for cloud-y computing.
> > Whether it's virtualized or containerized is a distinction that's getting
> > ever more blurry, with virtualization become a lot more dynamic and
> > container runtimes als possibly using hw virtualization underneath.
>
> FWIW, I'm completely on the same boat. There are two fundamental issues with
> hardware-mask based control - control granularity and work conservation.
> Combined, they make it a significantly more difficult interface to use which
> requires hardware-specific tuning rather than simply being able to say "I
> wanna prioritize this job twice over that one".
>
> My knoweldge of gpus is really limited but my understanding is also that the
> gpu cores and threads aren't as homogeneous as the CPU counterparts across
> the vendors, product generations and possibly even within a single chip,
> which makes the problem even worse.
>
> Given that GPUs are time-shareable to begin with, the most universal
> solution seems pretty clear.

The problem is temporal partitioning on GPUs is much harder to enforce
unless you have a special case like SR-IOV.  Spatial partitioning, on
AMD GPUs at least, is widely available and easily enforced.  What is
the point of implementing temporal style cgroups if no one can enforce
it effectively?

Alex

>
> Thanks.
>
> --
> tejun
Tejun Heo May 7, 2021, 8:59 p.m. UTC | #26
Hello,

On Fri, May 07, 2021 at 03:55:39PM -0400, Alex Deucher wrote:
> The problem is temporal partitioning on GPUs is much harder to enforce
> unless you have a special case like SR-IOV.  Spatial partitioning, on
> AMD GPUs at least, is widely available and easily enforced.  What is
> the point of implementing temporal style cgroups if no one can enforce
> it effectively?

So, if generic fine-grained partitioning can't be implemented, the right
thing to do is stopping pushing for full-blown cgroup interface for it. The
hardware simply isn't capable of being managed in a way which allows generic
fine-grained hierarchical scheduling and there's no point in bloating the
interface with half baked hardware dependent features.

This isn't to say that there's no way to support them, but what have been
being proposed is way too generic and ambitious in terms of interface while
being poorly developed on the internal abstraction and mechanism front. If
the hardware can't do generic, either implement the barest minimum interface
(e.g. be a part of misc controller) or go driver-specific - the feature is
hardware specific anyway. I've repeated this multiple times in these
discussions now but it'd be really helpful to try to minimize the interace
while concentrating more on internal abstractions and actual control
mechanisms.

Thanks.
Alex Deucher May 7, 2021, 10:30 p.m. UTC | #27
On Fri, May 7, 2021 at 4:59 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Fri, May 07, 2021 at 03:55:39PM -0400, Alex Deucher wrote:
> > The problem is temporal partitioning on GPUs is much harder to enforce
> > unless you have a special case like SR-IOV.  Spatial partitioning, on
> > AMD GPUs at least, is widely available and easily enforced.  What is
> > the point of implementing temporal style cgroups if no one can enforce
> > it effectively?
>
> So, if generic fine-grained partitioning can't be implemented, the right
> thing to do is stopping pushing for full-blown cgroup interface for it. The
> hardware simply isn't capable of being managed in a way which allows generic
> fine-grained hierarchical scheduling and there's no point in bloating the
> interface with half baked hardware dependent features.
>
> This isn't to say that there's no way to support them, but what have been
> being proposed is way too generic and ambitious in terms of interface while
> being poorly developed on the internal abstraction and mechanism front. If
> the hardware can't do generic, either implement the barest minimum interface
> (e.g. be a part of misc controller) or go driver-specific - the feature is
> hardware specific anyway. I've repeated this multiple times in these
> discussions now but it'd be really helpful to try to minimize the interace
> while concentrating more on internal abstractions and actual control
> mechanisms.

Maybe we are speaking past each other.  I'm not following.  We got
here because a device specific cgroup didn't make sense.  With my
Linux user hat on, that makes sense.  I don't want to write code to a
bunch of device specific interfaces if I can avoid it.  But as for
temporal vs spatial partitioning of the GPU, the argument seems to be
a sort of hand-wavy one that both spatial and temporal partitioning
make sense on CPUs, but only temporal partitioning makes sense on
GPUs.  I'm trying to understand that assertion.  There are some GPUs
that can more easily be temporally partitioned and some that can be
more easily spatially partitioned.  It doesn't seem any different than
CPUs.

Alex
Tejun Heo May 7, 2021, 11:45 p.m. UTC | #28
Hello,

On Fri, May 07, 2021 at 06:30:56PM -0400, Alex Deucher wrote:
> Maybe we are speaking past each other.  I'm not following.  We got
> here because a device specific cgroup didn't make sense.  With my
> Linux user hat on, that makes sense.  I don't want to write code to a
> bunch of device specific interfaces if I can avoid it.  But as for
> temporal vs spatial partitioning of the GPU, the argument seems to be
> a sort of hand-wavy one that both spatial and temporal partitioning
> make sense on CPUs, but only temporal partitioning makes sense on
> GPUs.  I'm trying to understand that assertion.  There are some GPUs

Spatial partitioning as implemented in cpuset isn't a desirable model. It's
there partly because it has historically been there. It doesn't really
require dynamic hierarchical distribution of anything and is more of a way
to batch-update per-task configuration, which is how it's actually
implemented. It's broken too in that it interferes with per-task affinity
settings. So, not exactly a good example to follow. In addition, this sort
of partitioning requires more hardware knowledge and GPUs are worse than
CPUs in that hardwares differ more.

Features like this are trivial to implement from userland side by making
per-process settings inheritable and restricting who can update the
settings.

> that can more easily be temporally partitioned and some that can be
> more easily spatially partitioned.  It doesn't seem any different than
> CPUs.

Right, it doesn't really matter how the resource is distributed. What
matters is how granular and generic the distribution can be. If gpus can
implement work-conserving proportional distribution, that's something which
is widely useful and inherently requires dynamic scheduling from kernel
side. If it's about setting per-vendor affinities, this is way too much
cgroup interface for a feature which can be easily implemented outside
cgroup. Just do per-process (or whatever handles gpus use) and confine their
configurations from cgroup side however way.

While the specific theme changes a bit, we're basically having the same
discussion with the same conclusion over the past however many months.
Hopefully, the point is clear by now.

Thanks.
Alex Deucher May 11, 2021, 3:48 p.m. UTC | #29
On Fri, May 7, 2021 at 7:45 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Fri, May 07, 2021 at 06:30:56PM -0400, Alex Deucher wrote:
> > Maybe we are speaking past each other.  I'm not following.  We got
> > here because a device specific cgroup didn't make sense.  With my
> > Linux user hat on, that makes sense.  I don't want to write code to a
> > bunch of device specific interfaces if I can avoid it.  But as for
> > temporal vs spatial partitioning of the GPU, the argument seems to be
> > a sort of hand-wavy one that both spatial and temporal partitioning
> > make sense on CPUs, but only temporal partitioning makes sense on
> > GPUs.  I'm trying to understand that assertion.  There are some GPUs
>
> Spatial partitioning as implemented in cpuset isn't a desirable model. It's
> there partly because it has historically been there. It doesn't really
> require dynamic hierarchical distribution of anything and is more of a way
> to batch-update per-task configuration, which is how it's actually
> implemented. It's broken too in that it interferes with per-task affinity
> settings. So, not exactly a good example to follow. In addition, this sort
> of partitioning requires more hardware knowledge and GPUs are worse than
> CPUs in that hardwares differ more.
>
> Features like this are trivial to implement from userland side by making
> per-process settings inheritable and restricting who can update the
> settings.
>
> > that can more easily be temporally partitioned and some that can be
> > more easily spatially partitioned.  It doesn't seem any different than
> > CPUs.
>
> Right, it doesn't really matter how the resource is distributed. What
> matters is how granular and generic the distribution can be. If gpus can
> implement work-conserving proportional distribution, that's something which
> is widely useful and inherently requires dynamic scheduling from kernel
> side. If it's about setting per-vendor affinities, this is way too much
> cgroup interface for a feature which can be easily implemented outside
> cgroup. Just do per-process (or whatever handles gpus use) and confine their
> configurations from cgroup side however way.
>
> While the specific theme changes a bit, we're basically having the same
> discussion with the same conclusion over the past however many months.
> Hopefully, the point is clear by now.

Thanks, that helps a lot.

Alex
diff mbox series

Patch

diff --git a/fs/ioctl.c b/fs/ioctl.c
index 4e6cc0a7d69c..a3925486d417 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -19,6 +19,7 @@ 
 #include <linux/falloc.h>
 #include <linux/sched/signal.h>
 #include <linux/fiemap.h>
+#include <linux/cgroup.h>
 
 #include "internal.h"
 
@@ -45,6 +46,10 @@  long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	if (!filp->f_op->unlocked_ioctl)
 		goto out;
 
+	error = BPF_CGROUP_RUN_PROG_IOCTL(filp, cmd, arg);
+	if (error)
+		goto out;
+
 	error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
 	if (error == -ENOIOCTLCMD)
 		error = -ENOTTY;
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 64f367044e25..a5f0b0a8f82b 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -134,6 +134,9 @@  int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 				      short access, enum bpf_attach_type type);
 
+int __cgroup_bpf_check_ioctl_permission(struct file *filp, unsigned int cmd, unsigned long arg,
+				        enum bpf_attach_type type);
+
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   struct ctl_table *table, int write,
 				   void **buf, size_t *pcount, loff_t *ppos,
@@ -346,6 +349,16 @@  int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_PROG_IOCTL(filp, cmd, arg)       	              \
+({									      \
+	int __ret = 0;							      \
+	if (cgroup_bpf_enabled)						      \
+		__ret = __cgroup_bpf_check_ioctl_permission(filp, cmd, arg,   \
+							    BPF_CGROUP_IOCTL);\
+									      \
+	__ret;								      \
+})
+
 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 			   enum bpf_prog_type ptype, struct bpf_prog *prog);
 int cgroup_bpf_prog_detach(const union bpf_attr *attr,
@@ -429,6 +442,7 @@  static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 				       optlen, max_optlen, retval) ({ retval; })
 #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
 				       kernel_optval) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_IOCTL(type,major,minor,access) ({ 0; })
 
 #define for_each_cgroup_storage_type(stype) for (; false; )
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index a52a5688418e..3055e7e4918c 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -56,6 +56,8 @@  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl,
 	      struct bpf_sysctl, struct bpf_sysctl_kern)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt,
 	      struct bpf_sockopt, struct bpf_sockopt_kern)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_IOCTL, cg_ioctl,
+	      struct bpf_cgroup_ioctl_ctx, struct bpf_cgroup_ioctl_ctx)
 #endif
 #ifdef CONFIG_BPF_LIRC_MODE2
 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b6238b2209b7..6a908e13d3a3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -197,6 +197,7 @@  enum bpf_prog_type {
 	BPF_PROG_TYPE_EXT,
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
+	BPF_PROG_TYPE_CGROUP_IOCTL,
 };
 
 enum bpf_attach_type {
@@ -238,6 +239,7 @@  enum bpf_attach_type {
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
 	BPF_XDP,
+	BPF_CGROUP_IOCTL,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -4276,6 +4278,12 @@  struct bpf_cgroup_dev_ctx {
 	__u32 minor;
 };
 
+struct bpf_cgroup_ioctl_ctx {
+	__u64 filp;
+	__u32 cmd;
+	__u32 arg;
+};
+
 struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 83ff127ef7ae..0958bae3b0b7 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1203,6 +1203,72 @@  const struct bpf_verifier_ops cg_dev_verifier_ops = {
 	.is_valid_access	= cgroup_dev_is_valid_access,
 };
 
+int __cgroup_bpf_check_ioctl_permission(struct file *filp, unsigned int cmd, unsigned long arg,
+				      enum bpf_attach_type type)
+{
+	struct cgroup *cgrp;
+	struct bpf_cgroup_ioctl_ctx ctx = {
+		.filp = filp,
+		.cmd = cmd,
+		.arg = arg,
+	};
+	int allow = 1;
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(current);
+	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
+				   BPF_PROG_RUN);
+	rcu_read_unlock();
+
+	return !allow;
+}
+
+static const struct bpf_func_proto *
+cgroup_ioctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return cgroup_base_func_proto(func_id, prog);
+}
+
+static bool cgroup_ioctl_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       const struct bpf_prog *prog,
+				       struct bpf_insn_access_aux *info)
+{
+	const int size_default = sizeof(__u32);
+
+	if (type == BPF_WRITE)
+		return false;
+
+	if (off < 0 || off + size > sizeof(struct bpf_cgroup_ioctl_ctx))
+		return false;
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+
+	switch (off) {
+	case bpf_ctx_range(struct bpf_cgroup_ioctl_ctx, filp):
+		bpf_ctx_record_field_size(info, size_default);
+		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+			return false;
+		break;
+	case bpf_ctx_range(struct bpf_cgroup_ioctl_ctx, cmd):
+	case bpf_ctx_range(struct bpf_cgroup_ioctl_ctx, arg):
+	default:
+		if (size != size_default)
+			return false;
+	}
+
+	return true;
+}
+
+const struct bpf_prog_ops cg_ioctl_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_ioctl_verifier_ops = {
+	.get_func_proto		= cgroup_ioctl_func_proto,
+	.is_valid_access	= cgroup_ioctl_is_valid_access,
+};
+
 /**
  * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
  *
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 86299a292214..6984a62c96f4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2054,6 +2054,7 @@  static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_CGROUP_IOCTL:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_EXT: /* extends any prog */
 		return true;
@@ -2806,6 +2807,8 @@  attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_SOCK_OPS;
 	case BPF_CGROUP_DEVICE:
 		return BPF_PROG_TYPE_CGROUP_DEVICE;
+	case BPF_CGROUP_IOCTL:
+		return BPF_PROG_TYPE_CGROUP_IOCTL;
 	case BPF_SK_MSG_VERDICT:
 		return BPF_PROG_TYPE_SK_MSG;
 	case BPF_SK_SKB_STREAM_PARSER:
@@ -2878,6 +2881,7 @@  static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_CGROUP_IOCTL:
 	case BPF_PROG_TYPE_SOCK_OPS:
 		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
 		break;
@@ -2915,6 +2919,7 @@  static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_CGROUP_IOCTL:
 	case BPF_PROG_TYPE_SOCK_OPS:
 		return cgroup_bpf_prog_detach(attr, ptype);
 	default:
@@ -2958,6 +2963,7 @@  static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_SYSCTL:
 	case BPF_CGROUP_GETSOCKOPT:
 	case BPF_CGROUP_SETSOCKOPT:
+	case BPF_CGROUP_IOCTL:
 		return cgroup_bpf_prog_query(attr, uattr);
 	case BPF_LIRC_MODE2:
 		return lirc_prog_query(attr, uattr);
@@ -3914,6 +3920,7 @@  static int link_create(union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+	case BPF_PROG_TYPE_CGROUP_IOCTL:
 		ret = cgroup_bpf_link_attach(attr, prog);
 		break;
 	case BPF_PROG_TYPE_TRACING:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ef938f17b944..af68f463e828 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7419,6 +7419,7 @@  static int check_return_code(struct bpf_verifier_env *env)
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+	case BPF_PROG_TYPE_CGROUP_IOCTL:
 		break;
 	case BPF_PROG_TYPE_RAW_TRACEPOINT:
 		if (!env->prog->aux->attach_btf_id)