diff mbox series

[bpf-next,v2,4/8] bpf: Introduce cgroup iter

Message ID 20220610194435.2268290-5-yosryahmed@google.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: rstat: cgroup hierarchical stats | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-1 fail Logs for Kernel LATEST on ubuntu-latest with gcc
bpf/vmtest-bpf-next-VM_Test-2 fail Logs for Kernel LATEST on ubuntu-latest with llvm-15
bpf/vmtest-bpf-next-VM_Test-3 fail Logs for Kernel LATEST on z15 with gcc
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1783 this patch: 1783
netdev/cc_maintainers success CCed 10 of 10 maintainers
netdev/build_clang success Errors and warnings before: 196 this patch: 196
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1793 this patch: 1793
netdev/checkpatch warning WARNING: Possible unnecessary 'out of memory' message WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 83 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Yosry Ahmed June 10, 2022, 7:44 p.m. UTC
From: Hao Luo <haoluo@google.com>

Cgroup_iter is a type of bpf_iter. It walks over cgroups in two modes:

 - walking a cgroup's descendants.
 - walking a cgroup's ancestors.

When attaching cgroup_iter, one can set a cgroup to the iter_link
created from attaching. This cgroup is passed as a file descriptor and
serves as the starting point of the walk. If no cgroup is specified,
the starting point will be the root cgroup.

For walking descendants, one can specify the order: either pre-order or
post-order. For walking ancestors, the walk starts at the specified
cgroup and ends at the root.

One can also terminate the walk early by returning 1 from the iter
program.

Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
program is called with cgroup_mutex held.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
---
 include/linux/bpf.h            |   8 ++
 include/uapi/linux/bpf.h       |  21 +++
 kernel/bpf/Makefile            |   2 +-
 kernel/bpf/cgroup_iter.c       | 235 +++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  21 +++
 5 files changed, 286 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/cgroup_iter.c

Comments

kernel test robot June 11, 2022, 6:23 a.m. UTC | #1
Hi Yosry,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Yosry-Ahmed/bpf-rstat-cgroup-hierarchical-stats/20220611-034720
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: m68k-defconfig (https://download.01.org/0day-ci/archive/20220611/202206111453.xWWh2wMK-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/619857fd1ec4f351376ffcaaec20acc9aae9486f
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Yosry-Ahmed/bpf-rstat-cgroup-hierarchical-stats/20220611-034720
        git checkout 619857fd1ec4f351376ffcaaec20acc9aae9486f
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash kernel/bpf/ kernel/cgroup/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from kernel/bpf/cgroup_iter.c:9:
   kernel/bpf/../cgroup/cgroup-internal.h:78:41: error: field 'iter' has incomplete type
      78 |                 struct css_task_iter    iter;
         |                                         ^~~~
   kernel/bpf/../cgroup/cgroup-internal.h: In function 'cgroup_is_dead':
   kernel/bpf/../cgroup/cgroup-internal.h:188:22: error: invalid use of undefined type 'const struct cgroup'
     188 |         return !(cgrp->self.flags & CSS_ONLINE);
         |                      ^~
   kernel/bpf/../cgroup/cgroup-internal.h:188:37: error: 'CSS_ONLINE' undeclared (first use in this function); did you mean 'N_ONLINE'?
     188 |         return !(cgrp->self.flags & CSS_ONLINE);
         |                                     ^~~~~~~~~~
         |                                     N_ONLINE
   kernel/bpf/../cgroup/cgroup-internal.h:188:37: note: each undeclared identifier is reported only once for each function it appears in
   kernel/bpf/../cgroup/cgroup-internal.h: In function 'notify_on_release':
   kernel/bpf/../cgroup/cgroup-internal.h:193:25: error: 'CGRP_NOTIFY_ON_RELEASE' undeclared (first use in this function)
     193 |         return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
         |                         ^~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/../cgroup/cgroup-internal.h:193:54: error: invalid use of undefined type 'const struct cgroup'
     193 |         return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
         |                                                      ^~
   kernel/bpf/../cgroup/cgroup-internal.h: In function 'put_css_set':
   kernel/bpf/../cgroup/cgroup-internal.h:207:39: error: invalid use of undefined type 'struct css_set'
     207 |         if (refcount_dec_not_one(&cset->refcount))
         |                                       ^~
   kernel/bpf/../cgroup/cgroup-internal.h: In function 'get_css_set':
   kernel/bpf/../cgroup/cgroup-internal.h:220:27: error: invalid use of undefined type 'struct css_set'
     220 |         refcount_inc(&cset->refcount);
         |                           ^~
   kernel/bpf/../cgroup/cgroup-internal.h: At top level:
   kernel/bpf/../cgroup/cgroup-internal.h:284:22: error: array type has incomplete element type 'struct cftype'
     284 | extern struct cftype cgroup1_base_files[];
         |                      ^~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_start':
   kernel/bpf/cgroup_iter.c:55:24: error: implicit declaration of function 'css_next_descendant_pre' [-Werror=implicit-function-declaration]
      55 |                 return css_next_descendant_pre(NULL, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~
>> kernel/bpf/cgroup_iter.c:55:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
      55 |                 return css_next_descendant_pre(NULL, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:57:24: error: implicit declaration of function 'css_next_descendant_post' [-Werror=implicit-function-declaration]
      57 |                 return css_next_descendant_post(NULL, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:57:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
      57 |                 return css_next_descendant_post(NULL, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_next':
   kernel/bpf/cgroup_iter.c:83:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
      83 |                 return css_next_descendant_pre(curr, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:85:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
      85 |                 return css_next_descendant_post(curr, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:87:28: error: invalid use of undefined type 'struct cgroup_subsys_state'
      87 |                 return curr->parent;
         |                            ^~
   kernel/bpf/cgroup_iter.c: In function '__cgroup_iter_seq_show':
   kernel/bpf/cgroup_iter.c:100:38: error: invalid use of undefined type 'struct cgroup_subsys_state'
     100 |         if (css && cgroup_is_dead(css->cgroup))
         |                                      ^~
   kernel/bpf/cgroup_iter.c:104:31: error: invalid use of undefined type 'struct cgroup_subsys_state'
     104 |         ctx.cgroup = css ? css->cgroup : NULL;
         |                               ^~
   kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_init':
   kernel/bpf/cgroup_iter.c:137:29: error: invalid use of undefined type 'struct cgroup'
     137 |         p->start_css = &cgrp->self;
         |                             ^~
   kernel/bpf/cgroup_iter.c: In function 'bpf_iter_attach_cgroup':
   kernel/bpf/cgroup_iter.c:157:24: error: implicit declaration of function 'cgroup_get_from_fd'; did you mean 'cgroup_get_from_id'? [-Werror=implicit-function-declaration]
     157 |                 cgrp = cgroup_get_from_fd(fd);
         |                        ^~~~~~~~~~~~~~~~~~
         |                        cgroup_get_from_id
>> kernel/bpf/cgroup_iter.c:157:22: warning: assignment to 'struct cgroup *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
     157 |                 cgrp = cgroup_get_from_fd(fd);
         |                      ^
   kernel/bpf/cgroup_iter.c:159:24: error: implicit declaration of function 'cgroup_get_from_path'; did you mean 'cgroup_get_from_id'? [-Werror=implicit-function-declaration]
     159 |                 cgrp = cgroup_get_from_path("/");
         |                        ^~~~~~~~~~~~~~~~~~~~
         |                        cgroup_get_from_id
   kernel/bpf/cgroup_iter.c:159:22: warning: assignment to 'struct cgroup *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
     159 |                 cgrp = cgroup_get_from_path("/");
         |                      ^
   kernel/bpf/cgroup_iter.c: In function 'bpf_iter_cgroup_show_fdinfo':
   kernel/bpf/cgroup_iter.c:190:9: error: implicit declaration of function 'cgroup_path_ns'; did you mean 'cgroup_parent'? [-Werror=implicit-function-declaration]
     190 |         cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
         |         ^~~~~~~~~~~~~~
         |         cgroup_parent
   kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_next':
   kernel/bpf/cgroup_iter.c:88:1: error: control reaches end of non-void function [-Werror=return-type]
      88 | }
         | ^
   cc1: some warnings being treated as errors


vim +55 kernel/bpf/cgroup_iter.c

    41	
    42	static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
    43	{
    44		struct cgroup_iter_priv *p = seq->private;
    45	
    46		mutex_lock(&cgroup_mutex);
    47	
    48		/* support only one session */
    49		if (*pos > 0)
    50			return NULL;
    51	
    52		++*pos;
    53		p->terminate = false;
    54		if (p->order == BPF_ITER_CGROUP_PRE)
  > 55			return css_next_descendant_pre(NULL, p->start_css);
    56		else if (p->order == BPF_ITER_CGROUP_POST)
    57			return css_next_descendant_post(NULL, p->start_css);
    58		else /* BPF_ITER_CGROUP_PARENT_UP */
    59			return p->start_css;
    60	}
    61	
    62	static int __cgroup_iter_seq_show(struct seq_file *seq,
    63					  struct cgroup_subsys_state *css, int in_stop);
    64	
    65	static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
    66	{
    67		/* pass NULL to the prog for post-processing */
    68		if (!v)
    69			__cgroup_iter_seq_show(seq, NULL, true);
    70		mutex_unlock(&cgroup_mutex);
    71	}
    72	
    73	static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
    74	{
    75		struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
    76		struct cgroup_iter_priv *p = seq->private;
    77	
    78		++*pos;
    79		if (p->terminate)
    80			return NULL;
    81	
    82		if (p->order == BPF_ITER_CGROUP_PRE)
    83			return css_next_descendant_pre(curr, p->start_css);
    84		else if (p->order == BPF_ITER_CGROUP_POST)
    85			return css_next_descendant_post(curr, p->start_css);
    86		else
    87			return curr->parent;
    88	}
    89	
    90	static int __cgroup_iter_seq_show(struct seq_file *seq,
    91					  struct cgroup_subsys_state *css, int in_stop)
    92	{
    93		struct cgroup_iter_priv *p = seq->private;
    94		struct bpf_iter__cgroup ctx;
    95		struct bpf_iter_meta meta;
    96		struct bpf_prog *prog;
    97		int ret = 0;
    98	
    99		/* cgroup is dead, skip this element */
   100		if (css && cgroup_is_dead(css->cgroup))
   101			return 0;
   102	
   103		ctx.meta = &meta;
   104		ctx.cgroup = css ? css->cgroup : NULL;
   105		meta.seq = seq;
   106		prog = bpf_iter_get_info(&meta, in_stop);
   107		if (prog)
   108			ret = bpf_iter_run_prog(prog, &ctx);
   109	
   110		/* if prog returns > 0, terminate after this element. */
   111		if (ret != 0)
   112			p->terminate = true;
   113	
   114		return 0;
   115	}
   116	
   117	static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
   118	{
   119		return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
   120					      false);
   121	}
   122	
   123	static const struct seq_operations cgroup_iter_seq_ops = {
   124		.start  = cgroup_iter_seq_start,
   125		.next   = cgroup_iter_seq_next,
   126		.stop   = cgroup_iter_seq_stop,
   127		.show   = cgroup_iter_seq_show,
   128	};
   129	
   130	BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
   131	
   132	static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
   133	{
   134		struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
   135		struct cgroup *cgrp = aux->cgroup.start;
   136	
   137		p->start_css = &cgrp->self;
   138		p->terminate = false;
   139		p->order = aux->cgroup.order;
   140		return 0;
   141	}
   142	
   143	static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
   144		.seq_ops                = &cgroup_iter_seq_ops,
   145		.init_seq_private       = cgroup_iter_seq_init,
   146		.seq_priv_size          = sizeof(struct cgroup_iter_priv),
   147	};
   148	
   149	static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
   150					  union bpf_iter_link_info *linfo,
   151					  struct bpf_iter_aux_info *aux)
   152	{
   153		int fd = linfo->cgroup.cgroup_fd;
   154		struct cgroup *cgrp;
   155	
   156		if (fd)
 > 157			cgrp = cgroup_get_from_fd(fd);
   158		else /* walk the entire hierarchy by default. */
   159			cgrp = cgroup_get_from_path("/");
   160	
   161		if (IS_ERR(cgrp))
   162			return PTR_ERR(cgrp);
   163	
   164		aux->cgroup.start = cgrp;
   165		aux->cgroup.order = linfo->cgroup.traversal_order;
   166		return 0;
   167	}
   168
kernel test robot June 11, 2022, 7:34 a.m. UTC | #2
Hi Yosry,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Yosry-Ahmed/bpf-rstat-cgroup-hierarchical-stats/20220611-034720
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: m68k-defconfig (https://download.01.org/0day-ci/archive/20220611/202206111529.2okIVRo9-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/619857fd1ec4f351376ffcaaec20acc9aae9486f
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Yosry-Ahmed/bpf-rstat-cgroup-hierarchical-stats/20220611-034720
        git checkout 619857fd1ec4f351376ffcaaec20acc9aae9486f
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from kernel/bpf/cgroup_iter.c:9:
>> kernel/bpf/../cgroup/cgroup-internal.h:78:41: error: field 'iter' has incomplete type
      78 |                 struct css_task_iter    iter;
         |                                         ^~~~
   kernel/bpf/../cgroup/cgroup-internal.h: In function 'cgroup_is_dead':
   kernel/bpf/../cgroup/cgroup-internal.h:188:22: error: invalid use of undefined type 'const struct cgroup'
     188 |         return !(cgrp->self.flags & CSS_ONLINE);
         |                      ^~
   kernel/bpf/../cgroup/cgroup-internal.h:188:37: error: 'CSS_ONLINE' undeclared (first use in this function); did you mean 'N_ONLINE'?
     188 |         return !(cgrp->self.flags & CSS_ONLINE);
         |                                     ^~~~~~~~~~
         |                                     N_ONLINE
   kernel/bpf/../cgroup/cgroup-internal.h:188:37: note: each undeclared identifier is reported only once for each function it appears in
   kernel/bpf/../cgroup/cgroup-internal.h: In function 'notify_on_release':
   kernel/bpf/../cgroup/cgroup-internal.h:193:25: error: 'CGRP_NOTIFY_ON_RELEASE' undeclared (first use in this function)
     193 |         return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
         |                         ^~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/../cgroup/cgroup-internal.h:193:54: error: invalid use of undefined type 'const struct cgroup'
     193 |         return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
         |                                                      ^~
   kernel/bpf/../cgroup/cgroup-internal.h: In function 'put_css_set':
   kernel/bpf/../cgroup/cgroup-internal.h:207:39: error: invalid use of undefined type 'struct css_set'
     207 |         if (refcount_dec_not_one(&cset->refcount))
         |                                       ^~
   kernel/bpf/../cgroup/cgroup-internal.h: In function 'get_css_set':
   kernel/bpf/../cgroup/cgroup-internal.h:220:27: error: invalid use of undefined type 'struct css_set'
     220 |         refcount_inc(&cset->refcount);
         |                           ^~
   kernel/bpf/../cgroup/cgroup-internal.h: At top level:
   kernel/bpf/../cgroup/cgroup-internal.h:284:22: error: array type has incomplete element type 'struct cftype'
     284 | extern struct cftype cgroup1_base_files[];
         |                      ^~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_start':
>> kernel/bpf/cgroup_iter.c:55:24: error: implicit declaration of function 'css_next_descendant_pre' [-Werror=implicit-function-declaration]
      55 |                 return css_next_descendant_pre(NULL, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:55:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
      55 |                 return css_next_descendant_pre(NULL, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> kernel/bpf/cgroup_iter.c:57:24: error: implicit declaration of function 'css_next_descendant_post' [-Werror=implicit-function-declaration]
      57 |                 return css_next_descendant_post(NULL, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:57:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
      57 |                 return css_next_descendant_post(NULL, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_next':
   kernel/bpf/cgroup_iter.c:83:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
      83 |                 return css_next_descendant_pre(curr, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:85:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
      85 |                 return css_next_descendant_post(curr, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> kernel/bpf/cgroup_iter.c:87:28: error: invalid use of undefined type 'struct cgroup_subsys_state'
      87 |                 return curr->parent;
         |                            ^~
   kernel/bpf/cgroup_iter.c: In function '__cgroup_iter_seq_show':
   kernel/bpf/cgroup_iter.c:100:38: error: invalid use of undefined type 'struct cgroup_subsys_state'
     100 |         if (css && cgroup_is_dead(css->cgroup))
         |                                      ^~
   kernel/bpf/cgroup_iter.c:104:31: error: invalid use of undefined type 'struct cgroup_subsys_state'
     104 |         ctx.cgroup = css ? css->cgroup : NULL;
         |                               ^~
   kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_init':
>> kernel/bpf/cgroup_iter.c:137:29: error: invalid use of undefined type 'struct cgroup'
     137 |         p->start_css = &cgrp->self;
         |                             ^~
   kernel/bpf/cgroup_iter.c: In function 'bpf_iter_attach_cgroup':
>> kernel/bpf/cgroup_iter.c:157:24: error: implicit declaration of function 'cgroup_get_from_fd'; did you mean 'cgroup_get_from_id'? [-Werror=implicit-function-declaration]
     157 |                 cgrp = cgroup_get_from_fd(fd);
         |                        ^~~~~~~~~~~~~~~~~~
         |                        cgroup_get_from_id
   kernel/bpf/cgroup_iter.c:157:22: warning: assignment to 'struct cgroup *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
     157 |                 cgrp = cgroup_get_from_fd(fd);
         |                      ^
>> kernel/bpf/cgroup_iter.c:159:24: error: implicit declaration of function 'cgroup_get_from_path'; did you mean 'cgroup_get_from_id'? [-Werror=implicit-function-declaration]
     159 |                 cgrp = cgroup_get_from_path("/");
         |                        ^~~~~~~~~~~~~~~~~~~~
         |                        cgroup_get_from_id
   kernel/bpf/cgroup_iter.c:159:22: warning: assignment to 'struct cgroup *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
     159 |                 cgrp = cgroup_get_from_path("/");
         |                      ^
   kernel/bpf/cgroup_iter.c: In function 'bpf_iter_cgroup_show_fdinfo':
>> kernel/bpf/cgroup_iter.c:190:9: error: implicit declaration of function 'cgroup_path_ns'; did you mean 'cgroup_parent'? [-Werror=implicit-function-declaration]
     190 |         cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
         |         ^~~~~~~~~~~~~~
         |         cgroup_parent
   kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_next':
   kernel/bpf/cgroup_iter.c:88:1: error: control reaches end of non-void function [-Werror=return-type]
      88 | }
         | ^
   cc1: some warnings being treated as errors


vim +/iter +78 kernel/bpf/../cgroup/cgroup-internal.h

0d2b5955b36250 Tejun Heo 2022-01-06  68  
0d2b5955b36250 Tejun Heo 2022-01-06  69  struct cgroup_file_ctx {
e57457641613fe Tejun Heo 2022-01-06  70  	struct cgroup_namespace	*ns;
e57457641613fe Tejun Heo 2022-01-06  71  
0d2b5955b36250 Tejun Heo 2022-01-06  72  	struct {
0d2b5955b36250 Tejun Heo 2022-01-06  73  		void			*trigger;
0d2b5955b36250 Tejun Heo 2022-01-06  74  	} psi;
0d2b5955b36250 Tejun Heo 2022-01-06  75  
0d2b5955b36250 Tejun Heo 2022-01-06  76  	struct {
0d2b5955b36250 Tejun Heo 2022-01-06  77  		bool			started;
0d2b5955b36250 Tejun Heo 2022-01-06 @78  		struct css_task_iter	iter;
0d2b5955b36250 Tejun Heo 2022-01-06  79  	} procs;
0d2b5955b36250 Tejun Heo 2022-01-06  80  
0d2b5955b36250 Tejun Heo 2022-01-06  81  	struct {
0d2b5955b36250 Tejun Heo 2022-01-06  82  		struct cgroup_pidlist	*pidlist;
0d2b5955b36250 Tejun Heo 2022-01-06  83  	} procs1;
0d2b5955b36250 Tejun Heo 2022-01-06  84  };
0d2b5955b36250 Tejun Heo 2022-01-06  85
kernel test robot June 11, 2022, 12:44 p.m. UTC | #3
Hi Yosry,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Yosry-Ahmed/bpf-rstat-cgroup-hierarchical-stats/20220611-034720
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: s390-randconfig-r035-20220611 (https://download.01.org/0day-ci/archive/20220611/202206112009.sycCJKhv-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project ff4abe755279a3a47cc416ef80dbc900d9a98a19)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install s390 cross compiling tool for clang build
        # apt-get install binutils-s390x-linux-gnu
        # https://github.com/intel-lab-lkp/linux/commit/619857fd1ec4f351376ffcaaec20acc9aae9486f
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Yosry-Ahmed/bpf-rstat-cgroup-hierarchical-stats/20220611-034720
        git checkout 619857fd1ec4f351376ffcaaec20acc9aae9486f
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash kernel/bpf/ kernel/cgroup/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from kernel/bpf/cgroup_iter.c:9:
   kernel/bpf/../cgroup/cgroup-internal.h:78:24: error: field has incomplete type 'struct css_task_iter'
                   struct css_task_iter    iter;
                                           ^
   kernel/bpf/../cgroup/cgroup-internal.h:78:10: note: forward declaration of 'struct css_task_iter'
                   struct css_task_iter    iter;
                          ^
   kernel/bpf/../cgroup/cgroup-internal.h:188:15: error: incomplete definition of type 'struct cgroup'
           return !(cgrp->self.flags & CSS_ONLINE);
                    ~~~~^
   include/linux/sched/task.h:35:9: note: forward declaration of 'struct cgroup'
           struct cgroup *cgrp;
                  ^
   In file included from kernel/bpf/cgroup_iter.c:9:
   kernel/bpf/../cgroup/cgroup-internal.h:188:30: error: use of undeclared identifier 'CSS_ONLINE'; did you mean 'N_ONLINE'?
           return !(cgrp->self.flags & CSS_ONLINE);
                                       ^~~~~~~~~~
                                       N_ONLINE
   include/linux/nodemask.h:392:2: note: 'N_ONLINE' declared here
           N_ONLINE,               /* The node is online */
           ^
   In file included from kernel/bpf/cgroup_iter.c:9:
   kernel/bpf/../cgroup/cgroup-internal.h:193:47: error: incomplete definition of type 'struct cgroup'
           return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
                                                    ~~~~^
   include/linux/sched/task.h:35:9: note: forward declaration of 'struct cgroup'
           struct cgroup *cgrp;
                  ^
   In file included from kernel/bpf/cgroup_iter.c:9:
   kernel/bpf/../cgroup/cgroup-internal.h:193:18: error: use of undeclared identifier 'CGRP_NOTIFY_ON_RELEASE'
           return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
                           ^
   kernel/bpf/../cgroup/cgroup-internal.h:207:32: error: incomplete definition of type 'struct css_set'
           if (refcount_dec_not_one(&cset->refcount))
                                     ~~~~^
   include/linux/sched/task.h:16:8: note: forward declaration of 'struct css_set'
   struct css_set;
          ^
   In file included from kernel/bpf/cgroup_iter.c:9:
   kernel/bpf/../cgroup/cgroup-internal.h:220:20: error: incomplete definition of type 'struct css_set'
           refcount_inc(&cset->refcount);
                         ~~~~^
   include/linux/sched/task.h:16:8: note: forward declaration of 'struct css_set'
   struct css_set;
          ^
   In file included from kernel/bpf/cgroup_iter.c:9:
   kernel/bpf/../cgroup/cgroup-internal.h:284:40: error: array has incomplete element type 'struct cftype'
   extern struct cftype cgroup1_base_files[];
                                          ^
   kernel/bpf/../cgroup/cgroup-internal.h:284:15: note: forward declaration of 'struct cftype'
   extern struct cftype cgroup1_base_files[];
                 ^
   kernel/bpf/cgroup_iter.c:55:10: error: call to undeclared function 'css_next_descendant_pre'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
                   return css_next_descendant_pre(NULL, p->start_css);
                          ^
>> kernel/bpf/cgroup_iter.c:55:10: warning: incompatible integer to pointer conversion returning 'int' from a function with result type 'void *' [-Wint-conversion]
                   return css_next_descendant_pre(NULL, p->start_css);
                          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:57:10: error: call to undeclared function 'css_next_descendant_post'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
                   return css_next_descendant_post(NULL, p->start_css);
                          ^
   kernel/bpf/cgroup_iter.c:57:10: warning: incompatible integer to pointer conversion returning 'int' from a function with result type 'void *' [-Wint-conversion]
                   return css_next_descendant_post(NULL, p->start_css);
                          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:83:10: error: call to undeclared function 'css_next_descendant_pre'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
                   return css_next_descendant_pre(curr, p->start_css);
                          ^
   kernel/bpf/cgroup_iter.c:83:10: warning: incompatible integer to pointer conversion returning 'int' from a function with result type 'void *' [-Wint-conversion]
                   return css_next_descendant_pre(curr, p->start_css);
                          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:85:10: error: call to undeclared function 'css_next_descendant_post'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
                   return css_next_descendant_post(curr, p->start_css);
                          ^
   kernel/bpf/cgroup_iter.c:85:10: warning: incompatible integer to pointer conversion returning 'int' from a function with result type 'void *' [-Wint-conversion]
                   return css_next_descendant_post(curr, p->start_css);
                          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:87:14: error: incomplete definition of type 'struct cgroup_subsys_state'
                   return curr->parent;
                          ~~~~^
   include/linux/kthread.h:218:8: note: forward declaration of 'struct cgroup_subsys_state'
   struct cgroup_subsys_state;
          ^
   kernel/bpf/cgroup_iter.c:100:31: error: incomplete definition of type 'struct cgroup_subsys_state'
           if (css && cgroup_is_dead(css->cgroup))
                                     ~~~^
   include/linux/kthread.h:218:8: note: forward declaration of 'struct cgroup_subsys_state'
   struct cgroup_subsys_state;
          ^
   kernel/bpf/cgroup_iter.c:104:24: error: incomplete definition of type 'struct cgroup_subsys_state'
           ctx.cgroup = css ? css->cgroup : NULL;
                              ~~~^
   include/linux/kthread.h:218:8: note: forward declaration of 'struct cgroup_subsys_state'
   struct cgroup_subsys_state;
          ^
   kernel/bpf/cgroup_iter.c:137:22: error: incomplete definition of type 'struct cgroup'
           p->start_css = &cgrp->self;
                           ~~~~^
   include/linux/sched/task.h:35:9: note: forward declaration of 'struct cgroup'
           struct cgroup *cgrp;
                  ^
   kernel/bpf/cgroup_iter.c:157:10: error: call to undeclared function 'cgroup_get_from_fd'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
                   cgrp = cgroup_get_from_fd(fd);
                          ^
   kernel/bpf/cgroup_iter.c:157:10: note: did you mean 'cgroup_get_from_id'?
   include/linux/cgroup.h:756:30: note: 'cgroup_get_from_id' declared here
   static inline struct cgroup *cgroup_get_from_id(u64 id)
                                ^
>> kernel/bpf/cgroup_iter.c:157:8: warning: incompatible integer to pointer conversion assigning to 'struct cgroup *' from 'int' [-Wint-conversion]
                   cgrp = cgroup_get_from_fd(fd);
                        ^ ~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:159:10: error: call to undeclared function 'cgroup_get_from_path'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
                   cgrp = cgroup_get_from_path("/");
                          ^
   kernel/bpf/cgroup_iter.c:159:8: warning: incompatible integer to pointer conversion assigning to 'struct cgroup *' from 'int' [-Wint-conversion]
                   cgrp = cgroup_get_from_path("/");
                        ^ ~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:190:2: error: call to undeclared function 'cgroup_path_ns'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
           cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
           ^
   kernel/bpf/cgroup_iter.c:190:2: note: did you mean 'cgroup_parent'?
   include/linux/cgroup.h:732:30: note: 'cgroup_parent' declared here
   static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
                                ^
   6 warnings and 19 errors generated.


vim +55 kernel/bpf/cgroup_iter.c

    41	
    42	static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
    43	{
    44		struct cgroup_iter_priv *p = seq->private;
    45	
    46		mutex_lock(&cgroup_mutex);
    47	
    48		/* support only one session */
    49		if (*pos > 0)
    50			return NULL;
    51	
    52		++*pos;
    53		p->terminate = false;
    54		if (p->order == BPF_ITER_CGROUP_PRE)
  > 55			return css_next_descendant_pre(NULL, p->start_css);
    56		else if (p->order == BPF_ITER_CGROUP_POST)
    57			return css_next_descendant_post(NULL, p->start_css);
    58		else /* BPF_ITER_CGROUP_PARENT_UP */
    59			return p->start_css;
    60	}
    61	
    62	static int __cgroup_iter_seq_show(struct seq_file *seq,
    63					  struct cgroup_subsys_state *css, int in_stop);
    64	
    65	static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
    66	{
    67		/* pass NULL to the prog for post-processing */
    68		if (!v)
    69			__cgroup_iter_seq_show(seq, NULL, true);
    70		mutex_unlock(&cgroup_mutex);
    71	}
    72	
    73	static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
    74	{
    75		struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
    76		struct cgroup_iter_priv *p = seq->private;
    77	
    78		++*pos;
    79		if (p->terminate)
    80			return NULL;
    81	
    82		if (p->order == BPF_ITER_CGROUP_PRE)
    83			return css_next_descendant_pre(curr, p->start_css);
    84		else if (p->order == BPF_ITER_CGROUP_POST)
    85			return css_next_descendant_post(curr, p->start_css);
    86		else
    87			return curr->parent;
    88	}
    89	
    90	static int __cgroup_iter_seq_show(struct seq_file *seq,
    91					  struct cgroup_subsys_state *css, int in_stop)
    92	{
    93		struct cgroup_iter_priv *p = seq->private;
    94		struct bpf_iter__cgroup ctx;
    95		struct bpf_iter_meta meta;
    96		struct bpf_prog *prog;
    97		int ret = 0;
    98	
    99		/* cgroup is dead, skip this element */
   100		if (css && cgroup_is_dead(css->cgroup))
   101			return 0;
   102	
   103		ctx.meta = &meta;
   104		ctx.cgroup = css ? css->cgroup : NULL;
   105		meta.seq = seq;
   106		prog = bpf_iter_get_info(&meta, in_stop);
   107		if (prog)
   108			ret = bpf_iter_run_prog(prog, &ctx);
   109	
   110		/* if prog returns > 0, terminate after this element. */
   111		if (ret != 0)
   112			p->terminate = true;
   113	
   114		return 0;
   115	}
   116	
   117	static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
   118	{
   119		return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
   120					      false);
   121	}
   122	
   123	static const struct seq_operations cgroup_iter_seq_ops = {
   124		.start  = cgroup_iter_seq_start,
   125		.next   = cgroup_iter_seq_next,
   126		.stop   = cgroup_iter_seq_stop,
   127		.show   = cgroup_iter_seq_show,
   128	};
   129	
   130	BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
   131	
   132	static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
   133	{
   134		struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
   135		struct cgroup *cgrp = aux->cgroup.start;
   136	
   137		p->start_css = &cgrp->self;
   138		p->terminate = false;
   139		p->order = aux->cgroup.order;
   140		return 0;
   141	}
   142	
   143	static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
   144		.seq_ops                = &cgroup_iter_seq_ops,
   145		.init_seq_private       = cgroup_iter_seq_init,
   146		.seq_priv_size          = sizeof(struct cgroup_iter_priv),
   147	};
   148	
   149	static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
   150					  union bpf_iter_link_info *linfo,
   151					  struct bpf_iter_aux_info *aux)
   152	{
   153		int fd = linfo->cgroup.cgroup_fd;
   154		struct cgroup *cgrp;
   155	
   156		if (fd)
 > 157			cgrp = cgroup_get_from_fd(fd);
   158		else /* walk the entire hierarchy by default. */
   159			cgrp = cgroup_get_from_path("/");
   160	
   161		if (IS_ERR(cgrp))
   162			return PTR_ERR(cgrp);
   163	
   164		aux->cgroup.start = cgrp;
   165		aux->cgroup.order = linfo->cgroup.traversal_order;
   166		return 0;
   167	}
   168
kernel test robot June 11, 2022, 12:55 p.m. UTC | #4
Hi Yosry,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Yosry-Ahmed/bpf-rstat-cgroup-hierarchical-stats/20220611-034720
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: m68k-randconfig-r004-20220611 (https://download.01.org/0day-ci/archive/20220611/202206112000.LRgcxlpN-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/619857fd1ec4f351376ffcaaec20acc9aae9486f
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Yosry-Ahmed/bpf-rstat-cgroup-hierarchical-stats/20220611-034720
        git checkout 619857fd1ec4f351376ffcaaec20acc9aae9486f
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash kernel/bpf/ kernel/cgroup/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All error/warnings (new ones prefixed by >>):

   In file included from kernel/bpf/cgroup_iter.c:9:
>> kernel/bpf/../cgroup/cgroup-internal.h:78:41: error: field 'iter' has incomplete type
      78 |                 struct css_task_iter    iter;
         |                                         ^~~~
   kernel/bpf/../cgroup/cgroup-internal.h: In function 'cgroup_is_dead':
>> kernel/bpf/../cgroup/cgroup-internal.h:188:22: error: invalid use of undefined type 'const struct cgroup'
     188 |         return !(cgrp->self.flags & CSS_ONLINE);
         |                      ^~
>> kernel/bpf/../cgroup/cgroup-internal.h:188:37: error: 'CSS_ONLINE' undeclared (first use in this function); did you mean 'N_ONLINE'?
     188 |         return !(cgrp->self.flags & CSS_ONLINE);
         |                                     ^~~~~~~~~~
         |                                     N_ONLINE
   kernel/bpf/../cgroup/cgroup-internal.h:188:37: note: each undeclared identifier is reported only once for each function it appears in
   kernel/bpf/../cgroup/cgroup-internal.h: In function 'notify_on_release':
>> kernel/bpf/../cgroup/cgroup-internal.h:193:25: error: 'CGRP_NOTIFY_ON_RELEASE' undeclared (first use in this function)
     193 |         return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
         |                         ^~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/../cgroup/cgroup-internal.h:193:54: error: invalid use of undefined type 'const struct cgroup'
     193 |         return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
         |                                                      ^~
   kernel/bpf/../cgroup/cgroup-internal.h: In function 'put_css_set':
>> kernel/bpf/../cgroup/cgroup-internal.h:207:39: error: invalid use of undefined type 'struct css_set'
     207 |         if (refcount_dec_not_one(&cset->refcount))
         |                                       ^~
   kernel/bpf/../cgroup/cgroup-internal.h: In function 'get_css_set':
   kernel/bpf/../cgroup/cgroup-internal.h:220:27: error: invalid use of undefined type 'struct css_set'
     220 |         refcount_inc(&cset->refcount);
         |                           ^~
   kernel/bpf/../cgroup/cgroup-internal.h: At top level:
>> kernel/bpf/../cgroup/cgroup-internal.h:284:22: error: array type has incomplete element type 'struct cftype'
     284 | extern struct cftype cgroup1_base_files[];
         |                      ^~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_start':
>> kernel/bpf/cgroup_iter.c:55:24: error: implicit declaration of function 'css_next_descendant_pre' [-Werror=implicit-function-declaration]
      55 |                 return css_next_descendant_pre(NULL, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~
>> kernel/bpf/cgroup_iter.c:55:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
      55 |                 return css_next_descendant_pre(NULL, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> kernel/bpf/cgroup_iter.c:57:24: error: implicit declaration of function 'css_next_descendant_post' [-Werror=implicit-function-declaration]
      57 |                 return css_next_descendant_post(NULL, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:57:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
      57 |                 return css_next_descendant_post(NULL, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_next':
   kernel/bpf/cgroup_iter.c:83:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
      83 |                 return css_next_descendant_pre(curr, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/bpf/cgroup_iter.c:85:24: warning: returning 'int' from a function with return type 'void *' makes pointer from integer without a cast [-Wint-conversion]
      85 |                 return css_next_descendant_post(curr, p->start_css);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> kernel/bpf/cgroup_iter.c:87:28: error: invalid use of undefined type 'struct cgroup_subsys_state'
      87 |                 return curr->parent;
         |                            ^~
   kernel/bpf/cgroup_iter.c: In function '__cgroup_iter_seq_show':
   kernel/bpf/cgroup_iter.c:100:38: error: invalid use of undefined type 'struct cgroup_subsys_state'
     100 |         if (css && cgroup_is_dead(css->cgroup))
         |                                      ^~
   kernel/bpf/cgroup_iter.c:104:31: error: invalid use of undefined type 'struct cgroup_subsys_state'
     104 |         ctx.cgroup = css ? css->cgroup : NULL;
         |                               ^~
   kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_init':
>> kernel/bpf/cgroup_iter.c:137:29: error: invalid use of undefined type 'struct cgroup'
     137 |         p->start_css = &cgrp->self;
         |                             ^~
   kernel/bpf/cgroup_iter.c: In function 'bpf_iter_attach_cgroup':
>> kernel/bpf/cgroup_iter.c:157:24: error: implicit declaration of function 'cgroup_get_from_fd'; did you mean 'cgroup_get_from_id'? [-Werror=implicit-function-declaration]
     157 |                 cgrp = cgroup_get_from_fd(fd);
         |                        ^~~~~~~~~~~~~~~~~~
         |                        cgroup_get_from_id
>> kernel/bpf/cgroup_iter.c:157:22: warning: assignment to 'struct cgroup *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
     157 |                 cgrp = cgroup_get_from_fd(fd);
         |                      ^
>> kernel/bpf/cgroup_iter.c:159:24: error: implicit declaration of function 'cgroup_get_from_path'; did you mean 'cgroup_get_from_id'? [-Werror=implicit-function-declaration]
     159 |                 cgrp = cgroup_get_from_path("/");
         |                        ^~~~~~~~~~~~~~~~~~~~
         |                        cgroup_get_from_id
   kernel/bpf/cgroup_iter.c:159:22: warning: assignment to 'struct cgroup *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
     159 |                 cgrp = cgroup_get_from_path("/");
         |                      ^
   kernel/bpf/cgroup_iter.c: In function 'bpf_iter_cgroup_show_fdinfo':
>> kernel/bpf/cgroup_iter.c:190:9: error: implicit declaration of function 'cgroup_path_ns'; did you mean 'cgroup_parent'? [-Werror=implicit-function-declaration]
     190 |         cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
         |         ^~~~~~~~~~~~~~
         |         cgroup_parent
   kernel/bpf/cgroup_iter.c: In function 'cgroup_iter_seq_next':
   kernel/bpf/cgroup_iter.c:88:1: error: control reaches end of non-void function [-Werror=return-type]
      88 | }
         | ^
   cc1: some warnings being treated as errors


vim +/iter +78 kernel/bpf/../cgroup/cgroup-internal.h

0d2b5955b36250 Tejun Heo       2022-01-06   68  
0d2b5955b36250 Tejun Heo       2022-01-06   69  struct cgroup_file_ctx {
e57457641613fe Tejun Heo       2022-01-06   70  	struct cgroup_namespace	*ns;
e57457641613fe Tejun Heo       2022-01-06   71  
0d2b5955b36250 Tejun Heo       2022-01-06   72  	struct {
0d2b5955b36250 Tejun Heo       2022-01-06   73  		void			*trigger;
0d2b5955b36250 Tejun Heo       2022-01-06   74  	} psi;
0d2b5955b36250 Tejun Heo       2022-01-06   75  
0d2b5955b36250 Tejun Heo       2022-01-06   76  	struct {
0d2b5955b36250 Tejun Heo       2022-01-06   77  		bool			started;
0d2b5955b36250 Tejun Heo       2022-01-06  @78  		struct css_task_iter	iter;
0d2b5955b36250 Tejun Heo       2022-01-06   79  	} procs;
0d2b5955b36250 Tejun Heo       2022-01-06   80  
0d2b5955b36250 Tejun Heo       2022-01-06   81  	struct {
0d2b5955b36250 Tejun Heo       2022-01-06   82  		struct cgroup_pidlist	*pidlist;
0d2b5955b36250 Tejun Heo       2022-01-06   83  	} procs1;
0d2b5955b36250 Tejun Heo       2022-01-06   84  };
0d2b5955b36250 Tejun Heo       2022-01-06   85  
0a268dbd7932c7 Tejun Heo       2016-12-27   86  /*
0a268dbd7932c7 Tejun Heo       2016-12-27   87   * A cgroup can be associated with multiple css_sets as different tasks may
0a268dbd7932c7 Tejun Heo       2016-12-27   88   * belong to different cgroups on different hierarchies.  In the other
0a268dbd7932c7 Tejun Heo       2016-12-27   89   * direction, a css_set is naturally associated with multiple cgroups.
0a268dbd7932c7 Tejun Heo       2016-12-27   90   * This M:N relationship is represented by the following link structure
0a268dbd7932c7 Tejun Heo       2016-12-27   91   * which exists for each association and allows traversing the associations
0a268dbd7932c7 Tejun Heo       2016-12-27   92   * from both sides.
0a268dbd7932c7 Tejun Heo       2016-12-27   93   */
0a268dbd7932c7 Tejun Heo       2016-12-27   94  struct cgrp_cset_link {
0a268dbd7932c7 Tejun Heo       2016-12-27   95  	/* the cgroup and css_set this link associates */
0a268dbd7932c7 Tejun Heo       2016-12-27   96  	struct cgroup		*cgrp;
0a268dbd7932c7 Tejun Heo       2016-12-27   97  	struct css_set		*cset;
0a268dbd7932c7 Tejun Heo       2016-12-27   98  
0a268dbd7932c7 Tejun Heo       2016-12-27   99  	/* list of cgrp_cset_links anchored at cgrp->cset_links */
0a268dbd7932c7 Tejun Heo       2016-12-27  100  	struct list_head	cset_link;
0a268dbd7932c7 Tejun Heo       2016-12-27  101  
0a268dbd7932c7 Tejun Heo       2016-12-27  102  	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
0a268dbd7932c7 Tejun Heo       2016-12-27  103  	struct list_head	cgrp_link;
0a268dbd7932c7 Tejun Heo       2016-12-27  104  };
0a268dbd7932c7 Tejun Heo       2016-12-27  105  
e595cd706982bf Tejun Heo       2017-01-15  106  /* used to track tasks and csets during migration */
e595cd706982bf Tejun Heo       2017-01-15  107  struct cgroup_taskset {
e595cd706982bf Tejun Heo       2017-01-15  108  	/* the src and dst cset list running through cset->mg_node */
e595cd706982bf Tejun Heo       2017-01-15  109  	struct list_head	src_csets;
e595cd706982bf Tejun Heo       2017-01-15  110  	struct list_head	dst_csets;
e595cd706982bf Tejun Heo       2017-01-15  111  
610467270fb368 Tejun Heo       2017-07-08  112  	/* the number of tasks in the set */
610467270fb368 Tejun Heo       2017-07-08  113  	int			nr_tasks;
610467270fb368 Tejun Heo       2017-07-08  114  
e595cd706982bf Tejun Heo       2017-01-15  115  	/* the subsys currently being processed */
e595cd706982bf Tejun Heo       2017-01-15  116  	int			ssid;
e595cd706982bf Tejun Heo       2017-01-15  117  
e595cd706982bf Tejun Heo       2017-01-15  118  	/*
e595cd706982bf Tejun Heo       2017-01-15  119  	 * Fields for cgroup_taskset_*() iteration.
e595cd706982bf Tejun Heo       2017-01-15  120  	 *
e595cd706982bf Tejun Heo       2017-01-15  121  	 * Before migration is committed, the target migration tasks are on
e595cd706982bf Tejun Heo       2017-01-15  122  	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
e595cd706982bf Tejun Heo       2017-01-15  123  	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
e595cd706982bf Tejun Heo       2017-01-15  124  	 * or ->dst_csets depending on whether migration is committed.
e595cd706982bf Tejun Heo       2017-01-15  125  	 *
e595cd706982bf Tejun Heo       2017-01-15  126  	 * ->cur_csets and ->cur_task point to the current task position
e595cd706982bf Tejun Heo       2017-01-15  127  	 * during iteration.
e595cd706982bf Tejun Heo       2017-01-15  128  	 */
e595cd706982bf Tejun Heo       2017-01-15  129  	struct list_head	*csets;
e595cd706982bf Tejun Heo       2017-01-15  130  	struct css_set		*cur_cset;
e595cd706982bf Tejun Heo       2017-01-15  131  	struct task_struct	*cur_task;
e595cd706982bf Tejun Heo       2017-01-15  132  };
e595cd706982bf Tejun Heo       2017-01-15  133  
e595cd706982bf Tejun Heo       2017-01-15  134  /* migration context also tracks preloading */
e595cd706982bf Tejun Heo       2017-01-15  135  struct cgroup_mgctx {
e595cd706982bf Tejun Heo       2017-01-15  136  	/*
e595cd706982bf Tejun Heo       2017-01-15  137  	 * Preloaded source and destination csets.  Used to guarantee
e595cd706982bf Tejun Heo       2017-01-15  138  	 * atomic success or failure on actual migration.
e595cd706982bf Tejun Heo       2017-01-15  139  	 */
e595cd706982bf Tejun Heo       2017-01-15  140  	struct list_head	preloaded_src_csets;
e595cd706982bf Tejun Heo       2017-01-15  141  	struct list_head	preloaded_dst_csets;
e595cd706982bf Tejun Heo       2017-01-15  142  
e595cd706982bf Tejun Heo       2017-01-15  143  	/* tasks and csets to migrate */
e595cd706982bf Tejun Heo       2017-01-15  144  	struct cgroup_taskset	tset;
bfc2cf6f61fcea Tejun Heo       2017-01-15  145  
bfc2cf6f61fcea Tejun Heo       2017-01-15  146  	/* subsystems affected by migration */
bfc2cf6f61fcea Tejun Heo       2017-01-15  147  	u16			ss_mask;
e595cd706982bf Tejun Heo       2017-01-15  148  };
e595cd706982bf Tejun Heo       2017-01-15  149  
e595cd706982bf Tejun Heo       2017-01-15  150  #define CGROUP_TASKSET_INIT(tset)						\
e595cd706982bf Tejun Heo       2017-01-15  151  {										\
e595cd706982bf Tejun Heo       2017-01-15  152  	.src_csets		= LIST_HEAD_INIT(tset.src_csets),		\
e595cd706982bf Tejun Heo       2017-01-15  153  	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),		\
e595cd706982bf Tejun Heo       2017-01-15  154  	.csets			= &tset.src_csets,				\
e595cd706982bf Tejun Heo       2017-01-15  155  }
e595cd706982bf Tejun Heo       2017-01-15  156  
e595cd706982bf Tejun Heo       2017-01-15  157  #define CGROUP_MGCTX_INIT(name)							\
e595cd706982bf Tejun Heo       2017-01-15  158  {										\
e595cd706982bf Tejun Heo       2017-01-15  159  	LIST_HEAD_INIT(name.preloaded_src_csets),				\
e595cd706982bf Tejun Heo       2017-01-15  160  	LIST_HEAD_INIT(name.preloaded_dst_csets),				\
e595cd706982bf Tejun Heo       2017-01-15  161  	CGROUP_TASKSET_INIT(name.tset),						\
e595cd706982bf Tejun Heo       2017-01-15  162  }
e595cd706982bf Tejun Heo       2017-01-15  163  
e595cd706982bf Tejun Heo       2017-01-15  164  #define DEFINE_CGROUP_MGCTX(name)						\
e595cd706982bf Tejun Heo       2017-01-15  165  	struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
e595cd706982bf Tejun Heo       2017-01-15  166  
0a268dbd7932c7 Tejun Heo       2016-12-27  167  extern struct mutex cgroup_mutex;
0a268dbd7932c7 Tejun Heo       2016-12-27  168  extern spinlock_t css_set_lock;
0a268dbd7932c7 Tejun Heo       2016-12-27  169  extern struct cgroup_subsys *cgroup_subsys[];
0a268dbd7932c7 Tejun Heo       2016-12-27  170  extern struct list_head cgroup_roots;
0a268dbd7932c7 Tejun Heo       2016-12-27  171  extern struct file_system_type cgroup_fs_type;
0a268dbd7932c7 Tejun Heo       2016-12-27  172  
0a268dbd7932c7 Tejun Heo       2016-12-27  173  /* iterate across the hierarchies */
0a268dbd7932c7 Tejun Heo       2016-12-27  174  #define for_each_root(root)						\
0a268dbd7932c7 Tejun Heo       2016-12-27  175  	list_for_each_entry((root), &cgroup_roots, root_list)
0a268dbd7932c7 Tejun Heo       2016-12-27  176  
0a268dbd7932c7 Tejun Heo       2016-12-27  177  /**
0a268dbd7932c7 Tejun Heo       2016-12-27  178   * for_each_subsys - iterate all enabled cgroup subsystems
0a268dbd7932c7 Tejun Heo       2016-12-27  179   * @ss: the iteration cursor
0a268dbd7932c7 Tejun Heo       2016-12-27  180   * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
0a268dbd7932c7 Tejun Heo       2016-12-27  181   */
0a268dbd7932c7 Tejun Heo       2016-12-27  182  #define for_each_subsys(ss, ssid)					\
0a268dbd7932c7 Tejun Heo       2016-12-27  183  	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
0a268dbd7932c7 Tejun Heo       2016-12-27  184  	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
0a268dbd7932c7 Tejun Heo       2016-12-27  185  
0a268dbd7932c7 Tejun Heo       2016-12-27  186  static inline bool cgroup_is_dead(const struct cgroup *cgrp)
0a268dbd7932c7 Tejun Heo       2016-12-27  187  {
0a268dbd7932c7 Tejun Heo       2016-12-27 @188  	return !(cgrp->self.flags & CSS_ONLINE);
0a268dbd7932c7 Tejun Heo       2016-12-27  189  }
0a268dbd7932c7 Tejun Heo       2016-12-27  190  
0a268dbd7932c7 Tejun Heo       2016-12-27  191  static inline bool notify_on_release(const struct cgroup *cgrp)
0a268dbd7932c7 Tejun Heo       2016-12-27  192  {
0a268dbd7932c7 Tejun Heo       2016-12-27 @193  	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
0a268dbd7932c7 Tejun Heo       2016-12-27  194  }
0a268dbd7932c7 Tejun Heo       2016-12-27  195  
dcfe149b9f45aa Tejun Heo       2016-12-27  196  void put_css_set_locked(struct css_set *cset);
dcfe149b9f45aa Tejun Heo       2016-12-27  197  
dcfe149b9f45aa Tejun Heo       2016-12-27  198  static inline void put_css_set(struct css_set *cset)
dcfe149b9f45aa Tejun Heo       2016-12-27  199  {
dcfe149b9f45aa Tejun Heo       2016-12-27  200  	unsigned long flags;
dcfe149b9f45aa Tejun Heo       2016-12-27  201  
dcfe149b9f45aa Tejun Heo       2016-12-27  202  	/*
dcfe149b9f45aa Tejun Heo       2016-12-27  203  	 * Ensure that the refcount doesn't hit zero while any readers
dcfe149b9f45aa Tejun Heo       2016-12-27  204  	 * can see it. Similar to atomic_dec_and_lock(), but for an
dcfe149b9f45aa Tejun Heo       2016-12-27  205  	 * rwlock
dcfe149b9f45aa Tejun Heo       2016-12-27  206  	 */
4b9502e63b5e2b Elena Reshetova 2017-03-08 @207  	if (refcount_dec_not_one(&cset->refcount))
dcfe149b9f45aa Tejun Heo       2016-12-27  208  		return;
dcfe149b9f45aa Tejun Heo       2016-12-27  209  
dcfe149b9f45aa Tejun Heo       2016-12-27  210  	spin_lock_irqsave(&css_set_lock, flags);
dcfe149b9f45aa Tejun Heo       2016-12-27  211  	put_css_set_locked(cset);
dcfe149b9f45aa Tejun Heo       2016-12-27  212  	spin_unlock_irqrestore(&css_set_lock, flags);
dcfe149b9f45aa Tejun Heo       2016-12-27  213  }
dcfe149b9f45aa Tejun Heo       2016-12-27  214
Yonghong Song June 28, 2022, 4:09 a.m. UTC | #5
On 6/10/22 12:44 PM, Yosry Ahmed wrote:
> From: Hao Luo <haoluo@google.com>
> 
> Cgroup_iter is a type of bpf_iter. It walks over cgroups in two modes:
> 
>   - walking a cgroup's descendants.
>   - walking a cgroup's ancestors.

The implementation has another choice, BPF_ITER_CGROUP_PARENT_UP.
We should add it here as well.

> 
> When attaching cgroup_iter, one can set a cgroup to the iter_link
> created from attaching. This cgroup is passed as a file descriptor and
> serves as the starting point of the walk. If no cgroup is specified,
> the starting point will be the root cgroup.
> 
> For walking descendants, one can specify the order: either pre-order or
> post-order. For walking ancestors, the walk starts at the specified
> cgroup and ends at the root.
> 
> One can also terminate the walk early by returning 1 from the iter
> program.
> 
> Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
> program is called with cgroup_mutex held.

Overall looks good to me with a few nits below.

Acked-by: Yonghong Song <yhs@fb.com>

> 
> Signed-off-by: Hao Luo <haoluo@google.com>
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> ---
>   include/linux/bpf.h            |   8 ++
>   include/uapi/linux/bpf.h       |  21 +++
>   kernel/bpf/Makefile            |   2 +-
>   kernel/bpf/cgroup_iter.c       | 235 +++++++++++++++++++++++++++++++++
>   tools/include/uapi/linux/bpf.h |  21 +++
>   5 files changed, 286 insertions(+), 1 deletion(-)
>   create mode 100644 kernel/bpf/cgroup_iter.c
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 8e6092d0ea956..48d8e836b9748 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -44,6 +44,7 @@ struct kobject;
>   struct mem_cgroup;
>   struct module;
>   struct bpf_func_state;
> +struct cgroup;
>   
>   extern struct idr btf_idr;
>   extern spinlock_t btf_idr_lock;
> @@ -1590,7 +1591,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>   	int __init bpf_iter_ ## target(args) { return 0; }
>   
>   struct bpf_iter_aux_info {
> +	/* for map_elem iter */
>   	struct bpf_map *map;
> +
> +	/* for cgroup iter */
> +	struct {
> +		struct cgroup *start; /* starting cgroup */
> +		int order;
> +	} cgroup;
>   };
>   
>   typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f4009dbdf62da..4fd05cde19116 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -87,10 +87,27 @@ struct bpf_cgroup_storage_key {
>   	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
>   };
>   
> +enum bpf_iter_cgroup_traversal_order {
> +	BPF_ITER_CGROUP_PRE = 0,	/* pre-order traversal */
> +	BPF_ITER_CGROUP_POST,		/* post-order traversal */
> +	BPF_ITER_CGROUP_PARENT_UP,	/* traversal of ancestors up to the root */
> +};
> +
>   union bpf_iter_link_info {
>   	struct {
>   		__u32	map_fd;
>   	} map;
> +
> +	/* cgroup_iter walks either the live descendants of a cgroup subtree, or the ancestors
> +	 * of a given cgroup.
> +	 */
> +	struct {
> +		/* Cgroup file descriptor. This is root of the subtree if for walking the
> +		 * descendants; this is the starting cgroup if for walking the ancestors.
> +		 */
> +		__u32	cgroup_fd;
> +		__u32	traversal_order;
> +	} cgroup;
>   };
>   
>   /* BPF syscall commands, see bpf(2) man-page for more details. */
> @@ -6050,6 +6067,10 @@ struct bpf_link_info {
>   				struct {
>   					__u32 map_id;
>   				} map;
> +				struct {
> +					__u32 traversal_order;
> +					__aligned_u64 cgroup_id;
> +				} cgroup;
>   			};
>   		} iter;
>   		struct  {
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index 057ba8e01e70f..9741b9314fb46 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -8,7 +8,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
>   
>   obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
>   obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
> -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
> +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o cgroup_iter.o
>   obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
>   obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
>   obj-$(CONFIG_BPF_SYSCALL) += disasm.o
> diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
> new file mode 100644
> index 0000000000000..88deb655efa71
> --- /dev/null
> +++ b/kernel/bpf/cgroup_iter.c
> @@ -0,0 +1,235 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2022 Google */
> +#include <linux/bpf.h>
> +#include <linux/btf_ids.h>
> +#include <linux/cgroup.h>
> +#include <linux/kernel.h>
> +#include <linux/seq_file.h>
> +
> +#include "../cgroup/cgroup-internal.h"  /* cgroup_mutex and cgroup_is_dead */
> +
> +/* cgroup_iter provides two modes of traversal to the cgroup hierarchy.
> + *
> + *  1. Walk the descendants of a cgroup.
> + *  2. Walk the ancestors of a cgroup.

three modes here?

> + *
> + * For walking descendants, cgroup_iter can walk in either pre-order or
> + * post-order. For walking ancestors, the iter walks up from a cgroup to
> + * the root.
> + *
> + * The iter program can terminate the walk early by returning 1. Walk
> + * continues if prog returns 0.
> + *
> + * The prog can check (seq->num == 0) to determine whether this is
> + * the first element. The prog may also be passed a NULL cgroup,
> + * which means the walk has completed and the prog has a chance to
> + * do post-processing, such as outputing an epilogue.
> + *
> + * Note: the iter_prog is called with cgroup_mutex held.
> + */
> +
> +struct bpf_iter__cgroup {
> +	__bpf_md_ptr(struct bpf_iter_meta *, meta);
> +	__bpf_md_ptr(struct cgroup *, cgroup);
> +};
> +
> +struct cgroup_iter_priv {
> +	struct cgroup_subsys_state *start_css;
> +	bool terminate;
> +	int order;
> +};
> +
> +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +	struct cgroup_iter_priv *p = seq->private;
> +
> +	mutex_lock(&cgroup_mutex);
> +
> +	/* support only one session */
> +	if (*pos > 0)
> +		return NULL;
> +
> +	++*pos;
> +	p->terminate = false;
> +	if (p->order == BPF_ITER_CGROUP_PRE)
> +		return css_next_descendant_pre(NULL, p->start_css);
> +	else if (p->order == BPF_ITER_CGROUP_POST)
> +		return css_next_descendant_post(NULL, p->start_css);
> +	else /* BPF_ITER_CGROUP_PARENT_UP */
> +		return p->start_css;
> +}
> +
> +static int __cgroup_iter_seq_show(struct seq_file *seq,
> +				  struct cgroup_subsys_state *css, int in_stop);
> +
> +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
> +{
> +	/* pass NULL to the prog for post-processing */
> +	if (!v)
> +		__cgroup_iter_seq_show(seq, NULL, true);
> +	mutex_unlock(&cgroup_mutex);
> +}
> +
> +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +	struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
> +	struct cgroup_iter_priv *p = seq->private;
> +
> +	++*pos;
> +	if (p->terminate)
> +		return NULL;
> +
> +	if (p->order == BPF_ITER_CGROUP_PRE)
> +		return css_next_descendant_pre(curr, p->start_css);
> +	else if (p->order == BPF_ITER_CGROUP_POST)
> +		return css_next_descendant_post(curr, p->start_css);
> +	else
> +		return curr->parent;
> +}
> +
> +static int __cgroup_iter_seq_show(struct seq_file *seq,
> +				  struct cgroup_subsys_state *css, int in_stop)
> +{
> +	struct cgroup_iter_priv *p = seq->private;
> +	struct bpf_iter__cgroup ctx;
> +	struct bpf_iter_meta meta;
> +	struct bpf_prog *prog;
> +	int ret = 0;
> +
> +	/* cgroup is dead, skip this element */
> +	if (css && cgroup_is_dead(css->cgroup))
> +		return 0;
> +
> +	ctx.meta = &meta;
> +	ctx.cgroup = css ? css->cgroup : NULL;
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, in_stop);
> +	if (prog)
> +		ret = bpf_iter_run_prog(prog, &ctx);
> +
> +	/* if prog returns > 0, terminate after this element. */
> +	if (ret != 0)
> +		p->terminate = true;
> +
> +	return 0;
> +}
> +
> +static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
> +{
> +	return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
> +				      false);
> +}
> +
> +static const struct seq_operations cgroup_iter_seq_ops = {
> +	.start  = cgroup_iter_seq_start,
> +	.next   = cgroup_iter_seq_next,
> +	.stop   = cgroup_iter_seq_stop,
> +	.show   = cgroup_iter_seq_show,
> +};
> +
> +BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
> +
> +static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
> +{
> +	struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
> +	struct cgroup *cgrp = aux->cgroup.start;
> +
> +	p->start_css = &cgrp->self;
> +	p->terminate = false;
> +	p->order = aux->cgroup.order;
> +	return 0;
> +}
> +
> +static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
> +	.seq_ops                = &cgroup_iter_seq_ops,
> +	.init_seq_private       = cgroup_iter_seq_init,
> +	.seq_priv_size          = sizeof(struct cgroup_iter_priv),
> +};
> +
> +static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
> +				  union bpf_iter_link_info *linfo,
> +				  struct bpf_iter_aux_info *aux)
> +{
> +	int fd = linfo->cgroup.cgroup_fd;
> +	struct cgroup *cgrp;
> +
> +	if (fd)
> +		cgrp = cgroup_get_from_fd(fd);
> +	else /* walk the entire hierarchy by default. */
> +		cgrp = cgroup_get_from_path("/");
> +
> +	if (IS_ERR(cgrp))
> +		return PTR_ERR(cgrp);
> +
> +	aux->cgroup.start = cgrp;
> +	aux->cgroup.order = linfo->cgroup.traversal_order;

The legality of traversal_order should be checked.

> +	return 0;
> +}
> +
> +static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
> +{
> +	cgroup_put(aux->cgroup.start);
> +}
> +
> +static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
> +					struct seq_file *seq)
> +{
> +	char *buf;
> +
> +	buf = kzalloc(PATH_MAX, GFP_KERNEL);
> +	if (!buf) {
> +		seq_puts(seq, "cgroup_path:\n");

This is a really unlikely case. maybe "cgroup_path:<unknown>"?

> +		goto show_order;
> +	}
> +
> +	/* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
> +	 * will print nothing.
> +	 *
> +	 * Path is in the calling process's cgroup namespace.
> +	 */
> +	cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
> +		       current->nsproxy->cgroup_ns);
> +	seq_printf(seq, "cgroup_path:\t%s\n", buf);
> +	kfree(buf);
> +
> +show_order:
> +	if (aux->cgroup.order == BPF_ITER_CGROUP_PRE)
> +		seq_puts(seq, "traversal_order: pre\n");
> +	else if (aux->cgroup.order == BPF_ITER_CGROUP_POST)
> +		seq_puts(seq, "traversal_order: post\n");
> +	else /* BPF_ITER_CGROUP_PARENT_UP */
> +		seq_puts(seq, "traversal_order: parent_up\n");
> +}
> +
[...]
Yonghong Song June 28, 2022, 4:14 a.m. UTC | #6
On 6/10/22 12:44 PM, Yosry Ahmed wrote:
> From: Hao Luo <haoluo@google.com>
> 
> Cgroup_iter is a type of bpf_iter. It walks over cgroups in two modes:
> 
>   - walking a cgroup's descendants.
>   - walking a cgroup's ancestors.
> 
> When attaching cgroup_iter, one can set a cgroup to the iter_link
> created from attaching. This cgroup is passed as a file descriptor and
> serves as the starting point of the walk. If no cgroup is specified,
> the starting point will be the root cgroup.
> 
> For walking descendants, one can specify the order: either pre-order or
> post-order. For walking ancestors, the walk starts at the specified
> cgroup and ends at the root.
> 
> One can also terminate the walk early by returning 1 from the iter
> program.
> 
> Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
> program is called with cgroup_mutex held.
> 
> Signed-off-by: Hao Luo <haoluo@google.com>
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> ---
>   include/linux/bpf.h            |   8 ++
>   include/uapi/linux/bpf.h       |  21 +++
>   kernel/bpf/Makefile            |   2 +-
>   kernel/bpf/cgroup_iter.c       | 235 +++++++++++++++++++++++++++++++++
>   tools/include/uapi/linux/bpf.h |  21 +++
>   5 files changed, 286 insertions(+), 1 deletion(-)
>   create mode 100644 kernel/bpf/cgroup_iter.c
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 8e6092d0ea956..48d8e836b9748 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -44,6 +44,7 @@ struct kobject;
>   struct mem_cgroup;
>   struct module;
>   struct bpf_func_state;
> +struct cgroup;
>   
>   extern struct idr btf_idr;
>   extern spinlock_t btf_idr_lock;
> @@ -1590,7 +1591,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>   	int __init bpf_iter_ ## target(args) { return 0; }
>   
>   struct bpf_iter_aux_info {
> +	/* for map_elem iter */
>   	struct bpf_map *map;
> +
> +	/* for cgroup iter */
> +	struct {
> +		struct cgroup *start; /* starting cgroup */
> +		int order;
> +	} cgroup;
>   };
>   
[...]
> +
> +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +	struct cgroup_iter_priv *p = seq->private;
> +
> +	mutex_lock(&cgroup_mutex);
> +
> +	/* support only one session */
> +	if (*pos > 0)
> +		return NULL;
> +
> +	++*pos;
> +	p->terminate = false;
> +	if (p->order == BPF_ITER_CGROUP_PRE)
> +		return css_next_descendant_pre(NULL, p->start_css);
> +	else if (p->order == BPF_ITER_CGROUP_POST)
> +		return css_next_descendant_post(NULL, p->start_css);
> +	else /* BPF_ITER_CGROUP_PARENT_UP */
> +		return p->start_css;
> +}
> +
> +static int __cgroup_iter_seq_show(struct seq_file *seq,
> +				  struct cgroup_subsys_state *css, int in_stop);
> +
> +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
> +{
> +	/* pass NULL to the prog for post-processing */
> +	if (!v)
> +		__cgroup_iter_seq_show(seq, NULL, true);
> +	mutex_unlock(&cgroup_mutex);
> +}
> +
> +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +	struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
> +	struct cgroup_iter_priv *p = seq->private;
> +
> +	++*pos;
> +	if (p->terminate)
> +		return NULL;
> +
> +	if (p->order == BPF_ITER_CGROUP_PRE)
> +		return css_next_descendant_pre(curr, p->start_css);
> +	else if (p->order == BPF_ITER_CGROUP_POST)
> +		return css_next_descendant_post(curr, p->start_css);
> +	else
> +		return curr->parent;
> +}
> +
> +static int __cgroup_iter_seq_show(struct seq_file *seq,
> +				  struct cgroup_subsys_state *css, int in_stop)
> +{
> +	struct cgroup_iter_priv *p = seq->private;
> +	struct bpf_iter__cgroup ctx;
> +	struct bpf_iter_meta meta;
> +	struct bpf_prog *prog;
> +	int ret = 0;
> +
> +	/* cgroup is dead, skip this element */
> +	if (css && cgroup_is_dead(css->cgroup))
> +		return 0;
> +
> +	ctx.meta = &meta;
> +	ctx.cgroup = css ? css->cgroup : NULL;
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, in_stop);
> +	if (prog)
> +		ret = bpf_iter_run_prog(prog, &ctx);

Do we need to do anything special to ensure bpf program gets
up-to-date stat from ctx.cgroup?

> +
> +	/* if prog returns > 0, terminate after this element. */
> +	if (ret != 0)
> +		p->terminate = true;
> +
> +	return 0;
> +}
> +
[...]
Yosry Ahmed June 28, 2022, 6:03 a.m. UTC | #7
On Mon, Jun 27, 2022 at 9:14 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 6/10/22 12:44 PM, Yosry Ahmed wrote:
> > From: Hao Luo <haoluo@google.com>
> >
> > Cgroup_iter is a type of bpf_iter. It walks over cgroups in two modes:
> >
> >   - walking a cgroup's descendants.
> >   - walking a cgroup's ancestors.
> >
> > When attaching cgroup_iter, one can set a cgroup to the iter_link
> > created from attaching. This cgroup is passed as a file descriptor and
> > serves as the starting point of the walk. If no cgroup is specified,
> > the starting point will be the root cgroup.
> >
> > For walking descendants, one can specify the order: either pre-order or
> > post-order. For walking ancestors, the walk starts at the specified
> > cgroup and ends at the root.
> >
> > One can also terminate the walk early by returning 1 from the iter
> > program.
> >
> > Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
> > program is called with cgroup_mutex held.
> >
> > Signed-off-by: Hao Luo <haoluo@google.com>
> > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > ---
> >   include/linux/bpf.h            |   8 ++
> >   include/uapi/linux/bpf.h       |  21 +++
> >   kernel/bpf/Makefile            |   2 +-
> >   kernel/bpf/cgroup_iter.c       | 235 +++++++++++++++++++++++++++++++++
> >   tools/include/uapi/linux/bpf.h |  21 +++
> >   5 files changed, 286 insertions(+), 1 deletion(-)
> >   create mode 100644 kernel/bpf/cgroup_iter.c
> >
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 8e6092d0ea956..48d8e836b9748 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -44,6 +44,7 @@ struct kobject;
> >   struct mem_cgroup;
> >   struct module;
> >   struct bpf_func_state;
> > +struct cgroup;
> >
> >   extern struct idr btf_idr;
> >   extern spinlock_t btf_idr_lock;
> > @@ -1590,7 +1591,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
> >       int __init bpf_iter_ ## target(args) { return 0; }
> >
> >   struct bpf_iter_aux_info {
> > +     /* for map_elem iter */
> >       struct bpf_map *map;
> > +
> > +     /* for cgroup iter */
> > +     struct {
> > +             struct cgroup *start; /* starting cgroup */
> > +             int order;
> > +     } cgroup;
> >   };
> >
> [...]
> > +
> > +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
> > +{
> > +     struct cgroup_iter_priv *p = seq->private;
> > +
> > +     mutex_lock(&cgroup_mutex);
> > +
> > +     /* support only one session */
> > +     if (*pos > 0)
> > +             return NULL;
> > +
> > +     ++*pos;
> > +     p->terminate = false;
> > +     if (p->order == BPF_ITER_CGROUP_PRE)
> > +             return css_next_descendant_pre(NULL, p->start_css);
> > +     else if (p->order == BPF_ITER_CGROUP_POST)
> > +             return css_next_descendant_post(NULL, p->start_css);
> > +     else /* BPF_ITER_CGROUP_PARENT_UP */
> > +             return p->start_css;
> > +}
> > +
> > +static int __cgroup_iter_seq_show(struct seq_file *seq,
> > +                               struct cgroup_subsys_state *css, int in_stop);
> > +
> > +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
> > +{
> > +     /* pass NULL to the prog for post-processing */
> > +     if (!v)
> > +             __cgroup_iter_seq_show(seq, NULL, true);
> > +     mutex_unlock(&cgroup_mutex);
> > +}
> > +
> > +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> > +{
> > +     struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
> > +     struct cgroup_iter_priv *p = seq->private;
> > +
> > +     ++*pos;
> > +     if (p->terminate)
> > +             return NULL;
> > +
> > +     if (p->order == BPF_ITER_CGROUP_PRE)
> > +             return css_next_descendant_pre(curr, p->start_css);
> > +     else if (p->order == BPF_ITER_CGROUP_POST)
> > +             return css_next_descendant_post(curr, p->start_css);
> > +     else
> > +             return curr->parent;
> > +}
> > +
> > +static int __cgroup_iter_seq_show(struct seq_file *seq,
> > +                               struct cgroup_subsys_state *css, int in_stop)
> > +{
> > +     struct cgroup_iter_priv *p = seq->private;
> > +     struct bpf_iter__cgroup ctx;
> > +     struct bpf_iter_meta meta;
> > +     struct bpf_prog *prog;
> > +     int ret = 0;
> > +
> > +     /* cgroup is dead, skip this element */
> > +     if (css && cgroup_is_dead(css->cgroup))
> > +             return 0;
> > +
> > +     ctx.meta = &meta;
> > +     ctx.cgroup = css ? css->cgroup : NULL;
> > +     meta.seq = seq;
> > +     prog = bpf_iter_get_info(&meta, in_stop);
> > +     if (prog)
> > +             ret = bpf_iter_run_prog(prog, &ctx);
>
> Do we need to do anything special to ensure bpf program gets
> up-to-date stat from ctx.cgroup?

Later patches in the series add cgroup_flush_rstat() kfunc which
flushes cgroup stats that use rstat (e.g. memcg stats). It can be
called directly from the bpf program if needed.

It would be better to leave this to the bpf program, it's an
unnecessary toll to flush the stats for any cgroup_iter program, that
could be not accessing stats, or stats that are not maintained using
rstat.

>
> > +
> > +     /* if prog returns > 0, terminate after this element. */
> > +     if (ret != 0)
> > +             p->terminate = true;
> > +
> > +     return 0;
> > +}
> > +
> [...]
Yosry Ahmed June 28, 2022, 6:06 a.m. UTC | #8
On Mon, Jun 27, 2022 at 9:09 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 6/10/22 12:44 PM, Yosry Ahmed wrote:
> > From: Hao Luo <haoluo@google.com>
> >
> > Cgroup_iter is a type of bpf_iter. It walks over cgroups in two modes:
> >
> >   - walking a cgroup's descendants.
> >   - walking a cgroup's ancestors.
>
> The implementation has another choice, BPF_ITER_CGROUP_PARENT_UP.
> We should add it here as well.
>

BPF_ITER_CGROUP_PARENT_UP is expressed here, I think what's actually
missing here (and down below where only 2 modes are specified again)
is that walking descendants is broken down into two separate modes,
pre and post order traversals.

> >
> > When attaching cgroup_iter, one can set a cgroup to the iter_link
> > created from attaching. This cgroup is passed as a file descriptor and
> > serves as the starting point of the walk. If no cgroup is specified,
> > the starting point will be the root cgroup.
> >
> > For walking descendants, one can specify the order: either pre-order or
> > post-order. For walking ancestors, the walk starts at the specified
> > cgroup and ends at the root.
> >
> > One can also terminate the walk early by returning 1 from the iter
> > program.
> >
> > Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
> > program is called with cgroup_mutex held.
>
> Overall looks good to me with a few nits below.
>
> Acked-by: Yonghong Song <yhs@fb.com>
>
> >
> > Signed-off-by: Hao Luo <haoluo@google.com>
> > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > ---
> >   include/linux/bpf.h            |   8 ++
> >   include/uapi/linux/bpf.h       |  21 +++
> >   kernel/bpf/Makefile            |   2 +-
> >   kernel/bpf/cgroup_iter.c       | 235 +++++++++++++++++++++++++++++++++
> >   tools/include/uapi/linux/bpf.h |  21 +++
> >   5 files changed, 286 insertions(+), 1 deletion(-)
> >   create mode 100644 kernel/bpf/cgroup_iter.c
> >
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 8e6092d0ea956..48d8e836b9748 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -44,6 +44,7 @@ struct kobject;
> >   struct mem_cgroup;
> >   struct module;
> >   struct bpf_func_state;
> > +struct cgroup;
> >
> >   extern struct idr btf_idr;
> >   extern spinlock_t btf_idr_lock;
> > @@ -1590,7 +1591,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
> >       int __init bpf_iter_ ## target(args) { return 0; }
> >
> >   struct bpf_iter_aux_info {
> > +     /* for map_elem iter */
> >       struct bpf_map *map;
> > +
> > +     /* for cgroup iter */
> > +     struct {
> > +             struct cgroup *start; /* starting cgroup */
> > +             int order;
> > +     } cgroup;
> >   };
> >
> >   typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index f4009dbdf62da..4fd05cde19116 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -87,10 +87,27 @@ struct bpf_cgroup_storage_key {
> >       __u32   attach_type;            /* program attach type (enum bpf_attach_type) */
> >   };
> >
> > +enum bpf_iter_cgroup_traversal_order {
> > +     BPF_ITER_CGROUP_PRE = 0,        /* pre-order traversal */
> > +     BPF_ITER_CGROUP_POST,           /* post-order traversal */
> > +     BPF_ITER_CGROUP_PARENT_UP,      /* traversal of ancestors up to the root */
> > +};
> > +
> >   union bpf_iter_link_info {
> >       struct {
> >               __u32   map_fd;
> >       } map;
> > +
> > +     /* cgroup_iter walks either the live descendants of a cgroup subtree, or the ancestors
> > +      * of a given cgroup.
> > +      */
> > +     struct {
> > +             /* Cgroup file descriptor. This is root of the subtree if for walking the
> > +              * descendants; this is the starting cgroup if for walking the ancestors.
> > +              */
> > +             __u32   cgroup_fd;
> > +             __u32   traversal_order;
> > +     } cgroup;
> >   };
> >
> >   /* BPF syscall commands, see bpf(2) man-page for more details. */
> > @@ -6050,6 +6067,10 @@ struct bpf_link_info {
> >                               struct {
> >                                       __u32 map_id;
> >                               } map;
> > +                             struct {
> > +                                     __u32 traversal_order;
> > +                                     __aligned_u64 cgroup_id;
> > +                             } cgroup;
> >                       };
> >               } iter;
> >               struct  {
> > diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> > index 057ba8e01e70f..9741b9314fb46 100644
> > --- a/kernel/bpf/Makefile
> > +++ b/kernel/bpf/Makefile
> > @@ -8,7 +8,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
> >
> >   obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
> >   obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
> > -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
> > +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o cgroup_iter.o
> >   obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
> >   obj-${CONFIG_BPF_LSM}         += bpf_inode_storage.o
> >   obj-$(CONFIG_BPF_SYSCALL) += disasm.o
> > diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
> > new file mode 100644
> > index 0000000000000..88deb655efa71
> > --- /dev/null
> > +++ b/kernel/bpf/cgroup_iter.c
> > @@ -0,0 +1,235 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/* Copyright (c) 2022 Google */
> > +#include <linux/bpf.h>
> > +#include <linux/btf_ids.h>
> > +#include <linux/cgroup.h>
> > +#include <linux/kernel.h>
> > +#include <linux/seq_file.h>
> > +
> > +#include "../cgroup/cgroup-internal.h"  /* cgroup_mutex and cgroup_is_dead */
> > +
> > +/* cgroup_iter provides two modes of traversal to the cgroup hierarchy.
> > + *
> > + *  1. Walk the descendants of a cgroup.
> > + *  2. Walk the ancestors of a cgroup.
>
> three modes here?
>
> > + *
> > + * For walking descendants, cgroup_iter can walk in either pre-order or
> > + * post-order. For walking ancestors, the iter walks up from a cgroup to
> > + * the root.
> > + *
> > + * The iter program can terminate the walk early by returning 1. Walk
> > + * continues if prog returns 0.
> > + *
> > + * The prog can check (seq->num == 0) to determine whether this is
> > + * the first element. The prog may also be passed a NULL cgroup,
> > + * which means the walk has completed and the prog has a chance to
> > + * do post-processing, such as outputing an epilogue.
> > + *
> > + * Note: the iter_prog is called with cgroup_mutex held.
> > + */
> > +
> > +struct bpf_iter__cgroup {
> > +     __bpf_md_ptr(struct bpf_iter_meta *, meta);
> > +     __bpf_md_ptr(struct cgroup *, cgroup);
> > +};
> > +
> > +struct cgroup_iter_priv {
> > +     struct cgroup_subsys_state *start_css;
> > +     bool terminate;
> > +     int order;
> > +};
> > +
> > +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
> > +{
> > +     struct cgroup_iter_priv *p = seq->private;
> > +
> > +     mutex_lock(&cgroup_mutex);
> > +
> > +     /* support only one session */
> > +     if (*pos > 0)
> > +             return NULL;
> > +
> > +     ++*pos;
> > +     p->terminate = false;
> > +     if (p->order == BPF_ITER_CGROUP_PRE)
> > +             return css_next_descendant_pre(NULL, p->start_css);
> > +     else if (p->order == BPF_ITER_CGROUP_POST)
> > +             return css_next_descendant_post(NULL, p->start_css);
> > +     else /* BPF_ITER_CGROUP_PARENT_UP */
> > +             return p->start_css;
> > +}
> > +
> > +static int __cgroup_iter_seq_show(struct seq_file *seq,
> > +                               struct cgroup_subsys_state *css, int in_stop);
> > +
> > +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
> > +{
> > +     /* pass NULL to the prog for post-processing */
> > +     if (!v)
> > +             __cgroup_iter_seq_show(seq, NULL, true);
> > +     mutex_unlock(&cgroup_mutex);
> > +}
> > +
> > +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> > +{
> > +     struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
> > +     struct cgroup_iter_priv *p = seq->private;
> > +
> > +     ++*pos;
> > +     if (p->terminate)
> > +             return NULL;
> > +
> > +     if (p->order == BPF_ITER_CGROUP_PRE)
> > +             return css_next_descendant_pre(curr, p->start_css);
> > +     else if (p->order == BPF_ITER_CGROUP_POST)
> > +             return css_next_descendant_post(curr, p->start_css);
> > +     else
> > +             return curr->parent;
> > +}
> > +
> > +static int __cgroup_iter_seq_show(struct seq_file *seq,
> > +                               struct cgroup_subsys_state *css, int in_stop)
> > +{
> > +     struct cgroup_iter_priv *p = seq->private;
> > +     struct bpf_iter__cgroup ctx;
> > +     struct bpf_iter_meta meta;
> > +     struct bpf_prog *prog;
> > +     int ret = 0;
> > +
> > +     /* cgroup is dead, skip this element */
> > +     if (css && cgroup_is_dead(css->cgroup))
> > +             return 0;
> > +
> > +     ctx.meta = &meta;
> > +     ctx.cgroup = css ? css->cgroup : NULL;
> > +     meta.seq = seq;
> > +     prog = bpf_iter_get_info(&meta, in_stop);
> > +     if (prog)
> > +             ret = bpf_iter_run_prog(prog, &ctx);
> > +
> > +     /* if prog returns > 0, terminate after this element. */
> > +     if (ret != 0)
> > +             p->terminate = true;
> > +
> > +     return 0;
> > +}
> > +
> > +static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
> > +{
> > +     return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
> > +                                   false);
> > +}
> > +
> > +static const struct seq_operations cgroup_iter_seq_ops = {
> > +     .start  = cgroup_iter_seq_start,
> > +     .next   = cgroup_iter_seq_next,
> > +     .stop   = cgroup_iter_seq_stop,
> > +     .show   = cgroup_iter_seq_show,
> > +};
> > +
> > +BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
> > +
> > +static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
> > +{
> > +     struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
> > +     struct cgroup *cgrp = aux->cgroup.start;
> > +
> > +     p->start_css = &cgrp->self;
> > +     p->terminate = false;
> > +     p->order = aux->cgroup.order;
> > +     return 0;
> > +}
> > +
> > +static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
> > +     .seq_ops                = &cgroup_iter_seq_ops,
> > +     .init_seq_private       = cgroup_iter_seq_init,
> > +     .seq_priv_size          = sizeof(struct cgroup_iter_priv),
> > +};
> > +
> > +static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
> > +                               union bpf_iter_link_info *linfo,
> > +                               struct bpf_iter_aux_info *aux)
> > +{
> > +     int fd = linfo->cgroup.cgroup_fd;
> > +     struct cgroup *cgrp;
> > +
> > +     if (fd)
> > +             cgrp = cgroup_get_from_fd(fd);
> > +     else /* walk the entire hierarchy by default. */
> > +             cgrp = cgroup_get_from_path("/");
> > +
> > +     if (IS_ERR(cgrp))
> > +             return PTR_ERR(cgrp);
> > +
> > +     aux->cgroup.start = cgrp;
> > +     aux->cgroup.order = linfo->cgroup.traversal_order;
>
> The legality of traversal_order should be checked.
>
> > +     return 0;
> > +}
> > +
> > +static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
> > +{
> > +     cgroup_put(aux->cgroup.start);
> > +}
> > +
> > +static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
> > +                                     struct seq_file *seq)
> > +{
> > +     char *buf;
> > +
> > +     buf = kzalloc(PATH_MAX, GFP_KERNEL);
> > +     if (!buf) {
> > +             seq_puts(seq, "cgroup_path:\n");
>
> This is a really unlikely case. maybe "cgroup_path:<unknown>"?
>
> > +             goto show_order;
> > +     }
> > +
> > +     /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
> > +      * will print nothing.
> > +      *
> > +      * Path is in the calling process's cgroup namespace.
> > +      */
> > +     cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
> > +                    current->nsproxy->cgroup_ns);
> > +     seq_printf(seq, "cgroup_path:\t%s\n", buf);
> > +     kfree(buf);
> > +
> > +show_order:
> > +     if (aux->cgroup.order == BPF_ITER_CGROUP_PRE)
> > +             seq_puts(seq, "traversal_order: pre\n");
> > +     else if (aux->cgroup.order == BPF_ITER_CGROUP_POST)
> > +             seq_puts(seq, "traversal_order: post\n");
> > +     else /* BPF_ITER_CGROUP_PARENT_UP */
> > +             seq_puts(seq, "traversal_order: parent_up\n");
> > +}
> > +
> [...]
Yonghong Song June 28, 2022, 5:03 p.m. UTC | #9
On 6/27/22 11:03 PM, Yosry Ahmed wrote:
> On Mon, Jun 27, 2022 at 9:14 PM Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 6/10/22 12:44 PM, Yosry Ahmed wrote:
>>> From: Hao Luo <haoluo@google.com>
>>>
>>> Cgroup_iter is a type of bpf_iter. It walks over cgroups in two modes:
>>>
>>>    - walking a cgroup's descendants.
>>>    - walking a cgroup's ancestors.
>>>
>>> When attaching cgroup_iter, one can set a cgroup to the iter_link
>>> created from attaching. This cgroup is passed as a file descriptor and
>>> serves as the starting point of the walk. If no cgroup is specified,
>>> the starting point will be the root cgroup.
>>>
>>> For walking descendants, one can specify the order: either pre-order or
>>> post-order. For walking ancestors, the walk starts at the specified
>>> cgroup and ends at the root.
>>>
>>> One can also terminate the walk early by returning 1 from the iter
>>> program.
>>>
>>> Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
>>> program is called with cgroup_mutex held.
>>>
>>> Signed-off-by: Hao Luo <haoluo@google.com>
>>> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
>>> ---
>>>    include/linux/bpf.h            |   8 ++
>>>    include/uapi/linux/bpf.h       |  21 +++
>>>    kernel/bpf/Makefile            |   2 +-
>>>    kernel/bpf/cgroup_iter.c       | 235 +++++++++++++++++++++++++++++++++
>>>    tools/include/uapi/linux/bpf.h |  21 +++
>>>    5 files changed, 286 insertions(+), 1 deletion(-)
>>>    create mode 100644 kernel/bpf/cgroup_iter.c
>>>
>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>> index 8e6092d0ea956..48d8e836b9748 100644
>>> --- a/include/linux/bpf.h
>>> +++ b/include/linux/bpf.h
>>> @@ -44,6 +44,7 @@ struct kobject;
>>>    struct mem_cgroup;
>>>    struct module;
>>>    struct bpf_func_state;
>>> +struct cgroup;
>>>
>>>    extern struct idr btf_idr;
>>>    extern spinlock_t btf_idr_lock;
>>> @@ -1590,7 +1591,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>>>        int __init bpf_iter_ ## target(args) { return 0; }
>>>
>>>    struct bpf_iter_aux_info {
>>> +     /* for map_elem iter */
>>>        struct bpf_map *map;
>>> +
>>> +     /* for cgroup iter */
>>> +     struct {
>>> +             struct cgroup *start; /* starting cgroup */
>>> +             int order;
>>> +     } cgroup;
>>>    };
>>>
>> [...]
>>> +
>>> +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
>>> +{
>>> +     struct cgroup_iter_priv *p = seq->private;
>>> +
>>> +     mutex_lock(&cgroup_mutex);
>>> +
>>> +     /* support only one session */
>>> +     if (*pos > 0)
>>> +             return NULL;
>>> +
>>> +     ++*pos;
>>> +     p->terminate = false;
>>> +     if (p->order == BPF_ITER_CGROUP_PRE)
>>> +             return css_next_descendant_pre(NULL, p->start_css);
>>> +     else if (p->order == BPF_ITER_CGROUP_POST)
>>> +             return css_next_descendant_post(NULL, p->start_css);
>>> +     else /* BPF_ITER_CGROUP_PARENT_UP */
>>> +             return p->start_css;
>>> +}
>>> +
>>> +static int __cgroup_iter_seq_show(struct seq_file *seq,
>>> +                               struct cgroup_subsys_state *css, int in_stop);
>>> +
>>> +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
>>> +{
>>> +     /* pass NULL to the prog for post-processing */
>>> +     if (!v)
>>> +             __cgroup_iter_seq_show(seq, NULL, true);
>>> +     mutex_unlock(&cgroup_mutex);
>>> +}
>>> +
>>> +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>>> +{
>>> +     struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
>>> +     struct cgroup_iter_priv *p = seq->private;
>>> +
>>> +     ++*pos;
>>> +     if (p->terminate)
>>> +             return NULL;
>>> +
>>> +     if (p->order == BPF_ITER_CGROUP_PRE)
>>> +             return css_next_descendant_pre(curr, p->start_css);
>>> +     else if (p->order == BPF_ITER_CGROUP_POST)
>>> +             return css_next_descendant_post(curr, p->start_css);
>>> +     else
>>> +             return curr->parent;
>>> +}
>>> +
>>> +static int __cgroup_iter_seq_show(struct seq_file *seq,
>>> +                               struct cgroup_subsys_state *css, int in_stop)
>>> +{
>>> +     struct cgroup_iter_priv *p = seq->private;
>>> +     struct bpf_iter__cgroup ctx;
>>> +     struct bpf_iter_meta meta;
>>> +     struct bpf_prog *prog;
>>> +     int ret = 0;
>>> +
>>> +     /* cgroup is dead, skip this element */
>>> +     if (css && cgroup_is_dead(css->cgroup))
>>> +             return 0;
>>> +
>>> +     ctx.meta = &meta;
>>> +     ctx.cgroup = css ? css->cgroup : NULL;
>>> +     meta.seq = seq;
>>> +     prog = bpf_iter_get_info(&meta, in_stop);
>>> +     if (prog)
>>> +             ret = bpf_iter_run_prog(prog, &ctx);
>>
>> Do we need to do anything special to ensure bpf program gets
>> up-to-date stat from ctx.cgroup?
> 
> Later patches in the series add cgroup_flush_rstat() kfunc which
> flushes cgroup stats that use rstat (e.g. memcg stats). It can be
> called directly from the bpf program if needed.
> 
> It would be better to leave this to the bpf program, it's an
> unnecessary toll to flush the stats for any cgroup_iter program, that
> could be not accessing stats, or stats that are not maintained using
> rstat.

Okay, this should work.

> 
>>
>>> +
>>> +     /* if prog returns > 0, terminate after this element. */
>>> +     if (ret != 0)
>>> +             p->terminate = true;
>>> +
>>> +     return 0;
>>> +}
>>> +
>> [...]
Hao Luo July 7, 2022, 11:33 p.m. UTC | #10
On Mon, Jun 27, 2022 at 9:09 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 6/10/22 12:44 PM, Yosry Ahmed wrote:
> > From: Hao Luo <haoluo@google.com>
> >
> > Cgroup_iter is a type of bpf_iter. It walks over cgroups in two modes:
> >
> >   - walking a cgroup's descendants.
> >   - walking a cgroup's ancestors.
>
> The implementation has another choice, BPF_ITER_CGROUP_PARENT_UP.
> We should add it here as well.
>

Sorry about the late reply. I meant to write two modes: walking up and
walking down. And two sub-modes when walking down: pre-order and
post-order.

But it seems this is confusing. I'm going to use three modes in the
next version: UP, PRE and POST.

Besides, since this patch modifies the bpf_iter_link_info, and that
breaks the btf_dump selftest, I need to include the fix of the
selftest in this patch.

> >
> > When attaching cgroup_iter, one can set a cgroup to the iter_link
> > created from attaching. This cgroup is passed as a file descriptor and
> > serves as the starting point of the walk. If no cgroup is specified,
> > the starting point will be the root cgroup.
> >
> > For walking descendants, one can specify the order: either pre-order or
> > post-order. For walking ancestors, the walk starts at the specified
> > cgroup and ends at the root.
> >
> > One can also terminate the walk early by returning 1 from the iter
> > program.
> >
> > Note that because walking cgroup hierarchy holds cgroup_mutex, the iter
> > program is called with cgroup_mutex held.
>
> Overall looks good to me with a few nits below.
>
> Acked-by: Yonghong Song <yhs@fb.com>
>
> >
> > Signed-off-by: Hao Luo <haoluo@google.com>
> > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > ---
[...]
> > +
> > +/* cgroup_iter provides two modes of traversal to the cgroup hierarchy.
> > + *
> > + *  1. Walk the descendants of a cgroup.
> > + *  2. Walk the ancestors of a cgroup.
>
> three modes here?
>

Same here. Will use 'three modes' in the next version.

> > + *
[...]
> > +
> > +static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
> > +                               union bpf_iter_link_info *linfo,
> > +                               struct bpf_iter_aux_info *aux)
> > +{
> > +     int fd = linfo->cgroup.cgroup_fd;
> > +     struct cgroup *cgrp;
> > +
> > +     if (fd)
> > +             cgrp = cgroup_get_from_fd(fd);
> > +     else /* walk the entire hierarchy by default. */
> > +             cgrp = cgroup_get_from_path("/");
> > +
> > +     if (IS_ERR(cgrp))
> > +             return PTR_ERR(cgrp);
> > +
> > +     aux->cgroup.start = cgrp;
> > +     aux->cgroup.order = linfo->cgroup.traversal_order;
>
> The legality of traversal_order should be checked.
>

Sounds good. Will do.

> > +     return 0;
> > +}
> > +
[...]
> > +static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
> > +                                     struct seq_file *seq)
> > +{
> > +     char *buf;
> > +
> > +     buf = kzalloc(PATH_MAX, GFP_KERNEL);
> > +     if (!buf) {
> > +             seq_puts(seq, "cgroup_path:\n");
>
> This is a really unlikely case. maybe "cgroup_path:<unknown>"?
>

Sure, no problem. This is a path that is unlikely to hit.

> > +             goto show_order;
[...]
Hao Luo July 7, 2022, 11:36 p.m. UTC | #11
On Mon, Jun 27, 2022 at 9:14 PM Yonghong Song <yhs@fb.com> wrote:
>
> > +static int __cgroup_iter_seq_show(struct seq_file *seq,
> > +                               struct cgroup_subsys_state *css, int in_stop)
> > +{
> > +     struct cgroup_iter_priv *p = seq->private;
> > +     struct bpf_iter__cgroup ctx;
> > +     struct bpf_iter_meta meta;
> > +     struct bpf_prog *prog;
> > +     int ret = 0;
> > +
> > +     /* cgroup is dead, skip this element */
> > +     if (css && cgroup_is_dead(css->cgroup))
> > +             return 0;
> > +
> > +     ctx.meta = &meta;
> > +     ctx.cgroup = css ? css->cgroup : NULL;
> > +     meta.seq = seq;
> > +     prog = bpf_iter_get_info(&meta, in_stop);
> > +     if (prog)
> > +             ret = bpf_iter_run_prog(prog, &ctx);
>
> Do we need to do anything special to ensure bpf program gets
> up-to-date stat from ctx.cgroup?
>

Let's leave that to be handled by bpf programs. The kfunc rstat_flush
can be called to sync stats, if using rstat.

> > +
> > +     /* if prog returns > 0, terminate after this element. */
> > +     if (ret != 0)
> > +             p->terminate = true;
> > +
> > +     return 0;
> > +}
> > +
> [...]
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8e6092d0ea956..48d8e836b9748 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -44,6 +44,7 @@  struct kobject;
 struct mem_cgroup;
 struct module;
 struct bpf_func_state;
+struct cgroup;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -1590,7 +1591,14 @@  int bpf_obj_get_user(const char __user *pathname, int flags);
 	int __init bpf_iter_ ## target(args) { return 0; }
 
 struct bpf_iter_aux_info {
+	/* for map_elem iter */
 	struct bpf_map *map;
+
+	/* for cgroup iter */
+	struct {
+		struct cgroup *start; /* starting cgroup */
+		int order;
+	} cgroup;
 };
 
 typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f4009dbdf62da..4fd05cde19116 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -87,10 +87,27 @@  struct bpf_cgroup_storage_key {
 	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
 };
 
+enum bpf_iter_cgroup_traversal_order {
+	BPF_ITER_CGROUP_PRE = 0,	/* pre-order traversal */
+	BPF_ITER_CGROUP_POST,		/* post-order traversal */
+	BPF_ITER_CGROUP_PARENT_UP,	/* traversal of ancestors up to the root */
+};
+
 union bpf_iter_link_info {
 	struct {
 		__u32	map_fd;
 	} map;
+
+	/* cgroup_iter walks either the live descendants of a cgroup subtree, or the ancestors
+	 * of a given cgroup.
+	 */
+	struct {
+		/* Cgroup file descriptor. This is root of the subtree if for walking the
+		 * descendants; this is the starting cgroup if for walking the ancestors.
+		 */
+		__u32	cgroup_fd;
+		__u32	traversal_order;
+	} cgroup;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
@@ -6050,6 +6067,10 @@  struct bpf_link_info {
 				struct {
 					__u32 map_id;
 				} map;
+				struct {
+					__u32 traversal_order;
+					__aligned_u64 cgroup_id;
+				} cgroup;
 			};
 		} iter;
 		struct  {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 057ba8e01e70f..9741b9314fb46 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,7 +8,7 @@  CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o cgroup_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
 obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
new file mode 100644
index 0000000000000..88deb655efa71
--- /dev/null
+++ b/kernel/bpf/cgroup_iter.c
@@ -0,0 +1,235 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/cgroup.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../cgroup/cgroup-internal.h"  /* cgroup_mutex and cgroup_is_dead */
+
+/* cgroup_iter provides two modes of traversal to the cgroup hierarchy.
+ *
+ *  1. Walk the descendants of a cgroup.
+ *  2. Walk the ancestors of a cgroup.
+ *
+ * For walking descendants, cgroup_iter can walk in either pre-order or
+ * post-order. For walking ancestors, the iter walks up from a cgroup to
+ * the root.
+ *
+ * The iter program can terminate the walk early by returning 1. Walk
+ * continues if prog returns 0.
+ *
+ * The prog can check (seq->num == 0) to determine whether this is
+ * the first element. The prog may also be passed a NULL cgroup,
+ * which means the walk has completed and the prog has a chance to
+ * do post-processing, such as outputing an epilogue.
+ *
+ * Note: the iter_prog is called with cgroup_mutex held.
+ */
+
+struct bpf_iter__cgroup {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct cgroup *, cgroup);
+};
+
+struct cgroup_iter_priv {
+	struct cgroup_subsys_state *start_css;
+	bool terminate;
+	int order;
+};
+
+static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct cgroup_iter_priv *p = seq->private;
+
+	mutex_lock(&cgroup_mutex);
+
+	/* support only one session */
+	if (*pos > 0)
+		return NULL;
+
+	++*pos;
+	p->terminate = false;
+	if (p->order == BPF_ITER_CGROUP_PRE)
+		return css_next_descendant_pre(NULL, p->start_css);
+	else if (p->order == BPF_ITER_CGROUP_POST)
+		return css_next_descendant_post(NULL, p->start_css);
+	else /* BPF_ITER_CGROUP_PARENT_UP */
+		return p->start_css;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+				  struct cgroup_subsys_state *css, int in_stop);
+
+static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
+{
+	/* pass NULL to the prog for post-processing */
+	if (!v)
+		__cgroup_iter_seq_show(seq, NULL, true);
+	mutex_unlock(&cgroup_mutex);
+}
+
+static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
+	struct cgroup_iter_priv *p = seq->private;
+
+	++*pos;
+	if (p->terminate)
+		return NULL;
+
+	if (p->order == BPF_ITER_CGROUP_PRE)
+		return css_next_descendant_pre(curr, p->start_css);
+	else if (p->order == BPF_ITER_CGROUP_POST)
+		return css_next_descendant_post(curr, p->start_css);
+	else
+		return curr->parent;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+				  struct cgroup_subsys_state *css, int in_stop)
+{
+	struct cgroup_iter_priv *p = seq->private;
+	struct bpf_iter__cgroup ctx;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	/* cgroup is dead, skip this element */
+	if (css && cgroup_is_dead(css->cgroup))
+		return 0;
+
+	ctx.meta = &meta;
+	ctx.cgroup = css ? css->cgroup : NULL;
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (prog)
+		ret = bpf_iter_run_prog(prog, &ctx);
+
+	/* if prog returns > 0, terminate after this element. */
+	if (ret != 0)
+		p->terminate = true;
+
+	return 0;
+}
+
+static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
+{
+	return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
+				      false);
+}
+
+static const struct seq_operations cgroup_iter_seq_ops = {
+	.start  = cgroup_iter_seq_start,
+	.next   = cgroup_iter_seq_next,
+	.stop   = cgroup_iter_seq_stop,
+	.show   = cgroup_iter_seq_show,
+};
+
+BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
+
+static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
+{
+	struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+	struct cgroup *cgrp = aux->cgroup.start;
+
+	p->start_css = &cgrp->self;
+	p->terminate = false;
+	p->order = aux->cgroup.order;
+	return 0;
+}
+
+static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
+	.seq_ops                = &cgroup_iter_seq_ops,
+	.init_seq_private       = cgroup_iter_seq_init,
+	.seq_priv_size          = sizeof(struct cgroup_iter_priv),
+};
+
+static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
+				  union bpf_iter_link_info *linfo,
+				  struct bpf_iter_aux_info *aux)
+{
+	int fd = linfo->cgroup.cgroup_fd;
+	struct cgroup *cgrp;
+
+	if (fd)
+		cgrp = cgroup_get_from_fd(fd);
+	else /* walk the entire hierarchy by default. */
+		cgrp = cgroup_get_from_path("/");
+
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	aux->cgroup.start = cgrp;
+	aux->cgroup.order = linfo->cgroup.traversal_order;
+	return 0;
+}
+
+static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
+{
+	cgroup_put(aux->cgroup.start);
+}
+
+static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
+					struct seq_file *seq)
+{
+	char *buf;
+
+	buf = kzalloc(PATH_MAX, GFP_KERNEL);
+	if (!buf) {
+		seq_puts(seq, "cgroup_path:\n");
+		goto show_order;
+	}
+
+	/* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
+	 * will print nothing.
+	 *
+	 * Path is in the calling process's cgroup namespace.
+	 */
+	cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
+		       current->nsproxy->cgroup_ns);
+	seq_printf(seq, "cgroup_path:\t%s\n", buf);
+	kfree(buf);
+
+show_order:
+	if (aux->cgroup.order == BPF_ITER_CGROUP_PRE)
+		seq_puts(seq, "traversal_order: pre\n");
+	else if (aux->cgroup.order == BPF_ITER_CGROUP_POST)
+		seq_puts(seq, "traversal_order: post\n");
+	else /* BPF_ITER_CGROUP_PARENT_UP */
+		seq_puts(seq, "traversal_order: parent_up\n");
+}
+
+static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux,
+					  struct bpf_link_info *info)
+{
+	info->iter.cgroup.traversal_order = aux->cgroup.order;
+	info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start);
+	return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta,
+		     struct cgroup *cgroup)
+
+static struct bpf_iter_reg bpf_cgroup_reg_info = {
+	.target			= "cgroup",
+	.attach_target		= bpf_iter_attach_cgroup,
+	.detach_target		= bpf_iter_detach_cgroup,
+	.show_fdinfo		= bpf_iter_cgroup_show_fdinfo,
+	.fill_link_info		= bpf_iter_cgroup_fill_link_info,
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__cgroup, cgroup),
+		  PTR_TO_BTF_ID },
+	},
+	.seq_info		= &cgroup_iter_seq_info,
+};
+
+static int __init bpf_cgroup_iter_init(void)
+{
+	bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0];
+	return bpf_iter_reg_target(&bpf_cgroup_reg_info);
+}
+
+late_initcall(bpf_cgroup_iter_init);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f4009dbdf62da..4fd05cde19116 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -87,10 +87,27 @@  struct bpf_cgroup_storage_key {
 	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
 };
 
+enum bpf_iter_cgroup_traversal_order {
+	BPF_ITER_CGROUP_PRE = 0,	/* pre-order traversal */
+	BPF_ITER_CGROUP_POST,		/* post-order traversal */
+	BPF_ITER_CGROUP_PARENT_UP,	/* traversal of ancestors up to the root */
+};
+
 union bpf_iter_link_info {
 	struct {
 		__u32	map_fd;
 	} map;
+
+	/* cgroup_iter walks either the live descendants of a cgroup subtree, or the ancestors
+	 * of a given cgroup.
+	 */
+	struct {
+		/* Cgroup file descriptor. This is root of the subtree if for walking the
+		 * descendants; this is the starting cgroup if for walking the ancestors.
+		 */
+		__u32	cgroup_fd;
+		__u32	traversal_order;
+	} cgroup;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
@@ -6050,6 +6067,10 @@  struct bpf_link_info {
 				struct {
 					__u32 map_id;
 				} map;
+				struct {
+					__u32 traversal_order;
+					__aligned_u64 cgroup_id;
+				} cgroup;
 			};
 		} iter;
 		struct  {