diff mbox series

[v4,bpf-next,1/3] bpf: Add kmem_cache iterator

Message ID 20241002180956.1781008-2-namhyung@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: Add kmem_cache iterator and kfunc | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 213 this patch: 213
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 13 of 13 maintainers
netdev/build_clang success Errors and warnings before: 272 this patch: 272
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 6965 this patch: 6965
netdev/checkpatch warning CHECK: Comparison to NULL could be written "!ctx.s" WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17

Commit Message

Namhyung Kim Oct. 2, 2024, 6:09 p.m. UTC
The new "kmem_cache" iterator will traverse the list of slab caches
and call attached BPF programs for each entry.  It should check the
argument (ctx.s) if it's NULL before using it.

Now the iteration grabs the slab_mutex only if it traverse the list and
releases the mutex when it runs the BPF program.  The kmem_cache entry
is protected by a refcount during the execution.

It includes the internal "mm/slab.h" header to access kmem_cache,
slab_caches and slab_mutex.  Hope it's ok to mm folks.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
I've removed the Acked-by's from Roman and Vlastimil since it's changed
not to hold the slab_mutex and to manage the refcount.  Please review
this change again!

 include/linux/btf_ids.h      |   1 +
 kernel/bpf/Makefile          |   1 +
 kernel/bpf/kmem_cache_iter.c | 174 +++++++++++++++++++++++++++++++++++
 3 files changed, 176 insertions(+)
 create mode 100644 kernel/bpf/kmem_cache_iter.c

Comments

Vlastimil Babka Oct. 3, 2024, 7:35 a.m. UTC | #1
On 10/2/24 20:09, Namhyung Kim wrote:
> The new "kmem_cache" iterator will traverse the list of slab caches
> and call attached BPF programs for each entry.  It should check the
> argument (ctx.s) if it's NULL before using it.
> 
> Now the iteration grabs the slab_mutex only if it traverse the list and
> releases the mutex when it runs the BPF program.  The kmem_cache entry
> is protected by a refcount during the execution.
> 
> It includes the internal "mm/slab.h" header to access kmem_cache,
> slab_caches and slab_mutex.  Hope it's ok to mm folks.
> 
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>

Acked-by: Vlastimil Babka <vbabka@suse.cz> #mm/slab
Song Liu Oct. 4, 2024, 8:33 p.m. UTC | #2
On Wed, Oct 2, 2024 at 11:09 AM Namhyung Kim <namhyung@kernel.org> wrote:
>
[...]
> +
> +       mutex_lock(&slab_mutex);
> +
> +       /*
> +        * Find an entry at the given position in the slab_caches list instead

Nit: style of multi-line comment: "/* Find ...".

> +        * of keeping a reference (of the last visited entry, if any) out of
> +        * slab_mutex. It might miss something if one is deleted in the middle
> +        * while it releases the lock.  But it should be rare and there's not
> +        * much we can do about it.
> +        */
> +       list_for_each_entry(s, &slab_caches, list) {
> +               if (cnt == *pos) {
> +                       /*
> +                        * Make sure this entry remains in the list by getting
> +                        * a new reference count.  Note that boot_cache entries
> +                        * have a negative refcount, so don't touch them.
> +                        */
> +                       if (s->refcount > 0)
> +                               s->refcount++;
> +                       found = true;
> +                       break;
> +               }
> +               cnt++;
> +       }
> +       mutex_unlock(&slab_mutex);
> +
> +       if (!found)
> +               return NULL;
> +
> +       ++*pos;
> +       return s;
> +}
> +
> +static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v)
> +{
> +       struct bpf_iter_meta meta;
> +       struct bpf_iter__kmem_cache ctx = {
> +               .meta = &meta,
> +               .s = v,
> +       };
> +       struct bpf_prog *prog;
> +       bool destroy = false;
> +
> +       meta.seq = seq;
> +       prog = bpf_iter_get_info(&meta, true);
> +       if (prog)
> +               bpf_iter_run_prog(prog, &ctx);
> +
> +       if (ctx.s == NULL)
> +               return;
> +
> +       mutex_lock(&slab_mutex);
> +
> +       /* Skip kmem_cache_destroy() for active entries */
> +       if (ctx.s->refcount > 1)
> +               ctx.s->refcount--;
> +       else if (ctx.s->refcount == 1)
> +               destroy = true;
> +
> +       mutex_unlock(&slab_mutex);
> +
> +       if (destroy)
> +               kmem_cache_destroy(ctx.s);
> +}
> +
> +static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +       struct kmem_cache *s = v;
> +       struct kmem_cache *next = NULL;
> +       bool destroy = false;
> +
> +       ++*pos;
> +
> +       mutex_lock(&slab_mutex);
> +
> +       if (list_last_entry(&slab_caches, struct kmem_cache, list) != s) {
> +               next = list_next_entry(s, list);
> +               if (next->refcount > 0)
> +                       next->refcount++;

What if next->refcount <=0? Shall we find next of next?

> +       }
> +
> +       /* Skip kmem_cache_destroy() for active entries */
> +       if (s->refcount > 1)
> +               s->refcount--;
> +       else if (s->refcount == 1)
> +               destroy = true;
> +
> +       mutex_unlock(&slab_mutex);
> +
> +       if (destroy)
> +               kmem_cache_destroy(s);
> +
> +       return next;
> +}
[...]
Song Liu Oct. 4, 2024, 8:45 p.m. UTC | #3
On Wed, Oct 2, 2024 at 11:09 AM Namhyung Kim <namhyung@kernel.org> wrote:
>
[...]
> +
> +static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +       loff_t cnt = 0;
> +       bool found = false;
> +       struct kmem_cache *s;
> +
> +       mutex_lock(&slab_mutex);
> +
> +       /*
> +        * Find an entry at the given position in the slab_caches list instead
> +        * of keeping a reference (of the last visited entry, if any) out of
> +        * slab_mutex. It might miss something if one is deleted in the middle
> +        * while it releases the lock.  But it should be rare and there's not
> +        * much we can do about it.
> +        */
> +       list_for_each_entry(s, &slab_caches, list) {
> +               if (cnt == *pos) {
> +                       /*
> +                        * Make sure this entry remains in the list by getting
> +                        * a new reference count.  Note that boot_cache entries
> +                        * have a negative refcount, so don't touch them.
> +                        */
> +                       if (s->refcount > 0)
> +                               s->refcount++;
> +                       found = true;
> +                       break;
> +               }
> +               cnt++;
> +       }
> +       mutex_unlock(&slab_mutex);
> +
> +       if (!found)
> +               return NULL;
> +
> +       ++*pos;

This should be

if (*pos == 0)
    ++*pos;

> +       return s;
> +}
> +
> +static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v)
[...]
Namhyung Kim Oct. 4, 2024, 9:37 p.m. UTC | #4
Hi Song,

On Fri, Oct 04, 2024 at 01:33:19PM -0700, Song Liu wrote:
> On Wed, Oct 2, 2024 at 11:09 AM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> [...]
> > +
> > +       mutex_lock(&slab_mutex);
> > +
> > +       /*
> > +        * Find an entry at the given position in the slab_caches list instead
> 
> Nit: style of multi-line comment: "/* Find ...".

Ok, will update.

> 
> > +        * of keeping a reference (of the last visited entry, if any) out of
> > +        * slab_mutex. It might miss something if one is deleted in the middle
> > +        * while it releases the lock.  But it should be rare and there's not
> > +        * much we can do about it.
> > +        */
> > +       list_for_each_entry(s, &slab_caches, list) {
> > +               if (cnt == *pos) {
> > +                       /*
> > +                        * Make sure this entry remains in the list by getting
> > +                        * a new reference count.  Note that boot_cache entries
> > +                        * have a negative refcount, so don't touch them.
> > +                        */
> > +                       if (s->refcount > 0)
> > +                               s->refcount++;
> > +                       found = true;
> > +                       break;
> > +               }
> > +               cnt++;
> > +       }
> > +       mutex_unlock(&slab_mutex);
> > +
> > +       if (!found)
> > +               return NULL;
> > +
> > +       ++*pos;
> > +       return s;
> > +}
> > +
> > +static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v)
> > +{
> > +       struct bpf_iter_meta meta;
> > +       struct bpf_iter__kmem_cache ctx = {
> > +               .meta = &meta,
> > +               .s = v,
> > +       };
> > +       struct bpf_prog *prog;
> > +       bool destroy = false;
> > +
> > +       meta.seq = seq;
> > +       prog = bpf_iter_get_info(&meta, true);
> > +       if (prog)
> > +               bpf_iter_run_prog(prog, &ctx);
> > +
> > +       if (ctx.s == NULL)
> > +               return;
> > +
> > +       mutex_lock(&slab_mutex);
> > +
> > +       /* Skip kmem_cache_destroy() for active entries */
> > +       if (ctx.s->refcount > 1)
> > +               ctx.s->refcount--;
> > +       else if (ctx.s->refcount == 1)
> > +               destroy = true;
> > +
> > +       mutex_unlock(&slab_mutex);
> > +
> > +       if (destroy)
> > +               kmem_cache_destroy(ctx.s);
> > +}
> > +
> > +static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> > +{
> > +       struct kmem_cache *s = v;
> > +       struct kmem_cache *next = NULL;
> > +       bool destroy = false;
> > +
> > +       ++*pos;
> > +
> > +       mutex_lock(&slab_mutex);
> > +
> > +       if (list_last_entry(&slab_caches, struct kmem_cache, list) != s) {
> > +               next = list_next_entry(s, list);
> > +               if (next->refcount > 0)
> > +                       next->refcount++;
> 
> What if next->refcount <=0? Shall we find next of next?

The slab_mutex should protect refcount == 0 case so it won't see that.
The negative refcount means it's a boot_cache and we shouldn't touch the
refcount.

Thanks,
Namhyung

> 
> > +       }
> > +
> > +       /* Skip kmem_cache_destroy() for active entries */
> > +       if (s->refcount > 1)
> > +               s->refcount--;
> > +       else if (s->refcount == 1)
> > +               destroy = true;
> > +
> > +       mutex_unlock(&slab_mutex);
> > +
> > +       if (destroy)
> > +               kmem_cache_destroy(s);
> > +
> > +       return next;
> > +}
> [...]
Namhyung Kim Oct. 4, 2024, 9:42 p.m. UTC | #5
On Fri, Oct 04, 2024 at 01:45:09PM -0700, Song Liu wrote:
> On Wed, Oct 2, 2024 at 11:09 AM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> [...]
> > +
> > +static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos)
> > +{
> > +       loff_t cnt = 0;
> > +       bool found = false;
> > +       struct kmem_cache *s;
> > +
> > +       mutex_lock(&slab_mutex);
> > +
> > +       /*
> > +        * Find an entry at the given position in the slab_caches list instead
> > +        * of keeping a reference (of the last visited entry, if any) out of
> > +        * slab_mutex. It might miss something if one is deleted in the middle
> > +        * while it releases the lock.  But it should be rare and there's not
> > +        * much we can do about it.
> > +        */
> > +       list_for_each_entry(s, &slab_caches, list) {
> > +               if (cnt == *pos) {
> > +                       /*
> > +                        * Make sure this entry remains in the list by getting
> > +                        * a new reference count.  Note that boot_cache entries
> > +                        * have a negative refcount, so don't touch them.
> > +                        */
> > +                       if (s->refcount > 0)
> > +                               s->refcount++;
> > +                       found = true;
> > +                       break;
> > +               }
> > +               cnt++;
> > +       }
> > +       mutex_unlock(&slab_mutex);
> > +
> > +       if (!found)
> > +               return NULL;
> > +
> > +       ++*pos;
> 
> This should be
> 
> if (*pos == 0)
>     ++*pos;

Oh, I thought there's check for seq->count after the seq->op->show()
for the ->start().  I need to check this logic again, thanks for
pointing this out.

Thanks,
Namhyung

> 
> > +       return s;
> > +}
> > +
> > +static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v)
> [...]
Song Liu Oct. 4, 2024, 9:46 p.m. UTC | #6
On Fri, Oct 4, 2024 at 2:37 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Hi Song,
>
> On Fri, Oct 04, 2024 at 01:33:19PM -0700, Song Liu wrote:
[...]
> > > +
> > > +static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> > > +{
> > > +       struct kmem_cache *s = v;
> > > +       struct kmem_cache *next = NULL;
> > > +       bool destroy = false;
> > > +
> > > +       ++*pos;
> > > +
> > > +       mutex_lock(&slab_mutex);
> > > +
> > > +       if (list_last_entry(&slab_caches, struct kmem_cache, list) != s) {
> > > +               next = list_next_entry(s, list);
> > > +               if (next->refcount > 0)
> > > +                       next->refcount++;
> >
> > What if next->refcount <=0? Shall we find next of next?
>
> The slab_mutex should protect refcount == 0 case so it won't see that.
> The negative refcount means it's a boot_cache and we shouldn't touch the
> refcount.

I see. Thanks for the explanation!

Please add a comment here, and maybe also add

  WARN_ON_ONCE(next ->refcount == 0).

Song
Namhyung Kim Oct. 4, 2024, 11:29 p.m. UTC | #7
On Fri, Oct 04, 2024 at 02:46:43PM -0700, Song Liu wrote:
> On Fri, Oct 4, 2024 at 2:37 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > Hi Song,
> >
> > On Fri, Oct 04, 2024 at 01:33:19PM -0700, Song Liu wrote:
> [...]
> > > > +
> > > > +static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> > > > +{
> > > > +       struct kmem_cache *s = v;
> > > > +       struct kmem_cache *next = NULL;
> > > > +       bool destroy = false;
> > > > +
> > > > +       ++*pos;
> > > > +
> > > > +       mutex_lock(&slab_mutex);
> > > > +
> > > > +       if (list_last_entry(&slab_caches, struct kmem_cache, list) != s) {
> > > > +               next = list_next_entry(s, list);
> > > > +               if (next->refcount > 0)
> > > > +                       next->refcount++;
> > >
> > > What if next->refcount <=0? Shall we find next of next?
> >
> > The slab_mutex should protect refcount == 0 case so it won't see that.
> > The negative refcount means it's a boot_cache and we shouldn't touch the
> > refcount.
> 
> I see. Thanks for the explanation!
> 
> Please add a comment here, and maybe also add
> 
>   WARN_ON_ONCE(next ->refcount == 0).

Sure, thanks for your review!
Namhyung
diff mbox series

Patch

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index c0e3e1426a82f5c4..139bdececdcfaefb 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -283,5 +283,6 @@  extern u32 btf_tracing_ids[];
 extern u32 bpf_cgroup_btf_id[];
 extern u32 bpf_local_storage_map_btf_id[];
 extern u32 btf_bpf_map_id[];
+extern u32 bpf_kmem_cache_btf_id[];
 
 #endif
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 9b9c151b5c826b31..105328f0b9c04e37 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -52,3 +52,4 @@  obj-$(CONFIG_BPF_PRELOAD) += preload/
 obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
 obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
+obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
diff --git a/kernel/bpf/kmem_cache_iter.c b/kernel/bpf/kmem_cache_iter.c
new file mode 100644
index 0000000000000000..e103d25175126ab0
--- /dev/null
+++ b/kernel/bpf/kmem_cache_iter.c
@@ -0,0 +1,174 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */
+
+struct bpf_iter__kmem_cache {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct kmem_cache *, s);
+};
+
+static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	loff_t cnt = 0;
+	bool found = false;
+	struct kmem_cache *s;
+
+	mutex_lock(&slab_mutex);
+
+	/*
+	 * Find an entry at the given position in the slab_caches list instead
+	 * of keeping a reference (of the last visited entry, if any) out of
+	 * slab_mutex. It might miss something if one is deleted in the middle
+	 * while it releases the lock.  But it should be rare and there's not
+	 * much we can do about it.
+	 */
+	list_for_each_entry(s, &slab_caches, list) {
+		if (cnt == *pos) {
+			/*
+			 * Make sure this entry remains in the list by getting
+			 * a new reference count.  Note that boot_cache entries
+			 * have a negative refcount, so don't touch them.
+			 */
+			if (s->refcount > 0)
+				s->refcount++;
+			found = true;
+			break;
+		}
+		cnt++;
+	}
+	mutex_unlock(&slab_mutex);
+
+	if (!found)
+		return NULL;
+
+	++*pos;
+	return s;
+}
+
+static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_iter__kmem_cache ctx = {
+		.meta = &meta,
+		.s = v,
+	};
+	struct bpf_prog *prog;
+	bool destroy = false;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, true);
+	if (prog)
+		bpf_iter_run_prog(prog, &ctx);
+
+	if (ctx.s == NULL)
+		return;
+
+	mutex_lock(&slab_mutex);
+
+	/* Skip kmem_cache_destroy() for active entries */
+	if (ctx.s->refcount > 1)
+		ctx.s->refcount--;
+	else if (ctx.s->refcount == 1)
+		destroy = true;
+
+	mutex_unlock(&slab_mutex);
+
+	if (destroy)
+		kmem_cache_destroy(ctx.s);
+}
+
+static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct kmem_cache *s = v;
+	struct kmem_cache *next = NULL;
+	bool destroy = false;
+
+	++*pos;
+
+	mutex_lock(&slab_mutex);
+
+	if (list_last_entry(&slab_caches, struct kmem_cache, list) != s) {
+		next = list_next_entry(s, list);
+		if (next->refcount > 0)
+			next->refcount++;
+	}
+
+	/* Skip kmem_cache_destroy() for active entries */
+	if (s->refcount > 1)
+		s->refcount--;
+	else if (s->refcount == 1)
+		destroy = true;
+
+	mutex_unlock(&slab_mutex);
+
+	if (destroy)
+		kmem_cache_destroy(s);
+
+	return next;
+}
+
+static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_iter__kmem_cache ctx = {
+		.meta = &meta,
+		.s = v,
+	};
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, false);
+	if (prog)
+		ret = bpf_iter_run_prog(prog, &ctx);
+
+	return ret;
+}
+
+static const struct seq_operations kmem_cache_iter_seq_ops = {
+	.start  = kmem_cache_iter_seq_start,
+	.next   = kmem_cache_iter_seq_next,
+	.stop   = kmem_cache_iter_seq_stop,
+	.show   = kmem_cache_iter_seq_show,
+};
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache)
+
+static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = {
+	.seq_ops		= &kmem_cache_iter_seq_ops,
+};
+
+static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux,
+					    struct seq_file *seq)
+{
+	seq_puts(seq, "kmem_cache iter\n");
+}
+
+DEFINE_BPF_ITER_FUNC(kmem_cache, struct bpf_iter_meta *meta,
+		     struct kmem_cache *s)
+
+static struct bpf_iter_reg bpf_kmem_cache_reg_info = {
+	.target			= "kmem_cache",
+	.feature		= BPF_ITER_RESCHED,
+	.show_fdinfo		= bpf_iter_kmem_cache_show_fdinfo,
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__kmem_cache, s),
+		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+	},
+	.seq_info		= &kmem_cache_iter_seq_info,
+};
+
+static int __init bpf_kmem_cache_iter_init(void)
+{
+	bpf_kmem_cache_reg_info.ctx_arg_info[0].btf_id = bpf_kmem_cache_btf_id[0];
+	return bpf_iter_reg_target(&bpf_kmem_cache_reg_info);
+}
+
+late_initcall(bpf_kmem_cache_iter_init);