Message ID | 22bededbd502e0df45326a54b3056941de65a101.1617831474.git.dxu@dxuuu.xyz (mailing list archive) |
---|---|
State | RFC |
Delegated to: | BPF |
Headers | show |
Series | bpf: Add page cache iterator | expand |
On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote: > +struct bpf_iter_seq_pagecache_info { > + struct mnt_namespace *ns; > + struct radix_tree_root superblocks; Why are you adding a new radix tree? Use an XArray instead. > +static struct page *goto_next_page(struct bpf_iter_seq_pagecache_info *info) > +{ > + struct page *page, *ret = NULL; > + unsigned long idx; > + > + rcu_read_lock(); > +retry: > + BUG_ON(!info->cur_inode); > + ret = NULL; > + xa_for_each_start(&info->cur_inode->i_data.i_pages, idx, page, > + info->cur_page_idx) { > + if (!page_cache_get_speculative(page)) > + continue; Why do you feel the need to poke around in i_pages directly? Is there something wrong with find_get_entries()? > +static int __pagecache_seq_show(struct seq_file *seq, struct page *page, > + bool in_stop) > +{ > + struct bpf_iter_meta meta; > + struct bpf_iter__pagecache ctx; > + struct bpf_prog *prog; > + > + meta.seq = seq; > + prog = bpf_iter_get_info(&meta, in_stop); > + if (!prog) > + return 0; > + > + meta.seq = seq; > + ctx.meta = &meta; > + ctx.page = page; > + return bpf_iter_run_prog(prog, &ctx); I'm not really keen on the idea of random BPF programs being able to poke at pages in the page cache like this. From your initial description, it sounded like all you needed was a list of which pages are present. > + INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL); > + > + spin_lock(&info->ns->ns_lock); > + list_for_each_entry(mnt, &info->ns->list, mnt_list) { > + sb = mnt->mnt.mnt_sb; > + > + /* The same mount may be mounted in multiple places */ > + if (radix_tree_lookup(&info->superblocks, (unsigned long)sb)) > + continue; > + > + err = radix_tree_insert(&info->superblocks, > + (unsigned long)sb, (void *)1); > + if (err) > + goto out; > + } > + > + radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) { > + sb = (struct super_block *)iter.index; > + atomic_inc(&sb->s_active); > + } Uh. What on earth made you think this was a good way to use the radix tree? And, no, the XArray doesn't change that. If you don't understand why this is so bad, call xa_dump() on it after constructing it. I'll wait.
On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote: > This commit introduces the bpf page cache iterator. This iterator allows > users to run a bpf prog against each page in the "page cache". > Internally, the "page cache" is extremely tied to VFS superblock + inode > combo. Because of this, iter_pagecache will only examine pages in the > caller's mount namespace. > > Signed-off-by: Daniel Xu <dxu@dxuuu.xyz> > --- > kernel/bpf/Makefile | 2 +- > kernel/bpf/pagecache_iter.c | 293 ++++++++++++++++++++++++++++++++++++ > 2 files changed, 294 insertions(+), 1 deletion(-) > create mode 100644 kernel/bpf/pagecache_iter.c > > diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile > index 7f33098ca63f..3deb6a8d3f75 100644 > --- a/kernel/bpf/Makefile > +++ b/kernel/bpf/Makefile > @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse > endif > CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) > > -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o > +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o pagecache_iter.o map_iter.o task_iter.o prog_iter.o > obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o > obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o > obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o > diff --git a/kernel/bpf/pagecache_iter.c b/kernel/bpf/pagecache_iter.c > new file mode 100644 > index 000000000000..8442ab0d4221 > --- /dev/null > +++ b/kernel/bpf/pagecache_iter.c > @@ -0,0 +1,293 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* Copyright (c) 2021 Facebook */ > + > +#include <linux/bpf.h> > +#include <linux/btf_ids.h> > +#include <linux/init.h> > +#include <linux/mm_types.h> > +#include <linux/mnt_namespace.h> > +#include <linux/nsproxy.h> > +#include <linux/pagemap.h> > +#include <linux/radix-tree.h> > +#include <linux/seq_file.h> > +#include "../../fs/mount.h" This is a private header on purpose. Outside of fs/ poking around in struct mount or struct mount_namespace should not be done. > + > +struct bpf_iter_seq_pagecache_info { > + struct mnt_namespace *ns; > + struct radix_tree_root superblocks; > + struct super_block *cur_sb; > + struct inode *cur_inode; > + unsigned long cur_page_idx; > +}; > + > +static struct super_block *goto_next_sb(struct bpf_iter_seq_pagecache_info *info) > +{ > + struct super_block *sb = NULL; > + struct radix_tree_iter iter; > + void **slot; > + > + radix_tree_for_each_slot(slot, &info->superblocks, &iter, > + ((unsigned long)info->cur_sb + 1)) { > + sb = (struct super_block *)iter.index; > + break; > + } > + > + info->cur_sb = sb; > + info->cur_inode = NULL; > + info->cur_page_idx = 0; > + return sb; > +} > + > +static bool inode_unusual(struct inode *inode) { > + return ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || > + (inode->i_mapping->nrpages == 0)); > +} > + > +static struct inode *goto_next_inode(struct bpf_iter_seq_pagecache_info *info) > +{ > + struct inode *prev_inode = info->cur_inode; > + struct inode *inode; > + > +retry: > + BUG_ON(!info->cur_sb); > + spin_lock(&info->cur_sb->s_inode_list_lock); > + > + if (!info->cur_inode) { > + list_for_each_entry(inode, &info->cur_sb->s_inodes, i_sb_list) { > + spin_lock(&inode->i_lock); > + if (inode_unusual(inode)) { > + spin_unlock(&inode->i_lock); > + continue; > + } > + __iget(inode); > + spin_unlock(&inode->i_lock); > + info->cur_inode = inode; > + break; > + } > + } else { > + inode = info->cur_inode; > + info->cur_inode = NULL; > + list_for_each_entry_continue(inode, &info->cur_sb->s_inodes, > + i_sb_list) { > + spin_lock(&inode->i_lock); > + if (inode_unusual(inode)) { > + spin_unlock(&inode->i_lock); > + continue; > + } > + __iget(inode); > + spin_unlock(&inode->i_lock); > + info->cur_inode = inode; > + break; > + } > + } > + > + /* Seen all inodes in this superblock */ > + if (!info->cur_inode) { > + spin_unlock(&info->cur_sb->s_inode_list_lock); > + if (!goto_next_sb(info)) { > + inode = NULL; > + goto out; > + } > + > + goto retry; > + } > + > + spin_unlock(&info->cur_sb->s_inode_list_lock); > + info->cur_page_idx = 0; > +out: > + iput(prev_inode); > + return info->cur_inode; > +} > + > +static struct page *goto_next_page(struct bpf_iter_seq_pagecache_info *info) > +{ > + struct page *page, *ret = NULL; > + unsigned long idx; > + > + rcu_read_lock(); > +retry: > + BUG_ON(!info->cur_inode); > + ret = NULL; > + xa_for_each_start(&info->cur_inode->i_data.i_pages, idx, page, > + info->cur_page_idx) { > + if (!page_cache_get_speculative(page)) > + continue; > + > + ret = page; > + info->cur_page_idx = idx + 1; > + break; > + } > + > + if (!ret) { > + /* Seen all inodes and superblocks */ > + if (!goto_next_inode(info)) > + goto out; > + > + goto retry; > + } > + > +out: > + rcu_read_unlock(); > + return ret; > +} > + > +static void *pagecache_seq_start(struct seq_file *seq, loff_t *pos) > +{ > + struct bpf_iter_seq_pagecache_info *info = seq->private; > + struct page *page; > + > + if (!info->cur_sb && !goto_next_sb(info)) > + return NULL; > + if (!info->cur_inode && !goto_next_inode(info)) > + return NULL; > + > + page = goto_next_page(info); > + if (!page) > + return NULL; > + > + if (*pos == 0) > + ++*pos; > + > + return page; > + > +} > + > +static void *pagecache_seq_next(struct seq_file *seq, void *v, loff_t *pos) > +{ > + struct bpf_iter_seq_pagecache_info *info = seq->private; > + struct page *page; > + > + ++*pos; > + put_page((struct page *)v); > + page = goto_next_page(info); > + if (!page) > + return NULL; > + > + return page; > +} > + > +struct bpf_iter__pagecache { > + __bpf_md_ptr(struct bpf_iter_meta *, meta); > + __bpf_md_ptr(struct page *, page); > +}; > + > +DEFINE_BPF_ITER_FUNC(pagecache, struct bpf_iter_meta *meta, struct page *page) > + > +static int __pagecache_seq_show(struct seq_file *seq, struct page *page, > + bool in_stop) > +{ > + struct bpf_iter_meta meta; > + struct bpf_iter__pagecache ctx; > + struct bpf_prog *prog; > + > + meta.seq = seq; > + prog = bpf_iter_get_info(&meta, in_stop); > + if (!prog) > + return 0; > + > + meta.seq = seq; > + ctx.meta = &meta; > + ctx.page = page; > + return bpf_iter_run_prog(prog, &ctx); > +} > + > +static int pagecache_seq_show(struct seq_file *seq, void *v) > +{ > + return __pagecache_seq_show(seq, v, false); > +} > + > +static void pagecache_seq_stop(struct seq_file *seq, void *v) > +{ > + (void)__pagecache_seq_show(seq, v, true); > + if (v) > + put_page((struct page *)v); > +} > + > +static int init_seq_pagecache(void *priv_data, struct bpf_iter_aux_info *aux) > +{ > + struct bpf_iter_seq_pagecache_info *info = priv_data; > + struct radix_tree_iter iter; > + struct super_block *sb; > + struct mount *mnt; > + void **slot; > + int err; > + > + info->ns = current->nsproxy->mnt_ns; > + get_mnt_ns(info->ns); > + INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL); > + > + spin_lock(&info->ns->ns_lock); > + list_for_each_entry(mnt, &info->ns->list, mnt_list) { Not just are there helpers for taking ns_lock static inline void lock_ns_list(struct mnt_namespace *ns) static inline void unlock_ns_list(struct mnt_namespace *ns) they are private to fs/namespace.c because it's the only place that should ever walk this list. This seems buggy: why is it ok here to only take ns_lock and not also namespace_sem like mnt_already_visible() and __is_local_mountpoint() or the relevant proc iterators? I might be missing something. > + sb = mnt->mnt.mnt_sb; > + > + /* The same mount may be mounted in multiple places */ > + if (radix_tree_lookup(&info->superblocks, (unsigned long)sb)) > + continue; > + > + err = radix_tree_insert(&info->superblocks, > + (unsigned long)sb, (void *)1); > + if (err) > + goto out; > + } > + > + radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) { > + sb = (struct super_block *)iter.index; > + atomic_inc(&sb->s_active); It also isn't nice that you mess with sb->s_active directly. Imho, this is poking around in a lot of fs/ specific stuff that other parts of the kernel should not care about or have access to. Christian
On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote: > +static void fini_seq_pagecache(void *priv_data) > +{ > + struct bpf_iter_seq_pagecache_info *info = priv_data; > + struct radix_tree_iter iter; > + struct super_block *sb; > + void **slot; > + > + radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) { > + sb = (struct super_block *)iter.index; > + atomic_dec(&sb->s_active); > + radix_tree_delete(&info->superblocks, iter.index); > + } ... and if in the meanwhile all other contributors to ->s_active have gone away, that will result in...? IOW, NAK. The objects you are playing with have non-trivial lifecycle and poking into the guts of data structures without bothering to understand it is not a good idea. Rule of the thumb: if your code ends up using fields that are otherwise handled by a small part of codebase, the odds are that you need to be bloody careful. In particular, ->ns_lock has 3 users - all in fs/namespace.c. ->list/->mnt_list: all users in fs/namespace.c and fs/pnode.c. ->s_active: majority in fs/super.c, with several outliers in filesystems and safety of those is not trivial. Any time you see that kind of pattern, you are risking to reprise a scene from The Modern Times - the one with Charlie taking a trip through the guts of machinery.
On Thu, Apr 08, 2021 at 07:14:01AM +0100, Matthew Wilcox wrote: > On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote: > > +struct bpf_iter_seq_pagecache_info { > > + struct mnt_namespace *ns; > > + struct radix_tree_root superblocks; > > Why are you adding a new radix tree? Use an XArray instead. Ah right, sorry. Will do. > > +static struct page *goto_next_page(struct bpf_iter_seq_pagecache_info *info) > > +{ > > + struct page *page, *ret = NULL; > > + unsigned long idx; > > + > > + rcu_read_lock(); > > +retry: > > + BUG_ON(!info->cur_inode); > > + ret = NULL; > > + xa_for_each_start(&info->cur_inode->i_data.i_pages, idx, page, > > + info->cur_page_idx) { > > + if (!page_cache_get_speculative(page)) > > + continue; > > Why do you feel the need to poke around in i_pages directly? Is there > something wrong with find_get_entries()? No reason other than I didn't know about the latter. Thanks for the hint. find_get_entries() seems to return a pagevec of entries which would complicate the iteration (a 4th layer of things to iterate over). But I did find find_get_pages_range() which I think can be used to find 1 page at a time. I'll look into it further. > > +static int __pagecache_seq_show(struct seq_file *seq, struct page *page, > > + bool in_stop) > > +{ > > + struct bpf_iter_meta meta; > > + struct bpf_iter__pagecache ctx; > > + struct bpf_prog *prog; > > + > > + meta.seq = seq; > > + prog = bpf_iter_get_info(&meta, in_stop); > > + if (!prog) > > + return 0; > > + > > + meta.seq = seq; > > + ctx.meta = &meta; > > + ctx.page = page; > > + return bpf_iter_run_prog(prog, &ctx); > > I'm not really keen on the idea of random BPF programs being able to poke > at pages in the page cache like this. From your initial description, > it sounded like all you needed was a list of which pages are present. Could you elaborate on what "list of which pages are present" implies? The overall goal with this patch is to detect duplicate content in the page cache. So anything that helps achieve that goal I would (in theory) be OK with. My understanding is the user would need to hash the contents of each page in the page cache. And BPF provides the flexibility such that this work could be reused for currently unanticipated use cases. Furthermore, bpf programs could already look at all the pages in the page cache by hooking into tracepoint:filemap:mm_filemap_add_to_page_cache, albeit at a much slower rate. I figure the downside of adding this page cache iterator is we're explicitly condoning the behavior. > > + INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL); > > + > > + spin_lock(&info->ns->ns_lock); > > + list_for_each_entry(mnt, &info->ns->list, mnt_list) { > > + sb = mnt->mnt.mnt_sb; > > + > > + /* The same mount may be mounted in multiple places */ > > + if (radix_tree_lookup(&info->superblocks, (unsigned long)sb)) > > + continue; > > + > > + err = radix_tree_insert(&info->superblocks, > > + (unsigned long)sb, (void *)1); > > + if (err) > > + goto out; > > + } > > + > > + radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) { > > + sb = (struct super_block *)iter.index; > > + atomic_inc(&sb->s_active); > > + } > > Uh. What on earth made you think this was a good way to use the radix > tree? And, no, the XArray doesn't change that. The idea behind the radix tree was to deduplicate the mounts by superblock. Because a single filesystem may be mounted in different locations. I didn't find a set data structure I could reuse so I figured radix tree / xarray would work too. Happy to take any better ideas too. > If you don't understand why this is so bad, call xa_dump() on it after > constructing it. I'll wait. I did a dump and got the following results: http://ix.io/2VpY . I receieved a hint that you may be referring to how the xarray/radix tree would be as large as the largest pointer. To my uneducated eye it doesn't look like that's the case in this dump. Could you please clarify? <...> Thanks, Daniel
On Thu, Apr 08, 2021 at 10:19:35AM +0200, Christian Brauner wrote: > On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote: > > This commit introduces the bpf page cache iterator. This iterator allows > > users to run a bpf prog against each page in the "page cache". > > Internally, the "page cache" is extremely tied to VFS superblock + inode > > combo. Because of this, iter_pagecache will only examine pages in the > > caller's mount namespace. > > > > Signed-off-by: Daniel Xu <dxu@dxuuu.xyz> > > --- > > kernel/bpf/Makefile | 2 +- > > kernel/bpf/pagecache_iter.c | 293 ++++++++++++++++++++++++++++++++++++ > > 2 files changed, 294 insertions(+), 1 deletion(-) > > create mode 100644 kernel/bpf/pagecache_iter.c <...> > > > > +static int init_seq_pagecache(void *priv_data, struct bpf_iter_aux_info *aux) > > +{ > > + struct bpf_iter_seq_pagecache_info *info = priv_data; > > + struct radix_tree_iter iter; > > + struct super_block *sb; > > + struct mount *mnt; > > + void **slot; > > + int err; > > + > > + info->ns = current->nsproxy->mnt_ns; > > + get_mnt_ns(info->ns); > > + INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL); > > + > > + spin_lock(&info->ns->ns_lock); > > + list_for_each_entry(mnt, &info->ns->list, mnt_list) { > > Not just are there helpers for taking ns_lock > static inline void lock_ns_list(struct mnt_namespace *ns) > static inline void unlock_ns_list(struct mnt_namespace *ns) > they are private to fs/namespace.c because it's the only place that > should ever walk this list. Thanks for the hints. Would it be acceptable to add some helpers to fs/namespace.c to allow walking the list? IIUC the only way to find a list of mounts is by looking at the mount namespace. And walking each mount and looking at each `struct super_node`'s inode's `struct address_space` seemed like the cleanest way to walkthe page cache. > This seems buggy: why is it ok here to only take ns_lock and not also > namespace_sem like mnt_already_visible() and __is_local_mountpoint() > or the relevant proc iterators? I might be missing something. Thanks for the hints. I'll take a closer look at the locking. Most probably I didn't get it right. I should have also mentioned in the cover letter that I'm fairly sure I messed up the locking somewhere. > > > + sb = mnt->mnt.mnt_sb; > > + > > + /* The same mount may be mounted in multiple places */ > > + if (radix_tree_lookup(&info->superblocks, (unsigned long)sb)) > > + continue; > > + > > + err = radix_tree_insert(&info->superblocks, > > + (unsigned long)sb, (void *)1); > > + if (err) > > + goto out; > > + } > > + > > + radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) { > > + sb = (struct super_block *)iter.index; > > + atomic_inc(&sb->s_active); > > It also isn't nice that you mess with sb->s_active directly. > > Imho, this is poking around in a lot of fs/ specific stuff that other > parts of the kernel should not care about or have access to. Re above: do you think it'd be appropriate to add more helpers to fs/ ? <...> Thanks, Daniel
On Thu, Apr 08, 2021 at 04:45:37PM +0000, Al Viro wrote: > On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote: > > > +static void fini_seq_pagecache(void *priv_data) > > +{ > > + struct bpf_iter_seq_pagecache_info *info = priv_data; > > + struct radix_tree_iter iter; > > + struct super_block *sb; > > + void **slot; > > + > > + radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) { > > + sb = (struct super_block *)iter.index; > > + atomic_dec(&sb->s_active); > > + radix_tree_delete(&info->superblocks, iter.index); > > + } > > ... and if in the meanwhile all other contributors to ->s_active have > gone away, that will result in...? Ah right, sorry. Nobody will clean up the super_block. > IOW, NAK. The objects you are playing with have non-trivial lifecycle > and poking into the guts of data structures without bothering to > understand it is not a good idea. > > Rule of the thumb: if your code ends up using fields that are otherwise > handled by a small part of codebase, the odds are that you need to be > bloody careful. In particular, ->ns_lock has 3 users - all in > fs/namespace.c. ->list/->mnt_list: all users in fs/namespace.c and > fs/pnode.c. ->s_active: majority in fs/super.c, with several outliers > in filesystems and safety of those is not trivial. > > Any time you see that kind of pattern, you are risking to reprise > a scene from The Modern Times - the one with Charlie taking a trip > through the guts of machinery. I'll take a closer look at the lifetime semantics. Hopefully the overall goal of the patch is ok. Happy to iterate on the implementation details until it's correct. Thanks, Daniel
On Thu, Apr 08, 2021 at 01:49:35PM -0700, Daniel Xu wrote: > Ah right, sorry. Nobody will clean up the super_block. > > > IOW, NAK. The objects you are playing with have non-trivial lifecycle > > and poking into the guts of data structures without bothering to > > understand it is not a good idea. > > > > Rule of the thumb: if your code ends up using fields that are otherwise > > handled by a small part of codebase, the odds are that you need to be > > bloody careful. In particular, ->ns_lock has 3 users - all in > > fs/namespace.c. ->list/->mnt_list: all users in fs/namespace.c and > > fs/pnode.c. ->s_active: majority in fs/super.c, with several outliers > > in filesystems and safety of those is not trivial. > > > > Any time you see that kind of pattern, you are risking to reprise > > a scene from The Modern Times - the one with Charlie taking a trip > > through the guts of machinery. > > I'll take a closer look at the lifetime semantics. > > Hopefully the overall goal of the patch is ok. Happy to iterate on the > implementation details until it's correct. That depends. Note that bumping ->s_active means that umount of that sucker will *NOT* shut it down - that would happen only on the thread doing the final deactivation. What's more, having e.g. a USB stick mounted, doing umount(1), having it complete successfully, pulling the damn thing out and getting writes lost would make for a nasty surprise for users. With your approach it seems to be inevitable. Holding namespace_sem through the entire thing would prevent that, but's it's a non-starter for other reasons (starting with "it's a system-wide lock, so that'd be highly antisocial"). Are there any limits on what could be done to the pages, anyway? Because if it's "anything user wanted to do", it's *really* not feasible.
On Thu, Apr 08, 2021 at 12:48:49PM -0700, Daniel Xu wrote: > No reason other than I didn't know about the latter. Thanks for the > hint. find_get_entries() seems to return a pagevec of entries which > would complicate the iteration (a 4th layer of things to iterate over). > > But I did find find_get_pages_range() which I think can be used to find > 1 page at a time. I'll look into it further. Please don't, that's going to be a pagevec too. > > I'm not really keen on the idea of random BPF programs being able to poke > > at pages in the page cache like this. From your initial description, > > it sounded like all you needed was a list of which pages are present. > > Could you elaborate on what "list of which pages are present" implies? > The overall goal with this patch is to detect duplicate content in the > page cache. So anything that helps achieve that goal I would (in theory) > be OK with. > > My understanding is the user would need to hash the contents > of each page in the page cache. And BPF provides the flexibility such > that this work could be reused for currently unanticipated use cases. But if you need the contents, then you'll need to kmap() the pages. I don't see people being keen on exposing kmap() to bpf either. I think you're much better off providing an interface that returns a hash of each page to the BPF program. > Furthermore, bpf programs could already look at all the pages in the > page cache by hooking into tracepoint:filemap:mm_filemap_add_to_page_cache, > albeit at a much slower rate. I figure the downside of adding this > page cache iterator is we're explicitly condoning the behavior. That should never have been exposed. It's only supposed to be for error injection. If people have started actually using it for something, then it's time we delete that tracepoint. > The idea behind the radix tree was to deduplicate the mounts by > superblock. Because a single filesystem may be mounted in different > locations. I didn't find a set data structure I could reuse so I > figured radix tree / xarray would work too. > > Happy to take any better ideas too. > > > If you don't understand why this is so bad, call xa_dump() on it after > > constructing it. I'll wait. > > I did a dump and got the following results: http://ix.io/2VpY . > > I receieved a hint that you may be referring to how the xarray/radix > tree would be as large as the largest pointer. To my uneducated eye it > doesn't look like that's the case in this dump. Could you please > clarify? We get seven nodes per 4kB page. $ grep -c 'value 0' 2VpY 15 $ grep -c node 2VpY 43 so we use 6+1/7 pages in order to store 15 values. That's 387 cache lines, for the amount of data that could fit in two. Liam and I are working on a data structure that would support doing something along these lines in an efficient manner, but it's not ready yet.
On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote: > This commit introduces the bpf page cache iterator. This iterator allows > users to run a bpf prog against each page in the "page cache". > Internally, the "page cache" is extremely tied to VFS superblock + inode > combo. Because of this, iter_pagecache will only examine pages in the > caller's mount namespace. No, it does not just examine pages with in the callers mount namespace, because .... > +static struct inode *goto_next_inode(struct bpf_iter_seq_pagecache_info *info) > +{ > + struct inode *prev_inode = info->cur_inode; > + struct inode *inode; > + > +retry: > + BUG_ON(!info->cur_sb); > + spin_lock(&info->cur_sb->s_inode_list_lock); > + > + if (!info->cur_inode) { > + list_for_each_entry(inode, &info->cur_sb->s_inodes, i_sb_list) { ... this is an "all inodes on the superblock" walk. This will also iterate inodes in other mount namespaces that point to the same superblock. IOWs, if you have different parts of the same filesystem mounted into hundreds of container mount namespaces, this script will not just iterate the local mount name space, it will iterate every inode in every mount namespace. And, of course, if the same files are mounted into multiple containers (think read-only bind mounts using idmapping) then you have zero indication of which container is actually using them, just that there are hundreds of paths to the same inode. And every container will appear to be using exactly the same amount of page cache. IOWs, the stats this generates provide no insight into page cache usage across mount namespaces in many situations, and it leaks information about page cache usage across mount namespace boundaries. And that's before I say "iterating all inodes in a superblock is bad" because it causes lock contention and interrupts normal usage. We avoid s_inodes lists walks as much as we possibly can, and the last thing we want is for userspace to be able to trivially instigate long running walks of the s_inodes list. Remember, we can have hundreds of millions of inodes on this list.... > + spin_lock(&inode->i_lock); > + if (inode_unusual(inode)) { > + spin_unlock(&inode->i_lock); > + continue; > + } > + __iget(inode); > + spin_unlock(&inode->i_lock); This can spin long enough to trigger livelock warnings. Even if it's not held that long, it can cause unexpected long tail latencies in memory reclaim and inode instantiation. Every s_inodes list walk has cond_resched() built into it now.... > + info->ns = current->nsproxy->mnt_ns; > + get_mnt_ns(info->ns); > + INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL); > + > + spin_lock(&info->ns->ns_lock); > + list_for_each_entry(mnt, &info->ns->list, mnt_list) { > + sb = mnt->mnt.mnt_sb; > + > + /* The same mount may be mounted in multiple places */ > + if (radix_tree_lookup(&info->superblocks, (unsigned long)sb)) > + continue; > + > + err = radix_tree_insert(&info->superblocks, > + (unsigned long)sb, (void *)1); And just because nobody has pointed it out yet: radix_tree_insert() will do GFP_KERNEL memory allocations inside the spinlock being held here. ---- You said that you didn't take the "walk the LRUs" approach because walking superblocks "seemed simpler". It's not. Page cache residency and accounting is managed by memcgs, not by mount namespaces. That is, containers usually have a memcg associated with them to control memory usage of the container. The page cache used by a container is accounted directly to the memcg, and memory reclaim can find all the file-backed page cache pages associated with a memcg very quickly (via mem_cgroup_lruvec()). This will find pages associated directly with the memcg, so it gives you a fairly accurate picture of the page cache usage within the container. This has none of the issues that arise from "sb != mnt_ns" that walking superblocks and inode lists have, and it doesn't require you to play games with mounts, superblocks and inode references.... Cheers, Dave.
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 7f33098ca63f..3deb6a8d3f75 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o pagecache_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o diff --git a/kernel/bpf/pagecache_iter.c b/kernel/bpf/pagecache_iter.c new file mode 100644 index 000000000000..8442ab0d4221 --- /dev/null +++ b/kernel/bpf/pagecache_iter.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2021 Facebook */ + +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/init.h> +#include <linux/mm_types.h> +#include <linux/mnt_namespace.h> +#include <linux/nsproxy.h> +#include <linux/pagemap.h> +#include <linux/radix-tree.h> +#include <linux/seq_file.h> +#include "../../fs/mount.h" + +struct bpf_iter_seq_pagecache_info { + struct mnt_namespace *ns; + struct radix_tree_root superblocks; + struct super_block *cur_sb; + struct inode *cur_inode; + unsigned long cur_page_idx; +}; + +static struct super_block *goto_next_sb(struct bpf_iter_seq_pagecache_info *info) +{ + struct super_block *sb = NULL; + struct radix_tree_iter iter; + void **slot; + + radix_tree_for_each_slot(slot, &info->superblocks, &iter, + ((unsigned long)info->cur_sb + 1)) { + sb = (struct super_block *)iter.index; + break; + } + + info->cur_sb = sb; + info->cur_inode = NULL; + info->cur_page_idx = 0; + return sb; +} + +static bool inode_unusual(struct inode *inode) { + return ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0)); +} + +static struct inode *goto_next_inode(struct bpf_iter_seq_pagecache_info *info) +{ + struct inode *prev_inode = info->cur_inode; + struct inode *inode; + +retry: + BUG_ON(!info->cur_sb); + spin_lock(&info->cur_sb->s_inode_list_lock); + + if (!info->cur_inode) { + list_for_each_entry(inode, &info->cur_sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + if (inode_unusual(inode)) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + info->cur_inode = inode; + break; + } + } else { + inode = info->cur_inode; + info->cur_inode = NULL; + list_for_each_entry_continue(inode, &info->cur_sb->s_inodes, + i_sb_list) { + spin_lock(&inode->i_lock); + if (inode_unusual(inode)) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + info->cur_inode = inode; + break; + } + } + + /* Seen all inodes in this superblock */ + if (!info->cur_inode) { + spin_unlock(&info->cur_sb->s_inode_list_lock); + if (!goto_next_sb(info)) { + inode = NULL; + goto out; + } + + goto retry; + } + + spin_unlock(&info->cur_sb->s_inode_list_lock); + info->cur_page_idx = 0; +out: + iput(prev_inode); + return info->cur_inode; +} + +static struct page *goto_next_page(struct bpf_iter_seq_pagecache_info *info) +{ + struct page *page, *ret = NULL; + unsigned long idx; + + rcu_read_lock(); +retry: + BUG_ON(!info->cur_inode); + ret = NULL; + xa_for_each_start(&info->cur_inode->i_data.i_pages, idx, page, + info->cur_page_idx) { + if (!page_cache_get_speculative(page)) + continue; + + ret = page; + info->cur_page_idx = idx + 1; + break; + } + + if (!ret) { + /* Seen all inodes and superblocks */ + if (!goto_next_inode(info)) + goto out; + + goto retry; + } + +out: + rcu_read_unlock(); + return ret; +} + +static void *pagecache_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_pagecache_info *info = seq->private; + struct page *page; + + if (!info->cur_sb && !goto_next_sb(info)) + return NULL; + if (!info->cur_inode && !goto_next_inode(info)) + return NULL; + + page = goto_next_page(info); + if (!page) + return NULL; + + if (*pos == 0) + ++*pos; + + return page; + +} + +static void *pagecache_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_pagecache_info *info = seq->private; + struct page *page; + + ++*pos; + put_page((struct page *)v); + page = goto_next_page(info); + if (!page) + return NULL; + + return page; +} + +struct bpf_iter__pagecache { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct page *, page); +}; + +DEFINE_BPF_ITER_FUNC(pagecache, struct bpf_iter_meta *meta, struct page *page) + +static int __pagecache_seq_show(struct seq_file *seq, struct page *page, + bool in_stop) +{ + struct bpf_iter_meta meta; + struct bpf_iter__pagecache ctx; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + meta.seq = seq; + ctx.meta = &meta; + ctx.page = page; + return bpf_iter_run_prog(prog, &ctx); +} + +static int pagecache_seq_show(struct seq_file *seq, void *v) +{ + return __pagecache_seq_show(seq, v, false); +} + +static void pagecache_seq_stop(struct seq_file *seq, void *v) +{ + (void)__pagecache_seq_show(seq, v, true); + if (v) + put_page((struct page *)v); +} + +static int init_seq_pagecache(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_pagecache_info *info = priv_data; + struct radix_tree_iter iter; + struct super_block *sb; + struct mount *mnt; + void **slot; + int err; + + info->ns = current->nsproxy->mnt_ns; + get_mnt_ns(info->ns); + INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL); + + spin_lock(&info->ns->ns_lock); + list_for_each_entry(mnt, &info->ns->list, mnt_list) { + sb = mnt->mnt.mnt_sb; + + /* The same mount may be mounted in multiple places */ + if (radix_tree_lookup(&info->superblocks, (unsigned long)sb)) + continue; + + err = radix_tree_insert(&info->superblocks, + (unsigned long)sb, (void *)1); + if (err) + goto out; + } + + radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) { + sb = (struct super_block *)iter.index; + atomic_inc(&sb->s_active); + } + + err = 0; +out: + spin_unlock(&info->ns->ns_lock); + return err; +} + +static void fini_seq_pagecache(void *priv_data) +{ + struct bpf_iter_seq_pagecache_info *info = priv_data; + struct radix_tree_iter iter; + struct super_block *sb; + void **slot; + + radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) { + sb = (struct super_block *)iter.index; + atomic_dec(&sb->s_active); + radix_tree_delete(&info->superblocks, iter.index); + } + + put_mnt_ns(info->ns); +} + +static const struct seq_operations pagecache_seq_ops = { + .start = pagecache_seq_start, + .next = pagecache_seq_next, + .stop = pagecache_seq_stop, + .show = pagecache_seq_show, +}; + +static const struct bpf_iter_seq_info pagecache_seq_info = { + .seq_ops = &pagecache_seq_ops, + .init_seq_private = init_seq_pagecache, + .fini_seq_private = fini_seq_pagecache, + .seq_priv_size = sizeof(struct bpf_iter_seq_pagecache_info), +}; + +static struct bpf_iter_reg pagecache_reg_info = { + .target = "pagecache", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__pagecache, page), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &pagecache_seq_info, +}; + +BTF_ID_LIST(btf_page_id) +BTF_ID(struct, page) + +static int __init bpf_pagecache_iter_init(void) +{ + pagecache_reg_info.ctx_arg_info[0].btf_id = *btf_page_id; + return bpf_iter_reg_target(&pagecache_reg_info); +} + +late_initcall(bpf_pagecache_iter_init);
This commit introduces the bpf page cache iterator. This iterator allows users to run a bpf prog against each page in the "page cache". Internally, the "page cache" is extremely tied to VFS superblock + inode combo. Because of this, iter_pagecache will only examine pages in the caller's mount namespace. Signed-off-by: Daniel Xu <dxu@dxuuu.xyz> --- kernel/bpf/Makefile | 2 +- kernel/bpf/pagecache_iter.c | 293 ++++++++++++++++++++++++++++++++++++ 2 files changed, 294 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/pagecache_iter.c