diff mbox series

[RFC,bpf-next,1/1] bpf: Introduce iter_pagecache

Message ID 22bededbd502e0df45326a54b3056941de65a101.1617831474.git.dxu@dxuuu.xyz (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series bpf: Add page cache iterator | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 8 maintainers not CCed: netdev@vger.kernel.org kpsingh@kernel.org daniel@iogearbox.net andrii@kernel.org kafai@fb.com ast@kernel.org john.fastabend@gmail.com songliubraving@fb.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit fail Errors and warnings before: 1 this patch: 13
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch fail CHECK: Alignment should match open parenthesis CHECK: Blank lines aren't necessary before a close brace '}' CHECK: spaces preferred around that '|' (ctx:VxV) ERROR: code indent should use tabs where possible ERROR: open brace '{' following function definitions go on the next line WARNING: Avoid crashing the kernel - try using WARN_ON & recovery code rather than BUG() or BUG_ON() WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 81 exceeds 80 columns
netdev/build_allmodconfig_warn fail Errors and warnings before: 1 this patch: 13
netdev/header_inline success Link

Commit Message

Daniel Xu April 7, 2021, 9:46 p.m. UTC
This commit introduces the bpf page cache iterator. This iterator allows
users to run a bpf prog against each page in the "page cache".
Internally, the "page cache" is extremely tied to VFS superblock + inode
combo. Because of this, iter_pagecache will only examine pages in the
caller's mount namespace.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
---
 kernel/bpf/Makefile         |   2 +-
 kernel/bpf/pagecache_iter.c | 293 ++++++++++++++++++++++++++++++++++++
 2 files changed, 294 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/pagecache_iter.c

Comments

Matthew Wilcox April 8, 2021, 6:14 a.m. UTC | #1
On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:
> +struct bpf_iter_seq_pagecache_info {
> +	struct mnt_namespace *ns;
> +	struct radix_tree_root superblocks;

Why are you adding a new radix tree?  Use an XArray instead.

> +static struct page *goto_next_page(struct bpf_iter_seq_pagecache_info *info)
> +{
> +	struct page *page, *ret = NULL;
> +	unsigned long idx;
> +
> +	rcu_read_lock();
> +retry:
> +	BUG_ON(!info->cur_inode);
> +	ret = NULL;
> +	xa_for_each_start(&info->cur_inode->i_data.i_pages, idx, page,
> +			  info->cur_page_idx) {
> +		if (!page_cache_get_speculative(page))
> +			continue;

Why do you feel the need to poke around in i_pages directly?  Is there
something wrong with find_get_entries()?

> +static int __pagecache_seq_show(struct seq_file *seq, struct page *page,
> +				bool in_stop)
> +{
> +	struct bpf_iter_meta meta;
> +	struct bpf_iter__pagecache ctx;
> +	struct bpf_prog *prog;
> +
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, in_stop);
> +	if (!prog)
> +		return 0;
> +
> +	meta.seq = seq;
> +	ctx.meta = &meta;
> +	ctx.page = page;
> +	return bpf_iter_run_prog(prog, &ctx);

I'm not really keen on the idea of random BPF programs being able to poke
at pages in the page cache like this.  From your initial description,
it sounded like all you needed was a list of which pages are present.

> +	INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL);
> +
> +	spin_lock(&info->ns->ns_lock);
> +	list_for_each_entry(mnt, &info->ns->list, mnt_list) {
> +		sb = mnt->mnt.mnt_sb;
> +
> +		/* The same mount may be mounted in multiple places */
> +		if (radix_tree_lookup(&info->superblocks, (unsigned long)sb))
> +			continue;
> +
> +		err = radix_tree_insert(&info->superblocks,
> +				        (unsigned long)sb, (void *)1);
> +		if (err)
> +			goto out;
> +	}
> +
> +	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
> +		sb = (struct super_block *)iter.index;
> +		atomic_inc(&sb->s_active);
> +	}

Uh.  What on earth made you think this was a good way to use the radix
tree?  And, no, the XArray doesn't change that.

If you don't understand why this is so bad, call xa_dump() on it after
constructing it.  I'll wait.
Christian Brauner April 8, 2021, 8:19 a.m. UTC | #2
On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:
> This commit introduces the bpf page cache iterator. This iterator allows
> users to run a bpf prog against each page in the "page cache".
> Internally, the "page cache" is extremely tied to VFS superblock + inode
> combo. Because of this, iter_pagecache will only examine pages in the
> caller's mount namespace.
> 
> Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> ---
>  kernel/bpf/Makefile         |   2 +-
>  kernel/bpf/pagecache_iter.c | 293 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 294 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/bpf/pagecache_iter.c
> 
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index 7f33098ca63f..3deb6a8d3f75 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
>  endif
>  CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
>  
> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o pagecache_iter.o map_iter.o task_iter.o prog_iter.o
>  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
>  obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
>  obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
> diff --git a/kernel/bpf/pagecache_iter.c b/kernel/bpf/pagecache_iter.c
> new file mode 100644
> index 000000000000..8442ab0d4221
> --- /dev/null
> +++ b/kernel/bpf/pagecache_iter.c
> @@ -0,0 +1,293 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2021 Facebook */
> +
> +#include <linux/bpf.h>
> +#include <linux/btf_ids.h>
> +#include <linux/init.h>
> +#include <linux/mm_types.h>
> +#include <linux/mnt_namespace.h>
> +#include <linux/nsproxy.h>
> +#include <linux/pagemap.h>
> +#include <linux/radix-tree.h>
> +#include <linux/seq_file.h>
> +#include "../../fs/mount.h"

This is a private header on purpose. Outside of fs/ poking around in
struct mount or struct mount_namespace should not be done.

> +
> +struct bpf_iter_seq_pagecache_info {
> +	struct mnt_namespace *ns;
> +	struct radix_tree_root superblocks;
> +	struct super_block *cur_sb;
> +	struct inode *cur_inode;
> +	unsigned long cur_page_idx;
> +};
> +
> +static struct super_block *goto_next_sb(struct bpf_iter_seq_pagecache_info *info)
> +{
> +	struct super_block *sb = NULL;
> +	struct radix_tree_iter iter;
> +	void **slot;
> +
> +	radix_tree_for_each_slot(slot, &info->superblocks, &iter,
> +				 ((unsigned long)info->cur_sb + 1)) {
> +		sb = (struct super_block *)iter.index;
> +		break;
> +	}
> +
> +	info->cur_sb = sb;
> +	info->cur_inode = NULL;
> +	info->cur_page_idx = 0;
> +	return sb;
> +}
> +
> +static bool inode_unusual(struct inode *inode) {
> +	return ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
> +		(inode->i_mapping->nrpages == 0));
> +}
> +
> +static struct inode *goto_next_inode(struct bpf_iter_seq_pagecache_info *info)
> +{
> +	struct inode *prev_inode = info->cur_inode;
> +	struct inode *inode;
> +
> +retry:
> +	BUG_ON(!info->cur_sb);
> +	spin_lock(&info->cur_sb->s_inode_list_lock);
> +
> +	if (!info->cur_inode) {
> +		list_for_each_entry(inode, &info->cur_sb->s_inodes, i_sb_list) {
> +			spin_lock(&inode->i_lock);
> +			if (inode_unusual(inode)) {
> +				spin_unlock(&inode->i_lock);
> +				continue;
> +			}
> +			__iget(inode);
> +			spin_unlock(&inode->i_lock);
> +			info->cur_inode = inode;
> +			break;
> +		}
> +	} else {
> +		inode = info->cur_inode;
> +		info->cur_inode = NULL;
> +		list_for_each_entry_continue(inode, &info->cur_sb->s_inodes,
> +					     i_sb_list) {
> +			spin_lock(&inode->i_lock);
> +			if (inode_unusual(inode)) {
> +				spin_unlock(&inode->i_lock);
> +				continue;
> +			}
> +			__iget(inode);
> +			spin_unlock(&inode->i_lock);
> +			info->cur_inode = inode;
> +			break;
> +		}
> +	}
> +
> +	/* Seen all inodes in this superblock */
> +	if (!info->cur_inode) {
> +		spin_unlock(&info->cur_sb->s_inode_list_lock);
> +		if (!goto_next_sb(info)) {
> +			inode = NULL;
> +			goto out;
> +		}
> +
> +		goto retry;
> +	}
> +
> +	spin_unlock(&info->cur_sb->s_inode_list_lock);
> +	info->cur_page_idx = 0;
> +out:
> +	iput(prev_inode);
> +	return info->cur_inode;
> +}
> +
> +static struct page *goto_next_page(struct bpf_iter_seq_pagecache_info *info)
> +{
> +	struct page *page, *ret = NULL;
> +	unsigned long idx;
> +
> +	rcu_read_lock();
> +retry:
> +	BUG_ON(!info->cur_inode);
> +	ret = NULL;
> +	xa_for_each_start(&info->cur_inode->i_data.i_pages, idx, page,
> +			  info->cur_page_idx) {
> +		if (!page_cache_get_speculative(page))
> +			continue;
> +
> +		ret = page;
> +		info->cur_page_idx = idx + 1;
> +		break;
> +	}
> +
> +	if (!ret) {
> +		/* Seen all inodes and superblocks */
> +		if (!goto_next_inode(info))
> +			goto out;
> +
> +		goto retry;
> +	}
> +
> +out:
> +	rcu_read_unlock();
> +	return ret;
> +}
> +
> +static void *pagecache_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +	struct bpf_iter_seq_pagecache_info *info = seq->private;
> +	struct page *page;
> +
> +	if (!info->cur_sb && !goto_next_sb(info))
> +		return NULL;
> +	if (!info->cur_inode && !goto_next_inode(info))
> +		return NULL;
> +
> +	page = goto_next_page(info);
> +	if (!page)
> +		return NULL;
> +
> +	if (*pos == 0)
> +		++*pos;
> +
> +	return page;
> +
> +}
> +
> +static void *pagecache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +	struct bpf_iter_seq_pagecache_info *info = seq->private;
> +	struct page *page;
> +
> +	++*pos;
> +	put_page((struct page *)v);
> +	page = goto_next_page(info);
> +	if (!page)
> +		return NULL;
> +
> +	return page;
> +}
> +
> +struct bpf_iter__pagecache {
> +	__bpf_md_ptr(struct bpf_iter_meta *, meta);
> +	__bpf_md_ptr(struct page *, page);
> +};
> +
> +DEFINE_BPF_ITER_FUNC(pagecache, struct bpf_iter_meta *meta, struct page *page)
> +
> +static int __pagecache_seq_show(struct seq_file *seq, struct page *page,
> +				bool in_stop)
> +{
> +	struct bpf_iter_meta meta;
> +	struct bpf_iter__pagecache ctx;
> +	struct bpf_prog *prog;
> +
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, in_stop);
> +	if (!prog)
> +		return 0;
> +
> +	meta.seq = seq;
> +	ctx.meta = &meta;
> +	ctx.page = page;
> +	return bpf_iter_run_prog(prog, &ctx);
> +}
> +
> +static int pagecache_seq_show(struct seq_file *seq, void *v)
> +{
> +	return __pagecache_seq_show(seq, v, false);
> +}
> +
> +static void pagecache_seq_stop(struct seq_file *seq, void *v)
> +{
> +	(void)__pagecache_seq_show(seq, v, true);
> +	if (v)
> +		put_page((struct page *)v);
> +}
> +
> +static int init_seq_pagecache(void *priv_data, struct bpf_iter_aux_info *aux)
> +{
> +	struct bpf_iter_seq_pagecache_info *info = priv_data;
> +	struct radix_tree_iter iter;
> +	struct super_block *sb;
> +	struct mount *mnt;
> +	void **slot;
> +	int err;
> +
> +	info->ns = current->nsproxy->mnt_ns;
> +	get_mnt_ns(info->ns);
> +	INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL);
> +
> +	spin_lock(&info->ns->ns_lock);
> +	list_for_each_entry(mnt, &info->ns->list, mnt_list) {

Not just are there helpers for taking ns_lock
static inline void lock_ns_list(struct mnt_namespace *ns)
static inline void unlock_ns_list(struct mnt_namespace *ns)
they are private to fs/namespace.c because it's the only place that
should ever walk this list.

This seems buggy: why is it ok here to only take ns_lock and not also
namespace_sem like mnt_already_visible() and __is_local_mountpoint() or
the relevant proc iterators? I might be missing something.

> +		sb = mnt->mnt.mnt_sb;
> +
> +		/* The same mount may be mounted in multiple places */
> +		if (radix_tree_lookup(&info->superblocks, (unsigned long)sb))
> +			continue;
> +
> +		err = radix_tree_insert(&info->superblocks,
> +				        (unsigned long)sb, (void *)1);
> +		if (err)
> +			goto out;
> +	}
> +
> +	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
> +		sb = (struct super_block *)iter.index;
> +		atomic_inc(&sb->s_active);

It also isn't nice that you mess with sb->s_active directly.

Imho, this is poking around in a lot of fs/ specific stuff that other
parts of the kernel should not care about or have access to.

Christian
Al Viro April 8, 2021, 4:45 p.m. UTC | #3
On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:

> +static void fini_seq_pagecache(void *priv_data)
> +{
> +	struct bpf_iter_seq_pagecache_info *info = priv_data;
> +	struct radix_tree_iter iter;
> +	struct super_block *sb;
> +	void **slot;
> +
> +	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
> +		sb = (struct super_block *)iter.index;
> +		atomic_dec(&sb->s_active);
> +		radix_tree_delete(&info->superblocks, iter.index);
> +	}

... and if in the meanwhile all other contributors to ->s_active have
gone away, that will result in...?

IOW, NAK.  The objects you are playing with have non-trivial lifecycle
and poking into the guts of data structures without bothering to
understand it is not a good idea.

Rule of the thumb: if your code ends up using fields that are otherwise
handled by a small part of codebase, the odds are that you need to be
bloody careful.  In particular, ->ns_lock has 3 users - all in
fs/namespace.c.  ->list/->mnt_list: all users in fs/namespace.c and
fs/pnode.c.  ->s_active: majority in fs/super.c, with several outliers
in filesystems and safety of those is not trivial.

Any time you see that kind of pattern, you are risking to reprise
a scene from The Modern Times - the one with Charlie taking a trip
through the guts of machinery.
Daniel Xu April 8, 2021, 7:48 p.m. UTC | #4
On Thu, Apr 08, 2021 at 07:14:01AM +0100, Matthew Wilcox wrote:
> On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:
> > +struct bpf_iter_seq_pagecache_info {
> > +	struct mnt_namespace *ns;
> > +	struct radix_tree_root superblocks;
> 
> Why are you adding a new radix tree?  Use an XArray instead.

Ah right, sorry. Will do.

> > +static struct page *goto_next_page(struct bpf_iter_seq_pagecache_info *info)
> > +{
> > +	struct page *page, *ret = NULL;
> > +	unsigned long idx;
> > +
> > +	rcu_read_lock();
> > +retry:
> > +	BUG_ON(!info->cur_inode);
> > +	ret = NULL;
> > +	xa_for_each_start(&info->cur_inode->i_data.i_pages, idx, page,
> > +			  info->cur_page_idx) {
> > +		if (!page_cache_get_speculative(page))
> > +			continue;
> 
> Why do you feel the need to poke around in i_pages directly?  Is there
> something wrong with find_get_entries()?

No reason other than I didn't know about the latter. Thanks for the
hint. find_get_entries() seems to return a pagevec of entries which
would complicate the iteration (a 4th layer of things to iterate over).

But I did find find_get_pages_range() which I think can be used to find
1 page at a time. I'll look into it further.

> > +static int __pagecache_seq_show(struct seq_file *seq, struct page *page,
> > +				bool in_stop)
> > +{
> > +	struct bpf_iter_meta meta;
> > +	struct bpf_iter__pagecache ctx;
> > +	struct bpf_prog *prog;
> > +
> > +	meta.seq = seq;
> > +	prog = bpf_iter_get_info(&meta, in_stop);
> > +	if (!prog)
> > +		return 0;
> > +
> > +	meta.seq = seq;
> > +	ctx.meta = &meta;
> > +	ctx.page = page;
> > +	return bpf_iter_run_prog(prog, &ctx);
> 
> I'm not really keen on the idea of random BPF programs being able to poke
> at pages in the page cache like this.  From your initial description,
> it sounded like all you needed was a list of which pages are present.

Could you elaborate on what "list of which pages are present" implies?
The overall goal with this patch is to detect duplicate content in the
page cache. So anything that helps achieve that goal I would (in theory)
be OK with.

My understanding is the user would need to hash the contents
of each page in the page cache. And BPF provides the flexibility such
that this work could be reused for currently unanticipated use cases.

Furthermore, bpf programs could already look at all the pages in the
page cache by hooking into tracepoint:filemap:mm_filemap_add_to_page_cache,
albeit at a much slower rate. I figure the downside of adding this
page cache iterator is we're explicitly condoning the behavior.

> > +	INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL);
> > +
> > +	spin_lock(&info->ns->ns_lock);
> > +	list_for_each_entry(mnt, &info->ns->list, mnt_list) {
> > +		sb = mnt->mnt.mnt_sb;
> > +
> > +		/* The same mount may be mounted in multiple places */
> > +		if (radix_tree_lookup(&info->superblocks, (unsigned long)sb))
> > +			continue;
> > +
> > +		err = radix_tree_insert(&info->superblocks,
> > +				        (unsigned long)sb, (void *)1);
> > +		if (err)
> > +			goto out;
> > +	}
> > +
> > +	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
> > +		sb = (struct super_block *)iter.index;
> > +		atomic_inc(&sb->s_active);
> > +	}
> 
> Uh.  What on earth made you think this was a good way to use the radix
> tree?  And, no, the XArray doesn't change that.

The idea behind the radix tree was to deduplicate the mounts by
superblock. Because a single filesystem may be mounted in different
locations. I didn't find a set data structure I could reuse so I
figured radix tree / xarray would work too.

Happy to take any better ideas too.

> If you don't understand why this is so bad, call xa_dump() on it after
> constructing it.  I'll wait.

I did a dump and got the following results: http://ix.io/2VpY .

I receieved a hint that you may be referring to how the xarray/radix
tree would be as large as the largest pointer. To my uneducated eye it
doesn't look like that's the case in this dump. Could you please
clarify?

<...>

Thanks,
Daniel
Daniel Xu April 8, 2021, 8:44 p.m. UTC | #5
On Thu, Apr 08, 2021 at 10:19:35AM +0200, Christian Brauner wrote:
> On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:
> > This commit introduces the bpf page cache iterator. This iterator allows
> > users to run a bpf prog against each page in the "page cache".
> > Internally, the "page cache" is extremely tied to VFS superblock + inode
> > combo. Because of this, iter_pagecache will only examine pages in the
> > caller's mount namespace.
> > 
> > Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> > ---
> >  kernel/bpf/Makefile         |   2 +-
> >  kernel/bpf/pagecache_iter.c | 293 ++++++++++++++++++++++++++++++++++++
> >  2 files changed, 294 insertions(+), 1 deletion(-)
> >  create mode 100644 kernel/bpf/pagecache_iter.c

<...>

> > 
> > +static int init_seq_pagecache(void *priv_data, struct bpf_iter_aux_info *aux)
> > +{
> > +	struct bpf_iter_seq_pagecache_info *info = priv_data;
> > +	struct radix_tree_iter iter;
> > +	struct super_block *sb;
> > +	struct mount *mnt;
> > +	void **slot;
> > +	int err;
> > +
> > +	info->ns = current->nsproxy->mnt_ns;
> > +	get_mnt_ns(info->ns);
> > +	INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL);
> > +
> > +	spin_lock(&info->ns->ns_lock);
> > +	list_for_each_entry(mnt, &info->ns->list, mnt_list) {
> 
> Not just are there helpers for taking ns_lock
> static inline void lock_ns_list(struct mnt_namespace *ns)
> static inline void unlock_ns_list(struct mnt_namespace *ns)
> they are private to fs/namespace.c because it's the only place that
> should ever walk this list.

Thanks for the hints. Would it be acceptable to add some helpers to
fs/namespace.c to allow walking the list?

IIUC the only way to find a list of mounts is by looking at the mount
namespace. And walking each mount and looking at each `struct
super_node`'s inode's `struct address_space` seemed like the cleanest
way to walkthe page cache.

> This seems buggy: why is it ok here to only take ns_lock and not also
> namespace_sem like mnt_already_visible() and __is_local_mountpoint()
> or the relevant proc iterators? I might be missing something.

Thanks for the hints. I'll take a closer look at the locking. Most
probably I didn't get it right.

I should have also mentioned in the cover letter that I'm fairly sure I
messed up the locking somewhere.

> 
> > +		sb = mnt->mnt.mnt_sb;
> > +
> > +		/* The same mount may be mounted in multiple places */
> > +		if (radix_tree_lookup(&info->superblocks, (unsigned long)sb))
> > +			continue;
> > +
> > +		err = radix_tree_insert(&info->superblocks,
> > +				        (unsigned long)sb, (void *)1);
> > +		if (err)
> > +			goto out;
> > +	}
> > +
> > +	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
> > +		sb = (struct super_block *)iter.index;
> > +		atomic_inc(&sb->s_active);
> 
> It also isn't nice that you mess with sb->s_active directly.
> 
> Imho, this is poking around in a lot of fs/ specific stuff that other
> parts of the kernel should not care about or have access to.

Re above: do you think it'd be appropriate to add more helpers to fs/ ?

<...>

Thanks,
Daniel
Daniel Xu April 8, 2021, 8:49 p.m. UTC | #6
On Thu, Apr 08, 2021 at 04:45:37PM +0000, Al Viro wrote:
> On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:
> 
> > +static void fini_seq_pagecache(void *priv_data)
> > +{
> > +	struct bpf_iter_seq_pagecache_info *info = priv_data;
> > +	struct radix_tree_iter iter;
> > +	struct super_block *sb;
> > +	void **slot;
> > +
> > +	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
> > +		sb = (struct super_block *)iter.index;
> > +		atomic_dec(&sb->s_active);
> > +		radix_tree_delete(&info->superblocks, iter.index);
> > +	}
> 
> ... and if in the meanwhile all other contributors to ->s_active have
> gone away, that will result in...?

Ah right, sorry. Nobody will clean up the super_block.

> IOW, NAK.  The objects you are playing with have non-trivial lifecycle
> and poking into the guts of data structures without bothering to
> understand it is not a good idea.
> 
> Rule of the thumb: if your code ends up using fields that are otherwise
> handled by a small part of codebase, the odds are that you need to be
> bloody careful.  In particular, ->ns_lock has 3 users - all in
> fs/namespace.c.  ->list/->mnt_list: all users in fs/namespace.c and
> fs/pnode.c.  ->s_active: majority in fs/super.c, with several outliers
> in filesystems and safety of those is not trivial.
> 
> Any time you see that kind of pattern, you are risking to reprise
> a scene from The Modern Times - the one with Charlie taking a trip
> through the guts of machinery.

I'll take a closer look at the lifetime semantics.

Hopefully the overall goal of the patch is ok. Happy to iterate on the
implementation details until it's correct.

Thanks,
Daniel
Al Viro April 8, 2021, 9:04 p.m. UTC | #7
On Thu, Apr 08, 2021 at 01:49:35PM -0700, Daniel Xu wrote:

> Ah right, sorry. Nobody will clean up the super_block.
> 
> > IOW, NAK.  The objects you are playing with have non-trivial lifecycle
> > and poking into the guts of data structures without bothering to
> > understand it is not a good idea.
> > 
> > Rule of the thumb: if your code ends up using fields that are otherwise
> > handled by a small part of codebase, the odds are that you need to be
> > bloody careful.  In particular, ->ns_lock has 3 users - all in
> > fs/namespace.c.  ->list/->mnt_list: all users in fs/namespace.c and
> > fs/pnode.c.  ->s_active: majority in fs/super.c, with several outliers
> > in filesystems and safety of those is not trivial.
> > 
> > Any time you see that kind of pattern, you are risking to reprise
> > a scene from The Modern Times - the one with Charlie taking a trip
> > through the guts of machinery.
> 
> I'll take a closer look at the lifetime semantics.
> 
> Hopefully the overall goal of the patch is ok. Happy to iterate on the
> implementation details until it's correct.

That depends.  Note that bumping ->s_active means that umount of that
sucker will *NOT* shut it down - that would happen only on the thread
doing the final deactivation.  What's more, having e.g. a USB stick
mounted, doing umount(1), having it complete successfully, pulling the
damn thing out and getting writes lost would make for a nasty surprise
for users.

With your approach it seems to be inevitable.  Holding namespace_sem
through the entire thing would prevent that, but's it's a non-starter
for other reasons (starting with "it's a system-wide lock, so that'd
be highly antisocial").  Are there any limits on what could be done
to the pages, anyway?  Because if it's "anything user wanted to do",
it's *really* not feasible.
Matthew Wilcox April 8, 2021, 9:29 p.m. UTC | #8
On Thu, Apr 08, 2021 at 12:48:49PM -0700, Daniel Xu wrote:
> No reason other than I didn't know about the latter. Thanks for the
> hint. find_get_entries() seems to return a pagevec of entries which
> would complicate the iteration (a 4th layer of things to iterate over).
> 
> But I did find find_get_pages_range() which I think can be used to find
> 1 page at a time. I'll look into it further.

Please don't, that's going to be a pagevec too.

> > I'm not really keen on the idea of random BPF programs being able to poke
> > at pages in the page cache like this.  From your initial description,
> > it sounded like all you needed was a list of which pages are present.
> 
> Could you elaborate on what "list of which pages are present" implies?
> The overall goal with this patch is to detect duplicate content in the
> page cache. So anything that helps achieve that goal I would (in theory)
> be OK with.
> 
> My understanding is the user would need to hash the contents
> of each page in the page cache. And BPF provides the flexibility such
> that this work could be reused for currently unanticipated use cases.

But if you need the contents, then you'll need to kmap() the pages.
I don't see people being keen on exposing kmap() to bpf either.  I think
you're much better off providing an interface that returns a hash of
each page to the BPF program.

> Furthermore, bpf programs could already look at all the pages in the
> page cache by hooking into tracepoint:filemap:mm_filemap_add_to_page_cache,
> albeit at a much slower rate. I figure the downside of adding this
> page cache iterator is we're explicitly condoning the behavior.

That should never have been exposed.  It's only supposed to be for error
injection.  If people have started actually using it for something,
then it's time we delete that tracepoint.

> The idea behind the radix tree was to deduplicate the mounts by
> superblock. Because a single filesystem may be mounted in different
> locations. I didn't find a set data structure I could reuse so I
> figured radix tree / xarray would work too.
> 
> Happy to take any better ideas too.
> 
> > If you don't understand why this is so bad, call xa_dump() on it after
> > constructing it.  I'll wait.
> 
> I did a dump and got the following results: http://ix.io/2VpY .
> 
> I receieved a hint that you may be referring to how the xarray/radix
> tree would be as large as the largest pointer. To my uneducated eye it
> doesn't look like that's the case in this dump. Could you please
> clarify?

We get seven nodes per 4kB page.

$ grep -c 'value 0' 2VpY 
15
$ grep -c node 2VpY 
43

so we use 6+1/7 pages in order to store 15 values.  That's 387 cache
lines, for the amount of data that could fit in two.

Liam and I are working on a data structure that would support doing
something along these lines in an efficient manner, but it's not
ready yet.
Dave Chinner April 8, 2021, 10:11 p.m. UTC | #9
On Wed, Apr 07, 2021 at 02:46:11PM -0700, Daniel Xu wrote:
> This commit introduces the bpf page cache iterator. This iterator allows
> users to run a bpf prog against each page in the "page cache".
> Internally, the "page cache" is extremely tied to VFS superblock + inode
> combo. Because of this, iter_pagecache will only examine pages in the
> caller's mount namespace.

No, it does not just examine pages with in the callers mount
namespace, because ....

> +static struct inode *goto_next_inode(struct bpf_iter_seq_pagecache_info *info)
> +{
> +	struct inode *prev_inode = info->cur_inode;
> +	struct inode *inode;
> +
> +retry:
> +	BUG_ON(!info->cur_sb);
> +	spin_lock(&info->cur_sb->s_inode_list_lock);
> +
> +	if (!info->cur_inode) {
> +		list_for_each_entry(inode, &info->cur_sb->s_inodes, i_sb_list) {

... this is an "all inodes on the superblock" walk.  This will also
iterate inodes in other mount namespaces that point to the same
superblock.

IOWs, if you have different parts of the same filesystem mounted
into hundreds of container mount namespaces, this script will not
just iterate the local mount name space, it will iterate every inode
in every mount namespace.

And, of course, if the same files are mounted into multiple
containers (think read-only bind mounts using idmapping) then you
have zero indication of which container is actually using them, just
that there are hundreds of paths to the same inode. And every
container will appear to be using exactly the same amount of page cache.

IOWs, the stats this generates provide no insight into page cache
usage across mount namespaces in many situations, and it leaks
information about page cache usage across mount namespace
boundaries.

And that's before I say "iterating all inodes in a superblock is
bad" because it causes lock contention and interrupts normal usage.
We avoid s_inodes lists walks as much as we possibly can, and the
last thing we want is for userspace to be able to trivially
instigate long running walks of the s_inodes list. Remember, we can
have hundreds of millions of inodes on this list....

> +			spin_lock(&inode->i_lock);
> +			if (inode_unusual(inode)) {
> +				spin_unlock(&inode->i_lock);
> +				continue;
> +			}
> +			__iget(inode);
> +			spin_unlock(&inode->i_lock);

This can spin long enough to trigger livelock warnings. Even if it's
not held that long, it can cause unexpected long tail latencies in
memory reclaim and inode instantiation. Every s_inodes list walk has
cond_resched() built into it now....

> +	info->ns = current->nsproxy->mnt_ns;
> +	get_mnt_ns(info->ns);
> +	INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL);
> +
> +	spin_lock(&info->ns->ns_lock);
> +	list_for_each_entry(mnt, &info->ns->list, mnt_list) {
> +		sb = mnt->mnt.mnt_sb;
> +
> +		/* The same mount may be mounted in multiple places */
> +		if (radix_tree_lookup(&info->superblocks, (unsigned long)sb))
> +			continue;
> +
> +		err = radix_tree_insert(&info->superblocks,
> +				        (unsigned long)sb, (void *)1);

And just because nobody has pointed it out yet: radix_tree_insert()
will do GFP_KERNEL memory allocations inside the spinlock being held
here.

----

You said that you didn't take the "walk the LRUs" approach because
walking superblocks "seemed simpler". It's not. Page cache residency
and accounting is managed by memcgs, not by mount namespaces.

That is, containers usually have a memcg associated with them to control
memory usage of the container. The page cache used by a container is
accounted directly to the memcg, and memory reclaim can find all the
file-backed page cache pages associated with a memcg very quickly
(via mem_cgroup_lruvec()).  This will find pages associated directly
with the memcg, so it gives you a fairly accurate picture of the
page cache usage within the container.

This has none of the issues that arise from "sb != mnt_ns" that
walking superblocks and inode lists have, and it doesn't require you
to play games with mounts, superblocks and inode references....

Cheers,

Dave.
diff mbox series

Patch

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 7f33098ca63f..3deb6a8d3f75 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@  cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
 endif
 CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o pagecache_iter.o map_iter.o task_iter.o prog_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
diff --git a/kernel/bpf/pagecache_iter.c b/kernel/bpf/pagecache_iter.c
new file mode 100644
index 000000000000..8442ab0d4221
--- /dev/null
+++ b/kernel/bpf/pagecache_iter.c
@@ -0,0 +1,293 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2021 Facebook */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/init.h>
+#include <linux/mm_types.h>
+#include <linux/mnt_namespace.h>
+#include <linux/nsproxy.h>
+#include <linux/pagemap.h>
+#include <linux/radix-tree.h>
+#include <linux/seq_file.h>
+#include "../../fs/mount.h"
+
+struct bpf_iter_seq_pagecache_info {
+	struct mnt_namespace *ns;
+	struct radix_tree_root superblocks;
+	struct super_block *cur_sb;
+	struct inode *cur_inode;
+	unsigned long cur_page_idx;
+};
+
+static struct super_block *goto_next_sb(struct bpf_iter_seq_pagecache_info *info)
+{
+	struct super_block *sb = NULL;
+	struct radix_tree_iter iter;
+	void **slot;
+
+	radix_tree_for_each_slot(slot, &info->superblocks, &iter,
+				 ((unsigned long)info->cur_sb + 1)) {
+		sb = (struct super_block *)iter.index;
+		break;
+	}
+
+	info->cur_sb = sb;
+	info->cur_inode = NULL;
+	info->cur_page_idx = 0;
+	return sb;
+}
+
+static bool inode_unusual(struct inode *inode) {
+	return ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+		(inode->i_mapping->nrpages == 0));
+}
+
+static struct inode *goto_next_inode(struct bpf_iter_seq_pagecache_info *info)
+{
+	struct inode *prev_inode = info->cur_inode;
+	struct inode *inode;
+
+retry:
+	BUG_ON(!info->cur_sb);
+	spin_lock(&info->cur_sb->s_inode_list_lock);
+
+	if (!info->cur_inode) {
+		list_for_each_entry(inode, &info->cur_sb->s_inodes, i_sb_list) {
+			spin_lock(&inode->i_lock);
+			if (inode_unusual(inode)) {
+				spin_unlock(&inode->i_lock);
+				continue;
+			}
+			__iget(inode);
+			spin_unlock(&inode->i_lock);
+			info->cur_inode = inode;
+			break;
+		}
+	} else {
+		inode = info->cur_inode;
+		info->cur_inode = NULL;
+		list_for_each_entry_continue(inode, &info->cur_sb->s_inodes,
+					     i_sb_list) {
+			spin_lock(&inode->i_lock);
+			if (inode_unusual(inode)) {
+				spin_unlock(&inode->i_lock);
+				continue;
+			}
+			__iget(inode);
+			spin_unlock(&inode->i_lock);
+			info->cur_inode = inode;
+			break;
+		}
+	}
+
+	/* Seen all inodes in this superblock */
+	if (!info->cur_inode) {
+		spin_unlock(&info->cur_sb->s_inode_list_lock);
+		if (!goto_next_sb(info)) {
+			inode = NULL;
+			goto out;
+		}
+
+		goto retry;
+	}
+
+	spin_unlock(&info->cur_sb->s_inode_list_lock);
+	info->cur_page_idx = 0;
+out:
+	iput(prev_inode);
+	return info->cur_inode;
+}
+
+static struct page *goto_next_page(struct bpf_iter_seq_pagecache_info *info)
+{
+	struct page *page, *ret = NULL;
+	unsigned long idx;
+
+	rcu_read_lock();
+retry:
+	BUG_ON(!info->cur_inode);
+	ret = NULL;
+	xa_for_each_start(&info->cur_inode->i_data.i_pages, idx, page,
+			  info->cur_page_idx) {
+		if (!page_cache_get_speculative(page))
+			continue;
+
+		ret = page;
+		info->cur_page_idx = idx + 1;
+		break;
+	}
+
+	if (!ret) {
+		/* Seen all inodes and superblocks */
+		if (!goto_next_inode(info))
+			goto out;
+
+		goto retry;
+	}
+
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+static void *pagecache_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_pagecache_info *info = seq->private;
+	struct page *page;
+
+	if (!info->cur_sb && !goto_next_sb(info))
+		return NULL;
+	if (!info->cur_inode && !goto_next_inode(info))
+		return NULL;
+
+	page = goto_next_page(info);
+	if (!page)
+		return NULL;
+
+	if (*pos == 0)
+		++*pos;
+
+	return page;
+
+}
+
+static void *pagecache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_pagecache_info *info = seq->private;
+	struct page *page;
+
+	++*pos;
+	put_page((struct page *)v);
+	page = goto_next_page(info);
+	if (!page)
+		return NULL;
+
+	return page;
+}
+
+struct bpf_iter__pagecache {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct page *, page);
+};
+
+DEFINE_BPF_ITER_FUNC(pagecache, struct bpf_iter_meta *meta, struct page *page)
+
+static int __pagecache_seq_show(struct seq_file *seq, struct page *page,
+				bool in_stop)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_iter__pagecache ctx;
+	struct bpf_prog *prog;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (!prog)
+		return 0;
+
+	meta.seq = seq;
+	ctx.meta = &meta;
+	ctx.page = page;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int pagecache_seq_show(struct seq_file *seq, void *v)
+{
+	return __pagecache_seq_show(seq, v, false);
+}
+
+static void pagecache_seq_stop(struct seq_file *seq, void *v)
+{
+	(void)__pagecache_seq_show(seq, v, true);
+	if (v)
+		put_page((struct page *)v);
+}
+
+static int init_seq_pagecache(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+	struct bpf_iter_seq_pagecache_info *info = priv_data;
+	struct radix_tree_iter iter;
+	struct super_block *sb;
+	struct mount *mnt;
+	void **slot;
+	int err;
+
+	info->ns = current->nsproxy->mnt_ns;
+	get_mnt_ns(info->ns);
+	INIT_RADIX_TREE(&info->superblocks, GFP_KERNEL);
+
+	spin_lock(&info->ns->ns_lock);
+	list_for_each_entry(mnt, &info->ns->list, mnt_list) {
+		sb = mnt->mnt.mnt_sb;
+
+		/* The same mount may be mounted in multiple places */
+		if (radix_tree_lookup(&info->superblocks, (unsigned long)sb))
+			continue;
+
+		err = radix_tree_insert(&info->superblocks,
+				        (unsigned long)sb, (void *)1);
+		if (err)
+			goto out;
+	}
+
+	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
+		sb = (struct super_block *)iter.index;
+		atomic_inc(&sb->s_active);
+	}
+
+	err = 0;
+out:
+	spin_unlock(&info->ns->ns_lock);
+	return err;
+}
+
+static void fini_seq_pagecache(void *priv_data)
+{
+	struct bpf_iter_seq_pagecache_info *info = priv_data;
+	struct radix_tree_iter iter;
+	struct super_block *sb;
+	void **slot;
+
+	radix_tree_for_each_slot(slot, &info->superblocks, &iter, 0) {
+		sb = (struct super_block *)iter.index;
+		atomic_dec(&sb->s_active);
+		radix_tree_delete(&info->superblocks, iter.index);
+	}
+
+	put_mnt_ns(info->ns);
+}
+
+static const struct seq_operations pagecache_seq_ops = {
+	.start	= pagecache_seq_start,
+	.next	= pagecache_seq_next,
+	.stop	= pagecache_seq_stop,
+	.show	= pagecache_seq_show,
+};
+
+static const struct bpf_iter_seq_info pagecache_seq_info = {
+	.seq_ops		= &pagecache_seq_ops,
+	.init_seq_private	= init_seq_pagecache,
+	.fini_seq_private	= fini_seq_pagecache,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_pagecache_info),
+};
+
+static struct bpf_iter_reg pagecache_reg_info = {
+	.target			= "pagecache",
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__pagecache, page),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
+	.seq_info		= &pagecache_seq_info,
+};
+
+BTF_ID_LIST(btf_page_id)
+BTF_ID(struct, page)
+
+static int __init bpf_pagecache_iter_init(void)
+{
+	pagecache_reg_info.ctx_arg_info[0].btf_id = *btf_page_id;
+	return bpf_iter_reg_target(&pagecache_reg_info);
+}
+
+late_initcall(bpf_pagecache_iter_init);