diff mbox series

[bpf-next,2/4] bpf: Make bpf inode storage available to tracing program

Message ID 20241112082600.298035-3-song@kernel.org (mailing list archive)
State New
Headers show
Series Make inode storage available to tracing prog | expand

Commit Message

Song Liu Nov. 12, 2024, 8:25 a.m. UTC
inode storage can be useful for non-LSM program. For example, file* tools
from bcc/libbpf-tools can use inode storage instead of hash map; fanotify
fastpath [1] can also use inode storage to store useful data.

Make inode storage available for tracing program. Move bpf inode storage
from a security blob to inode->i_bpf_storage, and adjust related code
accordingly.

[1] https://lore.kernel.org/linux-fsdevel/20241029231244.2834368-1-song@kernel.org/
Signed-off-by: Song Liu <song@kernel.org>
---
 fs/inode.c                     |  1 +
 include/linux/bpf.h            |  9 +++++++++
 include/linux/bpf_lsm.h        | 29 -----------------------------
 include/linux/fs.h             |  4 ++++
 kernel/bpf/Makefile            |  3 +--
 kernel/bpf/bpf_inode_storage.c | 32 +++++---------------------------
 kernel/bpf/bpf_lsm.c           |  4 ----
 kernel/trace/bpf_trace.c       |  4 ++++
 security/bpf/hooks.c           |  6 ------
 9 files changed, 24 insertions(+), 68 deletions(-)

Comments

Christian Brauner Nov. 13, 2024, 10:19 a.m. UTC | #1
On Tue, Nov 12, 2024 at 12:25:56AM -0800, Song Liu wrote:
> inode storage can be useful for non-LSM program. For example, file* tools
> from bcc/libbpf-tools can use inode storage instead of hash map; fanotify
> fastpath [1] can also use inode storage to store useful data.
> 
> Make inode storage available for tracing program. Move bpf inode storage
> from a security blob to inode->i_bpf_storage, and adjust related code
> accordingly.
> 
> [1] https://lore.kernel.org/linux-fsdevel/20241029231244.2834368-1-song@kernel.org/
> Signed-off-by: Song Liu <song@kernel.org>
> ---
>  fs/inode.c                     |  1 +
>  include/linux/bpf.h            |  9 +++++++++
>  include/linux/bpf_lsm.h        | 29 -----------------------------
>  include/linux/fs.h             |  4 ++++
>  kernel/bpf/Makefile            |  3 +--
>  kernel/bpf/bpf_inode_storage.c | 32 +++++---------------------------
>  kernel/bpf/bpf_lsm.c           |  4 ----
>  kernel/trace/bpf_trace.c       |  4 ++++
>  security/bpf/hooks.c           |  6 ------
>  9 files changed, 24 insertions(+), 68 deletions(-)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index 8dabb224f941..3c679578169f 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -286,6 +286,7 @@ static struct inode *alloc_inode(struct super_block *sb)
>  void __destroy_inode(struct inode *inode)
>  {
>  	BUG_ON(inode_has_buffers(inode));
> +	bpf_inode_storage_free(inode);
>  	inode_detach_wb(inode);
>  	security_inode_free(inode);
>  	fsnotify_inode_delete(inode);
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 1b84613b10ac..0b31d2e74df6 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -2672,6 +2672,7 @@ struct bpf_link *bpf_link_by_id(u32 id);
>  const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id,
>  						 const struct bpf_prog *prog);
>  void bpf_task_storage_free(struct task_struct *task);
> +void bpf_inode_storage_free(struct inode *inode);
>  void bpf_cgrp_storage_free(struct cgroup *cgroup);
>  bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
>  const struct btf_func_model *
> @@ -2942,6 +2943,10 @@ static inline void bpf_task_storage_free(struct task_struct *task)
>  {
>  }
>  
> +static inline void bpf_inode_storage_free(struct inode *inode)
> +{
> +}
> +
>  static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
>  {
>  	return false;
> @@ -3305,6 +3310,10 @@ extern const struct bpf_func_proto bpf_task_storage_get_recur_proto;
>  extern const struct bpf_func_proto bpf_task_storage_get_proto;
>  extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto;
>  extern const struct bpf_func_proto bpf_task_storage_delete_proto;
> +extern const struct bpf_func_proto bpf_inode_storage_get_proto;
> +extern const struct bpf_func_proto bpf_inode_storage_get_recur_proto;
> +extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
> +extern const struct bpf_func_proto bpf_inode_storage_delete_recur_proto;
>  extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
>  extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
>  extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
> diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
> index aefcd6564251..a819c2f0a062 100644
> --- a/include/linux/bpf_lsm.h
> +++ b/include/linux/bpf_lsm.h
> @@ -19,31 +19,12 @@
>  #include <linux/lsm_hook_defs.h>
>  #undef LSM_HOOK
>  
> -struct bpf_storage_blob {
> -	struct bpf_local_storage __rcu *storage;
> -};
> -
> -extern struct lsm_blob_sizes bpf_lsm_blob_sizes;
> -
>  int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
>  			const struct bpf_prog *prog);
>  
>  bool bpf_lsm_is_sleepable_hook(u32 btf_id);
>  bool bpf_lsm_is_trusted(const struct bpf_prog *prog);
>  
> -static inline struct bpf_storage_blob *bpf_inode(
> -	const struct inode *inode)
> -{
> -	if (unlikely(!inode->i_security))
> -		return NULL;
> -
> -	return inode->i_security + bpf_lsm_blob_sizes.lbs_inode;
> -}
> -
> -extern const struct bpf_func_proto bpf_inode_storage_get_proto;
> -extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
> -void bpf_inode_storage_free(struct inode *inode);
> -
>  void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func);
>  
>  int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
> @@ -66,16 +47,6 @@ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
>  	return -EOPNOTSUPP;
>  }
>  
> -static inline struct bpf_storage_blob *bpf_inode(
> -	const struct inode *inode)
> -{
> -	return NULL;
> -}
> -
> -static inline void bpf_inode_storage_free(struct inode *inode)
> -{
> -}
> -
>  static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
>  					   bpf_func_t *bpf_func)
>  {
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 3559446279c1..479097e4dd5b 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -79,6 +79,7 @@ struct fs_context;
>  struct fs_parameter_spec;
>  struct fileattr;
>  struct iomap_ops;
> +struct bpf_local_storage;
>  
>  extern void __init inode_init(void);
>  extern void __init inode_init_early(void);
> @@ -648,6 +649,9 @@ struct inode {
>  #ifdef CONFIG_SECURITY
>  	void			*i_security;
>  #endif
> +#ifdef CONFIG_BPF_SYSCALL
> +	struct bpf_local_storage __rcu *i_bpf_storage;
> +#endif

Sorry, we're not growing struct inode for this. It just keeps getting
bigger. Last cycle we freed up 8 bytes to shrink it and we're not going
to waste them on special-purpose stuff. We already NAKed someone else's
pet field here.
Song Liu Nov. 13, 2024, 2:15 p.m. UTC | #2
Hi Christian, 

Thanks for your review. 

> On Nov 13, 2024, at 2:19 AM, Christian Brauner <brauner@kernel.org> wrote:
[...]

>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 3559446279c1..479097e4dd5b 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -79,6 +79,7 @@ struct fs_context;
>> struct fs_parameter_spec;
>> struct fileattr;
>> struct iomap_ops;
>> +struct bpf_local_storage;
>> 
>> extern void __init inode_init(void);
>> extern void __init inode_init_early(void);
>> @@ -648,6 +649,9 @@ struct inode {
>> #ifdef CONFIG_SECURITY
>> void *i_security;
>> #endif
>> +#ifdef CONFIG_BPF_SYSCALL
>> + struct bpf_local_storage __rcu *i_bpf_storage;
>> +#endif
> 
> Sorry, we're not growing struct inode for this. It just keeps getting
> bigger. Last cycle we freed up 8 bytes to shrink it and we're not going
> to waste them on special-purpose stuff. We already NAKed someone else's
> pet field here.

Would it be acceptable if we union i_bpf_storage with i_security?
IOW, if CONFIG_SECURITY is enabled, we will use existing logic. 
If CONFIG_SECURITY is not enabled, we will use i_bpf_storage. 
Given majority of default configs have CONFIG_SECURITY=y, this 
will not grow inode for most users. OTOH, users with 
CONFIG_SECURITY=n && CONFIG_BPF_SYSCALL=y combination can still 
use inode local storage in the tracing BPF programs. 

Does this make sense?

Thanks,
Song
Casey Schaufler Nov. 13, 2024, 6:29 p.m. UTC | #3
On 11/13/2024 6:15 AM, Song Liu wrote:
> Hi Christian, 
>
> Thanks for your review. 
>
>> On Nov 13, 2024, at 2:19 AM, Christian Brauner <brauner@kernel.org> wrote:
> [...]
>
>>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>>> index 3559446279c1..479097e4dd5b 100644
>>> --- a/include/linux/fs.h
>>> +++ b/include/linux/fs.h
>>> @@ -79,6 +79,7 @@ struct fs_context;
>>> struct fs_parameter_spec;
>>> struct fileattr;
>>> struct iomap_ops;
>>> +struct bpf_local_storage;
>>>
>>> extern void __init inode_init(void);
>>> extern void __init inode_init_early(void);
>>> @@ -648,6 +649,9 @@ struct inode {
>>> #ifdef CONFIG_SECURITY
>>> void *i_security;
>>> #endif
>>> +#ifdef CONFIG_BPF_SYSCALL
>>> + struct bpf_local_storage __rcu *i_bpf_storage;
>>> +#endif
>> Sorry, we're not growing struct inode for this. It just keeps getting
>> bigger. Last cycle we freed up 8 bytes to shrink it and we're not going
>> to waste them on special-purpose stuff. We already NAKed someone else's
>> pet field here.
> Would it be acceptable if we union i_bpf_storage with i_security?

No!

> IOW, if CONFIG_SECURITY is enabled, we will use existing logic. 
> If CONFIG_SECURITY is not enabled, we will use i_bpf_storage. 
> Given majority of default configs have CONFIG_SECURITY=y, this 
> will not grow inode for most users. OTOH, users with 
> CONFIG_SECURITY=n && CONFIG_BPF_SYSCALL=y combination can still 
> use inode local storage in the tracing BPF programs. 
>
> Does this make sense?

All it would take is one BPF programmer assuming that CONFIG_SECURITY=n
is the norm for this to blow up spectacularly.

>
> Thanks,
> Song 
>
Song Liu Nov. 13, 2024, 7 p.m. UTC | #4
On Wed, Nov 13, 2024 at 10:30 AM Casey Schaufler <casey@schaufler-ca.com> wrote:
>
> On 11/13/2024 6:15 AM, Song Liu wrote:
> > Hi Christian,
> >
> > Thanks for your review.
> >
> >> On Nov 13, 2024, at 2:19 AM, Christian Brauner <brauner@kernel.org> wrote:
> > [...]
> >
> >>> diff --git a/include/linux/fs.h b/include/linux/fs.h
> >>> index 3559446279c1..479097e4dd5b 100644
> >>> --- a/include/linux/fs.h
> >>> +++ b/include/linux/fs.h
> >>> @@ -79,6 +79,7 @@ struct fs_context;
> >>> struct fs_parameter_spec;
> >>> struct fileattr;
> >>> struct iomap_ops;
> >>> +struct bpf_local_storage;
> >>>
> >>> extern void __init inode_init(void);
> >>> extern void __init inode_init_early(void);
> >>> @@ -648,6 +649,9 @@ struct inode {
> >>> #ifdef CONFIG_SECURITY
> >>> void *i_security;
> >>> #endif
> >>> +#ifdef CONFIG_BPF_SYSCALL
> >>> + struct bpf_local_storage __rcu *i_bpf_storage;
> >>> +#endif
> >> Sorry, we're not growing struct inode for this. It just keeps getting
> >> bigger. Last cycle we freed up 8 bytes to shrink it and we're not going
> >> to waste them on special-purpose stuff. We already NAKed someone else's
> >> pet field here.
> > Would it be acceptable if we union i_bpf_storage with i_security?
>
> No!
>
> > IOW, if CONFIG_SECURITY is enabled, we will use existing logic.
> > If CONFIG_SECURITY is not enabled, we will use i_bpf_storage.
> > Given majority of default configs have CONFIG_SECURITY=y, this
> > will not grow inode for most users. OTOH, users with
> > CONFIG_SECURITY=n && CONFIG_BPF_SYSCALL=y combination can still
> > use inode local storage in the tracing BPF programs.
> >
> > Does this make sense?
>
> All it would take is one BPF programmer assuming that CONFIG_SECURITY=n
> is the norm for this to blow up spectacularly.

I seriously don't understand what would blow up and how. Can you be
more specific?

Thanks,
Song
Song Liu Nov. 14, 2024, 9:11 p.m. UTC | #5
Hi Christian, 

> On Nov 13, 2024, at 2:19 AM, Christian Brauner <brauner@kernel.org> wrote:

[...]

>> static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
>>   bpf_func_t *bpf_func)
>> {
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 3559446279c1..479097e4dd5b 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -79,6 +79,7 @@ struct fs_context;
>> struct fs_parameter_spec;
>> struct fileattr;
>> struct iomap_ops;
>> +struct bpf_local_storage;
>> 
>> extern void __init inode_init(void);
>> extern void __init inode_init_early(void);
>> @@ -648,6 +649,9 @@ struct inode {
>> #ifdef CONFIG_SECURITY
>> void *i_security;
>> #endif
>> +#ifdef CONFIG_BPF_SYSCALL
>> + struct bpf_local_storage __rcu *i_bpf_storage;
>> +#endif
> 
> Sorry, we're not growing struct inode for this. It just keeps getting
> bigger. Last cycle we freed up 8 bytes to shrink it and we're not going
> to waste them on special-purpose stuff. We already NAKed someone else's
> pet field here.

Per other discussions in this thread, I am implementing the following:

#ifdef CONFIG_SECURITY
        void                    *i_security;
#elif CONFIG_BPF_SYSCALL
        struct bpf_local_storage __rcu *i_bpf_storage;
#endif

However, it is a bit trickier than I thought. Specifically, we need 
to deal with the following scenarios:
 
1. CONFIG_SECURITY=y && CONFIG_BPF_LSM=n && CONFIG_BPF_SYSCALL=y
2. CONFIG_SECURITY=y && CONFIG_BPF_LSM=y && CONFIG_BPF_SYSCALL=y but 
   bpf lsm is not enabled at boot time. 

AFAICT, we need to modify how lsm blob are managed with 
CONFIG_BPF_SYSCALL=y && CONFIG_BPF_LSM=n case. The solution, even
if it gets accepted, doesn't really save any memory. Instead of 
growing struct inode by 8 bytes, the solution will allocate 8
more bytes to inode->i_security. So the total memory consumption
is the same, but the memory is more fragmented. 

Therefore, I think we should really step back and consider adding
the i_bpf_storage to struct inode. While this does increase the
size of struct inode by 8 bytes, it may end up with less overall
memory consumption for the system. This is why. 

When the user cannot use inode local storage, the alternative is 
to use hash maps (use inode pointer as key). AFAICT, all hash maps 
comes with non-trivial overhead, in memory consumption, in access 
latency, and in extra code to manage the memory. OTOH, inode local 
storage doesn't have these issue, and is usually much more efficient: 
 - memory is only allocated for inodes with actual data, 
 - O(1) latency, 
 - per inode data is freed automatically when the inode is evicted. 
Please refer to [1] where Amir mentioned all the work needed to 
properly manage a hash map, and I explained why we don't need to 
worry about these with inode local storage. 

Besides reducing memory consumption, i_bpf_storage also shortens 
the pointer chain to access inode local storage. Before this set, 
inode local storage is available at 
inode->i_security+offset(struct bpf_storage_blob)->storage. After 
this set, inode local storage is simply at inode->i_bpf_storage. 

At the moment, we are using bpf local storage with 
task_struct->bpf_storage, struct sock->sk_bpf_storage, 
struct cgroup->bpf_cgrp_storage. All of these turned out to be 
successful and helped users to use memory more efficiently. I 
think we can see the same benefits with struct inode->i_bpf_storage. 

I hope these make sense, and you will consider adding i_bpf_storage.
Please let me know if anything above is not clear. 

Thanks,
Song

[1] https://lore.kernel.org/linux-fsdevel/CAOQ4uxjXjjkKMa1xcPyxE5vxh1U5oGZJWtofRCwp-3ikCA6cgg@mail.gmail.com/
Jan Kara Nov. 15, 2024, 11:19 a.m. UTC | #6
Hi Song!

On Thu 14-11-24 21:11:57, Song Liu wrote:
> > On Nov 13, 2024, at 2:19 AM, Christian Brauner <brauner@kernel.org> wrote:
> >> static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
> >>   bpf_func_t *bpf_func)
> >> {
> >> diff --git a/include/linux/fs.h b/include/linux/fs.h
> >> index 3559446279c1..479097e4dd5b 100644
> >> --- a/include/linux/fs.h
> >> +++ b/include/linux/fs.h
> >> @@ -79,6 +79,7 @@ struct fs_context;
> >> struct fs_parameter_spec;
> >> struct fileattr;
> >> struct iomap_ops;
> >> +struct bpf_local_storage;
> >> 
> >> extern void __init inode_init(void);
> >> extern void __init inode_init_early(void);
> >> @@ -648,6 +649,9 @@ struct inode {
> >> #ifdef CONFIG_SECURITY
> >> void *i_security;
> >> #endif
> >> +#ifdef CONFIG_BPF_SYSCALL
> >> + struct bpf_local_storage __rcu *i_bpf_storage;
> >> +#endif
> > 
> > Sorry, we're not growing struct inode for this. It just keeps getting
> > bigger. Last cycle we freed up 8 bytes to shrink it and we're not going
> > to waste them on special-purpose stuff. We already NAKed someone else's
> > pet field here.
> 
> Per other discussions in this thread, I am implementing the following:
> 
> #ifdef CONFIG_SECURITY
>         void                    *i_security;
> #elif CONFIG_BPF_SYSCALL
>         struct bpf_local_storage __rcu *i_bpf_storage;
> #endif
> 
> However, it is a bit trickier than I thought. Specifically, we need 
> to deal with the following scenarios:
>  
> 1. CONFIG_SECURITY=y && CONFIG_BPF_LSM=n && CONFIG_BPF_SYSCALL=y
> 2. CONFIG_SECURITY=y && CONFIG_BPF_LSM=y && CONFIG_BPF_SYSCALL=y but 
>    bpf lsm is not enabled at boot time. 
> 
> AFAICT, we need to modify how lsm blob are managed with 
> CONFIG_BPF_SYSCALL=y && CONFIG_BPF_LSM=n case. The solution, even
> if it gets accepted, doesn't really save any memory. Instead of 
> growing struct inode by 8 bytes, the solution will allocate 8
> more bytes to inode->i_security. So the total memory consumption
> is the same, but the memory is more fragmented. 

I guess you've found a better solution for this based on James' suggestion.

> Therefore, I think we should really step back and consider adding
> the i_bpf_storage to struct inode. While this does increase the
> size of struct inode by 8 bytes, it may end up with less overall
> memory consumption for the system. This is why. 
>
> When the user cannot use inode local storage, the alternative is 
> to use hash maps (use inode pointer as key). AFAICT, all hash maps 
> comes with non-trivial overhead, in memory consumption, in access 
> latency, and in extra code to manage the memory. OTOH, inode local 
> storage doesn't have these issue, and is usually much more efficient: 
>  - memory is only allocated for inodes with actual data, 
>  - O(1) latency, 
>  - per inode data is freed automatically when the inode is evicted. 
> Please refer to [1] where Amir mentioned all the work needed to 
> properly manage a hash map, and I explained why we don't need to 
> worry about these with inode local storage. 

Well, but here you are speaking of a situation where bpf inode storage
space gets actually used for most inodes. Then I agree i_bpf_storage is the
most economic solution. But I'd also expect that for vast majority of
systems the bpf inode storage isn't used at all and if it does get used, it
is used only for a small fraction of inodes. So we are weighting 8 bytes
per inode for all those users that don't need it against more significant
memory savings for users that actually do need per inode bpf storage. A
factor in this is that a lot of people are running some distribution kernel
which generally enables most config options that are at least somewhat
useful. So hiding the cost behind CONFIG_FOO doesn't really help such
people.
 
I'm personally not *so* hung up about a pointer in struct inode but I can
see why Christian is and I agree adding a pointer there isn't a win for
everybody.

Longer term, I think it may be beneficial to come up with a way to attach
private info to the inode in a way that doesn't cost us one pointer per
funcionality that may possibly attach info to the inode. We already have
i_crypt_info, i_verity_info, i_flctx, i_security, etc. It's always a tough
call where the space overhead for everybody is worth the runtime &
complexity overhead for users using the functionality...

								Honza
Song Liu Nov. 15, 2024, 5:35 p.m. UTC | #7
Hi Jan, 

> On Nov 15, 2024, at 3:19 AM, Jan Kara <jack@suse.cz> wrote:

[...]

>> AFAICT, we need to modify how lsm blob are managed with 
>> CONFIG_BPF_SYSCALL=y && CONFIG_BPF_LSM=n case. The solution, even
>> if it gets accepted, doesn't really save any memory. Instead of 
>> growing struct inode by 8 bytes, the solution will allocate 8
>> more bytes to inode->i_security. So the total memory consumption
>> is the same, but the memory is more fragmented.
> 
> I guess you've found a better solution for this based on James' suggestion.
> 
>> Therefore, I think we should really step back and consider adding
>> the i_bpf_storage to struct inode. While this does increase the
>> size of struct inode by 8 bytes, it may end up with less overall
>> memory consumption for the system. This is why. 
>> 
>> When the user cannot use inode local storage, the alternative is 
>> to use hash maps (use inode pointer as key). AFAICT, all hash maps 
>> comes with non-trivial overhead, in memory consumption, in access 
>> latency, and in extra code to manage the memory. OTOH, inode local 
>> storage doesn't have these issue, and is usually much more efficient: 
>> - memory is only allocated for inodes with actual data, 
>> - O(1) latency, 
>> - per inode data is freed automatically when the inode is evicted. 
>> Please refer to [1] where Amir mentioned all the work needed to 
>> properly manage a hash map, and I explained why we don't need to 
>> worry about these with inode local storage.
> 
> Well, but here you are speaking of a situation where bpf inode storage
> space gets actually used for most inodes. Then I agree i_bpf_storage is the
> most economic solution. But I'd also expect that for vast majority of
> systems the bpf inode storage isn't used at all and if it does get used, it
> is used only for a small fraction of inodes. So we are weighting 8 bytes
> per inode for all those users that don't need it against more significant
> memory savings for users that actually do need per inode bpf storage. A
> factor in this is that a lot of people are running some distribution kernel
> which generally enables most config options that are at least somewhat
> useful. So hiding the cost behind CONFIG_FOO doesn't really help such
> people.

Agreed that an extra pointer will be used if there is no actual users
of it. However, in longer term, "most users do not use bpf inode
storage" may not be true. As kernel engineers, we may not always notice 
when user space is using some BPF features. For example, systemd has
a BPF LSM program "restrict_filesystems" [1]. It is enabled if the 
user have lsm=bpf in kernel args. I personally noticed it as a 
surprise when we enabled lsm=bpf. 

> I'm personally not *so* hung up about a pointer in struct inode but I can
> see why Christian is and I agree adding a pointer there isn't a win for
> everybody.

I can also understand Christian's motivation. However, I am a bit
frustrated because similar approach (adding a pointer to the struct) 
worked fine for other popular data structures: task_struct, sock, 
cgroup. 

> Longer term, I think it may be beneficial to come up with a way to attach
> private info to the inode in a way that doesn't cost us one pointer per
> funcionality that may possibly attach info to the inode. We already have
> i_crypt_info, i_verity_info, i_flctx, i_security, etc. It's always a tough
> call where the space overhead for everybody is worth the runtime &
> complexity overhead for users using the functionality...

It does seem to be the right long term solution, and I am willing to 
work on it. However, I would really appreciate some positive feedback
on the idea, so that I have better confidence my weeks of work has a 
better chance to worth it. 

Thanks,
Song

[1] https://github.com/systemd/systemd/blob/main/src/core/bpf/restrict_fs/restrict-fs.bpf.c
Jeff Layton Nov. 19, 2024, 2:21 p.m. UTC | #8
On Fri, 2024-11-15 at 17:35 +0000, Song Liu wrote:
> Hi Jan, 
> 
> > On Nov 15, 2024, at 3:19 AM, Jan Kara <jack@suse.cz> wrote:
> 
> [...]
> 
> > > AFAICT, we need to modify how lsm blob are managed with 
> > > CONFIG_BPF_SYSCALL=y && CONFIG_BPF_LSM=n case. The solution, even
> > > if it gets accepted, doesn't really save any memory. Instead of 
> > > growing struct inode by 8 bytes, the solution will allocate 8
> > > more bytes to inode->i_security. So the total memory consumption
> > > is the same, but the memory is more fragmented.
> > 
> > I guess you've found a better solution for this based on James' suggestion.
> > 
> > > Therefore, I think we should really step back and consider adding
> > > the i_bpf_storage to struct inode. While this does increase the
> > > size of struct inode by 8 bytes, it may end up with less overall
> > > memory consumption for the system. This is why. 
> > > 
> > > When the user cannot use inode local storage, the alternative is 
> > > to use hash maps (use inode pointer as key). AFAICT, all hash maps 
> > > comes with non-trivial overhead, in memory consumption, in access 
> > > latency, and in extra code to manage the memory. OTOH, inode local 
> > > storage doesn't have these issue, and is usually much more efficient: 
> > > - memory is only allocated for inodes with actual data, 
> > > - O(1) latency, 
> > > - per inode data is freed automatically when the inode is evicted. 
> > > Please refer to [1] where Amir mentioned all the work needed to 
> > > properly manage a hash map, and I explained why we don't need to 
> > > worry about these with inode local storage.
> > 
> > Well, but here you are speaking of a situation where bpf inode storage
> > space gets actually used for most inodes. Then I agree i_bpf_storage is the
> > most economic solution. But I'd also expect that for vast majority of
> > systems the bpf inode storage isn't used at all and if it does get used, it
> > is used only for a small fraction of inodes. So we are weighting 8 bytes
> > per inode for all those users that don't need it against more significant
> > memory savings for users that actually do need per inode bpf storage. A
> > factor in this is that a lot of people are running some distribution kernel
> > which generally enables most config options that are at least somewhat
> > useful. So hiding the cost behind CONFIG_FOO doesn't really help such
> > people.
> 
> Agreed that an extra pointer will be used if there is no actual users
> of it. However, in longer term, "most users do not use bpf inode
> storage" may not be true. As kernel engineers, we may not always notice 
> when user space is using some BPF features. For example, systemd has
> a BPF LSM program "restrict_filesystems" [1]. It is enabled if the 
> user have lsm=bpf in kernel args. I personally noticed it as a 
> surprise when we enabled lsm=bpf. 
> 
> > I'm personally not *so* hung up about a pointer in struct inode but I can
> > see why Christian is and I agree adding a pointer there isn't a win for
> > everybody.
> 
> I can also understand Christian's motivation. However, I am a bit
> frustrated because similar approach (adding a pointer to the struct) 
> worked fine for other popular data structures: task_struct, sock, 
> cgroup. 
> 

There are (usually) a lot more inodes on a host than all of those other
structs combined. Worse, struct inode is often embedded in other
structs, and adding fields can cause alignment problems there.


> > Longer term, I think it may be beneficial to come up with a way to attach
> > private info to the inode in a way that doesn't cost us one pointer per
> > funcionality that may possibly attach info to the inode. We already have
> > i_crypt_info, i_verity_info, i_flctx, i_security, etc. It's always a tough
> > call where the space overhead for everybody is worth the runtime &
> > complexity overhead for users using the functionality...
> 
> It does seem to be the right long term solution, and I am willing to 
> work on it. However, I would really appreciate some positive feedback
> on the idea, so that I have better confidence my weeks of work has a 
> better chance to worth it. 
> 
> Thanks,
> Song
> 
> [1] https://github.com/systemd/systemd/blob/main/src/core/bpf/restrict_fs/restrict-fs.bpf.c

fsnotify is somewhat similar to file locking in that few inodes on the
machine actually utilize these fields.

For file locking, we allocate and populate the inode->i_flctx field on
an as-needed basis. The kernel then hangs on to that struct until the
inode is freed. We could do something similar here. We have this now:

#ifdef CONFIG_FSNOTIFY
        __u32                   i_fsnotify_mask; /* all events this inode cares about */
        /* 32-bit hole reserved for expanding i_fsnotify_mask */
        struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
#endif

What if you were to turn these fields into a pointer to a new struct:

	struct fsnotify_inode_context {
		struct fsnotify_mark_connector __rcu	*i_fsnotify_marks;
		struct bpf_local_storage __rcu		*i_bpf_storage;
		__u32                   		i_fsnotify_mask; /* all events this inode cares about */
	};

Then whenever you have to populate any of these fields, you just
allocate one of these structs and set the inode up to point to it.
They're tiny too, so don't bother freeing it until the inode is
deallocated.

It'd mean rejiggering a fair bit of fsnotify code, but it would give
the fsnotify code an easier way to expand per-inode info in the future.
It would also slightly shrink struct inode too.
Amir Goldstein Nov. 19, 2024, 3:25 p.m. UTC | #9
On Tue, Nov 19, 2024 at 3:21 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Fri, 2024-11-15 at 17:35 +0000, Song Liu wrote:
> > Hi Jan,
> >
> > > On Nov 15, 2024, at 3:19 AM, Jan Kara <jack@suse.cz> wrote:
> >
> > [...]
> >
> > > > AFAICT, we need to modify how lsm blob are managed with
> > > > CONFIG_BPF_SYSCALL=y && CONFIG_BPF_LSM=n case. The solution, even
> > > > if it gets accepted, doesn't really save any memory. Instead of
> > > > growing struct inode by 8 bytes, the solution will allocate 8
> > > > more bytes to inode->i_security. So the total memory consumption
> > > > is the same, but the memory is more fragmented.
> > >
> > > I guess you've found a better solution for this based on James' suggestion.
> > >
> > > > Therefore, I think we should really step back and consider adding
> > > > the i_bpf_storage to struct inode. While this does increase the
> > > > size of struct inode by 8 bytes, it may end up with less overall
> > > > memory consumption for the system. This is why.
> > > >
> > > > When the user cannot use inode local storage, the alternative is
> > > > to use hash maps (use inode pointer as key). AFAICT, all hash maps
> > > > comes with non-trivial overhead, in memory consumption, in access
> > > > latency, and in extra code to manage the memory. OTOH, inode local
> > > > storage doesn't have these issue, and is usually much more efficient:
> > > > - memory is only allocated for inodes with actual data,
> > > > - O(1) latency,
> > > > - per inode data is freed automatically when the inode is evicted.
> > > > Please refer to [1] where Amir mentioned all the work needed to
> > > > properly manage a hash map, and I explained why we don't need to
> > > > worry about these with inode local storage.
> > >
> > > Well, but here you are speaking of a situation where bpf inode storage
> > > space gets actually used for most inodes. Then I agree i_bpf_storage is the
> > > most economic solution. But I'd also expect that for vast majority of
> > > systems the bpf inode storage isn't used at all and if it does get used, it
> > > is used only for a small fraction of inodes. So we are weighting 8 bytes
> > > per inode for all those users that don't need it against more significant
> > > memory savings for users that actually do need per inode bpf storage. A
> > > factor in this is that a lot of people are running some distribution kernel
> > > which generally enables most config options that are at least somewhat
> > > useful. So hiding the cost behind CONFIG_FOO doesn't really help such
> > > people.
> >
> > Agreed that an extra pointer will be used if there is no actual users
> > of it. However, in longer term, "most users do not use bpf inode
> > storage" may not be true. As kernel engineers, we may not always notice
> > when user space is using some BPF features. For example, systemd has
> > a BPF LSM program "restrict_filesystems" [1]. It is enabled if the
> > user have lsm=bpf in kernel args. I personally noticed it as a
> > surprise when we enabled lsm=bpf.
> >
> > > I'm personally not *so* hung up about a pointer in struct inode but I can
> > > see why Christian is and I agree adding a pointer there isn't a win for
> > > everybody.
> >
> > I can also understand Christian's motivation. However, I am a bit
> > frustrated because similar approach (adding a pointer to the struct)
> > worked fine for other popular data structures: task_struct, sock,
> > cgroup.
> >
>
> There are (usually) a lot more inodes on a host than all of those other
> structs combined. Worse, struct inode is often embedded in other
> structs, and adding fields can cause alignment problems there.
>
>
> > > Longer term, I think it may be beneficial to come up with a way to attach
> > > private info to the inode in a way that doesn't cost us one pointer per
> > > funcionality that may possibly attach info to the inode. We already have
> > > i_crypt_info, i_verity_info, i_flctx, i_security, etc. It's always a tough
> > > call where the space overhead for everybody is worth the runtime &
> > > complexity overhead for users using the functionality...
> >
> > It does seem to be the right long term solution, and I am willing to
> > work on it. However, I would really appreciate some positive feedback
> > on the idea, so that I have better confidence my weeks of work has a
> > better chance to worth it.
> >
> > Thanks,
> > Song
> >
> > [1] https://github.com/systemd/systemd/blob/main/src/core/bpf/restrict_fs/restrict-fs.bpf.c
>
> fsnotify is somewhat similar to file locking in that few inodes on the
> machine actually utilize these fields.
>
> For file locking, we allocate and populate the inode->i_flctx field on
> an as-needed basis. The kernel then hangs on to that struct until the
> inode is freed. We could do something similar here. We have this now:
>
> #ifdef CONFIG_FSNOTIFY
>         __u32                   i_fsnotify_mask; /* all events this inode cares about */
>         /* 32-bit hole reserved for expanding i_fsnotify_mask */
>         struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> #endif
>
> What if you were to turn these fields into a pointer to a new struct:
>
>         struct fsnotify_inode_context {
>                 struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
>                 struct bpf_local_storage __rcu          *i_bpf_storage;
>                 __u32                                   i_fsnotify_mask; /* all events this inode cares about */
>         };
>

The extra indirection is going to hurt for i_fsnotify_mask
it is being accessed frequently in fsnotify hooks, so I wouldn't move it
into a container, but it could be moved to the hole after i_state.

> Then whenever you have to populate any of these fields, you just
> allocate one of these structs and set the inode up to point to it.
> They're tiny too, so don't bother freeing it until the inode is
> deallocated.
>
> It'd mean rejiggering a fair bit of fsnotify code, but it would give
> the fsnotify code an easier way to expand per-inode info in the future.
> It would also slightly shrink struct inode too.

This was already done for s_fsnotify_marks, so you can follow the recipe
of 07a3b8d0bf72 ("fsnotify: lazy attach fsnotify_sb_info state to sb")
and create an fsnotify_inode_info container.

Thanks,
Amir.
Amir Goldstein Nov. 19, 2024, 3:30 p.m. UTC | #10
On Tue, Nov 19, 2024 at 4:25 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Tue, Nov 19, 2024 at 3:21 PM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > On Fri, 2024-11-15 at 17:35 +0000, Song Liu wrote:
> > > Hi Jan,
> > >
> > > > On Nov 15, 2024, at 3:19 AM, Jan Kara <jack@suse.cz> wrote:
> > >
> > > [...]
> > >
> > > > > AFAICT, we need to modify how lsm blob are managed with
> > > > > CONFIG_BPF_SYSCALL=y && CONFIG_BPF_LSM=n case. The solution, even
> > > > > if it gets accepted, doesn't really save any memory. Instead of
> > > > > growing struct inode by 8 bytes, the solution will allocate 8
> > > > > more bytes to inode->i_security. So the total memory consumption
> > > > > is the same, but the memory is more fragmented.
> > > >
> > > > I guess you've found a better solution for this based on James' suggestion.
> > > >
> > > > > Therefore, I think we should really step back and consider adding
> > > > > the i_bpf_storage to struct inode. While this does increase the
> > > > > size of struct inode by 8 bytes, it may end up with less overall
> > > > > memory consumption for the system. This is why.
> > > > >
> > > > > When the user cannot use inode local storage, the alternative is
> > > > > to use hash maps (use inode pointer as key). AFAICT, all hash maps
> > > > > comes with non-trivial overhead, in memory consumption, in access
> > > > > latency, and in extra code to manage the memory. OTOH, inode local
> > > > > storage doesn't have these issue, and is usually much more efficient:
> > > > > - memory is only allocated for inodes with actual data,
> > > > > - O(1) latency,
> > > > > - per inode data is freed automatically when the inode is evicted.
> > > > > Please refer to [1] where Amir mentioned all the work needed to
> > > > > properly manage a hash map, and I explained why we don't need to
> > > > > worry about these with inode local storage.
> > > >
> > > > Well, but here you are speaking of a situation where bpf inode storage
> > > > space gets actually used for most inodes. Then I agree i_bpf_storage is the
> > > > most economic solution. But I'd also expect that for vast majority of
> > > > systems the bpf inode storage isn't used at all and if it does get used, it
> > > > is used only for a small fraction of inodes. So we are weighting 8 bytes
> > > > per inode for all those users that don't need it against more significant
> > > > memory savings for users that actually do need per inode bpf storage. A
> > > > factor in this is that a lot of people are running some distribution kernel
> > > > which generally enables most config options that are at least somewhat
> > > > useful. So hiding the cost behind CONFIG_FOO doesn't really help such
> > > > people.
> > >
> > > Agreed that an extra pointer will be used if there is no actual users
> > > of it. However, in longer term, "most users do not use bpf inode
> > > storage" may not be true. As kernel engineers, we may not always notice
> > > when user space is using some BPF features. For example, systemd has
> > > a BPF LSM program "restrict_filesystems" [1]. It is enabled if the
> > > user have lsm=bpf in kernel args. I personally noticed it as a
> > > surprise when we enabled lsm=bpf.
> > >
> > > > I'm personally not *so* hung up about a pointer in struct inode but I can
> > > > see why Christian is and I agree adding a pointer there isn't a win for
> > > > everybody.
> > >
> > > I can also understand Christian's motivation. However, I am a bit
> > > frustrated because similar approach (adding a pointer to the struct)
> > > worked fine for other popular data structures: task_struct, sock,
> > > cgroup.
> > >
> >
> > There are (usually) a lot more inodes on a host than all of those other
> > structs combined. Worse, struct inode is often embedded in other
> > structs, and adding fields can cause alignment problems there.
> >
> >
> > > > Longer term, I think it may be beneficial to come up with a way to attach
> > > > private info to the inode in a way that doesn't cost us one pointer per
> > > > funcionality that may possibly attach info to the inode. We already have
> > > > i_crypt_info, i_verity_info, i_flctx, i_security, etc. It's always a tough
> > > > call where the space overhead for everybody is worth the runtime &
> > > > complexity overhead for users using the functionality...
> > >
> > > It does seem to be the right long term solution, and I am willing to
> > > work on it. However, I would really appreciate some positive feedback
> > > on the idea, so that I have better confidence my weeks of work has a
> > > better chance to worth it.
> > >
> > > Thanks,
> > > Song
> > >
> > > [1] https://github.com/systemd/systemd/blob/main/src/core/bpf/restrict_fs/restrict-fs.bpf.c
> >
> > fsnotify is somewhat similar to file locking in that few inodes on the
> > machine actually utilize these fields.
> >
> > For file locking, we allocate and populate the inode->i_flctx field on
> > an as-needed basis. The kernel then hangs on to that struct until the
> > inode is freed. We could do something similar here. We have this now:
> >
> > #ifdef CONFIG_FSNOTIFY
> >         __u32                   i_fsnotify_mask; /* all events this inode cares about */
> >         /* 32-bit hole reserved for expanding i_fsnotify_mask */
> >         struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> > #endif
> >
> > What if you were to turn these fields into a pointer to a new struct:
> >
> >         struct fsnotify_inode_context {
> >                 struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> >                 struct bpf_local_storage __rcu          *i_bpf_storage;
> >                 __u32                                   i_fsnotify_mask; /* all events this inode cares about */
> >         };
> >
>
> The extra indirection is going to hurt for i_fsnotify_mask
> it is being accessed frequently in fsnotify hooks, so I wouldn't move it
> into a container, but it could be moved to the hole after i_state.
>
> > Then whenever you have to populate any of these fields, you just
> > allocate one of these structs and set the inode up to point to it.
> > They're tiny too, so don't bother freeing it until the inode is
> > deallocated.
> >
> > It'd mean rejiggering a fair bit of fsnotify code, but it would give
> > the fsnotify code an easier way to expand per-inode info in the future.
> > It would also slightly shrink struct inode too.
>
> This was already done for s_fsnotify_marks, so you can follow the recipe
> of 07a3b8d0bf72 ("fsnotify: lazy attach fsnotify_sb_info state to sb")
> and create an fsnotify_inode_info container.
>

On second thought, fsnotify_sb_info container is allocated and attached
in the context of userspace adding a mark.

If you will need allocate and attach fsnotify_inode_info in the content of
fast path fanotify hook in order to add the inode to the map, I don't
think that is going to fly??

Thanks,
Amir.
Song Liu Nov. 19, 2024, 9:53 p.m. UTC | #11
Hi Jeff and Amir, 

Thanks for your inputs!

> On Nov 19, 2024, at 7:30 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> 
> On Tue, Nov 19, 2024 at 4:25 PM Amir Goldstein <amir73il@gmail.com> wrote:
>> 
>> On Tue, Nov 19, 2024 at 3:21 PM Jeff Layton <jlayton@kernel.org> wrote:
>>> 

[...]

>>> Longer term, I think it may be beneficial to come up with a way to attach
>>>>> private info to the inode in a way that doesn't cost us one pointer per
>>>>> funcionality that may possibly attach info to the inode. We already have
>>>>> i_crypt_info, i_verity_info, i_flctx, i_security, etc. It's always a tough
>>>>> call where the space overhead for everybody is worth the runtime &
>>>>> complexity overhead for users using the functionality...
>>>> 
>>>> It does seem to be the right long term solution, and I am willing to
>>>> work on it. However, I would really appreciate some positive feedback
>>>> on the idea, so that I have better confidence my weeks of work has a
>>>> better chance to worth it.
>>>> 
>>>> Thanks,
>>>> Song
>>>> 
>>>> [1] https://github.com/systemd/systemd/blob/main/src/core/bpf/restrict_fs/restrict-fs.bpf.c
>>> 
>>> fsnotify is somewhat similar to file locking in that few inodes on the
>>> machine actually utilize these fields.
>>> 
>>> For file locking, we allocate and populate the inode->i_flctx field on
>>> an as-needed basis. The kernel then hangs on to that struct until the
>>> inode is freed.

If we have some universal on-demand per-inode memory allocator, 
I guess we can move i_flctx to it?

>>> We could do something similar here. We have this now:
>>> 
>>> #ifdef CONFIG_FSNOTIFY
>>>        __u32                   i_fsnotify_mask; /* all events this inode cares about */
>>>        /* 32-bit hole reserved for expanding i_fsnotify_mask */
>>>        struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
>>> #endif

And maybe some fsnotify fields too?

With a couple users, I think it justifies to have some universal
on-demond allocator. 

>>> What if you were to turn these fields into a pointer to a new struct:
>>> 
>>>        struct fsnotify_inode_context {
>>>                struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
>>>                struct bpf_local_storage __rcu          *i_bpf_storage;
>>>                __u32                                   i_fsnotify_mask; /* all events this inode cares about */
>>>        };
>>> 
>> 
>> The extra indirection is going to hurt for i_fsnotify_mask
>> it is being accessed frequently in fsnotify hooks, so I wouldn't move it
>> into a container, but it could be moved to the hole after i_state.

>>> Then whenever you have to populate any of these fields, you just
>>> allocate one of these structs and set the inode up to point to it.
>>> They're tiny too, so don't bother freeing it until the inode is
>>> deallocated.
>>> 
>>> It'd mean rejiggering a fair bit of fsnotify code, but it would give
>>> the fsnotify code an easier way to expand per-inode info in the future.
>>> It would also slightly shrink struct inode too.

I am hoping to make i_bpf_storage available to tracing programs. 
Therefore, I would rather not limit it to fsnotify context. We can
still use the universal on-demand allocator.

>> 
>> This was already done for s_fsnotify_marks, so you can follow the recipe
>> of 07a3b8d0bf72 ("fsnotify: lazy attach fsnotify_sb_info state to sb")
>> and create an fsnotify_inode_info container.
>> 
> 
> On second thought, fsnotify_sb_info container is allocated and attached
> in the context of userspace adding a mark.
> 
> If you will need allocate and attach fsnotify_inode_info in the content of
> fast path fanotify hook in order to add the inode to the map, I don't
> think that is going to fly??

Do you mean we may not be able to allocate memory in the fast path 
hook? AFAICT, the fast path is still in the process context, so I 
think this is not a problem?

Thanks,
Song
Amir Goldstein Nov. 20, 2024, 9:19 a.m. UTC | #12
On Tue, Nov 19, 2024 at 10:53 PM Song Liu <songliubraving@meta.com> wrote:
>
> Hi Jeff and Amir,
>
> Thanks for your inputs!
>
> > On Nov 19, 2024, at 7:30 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Tue, Nov 19, 2024 at 4:25 PM Amir Goldstein <amir73il@gmail.com> wrote:
> >>
> >> On Tue, Nov 19, 2024 at 3:21 PM Jeff Layton <jlayton@kernel.org> wrote:
> >>>
>
> [...]
>
> >>> Longer term, I think it may be beneficial to come up with a way to attach
> >>>>> private info to the inode in a way that doesn't cost us one pointer per
> >>>>> funcionality that may possibly attach info to the inode. We already have
> >>>>> i_crypt_info, i_verity_info, i_flctx, i_security, etc. It's always a tough
> >>>>> call where the space overhead for everybody is worth the runtime &
> >>>>> complexity overhead for users using the functionality...
> >>>>
> >>>> It does seem to be the right long term solution, and I am willing to
> >>>> work on it. However, I would really appreciate some positive feedback
> >>>> on the idea, so that I have better confidence my weeks of work has a
> >>>> better chance to worth it.
> >>>>
> >>>> Thanks,
> >>>> Song
> >>>>
> >>>> [1] https://github.com/systemd/systemd/blob/main/src/core/bpf/restrict_fs/restrict-fs.bpf.c
> >>>
> >>> fsnotify is somewhat similar to file locking in that few inodes on the
> >>> machine actually utilize these fields.
> >>>
> >>> For file locking, we allocate and populate the inode->i_flctx field on
> >>> an as-needed basis. The kernel then hangs on to that struct until the
> >>> inode is freed.
>
> If we have some universal on-demand per-inode memory allocator,
> I guess we can move i_flctx to it?
>
> >>> We could do something similar here. We have this now:
> >>>
> >>> #ifdef CONFIG_FSNOTIFY
> >>>        __u32                   i_fsnotify_mask; /* all events this inode cares about */
> >>>        /* 32-bit hole reserved for expanding i_fsnotify_mask */
> >>>        struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> >>> #endif
>
> And maybe some fsnotify fields too?
>
> With a couple users, I think it justifies to have some universal
> on-demond allocator.
>
> >>> What if you were to turn these fields into a pointer to a new struct:
> >>>
> >>>        struct fsnotify_inode_context {
> >>>                struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> >>>                struct bpf_local_storage __rcu          *i_bpf_storage;
> >>>                __u32                                   i_fsnotify_mask; /* all events this inode cares about */
> >>>        };
> >>>
> >>
> >> The extra indirection is going to hurt for i_fsnotify_mask
> >> it is being accessed frequently in fsnotify hooks, so I wouldn't move it
> >> into a container, but it could be moved to the hole after i_state.
>
> >>> Then whenever you have to populate any of these fields, you just
> >>> allocate one of these structs and set the inode up to point to it.
> >>> They're tiny too, so don't bother freeing it until the inode is
> >>> deallocated.
> >>>
> >>> It'd mean rejiggering a fair bit of fsnotify code, but it would give
> >>> the fsnotify code an easier way to expand per-inode info in the future.
> >>> It would also slightly shrink struct inode too.
>
> I am hoping to make i_bpf_storage available to tracing programs.
> Therefore, I would rather not limit it to fsnotify context. We can
> still use the universal on-demand allocator.
>
> >>
> >> This was already done for s_fsnotify_marks, so you can follow the recipe
> >> of 07a3b8d0bf72 ("fsnotify: lazy attach fsnotify_sb_info state to sb")
> >> and create an fsnotify_inode_info container.
> >>
> >
> > On second thought, fsnotify_sb_info container is allocated and attached
> > in the context of userspace adding a mark.
> >
> > If you will need allocate and attach fsnotify_inode_info in the content of
> > fast path fanotify hook in order to add the inode to the map, I don't
> > think that is going to fly??
>
> Do you mean we may not be able to allocate memory in the fast path
> hook? AFAICT, the fast path is still in the process context, so I
> think this is not a problem?

Right. that should be ok.

Thanks,
Amir.
Christian Brauner Nov. 20, 2024, 9:28 a.m. UTC | #13
On Tue, Nov 19, 2024 at 09:53:20PM +0000, Song Liu wrote:
> Hi Jeff and Amir, 
> 
> Thanks for your inputs!
> 
> > On Nov 19, 2024, at 7:30 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> > 
> > On Tue, Nov 19, 2024 at 4:25 PM Amir Goldstein <amir73il@gmail.com> wrote:
> >> 
> >> On Tue, Nov 19, 2024 at 3:21 PM Jeff Layton <jlayton@kernel.org> wrote:
> >>> 
> 
> [...]
> 
> >>> Longer term, I think it may be beneficial to come up with a way to attach
> >>>>> private info to the inode in a way that doesn't cost us one pointer per
> >>>>> funcionality that may possibly attach info to the inode. We already have
> >>>>> i_crypt_info, i_verity_info, i_flctx, i_security, etc. It's always a tough
> >>>>> call where the space overhead for everybody is worth the runtime &
> >>>>> complexity overhead for users using the functionality...
> >>>> 
> >>>> It does seem to be the right long term solution, and I am willing to
> >>>> work on it. However, I would really appreciate some positive feedback
> >>>> on the idea, so that I have better confidence my weeks of work has a
> >>>> better chance to worth it.
> >>>> 
> >>>> Thanks,
> >>>> Song
> >>>> 
> >>>> [1] https://github.com/systemd/systemd/blob/main/src/core/bpf/restrict_fs/restrict-fs.bpf.c
> >>> 
> >>> fsnotify is somewhat similar to file locking in that few inodes on the
> >>> machine actually utilize these fields.
> >>> 
> >>> For file locking, we allocate and populate the inode->i_flctx field on
> >>> an as-needed basis. The kernel then hangs on to that struct until the
> >>> inode is freed.
> 
> If we have some universal on-demand per-inode memory allocator, 
> I guess we can move i_flctx to it?
> 
> >>> We could do something similar here. We have this now:
> >>> 
> >>> #ifdef CONFIG_FSNOTIFY
> >>>        __u32                   i_fsnotify_mask; /* all events this inode cares about */
> >>>        /* 32-bit hole reserved for expanding i_fsnotify_mask */
> >>>        struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> >>> #endif
> 
> And maybe some fsnotify fields too?
> 
> With a couple users, I think it justifies to have some universal
> on-demond allocator. 
> 
> >>> What if you were to turn these fields into a pointer to a new struct:
> >>> 
> >>>        struct fsnotify_inode_context {
> >>>                struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> >>>                struct bpf_local_storage __rcu          *i_bpf_storage;
> >>>                __u32                                   i_fsnotify_mask; /* all events this inode cares about */
> >>>        };
> >>> 
> >> 
> >> The extra indirection is going to hurt for i_fsnotify_mask
> >> it is being accessed frequently in fsnotify hooks, so I wouldn't move it
> >> into a container, but it could be moved to the hole after i_state.
> 
> >>> Then whenever you have to populate any of these fields, you just
> >>> allocate one of these structs and set the inode up to point to it.
> >>> They're tiny too, so don't bother freeing it until the inode is
> >>> deallocated.
> >>> 
> >>> It'd mean rejiggering a fair bit of fsnotify code, but it would give
> >>> the fsnotify code an easier way to expand per-inode info in the future.
> >>> It would also slightly shrink struct inode too.
> 
> I am hoping to make i_bpf_storage available to tracing programs. 
> Therefore, I would rather not limit it to fsnotify context. We can
> still use the universal on-demand allocator.

Can't we just do something like:

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7e29433c5ecc..cc05a5485365 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -627,6 +627,12 @@ is_uncached_acl(struct posix_acl *acl)
 #define IOP_DEFAULT_READLINK   0x0010
 #define IOP_MGTIME     0x0020

+struct inode_addons {
+        struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
+        struct bpf_local_storage __rcu          *i_bpf_storage;
+        __u32                                   i_fsnotify_mask; /* all events this inode cares about */
+};
+
 /*
  * Keep mostly read-only and often accessed (especially for
  * the RCU path lookup and 'stat' data) fields at the beginning
@@ -731,12 +737,7 @@ struct inode {
                unsigned                i_dir_seq;
        };

-
-#ifdef CONFIG_FSNOTIFY
-       __u32                   i_fsnotify_mask; /* all events this inode cares about */
-       /* 32-bit hole reserved for expanding i_fsnotify_mask */
-       struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
-#endif
+       struct inode_addons *i_addons;

 #ifdef CONFIG_FS_ENCRYPTION
        struct fscrypt_inode_info       *i_crypt_info;

Then when either fsnotify or bpf needs that storage they can do a
cmpxchg() based allocation for struct inode_addons just like I did with
f_owner:

int file_f_owner_allocate(struct file *file)
{
	struct fown_struct *f_owner;

	f_owner = file_f_owner(file);
	if (f_owner)
		return 0;

	f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL);
	if (!f_owner)
		return -ENOMEM;

	rwlock_init(&f_owner->lock);
	f_owner->file = file;
	/* If someone else raced us, drop our allocation. */
	if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner)))
		kfree(f_owner);
	return 0;
}

The internal allocations for specific fields are up to the subsystem
ofc. Does that make sense?

> >>>        struct fsnotify_inode_context {
> >>>                struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> >>>                struct bpf_local_storage __rcu          *i_bpf_storage;
> >>>                __u32                                   i_fsnotify_mask; /* all events this inode cares about */
> >>>        };

> 
> >> 
> >> This was already done for s_fsnotify_marks, so you can follow the recipe
> >> of 07a3b8d0bf72 ("fsnotify: lazy attach fsnotify_sb_info state to sb")
> >> and create an fsnotify_inode_info container.
> >> 
> > 
> > On second thought, fsnotify_sb_info container is allocated and attached
> > in the context of userspace adding a mark.
> > 
> > If you will need allocate and attach fsnotify_inode_info in the content of
> > fast path fanotify hook in order to add the inode to the map, I don't
> > think that is going to fly??
> 
> Do you mean we may not be able to allocate memory in the fast path 
> hook? AFAICT, the fast path is still in the process context, so I 
> think this is not a problem?
> 
> Thanks,
> Song 
>
Amir Goldstein Nov. 20, 2024, 11:19 a.m. UTC | #14
On Wed, Nov 20, 2024 at 10:28 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Tue, Nov 19, 2024 at 09:53:20PM +0000, Song Liu wrote:
> > Hi Jeff and Amir,
> >
> > Thanks for your inputs!
> >
> > > On Nov 19, 2024, at 7:30 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> > >
> > > On Tue, Nov 19, 2024 at 4:25 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > >>
> > >> On Tue, Nov 19, 2024 at 3:21 PM Jeff Layton <jlayton@kernel.org> wrote:
> > >>>
> >
> > [...]
> >
> > >>> Longer term, I think it may be beneficial to come up with a way to attach
> > >>>>> private info to the inode in a way that doesn't cost us one pointer per
> > >>>>> funcionality that may possibly attach info to the inode. We already have
> > >>>>> i_crypt_info, i_verity_info, i_flctx, i_security, etc. It's always a tough
> > >>>>> call where the space overhead for everybody is worth the runtime &
> > >>>>> complexity overhead for users using the functionality...
> > >>>>
> > >>>> It does seem to be the right long term solution, and I am willing to
> > >>>> work on it. However, I would really appreciate some positive feedback
> > >>>> on the idea, so that I have better confidence my weeks of work has a
> > >>>> better chance to worth it.
> > >>>>
> > >>>> Thanks,
> > >>>> Song
> > >>>>
> > >>>> [1] https://github.com/systemd/systemd/blob/main/src/core/bpf/restrict_fs/restrict-fs.bpf.c
> > >>>
> > >>> fsnotify is somewhat similar to file locking in that few inodes on the
> > >>> machine actually utilize these fields.
> > >>>
> > >>> For file locking, we allocate and populate the inode->i_flctx field on
> > >>> an as-needed basis. The kernel then hangs on to that struct until the
> > >>> inode is freed.
> >
> > If we have some universal on-demand per-inode memory allocator,
> > I guess we can move i_flctx to it?
> >
> > >>> We could do something similar here. We have this now:
> > >>>
> > >>> #ifdef CONFIG_FSNOTIFY
> > >>>        __u32                   i_fsnotify_mask; /* all events this inode cares about */
> > >>>        /* 32-bit hole reserved for expanding i_fsnotify_mask */
> > >>>        struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> > >>> #endif
> >
> > And maybe some fsnotify fields too?
> >
> > With a couple users, I think it justifies to have some universal
> > on-demond allocator.
> >
> > >>> What if you were to turn these fields into a pointer to a new struct:
> > >>>
> > >>>        struct fsnotify_inode_context {
> > >>>                struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> > >>>                struct bpf_local_storage __rcu          *i_bpf_storage;
> > >>>                __u32                                   i_fsnotify_mask; /* all events this inode cares about */
> > >>>        };
> > >>>
> > >>
> > >> The extra indirection is going to hurt for i_fsnotify_mask
> > >> it is being accessed frequently in fsnotify hooks, so I wouldn't move it
> > >> into a container, but it could be moved to the hole after i_state.
> >
> > >>> Then whenever you have to populate any of these fields, you just
> > >>> allocate one of these structs and set the inode up to point to it.
> > >>> They're tiny too, so don't bother freeing it until the inode is
> > >>> deallocated.
> > >>>
> > >>> It'd mean rejiggering a fair bit of fsnotify code, but it would give
> > >>> the fsnotify code an easier way to expand per-inode info in the future.
> > >>> It would also slightly shrink struct inode too.
> >
> > I am hoping to make i_bpf_storage available to tracing programs.
> > Therefore, I would rather not limit it to fsnotify context. We can
> > still use the universal on-demand allocator.
>
> Can't we just do something like:
>
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 7e29433c5ecc..cc05a5485365 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -627,6 +627,12 @@ is_uncached_acl(struct posix_acl *acl)
>  #define IOP_DEFAULT_READLINK   0x0010
>  #define IOP_MGTIME     0x0020
>
> +struct inode_addons {
> +        struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> +        struct bpf_local_storage __rcu          *i_bpf_storage;
> +        __u32                                   i_fsnotify_mask; /* all events this inode cares about */
> +};
> +
>  /*
>   * Keep mostly read-only and often accessed (especially for
>   * the RCU path lookup and 'stat' data) fields at the beginning
> @@ -731,12 +737,7 @@ struct inode {
>                 unsigned                i_dir_seq;
>         };
>
> -
> -#ifdef CONFIG_FSNOTIFY
> -       __u32                   i_fsnotify_mask; /* all events this inode cares about */
> -       /* 32-bit hole reserved for expanding i_fsnotify_mask */
> -       struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> -#endif
> +       struct inode_addons *i_addons;
>
>  #ifdef CONFIG_FS_ENCRYPTION
>         struct fscrypt_inode_info       *i_crypt_info;
>
> Then when either fsnotify or bpf needs that storage they can do a
> cmpxchg() based allocation for struct inode_addons just like I did with
> f_owner:
>
> int file_f_owner_allocate(struct file *file)
> {
>         struct fown_struct *f_owner;
>
>         f_owner = file_f_owner(file);
>         if (f_owner)
>                 return 0;
>
>         f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL);
>         if (!f_owner)
>                 return -ENOMEM;
>
>         rwlock_init(&f_owner->lock);
>         f_owner->file = file;
>         /* If someone else raced us, drop our allocation. */
>         if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner)))
>                 kfree(f_owner);
>         return 0;
> }
>
> The internal allocations for specific fields are up to the subsystem
> ofc. Does that make sense?
>

Maybe, but as I wrote, i_fsnotify_mask should not be moved out
of inode struct, because it is accessed in fast paths of fsnotify vfs
hooks, where we do not want to have to deref another context,
but i_fsnotify_mask can be moved to the hole after i_state.

And why stop at i_fsnotify/i_bfp?
If you go to "addons" why not also move i_security/i_crypt/i_verify?
Need to have some common rationale behind those decisions.

Thanks,
Amir.
Song Liu Nov. 21, 2024, 8:08 a.m. UTC | #15
> On Nov 20, 2024, at 1:28 AM, Christian Brauner <brauner@kernel.org> wrote:

[...]

>>>>> Then whenever you have to populate any of these fields, you just
>>>>> allocate one of these structs and set the inode up to point to it.
>>>>> They're tiny too, so don't bother freeing it until the inode is
>>>>> deallocated.
>>>>> 
>>>>> It'd mean rejiggering a fair bit of fsnotify code, but it would give
>>>>> the fsnotify code an easier way to expand per-inode info in the future.
>>>>> It would also slightly shrink struct inode too.
>> 
>> I am hoping to make i_bpf_storage available to tracing programs. 
>> Therefore, I would rather not limit it to fsnotify context. We can
>> still use the universal on-demand allocator.
> 
> Can't we just do something like:
> 
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 7e29433c5ecc..cc05a5485365 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -627,6 +627,12 @@ is_uncached_acl(struct posix_acl *acl)
> #define IOP_DEFAULT_READLINK   0x0010
> #define IOP_MGTIME     0x0020
> 
> +struct inode_addons {
> +        struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> +        struct bpf_local_storage __rcu          *i_bpf_storage;
> +        __u32                                   i_fsnotify_mask; /* all events this inode cares about */
> +};
> +
> /*
>  * Keep mostly read-only and often accessed (especially for
>  * the RCU path lookup and 'stat' data) fields at the beginning
> @@ -731,12 +737,7 @@ struct inode {
>                unsigned                i_dir_seq;
>        };
> 
> -
> -#ifdef CONFIG_FSNOTIFY
> -       __u32                   i_fsnotify_mask; /* all events this inode cares about */
> -       /* 32-bit hole reserved for expanding i_fsnotify_mask */
> -       struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> -#endif
> +       struct inode_addons *i_addons;
> 
> #ifdef CONFIG_FS_ENCRYPTION
>        struct fscrypt_inode_info       *i_crypt_info;
> 
> Then when either fsnotify or bpf needs that storage they can do a
> cmpxchg() based allocation for struct inode_addons just like I did with
> f_owner:
> 
> int file_f_owner_allocate(struct file *file)
> {
> struct fown_struct *f_owner;
> 
> f_owner = file_f_owner(file);
> if (f_owner)
> return 0;
> 
> f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL);
> if (!f_owner)
> return -ENOMEM;
> 
> rwlock_init(&f_owner->lock);
> f_owner->file = file;
> /* If someone else raced us, drop our allocation. */
> if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner)))
> kfree(f_owner);
> return 0;
> }
> 
> The internal allocations for specific fields are up to the subsystem
> ofc. Does that make sense?

This works for fsnotify/fanotify. However, for tracing use cases, 
this is not as reliable as other (task, cgroup, sock) local storage. 
BPF tracing programs need to work in any contexts, including NMI. 
Therefore, doing kzalloc(GFP_KERNEL) is not always safe for 
tracing use cases. OTOH, bpf local storage works in NMI. If we have 
a i_bpf_storage pointer in struct inode, bpf inode storage will work 
in NMI. 

Thanks,
Song
Christian Brauner Nov. 21, 2024, 8:43 a.m. UTC | #16
On Wed, Nov 20, 2024 at 12:19:51PM +0100, Amir Goldstein wrote:
> On Wed, Nov 20, 2024 at 10:28 AM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Tue, Nov 19, 2024 at 09:53:20PM +0000, Song Liu wrote:
> > > Hi Jeff and Amir,
> > >
> > > Thanks for your inputs!
> > >
> > > > On Nov 19, 2024, at 7:30 AM, Amir Goldstein <amir73il@gmail.com> wrote:
> > > >
> > > > On Tue, Nov 19, 2024 at 4:25 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > > >>
> > > >> On Tue, Nov 19, 2024 at 3:21 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > >>>
> > >
> > > [...]
> > >
> > > >>> Longer term, I think it may be beneficial to come up with a way to attach
> > > >>>>> private info to the inode in a way that doesn't cost us one pointer per
> > > >>>>> funcionality that may possibly attach info to the inode. We already have
> > > >>>>> i_crypt_info, i_verity_info, i_flctx, i_security, etc. It's always a tough
> > > >>>>> call where the space overhead for everybody is worth the runtime &
> > > >>>>> complexity overhead for users using the functionality...
> > > >>>>
> > > >>>> It does seem to be the right long term solution, and I am willing to
> > > >>>> work on it. However, I would really appreciate some positive feedback
> > > >>>> on the idea, so that I have better confidence my weeks of work has a
> > > >>>> better chance to worth it.
> > > >>>>
> > > >>>> Thanks,
> > > >>>> Song
> > > >>>>
> > > >>>> [1] https://github.com/systemd/systemd/blob/main/src/core/bpf/restrict_fs/restrict-fs.bpf.c
> > > >>>
> > > >>> fsnotify is somewhat similar to file locking in that few inodes on the
> > > >>> machine actually utilize these fields.
> > > >>>
> > > >>> For file locking, we allocate and populate the inode->i_flctx field on
> > > >>> an as-needed basis. The kernel then hangs on to that struct until the
> > > >>> inode is freed.
> > >
> > > If we have some universal on-demand per-inode memory allocator,
> > > I guess we can move i_flctx to it?
> > >
> > > >>> We could do something similar here. We have this now:
> > > >>>
> > > >>> #ifdef CONFIG_FSNOTIFY
> > > >>>        __u32                   i_fsnotify_mask; /* all events this inode cares about */
> > > >>>        /* 32-bit hole reserved for expanding i_fsnotify_mask */
> > > >>>        struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> > > >>> #endif
> > >
> > > And maybe some fsnotify fields too?
> > >
> > > With a couple users, I think it justifies to have some universal
> > > on-demond allocator.
> > >
> > > >>> What if you were to turn these fields into a pointer to a new struct:
> > > >>>
> > > >>>        struct fsnotify_inode_context {
> > > >>>                struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> > > >>>                struct bpf_local_storage __rcu          *i_bpf_storage;
> > > >>>                __u32                                   i_fsnotify_mask; /* all events this inode cares about */
> > > >>>        };
> > > >>>
> > > >>
> > > >> The extra indirection is going to hurt for i_fsnotify_mask
> > > >> it is being accessed frequently in fsnotify hooks, so I wouldn't move it
> > > >> into a container, but it could be moved to the hole after i_state.
> > >
> > > >>> Then whenever you have to populate any of these fields, you just
> > > >>> allocate one of these structs and set the inode up to point to it.
> > > >>> They're tiny too, so don't bother freeing it until the inode is
> > > >>> deallocated.
> > > >>>
> > > >>> It'd mean rejiggering a fair bit of fsnotify code, but it would give
> > > >>> the fsnotify code an easier way to expand per-inode info in the future.
> > > >>> It would also slightly shrink struct inode too.
> > >
> > > I am hoping to make i_bpf_storage available to tracing programs.
> > > Therefore, I would rather not limit it to fsnotify context. We can
> > > still use the universal on-demand allocator.
> >
> > Can't we just do something like:
> >
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index 7e29433c5ecc..cc05a5485365 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -627,6 +627,12 @@ is_uncached_acl(struct posix_acl *acl)
> >  #define IOP_DEFAULT_READLINK   0x0010
> >  #define IOP_MGTIME     0x0020
> >
> > +struct inode_addons {
> > +        struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> > +        struct bpf_local_storage __rcu          *i_bpf_storage;
> > +        __u32                                   i_fsnotify_mask; /* all events this inode cares about */
> > +};
> > +
> >  /*
> >   * Keep mostly read-only and often accessed (especially for
> >   * the RCU path lookup and 'stat' data) fields at the beginning
> > @@ -731,12 +737,7 @@ struct inode {
> >                 unsigned                i_dir_seq;
> >         };
> >
> > -
> > -#ifdef CONFIG_FSNOTIFY
> > -       __u32                   i_fsnotify_mask; /* all events this inode cares about */
> > -       /* 32-bit hole reserved for expanding i_fsnotify_mask */
> > -       struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
> > -#endif
> > +       struct inode_addons *i_addons;
> >
> >  #ifdef CONFIG_FS_ENCRYPTION
> >         struct fscrypt_inode_info       *i_crypt_info;
> >
> > Then when either fsnotify or bpf needs that storage they can do a
> > cmpxchg() based allocation for struct inode_addons just like I did with
> > f_owner:
> >
> > int file_f_owner_allocate(struct file *file)
> > {
> >         struct fown_struct *f_owner;
> >
> >         f_owner = file_f_owner(file);
> >         if (f_owner)
> >                 return 0;
> >
> >         f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL);
> >         if (!f_owner)
> >                 return -ENOMEM;
> >
> >         rwlock_init(&f_owner->lock);
> >         f_owner->file = file;
> >         /* If someone else raced us, drop our allocation. */
> >         if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner)))
> >                 kfree(f_owner);
> >         return 0;
> > }
> >
> > The internal allocations for specific fields are up to the subsystem
> > ofc. Does that make sense?
> >
> 
> Maybe, but as I wrote, i_fsnotify_mask should not be moved out
> of inode struct, because it is accessed in fast paths of fsnotify vfs
> hooks, where we do not want to have to deref another context,
> but i_fsnotify_mask can be moved to the hole after i_state.
> 
> And why stop at i_fsnotify/i_bfp?
> If you go to "addons" why not also move i_security/i_crypt/i_verify?
> Need to have some common rationale behind those decisions.

The rationale is that we need a mechanism to stop bloating our
structures with ever more stuff somehow. What happens to older members
of struct inode is a cleanup matter and then it needs to be seen what
can be moved into a substruct and not mind the additional pointer chase.

It's just a generalization of your proposal in a way because I don't
understand why you would move the bpf stuff into fsnotify specific
parts.
Christian Brauner Nov. 21, 2024, 9:04 a.m. UTC | #17
On Wed, Nov 13, 2024 at 02:15:20PM +0000, Song Liu wrote:
> Hi Christian, 
> 
> Thanks for your review. 
> 
> > On Nov 13, 2024, at 2:19 AM, Christian Brauner <brauner@kernel.org> wrote:
> [...]
> 
> >> diff --git a/include/linux/fs.h b/include/linux/fs.h
> >> index 3559446279c1..479097e4dd5b 100644
> >> --- a/include/linux/fs.h
> >> +++ b/include/linux/fs.h
> >> @@ -79,6 +79,7 @@ struct fs_context;
> >> struct fs_parameter_spec;
> >> struct fileattr;
> >> struct iomap_ops;
> >> +struct bpf_local_storage;
> >> 
> >> extern void __init inode_init(void);
> >> extern void __init inode_init_early(void);
> >> @@ -648,6 +649,9 @@ struct inode {
> >> #ifdef CONFIG_SECURITY
> >> void *i_security;
> >> #endif
> >> +#ifdef CONFIG_BPF_SYSCALL
> >> + struct bpf_local_storage __rcu *i_bpf_storage;
> >> +#endif
> > 
> > Sorry, we're not growing struct inode for this. It just keeps getting
> > bigger. Last cycle we freed up 8 bytes to shrink it and we're not going
> > to waste them on special-purpose stuff. We already NAKed someone else's
> > pet field here.
> 
> Would it be acceptable if we union i_bpf_storage with i_security?

I have no quarrels with this if this is acceptable to you.
Christian Brauner Nov. 21, 2024, 9:14 a.m. UTC | #18
> I'm personally not *so* hung up about a pointer in struct inode but I can
> see why Christian is and I agree adding a pointer there isn't a win for
> everybody.

Maybe I'm annoying here but I feel that I have to be. Because it's
trivial to grow structs it's rather cumbersome work to get them to
shrink. I just looked at struct task_struct again and it has four bpf
related structures/pointers in there. Ok, struct task_struct is
everyone's favorite struct to bloat so whatever.

For the VFS the structures we maintain longterm that need to be so
generic as to be usable by so many different filesystems have a tendency
to grow over time.

With some we are very strict, i.e., nobody would dare to grow struct
dentry and that's mostly because we have people that really care about
this and have an eye on that and ofc also because it's costly.

But for some structures we simply have no one caring about them that
much. So what happens is that with the ever growing list of features we
bloat them over time. There need to be some reasonable boundaries on
what we accept or not and the criteria I have been using is how
generically useful to filesystems or our infrastructre this is (roughly)
and this is rather very special-case so I'm weary of wasting 8 bytes in
struct inode that we fought rather hard to get back: Jeff's timespec
conversion and my i_state conversion.

I'm not saying it's out of the question but I want to exhaust all other
options and I'm not yet sure we have.

> Longer term, I think it may be beneficial to come up with a way to attach
> private info to the inode in a way that doesn't cost us one pointer per
> funcionality that may possibly attach info to the inode. We already have

Agreed.
diff mbox series

Patch

diff --git a/fs/inode.c b/fs/inode.c
index 8dabb224f941..3c679578169f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -286,6 +286,7 @@  static struct inode *alloc_inode(struct super_block *sb)
 void __destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
+	bpf_inode_storage_free(inode);
 	inode_detach_wb(inode);
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1b84613b10ac..0b31d2e74df6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2672,6 +2672,7 @@  struct bpf_link *bpf_link_by_id(u32 id);
 const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id,
 						 const struct bpf_prog *prog);
 void bpf_task_storage_free(struct task_struct *task);
+void bpf_inode_storage_free(struct inode *inode);
 void bpf_cgrp_storage_free(struct cgroup *cgroup);
 bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
 const struct btf_func_model *
@@ -2942,6 +2943,10 @@  static inline void bpf_task_storage_free(struct task_struct *task)
 {
 }
 
+static inline void bpf_inode_storage_free(struct inode *inode)
+{
+}
+
 static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
 {
 	return false;
@@ -3305,6 +3310,10 @@  extern const struct bpf_func_proto bpf_task_storage_get_recur_proto;
 extern const struct bpf_func_proto bpf_task_storage_get_proto;
 extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto;
 extern const struct bpf_func_proto bpf_task_storage_delete_proto;
+extern const struct bpf_func_proto bpf_inode_storage_get_proto;
+extern const struct bpf_func_proto bpf_inode_storage_get_recur_proto;
+extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
+extern const struct bpf_func_proto bpf_inode_storage_delete_recur_proto;
 extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index aefcd6564251..a819c2f0a062 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -19,31 +19,12 @@ 
 #include <linux/lsm_hook_defs.h>
 #undef LSM_HOOK
 
-struct bpf_storage_blob {
-	struct bpf_local_storage __rcu *storage;
-};
-
-extern struct lsm_blob_sizes bpf_lsm_blob_sizes;
-
 int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
 			const struct bpf_prog *prog);
 
 bool bpf_lsm_is_sleepable_hook(u32 btf_id);
 bool bpf_lsm_is_trusted(const struct bpf_prog *prog);
 
-static inline struct bpf_storage_blob *bpf_inode(
-	const struct inode *inode)
-{
-	if (unlikely(!inode->i_security))
-		return NULL;
-
-	return inode->i_security + bpf_lsm_blob_sizes.lbs_inode;
-}
-
-extern const struct bpf_func_proto bpf_inode_storage_get_proto;
-extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
-void bpf_inode_storage_free(struct inode *inode);
-
 void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func);
 
 int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
@@ -66,16 +47,6 @@  static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
 	return -EOPNOTSUPP;
 }
 
-static inline struct bpf_storage_blob *bpf_inode(
-	const struct inode *inode)
-{
-	return NULL;
-}
-
-static inline void bpf_inode_storage_free(struct inode *inode)
-{
-}
-
 static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
 					   bpf_func_t *bpf_func)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3559446279c1..479097e4dd5b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -79,6 +79,7 @@  struct fs_context;
 struct fs_parameter_spec;
 struct fileattr;
 struct iomap_ops;
+struct bpf_local_storage;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -648,6 +649,9 @@  struct inode {
 #ifdef CONFIG_SECURITY
 	void			*i_security;
 #endif
+#ifdef CONFIG_BPF_SYSCALL
+	struct bpf_local_storage __rcu *i_bpf_storage;
+#endif
 
 	/* Stat data, not accessed from path walking */
 	unsigned long		i_ino;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 105328f0b9c0..73604c7130f1 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -10,8 +10,7 @@  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
-obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
-obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o bpf_inode_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
 obj-$(CONFIG_BPF_JIT) += trampoline.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 44ccebc745e5..8d5a9bfe6643 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -21,16 +21,11 @@ 
 
 DEFINE_BPF_STORAGE_CACHE(inode_cache);
 
-static struct bpf_local_storage __rcu **
-inode_storage_ptr(void *owner)
+static struct bpf_local_storage __rcu **inode_storage_ptr(void *owner)
 {
 	struct inode *inode = owner;
-	struct bpf_storage_blob *bsb;
 
-	bsb = bpf_inode(inode);
-	if (!bsb)
-		return NULL;
-	return &bsb->storage;
+	return &inode->i_bpf_storage;
 }
 
 static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
@@ -39,14 +34,9 @@  static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
 {
 	struct bpf_local_storage *inode_storage;
 	struct bpf_local_storage_map *smap;
-	struct bpf_storage_blob *bsb;
-
-	bsb = bpf_inode(inode);
-	if (!bsb)
-		return NULL;
 
 	inode_storage =
-		rcu_dereference_check(bsb->storage, bpf_rcu_lock_held());
+		rcu_dereference_check(inode->i_bpf_storage, bpf_rcu_lock_held());
 	if (!inode_storage)
 		return NULL;
 
@@ -57,15 +47,10 @@  static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
 void bpf_inode_storage_free(struct inode *inode)
 {
 	struct bpf_local_storage *local_storage;
-	struct bpf_storage_blob *bsb;
-
-	bsb = bpf_inode(inode);
-	if (!bsb)
-		return;
 
 	rcu_read_lock();
 
-	local_storage = rcu_dereference(bsb->storage);
+	local_storage = rcu_dereference(inode->i_bpf_storage);
 	if (!local_storage) {
 		rcu_read_unlock();
 		return;
@@ -95,8 +80,6 @@  static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
 
 	if (fd_empty(f))
 		return -EBADF;
-	if (!inode_storage_ptr(file_inode(fd_file(f))))
-		return -EBADF;
 
 	sdata = bpf_local_storage_update(file_inode(fd_file(f)),
 					 (struct bpf_local_storage_map *)map,
@@ -136,12 +119,7 @@  BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
 	if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
 		return (unsigned long)NULL;
 
-	/* explicitly check that the inode_storage_ptr is not
-	 * NULL as inode_storage_lookup returns NULL in this case and
-	 * bpf_local_storage_update expects the owner to have a
-	 * valid storage pointer.
-	 */
-	if (!inode || !inode_storage_ptr(inode))
+	if (!inode)
 		return (unsigned long)NULL;
 
 	sdata = inode_storage_lookup(inode, map, true);
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 3bc61628ab25..6b6e0730e515 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -231,10 +231,6 @@  bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 
 	switch (func_id) {
-	case BPF_FUNC_inode_storage_get:
-		return &bpf_inode_storage_get_proto;
-	case BPF_FUNC_inode_storage_delete:
-		return &bpf_inode_storage_delete_proto;
 #ifdef CONFIG_NET
 	case BPF_FUNC_sk_storage_get:
 		return &bpf_sk_storage_get_proto;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 949a3870946c..262bd101ea0b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1553,6 +1553,10 @@  bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		if (bpf_prog_check_recur(prog))
 			return &bpf_task_storage_delete_recur_proto;
 		return &bpf_task_storage_delete_proto;
+	case BPF_FUNC_inode_storage_get:
+		return &bpf_inode_storage_get_proto;
+	case BPF_FUNC_inode_storage_delete:
+		return &bpf_inode_storage_delete_proto;
 	case BPF_FUNC_for_each_map_elem:
 		return &bpf_for_each_map_elem_proto;
 	case BPF_FUNC_snprintf:
diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c
index db759025abe1..67719a04bb0b 100644
--- a/security/bpf/hooks.c
+++ b/security/bpf/hooks.c
@@ -12,7 +12,6 @@  static struct security_hook_list bpf_lsm_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(NAME, bpf_lsm_##NAME),
 	#include <linux/lsm_hook_defs.h>
 	#undef LSM_HOOK
-	LSM_HOOK_INIT(inode_free_security, bpf_inode_storage_free),
 };
 
 static const struct lsm_id bpf_lsmid = {
@@ -28,12 +27,7 @@  static int __init bpf_lsm_init(void)
 	return 0;
 }
 
-struct lsm_blob_sizes bpf_lsm_blob_sizes __ro_after_init = {
-	.lbs_inode = sizeof(struct bpf_storage_blob),
-};
-
 DEFINE_LSM(bpf) = {
 	.name = "bpf",
 	.init = bpf_lsm_init,
-	.blobs = &bpf_lsm_blob_sizes
 };