diff mbox series

[v3,fanotify,1/2] fanotify: Introduce fanotify filter

Message ID 20241122225958.1775625-2-song@kernel.org (mailing list archive)
State Handled Elsewhere
Headers show
Series Fanotify in kernel filter | expand

Commit Message

Song Liu Nov. 22, 2024, 10:59 p.m. UTC
fanotify filter enables handling fanotify events within the kernel, and
thus saves a trip to the user space. fanotify filter can be useful in
many use cases. For example, if a user is only interested in events for
some files in side a directory, a filter can be used to filter out
irrelevant events.

fanotify filter is attached to fsnotify_group. At most one filter can
be attached to a fsnotify_group. The attach/detach of filter are
controlled by two new ioctls on the fanotify fds: FAN_IOC_ADD_FILTER
and FAN_IOC_DEL_FILTER.

fanotify filter is packaged in a kernel module. In the future, it is
also possible to package fanotify filter in a BPF program. Since loading
modules requires CAP_SYS_ADMIN, _loading_ fanotify filter in kernel
modules is limited to CAP_SYS_ADMIN. However, non-SYS_CAP_ADMIN users
can _attach_ filter loaded by sys admin to their fanotify fds. The owner
of the fanotify fitler can use flag FAN_FILTER_F_SYS_ADMIN_ONLY to
make a filter available only to users with CAP_SYS_ADMIN.

To make fanotify filters more flexible, a filter can take arguments at
attach time.

sysfs entry /sys/kernel/fanotify_filter is added to help users know
which fanotify filters are available. At the moment, these files are
added for each filter: flags, desc, and init_args.

Signed-off-by: Song Liu <song@kernel.org>
---
 fs/notify/fanotify/Kconfig           |  13 ++
 fs/notify/fanotify/Makefile          |   1 +
 fs/notify/fanotify/fanotify.c        |  44 +++-
 fs/notify/fanotify/fanotify_filter.c | 289 +++++++++++++++++++++++++++
 fs/notify/fanotify/fanotify_user.c   |   7 +
 include/linux/fanotify.h             | 128 ++++++++++++
 include/linux/fsnotify_backend.h     |   6 +-
 include/uapi/linux/fanotify.h        |  36 ++++
 8 files changed, 520 insertions(+), 4 deletions(-)
 create mode 100644 fs/notify/fanotify/fanotify_filter.c

Comments

Amir Goldstein Nov. 24, 2024, 6:25 a.m. UTC | #1
On Sat, Nov 23, 2024 at 12:00 AM Song Liu <song@kernel.org> wrote:
>
> fanotify filter enables handling fanotify events within the kernel, and
> thus saves a trip to the user space. fanotify filter can be useful in
> many use cases. For example, if a user is only interested in events for
> some files in side a directory, a filter can be used to filter out
> irrelevant events.
>
> fanotify filter is attached to fsnotify_group. At most one filter can
> be attached to a fsnotify_group. The attach/detach of filter are
> controlled by two new ioctls on the fanotify fds: FAN_IOC_ADD_FILTER
> and FAN_IOC_DEL_FILTER.
>
> fanotify filter is packaged in a kernel module. In the future, it is
> also possible to package fanotify filter in a BPF program. Since loading
> modules requires CAP_SYS_ADMIN, _loading_ fanotify filter in kernel
> modules is limited to CAP_SYS_ADMIN. However, non-SYS_CAP_ADMIN users
> can _attach_ filter loaded by sys admin to their fanotify fds. The owner
> of the fanotify fitler can use flag FAN_FILTER_F_SYS_ADMIN_ONLY to
> make a filter available only to users with CAP_SYS_ADMIN.
>
> To make fanotify filters more flexible, a filter can take arguments at
> attach time.
>
> sysfs entry /sys/kernel/fanotify_filter is added to help users know
> which fanotify filters are available. At the moment, these files are
> added for each filter: flags, desc, and init_args.

It's a shame that we have fanotify knobs at /proc/sys/fs/fanotify/ and
in sysfs, but understand we don't want to make more use of proc for this.

Still I would add the filter files under a new /sys/fs/fanotify/ dir and not
directly under /sys/kernel/

>
> Signed-off-by: Song Liu <song@kernel.org>
> ---
>  fs/notify/fanotify/Kconfig           |  13 ++
>  fs/notify/fanotify/Makefile          |   1 +
>  fs/notify/fanotify/fanotify.c        |  44 +++-
>  fs/notify/fanotify/fanotify_filter.c | 289 +++++++++++++++++++++++++++
>  fs/notify/fanotify/fanotify_user.c   |   7 +
>  include/linux/fanotify.h             | 128 ++++++++++++
>  include/linux/fsnotify_backend.h     |   6 +-
>  include/uapi/linux/fanotify.h        |  36 ++++
>  8 files changed, 520 insertions(+), 4 deletions(-)
>  create mode 100644 fs/notify/fanotify/fanotify_filter.c
>
> diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
> index 0e36aaf379b7..abfd59d95f49 100644
> --- a/fs/notify/fanotify/Kconfig
> +++ b/fs/notify/fanotify/Kconfig
> @@ -24,3 +24,16 @@ config FANOTIFY_ACCESS_PERMISSIONS
>            hierarchical storage management systems.
>
>            If unsure, say N.
> +
> +config FANOTIFY_FILTER
> +       bool "fanotify in kernel filter"
> +       depends on FANOTIFY
> +       default y
> +       help
> +          Say Y here if you want to use fanotify in kernel filter.
> +          The filter can be implemented in a kernel module or a
> +          BPF program. The filter can speed up fanotify in many
> +          use cases. For example, when the listener is only interested in
> +          a subset of events.
> +
> +          If unsure, say Y.
> \ No newline at end of file
> diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile
> index 25ef222915e5..d95ec0aeffb5 100644
> --- a/fs/notify/fanotify/Makefile
> +++ b/fs/notify/fanotify/Makefile
> @@ -1,2 +1,3 @@
>  # SPDX-License-Identifier: GPL-2.0-only
>  obj-$(CONFIG_FANOTIFY)         += fanotify.o fanotify_user.o
> +obj-$(CONFIG_FANOTIFY_FILTER)  += fanotify_filter.o
> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
> index 224bccaab4cc..c70184cd2d45 100644
> --- a/fs/notify/fanotify/fanotify.c
> +++ b/fs/notify/fanotify/fanotify.c
> @@ -18,6 +18,8 @@
>
>  #include "fanotify.h"
>
> +extern struct srcu_struct fsnotify_mark_srcu;
> +
>  static bool fanotify_path_equal(const struct path *p1, const struct path *p2)
>  {
>         return p1->mnt == p2->mnt && p1->dentry == p2->dentry;
> @@ -888,6 +890,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
>         struct fsnotify_event *fsn_event;
>         __kernel_fsid_t fsid = {};
>         u32 match_mask = 0;
> +       struct fanotify_filter_hook *filter_hook __maybe_unused;
>
>         BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
>         BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
> @@ -921,6 +924,39 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
>         pr_debug("%s: group=%p mask=%x report_mask=%x\n", __func__,
>                  group, mask, match_mask);
>
> +       if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS))
> +               fsid = fanotify_get_fsid(iter_info);
> +
> +#ifdef CONFIG_FANOTIFY_FILTER
> +       filter_hook = srcu_dereference(group->fanotify_data.filter_hook, &fsnotify_mark_srcu);

Do we actually need the sleeping rcu protection for calling the hook?
Can regular rcu read side be nested inside srcu read side?

Jan,

I don't remember why srcu is needed since we are not holding it
when waiting for userspace anymore?

> +       if (filter_hook) {
> +               struct fanotify_filter_event filter_event = {
> +                       .mask = mask,
> +                       .data = data,
> +                       .data_type = data_type,
> +                       .dir = dir,
> +                       .file_name = file_name,
> +                       .fsid = &fsid,
> +                       .match_mask = match_mask,
> +               };
> +
> +               ret = filter_hook->ops->filter(group, filter_hook, &filter_event);
> +
> +               /*
> +                * The filter may return
> +                * - FAN_FILTER_RET_SEND_TO_USERSPACE => continue the rest;
> +                * - FAN_FILTER_RET_SKIP_EVENT => return 0 now;
> +                * - < 0 error => return error now.
> +                *
> +                * For the latter two cases, we can just return ret.
> +                */
> +               BUILD_BUG_ON(FAN_FILTER_RET_SKIP_EVENT != 0);
> +
> +               if (ret != FAN_FILTER_RET_SEND_TO_USERSPACE)
> +                       return ret;
> +       }
> +#endif
> +
>         if (fanotify_is_perm_event(mask)) {
>                 /*
>                  * fsnotify_prepare_user_wait() fails if we race with mark
> @@ -930,9 +966,6 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
>                         return 0;
>         }
>
> -       if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS))
> -               fsid = fanotify_get_fsid(iter_info);
> -
>         event = fanotify_alloc_event(group, mask, data, data_type, dir,
>                                      file_name, &fsid, match_mask);
>         ret = -ENOMEM;
> @@ -976,6 +1009,11 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
>
>         if (mempool_initialized(&group->fanotify_data.error_events_pool))
>                 mempool_exit(&group->fanotify_data.error_events_pool);
> +
> +#ifdef CONFIG_FANOTIFY_FILTER
> +       if (group->fanotify_data.filter_hook)
> +               fanotify_filter_hook_free(group->fanotify_data.filter_hook);
> +#endif
>  }
>
>  static void fanotify_free_path_event(struct fanotify_event *event)
> diff --git a/fs/notify/fanotify/fanotify_filter.c b/fs/notify/fanotify/fanotify_filter.c
> new file mode 100644
> index 000000000000..9215612e2fcb
> --- /dev/null
> +++ b/fs/notify/fanotify/fanotify_filter.c
> @@ -0,0 +1,289 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/fanotify.h>
> +#include <linux/kobject.h>
> +#include <linux/module.h>
> +
> +#include "fanotify.h"
> +
> +extern struct srcu_struct fsnotify_mark_srcu;
> +
> +static DEFINE_SPINLOCK(filter_list_lock);
> +static LIST_HEAD(filter_list);
> +
> +static struct kobject *fan_filter_root_kobj;
> +
> +static struct {
> +       enum fanotify_filter_flags flag;
> +       const char *name;
> +} fanotify_filter_flags_names[] = {
> +       {
> +               .flag = FAN_FILTER_F_SYS_ADMIN_ONLY,
> +               .name = "SYS_ADMIN_ONLY",
> +       }
> +};
> +
> +static ssize_t flags_show(struct kobject *kobj,
> +                         struct kobj_attribute *attr, char *buf)
> +{
> +       struct fanotify_filter_ops *ops;
> +       ssize_t len = 0;
> +       int i;
> +
> +       ops = container_of(kobj, struct fanotify_filter_ops, kobj);
> +       for (i = 0; i < ARRAY_SIZE(fanotify_filter_flags_names); i++) {
> +               if (ops->flags & fanotify_filter_flags_names[i].flag) {
> +                       len += sysfs_emit_at(buf, len, "%s%s", len ? " " : "",
> +                                            fanotify_filter_flags_names[i].name);
> +               }
> +       }
> +       len += sysfs_emit_at(buf, len, "\n");
> +       return len;
> +}
> +
> +static ssize_t desc_show(struct kobject *kobj,
> +                        struct kobj_attribute *attr, char *buf)
> +{
> +       struct fanotify_filter_ops *ops;
> +
> +       ops = container_of(kobj, struct fanotify_filter_ops, kobj);
> +
> +       return sysfs_emit(buf, "%s\n", ops->desc ?: "N/A");
> +}
> +
> +static ssize_t init_args_show(struct kobject *kobj,
> +                             struct kobj_attribute *attr, char *buf)
> +{
> +       struct fanotify_filter_ops *ops;
> +
> +       ops = container_of(kobj, struct fanotify_filter_ops, kobj);
> +
> +       return sysfs_emit(buf, "%s\n", ops->init_args ?: "N/A");
> +}
> +
> +static struct kobj_attribute flags_kobj_attr = __ATTR_RO(flags);
> +static struct kobj_attribute desc_kobj_attr = __ATTR_RO(desc);
> +static struct kobj_attribute init_args_kobj_attr = __ATTR_RO(init_args);
> +
> +static struct attribute *fan_filter_attrs[] = {
> +       &flags_kobj_attr.attr,
> +       &desc_kobj_attr.attr,
> +       &init_args_kobj_attr.attr,
> +       NULL,
> +};
> +ATTRIBUTE_GROUPS(fan_filter);
> +
> +static void fan_filter_kobj_release(struct kobject *kobj)
> +{
> +}
> +
> +static const struct kobj_type fan_filter_ktype = {
> +       .release = fan_filter_kobj_release,
> +       .sysfs_ops = &kobj_sysfs_ops,
> +       .default_groups = fan_filter_groups,
> +};
> +
> +static struct fanotify_filter_ops *fanotify_filter_find(const char *name)
> +{
> +       struct fanotify_filter_ops *ops;
> +
> +       list_for_each_entry(ops, &filter_list, list) {
> +               if (!strcmp(ops->name, name))
> +                       return ops;
> +       }
> +       return NULL;
> +}
> +
> +static void __fanotify_filter_unregister(struct fanotify_filter_ops *ops)
> +{
> +       spin_lock(&filter_list_lock);
> +       list_del_init(&ops->list);
> +       spin_unlock(&filter_list_lock);
> +}
> +
> +/*
> + * fanotify_filter_register - Register a new filter.
> + *
> + * Add a filter to the filter_list. These filter are
> + * available for all users in the system.
> + *
> + * @ops:       pointer to fanotify_filter_ops to add.
> + *
> + * Returns:
> + *     0       - on success;
> + *     -EEXIST - filter of the same name already exists.
> + *     -ENODEV - fanotify filter was not properly initialized.
> + */
> +int fanotify_filter_register(struct fanotify_filter_ops *ops)
> +{
> +       int ret;
> +
> +       if (!fan_filter_root_kobj)
> +               return -ENODEV;
> +
> +       spin_lock(&filter_list_lock);
> +       if (fanotify_filter_find(ops->name)) {
> +               /* cannot register two filters with the same name */
> +               spin_unlock(&filter_list_lock);
> +               return -EEXIST;
> +       }
> +       list_add_tail(&ops->list, &filter_list);
> +       spin_unlock(&filter_list_lock);
> +
> +
> +       kobject_init(&ops->kobj, &fan_filter_ktype);
> +       ret = kobject_add(&ops->kobj, fan_filter_root_kobj, "%s", ops->name);
> +       if (ret) {
> +               __fanotify_filter_unregister(ops);
> +               return ret;
> +       }
> +       return 0;
> +}
> +EXPORT_SYMBOL_GPL(fanotify_filter_register);
> +
> +/*
> + * fanotify_filter_unregister - Unregister a new filter.
> + *
> + * Remove a filter from filter_list.
> + *
> + * @ops:       pointer to fanotify_filter_ops to remove.
> + */
> +void fanotify_filter_unregister(struct fanotify_filter_ops *ops)
> +{
> +       kobject_put(&ops->kobj);
> +       __fanotify_filter_unregister(ops);
> +}
> +EXPORT_SYMBOL_GPL(fanotify_filter_unregister);
> +
> +/*
> + * fanotify_filter_add - Add a filter to fsnotify_group.
> + *
> + * Add a filter from filter_list to a fsnotify_group.
> + *
> + * @group:     fsnotify_group that will have add
> + * @argp:      fanotify_filter_args that specifies the filter
> + *             and the init arguments of the filter.
> + *
> + * Returns:
> + *     0       - on success;
> + *     -EEXIST - filter of the same name already exists.
> + */
> +int fanotify_filter_add(struct fsnotify_group *group,
> +                       struct fanotify_filter_args __user *argp)
> +{
> +       struct fanotify_filter_hook *filter_hook;
> +       struct fanotify_filter_ops *filter_ops;
> +       struct fanotify_filter_args args;
> +       void *init_args = NULL;
> +       int ret = 0;
> +
> +       ret = copy_from_user(&args, argp, sizeof(args));
> +       if (ret)
> +               return -EFAULT;
> +
> +       if (args.init_args_size > FAN_FILTER_ARGS_MAX)
> +               return -EINVAL;
> +
> +       args.name[FAN_FILTER_NAME_MAX - 1] = '\0';
> +
> +       fsnotify_group_lock(group);
> +
> +       if (rcu_access_pointer(group->fanotify_data.filter_hook)) {
> +               fsnotify_group_unlock(group);
> +               return -EBUSY;
> +       }
> +
> +       filter_hook = kzalloc(sizeof(*filter_hook), GFP_KERNEL);
> +       if (!filter_hook) {
> +               ret = -ENOMEM;
> +               goto out;
> +       }
> +
> +       spin_lock(&filter_list_lock);
> +       filter_ops = fanotify_filter_find(args.name);
> +       if (!filter_ops || !try_module_get(filter_ops->owner)) {
> +               spin_unlock(&filter_list_lock);
> +               ret = -ENOENT;
> +               goto err_free_hook;
> +       }
> +       spin_unlock(&filter_list_lock);
> +
> +       if (!capable(CAP_SYS_ADMIN) && (filter_ops->flags & FAN_FILTER_F_SYS_ADMIN_ONLY)) {

1. feels better to opt-in for UNPRIV (and maybe later on) rather than
make it the default.
2. need to check that filter_ops->flags has only "known" flags
3. probably need to add filter_ops->version check in case we want to
change the ABI

> +               ret = -EPERM;
> +               goto err_module_put;
> +       }
> +
> +       if (filter_ops->filter_init) {
> +               if (args.init_args_size != filter_ops->init_args_size) {
> +                       ret = -EINVAL;
> +                       goto err_module_put;
> +               }
> +               if (args.init_args_size) {
> +                       init_args = kzalloc(args.init_args_size, GFP_KERNEL);
> +                       if (!init_args) {
> +                               ret = -ENOMEM;
> +                               goto err_module_put;
> +                       }
> +                       if (copy_from_user(init_args, (void __user *)args.init_args,
> +                                          args.init_args_size)) {
> +                               ret = -EFAULT;
> +                               goto err_free_args;
> +                       }
> +
> +               }
> +               ret = filter_ops->filter_init(group, filter_hook, init_args);
> +               if (ret)
> +                       goto err_free_args;
> +               kfree(init_args);
> +       }
> +       filter_hook->ops = filter_ops;
> +       rcu_assign_pointer(group->fanotify_data.filter_hook, filter_hook);
> +
> +out:
> +       fsnotify_group_unlock(group);
> +       return ret;
> +
> +err_free_args:
> +       kfree(init_args);
> +err_module_put:
> +       module_put(filter_ops->owner);
> +err_free_hook:
> +       kfree(filter_hook);
> +       goto out;
> +}
> +
> +void fanotify_filter_hook_free(struct fanotify_filter_hook *filter_hook)
> +{
> +       if (filter_hook->ops->filter_free)
> +               filter_hook->ops->filter_free(filter_hook);
> +
> +       module_put(filter_hook->ops->owner);
> +       kfree(filter_hook);
> +}
> +
> +/*
> + * fanotify_filter_del - Delete a filter from fsnotify_group.
> + */
> +void fanotify_filter_del(struct fsnotify_group *group)
> +{
> +       struct fanotify_filter_hook *filter_hook;
> +
> +       fsnotify_group_lock(group);
> +       filter_hook = group->fanotify_data.filter_hook;
> +       if (!filter_hook)
> +               goto out;
> +
> +       rcu_assign_pointer(group->fanotify_data.filter_hook, NULL);
> +       fanotify_filter_hook_free(filter_hook);

The read side is protected with srcu and there is no srcu/rcu delay of freeing.
You will either need something along the lines of
fsnotify_connector_destroy_workfn() with synchronize_srcu()
or use regular rcu delay and read side (assuming that it can be nested inside
srcu read side?).

Thanks,
Amir.
Song Liu Nov. 24, 2024, 7:08 p.m. UTC | #2
> On Nov 23, 2024, at 10:25 PM, Amir Goldstein <amir73il@gmail.com> wrote:
> 
> On Sat, Nov 23, 2024 at 12:00 AM Song Liu <song@kernel.org> wrote:

[...]

>> 
>> To make fanotify filters more flexible, a filter can take arguments at
>> attach time.
>> 
>> sysfs entry /sys/kernel/fanotify_filter is added to help users know
>> which fanotify filters are available. At the moment, these files are
>> added for each filter: flags, desc, and init_args.
> 
> It's a shame that we have fanotify knobs at /proc/sys/fs/fanotify/ and
> in sysfs, but understand we don't want to make more use of proc for this.
> 
> Still I would add the filter files under a new /sys/fs/fanotify/ dir and not
> directly under /sys/kernel/

I don't have a strong preference either way. We can create it under
/sys/fs/fanotify if that makes more sense. 

> 
>> 
>> Signed-off-by: Song Liu <song@kernel.org>
>> ---
>> fs/notify/fanotify/Kconfig           |  13 ++
>> fs/notify/fanotify/Makefile          |   1 +
>> fs/notify/fanotify/fanotify.c        |  44 +++-
>> fs/notify/fanotify/fanotify_filter.c | 289 +++++++++++++++++++++++++++
>> fs/notify/fanotify/fanotify_user.c   |   7 +
>> include/linux/fanotify.h             | 128 ++++++++++++
>> include/linux/fsnotify_backend.h     |   6 +-
>> include/uapi/linux/fanotify.h        |  36 ++++
>> 8 files changed, 520 insertions(+), 4 deletions(-)
>> create mode 100644 fs/notify/fanotify/fanotify_filter.c
[...]
>> 
>>        BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
>>        BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
>> @@ -921,6 +924,39 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
>>        pr_debug("%s: group=%p mask=%x report_mask=%x\n", __func__,
>>                 group, mask, match_mask);
>> 
>> +       if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS))
>> +               fsid = fanotify_get_fsid(iter_info);
>> +
>> +#ifdef CONFIG_FANOTIFY_FILTER
>> +       filter_hook = srcu_dereference(group->fanotify_data.filter_hook, &fsnotify_mark_srcu);
> 
> Do we actually need the sleeping rcu protection for calling the hook?
> Can regular rcu read side be nested inside srcu read side?

I was thinking the filter function can still sleep, for example, to 
read an xattr. 

> 
> Jan,
> 
> I don't remember why srcu is needed since we are not holding it
> when waiting for userspace anymore?
> 
>> +       if (filter_hook) {
>> +               struct fanotify_filter_event filter_event = {
>> +                       .mask = mask,
>> +                       .data = data,
>> +                       .data_type = data_type,
>> +                       .dir = dir,
>> +                       .file_name = file_name,
>> +                       .fsid = &fsid,
>> +                       .match_mask = match_mask,
>> +               };

[...]

>> +
>> +       spin_lock(&filter_list_lock);
>> +       filter_ops = fanotify_filter_find(args.name);
>> +       if (!filter_ops || !try_module_get(filter_ops->owner)) {
>> +               spin_unlock(&filter_list_lock);
>> +               ret = -ENOENT;
>> +               goto err_free_hook;
>> +       }
>> +       spin_unlock(&filter_list_lock);
>> +
>> +       if (!capable(CAP_SYS_ADMIN) && (filter_ops->flags & FAN_FILTER_F_SYS_ADMIN_ONLY)) {
> 
> 1. feels better to opt-in for UNPRIV (and maybe later on) rather than
> make it the default.

Sure. 

> 2. need to check that filter_ops->flags has only "known" flags

Agreed. 

> 3. probably need to add filter_ops->version check in case we want to
> change the ABI

I think we can let the author of the filter handle versioning 
inside filter_init function. Many users may not need any logic
for compatibility. 

> 
>> +               ret = -EPERM;
>> +               goto err_module_put;
>> +       }
>> +

[...]

>> +
>> +/*
>> + * fanotify_filter_del - Delete a filter from fsnotify_group.
>> + */
>> +void fanotify_filter_del(struct fsnotify_group *group)
>> +{
>> +       struct fanotify_filter_hook *filter_hook;
>> +
>> +       fsnotify_group_lock(group);
>> +       filter_hook = group->fanotify_data.filter_hook;
>> +       if (!filter_hook)
>> +               goto out;
>> +
>> +       rcu_assign_pointer(group->fanotify_data.filter_hook, NULL);
>> +       fanotify_filter_hook_free(filter_hook);
> 
> The read side is protected with srcu and there is no srcu/rcu delay of freeing.
> You will either need something along the lines of
> fsnotify_connector_destroy_workfn() with synchronize_srcu()

Yeah, we do need a synchronize_srcu() here. I will fix this in 
the next version. 

> or use regular rcu delay and read side (assuming that it can be nested inside
> srcu read side?).

Thanks for the review!

Song
kernel test robot Nov. 25, 2024, 11:01 a.m. UTC | #3
Hi Song,

kernel test robot noticed the following build warnings:

[auto build test WARNING on jack-fs/fsnotify]
[also build test WARNING on linus/master v6.12 next-20241125]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Song-Liu/fanotify-Introduce-fanotify-filter/20241125-110818
base:   https://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git fsnotify
patch link:    https://lore.kernel.org/r/20241122225958.1775625-2-song%40kernel.org
patch subject: [PATCH v3 fanotify 1/2] fanotify: Introduce fanotify filter
config: i386-randconfig-001-20241125 (https://download.01.org/0day-ci/archive/20241125/202411251801.nLqLjFGW-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241125/202411251801.nLqLjFGW-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202411251801.nLqLjFGW-lkp@intel.com/

All warnings (new ones prefixed by >>):

   fs/notify/fanotify/fanotify_filter.c: In function 'fanotify_filter_add':
>> fs/notify/fanotify/fanotify_filter.c:226:55: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
     226 |                         if (copy_from_user(init_args, (void __user *)args.init_args,
         |                                                       ^


vim +226 fs/notify/fanotify/fanotify_filter.c

   156	
   157	/*
   158	 * fanotify_filter_add - Add a filter to fsnotify_group.
   159	 *
   160	 * Add a filter from filter_list to a fsnotify_group.
   161	 *
   162	 * @group:	fsnotify_group that will have add
   163	 * @argp:	fanotify_filter_args that specifies the filter
   164	 *		and the init arguments of the filter.
   165	 *
   166	 * Returns:
   167	 *	0	- on success;
   168	 *	-EEXIST	- filter of the same name already exists.
   169	 */
   170	int fanotify_filter_add(struct fsnotify_group *group,
   171				struct fanotify_filter_args __user *argp)
   172	{
   173		struct fanotify_filter_hook *filter_hook;
   174		struct fanotify_filter_ops *filter_ops;
   175		struct fanotify_filter_args args;
   176		void *init_args = NULL;
   177		int ret = 0;
   178	
   179		ret = copy_from_user(&args, argp, sizeof(args));
   180		if (ret)
   181			return -EFAULT;
   182	
   183		if (args.init_args_size > FAN_FILTER_ARGS_MAX)
   184			return -EINVAL;
   185	
   186		args.name[FAN_FILTER_NAME_MAX - 1] = '\0';
   187	
   188		fsnotify_group_lock(group);
   189	
   190		if (rcu_access_pointer(group->fanotify_data.filter_hook)) {
   191			fsnotify_group_unlock(group);
   192			return -EBUSY;
   193		}
   194	
   195		filter_hook = kzalloc(sizeof(*filter_hook), GFP_KERNEL);
   196		if (!filter_hook) {
   197			ret = -ENOMEM;
   198			goto out;
   199		}
   200	
   201		spin_lock(&filter_list_lock);
   202		filter_ops = fanotify_filter_find(args.name);
   203		if (!filter_ops || !try_module_get(filter_ops->owner)) {
   204			spin_unlock(&filter_list_lock);
   205			ret = -ENOENT;
   206			goto err_free_hook;
   207		}
   208		spin_unlock(&filter_list_lock);
   209	
   210		if (!capable(CAP_SYS_ADMIN) && (filter_ops->flags & FAN_FILTER_F_SYS_ADMIN_ONLY)) {
   211			ret = -EPERM;
   212			goto err_module_put;
   213		}
   214	
   215		if (filter_ops->filter_init) {
   216			if (args.init_args_size != filter_ops->init_args_size) {
   217				ret = -EINVAL;
   218				goto err_module_put;
   219			}
   220			if (args.init_args_size) {
   221				init_args = kzalloc(args.init_args_size, GFP_KERNEL);
   222				if (!init_args) {
   223					ret = -ENOMEM;
   224					goto err_module_put;
   225				}
 > 226				if (copy_from_user(init_args, (void __user *)args.init_args,
   227						   args.init_args_size)) {
   228					ret = -EFAULT;
   229					goto err_free_args;
   230				}
   231	
   232			}
   233			ret = filter_ops->filter_init(group, filter_hook, init_args);
   234			if (ret)
   235				goto err_free_args;
   236			kfree(init_args);
   237		}
   238		filter_hook->ops = filter_ops;
   239		rcu_assign_pointer(group->fanotify_data.filter_hook, filter_hook);
   240	
   241	out:
   242		fsnotify_group_unlock(group);
   243		return ret;
   244	
   245	err_free_args:
   246		kfree(init_args);
   247	err_module_put:
   248		module_put(filter_ops->owner);
   249	err_free_hook:
   250		kfree(filter_hook);
   251		goto out;
   252	}
   253
kernel test robot Nov. 26, 2024, 12:06 a.m. UTC | #4
Hi Song,

kernel test robot noticed the following build warnings:

[auto build test WARNING on jack-fs/fsnotify]
[also build test WARNING on linus/master v6.12 next-20241125]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Song-Liu/fanotify-Introduce-fanotify-filter/20241125-110818
base:   https://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs.git fsnotify
patch link:    https://lore.kernel.org/r/20241122225958.1775625-2-song%40kernel.org
patch subject: [PATCH v3 fanotify 1/2] fanotify: Introduce fanotify filter
config: x86_64-randconfig-123-20241125 (https://download.01.org/0day-ci/archive/20241126/202411260746.XEdTrLMj-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241126/202411260746.XEdTrLMj-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202411260746.XEdTrLMj-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> fs/notify/fanotify/fanotify_filter.c:271:21: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct fanotify_filter_hook *filter_hook @@     got struct fanotify_filter_hook [noderef] __rcu *filter_hook @@
   fs/notify/fanotify/fanotify_filter.c:271:21: sparse:     expected struct fanotify_filter_hook *filter_hook
   fs/notify/fanotify/fanotify_filter.c:271:21: sparse:     got struct fanotify_filter_hook [noderef] __rcu *filter_hook
   fs/notify/fanotify/fanotify_filter.c: note: in included file (through include/linux/kobject.h, include/linux/fanotify.h):
   include/linux/list.h:83:21: sparse: sparse: self-comparison always evaluates to true
--
>> fs/notify/fanotify/fanotify.c:1015:63: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct fanotify_filter_hook *filter_hook @@     got struct fanotify_filter_hook [noderef] __rcu *filter_hook @@
   fs/notify/fanotify/fanotify.c:1015:63: sparse:     expected struct fanotify_filter_hook *filter_hook
   fs/notify/fanotify/fanotify.c:1015:63: sparse:     got struct fanotify_filter_hook [noderef] __rcu *filter_hook

vim +271 fs/notify/fanotify/fanotify_filter.c

   262	
   263	/*
   264	 * fanotify_filter_del - Delete a filter from fsnotify_group.
   265	 */
   266	void fanotify_filter_del(struct fsnotify_group *group)
   267	{
   268		struct fanotify_filter_hook *filter_hook;
   269	
   270		fsnotify_group_lock(group);
 > 271		filter_hook = group->fanotify_data.filter_hook;
   272		if (!filter_hook)
   273			goto out;
   274	
   275		rcu_assign_pointer(group->fanotify_data.filter_hook, NULL);
   276		fanotify_filter_hook_free(filter_hook);
   277	
   278	out:
   279		fsnotify_group_unlock(group);
   280	}
   281
Jan Kara Nov. 28, 2024, 4:37 p.m. UTC | #5
On Sun 24-11-24 07:25:20, Amir Goldstein wrote:
> On Sat, Nov 23, 2024 at 12:00 AM Song Liu <song@kernel.org> wrote:
...
> > @@ -921,6 +924,39 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
> >         pr_debug("%s: group=%p mask=%x report_mask=%x\n", __func__,
> >                  group, mask, match_mask);
> >
> > +       if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS))
> > +               fsid = fanotify_get_fsid(iter_info);
> > +
> > +#ifdef CONFIG_FANOTIFY_FILTER
> > +       filter_hook = srcu_dereference(group->fanotify_data.filter_hook, &fsnotify_mark_srcu);
> 
> Do we actually need the sleeping rcu protection for calling the hook?
> Can regular rcu read side be nested inside srcu read side?

Nesting rcu inside srcu is fine.

> Jan,
> 
> I don't remember why srcu is needed since we are not holding it
> when waiting for userspace anymore?

You need srcu to access marks or notification group unless you grab a
reference to them (which is what waiting for permission event reply code
does).
 
								Honza
diff mbox series

Patch

diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 0e36aaf379b7..abfd59d95f49 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -24,3 +24,16 @@  config FANOTIFY_ACCESS_PERMISSIONS
 	   hierarchical storage management systems.
 
 	   If unsure, say N.
+
+config FANOTIFY_FILTER
+	bool "fanotify in kernel filter"
+	depends on FANOTIFY
+	default y
+	help
+	   Say Y here if you want to use fanotify in kernel filter.
+	   The filter can be implemented in a kernel module or a
+	   BPF program. The filter can speed up fanotify in many
+	   use cases. For example, when the listener is only interested in
+	   a subset of events.
+
+	   If unsure, say Y.
\ No newline at end of file
diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile
index 25ef222915e5..d95ec0aeffb5 100644
--- a/fs/notify/fanotify/Makefile
+++ b/fs/notify/fanotify/Makefile
@@ -1,2 +1,3 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_FANOTIFY)		+= fanotify.o fanotify_user.o
+obj-$(CONFIG_FANOTIFY_FILTER)	+= fanotify_filter.o
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 224bccaab4cc..c70184cd2d45 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -18,6 +18,8 @@ 
 
 #include "fanotify.h"
 
+extern struct srcu_struct fsnotify_mark_srcu;
+
 static bool fanotify_path_equal(const struct path *p1, const struct path *p2)
 {
 	return p1->mnt == p2->mnt && p1->dentry == p2->dentry;
@@ -888,6 +890,7 @@  static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
 	struct fsnotify_event *fsn_event;
 	__kernel_fsid_t fsid = {};
 	u32 match_mask = 0;
+	struct fanotify_filter_hook *filter_hook __maybe_unused;
 
 	BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
 	BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
@@ -921,6 +924,39 @@  static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
 	pr_debug("%s: group=%p mask=%x report_mask=%x\n", __func__,
 		 group, mask, match_mask);
 
+	if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS))
+		fsid = fanotify_get_fsid(iter_info);
+
+#ifdef CONFIG_FANOTIFY_FILTER
+	filter_hook = srcu_dereference(group->fanotify_data.filter_hook, &fsnotify_mark_srcu);
+	if (filter_hook) {
+		struct fanotify_filter_event filter_event = {
+			.mask = mask,
+			.data = data,
+			.data_type = data_type,
+			.dir = dir,
+			.file_name = file_name,
+			.fsid = &fsid,
+			.match_mask = match_mask,
+		};
+
+		ret = filter_hook->ops->filter(group, filter_hook, &filter_event);
+
+		/*
+		 * The filter may return
+		 * - FAN_FILTER_RET_SEND_TO_USERSPACE => continue the rest;
+		 * - FAN_FILTER_RET_SKIP_EVENT => return 0 now;
+		 * - < 0 error => return error now.
+		 *
+		 * For the latter two cases, we can just return ret.
+		 */
+		BUILD_BUG_ON(FAN_FILTER_RET_SKIP_EVENT != 0);
+
+		if (ret != FAN_FILTER_RET_SEND_TO_USERSPACE)
+			return ret;
+	}
+#endif
+
 	if (fanotify_is_perm_event(mask)) {
 		/*
 		 * fsnotify_prepare_user_wait() fails if we race with mark
@@ -930,9 +966,6 @@  static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
 			return 0;
 	}
 
-	if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS))
-		fsid = fanotify_get_fsid(iter_info);
-
 	event = fanotify_alloc_event(group, mask, data, data_type, dir,
 				     file_name, &fsid, match_mask);
 	ret = -ENOMEM;
@@ -976,6 +1009,11 @@  static void fanotify_free_group_priv(struct fsnotify_group *group)
 
 	if (mempool_initialized(&group->fanotify_data.error_events_pool))
 		mempool_exit(&group->fanotify_data.error_events_pool);
+
+#ifdef CONFIG_FANOTIFY_FILTER
+	if (group->fanotify_data.filter_hook)
+		fanotify_filter_hook_free(group->fanotify_data.filter_hook);
+#endif
 }
 
 static void fanotify_free_path_event(struct fanotify_event *event)
diff --git a/fs/notify/fanotify/fanotify_filter.c b/fs/notify/fanotify/fanotify_filter.c
new file mode 100644
index 000000000000..9215612e2fcb
--- /dev/null
+++ b/fs/notify/fanotify/fanotify_filter.c
@@ -0,0 +1,289 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fanotify.h>
+#include <linux/kobject.h>
+#include <linux/module.h>
+
+#include "fanotify.h"
+
+extern struct srcu_struct fsnotify_mark_srcu;
+
+static DEFINE_SPINLOCK(filter_list_lock);
+static LIST_HEAD(filter_list);
+
+static struct kobject *fan_filter_root_kobj;
+
+static struct {
+	enum fanotify_filter_flags flag;
+	const char *name;
+} fanotify_filter_flags_names[] = {
+	{
+		.flag = FAN_FILTER_F_SYS_ADMIN_ONLY,
+		.name = "SYS_ADMIN_ONLY",
+	}
+};
+
+static ssize_t flags_show(struct kobject *kobj,
+			  struct kobj_attribute *attr, char *buf)
+{
+	struct fanotify_filter_ops *ops;
+	ssize_t len = 0;
+	int i;
+
+	ops = container_of(kobj, struct fanotify_filter_ops, kobj);
+	for (i = 0; i < ARRAY_SIZE(fanotify_filter_flags_names); i++) {
+		if (ops->flags & fanotify_filter_flags_names[i].flag) {
+			len += sysfs_emit_at(buf, len, "%s%s", len ? " " : "",
+					     fanotify_filter_flags_names[i].name);
+		}
+	}
+	len += sysfs_emit_at(buf, len, "\n");
+	return len;
+}
+
+static ssize_t desc_show(struct kobject *kobj,
+			 struct kobj_attribute *attr, char *buf)
+{
+	struct fanotify_filter_ops *ops;
+
+	ops = container_of(kobj, struct fanotify_filter_ops, kobj);
+
+	return sysfs_emit(buf, "%s\n", ops->desc ?: "N/A");
+}
+
+static ssize_t init_args_show(struct kobject *kobj,
+			      struct kobj_attribute *attr, char *buf)
+{
+	struct fanotify_filter_ops *ops;
+
+	ops = container_of(kobj, struct fanotify_filter_ops, kobj);
+
+	return sysfs_emit(buf, "%s\n", ops->init_args ?: "N/A");
+}
+
+static struct kobj_attribute flags_kobj_attr = __ATTR_RO(flags);
+static struct kobj_attribute desc_kobj_attr = __ATTR_RO(desc);
+static struct kobj_attribute init_args_kobj_attr = __ATTR_RO(init_args);
+
+static struct attribute *fan_filter_attrs[] = {
+	&flags_kobj_attr.attr,
+	&desc_kobj_attr.attr,
+	&init_args_kobj_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(fan_filter);
+
+static void fan_filter_kobj_release(struct kobject *kobj)
+{
+}
+
+static const struct kobj_type fan_filter_ktype = {
+	.release = fan_filter_kobj_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = fan_filter_groups,
+};
+
+static struct fanotify_filter_ops *fanotify_filter_find(const char *name)
+{
+	struct fanotify_filter_ops *ops;
+
+	list_for_each_entry(ops, &filter_list, list) {
+		if (!strcmp(ops->name, name))
+			return ops;
+	}
+	return NULL;
+}
+
+static void __fanotify_filter_unregister(struct fanotify_filter_ops *ops)
+{
+	spin_lock(&filter_list_lock);
+	list_del_init(&ops->list);
+	spin_unlock(&filter_list_lock);
+}
+
+/*
+ * fanotify_filter_register - Register a new filter.
+ *
+ * Add a filter to the filter_list. These filter are
+ * available for all users in the system.
+ *
+ * @ops:	pointer to fanotify_filter_ops to add.
+ *
+ * Returns:
+ *	0	- on success;
+ *	-EEXIST	- filter of the same name already exists.
+ *	-ENODEV	- fanotify filter was not properly initialized.
+ */
+int fanotify_filter_register(struct fanotify_filter_ops *ops)
+{
+	int ret;
+
+	if (!fan_filter_root_kobj)
+		return -ENODEV;
+
+	spin_lock(&filter_list_lock);
+	if (fanotify_filter_find(ops->name)) {
+		/* cannot register two filters with the same name */
+		spin_unlock(&filter_list_lock);
+		return -EEXIST;
+	}
+	list_add_tail(&ops->list, &filter_list);
+	spin_unlock(&filter_list_lock);
+
+
+	kobject_init(&ops->kobj, &fan_filter_ktype);
+	ret = kobject_add(&ops->kobj, fan_filter_root_kobj, "%s", ops->name);
+	if (ret) {
+		__fanotify_filter_unregister(ops);
+		return ret;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fanotify_filter_register);
+
+/*
+ * fanotify_filter_unregister - Unregister a new filter.
+ *
+ * Remove a filter from filter_list.
+ *
+ * @ops:	pointer to fanotify_filter_ops to remove.
+ */
+void fanotify_filter_unregister(struct fanotify_filter_ops *ops)
+{
+	kobject_put(&ops->kobj);
+	__fanotify_filter_unregister(ops);
+}
+EXPORT_SYMBOL_GPL(fanotify_filter_unregister);
+
+/*
+ * fanotify_filter_add - Add a filter to fsnotify_group.
+ *
+ * Add a filter from filter_list to a fsnotify_group.
+ *
+ * @group:	fsnotify_group that will have add
+ * @argp:	fanotify_filter_args that specifies the filter
+ *		and the init arguments of the filter.
+ *
+ * Returns:
+ *	0	- on success;
+ *	-EEXIST	- filter of the same name already exists.
+ */
+int fanotify_filter_add(struct fsnotify_group *group,
+			struct fanotify_filter_args __user *argp)
+{
+	struct fanotify_filter_hook *filter_hook;
+	struct fanotify_filter_ops *filter_ops;
+	struct fanotify_filter_args args;
+	void *init_args = NULL;
+	int ret = 0;
+
+	ret = copy_from_user(&args, argp, sizeof(args));
+	if (ret)
+		return -EFAULT;
+
+	if (args.init_args_size > FAN_FILTER_ARGS_MAX)
+		return -EINVAL;
+
+	args.name[FAN_FILTER_NAME_MAX - 1] = '\0';
+
+	fsnotify_group_lock(group);
+
+	if (rcu_access_pointer(group->fanotify_data.filter_hook)) {
+		fsnotify_group_unlock(group);
+		return -EBUSY;
+	}
+
+	filter_hook = kzalloc(sizeof(*filter_hook), GFP_KERNEL);
+	if (!filter_hook) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	spin_lock(&filter_list_lock);
+	filter_ops = fanotify_filter_find(args.name);
+	if (!filter_ops || !try_module_get(filter_ops->owner)) {
+		spin_unlock(&filter_list_lock);
+		ret = -ENOENT;
+		goto err_free_hook;
+	}
+	spin_unlock(&filter_list_lock);
+
+	if (!capable(CAP_SYS_ADMIN) && (filter_ops->flags & FAN_FILTER_F_SYS_ADMIN_ONLY)) {
+		ret = -EPERM;
+		goto err_module_put;
+	}
+
+	if (filter_ops->filter_init) {
+		if (args.init_args_size != filter_ops->init_args_size) {
+			ret = -EINVAL;
+			goto err_module_put;
+		}
+		if (args.init_args_size) {
+			init_args = kzalloc(args.init_args_size, GFP_KERNEL);
+			if (!init_args) {
+				ret = -ENOMEM;
+				goto err_module_put;
+			}
+			if (copy_from_user(init_args, (void __user *)args.init_args,
+					   args.init_args_size)) {
+				ret = -EFAULT;
+				goto err_free_args;
+			}
+
+		}
+		ret = filter_ops->filter_init(group, filter_hook, init_args);
+		if (ret)
+			goto err_free_args;
+		kfree(init_args);
+	}
+	filter_hook->ops = filter_ops;
+	rcu_assign_pointer(group->fanotify_data.filter_hook, filter_hook);
+
+out:
+	fsnotify_group_unlock(group);
+	return ret;
+
+err_free_args:
+	kfree(init_args);
+err_module_put:
+	module_put(filter_ops->owner);
+err_free_hook:
+	kfree(filter_hook);
+	goto out;
+}
+
+void fanotify_filter_hook_free(struct fanotify_filter_hook *filter_hook)
+{
+	if (filter_hook->ops->filter_free)
+		filter_hook->ops->filter_free(filter_hook);
+
+	module_put(filter_hook->ops->owner);
+	kfree(filter_hook);
+}
+
+/*
+ * fanotify_filter_del - Delete a filter from fsnotify_group.
+ */
+void fanotify_filter_del(struct fsnotify_group *group)
+{
+	struct fanotify_filter_hook *filter_hook;
+
+	fsnotify_group_lock(group);
+	filter_hook = group->fanotify_data.filter_hook;
+	if (!filter_hook)
+		goto out;
+
+	rcu_assign_pointer(group->fanotify_data.filter_hook, NULL);
+	fanotify_filter_hook_free(filter_hook);
+
+out:
+	fsnotify_group_unlock(group);
+}
+
+static int __init fanotify_filter_init(void)
+{
+	fan_filter_root_kobj = kobject_create_and_add("fanotify_filter", kernel_kobj);
+	if (!fan_filter_root_kobj)
+		return -ENOMEM;
+	return 0;
+}
+device_initcall(fanotify_filter_init);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8e2d43fc6f7c..6445256541d2 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -987,6 +987,13 @@  static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
 		spin_unlock(&group->notification_lock);
 		ret = put_user(send_len, (int __user *) p);
 		break;
+	case FAN_IOC_ADD_FILTER:
+		ret = fanotify_filter_add(group, p);
+		break;
+	case FAN_IOC_DEL_FILTER:
+		fanotify_filter_del(group);
+		ret = 0;
+		break;
 	}
 
 	return ret;
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 89ff45bd6f01..d3d9af81d2c5 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -2,6 +2,7 @@ 
 #ifndef _LINUX_FANOTIFY_H
 #define _LINUX_FANOTIFY_H
 
+#include <linux/kobject.h>
 #include <linux/sysctl.h>
 #include <uapi/linux/fanotify.h>
 
@@ -136,4 +137,131 @@ 
 #undef FAN_ALL_PERM_EVENTS
 #undef FAN_ALL_OUTGOING_EVENTS
 
+struct fsnotify_group;
+struct qstr;
+struct inode;
+struct fanotify_filter_hook;
+
+/*
+ * Event passed to fanotify filter
+ *
+ * @mask:	event type and flags
+ * @data:	object that event happened on
+ * @data_type:	type of object for fanotify_data_XXX() accessors
+ * @dir:	optional directory associated with event -
+ *		if @file_name is not NULL, this is the directory that
+ *		@file_name is relative to
+ * @file_name:	optional file name associated with event
+ * @match_mask:	mark types of this group that matched the event
+ */
+struct fanotify_filter_event {
+	u32 mask;
+	const void *data;
+	int data_type;
+	struct inode *dir;
+	const struct qstr *file_name;
+	__kernel_fsid_t *fsid;
+	u32 match_mask;
+};
+
+/*
+ * fanotify filter should implement these ops.
+ *
+ * filter - Main call for the filter.
+ * @group:	The group being notified
+ * @filter_hook:	fanotify_filter_hook for the attach on @group.
+ * Returns: enum fanotify_filter_return.
+ *
+ * filter_init - Initialize the fanotify_filter_hook.
+ * @group:	The group that getting the filter
+ * @hook:	fanotify_filter_hook to be initialized
+ * @args:	Arguments used to initialize @hook
+ *
+ * filter_free - Free the fanotify_filter_hook.
+ * @hook:	fanotify_filter_hook to be freed.
+ *
+ * @name:	Name of the fanotify_filter_ops. This need to be unique
+ *		in the system
+ * @owner:	Owner module of this fanotify_filter_ops
+ * @list:	Attach to global list of fanotify_filter_ops
+ * @flags:	Flags for the fanotify_filter_ops
+ * @init_args_size: expected size of @args of filter_init()
+ * @desc:	Description of what this filter do (optional)
+ * @init_args:	Description of the init_args in a string (optional)
+ */
+struct fanotify_filter_ops {
+	int (*filter)(struct fsnotify_group *group,
+		      struct fanotify_filter_hook *filter_hook,
+		      struct fanotify_filter_event *filter_event);
+	int (*filter_init)(struct fsnotify_group *group,
+			   struct fanotify_filter_hook *hook,
+			   void *args);
+	void (*filter_free)(struct fanotify_filter_hook *hook);
+
+	char name[FAN_FILTER_NAME_MAX];
+	struct module *owner;
+	struct list_head list;
+	u32 flags;
+	u32 init_args_size;
+	const char *desc;
+	const char *init_args;
+
+	/* internal */
+	struct kobject kobj;
+};
+
+/* Flags for fanotify_filter_ops->flags */
+enum fanotify_filter_flags {
+	/* CAP_SYS_ADMIN is required to use this filter */
+	FAN_FILTER_F_SYS_ADMIN_ONLY = BIT(0),
+
+	FAN_FILTER_F_ALL = FAN_FILTER_F_SYS_ADMIN_ONLY,
+};
+
+/*
+ * Hook that attaches fanotify_filter_ops to a group.
+ * @ops:	the ops
+ * @data:	per group data used by the ops
+ */
+struct fanotify_filter_hook {
+	struct fanotify_filter_ops *ops;
+	void *data;
+};
+
+#ifdef CONFIG_FANOTIFY_FILTER
+
+int fanotify_filter_register(struct fanotify_filter_ops *ops);
+void fanotify_filter_unregister(struct fanotify_filter_ops *ops);
+int fanotify_filter_add(struct fsnotify_group *group,
+			struct fanotify_filter_args __user *args);
+void fanotify_filter_del(struct fsnotify_group *group);
+void fanotify_filter_hook_free(struct fanotify_filter_hook *filter_hook);
+
+#else /* CONFIG_FANOTIFY_FILTER */
+
+static inline int fanotify_filter_register(struct fanotify_filter_ops *ops)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void fanotify_filter_unregister(struct fanotify_filter_ops *ops)
+{
+}
+
+static inline int fanotify_filter_add(struct fsnotify_group *group,
+				      struct fanotify_filter_args __user *args)
+{
+	return -ENOENT;
+}
+
+static inline void fanotify_filter_del(struct fsnotify_group *group)
+{
+}
+
+static inline void fanotify_filter_hook_free(struct fanotify_filter_hook *filter_hook)
+{
+}
+
+#endif /* CONFIG_FANOTIFY_FILTER */
+
 #endif /* _LINUX_FANOTIFY_H */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 3ecf7768e577..8cc2d6f737a6 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -117,6 +117,7 @@  struct fsnotify_fname;
 struct fsnotify_iter_info;
 
 struct mem_cgroup;
+struct fanotify_filter_hook;
 
 /*
  * Each group much define these ops.  The fsnotify infrastructure will call
@@ -255,8 +256,11 @@  struct fsnotify_group {
 			int f_flags; /* event_f_flags from fanotify_init() */
 			struct ucounts *ucounts;
 			mempool_t error_events_pool;
+#ifdef CONFIG_FANOTIFY_FILTER
+			struct fanotify_filter_hook __rcu *filter_hook;
+#endif /* CONFIG_FANOTIFY_FILTER */
 		} fanotify_data;
-#endif /* CONFIG_FANOTIFY */
+#endif
 	};
 };
 
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index 34f221d3a1b9..33fd493558c3 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -3,6 +3,7 @@ 
 #define _UAPI_LINUX_FANOTIFY_H
 
 #include <linux/types.h>
+#include <linux/ioctl.h>
 
 /* the following events that user-space can register for */
 #define FAN_ACCESS		0x00000001	/* File was accessed */
@@ -243,4 +244,39 @@  struct fanotify_response_info_audit_rule {
 				(long)(meta)->event_len >= (long)FAN_EVENT_METADATA_LEN && \
 				(long)(meta)->event_len <= (long)(len))
 
+/*
+ * fanotify filter.
+ * Please check out include/linux/fanotify.h for more information about
+ * the kernel module API.
+ */
+
+/* Return value of filter */
+enum fanotify_filter_return {
+	/* The event should NOT be sent to user space */
+	FAN_FILTER_RET_SKIP_EVENT = 0,
+	/* The event should be sent to user space */
+	FAN_FILTER_RET_SEND_TO_USERSPACE = 1,
+};
+
+#define FAN_FILTER_NAME_MAX 64
+#define FAN_FILTER_ARGS_MAX 64
+
+/* This is the arguments used to add filter to a group. */
+struct fanotify_filter_args {
+	char name[FAN_FILTER_NAME_MAX];
+
+	/*
+	 * user space pointer to the init args of filter,
+	 * up to init_args_len (<= FAN_FILTER_ARGS_MAX).
+	 */
+	__u64 init_args;
+	/* size of init_args */
+	__u32 init_args_size;
+} __attribute__((__packed__));
+
+#define FAN_IOC_MAGIC 'F'
+
+#define FAN_IOC_ADD_FILTER _IOW(FAN_IOC_MAGIC, 0, struct fanotify_filter_args)
+#define FAN_IOC_DEL_FILTER _IOW(FAN_IOC_MAGIC, 1, char[FAN_FILTER_NAME_MAX])
+
 #endif /* _UAPI_LINUX_FANOTIFY_H */