diff mbox series

[v2] security: add trace event for cap_capable

Message ID 20241025151128.1854905-1-linux@jordanrome.com (mailing list archive)
State Handled Elsewhere
Headers show
Series [v2] security: add trace event for cap_capable | expand

Commit Message

Jordan Rome Oct. 25, 2024, 3:11 p.m. UTC
In cases where we want a stable way to observe/trace
cap_capable (e.g. protection from inlining and API updates)
add a tracepoint that passes:
- The credentials used
- The user namespace of the resource being accessed
- The user namespace that has the capability to access the
targeted resource
- The capability to check for
- Bitmask of options defined in include/linux/security.h
- The return value of the check

Signed-off-by: Jordan Rome <linux@jordanrome.com>
---
 MAINTAINERS                       |  1 +
 include/trace/events/capability.h | 60 +++++++++++++++++++++++++++++++
 security/commoncap.c              | 31 +++++++++++-----
 3 files changed, 84 insertions(+), 8 deletions(-)
 create mode 100644 include/trace/events/capability.h

--
2.43.5

Comments

Andrii Nakryiko Oct. 25, 2024, 6:37 p.m. UTC | #1
On Fri, Oct 25, 2024 at 8:15 AM Jordan Rome <linux@jordanrome.com> wrote:
>
> In cases where we want a stable way to observe/trace
> cap_capable (e.g. protection from inlining and API updates)
> add a tracepoint that passes:
> - The credentials used
> - The user namespace of the resource being accessed
> - The user namespace that has the capability to access the
> targeted resource
> - The capability to check for
> - Bitmask of options defined in include/linux/security.h
> - The return value of the check
>
> Signed-off-by: Jordan Rome <linux@jordanrome.com>
> ---
>  MAINTAINERS                       |  1 +
>  include/trace/events/capability.h | 60 +++++++++++++++++++++++++++++++
>  security/commoncap.c              | 31 +++++++++++-----
>  3 files changed, 84 insertions(+), 8 deletions(-)
>  create mode 100644 include/trace/events/capability.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index cc40a9d9b8cd..210e9076c858 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -4994,6 +4994,7 @@ M:        Serge Hallyn <serge@hallyn.com>
>  L:     linux-security-module@vger.kernel.org
>  S:     Supported
>  F:     include/linux/capability.h
> +F:     include/trace/events/capability.h
>  F:     include/uapi/linux/capability.h
>  F:     kernel/capability.c
>  F:     security/commoncap.c
> diff --git a/include/trace/events/capability.h b/include/trace/events/capability.h
> new file mode 100644
> index 000000000000..e706ce690c38
> --- /dev/null
> +++ b/include/trace/events/capability.h
> @@ -0,0 +1,60 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM capability
> +
> +#if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_CAPABILITY_H
> +
> +#include <linux/cred.h>
> +#include <linux/tracepoint.h>
> +#include <linux/user_namespace.h>
> +
> +/**
> + * cap_capable - called after it's determined if a task has a particular
> + * effective capability
> + *
> + * @cred: The credentials used
> + * @targ_ns: The user namespace of the resource being accessed
> + * @capable_ns: The user namespace in which the credential provides the
> + *              capability to access the targeted resource.
> + *              This will be NULL if ret is not 0.
> + * @cap: The capability to check for
> + * @opts: Bitmask of options defined in include/linux/security.h
> + * @ret: The return value of the check: 0 if it does, -ve if it does not
> + *
> + * Allows to trace calls to cap_capable in commoncap.c
> + */
> +TRACE_EVENT(cap_capable,
> +
> +       TP_PROTO(const struct cred *cred, struct user_namespace *targ_ns,
> +               struct user_namespace *capable_ns, int cap, unsigned int opts, int ret),
> +
> +       TP_ARGS(cred, targ_ns, capable_ns, cap, opts, ret),
> +
> +       TP_STRUCT__entry(
> +               __field(const struct cred *, cred)
> +               __field(struct user_namespace *, targ_ns)
> +               __field(struct user_namespace *, capable_ns)
> +               __field(int, cap)
> +               __field(unsigned int, opts)
> +               __field(int, ret)
> +       ),
> +
> +       TP_fast_assign(
> +               __entry->cred       = cred;
> +               __entry->targ_ns    = targ_ns;
> +               __entry->capable_ns = capable_ns;
> +               __entry->cap        = cap;
> +               __entry->opts       = opts;
> +               __entry->ret        = ret;
> +       ),
> +
> +       TP_printk("cred %p, targ_ns %p, capable_ns %p, cap %d, opts %u, ret %d",
> +               __entry->cred, __entry->targ_ns, __entry->capable_ns, __entry->cap,
> +               __entry->opts, __entry->ret)
> +);
> +
> +#endif /* _TRACE_CAPABILITY_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> diff --git a/security/commoncap.c b/security/commoncap.c
> index 162d96b3a676..12c3ddfe0d6e 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -27,6 +27,9 @@
>  #include <linux/mnt_idmapping.h>
>  #include <uapi/linux/lsm.h>
>
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/capability.h>
> +
>  /*
>   * If a non-root user executes a setuid-root binary in
>   * !secure(SECURE_NOROOT) mode, then we raise capabilities.
> @@ -52,7 +55,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
>  /**
>   * cap_capable - Determine whether a task has a particular effective capability
>   * @cred: The credentials to use
> - * @targ_ns:  The user namespace in which we need the capability
> + * @targ_ns:  The user namespace of the resource being accessed
>   * @cap: The capability to check for
>   * @opts: Bitmask of options defined in include/linux/security.h
>   *
> @@ -68,6 +71,7 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
>                 int cap, unsigned int opts)
>  {
>         struct user_namespace *ns = targ_ns;
> +       int ret = -EPERM;
>
>         /* See if cred has the capability in the target user namespace
>          * by examining the target user namespace and all of the target
> @@ -75,22 +79,32 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
>          */
>         for (;;) {
>                 /* Do we have the necessary capabilities? */
> -               if (ns == cred->user_ns)
> -                       return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
> +               if (ns == cred->user_ns) {
> +                       if (cap_raised(cred->cap_effective, cap))
> +                               ret = 0;
> +                       else
> +                               ns = NULL;

This is a bit unfortunate :( so maybe all we needed was `ns =
ns->parent` for that one use case, and keep the original `ret ? NULL :
ns` inside trace_cap_capable().

But whatever security folks prefer, I'm fine with either.

Acked-by: Andrii Nakryiko <andrii@kernel.org>

> +                       break;
> +               }
>
>                 /*
>                  * If we're already at a lower level than we're looking for,
>                  * we're done searching.
>                  */
> -               if (ns->level <= cred->user_ns->level)
> -                       return -EPERM;
> +               if (ns->level <= cred->user_ns->level) {
> +                       ns = NULL;
> +                       break;
> +               }
>
>                 /*
>                  * The owner of the user namespace in the parent of the
>                  * user namespace has all caps.
>                  */
> -               if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
> -                       return 0;
> +               if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid)) {
> +                       ns = ns->parent;
> +                       ret = 0;
> +                       break;
> +               }
>
>                 /*
>                  * If you have a capability in a parent user ns, then you have
> @@ -99,7 +113,8 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
>                 ns = ns->parent;
>         }
>
> -       /* We never get here */
> +       trace_cap_capable(cred, targ_ns, ns, cap, opts, ret);
> +       return ret;
>  }
>
>  /**
> --
> 2.43.5
>
>
Serge E. Hallyn Oct. 25, 2024, 7:52 p.m. UTC | #2
On Fri, Oct 25, 2024 at 11:37:59AM -0700, Andrii Nakryiko wrote:
> On Fri, Oct 25, 2024 at 8:15 AM Jordan Rome <linux@jordanrome.com> wrote:
> >
> > In cases where we want a stable way to observe/trace
> > cap_capable (e.g. protection from inlining and API updates)
> > add a tracepoint that passes:
> > - The credentials used
> > - The user namespace of the resource being accessed
> > - The user namespace that has the capability to access the
> > targeted resource
> > - The capability to check for
> > - Bitmask of options defined in include/linux/security.h
> > - The return value of the check
> >
> > Signed-off-by: Jordan Rome <linux@jordanrome.com>
> > ---
> >  MAINTAINERS                       |  1 +
> >  include/trace/events/capability.h | 60 +++++++++++++++++++++++++++++++
> >  security/commoncap.c              | 31 +++++++++++-----
> >  3 files changed, 84 insertions(+), 8 deletions(-)
> >  create mode 100644 include/trace/events/capability.h
> >
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index cc40a9d9b8cd..210e9076c858 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -4994,6 +4994,7 @@ M:        Serge Hallyn <serge@hallyn.com>
> >  L:     linux-security-module@vger.kernel.org
> >  S:     Supported
> >  F:     include/linux/capability.h
> > +F:     include/trace/events/capability.h
> >  F:     include/uapi/linux/capability.h
> >  F:     kernel/capability.c
> >  F:     security/commoncap.c
> > diff --git a/include/trace/events/capability.h b/include/trace/events/capability.h
> > new file mode 100644
> > index 000000000000..e706ce690c38
> > --- /dev/null
> > +++ b/include/trace/events/capability.h
> > @@ -0,0 +1,60 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#undef TRACE_SYSTEM
> > +#define TRACE_SYSTEM capability
> > +
> > +#if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ)
> > +#define _TRACE_CAPABILITY_H
> > +
> > +#include <linux/cred.h>
> > +#include <linux/tracepoint.h>
> > +#include <linux/user_namespace.h>
> > +
> > +/**
> > + * cap_capable - called after it's determined if a task has a particular
> > + * effective capability
> > + *
> > + * @cred: The credentials used
> > + * @targ_ns: The user namespace of the resource being accessed
> > + * @capable_ns: The user namespace in which the credential provides the
> > + *              capability to access the targeted resource.
> > + *              This will be NULL if ret is not 0.
> > + * @cap: The capability to check for
> > + * @opts: Bitmask of options defined in include/linux/security.h
> > + * @ret: The return value of the check: 0 if it does, -ve if it does not
> > + *
> > + * Allows to trace calls to cap_capable in commoncap.c
> > + */
> > +TRACE_EVENT(cap_capable,
> > +
> > +       TP_PROTO(const struct cred *cred, struct user_namespace *targ_ns,
> > +               struct user_namespace *capable_ns, int cap, unsigned int opts, int ret),
> > +
> > +       TP_ARGS(cred, targ_ns, capable_ns, cap, opts, ret),
> > +
> > +       TP_STRUCT__entry(
> > +               __field(const struct cred *, cred)
> > +               __field(struct user_namespace *, targ_ns)
> > +               __field(struct user_namespace *, capable_ns)
> > +               __field(int, cap)
> > +               __field(unsigned int, opts)
> > +               __field(int, ret)
> > +       ),
> > +
> > +       TP_fast_assign(
> > +               __entry->cred       = cred;
> > +               __entry->targ_ns    = targ_ns;
> > +               __entry->capable_ns = capable_ns;
> > +               __entry->cap        = cap;
> > +               __entry->opts       = opts;
> > +               __entry->ret        = ret;
> > +       ),
> > +
> > +       TP_printk("cred %p, targ_ns %p, capable_ns %p, cap %d, opts %u, ret %d",
> > +               __entry->cred, __entry->targ_ns, __entry->capable_ns, __entry->cap,
> > +               __entry->opts, __entry->ret)
> > +);
> > +
> > +#endif /* _TRACE_CAPABILITY_H */
> > +
> > +/* This part must be outside protection */
> > +#include <trace/define_trace.h>
> > diff --git a/security/commoncap.c b/security/commoncap.c
> > index 162d96b3a676..12c3ddfe0d6e 100644
> > --- a/security/commoncap.c
> > +++ b/security/commoncap.c
> > @@ -27,6 +27,9 @@
> >  #include <linux/mnt_idmapping.h>
> >  #include <uapi/linux/lsm.h>
> >
> > +#define CREATE_TRACE_POINTS
> > +#include <trace/events/capability.h>
> > +
> >  /*
> >   * If a non-root user executes a setuid-root binary in
> >   * !secure(SECURE_NOROOT) mode, then we raise capabilities.
> > @@ -52,7 +55,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
> >  /**
> >   * cap_capable - Determine whether a task has a particular effective capability
> >   * @cred: The credentials to use
> > - * @targ_ns:  The user namespace in which we need the capability
> > + * @targ_ns:  The user namespace of the resource being accessed
> >   * @cap: The capability to check for
> >   * @opts: Bitmask of options defined in include/linux/security.h
> >   *
> > @@ -68,6 +71,7 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> >                 int cap, unsigned int opts)
> >  {
> >         struct user_namespace *ns = targ_ns;
> > +       int ret = -EPERM;
> >
> >         /* See if cred has the capability in the target user namespace
> >          * by examining the target user namespace and all of the target
> > @@ -75,22 +79,32 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> >          */
> >         for (;;) {
> >                 /* Do we have the necessary capabilities? */
> > -               if (ns == cred->user_ns)
> > -                       return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
> > +               if (ns == cred->user_ns) {
> > +                       if (cap_raised(cred->cap_effective, cap))
> > +                               ret = 0;
> > +                       else
> > +                               ns = NULL;
> 
> This is a bit unfortunate :( so maybe all we needed was `ns =
> ns->parent` for that one use case, and keep the original `ret ? NULL :
> ns` inside trace_cap_capable().

Yeah, that would be fine with me.  Or maybe just doing

	/* in case of an error, trace should show ns=NULL */
	if (ret)
		ns = NULL;

right above the trace_cap_capable() call would be clearer.

> But whatever security folks prefer, I'm fine with either.
> 
> Acked-by: Andrii Nakryiko <andrii@kernel.org>
> 
> > +                       break;
> > +               }
> >
> >                 /*
> >                  * If we're already at a lower level than we're looking for,
> >                  * we're done searching.
> >                  */
> > -               if (ns->level <= cred->user_ns->level)
> > -                       return -EPERM;
> > +               if (ns->level <= cred->user_ns->level) {
> > +                       ns = NULL;
> > +                       break;
> > +               }
> >
> >                 /*
> >                  * The owner of the user namespace in the parent of the
> >                  * user namespace has all caps.
> >                  */
> > -               if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
> > -                       return 0;
> > +               if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid)) {
> > +                       ns = ns->parent;
> > +                       ret = 0;
> > +                       break;
> > +               }
> >
> >                 /*
> >                  * If you have a capability in a parent user ns, then you have
> > @@ -99,7 +113,8 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> >                 ns = ns->parent;
> >         }
> >
> > -       /* We never get here */
> > +       trace_cap_capable(cred, targ_ns, ns, cap, opts, ret);
> > +       return ret;
> >  }
> >
> >  /**
> > --
> > 2.43.5
> >
> >
Jordan Rome Oct. 25, 2024, 8:24 p.m. UTC | #3
On Fri, Oct 25, 2024 at 3:52 PM Serge E. Hallyn <serge@hallyn.com> wrote:
>
> On Fri, Oct 25, 2024 at 11:37:59AM -0700, Andrii Nakryiko wrote:
> > On Fri, Oct 25, 2024 at 8:15 AM Jordan Rome <linux@jordanrome.com> wrote:
> > >
> > > In cases where we want a stable way to observe/trace
> > > cap_capable (e.g. protection from inlining and API updates)
> > > add a tracepoint that passes:
> > > - The credentials used
> > > - The user namespace of the resource being accessed
> > > - The user namespace that has the capability to access the
> > > targeted resource
> > > - The capability to check for
> > > - Bitmask of options defined in include/linux/security.h
> > > - The return value of the check
> > >
> > > Signed-off-by: Jordan Rome <linux@jordanrome.com>
> > > ---
> > >  MAINTAINERS                       |  1 +
> > >  include/trace/events/capability.h | 60 +++++++++++++++++++++++++++++++
> > >  security/commoncap.c              | 31 +++++++++++-----
> > >  3 files changed, 84 insertions(+), 8 deletions(-)
> > >  create mode 100644 include/trace/events/capability.h
> > >
> > > diff --git a/MAINTAINERS b/MAINTAINERS
> > > index cc40a9d9b8cd..210e9076c858 100644
> > > --- a/MAINTAINERS
> > > +++ b/MAINTAINERS
> > > @@ -4994,6 +4994,7 @@ M:        Serge Hallyn <serge@hallyn.com>
> > >  L:     linux-security-module@vger.kernel.org
> > >  S:     Supported
> > >  F:     include/linux/capability.h
> > > +F:     include/trace/events/capability.h
> > >  F:     include/uapi/linux/capability.h
> > >  F:     kernel/capability.c
> > >  F:     security/commoncap.c
> > > diff --git a/include/trace/events/capability.h b/include/trace/events/capability.h
> > > new file mode 100644
> > > index 000000000000..e706ce690c38
> > > --- /dev/null
> > > +++ b/include/trace/events/capability.h
> > > @@ -0,0 +1,60 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > +#undef TRACE_SYSTEM
> > > +#define TRACE_SYSTEM capability
> > > +
> > > +#if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ)
> > > +#define _TRACE_CAPABILITY_H
> > > +
> > > +#include <linux/cred.h>
> > > +#include <linux/tracepoint.h>
> > > +#include <linux/user_namespace.h>
> > > +
> > > +/**
> > > + * cap_capable - called after it's determined if a task has a particular
> > > + * effective capability
> > > + *
> > > + * @cred: The credentials used
> > > + * @targ_ns: The user namespace of the resource being accessed
> > > + * @capable_ns: The user namespace in which the credential provides the
> > > + *              capability to access the targeted resource.
> > > + *              This will be NULL if ret is not 0.
> > > + * @cap: The capability to check for
> > > + * @opts: Bitmask of options defined in include/linux/security.h
> > > + * @ret: The return value of the check: 0 if it does, -ve if it does not
> > > + *
> > > + * Allows to trace calls to cap_capable in commoncap.c
> > > + */
> > > +TRACE_EVENT(cap_capable,
> > > +
> > > +       TP_PROTO(const struct cred *cred, struct user_namespace *targ_ns,
> > > +               struct user_namespace *capable_ns, int cap, unsigned int opts, int ret),
> > > +
> > > +       TP_ARGS(cred, targ_ns, capable_ns, cap, opts, ret),
> > > +
> > > +       TP_STRUCT__entry(
> > > +               __field(const struct cred *, cred)
> > > +               __field(struct user_namespace *, targ_ns)
> > > +               __field(struct user_namespace *, capable_ns)
> > > +               __field(int, cap)
> > > +               __field(unsigned int, opts)
> > > +               __field(int, ret)
> > > +       ),
> > > +
> > > +       TP_fast_assign(
> > > +               __entry->cred       = cred;
> > > +               __entry->targ_ns    = targ_ns;
> > > +               __entry->capable_ns = capable_ns;
> > > +               __entry->cap        = cap;
> > > +               __entry->opts       = opts;
> > > +               __entry->ret        = ret;
> > > +       ),
> > > +
> > > +       TP_printk("cred %p, targ_ns %p, capable_ns %p, cap %d, opts %u, ret %d",
> > > +               __entry->cred, __entry->targ_ns, __entry->capable_ns, __entry->cap,
> > > +               __entry->opts, __entry->ret)
> > > +);
> > > +
> > > +#endif /* _TRACE_CAPABILITY_H */
> > > +
> > > +/* This part must be outside protection */
> > > +#include <trace/define_trace.h>
> > > diff --git a/security/commoncap.c b/security/commoncap.c
> > > index 162d96b3a676..12c3ddfe0d6e 100644
> > > --- a/security/commoncap.c
> > > +++ b/security/commoncap.c
> > > @@ -27,6 +27,9 @@
> > >  #include <linux/mnt_idmapping.h>
> > >  #include <uapi/linux/lsm.h>
> > >
> > > +#define CREATE_TRACE_POINTS
> > > +#include <trace/events/capability.h>
> > > +
> > >  /*
> > >   * If a non-root user executes a setuid-root binary in
> > >   * !secure(SECURE_NOROOT) mode, then we raise capabilities.
> > > @@ -52,7 +55,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
> > >  /**
> > >   * cap_capable - Determine whether a task has a particular effective capability
> > >   * @cred: The credentials to use
> > > - * @targ_ns:  The user namespace in which we need the capability
> > > + * @targ_ns:  The user namespace of the resource being accessed
> > >   * @cap: The capability to check for
> > >   * @opts: Bitmask of options defined in include/linux/security.h
> > >   *
> > > @@ -68,6 +71,7 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> > >                 int cap, unsigned int opts)
> > >  {
> > >         struct user_namespace *ns = targ_ns;
> > > +       int ret = -EPERM;
> > >
> > >         /* See if cred has the capability in the target user namespace
> > >          * by examining the target user namespace and all of the target
> > > @@ -75,22 +79,32 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> > >          */
> > >         for (;;) {
> > >                 /* Do we have the necessary capabilities? */
> > > -               if (ns == cred->user_ns)
> > > -                       return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
> > > +               if (ns == cred->user_ns) {
> > > +                       if (cap_raised(cred->cap_effective, cap))
> > > +                               ret = 0;
> > > +                       else
> > > +                               ns = NULL;
> >
> > This is a bit unfortunate :( so maybe all we needed was `ns =
> > ns->parent` for that one use case, and keep the original `ret ? NULL :
> > ns` inside trace_cap_capable().
>
> Yeah, that would be fine with me.  Or maybe just doing
>
>         /* in case of an error, trace should show ns=NULL */
>         if (ret)
>                 ns = NULL;
>
> right above the trace_cap_capable() call would be clearer.

I feel like having less trace specific logic in this function would be
a good thing,
so I'm for Andrii's suggestion of doing the ret check there but also
fine to do what security folks prefer :)

>
> > But whatever security folks prefer, I'm fine with either.
> >
> > Acked-by: Andrii Nakryiko <andrii@kernel.org>
> >
> > > +                       break;
> > > +               }
> > >
> > >                 /*
> > >                  * If we're already at a lower level than we're looking for,
> > >                  * we're done searching.
> > >                  */
> > > -               if (ns->level <= cred->user_ns->level)
> > > -                       return -EPERM;
> > > +               if (ns->level <= cred->user_ns->level) {
> > > +                       ns = NULL;
> > > +                       break;
> > > +               }
> > >
> > >                 /*
> > >                  * The owner of the user namespace in the parent of the
> > >                  * user namespace has all caps.
> > >                  */
> > > -               if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
> > > -                       return 0;
> > > +               if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid)) {
> > > +                       ns = ns->parent;
> > > +                       ret = 0;
> > > +                       break;
> > > +               }
> > >
> > >                 /*
> > >                  * If you have a capability in a parent user ns, then you have
> > > @@ -99,7 +113,8 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> > >                 ns = ns->parent;
> > >         }
> > >
> > > -       /* We never get here */
> > > +       trace_cap_capable(cred, targ_ns, ns, cap, opts, ret);
> > > +       return ret;
> > >  }
> > >
> > >  /**
> > > --
> > > 2.43.5
> > >
> > >
Serge E. Hallyn Oct. 26, 2024, 10:09 a.m. UTC | #4
On Fri, Oct 25, 2024 at 04:24:05PM -0400, Jordan Rome wrote:
> On Fri, Oct 25, 2024 at 3:52 PM Serge E. Hallyn <serge@hallyn.com> wrote:
> >
> > On Fri, Oct 25, 2024 at 11:37:59AM -0700, Andrii Nakryiko wrote:
> > > On Fri, Oct 25, 2024 at 8:15 AM Jordan Rome <linux@jordanrome.com> wrote:
> > > >
> > > > In cases where we want a stable way to observe/trace
> > > > cap_capable (e.g. protection from inlining and API updates)
> > > > add a tracepoint that passes:
> > > > - The credentials used
> > > > - The user namespace of the resource being accessed
> > > > - The user namespace that has the capability to access the
> > > > targeted resource
> > > > - The capability to check for
> > > > - Bitmask of options defined in include/linux/security.h
> > > > - The return value of the check
> > > >
> > > > Signed-off-by: Jordan Rome <linux@jordanrome.com>
> > > > ---
> > > >  MAINTAINERS                       |  1 +
> > > >  include/trace/events/capability.h | 60 +++++++++++++++++++++++++++++++
> > > >  security/commoncap.c              | 31 +++++++++++-----
> > > >  3 files changed, 84 insertions(+), 8 deletions(-)
> > > >  create mode 100644 include/trace/events/capability.h
> > > >
> > > > diff --git a/MAINTAINERS b/MAINTAINERS
> > > > index cc40a9d9b8cd..210e9076c858 100644
> > > > --- a/MAINTAINERS
> > > > +++ b/MAINTAINERS
> > > > @@ -4994,6 +4994,7 @@ M:        Serge Hallyn <serge@hallyn.com>
> > > >  L:     linux-security-module@vger.kernel.org
> > > >  S:     Supported
> > > >  F:     include/linux/capability.h
> > > > +F:     include/trace/events/capability.h
> > > >  F:     include/uapi/linux/capability.h
> > > >  F:     kernel/capability.c
> > > >  F:     security/commoncap.c
> > > > diff --git a/include/trace/events/capability.h b/include/trace/events/capability.h
> > > > new file mode 100644
> > > > index 000000000000..e706ce690c38
> > > > --- /dev/null
> > > > +++ b/include/trace/events/capability.h
> > > > @@ -0,0 +1,60 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > +#undef TRACE_SYSTEM
> > > > +#define TRACE_SYSTEM capability
> > > > +
> > > > +#if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ)
> > > > +#define _TRACE_CAPABILITY_H
> > > > +
> > > > +#include <linux/cred.h>
> > > > +#include <linux/tracepoint.h>
> > > > +#include <linux/user_namespace.h>
> > > > +
> > > > +/**
> > > > + * cap_capable - called after it's determined if a task has a particular
> > > > + * effective capability
> > > > + *
> > > > + * @cred: The credentials used
> > > > + * @targ_ns: The user namespace of the resource being accessed
> > > > + * @capable_ns: The user namespace in which the credential provides the
> > > > + *              capability to access the targeted resource.
> > > > + *              This will be NULL if ret is not 0.
> > > > + * @cap: The capability to check for
> > > > + * @opts: Bitmask of options defined in include/linux/security.h
> > > > + * @ret: The return value of the check: 0 if it does, -ve if it does not
> > > > + *
> > > > + * Allows to trace calls to cap_capable in commoncap.c
> > > > + */
> > > > +TRACE_EVENT(cap_capable,
> > > > +
> > > > +       TP_PROTO(const struct cred *cred, struct user_namespace *targ_ns,
> > > > +               struct user_namespace *capable_ns, int cap, unsigned int opts, int ret),
> > > > +
> > > > +       TP_ARGS(cred, targ_ns, capable_ns, cap, opts, ret),
> > > > +
> > > > +       TP_STRUCT__entry(
> > > > +               __field(const struct cred *, cred)
> > > > +               __field(struct user_namespace *, targ_ns)
> > > > +               __field(struct user_namespace *, capable_ns)
> > > > +               __field(int, cap)
> > > > +               __field(unsigned int, opts)
> > > > +               __field(int, ret)
> > > > +       ),
> > > > +
> > > > +       TP_fast_assign(
> > > > +               __entry->cred       = cred;
> > > > +               __entry->targ_ns    = targ_ns;
> > > > +               __entry->capable_ns = capable_ns;
> > > > +               __entry->cap        = cap;
> > > > +               __entry->opts       = opts;
> > > > +               __entry->ret        = ret;
> > > > +       ),
> > > > +
> > > > +       TP_printk("cred %p, targ_ns %p, capable_ns %p, cap %d, opts %u, ret %d",
> > > > +               __entry->cred, __entry->targ_ns, __entry->capable_ns, __entry->cap,
> > > > +               __entry->opts, __entry->ret)
> > > > +);
> > > > +
> > > > +#endif /* _TRACE_CAPABILITY_H */
> > > > +
> > > > +/* This part must be outside protection */
> > > > +#include <trace/define_trace.h>
> > > > diff --git a/security/commoncap.c b/security/commoncap.c
> > > > index 162d96b3a676..12c3ddfe0d6e 100644
> > > > --- a/security/commoncap.c
> > > > +++ b/security/commoncap.c
> > > > @@ -27,6 +27,9 @@
> > > >  #include <linux/mnt_idmapping.h>
> > > >  #include <uapi/linux/lsm.h>
> > > >
> > > > +#define CREATE_TRACE_POINTS
> > > > +#include <trace/events/capability.h>
> > > > +
> > > >  /*
> > > >   * If a non-root user executes a setuid-root binary in
> > > >   * !secure(SECURE_NOROOT) mode, then we raise capabilities.
> > > > @@ -52,7 +55,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
> > > >  /**
> > > >   * cap_capable - Determine whether a task has a particular effective capability
> > > >   * @cred: The credentials to use
> > > > - * @targ_ns:  The user namespace in which we need the capability
> > > > + * @targ_ns:  The user namespace of the resource being accessed
> > > >   * @cap: The capability to check for
> > > >   * @opts: Bitmask of options defined in include/linux/security.h
> > > >   *
> > > > @@ -68,6 +71,7 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> > > >                 int cap, unsigned int opts)
> > > >  {
> > > >         struct user_namespace *ns = targ_ns;
> > > > +       int ret = -EPERM;
> > > >
> > > >         /* See if cred has the capability in the target user namespace
> > > >          * by examining the target user namespace and all of the target
> > > > @@ -75,22 +79,32 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> > > >          */
> > > >         for (;;) {
> > > >                 /* Do we have the necessary capabilities? */
> > > > -               if (ns == cred->user_ns)
> > > > -                       return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
> > > > +               if (ns == cred->user_ns) {
> > > > +                       if (cap_raised(cred->cap_effective, cap))
> > > > +                               ret = 0;
> > > > +                       else
> > > > +                               ns = NULL;
> > >
> > > This is a bit unfortunate :( so maybe all we needed was `ns =
> > > ns->parent` for that one use case, and keep the original `ret ? NULL :
> > > ns` inside trace_cap_capable().
> >
> > Yeah, that would be fine with me.  Or maybe just doing
> >
> >         /* in case of an error, trace should show ns=NULL */
> >         if (ret)
> >                 ns = NULL;
> >
> > right above the trace_cap_capable() call would be clearer.
> 
> I feel like having less trace specific logic in this function would be
> a good thing,
> so I'm for Andrii's suggestion of doing the ret check there but also
> fine to do what security folks prefer :)

I think a comment is needed to remind us (me) in 2 years why the
seting of ns to NULL is there.  But the comment of trace_cap_capable()
probably suffices, so sure, go with Andrii's suggestion.  And then

Reviewed-by: Serge Hallyn <serge@hallyn.com>

for the capability code.

thanks,
-serge
Jordan Rome Oct. 26, 2024, 11:22 a.m. UTC | #5
On Sat, Oct 26, 2024 at 6:10 AM Serge E. Hallyn <serge@hallyn.com> wrote:
>
> On Fri, Oct 25, 2024 at 04:24:05PM -0400, Jordan Rome wrote:
> > On Fri, Oct 25, 2024 at 3:52 PM Serge E. Hallyn <serge@hallyn.com> wrote:
> > >
> > > On Fri, Oct 25, 2024 at 11:37:59AM -0700, Andrii Nakryiko wrote:
> > > > On Fri, Oct 25, 2024 at 8:15 AM Jordan Rome <linux@jordanrome.com> wrote:
> > > > >
> > > > > In cases where we want a stable way to observe/trace
> > > > > cap_capable (e.g. protection from inlining and API updates)
> > > > > add a tracepoint that passes:
> > > > > - The credentials used
> > > > > - The user namespace of the resource being accessed
> > > > > - The user namespace that has the capability to access the
> > > > > targeted resource
> > > > > - The capability to check for
> > > > > - Bitmask of options defined in include/linux/security.h
> > > > > - The return value of the check
> > > > >
> > > > > Signed-off-by: Jordan Rome <linux@jordanrome.com>
> > > > > ---
> > > > >  MAINTAINERS                       |  1 +
> > > > >  include/trace/events/capability.h | 60 +++++++++++++++++++++++++++++++
> > > > >  security/commoncap.c              | 31 +++++++++++-----
> > > > >  3 files changed, 84 insertions(+), 8 deletions(-)
> > > > >  create mode 100644 include/trace/events/capability.h
> > > > >
> > > > > diff --git a/MAINTAINERS b/MAINTAINERS
> > > > > index cc40a9d9b8cd..210e9076c858 100644
> > > > > --- a/MAINTAINERS
> > > > > +++ b/MAINTAINERS
> > > > > @@ -4994,6 +4994,7 @@ M:        Serge Hallyn <serge@hallyn.com>
> > > > >  L:     linux-security-module@vger.kernel.org
> > > > >  S:     Supported
> > > > >  F:     include/linux/capability.h
> > > > > +F:     include/trace/events/capability.h
> > > > >  F:     include/uapi/linux/capability.h
> > > > >  F:     kernel/capability.c
> > > > >  F:     security/commoncap.c
> > > > > diff --git a/include/trace/events/capability.h b/include/trace/events/capability.h
> > > > > new file mode 100644
> > > > > index 000000000000..e706ce690c38
> > > > > --- /dev/null
> > > > > +++ b/include/trace/events/capability.h
> > > > > @@ -0,0 +1,60 @@
> > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > +#undef TRACE_SYSTEM
> > > > > +#define TRACE_SYSTEM capability
> > > > > +
> > > > > +#if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ)
> > > > > +#define _TRACE_CAPABILITY_H
> > > > > +
> > > > > +#include <linux/cred.h>
> > > > > +#include <linux/tracepoint.h>
> > > > > +#include <linux/user_namespace.h>
> > > > > +
> > > > > +/**
> > > > > + * cap_capable - called after it's determined if a task has a particular
> > > > > + * effective capability
> > > > > + *
> > > > > + * @cred: The credentials used
> > > > > + * @targ_ns: The user namespace of the resource being accessed
> > > > > + * @capable_ns: The user namespace in which the credential provides the
> > > > > + *              capability to access the targeted resource.
> > > > > + *              This will be NULL if ret is not 0.
> > > > > + * @cap: The capability to check for
> > > > > + * @opts: Bitmask of options defined in include/linux/security.h
> > > > > + * @ret: The return value of the check: 0 if it does, -ve if it does not
> > > > > + *
> > > > > + * Allows to trace calls to cap_capable in commoncap.c
> > > > > + */
> > > > > +TRACE_EVENT(cap_capable,
> > > > > +
> > > > > +       TP_PROTO(const struct cred *cred, struct user_namespace *targ_ns,
> > > > > +               struct user_namespace *capable_ns, int cap, unsigned int opts, int ret),
> > > > > +
> > > > > +       TP_ARGS(cred, targ_ns, capable_ns, cap, opts, ret),
> > > > > +
> > > > > +       TP_STRUCT__entry(
> > > > > +               __field(const struct cred *, cred)
> > > > > +               __field(struct user_namespace *, targ_ns)
> > > > > +               __field(struct user_namespace *, capable_ns)
> > > > > +               __field(int, cap)
> > > > > +               __field(unsigned int, opts)
> > > > > +               __field(int, ret)
> > > > > +       ),
> > > > > +
> > > > > +       TP_fast_assign(
> > > > > +               __entry->cred       = cred;
> > > > > +               __entry->targ_ns    = targ_ns;
> > > > > +               __entry->capable_ns = capable_ns;
> > > > > +               __entry->cap        = cap;
> > > > > +               __entry->opts       = opts;
> > > > > +               __entry->ret        = ret;
> > > > > +       ),
> > > > > +
> > > > > +       TP_printk("cred %p, targ_ns %p, capable_ns %p, cap %d, opts %u, ret %d",
> > > > > +               __entry->cred, __entry->targ_ns, __entry->capable_ns, __entry->cap,
> > > > > +               __entry->opts, __entry->ret)
> > > > > +);
> > > > > +
> > > > > +#endif /* _TRACE_CAPABILITY_H */
> > > > > +
> > > > > +/* This part must be outside protection */
> > > > > +#include <trace/define_trace.h>
> > > > > diff --git a/security/commoncap.c b/security/commoncap.c
> > > > > index 162d96b3a676..12c3ddfe0d6e 100644
> > > > > --- a/security/commoncap.c
> > > > > +++ b/security/commoncap.c
> > > > > @@ -27,6 +27,9 @@
> > > > >  #include <linux/mnt_idmapping.h>
> > > > >  #include <uapi/linux/lsm.h>
> > > > >
> > > > > +#define CREATE_TRACE_POINTS
> > > > > +#include <trace/events/capability.h>
> > > > > +
> > > > >  /*
> > > > >   * If a non-root user executes a setuid-root binary in
> > > > >   * !secure(SECURE_NOROOT) mode, then we raise capabilities.
> > > > > @@ -52,7 +55,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
> > > > >  /**
> > > > >   * cap_capable - Determine whether a task has a particular effective capability
> > > > >   * @cred: The credentials to use
> > > > > - * @targ_ns:  The user namespace in which we need the capability
> > > > > + * @targ_ns:  The user namespace of the resource being accessed
> > > > >   * @cap: The capability to check for
> > > > >   * @opts: Bitmask of options defined in include/linux/security.h
> > > > >   *
> > > > > @@ -68,6 +71,7 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> > > > >                 int cap, unsigned int opts)
> > > > >  {
> > > > >         struct user_namespace *ns = targ_ns;
> > > > > +       int ret = -EPERM;
> > > > >
> > > > >         /* See if cred has the capability in the target user namespace
> > > > >          * by examining the target user namespace and all of the target
> > > > > @@ -75,22 +79,32 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> > > > >          */
> > > > >         for (;;) {
> > > > >                 /* Do we have the necessary capabilities? */
> > > > > -               if (ns == cred->user_ns)
> > > > > -                       return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
> > > > > +               if (ns == cred->user_ns) {
> > > > > +                       if (cap_raised(cred->cap_effective, cap))
> > > > > +                               ret = 0;
> > > > > +                       else
> > > > > +                               ns = NULL;
> > > >
> > > > This is a bit unfortunate :( so maybe all we needed was `ns =
> > > > ns->parent` for that one use case, and keep the original `ret ? NULL :
> > > > ns` inside trace_cap_capable().
> > >
> > > Yeah, that would be fine with me.  Or maybe just doing
> > >
> > >         /* in case of an error, trace should show ns=NULL */
> > >         if (ret)
> > >                 ns = NULL;
> > >
> > > right above the trace_cap_capable() call would be clearer.
> >
> > I feel like having less trace specific logic in this function would be
> > a good thing,
> > so I'm for Andrii's suggestion of doing the ret check there but also
> > fine to do what security folks prefer :)
>
> I think a comment is needed to remind us (me) in 2 years why the
> seting of ns to NULL is there.  But the comment of trace_cap_capable()
> probably suffices, so sure, go with Andrii's suggestion.  And then
>
> Reviewed-by: Serge Hallyn <serge@hallyn.com>
>
> for the capability code.
>
> thanks,
> -serge

I think we're suggesting to not set ns = NULL here and instead
check the ret value in the trace code e.g.
`__entry->capable_ns = ret ? NULL : capable_ns;`

I think the only trace-specific thing, which I can add a comment
for, is this part `ns = ns->parent;` after we already set the ret to 0.
Serge E. Hallyn Oct. 26, 2024, 1 p.m. UTC | #6
On Sat, Oct 26, 2024 at 07:22:29AM -0400, Jordan Rome wrote:
> On Sat, Oct 26, 2024 at 6:10 AM Serge E. Hallyn <serge@hallyn.com> wrote:
> >
> > On Fri, Oct 25, 2024 at 04:24:05PM -0400, Jordan Rome wrote:
> > > On Fri, Oct 25, 2024 at 3:52 PM Serge E. Hallyn <serge@hallyn.com> wrote:
> > > >
> > > > On Fri, Oct 25, 2024 at 11:37:59AM -0700, Andrii Nakryiko wrote:
> > > > > On Fri, Oct 25, 2024 at 8:15 AM Jordan Rome <linux@jordanrome.com> wrote:
> > > > > >
> > > > > > In cases where we want a stable way to observe/trace
> > > > > > cap_capable (e.g. protection from inlining and API updates)
> > > > > > add a tracepoint that passes:
> > > > > > - The credentials used
> > > > > > - The user namespace of the resource being accessed
> > > > > > - The user namespace that has the capability to access the
> > > > > > targeted resource
> > > > > > - The capability to check for
> > > > > > - Bitmask of options defined in include/linux/security.h
> > > > > > - The return value of the check
> > > > > >
> > > > > > Signed-off-by: Jordan Rome <linux@jordanrome.com>
> > > > > > ---
> > > > > >  MAINTAINERS                       |  1 +
> > > > > >  include/trace/events/capability.h | 60 +++++++++++++++++++++++++++++++
> > > > > >  security/commoncap.c              | 31 +++++++++++-----
> > > > > >  3 files changed, 84 insertions(+), 8 deletions(-)
> > > > > >  create mode 100644 include/trace/events/capability.h
> > > > > >
> > > > > > diff --git a/MAINTAINERS b/MAINTAINERS
> > > > > > index cc40a9d9b8cd..210e9076c858 100644
> > > > > > --- a/MAINTAINERS
> > > > > > +++ b/MAINTAINERS
> > > > > > @@ -4994,6 +4994,7 @@ M:        Serge Hallyn <serge@hallyn.com>
> > > > > >  L:     linux-security-module@vger.kernel.org
> > > > > >  S:     Supported
> > > > > >  F:     include/linux/capability.h
> > > > > > +F:     include/trace/events/capability.h
> > > > > >  F:     include/uapi/linux/capability.h
> > > > > >  F:     kernel/capability.c
> > > > > >  F:     security/commoncap.c
> > > > > > diff --git a/include/trace/events/capability.h b/include/trace/events/capability.h
> > > > > > new file mode 100644
> > > > > > index 000000000000..e706ce690c38
> > > > > > --- /dev/null
> > > > > > +++ b/include/trace/events/capability.h
> > > > > > @@ -0,0 +1,60 @@
> > > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > > +#undef TRACE_SYSTEM
> > > > > > +#define TRACE_SYSTEM capability
> > > > > > +
> > > > > > +#if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ)
> > > > > > +#define _TRACE_CAPABILITY_H
> > > > > > +
> > > > > > +#include <linux/cred.h>
> > > > > > +#include <linux/tracepoint.h>
> > > > > > +#include <linux/user_namespace.h>
> > > > > > +
> > > > > > +/**
> > > > > > + * cap_capable - called after it's determined if a task has a particular
> > > > > > + * effective capability
> > > > > > + *
> > > > > > + * @cred: The credentials used
> > > > > > + * @targ_ns: The user namespace of the resource being accessed
> > > > > > + * @capable_ns: The user namespace in which the credential provides the
> > > > > > + *              capability to access the targeted resource.
> > > > > > + *              This will be NULL if ret is not 0.
> > > > > > + * @cap: The capability to check for
> > > > > > + * @opts: Bitmask of options defined in include/linux/security.h
> > > > > > + * @ret: The return value of the check: 0 if it does, -ve if it does not
> > > > > > + *
> > > > > > + * Allows to trace calls to cap_capable in commoncap.c
> > > > > > + */
> > > > > > +TRACE_EVENT(cap_capable,
> > > > > > +
> > > > > > +       TP_PROTO(const struct cred *cred, struct user_namespace *targ_ns,
> > > > > > +               struct user_namespace *capable_ns, int cap, unsigned int opts, int ret),
> > > > > > +
> > > > > > +       TP_ARGS(cred, targ_ns, capable_ns, cap, opts, ret),
> > > > > > +
> > > > > > +       TP_STRUCT__entry(
> > > > > > +               __field(const struct cred *, cred)
> > > > > > +               __field(struct user_namespace *, targ_ns)
> > > > > > +               __field(struct user_namespace *, capable_ns)
> > > > > > +               __field(int, cap)
> > > > > > +               __field(unsigned int, opts)
> > > > > > +               __field(int, ret)
> > > > > > +       ),
> > > > > > +
> > > > > > +       TP_fast_assign(
> > > > > > +               __entry->cred       = cred;
> > > > > > +               __entry->targ_ns    = targ_ns;
> > > > > > +               __entry->capable_ns = capable_ns;
> > > > > > +               __entry->cap        = cap;
> > > > > > +               __entry->opts       = opts;
> > > > > > +               __entry->ret        = ret;
> > > > > > +       ),
> > > > > > +
> > > > > > +       TP_printk("cred %p, targ_ns %p, capable_ns %p, cap %d, opts %u, ret %d",
> > > > > > +               __entry->cred, __entry->targ_ns, __entry->capable_ns, __entry->cap,
> > > > > > +               __entry->opts, __entry->ret)
> > > > > > +);
> > > > > > +
> > > > > > +#endif /* _TRACE_CAPABILITY_H */
> > > > > > +
> > > > > > +/* This part must be outside protection */
> > > > > > +#include <trace/define_trace.h>
> > > > > > diff --git a/security/commoncap.c b/security/commoncap.c
> > > > > > index 162d96b3a676..12c3ddfe0d6e 100644
> > > > > > --- a/security/commoncap.c
> > > > > > +++ b/security/commoncap.c
> > > > > > @@ -27,6 +27,9 @@
> > > > > >  #include <linux/mnt_idmapping.h>
> > > > > >  #include <uapi/linux/lsm.h>
> > > > > >
> > > > > > +#define CREATE_TRACE_POINTS
> > > > > > +#include <trace/events/capability.h>
> > > > > > +
> > > > > >  /*
> > > > > >   * If a non-root user executes a setuid-root binary in
> > > > > >   * !secure(SECURE_NOROOT) mode, then we raise capabilities.
> > > > > > @@ -52,7 +55,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
> > > > > >  /**
> > > > > >   * cap_capable - Determine whether a task has a particular effective capability
> > > > > >   * @cred: The credentials to use
> > > > > > - * @targ_ns:  The user namespace in which we need the capability
> > > > > > + * @targ_ns:  The user namespace of the resource being accessed
> > > > > >   * @cap: The capability to check for
> > > > > >   * @opts: Bitmask of options defined in include/linux/security.h
> > > > > >   *
> > > > > > @@ -68,6 +71,7 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> > > > > >                 int cap, unsigned int opts)
> > > > > >  {
> > > > > >         struct user_namespace *ns = targ_ns;
> > > > > > +       int ret = -EPERM;
> > > > > >
> > > > > >         /* See if cred has the capability in the target user namespace
> > > > > >          * by examining the target user namespace and all of the target
> > > > > > @@ -75,22 +79,32 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> > > > > >          */
> > > > > >         for (;;) {
> > > > > >                 /* Do we have the necessary capabilities? */
> > > > > > -               if (ns == cred->user_ns)
> > > > > > -                       return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
> > > > > > +               if (ns == cred->user_ns) {
> > > > > > +                       if (cap_raised(cred->cap_effective, cap))
> > > > > > +                               ret = 0;
> > > > > > +                       else
> > > > > > +                               ns = NULL;
> > > > >
> > > > > This is a bit unfortunate :( so maybe all we needed was `ns =
> > > > > ns->parent` for that one use case, and keep the original `ret ? NULL :
> > > > > ns` inside trace_cap_capable().
> > > >
> > > > Yeah, that would be fine with me.  Or maybe just doing
> > > >
> > > >         /* in case of an error, trace should show ns=NULL */
> > > >         if (ret)
> > > >                 ns = NULL;
> > > >
> > > > right above the trace_cap_capable() call would be clearer.
> > >
> > > I feel like having less trace specific logic in this function would be
> > > a good thing,
> > > so I'm for Andrii's suggestion of doing the ret check there but also
> > > fine to do what security folks prefer :)
> >
> > I think a comment is needed to remind us (me) in 2 years why the
> > seting of ns to NULL is there.  But the comment of trace_cap_capable()
> > probably suffices, so sure, go with Andrii's suggestion.  And then
> >
> > Reviewed-by: Serge Hallyn <serge@hallyn.com>
> >
> > for the capability code.
> >
> > thanks,
> > -serge
> 
> I think we're suggesting to not set ns = NULL here and instead
> check the ret value in the trace code e.g.
> `__entry->capable_ns = ret ? NULL : capable_ns;`

Perfect.  Was originally going to suggest this, but then thought well
the rest of the ns logic is purely capability not tracing related.
But since the comment is in trace_cap_capable(), putting the assignment
there makes sense.

> I think the only trace-specific thing, which I can add a comment
> for, is this part `ns = ns->parent;` after we already set the ret to 0.
Jordan Rome Oct. 27, 2024, 6:21 p.m. UTC | #7
On Sat, Oct 26, 2024 at 9:00 AM Serge E. Hallyn <serge@hallyn.com> wrote:
>
> On Sat, Oct 26, 2024 at 07:22:29AM -0400, Jordan Rome wrote:
> > On Sat, Oct 26, 2024 at 6:10 AM Serge E. Hallyn <serge@hallyn.com> wrote:
> > >
> > > On Fri, Oct 25, 2024 at 04:24:05PM -0400, Jordan Rome wrote:
> > > > On Fri, Oct 25, 2024 at 3:52 PM Serge E. Hallyn <serge@hallyn.com> wrote:
> > > > >
> > > > > On Fri, Oct 25, 2024 at 11:37:59AM -0700, Andrii Nakryiko wrote:
> > > > > > On Fri, Oct 25, 2024 at 8:15 AM Jordan Rome <linux@jordanrome.com> wrote:
> > > > > > >
> > > > > > > In cases where we want a stable way to observe/trace
> > > > > > > cap_capable (e.g. protection from inlining and API updates)
> > > > > > > add a tracepoint that passes:
> > > > > > > - The credentials used
> > > > > > > - The user namespace of the resource being accessed
> > > > > > > - The user namespace that has the capability to access the
> > > > > > > targeted resource
> > > > > > > - The capability to check for
> > > > > > > - Bitmask of options defined in include/linux/security.h
> > > > > > > - The return value of the check
> > > > > > >
> > > > > > > Signed-off-by: Jordan Rome <linux@jordanrome.com>
> > > > > > > ---
> > > > > > >  MAINTAINERS                       |  1 +
> > > > > > >  include/trace/events/capability.h | 60 +++++++++++++++++++++++++++++++
> > > > > > >  security/commoncap.c              | 31 +++++++++++-----
> > > > > > >  3 files changed, 84 insertions(+), 8 deletions(-)
> > > > > > >  create mode 100644 include/trace/events/capability.h
> > > > > > >
> > > > > > > diff --git a/MAINTAINERS b/MAINTAINERS
> > > > > > > index cc40a9d9b8cd..210e9076c858 100644
> > > > > > > --- a/MAINTAINERS
> > > > > > > +++ b/MAINTAINERS
> > > > > > > @@ -4994,6 +4994,7 @@ M:        Serge Hallyn <serge@hallyn.com>
> > > > > > >  L:     linux-security-module@vger.kernel.org
> > > > > > >  S:     Supported
> > > > > > >  F:     include/linux/capability.h
> > > > > > > +F:     include/trace/events/capability.h
> > > > > > >  F:     include/uapi/linux/capability.h
> > > > > > >  F:     kernel/capability.c
> > > > > > >  F:     security/commoncap.c
> > > > > > > diff --git a/include/trace/events/capability.h b/include/trace/events/capability.h
> > > > > > > new file mode 100644
> > > > > > > index 000000000000..e706ce690c38
> > > > > > > --- /dev/null
> > > > > > > +++ b/include/trace/events/capability.h
> > > > > > > @@ -0,0 +1,60 @@
> > > > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > > > +#undef TRACE_SYSTEM
> > > > > > > +#define TRACE_SYSTEM capability
> > > > > > > +
> > > > > > > +#if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ)
> > > > > > > +#define _TRACE_CAPABILITY_H
> > > > > > > +
> > > > > > > +#include <linux/cred.h>
> > > > > > > +#include <linux/tracepoint.h>
> > > > > > > +#include <linux/user_namespace.h>
> > > > > > > +
> > > > > > > +/**
> > > > > > > + * cap_capable - called after it's determined if a task has a particular
> > > > > > > + * effective capability
> > > > > > > + *
> > > > > > > + * @cred: The credentials used
> > > > > > > + * @targ_ns: The user namespace of the resource being accessed
> > > > > > > + * @capable_ns: The user namespace in which the credential provides the
> > > > > > > + *              capability to access the targeted resource.
> > > > > > > + *              This will be NULL if ret is not 0.
> > > > > > > + * @cap: The capability to check for
> > > > > > > + * @opts: Bitmask of options defined in include/linux/security.h
> > > > > > > + * @ret: The return value of the check: 0 if it does, -ve if it does not
> > > > > > > + *
> > > > > > > + * Allows to trace calls to cap_capable in commoncap.c
> > > > > > > + */
> > > > > > > +TRACE_EVENT(cap_capable,
> > > > > > > +
> > > > > > > +       TP_PROTO(const struct cred *cred, struct user_namespace *targ_ns,
> > > > > > > +               struct user_namespace *capable_ns, int cap, unsigned int opts, int ret),
> > > > > > > +
> > > > > > > +       TP_ARGS(cred, targ_ns, capable_ns, cap, opts, ret),
> > > > > > > +
> > > > > > > +       TP_STRUCT__entry(
> > > > > > > +               __field(const struct cred *, cred)
> > > > > > > +               __field(struct user_namespace *, targ_ns)
> > > > > > > +               __field(struct user_namespace *, capable_ns)
> > > > > > > +               __field(int, cap)
> > > > > > > +               __field(unsigned int, opts)
> > > > > > > +               __field(int, ret)
> > > > > > > +       ),
> > > > > > > +
> > > > > > > +       TP_fast_assign(
> > > > > > > +               __entry->cred       = cred;
> > > > > > > +               __entry->targ_ns    = targ_ns;
> > > > > > > +               __entry->capable_ns = capable_ns;
> > > > > > > +               __entry->cap        = cap;
> > > > > > > +               __entry->opts       = opts;
> > > > > > > +               __entry->ret        = ret;
> > > > > > > +       ),
> > > > > > > +
> > > > > > > +       TP_printk("cred %p, targ_ns %p, capable_ns %p, cap %d, opts %u, ret %d",
> > > > > > > +               __entry->cred, __entry->targ_ns, __entry->capable_ns, __entry->cap,
> > > > > > > +               __entry->opts, __entry->ret)
> > > > > > > +);
> > > > > > > +
> > > > > > > +#endif /* _TRACE_CAPABILITY_H */
> > > > > > > +
> > > > > > > +/* This part must be outside protection */
> > > > > > > +#include <trace/define_trace.h>
> > > > > > > diff --git a/security/commoncap.c b/security/commoncap.c
> > > > > > > index 162d96b3a676..12c3ddfe0d6e 100644
> > > > > > > --- a/security/commoncap.c
> > > > > > > +++ b/security/commoncap.c
> > > > > > > @@ -27,6 +27,9 @@
> > > > > > >  #include <linux/mnt_idmapping.h>
> > > > > > >  #include <uapi/linux/lsm.h>
> > > > > > >
> > > > > > > +#define CREATE_TRACE_POINTS
> > > > > > > +#include <trace/events/capability.h>
> > > > > > > +
> > > > > > >  /*
> > > > > > >   * If a non-root user executes a setuid-root binary in
> > > > > > >   * !secure(SECURE_NOROOT) mode, then we raise capabilities.
> > > > > > > @@ -52,7 +55,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
> > > > > > >  /**
> > > > > > >   * cap_capable - Determine whether a task has a particular effective capability
> > > > > > >   * @cred: The credentials to use
> > > > > > > - * @targ_ns:  The user namespace in which we need the capability
> > > > > > > + * @targ_ns:  The user namespace of the resource being accessed
> > > > > > >   * @cap: The capability to check for
> > > > > > >   * @opts: Bitmask of options defined in include/linux/security.h
> > > > > > >   *
> > > > > > > @@ -68,6 +71,7 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> > > > > > >                 int cap, unsigned int opts)
> > > > > > >  {
> > > > > > >         struct user_namespace *ns = targ_ns;
> > > > > > > +       int ret = -EPERM;
> > > > > > >
> > > > > > >         /* See if cred has the capability in the target user namespace
> > > > > > >          * by examining the target user namespace and all of the target
> > > > > > > @@ -75,22 +79,32 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> > > > > > >          */
> > > > > > >         for (;;) {
> > > > > > >                 /* Do we have the necessary capabilities? */
> > > > > > > -               if (ns == cred->user_ns)
> > > > > > > -                       return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
> > > > > > > +               if (ns == cred->user_ns) {
> > > > > > > +                       if (cap_raised(cred->cap_effective, cap))
> > > > > > > +                               ret = 0;
> > > > > > > +                       else
> > > > > > > +                               ns = NULL;
> > > > > >
> > > > > > This is a bit unfortunate :( so maybe all we needed was `ns =
> > > > > > ns->parent` for that one use case, and keep the original `ret ? NULL :
> > > > > > ns` inside trace_cap_capable().
> > > > >
> > > > > Yeah, that would be fine with me.  Or maybe just doing
> > > > >
> > > > >         /* in case of an error, trace should show ns=NULL */
> > > > >         if (ret)
> > > > >                 ns = NULL;
> > > > >
> > > > > right above the trace_cap_capable() call would be clearer.
> > > >
> > > > I feel like having less trace specific logic in this function would be
> > > > a good thing,
> > > > so I'm for Andrii's suggestion of doing the ret check there but also
> > > > fine to do what security folks prefer :)
> > >
> > > I think a comment is needed to remind us (me) in 2 years why the
> > > seting of ns to NULL is there.  But the comment of trace_cap_capable()
> > > probably suffices, so sure, go with Andrii's suggestion.  And then
> > >
> > > Reviewed-by: Serge Hallyn <serge@hallyn.com>
> > >
> > > for the capability code.
> > >
> > > thanks,
> > > -serge
> >
> > I think we're suggesting to not set ns = NULL here and instead
> > check the ret value in the trace code e.g.
> > `__entry->capable_ns = ret ? NULL : capable_ns;`
>
> Perfect.  Was originally going to suggest this, but then thought well
> the rest of the ns logic is purely capability not tracing related.
> But since the comment is in trace_cap_capable(), putting the assignment
> there makes sense.
>

Actually, I had another idea. What about just having a separate
variable in the `cap_capable` function for `capable_ns` that only gets
set if ret is 0. Then we're not changing the `ns` variable at all for
the purposes of the trace function.

> > I think the only trace-specific thing, which I can add a comment
> > for, is this part `ns = ns->parent;` after we already set the ret to 0.
sergeh@kernel.org Oct. 28, 2024, 4:51 p.m. UTC | #8
On Sun, Oct 27, 2024 at 02:21:26PM -0400, Jordan Rome wrote:
> On Sat, Oct 26, 2024 at 9:00 AM Serge E. Hallyn <serge@hallyn.com> wrote:
> >
> > On Sat, Oct 26, 2024 at 07:22:29AM -0400, Jordan Rome wrote:
> > > On Sat, Oct 26, 2024 at 6:10 AM Serge E. Hallyn <serge@hallyn.com> wrote:
> > > >
> > > > On Fri, Oct 25, 2024 at 04:24:05PM -0400, Jordan Rome wrote:
> > > > > On Fri, Oct 25, 2024 at 3:52 PM Serge E. Hallyn <serge@hallyn.com> wrote:
> > > > > >
> > > > > > On Fri, Oct 25, 2024 at 11:37:59AM -0700, Andrii Nakryiko wrote:
> > > > > > > On Fri, Oct 25, 2024 at 8:15 AM Jordan Rome <linux@jordanrome.com> wrote:
> > > > > > > >
> > > > > > > > In cases where we want a stable way to observe/trace
> > > > > > > > cap_capable (e.g. protection from inlining and API updates)
> > > > > > > > add a tracepoint that passes:
> > > > > > > > - The credentials used
> > > > > > > > - The user namespace of the resource being accessed
> > > > > > > > - The user namespace that has the capability to access the
> > > > > > > > targeted resource
> > > > > > > > - The capability to check for
> > > > > > > > - Bitmask of options defined in include/linux/security.h
> > > > > > > > - The return value of the check
> > > > > > > >
> > > > > > > > Signed-off-by: Jordan Rome <linux@jordanrome.com>
> > > > > > > > ---
> > > > > > > >  MAINTAINERS                       |  1 +
> > > > > > > >  include/trace/events/capability.h | 60 +++++++++++++++++++++++++++++++
> > > > > > > >  security/commoncap.c              | 31 +++++++++++-----
> > > > > > > >  3 files changed, 84 insertions(+), 8 deletions(-)
> > > > > > > >  create mode 100644 include/trace/events/capability.h
> > > > > > > >
> > > > > > > > diff --git a/MAINTAINERS b/MAINTAINERS
> > > > > > > > index cc40a9d9b8cd..210e9076c858 100644
> > > > > > > > --- a/MAINTAINERS
> > > > > > > > +++ b/MAINTAINERS
> > > > > > > > @@ -4994,6 +4994,7 @@ M:        Serge Hallyn <serge@hallyn.com>
> > > > > > > >  L:     linux-security-module@vger.kernel.org
> > > > > > > >  S:     Supported
> > > > > > > >  F:     include/linux/capability.h
> > > > > > > > +F:     include/trace/events/capability.h
> > > > > > > >  F:     include/uapi/linux/capability.h
> > > > > > > >  F:     kernel/capability.c
> > > > > > > >  F:     security/commoncap.c
> > > > > > > > diff --git a/include/trace/events/capability.h b/include/trace/events/capability.h
> > > > > > > > new file mode 100644
> > > > > > > > index 000000000000..e706ce690c38
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/include/trace/events/capability.h
> > > > > > > > @@ -0,0 +1,60 @@
> > > > > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > > > > +#undef TRACE_SYSTEM
> > > > > > > > +#define TRACE_SYSTEM capability
> > > > > > > > +
> > > > > > > > +#if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ)
> > > > > > > > +#define _TRACE_CAPABILITY_H
> > > > > > > > +
> > > > > > > > +#include <linux/cred.h>
> > > > > > > > +#include <linux/tracepoint.h>
> > > > > > > > +#include <linux/user_namespace.h>
> > > > > > > > +
> > > > > > > > +/**
> > > > > > > > + * cap_capable - called after it's determined if a task has a particular
> > > > > > > > + * effective capability
> > > > > > > > + *
> > > > > > > > + * @cred: The credentials used
> > > > > > > > + * @targ_ns: The user namespace of the resource being accessed
> > > > > > > > + * @capable_ns: The user namespace in which the credential provides the
> > > > > > > > + *              capability to access the targeted resource.
> > > > > > > > + *              This will be NULL if ret is not 0.
> > > > > > > > + * @cap: The capability to check for
> > > > > > > > + * @opts: Bitmask of options defined in include/linux/security.h
> > > > > > > > + * @ret: The return value of the check: 0 if it does, -ve if it does not
> > > > > > > > + *
> > > > > > > > + * Allows to trace calls to cap_capable in commoncap.c
> > > > > > > > + */
> > > > > > > > +TRACE_EVENT(cap_capable,
> > > > > > > > +
> > > > > > > > +       TP_PROTO(const struct cred *cred, struct user_namespace *targ_ns,
> > > > > > > > +               struct user_namespace *capable_ns, int cap, unsigned int opts, int ret),
> > > > > > > > +
> > > > > > > > +       TP_ARGS(cred, targ_ns, capable_ns, cap, opts, ret),
> > > > > > > > +
> > > > > > > > +       TP_STRUCT__entry(
> > > > > > > > +               __field(const struct cred *, cred)
> > > > > > > > +               __field(struct user_namespace *, targ_ns)
> > > > > > > > +               __field(struct user_namespace *, capable_ns)
> > > > > > > > +               __field(int, cap)
> > > > > > > > +               __field(unsigned int, opts)
> > > > > > > > +               __field(int, ret)
> > > > > > > > +       ),
> > > > > > > > +
> > > > > > > > +       TP_fast_assign(
> > > > > > > > +               __entry->cred       = cred;
> > > > > > > > +               __entry->targ_ns    = targ_ns;
> > > > > > > > +               __entry->capable_ns = capable_ns;
> > > > > > > > +               __entry->cap        = cap;
> > > > > > > > +               __entry->opts       = opts;
> > > > > > > > +               __entry->ret        = ret;
> > > > > > > > +       ),
> > > > > > > > +
> > > > > > > > +       TP_printk("cred %p, targ_ns %p, capable_ns %p, cap %d, opts %u, ret %d",
> > > > > > > > +               __entry->cred, __entry->targ_ns, __entry->capable_ns, __entry->cap,
> > > > > > > > +               __entry->opts, __entry->ret)
> > > > > > > > +);
> > > > > > > > +
> > > > > > > > +#endif /* _TRACE_CAPABILITY_H */
> > > > > > > > +
> > > > > > > > +/* This part must be outside protection */
> > > > > > > > +#include <trace/define_trace.h>
> > > > > > > > diff --git a/security/commoncap.c b/security/commoncap.c
> > > > > > > > index 162d96b3a676..12c3ddfe0d6e 100644
> > > > > > > > --- a/security/commoncap.c
> > > > > > > > +++ b/security/commoncap.c
> > > > > > > > @@ -27,6 +27,9 @@
> > > > > > > >  #include <linux/mnt_idmapping.h>
> > > > > > > >  #include <uapi/linux/lsm.h>
> > > > > > > >
> > > > > > > > +#define CREATE_TRACE_POINTS
> > > > > > > > +#include <trace/events/capability.h>
> > > > > > > > +
> > > > > > > >  /*
> > > > > > > >   * If a non-root user executes a setuid-root binary in
> > > > > > > >   * !secure(SECURE_NOROOT) mode, then we raise capabilities.
> > > > > > > > @@ -52,7 +55,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
> > > > > > > >  /**
> > > > > > > >   * cap_capable - Determine whether a task has a particular effective capability
> > > > > > > >   * @cred: The credentials to use
> > > > > > > > - * @targ_ns:  The user namespace in which we need the capability
> > > > > > > > + * @targ_ns:  The user namespace of the resource being accessed
> > > > > > > >   * @cap: The capability to check for
> > > > > > > >   * @opts: Bitmask of options defined in include/linux/security.h
> > > > > > > >   *
> > > > > > > > @@ -68,6 +71,7 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> > > > > > > >                 int cap, unsigned int opts)
> > > > > > > >  {
> > > > > > > >         struct user_namespace *ns = targ_ns;
> > > > > > > > +       int ret = -EPERM;
> > > > > > > >
> > > > > > > >         /* See if cred has the capability in the target user namespace
> > > > > > > >          * by examining the target user namespace and all of the target
> > > > > > > > @@ -75,22 +79,32 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
> > > > > > > >          */
> > > > > > > >         for (;;) {
> > > > > > > >                 /* Do we have the necessary capabilities? */
> > > > > > > > -               if (ns == cred->user_ns)
> > > > > > > > -                       return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
> > > > > > > > +               if (ns == cred->user_ns) {
> > > > > > > > +                       if (cap_raised(cred->cap_effective, cap))
> > > > > > > > +                               ret = 0;
> > > > > > > > +                       else
> > > > > > > > +                               ns = NULL;
> > > > > > >
> > > > > > > This is a bit unfortunate :( so maybe all we needed was `ns =
> > > > > > > ns->parent` for that one use case, and keep the original `ret ? NULL :
> > > > > > > ns` inside trace_cap_capable().
> > > > > >
> > > > > > Yeah, that would be fine with me.  Or maybe just doing
> > > > > >
> > > > > >         /* in case of an error, trace should show ns=NULL */
> > > > > >         if (ret)
> > > > > >                 ns = NULL;
> > > > > >
> > > > > > right above the trace_cap_capable() call would be clearer.
> > > > >
> > > > > I feel like having less trace specific logic in this function would be
> > > > > a good thing,
> > > > > so I'm for Andrii's suggestion of doing the ret check there but also
> > > > > fine to do what security folks prefer :)
> > > >
> > > > I think a comment is needed to remind us (me) in 2 years why the
> > > > seting of ns to NULL is there.  But the comment of trace_cap_capable()
> > > > probably suffices, so sure, go with Andrii's suggestion.  And then
> > > >
> > > > Reviewed-by: Serge Hallyn <serge@hallyn.com>
> > > >
> > > > for the capability code.
> > > >
> > > > thanks,
> > > > -serge
> > >
> > > I think we're suggesting to not set ns = NULL here and instead
> > > check the ret value in the trace code e.g.
> > > `__entry->capable_ns = ret ? NULL : capable_ns;`
> >
> > Perfect.  Was originally going to suggest this, but then thought well
> > the rest of the ns logic is purely capability not tracing related.
> > But since the comment is in trace_cap_capable(), putting the assignment
> > there makes sense.
> >
> 
> Actually, I had another idea. What about just having a separate
> variable in the `cap_capable` function for `capable_ns` that only gets
> set if ret is 0. Then we're not changing the `ns` variable at all for
> the purposes of the trace function.

FWIW that sounds great.

-serge
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index cc40a9d9b8cd..210e9076c858 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4994,6 +4994,7 @@  M:	Serge Hallyn <serge@hallyn.com>
 L:	linux-security-module@vger.kernel.org
 S:	Supported
 F:	include/linux/capability.h
+F:	include/trace/events/capability.h
 F:	include/uapi/linux/capability.h
 F:	kernel/capability.c
 F:	security/commoncap.c
diff --git a/include/trace/events/capability.h b/include/trace/events/capability.h
new file mode 100644
index 000000000000..e706ce690c38
--- /dev/null
+++ b/include/trace/events/capability.h
@@ -0,0 +1,60 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM capability
+
+#if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_CAPABILITY_H
+
+#include <linux/cred.h>
+#include <linux/tracepoint.h>
+#include <linux/user_namespace.h>
+
+/**
+ * cap_capable - called after it's determined if a task has a particular
+ * effective capability
+ *
+ * @cred: The credentials used
+ * @targ_ns: The user namespace of the resource being accessed
+ * @capable_ns: The user namespace in which the credential provides the
+ *              capability to access the targeted resource.
+ *              This will be NULL if ret is not 0.
+ * @cap: The capability to check for
+ * @opts: Bitmask of options defined in include/linux/security.h
+ * @ret: The return value of the check: 0 if it does, -ve if it does not
+ *
+ * Allows to trace calls to cap_capable in commoncap.c
+ */
+TRACE_EVENT(cap_capable,
+
+	TP_PROTO(const struct cred *cred, struct user_namespace *targ_ns,
+		struct user_namespace *capable_ns, int cap, unsigned int opts, int ret),
+
+	TP_ARGS(cred, targ_ns, capable_ns, cap, opts, ret),
+
+	TP_STRUCT__entry(
+		__field(const struct cred *, cred)
+		__field(struct user_namespace *, targ_ns)
+		__field(struct user_namespace *, capable_ns)
+		__field(int, cap)
+		__field(unsigned int, opts)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->cred       = cred;
+		__entry->targ_ns    = targ_ns;
+		__entry->capable_ns = capable_ns;
+		__entry->cap        = cap;
+		__entry->opts       = opts;
+		__entry->ret        = ret;
+	),
+
+	TP_printk("cred %p, targ_ns %p, capable_ns %p, cap %d, opts %u, ret %d",
+		__entry->cred, __entry->targ_ns, __entry->capable_ns, __entry->cap,
+		__entry->opts, __entry->ret)
+);
+
+#endif /* _TRACE_CAPABILITY_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/security/commoncap.c b/security/commoncap.c
index 162d96b3a676..12c3ddfe0d6e 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -27,6 +27,9 @@ 
 #include <linux/mnt_idmapping.h>
 #include <uapi/linux/lsm.h>

+#define CREATE_TRACE_POINTS
+#include <trace/events/capability.h>
+
 /*
  * If a non-root user executes a setuid-root binary in
  * !secure(SECURE_NOROOT) mode, then we raise capabilities.
@@ -52,7 +55,7 @@  static void warn_setuid_and_fcaps_mixed(const char *fname)
 /**
  * cap_capable - Determine whether a task has a particular effective capability
  * @cred: The credentials to use
- * @targ_ns:  The user namespace in which we need the capability
+ * @targ_ns:  The user namespace of the resource being accessed
  * @cap: The capability to check for
  * @opts: Bitmask of options defined in include/linux/security.h
  *
@@ -68,6 +71,7 @@  int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
 		int cap, unsigned int opts)
 {
 	struct user_namespace *ns = targ_ns;
+	int ret = -EPERM;

 	/* See if cred has the capability in the target user namespace
 	 * by examining the target user namespace and all of the target
@@ -75,22 +79,32 @@  int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
 	 */
 	for (;;) {
 		/* Do we have the necessary capabilities? */
-		if (ns == cred->user_ns)
-			return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
+		if (ns == cred->user_ns) {
+			if (cap_raised(cred->cap_effective, cap))
+				ret = 0;
+			else
+				ns = NULL;
+			break;
+		}

 		/*
 		 * If we're already at a lower level than we're looking for,
 		 * we're done searching.
 		 */
-		if (ns->level <= cred->user_ns->level)
-			return -EPERM;
+		if (ns->level <= cred->user_ns->level) {
+			ns = NULL;
+			break;
+		}

 		/*
 		 * The owner of the user namespace in the parent of the
 		 * user namespace has all caps.
 		 */
-		if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
-			return 0;
+		if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid)) {
+			ns = ns->parent;
+			ret = 0;
+			break;
+		}

 		/*
 		 * If you have a capability in a parent user ns, then you have
@@ -99,7 +113,8 @@  int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
 		ns = ns->parent;
 	}

-	/* We never get here */
+	trace_cap_capable(cred, targ_ns, ns, cap, opts, ret);
+	return ret;
 }

 /**