diff mbox

devpts: Add ptmx_uid and ptmx_gid options

Message ID b321c0c2729d1c2a72aea319b077dce7afd79698.1424480579.git.luto@amacapital.net (mailing list archive)
State New, archived
Headers show

Commit Message

Andy Lutomirski Feb. 21, 2015, 1:04 a.m. UTC
It's currently impossible to mount devpts in a user namespace that
has no root user, since ptmx can't be created.  This adds options
ptmx_uid and ptmx_gid that override the default uid and gid of 0.

These options are not shown in mountinfo because they have no effect
other than changing the initial mode of ptmx, and, in particular, it
wouldn't make any sense to change them on remount.  Instead, we
disallow them on remount.

This could be changed, but we'd probably want to fix the userns
behavior of uid and gid at the same time if we did so.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 Documentation/filesystems/devpts.txt |  4 +++
 fs/devpts/inode.c                    | 58 ++++++++++++++++++++++++++----------
 2 files changed, 46 insertions(+), 16 deletions(-)

Comments

Andy Lutomirski March 26, 2015, 7:29 p.m. UTC | #1
Ping?  It's been over a month.

On Fri, Feb 20, 2015 at 5:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> It's currently impossible to mount devpts in a user namespace that
> has no root user, since ptmx can't be created.  This adds options
> ptmx_uid and ptmx_gid that override the default uid and gid of 0.
>
> These options are not shown in mountinfo because they have no effect
> other than changing the initial mode of ptmx, and, in particular, it
> wouldn't make any sense to change them on remount.  Instead, we
> disallow them on remount.
>
> This could be changed, but we'd probably want to fix the userns
> behavior of uid and gid at the same time if we did so.
>
> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> ---
>  Documentation/filesystems/devpts.txt |  4 +++
>  fs/devpts/inode.c                    | 58 ++++++++++++++++++++++++++----------
>  2 files changed, 46 insertions(+), 16 deletions(-)
>
> diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt
> index 68dffd87f9b7..7808e77d0d72 100644
> --- a/Documentation/filesystems/devpts.txt
> +++ b/Documentation/filesystems/devpts.txt
> @@ -121,6 +121,10 @@ once), following user-space issues should be noted.
>
>         chmod 666 /dev/pts/ptmx
>
> +   The ownership for /dev/pts/ptmx can be specified using the ptmxuid
> +   and ptmxgid options.  Both default to zero, which, in user namespaces
> +   that have no root user, will cause mounting to fail.
> +
>  7. A mount of devpts without the 'newinstance' option results in binding to
>     initial kernel mount.  This behavior while preserving legacy semantics,
>     does not provide strict isolation in a container environment. i.e by
> diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
> index cfe8466f7fef..b60d1438c660 100644
> --- a/fs/devpts/inode.c
> +++ b/fs/devpts/inode.c
> @@ -102,6 +102,8 @@ struct pts_mount_opts {
>         int setgid;
>         kuid_t   uid;
>         kgid_t   gid;
> +       uid_t ptmx_uid;
> +       gid_t ptmx_gid;
>         umode_t mode;
>         umode_t ptmxmode;
>         int newinstance;
> @@ -109,8 +111,8 @@ struct pts_mount_opts {
>  };
>
>  enum {
> -       Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance,  Opt_max,
> -       Opt_err
> +       Opt_uid, Opt_gid, Opt_ptmx_uid, Opt_ptmx_gid, Opt_mode, Opt_ptmxmode,
> +       Opt_newinstance,  Opt_max, Opt_err,
>  };
>
>  static const match_table_t tokens = {
> @@ -118,6 +120,8 @@ static const match_table_t tokens = {
>         {Opt_gid, "gid=%u"},
>         {Opt_mode, "mode=%o"},
>  #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
> +       {Opt_ptmx_uid, "ptmxuid=%u"},
> +       {Opt_ptmx_gid, "ptmxgid=%u"},
>         {Opt_ptmxmode, "ptmxmode=%o"},
>         {Opt_newinstance, "newinstance"},
>         {Opt_max, "max=%d"},
> @@ -162,14 +166,17 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
>         char *p;
>         kuid_t uid;
>         kgid_t gid;
> -
> -       opts->setuid  = 0;
> -       opts->setgid  = 0;
> -       opts->uid     = GLOBAL_ROOT_UID;
> -       opts->gid     = GLOBAL_ROOT_GID;
> -       opts->mode    = DEVPTS_DEFAULT_MODE;
> +       bool setptmxid = false;
> +
> +       opts->setuid   = 0;
> +       opts->setgid   = 0;
> +       opts->uid      = GLOBAL_ROOT_UID;
> +       opts->gid      = GLOBAL_ROOT_GID;
> +       opts->ptmx_uid = 0;
> +       opts->ptmx_gid = 0;
> +       opts->mode     = DEVPTS_DEFAULT_MODE;
>         opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
> -       opts->max     = NR_UNIX98_PTY_MAX;
> +       opts->max      = NR_UNIX98_PTY_MAX;
>
>         /* newinstance makes sense only on initial mount */
>         if (op == PARSE_MOUNT)
> @@ -209,6 +216,22 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
>                         opts->mode = option & S_IALLUGO;
>                         break;
>  #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
> +               case Opt_ptmx_uid:
> +                       if (match_int(&args[0], &option))
> +                               return -EINVAL;
> +                       if (op != PARSE_MOUNT)
> +                               return -EINVAL;
> +                       opts->ptmx_uid = option;
> +                       setptmxid = true;
> +                       break;
> +               case Opt_ptmx_gid:
> +                       if (match_int(&args[0], &option))
> +                               return -EINVAL;
> +                       if (op != PARSE_MOUNT)
> +                               return -EINVAL;
> +                       opts->ptmx_gid = option;
> +                       setptmxid = true;
> +                       break;
>                 case Opt_ptmxmode:
>                         if (match_octal(&args[0], &option))
>                                 return -EINVAL;
> @@ -232,6 +255,9 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
>                 }
>         }
>
> +       if (setptmxid && !opts->newinstance)
> +               return -EINVAL;
> +
>         return 0;
>  }
>
> @@ -245,12 +271,12 @@ static int mknod_ptmx(struct super_block *sb)
>         struct dentry *root = sb->s_root;
>         struct pts_fs_info *fsi = DEVPTS_SB(sb);
>         struct pts_mount_opts *opts = &fsi->mount_opts;
> -       kuid_t root_uid;
> -       kgid_t root_gid;
> +       kuid_t ptmx_uid;
> +       kgid_t ptmx_gid;
>
> -       root_uid = make_kuid(current_user_ns(), 0);
> -       root_gid = make_kgid(current_user_ns(), 0);
> -       if (!uid_valid(root_uid) || !gid_valid(root_gid))
> +       ptmx_uid = make_kuid(current_user_ns(), opts->ptmx_uid);
> +       ptmx_gid = make_kgid(current_user_ns(), opts->ptmx_gid);
> +       if (!uid_valid(ptmx_uid) || !gid_valid(ptmx_gid))
>                 return -EINVAL;
>
>         mutex_lock(&root->d_inode->i_mutex);
> @@ -282,8 +308,8 @@ static int mknod_ptmx(struct super_block *sb)
>
>         mode = S_IFCHR|opts->ptmxmode;
>         init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
> -       inode->i_uid = root_uid;
> -       inode->i_gid = root_gid;
> +       inode->i_uid = ptmx_uid;
> +       inode->i_gid = ptmx_gid;
>
>         d_add(dentry, inode);
>
> --
> 2.3.0
>
James Bottomley March 27, 2015, 9:03 a.m. UTC | #2
On Thu, 2015-03-26 at 12:29 -0700, Andy Lutomirski wrote:
> Ping?  It's been over a month.

I think we all looked at this and thought "that's not a problem".  The
reason is that we all bring up full OS containers with devpts already
mounted by the host.  Even when you run from init in the Container, the
OS always seems to be prepared to find devpts already mounted.  Can you
explain a bit more what the actual problem you're trying to solve is?  I
think also I'm a bit dubious about allowing this type of flexibility
because the more lattitude container root has, the more scope there is
for a security problem.

> On Fri, Feb 20, 2015 at 5:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> > It's currently impossible to mount devpts in a user namespace that
> > has no root user, since ptmx can't be created.

This is where I stopped reading because it's not true ... because it is
possible, you just do it from the host as real root.

James

>   This adds options
> > ptmx_uid and ptmx_gid that override the default uid and gid of 0.
> >
> > These options are not shown in mountinfo because they have no effect
> > other than changing the initial mode of ptmx, and, in particular, it
> > wouldn't make any sense to change them on remount.  Instead, we
> > disallow them on remount.
> >
> > This could be changed, but we'd probably want to fix the userns
> > behavior of uid and gid at the same time if we did so.
> >
> > Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> > ---
> >  Documentation/filesystems/devpts.txt |  4 +++
> >  fs/devpts/inode.c                    | 58 ++++++++++++++++++++++++++----------
> >  2 files changed, 46 insertions(+), 16 deletions(-)
> >
> > diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt
> > index 68dffd87f9b7..7808e77d0d72 100644
> > --- a/Documentation/filesystems/devpts.txt
> > +++ b/Documentation/filesystems/devpts.txt
> > @@ -121,6 +121,10 @@ once), following user-space issues should be noted.
> >
> >         chmod 666 /dev/pts/ptmx
> >
> > +   The ownership for /dev/pts/ptmx can be specified using the ptmxuid
> > +   and ptmxgid options.  Both default to zero, which, in user namespaces
> > +   that have no root user, will cause mounting to fail.
> > +
> >  7. A mount of devpts without the 'newinstance' option results in binding to
> >     initial kernel mount.  This behavior while preserving legacy semantics,
> >     does not provide strict isolation in a container environment. i.e by
> > diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
> > index cfe8466f7fef..b60d1438c660 100644
> > --- a/fs/devpts/inode.c
> > +++ b/fs/devpts/inode.c
> > @@ -102,6 +102,8 @@ struct pts_mount_opts {
> >         int setgid;
> >         kuid_t   uid;
> >         kgid_t   gid;
> > +       uid_t ptmx_uid;
> > +       gid_t ptmx_gid;
> >         umode_t mode;
> >         umode_t ptmxmode;
> >         int newinstance;
> > @@ -109,8 +111,8 @@ struct pts_mount_opts {
> >  };
> >
> >  enum {
> > -       Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance,  Opt_max,
> > -       Opt_err
> > +       Opt_uid, Opt_gid, Opt_ptmx_uid, Opt_ptmx_gid, Opt_mode, Opt_ptmxmode,
> > +       Opt_newinstance,  Opt_max, Opt_err,
> >  };
> >
> >  static const match_table_t tokens = {
> > @@ -118,6 +120,8 @@ static const match_table_t tokens = {
> >         {Opt_gid, "gid=%u"},
> >         {Opt_mode, "mode=%o"},
> >  #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
> > +       {Opt_ptmx_uid, "ptmxuid=%u"},
> > +       {Opt_ptmx_gid, "ptmxgid=%u"},
> >         {Opt_ptmxmode, "ptmxmode=%o"},
> >         {Opt_newinstance, "newinstance"},
> >         {Opt_max, "max=%d"},
> > @@ -162,14 +166,17 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
> >         char *p;
> >         kuid_t uid;
> >         kgid_t gid;
> > -
> > -       opts->setuid  = 0;
> > -       opts->setgid  = 0;
> > -       opts->uid     = GLOBAL_ROOT_UID;
> > -       opts->gid     = GLOBAL_ROOT_GID;
> > -       opts->mode    = DEVPTS_DEFAULT_MODE;
> > +       bool setptmxid = false;
> > +
> > +       opts->setuid   = 0;
> > +       opts->setgid   = 0;
> > +       opts->uid      = GLOBAL_ROOT_UID;
> > +       opts->gid      = GLOBAL_ROOT_GID;
> > +       opts->ptmx_uid = 0;
> > +       opts->ptmx_gid = 0;
> > +       opts->mode     = DEVPTS_DEFAULT_MODE;
> >         opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
> > -       opts->max     = NR_UNIX98_PTY_MAX;
> > +       opts->max      = NR_UNIX98_PTY_MAX;
> >
> >         /* newinstance makes sense only on initial mount */
> >         if (op == PARSE_MOUNT)
> > @@ -209,6 +216,22 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
> >                         opts->mode = option & S_IALLUGO;
> >                         break;
> >  #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
> > +               case Opt_ptmx_uid:
> > +                       if (match_int(&args[0], &option))
> > +                               return -EINVAL;
> > +                       if (op != PARSE_MOUNT)
> > +                               return -EINVAL;
> > +                       opts->ptmx_uid = option;
> > +                       setptmxid = true;
> > +                       break;
> > +               case Opt_ptmx_gid:
> > +                       if (match_int(&args[0], &option))
> > +                               return -EINVAL;
> > +                       if (op != PARSE_MOUNT)
> > +                               return -EINVAL;
> > +                       opts->ptmx_gid = option;
> > +                       setptmxid = true;
> > +                       break;
> >                 case Opt_ptmxmode:
> >                         if (match_octal(&args[0], &option))
> >                                 return -EINVAL;
> > @@ -232,6 +255,9 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
> >                 }
> >         }
> >
> > +       if (setptmxid && !opts->newinstance)
> > +               return -EINVAL;
> > +
> >         return 0;
> >  }
> >
> > @@ -245,12 +271,12 @@ static int mknod_ptmx(struct super_block *sb)
> >         struct dentry *root = sb->s_root;
> >         struct pts_fs_info *fsi = DEVPTS_SB(sb);
> >         struct pts_mount_opts *opts = &fsi->mount_opts;
> > -       kuid_t root_uid;
> > -       kgid_t root_gid;
> > +       kuid_t ptmx_uid;
> > +       kgid_t ptmx_gid;
> >
> > -       root_uid = make_kuid(current_user_ns(), 0);
> > -       root_gid = make_kgid(current_user_ns(), 0);
> > -       if (!uid_valid(root_uid) || !gid_valid(root_gid))
> > +       ptmx_uid = make_kuid(current_user_ns(), opts->ptmx_uid);
> > +       ptmx_gid = make_kgid(current_user_ns(), opts->ptmx_gid);
> > +       if (!uid_valid(ptmx_uid) || !gid_valid(ptmx_gid))
> >                 return -EINVAL;
> >
> >         mutex_lock(&root->d_inode->i_mutex);
> > @@ -282,8 +308,8 @@ static int mknod_ptmx(struct super_block *sb)
> >
> >         mode = S_IFCHR|opts->ptmxmode;
> >         init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
> > -       inode->i_uid = root_uid;
> > -       inode->i_gid = root_gid;
> > +       inode->i_uid = ptmx_uid;
> > +       inode->i_gid = ptmx_gid;
> >
> >         d_add(dentry, inode);
> >
> > --
> > 2.3.0
> >
> 
> 
> 



--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Larsson March 31, 2015, 7:57 a.m. UTC | #3
On fre, 2015-03-27 at 10:03 +0100, James Bottomley 
> 
> > On Fri, Feb 20, 2015 at 5:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> > > It's currently impossible to mount devpts in a user namespace that
> > > has no root user, since ptmx can't be created.
> 
> This is where I stopped reading because it's not true ... because it is
> possible, you just do it from the host as real root.

The point is being able to set up a container as a user, not requiring
the setup to be run as root at all. In my case container is a desktop
application which will be started by the user, and will run as the user.
There is no root involved in the call chain at all.
Andy Lutomirski March 31, 2015, 1:06 p.m. UTC | #4
On Tue, Mar 31, 2015 at 12:57 AM, Alexander Larsson <alexl@redhat.com> wrote:
> On fre, 2015-03-27 at 10:03 +0100, James Bottomley
>>
>> > On Fri, Feb 20, 2015 at 5:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> > > It's currently impossible to mount devpts in a user namespace that
>> > > has no root user, since ptmx can't be created.
>>
>> This is where I stopped reading because it's not true ... because it is
>> possible, you just do it from the host as real root.
>
> The point is being able to set up a container as a user, not requiring
> the setup to be run as root at all. In my case container is a desktop
> application which will be started by the user, and will run as the user.
> There is no root involved in the call chain at all.

Even more precisely, I think the status quo has less to do with the
privilege of the userns' creator and more to do with the configuration
of the userns.  Anyone (root or otherwise) can create a userns with or
without a "root" inside.  With the current code, if you create a
userns with no inner root, you can't mount devpts inside.

This gives no security benefit exactly because nonroot users *can*
create user namespaces that contain a root user.  In the use case for
this patch, the user doesn't want to do that and then gets stuck with
the current code.


--Andy


>
> --
> =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
>  Alexander Larsson                                            Red Hat, Inc
>        alexl@redhat.com            alexander.larsson@gmail.com
> He's a war-weary Catholic boxer for the 21st century. She's a beautiful
> renegade magician's assistant looking for love in all the wrong places.
> They fight crime!
>
James Bottomley March 31, 2015, 1:07 p.m. UTC | #5
On Tue, 2015-03-31 at 09:57 +0200, Alexander Larsson wrote:
> On fre, 2015-03-27 at 10:03 +0100, James Bottomley 
> > 
> > > On Fri, Feb 20, 2015 at 5:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> > > > It's currently impossible to mount devpts in a user namespace that
> > > > has no root user, since ptmx can't be created.
> > 
> > This is where I stopped reading because it's not true ... because it is
> > possible, you just do it from the host as real root.
> 
> The point is being able to set up a container as a user, not requiring
> the setup to be run as root at all. In my case container is a desktop
> application which will be started by the user, and will run as the user.
> There is no root involved in the call chain at all.

I don't really like that use case:  Most container setups are under the
control of an orchestration system (like LXC, OpenVZ or even Docker).
You typically get the orchestration system to do the dangerous
operations (mount being one of the bigger dangers) because it has the
capacity to vet them.  I can see the value in allowing a user to set up
a container without an oversight system, but at the same time you're
increasing the security vulnerability of the system.  Security is often
a result of policy, so now this embeds policy into the kernel.  I
strongly feel we should define the list of things we expect an
unsupervised (as in with no orchestration system) container to do and
then revisit this once we've given it some thought.

James


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Larsson March 31, 2015, 1:11 p.m. UTC | #6
On tis, 2015-03-31 at 16:07 +0300, James Bottomley wrote:
> On Tue, 2015-03-31 at 09:57 +0200, Alexander Larsson wrote:
> > On fre, 2015-03-27 at 10:03 +0100, James Bottomley 
> > > 
> > > > On Fri, Feb 20, 2015 at 5:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> > > > > It's currently impossible to mount devpts in a user namespace that
> > > > > has no root user, since ptmx can't be created.
> > > 
> > > This is where I stopped reading because it's not true ... because it is
> > > possible, you just do it from the host as real root.
> > 
> > The point is being able to set up a container as a user, not requiring
> > the setup to be run as root at all. In my case container is a desktop
> > application which will be started by the user, and will run as the user.
> > There is no root involved in the call chain at all.
> 
> I don't really like that use case:  Most container setups are under the
> control of an orchestration system (like LXC, OpenVZ or even Docker).

Well, I'm doing something different from a server side orchestration
framework. I'm doing sandboxed desktop apps.
Andy Lutomirski March 31, 2015, 1:12 p.m. UTC | #7
On Tue, Mar 31, 2015 at 6:07 AM, James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
> On Tue, 2015-03-31 at 09:57 +0200, Alexander Larsson wrote:
>> On fre, 2015-03-27 at 10:03 +0100, James Bottomley
>> >
>> > > On Fri, Feb 20, 2015 at 5:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> > > > It's currently impossible to mount devpts in a user namespace that
>> > > > has no root user, since ptmx can't be created.
>> >
>> > This is where I stopped reading because it's not true ... because it is
>> > possible, you just do it from the host as real root.
>>
>> The point is being able to set up a container as a user, not requiring
>> the setup to be run as root at all. In my case container is a desktop
>> application which will be started by the user, and will run as the user.
>> There is no root involved in the call chain at all.
>
> I don't really like that use case:  Most container setups are under the
> control of an orchestration system (like LXC, OpenVZ or even Docker).
> You typically get the orchestration system to do the dangerous
> operations (mount being one of the bigger dangers) because it has the
> capacity to vet them.  I can see the value in allowing a user to set up
> a container without an oversight system, but at the same time you're
> increasing the security vulnerability of the system.  Security is often
> a result of policy, so now this embeds policy into the kernel.  I
> strongly feel we should define the list of things we expect an
> unsupervised (as in with no orchestration system) container to do and
> then revisit this once we've given it some thought.

Try thinking "sandbox", not "container".  The ability to create
sandboxes without some root-installed orchestration is incredibly
valuable.

In any event, this ship sailed quite awhile ago.  devpts is one of the
smallish number of important missing features.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley March 31, 2015, 1:23 p.m. UTC | #8
On Tue, 2015-03-31 at 06:12 -0700, Andy Lutomirski wrote:
> On Tue, Mar 31, 2015 at 6:07 AM, James Bottomley
> <James.Bottomley@hansenpartnership.com> wrote:
> > On Tue, 2015-03-31 at 09:57 +0200, Alexander Larsson wrote:
> >> On fre, 2015-03-27 at 10:03 +0100, James Bottomley
> >> >
> >> > > On Fri, Feb 20, 2015 at 5:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> >> > > > It's currently impossible to mount devpts in a user namespace that
> >> > > > has no root user, since ptmx can't be created.
> >> >
> >> > This is where I stopped reading because it's not true ... because it is
> >> > possible, you just do it from the host as real root.
> >>
> >> The point is being able to set up a container as a user, not requiring
> >> the setup to be run as root at all. In my case container is a desktop
> >> application which will be started by the user, and will run as the user.
> >> There is no root involved in the call chain at all.
> >
> > I don't really like that use case:  Most container setups are under the
> > control of an orchestration system (like LXC, OpenVZ or even Docker).
> > You typically get the orchestration system to do the dangerous
> > operations (mount being one of the bigger dangers) because it has the
> > capacity to vet them.  I can see the value in allowing a user to set up
> > a container without an oversight system, but at the same time you're
> > increasing the security vulnerability of the system.  Security is often
> > a result of policy, so now this embeds policy into the kernel.  I
> > strongly feel we should define the list of things we expect an
> > unsupervised (as in with no orchestration system) container to do and
> > then revisit this once we've given it some thought.
> 
> Try thinking "sandbox", not "container".  The ability to create
> sandboxes without some root-installed orchestration is incredibly
> valuable.

A container is anything that uses the various container APIs (mostly
cgroups and namespaces), so the set of possible containers overlaps the
set of possible sandboxes.

> In any event, this ship sailed quite awhile ago.  devpts is one of the
> smallish number of important missing features.

I'm not saying "don't do it" I'm saying think carefully about the
allowable features we permit an unprivileged user to take advantage of.
This one feels strange to me in that you're asking to give an
unprivileged user in a container more abilities than an unprivileged
user outside a container (a non-root user can't mount /dev/ptmx today).
This would mean that every unprivileged container user can now interfere
with the tty subsystem.

James


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski March 31, 2015, 1:44 p.m. UTC | #9
On Tue, Mar 31, 2015 at 6:23 AM, James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
> On Tue, 2015-03-31 at 06:12 -0700, Andy Lutomirski wrote:
>> On Tue, Mar 31, 2015 at 6:07 AM, James Bottomley
>> <James.Bottomley@hansenpartnership.com> wrote:
>> > On Tue, 2015-03-31 at 09:57 +0200, Alexander Larsson wrote:
>> >> On fre, 2015-03-27 at 10:03 +0100, James Bottomley
>> >> >
>> >> > > On Fri, Feb 20, 2015 at 5:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> >> > > > It's currently impossible to mount devpts in a user namespace that
>> >> > > > has no root user, since ptmx can't be created.
>> >> >
>> >> > This is where I stopped reading because it's not true ... because it is
>> >> > possible, you just do it from the host as real root.
>> >>
>> >> The point is being able to set up a container as a user, not requiring
>> >> the setup to be run as root at all. In my case container is a desktop
>> >> application which will be started by the user, and will run as the user.
>> >> There is no root involved in the call chain at all.
>> >
>> > I don't really like that use case:  Most container setups are under the
>> > control of an orchestration system (like LXC, OpenVZ or even Docker).
>> > You typically get the orchestration system to do the dangerous
>> > operations (mount being one of the bigger dangers) because it has the
>> > capacity to vet them.  I can see the value in allowing a user to set up
>> > a container without an oversight system, but at the same time you're
>> > increasing the security vulnerability of the system.  Security is often
>> > a result of policy, so now this embeds policy into the kernel.  I
>> > strongly feel we should define the list of things we expect an
>> > unsupervised (as in with no orchestration system) container to do and
>> > then revisit this once we've given it some thought.
>>
>> Try thinking "sandbox", not "container".  The ability to create
>> sandboxes without some root-installed orchestration is incredibly
>> valuable.
>
> A container is anything that uses the various container APIs (mostly
> cgroups and namespaces), so the set of possible containers overlaps the
> set of possible sandboxes.
>
>> In any event, this ship sailed quite awhile ago.  devpts is one of the
>> smallish number of important missing features.
>
> I'm not saying "don't do it" I'm saying think carefully about the
> allowable features we permit an unprivileged user to take advantage of.
> This one feels strange to me in that you're asking to give an
> unprivileged user in a container more abilities than an unprivileged
> user outside a container (a non-root user can't mount /dev/ptmx today).
> This would mean that every unprivileged container user can now interfere
> with the tty subsystem.

That is true, but this is already the case.  The current code is:

        root_uid = make_kuid(current_user_ns(), 0);
        root_gid = make_kgid(current_user_ns(), 0);

Unprivileged tasks can make a userns and map themselves as "0" inside,
at which point the code I quoted will work fine.  The failure only
happens if they opt not to map anything at all as "0", as many
sandboxes will do.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley March 31, 2015, 1:55 p.m. UTC | #10
On Tue, 2015-03-31 at 06:44 -0700, Andy Lutomirski wrote:
> On Tue, Mar 31, 2015 at 6:23 AM, James Bottomley
> <James.Bottomley@hansenpartnership.com> wrote:
> > On Tue, 2015-03-31 at 06:12 -0700, Andy Lutomirski wrote:
> >> On Tue, Mar 31, 2015 at 6:07 AM, James Bottomley
> >> <James.Bottomley@hansenpartnership.com> wrote:
> >> > On Tue, 2015-03-31 at 09:57 +0200, Alexander Larsson wrote:
> >> >> On fre, 2015-03-27 at 10:03 +0100, James Bottomley
> >> >> >
> >> >> > > On Fri, Feb 20, 2015 at 5:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> >> >> > > > It's currently impossible to mount devpts in a user namespace that
> >> >> > > > has no root user, since ptmx can't be created.
> >> >> >
> >> >> > This is where I stopped reading because it's not true ... because it is
> >> >> > possible, you just do it from the host as real root.
> >> >>
> >> >> The point is being able to set up a container as a user, not requiring
> >> >> the setup to be run as root at all. In my case container is a desktop
> >> >> application which will be started by the user, and will run as the user.
> >> >> There is no root involved in the call chain at all.
> >> >
> >> > I don't really like that use case:  Most container setups are under the
> >> > control of an orchestration system (like LXC, OpenVZ or even Docker).
> >> > You typically get the orchestration system to do the dangerous
> >> > operations (mount being one of the bigger dangers) because it has the
> >> > capacity to vet them.  I can see the value in allowing a user to set up
> >> > a container without an oversight system, but at the same time you're
> >> > increasing the security vulnerability of the system.  Security is often
> >> > a result of policy, so now this embeds policy into the kernel.  I
> >> > strongly feel we should define the list of things we expect an
> >> > unsupervised (as in with no orchestration system) container to do and
> >> > then revisit this once we've given it some thought.
> >>
> >> Try thinking "sandbox", not "container".  The ability to create
> >> sandboxes without some root-installed orchestration is incredibly
> >> valuable.
> >
> > A container is anything that uses the various container APIs (mostly
> > cgroups and namespaces), so the set of possible containers overlaps the
> > set of possible sandboxes.
> >
> >> In any event, this ship sailed quite awhile ago.  devpts is one of the
> >> smallish number of important missing features.
> >
> > I'm not saying "don't do it" I'm saying think carefully about the
> > allowable features we permit an unprivileged user to take advantage of.
> > This one feels strange to me in that you're asking to give an
> > unprivileged user in a container more abilities than an unprivileged
> > user outside a container (a non-root user can't mount /dev/ptmx today).
> > This would mean that every unprivileged container user can now interfere
> > with the tty subsystem.
> 
> That is true, but this is already the case.  The current code is:
> 
>         root_uid = make_kuid(current_user_ns(), 0);
>         root_gid = make_kgid(current_user_ns(), 0);
> 
> Unprivileged tasks can make a userns and map themselves as "0" inside,
> at which point the code I quoted will work fine.  The failure only
> happens if they opt not to map anything at all as "0", as many
> sandboxes will do.

Yes, I know.  However remember we use containers to host VPSs which
themselves can have non-root users.  I don't want a non root user inside
the VPS to be able to muck with the tty subsystem.  Your patch allows
that.  It will effectively relax security of a VPS container which is
highly undesirable.  We need the security of an operating system
container to be the same as it would be for an unvirtualized operating
system otherwise people get nasty surprises.

The fact that container root can mount /dev/ptmx is fine to me, because
container root is a privileged user inside the container.  There's still
no way, short of a privilege escalation, than a non-root container user
can become container root.

James


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski March 31, 2015, 1:59 p.m. UTC | #11
On Tue, Mar 31, 2015 at 6:55 AM, James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
> On Tue, 2015-03-31 at 06:44 -0700, Andy Lutomirski wrote:
>> On Tue, Mar 31, 2015 at 6:23 AM, James Bottomley
>> <James.Bottomley@hansenpartnership.com> wrote:
>> > On Tue, 2015-03-31 at 06:12 -0700, Andy Lutomirski wrote:
>> >> On Tue, Mar 31, 2015 at 6:07 AM, James Bottomley
>> >> <James.Bottomley@hansenpartnership.com> wrote:
>> >> > On Tue, 2015-03-31 at 09:57 +0200, Alexander Larsson wrote:
>> >> >> On fre, 2015-03-27 at 10:03 +0100, James Bottomley
>> >> >> >
>> >> >> > > On Fri, Feb 20, 2015 at 5:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> >> >> > > > It's currently impossible to mount devpts in a user namespace that
>> >> >> > > > has no root user, since ptmx can't be created.
>> >> >> >
>> >> >> > This is where I stopped reading because it's not true ... because it is
>> >> >> > possible, you just do it from the host as real root.
>> >> >>
>> >> >> The point is being able to set up a container as a user, not requiring
>> >> >> the setup to be run as root at all. In my case container is a desktop
>> >> >> application which will be started by the user, and will run as the user.
>> >> >> There is no root involved in the call chain at all.
>> >> >
>> >> > I don't really like that use case:  Most container setups are under the
>> >> > control of an orchestration system (like LXC, OpenVZ or even Docker).
>> >> > You typically get the orchestration system to do the dangerous
>> >> > operations (mount being one of the bigger dangers) because it has the
>> >> > capacity to vet them.  I can see the value in allowing a user to set up
>> >> > a container without an oversight system, but at the same time you're
>> >> > increasing the security vulnerability of the system.  Security is often
>> >> > a result of policy, so now this embeds policy into the kernel.  I
>> >> > strongly feel we should define the list of things we expect an
>> >> > unsupervised (as in with no orchestration system) container to do and
>> >> > then revisit this once we've given it some thought.
>> >>
>> >> Try thinking "sandbox", not "container".  The ability to create
>> >> sandboxes without some root-installed orchestration is incredibly
>> >> valuable.
>> >
>> > A container is anything that uses the various container APIs (mostly
>> > cgroups and namespaces), so the set of possible containers overlaps the
>> > set of possible sandboxes.
>> >
>> >> In any event, this ship sailed quite awhile ago.  devpts is one of the
>> >> smallish number of important missing features.
>> >
>> > I'm not saying "don't do it" I'm saying think carefully about the
>> > allowable features we permit an unprivileged user to take advantage of.
>> > This one feels strange to me in that you're asking to give an
>> > unprivileged user in a container more abilities than an unprivileged
>> > user outside a container (a non-root user can't mount /dev/ptmx today).
>> > This would mean that every unprivileged container user can now interfere
>> > with the tty subsystem.
>>
>> That is true, but this is already the case.  The current code is:
>>
>>         root_uid = make_kuid(current_user_ns(), 0);
>>         root_gid = make_kgid(current_user_ns(), 0);
>>
>> Unprivileged tasks can make a userns and map themselves as "0" inside,
>> at which point the code I quoted will work fine.  The failure only
>> happens if they opt not to map anything at all as "0", as many
>> sandboxes will do.
>
> Yes, I know.  However remember we use containers to host VPSs which
> themselves can have non-root users.  I don't want a non root user inside
> the VPS to be able to muck with the tty subsystem.  Your patch allows
> that.

I don't think that this is correct.  That user can already create a
nested userns and map themselves as 0 inside it.  Then they can mount
devpts.

--Andy

>  It will effectively relax security of a VPS container which is
> highly undesirable.  We need the security of an operating system
> container to be the same as it would be for an unvirtualized operating
> system otherwise people get nasty surprises.
>
> The fact that container root can mount /dev/ptmx is fine to me, because
> container root is a privileged user inside the container.  There's still
> no way, short of a privilege escalation, than a non-root container user
> can become container root.
>
> James
>
>
James Bottomley March 31, 2015, 2:08 p.m. UTC | #12
On Tue, 2015-03-31 at 06:59 -0700, Andy Lutomirski wrote:
> On Tue, Mar 31, 2015 at 6:55 AM, James Bottomley
> <James.Bottomley@hansenpartnership.com> wrote:
> > On Tue, 2015-03-31 at 06:44 -0700, Andy Lutomirski wrote:
> >> On Tue, Mar 31, 2015 at 6:23 AM, James Bottomley
> >> <James.Bottomley@hansenpartnership.com> wrote:
> >> > On Tue, 2015-03-31 at 06:12 -0700, Andy Lutomirski wrote:
> >> >> On Tue, Mar 31, 2015 at 6:07 AM, James Bottomley
> >> >> <James.Bottomley@hansenpartnership.com> wrote:
> >> >> > On Tue, 2015-03-31 at 09:57 +0200, Alexander Larsson wrote:
> >> >> >> On fre, 2015-03-27 at 10:03 +0100, James Bottomley
> >> >> >> >
> >> >> >> > > On Fri, Feb 20, 2015 at 5:04 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> >> >> >> > > > It's currently impossible to mount devpts in a user namespace that
> >> >> >> > > > has no root user, since ptmx can't be created.
> >> >> >> >
> >> >> >> > This is where I stopped reading because it's not true ... because it is
> >> >> >> > possible, you just do it from the host as real root.
> >> >> >>
> >> >> >> The point is being able to set up a container as a user, not requiring
> >> >> >> the setup to be run as root at all. In my case container is a desktop
> >> >> >> application which will be started by the user, and will run as the user.
> >> >> >> There is no root involved in the call chain at all.
> >> >> >
> >> >> > I don't really like that use case:  Most container setups are under the
> >> >> > control of an orchestration system (like LXC, OpenVZ or even Docker).
> >> >> > You typically get the orchestration system to do the dangerous
> >> >> > operations (mount being one of the bigger dangers) because it has the
> >> >> > capacity to vet them.  I can see the value in allowing a user to set up
> >> >> > a container without an oversight system, but at the same time you're
> >> >> > increasing the security vulnerability of the system.  Security is often
> >> >> > a result of policy, so now this embeds policy into the kernel.  I
> >> >> > strongly feel we should define the list of things we expect an
> >> >> > unsupervised (as in with no orchestration system) container to do and
> >> >> > then revisit this once we've given it some thought.
> >> >>
> >> >> Try thinking "sandbox", not "container".  The ability to create
> >> >> sandboxes without some root-installed orchestration is incredibly
> >> >> valuable.
> >> >
> >> > A container is anything that uses the various container APIs (mostly
> >> > cgroups and namespaces), so the set of possible containers overlaps the
> >> > set of possible sandboxes.
> >> >
> >> >> In any event, this ship sailed quite awhile ago.  devpts is one of the
> >> >> smallish number of important missing features.
> >> >
> >> > I'm not saying "don't do it" I'm saying think carefully about the
> >> > allowable features we permit an unprivileged user to take advantage of.
> >> > This one feels strange to me in that you're asking to give an
> >> > unprivileged user in a container more abilities than an unprivileged
> >> > user outside a container (a non-root user can't mount /dev/ptmx today).
> >> > This would mean that every unprivileged container user can now interfere
> >> > with the tty subsystem.
> >>
> >> That is true, but this is already the case.  The current code is:
> >>
> >>         root_uid = make_kuid(current_user_ns(), 0);
> >>         root_gid = make_kgid(current_user_ns(), 0);
> >>
> >> Unprivileged tasks can make a userns and map themselves as "0" inside,
> >> at which point the code I quoted will work fine.  The failure only
> >> happens if they opt not to map anything at all as "0", as many
> >> sandboxes will do.
> >
> > Yes, I know.  However remember we use containers to host VPSs which
> > themselves can have non-root users.  I don't want a non root user inside
> > the VPS to be able to muck with the tty subsystem.  Your patch allows
> > that.
> 
> I don't think that this is correct.  That user can already create a
> nested userns and map themselves as 0 inside it.  Then they can mount
> devpts.

I don't mind if they create a container and control the isolated ttys in
that sub container in the VPS; that's fine.  I do mind if they get
access to the ttys in the VPS.

If you can convince me (and the rest of Linux) that the tty subsystem
should be mountable by an unprivileged user generally, then what you
propose is OK.

James


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Larsson March 31, 2015, 2:17 p.m. UTC | #13
On tis, 2015-03-31 at 17:08 +0300, James Bottomley wrote:
> On Tue, 2015-03-31 at 06:59 -0700, Andy Lutomirski wrote:
> > 
> > I don't think that this is correct.  That user can already create a
> > nested userns and map themselves as 0 inside it.  Then they can mount
> > devpts.
> 
> I don't mind if they create a container and control the isolated ttys in
> that sub container in the VPS; that's fine.  I do mind if they get
> access to the ttys in the VPS.
> 
> If you can convince me (and the rest of Linux) that the tty subsystem
> should be mountable by an unprivileged user generally, then what you
> propose is OK.

That is controlled by the general rights to mount stuff. I.e. unless you
have CAP_SYS_ADMIN in the VPS container you will not be able to mount
devpts there. You can only do it in a subcontainer where you got
permissions to mount via using user namespaces.
James Bottomley April 2, 2015, 10:12 a.m. UTC | #14
On Tue, 2015-03-31 at 16:17 +0200, Alexander Larsson wrote:
> On tis, 2015-03-31 at 17:08 +0300, James Bottomley wrote:
> > On Tue, 2015-03-31 at 06:59 -0700, Andy Lutomirski wrote:
> > > 
> > > I don't think that this is correct.  That user can already create a
> > > nested userns and map themselves as 0 inside it.  Then they can mount
> > > devpts.
> > 
> > I don't mind if they create a container and control the isolated ttys in
> > that sub container in the VPS; that's fine.  I do mind if they get
> > access to the ttys in the VPS.
> > 
> > If you can convince me (and the rest of Linux) that the tty subsystem
> > should be mountable by an unprivileged user generally, then what you
> > propose is OK.
> 
> That is controlled by the general rights to mount stuff. I.e. unless you
> have CAP_SYS_ADMIN in the VPS container you will not be able to mount
> devpts there. You can only do it in a subcontainer where you got
> permissions to mount via using user namespaces.

OK let me try again.  Fine, if you want to speak capabilities, you've
given a non-root user an unexpected capability (the capability of
creating a ptmx device).  But you haven't used a capability separation
to do this, you've just hard coded it via a mount parameter mechanism.

If you want to do this thing, do it properly, so it's acceptable to the
whole of Linux, not a special corner case for one particular type of
container.

Security breaches are created when people code in special, little used,
corner cases because they don't get as thoroughly tested and inspected
as generally applicable mechanisms.

What you want is to be able to use the tty subsystem as a non root user:
fine, but set that up globally, don't hide it in containers so a lot
fewer people care.

James


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski April 2, 2015, 2:06 p.m. UTC | #15
On Thu, Apr 2, 2015 at 3:12 AM, James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
> On Tue, 2015-03-31 at 16:17 +0200, Alexander Larsson wrote:
>> On tis, 2015-03-31 at 17:08 +0300, James Bottomley wrote:
>> > On Tue, 2015-03-31 at 06:59 -0700, Andy Lutomirski wrote:
>> > >
>> > > I don't think that this is correct.  That user can already create a
>> > > nested userns and map themselves as 0 inside it.  Then they can mount
>> > > devpts.
>> >
>> > I don't mind if they create a container and control the isolated ttys in
>> > that sub container in the VPS; that's fine.  I do mind if they get
>> > access to the ttys in the VPS.
>> >
>> > If you can convince me (and the rest of Linux) that the tty subsystem
>> > should be mountable by an unprivileged user generally, then what you
>> > propose is OK.
>>
>> That is controlled by the general rights to mount stuff. I.e. unless you
>> have CAP_SYS_ADMIN in the VPS container you will not be able to mount
>> devpts there. You can only do it in a subcontainer where you got
>> permissions to mount via using user namespaces.
>
> OK let me try again.  Fine, if you want to speak capabilities, you've
> given a non-root user an unexpected capability (the capability of
> creating a ptmx device).  But you haven't used a capability separation
> to do this, you've just hard coded it via a mount parameter mechanism.
>
> If you want to do this thing, do it properly, so it's acceptable to the
> whole of Linux, not a special corner case for one particular type of
> container.
>
> Security breaches are created when people code in special, little used,
> corner cases because they don't get as thoroughly tested and inspected
> as generally applicable mechanisms.
>
> What you want is to be able to use the tty subsystem as a non root user:
> fine, but set that up globally, don't hide it in containers so a lot
> fewer people care.

I tend to agree, and not just for the tty subsystem.  This is an
attack surface issue.  With unprivileged user namespaces, unprivileged
users can create mount namespaces (probably a good thing for bind
mounts, etc), network namespaces (reasonably safe by themselves),
network interfaces and iptables rules (scary), fresh
instances/superblocks of some filesystems (scariness depends on the fs
-- tmpfs is probably fine), and more.

I think we should have real controls for this, and this is mostly
Eric's domain.  Eric?  A silly issue that sometimes prevents devpts
from being mountable isn't a real control, though.

--Andy

>
> James
>
>
Alexander Larsson April 2, 2015, 2:29 p.m. UTC | #16
On Thu, 2015-04-02 at 07:06 -0700, Andy Lutomirski wrote:
> On Thu, Apr 2, 2015 at 3:12 AM, James Bottomley
> <James.Bottomley@hansenpartnership.com> wrote:
> > On Tue, 2015-03-31 at 16:17 +0200, Alexander Larsson wrote:
> >> On tis, 2015-03-31 at 17:08 +0300, James Bottomley wrote:
> >> > On Tue, 2015-03-31 at 06:59 -0700, Andy Lutomirski wrote:
> >> > >
> >> > > I don't think that this is correct.  That user can already create a
> >> > > nested userns and map themselves as 0 inside it.  Then they can mount
> >> > > devpts.
> >> >
> >> > I don't mind if they create a container and control the isolated ttys in
> >> > that sub container in the VPS; that's fine.  I do mind if they get
> >> > access to the ttys in the VPS.
> >> >
> >> > If you can convince me (and the rest of Linux) that the tty subsystem
> >> > should be mountable by an unprivileged user generally, then what you
> >> > propose is OK.
> >>
> >> That is controlled by the general rights to mount stuff. I.e. unless you
> >> have CAP_SYS_ADMIN in the VPS container you will not be able to mount
> >> devpts there. You can only do it in a subcontainer where you got
> >> permissions to mount via using user namespaces.
> >
> > OK let me try again.  Fine, if you want to speak capabilities, you've
> > given a non-root user an unexpected capability (the capability of
> > creating a ptmx device).  But you haven't used a capability separation
> > to do this, you've just hard coded it via a mount parameter mechanism.
> >
> > If you want to do this thing, do it properly, so it's acceptable to the
> > whole of Linux, not a special corner case for one particular type of
> > container.
> >
> > Security breaches are created when people code in special, little used,
> > corner cases because they don't get as thoroughly tested and inspected
> > as generally applicable mechanisms.
> >
> > What you want is to be able to use the tty subsystem as a non root user:
> > fine, but set that up globally, don't hide it in containers so a lot
> > fewer people care.
> 
> I tend to agree, and not just for the tty subsystem.  This is an
> attack surface issue.  With unprivileged user namespaces, unprivileged
> users can create mount namespaces (probably a good thing for bind
> mounts, etc), network namespaces (reasonably safe by themselves),
> network interfaces and iptables rules (scary), fresh
> instances/superblocks of some filesystems (scariness depends on the fs
> -- tmpfs is probably fine), and more.
> 
> I think we should have real controls for this, and this is mostly
> Eric's domain.  Eric?  A silly issue that sometimes prevents devpts
> from being mountable isn't a real control, though.

I'm honestly surprised that non-root is allowed to mount things in
general with user namespaces. This was long disabled use for non-root in
Fedora, but it is now enabled. 

For instance, using loopback mounted files you could probably attack
some of the less well tested filesystem implementations by feeding them
fuzzed data.

Anyway, I don't see how this affects devpts though. If you're running in
a container (or uncontained), as a regular users with no mount
capabilities you can already mount a devpts filesystem if you create a
subbcontainer with user namespaces and map your uid to 0 in the
subcontainer. Then you get a new ptmx device that you can do whatever
you want with. The mount option would let you do the same, except be
your regular uid in the subcontainer.

The only difference outside of the subcontainer is that if the outer
container has no uid 0 mapped, yet the user has CAP_SYSADMIN rights in
that container. Then he can mount devpts in the outer container where he
before could only mount it in an inner container.

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski April 2, 2015, 2:33 p.m. UTC | #17
On Thu, Apr 2, 2015 at 7:29 AM, Alexander Larsson <alexl@redhat.com> wrote:
> On Thu, 2015-04-02 at 07:06 -0700, Andy Lutomirski wrote:
>> On Thu, Apr 2, 2015 at 3:12 AM, James Bottomley
>> <James.Bottomley@hansenpartnership.com> wrote:
>> > On Tue, 2015-03-31 at 16:17 +0200, Alexander Larsson wrote:
>> >> On tis, 2015-03-31 at 17:08 +0300, James Bottomley wrote:
>> >> > On Tue, 2015-03-31 at 06:59 -0700, Andy Lutomirski wrote:
>> >> > >
>> >> > > I don't think that this is correct.  That user can already create a
>> >> > > nested userns and map themselves as 0 inside it.  Then they can mount
>> >> > > devpts.
>> >> >
>> >> > I don't mind if they create a container and control the isolated ttys in
>> >> > that sub container in the VPS; that's fine.  I do mind if they get
>> >> > access to the ttys in the VPS.
>> >> >
>> >> > If you can convince me (and the rest of Linux) that the tty subsystem
>> >> > should be mountable by an unprivileged user generally, then what you
>> >> > propose is OK.
>> >>
>> >> That is controlled by the general rights to mount stuff. I.e. unless you
>> >> have CAP_SYS_ADMIN in the VPS container you will not be able to mount
>> >> devpts there. You can only do it in a subcontainer where you got
>> >> permissions to mount via using user namespaces.
>> >
>> > OK let me try again.  Fine, if you want to speak capabilities, you've
>> > given a non-root user an unexpected capability (the capability of
>> > creating a ptmx device).  But you haven't used a capability separation
>> > to do this, you've just hard coded it via a mount parameter mechanism.
>> >
>> > If you want to do this thing, do it properly, so it's acceptable to the
>> > whole of Linux, not a special corner case for one particular type of
>> > container.
>> >
>> > Security breaches are created when people code in special, little used,
>> > corner cases because they don't get as thoroughly tested and inspected
>> > as generally applicable mechanisms.
>> >
>> > What you want is to be able to use the tty subsystem as a non root user:
>> > fine, but set that up globally, don't hide it in containers so a lot
>> > fewer people care.
>>
>> I tend to agree, and not just for the tty subsystem.  This is an
>> attack surface issue.  With unprivileged user namespaces, unprivileged
>> users can create mount namespaces (probably a good thing for bind
>> mounts, etc), network namespaces (reasonably safe by themselves),
>> network interfaces and iptables rules (scary), fresh
>> instances/superblocks of some filesystems (scariness depends on the fs
>> -- tmpfs is probably fine), and more.
>>
>> I think we should have real controls for this, and this is mostly
>> Eric's domain.  Eric?  A silly issue that sometimes prevents devpts
>> from being mountable isn't a real control, though.
>
> I'm honestly surprised that non-root is allowed to mount things in
> general with user namespaces. This was long disabled use for non-root in
> Fedora, but it is now enabled.
>
> For instance, using loopback mounted files you could probably attack
> some of the less well tested filesystem implementations by feeding them
> fuzzed data.
>

You actually can't do that right now.  Filesystems have to opt in to
being mounted in unprivileged user namespaces, and no filesystems with
backing stores have opted in.  devpts has, but it's buggy without this
patch IMO.

> Anyway, I don't see how this affects devpts though. If you're running in
> a container (or uncontained), as a regular users with no mount
> capabilities you can already mount a devpts filesystem if you create a
> subbcontainer with user namespaces and map your uid to 0 in the
> subcontainer. Then you get a new ptmx device that you can do whatever
> you want with. The mount option would let you do the same, except be
> your regular uid in the subcontainer.
>
> The only difference outside of the subcontainer is that if the outer
> container has no uid 0 mapped, yet the user has CAP_SYSADMIN rights in
> that container. Then he can mount devpts in the outer container where he
> before could only mount it in an inner container.
>

Agreed.  Also, devpts doesn't seem scary at all to me from a userns
perspective.  Regular users on normal systems can already use ptmx,
and AFAICS basically all of the attack surface is already available
through the normal /dev/ptmx node.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Serge E. Hallyn April 2, 2015, 3:49 p.m. UTC | #18
Quoting Andy Lutomirski (luto@amacapital.net):
> On Thu, Apr 2, 2015 at 7:29 AM, Alexander Larsson <alexl@redhat.com> wrote:
> > On Thu, 2015-04-02 at 07:06 -0700, Andy Lutomirski wrote:
> >> On Thu, Apr 2, 2015 at 3:12 AM, James Bottomley
> >> <James.Bottomley@hansenpartnership.com> wrote:
> >> > On Tue, 2015-03-31 at 16:17 +0200, Alexander Larsson wrote:
> >> >> On tis, 2015-03-31 at 17:08 +0300, James Bottomley wrote:
> >> >> > On Tue, 2015-03-31 at 06:59 -0700, Andy Lutomirski wrote:
> >> >> > >
> >> >> > > I don't think that this is correct.  That user can already create a
> >> >> > > nested userns and map themselves as 0 inside it.  Then they can mount
> >> >> > > devpts.
> >> >> >
> >> >> > I don't mind if they create a container and control the isolated ttys in
> >> >> > that sub container in the VPS; that's fine.  I do mind if they get
> >> >> > access to the ttys in the VPS.
> >> >> >
> >> >> > If you can convince me (and the rest of Linux) that the tty subsystem
> >> >> > should be mountable by an unprivileged user generally, then what you
> >> >> > propose is OK.
> >> >>
> >> >> That is controlled by the general rights to mount stuff. I.e. unless you
> >> >> have CAP_SYS_ADMIN in the VPS container you will not be able to mount
> >> >> devpts there. You can only do it in a subcontainer where you got
> >> >> permissions to mount via using user namespaces.
> >> >
> >> > OK let me try again.  Fine, if you want to speak capabilities, you've
> >> > given a non-root user an unexpected capability (the capability of
> >> > creating a ptmx device).  But you haven't used a capability separation
> >> > to do this, you've just hard coded it via a mount parameter mechanism.
> >> >
> >> > If you want to do this thing, do it properly, so it's acceptable to the
> >> > whole of Linux, not a special corner case for one particular type of
> >> > container.
> >> >
> >> > Security breaches are created when people code in special, little used,
> >> > corner cases because they don't get as thoroughly tested and inspected
> >> > as generally applicable mechanisms.
> >> >
> >> > What you want is to be able to use the tty subsystem as a non root user:
> >> > fine, but set that up globally, don't hide it in containers so a lot
> >> > fewer people care.
> >>
> >> I tend to agree, and not just for the tty subsystem.  This is an
> >> attack surface issue.  With unprivileged user namespaces, unprivileged
> >> users can create mount namespaces (probably a good thing for bind
> >> mounts, etc), network namespaces (reasonably safe by themselves),
> >> network interfaces and iptables rules (scary), fresh
> >> instances/superblocks of some filesystems (scariness depends on the fs
> >> -- tmpfs is probably fine), and more.
> >>
> >> I think we should have real controls for this, and this is mostly
> >> Eric's domain.  Eric?  A silly issue that sometimes prevents devpts
> >> from being mountable isn't a real control, though.
> >
> > I'm honestly surprised that non-root is allowed to mount things in
> > general with user namespaces. This was long disabled use for non-root in
> > Fedora, but it is now enabled.
> >
> > For instance, using loopback mounted files you could probably attack
> > some of the less well tested filesystem implementations by feeding them
> > fuzzed data.
> >
> 
> You actually can't do that right now.  Filesystems have to opt in to
> being mounted in unprivileged user namespaces, and no filesystems with
> backing stores have opted in.  devpts has, but it's buggy without this
> patch IMO.
> 
> > Anyway, I don't see how this affects devpts though. If you're running in
> > a container (or uncontained), as a regular users with no mount
> > capabilities you can already mount a devpts filesystem if you create a
> > subbcontainer with user namespaces and map your uid to 0 in the
> > subcontainer. Then you get a new ptmx device that you can do whatever
> > you want with. The mount option would let you do the same, except be
> > your regular uid in the subcontainer.
> >
> > The only difference outside of the subcontainer is that if the outer
> > container has no uid 0 mapped, yet the user has CAP_SYSADMIN rights in
> > that container. Then he can mount devpts in the outer container where he
> > before could only mount it in an inner container.
> >
> 
> Agreed.  Also, devpts doesn't seem scary at all to me from a userns
> perspective.  Regular users on normal systems can already use ptmx,
> and AFAICS basically all of the attack surface is already available
> through the normal /dev/ptmx node.

I've been ignoring this thread bc I was pretty sure I had acked the
original patch.  If you don't have a record of that (or I'm plain wrong
and never did) please let me know.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman April 2, 2015, 6:27 p.m. UTC | #19
Andy Lutomirski <luto@amacapital.net> writes:

> On Thu, Apr 2, 2015 at 7:29 AM, Alexander Larsson <alexl@redhat.com> wrote:
>> On Thu, 2015-04-02 at 07:06 -0700, Andy Lutomirski wrote:
>>> On Thu, Apr 2, 2015 at 3:12 AM, James Bottomley
>>> <James.Bottomley@hansenpartnership.com> wrote:
>>> > On Tue, 2015-03-31 at 16:17 +0200, Alexander Larsson wrote:
>>> >> On tis, 2015-03-31 at 17:08 +0300, James Bottomley wrote:
>>> >> > On Tue, 2015-03-31 at 06:59 -0700, Andy Lutomirski wrote:
>>> >> > >
>>> >> > > I don't think that this is correct.  That user can already create a
>>> >> > > nested userns and map themselves as 0 inside it.  Then they can mount
>>> >> > > devpts.
>>> >> >
>>> >> > I don't mind if they create a container and control the isolated ttys in
>>> >> > that sub container in the VPS; that's fine.  I do mind if they get
>>> >> > access to the ttys in the VPS.
>>> >> >
>>> >> > If you can convince me (and the rest of Linux) that the tty subsystem
>>> >> > should be mountable by an unprivileged user generally, then what you
>>> >> > propose is OK.
>>> >>
>>> >> That is controlled by the general rights to mount stuff. I.e. unless you
>>> >> have CAP_SYS_ADMIN in the VPS container you will not be able to mount
>>> >> devpts there. You can only do it in a subcontainer where you got
>>> >> permissions to mount via using user namespaces.
>>> >
>>> > OK let me try again.  Fine, if you want to speak capabilities, you've
>>> > given a non-root user an unexpected capability (the capability of
>>> > creating a ptmx device).  But you haven't used a capability separation
>>> > to do this, you've just hard coded it via a mount parameter mechanism.
>>> >
>>> > If you want to do this thing, do it properly, so it's acceptable to the
>>> > whole of Linux, not a special corner case for one particular type of
>>> > container.
>>> >
>>> > Security breaches are created when people code in special, little used,
>>> > corner cases because they don't get as thoroughly tested and inspected
>>> > as generally applicable mechanisms.
>>> >
>>> > What you want is to be able to use the tty subsystem as a non root user:
>>> > fine, but set that up globally, don't hide it in containers so a lot
>>> > fewer people care.
>>>
>>> I tend to agree, and not just for the tty subsystem.  This is an
>>> attack surface issue.  With unprivileged user namespaces, unprivileged
>>> users can create mount namespaces (probably a good thing for bind
>>> mounts, etc), network namespaces (reasonably safe by themselves),
>>> network interfaces and iptables rules (scary), fresh
>>> instances/superblocks of some filesystems (scariness depends on the fs
>>> -- tmpfs is probably fine), and more.
>>>
>>> I think we should have real controls for this, and this is mostly
>>> Eric's domain.  Eric?  A silly issue that sometimes prevents devpts
>>> from being mountable isn't a real control, though.

I thought the controls for limiting how much of the userspace API
an application could use were called seccomp and seccomp2.

Do we need something like a PAM module so that we can set up these
controls during login?

>> I'm honestly surprised that non-root is allowed to mount things in
>> general with user namespaces. This was long disabled use for non-root in
>> Fedora, but it is now enabled.
>>
>> For instance, using loopback mounted files you could probably attack
>> some of the less well tested filesystem implementations by feeding them
>> fuzzed data.
>>
>
> You actually can't do that right now.  Filesystems have to opt in to
> being mounted in unprivileged user namespaces, and no filesystems with
> backing stores have opted in.  devpts has, but it's buggy without this
> patch IMO.

Arguably you should use two user namespaces.  The first to do what you
want to as root the second to run as the uid you want to run as.

>> Anyway, I don't see how this affects devpts though. If you're running in
>> a container (or uncontained), as a regular users with no mount
>> capabilities you can already mount a devpts filesystem if you create a
>> subbcontainer with user namespaces and map your uid to 0 in the
>> subcontainer. Then you get a new ptmx device that you can do whatever
>> you want with. The mount option would let you do the same, except be
>> your regular uid in the subcontainer.
>>
>> The only difference outside of the subcontainer is that if the outer
>> container has no uid 0 mapped, yet the user has CAP_SYSADMIN rights in
>> that container. Then he can mount devpts in the outer container where he
>> before could only mount it in an inner container.
>>
>
> Agreed.  Also, devpts doesn't seem scary at all to me from a userns
> perspective.  Regular users on normal systems can already use ptmx,
> and AFAICS basically all of the attack surface is already available
> through the normal /dev/ptmx node.

My only real take is that there are a lot more places that you need to
tweak beyond devpts.  So this patch seemed lacking and boring.

Beyond that until I get the mount namespace sorted out things are pretty
much in a feature freeze because I can't multitask well enough to do
complicated patches and take feature patches.

Eric

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Larsson May 18, 2015, 9:04 p.m. UTC | #20
On tor, 2015-03-26 at 12:29 -0700, Andy Lutomirski wrote:
> Ping?  It's been over a month.

Ping again. I've tested this with 
https://github.com/alexlarsson/xdg-app/tree/wip/userns
and this is the final kernel change needed to allow desktop sandboxing
without any raised priviledges (setuid etc).

So, 
Tested-by: alexl@redhat.com

And please, can we get some eyeballs on this, it really is very useful
(and very simple too).
Andy Lutomirski May 27, 2015, 9:32 p.m. UTC | #21
On Thu, Apr 2, 2015 at 11:27 AM, Eric W. Biederman
<ebiederm@xmission.com> wrote:
> Andy Lutomirski <luto@amacapital.net> writes:
>
>> On Thu, Apr 2, 2015 at 7:29 AM, Alexander Larsson <alexl@redhat.com> wrote:
>>> On Thu, 2015-04-02 at 07:06 -0700, Andy Lutomirski wrote:
>>>> On Thu, Apr 2, 2015 at 3:12 AM, James Bottomley
>>>> <James.Bottomley@hansenpartnership.com> wrote:
>>>> > On Tue, 2015-03-31 at 16:17 +0200, Alexander Larsson wrote:
>>>> >> On tis, 2015-03-31 at 17:08 +0300, James Bottomley wrote:
>>>> >> > On Tue, 2015-03-31 at 06:59 -0700, Andy Lutomirski wrote:
>>>> >> > >
>>>> >> > > I don't think that this is correct.  That user can already create a
>>>> >> > > nested userns and map themselves as 0 inside it.  Then they can mount
>>>> >> > > devpts.
>>>> >> >
>>>> >> > I don't mind if they create a container and control the isolated ttys in
>>>> >> > that sub container in the VPS; that's fine.  I do mind if they get
>>>> >> > access to the ttys in the VPS.
>>>> >> >
>>>> >> > If you can convince me (and the rest of Linux) that the tty subsystem
>>>> >> > should be mountable by an unprivileged user generally, then what you
>>>> >> > propose is OK.
>>>> >>
>>>> >> That is controlled by the general rights to mount stuff. I.e. unless you
>>>> >> have CAP_SYS_ADMIN in the VPS container you will not be able to mount
>>>> >> devpts there. You can only do it in a subcontainer where you got
>>>> >> permissions to mount via using user namespaces.
>>>> >
>>>> > OK let me try again.  Fine, if you want to speak capabilities, you've
>>>> > given a non-root user an unexpected capability (the capability of
>>>> > creating a ptmx device).  But you haven't used a capability separation
>>>> > to do this, you've just hard coded it via a mount parameter mechanism.
>>>> >
>>>> > If you want to do this thing, do it properly, so it's acceptable to the
>>>> > whole of Linux, not a special corner case for one particular type of
>>>> > container.
>>>> >
>>>> > Security breaches are created when people code in special, little used,
>>>> > corner cases because they don't get as thoroughly tested and inspected
>>>> > as generally applicable mechanisms.
>>>> >
>>>> > What you want is to be able to use the tty subsystem as a non root user:
>>>> > fine, but set that up globally, don't hide it in containers so a lot
>>>> > fewer people care.
>>>>
>>>> I tend to agree, and not just for the tty subsystem.  This is an
>>>> attack surface issue.  With unprivileged user namespaces, unprivileged
>>>> users can create mount namespaces (probably a good thing for bind
>>>> mounts, etc), network namespaces (reasonably safe by themselves),
>>>> network interfaces and iptables rules (scary), fresh
>>>> instances/superblocks of some filesystems (scariness depends on the fs
>>>> -- tmpfs is probably fine), and more.
>>>>
>>>> I think we should have real controls for this, and this is mostly
>>>> Eric's domain.  Eric?  A silly issue that sometimes prevents devpts
>>>> from being mountable isn't a real control, though.
>
> I thought the controls for limiting how much of the userspace API
> an application could use were called seccomp and seccomp2.
>
> Do we need something like a PAM module so that we can set up these
> controls during login?
>
>>> I'm honestly surprised that non-root is allowed to mount things in
>>> general with user namespaces. This was long disabled use for non-root in
>>> Fedora, but it is now enabled.
>>>
>>> For instance, using loopback mounted files you could probably attack
>>> some of the less well tested filesystem implementations by feeding them
>>> fuzzed data.
>>>
>>
>> You actually can't do that right now.  Filesystems have to opt in to
>> being mounted in unprivileged user namespaces, and no filesystems with
>> backing stores have opted in.  devpts has, but it's buggy without this
>> patch IMO.
>
> Arguably you should use two user namespaces.  The first to do what you
> want to as root the second to run as the uid you want to run as.
>
>>> Anyway, I don't see how this affects devpts though. If you're running in
>>> a container (or uncontained), as a regular users with no mount
>>> capabilities you can already mount a devpts filesystem if you create a
>>> subbcontainer with user namespaces and map your uid to 0 in the
>>> subcontainer. Then you get a new ptmx device that you can do whatever
>>> you want with. The mount option would let you do the same, except be
>>> your regular uid in the subcontainer.
>>>
>>> The only difference outside of the subcontainer is that if the outer
>>> container has no uid 0 mapped, yet the user has CAP_SYSADMIN rights in
>>> that container. Then he can mount devpts in the outer container where he
>>> before could only mount it in an inner container.
>>>
>>
>> Agreed.  Also, devpts doesn't seem scary at all to me from a userns
>> perspective.  Regular users on normal systems can already use ptmx,
>> and AFAICS basically all of the attack surface is already available
>> through the normal /dev/ptmx node.
>
> My only real take is that there are a lot more places that you need to
> tweak beyond devpts.  So this patch seemed lacking and boring.
>
> Beyond that until I get the mount namespace sorted out things are pretty
> much in a feature freeze because I can't multitask well enough to do
> complicated patches and take feature patches.
>

Eric, do you think you have time now to take a look at this patch?

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman May 28, 2015, 4:44 p.m. UTC | #22
Andy Lutomirski <luto@amacapital.net> writes:

> On Thu, Apr 2, 2015 at 11:27 AM, Eric W. Biederman
> <ebiederm@xmission.com> wrote:
>> Andy Lutomirski <luto@amacapital.net> writes:
>>
>>> On Thu, Apr 2, 2015 at 7:29 AM, Alexander Larsson <alexl@redhat.com> wrote:
>>>> On Thu, 2015-04-02 at 07:06 -0700, Andy Lutomirski wrote:
>>>>> On Thu, Apr 2, 2015 at 3:12 AM, James Bottomley
>>>>> <James.Bottomley@hansenpartnership.com> wrote:
>>>>> > On Tue, 2015-03-31 at 16:17 +0200, Alexander Larsson wrote:
>>>>> >> On tis, 2015-03-31 at 17:08 +0300, James Bottomley wrote:
>>>>> >> > On Tue, 2015-03-31 at 06:59 -0700, Andy Lutomirski wrote:
>>>>> >> > >
>>>>> >> > > I don't think that this is correct.  That user can already create a
>>>>> >> > > nested userns and map themselves as 0 inside it.  Then they can mount
>>>>> >> > > devpts.
>>>>> >> >
>>>>> >> > I don't mind if they create a container and control the isolated ttys in
>>>>> >> > that sub container in the VPS; that's fine.  I do mind if they get
>>>>> >> > access to the ttys in the VPS.
>>>>> >> >
>>>>> >> > If you can convince me (and the rest of Linux) that the tty subsystem
>>>>> >> > should be mountable by an unprivileged user generally, then what you
>>>>> >> > propose is OK.
>>>>> >>
>>>>> >> That is controlled by the general rights to mount stuff. I.e. unless you
>>>>> >> have CAP_SYS_ADMIN in the VPS container you will not be able to mount
>>>>> >> devpts there. You can only do it in a subcontainer where you got
>>>>> >> permissions to mount via using user namespaces.
>>>>> >
>>>>> > OK let me try again.  Fine, if you want to speak capabilities, you've
>>>>> > given a non-root user an unexpected capability (the capability of
>>>>> > creating a ptmx device).  But you haven't used a capability separation
>>>>> > to do this, you've just hard coded it via a mount parameter mechanism.
>>>>> >
>>>>> > If you want to do this thing, do it properly, so it's acceptable to the
>>>>> > whole of Linux, not a special corner case for one particular type of
>>>>> > container.
>>>>> >
>>>>> > Security breaches are created when people code in special, little used,
>>>>> > corner cases because they don't get as thoroughly tested and inspected
>>>>> > as generally applicable mechanisms.
>>>>> >
>>>>> > What you want is to be able to use the tty subsystem as a non root user:
>>>>> > fine, but set that up globally, don't hide it in containers so a lot
>>>>> > fewer people care.
>>>>>
>>>>> I tend to agree, and not just for the tty subsystem.  This is an
>>>>> attack surface issue.  With unprivileged user namespaces, unprivileged
>>>>> users can create mount namespaces (probably a good thing for bind
>>>>> mounts, etc), network namespaces (reasonably safe by themselves),
>>>>> network interfaces and iptables rules (scary), fresh
>>>>> instances/superblocks of some filesystems (scariness depends on the fs
>>>>> -- tmpfs is probably fine), and more.
>>>>>
>>>>> I think we should have real controls for this, and this is mostly
>>>>> Eric's domain.  Eric?  A silly issue that sometimes prevents devpts
>>>>> from being mountable isn't a real control, though.
>>
>> I thought the controls for limiting how much of the userspace API
>> an application could use were called seccomp and seccomp2.
>>
>> Do we need something like a PAM module so that we can set up these
>> controls during login?
>>
>>>> I'm honestly surprised that non-root is allowed to mount things in
>>>> general with user namespaces. This was long disabled use for non-root in
>>>> Fedora, but it is now enabled.
>>>>
>>>> For instance, using loopback mounted files you could probably attack
>>>> some of the less well tested filesystem implementations by feeding them
>>>> fuzzed data.
>>>>
>>>
>>> You actually can't do that right now.  Filesystems have to opt in to
>>> being mounted in unprivileged user namespaces, and no filesystems with
>>> backing stores have opted in.  devpts has, but it's buggy without this
>>> patch IMO.
>>
>> Arguably you should use two user namespaces.  The first to do what you
>> want to as root the second to run as the uid you want to run as.
>>
>>>> Anyway, I don't see how this affects devpts though. If you're running in
>>>> a container (or uncontained), as a regular users with no mount
>>>> capabilities you can already mount a devpts filesystem if you create a
>>>> subbcontainer with user namespaces and map your uid to 0 in the
>>>> subcontainer. Then you get a new ptmx device that you can do whatever
>>>> you want with. The mount option would let you do the same, except be
>>>> your regular uid in the subcontainer.
>>>>
>>>> The only difference outside of the subcontainer is that if the outer
>>>> container has no uid 0 mapped, yet the user has CAP_SYSADMIN rights in
>>>> that container. Then he can mount devpts in the outer container where he
>>>> before could only mount it in an inner container.
>>>>
>>>
>>> Agreed.  Also, devpts doesn't seem scary at all to me from a userns
>>> perspective.  Regular users on normal systems can already use ptmx,
>>> and AFAICS basically all of the attack surface is already available
>>> through the normal /dev/ptmx node.
>>
>> My only real take is that there are a lot more places that you need to
>> tweak beyond devpts.  So this patch seemed lacking and boring.
>>
>> Beyond that until I get the mount namespace sorted out things are pretty
>> much in a feature freeze because I can't multitask well enough to do
>> complicated patches and take feature patches.
>>
>
> Eric, do you think you have time now to take a look at this patch?

I am much closer.  Escaping bind mounts is still not yet fixed but I
have code that almost works.

My gut feel still says that two user namespaces one where your 0 is
mapped to your uid and a second where your uid is identity mapped is the
preferrable configuration, and makes this patch unnecessary.

I don't think I have heard anyone describe why using a pair of user
namespaces is a problem.

Conceptually as the patch is an efficiency hack on something we can
already do I don't have any semantic grounds to refuse it.  There remain
maintenance concerns (how much else will need this kind of hack) code
code complexity concerns, and is the patch buggy concerns.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Larsson May 28, 2015, 5:01 p.m. UTC | #23
On Thu, 2015-05-28 at 11:44 -0500, Eric W. Biederman wrote:
> Andy Lutomirski <luto@amacapital.net> writes:
> 
> > On Thu, Apr 2, 2015 at 11:27 AM, Eric W. Biederman
> > <ebiederm@xmission.com> wrote:
> > > Andy Lutomirski <luto@amacapital.net> writes:
> > > 
> > > > On Thu, Apr 2, 2015 at 7:29 AM, Alexander Larsson <
> > > > alexl@redhat.com> wrote:
> > > > > On Thu, 2015-04-02 at 07:06 -0700, Andy Lutomirski wrote:
> > > > > > On Thu, Apr 2, 2015 at 3:12 AM, James Bottomley
> > > > > > <James.Bottomley@hansenpartnership.com> wrote:
> > > > > > > On Tue, 2015-03-31 at 16:17 +0200, Alexander Larsson 
> > > > > > > wrote:
> > > > > > > > On tis, 2015-03-31 at 17:08 +0300, James Bottomley 
> > > > > > > > wrote:
> > > > > > > > > On Tue, 2015-03-31 at 06:59 -0700, Andy Lutomirski 
> > > > > > > > > wrote:
> > > > > > > > > > 
> > > > > > > > > > I don't think that this is correct.  That user can 
> > > > > > > > > > already create a
> > > > > > > > > > nested userns and map themselves as 0 inside it. 
> > > > > > > > > >  Then they can mount
> > > > > > > > > > devpts.
> > > > > > > > > 
> > > > > > > > > I don't mind if they create a container and control 
> > > > > > > > > the isolated ttys in
> > > > > > > > > that sub container in the VPS; that's fine.  I do 
> > > > > > > > > mind if they get
> > > > > > > > > access to the ttys in the VPS.
> > > > > > > > > 
> > > > > > > > > If you can convince me (and the rest of Linux) that 
> > > > > > > > > the tty subsystem
> > > > > > > > > should be mountable by an unprivileged user 
> > > > > > > > > generally, then what you
> > > > > > > > > propose is OK.
> > > > > > > > 
> > > > > > > > That is controlled by the general rights to mount 
> > > > > > > > stuff. I.e. unless you
> > > > > > > > have CAP_SYS_ADMIN in the VPS container you will not be 
> > > > > > > > able to mount
> > > > > > > > devpts there. You can only do it in a subcontainer 
> > > > > > > > where you got
> > > > > > > > permissions to mount via using user namespaces.
> > > > > > > 
> > > > > > > OK let me try again.  Fine, if you want to speak 
> > > > > > > capabilities, you've
> > > > > > > given a non-root user an unexpected capability (the 
> > > > > > > capability of
> > > > > > > creating a ptmx device).  But you haven't used a 
> > > > > > > capability separation
> > > > > > > to do this, you've just hard coded it via a mount 
> > > > > > > parameter mechanism.
> > > > > > > 
> > > > > > > If you want to do this thing, do it properly, so it's 
> > > > > > > acceptable to the
> > > > > > > whole of Linux, not a special corner case for one 
> > > > > > > particular type of
> > > > > > > container.
> > > > > > > 
> > > > > > > Security breaches are created when people code in 
> > > > > > > special, little used,
> > > > > > > corner cases because they don't get as thoroughly tested 
> > > > > > > and inspected
> > > > > > > as generally applicable mechanisms.
> > > > > > > 
> > > > > > > What you want is to be able to use the tty subsystem as a 
> > > > > > > non root user:
> > > > > > > fine, but set that up globally, don't hide it in 
> > > > > > > containers so a lot
> > > > > > > fewer people care.
> > > > > > 
> > > > > > I tend to agree, and not just for the tty subsystem.  This 
> > > > > > is an
> > > > > > attack surface issue.  With unprivileged user namespaces, 
> > > > > > unprivileged
> > > > > > users can create mount namespaces (probably a good thing 
> > > > > > for bind
> > > > > > mounts, etc), network namespaces (reasonably safe by 
> > > > > > themselves),
> > > > > > network interfaces and iptables rules (scary), fresh
> > > > > > instances/superblocks of some filesystems (scariness 
> > > > > > depends on the fs
> > > > > > -- tmpfs is probably fine), and more.
> > > > > > 
> > > > > > I think we should have real controls for this, and this is 
> > > > > > mostly
> > > > > > Eric's domain.  Eric?  A silly issue that sometimes 
> > > > > > prevents devpts
> > > > > > from being mountable isn't a real control, though.
> > > 
> > > I thought the controls for limiting how much of the userspace API
> > > an application could use were called seccomp and seccomp2.
> > > 
> > > Do we need something like a PAM module so that we can set up 
> > > these
> > > controls during login?
> > > 
> > > > > I'm honestly surprised that non-root is allowed to mount 
> > > > > things in
> > > > > general with user namespaces. This was long disabled use for 
> > > > > non-root in
> > > > > Fedora, but it is now enabled.
> > > > > 
> > > > > For instance, using loopback mounted files you could probably 
> > > > > attack
> > > > > some of the less well tested filesystem implementations by 
> > > > > feeding them
> > > > > fuzzed data.
> > > > > 
> > > > 
> > > > You actually can't do that right now.  Filesystems have to opt 
> > > > in to
> > > > being mounted in unprivileged user namespaces, and no 
> > > > filesystems with
> > > > backing stores have opted in.  devpts has, but it's buggy 
> > > > without this
> > > > patch IMO.
> > > 
> > > Arguably you should use two user namespaces.  The first to do 
> > > what you
> > > want to as root the second to run as the uid you want to run as.
> > > 
> > > > > Anyway, I don't see how this affects devpts though. If you're 
> > > > > running in
> > > > > a container (or uncontained), as a regular users with no 
> > > > > mount
> > > > > capabilities you can already mount a devpts filesystem if you 
> > > > > create a
> > > > > subbcontainer with user namespaces and map your uid to 0 in 
> > > > > the
> > > > > subcontainer. Then you get a new ptmx device that you can do 
> > > > > whatever
> > > > > you want with. The mount option would let you do the same, 
> > > > > except be
> > > > > your regular uid in the subcontainer.
> > > > > 
> > > > > The only difference outside of the subcontainer is that if 
> > > > > the outer
> > > > > container has no uid 0 mapped, yet the user has CAP_SYSADMIN 
> > > > > rights in
> > > > > that container. Then he can mount devpts in the outer 
> > > > > container where he
> > > > > before could only mount it in an inner container.
> > > > > 
> > > > 
> > > > Agreed.  Also, devpts doesn't seem scary at all to me from a 
> > > > userns
> > > > perspective.  Regular users on normal systems can already use 
> > > > ptmx,
> > > > and AFAICS basically all of the attack surface is already 
> > > > available
> > > > through the normal /dev/ptmx node.
> > > 
> > > My only real take is that there are a lot more places that you 
> > > need to
> > > tweak beyond devpts.  So this patch seemed lacking and boring.
> > > 
> > > Beyond that until I get the mount namespace sorted out things are 
> > > pretty
> > > much in a feature freeze because I can't multitask well enough to 
> > > do
> > > complicated patches and take feature patches.
> > > 
> > 
> > Eric, do you think you have time now to take a look at this patch?
> 
> I am much closer.  Escaping bind mounts is still not yet fixed but I
> have code that almost works.
> 
> My gut feel still says that two user namespaces one where your 0 is
> mapped to your uid and a second where your uid is identity mapped is 
> the
> preferrable configuration, and makes this patch unnecessary.

I don't really understand this. My usecase is that I want a desktop app
sandbox, it should run as the actual user that is running the graphical
session mapped to its real uid. In this namespace i want a /dev/pts so
that i can e.g. shell out to ssh and feed it a password on the tty
prompt or similar. And i don't want to bind-mount in the host /dev/pts,
because then the sandbox can read from the ttys of other apps.

Where does the second namespace enter into this? 

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman May 28, 2015, 5:14 p.m. UTC | #24
Alexander Larsson <alexl@redhat.com> writes:

> On Thu, 2015-05-28 at 11:44 -0500, Eric W. Biederman wrote:
>> Andy Lutomirski <luto@amacapital.net> writes:
>> 
>> > On Thu, Apr 2, 2015 at 11:27 AM, Eric W. Biederman
>> > <ebiederm@xmission.com> wrote:
>> > > Andy Lutomirski <luto@amacapital.net> writes:
>> > > 
>> > > > On Thu, Apr 2, 2015 at 7:29 AM, Alexander Larsson <
>> > > > alexl@redhat.com> wrote:
>> > > > > On Thu, 2015-04-02 at 07:06 -0700, Andy Lutomirski wrote:
>> > > > > > On Thu, Apr 2, 2015 at 3:12 AM, James Bottomley
>> > > > > > <James.Bottomley@hansenpartnership.com> wrote:
>> > > > > > > On Tue, 2015-03-31 at 16:17 +0200, Alexander Larsson 
>> > > > > > > wrote:
>> > > > > > > > On tis, 2015-03-31 at 17:08 +0300, James Bottomley 
>> > > > > > > > wrote:
>> > > > > > > > > On Tue, 2015-03-31 at 06:59 -0700, Andy Lutomirski 
>> > > > > > > > > wrote:
>> > > > > > > > > > 
>> > > > > > > > > > I don't think that this is correct.  That user can 
>> > > > > > > > > > already create a
>> > > > > > > > > > nested userns and map themselves as 0 inside it. 
>> > > > > > > > > >  Then they can mount
>> > > > > > > > > > devpts.
>> > > > > > > > > 
>> > > > > > > > > I don't mind if they create a container and control 
>> > > > > > > > > the isolated ttys in
>> > > > > > > > > that sub container in the VPS; that's fine.  I do 
>> > > > > > > > > mind if they get
>> > > > > > > > > access to the ttys in the VPS.
>> > > > > > > > > 
>> > > > > > > > > If you can convince me (and the rest of Linux) that 
>> > > > > > > > > the tty subsystem
>> > > > > > > > > should be mountable by an unprivileged user 
>> > > > > > > > > generally, then what you
>> > > > > > > > > propose is OK.
>> > > > > > > > 
>> > > > > > > > That is controlled by the general rights to mount 
>> > > > > > > > stuff. I.e. unless you
>> > > > > > > > have CAP_SYS_ADMIN in the VPS container you will not be 
>> > > > > > > > able to mount
>> > > > > > > > devpts there. You can only do it in a subcontainer 
>> > > > > > > > where you got
>> > > > > > > > permissions to mount via using user namespaces.
>> > > > > > > 
>> > > > > > > OK let me try again.  Fine, if you want to speak 
>> > > > > > > capabilities, you've
>> > > > > > > given a non-root user an unexpected capability (the 
>> > > > > > > capability of
>> > > > > > > creating a ptmx device).  But you haven't used a 
>> > > > > > > capability separation
>> > > > > > > to do this, you've just hard coded it via a mount 
>> > > > > > > parameter mechanism.
>> > > > > > > 
>> > > > > > > If you want to do this thing, do it properly, so it's 
>> > > > > > > acceptable to the
>> > > > > > > whole of Linux, not a special corner case for one 
>> > > > > > > particular type of
>> > > > > > > container.
>> > > > > > > 
>> > > > > > > Security breaches are created when people code in 
>> > > > > > > special, little used,
>> > > > > > > corner cases because they don't get as thoroughly tested 
>> > > > > > > and inspected
>> > > > > > > as generally applicable mechanisms.
>> > > > > > > 
>> > > > > > > What you want is to be able to use the tty subsystem as a 
>> > > > > > > non root user:
>> > > > > > > fine, but set that up globally, don't hide it in 
>> > > > > > > containers so a lot
>> > > > > > > fewer people care.
>> > > > > > 
>> > > > > > I tend to agree, and not just for the tty subsystem.  This 
>> > > > > > is an
>> > > > > > attack surface issue.  With unprivileged user namespaces, 
>> > > > > > unprivileged
>> > > > > > users can create mount namespaces (probably a good thing 
>> > > > > > for bind
>> > > > > > mounts, etc), network namespaces (reasonably safe by 
>> > > > > > themselves),
>> > > > > > network interfaces and iptables rules (scary), fresh
>> > > > > > instances/superblocks of some filesystems (scariness 
>> > > > > > depends on the fs
>> > > > > > -- tmpfs is probably fine), and more.
>> > > > > > 
>> > > > > > I think we should have real controls for this, and this is 
>> > > > > > mostly
>> > > > > > Eric's domain.  Eric?  A silly issue that sometimes 
>> > > > > > prevents devpts
>> > > > > > from being mountable isn't a real control, though.
>> > > 
>> > > I thought the controls for limiting how much of the userspace API
>> > > an application could use were called seccomp and seccomp2.
>> > > 
>> > > Do we need something like a PAM module so that we can set up 
>> > > these
>> > > controls during login?
>> > > 
>> > > > > I'm honestly surprised that non-root is allowed to mount 
>> > > > > things in
>> > > > > general with user namespaces. This was long disabled use for 
>> > > > > non-root in
>> > > > > Fedora, but it is now enabled.
>> > > > > 
>> > > > > For instance, using loopback mounted files you could probably 
>> > > > > attack
>> > > > > some of the less well tested filesystem implementations by 
>> > > > > feeding them
>> > > > > fuzzed data.
>> > > > > 
>> > > > 
>> > > > You actually can't do that right now.  Filesystems have to opt 
>> > > > in to
>> > > > being mounted in unprivileged user namespaces, and no 
>> > > > filesystems with
>> > > > backing stores have opted in.  devpts has, but it's buggy 
>> > > > without this
>> > > > patch IMO.
>> > > 
>> > > Arguably you should use two user namespaces.  The first to do 
>> > > what you
>> > > want to as root the second to run as the uid you want to run as.
>> > > 
>> > > > > Anyway, I don't see how this affects devpts though. If you're 
>> > > > > running in
>> > > > > a container (or uncontained), as a regular users with no 
>> > > > > mount
>> > > > > capabilities you can already mount a devpts filesystem if you 
>> > > > > create a
>> > > > > subbcontainer with user namespaces and map your uid to 0 in 
>> > > > > the
>> > > > > subcontainer. Then you get a new ptmx device that you can do 
>> > > > > whatever
>> > > > > you want with. The mount option would let you do the same, 
>> > > > > except be
>> > > > > your regular uid in the subcontainer.
>> > > > > 
>> > > > > The only difference outside of the subcontainer is that if 
>> > > > > the outer
>> > > > > container has no uid 0 mapped, yet the user has CAP_SYSADMIN 
>> > > > > rights in
>> > > > > that container. Then he can mount devpts in the outer 
>> > > > > container where he
>> > > > > before could only mount it in an inner container.
>> > > > > 
>> > > > 
>> > > > Agreed.  Also, devpts doesn't seem scary at all to me from a 
>> > > > userns
>> > > > perspective.  Regular users on normal systems can already use 
>> > > > ptmx,
>> > > > and AFAICS basically all of the attack surface is already 
>> > > > available
>> > > > through the normal /dev/ptmx node.
>> > > 
>> > > My only real take is that there are a lot more places that you 
>> > > need to
>> > > tweak beyond devpts.  So this patch seemed lacking and boring.
>> > > 
>> > > Beyond that until I get the mount namespace sorted out things are 
>> > > pretty
>> > > much in a feature freeze because I can't multitask well enough to 
>> > > do
>> > > complicated patches and take feature patches.
>> > > 
>> > 
>> > Eric, do you think you have time now to take a look at this patch?
>> 
>> I am much closer.  Escaping bind mounts is still not yet fixed but I
>> have code that almost works.
>> 
>> My gut feel still says that two user namespaces one where your 0 is
>> mapped to your uid and a second where your uid is identity mapped is 
>> the
>> preferrable configuration, and makes this patch unnecessary.
>
> I don't really understand this. My usecase is that I want a desktop app
> sandbox, it should run as the actual user that is running the graphical
> session mapped to its real uid. In this namespace i want a /dev/pts so
> that i can e.g. shell out to ssh and feed it a password on the tty
> prompt or similar. And i don't want to bind-mount in the host /dev/pts,
> because then the sandbox can read from the ttys of other apps.
>
> Where does the second namespace enter into this? 

Step a.  Create create a user namespace where uid 0 is mapped to your
real uid, and set up your sandbox (aka mount /dev/pts and everything
else).

Step b.  Create a nested user namespace where your uid is identity
mapped and run your desktop application.  You can even drop all caps in
your namespace.

Or basically:
    unshare(CLONE_NEWUSER)
    
    map 0 to real_uid
    set things up.
    
    unshare(CLONE_NEWUSER)
    map real_uid to 0 (Because I am assuming we are
                      single threaded in the nested context)
    
    drop caps
    exec /path/to/my/sandboxed/application

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andy Lutomirski May 28, 2015, 5:30 p.m. UTC | #25
On Thu, May 28, 2015 at 10:01 AM, Alexander Larsson <alexl@redhat.com> wrote:
> On Thu, 2015-05-28 at 11:44 -0500, Eric W. Biederman wrote:
>> Andy Lutomirski <luto@amacapital.net> writes:
>>
>> > On Thu, Apr 2, 2015 at 11:27 AM, Eric W. Biederman
>> > <ebiederm@xmission.com> wrote:
>> > > Andy Lutomirski <luto@amacapital.net> writes:
>> > >
>> > > > On Thu, Apr 2, 2015 at 7:29 AM, Alexander Larsson <
>> > > > alexl@redhat.com> wrote:
>> > > > > On Thu, 2015-04-02 at 07:06 -0700, Andy Lutomirski wrote:
>> > > > > > On Thu, Apr 2, 2015 at 3:12 AM, James Bottomley
>> > > > > > <James.Bottomley@hansenpartnership.com> wrote:
>> > > > > > > On Tue, 2015-03-31 at 16:17 +0200, Alexander Larsson
>> > > > > > > wrote:
>> > > > > > > > On tis, 2015-03-31 at 17:08 +0300, James Bottomley
>> > > > > > > > wrote:
>> > > > > > > > > On Tue, 2015-03-31 at 06:59 -0700, Andy Lutomirski
>> > > > > > > > > wrote:
>> > > > > > > > > >
>> > > > > > > > > > I don't think that this is correct.  That user can
>> > > > > > > > > > already create a
>> > > > > > > > > > nested userns and map themselves as 0 inside it.
>> > > > > > > > > >  Then they can mount
>> > > > > > > > > > devpts.
>> > > > > > > > >
>> > > > > > > > > I don't mind if they create a container and control
>> > > > > > > > > the isolated ttys in
>> > > > > > > > > that sub container in the VPS; that's fine.  I do
>> > > > > > > > > mind if they get
>> > > > > > > > > access to the ttys in the VPS.
>> > > > > > > > >
>> > > > > > > > > If you can convince me (and the rest of Linux) that
>> > > > > > > > > the tty subsystem
>> > > > > > > > > should be mountable by an unprivileged user
>> > > > > > > > > generally, then what you
>> > > > > > > > > propose is OK.
>> > > > > > > >
>> > > > > > > > That is controlled by the general rights to mount
>> > > > > > > > stuff. I.e. unless you
>> > > > > > > > have CAP_SYS_ADMIN in the VPS container you will not be
>> > > > > > > > able to mount
>> > > > > > > > devpts there. You can only do it in a subcontainer
>> > > > > > > > where you got
>> > > > > > > > permissions to mount via using user namespaces.
>> > > > > > >
>> > > > > > > OK let me try again.  Fine, if you want to speak
>> > > > > > > capabilities, you've
>> > > > > > > given a non-root user an unexpected capability (the
>> > > > > > > capability of
>> > > > > > > creating a ptmx device).  But you haven't used a
>> > > > > > > capability separation
>> > > > > > > to do this, you've just hard coded it via a mount
>> > > > > > > parameter mechanism.
>> > > > > > >
>> > > > > > > If you want to do this thing, do it properly, so it's
>> > > > > > > acceptable to the
>> > > > > > > whole of Linux, not a special corner case for one
>> > > > > > > particular type of
>> > > > > > > container.
>> > > > > > >
>> > > > > > > Security breaches are created when people code in
>> > > > > > > special, little used,
>> > > > > > > corner cases because they don't get as thoroughly tested
>> > > > > > > and inspected
>> > > > > > > as generally applicable mechanisms.
>> > > > > > >
>> > > > > > > What you want is to be able to use the tty subsystem as a
>> > > > > > > non root user:
>> > > > > > > fine, but set that up globally, don't hide it in
>> > > > > > > containers so a lot
>> > > > > > > fewer people care.
>> > > > > >
>> > > > > > I tend to agree, and not just for the tty subsystem.  This
>> > > > > > is an
>> > > > > > attack surface issue.  With unprivileged user namespaces,
>> > > > > > unprivileged
>> > > > > > users can create mount namespaces (probably a good thing
>> > > > > > for bind
>> > > > > > mounts, etc), network namespaces (reasonably safe by
>> > > > > > themselves),
>> > > > > > network interfaces and iptables rules (scary), fresh
>> > > > > > instances/superblocks of some filesystems (scariness
>> > > > > > depends on the fs
>> > > > > > -- tmpfs is probably fine), and more.
>> > > > > >
>> > > > > > I think we should have real controls for this, and this is
>> > > > > > mostly
>> > > > > > Eric's domain.  Eric?  A silly issue that sometimes
>> > > > > > prevents devpts
>> > > > > > from being mountable isn't a real control, though.
>> > >
>> > > I thought the controls for limiting how much of the userspace API
>> > > an application could use were called seccomp and seccomp2.
>> > >
>> > > Do we need something like a PAM module so that we can set up
>> > > these
>> > > controls during login?
>> > >
>> > > > > I'm honestly surprised that non-root is allowed to mount
>> > > > > things in
>> > > > > general with user namespaces. This was long disabled use for
>> > > > > non-root in
>> > > > > Fedora, but it is now enabled.
>> > > > >
>> > > > > For instance, using loopback mounted files you could probably
>> > > > > attack
>> > > > > some of the less well tested filesystem implementations by
>> > > > > feeding them
>> > > > > fuzzed data.
>> > > > >
>> > > >
>> > > > You actually can't do that right now.  Filesystems have to opt
>> > > > in to
>> > > > being mounted in unprivileged user namespaces, and no
>> > > > filesystems with
>> > > > backing stores have opted in.  devpts has, but it's buggy
>> > > > without this
>> > > > patch IMO.
>> > >
>> > > Arguably you should use two user namespaces.  The first to do
>> > > what you
>> > > want to as root the second to run as the uid you want to run as.
>> > >
>> > > > > Anyway, I don't see how this affects devpts though. If you're
>> > > > > running in
>> > > > > a container (or uncontained), as a regular users with no
>> > > > > mount
>> > > > > capabilities you can already mount a devpts filesystem if you
>> > > > > create a
>> > > > > subbcontainer with user namespaces and map your uid to 0 in
>> > > > > the
>> > > > > subcontainer. Then you get a new ptmx device that you can do
>> > > > > whatever
>> > > > > you want with. The mount option would let you do the same,
>> > > > > except be
>> > > > > your regular uid in the subcontainer.
>> > > > >
>> > > > > The only difference outside of the subcontainer is that if
>> > > > > the outer
>> > > > > container has no uid 0 mapped, yet the user has CAP_SYSADMIN
>> > > > > rights in
>> > > > > that container. Then he can mount devpts in the outer
>> > > > > container where he
>> > > > > before could only mount it in an inner container.
>> > > > >
>> > > >
>> > > > Agreed.  Also, devpts doesn't seem scary at all to me from a
>> > > > userns
>> > > > perspective.  Regular users on normal systems can already use
>> > > > ptmx,
>> > > > and AFAICS basically all of the attack surface is already
>> > > > available
>> > > > through the normal /dev/ptmx node.
>> > >
>> > > My only real take is that there are a lot more places that you
>> > > need to
>> > > tweak beyond devpts.  So this patch seemed lacking and boring.
>> > >
>> > > Beyond that until I get the mount namespace sorted out things are
>> > > pretty
>> > > much in a feature freeze because I can't multitask well enough to
>> > > do
>> > > complicated patches and take feature patches.
>> > >
>> >
>> > Eric, do you think you have time now to take a look at this patch?
>>
>> I am much closer.  Escaping bind mounts is still not yet fixed but I
>> have code that almost works.
>>
>> My gut feel still says that two user namespaces one where your 0 is
>> mapped to your uid and a second where your uid is identity mapped is
>> the
>> preferrable configuration, and makes this patch unnecessary.
>
> I don't really understand this. My usecase is that I want a desktop app
> sandbox, it should run as the actual user that is running the graphical
> session mapped to its real uid. In this namespace i want a /dev/pts so
> that i can e.g. shell out to ssh and feed it a password on the tty
> prompt or similar. And i don't want to bind-mount in the host /dev/pts,
> because then the sandbox can read from the ttys of other apps.
>
> Where does the second namespace enter into this?
>

I think Eric is suggesting making a user namespace that maps your uid
as 0, then making a mount namespace and mounting devpts, then making
*another* user namespace that maps your uid (seen as 0) back to
whatever nonzero number you want.

That would probably work, but I think it's really ugly.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Larsson May 28, 2015, 5:35 p.m. UTC | #26
On Thu, 2015-05-28 at 12:14 -0500, Eric W. Biederman wrote:
> Alexander Larsson <alexl@redhat.com> writes:
> 
> > On Thu, 2015-05-28 at 11:44 -0500, Eric W. Biederman wrote:
> > > Andy Lutomirski <luto@amacapital.net> writes:
> > > 
> > > > On Thu, Apr 2, 2015 at 11:27 AM, Eric W. Biederman
> > > > <ebiederm@xmission.com> wrote:
> > > > > Andy Lutomirski <luto@amacapital.net> writes:
> > > > > 
> > > > > > On Thu, Apr 2, 2015 at 7:29 AM, Alexander Larsson <
> > > > > > alexl@redhat.com> wrote:
> > > > > > > On Thu, 2015-04-02 at 07:06 -0700, Andy Lutomirski wrote:
> > > > > > > > On Thu, Apr 2, 2015 at 3:12 AM, James Bottomley
> > > > > > > > <James.Bottomley@hansenpartnership.com> wrote:
> > > > > > > > > On Tue, 2015-03-31 at 16:17 +0200, Alexander Larsson 
> > > > > > > > > wrote:
> > > > > > > > > > On tis, 2015-03-31 at 17:08 +0300, James Bottomley 
> > > > > > > > > > wrote:
> > > > > > > > > > > On Tue, 2015-03-31 at 06:59 -0700, Andy 
> > > > > > > > > > > Lutomirski 
> > > > > > > > > > > wrote:
> > > > > > > > > > > > 
> > > > > > > > > > > > I don't think that this is correct.  That user 
> > > > > > > > > > > > can 
> > > > > > > > > > > > already create a
> > > > > > > > > > > > nested userns and map themselves as 0 inside 
> > > > > > > > > > > > it. 
> > > > > > > > > > > >  Then they can mount
> > > > > > > > > > > > devpts.
> > > > > > > > > > > 
> > > > > > > > > > > I don't mind if they create a container and 
> > > > > > > > > > > control 
> > > > > > > > > > > the isolated ttys in
> > > > > > > > > > > that sub container in the VPS; that's fine.  I do 
> > > > > > > > > > > 
> > > > > > > > > > > mind if they get
> > > > > > > > > > > access to the ttys in the VPS.
> > > > > > > > > > > 
> > > > > > > > > > > If you can convince me (and the rest of Linux) 
> > > > > > > > > > > that 
> > > > > > > > > > > the tty subsystem
> > > > > > > > > > > should be mountable by an unprivileged user 
> > > > > > > > > > > generally, then what you
> > > > > > > > > > > propose is OK.
> > > > > > > > > > 
> > > > > > > > > > That is controlled by the general rights to mount 
> > > > > > > > > > stuff. I.e. unless you
> > > > > > > > > > have CAP_SYS_ADMIN in the VPS container you will 
> > > > > > > > > > not be 
> > > > > > > > > > able to mount
> > > > > > > > > > devpts there. You can only do it in a subcontainer 
> > > > > > > > > > where you got
> > > > > > > > > > permissions to mount via using user namespaces.
> > > > > > > > > 
> > > > > > > > > OK let me try again.  Fine, if you want to speak 
> > > > > > > > > capabilities, you've
> > > > > > > > > given a non-root user an unexpected capability (the 
> > > > > > > > > capability of
> > > > > > > > > creating a ptmx device).  But you haven't used a 
> > > > > > > > > capability separation
> > > > > > > > > to do this, you've just hard coded it via a mount 
> > > > > > > > > parameter mechanism.
> > > > > > > > > 
> > > > > > > > > If you want to do this thing, do it properly, so it's 
> > > > > > > > > 
> > > > > > > > > acceptable to the
> > > > > > > > > whole of Linux, not a special corner case for one 
> > > > > > > > > particular type of
> > > > > > > > > container.
> > > > > > > > > 
> > > > > > > > > Security breaches are created when people code in 
> > > > > > > > > special, little used,
> > > > > > > > > corner cases because they don't get as thoroughly 
> > > > > > > > > tested 
> > > > > > > > > and inspected
> > > > > > > > > as generally applicable mechanisms.
> > > > > > > > > 
> > > > > > > > > What you want is to be able to use the tty subsystem 
> > > > > > > > > as a 
> > > > > > > > > non root user:
> > > > > > > > > fine, but set that up globally, don't hide it in 
> > > > > > > > > containers so a lot
> > > > > > > > > fewer people care.
> > > > > > > > 
> > > > > > > > I tend to agree, and not just for the tty subsystem. 
> > > > > > > >  This 
> > > > > > > > is an
> > > > > > > > attack surface issue.  With unprivileged user 
> > > > > > > > namespaces, 
> > > > > > > > unprivileged
> > > > > > > > users can create mount namespaces (probably a good 
> > > > > > > > thing 
> > > > > > > > for bind
> > > > > > > > mounts, etc), network namespaces (reasonably safe by 
> > > > > > > > themselves),
> > > > > > > > network interfaces and iptables rules (scary), fresh
> > > > > > > > instances/superblocks of some filesystems (scariness 
> > > > > > > > depends on the fs
> > > > > > > > -- tmpfs is probably fine), and more.
> > > > > > > > 
> > > > > > > > I think we should have real controls for this, and this 
> > > > > > > > is 
> > > > > > > > mostly
> > > > > > > > Eric's domain.  Eric?  A silly issue that sometimes 
> > > > > > > > prevents devpts
> > > > > > > > from being mountable isn't a real control, though.
> > > > > 
> > > > > I thought the controls for limiting how much of the userspace 
> > > > > API
> > > > > an application could use were called seccomp and seccomp2.
> > > > > 
> > > > > Do we need something like a PAM module so that we can set up 
> > > > > these
> > > > > controls during login?
> > > > > 
> > > > > > > I'm honestly surprised that non-root is allowed to mount 
> > > > > > > things in
> > > > > > > general with user namespaces. This was long disabled use 
> > > > > > > for 
> > > > > > > non-root in
> > > > > > > Fedora, but it is now enabled.
> > > > > > > 
> > > > > > > For instance, using loopback mounted files you could 
> > > > > > > probably 
> > > > > > > attack
> > > > > > > some of the less well tested filesystem implementations 
> > > > > > > by 
> > > > > > > feeding them
> > > > > > > fuzzed data.
> > > > > > > 
> > > > > > 
> > > > > > You actually can't do that right now.  Filesystems have to 
> > > > > > opt 
> > > > > > in to
> > > > > > being mounted in unprivileged user namespaces, and no 
> > > > > > filesystems with
> > > > > > backing stores have opted in.  devpts has, but it's buggy 
> > > > > > without this
> > > > > > patch IMO.
> > > > > 
> > > > > Arguably you should use two user namespaces.  The first to do 
> > > > > 
> > > > > what you
> > > > > want to as root the second to run as the uid you want to run 
> > > > > as.
> > > > > 
> > > > > > > Anyway, I don't see how this affects devpts though. If 
> > > > > > > you're 
> > > > > > > running in
> > > > > > > a container (or uncontained), as a regular users with no 
> > > > > > > mount
> > > > > > > capabilities you can already mount a devpts filesystem if 
> > > > > > > you 
> > > > > > > create a
> > > > > > > subbcontainer with user namespaces and map your uid to 0 
> > > > > > > in 
> > > > > > > the
> > > > > > > subcontainer. Then you get a new ptmx device that you can 
> > > > > > > do 
> > > > > > > whatever
> > > > > > > you want with. The mount option would let you do the 
> > > > > > > same, 
> > > > > > > except be
> > > > > > > your regular uid in the subcontainer.
> > > > > > > 
> > > > > > > The only difference outside of the subcontainer is that 
> > > > > > > if 
> > > > > > > the outer
> > > > > > > container has no uid 0 mapped, yet the user has 
> > > > > > > CAP_SYSADMIN 
> > > > > > > rights in
> > > > > > > that container. Then he can mount devpts in the outer 
> > > > > > > container where he
> > > > > > > before could only mount it in an inner container.
> > > > > > > 
> > > > > > 
> > > > > > Agreed.  Also, devpts doesn't seem scary at all to me from 
> > > > > > a 
> > > > > > userns
> > > > > > perspective.  Regular users on normal systems can already 
> > > > > > use 
> > > > > > ptmx,
> > > > > > and AFAICS basically all of the attack surface is already 
> > > > > > available
> > > > > > through the normal /dev/ptmx node.
> > > > > 
> > > > > My only real take is that there are a lot more places that 
> > > > > you 
> > > > > need to
> > > > > tweak beyond devpts.  So this patch seemed lacking and 
> > > > > boring.
> > > > > 
> > > > > Beyond that until I get the mount namespace sorted out things 
> > > > > are 
> > > > > pretty
> > > > > much in a feature freeze because I can't multitask well 
> > > > > enough to 
> > > > > do
> > > > > complicated patches and take feature patches.
> > > > > 
> > > > 
> > > > Eric, do you think you have time now to take a look at this 
> > > > patch?
> > > 
> > > I am much closer.  Escaping bind mounts is still not yet fixed 
> > > but I
> > > have code that almost works.
> > > 
> > > My gut feel still says that two user namespaces one where your 0 
> > > is
> > > mapped to your uid and a second where your uid is identity mapped 
> > > is 
> > > the
> > > preferrable configuration, and makes this patch unnecessary.
> > 
> > I don't really understand this. My usecase is that I want a desktop 
> > app
> > sandbox, it should run as the actual user that is running the 
> > graphical
> > session mapped to its real uid. In this namespace i want a /dev/pts 
> > so
> > that i can e.g. shell out to ssh and feed it a password on the tty
> > prompt or similar. And i don't want to bind-mount in the host 
> > /dev/pts,
> > because then the sandbox can read from the ttys of other apps.
> > 
> > Where does the second namespace enter into this? 
> 
> Step a.  Create create a user namespace where uid 0 is mapped to your
> real uid, and set up your sandbox (aka mount /dev/pts and everything
> else).
> 
> Step b.  Create a nested user namespace where your uid is identity
> mapped and run your desktop application.  You can even drop all caps 
> in
> your namespace.
> 
> Or basically:
>     unshare(CLONE_NEWUSER)
>     
>     map 0 to real_uid
>     set things up.
>     
>     unshare(CLONE_NEWUSER)
>     map real_uid to 0 (Because I am assuming we are
>                       single threaded in the nested context)
>     
>     drop caps
>     exec /path/to/my/sandboxed/application

Thanks. I'll try that.

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Larsson May 28, 2015, 8:06 p.m. UTC | #27
On Thu, 2015-05-28 at 12:14 -0500, Eric W. Biederman wrote:
> 
> > Where does the second namespace enter into this? 
> 
> Step a.  Create create a user namespace where uid 0 is mapped to your
> real uid, and set up your sandbox (aka mount /dev/pts and everything
> else).
> 
> Step b.  Create a nested user namespace where your uid is identity
> mapped and run your desktop application.  You can even drop all caps 
> in
> your namespace.

Just tried this. Its not the nicest, and it doubles the number of
namespaces in action for each sandbox, but it does work.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kenton Varda May 28, 2015, 8:17 p.m. UTC | #28
On Thu, May 28, 2015 at 1:06 PM, Alexander Larsson <alexl@redhat.com> wrote:
> On Thu, 2015-05-28 at 12:14 -0500, Eric W. Biederman wrote:
>>
>> > Where does the second namespace enter into this?
>>
>> Step a.  Create create a user namespace where uid 0 is mapped to your
>> real uid, and set up your sandbox (aka mount /dev/pts and everything
>> else).
>>
>> Step b.  Create a nested user namespace where your uid is identity
>> mapped and run your desktop application.  You can even drop all caps
>> in
>> your namespace.
>
> Just tried this. Its not the nicest, and it doubles the number of
> namespaces in action for each sandbox, but it does work.

How much overhead is involved in each user namespace? Is there any
system-wide limit on total namespaces, other than RAM? Is there
(non-negligible) CPU overhead for each syscall seeking permissions in
the namespace?

-Kenton
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman May 28, 2015, 9:50 p.m. UTC | #29
Kenton Varda <kenton@sandstorm.io> writes:

> On Thu, May 28, 2015 at 1:06 PM, Alexander Larsson <alexl@redhat.com> wrote:
>> On Thu, 2015-05-28 at 12:14 -0500, Eric W. Biederman wrote:
>>>
>>> > Where does the second namespace enter into this?
>>>
>>> Step a.  Create create a user namespace where uid 0 is mapped to your
>>> real uid, and set up your sandbox (aka mount /dev/pts and everything
>>> else).
>>>
>>> Step b.  Create a nested user namespace where your uid is identity
>>> mapped and run your desktop application.  You can even drop all caps
>>> in
>>> your namespace.
>>
>> Just tried this. Its not the nicest, and it doubles the number of
>> namespaces in action for each sandbox, but it does work.
>
> How much overhead is involved in each user namespace?

sizeof(struct user_namespace).

> Is there any system-wide limit on total namespaces, other than RAM? 

There is a system-wide maximum depth, but not count.

>  Is there
> (non-negligible) CPU overhead for each syscall seeking permissions in
> the namespace?

ns_capable(ns, X) in some cases can walk up the from a starting user
namespace to the initial user.  (The only non-constant operation I am
aware of).  However unless the user namespace depth is deep it should
still take a negligible amount of time.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt
index 68dffd87f9b7..7808e77d0d72 100644
--- a/Documentation/filesystems/devpts.txt
+++ b/Documentation/filesystems/devpts.txt
@@ -121,6 +121,10 @@  once), following user-space issues should be noted.
 
 	chmod 666 /dev/pts/ptmx
 
+   The ownership for /dev/pts/ptmx can be specified using the ptmxuid
+   and ptmxgid options.  Both default to zero, which, in user namespaces
+   that have no root user, will cause mounting to fail.
+
 7. A mount of devpts without the 'newinstance' option results in binding to
    initial kernel mount.  This behavior while preserving legacy semantics,
    does not provide strict isolation in a container environment. i.e by
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index cfe8466f7fef..b60d1438c660 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -102,6 +102,8 @@  struct pts_mount_opts {
 	int setgid;
 	kuid_t   uid;
 	kgid_t   gid;
+	uid_t ptmx_uid;
+	gid_t ptmx_gid;
 	umode_t mode;
 	umode_t ptmxmode;
 	int newinstance;
@@ -109,8 +111,8 @@  struct pts_mount_opts {
 };
 
 enum {
-	Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance,  Opt_max,
-	Opt_err
+	Opt_uid, Opt_gid, Opt_ptmx_uid, Opt_ptmx_gid, Opt_mode, Opt_ptmxmode,
+	Opt_newinstance,  Opt_max, Opt_err,
 };
 
 static const match_table_t tokens = {
@@ -118,6 +120,8 @@  static const match_table_t tokens = {
 	{Opt_gid, "gid=%u"},
 	{Opt_mode, "mode=%o"},
 #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+	{Opt_ptmx_uid, "ptmxuid=%u"},
+	{Opt_ptmx_gid, "ptmxgid=%u"},
 	{Opt_ptmxmode, "ptmxmode=%o"},
 	{Opt_newinstance, "newinstance"},
 	{Opt_max, "max=%d"},
@@ -162,14 +166,17 @@  static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
 	char *p;
 	kuid_t uid;
 	kgid_t gid;
-
-	opts->setuid  = 0;
-	opts->setgid  = 0;
-	opts->uid     = GLOBAL_ROOT_UID;
-	opts->gid     = GLOBAL_ROOT_GID;
-	opts->mode    = DEVPTS_DEFAULT_MODE;
+	bool setptmxid = false;
+
+	opts->setuid   = 0;
+	opts->setgid   = 0;
+	opts->uid      = GLOBAL_ROOT_UID;
+	opts->gid      = GLOBAL_ROOT_GID;
+	opts->ptmx_uid = 0;
+	opts->ptmx_gid = 0;
+	opts->mode     = DEVPTS_DEFAULT_MODE;
 	opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
-	opts->max     = NR_UNIX98_PTY_MAX;
+	opts->max      = NR_UNIX98_PTY_MAX;
 
 	/* newinstance makes sense only on initial mount */
 	if (op == PARSE_MOUNT)
@@ -209,6 +216,22 @@  static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
 			opts->mode = option & S_IALLUGO;
 			break;
 #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+		case Opt_ptmx_uid:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			if (op != PARSE_MOUNT)
+				return -EINVAL;
+			opts->ptmx_uid = option;
+			setptmxid = true;
+			break;
+		case Opt_ptmx_gid:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			if (op != PARSE_MOUNT)
+				return -EINVAL;
+			opts->ptmx_gid = option;
+			setptmxid = true;
+			break;
 		case Opt_ptmxmode:
 			if (match_octal(&args[0], &option))
 				return -EINVAL;
@@ -232,6 +255,9 @@  static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
 		}
 	}
 
+	if (setptmxid && !opts->newinstance)
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -245,12 +271,12 @@  static int mknod_ptmx(struct super_block *sb)
 	struct dentry *root = sb->s_root;
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 	struct pts_mount_opts *opts = &fsi->mount_opts;
-	kuid_t root_uid;
-	kgid_t root_gid;
+	kuid_t ptmx_uid;
+	kgid_t ptmx_gid;
 
-	root_uid = make_kuid(current_user_ns(), 0);
-	root_gid = make_kgid(current_user_ns(), 0);
-	if (!uid_valid(root_uid) || !gid_valid(root_gid))
+	ptmx_uid = make_kuid(current_user_ns(), opts->ptmx_uid);
+	ptmx_gid = make_kgid(current_user_ns(), opts->ptmx_gid);
+	if (!uid_valid(ptmx_uid) || !gid_valid(ptmx_gid))
 		return -EINVAL;
 
 	mutex_lock(&root->d_inode->i_mutex);
@@ -282,8 +308,8 @@  static int mknod_ptmx(struct super_block *sb)
 
 	mode = S_IFCHR|opts->ptmxmode;
 	init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
-	inode->i_uid = root_uid;
-	inode->i_gid = root_gid;
+	inode->i_uid = ptmx_uid;
+	inode->i_gid = ptmx_gid;
 
 	d_add(dentry, inode);