diff mbox series

[v6,02/11] fs: add O_ALLOW_ENCODED open flag

Message ID 977fd16687d8b0474fd9c442f79c23f53783e403.1605723568.git.osandov@fb.com (mailing list archive)
State New, archived
Headers show
Series fs: interface for directly reading/writing compressed data | expand

Commit Message

Omar Sandoval Nov. 18, 2020, 7:18 p.m. UTC
From: Omar Sandoval <osandov@fb.com>

The upcoming RWF_ENCODED operation introduces some security concerns:

1. Compressed writes will pass arbitrary data to decompression
   algorithms in the kernel.
2. Compressed reads can leak truncated/hole punched data.

Therefore, we need to require privilege for RWF_ENCODED. It's not
possible to do the permissions checks at the time of the read or write
because, e.g., io_uring submits IO from a worker thread. So, add an open
flag which requires CAP_SYS_ADMIN. It can also be set and cleared with
fcntl(). The flag is not cleared in any way on fork or exec. It must be
combined with O_CLOEXEC when opening to avoid accidental leaks (if
needed, it may be set without O_CLOEXEC by using fnctl()).

Note that the usual issue that unknown open flags are ignored doesn't
really matter for O_ALLOW_ENCODED; if the kernel doesn't support
O_ALLOW_ENCODED, then it doesn't support RWF_ENCODED, either.

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 arch/alpha/include/uapi/asm/fcntl.h  |  1 +
 arch/parisc/include/uapi/asm/fcntl.h |  1 +
 arch/sparc/include/uapi/asm/fcntl.h  |  1 +
 fs/fcntl.c                           | 10 ++++++++--
 fs/namei.c                           |  4 ++++
 fs/open.c                            |  7 +++++++
 include/linux/fcntl.h                |  2 +-
 include/uapi/asm-generic/fcntl.h     |  4 ++++
 8 files changed, 27 insertions(+), 3 deletions(-)

Comments

Amir Goldstein Nov. 19, 2020, 7:02 a.m. UTC | #1
On Wed, Nov 18, 2020 at 9:18 PM Omar Sandoval <osandov@osandov.com> wrote:
>
> From: Omar Sandoval <osandov@fb.com>
>
> The upcoming RWF_ENCODED operation introduces some security concerns:
>
> 1. Compressed writes will pass arbitrary data to decompression
>    algorithms in the kernel.
> 2. Compressed reads can leak truncated/hole punched data.
>
> Therefore, we need to require privilege for RWF_ENCODED. It's not
> possible to do the permissions checks at the time of the read or write
> because, e.g., io_uring submits IO from a worker thread. So, add an open
> flag which requires CAP_SYS_ADMIN. It can also be set and cleared with
> fcntl(). The flag is not cleared in any way on fork or exec. It must be
> combined with O_CLOEXEC when opening to avoid accidental leaks (if
> needed, it may be set without O_CLOEXEC by using fnctl()).
>
> Note that the usual issue that unknown open flags are ignored doesn't
> really matter for O_ALLOW_ENCODED; if the kernel doesn't support
> O_ALLOW_ENCODED, then it doesn't support RWF_ENCODED, either.
>
> Signed-off-by: Omar Sandoval <osandov@fb.com>
> ---
>  arch/alpha/include/uapi/asm/fcntl.h  |  1 +
>  arch/parisc/include/uapi/asm/fcntl.h |  1 +
>  arch/sparc/include/uapi/asm/fcntl.h  |  1 +
>  fs/fcntl.c                           | 10 ++++++++--
>  fs/namei.c                           |  4 ++++
>  fs/open.c                            |  7 +++++++
>  include/linux/fcntl.h                |  2 +-
>  include/uapi/asm-generic/fcntl.h     |  4 ++++
>  8 files changed, 27 insertions(+), 3 deletions(-)
>
> diff --git a/arch/alpha/include/uapi/asm/fcntl.h b/arch/alpha/include/uapi/asm/fcntl.h
> index 50bdc8e8a271..391e0d112e41 100644
> --- a/arch/alpha/include/uapi/asm/fcntl.h
> +++ b/arch/alpha/include/uapi/asm/fcntl.h
> @@ -34,6 +34,7 @@
>
>  #define O_PATH         040000000
>  #define __O_TMPFILE    0100000000
> +#define O_ALLOW_ENCODED        0200000000
>
>  #define F_GETLK                7
>  #define F_SETLK                8
> diff --git a/arch/parisc/include/uapi/asm/fcntl.h b/arch/parisc/include/uapi/asm/fcntl.h
> index 03dee816cb13..72ea9bdf5f04 100644
> --- a/arch/parisc/include/uapi/asm/fcntl.h
> +++ b/arch/parisc/include/uapi/asm/fcntl.h
> @@ -19,6 +19,7 @@
>
>  #define O_PATH         020000000
>  #define __O_TMPFILE    040000000
> +#define O_ALLOW_ENCODED        100000000
>
>  #define F_GETLK64      8
>  #define F_SETLK64      9
> diff --git a/arch/sparc/include/uapi/asm/fcntl.h b/arch/sparc/include/uapi/asm/fcntl.h
> index 67dae75e5274..ac3e8c9cb32c 100644
> --- a/arch/sparc/include/uapi/asm/fcntl.h
> +++ b/arch/sparc/include/uapi/asm/fcntl.h
> @@ -37,6 +37,7 @@
>
>  #define O_PATH         0x1000000
>  #define __O_TMPFILE    0x2000000
> +#define O_ALLOW_ENCODED        0x8000000
>
>  #define F_GETOWN       5       /*  for sockets. */
>  #define F_SETOWN       6       /*  for sockets. */
> diff --git a/fs/fcntl.c b/fs/fcntl.c
> index 19ac5baad50f..9302f68fe698 100644
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -30,7 +30,8 @@
>  #include <asm/siginfo.h>
>  #include <linux/uaccess.h>
>
> -#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
> +#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME | \
> +                   O_ALLOW_ENCODED)
>
>  static int setfl(int fd, struct file * filp, unsigned long arg)
>  {
> @@ -49,6 +50,11 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
>                 if (!inode_owner_or_capable(inode))
>                         return -EPERM;
>
> +       /* O_ALLOW_ENCODED can only be set by superuser */
> +       if ((arg & O_ALLOW_ENCODED) && !(filp->f_flags & O_ALLOW_ENCODED) &&
> +           !capable(CAP_SYS_ADMIN))
> +               return -EPERM;
> +
>         /* required for strict SunOS emulation */
>         if (O_NONBLOCK != O_NDELAY)
>                if (arg & O_NDELAY)
> @@ -1033,7 +1039,7 @@ static int __init fcntl_init(void)
>          * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
>          * is defined as O_NONBLOCK on some platforms and not on others.
>          */
> -       BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
> +       BUILD_BUG_ON(22 - 1 /* for O_RDONLY being 0 */ !=
>                 HWEIGHT32(
>                         (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
>                         __FMODE_EXEC | __FMODE_NONOTIFY));
> diff --git a/fs/namei.c b/fs/namei.c
> index d4a6dd772303..fbf64ce61088 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -2890,6 +2890,10 @@ static int may_open(const struct path *path, int acc_mode, int flag)
>         if (flag & O_NOATIME && !inode_owner_or_capable(inode))
>                 return -EPERM;
>
> +       /* O_ALLOW_ENCODED can only be set by superuser */
> +       if ((flag & O_ALLOW_ENCODED) && !capable(CAP_SYS_ADMIN))
> +               return -EPERM;
> +
>         return 0;
>  }
>
> diff --git a/fs/open.c b/fs/open.c
> index 9af548fb841b..f2863aaf78e7 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -1040,6 +1040,13 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
>                 acc_mode = 0;
>         }
>
> +       /*
> +        * O_ALLOW_ENCODED must be combined with O_CLOEXEC to avoid accidentally
> +        * leaking encoded I/O privileges.
> +        */
> +       if ((how->flags & (O_ALLOW_ENCODED | O_CLOEXEC)) == O_ALLOW_ENCODED)
> +               return -EINVAL;
> +


dup() can also result in accidental leak.
We could fail dup() of fd without O_CLOEXEC. Should we?

If we should than what error code should it be? We could return EPERM,
but since we do allow to clear O_CLOEXEC or set O_ALLOW_ENCODED
after open, EPERM seems a tad harsh.
EINVAL seems inappropriate because the error has nothing to do with
input args of dup() and EBADF would also be confusing.

Thanks,
Amir.
Jann Horn Nov. 20, 2020, 11:41 p.m. UTC | #2
On Thu, Nov 19, 2020 at 8:03 AM Amir Goldstein <amir73il@gmail.com> wrote:
> On Wed, Nov 18, 2020 at 9:18 PM Omar Sandoval <osandov@osandov.com> wrote:
> > The upcoming RWF_ENCODED operation introduces some security concerns:
> >
> > 1. Compressed writes will pass arbitrary data to decompression
> >    algorithms in the kernel.
> > 2. Compressed reads can leak truncated/hole punched data.
> >
> > Therefore, we need to require privilege for RWF_ENCODED. It's not
> > possible to do the permissions checks at the time of the read or write
> > because, e.g., io_uring submits IO from a worker thread. So, add an open
> > flag which requires CAP_SYS_ADMIN. It can also be set and cleared with
> > fcntl(). The flag is not cleared in any way on fork or exec. It must be
> > combined with O_CLOEXEC when opening to avoid accidental leaks (if
> > needed, it may be set without O_CLOEXEC by using fnctl()).
> >
> > Note that the usual issue that unknown open flags are ignored doesn't
> > really matter for O_ALLOW_ENCODED; if the kernel doesn't support
> > O_ALLOW_ENCODED, then it doesn't support RWF_ENCODED, either.
[...]
> > diff --git a/fs/open.c b/fs/open.c
> > index 9af548fb841b..f2863aaf78e7 100644
> > --- a/fs/open.c
> > +++ b/fs/open.c
> > @@ -1040,6 +1040,13 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
> >                 acc_mode = 0;
> >         }
> >
> > +       /*
> > +        * O_ALLOW_ENCODED must be combined with O_CLOEXEC to avoid accidentally
> > +        * leaking encoded I/O privileges.
> > +        */
> > +       if ((how->flags & (O_ALLOW_ENCODED | O_CLOEXEC)) == O_ALLOW_ENCODED)
> > +               return -EINVAL;
> > +
>
>
> dup() can also result in accidental leak.
> We could fail dup() of fd without O_CLOEXEC. Should we?
>
> If we should than what error code should it be? We could return EPERM,
> but since we do allow to clear O_CLOEXEC or set O_ALLOW_ENCODED
> after open, EPERM seems a tad harsh.
> EINVAL seems inappropriate because the error has nothing to do with
> input args of dup() and EBADF would also be confusing.

This seems very arbitrary to me. Sure, leaking these file descriptors
wouldn't be great, but there are plenty of other types of file
descriptors that are probably more sensitive. (Writable file
descriptors to databases, to important configuration files, to
io_uring instances, and so on.) So I don't see why this specific
feature should impose such special rules on it.
Omar Sandoval Nov. 30, 2020, 7:26 p.m. UTC | #3
On Sat, Nov 21, 2020 at 12:41:23AM +0100, Jann Horn wrote:
> On Thu, Nov 19, 2020 at 8:03 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > On Wed, Nov 18, 2020 at 9:18 PM Omar Sandoval <osandov@osandov.com> wrote:
> > > The upcoming RWF_ENCODED operation introduces some security concerns:
> > >
> > > 1. Compressed writes will pass arbitrary data to decompression
> > >    algorithms in the kernel.
> > > 2. Compressed reads can leak truncated/hole punched data.
> > >
> > > Therefore, we need to require privilege for RWF_ENCODED. It's not
> > > possible to do the permissions checks at the time of the read or write
> > > because, e.g., io_uring submits IO from a worker thread. So, add an open
> > > flag which requires CAP_SYS_ADMIN. It can also be set and cleared with
> > > fcntl(). The flag is not cleared in any way on fork or exec. It must be
> > > combined with O_CLOEXEC when opening to avoid accidental leaks (if
> > > needed, it may be set without O_CLOEXEC by using fnctl()).
> > >
> > > Note that the usual issue that unknown open flags are ignored doesn't
> > > really matter for O_ALLOW_ENCODED; if the kernel doesn't support
> > > O_ALLOW_ENCODED, then it doesn't support RWF_ENCODED, either.
> [...]
> > > diff --git a/fs/open.c b/fs/open.c
> > > index 9af548fb841b..f2863aaf78e7 100644
> > > --- a/fs/open.c
> > > +++ b/fs/open.c
> > > @@ -1040,6 +1040,13 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
> > >                 acc_mode = 0;
> > >         }
> > >
> > > +       /*
> > > +        * O_ALLOW_ENCODED must be combined with O_CLOEXEC to avoid accidentally
> > > +        * leaking encoded I/O privileges.
> > > +        */
> > > +       if ((how->flags & (O_ALLOW_ENCODED | O_CLOEXEC)) == O_ALLOW_ENCODED)
> > > +               return -EINVAL;
> > > +
> >
> >
> > dup() can also result in accidental leak.
> > We could fail dup() of fd without O_CLOEXEC. Should we?
> >
> > If we should than what error code should it be? We could return EPERM,
> > but since we do allow to clear O_CLOEXEC or set O_ALLOW_ENCODED
> > after open, EPERM seems a tad harsh.
> > EINVAL seems inappropriate because the error has nothing to do with
> > input args of dup() and EBADF would also be confusing.
> 
> This seems very arbitrary to me. Sure, leaking these file descriptors
> wouldn't be great, but there are plenty of other types of file
> descriptors that are probably more sensitive. (Writable file
> descriptors to databases, to important configuration files, to
> io_uring instances, and so on.) So I don't see why this specific
> feature should impose such special rules on it.

I agree with Jann. I'm okay with the O_CLOEXEC-on-open requirement if it
makes people more comfortable, but I don't think we should be bending
over backwards to block it anywhere else.
Amir Goldstein Dec. 1, 2020, 8:15 a.m. UTC | #4
On Mon, Nov 30, 2020 at 9:26 PM Omar Sandoval <osandov@osandov.com> wrote:
>
> On Sat, Nov 21, 2020 at 12:41:23AM +0100, Jann Horn wrote:
> > On Thu, Nov 19, 2020 at 8:03 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > > On Wed, Nov 18, 2020 at 9:18 PM Omar Sandoval <osandov@osandov.com> wrote:
> > > > The upcoming RWF_ENCODED operation introduces some security concerns:
> > > >
> > > > 1. Compressed writes will pass arbitrary data to decompression
> > > >    algorithms in the kernel.
> > > > 2. Compressed reads can leak truncated/hole punched data.
> > > >
> > > > Therefore, we need to require privilege for RWF_ENCODED. It's not
> > > > possible to do the permissions checks at the time of the read or write
> > > > because, e.g., io_uring submits IO from a worker thread. So, add an open
> > > > flag which requires CAP_SYS_ADMIN. It can also be set and cleared with
> > > > fcntl(). The flag is not cleared in any way on fork or exec. It must be
> > > > combined with O_CLOEXEC when opening to avoid accidental leaks (if
> > > > needed, it may be set without O_CLOEXEC by using fnctl()).
> > > >
> > > > Note that the usual issue that unknown open flags are ignored doesn't
> > > > really matter for O_ALLOW_ENCODED; if the kernel doesn't support
> > > > O_ALLOW_ENCODED, then it doesn't support RWF_ENCODED, either.
> > [...]
> > > > diff --git a/fs/open.c b/fs/open.c
> > > > index 9af548fb841b..f2863aaf78e7 100644
> > > > --- a/fs/open.c
> > > > +++ b/fs/open.c
> > > > @@ -1040,6 +1040,13 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
> > > >                 acc_mode = 0;
> > > >         }
> > > >
> > > > +       /*
> > > > +        * O_ALLOW_ENCODED must be combined with O_CLOEXEC to avoid accidentally
> > > > +        * leaking encoded I/O privileges.
> > > > +        */
> > > > +       if ((how->flags & (O_ALLOW_ENCODED | O_CLOEXEC)) == O_ALLOW_ENCODED)
> > > > +               return -EINVAL;
> > > > +
> > >
> > >
> > > dup() can also result in accidental leak.
> > > We could fail dup() of fd without O_CLOEXEC. Should we?
> > >
> > > If we should than what error code should it be? We could return EPERM,
> > > but since we do allow to clear O_CLOEXEC or set O_ALLOW_ENCODED
> > > after open, EPERM seems a tad harsh.
> > > EINVAL seems inappropriate because the error has nothing to do with
> > > input args of dup() and EBADF would also be confusing.
> >
> > This seems very arbitrary to me. Sure, leaking these file descriptors
> > wouldn't be great, but there are plenty of other types of file
> > descriptors that are probably more sensitive. (Writable file
> > descriptors to databases, to important configuration files, to
> > io_uring instances, and so on.) So I don't see why this specific
> > feature should impose such special rules on it.
>
> I agree with Jann. I'm okay with the O_CLOEXEC-on-open requirement if it
> makes people more comfortable, but I don't think we should be bending
> over backwards to block it anywhere else.

I'm fine with or without the O_CLOEXEC-on-open requirement.
Just pointing out the weirdness.

Thanks,
Amir.
Omar Sandoval Dec. 1, 2020, 8:31 p.m. UTC | #5
On Tue, Dec 01, 2020 at 10:15:58AM +0200, Amir Goldstein wrote:
> On Mon, Nov 30, 2020 at 9:26 PM Omar Sandoval <osandov@osandov.com> wrote:
> >
> > On Sat, Nov 21, 2020 at 12:41:23AM +0100, Jann Horn wrote:
> > > On Thu, Nov 19, 2020 at 8:03 AM Amir Goldstein <amir73il@gmail.com> wrote:
> > > > On Wed, Nov 18, 2020 at 9:18 PM Omar Sandoval <osandov@osandov.com> wrote:
> > > > > The upcoming RWF_ENCODED operation introduces some security concerns:
> > > > >
> > > > > 1. Compressed writes will pass arbitrary data to decompression
> > > > >    algorithms in the kernel.
> > > > > 2. Compressed reads can leak truncated/hole punched data.
> > > > >
> > > > > Therefore, we need to require privilege for RWF_ENCODED. It's not
> > > > > possible to do the permissions checks at the time of the read or write
> > > > > because, e.g., io_uring submits IO from a worker thread. So, add an open
> > > > > flag which requires CAP_SYS_ADMIN. It can also be set and cleared with
> > > > > fcntl(). The flag is not cleared in any way on fork or exec. It must be
> > > > > combined with O_CLOEXEC when opening to avoid accidental leaks (if
> > > > > needed, it may be set without O_CLOEXEC by using fnctl()).
> > > > >
> > > > > Note that the usual issue that unknown open flags are ignored doesn't
> > > > > really matter for O_ALLOW_ENCODED; if the kernel doesn't support
> > > > > O_ALLOW_ENCODED, then it doesn't support RWF_ENCODED, either.
> > > [...]
> > > > > diff --git a/fs/open.c b/fs/open.c
> > > > > index 9af548fb841b..f2863aaf78e7 100644
> > > > > --- a/fs/open.c
> > > > > +++ b/fs/open.c
> > > > > @@ -1040,6 +1040,13 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
> > > > >                 acc_mode = 0;
> > > > >         }
> > > > >
> > > > > +       /*
> > > > > +        * O_ALLOW_ENCODED must be combined with O_CLOEXEC to avoid accidentally
> > > > > +        * leaking encoded I/O privileges.
> > > > > +        */
> > > > > +       if ((how->flags & (O_ALLOW_ENCODED | O_CLOEXEC)) == O_ALLOW_ENCODED)
> > > > > +               return -EINVAL;
> > > > > +
> > > >
> > > >
> > > > dup() can also result in accidental leak.
> > > > We could fail dup() of fd without O_CLOEXEC. Should we?
> > > >
> > > > If we should than what error code should it be? We could return EPERM,
> > > > but since we do allow to clear O_CLOEXEC or set O_ALLOW_ENCODED
> > > > after open, EPERM seems a tad harsh.
> > > > EINVAL seems inappropriate because the error has nothing to do with
> > > > input args of dup() and EBADF would also be confusing.
> > >
> > > This seems very arbitrary to me. Sure, leaking these file descriptors
> > > wouldn't be great, but there are plenty of other types of file
> > > descriptors that are probably more sensitive. (Writable file
> > > descriptors to databases, to important configuration files, to
> > > io_uring instances, and so on.) So I don't see why this specific
> > > feature should impose such special rules on it.
> >
> > I agree with Jann. I'm okay with the O_CLOEXEC-on-open requirement if it
> > makes people more comfortable, but I don't think we should be bending
> > over backwards to block it anywhere else.
> 
> I'm fine with or without the O_CLOEXEC-on-open requirement.
> Just pointing out the weirdness.

I agree, it's weird to enforce it in one place but not in others, so I
think I might as well drop the O_CLOEXEC requirement altogether.
diff mbox series

Patch

diff --git a/arch/alpha/include/uapi/asm/fcntl.h b/arch/alpha/include/uapi/asm/fcntl.h
index 50bdc8e8a271..391e0d112e41 100644
--- a/arch/alpha/include/uapi/asm/fcntl.h
+++ b/arch/alpha/include/uapi/asm/fcntl.h
@@ -34,6 +34,7 @@ 
 
 #define O_PATH		040000000
 #define __O_TMPFILE	0100000000
+#define O_ALLOW_ENCODED	0200000000
 
 #define F_GETLK		7
 #define F_SETLK		8
diff --git a/arch/parisc/include/uapi/asm/fcntl.h b/arch/parisc/include/uapi/asm/fcntl.h
index 03dee816cb13..72ea9bdf5f04 100644
--- a/arch/parisc/include/uapi/asm/fcntl.h
+++ b/arch/parisc/include/uapi/asm/fcntl.h
@@ -19,6 +19,7 @@ 
 
 #define O_PATH		020000000
 #define __O_TMPFILE	040000000
+#define O_ALLOW_ENCODED	100000000
 
 #define F_GETLK64	8
 #define F_SETLK64	9
diff --git a/arch/sparc/include/uapi/asm/fcntl.h b/arch/sparc/include/uapi/asm/fcntl.h
index 67dae75e5274..ac3e8c9cb32c 100644
--- a/arch/sparc/include/uapi/asm/fcntl.h
+++ b/arch/sparc/include/uapi/asm/fcntl.h
@@ -37,6 +37,7 @@ 
 
 #define O_PATH		0x1000000
 #define __O_TMPFILE	0x2000000
+#define O_ALLOW_ENCODED	0x8000000
 
 #define F_GETOWN	5	/*  for sockets. */
 #define F_SETOWN	6	/*  for sockets. */
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 19ac5baad50f..9302f68fe698 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -30,7 +30,8 @@ 
 #include <asm/siginfo.h>
 #include <linux/uaccess.h>
 
-#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
+#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME | \
+		    O_ALLOW_ENCODED)
 
 static int setfl(int fd, struct file * filp, unsigned long arg)
 {
@@ -49,6 +50,11 @@  static int setfl(int fd, struct file * filp, unsigned long arg)
 		if (!inode_owner_or_capable(inode))
 			return -EPERM;
 
+	/* O_ALLOW_ENCODED can only be set by superuser */
+	if ((arg & O_ALLOW_ENCODED) && !(filp->f_flags & O_ALLOW_ENCODED) &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	/* required for strict SunOS emulation */
 	if (O_NONBLOCK != O_NDELAY)
 	       if (arg & O_NDELAY)
@@ -1033,7 +1039,7 @@  static int __init fcntl_init(void)
 	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
 	 * is defined as O_NONBLOCK on some platforms and not on others.
 	 */
-	BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
+	BUILD_BUG_ON(22 - 1 /* for O_RDONLY being 0 */ !=
 		HWEIGHT32(
 			(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
 			__FMODE_EXEC | __FMODE_NONOTIFY));
diff --git a/fs/namei.c b/fs/namei.c
index d4a6dd772303..fbf64ce61088 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2890,6 +2890,10 @@  static int may_open(const struct path *path, int acc_mode, int flag)
 	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
 		return -EPERM;
 
+	/* O_ALLOW_ENCODED can only be set by superuser */
+	if ((flag & O_ALLOW_ENCODED) && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	return 0;
 }
 
diff --git a/fs/open.c b/fs/open.c
index 9af548fb841b..f2863aaf78e7 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1040,6 +1040,13 @@  inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 		acc_mode = 0;
 	}
 
+	/*
+	 * O_ALLOW_ENCODED must be combined with O_CLOEXEC to avoid accidentally
+	 * leaking encoded I/O privileges.
+	 */
+	if ((how->flags & (O_ALLOW_ENCODED | O_CLOEXEC)) == O_ALLOW_ENCODED)
+		return -EINVAL;
+
 	/*
 	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
 	 * check for O_DSYNC if the need any syncing at all we enforce it's
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 921e750843e6..dc66c557b7d0 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -10,7 +10,7 @@ 
 	(O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \
 	 O_APPEND | O_NDELAY | O_NONBLOCK | __O_SYNC | O_DSYNC | \
 	 FASYNC	| O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
-	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
+	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | O_ALLOW_ENCODED)
 
 /* List of all valid flags for the how->upgrade_mask argument: */
 #define VALID_UPGRADE_FLAGS \
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 9dc0bf0c5a6e..75321c7a66ac 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -89,6 +89,10 @@ 
 #define __O_TMPFILE	020000000
 #endif
 
+#ifndef O_ALLOW_ENCODED
+#define O_ALLOW_ENCODED	040000000
+#endif
+
 /* a horrid kludge trying to make sure that this will fail on old kernels */
 #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
 #define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT)