diff mbox

linux-user: add option to intercept execve() syscalls

Message ID 1453091602-21843-1-git-send-email-petrosagg@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Petros Angelatos Jan. 18, 2016, 4:33 a.m. UTC
From: Petros Angelatos <petrosagg@resin.io>

In order for one to use QEMU user mode emulation under a chroot, it is
required to use binfmt_misc. This can be avoided by QEMU never doing a
raw execve() to the host system.

Introduce a new option, -execve=path, that sets the absolute path to the
QEMU interpreter and enables execve() interception. When a guest process
tries to call execve(), qemu_execve() is called instead.

qemu_execve() will prepend the interpreter set with -execve, similar to
what binfmt_misc would do, and then pass the modified execve() to the
host.

It is necessary to parse hashbang scripts in that function otherwise
the kernel will try to run the interpreter of a script without QEMU and
get an invalid exec format error.

Signed-off-by: Petros Angelatos <petrosagg@resin.io>
---
 linux-user/main.c    |   8 ++++
 linux-user/qemu.h    |   1 +
 linux-user/syscall.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 119 insertions(+), 1 deletion(-)

Comments

Laurent Vivier Jan. 20, 2016, 11:34 p.m. UTC | #1
Hi Petros,

Le 18/01/2016 05:33, Petros Angelatos a écrit :
> From: Petros Angelatos <petrosagg@resin.io>
> 
> In order for one to use QEMU user mode emulation under a chroot, it is
> required to use binfmt_misc. This can be avoided by QEMU never doing a
> raw execve() to the host system.

Are there some reasons to not use binfmt_misc when we are able to do
chroot ?

Moreover binfmt_misc allows to execute binaries that cannot be read, I
think it is not possible with an userspace solution. And binfmt_misc
also allows to use credential and security tokens from the binaries, not
from the interpreter (See [1]), it is useful to run commands like "sudo".

With this solution, you can't mix several interpreters in your chroot: I
guess LXC has templates to create (ubuntu) containers where some
binaries are statically linked native ones to manage syscalls (like
netlink) that are not supported by qemu linux-user.

That said, I have nothing against the idea but I don't understand what
it is useful for... do you have some use cases ?

I think it is better to use kernel mechanisms when they are available...

Laurent
[1] https://patchwork.ozlabs.org/patch/215941/
    https://www.kernel.org/doc/Documentation/binfmt_misc.txt

> Introduce a new option, -execve=path, that sets the absolute path to the
> QEMU interpreter and enables execve() interception. When a guest process
> tries to call execve(), qemu_execve() is called instead.
> 
> qemu_execve() will prepend the interpreter set with -execve, similar to
> what binfmt_misc would do, and then pass the modified execve() to the
> host.
> 
> It is necessary to parse hashbang scripts in that function otherwise
> the kernel will try to run the interpreter of a script without QEMU and
> get an invalid exec format error.
> 
> Signed-off-by: Petros Angelatos <petrosagg@resin.io>
> ---
>  linux-user/main.c    |   8 ++++
>  linux-user/qemu.h    |   1 +
>  linux-user/syscall.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 119 insertions(+), 1 deletion(-)
> 
> diff --git a/linux-user/main.c b/linux-user/main.c
> index ee12035..5951279 100644
> --- a/linux-user/main.c
> +++ b/linux-user/main.c
> @@ -79,6 +79,7 @@ static void usage(int exitcode);
>  
>  static const char *interp_prefix = CONFIG_QEMU_INTERP_PREFIX;
>  const char *qemu_uname_release;
> +const char *qemu_execve_path;
>  
>  /* XXX: on x86 MAP_GROWSDOWN only works if ESP <= address + 32, so
>     we allocate a bigger stack. Need a better solution, for example
> @@ -3828,6 +3829,11 @@ static void handle_arg_guest_base(const char *arg)
>      have_guest_base = 1;
>  }
>  
> +static void handle_arg_execve(const char *arg)
> +{
> +    qemu_execve_path = strdup(arg);
> +}
> +
>  static void handle_arg_reserved_va(const char *arg)
>  {
>      char *p;
> @@ -3913,6 +3919,8 @@ static const struct qemu_argument arg_table[] = {
>       "uname",      "set qemu uname release string to 'uname'"},
>      {"B",          "QEMU_GUEST_BASE",  true,  handle_arg_guest_base,
>       "address",    "set guest_base address to 'address'"},
> +    {"execve",     "QEMU_EXECVE",      true,   handle_arg_execve,
> +     "path",       "use interpreter at 'path' when a process calls execve()"},
>      {"R",          "QEMU_RESERVED_VA", true,  handle_arg_reserved_va,
>       "size",       "reserve 'size' bytes for guest virtual address space"},
>      {"d",          "QEMU_LOG",         true,  handle_arg_log,
> diff --git a/linux-user/qemu.h b/linux-user/qemu.h
> index bd90cc3..0d9b058 100644
> --- a/linux-user/qemu.h
> +++ b/linux-user/qemu.h
> @@ -140,6 +140,7 @@ void init_task_state(TaskState *ts);
>  void task_settid(TaskState *);
>  void stop_all_tasks(void);
>  extern const char *qemu_uname_release;
> +extern const char *qemu_execve_path;
>  extern unsigned long mmap_min_addr;
>  
>  /* ??? See if we can avoid exposing so much of the loader internals.  */
> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
> index 0cbace4..d0b5442 100644
> --- a/linux-user/syscall.c
> +++ b/linux-user/syscall.c
> @@ -5854,6 +5854,109 @@ static target_timer_t get_timer_id(abi_long arg)
>      return timerid;
>  }
>  
> +#define BINPRM_BUF_SIZE 128
> +
> +/* qemu_execve() Must return target values and target errnos. */
> +static abi_long qemu_execve(char *filename, char *argv[],
> +                  char *envp[])
> +{
> +    char *i_arg = NULL, *i_name = NULL;
> +    char **new_argp;
> +    int argc, fd, ret, i, offset = 3;
> +    char *cp;
> +    char buf[BINPRM_BUF_SIZE];
> +
> +    for (argc = 0; argv[argc] != NULL; argc++) {
> +        /* nothing */ ;
> +    }
> +
> +    fd = open(filename, O_RDONLY);
> +    if (fd == -1) {
> +        return -ENOENT;
> +    }
> +
> +    ret = read(fd, buf, BINPRM_BUF_SIZE);
> +    if (ret == -1) {
> +        close(fd);
> +        return -ENOENT;
> +    }
> +
> +    close(fd);
> +
> +    /* adapted from the kernel
> +     * https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/fs/binfmt_script.c
> +     */
> +    if ((buf[0] == '#') && (buf[1] == '!')) {
> +        /*
> +         * This section does the #! interpretation.
> +         * Sorta complicated, but hopefully it will work.  -TYT
> +         */
> +
> +        buf[BINPRM_BUF_SIZE - 1] = '\0';
> +        cp = strchr(buf, '\n');
> +        if (cp == NULL) {
> +            cp = buf+BINPRM_BUF_SIZE-1;
> +        }
> +        *cp = '\0';
> +        while (cp > buf) {
> +            cp--;
> +            if ((*cp == ' ') || (*cp == '\t')) {
> +                *cp = '\0';
> +            } else {
> +                break;
> +            }
> +        }
> +        for (cp = buf+2; (*cp == ' ') || (*cp == '\t'); cp++) {
> +            /* nothing */ ;
> +        }
> +        if (*cp == '\0') {
> +            return -ENOEXEC; /* No interpreter name found */
> +        }
> +        i_name = cp;
> +        i_arg = NULL;
> +        for ( ; *cp && (*cp != ' ') && (*cp != '\t'); cp++) {
> +            /* nothing */ ;
> +        }
> +        while ((*cp == ' ') || (*cp == '\t')) {
> +            *cp++ = '\0';
> +        }
> +        if (*cp) {
> +            i_arg = cp;
> +        }
> +
> +        if (i_arg) {
> +            offset = 5;
> +        } else {
> +            offset = 4;
> +        }
> +    }
> +
> +    new_argp = alloca((argc + offset + 1) * sizeof(void *));
> +
> +    /* Copy the original arguments with offset */
> +    for (i = 0; i < argc; i++) {
> +        new_argp[i + offset] = argv[i];
> +    }
> +
> +    new_argp[0] = strdup(qemu_execve_path);
> +    new_argp[1] = strdup("-0");
> +    new_argp[offset] = filename;
> +    new_argp[argc + offset] = NULL;
> +
> +    if (i_name) {
> +        new_argp[2] = i_name;
> +        new_argp[3] = i_name;
> +
> +        if (i_arg) {
> +            new_argp[4] = i_arg;
> +        }
> +    } else {
> +        new_argp[2] = argv[0];
> +    }
> +
> +    return get_errno(execve(qemu_execve_path, new_argp, envp));
> +}
> +
>  /* do_syscall() should always have a single exit point at the end so
>     that actions, such as logging of syscall results, can be performed.
>     All errnos that do_syscall() returns must be -TARGET_<errcode>. */
> @@ -6113,7 +6216,13 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
>  
>              if (!(p = lock_user_string(arg1)))
>                  goto execve_efault;
> -            ret = get_errno(execve(p, argp, envp));
> +
> +            if (qemu_execve_path && *qemu_execve_path) {
> +                ret = get_errno(qemu_execve(p, argp, envp));
> +            } else {
> +                ret = get_errno(execve(p, argp, envp));
> +            }
> +
>              unlock_user(p, arg1, 0);
>  
>              goto execve_end;
>
Petros Angelatos Jan. 21, 2016, 1:49 a.m. UTC | #2
Hi Laurent,

> Are there some reasons to not use binfmt_misc when we are able to do
> chroot ?
>
> Moreover binfmt_misc allows to execute binaries that cannot be read, I
> think it is not possible with an userspace solution. And binfmt_misc
> also allows to use credential and security tokens from the binaries, not
> from the interpreter (See [1]), it is useful to run commands like "sudo".
>
> With this solution, you can't mix several interpreters in your chroot: I
> guess LXC has templates to create (ubuntu) containers where some
> binaries are statically linked native ones to manage syscalls (like
> netlink) that are not supported by qemu linux-user.
>
> I think it is better to use kernel mechanisms when they are available...

Definitely! If binfmt_misc is available as an option then it is a
superior choice for all the reasons and edge cases you outlined above.
However, binfmt_misc isn't always a choice. Even if you have enough
permissions to do a chroot() the module might just not be available or
you might not have enough permissions to load it and configure it.

Maybe it would make sense to add some caveats of this option in the
documentation?

> That said, I have nothing against the idea but I don't understand what
> it is useful for... do you have some use cases ?

My main use case for this is opening up the possibility to build ARM
Docker images using hosted services like Dockerhub where you don't
have access to the kernel. Also, it allows projects on Github to use
services like Travis CI to do continuous integration and testing on
foreign architectures.

I assume it will also work under a fakechroot although I haven't tested this.

I've written a blog post[1] on the details of the usecase and it has
been found to be very useful from people in the Docker for ARM
community.

[1] https://resin.io/blog/building-arm-containers-on-any-x86-machine-even-dockerhub/

Best,
Petros
Laurent Vivier Jan. 21, 2016, 10:12 a.m. UTC | #3
Le 18/01/2016 05:33, Petros Angelatos a écrit :
> From: Petros Angelatos <petrosagg@resin.io>
> 
> In order for one to use QEMU user mode emulation under a chroot, it is
> required to use binfmt_misc. This can be avoided by QEMU never doing a
> raw execve() to the host system.
> 
> Introduce a new option, -execve=path, that sets the absolute path to the
> QEMU interpreter and enables execve() interception. When a guest process
> tries to call execve(), qemu_execve() is called instead.
> 
> qemu_execve() will prepend the interpreter set with -execve, similar to
> what binfmt_misc would do, and then pass the modified execve() to the
> host.
> 
> It is necessary to parse hashbang scripts in that function otherwise
> the kernel will try to run the interpreter of a script without QEMU and
> get an invalid exec format error.
> 
> Signed-off-by: Petros Angelatos <petrosagg@resin.io>
> ---
>  linux-user/main.c    |   8 ++++
>  linux-user/qemu.h    |   1 +
>  linux-user/syscall.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 119 insertions(+), 1 deletion(-)
> 
> diff --git a/linux-user/main.c b/linux-user/main.c
> index ee12035..5951279 100644
> --- a/linux-user/main.c
> +++ b/linux-user/main.c
> @@ -79,6 +79,7 @@ static void usage(int exitcode);
>  
>  static const char *interp_prefix = CONFIG_QEMU_INTERP_PREFIX;
>  const char *qemu_uname_release;
> +const char *qemu_execve_path;
>  
>  /* XXX: on x86 MAP_GROWSDOWN only works if ESP <= address + 32, so
>     we allocate a bigger stack. Need a better solution, for example
> @@ -3828,6 +3829,11 @@ static void handle_arg_guest_base(const char *arg)
>      have_guest_base = 1;
>  }
>  
> +static void handle_arg_execve(const char *arg)
> +{
> +    qemu_execve_path = strdup(arg);

I think you can use the parameter just as an on/off switch and
realpath(argv[0]) as qemu_execve_path.

I don't see any reason to use other binary than the one in use.

> +}
> +
>  static void handle_arg_reserved_va(const char *arg)
>  {
>      char *p;
> @@ -3913,6 +3919,8 @@ static const struct qemu_argument arg_table[] = {
>       "uname",      "set qemu uname release string to 'uname'"},
>      {"B",          "QEMU_GUEST_BASE",  true,  handle_arg_guest_base,
>       "address",    "set guest_base address to 'address'"},
> +    {"execve",     "QEMU_EXECVE",      true,   handle_arg_execve,
> +     "path",       "use interpreter at 'path' when a process calls execve()"},
>      {"R",          "QEMU_RESERVED_VA", true,  handle_arg_reserved_va,
>       "size",       "reserve 'size' bytes for guest virtual address space"},
>      {"d",          "QEMU_LOG",         true,  handle_arg_log,
> diff --git a/linux-user/qemu.h b/linux-user/qemu.h
> index bd90cc3..0d9b058 100644
> --- a/linux-user/qemu.h
> +++ b/linux-user/qemu.h
> @@ -140,6 +140,7 @@ void init_task_state(TaskState *ts);
>  void task_settid(TaskState *);
>  void stop_all_tasks(void);
>  extern const char *qemu_uname_release;
> +extern const char *qemu_execve_path;
>  extern unsigned long mmap_min_addr;
>  
>  /* ??? See if we can avoid exposing so much of the loader internals.  */
> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
> index 0cbace4..d0b5442 100644
> --- a/linux-user/syscall.c
> +++ b/linux-user/syscall.c
> @@ -5854,6 +5854,109 @@ static target_timer_t get_timer_id(abi_long arg)
>      return timerid;
>  }
>  
> +#define BINPRM_BUF_SIZE 128

This is defined in <linux/binfmts.h>

> +/* qemu_execve() Must return target values and target errnos. */
> +static abi_long qemu_execve(char *filename, char *argv[],
> +                  char *envp[])
> +{
> +    char *i_arg = NULL, *i_name = NULL;
> +    char **new_argp;
> +    int argc, fd, ret, i, offset = 3;
> +    char *cp;
> +    char buf[BINPRM_BUF_SIZE];
> +
> +    for (argc = 0; argv[argc] != NULL; argc++) {
> +        /* nothing */ ;
> +    }
> +
> +    fd = open(filename, O_RDONLY);
> +    if (fd == -1) {
> +        return -ENOENT;

return -errno; ?

> +    }
> +
> +    ret = read(fd, buf, BINPRM_BUF_SIZE);
> +    if (ret == -1) {
> +        close(fd);
> +        return -ENOENT;

return -errno; ?

> +    }
> +
> +    close(fd);
> +
> +    /* adapted from the kernel
> +     * https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/fs/binfmt_script.c
> +     */
> +    if ((buf[0] == '#') && (buf[1] == '!')) {

what happens if read() < 2 ?

> +        /*
> +         * This section does the #! interpretation.
> +         * Sorta complicated, but hopefully it will work.  -TYT
> +         */
> +
> +        buf[BINPRM_BUF_SIZE - 1] = '\0';
> +        cp = strchr(buf, '\n');
> +        if (cp == NULL) {
> +            cp = buf+BINPRM_BUF_SIZE-1;
> +        }
> +        *cp = '\0';
> +        while (cp > buf) {
> +            cp--;
> +            if ((*cp == ' ') || (*cp == '\t')) {
> +                *cp = '\0';
> +            } else {
> +                break;
> +            }
> +        }
> +        for (cp = buf+2; (*cp == ' ') || (*cp == '\t'); cp++) {
> +            /* nothing */ ;
> +        }
> +        if (*cp == '\0') {
> +            return -ENOEXEC; /* No interpreter name found */
> +        }
> +        i_name = cp;
> +        i_arg = NULL;
> +        for ( ; *cp && (*cp != ' ') && (*cp != '\t'); cp++) {
> +            /* nothing */ ;
> +        }
> +        while ((*cp == ' ') || (*cp == '\t')) {
> +            *cp++ = '\0';
> +        }
> +        if (*cp) {
> +            i_arg = cp;
> +        }
> +
> +        if (i_arg) {
> +            offset = 5;
> +        } else {
> +            offset = 4;
> +        }
> +    }
> +
> +    new_argp = alloca((argc + offset + 1) * sizeof(void *));
> +
> +    /* Copy the original arguments with offset */
> +    for (i = 0; i < argc; i++) {
> +        new_argp[i + offset] = argv[i];
> +    }
> +
> +    new_argp[0] = strdup(qemu_execve_path);
> +    new_argp[1] = strdup("-0");
> +    new_argp[offset] = filename;
> +    new_argp[argc + offset] = NULL;
> +
> +    if (i_name) {
> +        new_argp[2] = i_name;
> +        new_argp[3] = i_name;
> +
> +        if (i_arg) {
> +            new_argp[4] = i_arg;
> +        }
> +    } else {
> +        new_argp[2] = argv[0];
> +    }
> +
> +    return get_errno(execve(qemu_execve_path, new_argp, envp));

duplicate get_errno() with the caller.

> +}
> +
>  /* do_syscall() should always have a single exit point at the end so
>     that actions, such as logging of syscall results, can be performed.
>     All errnos that do_syscall() returns must be -TARGET_<errcode>. */
> @@ -6113,7 +6216,13 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
>  
>              if (!(p = lock_user_string(arg1)))
>                  goto execve_efault;
> -            ret = get_errno(execve(p, argp, envp));
> +
> +            if (qemu_execve_path && *qemu_execve_path) {
> +                ret = get_errno(qemu_execve(p, argp, envp));
> +            } else {
> +                ret = get_errno(execve(p, argp, envp));
> +            }
> +

what do you think of:

    ret = qemu_execve(p, argp, envp);

and in qemu_execve():

    if (qemu_execve_path == NULL || *qemu_execve_path == 0) {
        return get_errno(execve(p, argp, envp));
    }

so all the logic is in the function.

>              unlock_user(p, arg1, 0);
>  
>              goto execve_end;
>
Petros Angelatos Jan. 22, 2016, 10:01 a.m. UTC | #4
>> diff --git a/linux-user/main.c b/linux-user/main.c
>> index ee12035..5951279 100644
>> --- a/linux-user/main.c
>> +++ b/linux-user/main.c
>> @@ -79,6 +79,7 @@ static void usage(int exitcode);
>>
>>  static const char *interp_prefix = CONFIG_QEMU_INTERP_PREFIX;
>>  const char *qemu_uname_release;
>> +const char *qemu_execve_path;
>>
>>  /* XXX: on x86 MAP_GROWSDOWN only works if ESP <= address + 32, so
>>     we allocate a bigger stack. Need a better solution, for example
>> @@ -3828,6 +3829,11 @@ static void handle_arg_guest_base(const char *arg)
>>      have_guest_base = 1;
>>  }
>>
>> +static void handle_arg_execve(const char *arg)
>> +{
>> +    qemu_execve_path = strdup(arg);
>
> I think you can use the parameter just as an on/off switch and
> realpath(argv[0]) as qemu_execve_path.
>
> I don't see any reason to use other binary than the one in use.

This was my initial approach too, but argv[0] can be just the filename
like "qemu-arm-static". And while I could add extra logic to look this
up in the PATH, someone could run it from a completely different
location. Then I looked for a way to get the path of the current
executable but every platform has its own way of doing that and I
didn't want to add all these cases.

https://stackoverflow.com/questions/1023306/finding-current-executables-path-without-proc-self-exe

>> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
>> index 0cbace4..d0b5442 100644
>> --- a/linux-user/syscall.c
>> +++ b/linux-user/syscall.c
>> @@ -5854,6 +5854,109 @@ static target_timer_t get_timer_id(abi_long arg)
>>      return timerid;
>>  }
>>
>> +#define BINPRM_BUF_SIZE 128
>
> This is defined in <linux/binfmts.h>

Got it, I'll add this header and remove the definition.

>
>> +/* qemu_execve() Must return target values and target errnos. */
>> +static abi_long qemu_execve(char *filename, char *argv[],
>> +                  char *envp[])
>> +{
>> +    char *i_arg = NULL, *i_name = NULL;
>> +    char **new_argp;
>> +    int argc, fd, ret, i, offset = 3;
>> +    char *cp;
>> +    char buf[BINPRM_BUF_SIZE];
>> +
>> +    for (argc = 0; argv[argc] != NULL; argc++) {
>> +        /* nothing */ ;
>> +    }
>> +
>> +    fd = open(filename, O_RDONLY);
>> +    if (fd == -1) {
>> +        return -ENOENT;
>
> return -errno; ?

Will fix in v2

>> +    ret = read(fd, buf, BINPRM_BUF_SIZE);
>> +    if (ret == -1) {
>> +        close(fd);
>> +        return -ENOENT;
>
> return -errno; ?

Will fix in v2

>> +    }
>> +
>> +    close(fd);
>> +
>> +    /* adapted from the kernel
>> +     * https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/fs/binfmt_script.c
>> +     */
>> +    if ((buf[0] == '#') && (buf[1] == '!')) {
>
> what happens if read() < 2 ?

Hm, the easy option is for qemu_execve to return ENOEXEC or EIO.
Otherwise I could retry the read N times? I'm not sure how to handle
this if we don't want to return an error.

>> +    /* Copy the original arguments with offset */
>> +    for (i = 0; i < argc; i++) {
>> +        new_argp[i + offset] = argv[i];
>> +    }
>> +
>> +    new_argp[0] = strdup(qemu_execve_path);
>> +    new_argp[1] = strdup("-0");
>> +    new_argp[offset] = filename;
>> +    new_argp[argc + offset] = NULL;
>> +
>> +    if (i_name) {
>> +        new_argp[2] = i_name;
>> +        new_argp[3] = i_name;
>> +
>> +        if (i_arg) {
>> +            new_argp[4] = i_arg;
>> +        }
>> +    } else {
>> +        new_argp[2] = argv[0];
>> +    }
>> +
>> +    return get_errno(execve(qemu_execve_path, new_argp, envp));
>
> duplicate get_errno() with the caller.

I'll add the logic proposed bellow in this function and remove the
duplicate get_errno() from the caller.

>>  /* do_syscall() should always have a single exit point at the end so
>>     that actions, such as logging of syscall results, can be performed.
>>     All errnos that do_syscall() returns must be -TARGET_<errcode>. */
>> @@ -6113,7 +6216,13 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
>>
>>              if (!(p = lock_user_string(arg1)))
>>                  goto execve_efault;
>> -            ret = get_errno(execve(p, argp, envp));
>> +
>> +            if (qemu_execve_path && *qemu_execve_path) {
>> +                ret = get_errno(qemu_execve(p, argp, envp));
>> +            } else {
>> +                ret = get_errno(execve(p, argp, envp));
>> +            }
>> +
>
> what do you think of:
>
>     ret = qemu_execve(p, argp, envp);
>
> and in qemu_execve():
>
>     if (qemu_execve_path == NULL || *qemu_execve_path == 0) {
>         return get_errno(execve(p, argp, envp));
>     }
>
> so all the logic is in the function.

Sounds good, I'll include this in v2

Since I'm new to this style of contribution I have a couple of
questions. Is it ok that I deleted part of the patch for my reply to
code review, or should I have replied inline without deleting
anything? Should I send v2 after we resolve all the issues here or
should I send a v2 with proposed fixes but lacking the ones pending
replies?

Thanks,
Petros
Laurent Vivier Jan. 22, 2016, 10:33 a.m. UTC | #5
Le 22/01/2016 11:01, Petros Angelatos a écrit :
>>> diff --git a/linux-user/main.c b/linux-user/main.c
>>> index ee12035..5951279 100644
>>> --- a/linux-user/main.c
>>> +++ b/linux-user/main.c
>>> @@ -79,6 +79,7 @@ static void usage(int exitcode);
>>>
>>>  static const char *interp_prefix = CONFIG_QEMU_INTERP_PREFIX;
>>>  const char *qemu_uname_release;
>>> +const char *qemu_execve_path;
>>>
>>>  /* XXX: on x86 MAP_GROWSDOWN only works if ESP <= address + 32, so
>>>     we allocate a bigger stack. Need a better solution, for example
>>> @@ -3828,6 +3829,11 @@ static void handle_arg_guest_base(const char *arg)
>>>      have_guest_base = 1;
>>>  }
>>>
>>> +static void handle_arg_execve(const char *arg)
>>> +{
>>> +    qemu_execve_path = strdup(arg);
>>
>> I think you can use the parameter just as an on/off switch and
>> realpath(argv[0]) as qemu_execve_path.
>>
>> I don't see any reason to use other binary than the one in use.
> 
> This was my initial approach too, but argv[0] can be just the filename
> like "qemu-arm-static". And while I could add extra logic to look this
> up in the PATH, someone could run it from a completely different
> location. Then I looked for a way to get the path of the current
> executable but every platform has its own way of doing that and I
> didn't want to add all these cases.
> 
> https://stackoverflow.com/questions/1023306/finding-current-executables-path-without-proc-self-exe

linux-user works only on linux.
qemu uses glib-2.0, so you can use g_find_program_in_path().

>>> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
>>> index 0cbace4..d0b5442 100644
>>> --- a/linux-user/syscall.c
>>> +++ b/linux-user/syscall.c
>>> @@ -5854,6 +5854,109 @@ static target_timer_t get_timer_id(abi_long arg)
>>>      return timerid;
>>>  }
>>>
>>> +#define BINPRM_BUF_SIZE 128
>>
>> This is defined in <linux/binfmts.h>
> 
> Got it, I'll add this header and remove the definition.
> 
>>
>>> +/* qemu_execve() Must return target values and target errnos. */
>>> +static abi_long qemu_execve(char *filename, char *argv[],
>>> +                  char *envp[])
>>> +{
>>> +    char *i_arg = NULL, *i_name = NULL;
>>> +    char **new_argp;
>>> +    int argc, fd, ret, i, offset = 3;
>>> +    char *cp;
>>> +    char buf[BINPRM_BUF_SIZE];
>>> +
>>> +    for (argc = 0; argv[argc] != NULL; argc++) {
>>> +        /* nothing */ ;
>>> +    }
>>> +
>>> +    fd = open(filename, O_RDONLY);
>>> +    if (fd == -1) {
>>> +        return -ENOENT;
>>
>> return -errno; ?
> 
> Will fix in v2
> 
>>> +    ret = read(fd, buf, BINPRM_BUF_SIZE);
>>> +    if (ret == -1) {
>>> +        close(fd);
>>> +        return -ENOENT;
>>
>> return -errno; ?
> 
> Will fix in v2
> 
>>> +    }
>>> +
>>> +    close(fd);
>>> +
>>> +    /* adapted from the kernel
>>> +     * https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/fs/binfmt_script.c
>>> +     */
>>> +    if ((buf[0] == '#') && (buf[1] == '!')) {
>>
>> what happens if read() < 2 ?
> 
> Hm, the easy option is for qemu_execve to return ENOEXEC or EIO.
> Otherwise I could retry the read N times? I'm not sure how to handle
> this if we don't want to return an error.

if we have less than 2 bytes, we can guess it is not executable...

>>> +    /* Copy the original arguments with offset */
>>> +    for (i = 0; i < argc; i++) {
>>> +        new_argp[i + offset] = argv[i];
>>> +    }
>>> +
>>> +    new_argp[0] = strdup(qemu_execve_path);
>>> +    new_argp[1] = strdup("-0");
>>> +    new_argp[offset] = filename;
>>> +    new_argp[argc + offset] = NULL;
>>> +
>>> +    if (i_name) {
>>> +        new_argp[2] = i_name;
>>> +        new_argp[3] = i_name;
>>> +
>>> +        if (i_arg) {
>>> +            new_argp[4] = i_arg;
>>> +        }
>>> +    } else {
>>> +        new_argp[2] = argv[0];
>>> +    }
>>> +
>>> +    return get_errno(execve(qemu_execve_path, new_argp, envp));
>>
>> duplicate get_errno() with the caller.
> 
> I'll add the logic proposed bellow in this function and remove the
> duplicate get_errno() from the caller.
> 
>>>  /* do_syscall() should always have a single exit point at the end so
>>>     that actions, such as logging of syscall results, can be performed.
>>>     All errnos that do_syscall() returns must be -TARGET_<errcode>. */
>>> @@ -6113,7 +6216,13 @@ abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
>>>
>>>              if (!(p = lock_user_string(arg1)))
>>>                  goto execve_efault;
>>> -            ret = get_errno(execve(p, argp, envp));
>>> +
>>> +            if (qemu_execve_path && *qemu_execve_path) {
>>> +                ret = get_errno(qemu_execve(p, argp, envp));
>>> +            } else {
>>> +                ret = get_errno(execve(p, argp, envp));
>>> +            }
>>> +
>>
>> what do you think of:
>>
>>     ret = qemu_execve(p, argp, envp);
>>
>> and in qemu_execve():
>>
>>     if (qemu_execve_path == NULL || *qemu_execve_path == 0) {
>>         return get_errno(execve(p, argp, envp));
>>     }
>>
>> so all the logic is in the function.
> 
> Sounds good, I'll include this in v2
> 
> Since I'm new to this style of contribution I have a couple of

http://wiki.qemu.org/Contribute/SubmitAPatch

> questions. Is it ok that I deleted part of the patch for my reply to
> code review, or should I have replied inline without deleting

Generally, it's better to not delete parts. So, someone tacking the mail
thread at a moment can read the whole history in the last mail.

> anything? Should I send v2 after we resolve all the issues here or
> should I send a v2 with proposed fixes but lacking the ones pending
> replies?

Do as you want, but more you send versions, more you (can) have reviews.

BTW, I'm not the linux-user maintainer (Riku is), so I can just review
and comment, no more.

> 
> Thanks,
> Petros

Laurent
Peter Maydell Jan. 22, 2016, 10:47 a.m. UTC | #6
On 22 January 2016 at 10:33, Laurent Vivier <laurent@vivier.eu> wrote:
> Le 22/01/2016 11:01, Petros Angelatos a écrit :
>> This was my initial approach too, but argv[0] can be just the filename
>> like "qemu-arm-static". And while I could add extra logic to look this
>> up in the PATH, someone could run it from a completely different
>> location. Then I looked for a way to get the path of the current
>> executable but every platform has its own way of doing that and I
>> didn't want to add all these cases.
>>
>> https://stackoverflow.com/questions/1023306/finding-current-executables-path-without-proc-self-exe
>
> linux-user works only on linux.
> qemu uses glib-2.0, so you can use g_find_program_in_path().

If QEMU was started via execle() to set the environment of the
executed process and that specified environment has a different
PATH, then g_find_program_in_path() will give the wrong answer.
Using AT_EXECFN (perhaps with a fallback to /proc/self/exe) seems
like a better approach to me.

>> questions. Is it ok that I deleted part of the patch for my reply to
>> code review, or should I have replied inline without deleting
>
> Generally, it's better to not delete parts. So, someone tacking the mail
> thread at a moment can read the whole history in the last mail.

I tend to happily delete parts and assume that readers have
access to the thread (via the archive or in their mail readers).
Not deleting bits makes it hard to read replies if there's
a conversation about a small part of a large patch.

thanks
-- PMM
Laurent Vivier Jan. 22, 2016, 11 a.m. UTC | #7
Le 22/01/2016 11:47, Peter Maydell a écrit :
> On 22 January 2016 at 10:33, Laurent Vivier <laurent@vivier.eu> wrote:
>> Le 22/01/2016 11:01, Petros Angelatos a écrit :
>>> This was my initial approach too, but argv[0] can be just the filename
>>> like "qemu-arm-static". And while I could add extra logic to look this
>>> up in the PATH, someone could run it from a completely different
>>> location. Then I looked for a way to get the path of the current
>>> executable but every platform has its own way of doing that and I
>>> didn't want to add all these cases.
>>>
>>> https://stackoverflow.com/questions/1023306/finding-current-executables-path-without-proc-self-exe
>>
>> linux-user works only on linux.
>> qemu uses glib-2.0, so you can use g_find_program_in_path().
> 
> If QEMU was started via execle() to set the environment of the
> executed process and that specified environment has a different
> PATH, then g_find_program_in_path() will give the wrong answer.
> Using AT_EXECFN (perhaps with a fallback to /proc/self/exe) seems
> like a better approach to me.

I agree, you can use getauxval(AT_EXECFN).

>>> questions. Is it ok that I deleted part of the patch for my reply to
>>> code review, or should I have replied inline without deleting
>>
>> Generally, it's better to not delete parts. So, someone tacking the mail
>> thread at a moment can read the whole history in the last mail.
> 
> I tend to happily delete parts and assume that readers have
> access to the thread (via the archive or in their mail readers).
> Not deleting bits makes it hard to read replies if there's
> a conversation about a small part of a large patch.

Yes, I do that also... :)

Laurent
Petros Angelatos Jan. 27, 2016, 8:50 a.m. UTC | #8
All raised issues fixed, I just sent v2 :)

On Fri, Jan 22, 2016 at 3:00 AM, Laurent Vivier <laurent@vivier.eu> wrote:
>
>
> Le 22/01/2016 11:47, Peter Maydell a écrit :
>> On 22 January 2016 at 10:33, Laurent Vivier <laurent@vivier.eu> wrote:
>>> Le 22/01/2016 11:01, Petros Angelatos a écrit :
>>>> This was my initial approach too, but argv[0] can be just the filename
>>>> like "qemu-arm-static". And while I could add extra logic to look this
>>>> up in the PATH, someone could run it from a completely different
>>>> location. Then I looked for a way to get the path of the current
>>>> executable but every platform has its own way of doing that and I
>>>> didn't want to add all these cases.
>>>>
>>>> https://stackoverflow.com/questions/1023306/finding-current-executables-path-without-proc-self-exe
>>>
>>> linux-user works only on linux.
>>> qemu uses glib-2.0, so you can use g_find_program_in_path().
>>
>> If QEMU was started via execle() to set the environment of the
>> executed process and that specified environment has a different
>> PATH, then g_find_program_in_path() will give the wrong answer.
>> Using AT_EXECFN (perhaps with a fallback to /proc/self/exe) seems
>> like a better approach to me.
>
> I agree, you can use getauxval(AT_EXECFN).
>
>>>> questions. Is it ok that I deleted part of the patch for my reply to
>>>> code review, or should I have replied inline without deleting
>>>
>>> Generally, it's better to not delete parts. So, someone tacking the mail
>>> thread at a moment can read the whole history in the last mail.
>>
>> I tend to happily delete parts and assume that readers have
>> access to the thread (via the archive or in their mail readers).
>> Not deleting bits makes it hard to read replies if there's
>> a conversation about a small part of a large patch.
>
> Yes, I do that also... :)
>
> Laurent
>
diff mbox

Patch

diff --git a/linux-user/main.c b/linux-user/main.c
index ee12035..5951279 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -79,6 +79,7 @@  static void usage(int exitcode);
 
 static const char *interp_prefix = CONFIG_QEMU_INTERP_PREFIX;
 const char *qemu_uname_release;
+const char *qemu_execve_path;
 
 /* XXX: on x86 MAP_GROWSDOWN only works if ESP <= address + 32, so
    we allocate a bigger stack. Need a better solution, for example
@@ -3828,6 +3829,11 @@  static void handle_arg_guest_base(const char *arg)
     have_guest_base = 1;
 }
 
+static void handle_arg_execve(const char *arg)
+{
+    qemu_execve_path = strdup(arg);
+}
+
 static void handle_arg_reserved_va(const char *arg)
 {
     char *p;
@@ -3913,6 +3919,8 @@  static const struct qemu_argument arg_table[] = {
      "uname",      "set qemu uname release string to 'uname'"},
     {"B",          "QEMU_GUEST_BASE",  true,  handle_arg_guest_base,
      "address",    "set guest_base address to 'address'"},
+    {"execve",     "QEMU_EXECVE",      true,   handle_arg_execve,
+     "path",       "use interpreter at 'path' when a process calls execve()"},
     {"R",          "QEMU_RESERVED_VA", true,  handle_arg_reserved_va,
      "size",       "reserve 'size' bytes for guest virtual address space"},
     {"d",          "QEMU_LOG",         true,  handle_arg_log,
diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index bd90cc3..0d9b058 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -140,6 +140,7 @@  void init_task_state(TaskState *ts);
 void task_settid(TaskState *);
 void stop_all_tasks(void);
 extern const char *qemu_uname_release;
+extern const char *qemu_execve_path;
 extern unsigned long mmap_min_addr;
 
 /* ??? See if we can avoid exposing so much of the loader internals.  */
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 0cbace4..d0b5442 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -5854,6 +5854,109 @@  static target_timer_t get_timer_id(abi_long arg)
     return timerid;
 }
 
+#define BINPRM_BUF_SIZE 128
+
+/* qemu_execve() Must return target values and target errnos. */
+static abi_long qemu_execve(char *filename, char *argv[],
+                  char *envp[])
+{
+    char *i_arg = NULL, *i_name = NULL;
+    char **new_argp;
+    int argc, fd, ret, i, offset = 3;
+    char *cp;
+    char buf[BINPRM_BUF_SIZE];
+
+    for (argc = 0; argv[argc] != NULL; argc++) {
+        /* nothing */ ;
+    }
+
+    fd = open(filename, O_RDONLY);
+    if (fd == -1) {
+        return -ENOENT;
+    }
+
+    ret = read(fd, buf, BINPRM_BUF_SIZE);
+    if (ret == -1) {
+        close(fd);
+        return -ENOENT;
+    }
+
+    close(fd);
+
+    /* adapted from the kernel
+     * https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/fs/binfmt_script.c
+     */
+    if ((buf[0] == '#') && (buf[1] == '!')) {
+        /*
+         * This section does the #! interpretation.
+         * Sorta complicated, but hopefully it will work.  -TYT
+         */
+
+        buf[BINPRM_BUF_SIZE - 1] = '\0';
+        cp = strchr(buf, '\n');
+        if (cp == NULL) {
+            cp = buf+BINPRM_BUF_SIZE-1;
+        }
+        *cp = '\0';
+        while (cp > buf) {
+            cp--;
+            if ((*cp == ' ') || (*cp == '\t')) {
+                *cp = '\0';
+            } else {
+                break;
+            }
+        }
+        for (cp = buf+2; (*cp == ' ') || (*cp == '\t'); cp++) {
+            /* nothing */ ;
+        }
+        if (*cp == '\0') {
+            return -ENOEXEC; /* No interpreter name found */
+        }
+        i_name = cp;
+        i_arg = NULL;
+        for ( ; *cp && (*cp != ' ') && (*cp != '\t'); cp++) {
+            /* nothing */ ;
+        }
+        while ((*cp == ' ') || (*cp == '\t')) {
+            *cp++ = '\0';
+        }
+        if (*cp) {
+            i_arg = cp;
+        }
+
+        if (i_arg) {
+            offset = 5;
+        } else {
+            offset = 4;
+        }
+    }
+
+    new_argp = alloca((argc + offset + 1) * sizeof(void *));
+
+    /* Copy the original arguments with offset */
+    for (i = 0; i < argc; i++) {
+        new_argp[i + offset] = argv[i];
+    }
+
+    new_argp[0] = strdup(qemu_execve_path);
+    new_argp[1] = strdup("-0");
+    new_argp[offset] = filename;
+    new_argp[argc + offset] = NULL;
+
+    if (i_name) {
+        new_argp[2] = i_name;
+        new_argp[3] = i_name;
+
+        if (i_arg) {
+            new_argp[4] = i_arg;
+        }
+    } else {
+        new_argp[2] = argv[0];
+    }
+
+    return get_errno(execve(qemu_execve_path, new_argp, envp));
+}
+
 /* do_syscall() should always have a single exit point at the end so
    that actions, such as logging of syscall results, can be performed.
    All errnos that do_syscall() returns must be -TARGET_<errcode>. */
@@ -6113,7 +6216,13 @@  abi_long do_syscall(void *cpu_env, int num, abi_long arg1,
 
             if (!(p = lock_user_string(arg1)))
                 goto execve_efault;
-            ret = get_errno(execve(p, argp, envp));
+
+            if (qemu_execve_path && *qemu_execve_path) {
+                ret = get_errno(qemu_execve(p, argp, envp));
+            } else {
+                ret = get_errno(execve(p, argp, envp));
+            }
+
             unlock_user(p, arg1, 0);
 
             goto execve_end;