Message ID | 20241030141037.375897-1-goldstein.w.n@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | linux-user: Add option to run `execve`d programs through QEMU | expand |
On Wed, Oct 30, 2024 at 9:10 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > The new option '-qemu-children' makes it so that on `execve` the child > process will be launch by the same `qemu` executable that is currently > running along with its current commandline arguments. > > The motivation for the change is to make it so that plugins running > through `qemu` can continue to run on children. Why not just > `binfmt`?: Plugins can be desirable regardless of system/architecture > emulation, and can sometimes be useful for elf files that can run > natively. Enabling `binfmt` for all natively runnable elf files may > not be desirable. > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > --- > linux-user/main.c | 21 ++++++ > linux-user/syscall.c | 21 ++++-- > linux-user/user-internals.h | 4 ++ > tests/tcg/multiarch/Makefile.target | 8 +++ > .../linux/linux-execve-qemu-children.c | 68 +++++++++++++++++++ > 5 files changed, 117 insertions(+), 5 deletions(-) > create mode 100644 tests/tcg/multiarch/linux/linux-execve-qemu-children.c > > diff --git a/linux-user/main.c b/linux-user/main.c > index 8143a0d4b0..5e3d41dc2b 100644 > --- a/linux-user/main.c > +++ b/linux-user/main.c > @@ -81,6 +81,10 @@ unsigned long mmap_min_addr; > uintptr_t guest_base; > bool have_guest_base; > > +bool qemu_dup_for_children; > +int qemu_argc; > +char **qemu_argv; > + > /* > * Used to implement backwards-compatibility for the `-strace`, and > * QEMU_STRACE options. Without this, the QEMU_LOG can be overwritten by > @@ -451,6 +455,11 @@ static void handle_arg_jitdump(const char *arg) > perf_enable_jitdump(); > } > > +static void handle_arg_qemu_children(const char *arg) > +{ > + qemu_dup_for_children = true; > +} > + > static QemuPluginList plugins = QTAILQ_HEAD_INITIALIZER(plugins); > > #ifdef CONFIG_PLUGIN > @@ -526,6 +535,10 @@ static const struct qemu_argument arg_table[] = { > "", "Generate a /tmp/perf-${pid}.map file for perf"}, > {"jitdump", "QEMU_JITDUMP", false, handle_arg_jitdump, > "", "Generate a jit-${pid}.dump file for perf"}, > + {"qemu-children", > + "QEMU_CHILDREN", false, handle_arg_qemu_children, > + "", "Run child processes (created with execve) with qemu " > + "(as instantiated for the parent)"}, > {NULL, NULL, false, NULL, NULL, NULL} > }; > > @@ -729,6 +742,14 @@ int main(int argc, char **argv, char **envp) > > optind = parse_args(argc, argv); > > + if (qemu_dup_for_children) { > + qemu_argc = optind; > + qemu_argv = g_new0(char *, qemu_argc); > + for (i = 0; i < optind; ++i) { > + qemu_argv[i] = strdup(argv[i]); > + } > + } > + > qemu_set_log_filename_flags(last_log_filename, > last_log_mask | (enable_strace * LOG_STRACE), > &error_fatal); > diff --git a/linux-user/syscall.c b/linux-user/syscall.c > index 59b2080b98..96b105e9ce 100644 > --- a/linux-user/syscall.c > +++ b/linux-user/syscall.c > @@ -8550,13 +8550,14 @@ static int do_execv(CPUArchState *cpu_env, int dirfd, > abi_long pathname, abi_long guest_argp, > abi_long guest_envp, int flags, bool is_execveat) > { > - int ret; > + int ret, argp_offset; > char **argp, **envp; > int argc, envc; > abi_ulong gp; > abi_ulong addr; > char **q; > void *p; > + bool through_qemu = dirfd == AT_FDCWD && qemu_dup_for_children; > > argc = 0; > > @@ -8580,10 +8581,12 @@ static int do_execv(CPUArchState *cpu_env, int dirfd, > envc++; > } > > - argp = g_new0(char *, argc + 1); > + argp_offset = through_qemu ? qemu_argc : 0; > + argp = g_new0(char *, argc + argp_offset + 1); > envp = g_new0(char *, envc + 1); > > - for (gp = guest_argp, q = argp; gp; gp += sizeof(abi_ulong), q++) { > + for (gp = guest_argp, q = argp + argp_offset; > + gp; gp += sizeof(abi_ulong), q++) { > if (get_user_ual(addr, gp)) { > goto execve_efault; > } > @@ -8628,9 +8631,16 @@ static int do_execv(CPUArchState *cpu_env, int dirfd, > } > > const char *exe = p; > - if (is_proc_myself(p, "exe")) { > + if (through_qemu) { > + int i; > + for (i = 0; i < argp_offset; ++i) { > + argp[i] = qemu_argv[i]; > + } > + exe = qemu_argv[0]; > + } else if (is_proc_myself(p, "exe")) { > exe = exec_path; > } > + > ret = is_execveat > ? safe_execveat(dirfd, exe, argp, envp, flags) > : safe_execve(exe, argp, envp); > @@ -8644,7 +8654,8 @@ execve_efault: > ret = -TARGET_EFAULT; > > execve_end: > - for (gp = guest_argp, q = argp; *q; gp += sizeof(abi_ulong), q++) { > + for (gp = guest_argp, q = argp + argp_offset; > + *q; gp += sizeof(abi_ulong), q++) { > if (get_user_ual(addr, gp) || !addr) { > break; > } > diff --git a/linux-user/user-internals.h b/linux-user/user-internals.h > index 46ffc093f4..ed3ed666a0 100644 > --- a/linux-user/user-internals.h > +++ b/linux-user/user-internals.h > @@ -30,6 +30,10 @@ void stop_all_tasks(void); > extern const char *qemu_uname_release; > extern unsigned long mmap_min_addr; > > +extern bool qemu_dup_for_children; > +extern int qemu_argc; > +extern char **qemu_argv; > + > typedef struct IOCTLEntry IOCTLEntry; > > typedef abi_long do_ioctl_fn(const IOCTLEntry *ie, uint8_t *buf_temp, > diff --git a/tests/tcg/multiarch/Makefile.target b/tests/tcg/multiarch/Makefile.target > index 78b83d5575..0e220953e7 100644 > --- a/tests/tcg/multiarch/Makefile.target > +++ b/tests/tcg/multiarch/Makefile.target > @@ -30,6 +30,14 @@ run-float_%: float_% > $(call conditional-diff-out,$<,$(SRC_PATH)/tests/tcg/$(TARGET_NAME)/$<.ref) > > > +run-linux-execve-qemu-children: linux-execve-qemu-children > + $(call run-test,$<, $(QEMU) $(QEMU_OPTS) -qemu-children $< $(QEMU) 0) > + $(call run-test,$<, $(QEMU) $(QEMU_OPTS) $< linux-execve 0 skip) > + > +run-plugin-linux-execve-qemu-children-with-%: linux-execve-qemu-children > + $(call run-test,$<, $(QEMU) $(QEMU_OPTS) -qemu-children $< $(QEMU) 0) > + $(call run-test,$<, $(QEMU) $(QEMU_OPTS) $< linux-execve 0 skip) > + > testthread: LDFLAGS+=-lpthread > > threadcount: LDFLAGS+=-lpthread > diff --git a/tests/tcg/multiarch/linux/linux-execve-qemu-children.c b/tests/tcg/multiarch/linux/linux-execve-qemu-children.c > new file mode 100644 > index 0000000000..60d6537666 > --- /dev/null > +++ b/tests/tcg/multiarch/linux/linux-execve-qemu-children.c > @@ -0,0 +1,68 @@ > +#include <assert.h> > +#include <errno.h> > +#include <fcntl.h> > +#include <libgen.h> > +#include <malloc.h> > +#include <stdio.h> > +#include <string.h> > +#include <unistd.h> > + > +#define MAX_COMM_SIZE (4096) > + > +int > +main(int argc, char ** argv, char ** envp) { > + int fd; > + char next_arg[2]; > + char * buf; > + ssize_t off; > + const char * expec_comm; > + assert(argc == 3 || argc == 4); > + fd = open("/proc/self/comm", O_RDONLY); > + assert(fd > 0); > + > + buf = calloc(MAX_COMM_SIZE + 1, 1); > + assert(buf != NULL); > + > + off = 0; > + for (;;) { > + ssize_t res = read(fd, buf + off, 1); > + if (res < 0 && errno != EAGAIN) { > + perror("Failed to read comm"); > + return -1; > + } > + if (res == 0) { > + break; > + } > + > + off += res; > + > + if (off >= MAX_COMM_SIZE) { > + fprintf(stderr, "/proc/self/comm too large for test\n"); > + return -1; > + } > + } > + assert(off && buf[off] == '\0' && buf[off - 1] == '\n'); > + buf[off - 1] = '\0'; > + expec_comm = basename(argv[1]); > + if (argc == 3 && strncmp(buf, expec_comm, strlen(expec_comm))) { > + fprintf(stderr, > + "Didn't propagate qemu settings\nComm: '%s'\nExpec: '%s'\n", > + buf, expec_comm); > + return -1; > + } > + free(buf); > + next_arg[0] = argv[2][0]; > + next_arg[1] = '\0'; > + if (next_arg[0] == '9') { > + return 0; > + } > + next_arg[0] += 1; > + char * next_args[] = { argv[0], argv[1], next_arg, NULL }; > + int eres = execve(argv[0], &next_args[0], envp); > + if (eres != 0) { > + fprintf(stderr, "Unable to execve: %d/%d -> %s\n", eres, errno, > + strerror(errno)); > + return -1; > + } > + return 0; > +} > -- > 2.43.0 > Added test that tests both old behavior (no propagation of qemu) and new behavior (propagation of qemu + cmdline). Tested on Aarch64 + Linux with: ``` make check-tcg ```
On 10/30/24 14:10, Noah Goldstein wrote: > The new option '-qemu-children' makes it so that on `execve` the child > process will be launch by the same `qemu` executable that is currently > running along with its current commandline arguments. > > The motivation for the change is to make it so that plugins running > through `qemu` can continue to run on children. Why not just > `binfmt`?: Plugins can be desirable regardless of system/architecture > emulation, and can sometimes be useful for elf files that can run > natively. Enabling `binfmt` for all natively runnable elf files may > not be desirable. > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > --- > linux-user/main.c | 21 ++++++ > linux-user/syscall.c | 21 ++++-- > linux-user/user-internals.h | 4 ++ > tests/tcg/multiarch/Makefile.target | 8 +++ > .../linux/linux-execve-qemu-children.c | 68 +++++++++++++++++++ > 5 files changed, 117 insertions(+), 5 deletions(-) > create mode 100644 tests/tcg/multiarch/linux/linux-execve-qemu-children.c > > diff --git a/linux-user/main.c b/linux-user/main.c > index 8143a0d4b0..5e3d41dc2b 100644 > --- a/linux-user/main.c > +++ b/linux-user/main.c > @@ -81,6 +81,10 @@ unsigned long mmap_min_addr; > uintptr_t guest_base; > bool have_guest_base; > > +bool qemu_dup_for_children; > +int qemu_argc; > +char **qemu_argv; > + > /* > * Used to implement backwards-compatibility for the `-strace`, and > * QEMU_STRACE options. Without this, the QEMU_LOG can be overwritten by > @@ -451,6 +455,11 @@ static void handle_arg_jitdump(const char *arg) > perf_enable_jitdump(); > } > > +static void handle_arg_qemu_children(const char *arg) > +{ > + qemu_dup_for_children = true; > +} > + > static QemuPluginList plugins = QTAILQ_HEAD_INITIALIZER(plugins); > > #ifdef CONFIG_PLUGIN > @@ -526,6 +535,10 @@ static const struct qemu_argument arg_table[] = { > "", "Generate a /tmp/perf-${pid}.map file for perf"}, > {"jitdump", "QEMU_JITDUMP", false, handle_arg_jitdump, > "", "Generate a jit-${pid}.dump file for perf"}, > + {"qemu-children", > + "QEMU_CHILDREN", false, handle_arg_qemu_children, > + "", "Run child processes (created with execve) with qemu " > + "(as instantiated for the parent)"}, > {NULL, NULL, false, NULL, NULL, NULL} > }; > > @@ -729,6 +742,14 @@ int main(int argc, char **argv, char **envp) > > optind = parse_args(argc, argv); > > + if (qemu_dup_for_children) { > + qemu_argc = optind; > + qemu_argv = g_new0(char *, qemu_argc); > + for (i = 0; i < optind; ++i) { > + qemu_argv[i] = strdup(argv[i]); g_strdup. > + bool through_qemu = dirfd == AT_FDCWD && qemu_dup_for_children; Why is this limited to AT_FDCWD? Why not for execvat too? > @@ -8628,9 +8631,16 @@ static int do_execv(CPUArchState *cpu_env, int dirfd, > } > > const char *exe = p; > - if (is_proc_myself(p, "exe")) { > + if (through_qemu) { > + int i; > + for (i = 0; i < argp_offset; ++i) { > + argp[i] = qemu_argv[i]; > + } > + exe = qemu_argv[0]; > + } else if (is_proc_myself(p, "exe")) { > exe = exec_path; > } > + You still need to handle is_proc_myself, for the guest binary. I wonder if those two cases are related. Do we need to also add an argument so that we can pass the executable to the next qemu via file descriptor? I.e. execvat becomes f = openat() execv(qemu, "-execfd", f) and is_proc_myself uses execfd, which we already have open. r~
On Tue, Nov 5, 2024 at 5:37 AM Richard Henderson <richard.henderson@linaro.org> wrote: > > On 10/30/24 14:10, Noah Goldstein wrote: > > The new option '-qemu-children' makes it so that on `execve` the child > > process will be launch by the same `qemu` executable that is currently > > running along with its current commandline arguments. > > > > The motivation for the change is to make it so that plugins running > > through `qemu` can continue to run on children. Why not just > > `binfmt`?: Plugins can be desirable regardless of system/architecture > > emulation, and can sometimes be useful for elf files that can run > > natively. Enabling `binfmt` for all natively runnable elf files may > > not be desirable. > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > --- > > linux-user/main.c | 21 ++++++ > > linux-user/syscall.c | 21 ++++-- > > linux-user/user-internals.h | 4 ++ > > tests/tcg/multiarch/Makefile.target | 8 +++ > > .../linux/linux-execve-qemu-children.c | 68 +++++++++++++++++++ > > 5 files changed, 117 insertions(+), 5 deletions(-) > > create mode 100644 tests/tcg/multiarch/linux/linux-execve-qemu-children.c > > > > diff --git a/linux-user/main.c b/linux-user/main.c > > index 8143a0d4b0..5e3d41dc2b 100644 > > --- a/linux-user/main.c > > +++ b/linux-user/main.c > > @@ -81,6 +81,10 @@ unsigned long mmap_min_addr; > > uintptr_t guest_base; > > bool have_guest_base; > > > > +bool qemu_dup_for_children; > > +int qemu_argc; > > +char **qemu_argv; > > + > > /* > > * Used to implement backwards-compatibility for the `-strace`, and > > * QEMU_STRACE options. Without this, the QEMU_LOG can be overwritten by > > @@ -451,6 +455,11 @@ static void handle_arg_jitdump(const char *arg) > > perf_enable_jitdump(); > > } > > > > +static void handle_arg_qemu_children(const char *arg) > > +{ > > + qemu_dup_for_children = true; > > +} > > + > > static QemuPluginList plugins = QTAILQ_HEAD_INITIALIZER(plugins); > > > > #ifdef CONFIG_PLUGIN > > @@ -526,6 +535,10 @@ static const struct qemu_argument arg_table[] = { > > "", "Generate a /tmp/perf-${pid}.map file for perf"}, > > {"jitdump", "QEMU_JITDUMP", false, handle_arg_jitdump, > > "", "Generate a jit-${pid}.dump file for perf"}, > > + {"qemu-children", > > + "QEMU_CHILDREN", false, handle_arg_qemu_children, > > + "", "Run child processes (created with execve) with qemu " > > + "(as instantiated for the parent)"}, > > {NULL, NULL, false, NULL, NULL, NULL} > > }; > > > > @@ -729,6 +742,14 @@ int main(int argc, char **argv, char **envp) > > > > optind = parse_args(argc, argv); > > > > + if (qemu_dup_for_children) { > > + qemu_argc = optind; > > + qemu_argv = g_new0(char *, qemu_argc); > > + for (i = 0; i < optind; ++i) { > > + qemu_argv[i] = strdup(argv[i]); > > g_strdup. ack > > > + bool through_qemu = dirfd == AT_FDCWD && qemu_dup_for_children; > > Why is this limited to AT_FDCWD? Why not for execvat too? > We could, initially it was because AFAICT qemu doesn't support executing a program relative to another dir, but it would be simple enough to just join the relative program path and path dirfd points to. Want me to add support? > > @@ -8628,9 +8631,16 @@ static int do_execv(CPUArchState *cpu_env, int dirfd, > > } > > > > const char *exe = p; > > - if (is_proc_myself(p, "exe")) { > > + if (through_qemu) { > > + int i; > > + for (i = 0; i < argp_offset; ++i) { > > + argp[i] = qemu_argv[i]; > > + } > > + exe = qemu_argv[0]; > > + } else if (is_proc_myself(p, "exe")) { > > exe = exec_path; > > } > > + > > You still need to handle is_proc_myself, for the guest binary. > > I wonder if those two cases are related. Do we need to also add an argument so that we > can pass the executable to the next qemu via file descriptor? I.e. execvat becomes > > f = openat() > execv(qemu, "-execfd", f) > > and is_proc_myself uses execfd, which we already have open. > > > r~
On Tue, Nov 5, 2024 at 5:48 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Tue, Nov 5, 2024 at 5:37 AM Richard Henderson > <richard.henderson@linaro.org> wrote: > > > > On 10/30/24 14:10, Noah Goldstein wrote: > > > The new option '-qemu-children' makes it so that on `execve` the child > > > process will be launch by the same `qemu` executable that is currently > > > running along with its current commandline arguments. > > > > > > The motivation for the change is to make it so that plugins running > > > through `qemu` can continue to run on children. Why not just > > > `binfmt`?: Plugins can be desirable regardless of system/architecture > > > emulation, and can sometimes be useful for elf files that can run > > > natively. Enabling `binfmt` for all natively runnable elf files may > > > not be desirable. > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > --- > > > linux-user/main.c | 21 ++++++ > > > linux-user/syscall.c | 21 ++++-- > > > linux-user/user-internals.h | 4 ++ > > > tests/tcg/multiarch/Makefile.target | 8 +++ > > > .../linux/linux-execve-qemu-children.c | 68 +++++++++++++++++++ > > > 5 files changed, 117 insertions(+), 5 deletions(-) > > > create mode 100644 tests/tcg/multiarch/linux/linux-execve-qemu-children.c > > > > > > diff --git a/linux-user/main.c b/linux-user/main.c > > > index 8143a0d4b0..5e3d41dc2b 100644 > > > --- a/linux-user/main.c > > > +++ b/linux-user/main.c > > > @@ -81,6 +81,10 @@ unsigned long mmap_min_addr; > > > uintptr_t guest_base; > > > bool have_guest_base; > > > > > > +bool qemu_dup_for_children; > > > +int qemu_argc; > > > +char **qemu_argv; > > > + > > > /* > > > * Used to implement backwards-compatibility for the `-strace`, and > > > * QEMU_STRACE options. Without this, the QEMU_LOG can be overwritten by > > > @@ -451,6 +455,11 @@ static void handle_arg_jitdump(const char *arg) > > > perf_enable_jitdump(); > > > } > > > > > > +static void handle_arg_qemu_children(const char *arg) > > > +{ > > > + qemu_dup_for_children = true; > > > +} > > > + > > > static QemuPluginList plugins = QTAILQ_HEAD_INITIALIZER(plugins); > > > > > > #ifdef CONFIG_PLUGIN > > > @@ -526,6 +535,10 @@ static const struct qemu_argument arg_table[] = { > > > "", "Generate a /tmp/perf-${pid}.map file for perf"}, > > > {"jitdump", "QEMU_JITDUMP", false, handle_arg_jitdump, > > > "", "Generate a jit-${pid}.dump file for perf"}, > > > + {"qemu-children", > > > + "QEMU_CHILDREN", false, handle_arg_qemu_children, > > > + "", "Run child processes (created with execve) with qemu " > > > + "(as instantiated for the parent)"}, > > > {NULL, NULL, false, NULL, NULL, NULL} > > > }; > > > > > > @@ -729,6 +742,14 @@ int main(int argc, char **argv, char **envp) > > > > > > optind = parse_args(argc, argv); > > > > > > + if (qemu_dup_for_children) { > > > + qemu_argc = optind; > > > + qemu_argv = g_new0(char *, qemu_argc); > > > + for (i = 0; i < optind; ++i) { > > > + qemu_argv[i] = strdup(argv[i]); > > > > g_strdup. > ack > > > > > + bool through_qemu = dirfd == AT_FDCWD && qemu_dup_for_children; > > > > Why is this limited to AT_FDCWD? Why not for execvat too? > > > > We could, initially it was because AFAICT qemu doesn't support executing a > program relative to another dir, but it would be simple enough to just join > the relative program path and path dirfd points to. > > Want me to add support? > > > @@ -8628,9 +8631,16 @@ static int do_execv(CPUArchState *cpu_env, int dirfd, > > > } > > > > > > const char *exe = p; > > > - if (is_proc_myself(p, "exe")) { > > > + if (through_qemu) { > > > + int i; > > > + for (i = 0; i < argp_offset; ++i) { > > > + argp[i] = qemu_argv[i]; > > > + } > > > + exe = qemu_argv[0]; > > > + } else if (is_proc_myself(p, "exe")) { > > > exe = exec_path; > > > } > > > + > > > > You still need to handle is_proc_myself, for the guest binary. Would this by handled by basically do: ``` if (is_proc_myself(p, "exe")) { exe = exec_path; if (through_qemu) argp[argp_offset] = exec_path; } ``` Or am I missing something? > > > > I wonder if those two cases are related. Do we need to also add an argument so that we > > can pass the executable to the next qemu via file descriptor? I.e. execvat becomes > > > > f = openat() > > execv(qemu, "-execfd", f) > > > > and is_proc_myself uses execfd, which we already have open. How does passing a fd from one process to another work? > > > > > > r~
On 11/5/24 23:54, Noah Goldstein wrote: >>> You still need to handle is_proc_myself, for the guest binary. > > Would this by handled by basically do: > > ``` > if (is_proc_myself(p, "exe")) { > exe = exec_path; > if (through_qemu) > argp[argp_offset] = exec_path; > } > ``` > Or am I missing something? Something like that, yes. >>> I wonder if those two cases are related. Do we need to also add an argument so that we >>> can pass the executable to the next qemu via file descriptor? I.e. execvat becomes >>> >>> f = openat() >>> execv(qemu, "-execfd", f) >>> >>> and is_proc_myself uses execfd, which we already have open. > > How does passing a fd from one process to another work? As long as the fd is not marked O_CLOEXEC, it stays open in the new process. Providing the number via command-line, or whatever, is sufficient for the new process to know what is going on. I now realize this is necessary for the AT_EMPTY_PATH flag, where we only have the file descriptor. r~
On Wed, Nov 6, 2024 at 3:38 AM Richard Henderson <richard.henderson@linaro.org> wrote: > > On 11/5/24 23:54, Noah Goldstein wrote: > >>> You still need to handle is_proc_myself, for the guest binary. > > > > Would this by handled by basically do: > > > > ``` > > if (is_proc_myself(p, "exe")) { > > exe = exec_path; > > if (through_qemu) > > argp[argp_offset] = exec_path; > > } > > ``` > > Or am I missing something? > > Something like that, yes. > > >>> I wonder if those two cases are related. Do we need to also add an argument so that we > >>> can pass the executable to the next qemu via file descriptor? I.e. execvat becomes > >>> > >>> f = openat() > >>> execv(qemu, "-execfd", f) > >>> > >>> and is_proc_myself uses execfd, which we already have open. > > > > How does passing a fd from one process to another work? > As long as the fd is not marked O_CLOEXEC, it stays open in the new process. Providing > the number via command-line, or whatever, is sufficient for the new process to know what > is going on. Err I guess I was thinking its a bit weird having an option that is only really applicable if qemu is a child process. I.e the `-execfd` argument is not usable from commandline. > > I now realize this is necessary for the AT_EMPTY_PATH flag, where we only have the file > descriptor. We could also do something along the lines of: ``` fd = openat(dirfd, exe); char new_exe[PATH_MAX]; char fd_path[PATH_MAX]; sprintf(fd_path, "/proc/self/fd/%d", fd); readlink(fd_path, new_exe, PATH_MAX); exe = new_exe; ``` > > > r~
On 11/6/24 17:03, Noah Goldstein wrote: > On Wed, Nov 6, 2024 at 3:38 AM Richard Henderson > <richard.henderson@linaro.org> wrote: >> >> On 11/5/24 23:54, Noah Goldstein wrote: >>>>> You still need to handle is_proc_myself, for the guest binary. >>> >>> Would this by handled by basically do: >>> >>> ``` >>> if (is_proc_myself(p, "exe")) { >>> exe = exec_path; >>> if (through_qemu) >>> argp[argp_offset] = exec_path; >>> } >>> ``` >>> Or am I missing something? >> >> Something like that, yes. >> >>>>> I wonder if those two cases are related. Do we need to also add an argument so that we >>>>> can pass the executable to the next qemu via file descriptor? I.e. execvat becomes >>>>> >>>>> f = openat() >>>>> execv(qemu, "-execfd", f) >>>>> >>>>> and is_proc_myself uses execfd, which we already have open. >>> >>> How does passing a fd from one process to another work? >> As long as the fd is not marked O_CLOEXEC, it stays open in the new process. Providing >> the number via command-line, or whatever, is sufficient for the new process to know what >> is going on. > > Err I guess I was thinking its a bit weird having an option that is > only really applicable > if qemu is a child process. I.e the `-execfd` argument is not usable > from commandline. qemu-foo -execfd 3 3< /some/file Or perhaps opened via other tooling. >> I now realize this is necessary for the AT_EMPTY_PATH flag, where we only have the file >> descriptor. > > We could also do something along the lines of: > > ``` > fd = openat(dirfd, exe); > char new_exe[PATH_MAX]; > char fd_path[PATH_MAX]; > sprintf(fd_path, "/proc/self/fd/%d", fd); > readlink(fd_path, new_exe, PATH_MAX); Reading the link doesn't always work. Reading or passing the link means AT_SYMLINK_NOFOLLOW isn't honored. r~
On Wed, Nov 6, 2024 at 11:26 AM Richard Henderson <richard.henderson@linaro.org> wrote: > > On 11/6/24 17:03, Noah Goldstein wrote: > > On Wed, Nov 6, 2024 at 3:38 AM Richard Henderson > > <richard.henderson@linaro.org> wrote: > >> > >> On 11/5/24 23:54, Noah Goldstein wrote: > >>>>> You still need to handle is_proc_myself, for the guest binary. > >>> > >>> Would this by handled by basically do: > >>> > >>> ``` > >>> if (is_proc_myself(p, "exe")) { > >>> exe = exec_path; > >>> if (through_qemu) > >>> argp[argp_offset] = exec_path; > >>> } > >>> ``` > >>> Or am I missing something? > >> > >> Something like that, yes. > >> > >>>>> I wonder if those two cases are related. Do we need to also add an argument so that we > >>>>> can pass the executable to the next qemu via file descriptor? I.e. execvat becomes > >>>>> > >>>>> f = openat() > >>>>> execv(qemu, "-execfd", f) > >>>>> > >>>>> and is_proc_myself uses execfd, which we already have open. > >>> > >>> How does passing a fd from one process to another work? > >> As long as the fd is not marked O_CLOEXEC, it stays open in the new process. Providing > >> the number via command-line, or whatever, is sufficient for the new process to know what > >> is going on. > > > > Err I guess I was thinking its a bit weird having an option that is > > only really applicable > > if qemu is a child process. I.e the `-execfd` argument is not usable > > from commandline. > > qemu-foo -execfd 3 3< /some/file > > Or perhaps opened via other tooling. > > >> I now realize this is necessary for the AT_EMPTY_PATH flag, where we only have the file > >> descriptor. > > > > We could also do something along the lines of: > > > > ``` > > fd = openat(dirfd, exe); > > char new_exe[PATH_MAX]; > > char fd_path[PATH_MAX]; > > sprintf(fd_path, "/proc/self/fd/%d", fd); > > readlink(fd_path, new_exe, PATH_MAX); > > Reading the link doesn't always work. > Reading or passing the link means AT_SYMLINK_NOFOLLOW isn't honored. Okay, fair enough, I will get started on adding `-execfd` > > > r~
On Wed, Nov 6, 2024 at 11:53 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Nov 6, 2024 at 11:26 AM Richard Henderson > <richard.henderson@linaro.org> wrote: > > > > On 11/6/24 17:03, Noah Goldstein wrote: > > > On Wed, Nov 6, 2024 at 3:38 AM Richard Henderson > > > <richard.henderson@linaro.org> wrote: > > >> > > >> On 11/5/24 23:54, Noah Goldstein wrote: > > >>>>> You still need to handle is_proc_myself, for the guest binary. > > >>> > > >>> Would this by handled by basically do: > > >>> > > >>> ``` > > >>> if (is_proc_myself(p, "exe")) { > > >>> exe = exec_path; > > >>> if (through_qemu) > > >>> argp[argp_offset] = exec_path; > > >>> } > > >>> ``` > > >>> Or am I missing something? > > >> > > >> Something like that, yes. > > >> > > >>>>> I wonder if those two cases are related. Do we need to also add an argument so that we > > >>>>> can pass the executable to the next qemu via file descriptor? I.e. execvat becomes > > >>>>> > > >>>>> f = openat() > > >>>>> execv(qemu, "-execfd", f) > > >>>>> > > >>>>> and is_proc_myself uses execfd, which we already have open. > > >>> > > >>> How does passing a fd from one process to another work? > > >> As long as the fd is not marked O_CLOEXEC, it stays open in the new process. Providing > > >> the number via command-line, or whatever, is sufficient for the new process to know what > > >> is going on. > > > > > > Err I guess I was thinking its a bit weird having an option that is > > > only really applicable > > > if qemu is a child process. I.e the `-execfd` argument is not usable > > > from commandline. > > > > qemu-foo -execfd 3 3< /some/file > > > > Or perhaps opened via other tooling. > > > > >> I now realize this is necessary for the AT_EMPTY_PATH flag, where we only have the file > > >> descriptor. > > > > > > We could also do something along the lines of: > > > > > > ``` > > > fd = openat(dirfd, exe); > > > char new_exe[PATH_MAX]; > > > char fd_path[PATH_MAX]; > > > sprintf(fd_path, "/proc/self/fd/%d", fd); > > > readlink(fd_path, new_exe, PATH_MAX); > > > > Reading the link doesn't always work. > > Reading or passing the link means AT_SYMLINK_NOFOLLOW isn't honored. > > Okay, fair enough, I will get started on adding `-execfd` Question about impl regarding handling of `-execfd` with/without a program name. 1) `-execfd` + program name ie: `qemu -execfd <some_fd> ls -a`. 2) `-execfd` without program name i.e: `qemu -execfd <some_fd> -a`. Do you want to allow both of these? If you want to allow (1), what should we use for `argv[0]`/`exec_path`. The program pass ("ls") or `readlink(<some_fd>)`?
On 11/6/24 18:13, Noah Goldstein wrote: > Question about impl regarding handling of `-execfd` with/without a program name. > > 1) `-execfd` + program name ie: `qemu -execfd <some_fd> ls -a`. > 2) `-execfd` without program name i.e: `qemu -execfd <some_fd> -a`. > > Do you want to allow both of these? If you want to allow (1), what should > we use for `argv[0]`/`exec_path`. The program pass ("ls") or > `readlink(<some_fd>)`? The canonical response is, examine the kernel source. We're not implementing this in a vacuum, you're replicating execveat(2). I suspect the answer is (1), to be compared with syscall(__NR_execveat, some_fd, "", &["ls", "-a"], env, AT_EMPTY_PATH); r~
On Wed, Nov 6, 2024 at 3:10 PM Richard Henderson <richard.henderson@linaro.org> wrote: > > On 11/6/24 18:13, Noah Goldstein wrote: > > Question about impl regarding handling of `-execfd` with/without a program name. > > > > 1) `-execfd` + program name ie: `qemu -execfd <some_fd> ls -a`. > > 2) `-execfd` without program name i.e: `qemu -execfd <some_fd> -a`. > > > > Do you want to allow both of these? If you want to allow (1), what should > > we use for `argv[0]`/`exec_path`. The program pass ("ls") or > > `readlink(<some_fd>)`? > > The canonical response is, examine the kernel source. > We're not implementing this in a vacuum, you're replicating execveat(2). > > I suspect the answer is (1), to be compared with > > syscall(__NR_execveat, some_fd, "", &["ls", "-a"], env, AT_EMPTY_PATH); Err, I think the reference for '-execfd' is `fexecve`: https://man7.org/linux/man-pages/man3/fexecve.3.html Which doesn't take a path. So I guess we just interpret the "ls" as argv[0] but not as "exec_path". > > > r~
On Wed, Nov 6, 2024 at 3:30 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Nov 6, 2024 at 3:10 PM Richard Henderson > <richard.henderson@linaro.org> wrote: > > > > On 11/6/24 18:13, Noah Goldstein wrote: > > > Question about impl regarding handling of `-execfd` with/without a program name. > > > > > > 1) `-execfd` + program name ie: `qemu -execfd <some_fd> ls -a`. > > > 2) `-execfd` without program name i.e: `qemu -execfd <some_fd> -a`. > > > > > > Do you want to allow both of these? If you want to allow (1), what should > > > we use for `argv[0]`/`exec_path`. The program pass ("ls") or > > > `readlink(<some_fd>)`? > > > > The canonical response is, examine the kernel source. > > We're not implementing this in a vacuum, you're replicating execveat(2). > > > > I suspect the answer is (1), to be compared with > > > > syscall(__NR_execveat, some_fd, "", &["ls", "-a"], env, AT_EMPTY_PATH); > > Err, I think the reference for '-execfd' is `fexecve`: > https://man7.org/linux/man-pages/man3/fexecve.3.html > > Which doesn't take a path. So I guess we just interpret the "ls" as > argv[0] but not > as "exec_path". One more point, what should the behavior be if we have AT_EXECFD from binfmt-misc? > > > > > > r~
On 11/6/24 21:30, Noah Goldstein wrote: > On Wed, Nov 6, 2024 at 3:10 PM Richard Henderson > <richard.henderson@linaro.org> wrote: >> >> On 11/6/24 18:13, Noah Goldstein wrote: >>> Question about impl regarding handling of `-execfd` with/without a program name. >>> >>> 1) `-execfd` + program name ie: `qemu -execfd <some_fd> ls -a`. >>> 2) `-execfd` without program name i.e: `qemu -execfd <some_fd> -a`. >>> >>> Do you want to allow both of these? If you want to allow (1), what should >>> we use for `argv[0]`/`exec_path`. The program pass ("ls") or >>> `readlink(<some_fd>)`? >> >> The canonical response is, examine the kernel source. >> We're not implementing this in a vacuum, you're replicating execveat(2). >> >> I suspect the answer is (1), to be compared with >> >> syscall(__NR_execveat, some_fd, "", &["ls", "-a"], env, AT_EMPTY_PATH); > > Err, I think the reference for '-execfd' is `fexecve`: > https://man7.org/linux/man-pages/man3/fexecve.3.html No, fexecve(3) is a glibc function which (nowadays) uses the execveat(2) syscall that we want to emulate. > Which doesn't take a path... ... corresponding to the "" and AT_EMPTY_PATH above. > So I guess we just interpret the "ls" as argv[0] but not as "exec_path". But your conclusion is correct. r~
On 11/6/24 23:49, Noah Goldstein wrote: > On Wed, Nov 6, 2024 at 3:30 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> On Wed, Nov 6, 2024 at 3:10 PM Richard Henderson >> <richard.henderson@linaro.org> wrote: >>> >>> On 11/6/24 18:13, Noah Goldstein wrote: >>>> Question about impl regarding handling of `-execfd` with/without a program name. >>>> >>>> 1) `-execfd` + program name ie: `qemu -execfd <some_fd> ls -a`. >>>> 2) `-execfd` without program name i.e: `qemu -execfd <some_fd> -a`. >>>> >>>> Do you want to allow both of these? If you want to allow (1), what should >>>> we use for `argv[0]`/`exec_path`. The program pass ("ls") or >>>> `readlink(<some_fd>)`? >>> >>> The canonical response is, examine the kernel source. >>> We're not implementing this in a vacuum, you're replicating execveat(2). >>> >>> I suspect the answer is (1), to be compared with >>> >>> syscall(__NR_execveat, some_fd, "", &["ls", "-a"], env, AT_EMPTY_PATH); >> >> Err, I think the reference for '-execfd' is `fexecve`: >> https://man7.org/linux/man-pages/man3/fexecve.3.html >> >> Which doesn't take a path. So I guess we just interpret the "ls" as >> argv[0] but not >> as "exec_path". > > One more point, what should the behavior be if we have > AT_EXECFD from binfmt-misc? You mean precedence of AT_EXECFD vs the command-line option? Arbitrary, since it would be a usage error to have both. You'd have to do something silly with the binfmt-misc rule for that to happen. Perhaps static int execfd = -1; // option processing // main if (execfd < 0) { errno = 0; execfd = qemu_getauxval(AT_EXECFD); if (errno != 0) { execfd = open(...); } } just because that's a simple change to what's currently present. r~
diff --git a/linux-user/main.c b/linux-user/main.c index 8143a0d4b0..5e3d41dc2b 100644 --- a/linux-user/main.c +++ b/linux-user/main.c @@ -81,6 +81,10 @@ unsigned long mmap_min_addr; uintptr_t guest_base; bool have_guest_base; +bool qemu_dup_for_children; +int qemu_argc; +char **qemu_argv; + /* * Used to implement backwards-compatibility for the `-strace`, and * QEMU_STRACE options. Without this, the QEMU_LOG can be overwritten by @@ -451,6 +455,11 @@ static void handle_arg_jitdump(const char *arg) perf_enable_jitdump(); } +static void handle_arg_qemu_children(const char *arg) +{ + qemu_dup_for_children = true; +} + static QemuPluginList plugins = QTAILQ_HEAD_INITIALIZER(plugins); #ifdef CONFIG_PLUGIN @@ -526,6 +535,10 @@ static const struct qemu_argument arg_table[] = { "", "Generate a /tmp/perf-${pid}.map file for perf"}, {"jitdump", "QEMU_JITDUMP", false, handle_arg_jitdump, "", "Generate a jit-${pid}.dump file for perf"}, + {"qemu-children", + "QEMU_CHILDREN", false, handle_arg_qemu_children, + "", "Run child processes (created with execve) with qemu " + "(as instantiated for the parent)"}, {NULL, NULL, false, NULL, NULL, NULL} }; @@ -729,6 +742,14 @@ int main(int argc, char **argv, char **envp) optind = parse_args(argc, argv); + if (qemu_dup_for_children) { + qemu_argc = optind; + qemu_argv = g_new0(char *, qemu_argc); + for (i = 0; i < optind; ++i) { + qemu_argv[i] = strdup(argv[i]); + } + } + qemu_set_log_filename_flags(last_log_filename, last_log_mask | (enable_strace * LOG_STRACE), &error_fatal); diff --git a/linux-user/syscall.c b/linux-user/syscall.c index 59b2080b98..96b105e9ce 100644 --- a/linux-user/syscall.c +++ b/linux-user/syscall.c @@ -8550,13 +8550,14 @@ static int do_execv(CPUArchState *cpu_env, int dirfd, abi_long pathname, abi_long guest_argp, abi_long guest_envp, int flags, bool is_execveat) { - int ret; + int ret, argp_offset; char **argp, **envp; int argc, envc; abi_ulong gp; abi_ulong addr; char **q; void *p; + bool through_qemu = dirfd == AT_FDCWD && qemu_dup_for_children; argc = 0; @@ -8580,10 +8581,12 @@ static int do_execv(CPUArchState *cpu_env, int dirfd, envc++; } - argp = g_new0(char *, argc + 1); + argp_offset = through_qemu ? qemu_argc : 0; + argp = g_new0(char *, argc + argp_offset + 1); envp = g_new0(char *, envc + 1); - for (gp = guest_argp, q = argp; gp; gp += sizeof(abi_ulong), q++) { + for (gp = guest_argp, q = argp + argp_offset; + gp; gp += sizeof(abi_ulong), q++) { if (get_user_ual(addr, gp)) { goto execve_efault; } @@ -8628,9 +8631,16 @@ static int do_execv(CPUArchState *cpu_env, int dirfd, } const char *exe = p; - if (is_proc_myself(p, "exe")) { + if (through_qemu) { + int i; + for (i = 0; i < argp_offset; ++i) { + argp[i] = qemu_argv[i]; + } + exe = qemu_argv[0]; + } else if (is_proc_myself(p, "exe")) { exe = exec_path; } + ret = is_execveat ? safe_execveat(dirfd, exe, argp, envp, flags) : safe_execve(exe, argp, envp); @@ -8644,7 +8654,8 @@ execve_efault: ret = -TARGET_EFAULT; execve_end: - for (gp = guest_argp, q = argp; *q; gp += sizeof(abi_ulong), q++) { + for (gp = guest_argp, q = argp + argp_offset; + *q; gp += sizeof(abi_ulong), q++) { if (get_user_ual(addr, gp) || !addr) { break; } diff --git a/linux-user/user-internals.h b/linux-user/user-internals.h index 46ffc093f4..ed3ed666a0 100644 --- a/linux-user/user-internals.h +++ b/linux-user/user-internals.h @@ -30,6 +30,10 @@ void stop_all_tasks(void); extern const char *qemu_uname_release; extern unsigned long mmap_min_addr; +extern bool qemu_dup_for_children; +extern int qemu_argc; +extern char **qemu_argv; + typedef struct IOCTLEntry IOCTLEntry; typedef abi_long do_ioctl_fn(const IOCTLEntry *ie, uint8_t *buf_temp, diff --git a/tests/tcg/multiarch/Makefile.target b/tests/tcg/multiarch/Makefile.target index 78b83d5575..0e220953e7 100644 --- a/tests/tcg/multiarch/Makefile.target +++ b/tests/tcg/multiarch/Makefile.target @@ -30,6 +30,14 @@ run-float_%: float_% $(call conditional-diff-out,$<,$(SRC_PATH)/tests/tcg/$(TARGET_NAME)/$<.ref) +run-linux-execve-qemu-children: linux-execve-qemu-children + $(call run-test,$<, $(QEMU) $(QEMU_OPTS) -qemu-children $< $(QEMU) 0) + $(call run-test,$<, $(QEMU) $(QEMU_OPTS) $< linux-execve 0 skip) + +run-plugin-linux-execve-qemu-children-with-%: linux-execve-qemu-children + $(call run-test,$<, $(QEMU) $(QEMU_OPTS) -qemu-children $< $(QEMU) 0) + $(call run-test,$<, $(QEMU) $(QEMU_OPTS) $< linux-execve 0 skip) + testthread: LDFLAGS+=-lpthread threadcount: LDFLAGS+=-lpthread diff --git a/tests/tcg/multiarch/linux/linux-execve-qemu-children.c b/tests/tcg/multiarch/linux/linux-execve-qemu-children.c new file mode 100644 index 0000000000..60d6537666 --- /dev/null +++ b/tests/tcg/multiarch/linux/linux-execve-qemu-children.c @@ -0,0 +1,68 @@ +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <libgen.h> +#include <malloc.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#define MAX_COMM_SIZE (4096) + +int +main(int argc, char ** argv, char ** envp) { + int fd; + char next_arg[2]; + char * buf; + ssize_t off; + const char * expec_comm; + assert(argc == 3 || argc == 4); + fd = open("/proc/self/comm", O_RDONLY); + assert(fd > 0); + + buf = calloc(MAX_COMM_SIZE + 1, 1); + assert(buf != NULL); + + off = 0; + for (;;) { + ssize_t res = read(fd, buf + off, 1); + if (res < 0 && errno != EAGAIN) { + perror("Failed to read comm"); + return -1; + } + if (res == 0) { + break; + } + + off += res; + + if (off >= MAX_COMM_SIZE) { + fprintf(stderr, "/proc/self/comm too large for test\n"); + return -1; + } + } + assert(off && buf[off] == '\0' && buf[off - 1] == '\n'); + buf[off - 1] = '\0'; + expec_comm = basename(argv[1]); + if (argc == 3 && strncmp(buf, expec_comm, strlen(expec_comm))) { + fprintf(stderr, + "Didn't propagate qemu settings\nComm: '%s'\nExpec: '%s'\n", + buf, expec_comm); + return -1; + } + free(buf); + next_arg[0] = argv[2][0]; + next_arg[1] = '\0'; + if (next_arg[0] == '9') { + return 0; + } + next_arg[0] += 1; + char * next_args[] = { argv[0], argv[1], next_arg, NULL }; + int eres = execve(argv[0], &next_args[0], envp); + if (eres != 0) { + fprintf(stderr, "Unable to execve: %d/%d -> %s\n", eres, errno, + strerror(errno)); + return -1; + } + return 0; +}
The new option '-qemu-children' makes it so that on `execve` the child process will be launch by the same `qemu` executable that is currently running along with its current commandline arguments. The motivation for the change is to make it so that plugins running through `qemu` can continue to run on children. Why not just `binfmt`?: Plugins can be desirable regardless of system/architecture emulation, and can sometimes be useful for elf files that can run natively. Enabling `binfmt` for all natively runnable elf files may not be desirable. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> --- linux-user/main.c | 21 ++++++ linux-user/syscall.c | 21 ++++-- linux-user/user-internals.h | 4 ++ tests/tcg/multiarch/Makefile.target | 8 +++ .../linux/linux-execve-qemu-children.c | 68 +++++++++++++++++++ 5 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 tests/tcg/multiarch/linux/linux-execve-qemu-children.c