diff mbox

[PULL,v2,22/38] linux-user: Provide safe_syscall for fixing races between signals and syscalls

Message ID 4d330cee37a21aabfc619a1948953559e66951a4.1464353863.git.riku.voipio@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Riku Voipio May 27, 2016, 1 p.m. UTC
From: Timothy E Baldwin <T.E.Baldwin99@members.leeds.ac.uk>

If a signal is delivered immediately before a blocking system call the
handler will only be called after the system call returns, which may be a
long time later or never.

This is fixed by using a function (safe_syscall) that checks if a guest
signal is pending prior to making a system call, and if so does not call the
system call and returns -TARGET_ERESTARTSYS. If a signal is received between
the check and the system call host_signal_handler() rewinds execution to
before the check. This rewinding has the effect of closing the race window
so that safe_syscall will reliably either (a) go into the host syscall
with no unprocessed guest signals pending or or (b) return
-TARGET_ERESTARTSYS so that the caller can deal with the signals.
Implementing this requires a per-host-architecture assembly language
fragment.

This will also resolve the mishandling of the SA_RESTART flag where
we would restart a host system call and not call the guest signal handler
until the syscall finally completed -- syscall restarting now always
happens at the guest syscall level so the guest signal handler will run.
(The host syscall will never be restarted because if the host kernel
rewinds the PC to point at the syscall insn for a restart then our
host_signal_handler() will see this and arrange the guest PC rewind.)

This commit contains the infrastructure for implementing safe_syscall
and the assembly language fragment for x86-64, but does not change any
syscalls to use it.

Signed-off-by: Timothy Edward Baldwin <T.E.Baldwin99@members.leeds.ac.uk>
Message-id: 1441497448-32489-14-git-send-email-T.E.Baldwin99@members.leeds.ac.uk
[PMM:
 * Avoid having an architecture if-ladder in configure by putting
   linux-user/host/$(ARCH) on the include path and including
   safe-syscall.inc.S from it
 * Avoid ifdef ladder in signal.c by creating new hostdep.h to hold
   host-architecture-specific things
 * Added copyright/license header to safe-syscall.inc.S
 * Rewrote commit message
 * Added comments to safe-syscall.inc.S
 * Changed calling convention of safe_syscall() to match syscall()
   (returns -1 and host error in errno on failure)
 * Added a long comment in qemu.h about how to use safe_syscall()
   to implement guest syscalls.
]
RV: squashed Peters "fixup! linux-user: compile on non-x86-64 hosts"
patch
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 Makefile.target                           |   7 +-
 linux-user/Makefile.objs                  |   3 +-
 linux-user/host/generic/hostdep.h         |  20 +++++
 linux-user/host/x86_64/hostdep.h          |  38 +++++++++
 linux-user/host/x86_64/safe-syscall.inc.S |  81 +++++++++++++++++++
 linux-user/qemu.h                         | 127 +++++++++++++++++++++++++++++-
 linux-user/safe-syscall.S                 |  30 +++++++
 linux-user/signal.c                       |  10 +++
 linux-user/syscall.c                      |  47 +++++++++++
 9 files changed, 360 insertions(+), 3 deletions(-)
 create mode 100644 linux-user/host/generic/hostdep.h
 create mode 100644 linux-user/host/x86_64/hostdep.h
 create mode 100644 linux-user/host/x86_64/safe-syscall.inc.S
 create mode 100644 linux-user/safe-syscall.S
diff mbox

Patch

diff --git a/Makefile.target b/Makefile.target
index 34ddb7e..5b80dd7 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -108,7 +108,12 @@  obj-$(CONFIG_LIBDECNUMBER) += libdecnumber/dpd/decimal128.o
 
 ifdef CONFIG_LINUX_USER
 
-QEMU_CFLAGS+=-I$(SRC_PATH)/linux-user/$(TARGET_ABI_DIR) -I$(SRC_PATH)/linux-user
+# Note that we only add linux-user/host/$ARCH if it exists, and
+# that it must come before linux-user/host/generic in the search path.
+QEMU_CFLAGS+=-I$(SRC_PATH)/linux-user/$(TARGET_ABI_DIR) \
+             $(patsubst %,-I%,$(wildcard $(SRC_PATH)/linux-user/host/$(ARCH))) \
+             -I$(SRC_PATH)/linux-user/host/generic \
+             -I$(SRC_PATH)/linux-user
 
 obj-y += linux-user/
 obj-y += gdbstub.o thunk.o user-exec.o
diff --git a/linux-user/Makefile.objs b/linux-user/Makefile.objs
index fd50217..8c93058 100644
--- a/linux-user/Makefile.objs
+++ b/linux-user/Makefile.objs
@@ -1,5 +1,6 @@ 
 obj-y = main.o syscall.o strace.o mmap.o signal.o \
-	elfload.o linuxload.o uaccess.o uname.o
+	elfload.o linuxload.o uaccess.o uname.o \
+	safe-syscall.o
 
 obj-$(TARGET_HAS_BFLT) += flatload.o
 obj-$(TARGET_I386) += vm86.o
diff --git a/linux-user/host/generic/hostdep.h b/linux-user/host/generic/hostdep.h
new file mode 100644
index 0000000..cfabc35
--- /dev/null
+++ b/linux-user/host/generic/hostdep.h
@@ -0,0 +1,20 @@ 
+/*
+ * hostdep.h : fallback generic version of header for things
+ * which are dependent on the host architecture
+ *
+ *  * Written by Peter Maydell <peter.maydell@linaro.org>
+ *
+ * Copyright (C) 2016 Linaro Limited
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_HOSTDEP_H
+#define QEMU_HOSTDEP_H
+
+/* This is the fallback header which is only used if the host
+ * architecture doesn't provide one in linux-user/host/$ARCH.
+ */
+
+#endif
diff --git a/linux-user/host/x86_64/hostdep.h b/linux-user/host/x86_64/hostdep.h
new file mode 100644
index 0000000..9dfbf3a
--- /dev/null
+++ b/linux-user/host/x86_64/hostdep.h
@@ -0,0 +1,38 @@ 
+/*
+ * hostdep.h : things which are dependent on the host architecture
+ *
+ *  * Written by Peter Maydell <peter.maydell@linaro.org>
+ *
+ * Copyright (C) 2016 Linaro Limited
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_HOSTDEP_H
+#define QEMU_HOSTDEP_H
+
+/* We have a safe-syscall.inc.S */
+#define HAVE_SAFE_SYSCALL
+
+#ifndef __ASSEMBLER__
+
+/* These are defined by the safe-syscall.inc.S file */
+extern char safe_syscall_start[];
+extern char safe_syscall_end[];
+
+/* Adjust the signal context to rewind out of safe-syscall if we're in it */
+static inline void rewind_if_in_safe_syscall(void *puc)
+{
+    struct ucontext *uc = puc;
+    greg_t *pcreg = &uc->uc_mcontext.gregs[REG_RIP];
+
+    if (*pcreg > (uintptr_t)safe_syscall_start
+        && *pcreg < (uintptr_t)safe_syscall_end) {
+        *pcreg = (uintptr_t)safe_syscall_start;
+    }
+}
+
+#endif /* __ASSEMBLER__ */
+
+#endif
diff --git a/linux-user/host/x86_64/safe-syscall.inc.S b/linux-user/host/x86_64/safe-syscall.inc.S
new file mode 100644
index 0000000..dde434c
--- /dev/null
+++ b/linux-user/host/x86_64/safe-syscall.inc.S
@@ -0,0 +1,81 @@ 
+/*
+ * safe-syscall.inc.S : host-specific assembly fragment
+ * to handle signals occurring at the same time as system calls.
+ * This is intended to be included by linux-user/safe-syscall.S
+ *
+ * Copyright (C) 2015 Timothy Edward Baldwin <T.E.Baldwin99@members.leeds.ac.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+        .global safe_syscall_base
+        .global safe_syscall_start
+        .global safe_syscall_end
+        .type   safe_syscall_base, @function
+
+        /* This is the entry point for making a system call. The calling
+         * convention here is that of a C varargs function with the
+         * first argument an 'int *' to the signal_pending flag, the
+         * second one the system call number (as a 'long'), and all further
+         * arguments being syscall arguments (also 'long').
+         * We return a long which is the syscall's return value, which
+         * may be negative-errno on failure. Conversion to the
+         * -1-and-errno-set convention is done by the calling wrapper.
+         */
+safe_syscall_base:
+        /* This saves a frame pointer and aligns the stack for the syscall.
+         * (It's unclear if the syscall ABI has the same stack alignment
+         * requirements as the userspace function call ABI, but better safe than
+         * sorry. Appendix A2 of http://www.x86-64.org/documentation/abi.pdf
+         * does not list any ABI differences regarding stack alignment.)
+         */
+        push    %rbp
+
+        /* The syscall calling convention isn't the same as the
+         * C one:
+         * we enter with rdi == *signal_pending
+         *               rsi == syscall number
+         *               rdx, rcx, r8, r9, (stack), (stack) == syscall arguments
+         *               and return the result in rax
+         * and the syscall instruction needs
+         *               rax == syscall number
+         *               rdi, rsi, rdx, r10, r8, r9 == syscall arguments
+         *               and returns the result in rax
+         * Shuffle everything around appropriately.
+         * Note that syscall will trash rcx and r11.
+         */
+        mov     %rsi, %rax /* syscall number */
+        mov     %rdi, %rbp /* signal_pending pointer */
+        /* and the syscall arguments */
+        mov     %rdx, %rdi
+        mov     %rcx, %rsi
+        mov     %r8,  %rdx
+        mov     %r9,  %r10
+        mov     16(%rsp), %r8
+        mov     24(%rsp), %r9
+
+        /* This next sequence of code works in conjunction with the
+         * rewind_if_safe_syscall_function(). If a signal is taken
+         * and the interrupted PC is anywhere between 'safe_syscall_start'
+         * and 'safe_syscall_end' then we rewind it to 'safe_syscall_start'.
+         * The code sequence must therefore be able to cope with this, and
+         * the syscall instruction must be the final one in the sequence.
+         */
+safe_syscall_start:
+        /* if signal_pending is non-zero, don't do the call */
+        testl   $1, (%rbp)
+        jnz     return_ERESTARTSYS
+        syscall
+safe_syscall_end:
+        /* code path for having successfully executed the syscall */
+        pop     %rbp
+        ret
+
+return_ERESTARTSYS:
+        /* code path when we didn't execute the syscall */
+        mov     $-TARGET_ERESTARTSYS, %rax
+        pop     %rbp
+        ret
+
+        .size   safe_syscall_base, .-safe_syscall_base
diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index 208c63e..f09b750 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -1,7 +1,7 @@ 
 #ifndef QEMU_H
 #define QEMU_H
 
-
+#include "hostdep.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
@@ -205,6 +205,131 @@  unsigned long init_guest_space(unsigned long host_start,
 
 #include "qemu/log.h"
 
+/* safe_syscall.S */
+
+/**
+ * safe_syscall:
+ * @int number: number of system call to make
+ * ...: arguments to the system call
+ *
+ * Call a system call if guest signal not pending.
+ * This has the same API as the libc syscall() function, except that it
+ * may return -1 with errno == TARGET_ERESTARTSYS if a signal was pending.
+ *
+ * Returns: the system call result, or -1 with an error code in errno
+ * (Errnos are host errnos; we rely on TARGET_ERESTARTSYS not clashing
+ * with any of the host errno values.)
+ */
+
+/* A guide to using safe_syscall() to handle interactions between guest
+ * syscalls and guest signals:
+ *
+ * Guest syscalls come in two flavours:
+ *
+ * (1) Non-interruptible syscalls
+ *
+ * These are guest syscalls that never get interrupted by signals and
+ * so never return EINTR. They can be implemented straightforwardly in
+ * QEMU: just make sure that if the implementation code has to make any
+ * blocking calls that those calls are retried if they return EINTR.
+ * It's also OK to implement these with safe_syscall, though it will be
+ * a little less efficient if a signal is delivered at the 'wrong' moment.
+ *
+ * (2) Interruptible syscalls
+ *
+ * These are guest syscalls that can be interrupted by signals and
+ * for which we need to either return EINTR or arrange for the guest
+ * syscall to be restarted. This category includes both syscalls which
+ * always restart (and in the kernel return -ERESTARTNOINTR), ones
+ * which only restart if there is no handler (kernel returns -ERESTARTNOHAND
+ * or -ERESTART_RESTARTBLOCK), and the most common kind which restart
+ * if the handler was registered with SA_RESTART (kernel returns
+ * -ERESTARTSYS). System calls which are only interruptible in some
+ * situations (like 'open') also need to be handled this way.
+ *
+ * Here it is important that the host syscall is made
+ * via this safe_syscall() function, and *not* via the host libc.
+ * If the host libc is used then the implementation will appear to work
+ * most of the time, but there will be a race condition where a
+ * signal could arrive just before we make the host syscall inside libc,
+ * and then then guest syscall will not correctly be interrupted.
+ * Instead the implementation of the guest syscall can use the safe_syscall
+ * function but otherwise just return the result or errno in the usual
+ * way; the main loop code will take care of restarting the syscall
+ * if appropriate.
+ *
+ * (If the implementation needs to make multiple host syscalls this is
+ * OK; any which might really block must be via safe_syscall(); for those
+ * which are only technically blocking (ie which we know in practice won't
+ * stay in the host kernel indefinitely) it's OK to use libc if necessary.
+ * You must be able to cope with backing out correctly if some safe_syscall
+ * you make in the implementation returns either -TARGET_ERESTARTSYS or
+ * EINTR though.)
+ *
+ *
+ * How and why the safe_syscall implementation works:
+ *
+ * The basic setup is that we make the host syscall via a known
+ * section of host native assembly. If a signal occurs, our signal
+ * handler checks the interrupted host PC against the addresse of that
+ * known section. If the PC is before or at the address of the syscall
+ * instruction then we change the PC to point at a "return
+ * -TARGET_ERESTARTSYS" code path instead, and then exit the signal handler
+ * (causing the safe_syscall() call to immediately return that value).
+ * Then in the main.c loop if we see this magic return value we adjust
+ * the guest PC to wind it back to before the system call, and invoke
+ * the guest signal handler as usual.
+ *
+ * This winding-back will happen in two cases:
+ * (1) signal came in just before we took the host syscall (a race);
+ *   in this case we'll take the guest signal and have another go
+ *   at the syscall afterwards, and this is indistinguishable for the
+ *   guest from the timing having been different such that the guest
+ *   signal really did win the race
+ * (2) signal came in while the host syscall was blocking, and the
+ *   host kernel decided the syscall should be restarted;
+ *   in this case we want to restart the guest syscall also, and so
+ *   rewinding is the right thing. (Note that "restart" semantics mean
+ *   "first call the signal handler, then reattempt the syscall".)
+ * The other situation to consider is when a signal came in while the
+ * host syscall was blocking, and the host kernel decided that the syscall
+ * should not be restarted; in this case QEMU's host signal handler will
+ * be invoked with the PC pointing just after the syscall instruction,
+ * with registers indicating an EINTR return; the special code in the
+ * handler will not kick in, and we will return EINTR to the guest as
+ * we should.
+ *
+ * Notice that we can leave the host kernel to make the decision for
+ * us about whether to do a restart of the syscall or not; we do not
+ * need to check SA_RESTART flags in QEMU or distinguish the various
+ * kinds of restartability.
+ */
+#ifdef HAVE_SAFE_SYSCALL
+/* The core part of this function is implemented in assembly */
+extern long safe_syscall_base(int *pending, long number, ...);
+
+#define safe_syscall(...)                                               \
+    ({                                                                  \
+        long ret_;                                                      \
+        int *psp_ = &((TaskState *)thread_cpu->opaque)->signal_pending; \
+        ret_ = safe_syscall_base(psp_, __VA_ARGS__);                    \
+        if (is_error(ret_)) {                                           \
+            errno = -ret_;                                              \
+            ret_ = -1;                                                  \
+        }                                                               \
+        ret_;                                                           \
+    })
+
+#else
+
+/* Fallback for architectures which don't yet provide a safe-syscall assembly
+ * fragment; note that this is racy!
+ * This should go away when all host architectures have been updated.
+ */
+#define safe_syscall syscall
+
+#endif
+
 /* syscall.c */
 int host_to_target_waitstatus(int status);
 
diff --git a/linux-user/safe-syscall.S b/linux-user/safe-syscall.S
new file mode 100644
index 0000000..b5df625
--- /dev/null
+++ b/linux-user/safe-syscall.S
@@ -0,0 +1,30 @@ 
+/*
+ * safe-syscall.S : include the host-specific assembly fragment
+ * to handle signals occurring at the same time as system calls.
+ *
+ * Written by Peter Maydell <peter.maydell@linaro.org>
+ *
+ * Copyright (C) 2016 Linaro Limited
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "hostdep.h"
+#include "errno_defs.h"
+
+/* We have the correct host directory on our include path
+ * so that this will pull in the right fragment for the architecture.
+ */
+#ifdef HAVE_SAFE_SYSCALL
+#include "safe-syscall.inc.S"
+#endif
+
+/* We must specifically say that we're happy for the stack to not be
+ * executable, otherwise the toolchain will default to assuming our
+ * assembly needs an executable stack and the whole QEMU binary will
+ * needlessly end up with one. This should be the last thing in this file.
+ */
+#if defined(__linux__) && defined(__ELF__)
+.section        .note.GNU-stack, "", %progbits
+#endif
diff --git a/linux-user/signal.c b/linux-user/signal.c
index 9e85550..ff4de4f 100644
--- a/linux-user/signal.c
+++ b/linux-user/signal.c
@@ -561,6 +561,13 @@  int queue_signal(CPUArchState *env, int sig, target_siginfo_t *info)
     }
 }
 
+#ifndef HAVE_SAFE_SYSCALL
+static inline void rewind_if_in_safe_syscall(void *puc)
+{
+    /* Default version: never rewind */
+}
+#endif
+
 static void host_signal_handler(int host_signum, siginfo_t *info,
                                 void *puc)
 {
@@ -581,6 +588,9 @@  static void host_signal_handler(int host_signum, siginfo_t *info,
     if (sig < 1 || sig > TARGET_NSIG)
         return;
     trace_user_host_signal(env, host_signum, sig);
+
+    rewind_if_in_safe_syscall(puc);
+
     host_to_target_siginfo_noswap(&tinfo, info);
     if (queue_signal(env, sig, &tinfo) == 1) {
         /* interrupt the virtual CPU as soon as possible */
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index ced519d..b6a8ed6 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -660,6 +660,53 @@  char *target_strerror(int err)
     return strerror(target_to_host_errno(err));
 }
 
+#define safe_syscall0(type, name) \
+static type safe_##name(void) \
+{ \
+    return safe_syscall(__NR_##name); \
+}
+
+#define safe_syscall1(type, name, type1, arg1) \
+static type safe_##name(type1 arg1) \
+{ \
+    return safe_syscall(__NR_##name, arg1); \
+}
+
+#define safe_syscall2(type, name, type1, arg1, type2, arg2) \
+static type safe_##name(type1 arg1, type2 arg2) \
+{ \
+    return safe_syscall(__NR_##name, arg1, arg2); \
+}
+
+#define safe_syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \
+static type safe_##name(type1 arg1, type2 arg2, type3 arg3) \
+{ \
+    return safe_syscall(__NR_##name, arg1, arg2, arg3); \
+}
+
+#define safe_syscall4(type, name, type1, arg1, type2, arg2, type3, arg3, \
+    type4, arg4) \
+static type safe_##name(type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
+{ \
+    return safe_syscall(__NR_##name, arg1, arg2, arg3, arg4); \
+}
+
+#define safe_syscall5(type, name, type1, arg1, type2, arg2, type3, arg3, \
+    type4, arg4, type5, arg5) \
+static type safe_##name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+    type5 arg5) \
+{ \
+    return safe_syscall(__NR_##name, arg1, arg2, arg3, arg4, arg5); \
+}
+
+#define safe_syscall6(type, name, type1, arg1, type2, arg2, type3, arg3, \
+    type4, arg4, type5, arg5, type6, arg6) \
+static type safe_##name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+    type5 arg5, type6 arg6) \
+{ \
+    return safe_syscall(__NR_##name, arg1, arg2, arg3, arg4, arg5, arg6); \
+}
+
 static inline int host_to_target_sock_type(int host_type)
 {
     int target_type;