diff mbox

[1/6] oslib-posix: add helpers for stack alloc and free

Message ID 1467272240-32123-2-git-send-email-pl@kamp.de (mailing list archive)
State New, archived
Headers show

Commit Message

Peter Lieven June 30, 2016, 7:37 a.m. UTC
Suggested-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Lieven <pl@kamp.de>
---
 include/sysemu/os-posix.h | 24 ++++++++++++++++++++++++
 util/oslib-posix.c        | 22 ++++++++++++++++++++++
 2 files changed, 46 insertions(+)

Comments

Richard Henderson July 1, 2016, 8:12 p.m. UTC | #1
On 06/30/2016 12:37 AM, Peter Lieven wrote:
> +void *qemu_alloc_stack(size_t sz)
> +{
> +    /* allocate sz bytes plus one extra page for a guard
> +     * page at the bottom of the stack */
> +    void *ptr = mmap(NULL, sz + getpagesize(), PROT_NONE,
> +                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +    if (ptr == MAP_FAILED) {
> +        abort();
> +    }
> +    if (mmap(ptr + getpagesize(), sz, PROT_READ | PROT_WRITE,
> +        MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == MAP_FAILED) {
> +        abort();
> +    }

Why two mmap instead of mmap + mprotect?


r~
Richard Henderson July 1, 2016, 8:49 p.m. UTC | #2
On 07/01/2016 01:12 PM, Richard Henderson wrote:
> On 06/30/2016 12:37 AM, Peter Lieven wrote:
>> +void *qemu_alloc_stack(size_t sz)
>> +{
>> +    /* allocate sz bytes plus one extra page for a guard
>> +     * page at the bottom of the stack */
>> +    void *ptr = mmap(NULL, sz + getpagesize(), PROT_NONE,
>> +                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>> +    if (ptr == MAP_FAILED) {
>> +        abort();
>> +    }
>> +    if (mmap(ptr + getpagesize(), sz, PROT_READ | PROT_WRITE,
>> +        MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == MAP_FAILED) {
>> +        abort();
>> +    }

Rare platforms now, but fwiw, this is incorrect for hppa and ia64.

For hppa, stack grows up, so the guard page needs to be at the top.

For ia64, there are two stacks, the "normal" program stack (grows down) and the 
register window stack (grows up).  The guard page goes in between.

See e.g. glibc/nptl/allocatestack.c

#ifdef NEED_SEPARATE_REGISTER_STACK
           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
#elif _STACK_GROWS_DOWN
           char *guard = mem;
#elif _STACK_GROWS_UP
           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
#endif
           if (mprotect (guard, guardsize, PROT_NONE) != 0)



r~
Peter Lieven July 4, 2016, 6:16 a.m. UTC | #3
Am 01.07.2016 um 22:12 schrieb Richard Henderson:
> On 06/30/2016 12:37 AM, Peter Lieven wrote:
>> +void *qemu_alloc_stack(size_t sz)
>> +{
>> +    /* allocate sz bytes plus one extra page for a guard
>> +     * page at the bottom of the stack */
>> +    void *ptr = mmap(NULL, sz + getpagesize(), PROT_NONE,
>> +                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>> +    if (ptr == MAP_FAILED) {
>> +        abort();
>> +    }
>> +    if (mmap(ptr + getpagesize(), sz, PROT_READ | PROT_WRITE,
>> +        MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == MAP_FAILED) {
>> +        abort();
>> +    }
>
> Why two mmap instead of mmap + mprotect?
>
>
> r~

I was looking at qemu_ram_mmap too much. mprotect seems the cleaner solution.

Thanks,
Peter
Peter Lieven July 4, 2016, 6:18 a.m. UTC | #4
Am 01.07.2016 um 22:49 schrieb Richard Henderson:
> On 07/01/2016 01:12 PM, Richard Henderson wrote:
>> On 06/30/2016 12:37 AM, Peter Lieven wrote:
>>> +void *qemu_alloc_stack(size_t sz)
>>> +{
>>> +    /* allocate sz bytes plus one extra page for a guard
>>> +     * page at the bottom of the stack */
>>> +    void *ptr = mmap(NULL, sz + getpagesize(), PROT_NONE,
>>> +                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>>> +    if (ptr == MAP_FAILED) {
>>> +        abort();
>>> +    }
>>> +    if (mmap(ptr + getpagesize(), sz, PROT_READ | PROT_WRITE,
>>> +        MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == MAP_FAILED) {
>>> +        abort();
>>> +    }
>
> Rare platforms now, but fwiw, this is incorrect for hppa and ia64.
>
> For hppa, stack grows up, so the guard page needs to be at the top.
>
> For ia64, there are two stacks, the "normal" program stack (grows down) and the register window stack (grows up).  The guard page goes in between.
>
> See e.g. glibc/nptl/allocatestack.c
>
> #ifdef NEED_SEPARATE_REGISTER_STACK
>           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
> #elif _STACK_GROWS_DOWN
>           char *guard = mem;
> #elif _STACK_GROWS_UP
>           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
> #endif
>           if (mprotect (guard, guardsize, PROT_NONE) != 0)

It seems that ia64 needs even more care when allocating a stack, right?
Would you think it is ok to only handle _STACK_GROWS_DOWN and _STACK_GROWS_UP ?

Peter
Paolo Bonzini July 4, 2016, 10:19 a.m. UTC | #5
On 04/07/2016 08:18, Peter Lieven wrote:
> Am 01.07.2016 um 22:49 schrieb Richard Henderson:
>> On 07/01/2016 01:12 PM, Richard Henderson wrote:
>>> On 06/30/2016 12:37 AM, Peter Lieven wrote:
>>>> +void *qemu_alloc_stack(size_t sz)
>>>> +{
>>>> +    /* allocate sz bytes plus one extra page for a guard
>>>> +     * page at the bottom of the stack */
>>>> +    void *ptr = mmap(NULL, sz + getpagesize(), PROT_NONE,
>>>> +                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>>>> +    if (ptr == MAP_FAILED) {
>>>> +        abort();
>>>> +    }
>>>> +    if (mmap(ptr + getpagesize(), sz, PROT_READ | PROT_WRITE,
>>>> +        MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
>>>> MAP_FAILED) {
>>>> +        abort();
>>>> +    }
>>
>> Rare platforms now, but fwiw, this is incorrect for hppa and ia64.
>>
>> For hppa, stack grows up, so the guard page needs to be at the top.
>>
>> For ia64, there are two stacks, the "normal" program stack (grows
>> down) and the register window stack (grows up).  The guard page goes
>> in between.
>>
>> See e.g. glibc/nptl/allocatestack.c
>>
>> #ifdef NEED_SEPARATE_REGISTER_STACK
>>           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
>> #elif _STACK_GROWS_DOWN
>>           char *guard = mem;
>> #elif _STACK_GROWS_UP
>>           char *guard = (char *) (((uintptr_t) pd - guardsize) &
>> ~pagesize_m1);
>> #endif
>>           if (mprotect (guard, guardsize, PROT_NONE) != 0)
> 
> It seems that ia64 needs even more care when allocating a stack, right?

No, you just pass the stack and the runtime takes care of initializing
the two stack pointers:

    uc.uc_link = &old_uc;
    uc.uc_stack.ss_sp = co->stack;
    uc.uc_stack.ss_size = stack_size;
    uc.uc_stack.ss_flags = 0;

FWIW, I've heard about some experiments with "split" stacks on x86 too.
In this case variables that escaped went on the "grows up" part, while
everything else (parameters, spills and call return addresses) stayed on
the hardware "grows down" part.  The result is more secure and actually
even faster because of better TLB locality.

Paolo

> Would you think it is ok to only handle _STACK_GROWS_DOWN and
> _STACK_GROWS_UP ?
> 
> Peter
Peter Lieven July 4, 2016, 10:25 a.m. UTC | #6
Am 04.07.2016 um 12:19 schrieb Paolo Bonzini:
>
> On 04/07/2016 08:18, Peter Lieven wrote:
>> Am 01.07.2016 um 22:49 schrieb Richard Henderson:
>>> On 07/01/2016 01:12 PM, Richard Henderson wrote:
>>>> On 06/30/2016 12:37 AM, Peter Lieven wrote:
>>>>> +void *qemu_alloc_stack(size_t sz)
>>>>> +{
>>>>> +    /* allocate sz bytes plus one extra page for a guard
>>>>> +     * page at the bottom of the stack */
>>>>> +    void *ptr = mmap(NULL, sz + getpagesize(), PROT_NONE,
>>>>> +                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>>>>> +    if (ptr == MAP_FAILED) {
>>>>> +        abort();
>>>>> +    }
>>>>> +    if (mmap(ptr + getpagesize(), sz, PROT_READ | PROT_WRITE,
>>>>> +        MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
>>>>> MAP_FAILED) {
>>>>> +        abort();
>>>>> +    }
>>> Rare platforms now, but fwiw, this is incorrect for hppa and ia64.
>>>
>>> For hppa, stack grows up, so the guard page needs to be at the top.
>>>
>>> For ia64, there are two stacks, the "normal" program stack (grows
>>> down) and the register window stack (grows up).  The guard page goes
>>> in between.
>>>
>>> See e.g. glibc/nptl/allocatestack.c
>>>
>>> #ifdef NEED_SEPARATE_REGISTER_STACK
>>>            char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
>>> #elif _STACK_GROWS_DOWN
>>>            char *guard = mem;
>>> #elif _STACK_GROWS_UP
>>>            char *guard = (char *) (((uintptr_t) pd - guardsize) &
>>> ~pagesize_m1);
>>> #endif
>>>            if (mprotect (guard, guardsize, PROT_NONE) != 0)
>> It seems that ia64 needs even more care when allocating a stack, right?
> No, you just pass the stack and the runtime takes care of initializing
> the two stack pointers:
>
>      uc.uc_link = &old_uc;
>      uc.uc_stack.ss_sp = co->stack;
>      uc.uc_stack.ss_size = stack_size;
>      uc.uc_stack.ss_flags = 0;
>
> FWIW, I've heard about some experiments with "split" stacks on x86 too.
> In this case variables that escaped went on the "grows up" part, while
> everything else (parameters, spills and call return addresses) stayed on
> the hardware "grows down" part.  The result is more secure and actually
> even faster because of better TLB locality.

So, you would basically copy the if/elif part from allocatestack.c ?

In this case I would also count the guardpage as part of the stack, so
that the usable stack size is actually reduced by one page?

Peter
Paolo Bonzini July 4, 2016, 10:34 a.m. UTC | #7
On 04/07/2016 12:25, Peter Lieven wrote:
>>>
>> No, you just pass the stack and the runtime takes care of initializing
>> the two stack pointers:
>>
>>      uc.uc_link = &old_uc;
>>      uc.uc_stack.ss_sp = co->stack;
>>      uc.uc_stack.ss_size = stack_size;
>>      uc.uc_stack.ss_flags = 0;
> 
> So, you would basically copy the if/elif part from allocatestack.c ?

Yes, but note that _STACK_GROWS_{DOWN,UP} and
NEED_SEPARATE_REGISTER_STACK are glibc-specific.  You need to use
HOST_IA64 and HOST_HPPA.

> In this case I would also count the guardpage as part of the stack, so
> that the usable stack size is actually reduced by one page?

Yes.

Paolo
Peter Lieven July 4, 2016, 10:35 a.m. UTC | #8
Am 04.07.2016 um 12:34 schrieb Paolo Bonzini:
>
> On 04/07/2016 12:25, Peter Lieven wrote:
>>> No, you just pass the stack and the runtime takes care of initializing
>>> the two stack pointers:
>>>
>>>       uc.uc_link = &old_uc;
>>>       uc.uc_stack.ss_sp = co->stack;
>>>       uc.uc_stack.ss_size = stack_size;
>>>       uc.uc_stack.ss_flags = 0;
>> So, you would basically copy the if/elif part from allocatestack.c ?
> Yes, but note that _STACK_GROWS_{DOWN,UP} and
> NEED_SEPARATE_REGISTER_STACK are glibc-specific.  You need to use
> HOST_IA64 and HOST_HPPA.

Is HOST_HPPA the only supported target where the stack grows up?

Peter
Paolo Bonzini July 4, 2016, 10:48 a.m. UTC | #9
On 04/07/2016 12:35, Peter Lieven wrote:
> Am 04.07.2016 um 12:34 schrieb Paolo Bonzini:
>>
>> On 04/07/2016 12:25, Peter Lieven wrote:
>>>> No, you just pass the stack and the runtime takes care of initializing
>>>> the two stack pointers:
>>>>
>>>>       uc.uc_link = &old_uc;
>>>>       uc.uc_stack.ss_sp = co->stack;
>>>>       uc.uc_stack.ss_size = stack_size;
>>>>       uc.uc_stack.ss_flags = 0;
>>> So, you would basically copy the if/elif part from allocatestack.c ?
>> Yes, but note that _STACK_GROWS_{DOWN,UP} and
>> NEED_SEPARATE_REGISTER_STACK are glibc-specific.  You need to use
>> HOST_IA64 and HOST_HPPA.
> 
> Is HOST_HPPA the only supported target where the stack grows up?

Yes, I think so.

Paolo
diff mbox

Patch

diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h
index 9c7dfdf..34d102c 100644
--- a/include/sysemu/os-posix.h
+++ b/include/sysemu/os-posix.h
@@ -60,4 +60,28 @@  int qemu_utimens(const char *path, const qemu_timespec *times);
 
 bool is_daemonized(void);
 
+/**
+ * qemu_alloc_stack:
+ * @sz: size of required stack in bytes
+ *
+ * Allocate memory that can be used as a stack, for instance for
+ * coroutines. If the memory cannot be allocated, this function
+ * will abort (like g_malloc()). The function will register a
+ * guard page right below the stack memory to catch stack overflows.
+ *
+ * The allocated stack must be freed with qemu_free_stack().
+ *
+ * Returns: pointer to (the lowest address of) the stack memory.
+ */
+void *qemu_alloc_stack(size_t sz);
+
+/**
+ * qemu_free_stack:
+ * @stack: stack to free
+ * @sz: size of stack in bytes
+ *
+ * Free a stack allocated via qemu_alloc_stack().
+ */
+void qemu_free_stack(void *stack, size_t sz);
+
 #endif
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index e2e1d4d..a567a7d 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -497,3 +497,25 @@  pid_t qemu_fork(Error **errp)
     }
     return pid;
 }
+
+void *qemu_alloc_stack(size_t sz)
+{
+    /* allocate sz bytes plus one extra page for a guard
+     * page at the bottom of the stack */
+    void *ptr = mmap(NULL, sz + getpagesize(), PROT_NONE,
+                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (ptr == MAP_FAILED) {
+        abort();
+    }
+    if (mmap(ptr + getpagesize(), sz, PROT_READ | PROT_WRITE,
+        MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == MAP_FAILED) {
+        abort();
+    }
+    return ptr + getpagesize();
+}
+
+void qemu_free_stack(void *stack, size_t sz)
+{
+    /* unmap the stack plus the extra guard page */
+    munmap(stack - getpagesize(), sz + getpagesize());
+}