diff mbox series

[v2] tcg: Fix execution on Apple Silicon

Message ID 20210103145055.11074-1-r.bolshakov@yadro.com (mailing list archive)
State New, archived
Headers show
Series [v2] tcg: Fix execution on Apple Silicon | expand

Commit Message

Roman Bolshakov Jan. 3, 2021, 2:50 p.m. UTC
Pages can't be both write and executable at the same time on Apple
Silicon. macOS provides public API to switch write protection [1] for
JIT applications, like TCG.

1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon

Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
---
v1: https://lists.gnu.org/archive/html/qemu-devel/2021-01/msg00073.html
Changes since v1:

 - Pruned not needed fiddling with W^X and dropped symmetry from write
   lock/unlock and renamed related functions.
   Similar approach is used in JavaScriptCore [1].

 - Moved jit helper functions to util/osdep
                                                                                                                                                  As outlined in osdep.h, this matches to (2):                                                                                                                                                                                                                                                    * In an ideal world this header would contain only:                                                                                            *  (1) things which everybody needs                                                                                                            *  (2) things without which code would work on most platforms but                                                                              *      fail to compile or misbehave on a minority of host OSes

 - Fixed a checkpatch error

 - Limit new behaviour only to macOS 11.0 and above, because of the
   following declarations:

   __API_AVAILABLE(macos(11.0))
   __API_UNAVAILABLE(ios, tvos, watchos)
   void pthread_jit_write_protect_np(int enabled);

   __API_AVAILABLE(macos(11.0))
   __API_UNAVAILABLE(ios, tvos, watchos)
   int pthread_jit_write_protect_supported_np(void);

 1. https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch

 accel/tcg/cpu-exec.c      |  2 ++
 accel/tcg/translate-all.c |  6 ++++++
 include/qemu/osdep.h      |  3 +++
 tcg/tcg.c                 |  1 +
 util/osdep.c              | 22 ++++++++++++++++++++++
 5 files changed, 34 insertions(+)

Comments

Joelle van Dyne Jan. 3, 2021, 4:52 p.m. UTC | #1
MAC_OS_VERSION_11_0 is always defined. You can see in
usr/include/AvailabilityVersions.h

...

#define MAC_OS_X_VERSION_10_15      101500
#define MAC_OS_X_VERSION_10_15_1    101501
#define MAC_OS_X_VERSION_10_16      101600
#define MAC_OS_VERSION_11_0         110000

The proper way is to do an __builtin_available check but that assumes
you have the symbol for pthread_jit_write_protect_np which you won't
if building on 10.15, so you need a configure time check as well. I
have a newer version of my patch that I haven't submitted yet because
I was waiting for some other patches to go in first, but I can
decouple it from the iOS stuff and submit it as a separate patchset.

-j

On Sun, Jan 3, 2021 at 6:54 AM Roman Bolshakov <r.bolshakov@yadro.com> wrote:
>
> Pages can't be both write and executable at the same time on Apple
> Silicon. macOS provides public API to switch write protection [1] for
> JIT applications, like TCG.
>
> 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
>
> Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> ---
> v1: https://lists.gnu.org/archive/html/qemu-devel/2021-01/msg00073.html
> Changes since v1:
>
>  - Pruned not needed fiddling with W^X and dropped symmetry from write
>    lock/unlock and renamed related functions.
>    Similar approach is used in JavaScriptCore [1].
>
>  - Moved jit helper functions to util/osdep
>                                                                                                                                                   As outlined in osdep.h, this matches to (2):                                                                                                                                                                                                                                                    * In an ideal world this header would contain only:                                                                                            *  (1) things which everybody needs                                                                                                            *  (2) things without which code would work on most platforms but                                                                              *      fail to compile or misbehave on a minority of host OSes
>
>  - Fixed a checkpatch error
>
>  - Limit new behaviour only to macOS 11.0 and above, because of the
>    following declarations:
>
>    __API_AVAILABLE(macos(11.0))
>    __API_UNAVAILABLE(ios, tvos, watchos)
>    void pthread_jit_write_protect_np(int enabled);
>
>    __API_AVAILABLE(macos(11.0))
>    __API_UNAVAILABLE(ios, tvos, watchos)
>    int pthread_jit_write_protect_supported_np(void);
>
>  1. https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch
>
>  accel/tcg/cpu-exec.c      |  2 ++
>  accel/tcg/translate-all.c |  6 ++++++
>  include/qemu/osdep.h      |  3 +++
>  tcg/tcg.c                 |  1 +
>  util/osdep.c              | 22 ++++++++++++++++++++++
>  5 files changed, 34 insertions(+)
>
> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> index 8689c54499..374060eb45 100644
> --- a/accel/tcg/cpu-exec.c
> +++ b/accel/tcg/cpu-exec.c
> @@ -175,6 +175,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
>      }
>  #endif /* DEBUG_DISAS */
>
> +    qemu_thread_jit_execute();
>      ret = tcg_qemu_tb_exec(env, tb_ptr);
>      cpu->can_do_io = 1;
>      last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
> @@ -382,6 +383,7 @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
>  {
>      uintptr_t old;
>
> +    qemu_thread_jit_write();
>      assert(n < ARRAY_SIZE(tb->jmp_list_next));
>      qemu_spin_lock(&tb_next->jmp_lock);
>
> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> index b7d50a73d4..88ae5d35ef 100644
> --- a/accel/tcg/translate-all.c
> +++ b/accel/tcg/translate-all.c
> @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
>      size_t size = tcg_ctx->code_gen_buffer_size;
>      void *buf;
>
> +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
> +    flags |= MAP_JIT;
> +#endif
>      buf = mmap(NULL, size, prot, flags, -1, 0);
>      if (buf == MAP_FAILED) {
>          return NULL;
> @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
>
>  static void tb_phys_invalidate__locked(TranslationBlock *tb)
>  {
> +    qemu_thread_jit_write();
>      do_tb_phys_invalidate(tb, true);
> +    qemu_thread_jit_execute();
>  }
>
>  /* invalidate one TB
> @@ -1687,6 +1692,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>  #endif
>
>      assert_memory_lock();
> +    qemu_thread_jit_write();
>
>      phys_pc = get_page_addr_code(env, pc);
>
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index f9ec8c84e9..89abebcf5d 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -686,4 +686,7 @@ char *qemu_get_host_name(Error **errp);
>   */
>  size_t qemu_get_host_physmem(void);
>
> +void qemu_thread_jit_write(void);
> +void qemu_thread_jit_execute(void);
> +
>  #endif
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index 43c6cf8f52..ab8488f5d5 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
>      s->pool_labels = NULL;
>  #endif
>
> +    qemu_thread_jit_write();
>      /* Generate the prologue.  */
>      tcg_target_qemu_prologue(s);
>
> diff --git a/util/osdep.c b/util/osdep.c
> index 66d01b9160..80ec7185da 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -606,3 +606,25 @@ writev(int fd, const struct iovec *iov, int iov_cnt)
>      return readv_writev(fd, iov, iov_cnt, true);
>  }
>  #endif
> +
> +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
> +static inline void qemu_thread_jit_write_protect(bool enabled)
> +{
> +    if (pthread_jit_write_protect_supported_np()) {
> +        pthread_jit_write_protect_np(enabled);
> +    }
> +}
> +
> +void qemu_thread_jit_execute(void)
> +{
> +    qemu_thread_jit_write_protect(true);
> +}
> +
> +void qemu_thread_jit_write(void)
> +{
> +    qemu_thread_jit_write_protect(false);
> +}
> +#else
> +void qemu_thread_jit_write(void) {}
> +void qemu_thread_jit_execute(void) {}
> +#endif
> --
> 2.29.2
>
Roman Bolshakov Jan. 3, 2021, 9:21 p.m. UTC | #2
On Sun, Jan 03, 2021 at 08:52:52AM -0800, Joelle van Dyne wrote:
> MAC_OS_VERSION_11_0 is always defined. You can see in
> usr/include/AvailabilityVersions.h
> 

It's not defined on my old MPB that has Catalina (10.15.7). The last
entries are:

#define MAC_OS_X_VERSION_10_15      101500
#define MAC_OS_X_VERSION_10_15_1    101501

I was able to compile the patch on Catalina without any issues (and I've
checked Catalina SDK doesn't provide pthread_jit_write_protect).

> ...
> 
> #define MAC_OS_X_VERSION_10_15      101500
> #define MAC_OS_X_VERSION_10_15_1    101501
> #define MAC_OS_X_VERSION_10_16      101600
> #define MAC_OS_VERSION_11_0         110000
> 
> The proper way is to do an __builtin_available check but that assumes
> you have the symbol for pthread_jit_write_protect_np which you won't
> if building on 10.15, so you need a configure time check as well.

__builtin_available is a clang extension and I'm not sure if it's
available on GCC. But I can surely add a config-time check in v3 if you
find it more preferred for iOS host support.

> I have a newer version of my patch that I haven't submitted yet
> because I was waiting for some other patches to go in first, but I can
> decouple it from the iOS stuff and submit it as a separate patchset.
> 

I'm sorry I stepped in... I didn't want to bother anyone during NY
holidays and couldn't ask for new patch revision. So I hacked it for
myself because I recently got M1 laptop and some spare time off work. In
the patch I wanted to avoid conflicts with your iOS host support patches
by limiting the patch only to macOS.

Hopefully, qemu_thread_jit_write/execute provides the room to add
reverse-enginereed implementation of pthread_jit_write_protect_np for
iOS 13 in UTM app.

Thanks,
Roman
Alex Bennée Jan. 4, 2021, 3:23 p.m. UTC | #3
Roman Bolshakov <r.bolshakov@yadro.com> writes:

> Pages can't be both write and executable at the same time on Apple
> Silicon. macOS provides public API to switch write protection [1] for
> JIT applications, like TCG.
>
> 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
>
> Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> ---
> v1: https://lists.gnu.org/archive/html/qemu-devel/2021-01/msg00073.html
> Changes since v1:
>
>  - Pruned not needed fiddling with W^X and dropped symmetry from write
>    lock/unlock and renamed related functions.
>    Similar approach is used in JavaScriptCore [1].
>
>  - Moved jit helper functions to util/osdep
>                                                                                                                                                   As outlined in osdep.h, this matches to (2):                                                                                                                                                                                                                                                    * In an ideal world this header would contain only:                                                                                            *  (1) things which everybody needs                                                                                                            *  (2) things without which code would work on most platforms but                                                                              *      fail to compile or misbehave on a minority of host OSes
>
>  - Fixed a checkpatch error
>
>  - Limit new behaviour only to macOS 11.0 and above, because of the
>    following declarations:
>
>    __API_AVAILABLE(macos(11.0))
>    __API_UNAVAILABLE(ios, tvos, watchos)
>    void pthread_jit_write_protect_np(int enabled);
>
>    __API_AVAILABLE(macos(11.0))
>    __API_UNAVAILABLE(ios, tvos, watchos)
>    int pthread_jit_write_protect_supported_np(void);
>
>  1. https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch
>
>  accel/tcg/cpu-exec.c      |  2 ++
>  accel/tcg/translate-all.c |  6 ++++++
>  include/qemu/osdep.h      |  3 +++
>  tcg/tcg.c                 |  1 +
>  util/osdep.c              | 22 ++++++++++++++++++++++
>  5 files changed, 34 insertions(+)
>
> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> index 8689c54499..374060eb45 100644
> --- a/accel/tcg/cpu-exec.c
> +++ b/accel/tcg/cpu-exec.c
> @@ -175,6 +175,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
>      }
>  #endif /* DEBUG_DISAS */
>  
> +    qemu_thread_jit_execute();
>      ret = tcg_qemu_tb_exec(env, tb_ptr);
>      cpu->can_do_io = 1;
>      last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
> @@ -382,6 +383,7 @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
>  {
>      uintptr_t old;
>  
> +    qemu_thread_jit_write();
>      assert(n < ARRAY_SIZE(tb->jmp_list_next));
>      qemu_spin_lock(&tb_next->jmp_lock);
>  
> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> index b7d50a73d4..88ae5d35ef 100644
> --- a/accel/tcg/translate-all.c
> +++ b/accel/tcg/translate-all.c
> @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
>      size_t size = tcg_ctx->code_gen_buffer_size;
>      void *buf;
>  
> +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
> +    flags |= MAP_JIT;
> +#endif
>      buf = mmap(NULL, size, prot, flags, -1, 0);
>      if (buf == MAP_FAILED) {
>          return NULL;
> @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
>  
>  static void tb_phys_invalidate__locked(TranslationBlock *tb)
>  {
> +    qemu_thread_jit_write();
>      do_tb_phys_invalidate(tb, true);
> +    qemu_thread_jit_execute();
>  }
>  
>  /* invalidate one TB
> @@ -1687,6 +1692,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>  #endif
>  
>      assert_memory_lock();
> +    qemu_thread_jit_write();
>  
>      phys_pc = get_page_addr_code(env, pc);
>  
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index f9ec8c84e9..89abebcf5d 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -686,4 +686,7 @@ char *qemu_get_host_name(Error **errp);
>   */
>  size_t qemu_get_host_physmem(void);
>  
> +void qemu_thread_jit_write(void);
> +void qemu_thread_jit_execute(void);
> +
>  #endif
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index 43c6cf8f52..ab8488f5d5 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
>      s->pool_labels = NULL;
>  #endif
>  
> +    qemu_thread_jit_write();
>      /* Generate the prologue.  */
>      tcg_target_qemu_prologue(s);
>  
> diff --git a/util/osdep.c b/util/osdep.c
> index 66d01b9160..80ec7185da 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -606,3 +606,25 @@ writev(int fd, const struct iovec *iov, int iov_cnt)
>      return readv_writev(fd, iov, iov_cnt, true);
>  }
>  #endif
> +
> +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
> +static inline void qemu_thread_jit_write_protect(bool enabled)
> +{
> +    if (pthread_jit_write_protect_supported_np()) {
> +        pthread_jit_write_protect_np(enabled);
> +    }
> +}
> +
> +void qemu_thread_jit_execute(void)
> +{
> +    qemu_thread_jit_write_protect(true);
> +}
> +
> +void qemu_thread_jit_write(void)
> +{
> +    qemu_thread_jit_write_protect(false);
> +}

What happens if you emulate a -smp 2 ARM guest? In this case MTTCG
should be enabled (same guest ordering) but you run a risk of attempting
to execute code while write is enabled.

Is there any way to only change the mapping for the parts of the TB
cache used by a thread? Otherwise we'll need additional logic in
default_mttcg_enabled to ensure we don't accidentally enable it on Apple
silicon.

> +#else
> +void qemu_thread_jit_write(void) {}
> +void qemu_thread_jit_execute(void) {}
> +#endif
Alexander Graf Jan. 4, 2021, 6:39 p.m. UTC | #4
On 04.01.21 16:23, Alex Bennée wrote:
> Roman Bolshakov <r.bolshakov@yadro.com> writes:
>
>> Pages can't be both write and executable at the same time on Apple
>> Silicon. macOS provides public API to switch write protection [1] for
>> JIT applications, like TCG.
>>
>> 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
>>
>> Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
>> ---
>> v1: https://lists.gnu.org/archive/html/qemu-devel/2021-01/msg00073.html
>> Changes since v1:
>>
>>   - Pruned not needed fiddling with W^X and dropped symmetry from write
>>     lock/unlock and renamed related functions.
>>     Similar approach is used in JavaScriptCore [1].
>>
>>   - Moved jit helper functions to util/osdep
>>                                                                                                                                                    As outlined in osdep.h, this matches to (2):                                                                                                                                                                                                                                                    * In an ideal world this header would contain only:                                                                                            *  (1) things which everybody needs                                                                                                            *  (2) things without which code would work on most platforms but                                                                              *      fail to compile or misbehave on a minority of host OSes
>>
>>   - Fixed a checkpatch error
>>
>>   - Limit new behaviour only to macOS 11.0 and above, because of the
>>     following declarations:
>>
>>     __API_AVAILABLE(macos(11.0))
>>     __API_UNAVAILABLE(ios, tvos, watchos)
>>     void pthread_jit_write_protect_np(int enabled);
>>
>>     __API_AVAILABLE(macos(11.0))
>>     __API_UNAVAILABLE(ios, tvos, watchos)
>>     int pthread_jit_write_protect_supported_np(void);
>>
>>   1. https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch
>>
>>   accel/tcg/cpu-exec.c      |  2 ++
>>   accel/tcg/translate-all.c |  6 ++++++
>>   include/qemu/osdep.h      |  3 +++
>>   tcg/tcg.c                 |  1 +
>>   util/osdep.c              | 22 ++++++++++++++++++++++
>>   5 files changed, 34 insertions(+)
>>
>> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
>> index 8689c54499..374060eb45 100644
>> --- a/accel/tcg/cpu-exec.c
>> +++ b/accel/tcg/cpu-exec.c
>> @@ -175,6 +175,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
>>       }
>>   #endif /* DEBUG_DISAS */
>>   
>> +    qemu_thread_jit_execute();
>>       ret = tcg_qemu_tb_exec(env, tb_ptr);
>>       cpu->can_do_io = 1;
>>       last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
>> @@ -382,6 +383,7 @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
>>   {
>>       uintptr_t old;
>>   
>> +    qemu_thread_jit_write();
>>       assert(n < ARRAY_SIZE(tb->jmp_list_next));
>>       qemu_spin_lock(&tb_next->jmp_lock);
>>   
>> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
>> index b7d50a73d4..88ae5d35ef 100644
>> --- a/accel/tcg/translate-all.c
>> +++ b/accel/tcg/translate-all.c
>> @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
>>       size_t size = tcg_ctx->code_gen_buffer_size;
>>       void *buf;
>>   
>> +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
>> +    flags |= MAP_JIT;
>> +#endif
>>       buf = mmap(NULL, size, prot, flags, -1, 0);
>>       if (buf == MAP_FAILED) {
>>           return NULL;
>> @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
>>   
>>   static void tb_phys_invalidate__locked(TranslationBlock *tb)
>>   {
>> +    qemu_thread_jit_write();
>>       do_tb_phys_invalidate(tb, true);
>> +    qemu_thread_jit_execute();
>>   }
>>   
>>   /* invalidate one TB
>> @@ -1687,6 +1692,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>>   #endif
>>   
>>       assert_memory_lock();
>> +    qemu_thread_jit_write();
>>   
>>       phys_pc = get_page_addr_code(env, pc);
>>   
>> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
>> index f9ec8c84e9..89abebcf5d 100644
>> --- a/include/qemu/osdep.h
>> +++ b/include/qemu/osdep.h
>> @@ -686,4 +686,7 @@ char *qemu_get_host_name(Error **errp);
>>    */
>>   size_t qemu_get_host_physmem(void);
>>   
>> +void qemu_thread_jit_write(void);
>> +void qemu_thread_jit_execute(void);
>> +
>>   #endif
>> diff --git a/tcg/tcg.c b/tcg/tcg.c
>> index 43c6cf8f52..ab8488f5d5 100644
>> --- a/tcg/tcg.c
>> +++ b/tcg/tcg.c
>> @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
>>       s->pool_labels = NULL;
>>   #endif
>>   
>> +    qemu_thread_jit_write();
>>       /* Generate the prologue.  */
>>       tcg_target_qemu_prologue(s);
>>   
>> diff --git a/util/osdep.c b/util/osdep.c
>> index 66d01b9160..80ec7185da 100644
>> --- a/util/osdep.c
>> +++ b/util/osdep.c
>> @@ -606,3 +606,25 @@ writev(int fd, const struct iovec *iov, int iov_cnt)
>>       return readv_writev(fd, iov, iov_cnt, true);
>>   }
>>   #endif
>> +
>> +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)


Will this be defined in future versions?


>> +static inline void qemu_thread_jit_write_protect(bool enabled)
>> +{
>> +    if (pthread_jit_write_protect_supported_np()) {


Do we need this call? Sounds like extra overhead to me.


>> +        pthread_jit_write_protect_np(enabled);
>> +    }
>> +}
>> +
>> +void qemu_thread_jit_execute(void)
>> +{
>> +    qemu_thread_jit_write_protect(true);
>> +}
>> +
>> +void qemu_thread_jit_write(void)
>> +{
>> +    qemu_thread_jit_write_protect(false);
>> +}
> What happens if you emulate a -smp 2 ARM guest? In this case MTTCG
> should be enabled (same guest ordering) but you run a risk of attempting
> to execute code while write is enabled.
>
> Is there any way to only change the mapping for the parts of the TB
> cache used by a thread? Otherwise we'll need additional logic in
> default_mttcg_enabled to ensure we don't accidentally enable it on Apple
> silicon.


The actual protection logic is per thread, so the MTTCG side thread 
won't be affected by the flips.

Given this super specific semantic that is impossible to mimic on other 
platforms, we should probably name the functions accordingly and make 
sure people understand this is *only* for macos.

Also, is there anything this patch doesn't do that the one from Joelle 
does? It seems a bit ... short.


Alex
Alex Bennée Jan. 4, 2021, 8:28 p.m. UTC | #5
Alexander Graf <agraf@csgraf.de> writes:

> On 04.01.21 16:23, Alex Bennée wrote:
>> Roman Bolshakov <r.bolshakov@yadro.com> writes:
>>
>>> Pages can't be both write and executable at the same time on Apple
>>> Silicon. macOS provides public API to switch write protection [1] for
>>> JIT applications, like TCG.
>>>
>>> 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
>>>
>>> Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
>>> ---
>>> v1: https://lists.gnu.org/archive/html/qemu-devel/2021-01/msg00073.html
>>> Changes since v1:
>>>
>>>   - Pruned not needed fiddling with W^X and dropped symmetry from write
>>>     lock/unlock and renamed related functions.
>>>     Similar approach is used in JavaScriptCore [1].
>>>
>>>   - Moved jit helper functions to util/osdep
>>>                                                                                                                                                    As outlined in osdep.h, this matches to (2):                                                                                                                                                                                                                                                    * In an ideal world this header would contain only:                                                                                            *  (1) things which everybody needs                                                                                                            *  (2) things without which code would work on most platforms but                                                                              *      fail to compile or misbehave on a minority of host OSes
>>>
>>>   - Fixed a checkpatch error
>>>
>>>   - Limit new behaviour only to macOS 11.0 and above, because of the
>>>     following declarations:
>>>
>>>     __API_AVAILABLE(macos(11.0))
>>>     __API_UNAVAILABLE(ios, tvos, watchos)
>>>     void pthread_jit_write_protect_np(int enabled);
>>>
>>>     __API_AVAILABLE(macos(11.0))
>>>     __API_UNAVAILABLE(ios, tvos, watchos)
>>>     int pthread_jit_write_protect_supported_np(void);
>>>
>>>   1. https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch
>>>
>>>   accel/tcg/cpu-exec.c      |  2 ++
>>>   accel/tcg/translate-all.c |  6 ++++++
>>>   include/qemu/osdep.h      |  3 +++
>>>   tcg/tcg.c                 |  1 +
>>>   util/osdep.c              | 22 ++++++++++++++++++++++
>>>   5 files changed, 34 insertions(+)
>>>
>>> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
>>> index 8689c54499..374060eb45 100644
>>> --- a/accel/tcg/cpu-exec.c
>>> +++ b/accel/tcg/cpu-exec.c
>>> @@ -175,6 +175,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
>>>       }
>>>   #endif /* DEBUG_DISAS */
>>>   
>>> +    qemu_thread_jit_execute();
>>>       ret = tcg_qemu_tb_exec(env, tb_ptr);
>>>       cpu->can_do_io = 1;
>>>       last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
>>> @@ -382,6 +383,7 @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
>>>   {
>>>       uintptr_t old;
>>>   
>>> +    qemu_thread_jit_write();
>>>       assert(n < ARRAY_SIZE(tb->jmp_list_next));
>>>       qemu_spin_lock(&tb_next->jmp_lock);
>>>   
>>> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
>>> index b7d50a73d4..88ae5d35ef 100644
>>> --- a/accel/tcg/translate-all.c
>>> +++ b/accel/tcg/translate-all.c
>>> @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
>>>       size_t size = tcg_ctx->code_gen_buffer_size;
>>>       void *buf;
>>>   
>>> +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
>>> +    flags |= MAP_JIT;
>>> +#endif
>>>       buf = mmap(NULL, size, prot, flags, -1, 0);
>>>       if (buf == MAP_FAILED) {
>>>           return NULL;
>>> @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
>>>   
>>>   static void tb_phys_invalidate__locked(TranslationBlock *tb)
>>>   {
>>> +    qemu_thread_jit_write();
>>>       do_tb_phys_invalidate(tb, true);
>>> +    qemu_thread_jit_execute();
>>>   }
>>>   
>>>   /* invalidate one TB
>>> @@ -1687,6 +1692,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>>>   #endif
>>>   
>>>       assert_memory_lock();
>>> +    qemu_thread_jit_write();
>>>   
>>>       phys_pc = get_page_addr_code(env, pc);
>>>   
>>> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
>>> index f9ec8c84e9..89abebcf5d 100644
>>> --- a/include/qemu/osdep.h
>>> +++ b/include/qemu/osdep.h
>>> @@ -686,4 +686,7 @@ char *qemu_get_host_name(Error **errp);
>>>    */
>>>   size_t qemu_get_host_physmem(void);
>>>   
>>> +void qemu_thread_jit_write(void);
>>> +void qemu_thread_jit_execute(void);
>>> +
>>>   #endif
>>> diff --git a/tcg/tcg.c b/tcg/tcg.c
>>> index 43c6cf8f52..ab8488f5d5 100644
>>> --- a/tcg/tcg.c
>>> +++ b/tcg/tcg.c
>>> @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
>>>       s->pool_labels = NULL;
>>>   #endif
>>>   
>>> +    qemu_thread_jit_write();
>>>       /* Generate the prologue.  */
>>>       tcg_target_qemu_prologue(s);
>>>   
>>> diff --git a/util/osdep.c b/util/osdep.c
>>> index 66d01b9160..80ec7185da 100644
>>> --- a/util/osdep.c
>>> +++ b/util/osdep.c
>>> @@ -606,3 +606,25 @@ writev(int fd, const struct iovec *iov, int iov_cnt)
>>>       return readv_writev(fd, iov, iov_cnt, true);
>>>   }
>>>   #endif
>>> +
>>> +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
>
>
> Will this be defined in future versions?
>
>
>>> +static inline void qemu_thread_jit_write_protect(bool enabled)
>>> +{
>>> +    if (pthread_jit_write_protect_supported_np()) {
>
>
> Do we need this call? Sounds like extra overhead to me.
>
>
>>> +        pthread_jit_write_protect_np(enabled);
>>> +    }
>>> +}
>>> +
>>> +void qemu_thread_jit_execute(void)
>>> +{
>>> +    qemu_thread_jit_write_protect(true);
>>> +}
>>> +
>>> +void qemu_thread_jit_write(void)
>>> +{
>>> +    qemu_thread_jit_write_protect(false);
>>> +}
>> What happens if you emulate a -smp 2 ARM guest? In this case MTTCG
>> should be enabled (same guest ordering) but you run a risk of attempting
>> to execute code while write is enabled.
>>
>> Is there any way to only change the mapping for the parts of the TB
>> cache used by a thread? Otherwise we'll need additional logic in
>> default_mttcg_enabled to ensure we don't accidentally enable it on Apple
>> silicon.
>
>
> The actual protection logic is per thread, so the MTTCG side thread 
> won't be affected by the flips.

Just to be clear you are saying each thread has it's own mappings with
potentially different protection per page? I had always assumed the
mappings where per-process.

I'm not sure what you mean by side-thread. Code generation is in the
context of the running thread and while each tb region can only be
written by one thread all threads can run code residing in all regions.

> Given this super specific semantic that is impossible to mimic on other 
> platforms, we should probably name the functions accordingly and make 
> sure people understand this is *only* for macos.
>
> Also, is there anything this patch doesn't do that the one from Joelle 
> does? It seems a bit ... short.
>
>
> Alex
Joelle van Dyne Jan. 5, 2021, 2:02 a.m. UTC | #6
Tested-by: Joelle van Dyne <j@getutm.app>

It works for me. But one thing is that if you build it with the macOS
11.x SDK it won't run on < 11.x. This is why apple recommends
something like:

        if (__builtin_available(macOS 11, *)) {
            pthread_jit_write_protect_np();
        }

You still need a compile time check like MAC_OS_VERSION_11_0 to
support linking with older SDKs.

On Sun, Jan 3, 2021 at 6:54 AM Roman Bolshakov <r.bolshakov@yadro.com> wrote:
>
> Pages can't be both write and executable at the same time on Apple
> Silicon. macOS provides public API to switch write protection [1] for
> JIT applications, like TCG.
>
> 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
>
> Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> ---
> v1: https://lists.gnu.org/archive/html/qemu-devel/2021-01/msg00073.html
> Changes since v1:
>
>  - Pruned not needed fiddling with W^X and dropped symmetry from write
>    lock/unlock and renamed related functions.
>    Similar approach is used in JavaScriptCore [1].
>
>  - Moved jit helper functions to util/osdep
>                                                                                                                                                   As outlined in osdep.h, this matches to (2):                                                                                                                                                                                                                                                    * In an ideal world this header would contain only:                                                                                            *  (1) things which everybody needs                                                                                                            *  (2) things without which code would work on most platforms but                                                                              *      fail to compile or misbehave on a minority of host OSes
>
>  - Fixed a checkpatch error
>
>  - Limit new behaviour only to macOS 11.0 and above, because of the
>    following declarations:
>
>    __API_AVAILABLE(macos(11.0))
>    __API_UNAVAILABLE(ios, tvos, watchos)
>    void pthread_jit_write_protect_np(int enabled);
>
>    __API_AVAILABLE(macos(11.0))
>    __API_UNAVAILABLE(ios, tvos, watchos)
>    int pthread_jit_write_protect_supported_np(void);
>
>  1. https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch
>
>  accel/tcg/cpu-exec.c      |  2 ++
>  accel/tcg/translate-all.c |  6 ++++++
>  include/qemu/osdep.h      |  3 +++
>  tcg/tcg.c                 |  1 +
>  util/osdep.c              | 22 ++++++++++++++++++++++
>  5 files changed, 34 insertions(+)
>
> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> index 8689c54499..374060eb45 100644
> --- a/accel/tcg/cpu-exec.c
> +++ b/accel/tcg/cpu-exec.c
> @@ -175,6 +175,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
>      }
>  #endif /* DEBUG_DISAS */
>
> +    qemu_thread_jit_execute();
>      ret = tcg_qemu_tb_exec(env, tb_ptr);
>      cpu->can_do_io = 1;
>      last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
> @@ -382,6 +383,7 @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
>  {
>      uintptr_t old;
>
> +    qemu_thread_jit_write();
>      assert(n < ARRAY_SIZE(tb->jmp_list_next));
>      qemu_spin_lock(&tb_next->jmp_lock);
>
> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> index b7d50a73d4..88ae5d35ef 100644
> --- a/accel/tcg/translate-all.c
> +++ b/accel/tcg/translate-all.c
> @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
>      size_t size = tcg_ctx->code_gen_buffer_size;
>      void *buf;
>
> +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
> +    flags |= MAP_JIT;
> +#endif
>      buf = mmap(NULL, size, prot, flags, -1, 0);
>      if (buf == MAP_FAILED) {
>          return NULL;
> @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
>
>  static void tb_phys_invalidate__locked(TranslationBlock *tb)
>  {
> +    qemu_thread_jit_write();
>      do_tb_phys_invalidate(tb, true);
> +    qemu_thread_jit_execute();
>  }
>
>  /* invalidate one TB
> @@ -1687,6 +1692,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>  #endif
>
>      assert_memory_lock();
> +    qemu_thread_jit_write();
>
>      phys_pc = get_page_addr_code(env, pc);
>
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index f9ec8c84e9..89abebcf5d 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -686,4 +686,7 @@ char *qemu_get_host_name(Error **errp);
>   */
>  size_t qemu_get_host_physmem(void);
>
> +void qemu_thread_jit_write(void);
> +void qemu_thread_jit_execute(void);
> +
>  #endif
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index 43c6cf8f52..ab8488f5d5 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
>      s->pool_labels = NULL;
>  #endif
>
> +    qemu_thread_jit_write();
>      /* Generate the prologue.  */
>      tcg_target_qemu_prologue(s);
>
> diff --git a/util/osdep.c b/util/osdep.c
> index 66d01b9160..80ec7185da 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -606,3 +606,25 @@ writev(int fd, const struct iovec *iov, int iov_cnt)
>      return readv_writev(fd, iov, iov_cnt, true);
>  }
>  #endif
> +
> +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
> +static inline void qemu_thread_jit_write_protect(bool enabled)
> +{
> +    if (pthread_jit_write_protect_supported_np()) {
> +        pthread_jit_write_protect_np(enabled);
> +    }
> +}
> +
> +void qemu_thread_jit_execute(void)
> +{
> +    qemu_thread_jit_write_protect(true);
> +}
> +
> +void qemu_thread_jit_write(void)
> +{
> +    qemu_thread_jit_write_protect(false);
> +}
> +#else
> +void qemu_thread_jit_write(void) {}
> +void qemu_thread_jit_execute(void) {}
> +#endif
> --
> 2.29.2
>
Roman Bolshakov Jan. 5, 2021, 8:34 p.m. UTC | #7
On Mon, Jan 04, 2021 at 03:23:07PM +0000, Alex Bennée wrote:
> 
> Roman Bolshakov <r.bolshakov@yadro.com> writes:
> 
> > Pages can't be both write and executable at the same time on Apple
> > Silicon. macOS provides public API to switch write protection [1] for
> > JIT applications, like TCG.
> >
> > 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
> >
> > Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> > ---
> > v1: https://lists.gnu.org/archive/html/qemu-devel/2021-01/msg00073.html
> > Changes since v1:
> >
> >  - Pruned not needed fiddling with W^X and dropped symmetry from write
> >    lock/unlock and renamed related functions.
> >    Similar approach is used in JavaScriptCore [1].
> >
> >  - Moved jit helper functions to util/osdep
> >                                                                                                                                                   As outlined in osdep.h, this matches to (2):                                                                                                                                                                                                                                                    * In an ideal world this header would contain only:                                                                                            *  (1) things which everybody needs                                                                                                            *  (2) things without which code would work on most platforms but                                                                              *      fail to compile or misbehave on a minority of host OSes
> >
> >  - Fixed a checkpatch error
> >
> >  - Limit new behaviour only to macOS 11.0 and above, because of the
> >    following declarations:
> >
> >    __API_AVAILABLE(macos(11.0))
> >    __API_UNAVAILABLE(ios, tvos, watchos)
> >    void pthread_jit_write_protect_np(int enabled);
> >
> >    __API_AVAILABLE(macos(11.0))
> >    __API_UNAVAILABLE(ios, tvos, watchos)
> >    int pthread_jit_write_protect_supported_np(void);
> >
> >  1. https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch
> >
> >  accel/tcg/cpu-exec.c      |  2 ++
> >  accel/tcg/translate-all.c |  6 ++++++
> >  include/qemu/osdep.h      |  3 +++
> >  tcg/tcg.c                 |  1 +
> >  util/osdep.c              | 22 ++++++++++++++++++++++
> >  5 files changed, 34 insertions(+)
> >
> > diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> > index 8689c54499..374060eb45 100644
> > --- a/accel/tcg/cpu-exec.c
> > +++ b/accel/tcg/cpu-exec.c
> > @@ -175,6 +175,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
> >      }
> >  #endif /* DEBUG_DISAS */
> >  
> > +    qemu_thread_jit_execute();
> >      ret = tcg_qemu_tb_exec(env, tb_ptr);
> >      cpu->can_do_io = 1;
> >      last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
> > @@ -382,6 +383,7 @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
> >  {
> >      uintptr_t old;
> >  
> > +    qemu_thread_jit_write();
> >      assert(n < ARRAY_SIZE(tb->jmp_list_next));
> >      qemu_spin_lock(&tb_next->jmp_lock);
> >  
> > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> > index b7d50a73d4..88ae5d35ef 100644
> > --- a/accel/tcg/translate-all.c
> > +++ b/accel/tcg/translate-all.c
> > @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
> >      size_t size = tcg_ctx->code_gen_buffer_size;
> >      void *buf;
> >  
> > +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
> > +    flags |= MAP_JIT;
> > +#endif
> >      buf = mmap(NULL, size, prot, flags, -1, 0);
> >      if (buf == MAP_FAILED) {
> >          return NULL;
> > @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
> >  
> >  static void tb_phys_invalidate__locked(TranslationBlock *tb)
> >  {
> > +    qemu_thread_jit_write();
> >      do_tb_phys_invalidate(tb, true);
> > +    qemu_thread_jit_execute();
> >  }
> >  
> >  /* invalidate one TB
> > @@ -1687,6 +1692,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
> >  #endif
> >  
> >      assert_memory_lock();
> > +    qemu_thread_jit_write();
> >  
> >      phys_pc = get_page_addr_code(env, pc);
> >  
> > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> > index f9ec8c84e9..89abebcf5d 100644
> > --- a/include/qemu/osdep.h
> > +++ b/include/qemu/osdep.h
> > @@ -686,4 +686,7 @@ char *qemu_get_host_name(Error **errp);
> >   */
> >  size_t qemu_get_host_physmem(void);
> >  
> > +void qemu_thread_jit_write(void);
> > +void qemu_thread_jit_execute(void);
> > +
> >  #endif
> > diff --git a/tcg/tcg.c b/tcg/tcg.c
> > index 43c6cf8f52..ab8488f5d5 100644
> > --- a/tcg/tcg.c
> > +++ b/tcg/tcg.c
> > @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
> >      s->pool_labels = NULL;
> >  #endif
> >  
> > +    qemu_thread_jit_write();
> >      /* Generate the prologue.  */
> >      tcg_target_qemu_prologue(s);
> >  
> > diff --git a/util/osdep.c b/util/osdep.c
> > index 66d01b9160..80ec7185da 100644
> > --- a/util/osdep.c
> > +++ b/util/osdep.c
> > @@ -606,3 +606,25 @@ writev(int fd, const struct iovec *iov, int iov_cnt)
> >      return readv_writev(fd, iov, iov_cnt, true);
> >  }
> >  #endif
> > +
> > +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
> > +static inline void qemu_thread_jit_write_protect(bool enabled)
> > +{
> > +    if (pthread_jit_write_protect_supported_np()) {
> > +        pthread_jit_write_protect_np(enabled);
> > +    }
> > +}
> > +
> > +void qemu_thread_jit_execute(void)
> > +{
> > +    qemu_thread_jit_write_protect(true);
> > +}
> > +
> > +void qemu_thread_jit_write(void)
> > +{
> > +    qemu_thread_jit_write_protect(false);
> > +}
> 
> What happens if you emulate a -smp 2 ARM guest? In this case MTTCG
> should be enabled (same guest ordering) but you run a risk of attempting
> to execute code while write is enabled.
> 

Hi Alex,

Thanks for providing a hint. Ubuntu ARM with -smp 4 boots and works. I
can see 4 CPU in the guest and use the VM without any crashes (but it
requires patience as it's much slower compared to hvf).

> Is there any way to only change the mapping for the parts of the TB
> cache used by a thread? Otherwise we'll need additional logic in
> default_mttcg_enabled to ensure we don't accidentally enable it on Apple
> silicon.

I'm not sure I understand the question. The mappings are changed only
for the thread that invokes pthread_jit_write_protect_np(). Each thread
has its own permissions for MAP_JIT region. As far as I understand MTTCG
works fine with the series as I've seen 376% CPU utilization at times
with -smp 4, regardless whether MTTCG is specified explicitly
(-accel tcg,thread=multi) or not. Respectively, default ARM on ARM is
MTTCG and it works fine, we don't need to disable it :)

Thanks,
Roman

> 
> > +#else
> > +void qemu_thread_jit_write(void) {}
> > +void qemu_thread_jit_execute(void) {}
> > +#endif
> 
> 
> -- 
> Alex Bennée
Roman Bolshakov Jan. 5, 2021, 9:39 p.m. UTC | #8
On Mon, Jan 04, 2021 at 07:39:13PM +0100, Alexander Graf wrote:
> 
> On 04.01.21 16:23, Alex Bennée wrote:
> > Roman Bolshakov <r.bolshakov@yadro.com> writes:
> > 
> > > Pages can't be both write and executable at the same time on Apple
> > > Silicon. macOS provides public API to switch write protection [1] for
> > > JIT applications, like TCG.
> > > 
> > > 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
> > > 
> > > Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> > > ---
> > > v1: https://lists.gnu.org/archive/html/qemu-devel/2021-01/msg00073.html
> > > Changes since v1:
> > > 
> > >   - Pruned not needed fiddling with W^X and dropped symmetry from write
> > >     lock/unlock and renamed related functions.
> > >     Similar approach is used in JavaScriptCore [1].
> > > 
> > >   - Moved jit helper functions to util/osdep
> > >                                                                                                                                                    As outlined in osdep.h, this matches to (2):                                                                                                                                                                                                                                                    * In an ideal world this header would contain only:                                                                                            *  (1) things which everybody needs                                                                                                            *  (2) things without which code would work on most platforms but                                                                              *      fail to compile or misbehave on a minority of host OSes
> > > 
> > >   - Fixed a checkpatch error
> > > 
> > >   - Limit new behaviour only to macOS 11.0 and above, because of the
> > >     following declarations:
> > > 
> > >     __API_AVAILABLE(macos(11.0))
> > >     __API_UNAVAILABLE(ios, tvos, watchos)
> > >     void pthread_jit_write_protect_np(int enabled);
> > > 
> > >     __API_AVAILABLE(macos(11.0))
> > >     __API_UNAVAILABLE(ios, tvos, watchos)
> > >     int pthread_jit_write_protect_supported_np(void);
> > > 
> > >   1. https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch
> > > 
> > >   accel/tcg/cpu-exec.c      |  2 ++
> > >   accel/tcg/translate-all.c |  6 ++++++
> > >   include/qemu/osdep.h      |  3 +++
> > >   tcg/tcg.c                 |  1 +
> > >   util/osdep.c              | 22 ++++++++++++++++++++++
> > >   5 files changed, 34 insertions(+)
> > > 
> > > diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> > > index 8689c54499..374060eb45 100644
> > > --- a/accel/tcg/cpu-exec.c
> > > +++ b/accel/tcg/cpu-exec.c
> > > @@ -175,6 +175,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
> > >       }
> > >   #endif /* DEBUG_DISAS */
> > > +    qemu_thread_jit_execute();
> > >       ret = tcg_qemu_tb_exec(env, tb_ptr);
> > >       cpu->can_do_io = 1;
> > >       last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
> > > @@ -382,6 +383,7 @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
> > >   {
> > >       uintptr_t old;
> > > +    qemu_thread_jit_write();
> > >       assert(n < ARRAY_SIZE(tb->jmp_list_next));
> > >       qemu_spin_lock(&tb_next->jmp_lock);
> > > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> > > index b7d50a73d4..88ae5d35ef 100644
> > > --- a/accel/tcg/translate-all.c
> > > +++ b/accel/tcg/translate-all.c
> > > @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
> > >       size_t size = tcg_ctx->code_gen_buffer_size;
> > >       void *buf;
> > > +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
> > > +    flags |= MAP_JIT;
> > > +#endif
> > >       buf = mmap(NULL, size, prot, flags, -1, 0);
> > >       if (buf == MAP_FAILED) {
> > >           return NULL;
> > > @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
> > >   static void tb_phys_invalidate__locked(TranslationBlock *tb)
> > >   {
> > > +    qemu_thread_jit_write();
> > >       do_tb_phys_invalidate(tb, true);
> > > +    qemu_thread_jit_execute();
> > >   }
> > >   /* invalidate one TB
> > > @@ -1687,6 +1692,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
> > >   #endif
> > >       assert_memory_lock();
> > > +    qemu_thread_jit_write();
> > >       phys_pc = get_page_addr_code(env, pc);
> > > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> > > index f9ec8c84e9..89abebcf5d 100644
> > > --- a/include/qemu/osdep.h
> > > +++ b/include/qemu/osdep.h
> > > @@ -686,4 +686,7 @@ char *qemu_get_host_name(Error **errp);
> > >    */
> > >   size_t qemu_get_host_physmem(void);
> > > +void qemu_thread_jit_write(void);
> > > +void qemu_thread_jit_execute(void);
> > > +
> > >   #endif
> > > diff --git a/tcg/tcg.c b/tcg/tcg.c
> > > index 43c6cf8f52..ab8488f5d5 100644
> > > --- a/tcg/tcg.c
> > > +++ b/tcg/tcg.c
> > > @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
> > >       s->pool_labels = NULL;
> > >   #endif
> > > +    qemu_thread_jit_write();
> > >       /* Generate the prologue.  */
> > >       tcg_target_qemu_prologue(s);
> > > diff --git a/util/osdep.c b/util/osdep.c
> > > index 66d01b9160..80ec7185da 100644
> > > --- a/util/osdep.c
> > > +++ b/util/osdep.c
> > > @@ -606,3 +606,25 @@ writev(int fd, const struct iovec *iov, int iov_cnt)
> > >       return readv_writev(fd, iov, iov_cnt, true);
> > >   }
> > >   #endif
> > > +
> > > +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
> 
> 
> Will this be defined in future versions?
> 

Yes, it will be. But we might explicitly specify upper and lower bounds:

https://github.com/phracker/MacOSX-SDKs/blob/master/MacOSX10.15.sdk/usr/include/AvailabilityMacros.h

> 
> > > +static inline void qemu_thread_jit_write_protect(bool enabled)
> > > +{
> > > +    if (pthread_jit_write_protect_supported_np()) {
> 
> 
> Do we need this call? Sounds like extra overhead to me.
> 

We don't need it on ARM, but Apple uses it for x86_64 in JavaScriptCore:

https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch

> 
> > > +        pthread_jit_write_protect_np(enabled);
> > > +    }
> > > +}
> > > +
> > > +void qemu_thread_jit_execute(void)
> > > +{
> > > +    qemu_thread_jit_write_protect(true);
> > > +}
> > > +
> > > +void qemu_thread_jit_write(void)
> > > +{
> > > +    qemu_thread_jit_write_protect(false);
> > > +}
> > What happens if you emulate a -smp 2 ARM guest? In this case MTTCG
> > should be enabled (same guest ordering) but you run a risk of attempting
> > to execute code while write is enabled.
> > 
> > Is there any way to only change the mapping for the parts of the TB
> > cache used by a thread? Otherwise we'll need additional logic in
> > default_mttcg_enabled to ensure we don't accidentally enable it on Apple
> > silicon.
> 
> 
> The actual protection logic is per thread, so the MTTCG side thread won't be
> affected by the flips.
> 

That's correct.

> Given this super specific semantic that is impossible to mimic on other
> platforms, we should probably name the functions accordingly and make sure
> people understand this is *only* for macos.
> 

I intended to name them accordingly :) I'm sorry if it's not clear. What
do you propose instead?

> Also, is there anything this patch doesn't do that the one from Joelle does?
> It seems a bit ... short.
> 

I tried to minimize amount of W^X fiddling to the bare minimum required
for correct operation of TCG. So if you remove any of
qemu_thread_jit_*() calls in the patch, TCG won't work.

Thanks,
Roman
Roman Bolshakov Jan. 5, 2021, 9:47 p.m. UTC | #9
On Mon, Jan 04, 2021 at 08:28:08PM +0000, Alex Bennée wrote:
> 
> Alexander Graf <agraf@csgraf.de> writes:
> 
> > On 04.01.21 16:23, Alex Bennée wrote:
> >> Roman Bolshakov <r.bolshakov@yadro.com> writes:
> >>
> >>> Pages can't be both write and executable at the same time on Apple
> >>> Silicon. macOS provides public API to switch write protection [1] for
> >>> JIT applications, like TCG.
> >>>
> >>> 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
> >>>
> >>> Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> >>> ---
> >>> v1: https://lists.gnu.org/archive/html/qemu-devel/2021-01/msg00073.html
> >>> Changes since v1:
> >>>
> >>>   - Pruned not needed fiddling with W^X and dropped symmetry from write
> >>>     lock/unlock and renamed related functions.
> >>>     Similar approach is used in JavaScriptCore [1].
> >>>
> >>>   - Moved jit helper functions to util/osdep
> >>>                                                                                                                                                    As outlined in osdep.h, this matches to (2):                                                                                                                                                                                                                                                    * In an ideal world this header would contain only:                                                                                            *  (1) things which everybody needs                                                                                                            *  (2) things without which code would work on most platforms but                                                                              *      fail to compile or misbehave on a minority of host OSes
> >>>
> >>>   - Fixed a checkpatch error
> >>>
> >>>   - Limit new behaviour only to macOS 11.0 and above, because of the
> >>>     following declarations:
> >>>
> >>>     __API_AVAILABLE(macos(11.0))
> >>>     __API_UNAVAILABLE(ios, tvos, watchos)
> >>>     void pthread_jit_write_protect_np(int enabled);
> >>>
> >>>     __API_AVAILABLE(macos(11.0))
> >>>     __API_UNAVAILABLE(ios, tvos, watchos)
> >>>     int pthread_jit_write_protect_supported_np(void);
> >>>
> >>>   1. https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch
> >>>
> >>>   accel/tcg/cpu-exec.c      |  2 ++
> >>>   accel/tcg/translate-all.c |  6 ++++++
> >>>   include/qemu/osdep.h      |  3 +++
> >>>   tcg/tcg.c                 |  1 +
> >>>   util/osdep.c              | 22 ++++++++++++++++++++++
> >>>   5 files changed, 34 insertions(+)
> >>>
> >>> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> >>> index 8689c54499..374060eb45 100644
> >>> --- a/accel/tcg/cpu-exec.c
> >>> +++ b/accel/tcg/cpu-exec.c
> >>> @@ -175,6 +175,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
> >>>       }
> >>>   #endif /* DEBUG_DISAS */
> >>>   
> >>> +    qemu_thread_jit_execute();
> >>>       ret = tcg_qemu_tb_exec(env, tb_ptr);
> >>>       cpu->can_do_io = 1;
> >>>       last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
> >>> @@ -382,6 +383,7 @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
> >>>   {
> >>>       uintptr_t old;
> >>>   
> >>> +    qemu_thread_jit_write();
> >>>       assert(n < ARRAY_SIZE(tb->jmp_list_next));
> >>>       qemu_spin_lock(&tb_next->jmp_lock);
> >>>   
> >>> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> >>> index b7d50a73d4..88ae5d35ef 100644
> >>> --- a/accel/tcg/translate-all.c
> >>> +++ b/accel/tcg/translate-all.c
> >>> @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
> >>>       size_t size = tcg_ctx->code_gen_buffer_size;
> >>>       void *buf;
> >>>   
> >>> +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
> >>> +    flags |= MAP_JIT;
> >>> +#endif
> >>>       buf = mmap(NULL, size, prot, flags, -1, 0);
> >>>       if (buf == MAP_FAILED) {
> >>>           return NULL;
> >>> @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
> >>>   
> >>>   static void tb_phys_invalidate__locked(TranslationBlock *tb)
> >>>   {
> >>> +    qemu_thread_jit_write();
> >>>       do_tb_phys_invalidate(tb, true);
> >>> +    qemu_thread_jit_execute();
> >>>   }
> >>>   
> >>>   /* invalidate one TB
> >>> @@ -1687,6 +1692,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
> >>>   #endif
> >>>   
> >>>       assert_memory_lock();
> >>> +    qemu_thread_jit_write();
> >>>   
> >>>       phys_pc = get_page_addr_code(env, pc);
> >>>   
> >>> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> >>> index f9ec8c84e9..89abebcf5d 100644
> >>> --- a/include/qemu/osdep.h
> >>> +++ b/include/qemu/osdep.h
> >>> @@ -686,4 +686,7 @@ char *qemu_get_host_name(Error **errp);
> >>>    */
> >>>   size_t qemu_get_host_physmem(void);
> >>>   
> >>> +void qemu_thread_jit_write(void);
> >>> +void qemu_thread_jit_execute(void);
> >>> +
> >>>   #endif
> >>> diff --git a/tcg/tcg.c b/tcg/tcg.c
> >>> index 43c6cf8f52..ab8488f5d5 100644
> >>> --- a/tcg/tcg.c
> >>> +++ b/tcg/tcg.c
> >>> @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
> >>>       s->pool_labels = NULL;
> >>>   #endif
> >>>   
> >>> +    qemu_thread_jit_write();
> >>>       /* Generate the prologue.  */
> >>>       tcg_target_qemu_prologue(s);
> >>>   
> >>> diff --git a/util/osdep.c b/util/osdep.c
> >>> index 66d01b9160..80ec7185da 100644
> >>> --- a/util/osdep.c
> >>> +++ b/util/osdep.c
> >>> @@ -606,3 +606,25 @@ writev(int fd, const struct iovec *iov, int iov_cnt)
> >>>       return readv_writev(fd, iov, iov_cnt, true);
> >>>   }
> >>>   #endif
> >>> +
> >>> +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
> >
> >
> > Will this be defined in future versions?
> >
> >
> >>> +static inline void qemu_thread_jit_write_protect(bool enabled)
> >>> +{
> >>> +    if (pthread_jit_write_protect_supported_np()) {
> >
> >
> > Do we need this call? Sounds like extra overhead to me.
> >
> >
> >>> +        pthread_jit_write_protect_np(enabled);
> >>> +    }
> >>> +}
> >>> +
> >>> +void qemu_thread_jit_execute(void)
> >>> +{
> >>> +    qemu_thread_jit_write_protect(true);
> >>> +}
> >>> +
> >>> +void qemu_thread_jit_write(void)
> >>> +{
> >>> +    qemu_thread_jit_write_protect(false);
> >>> +}
> >> What happens if you emulate a -smp 2 ARM guest? In this case MTTCG
> >> should be enabled (same guest ordering) but you run a risk of attempting
> >> to execute code while write is enabled.
> >>
> >> Is there any way to only change the mapping for the parts of the TB
> >> cache used by a thread? Otherwise we'll need additional logic in
> >> default_mttcg_enabled to ensure we don't accidentally enable it on Apple
> >> silicon.
> >
> >
> > The actual protection logic is per thread, so the MTTCG side thread 
> > won't be affected by the flips.
> 
> Just to be clear you are saying each thread has it's own mappings with
> potentially different protection per page? I had always assumed the
> mappings where per-process.
> 

The JIT mapping is one per-process but permissions for the mapping are
controlled per-thread.

Here's decription from pthread_jit_write_protect_supported_np(3)
manpage:

DESCRIPTION
     The pthread_jit_write_protect_supported_np() function returns
     whether the pthread_jit_write_protect_np() API is supported on this
     platform. If pthread_jit_write_protect_np() API is supported on
     this platform, pthread_jit_write_protect_np() must be called to
     toggle per-thread write protection on the MAP_JIT region before the
     thread writes to or executes from the MAP_JIT region.

     The pthread_jit_write_protect_np() function sets whether MAP_JIT
     region write protection is enabled for this thread.

     On platforms where pthread_jit_write_protect_supported_np() is
     true, MAP_JIT regions are never writeable and executable
     simultaneously.  When write protection is enabled for the thread,
     writes by the thread to the MAP_JIT region are denied and the
     MAP_JIT region is executable.  When write protection is disabled
     for the thread, writes by the thread to the MAP_JIT region are
     allowed and the MAP_JIT region is not executable.  Pass a non-zero
     value for the enabled parameter to enable thread JIT region write
     protection and allow execution. Pass a zero value for the enabled
     parameter to disable thread JIT write protection and deny
     execution.

     On platforms where pthread_jit_write_protect_supported_np() is not
     supported, MAP_JIT regions are both simultaenously writeable and
     executable. Calls to pthread_jit_write_protect_np() are no-ops on
     unsupported platforms.

Regards,
Roman

> I'm not sure what you mean by side-thread. Code generation is in the
> context of the running thread and while each tb region can only be
> written by one thread all threads can run code residing in all regions.
>
Roman Bolshakov Jan. 5, 2021, 9:50 p.m. UTC | #10
On Mon, Jan 04, 2021 at 06:02:50PM -0800, Joelle van Dyne wrote:
> Tested-by: Joelle van Dyne <j@getutm.app>
> 
> It works for me. But one thing is that if you build it with the macOS
> 11.x SDK it won't run on < 11.x. This is why apple recommends
> something like:
> 
>         if (__builtin_available(macOS 11, *)) {
>             pthread_jit_write_protect_np();
>         }
> 
> You still need a compile time check like MAC_OS_VERSION_11_0 to
> support linking with older SDKs.
> 

I'll address the issue in v3. Thanks for catching it.

Regards,
Roman

> On Sun, Jan 3, 2021 at 6:54 AM Roman Bolshakov <r.bolshakov@yadro.com> wrote:
> >
> > Pages can't be both write and executable at the same time on Apple
> > Silicon. macOS provides public API to switch write protection [1] for
> > JIT applications, like TCG.
> >
> > 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
> >
> > Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> > ---
> > v1: https://lists.gnu.org/archive/html/qemu-devel/2021-01/msg00073.html
> > Changes since v1:
> >
> >  - Pruned not needed fiddling with W^X and dropped symmetry from write
> >    lock/unlock and renamed related functions.
> >    Similar approach is used in JavaScriptCore [1].
> >
> >  - Moved jit helper functions to util/osdep
> >                                                                                                                                                   As outlined in osdep.h, this matches to (2):                                                                                                                                                                                                                                                    * In an ideal world this header would contain only:                                                                                            *  (1) things which everybody needs                                                                                                            *  (2) things without which code would work on most platforms but                                                                              *      fail to compile or misbehave on a minority of host OSes
> >
> >  - Fixed a checkpatch error
> >
> >  - Limit new behaviour only to macOS 11.0 and above, because of the
> >    following declarations:
> >
> >    __API_AVAILABLE(macos(11.0))
> >    __API_UNAVAILABLE(ios, tvos, watchos)
> >    void pthread_jit_write_protect_np(int enabled);
> >
> >    __API_AVAILABLE(macos(11.0))
> >    __API_UNAVAILABLE(ios, tvos, watchos)
> >    int pthread_jit_write_protect_supported_np(void);
> >
> >  1. https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch
> >
> >  accel/tcg/cpu-exec.c      |  2 ++
> >  accel/tcg/translate-all.c |  6 ++++++
> >  include/qemu/osdep.h      |  3 +++
> >  tcg/tcg.c                 |  1 +
> >  util/osdep.c              | 22 ++++++++++++++++++++++
> >  5 files changed, 34 insertions(+)
> >
> > diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> > index 8689c54499..374060eb45 100644
> > --- a/accel/tcg/cpu-exec.c
> > +++ b/accel/tcg/cpu-exec.c
> > @@ -175,6 +175,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
> >      }
> >  #endif /* DEBUG_DISAS */
> >
> > +    qemu_thread_jit_execute();
> >      ret = tcg_qemu_tb_exec(env, tb_ptr);
> >      cpu->can_do_io = 1;
> >      last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
> > @@ -382,6 +383,7 @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
> >  {
> >      uintptr_t old;
> >
> > +    qemu_thread_jit_write();
> >      assert(n < ARRAY_SIZE(tb->jmp_list_next));
> >      qemu_spin_lock(&tb_next->jmp_lock);
> >
> > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> > index b7d50a73d4..88ae5d35ef 100644
> > --- a/accel/tcg/translate-all.c
> > +++ b/accel/tcg/translate-all.c
> > @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
> >      size_t size = tcg_ctx->code_gen_buffer_size;
> >      void *buf;
> >
> > +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
> > +    flags |= MAP_JIT;
> > +#endif
> >      buf = mmap(NULL, size, prot, flags, -1, 0);
> >      if (buf == MAP_FAILED) {
> >          return NULL;
> > @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
> >
> >  static void tb_phys_invalidate__locked(TranslationBlock *tb)
> >  {
> > +    qemu_thread_jit_write();
> >      do_tb_phys_invalidate(tb, true);
> > +    qemu_thread_jit_execute();
> >  }
> >
> >  /* invalidate one TB
> > @@ -1687,6 +1692,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
> >  #endif
> >
> >      assert_memory_lock();
> > +    qemu_thread_jit_write();
> >
> >      phys_pc = get_page_addr_code(env, pc);
> >
> > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> > index f9ec8c84e9..89abebcf5d 100644
> > --- a/include/qemu/osdep.h
> > +++ b/include/qemu/osdep.h
> > @@ -686,4 +686,7 @@ char *qemu_get_host_name(Error **errp);
> >   */
> >  size_t qemu_get_host_physmem(void);
> >
> > +void qemu_thread_jit_write(void);
> > +void qemu_thread_jit_execute(void);
> > +
> >  #endif
> > diff --git a/tcg/tcg.c b/tcg/tcg.c
> > index 43c6cf8f52..ab8488f5d5 100644
> > --- a/tcg/tcg.c
> > +++ b/tcg/tcg.c
> > @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
> >      s->pool_labels = NULL;
> >  #endif
> >
> > +    qemu_thread_jit_write();
> >      /* Generate the prologue.  */
> >      tcg_target_qemu_prologue(s);
> >
> > diff --git a/util/osdep.c b/util/osdep.c
> > index 66d01b9160..80ec7185da 100644
> > --- a/util/osdep.c
> > +++ b/util/osdep.c
> > @@ -606,3 +606,25 @@ writev(int fd, const struct iovec *iov, int iov_cnt)
> >      return readv_writev(fd, iov, iov_cnt, true);
> >  }
> >  #endif
> > +
> > +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
> > +static inline void qemu_thread_jit_write_protect(bool enabled)
> > +{
> > +    if (pthread_jit_write_protect_supported_np()) {
> > +        pthread_jit_write_protect_np(enabled);
> > +    }
> > +}
> > +
> > +void qemu_thread_jit_execute(void)
> > +{
> > +    qemu_thread_jit_write_protect(true);
> > +}
> > +
> > +void qemu_thread_jit_write(void)
> > +{
> > +    qemu_thread_jit_write_protect(false);
> > +}
> > +#else
> > +void qemu_thread_jit_write(void) {}
> > +void qemu_thread_jit_execute(void) {}
> > +#endif
> > --
> > 2.29.2
> >
Alex Bennée Jan. 5, 2021, 11:37 p.m. UTC | #11
Roman Bolshakov <r.bolshakov@yadro.com> writes:

> On Mon, Jan 04, 2021 at 03:23:07PM +0000, Alex Bennée wrote:
>> 
>> Roman Bolshakov <r.bolshakov@yadro.com> writes:
>> 
>> > Pages can't be both write and executable at the same time on Apple
>> > Silicon. macOS provides public API to switch write protection [1] for
>> > JIT applications, like TCG.
>> >
>> > 1. https://developer.apple.com/documentation/apple_silicon/porting_just-in-time_compilers_to_apple_silicon
>> >
>> > Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
>> > ---
>> > v1: https://lists.gnu.org/archive/html/qemu-devel/2021-01/msg00073.html
>> > Changes since v1:
>> >
>> >  - Pruned not needed fiddling with W^X and dropped symmetry from write
>> >    lock/unlock and renamed related functions.
>> >    Similar approach is used in JavaScriptCore [1].
>> >
>> >  - Moved jit helper functions to util/osdep
>> >                                                                                                                                                   As outlined in osdep.h, this matches to (2):                                                                                                                                                                                                                                                    * In an ideal world this header would contain only:                                                                                            *  (1) things which everybody needs                                                                                                            *  (2) things without which code would work on most platforms but                                                                              *      fail to compile or misbehave on a minority of host OSes
>> >
>> >  - Fixed a checkpatch error
>> >
>> >  - Limit new behaviour only to macOS 11.0 and above, because of the
>> >    following declarations:
>> >
>> >    __API_AVAILABLE(macos(11.0))
>> >    __API_UNAVAILABLE(ios, tvos, watchos)
>> >    void pthread_jit_write_protect_np(int enabled);
>> >
>> >    __API_AVAILABLE(macos(11.0))
>> >    __API_UNAVAILABLE(ios, tvos, watchos)
>> >    int pthread_jit_write_protect_supported_np(void);
>> >
>> >  1. https://bugs.webkit.org/attachment.cgi?id=402515&action=prettypatch
>> >
>> >  accel/tcg/cpu-exec.c      |  2 ++
>> >  accel/tcg/translate-all.c |  6 ++++++
>> >  include/qemu/osdep.h      |  3 +++
>> >  tcg/tcg.c                 |  1 +
>> >  util/osdep.c              | 22 ++++++++++++++++++++++
>> >  5 files changed, 34 insertions(+)
>> >
>> > diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
>> > index 8689c54499..374060eb45 100644
>> > --- a/accel/tcg/cpu-exec.c
>> > +++ b/accel/tcg/cpu-exec.c
>> > @@ -175,6 +175,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
>> >      }
>> >  #endif /* DEBUG_DISAS */
>> >  
>> > +    qemu_thread_jit_execute();
>> >      ret = tcg_qemu_tb_exec(env, tb_ptr);
>> >      cpu->can_do_io = 1;
>> >      last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
>> > @@ -382,6 +383,7 @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
>> >  {
>> >      uintptr_t old;
>> >  
>> > +    qemu_thread_jit_write();
>> >      assert(n < ARRAY_SIZE(tb->jmp_list_next));
>> >      qemu_spin_lock(&tb_next->jmp_lock);
>> >  
>> > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
>> > index b7d50a73d4..88ae5d35ef 100644
>> > --- a/accel/tcg/translate-all.c
>> > +++ b/accel/tcg/translate-all.c
>> > @@ -1072,6 +1072,9 @@ static inline void *alloc_code_gen_buffer(void)
>> >      size_t size = tcg_ctx->code_gen_buffer_size;
>> >      void *buf;
>> >  
>> > +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
>> > +    flags |= MAP_JIT;
>> > +#endif
>> >      buf = mmap(NULL, size, prot, flags, -1, 0);
>> >      if (buf == MAP_FAILED) {
>> >          return NULL;
>> > @@ -1485,7 +1488,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
>> >  
>> >  static void tb_phys_invalidate__locked(TranslationBlock *tb)
>> >  {
>> > +    qemu_thread_jit_write();
>> >      do_tb_phys_invalidate(tb, true);
>> > +    qemu_thread_jit_execute();
>> >  }
>> >  
>> >  /* invalidate one TB
>> > @@ -1687,6 +1692,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
>> >  #endif
>> >  
>> >      assert_memory_lock();
>> > +    qemu_thread_jit_write();
>> >  
>> >      phys_pc = get_page_addr_code(env, pc);
>> >  
>> > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
>> > index f9ec8c84e9..89abebcf5d 100644
>> > --- a/include/qemu/osdep.h
>> > +++ b/include/qemu/osdep.h
>> > @@ -686,4 +686,7 @@ char *qemu_get_host_name(Error **errp);
>> >   */
>> >  size_t qemu_get_host_physmem(void);
>> >  
>> > +void qemu_thread_jit_write(void);
>> > +void qemu_thread_jit_execute(void);
>> > +
>> >  #endif
>> > diff --git a/tcg/tcg.c b/tcg/tcg.c
>> > index 43c6cf8f52..ab8488f5d5 100644
>> > --- a/tcg/tcg.c
>> > +++ b/tcg/tcg.c
>> > @@ -1065,6 +1065,7 @@ void tcg_prologue_init(TCGContext *s)
>> >      s->pool_labels = NULL;
>> >  #endif
>> >  
>> > +    qemu_thread_jit_write();
>> >      /* Generate the prologue.  */
>> >      tcg_target_qemu_prologue(s);
>> >  
>> > diff --git a/util/osdep.c b/util/osdep.c
>> > index 66d01b9160..80ec7185da 100644
>> > --- a/util/osdep.c
>> > +++ b/util/osdep.c
>> > @@ -606,3 +606,25 @@ writev(int fd, const struct iovec *iov, int iov_cnt)
>> >      return readv_writev(fd, iov, iov_cnt, true);
>> >  }
>> >  #endif
>> > +
>> > +#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
>> > +static inline void qemu_thread_jit_write_protect(bool enabled)
>> > +{
>> > +    if (pthread_jit_write_protect_supported_np()) {
>> > +        pthread_jit_write_protect_np(enabled);
>> > +    }
>> > +}
>> > +
>> > +void qemu_thread_jit_execute(void)
>> > +{
>> > +    qemu_thread_jit_write_protect(true);
>> > +}
>> > +
>> > +void qemu_thread_jit_write(void)
>> > +{
>> > +    qemu_thread_jit_write_protect(false);
>> > +}
>> 
>> What happens if you emulate a -smp 2 ARM guest? In this case MTTCG
>> should be enabled (same guest ordering) but you run a risk of attempting
>> to execute code while write is enabled.
>> 
>
> Hi Alex,
>
> Thanks for providing a hint. Ubuntu ARM with -smp 4 boots and works. I
> can see 4 CPU in the guest and use the VM without any crashes (but it
> requires patience as it's much slower compared to hvf).
>
>> Is there any way to only change the mapping for the parts of the TB
>> cache used by a thread? Otherwise we'll need additional logic in
>> default_mttcg_enabled to ensure we don't accidentally enable it on Apple
>> silicon.
>
> I'm not sure I understand the question. The mappings are changed only
> for the thread that invokes pthread_jit_write_protect_np(). Each thread
> has its own permissions for MAP_JIT region.

Ahh that was the bit I was unsure of. If two threads can have different
permissions at the same time then it will be fine ;-)


> As far as I understand MTTCG
> works fine with the series as I've seen 376% CPU utilization at times
> with -smp 4, regardless whether MTTCG is specified explicitly
> (-accel tcg,thread=multi) or not. Respectively, default ARM on ARM is
> MTTCG and it works fine, we don't need to disable it :)
>
> Thanks,
> Roman
>
>> 
>> > +#else
>> > +void qemu_thread_jit_write(void) {}
>> > +void qemu_thread_jit_execute(void) {}
>> > +#endif
>> 
>> 
>> -- 
>> Alex Bennée
diff mbox series

Patch

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index 8689c54499..374060eb45 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -175,6 +175,7 @@  static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
     }
 #endif /* DEBUG_DISAS */
 
+    qemu_thread_jit_execute();
     ret = tcg_qemu_tb_exec(env, tb_ptr);
     cpu->can_do_io = 1;
     last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
@@ -382,6 +383,7 @@  static inline void tb_add_jump(TranslationBlock *tb, int n,
 {
     uintptr_t old;
 
+    qemu_thread_jit_write();
     assert(n < ARRAY_SIZE(tb->jmp_list_next));
     qemu_spin_lock(&tb_next->jmp_lock);
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index b7d50a73d4..88ae5d35ef 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1072,6 +1072,9 @@  static inline void *alloc_code_gen_buffer(void)
     size_t size = tcg_ctx->code_gen_buffer_size;
     void *buf;
 
+#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
+    flags |= MAP_JIT;
+#endif
     buf = mmap(NULL, size, prot, flags, -1, 0);
     if (buf == MAP_FAILED) {
         return NULL;
@@ -1485,7 +1488,9 @@  static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
 
 static void tb_phys_invalidate__locked(TranslationBlock *tb)
 {
+    qemu_thread_jit_write();
     do_tb_phys_invalidate(tb, true);
+    qemu_thread_jit_execute();
 }
 
 /* invalidate one TB
@@ -1687,6 +1692,7 @@  TranslationBlock *tb_gen_code(CPUState *cpu,
 #endif
 
     assert_memory_lock();
+    qemu_thread_jit_write();
 
     phys_pc = get_page_addr_code(env, pc);
 
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index f9ec8c84e9..89abebcf5d 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -686,4 +686,7 @@  char *qemu_get_host_name(Error **errp);
  */
 size_t qemu_get_host_physmem(void);
 
+void qemu_thread_jit_write(void);
+void qemu_thread_jit_execute(void);
+
 #endif
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 43c6cf8f52..ab8488f5d5 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1065,6 +1065,7 @@  void tcg_prologue_init(TCGContext *s)
     s->pool_labels = NULL;
 #endif
 
+    qemu_thread_jit_write();
     /* Generate the prologue.  */
     tcg_target_qemu_prologue(s);
 
diff --git a/util/osdep.c b/util/osdep.c
index 66d01b9160..80ec7185da 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -606,3 +606,25 @@  writev(int fd, const struct iovec *iov, int iov_cnt)
     return readv_writev(fd, iov, iov_cnt, true);
 }
 #endif
+
+#if defined(__APPLE__) && defined(MAC_OS_VERSION_11_0)
+static inline void qemu_thread_jit_write_protect(bool enabled)
+{
+    if (pthread_jit_write_protect_supported_np()) {
+        pthread_jit_write_protect_np(enabled);
+    }
+}
+
+void qemu_thread_jit_execute(void)
+{
+    qemu_thread_jit_write_protect(true);
+}
+
+void qemu_thread_jit_write(void)
+{
+    qemu_thread_jit_write_protect(false);
+}
+#else
+void qemu_thread_jit_write(void) {}
+void qemu_thread_jit_execute(void) {}
+#endif