diff mbox

[RFC,v2,10/11] tcg: drop global lock during TCG code execution

Message ID 1459870344-16773-11-git-send-email-alex.bennee@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Alex Bennée April 5, 2016, 3:32 p.m. UTC
From: Jan Kiszka <jan.kiszka@siemens.com>

This finally allows TCG to benefit from the iothread introduction: Drop
the global mutex while running pure TCG CPU code. Reacquire the lock
when entering MMIO or PIO emulation, or when leaving the TCG loop.

We have to revert a few optimization for the current TCG threading
model, namely kicking the TCG thread in qemu_mutex_lock_iothread and not
kicking it in qemu_cpu_kick. We also need to disable RAM block
reordering until we have a more efficient locking mechanism at hand.

Still, a Linux x86 UP guest and my Musicpal ARM model boot fine here.
These numbers demonstrate where we gain something:

20338 jan       20   0  331m  75m 6904 R   99  0.9   0:50.95 qemu-system-arm
20337 jan       20   0  331m  75m 6904 S   20  0.9   0:26.50 qemu-system-arm

The guest CPU was fully loaded, but the iothread could still run mostly
independent on a second core. Without the patch we don't get beyond

32206 jan       20   0  330m  73m 7036 R   82  0.9   1:06.00 qemu-system-arm
32204 jan       20   0  330m  73m 7036 S   21  0.9   0:17.03 qemu-system-arm

We don't benefit significantly, though, when the guest is not fully
loading a host CPU.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Message-Id: <1439220437-23957-10-git-send-email-fred.konrad@greensocs.com>
[FK: Rebase, fix qemu_devices_reset deadlock, rm address_space_* mutex]
Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
[EGC: fixed iothread lock for cpu-exec IRQ handling]
Signed-off-by: Emilio G. Cota <cota@braap.org>
[AJB: -smp single-threaded fix, rm old info from commit msg]
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

---
v1 (ajb):
  - SMP failure now fixed by previous commit
Changes from Fred Konrad (mttcg-v7 via paolo):
  * Rebase on the current HEAD.
  * Fixes a deadlock in qemu_devices_reset().
  * Remove the mutex in address_space_*
v2 (ajb):
  - merge with tcg: grab iothread lock in cpu-exec interrupt handling
  - use existing fns for tracking lock state
  - lock iothread for mem_region
    - add assert on mem region modification
    - ensure smm_helper holds iothread
  - Add JK s-o-b
  - Fix-up FK s-o-b annotation
---
 cpu-exec.c               | 12 ++++++++++++
 cpus.c                   | 26 +++-----------------------
 cputlb.c                 |  1 +
 exec.c                   | 12 ++++++++++++
 hw/i386/kvmvapic.c       |  3 +++
 memory.c                 |  2 ++
 softmmu_template.h       | 17 +++++++++++++++++
 target-i386/smm_helper.c |  7 +++++++
 translate-all.c          | 11 +++++++++--
 9 files changed, 66 insertions(+), 25 deletions(-)

Comments

Sergey Fedorov May 24, 2016, 9:28 p.m. UTC | #1
On 05/04/16 18:32, Alex Bennée wrote:
> diff --git a/cpu-exec.c b/cpu-exec.c
> index bd50fef..f558508 100644
> --- a/cpu-exec.c
> +++ b/cpu-exec.c
> @@ -28,6 +28,7 @@
>  #include "qemu/rcu.h"
>  #include "exec/tb-hash.h"
>  #include "exec/log.h"
> +#include "qemu/main-loop.h"
>  #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
>  #include "hw/i386/apic.h"
>  #endif
> @@ -452,6 +453,8 @@ int cpu_exec(CPUState *cpu)
>              for(;;) {
>                  interrupt_request = cpu->interrupt_request;
>                  if (unlikely(interrupt_request)) {
> +                    qemu_mutex_lock_iothread();
> +
>                      if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
>                          /* Mask out external interrupts for this step. */
>                          interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
> @@ -504,6 +507,11 @@ int cpu_exec(CPUState *cpu)
>                             the program flow was changed */
>                          next_tb = 0;
>                      }
> +
> +                    if (qemu_mutex_iothread_locked()) {
> +                        qemu_mutex_unlock_iothread();
> +                    }
> +

Why do we need to check for "qemu_mutex_iothread_locked()" here?
iothread lock is always held here, isn't it?

>                  }
>                  if (unlikely(cpu->exit_request
>                               || replay_has_interrupt())) {
(snip)
> diff --git a/cputlb.c b/cputlb.c
> index 466663b..1412049 100644
> --- a/cputlb.c
> +++ b/cputlb.c
> @@ -18,6 +18,7 @@
>   */
>  
>  #include "qemu/osdep.h"
> +#include "qemu/main-loop.h"

No need in this include.

>  #include "cpu.h"
>  #include "exec/exec-all.h"
>  #include "exec/memory.h"
> diff --git a/exec.c b/exec.c
> index c46c123..9e83673 100644
> --- a/exec.c
> +++ b/exec.c
(snip)
> @@ -2347,8 +2353,14 @@ static void io_mem_init(void)
>      memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
>      memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
>                            NULL, UINT64_MAX);
> +
> +    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
> +     * which must be called without the iothread mutex.
> +     */

notdirty_mem_write() operates on page dirty flags. Is it safe to do so
out of iothread lock?

>      memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
>                            NULL, UINT64_MAX);
> +    memory_region_clear_global_locking(&io_mem_notdirty);
> +
>      memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
>                            NULL, UINT64_MAX);
>  }
(snip)
> diff --git a/translate-all.c b/translate-all.c
> index 935d24c..0c377bb 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -1318,6 +1318,7 @@ void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
>   * this TB.
>   *
>   * Called with mmap_lock held for user-mode emulation
> + * If called from generated code, iothread mutex must not be held.

What does that mean? If iothread mutex is not required by the function
then why mention it here at all?

Kind regards,
Sergey
Paolo Bonzini May 25, 2016, 10:33 a.m. UTC | #2
On 24/05/2016 23:28, Sergey Fedorov wrote:
> On 05/04/16 18:32, Alex Bennée wrote:
>> diff --git a/cpu-exec.c b/cpu-exec.c
>> index bd50fef..f558508 100644
>> --- a/cpu-exec.c
>> +++ b/cpu-exec.c
>> @@ -28,6 +28,7 @@
>>  #include "qemu/rcu.h"
>>  #include "exec/tb-hash.h"
>>  #include "exec/log.h"
>> +#include "qemu/main-loop.h"
>>  #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
>>  #include "hw/i386/apic.h"
>>  #endif
>> @@ -452,6 +453,8 @@ int cpu_exec(CPUState *cpu)
>>              for(;;) {
>>                  interrupt_request = cpu->interrupt_request;
>>                  if (unlikely(interrupt_request)) {
>> +                    qemu_mutex_lock_iothread();
>> +
>>                      if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
>>                          /* Mask out external interrupts for this step. */
>>                          interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
>> @@ -504,6 +507,11 @@ int cpu_exec(CPUState *cpu)
>>                             the program flow was changed */
>>                          next_tb = 0;
>>                      }
>> +
>> +                    if (qemu_mutex_iothread_locked()) {
>> +                        qemu_mutex_unlock_iothread();
>> +                    }
>> +
> 
> Why do we need to check for "qemu_mutex_iothread_locked()" here?
> iothread lock is always held here, isn't it?

Correct.

>>                  }
>>                  if (unlikely(cpu->exit_request
>>                               || replay_has_interrupt())) {
> (snip)
>> diff --git a/cputlb.c b/cputlb.c
>> index 466663b..1412049 100644
>> --- a/cputlb.c
>> +++ b/cputlb.c
>> @@ -18,6 +18,7 @@
>>   */
>>  
>>  #include "qemu/osdep.h"
>> +#include "qemu/main-loop.h"
> 
> No need in this include.
> 
>>  #include "cpu.h"
>>  #include "exec/exec-all.h"
>>  #include "exec/memory.h"
>> diff --git a/exec.c b/exec.c
>> index c46c123..9e83673 100644
>> --- a/exec.c
>> +++ b/exec.c
> (snip)
>> @@ -2347,8 +2353,14 @@ static void io_mem_init(void)
>>      memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
>>      memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
>>                            NULL, UINT64_MAX);
>> +
>> +    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
>> +     * which must be called without the iothread mutex.
>> +     */
> 
> notdirty_mem_write() operates on page dirty flags. Is it safe to do so
> out of iothread lock?

Yes, see commit 5f2cb94 ("memory: make
cpu_physical_memory_sync_dirty_bitmap() fully atomic", 2014-12-02) and
those before.

>>      memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
>>                            NULL, UINT64_MAX);
>> +    memory_region_clear_global_locking(&io_mem_notdirty);
>> +
>>      memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
>>                            NULL, UINT64_MAX);
>>  }
> (snip)
>> diff --git a/translate-all.c b/translate-all.c
>> index 935d24c..0c377bb 100644
>> --- a/translate-all.c
>> +++ b/translate-all.c
>> @@ -1318,6 +1318,7 @@ void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
>>   * this TB.
>>   *
>>   * Called with mmap_lock held for user-mode emulation
>> + * If called from generated code, iothread mutex must not be held.
> 
> What does that mean? If iothread mutex is not required by the function
> then why mention it here at all?

If this function can take the iothread mutex itself, this would cause a
deadlock.  I'm not sure if it could though.

Another possibility is that this function can longjmp out into cpu_exec,
and then the iothread mutex would not be released.  I think this is more
likely.

Thanks,

Paolo
Alex Bennée May 25, 2016, 11:07 a.m. UTC | #3
Paolo Bonzini <pbonzini@redhat.com> writes:

> On 24/05/2016 23:28, Sergey Fedorov wrote:
>> On 05/04/16 18:32, Alex Bennée wrote:
>>> diff --git a/cpu-exec.c b/cpu-exec.c
>>> index bd50fef..f558508 100644
>>> --- a/cpu-exec.c
>>> +++ b/cpu-exec.c
>>> @@ -28,6 +28,7 @@
>>>  #include "qemu/rcu.h"
>>>  #include "exec/tb-hash.h"
>>>  #include "exec/log.h"
>>> +#include "qemu/main-loop.h"
>>>  #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
>>>  #include "hw/i386/apic.h"
>>>  #endif
>>> @@ -452,6 +453,8 @@ int cpu_exec(CPUState *cpu)
>>>              for(;;) {
>>>                  interrupt_request = cpu->interrupt_request;
>>>                  if (unlikely(interrupt_request)) {
>>> +                    qemu_mutex_lock_iothread();
>>> +
>>>                      if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
>>>                          /* Mask out external interrupts for this step. */
>>>                          interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
>>> @@ -504,6 +507,11 @@ int cpu_exec(CPUState *cpu)
>>>                             the program flow was changed */
>>>                          next_tb = 0;
>>>                      }
>>> +
>>> +                    if (qemu_mutex_iothread_locked()) {
>>> +                        qemu_mutex_unlock_iothread();
>>> +                    }
>>> +
>>
>> Why do we need to check for "qemu_mutex_iothread_locked()" here?
>> iothread lock is always held here, isn't it?
>
> Correct.

I'll fix that for the next iteration. The main else leg still needs to
check though as it doesn't know if the mutex was grabbed before a
siglongjmp.


>
>>>                  }
>>>                  if (unlikely(cpu->exit_request
>>>                               || replay_has_interrupt())) {
>> (snip)
>>> diff --git a/cputlb.c b/cputlb.c
>>> index 466663b..1412049 100644
>>> --- a/cputlb.c
>>> +++ b/cputlb.c
>>> @@ -18,6 +18,7 @@
>>>   */
>>>
>>>  #include "qemu/osdep.h"
>>> +#include "qemu/main-loop.h"
>>
>> No need in this include.
>>
>>>  #include "cpu.h"
>>>  #include "exec/exec-all.h"
>>>  #include "exec/memory.h"
>>> diff --git a/exec.c b/exec.c
>>> index c46c123..9e83673 100644
>>> --- a/exec.c
>>> +++ b/exec.c
>> (snip)
>>> @@ -2347,8 +2353,14 @@ static void io_mem_init(void)
>>>      memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
>>>      memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
>>>                            NULL, UINT64_MAX);
>>> +
>>> +    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
>>> +     * which must be called without the iothread mutex.
>>> +     */
>>
>> notdirty_mem_write() operates on page dirty flags. Is it safe to do so
>> out of iothread lock?
>
> Yes, see commit 5f2cb94 ("memory: make
> cpu_physical_memory_sync_dirty_bitmap() fully atomic", 2014-12-02) and
> those before.
>
>>>      memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
>>>                            NULL, UINT64_MAX);
>>> +    memory_region_clear_global_locking(&io_mem_notdirty);
>>> +
>>>      memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
>>>                            NULL, UINT64_MAX);
>>>  }
>> (snip)
>>> diff --git a/translate-all.c b/translate-all.c
>>> index 935d24c..0c377bb 100644
>>> --- a/translate-all.c
>>> +++ b/translate-all.c
>>> @@ -1318,6 +1318,7 @@ void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
>>>   * this TB.
>>>   *
>>>   * Called with mmap_lock held for user-mode emulation
>>> + * If called from generated code, iothread mutex must not be held.
>>
>> What does that mean? If iothread mutex is not required by the function
>> then why mention it here at all?
>
> If this function can take the iothread mutex itself, this would cause a
> deadlock.  I'm not sure if it could though.
>
> Another possibility is that this function can longjmp out into cpu_exec,
> and then the iothread mutex would not be released.  I think this is more
> likely.

Any longjmp should:

            tb_lock_reset();
            if (qemu_mutex_iothread_locked()) {
                qemu_mutex_unlock_iothread();
            }

To reset the locks.

>
> Thanks,
>
> Paolo


--
Alex Bennée
Paolo Bonzini May 25, 2016, 12:46 p.m. UTC | #4
On 25/05/2016 13:07, Alex Bennée wrote:
>>>> + * If called from generated code, iothread mutex must not be held.
>>>
>>> What does that mean? If iothread mutex is not required by the function
>>> then why mention it here at all?
>>
>> If this function can take the iothread mutex itself, this would cause a
>> deadlock.  I'm not sure if it could though.
>>
>> Another possibility is that this function can longjmp out into cpu_exec,
>> and then the iothread mutex would not be released.  I think this is more
>> likely.
>
> Any longjmp should:
> 
>             tb_lock_reset();
>             if (qemu_mutex_iothread_locked()) {
>                 qemu_mutex_unlock_iothread();
>             }
> 
> To reset the locks.
> 

Ok, so I think these comments are stale, dating to before you added the
unlock_iothread in the sigsetjmp else.

Thanks,

Paolo
diff mbox

Patch

diff --git a/cpu-exec.c b/cpu-exec.c
index bd50fef..f558508 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -28,6 +28,7 @@ 
 #include "qemu/rcu.h"
 #include "exec/tb-hash.h"
 #include "exec/log.h"
+#include "qemu/main-loop.h"
 #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
 #include "hw/i386/apic.h"
 #endif
@@ -452,6 +453,8 @@  int cpu_exec(CPUState *cpu)
             for(;;) {
                 interrupt_request = cpu->interrupt_request;
                 if (unlikely(interrupt_request)) {
+                    qemu_mutex_lock_iothread();
+
                     if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
                         /* Mask out external interrupts for this step. */
                         interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
@@ -504,6 +507,11 @@  int cpu_exec(CPUState *cpu)
                            the program flow was changed */
                         next_tb = 0;
                     }
+
+                    if (qemu_mutex_iothread_locked()) {
+                        qemu_mutex_unlock_iothread();
+                    }
+
                 }
                 if (unlikely(cpu->exit_request
                              || replay_has_interrupt())) {
@@ -609,8 +617,12 @@  int cpu_exec(CPUState *cpu)
             g_assert(env == &x86_cpu->env);
 #endif
 #endif /* buggy compiler */
+
             cpu->can_do_io = 1;
             tb_lock_reset();
+            if (qemu_mutex_iothread_locked()) {
+                qemu_mutex_unlock_iothread();
+            }
         }
     } /* for(;;) */
 
diff --git a/cpus.c b/cpus.c
index e22bb77..02fae13 100644
--- a/cpus.c
+++ b/cpus.c
@@ -920,8 +920,6 @@  static void qemu_kvm_init_cpu_signals(CPUState *cpu)
 #endif /* _WIN32 */
 
 static QemuMutex qemu_global_mutex;
-static QemuCond qemu_io_proceeded_cond;
-static unsigned iothread_requesting_mutex;
 
 static QemuThread io_thread;
 
@@ -937,7 +935,6 @@  void qemu_init_cpu_loop(void)
     qemu_cond_init(&qemu_cpu_cond);
     qemu_cond_init(&qemu_pause_cond);
     qemu_cond_init(&qemu_work_cond);
-    qemu_cond_init(&qemu_io_proceeded_cond);
     qemu_mutex_init(&qemu_global_mutex);
 
     qemu_thread_get_self(&io_thread);
@@ -1049,10 +1046,6 @@  static void qemu_tcg_wait_io_event(CPUState *cpu)
         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
     }
 
-    while (iothread_requesting_mutex) {
-        qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
-    }
-
     CPU_FOREACH(cpu) {
         qemu_wait_io_event_common(cpu);
     }
@@ -1319,22 +1312,7 @@  bool qemu_mutex_iothread_locked(void)
 
 void qemu_mutex_lock_iothread(void)
 {
-    atomic_inc(&iothread_requesting_mutex);
-    /* In the simple case there is no need to bump the VCPU thread out of
-     * TCG code execution.
-     */
-    if (!tcg_enabled() || qemu_in_vcpu_thread() ||
-        !first_cpu || !first_cpu->created) {
-        qemu_mutex_lock(&qemu_global_mutex);
-        atomic_dec(&iothread_requesting_mutex);
-    } else {
-        if (qemu_mutex_trylock(&qemu_global_mutex)) {
-            qemu_cpu_kick_no_halt();
-            qemu_mutex_lock(&qemu_global_mutex);
-        }
-        atomic_dec(&iothread_requesting_mutex);
-        qemu_cond_broadcast(&qemu_io_proceeded_cond);
-    }
+    qemu_mutex_lock(&qemu_global_mutex);
     iothread_locked = true;
 }
 
@@ -1580,7 +1558,9 @@  static int tcg_cpu_exec(CPUState *cpu)
         cpu->icount_decr.u16.low = decr;
         cpu->icount_extra = count;
     }
+    qemu_mutex_unlock_iothread();
     ret = cpu_exec(cpu);
+    qemu_mutex_lock_iothread();
 #ifdef CONFIG_PROFILER
     tcg_time += profile_getclock() - ti;
 #endif
diff --git a/cputlb.c b/cputlb.c
index 466663b..1412049 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -18,6 +18,7 @@ 
  */
 
 #include "qemu/osdep.h"
+#include "qemu/main-loop.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "exec/memory.h"
diff --git a/exec.c b/exec.c
index c46c123..9e83673 100644
--- a/exec.c
+++ b/exec.c
@@ -2112,6 +2112,12 @@  static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
                 }
                 cpu->watchpoint_hit = wp;
 
+                /*
+                 * Unlock iothread mutex before calling cpu_loop_exit
+                 * or cpu_resume_from_signal.
+                 */
+                qemu_mutex_unlock_iothread();
+
                 /* Unlocked by cpu_loop_exit or cpu_resume_from_signal.  */
                 tb_lock();
                 tb_check_watchpoint(cpu);
@@ -2347,8 +2353,14 @@  static void io_mem_init(void)
     memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
                           NULL, UINT64_MAX);
+
+    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
+     * which must be called without the iothread mutex.
+     */
     memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
                           NULL, UINT64_MAX);
+    memory_region_clear_global_locking(&io_mem_notdirty);
+
     memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
                           NULL, UINT64_MAX);
 }
diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c
index 7c0d542..a0439a8 100644
--- a/hw/i386/kvmvapic.c
+++ b/hw/i386/kvmvapic.c
@@ -447,6 +447,9 @@  static void patch_instruction(VAPICROMState *s, X86CPU *cpu, target_ulong ip)
     resume_all_vcpus();
 
     if (!kvm_enabled()) {
+        /* Unlock iothread mutex before calling cpu_resume_from_signal.  */
+        qemu_mutex_unlock_iothread();
+
         /* Unlocked by cpu_resume_from_signal.  */
         tb_lock();
         cs->current_tb = NULL;
diff --git a/memory.c b/memory.c
index f76f85d..2eae609 100644
--- a/memory.c
+++ b/memory.c
@@ -916,6 +916,8 @@  void memory_region_transaction_commit(void)
     AddressSpace *as;
 
     assert(memory_region_transaction_depth);
+    assert(qemu_mutex_iothread_locked());
+
     --memory_region_transaction_depth;
     if (!memory_region_transaction_depth) {
         if (memory_region_update_pending) {
diff --git a/softmmu_template.h b/softmmu_template.h
index 208f808..0b6c609 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -151,6 +151,7 @@  static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
     CPUState *cpu = ENV_GET_CPU(env);
     hwaddr physaddr = iotlbentry->addr;
     MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
+    bool locked = false;
 
     physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
     cpu->mem_io_pc = retaddr;
@@ -159,8 +160,15 @@  static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
     }
 
     cpu->mem_io_vaddr = addr;
+    if (mr->global_locking) {
+        qemu_mutex_lock_iothread();
+        locked = true;
+    }
     memory_region_dispatch_read(mr, physaddr, &val, 1 << SHIFT,
                                 iotlbentry->attrs);
+    if (locked) {
+        qemu_mutex_unlock_iothread();
+    }
     return val;
 }
 #endif
@@ -358,6 +366,7 @@  static inline void glue(io_write, SUFFIX)(CPUArchState *env,
     CPUState *cpu = ENV_GET_CPU(env);
     hwaddr physaddr = iotlbentry->addr;
     MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
+    bool locked = false;
 
     physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
     if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
@@ -366,8 +375,16 @@  static inline void glue(io_write, SUFFIX)(CPUArchState *env,
 
     cpu->mem_io_vaddr = addr;
     cpu->mem_io_pc = retaddr;
+
+    if (mr->global_locking) {
+        qemu_mutex_lock_iothread();
+        locked = true;
+    }
     memory_region_dispatch_write(mr, physaddr, val, 1 << SHIFT,
                                  iotlbentry->attrs);
+    if (locked) {
+        qemu_mutex_unlock_iothread();
+    }
 }
 
 void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
diff --git a/target-i386/smm_helper.c b/target-i386/smm_helper.c
index 4dd6a2c..6a5489b 100644
--- a/target-i386/smm_helper.c
+++ b/target-i386/smm_helper.c
@@ -18,6 +18,7 @@ 
  */
 
 #include "qemu/osdep.h"
+#include "qemu/main-loop.h"
 #include "cpu.h"
 #include "exec/helper-proto.h"
 #include "exec/log.h"
@@ -42,11 +43,14 @@  void helper_rsm(CPUX86State *env)
 #define SMM_REVISION_ID 0x00020000
 #endif
 
+/* Called we iothread lock taken */
 void cpu_smm_update(X86CPU *cpu)
 {
     CPUX86State *env = &cpu->env;
     bool smm_enabled = (env->hflags & HF_SMM_MASK);
 
+    g_assert(qemu_mutex_iothread_locked());
+
     if (cpu->smram) {
         memory_region_set_enabled(cpu->smram, smm_enabled);
     }
@@ -333,7 +337,10 @@  void helper_rsm(CPUX86State *env)
     }
     env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK;
     env->hflags &= ~HF_SMM_MASK;
+
+    qemu_mutex_lock_iothread();
     cpu_smm_update(cpu);
+    qemu_mutex_unlock_iothread();
 
     qemu_log_mask(CPU_LOG_INT, "SMM: after RSM\n");
     log_cpu_state_mask(CPU_LOG_INT, CPU(cpu), CPU_DUMP_CCOP);
diff --git a/translate-all.c b/translate-all.c
index 935d24c..0c377bb 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -1318,6 +1318,7 @@  void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end)
  * this TB.
  *
  * Called with mmap_lock held for user-mode emulation
+ * If called from generated code, iothread mutex must not be held.
  */
 void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
                                    int is_cpu_write_access)
@@ -1430,7 +1431,9 @@  void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
 }
 
 #ifdef CONFIG_SOFTMMU
-/* len must be <= 8 and start must be a multiple of len */
+/* len must be <= 8 and start must be a multiple of len.
+ * Called via softmmu_template.h, with iothread mutex not held.
+ */
 void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len)
 {
     PageDesc *p;
@@ -1579,6 +1582,7 @@  static TranslationBlock *tb_find_pc(uintptr_t tc_ptr)
 }
 
 #if !defined(CONFIG_USER_ONLY)
+/* If called from generated code, iothread mutex must not be held.  */
 void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr)
 {
     ram_addr_t ram_addr;
@@ -1625,7 +1629,10 @@  void tb_check_watchpoint(CPUState *cpu)
 
 #ifndef CONFIG_USER_ONLY
 /* in deterministic execution mode, instructions doing device I/Os
-   must be at the end of the TB */
+ * must be at the end of the TB.
+ *
+ * Called by softmmu_template.h, with iothread mutex not held.
+ */
 void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
 {
 #if defined(TARGET_MIPS) || defined(TARGET_SH4)