diff mbox

[v2,3/4] xen/x86: Replace remaining mandatory barriers with SMP barriers

Message ID 1502882530-31700-4-git-send-email-andrew.cooper3@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Andrew Cooper Aug. 16, 2017, 11:22 a.m. UTC
There is no functional change.  Xen currently assignes smp_* meaning to
the non-smp_* barriers.

All of these uses are just to deal with shared memory between multiple
processors, so use the smp_*() which are the correct barriers for the purpose.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>

v2:
 * Drop more unnecessary barriers, rather than converting them to smp
---
 xen/arch/x86/acpi/cpu_idle.c      |  8 ++++----
 xen/arch/x86/cpu/mcheck/barrier.c | 10 +++++-----
 xen/arch/x86/cpu/mcheck/mctelem.c |  4 ++--
 xen/arch/x86/genapic/x2apic.c     |  6 +++---
 xen/arch/x86/hpet.c               |  2 +-
 xen/arch/x86/hvm/ioreq.c          |  4 ++--
 xen/arch/x86/irq.c                |  4 ++--
 xen/arch/x86/smpboot.c            | 12 ++++++------
 xen/arch/x86/time.c               |  8 ++++----
 xen/include/asm-x86/desc.h        |  8 ++++----
 xen/include/asm-x86/system.h      |  2 +-
 11 files changed, 34 insertions(+), 34 deletions(-)

Comments

Dario Faggioli Aug. 16, 2017, 3:42 p.m. UTC | #1
On Wed, 2017-08-16 at 12:22 +0100, Andrew Cooper wrote:
> There is no functional change.  Xen currently assignes smp_* meaning
> to
> the non-smp_* barriers.
> 
> All of these uses are just to deal with shared memory between
> multiple
> processors, so use the smp_*() which are the correct barriers for the
> purpose.
> 
FWIW, I had to deal with barriers recently, and this is much
appreciated! :-)

> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>
Reviewed-by: Dario Faggioli <dario.faggioli@citrix.com>

Regards,
Dario
Jan Beulich Aug. 17, 2017, 8:37 a.m. UTC | #2
>>> On 16.08.17 at 13:22, <andrew.cooper3@citrix.com> wrote:
> There is no functional change.  Xen currently assignes smp_* meaning to
> the non-smp_* barriers.
> 
> All of these uses are just to deal with shared memory between multiple
> processors, so use the smp_*() which are the correct barriers for the 
> purpose.

Taking this together with ...

> --- a/xen/arch/x86/acpi/cpu_idle.c
> +++ b/xen/arch/x86/acpi/cpu_idle.c
> @@ -390,9 +390,9 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>  
>      if ( boot_cpu_has(X86_FEATURE_CLFLUSH_MONITOR) )
>      {
> -        mb();
> +        smp_mb();
>          clflush((void *)&mwait_wakeup(cpu));
> -        mb();
> +        smp_mb();
>      }

See commit 48d32458bc ("x86, idle: add barriers to CLFLUSH
workaround") for why these better stay the way they are.

> @@ -755,10 +755,10 @@ void acpi_dead_idle(void)
>               * instruction, hence memory fence is necessary to make sure all 
>               * load/store visible before flush cache line.
>               */
> -            mb();
> +            smp_mb();
>              clflush(mwait_ptr);
>              __monitor(mwait_ptr, 0, 0);
> -            mb();
> +            smp_mb();
>              __mwait(cx->address, 0);

... the comment the tail of which is in context here, I'm rather
surprised you convert these: They're there strictly for
correctness on a single processor (the need for prior memory
accesses to be visible isn't limited to the CPUs in the system).

In both cases, while smp_mb() and mb() are the same, I'd rather
keep the distinction at use sites with the assumption that the
smp_* ones would expand to just barrier() when !CONFIG_SMP (a
configuration we currently simply don't allow). The only alternative
I see would be to open-code the fences.

> --- a/xen/arch/x86/hvm/ioreq.c
> +++ b/xen/arch/x86/hvm/ioreq.c
> @@ -91,7 +91,7 @@ static bool_t hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
>      {
>          unsigned int state = p->state;
>  
> -        rmb();
> +        smp_rmb();
>          switch ( state )
>          {
>          case STATE_IOREQ_NONE:
> @@ -1327,7 +1327,7 @@ static int hvm_send_buffered_ioreq(struct hvm_ioreq_server *s, ioreq_t *p)
>      }
>  
>      /* Make the ioreq_t visible /before/ write_pointer. */
> -    wmb();
> +    smp_wmb();
>      pg->ptrs.write_pointer += qw ? 2 : 1;

I agree with these changes, but it needs to be clear that their
counterparts cannot be smp_?mb().

> --- a/xen/arch/x86/time.c
> +++ b/xen/arch/x86/time.c
> @@ -976,10 +976,10 @@ static void __update_vcpu_system_time(struct vcpu *v, int force)
>  
>      /* 1. Update guest kernel version. */
>      _u.version = u->version = version_update_begin(u->version);
> -    wmb();
> +    smp_wmb();
>      /* 2. Update all other guest kernel fields. */
>      *u = _u;
> -    wmb();
> +    smp_wmb();
>      /* 3. Update guest kernel version. */
>      u->version = version_update_end(u->version);
>  
> @@ -1006,10 +1006,10 @@ bool update_secondary_system_time(struct vcpu *v,
>          update_guest_memory_policy(v, &policy);
>          return false;
>      }
> -    wmb();
> +    smp_wmb();
>      /* 2. Update all other userspace fields. */
>      __copy_to_guest(user_u, u, 1);
> -    wmb();
> +    smp_wmb();
>      /* 3. Update userspace version. */
>      u->version = version_update_end(u->version);
>      __copy_field_to_guest(user_u, u, version);

Same fore these.

So with the cpu_idle.c changes dropped or replaced by open-coded
fences
Reviewed-by: Jan Beulich <jbeulich@suse.com>

Jan
Andrew Cooper Aug. 17, 2017, 9:35 a.m. UTC | #3
On 17/08/17 09:37, Jan Beulich wrote:
>>>> On 16.08.17 at 13:22, <andrew.cooper3@citrix.com> wrote:
>> There is no functional change.  Xen currently assignes smp_* meaning to
>> the non-smp_* barriers.
>>
>> All of these uses are just to deal with shared memory between multiple
>> processors, so use the smp_*() which are the correct barriers for the 
>> purpose.
> Taking this together with ...
>
>> --- a/xen/arch/x86/acpi/cpu_idle.c
>> +++ b/xen/arch/x86/acpi/cpu_idle.c
>> @@ -390,9 +390,9 @@ void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
>>  
>>      if ( boot_cpu_has(X86_FEATURE_CLFLUSH_MONITOR) )
>>      {
>> -        mb();
>> +        smp_mb();
>>          clflush((void *)&mwait_wakeup(cpu));
>> -        mb();
>> +        smp_mb();
>>      }
> See commit 48d32458bc ("x86, idle: add barriers to CLFLUSH
> workaround") for why these better stay the way they are.
>
>> @@ -755,10 +755,10 @@ void acpi_dead_idle(void)
>>               * instruction, hence memory fence is necessary to make sure all 
>>               * load/store visible before flush cache line.
>>               */
>> -            mb();
>> +            smp_mb();
>>              clflush(mwait_ptr);
>>              __monitor(mwait_ptr, 0, 0);
>> -            mb();
>> +            smp_mb();
>>              __mwait(cx->address, 0);
> ... the comment the tail of which is in context here, I'm rather
> surprised you convert these: They're there strictly for
> correctness on a single processor (the need for prior memory
> accesses to be visible isn't limited to the CPUs in the system).
>
> In both cases, while smp_mb() and mb() are the same, I'd rather
> keep the distinction at use sites with the assumption that the
> smp_* ones would expand to just barrier() when !CONFIG_SMP (a
> configuration we currently simply don't allow). The only alternative
> I see would be to open-code the fences.

Yeah - in hindsight they should logically stay as mb() (even as you say,
there is no change).

>
>> --- a/xen/arch/x86/hvm/ioreq.c
>> +++ b/xen/arch/x86/hvm/ioreq.c
>> @@ -91,7 +91,7 @@ static bool_t hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
>>      {
>>          unsigned int state = p->state;
>>  
>> -        rmb();
>> +        smp_rmb();
>>          switch ( state )
>>          {
>>          case STATE_IOREQ_NONE:
>> @@ -1327,7 +1327,7 @@ static int hvm_send_buffered_ioreq(struct hvm_ioreq_server *s, ioreq_t *p)
>>      }
>>  
>>      /* Make the ioreq_t visible /before/ write_pointer. */
>> -    wmb();
>> +    smp_wmb();
>>      pg->ptrs.write_pointer += qw ? 2 : 1;
> I agree with these changes, but it needs to be clear that their
> counterparts cannot be smp_?mb().
>
>> --- a/xen/arch/x86/time.c
>> +++ b/xen/arch/x86/time.c
>> @@ -976,10 +976,10 @@ static void __update_vcpu_system_time(struct vcpu *v, int force)
>>  
>>      /* 1. Update guest kernel version. */
>>      _u.version = u->version = version_update_begin(u->version);
>> -    wmb();
>> +    smp_wmb();
>>      /* 2. Update all other guest kernel fields. */
>>      *u = _u;
>> -    wmb();
>> +    smp_wmb();
>>      /* 3. Update guest kernel version. */
>>      u->version = version_update_end(u->version);
>>  
>> @@ -1006,10 +1006,10 @@ bool update_secondary_system_time(struct vcpu *v,
>>          update_guest_memory_policy(v, &policy);
>>          return false;
>>      }
>> -    wmb();
>> +    smp_wmb();
>>      /* 2. Update all other userspace fields. */
>>      __copy_to_guest(user_u, u, 1);
>> -    wmb();
>> +    smp_wmb();
>>      /* 3. Update userspace version. */
>>      u->version = version_update_end(u->version);
>>      __copy_field_to_guest(user_u, u, version);
> Same fore these.

Why?  The guest side of this protocol is just reads.

Irrespective, how do you suggest I make things more clear?

~Andrew
Jan Beulich Aug. 17, 2017, 10:01 a.m. UTC | #4
>>> On 17.08.17 at 11:35, <andrew.cooper3@citrix.com> wrote:
> On 17/08/17 09:37, Jan Beulich wrote:
>>>>> On 16.08.17 at 13:22, <andrew.cooper3@citrix.com> wrote:
>>> --- a/xen/arch/x86/time.c
>>> +++ b/xen/arch/x86/time.c
>>> @@ -976,10 +976,10 @@ static void __update_vcpu_system_time(struct vcpu *v, int force)
>>>  
>>>      /* 1. Update guest kernel version. */
>>>      _u.version = u->version = version_update_begin(u->version);
>>> -    wmb();
>>> +    smp_wmb();
>>>      /* 2. Update all other guest kernel fields. */
>>>      *u = _u;
>>> -    wmb();
>>> +    smp_wmb();
>>>      /* 3. Update guest kernel version. */
>>>      u->version = version_update_end(u->version);
>>>  
>>> @@ -1006,10 +1006,10 @@ bool update_secondary_system_time(struct vcpu *v,
>>>          update_guest_memory_policy(v, &policy);
>>>          return false;
>>>      }
>>> -    wmb();
>>> +    smp_wmb();
>>>      /* 2. Update all other userspace fields. */
>>>      __copy_to_guest(user_u, u, 1);
>>> -    wmb();
>>> +    smp_wmb();
>>>      /* 3. Update userspace version. */
>>>      u->version = version_update_end(u->version);
>>>      __copy_field_to_guest(user_u, u, version);
>> Same fore these.
> 
> Why?  The guest side of this protocol is just reads.

As always (and as you keep stressing) barriers make sense almost
exclusively when they're being used on both sides. This applies
here too. It's just that even a non-SMP consumer would need
barriers; that's along the lines of why Linux has gained virt_mb().

> Irrespective, how do you suggest I make things more clear?

Well, it was more a remark than a request for you to change
anything. I agree there's little point of adding a comment on the
hypervisor side of things.

Jan
diff mbox

Patch

diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index 5879ad6..dea834c 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -390,9 +390,9 @@  void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
 
     if ( boot_cpu_has(X86_FEATURE_CLFLUSH_MONITOR) )
     {
-        mb();
+        smp_mb();
         clflush((void *)&mwait_wakeup(cpu));
-        mb();
+        smp_mb();
     }
 
     __monitor((void *)&mwait_wakeup(cpu), 0, 0);
@@ -755,10 +755,10 @@  void acpi_dead_idle(void)
              * instruction, hence memory fence is necessary to make sure all 
              * load/store visible before flush cache line.
              */
-            mb();
+            smp_mb();
             clflush(mwait_ptr);
             __monitor(mwait_ptr, 0, 0);
-            mb();
+            smp_mb();
             __mwait(cx->address, 0);
         }
     }
diff --git a/xen/arch/x86/cpu/mcheck/barrier.c b/xen/arch/x86/cpu/mcheck/barrier.c
index 7de8e45..a7e5b19 100644
--- a/xen/arch/x86/cpu/mcheck/barrier.c
+++ b/xen/arch/x86/cpu/mcheck/barrier.c
@@ -12,7 +12,7 @@  void mce_barrier_init(struct mce_softirq_barrier *bar)
 void mce_barrier_dec(struct mce_softirq_barrier *bar)
 {
     atomic_inc(&bar->outgen);
-    wmb();
+    smp_wmb();
     atomic_dec(&bar->val);
 }
 
@@ -24,12 +24,12 @@  void mce_barrier_enter(struct mce_softirq_barrier *bar, bool wait)
         return;
     atomic_inc(&bar->ingen);
     gen = atomic_read(&bar->outgen);
-    mb();
+    smp_mb();
     atomic_inc(&bar->val);
     while ( atomic_read(&bar->val) != num_online_cpus() &&
             atomic_read(&bar->outgen) == gen )
     {
-            mb();
+            smp_mb();
             mce_panic_check();
     }
 }
@@ -42,12 +42,12 @@  void mce_barrier_exit(struct mce_softirq_barrier *bar, bool wait)
         return;
     atomic_inc(&bar->outgen);
     gen = atomic_read(&bar->ingen);
-    mb();
+    smp_mb();
     atomic_dec(&bar->val);
     while ( atomic_read(&bar->val) != 0 &&
             atomic_read(&bar->ingen) == gen )
     {
-            mb();
+            smp_mb();
             mce_panic_check();
     }
 }
diff --git a/xen/arch/x86/cpu/mcheck/mctelem.c b/xen/arch/x86/cpu/mcheck/mctelem.c
index 1731514..b071dc8 100644
--- a/xen/arch/x86/cpu/mcheck/mctelem.c
+++ b/xen/arch/x86/cpu/mcheck/mctelem.c
@@ -501,9 +501,9 @@  static void mctelem_append_processing(mctelem_class_t which)
 		ltep->mcte_prev = *procltp;
 		*procltp = dangling[target];
 	}
-	wmb();
+	smp_wmb();
 	dangling[target] = NULL;
-	wmb();
+	smp_wmb();
 }
 
 mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c
index 5fffb31..4779b0d 100644
--- a/xen/arch/x86/genapic/x2apic.c
+++ b/xen/arch/x86/genapic/x2apic.c
@@ -106,12 +106,12 @@  static void send_IPI_mask_x2apic_phys(const cpumask_t *cpumask, int vector)
      * CPU is seen by notified remote CPUs. The WRMSR contained within
      * apic_icr_write() can otherwise be executed early.
      * 
-     * The reason mb() is sufficient here is subtle: the register arguments
+     * The reason smp_mb() is sufficient here is subtle: the register arguments
      * to WRMSR must depend on a memory read executed after the barrier. This
      * is guaranteed by cpu_physical_id(), which reads from a global array (and
      * so cannot be hoisted above the barrier even by a clever compiler).
      */
-    mb();
+    smp_mb();
 
     local_irq_save(flags);
 
@@ -135,7 +135,7 @@  static void send_IPI_mask_x2apic_cluster(const cpumask_t *cpumask, int vector)
     const cpumask_t *cluster_cpus;
     unsigned long flags;
 
-    mb(); /* See above for an explanation. */
+    smp_mb(); /* See above for an explanation. */
 
     local_irq_save(flags);
 
diff --git a/xen/arch/x86/hpet.c b/xen/arch/x86/hpet.c
index 8229c63..bc7a851 100644
--- a/xen/arch/x86/hpet.c
+++ b/xen/arch/x86/hpet.c
@@ -608,7 +608,7 @@  void __init hpet_broadcast_init(void)
         hpet_events[i].shift = 32;
         hpet_events[i].next_event = STIME_MAX;
         spin_lock_init(&hpet_events[i].lock);
-        wmb();
+        smp_wmb();
         hpet_events[i].event_handler = handle_hpet_broadcast;
 
         hpet_events[i].msi.msi_attrib.maskbit = 1;
diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
index b2a8b0e..e9851f6 100644
--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -91,7 +91,7 @@  static bool_t hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
     {
         unsigned int state = p->state;
 
-        rmb();
+        smp_rmb();
         switch ( state )
         {
         case STATE_IOREQ_NONE:
@@ -1327,7 +1327,7 @@  static int hvm_send_buffered_ioreq(struct hvm_ioreq_server *s, ioreq_t *p)
     }
 
     /* Make the ioreq_t visible /before/ write_pointer. */
-    wmb();
+    smp_wmb();
     pg->ptrs.write_pointer += qw ? 2 : 1;
 
     /* Canonicalize read/write pointers to prevent their overflow. */
diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
index 57e6c18..ee9afd8 100644
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -759,9 +759,9 @@  void irq_set_affinity(struct irq_desc *desc, const cpumask_t *mask)
     
     ASSERT(spin_is_locked(&desc->lock));
     desc->status &= ~IRQ_MOVE_PENDING;
-    wmb();
+    smp_wmb();
     cpumask_copy(desc->arch.pending_mask, mask);
-    wmb();
+    smp_wmb();
     desc->status |= IRQ_MOVE_PENDING;
 }
 
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 5b094b4..ee17f6d 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -79,7 +79,7 @@  static enum cpu_state {
     CPU_STATE_CALLIN,   /* slave -> master: Completed phase 2 */
     CPU_STATE_ONLINE    /* master -> slave: Go fully online now. */
 } cpu_state;
-#define set_cpu_state(state) do { mb(); cpu_state = (state); } while (0)
+#define set_cpu_state(state) do { smp_mb(); cpu_state = (state); } while (0)
 
 void *stack_base[NR_CPUS];
 
@@ -126,7 +126,7 @@  static void synchronize_tsc_master(unsigned int slave)
     for ( i = 1; i <= 5; i++ )
     {
         tsc_value = rdtsc_ordered();
-        wmb();
+        smp_wmb();
         atomic_inc(&tsc_count);
         while ( atomic_read(&tsc_count) != (i<<1) )
             cpu_relax();
@@ -151,7 +151,7 @@  static void synchronize_tsc_slave(unsigned int slave)
     {
         while ( atomic_read(&tsc_count) != ((i<<1)-1) )
             cpu_relax();
-        rmb();
+        smp_rmb();
         /*
          * If a CPU has been physically hotplugged, we may as well write
          * to its TSC in spite of X86_FEATURE_TSC_RELIABLE. The platform does
@@ -553,13 +553,13 @@  static int do_boot_cpu(int apicid, int cpu)
         }
         else if ( cpu_state == CPU_STATE_DEAD )
         {
-            rmb();
+            smp_rmb();
             rc = cpu_error;
         }
         else
         {
             boot_error = 1;
-            mb();
+            smp_mb();
             if ( bootsym(trampoline_cpu_started) == 0xA5 )
                 /* trampoline started but...? */
                 printk("Stuck ??\n");
@@ -577,7 +577,7 @@  static int do_boot_cpu(int apicid, int cpu)
 
     /* mark "stuck" area as not stuck */
     bootsym(trampoline_cpu_started) = 0;
-    mb();
+    smp_mb();
 
     smpboot_restore_warm_reset_vector();
 
diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
index b988b94..a7d7d77 100644
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -976,10 +976,10 @@  static void __update_vcpu_system_time(struct vcpu *v, int force)
 
     /* 1. Update guest kernel version. */
     _u.version = u->version = version_update_begin(u->version);
-    wmb();
+    smp_wmb();
     /* 2. Update all other guest kernel fields. */
     *u = _u;
-    wmb();
+    smp_wmb();
     /* 3. Update guest kernel version. */
     u->version = version_update_end(u->version);
 
@@ -1006,10 +1006,10 @@  bool update_secondary_system_time(struct vcpu *v,
         update_guest_memory_policy(v, &policy);
         return false;
     }
-    wmb();
+    smp_wmb();
     /* 2. Update all other userspace fields. */
     __copy_to_guest(user_u, u, 1);
-    wmb();
+    smp_wmb();
     /* 3. Update userspace version. */
     u->version = version_update_end(u->version);
     __copy_field_to_guest(user_u, u, version);
diff --git a/xen/include/asm-x86/desc.h b/xen/include/asm-x86/desc.h
index da924bf..9956aae 100644
--- a/xen/include/asm-x86/desc.h
+++ b/xen/include/asm-x86/desc.h
@@ -128,10 +128,10 @@  static inline void _write_gate_lower(volatile idt_entry_t *gate,
 #define _set_gate(gate_addr,type,dpl,addr)               \
 do {                                                     \
     (gate_addr)->a = 0;                                  \
-    wmb(); /* disable gate /then/ rewrite */             \
+    smp_wmb(); /* disable gate /then/ rewrite */         \
     (gate_addr)->b =                                     \
         ((unsigned long)(addr) >> 32);                   \
-    wmb(); /* rewrite /then/ enable gate */              \
+    smp_wmb(); /* rewrite /then/ enable gate */          \
     (gate_addr)->a =                                     \
         (((unsigned long)(addr) & 0xFFFF0000UL) << 32) | \
         ((unsigned long)(dpl) << 45) |                   \
@@ -174,11 +174,11 @@  static inline void _update_gate_addr_lower(idt_entry_t *gate, void *addr)
 #define _set_tssldt_desc(desc,addr,limit,type)           \
 do {                                                     \
     (desc)[0].b = (desc)[1].b = 0;                       \
-    wmb(); /* disable entry /then/ rewrite */            \
+    smp_wmb(); /* disable entry /then/ rewrite */        \
     (desc)[0].a =                                        \
         ((u32)(addr) << 16) | ((u32)(limit) & 0xFFFF);   \
     (desc)[1].a = (u32)(((unsigned long)(addr)) >> 32);  \
-    wmb(); /* rewrite /then/ enable entry */             \
+    smp_wmb(); /* rewrite /then/ enable entry */         \
     (desc)[0].b =                                        \
         ((u32)(addr) & 0xFF000000U) |                    \
         ((u32)(type) << 8) | 0x8000U |                   \
diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h
index eb498f5..9cb6fd7 100644
--- a/xen/include/asm-x86/system.h
+++ b/xen/include/asm-x86/system.h
@@ -183,7 +183,7 @@  static always_inline unsigned long __xadd(
 #define smp_wmb()       wmb()
 
 #define set_mb(var, value) do { xchg(&var, value); } while (0)
-#define set_wmb(var, value) do { var = value; wmb(); } while (0)
+#define set_wmb(var, value) do { var = value; smp_wmb(); } while (0)
 
 #define local_irq_disable()     asm volatile ( "cli" : : : "memory" )
 #define local_irq_enable()      asm volatile ( "sti" : : : "memory" )