diff mbox

[for-4.7] x86/emulate: synchronize LOCKed instruction emulation

Message ID 1460550374-4344-1-git-send-email-rcojocaru@bitdefender.com (mailing list archive)
State New, archived
Headers show

Commit Message

Razvan Cojocaru April 13, 2016, 12:26 p.m. UTC
LOCK-prefixed instructions are currenly allowed to run in parallel
in x86_emulate(), which can lead the guest into an undefined state.
This patch fixes the issue.

Signed-off-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
---
 tools/tests/x86_emulator/test_x86_emulator.c | 12 ++++++++++++
 xen/arch/x86/hvm/emulate.c                   | 26 ++++++++++++++++++++++++++
 xen/arch/x86/mm.c                            |  3 +++
 xen/arch/x86/mm/shadow/common.c              |  4 ++++
 xen/arch/x86/x86_emulate/x86_emulate.c       | 23 ++++++++++++++++++++++-
 xen/arch/x86/x86_emulate/x86_emulate.h       |  8 ++++++++
 xen/common/domain.c                          |  2 ++
 xen/include/asm-x86/domain.h                 |  4 ++++
 xen/include/asm-x86/hvm/emulate.h            |  3 +++
 9 files changed, 84 insertions(+), 1 deletion(-)

Comments

Jan Beulich April 14, 2016, 4:35 a.m. UTC | #1
>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/13/16 7:53 PM >>>
>LOCK-prefixed instructions are currenly allowed to run in parallel
>in x86_emulate(), which can lead the guest into an undefined state.
>This patch fixes the issue.

... by ... (read: Too brief a description)

>--- a/xen/arch/x86/hvm/emulate.c
>+++ b/xen/arch/x86/hvm/emulate.c
>@@ -25,6 +25,8 @@
 >#include <asm/hvm/svm/svm.h>
 >#include <asm/vm_event.h>
 >
>+DEFINE_PERCPU_RWLOCK_GLOBAL(emulate_locked_rwlock);

You should try hard to make this static.

>--- a/xen/arch/x86/mm.c
>+++ b/xen/arch/x86/mm.c
>@@ -112,6 +112,7 @@
 >#include <asm/ldt.h>
 >#include <asm/x86_emulate.h>
 >#include <asm/e820.h>
>+#include <asm/hvm/emulate.h>
 
This header shouldn't be included here. You need to move the declarations
elsewhere for them to be usable here.

>@@ -1589,6 +1591,8 @@ x86_emulate(
     >}
  >done_prefixes:
 >
>+    ops->smp_lock(lock_prefix);
>+
     >if ( rex_prefix & REX_W )
         >op_bytes = 8;
 
Already from the context it is obvious that the lock can be taken later.

>@@ -2380,7 +2390,12 @@ x86_emulate(
         >}
         >/* Write back the memory destination with implicit LOCK prefix. */
         >dst.val = src.val;
>-        lock_prefix = 1;
>+        if ( !lock_prefix )
>+        {
>+            ops->smp_unlock(lock_prefix);
>+            lock_prefix = 1;
>+            ops->smp_lock(lock_prefix);
>+        }
         
This can't be correct: You need to take the write lock _before_ reading the
memory location.

>@@ -3859,6 +3874,9 @@ x86_emulate(
  >done:
     >_put_fpu();
     >put_stub(stub);
>+
>+    ops->smp_unlock(lock_prefix);

And just like above from the context alone it is clear that the unlock can
happen earlier. Locked regions should always as small as possible.

>--- a/xen/common/domain.c
>+++ b/xen/common/domain.c
>@@ -272,6 +272,8 @@ struct domain *domain_create(domid_t domid, unsigned int domcr_flags,
 >
     >TRACE_1D(TRC_DOM0_DOM_ADD, d->domain_id);
 >
>+    percpu_rwlock_resource_init(&d->arch.emulate_lock, emulate_locked_rwlock);

I cannot see how this would build on ARM.

Overall I can see why you want this, but what is the performance impact? After
all you're basically making the emulator act like very old systems using a bus
lock (i.e. a global one) instead of utilizing cacheline exclusive ownership. Plus
you still don't (and cannot, afaict) deal with one LOCKed instruction getting
emulated and another in parallel getting executed by the CPU without emulation
(granted this shouldn't normally happen, but we also can't exclude that case).

With the above I'm also not convinced this is an issue that absolutely needs to
be addressed in 4.7 - it's not a regression, and I suppose not a problem for
guests that aren't under introspection.

Jan
Razvan Cojocaru April 14, 2016, 5:56 a.m. UTC | #2
On 04/14/16 07:35, Jan Beulich wrote:
>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/13/16 7:53 PM >>>
>> LOCK-prefixed instructions are currenly allowed to run in parallel
>> in x86_emulate(), which can lead the guest into an undefined state.
>> This patch fixes the issue.
> 
> ... by ... (read: Too brief a description)

I'll make it more detailed.

>> --- a/xen/arch/x86/hvm/emulate.c
>> +++ b/xen/arch/x86/hvm/emulate.c
>> @@ -25,6 +25,8 @@
>  >#include <asm/hvm/svm/svm.h>
>  >#include <asm/vm_event.h>
>  >
>> +DEFINE_PERCPU_RWLOCK_GLOBAL(emulate_locked_rwlock);
> 
> You should try hard to make this static.

I'll try.

>> --- a/xen/arch/x86/mm.c
>> +++ b/xen/arch/x86/mm.c
>> @@ -112,6 +112,7 @@
>  >#include <asm/ldt.h>
>  >#include <asm/x86_emulate.h>
>  >#include <asm/e820.h>
>> +#include <asm/hvm/emulate.h>
>  
> This header shouldn't be included here. You need to move the declarations
> elsewhere for them to be usable here.

Understood. Is there any place particularly fitting you could suggest?
Thanks!

>> @@ -1589,6 +1591,8 @@ x86_emulate(
>      >}
>   >done_prefixes:
>  >
>> +    ops->smp_lock(lock_prefix);
>> +
>      >if ( rex_prefix & REX_W )
>          >op_bytes = 8;
>  
> Already from the context it is obvious that the lock can be taken later.

I'll take it later.

>> @@ -2380,7 +2390,12 @@ x86_emulate(
>          >}
>          >/* Write back the memory destination with implicit LOCK prefix. */
>          >dst.val = src.val;
>> -        lock_prefix = 1;
>> +        if ( !lock_prefix )
>> +        {
>> +            ops->smp_unlock(lock_prefix);
>> +            lock_prefix = 1;
>> +            ops->smp_lock(lock_prefix);
>> +        }
>          
> This can't be correct: You need to take the write lock _before_ reading the
> memory location.

Right.

>> @@ -3859,6 +3874,9 @@ x86_emulate(
>   >done:
>      >_put_fpu();
>      >put_stub(stub);
>> +
>> +    ops->smp_unlock(lock_prefix);
> 
> And just like above from the context alone it is clear that the unlock can
> happen earlier. Locked regions should always as small as possible.

I'll move it up. I was just following returns.

>> --- a/xen/common/domain.c
>> +++ b/xen/common/domain.c
>> @@ -272,6 +272,8 @@ struct domain *domain_create(domid_t domid, unsigned int domcr_flags,
>  >
>      >TRACE_1D(TRC_DOM0_DOM_ADD, d->domain_id);
>  >
>> +    percpu_rwlock_resource_init(&d->arch.emulate_lock, emulate_locked_rwlock);
> 
> I cannot see how this would build on ARM.

I'll add separate functions for ARM and x86, with a no-op for ARM.

> Overall I can see why you want this, but what is the performance impact? After
> all you're basically making the emulator act like very old systems using a bus
> lock (i.e. a global one) instead of utilizing cacheline exclusive ownership. Plus
> you still don't (and cannot, afaict) deal with one LOCKed instruction getting
> emulated and another in parallel getting executed by the CPU without emulation
> (granted this shouldn't normally happen, but we also can't exclude that case).

I was under the impression that for reads, with the new percpu_rwlocks
the performance impact is close to zero, with some performance impact
for writes - but writes should, for one, be more infrequent, and then
the added safety factor should make up for that.

This indeed doesn't guard against LOCKed instructions being run in
parallel with and without emulation, however that is a case that should
almost never occur - at least not with introspection, where currently
all emulation happens as a result of EPT faults - so either all
instructions hitting a restricted page are emulated, or all ar run
directly. As long as all emulation can safely run in parallel and all
parallel non-emulation is also safe, it should be alright. But, yes,
this patch doesn't cover the case you're mentioning.

> With the above I'm also not convinced this is an issue that absolutely needs to
> be addressed in 4.7 - it's not a regression, and I suppose not a problem for
> guests that aren't under introspection.

The issue can also affect the pagetable and mmio hooks. Since Xen does
emulate outside of introspection, I thought that this has some immediate
importance. But obviously you know more about the code, so if you prefer
I can drop the "for-4.7" tag.


Thanks,
Razvan
Jürgen Groß April 14, 2016, 6:09 a.m. UTC | #3
On 14/04/16 07:56, Razvan Cojocaru wrote:
> This indeed doesn't guard against LOCKed instructions being run in
> parallel with and without emulation, however that is a case that should
> almost never occur - at least not with introspection, where currently
> all emulation happens as a result of EPT faults - so either all
> instructions hitting a restricted page are emulated, or all ar run
> directly. As long as all emulation can safely run in parallel and all
> parallel non-emulation is also safe, it should be alright. But, yes,
> this patch doesn't cover the case you're mentioning.

What about grant pages? There could be parallel accesses from different
domains, one being introspected, the other not.


Juergen
Razvan Cojocaru April 14, 2016, 6:31 a.m. UTC | #4
On 04/14/16 09:09, Juergen Gross wrote:
> On 14/04/16 07:56, Razvan Cojocaru wrote:
>> This indeed doesn't guard against LOCKed instructions being run in
>> parallel with and without emulation, however that is a case that should
>> almost never occur - at least not with introspection, where currently
>> all emulation happens as a result of EPT faults - so either all
>> instructions hitting a restricted page are emulated, or all ar run
>> directly. As long as all emulation can safely run in parallel and all
>> parallel non-emulation is also safe, it should be alright. But, yes,
>> this patch doesn't cover the case you're mentioning.
> 
> What about grant pages? There could be parallel accesses from different
> domains, one being introspected, the other not.

I'm not familiar with the code there, but the main issue is, I think,
LOCKed instructions that access (read / write) the same memory area - as
long as that doesn't happen, it should be fine, which may be the reason
why it hasn't caused problems so far.

While not perfect, I believe that the added safety is worth the small
performance impact for writes. I feel that going from unsafe parallel
emulation to safe parallel emulation is a good step to take, at least
until the problem can be fixed completely by more complex measures.


Thanks,
Razvan
Jürgen Groß April 14, 2016, 7:46 a.m. UTC | #5
On 14/04/16 08:31, Razvan Cojocaru wrote:
> On 04/14/16 09:09, Juergen Gross wrote:
>> On 14/04/16 07:56, Razvan Cojocaru wrote:
>>> This indeed doesn't guard against LOCKed instructions being run in
>>> parallel with and without emulation, however that is a case that should
>>> almost never occur - at least not with introspection, where currently
>>> all emulation happens as a result of EPT faults - so either all
>>> instructions hitting a restricted page are emulated, or all ar run
>>> directly. As long as all emulation can safely run in parallel and all
>>> parallel non-emulation is also safe, it should be alright. But, yes,
>>> this patch doesn't cover the case you're mentioning.
>>
>> What about grant pages? There could be parallel accesses from different
>> domains, one being introspected, the other not.
> 
> I'm not familiar with the code there, but the main issue is, I think,
> LOCKed instructions that access (read / write) the same memory area - as
> long as that doesn't happen, it should be fine, which may be the reason
> why it hasn't caused problems so far.

Depends on the guest, I suppose. :-)

I've been bitten by this before in my former position: we had a custom
pv-driver in dom0 which wasn't using LOCKed instructions accessing a
grant page. Reason was dom0 had one vcpu only and the Linux kernel
patched all LOCKs away as it didn't deem them being necessary. This
resulted in a very hard to debug communication failure between domU
and dom0.

> While not perfect, I believe that the added safety is worth the small
> performance impact for writes. I feel that going from unsafe parallel
> emulation to safe parallel emulation is a good step to take, at least
> until the problem can be fixed completely by more complex measures.

I'm fine with you saying for your use case the solution is good enough.

Just wanted to point out a possible problem. This might not happen
for most guest types, but you can't be sure. :-)


Juergen
Andrew Cooper April 14, 2016, 8:01 a.m. UTC | #6
On 14/04/2016 08:46, Juergen Gross wrote:
> On 14/04/16 08:31, Razvan Cojocaru wrote:
>> On 04/14/16 09:09, Juergen Gross wrote:
>>> On 14/04/16 07:56, Razvan Cojocaru wrote:
>>>> This indeed doesn't guard against LOCKed instructions being run in
>>>> parallel with and without emulation, however that is a case that should
>>>> almost never occur - at least not with introspection, where currently
>>>> all emulation happens as a result of EPT faults - so either all
>>>> instructions hitting a restricted page are emulated, or all ar run
>>>> directly. As long as all emulation can safely run in parallel and all
>>>> parallel non-emulation is also safe, it should be alright. But, yes,
>>>> this patch doesn't cover the case you're mentioning.
>>> What about grant pages? There could be parallel accesses from different
>>> domains, one being introspected, the other not.
>> I'm not familiar with the code there, but the main issue is, I think,
>> LOCKed instructions that access (read / write) the same memory area - as
>> long as that doesn't happen, it should be fine, which may be the reason
>> why it hasn't caused problems so far.
> Depends on the guest, I suppose. :-)
>
> I've been bitten by this before in my former position: we had a custom
> pv-driver in dom0 which wasn't using LOCKed instructions accessing a
> grant page. Reason was dom0 had one vcpu only and the Linux kernel
> patched all LOCKs away as it didn't deem them being necessary. This
> resulted in a very hard to debug communication failure between domU
> and dom0.
>
>> While not perfect, I believe that the added safety is worth the small
>> performance impact for writes. I feel that going from unsafe parallel
>> emulation to safe parallel emulation is a good step to take, at least
>> until the problem can be fixed completely by more complex measures.
> I'm fine with you saying for your use case the solution is good enough.
>
> Just wanted to point out a possible problem. This might not happen
> for most guest types, but you can't be sure. :-)

But accesses into a mapped grant don't trap for emulation.  Why would
locks here be any different to usual?

~Andrew
Andrew Cooper April 14, 2016, 8:07 a.m. UTC | #7
On 14/04/2016 06:56, Razvan Cojocaru wrote:
>
>>> --- a/xen/common/domain.c
>>> +++ b/xen/common/domain.c
>>> @@ -272,6 +272,8 @@ struct domain *domain_create(domid_t domid, unsigned int domcr_flags,
>>  >
>>      >TRACE_1D(TRC_DOM0_DOM_ADD, d->domain_id);
>>  >
>>> +    percpu_rwlock_resource_init(&d->arch.emulate_lock, emulate_locked_rwlock);
>> I cannot see how this would build on ARM.
> I'll add separate functions for ARM and x86, with a no-op for ARM.

Please move this line into arch_domain_create() arch/x86/domain.c

Strictly speaking, common code should never reference ->arch.  The bits
of ->arch which common code does touch should move up into common struct
domain.

~Andrew
Razvan Cojocaru April 14, 2016, 8:09 a.m. UTC | #8
On 04/14/2016 11:07 AM, Andrew Cooper wrote:
> On 14/04/2016 06:56, Razvan Cojocaru wrote:
>>
>>>> --- a/xen/common/domain.c
>>>> +++ b/xen/common/domain.c
>>>> @@ -272,6 +272,8 @@ struct domain *domain_create(domid_t domid, unsigned int domcr_flags,
>>>  >
>>>      >TRACE_1D(TRC_DOM0_DOM_ADD, d->domain_id);
>>>  >
>>>> +    percpu_rwlock_resource_init(&d->arch.emulate_lock, emulate_locked_rwlock);
>>> I cannot see how this would build on ARM.
>> I'll add separate functions for ARM and x86, with a no-op for ARM.
> 
> Please move this line into arch_domain_create() arch/x86/domain.c
> 
> Strictly speaking, common code should never reference ->arch.  The bits
> of ->arch which common code does touch should move up into common struct
> domain.

Of course, my bad. Will move it.


Thanks,
Razvan
Jürgen Groß April 14, 2016, 8:18 a.m. UTC | #9
On 14/04/16 10:01, Andrew Cooper wrote:
> On 14/04/2016 08:46, Juergen Gross wrote:
>> On 14/04/16 08:31, Razvan Cojocaru wrote:
>>> On 04/14/16 09:09, Juergen Gross wrote:
>>>> On 14/04/16 07:56, Razvan Cojocaru wrote:
>>>>> This indeed doesn't guard against LOCKed instructions being run in
>>>>> parallel with and without emulation, however that is a case that should
>>>>> almost never occur - at least not with introspection, where currently
>>>>> all emulation happens as a result of EPT faults - so either all
>>>>> instructions hitting a restricted page are emulated, or all ar run
>>>>> directly. As long as all emulation can safely run in parallel and all
>>>>> parallel non-emulation is also safe, it should be alright. But, yes,
>>>>> this patch doesn't cover the case you're mentioning.
>>>> What about grant pages? There could be parallel accesses from different
>>>> domains, one being introspected, the other not.
>>> I'm not familiar with the code there, but the main issue is, I think,
>>> LOCKed instructions that access (read / write) the same memory area - as
>>> long as that doesn't happen, it should be fine, which may be the reason
>>> why it hasn't caused problems so far.
>> Depends on the guest, I suppose. :-)
>>
>> I've been bitten by this before in my former position: we had a custom
>> pv-driver in dom0 which wasn't using LOCKed instructions accessing a
>> grant page. Reason was dom0 had one vcpu only and the Linux kernel
>> patched all LOCKs away as it didn't deem them being necessary. This
>> resulted in a very hard to debug communication failure between domU
>> and dom0.
>>
>>> While not perfect, I believe that the added safety is worth the small
>>> performance impact for writes. I feel that going from unsafe parallel
>>> emulation to safe parallel emulation is a good step to take, at least
>>> until the problem can be fixed completely by more complex measures.
>> I'm fine with you saying for your use case the solution is good enough.
>>
>> Just wanted to point out a possible problem. This might not happen
>> for most guest types, but you can't be sure. :-)
> 
> But accesses into a mapped grant don't trap for emulation.  Why would
> locks here be any different to usual?

With memory introspection switched on they will trap, won't they?


Juergen
Razvan Cojocaru April 14, 2016, 8:25 a.m. UTC | #10
On 04/14/2016 11:18 AM, Juergen Gross wrote:
> On 14/04/16 10:01, Andrew Cooper wrote:
>> On 14/04/2016 08:46, Juergen Gross wrote:
>>> On 14/04/16 08:31, Razvan Cojocaru wrote:
>>>> On 04/14/16 09:09, Juergen Gross wrote:
>>>>> On 14/04/16 07:56, Razvan Cojocaru wrote:
>>>>>> This indeed doesn't guard against LOCKed instructions being run in
>>>>>> parallel with and without emulation, however that is a case that should
>>>>>> almost never occur - at least not with introspection, where currently
>>>>>> all emulation happens as a result of EPT faults - so either all
>>>>>> instructions hitting a restricted page are emulated, or all ar run
>>>>>> directly. As long as all emulation can safely run in parallel and all
>>>>>> parallel non-emulation is also safe, it should be alright. But, yes,
>>>>>> this patch doesn't cover the case you're mentioning.
>>>>> What about grant pages? There could be parallel accesses from different
>>>>> domains, one being introspected, the other not.
>>>> I'm not familiar with the code there, but the main issue is, I think,
>>>> LOCKed instructions that access (read / write) the same memory area - as
>>>> long as that doesn't happen, it should be fine, which may be the reason
>>>> why it hasn't caused problems so far.
>>> Depends on the guest, I suppose. :-)
>>>
>>> I've been bitten by this before in my former position: we had a custom
>>> pv-driver in dom0 which wasn't using LOCKed instructions accessing a
>>> grant page. Reason was dom0 had one vcpu only and the Linux kernel
>>> patched all LOCKs away as it didn't deem them being necessary. This
>>> resulted in a very hard to debug communication failure between domU
>>> and dom0.
>>>
>>>> While not perfect, I believe that the added safety is worth the small
>>>> performance impact for writes. I feel that going from unsafe parallel
>>>> emulation to safe parallel emulation is a good step to take, at least
>>>> until the problem can be fixed completely by more complex measures.
>>> I'm fine with you saying for your use case the solution is good enough.
>>>
>>> Just wanted to point out a possible problem. This might not happen
>>> for most guest types, but you can't be sure. :-)
>>
>> But accesses into a mapped grant don't trap for emulation.  Why would
>> locks here be any different to usual?
> 
> With memory introspection switched on they will trap, won't they?

Only write or execute instructions referencing a handful of a HVM
guest's pages trap for emulation with our introspection application,
otherwise performance would be terrible - we don't trap all instructions
for emulation.


Thanks,
Razvan
Razvan Cojocaru April 14, 2016, 8:51 a.m. UTC | #11
On 04/14/2016 07:35 AM, Jan Beulich wrote:
>> --- a/xen/arch/x86/hvm/emulate.c
>> >+++ b/xen/arch/x86/hvm/emulate.c
>> >@@ -25,6 +25,8 @@
>  >#include <asm/hvm/svm/svm.h>
>  >#include <asm/vm_event.h>
>  >
>> >+DEFINE_PERCPU_RWLOCK_GLOBAL(emulate_locked_rwlock);
> You should try hard to make this static.

On second though, this would make the code somewhat more convoluted, as
the functions in emulate.c need to access this variable, and so does the
code in domain.c that initializes the lock when the domain is created.

What I've done is similar to what I found in the current source code with:

arch/x86/mm/p2m.c:DEFINE_PERCPU_RWLOCK_GLOBAL(p2m_percpu_rwlock);
common/grant_table.c:DEFINE_PERCPU_RWLOCK_GLOBAL(grant_rwlock);

But I could add a function to emulate.h, for example:

void init_emulate_smp_lock();

and call that from arch/x86/domain.c (as suggested by Andrew Cooper),
which would allow domain.c to stop referencing emulate_locked_rwlock
directly.

Would you prefer this?


Thanks,
Razvan
Razvan Cojocaru April 14, 2016, 9:08 a.m. UTC | #12
On 04/14/2016 08:56 AM, Razvan Cojocaru wrote:
>>> --- a/xen/arch/x86/mm.c
>>> >> +++ b/xen/arch/x86/mm.c
>>> >> @@ -112,6 +112,7 @@
>> >  >#include <asm/ldt.h>
>> >  >#include <asm/x86_emulate.h>
>> >  >#include <asm/e820.h>
>>> >> +#include <asm/hvm/emulate.h>
>> >  
>> > This header shouldn't be included here. You need to move the declarations
>> > elsewhere for them to be usable here.
> Understood. Is there any place particularly fitting you could suggest?

Actually, I've removed that #include and the code continues to compile,
so it would appear that it is somehow #included indirectly, which makes
the declarations accessible implicitly.


Thanks,
Razvan
David Vrabel April 14, 2016, 10:35 a.m. UTC | #13
On 13/04/16 13:26, Razvan Cojocaru wrote:
> LOCK-prefixed instructions are currenly allowed to run in parallel
> in x86_emulate(), which can lead the guest into an undefined state.
> This patch fixes the issue.

Is this sufficient?  What if another VCPU is running on another PCPU and
doing an unemulated LOCK-prefixed instruction to the same memory address?

This other VCPU could be for another domain (or Xen for that matter).

David
Razvan Cojocaru April 14, 2016, 11:43 a.m. UTC | #14
On 04/14/2016 01:35 PM, David Vrabel wrote:
> On 13/04/16 13:26, Razvan Cojocaru wrote:
>> LOCK-prefixed instructions are currenly allowed to run in parallel
>> in x86_emulate(), which can lead the guest into an undefined state.
>> This patch fixes the issue.
> 
> Is this sufficient?  What if another VCPU is running on another PCPU and
> doing an unemulated LOCK-prefixed instruction to the same memory address?
> 
> This other VCPU could be for another domain (or Xen for that matter).

The patch is only sufficient for parallel runs of emulated instructions,
as previously stated. It is, however, able to prevent nasty guest lockups.

This is what happened in a previous thread where I was hunting down the
issue and initially thought that the xen-access.c model was broken when
used with emulation, and even proceeded to check that the ring buffer
memory accesses are synchronized properly. They were alright, the
problem was in fact LOCKed instruction emulation happening in parallel,
i.e. a race condition there.

This is less obvious if we signal that vm_event responses are available
immediately after processing each one (which greatly reduces the chances
of a race happening), and more obvious with guests that have 2 (or more)
VCPUs where all of them are paused waiting for a vm_event reply, and all
of them are woken up at the same time, after processing all of the
events, and asked to emulate.

I do believe that somewhere in Xen emulating in this manner could occur,
so I hope to make emulation generally safer.

As for not fixing the _whole_ issue, as Jan has rightly pointed out,
that's a rather difficult thing to do.

I will add a comment in the V2 of the patch to clearly state the
limitations of the patch, as well as more information about how the
patch proposes to fix the issue described (as requested by Jan Beulich).


Thanks,
Razvan
Jan Beulich April 14, 2016, 3:31 p.m. UTC | #15
>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/14/16 10:50 AM >>>
>On 04/14/2016 07:35 AM, Jan Beulich wrote:
>>> --- a/xen/arch/x86/hvm/emulate.c
>>> >+++ b/xen/arch/x86/hvm/emulate.c
>>> >@@ -25,6 +25,8 @@
>>  >#include <asm/hvm/svm/svm.h>
>>  >#include <asm/vm_event.h>
>>  >
>>> >+DEFINE_PERCPU_RWLOCK_GLOBAL(emulate_locked_rwlock);
>> You should try hard to make this static.
>
>On second though, this would make the code somewhat more convoluted, as
>the functions in emulate.c need to access this variable, and so does the
>code in domain.c that initializes the lock when the domain is created.
>
>What I've done is similar to what I found in the current source code with:
>
>arch/x86/mm/p2m.c:DEFINE_PERCPU_RWLOCK_GLOBAL(p2m_percpu_rwlock);
>common/grant_table.c:DEFINE_PERCPU_RWLOCK_GLOBAL(grant_rwlock);

Well, that's why I said "you should try hard" instead of "you have to".

>But I could add a function to emulate.h, for example:
>
>void init_emulate_smp_lock();

Exactly (just that this is perhaps again the wrong header, considering you
also need this for PV emulation).

>and call that from arch/x86/domain.c (as suggested by Andrew Cooper),
>which would allow domain.c to stop referencing emulate_locked_rwlock
>directly.

Right, and to be honest I can't really see how my earlier comment could
have been taken for "add an ARM stub".

Jan
Jan Beulich April 14, 2016, 3:33 p.m. UTC | #16
>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/14/16 11:08 AM >>>
>On 04/14/2016 08:56 AM, Razvan Cojocaru wrote:
>>>> --- a/xen/arch/x86/mm.c
>>>> >> +++ b/xen/arch/x86/mm.c
>>>> >> @@ -112,6 +112,7 @@
>>> >  >#include <asm/ldt.h>
>>> >  >#include <asm/x86_emulate.h>
>>> >  >#include <asm/e820.h>
>>>> >> +#include <asm/hvm/emulate.h>
>>> >  
>>> > This header shouldn't be included here. You need to move the declarations
>>> > elsewhere for them to be usable here.
>> Understood. Is there any place particularly fitting you could suggest?
>
>Actually, I've removed that #include and the code continues to compile,
>so it would appear that it is somehow #included indirectly, which makes
>the declarations accessible implicitly.

But - see my other reply just sent - it is _still_ the wrong header then. Stuff
shared between PV and HVM doesn't belong in HVM specific files. There are
a few counterexamples, but we shouldn't add more.

Jan
Jan Beulich April 14, 2016, 3:40 p.m. UTC | #17
>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/14/16 1:43 PM >>>
>On 04/14/2016 01:35 PM, David Vrabel wrote:
>> On 13/04/16 13:26, Razvan Cojocaru wrote:
>>> LOCK-prefixed instructions are currenly allowed to run in parallel
>>> in x86_emulate(), which can lead the guest into an undefined state.
>>> This patch fixes the issue.
>> 
>> Is this sufficient?  What if another VCPU is running on another PCPU and
>> doing an unemulated LOCK-prefixed instruction to the same memory address?
>> 
>> This other VCPU could be for another domain (or Xen for that matter).
>
>The patch is only sufficient for parallel runs of emulated instructions,
>as previously stated. It is, however, able to prevent nasty guest lockups.
>
>This is what happened in a previous thread where I was hunting down the
>issue and initially thought that the xen-access.c model was broken when
>used with emulation, and even proceeded to check that the ring buffer
>memory accesses are synchronized properly. They were alright, the
>problem was in fact LOCKed instruction emulation happening in parallel,
>i.e. a race condition there.
>
>This is less obvious if we signal that vm_event responses are available
>immediately after processing each one (which greatly reduces the chances
>of a race happening), and more obvious with guests that have 2 (or more)
>VCPUs where all of them are paused waiting for a vm_event reply, and all
>of them are woken up at the same time, after processing all of the
>events, and asked to emulate.
>
>I do believe that somewhere in Xen emulating in this manner could occur,
>so I hope to make emulation generally safer.
>
>As for not fixing the _whole_ issue, as Jan has rightly pointed out,
>that's a rather difficult thing to do.

To be honest, just having remembered that we do the write back for locked
instructions using CMPXCHG, I'd first of all like to see a proper description
of "the _whole_ issue".

Jan
Razvan Cojocaru April 14, 2016, 3:40 p.m. UTC | #18
On 04/14/2016 06:31 PM, Jan Beulich wrote:
>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/14/16 10:50 AM >>>
>> On 04/14/2016 07:35 AM, Jan Beulich wrote:
>>>> --- a/xen/arch/x86/hvm/emulate.c
>>>>> +++ b/xen/arch/x86/hvm/emulate.c
>>>>> @@ -25,6 +25,8 @@
>>>  >#include <asm/hvm/svm/svm.h>
>>>  >#include <asm/vm_event.h>
>>>  >
>>>>> +DEFINE_PERCPU_RWLOCK_GLOBAL(emulate_locked_rwlock);
>>> You should try hard to make this static.
>>
>> On second though, this would make the code somewhat more convoluted, as
>> the functions in emulate.c need to access this variable, and so does the
>> code in domain.c that initializes the lock when the domain is created.
>>
>> What I've done is similar to what I found in the current source code with:
>>
>> arch/x86/mm/p2m.c:DEFINE_PERCPU_RWLOCK_GLOBAL(p2m_percpu_rwlock);
>> common/grant_table.c:DEFINE_PERCPU_RWLOCK_GLOBAL(grant_rwlock);
> 
> Well, that's why I said "you should try hard" instead of "you have to".
> 
>> But I could add a function to emulate.h, for example:
>>
>> void init_emulate_smp_lock();
> 
> Exactly (just that this is perhaps again the wrong header, considering you
> also need this for PV emulation).

I understand, I'll look for a more suitable place for the function
declarations and definitions then (unless a specific place is requested).


Thanks,
Razvan
Jan Beulich April 14, 2016, 3:44 p.m. UTC | #19
>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/14/16 7:57 AM >>>
>On 04/14/16 07:35, Jan Beulich wrote:
>>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/13/16 7:53 PM >>>
>>> @@ -1589,6 +1591,8 @@ x86_emulate(
>>      >}
>>   >done_prefixes:
>>  >
>>> +    ops->smp_lock(lock_prefix);
>>> +
>>      >if ( rex_prefix & REX_W )
>>          >op_bytes = 8;
>>  
>> Already from the context it is obvious that the lock can be taken later.
>
>I'll take it later.

And that alone won't suffice: Instructions not touching memory shouldn't take
any lock at all. As shouldn't instructions that only read or only write memory.

>> Overall I can see why you want this, but what is the performance impact? After
>> all you're basically making the emulator act like very old systems using a bus
>> lock (i.e. a global one) instead of utilizing cacheline exclusive ownership. Plus
>> you still don't (and cannot, afaict) deal with one LOCKed instruction getting
>> emulated and another in parallel getting executed by the CPU without emulation
>> (granted this shouldn't normally happen, but we also can't exclude that case).
>
>I was under the impression that for reads, with the new percpu_rwlocks
>the performance impact is close to zero, with some performance impact
>for writes - but writes should, for one, be more infrequent, and then
>the added safety factor should make up for that.

That's the performance effect on the hypervisor you talk about. But what about
the guest, which all of the sudden gets another domain wide lock applied?

Jan
Razvan Cojocaru April 14, 2016, 3:45 p.m. UTC | #20
On 04/14/2016 06:40 PM, Jan Beulich wrote:
>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/14/16 1:43 PM >>>
>> On 04/14/2016 01:35 PM, David Vrabel wrote:
>>> On 13/04/16 13:26, Razvan Cojocaru wrote:
>>>> LOCK-prefixed instructions are currenly allowed to run in parallel
>>>> in x86_emulate(), which can lead the guest into an undefined state.
>>>> This patch fixes the issue.
>>>
>>> Is this sufficient?  What if another VCPU is running on another PCPU and
>>> doing an unemulated LOCK-prefixed instruction to the same memory address?
>>>
>>> This other VCPU could be for another domain (or Xen for that matter).
>>
>> The patch is only sufficient for parallel runs of emulated instructions,
>> as previously stated. It is, however, able to prevent nasty guest lockups.
>>
>> This is what happened in a previous thread where I was hunting down the
>> issue and initially thought that the xen-access.c model was broken when
>> used with emulation, and even proceeded to check that the ring buffer
>> memory accesses are synchronized properly. They were alright, the
>> problem was in fact LOCKed instruction emulation happening in parallel,
>> i.e. a race condition there.
>>
>> This is less obvious if we signal that vm_event responses are available
>> immediately after processing each one (which greatly reduces the chances
>> of a race happening), and more obvious with guests that have 2 (or more)
>> VCPUs where all of them are paused waiting for a vm_event reply, and all
>> of them are woken up at the same time, after processing all of the
>> events, and asked to emulate.
>>
>> I do believe that somewhere in Xen emulating in this manner could occur,
>> so I hope to make emulation generally safer.
>>
>> As for not fixing the _whole_ issue, as Jan has rightly pointed out,
>> that's a rather difficult thing to do.
> 
> To be honest, just having remembered that we do the write back for locked
> instructions using CMPXCHG, I'd first of all like to see a proper description
> of "the _whole_ issue".

I believe at least part of the issue has to do with the comment on line
1013 from xen/arch/x86/hvm/emulate.c:

 994 static int hvmemul_cmpxchg(
 995     enum x86_segment seg,
 996     unsigned long offset,
 997     void *p_old,
 998     void *p_new,
 999     unsigned int bytes,
1000     struct x86_emulate_ctxt *ctxt)
1001 {
1002     struct hvm_emulate_ctxt *hvmemul_ctxt =
1003         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
1004
1005     if ( unlikely(hvmemul_ctxt->set_context) )
1006     {
1007         int rc = set_context_data(p_new, bytes);
1008
1009         if ( rc != X86EMUL_OKAY )
1010             return rc;
1011     }
1012
1013     /* Fix this in case the guest is really relying on r-m-w
atomicity. */
1014     return hvmemul_write(seg, offset, p_new, bytes, ctxt);
1015 }


Thanks,
Razvan
Andrew Cooper April 14, 2016, 3:45 p.m. UTC | #21
On 14/04/16 16:40, Jan Beulich wrote:
>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/14/16 1:43 PM >>>
>> On 04/14/2016 01:35 PM, David Vrabel wrote:
>>> On 13/04/16 13:26, Razvan Cojocaru wrote:
>>>> LOCK-prefixed instructions are currenly allowed to run in parallel
>>>> in x86_emulate(), which can lead the guest into an undefined state.
>>>> This patch fixes the issue.
>>> Is this sufficient?  What if another VCPU is running on another PCPU and
>>> doing an unemulated LOCK-prefixed instruction to the same memory address?
>>>
>>> This other VCPU could be for another domain (or Xen for that matter).
>> The patch is only sufficient for parallel runs of emulated instructions,
>> as previously stated. It is, however, able to prevent nasty guest lockups.
>>
>> This is what happened in a previous thread where I was hunting down the
>> issue and initially thought that the xen-access.c model was broken when
>> used with emulation, and even proceeded to check that the ring buffer
>> memory accesses are synchronized properly. They were alright, the
>> problem was in fact LOCKed instruction emulation happening in parallel,
>> i.e. a race condition there.
>>
>> This is less obvious if we signal that vm_event responses are available
>> immediately after processing each one (which greatly reduces the chances
>> of a race happening), and more obvious with guests that have 2 (or more)
>> VCPUs where all of them are paused waiting for a vm_event reply, and all
>> of them are woken up at the same time, after processing all of the
>> events, and asked to emulate.
>>
>> I do believe that somewhere in Xen emulating in this manner could occur,
>> so I hope to make emulation generally safer.
>>
>> As for not fixing the _whole_ issue, as Jan has rightly pointed out,
>> that's a rather difficult thing to do.
> To be honest, just having remembered that we do the write back for locked
> instructions using CMPXCHG, I'd first of all like to see a proper description
> of "the _whole_ issue".

All emulated instructions with a lock prefix end up calling into
hvmemul_cmpxchg()

I suspect the issue is to do with the implementation of
hvmemul_cmpxchg(), which contains a TODO from 2008 of

/* Fix this in case the guest is really relying on r-m-w atomicity. */

which, amongst other things, never updates *p_old.


Short of having the instruction emulator convert any locked instruction
into a stub, I can't think of a solution which works for both emulated
and non-emulated instructions.

~Andrew
Razvan Cojocaru April 14, 2016, 4 p.m. UTC | #22
On 04/14/2016 06:44 PM, Jan Beulich wrote:
>>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/14/16 7:57 AM >>>
>> On 04/14/16 07:35, Jan Beulich wrote:
>>>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/13/16 7:53 PM >>>
>>>> @@ -1589,6 +1591,8 @@ x86_emulate(
>>>      >}
>>>   >done_prefixes:
>>>  >
>>>> +    ops->smp_lock(lock_prefix);
>>>> +
>>>      >if ( rex_prefix & REX_W )
>>>          >op_bytes = 8;
>>>  
>>> Already from the context it is obvious that the lock can be taken later.
>>
>> I'll take it later.
> 
> And that alone won't suffice: Instructions not touching memory shouldn't take
> any lock at all. As shouldn't instructions that only read or only write memory.
> 
>>> Overall I can see why you want this, but what is the performance impact? After
>>> all you're basically making the emulator act like very old systems using a bus
>>> lock (i.e. a global one) instead of utilizing cacheline exclusive ownership. Plus
>>> you still don't (and cannot, afaict) deal with one LOCKed instruction getting
>>> emulated and another in parallel getting executed by the CPU without emulation
>>> (granted this shouldn't normally happen, but we also can't exclude that case).
>>
>> I was under the impression that for reads, with the new percpu_rwlocks
>> the performance impact is close to zero, with some performance impact
>> for writes - but writes should, for one, be more infrequent, and then
>> the added safety factor should make up for that.
> 
> That's the performance effect on the hypervisor you talk about. But what about
> the guest, which all of the sudden gets another domain wide lock applied?

Well, yes, there's bound to be some performance loss - but I think the
question is: is the added safety worth it? As always, there are
trade-offs. I am quite possibly missing something, but surely a slightly
slower, running guest is better than a fast unstable one.

As for the patch, it's definitely perfectible, and I appreciate all
suggestions to make it the best it can be (or even to scrape it for a
better solution).


Thanks,
Razvan
Jan Beulich April 14, 2016, 4:08 p.m. UTC | #23
>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/14/16 5:45 PM >>>
>On 04/14/2016 06:40 PM, Jan Beulich wrote:
>> To be honest, just having remembered that we do the write back for locked
>> instructions using CMPXCHG, I'd first of all like to see a proper description
>> of "the _whole_ issue".
>
>I believe at least part of the issue has to do with the comment on line
>1013 from xen/arch/x86/hvm/emulate.c:
>
 >994 static int hvmemul_cmpxchg(
 >995     enum x86_segment seg,
 >996     unsigned long offset,
 >997     void *p_old,
 >998     void *p_new,
 >999     unsigned int bytes,
>1000     struct x86_emulate_ctxt *ctxt)
>1001 {
>1002     struct hvm_emulate_ctxt *hvmemul_ctxt =
>1003         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
>1004
>1005     if ( unlikely(hvmemul_ctxt->set_context) )
>1006     {
>1007         int rc = set_context_data(p_new, bytes);
>1008
>1009         if ( rc != X86EMUL_OKAY )
>1010             return rc;
>1011     }
>1012
>1013     /* Fix this in case the guest is really relying on r-m-w atomicity. */
>1014     return hvmemul_write(seg, offset, p_new, bytes, ctxt);
>1015 }

Ah, so _that's_ where the problem wants to be fixed then (leaving - afaict -
PV emulation paths completely unaffected).

Jan
Jan Beulich April 14, 2016, 4:09 p.m. UTC | #24
>>> Andrew Cooper <andrew.cooper3@citrix.com> 04/14/16 5:46 PM >>>
>Short of having the instruction emulator convert any locked instruction
>into a stub, I can't think of a solution which works for both emulated
>and non-emulated instructions.

That's my understanding too.

Jan
Jan Beulich April 14, 2016, 4:11 p.m. UTC | #25
>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/14/16 6:00 PM >>>
>On 04/14/2016 06:44 PM, Jan Beulich wrote:
>> That's the performance effect on the hypervisor you talk about. But what about
>> the guest, which all of the sudden gets another domain wide lock applied?
>
>Well, yes, there's bound to be some performance loss - but I think the
>question is: is the added safety worth it? As always, there are
>trade-offs. I am quite possibly missing something, but surely a slightly
>slower, running guest is better than a fast unstable one.
>
>As for the patch, it's definitely perfectible, and I appreciate all
>suggestions to make it the best it can be (or even to scrape it for a
>better solution).

That's the while thing here: If what you have presented comes at a
reasonable price, we may well take it with obvious issues fixed. Yet if the
price is quite high, investing effort into finding a better solution is imo not
just warranted, but necessary.

Jan
Razvan Cojocaru April 18, 2016, 12:14 p.m. UTC | #26
On 04/14/2016 07:08 PM, Jan Beulich wrote:
>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/14/16 5:45 PM >>>
>> On 04/14/2016 06:40 PM, Jan Beulich wrote:
>>> To be honest, just having remembered that we do the write back for locked
>>> instructions using CMPXCHG, I'd first of all like to see a proper description
>>> of "the _whole_ issue".
>>
>> I believe at least part of the issue has to do with the comment on line
>> 1013 from xen/arch/x86/hvm/emulate.c:
>>
>  >994 static int hvmemul_cmpxchg(
>  >995     enum x86_segment seg,
>  >996     unsigned long offset,
>  >997     void *p_old,
>  >998     void *p_new,
>  >999     unsigned int bytes,
>> 1000     struct x86_emulate_ctxt *ctxt)
>> 1001 {
>> 1002     struct hvm_emulate_ctxt *hvmemul_ctxt =
>> 1003         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
>> 1004
>> 1005     if ( unlikely(hvmemul_ctxt->set_context) )
>> 1006     {
>> 1007         int rc = set_context_data(p_new, bytes);
>> 1008
>> 1009         if ( rc != X86EMUL_OKAY )
>> 1010             return rc;
>> 1011     }
>> 1012
>> 1013     /* Fix this in case the guest is really relying on r-m-w atomicity. */
>> 1014     return hvmemul_write(seg, offset, p_new, bytes, ctxt);
>> 1015 }
> 
> Ah, so _that's_ where the problem wants to be fixed then (leaving - afaict -
> PV emulation paths completely unaffected).

That's what I had hoped too, an early version of this patch simply used
a spinlock that locked / unlock on entry / exit of hvmemul_cmpxchg().
Even with this patch, I've just tried it again with all ops->smp_lock()
/ ops->smp_unlock() calls commented out in x86_emulate(), and
hvmemul_cmpxchg() modified as follows:

 994 static int hvmemul_cmpxchg(
 995     enum x86_segment seg,
 996     unsigned long offset,
 997     void *p_old,
 998     void *p_new,
 999     unsigned int bytes,
1000     struct x86_emulate_ctxt *ctxt)
1001 {
1002     struct hvm_emulate_ctxt *hvmemul_ctxt =
1003         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
1004     int rc;
1005
1006     emulate_smp_lock(1);
1007
1008     if ( unlikely(hvmemul_ctxt->set_context) )
1009     {
1010         int rc = set_context_data(p_new, bytes);
1011
1012         if ( rc != X86EMUL_OKAY )
1013             return rc;
1014     }
1015
1016     /* Fix this in case the guest is really relying on r-m-w
atomicity. */
1017     rc = hvmemul_write(seg, offset, p_new, bytes, ctxt);
1018
1019     emulate_smp_unlock(1);
1020
1021     return rc;
1022 }

Unfortunately, with just this the guest still hangs, while with read and
write locking in x86_emulate() it does not.


Thanks,
Razvan
Jan Beulich April 18, 2016, 4:45 p.m. UTC | #27
>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/18/16 2:40 PM >>>
>On 04/14/2016 07:08 PM, Jan Beulich wrote:
>>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/14/16 5:45 PM >>>
>>> On 04/14/2016 06:40 PM, Jan Beulich wrote:
>>>> To be honest, just having remembered that we do the write back for locked
>>>> instructions using CMPXCHG, I'd first of all like to see a proper description
>>>> of "the _whole_ issue".
>>>
>>> I believe at least part of the issue has to do with the comment on line
>>> 1013 from xen/arch/x86/hvm/emulate.c:
>>>
>>  >994 static int hvmemul_cmpxchg(
>>  >995     enum x86_segment seg,
>>  >996     unsigned long offset,
>>  >997     void *p_old,
>>  >998     void *p_new,
>>  >999     unsigned int bytes,
>>> 1000     struct x86_emulate_ctxt *ctxt)
>>> 1001 {
>>> 1002     struct hvm_emulate_ctxt *hvmemul_ctxt =
>>> 1003         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
>>> 1004
>>> 1005     if ( unlikely(hvmemul_ctxt->set_context) )
>>> 1006     {
>>> 1007         int rc = set_context_data(p_new, bytes);
>>> 1008
>>> 1009         if ( rc != X86EMUL_OKAY )
>>> 1010             return rc;
>>> 1011     }
>>> 1012
>>> 1013     /* Fix this in case the guest is really relying on r-m-w atomicity. */
>>> 1014     return hvmemul_write(seg, offset, p_new, bytes, ctxt);
>>> 1015 }
>> 
>> Ah, so _that's_ where the problem wants to be fixed then (leaving - afaict -
>> PV emulation paths completely unaffected).
>
>That's what I had hoped too, an early version of this patch simply used
>a spinlock that locked / unlock on entry / exit of hvmemul_cmpxchg().
>Even with this patch, I've just tried it again with all ops->smp_lock()
>/ ops->smp_unlock() calls commented out in x86_emulate(), and
>hvmemul_cmpxchg() modified as follows:
>
 >994 static int hvmemul_cmpxchg(
 >995     enum x86_segment seg,
 >996     unsigned long offset,
 >997     void *p_old,
 >998     void *p_new,
 >999     unsigned int bytes,
>1000     struct x86_emulate_ctxt *ctxt)
>1001 {
>1002     struct hvm_emulate_ctxt *hvmemul_ctxt =
>1003         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
>1004     int rc;
>1005
>1006     emulate_smp_lock(1);
>1007
>1008     if ( unlikely(hvmemul_ctxt->set_context) )
>1009     {
>1010         int rc = set_context_data(p_new, bytes);
>1011
>1012         if ( rc != X86EMUL_OKAY )
>1013             return rc;
>1014     }
>1015
>1016     /* Fix this in case the guest is really relying on r-m-w
>atomicity. */
>1017     rc = hvmemul_write(seg, offset, p_new, bytes, ctxt);
>1018
>1019     emulate_smp_unlock(1);
>1020
>1021     return rc;
>1022 }
>
>Unfortunately, with just this the guest still hangs, while with read and
>write locking in x86_emulate() it does not.

That's unexpected at least at the first glance, but justifying some variant of
the patch you have submitted would require understanding why the above
change isn't enough and can't be suitably extended to be sufficient.

Jan
Razvan Cojocaru April 19, 2016, 11:01 a.m. UTC | #28
On 04/18/2016 07:45 PM, Jan Beulich wrote:
>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/18/16 2:40 PM >>>
>> On 04/14/2016 07:08 PM, Jan Beulich wrote:
>>>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/14/16 5:45 PM >>>
>>>> On 04/14/2016 06:40 PM, Jan Beulich wrote:
>>>>> To be honest, just having remembered that we do the write back for locked
>>>>> instructions using CMPXCHG, I'd first of all like to see a proper description
>>>>> of "the _whole_ issue".
>>>>
>>>> I believe at least part of the issue has to do with the comment on line
>>>> 1013 from xen/arch/x86/hvm/emulate.c:
>>>>
>>>  >994 static int hvmemul_cmpxchg(
>>>  >995     enum x86_segment seg,
>>>  >996     unsigned long offset,
>>>  >997     void *p_old,
>>>  >998     void *p_new,
>>>  >999     unsigned int bytes,
>>>> 1000     struct x86_emulate_ctxt *ctxt)
>>>> 1001 {
>>>> 1002     struct hvm_emulate_ctxt *hvmemul_ctxt =
>>>> 1003         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
>>>> 1004
>>>> 1005     if ( unlikely(hvmemul_ctxt->set_context) )
>>>> 1006     {
>>>> 1007         int rc = set_context_data(p_new, bytes);
>>>> 1008
>>>> 1009         if ( rc != X86EMUL_OKAY )
>>>> 1010             return rc;
>>>> 1011     }
>>>> 1012
>>>> 1013     /* Fix this in case the guest is really relying on r-m-w atomicity. */
>>>> 1014     return hvmemul_write(seg, offset, p_new, bytes, ctxt);
>>>> 1015 }
>>>
>>> Ah, so _that's_ where the problem wants to be fixed then (leaving - afaict -
>>> PV emulation paths completely unaffected).
>>
>> That's what I had hoped too, an early version of this patch simply used
>> a spinlock that locked / unlock on entry / exit of hvmemul_cmpxchg().
>> Even with this patch, I've just tried it again with all ops->smp_lock()
>> / ops->smp_unlock() calls commented out in x86_emulate(), and
>> hvmemul_cmpxchg() modified as follows:
>>
>  >994 static int hvmemul_cmpxchg(
>  >995     enum x86_segment seg,
>  >996     unsigned long offset,
>  >997     void *p_old,
>  >998     void *p_new,
>  >999     unsigned int bytes,
>> 1000     struct x86_emulate_ctxt *ctxt)
>> 1001 {
>> 1002     struct hvm_emulate_ctxt *hvmemul_ctxt =
>> 1003         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
>> 1004     int rc;
>> 1005
>> 1006     emulate_smp_lock(1);
>> 1007
>> 1008     if ( unlikely(hvmemul_ctxt->set_context) )
>> 1009     {
>> 1010         int rc = set_context_data(p_new, bytes);
>> 1011
>> 1012         if ( rc != X86EMUL_OKAY )
>> 1013             return rc;
>> 1014     }
>> 1015
>> 1016     /* Fix this in case the guest is really relying on r-m-w
>> atomicity. */
>> 1017     rc = hvmemul_write(seg, offset, p_new, bytes, ctxt);
>> 1018
>> 1019     emulate_smp_unlock(1);
>> 1020
>> 1021     return rc;
>> 1022 }
>>
>> Unfortunately, with just this the guest still hangs, while with read and
>> write locking in x86_emulate() it does not.
> 
> That's unexpected at least at the first glance, but justifying some variant of
> the patch you have submitted would require understanding why the above
> change isn't enough and can't be suitably extended to be sufficient.

I think this might be because the LOCK prefix should guarantee that the
instruction that follows it has exclusive use of shared memory (for both
reads and writes) but I might be misreading the docs:

(From the Intel SDM) "Causes the processor’s LOCK# signal to be asserted
during execution of the accompanying instruction (turns the instruction
into an atomic instruction). In a multiprocessor environment, the LOCK#
signal ensures that the processor has exclusive use of any shared memory
while the signal is asserted."

Using a spin lock (or the rwlock in the manner described above) in
hvmemul_cmpxchg() only prevents two LOCKed instructions from running at
the same time, but presumably even non-LOCKed instructions just reading
that memory area should be prevented from running until the LOCKed
instruction is done (hence the guest starting up with the rwlock in
x86_emulate() but not with it locked only in hvmemul_cmpxchg()).

Hopefully I haven't misunderstood the issue.


Thanks,
Razvan
Jan Beulich April 19, 2016, 4:35 p.m. UTC | #29
>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/19/16 1:01 PM >>>
>I think this might be because the LOCK prefix should guarantee that the
>instruction that follows it has exclusive use of shared memory (for both
>reads and writes) but I might be misreading the docs:

LOCK definitely has no effect on other than the instruction it gets applied
to.

Jan

>(From the Intel SDM) "Causes the processor’s LOCK# signal to be asserted
>during execution of the accompanying instruction (turns the instruction
>into an atomic instruction). In a multiprocessor environment, the LOCK#
>signal ensures that the processor has exclusive use of any shared memory
>while the signal is asserted."
>
>Using a spin lock (or the rwlock in the manner described above) in
>hvmemul_cmpxchg() only prevents two LOCKed instructions from running at
>the same time, but presumably even non-LOCKed instructions just reading
>that memory area should be prevented from running until the LOCKed
>instruction is done (hence the guest starting up with the rwlock in
>x86_emulate() but not with it locked only in hvmemul_cmpxchg()).
>
>Hopefully I haven't misunderstood the issue.
>
>
>Thanks,
>Razvan
George Dunlap April 26, 2016, 4:03 p.m. UTC | #30
On 19/04/16 17:35, Jan Beulich wrote:
>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/19/16 1:01 PM >>>
>> I think this might be because the LOCK prefix should guarantee that the
>> instruction that follows it has exclusive use of shared memory (for both
>> reads and writes) but I might be misreading the docs:
> 
> LOCK definitely has no effect on other than the instruction it gets applied
> to.

Sorry I wasn't involved in this discussion -- what was the conclusion here?

FWIW Andy's suggestion of using a stub seemed like the most robust
solution, if that could be made to work.

If you're going to submit a patch substantially similar to this one, let
me know so I can review the mm bits of the original patch.

 -George
Razvan Cojocaru April 26, 2016, 5:23 p.m. UTC | #31
On 04/26/16 19:03, George Dunlap wrote:
> On 19/04/16 17:35, Jan Beulich wrote:
>>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/19/16 1:01 PM >>>
>>> I think this might be because the LOCK prefix should guarantee that the
>>> instruction that follows it has exclusive use of shared memory (for both
>>> reads and writes) but I might be misreading the docs:
>>
>> LOCK definitely has no effect on other than the instruction it gets applied
>> to.
> 
> Sorry I wasn't involved in this discussion -- what was the conclusion here?
> 
> FWIW Andy's suggestion of using a stub seemed like the most robust
> solution, if that could be made to work.
> 
> If you're going to submit a patch substantially similar to this one, let
> me know so I can review the mm bits of the original patch.

I'm not really sure.

Regarding this version of the patch, Jan has asked for more information
on the performance impact, but I'm not sure how to obtain it in a
rigorous manner. If it is decided that a version of this patch is
desirable, I can go on fixing the issues we've found and address the
comments we've had so far and submit a new version.

I'm not familiar with what the stub solution would imply, so I'm afraid
I can't comment on that. This is not code I've had that much contact
with prior to stumbling into this problem.


Thanks,
Razvan
Andrew Cooper April 26, 2016, 5:39 p.m. UTC | #32
On 26/04/16 18:23, Razvan Cojocaru wrote:
> On 04/26/16 19:03, George Dunlap wrote:
>> On 19/04/16 17:35, Jan Beulich wrote:
>>>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/19/16 1:01 PM >>>
>>>> I think this might be because the LOCK prefix should guarantee that the
>>>> instruction that follows it has exclusive use of shared memory (for both
>>>> reads and writes) but I might be misreading the docs:
>>> LOCK definitely has no effect on other than the instruction it gets applied
>>> to.
>> Sorry I wasn't involved in this discussion -- what was the conclusion here?
>>
>> FWIW Andy's suggestion of using a stub seemed like the most robust
>> solution, if that could be made to work.
>>
>> If you're going to submit a patch substantially similar to this one, let
>> me know so I can review the mm bits of the original patch.
> I'm not really sure.
>
> Regarding this version of the patch, Jan has asked for more information
> on the performance impact, but I'm not sure how to obtain it in a
> rigorous manner. If it is decided that a version of this patch is
> desirable, I can go on fixing the issues we've found and address the
> comments we've had so far and submit a new version.

XenServer did performance testing.  No observable impact for normal VM
workloads (which is to be expected, as an OS wouldn't normally LOCK the
instructions it uses for MMIO).  The per-cpu rwlocks have ~0 overhead
when the lock isn't held for writing.

>
> I'm not familiar with what the stub solution would imply, so I'm afraid
> I can't comment on that. This is not code I've had that much contact
> with prior to stumbling into this problem.

As for the fix I suggested, its probably prohibitive to fix the current
emulator, given the plans for a rewrite.  (And on that note, I really
need to write a design doc and post to the list).

~Andrew
Jan Beulich April 27, 2016, 6:22 a.m. UTC | #33
>>> On 26.04.16 at 19:23, <rcojocaru@bitdefender.com> wrote:
> On 04/26/16 19:03, George Dunlap wrote:
>> On 19/04/16 17:35, Jan Beulich wrote:
>>>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/19/16 1:01 PM >>>
>>>> I think this might be because the LOCK prefix should guarantee that the
>>>> instruction that follows it has exclusive use of shared memory (for both
>>>> reads and writes) but I might be misreading the docs:
>>>
>>> LOCK definitely has no effect on other than the instruction it gets applied
>>> to.
>> 
>> Sorry I wasn't involved in this discussion -- what was the conclusion here?
>> 
>> FWIW Andy's suggestion of using a stub seemed like the most robust
>> solution, if that could be made to work.
>> 
>> If you're going to submit a patch substantially similar to this one, let
>> me know so I can review the mm bits of the original patch.
> 
> I'm not really sure.
> 
> Regarding this version of the patch, Jan has asked for more information
> on the performance impact, but I'm not sure how to obtain it in a
> rigorous manner.

That was only one half, which Andrew has now answered. The
other was the not understood issue with a later variant you had
tried.

Jan
Jan Beulich April 27, 2016, 6:25 a.m. UTC | #34
>>> On 26.04.16 at 19:39, <andrew.cooper3@citrix.com> wrote:
> On 26/04/16 18:23, Razvan Cojocaru wrote:
>> Regarding this version of the patch, Jan has asked for more information
>> on the performance impact, but I'm not sure how to obtain it in a
>> rigorous manner. If it is decided that a version of this patch is
>> desirable, I can go on fixing the issues we've found and address the
>> comments we've had so far and submit a new version.
> 
> XenServer did performance testing.  No observable impact for normal VM
> workloads (which is to be expected, as an OS wouldn't normally LOCK the
> instructions it uses for MMIO).  The per-cpu rwlocks have ~0 overhead
> when the lock isn't held for writing.

So how about PV guests doing locked page table updates, or the
impact on log-dirty mode?

Jan
Razvan Cojocaru April 27, 2016, 7:14 a.m. UTC | #35
On 04/27/2016 09:22 AM, Jan Beulich wrote:
>>>> On 26.04.16 at 19:23, <rcojocaru@bitdefender.com> wrote:
>> On 04/26/16 19:03, George Dunlap wrote:
>>> On 19/04/16 17:35, Jan Beulich wrote:
>>>>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/19/16 1:01 PM >>>
>>>>> I think this might be because the LOCK prefix should guarantee that the
>>>>> instruction that follows it has exclusive use of shared memory (for both
>>>>> reads and writes) but I might be misreading the docs:
>>>>
>>>> LOCK definitely has no effect on other than the instruction it gets applied
>>>> to.
>>>
>>> Sorry I wasn't involved in this discussion -- what was the conclusion here?
>>>
>>> FWIW Andy's suggestion of using a stub seemed like the most robust
>>> solution, if that could be made to work.
>>>
>>> If you're going to submit a patch substantially similar to this one, let
>>> me know so I can review the mm bits of the original patch.
>>
>> I'm not really sure.
>>
>> Regarding this version of the patch, Jan has asked for more information
>> on the performance impact, but I'm not sure how to obtain it in a
>> rigorous manner.
> 
> That was only one half, which Andrew has now answered. The
> other was the not understood issue with a later variant you had
> tried.

Right, there's the fact that this version (with read / write locking the
whole function) works whereas just synchonizing hvmemul_cmpxchg() with a
spin lock does not. It is not yet fully clear why (the part of the
conversation at the very top of this email was an early attempt to
elucidate it).


Thanks,
Razvan
Andrew Cooper April 27, 2016, 7:36 a.m. UTC | #36
On 27/04/2016 07:25, Jan Beulich wrote:
>>>> On 26.04.16 at 19:39, <andrew.cooper3@citrix.com> wrote:
>> On 26/04/16 18:23, Razvan Cojocaru wrote:
>>> Regarding this version of the patch, Jan has asked for more information
>>> on the performance impact, but I'm not sure how to obtain it in a
>>> rigorous manner. If it is decided that a version of this patch is
>>> desirable, I can go on fixing the issues we've found and address the
>>> comments we've had so far and submit a new version.
>> XenServer did performance testing.  No observable impact for normal VM
>> workloads (which is to be expected, as an OS wouldn't normally LOCK the
>> instructions it uses for MMIO).  The per-cpu rwlocks have ~0 overhead
>> when the lock isn't held for writing.
> So how about PV guests doing locked page table updates, or the
> impact on log-dirty mode?

As I sad - no observable change.

Emulated pagetable writes are already a slowpath for PV guests, so
avoided  (Hypercalls are several times faster).  Logdirty doesn't
directly cause any instructions to be emulated.  For hap guests, the
EPT/NPT violation is resolved and the guest re-entered.  For PV guests
and shadow hvm, we do get emulated pagetable writes, but how many of
those are actually locked?  The majority in Xen at least and just
explicit-width mov's.

~Andrew
Razvan Cojocaru May 3, 2016, 2:20 p.m. UTC | #37
On 04/27/2016 10:14 AM, Razvan Cojocaru wrote:
> On 04/27/2016 09:22 AM, Jan Beulich wrote:
>>>>> On 26.04.16 at 19:23, <rcojocaru@bitdefender.com> wrote:
>>> On 04/26/16 19:03, George Dunlap wrote:
>>>> On 19/04/16 17:35, Jan Beulich wrote:
>>>>>>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 04/19/16 1:01 PM >>>
>>>>>> I think this might be because the LOCK prefix should guarantee that the
>>>>>> instruction that follows it has exclusive use of shared memory (for both
>>>>>> reads and writes) but I might be misreading the docs:
>>>>>
>>>>> LOCK definitely has no effect on other than the instruction it gets applied
>>>>> to.
>>>>
>>>> Sorry I wasn't involved in this discussion -- what was the conclusion here?
>>>>
>>>> FWIW Andy's suggestion of using a stub seemed like the most robust
>>>> solution, if that could be made to work.
>>>>
>>>> If you're going to submit a patch substantially similar to this one, let
>>>> me know so I can review the mm bits of the original patch.
>>>
>>> I'm not really sure.
>>>
>>> Regarding this version of the patch, Jan has asked for more information
>>> on the performance impact, but I'm not sure how to obtain it in a
>>> rigorous manner.
>>
>> That was only one half, which Andrew has now answered. The
>> other was the not understood issue with a later variant you had
>> tried.
> 
> Right, there's the fact that this version (with read / write locking the
> whole function) works whereas just synchonizing hvmemul_cmpxchg() with a
> spin lock does not. It is not yet fully clear why (the part of the
> conversation at the very top of this email was an early attempt to
> elucidate it).

I've kept experimenting with the patch but can't quite figure out why
minimizing the lock scope to the writeback part would not be sufficient,
but it isn't.

I.e. with this code:

3824  writeback:
3825     ops->smp_lock(lock_prefix);
3826     switch ( dst.type )
3827     {
3828     case OP_REG:
3829         /* The 4-byte case *is* correct: in 64-bit mode we
zero-extend. */
3830         switch ( dst.bytes )
3831         {
3832         case 1: *(uint8_t  *)dst.reg = (uint8_t)dst.val; break;
3833         case 2: *(uint16_t *)dst.reg = (uint16_t)dst.val; break;
3834         case 4: *dst.reg = (uint32_t)dst.val; break; /* 64b:
zero-ext */
3835         case 8: *dst.reg = dst.val; break;
3836         }
3837         break;
3838     case OP_MEM:
3839         if ( !(d & Mov) && (dst.orig_val == dst.val) &&
3840              !ctxt->force_writeback )
3841             /* nothing to do */;
3842         else if ( lock_prefix )
3843             rc = ops->cmpxchg(
3844                 dst.mem.seg, dst.mem.off, &dst.orig_val,
3845                 &dst.val, dst.bytes, ctxt);
3846         else
3847             rc = ops->write(
3848                 dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
3849         if ( rc != 0 )
3850         {
3851             ops->smp_unlock(lock_prefix);
3852             goto done;
3853         }
3854     default:
3855         break;
3856     }
3857     ops->smp_unlock(lock_prefix);

I can still reproduce the guest hang. But if I lock at the very
beginning of x86_emulate() and unlock before each return, no more hangs.

I would appreciate any suggestions on how to go about trying to narrow
the scope of the lock, or figure out what subtlety is at work here.


Thanks,
Razvan
Jan Beulich May 3, 2016, 2:30 p.m. UTC | #38
>>> On 03.05.16 at 16:20, <rcojocaru@bitdefender.com> wrote:
> I've kept experimenting with the patch but can't quite figure out why
> minimizing the lock scope to the writeback part would not be sufficient,
> but it isn't.
> 
> I.e. with this code:
> 
> 3824  writeback:
> 3825     ops->smp_lock(lock_prefix);
> 3826     switch ( dst.type )
> 3827     {
> 3828     case OP_REG:
> 3829         /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
> 3830         switch ( dst.bytes )
> 3831         {
> 3832         case 1: *(uint8_t  *)dst.reg = (uint8_t)dst.val; break;
> 3833         case 2: *(uint16_t *)dst.reg = (uint16_t)dst.val; break;
> 3834         case 4: *dst.reg = (uint32_t)dst.val; break; /* 64b: zero-ext */
> 3835         case 8: *dst.reg = dst.val; break;
> 3836         }
> 3837         break;
> 3838     case OP_MEM:
> 3839         if ( !(d & Mov) && (dst.orig_val == dst.val) &&
> 3840              !ctxt->force_writeback )
> 3841             /* nothing to do */;
> 3842         else if ( lock_prefix )
> 3843             rc = ops->cmpxchg(
> 3844                 dst.mem.seg, dst.mem.off, &dst.orig_val,
> 3845                 &dst.val, dst.bytes, ctxt);
> 3846         else
> 3847             rc = ops->write(
> 3848                 dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
> 3849         if ( rc != 0 )
> 3850         {
> 3851             ops->smp_unlock(lock_prefix);
> 3852             goto done;
> 3853         }
> 3854     default:
> 3855         break;
> 3856     }
> 3857     ops->smp_unlock(lock_prefix);
> 
> I can still reproduce the guest hang. But if I lock at the very
> beginning of x86_emulate() and unlock before each return, no more hangs.

Isn't that obvious? Locked instructions are necessarily
read-modify-write ones, and hence the lock needs to be taken
before the read, and dropped after the write. But remember, I'll
continue to show opposition to this getting "fixed" this way (in the
emulator itself), as long as no proper explanation can be given
why making hvmemul_cmpxchg() do what its name says isn't all
we need (and hence why i386-like bus lock behavior is needed).

Jan
Razvan Cojocaru May 3, 2016, 2:41 p.m. UTC | #39
On 05/03/2016 05:30 PM, Jan Beulich wrote:
>>>> On 03.05.16 at 16:20, <rcojocaru@bitdefender.com> wrote:
>> I've kept experimenting with the patch but can't quite figure out why
>> minimizing the lock scope to the writeback part would not be sufficient,
>> but it isn't.
>>
>> I.e. with this code:
>>
>> 3824  writeback:
>> 3825     ops->smp_lock(lock_prefix);
>> 3826     switch ( dst.type )
>> 3827     {
>> 3828     case OP_REG:
>> 3829         /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
>> 3830         switch ( dst.bytes )
>> 3831         {
>> 3832         case 1: *(uint8_t  *)dst.reg = (uint8_t)dst.val; break;
>> 3833         case 2: *(uint16_t *)dst.reg = (uint16_t)dst.val; break;
>> 3834         case 4: *dst.reg = (uint32_t)dst.val; break; /* 64b: zero-ext */
>> 3835         case 8: *dst.reg = dst.val; break;
>> 3836         }
>> 3837         break;
>> 3838     case OP_MEM:
>> 3839         if ( !(d & Mov) && (dst.orig_val == dst.val) &&
>> 3840              !ctxt->force_writeback )
>> 3841             /* nothing to do */;
>> 3842         else if ( lock_prefix )
>> 3843             rc = ops->cmpxchg(
>> 3844                 dst.mem.seg, dst.mem.off, &dst.orig_val,
>> 3845                 &dst.val, dst.bytes, ctxt);
>> 3846         else
>> 3847             rc = ops->write(
>> 3848                 dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
>> 3849         if ( rc != 0 )
>> 3850         {
>> 3851             ops->smp_unlock(lock_prefix);
>> 3852             goto done;
>> 3853         }
>> 3854     default:
>> 3855         break;
>> 3856     }
>> 3857     ops->smp_unlock(lock_prefix);
>>
>> I can still reproduce the guest hang. But if I lock at the very
>> beginning of x86_emulate() and unlock before each return, no more hangs.
> 
> Isn't that obvious? Locked instructions are necessarily
> read-modify-write ones, and hence the lock needs to be taken
> before the read, and dropped after the write. But remember, I'll
> continue to show opposition to this getting "fixed" this way (in the
> emulator itself), as long as no proper explanation can be given
> why making hvmemul_cmpxchg() do what its name says isn't all
> we need (and hence why i386-like bus lock behavior is needed).

Yes, that's what I thought, but at a previous time I've described my
attempt to lock _only_ hvmemul_cmpxchg() (which failed) - and the
failure of that change to address the issue has been considered curious.
I've probably not been able to explain clearly what I've tried, or have
misunderstood the answer, and took it to mean that for some reason a
similar change is supposed to be able to fix it.

Obviously locking hvmemul_cmpxchg() would have only affected the OP_MEM
case above (with lock_prefix == 1), actually an even smaller scope than
the new one, and with no read locking either.

I guess the question now is what avenues are there to make
hvmemul_cmpxchg() do what its name says - I'm certainly open to trying
out any alternatives - my main concern is to have the problem fixed in
the best way possible, certainly not to have any specific version of
this patch make it into Xen.


Thanks,
Razvan
Jan Beulich May 3, 2016, 3:13 p.m. UTC | #40
>>> On 03.05.16 at 16:41, <rcojocaru@bitdefender.com> wrote:
> On 05/03/2016 05:30 PM, Jan Beulich wrote:
>>>>> On 03.05.16 at 16:20, <rcojocaru@bitdefender.com> wrote:
>>> I've kept experimenting with the patch but can't quite figure out why
>>> minimizing the lock scope to the writeback part would not be sufficient,
>>> but it isn't.
>>>
>>> I.e. with this code:
>>>
>>> 3824  writeback:
>>> 3825     ops->smp_lock(lock_prefix);
>>> 3826     switch ( dst.type )
>>> 3827     {
>>> 3828     case OP_REG:
>>> 3829         /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
>>> 3830         switch ( dst.bytes )
>>> 3831         {
>>> 3832         case 1: *(uint8_t  *)dst.reg = (uint8_t)dst.val; break;
>>> 3833         case 2: *(uint16_t *)dst.reg = (uint16_t)dst.val; break;
>>> 3834         case 4: *dst.reg = (uint32_t)dst.val; break; /* 64b: zero-ext */
>>> 3835         case 8: *dst.reg = dst.val; break;
>>> 3836         }
>>> 3837         break;
>>> 3838     case OP_MEM:
>>> 3839         if ( !(d & Mov) && (dst.orig_val == dst.val) &&
>>> 3840              !ctxt->force_writeback )
>>> 3841             /* nothing to do */;
>>> 3842         else if ( lock_prefix )
>>> 3843             rc = ops->cmpxchg(
>>> 3844                 dst.mem.seg, dst.mem.off, &dst.orig_val,
>>> 3845                 &dst.val, dst.bytes, ctxt);
>>> 3846         else
>>> 3847             rc = ops->write(
>>> 3848                 dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
>>> 3849         if ( rc != 0 )
>>> 3850         {
>>> 3851             ops->smp_unlock(lock_prefix);
>>> 3852             goto done;
>>> 3853         }
>>> 3854     default:
>>> 3855         break;
>>> 3856     }
>>> 3857     ops->smp_unlock(lock_prefix);
>>>
>>> I can still reproduce the guest hang. But if I lock at the very
>>> beginning of x86_emulate() and unlock before each return, no more hangs.
>> 
>> Isn't that obvious? Locked instructions are necessarily
>> read-modify-write ones, and hence the lock needs to be taken
>> before the read, and dropped after the write. But remember, I'll
>> continue to show opposition to this getting "fixed" this way (in the
>> emulator itself), as long as no proper explanation can be given
>> why making hvmemul_cmpxchg() do what its name says isn't all
>> we need (and hence why i386-like bus lock behavior is needed).
> 
> Yes, that's what I thought, but at a previous time I've described my
> attempt to lock _only_ hvmemul_cmpxchg() (which failed) - and the
> failure of that change to address the issue has been considered curious.
> I've probably not been able to explain clearly what I've tried, or have
> misunderstood the answer, and took it to mean that for some reason a
> similar change is supposed to be able to fix it.
> 
> Obviously locking hvmemul_cmpxchg() would have only affected the OP_MEM
> case above (with lock_prefix == 1), actually an even smaller scope than
> the new one, and with no read locking either.

Yeah, I may have got confused there too (resulting in me confusing
you, perhaps): Adding a spin lock to hvmemul_cmpxchg() of course
won't help. Making it use (or force us of) LOCK CMPXCHG would, I
suppose.

> I guess the question now is what avenues are there to make
> hvmemul_cmpxchg() do what its name says - I'm certainly open to trying
> out any alternatives - my main concern is to have the problem fixed in
> the best way possible, certainly not to have any specific version of
> this patch make it into Xen.

I guess making it no longer call hvmemul_write(), copying most of
that function's code while suitably replacing just the call to
hvm_copy_to_guest_virt() (with the assumption that the MMIO
paths at least for now don't need strict LOCK handling) is the only
viable route.

Jan
Razvan Cojocaru May 4, 2016, 11:32 a.m. UTC | #41
On 05/03/2016 06:13 PM, Jan Beulich wrote:
>>>> On 03.05.16 at 16:41, <rcojocaru@bitdefender.com> wrote:
>> On 05/03/2016 05:30 PM, Jan Beulich wrote:
>>>>>> On 03.05.16 at 16:20, <rcojocaru@bitdefender.com> wrote:
>>>> I've kept experimenting with the patch but can't quite figure out why
>>>> minimizing the lock scope to the writeback part would not be sufficient,
>>>> but it isn't.
>>>>
>>>> I.e. with this code:
>>>>
>>>> 3824  writeback:
>>>> 3825     ops->smp_lock(lock_prefix);
>>>> 3826     switch ( dst.type )
>>>> 3827     {
>>>> 3828     case OP_REG:
>>>> 3829         /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
>>>> 3830         switch ( dst.bytes )
>>>> 3831         {
>>>> 3832         case 1: *(uint8_t  *)dst.reg = (uint8_t)dst.val; break;
>>>> 3833         case 2: *(uint16_t *)dst.reg = (uint16_t)dst.val; break;
>>>> 3834         case 4: *dst.reg = (uint32_t)dst.val; break; /* 64b: zero-ext */
>>>> 3835         case 8: *dst.reg = dst.val; break;
>>>> 3836         }
>>>> 3837         break;
>>>> 3838     case OP_MEM:
>>>> 3839         if ( !(d & Mov) && (dst.orig_val == dst.val) &&
>>>> 3840              !ctxt->force_writeback )
>>>> 3841             /* nothing to do */;
>>>> 3842         else if ( lock_prefix )
>>>> 3843             rc = ops->cmpxchg(
>>>> 3844                 dst.mem.seg, dst.mem.off, &dst.orig_val,
>>>> 3845                 &dst.val, dst.bytes, ctxt);
>>>> 3846         else
>>>> 3847             rc = ops->write(
>>>> 3848                 dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
>>>> 3849         if ( rc != 0 )
>>>> 3850         {
>>>> 3851             ops->smp_unlock(lock_prefix);
>>>> 3852             goto done;
>>>> 3853         }
>>>> 3854     default:
>>>> 3855         break;
>>>> 3856     }
>>>> 3857     ops->smp_unlock(lock_prefix);
>>>>
>>>> I can still reproduce the guest hang. But if I lock at the very
>>>> beginning of x86_emulate() and unlock before each return, no more hangs.
>>>
>>> Isn't that obvious? Locked instructions are necessarily
>>> read-modify-write ones, and hence the lock needs to be taken
>>> before the read, and dropped after the write. But remember, I'll
>>> continue to show opposition to this getting "fixed" this way (in the
>>> emulator itself), as long as no proper explanation can be given
>>> why making hvmemul_cmpxchg() do what its name says isn't all
>>> we need (and hence why i386-like bus lock behavior is needed).
>>
>> Yes, that's what I thought, but at a previous time I've described my
>> attempt to lock _only_ hvmemul_cmpxchg() (which failed) - and the
>> failure of that change to address the issue has been considered curious.
>> I've probably not been able to explain clearly what I've tried, or have
>> misunderstood the answer, and took it to mean that for some reason a
>> similar change is supposed to be able to fix it.
>>
>> Obviously locking hvmemul_cmpxchg() would have only affected the OP_MEM
>> case above (with lock_prefix == 1), actually an even smaller scope than
>> the new one, and with no read locking either.
> 
> Yeah, I may have got confused there too (resulting in me confusing
> you, perhaps): Adding a spin lock to hvmemul_cmpxchg() of course
> won't help. Making it use (or force us of) LOCK CMPXCHG would, I
> suppose.
> 
>> I guess the question now is what avenues are there to make
>> hvmemul_cmpxchg() do what its name says - I'm certainly open to trying
>> out any alternatives - my main concern is to have the problem fixed in
>> the best way possible, certainly not to have any specific version of
>> this patch make it into Xen.
> 
> I guess making it no longer call hvmemul_write(), copying most of
> that function's code while suitably replacing just the call to
> hvm_copy_to_guest_virt() (with the assumption that the MMIO
> paths at least for now don't need strict LOCK handling) is the only
> viable route.

Thank you for clearing that up!

But while implementing a stub that falls back to the actual LOCK CMPXCHG
and replacing hvm_copy_to_guest_virt() with it would indeed be an
improvement (with the added advantage of being able to treat
non-emulated LOCK CMPXCHG cases), I don't understand how that would
solve the read-modify-write atomicity problem.

AFAICT, this would only solve the write problem. Assuming we have VCPU1
and VCPU2 emulating a LOCKed instruction expecting rmw atomicity, the
stub alone would not prevent this:

VCPU1: read, modify
VCPU2: read, modify, write
VCPU1: write

Moreover, since reads and writes are not synchronized, it would be
possible for VCPU2's read to occur while VCPU1 writes, and VCPU1 could
read part of the old data + part of the new data.

So the problem originally addressed by the patch would still need to be
addressed like that: with a read / write lock covering all the relevant
parts of x86_emulate(). Unless I'm mistaken, the stub part is only
needed to make sure that CMPXCHG alone does not race when a VCPU
emulates and another does not.


Thanks,
Razvan
Jan Beulich May 4, 2016, 1:42 p.m. UTC | #42
>>> On 04.05.16 at 13:32, <rcojocaru@bitdefender.com> wrote:
> But while implementing a stub that falls back to the actual LOCK CMPXCHG
> and replacing hvm_copy_to_guest_virt() with it would indeed be an
> improvement (with the added advantage of being able to treat
> non-emulated LOCK CMPXCHG cases), I don't understand how that would
> solve the read-modify-write atomicity problem.
> 
> AFAICT, this would only solve the write problem. Assuming we have VCPU1
> and VCPU2 emulating a LOCKed instruction expecting rmw atomicity, the
> stub alone would not prevent this:
> 
> VCPU1: read, modify
> VCPU2: read, modify, write
> VCPU1: write

I'm not sure I follow what you mean here: Does the above represent
what the guest does, or what the hypervisor does as steps to emulate
a _single_ guest instruction? In the former case, I don't see what
you're after. And in the latter case I don't understand why you think
using CMPXCHG instead of WRITE wouldn't help.

Jan

> Moreover, since reads and writes are not synchronized, it would be
> possible for VCPU2's read to occur while VCPU1 writes, and VCPU1 could
> read part of the old data + part of the new data.
> 
> So the problem originally addressed by the patch would still need to be
> addressed like that: with a read / write lock covering all the relevant
> parts of x86_emulate(). Unless I'm mistaken, the stub part is only
> needed to make sure that CMPXCHG alone does not race when a VCPU
> emulates and another does not.
> 
> 
> Thanks,
> Razvan
Razvan Cojocaru May 5, 2016, 9:25 a.m. UTC | #43
On 05/04/2016 04:42 PM, Jan Beulich wrote:
>>>> On 04.05.16 at 13:32, <rcojocaru@bitdefender.com> wrote:
>> But while implementing a stub that falls back to the actual LOCK CMPXCHG
>> and replacing hvm_copy_to_guest_virt() with it would indeed be an
>> improvement (with the added advantage of being able to treat
>> non-emulated LOCK CMPXCHG cases), I don't understand how that would
>> solve the read-modify-write atomicity problem.
>>
>> AFAICT, this would only solve the write problem. Assuming we have VCPU1
>> and VCPU2 emulating a LOCKed instruction expecting rmw atomicity, the
>> stub alone would not prevent this:
>>
>> VCPU1: read, modify
>> VCPU2: read, modify, write
>> VCPU1: write
> 
> I'm not sure I follow what you mean here: Does the above represent
> what the guest does, or what the hypervisor does as steps to emulate
> a _single_ guest instruction? In the former case, I don't see what
> you're after. And in the latter case I don't understand why you think
> using CMPXCHG instead of WRITE wouldn't help.

Briefly, this is the scenario: assuming a guest with two VCPUs and an
introspection application that has restricted access to a page, the
guest runs two LOCK instructions that touch the page, causing a page
fault for each instruction. This further translates to two EPT fault
vm_events being placed in the ring buffer.

By the time the introspection application polls the event channel, both
VCPUs are paused, waiting for replies to the vm_events.

If the monitoring application processes both events (puts both replies,
with the emulate option on, in the ring buffer), then signals the event
channel, it is possible that both VCPUs get woken up, ending up running
x86_emulate() simultaneously.

In this case, my understanding is that just using CMPXCHG will not work
(although it is clearly superior to the current implementation), because
the read part and the write part of x86_emulate() (when LOCKed
instructions are involved) should be executed atomically, but writing
the CMPXCHG stub would only make sure that two simultaneous writes won't
occur.

In other words, this would still be possible (atomicity would still not
be guaranteed for LOCKed instructions):

VCPU1: read
VCPU2: read, write
VCPU1: write

when what we want for LOCKed instructions is:

VCPU1: read, write
VCPU2: read, write

Am I misunderstanding how x86_emulate() works?


Thanks,
Razvan
Jan Beulich May 5, 2016, 4:38 p.m. UTC | #44
>>> Razvan Cojocaru <rcojocaru@bitdefender.com> 05/05/16 11:24 AM >>>
>On 05/04/2016 04:42 PM, Jan Beulich wrote:
>>>>> On 04.05.16 at 13:32, <rcojocaru@bitdefender.com> wrote:
>>> But while implementing a stub that falls back to the actual LOCK CMPXCHG
>>> and replacing hvm_copy_to_guest_virt() with it would indeed be an
>>> improvement (with the added advantage of being able to treat
>>> non-emulated LOCK CMPXCHG cases), I don't understand how that would
>>> solve the read-modify-write atomicity problem.
>>>
>>> AFAICT, this would only solve the write problem. Assuming we have VCPU1
>>> and VCPU2 emulating a LOCKed instruction expecting rmw atomicity, the
>>> stub alone would not prevent this:
>>>
>>> VCPU1: read, modify
>>> VCPU2: read, modify, write
>>> VCPU1: write
>> 
>> I'm not sure I follow what you mean here: Does the above represent
>> what the guest does, or what the hypervisor does as steps to emulate
>> a _single_ guest instruction? In the former case, I don't see what
>> you're after. And in the latter case I don't understand why you think
>> using CMPXCHG instead of WRITE wouldn't help.
>
>Briefly, this is the scenario: assuming a guest with two VCPUs and an
>introspection application that has restricted access to a page, the
>guest runs two LOCK instructions that touch the page, causing a page
>fault for each instruction. This further translates to two EPT fault
>vm_events being placed in the ring buffer.
>
>By the time the introspection application polls the event channel, both
>VCPUs are paused, waiting for replies to the vm_events.
>
>If the monitoring application processes both events (puts both replies,
>with the emulate option on, in the ring buffer), then signals the event
>channel, it is possible that both VCPUs get woken up, ending up running
>x86_emulate() simultaneously.
>
>In this case, my understanding is that just using CMPXCHG will not work
>(although it is clearly superior to the current implementation), because
>the read part and the write part of x86_emulate() (when LOCKed
>instructions are involved) should be executed atomically, but writing
>the CMPXCHG stub would only make sure that two simultaneous writes won't
>occur.
>
>In other words, this would still be possible (atomicity would still not
>be guaranteed for LOCKed instructions):
>
>VCPU1: read
>VCPU2: read, write
>VCPU1: write
>
>when what we want for LOCKed instructions is:
>
>VCPU1: read, write
>VCPU2: read, write

Okay, in short I take this to mean "single instruction" as answer to my actual
question.

>Am I misunderstanding how x86_emulate() works?

No, but I suppose you're misunderstanding what I'm trying to suggest. What you
write above is not what will result when using CMPXCHG. Instead what we'll have
is

vCPU1: read
vCPU2: read
vCPU2: cmpxchg
vCPU1: cmpxchg

Note that the second cmpxchg will fail unless the first one wrote back an
unchanged value. Hence vCPU1 will be told to re-execute the instruction.

Jan
Wei Liu May 13, 2016, 3:27 p.m. UTC | #45
We're approaching the release date so I would like to wrap this up.

As I understand it, there is indeed an issue in the emulator, but a
proper fix that take into consideration all cases has not been proposed.

Should we make this a blocker for the release? I'm inclined to say no
because it has been like this for a long time and doesn't seem to cause
trouble for our main use case.

Thoughts?

Wei.
Jan Beulich May 13, 2016, 3:51 p.m. UTC | #46
>>> On 13.05.16 at 17:27, <wei.liu2@citrix.com> wrote:
> We're approaching the release date so I would like to wrap this up.
> 
> As I understand it, there is indeed an issue in the emulator, but a
> proper fix that take into consideration all cases has not been proposed.
> 
> Should we make this a blocker for the release? I'm inclined to say no
> because it has been like this for a long time and doesn't seem to cause
> trouble for our main use case.
> 
> Thoughts?

I agree.

Jan
diff mbox

Patch

diff --git a/tools/tests/x86_emulator/test_x86_emulator.c b/tools/tests/x86_emulator/test_x86_emulator.c
index 86e298f..22e963b 100644
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -9,6 +9,8 @@ 
 
 #define __packed __attribute__((packed))
 
+typedef bool bool_t;
+
 #include "x86_emulate/x86_emulate.h"
 #include "blowfish.h"
 
@@ -160,6 +162,14 @@  int get_fpu(
     return X86EMUL_OKAY;
 }
 
+static void smp_lock(bool_t locked)
+{
+}
+
+static void smp_unlock(bool_t locked)
+{
+}
+
 static struct x86_emulate_ops emulops = {
     .read       = read,
     .insn_fetch = fetch,
@@ -167,6 +177,8 @@  static struct x86_emulate_ops emulops = {
     .cmpxchg    = cmpxchg,
     .cpuid      = cpuid,
     .get_fpu    = get_fpu,
+    .smp_lock   = smp_lock,
+    .smp_unlock = smp_unlock,
 };
 
 int main(int argc, char **argv)
diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index cc0b841..02096d5 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -25,6 +25,8 @@ 
 #include <asm/hvm/svm/svm.h>
 #include <asm/vm_event.h>
 
+DEFINE_PERCPU_RWLOCK_GLOBAL(emulate_locked_rwlock);
+
 static void hvmtrace_io_assist(const ioreq_t *p)
 {
     unsigned int size, event;
@@ -1616,6 +1618,26 @@  static int hvmemul_vmfunc(
     return rc;
 }
 
+void emulate_smp_lock(bool_t locked)
+{
+    struct domain *d = current->domain;
+
+    if ( locked )
+        percpu_write_lock(emulate_locked_rwlock, &d->arch.emulate_lock);
+    else
+        percpu_read_lock(emulate_locked_rwlock, &d->arch.emulate_lock);
+}
+
+void emulate_smp_unlock(bool_t locked)
+{
+    struct domain *d = current->domain;
+
+    if ( locked )
+        percpu_write_unlock(emulate_locked_rwlock, &d->arch.emulate_lock);
+    else
+        percpu_read_unlock(emulate_locked_rwlock, &d->arch.emulate_lock);
+}
+
 static const struct x86_emulate_ops hvm_emulate_ops = {
     .read          = hvmemul_read,
     .insn_fetch    = hvmemul_insn_fetch,
@@ -1641,6 +1663,8 @@  static const struct x86_emulate_ops hvm_emulate_ops = {
     .put_fpu       = hvmemul_put_fpu,
     .invlpg        = hvmemul_invlpg,
     .vmfunc        = hvmemul_vmfunc,
+    .smp_lock      = emulate_smp_lock,
+    .smp_unlock    = emulate_smp_unlock,
 };
 
 static const struct x86_emulate_ops hvm_emulate_ops_no_write = {
@@ -1668,6 +1692,8 @@  static const struct x86_emulate_ops hvm_emulate_ops_no_write = {
     .put_fpu       = hvmemul_put_fpu,
     .invlpg        = hvmemul_invlpg,
     .vmfunc        = hvmemul_vmfunc,
+    .smp_lock      = emulate_smp_lock,
+    .smp_unlock    = emulate_smp_unlock,
 };
 
 static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt,
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index bca7532..52a3c5d 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -112,6 +112,7 @@ 
 #include <asm/ldt.h>
 #include <asm/x86_emulate.h>
 #include <asm/e820.h>
+#include <asm/hvm/emulate.h>
 #include <asm/hypercall.h>
 #include <asm/shared.h>
 #include <asm/mem_sharing.h>
@@ -5319,6 +5320,8 @@  static const struct x86_emulate_ops ptwr_emulate_ops = {
     .insn_fetch = ptwr_emulated_read,
     .write      = ptwr_emulated_write,
     .cmpxchg    = ptwr_emulated_cmpxchg,
+    .smp_lock   = emulate_smp_lock,
+    .smp_unlock = emulate_smp_unlock,
 };
 
 /* Write page fault handler: check if guest is trying to modify a PTE. */
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index ec87fb4..6d18430 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -283,6 +283,8 @@  static const struct x86_emulate_ops hvm_shadow_emulator_ops = {
     .insn_fetch = hvm_emulate_insn_fetch,
     .write      = hvm_emulate_write,
     .cmpxchg    = hvm_emulate_cmpxchg,
+    .smp_lock   = emulate_smp_lock,
+    .smp_unlock = emulate_smp_unlock,
 };
 
 static int
@@ -351,6 +353,8 @@  static const struct x86_emulate_ops pv_shadow_emulator_ops = {
     .insn_fetch = pv_emulate_read,
     .write      = pv_emulate_write,
     .cmpxchg    = pv_emulate_cmpxchg,
+    .smp_lock   = emulate_smp_lock,
+    .smp_unlock = emulate_smp_unlock,
 };
 
 const struct x86_emulate_ops *shadow_init_emulation(
diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c
index 10a2959..aab934f 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1520,6 +1520,8 @@  x86_emulate(
     struct operand ea = { .type = OP_MEM, .reg = REG_POISON };
     ea.mem.seg = x86_seg_ds; /* gcc may reject anon union initializer */
 
+    ASSERT(ops->smp_lock && ops->smp_unlock);
+
     ctxt->retire.byte = 0;
 
     op_bytes = def_op_bytes = ad_bytes = def_ad_bytes = ctxt->addr_size/8;
@@ -1589,6 +1591,8 @@  x86_emulate(
     }
  done_prefixes:
 
+    ops->smp_lock(lock_prefix);
+
     if ( rex_prefix & REX_W )
         op_bytes = 8;
 
@@ -2052,7 +2056,10 @@  x86_emulate(
         generate_exception_if(mode_64bit() && !twobyte, EXC_UD, -1);
         fail_if(ops->read_segment == NULL);
         if ( (rc = ops->read_segment(src.val, &reg, ctxt)) != 0 )
+        {
+            ops->smp_unlock(lock_prefix);
             return rc;
+        }
         /* 64-bit mode: PUSH defaults to a 64-bit operand. */
         if ( mode_64bit() && (op_bytes == 4) )
             op_bytes = 8;
@@ -2074,7 +2081,10 @@  x86_emulate(
                               &dst.val, op_bytes, ctxt, ops)) != 0 )
             goto done;
         if ( (rc = load_seg(src.val, dst.val, 0, NULL, ctxt, ops)) != 0 )
+        {
+            ops->smp_unlock(lock_prefix);
             return rc;
+        }
         break;
 
     case 0x0e: /* push %%cs */
@@ -2380,7 +2390,12 @@  x86_emulate(
         }
         /* Write back the memory destination with implicit LOCK prefix. */
         dst.val = src.val;
-        lock_prefix = 1;
+        if ( !lock_prefix )
+        {
+            ops->smp_unlock(lock_prefix);
+            lock_prefix = 1;
+            ops->smp_lock(lock_prefix);
+        }
         break;
 
     case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
@@ -3859,6 +3874,9 @@  x86_emulate(
  done:
     _put_fpu();
     put_stub(stub);
+
+    ops->smp_unlock(lock_prefix);
+
     return rc;
 
  twobyte_insn:
@@ -4767,5 +4785,8 @@  x86_emulate(
  cannot_emulate:
     _put_fpu();
     put_stub(stub);
+
+    ops->smp_unlock(lock_prefix);
+
     return X86EMUL_UNHANDLEABLE;
 }
diff --git a/xen/arch/x86/x86_emulate/x86_emulate.h b/xen/arch/x86/x86_emulate/x86_emulate.h
index 3a1bb46..e515840 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -400,6 +400,14 @@  struct x86_emulate_ops
     /* vmfunc: Emulate VMFUNC via given set of EAX ECX inputs */
     int (*vmfunc)(
         struct x86_emulate_ctxt *ctxt);
+
+    /* smp_lock: Take a write lock if locked, read lock otherwise. */
+    void (*smp_lock)(
+        bool_t locked);
+
+    /* smp_unlock: Write unlock if locked, read unlock otherwise. */
+    void (*smp_unlock)(
+        bool_t locked);
 };
 
 struct cpu_user_regs;
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 45273d4..0f98256 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -272,6 +272,8 @@  struct domain *domain_create(domid_t domid, unsigned int domcr_flags,
 
     TRACE_1D(TRC_DOM0_DOM_ADD, d->domain_id);
 
+    percpu_rwlock_resource_init(&d->arch.emulate_lock, emulate_locked_rwlock);
+
     lock_profile_register_struct(LOCKPROF_TYPE_PERDOM, d, domid, "Domain");
 
     if ( (err = xsm_alloc_security_domain(d)) != 0 )
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index d393ed2..04312ae 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -271,6 +271,8 @@  struct monitor_write_data {
     uint64_t cr4;
 };
 
+DECLARE_PERCPU_RWLOCK_GLOBAL(emulate_locked_rwlock);
+
 struct arch_domain
 {
     struct page_info *perdomain_l3_pg;
@@ -409,6 +411,8 @@  struct arch_domain
 
     /* Emulated devices enabled bitmap. */
     uint32_t emulation_flags;
+
+    percpu_rwlock_t emulate_lock;
 } __cacheline_aligned;
 
 #define has_vlapic(d)      (!!((d)->arch.emulation_flags & XEN_X86_EMU_LAPIC))
diff --git a/xen/include/asm-x86/hvm/emulate.h b/xen/include/asm-x86/hvm/emulate.h
index 142d1b6..863f01d 100644
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -67,6 +67,9 @@  int hvmemul_do_pio_buffer(uint16_t port,
 void hvm_dump_emulation_state(const char *prefix,
                               struct hvm_emulate_ctxt *hvmemul_ctxt);
 
+void emulate_smp_lock(bool_t locked);
+void emulate_smp_unlock(bool_t locked);
+
 #endif /* __ASM_X86_HVM_EMULATE_H__ */
 
 /*