Message ID | 1489593882-18025-1-git-send-email-rcojocaru@bitdefender.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
>>> On 15.03.17 at 17:04, <rcojocaru@bitdefender.com> wrote: > --- > Changes since V1: > - Added Andrew Cooper's credit, as he's kept the patch current > througout non-trivial code changes since the initial patch. > - Significantly more patch testing (with XenServer). > - Restricted lock scope. Not by much, as it seems. In particular you continue to take the lock even for instructions not accessing memory at all. Also, by "reworked" I did assume you mean converted to at least the cmpxchg based model. > --- a/tools/tests/x86_emulator/test_x86_emulator.c > +++ b/tools/tests/x86_emulator/test_x86_emulator.c > @@ -283,6 +283,14 @@ static int read_msr( > return X86EMUL_UNHANDLEABLE; > } > > +static void smp_lock(bool locked) > +{ > +} > + > +static void smp_unlock(bool locked) > +{ > +} I don't think the hooks should be a requirement, and hence these shouldn't be needed. > --- a/xen/arch/x86/domain.c > +++ b/xen/arch/x86/domain.c > @@ -529,6 +529,8 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags, > if ( config == NULL && !is_idle_domain(d) ) > return -EINVAL; > > + percpu_rwlock_resource_init(&d->arch.emulate_lock, emulate_locked_rwlock); This should move into the same file as where the lock and the hook functions live, so that the variable can be static. I'm not sure ... > --- a/xen/arch/x86/hvm/emulate.c > +++ b/xen/arch/x86/hvm/emulate.c > @@ -24,6 +24,8 @@ > #include <asm/hvm/svm/svm.h> > #include <asm/vm_event.h> > > +DEFINE_PERCPU_RWLOCK_GLOBAL(emulate_locked_rwlock); ... this is the right file, though, considering the wide (including PV) use of it. > @@ -1731,6 +1755,8 @@ static const struct x86_emulate_ops hvm_emulate_ops_no_write = { > .put_fpu = hvmemul_put_fpu, > .invlpg = hvmemul_invlpg, > .vmfunc = hvmemul_vmfunc, > + .smp_lock = emulate_smp_lock, > + .smp_unlock = emulate_smp_unlock, > }; No need for the hooks here. > @@ -5485,6 +5487,8 @@ static const struct x86_emulate_ops mmio_ro_emulate_ops = { > .write = mmio_ro_emulated_write, > .validate = pv_emul_is_mem_write, > .cpuid = pv_emul_cpuid, > + .smp_lock = emulate_smp_lock, > + .smp_unlock = emulate_smp_unlock, > }; Nor here. > @@ -5524,6 +5528,8 @@ static const struct x86_emulate_ops mmcfg_intercept_ops = { > .write = mmcfg_intercept_write, > .validate = pv_emul_is_mem_write, > .cpuid = pv_emul_cpuid, > + .smp_lock = emulate_smp_lock, > + .smp_unlock = emulate_smp_unlock, > }; Not sure about this one, but generally I'd expect no LOCKed accesses to MMCFG space. > --- a/xen/arch/x86/traps.c > +++ b/xen/arch/x86/traps.c > @@ -2957,6 +2957,8 @@ static const struct x86_emulate_ops priv_op_ops = { > .write_msr = priv_op_write_msr, > .cpuid = pv_emul_cpuid, > .wbinvd = priv_op_wbinvd, > + .smp_lock = emulate_smp_lock, > + .smp_unlock = emulate_smp_unlock, > }; No need for the hooks again. > @@ -3065,6 +3065,8 @@ x86_emulate( > d = state.desc; > #define state (&state) > > + ops->smp_lock(lock_prefix); There's a "goto complete_insn" upwards from here, which therefore bypasses the acquire, but goes through the release path. Also this is still too early to take the lock. > @@ -7925,8 +7932,11 @@ x86_emulate( > ctxt->regs->eflags &= ~X86_EFLAGS_RF; > > done: > + ops->smp_unlock(lock_prefix); And this, imo, is too late (except for covering error exits coming here). I don't think you can avoid having a local tracking variable. > --- a/xen/arch/x86/x86_emulate/x86_emulate.h > +++ b/xen/arch/x86/x86_emulate/x86_emulate.h > @@ -448,6 +448,14 @@ struct x86_emulate_ops > /* vmfunc: Emulate VMFUNC via given set of EAX ECX inputs */ > int (*vmfunc)( > struct x86_emulate_ctxt *ctxt); > + > + /* smp_lock: Take a write lock if locked, read lock otherwise. */ > + void (*smp_lock)( > + bool locked); > + > + /* smp_unlock: Write unlock if locked, read unlock otherwise. */ > + void (*smp_unlock)( > + bool locked); > }; All hooks should take a ctxt pointer. Jan
On 03/15/2017 06:30 PM, Jan Beulich wrote: >>>> On 15.03.17 at 17:04, <rcojocaru@bitdefender.com> wrote: >> --- >> Changes since V1: >> - Added Andrew Cooper's credit, as he's kept the patch current >> througout non-trivial code changes since the initial patch. >> - Significantly more patch testing (with XenServer). >> - Restricted lock scope. > > Not by much, as it seems. In particular you continue to take the > lock even for instructions not accessing memory at all. I'll take a closer look. > Also, by "reworked" I did assume you mean converted to at least the > cmpxchg based model. I haven't been able to follow the latest emulator changes closely, could you please clarify what you mean by "the cmpxchg model"? Thanks. >> --- a/tools/tests/x86_emulator/test_x86_emulator.c >> +++ b/tools/tests/x86_emulator/test_x86_emulator.c >> @@ -283,6 +283,14 @@ static int read_msr( >> return X86EMUL_UNHANDLEABLE; >> } >> >> +static void smp_lock(bool locked) >> +{ >> +} >> + >> +static void smp_unlock(bool locked) >> +{ >> +} > > I don't think the hooks should be a requirement, and hence these > shouldn't be needed. I'll make them optional. >> --- a/xen/arch/x86/domain.c >> +++ b/xen/arch/x86/domain.c >> @@ -529,6 +529,8 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags, >> if ( config == NULL && !is_idle_domain(d) ) >> return -EINVAL; >> >> + percpu_rwlock_resource_init(&d->arch.emulate_lock, emulate_locked_rwlock); > > This should move into the same file as where the lock and the hook > functions live, so that the variable can be static. I'm not sure ... > >> --- a/xen/arch/x86/hvm/emulate.c >> +++ b/xen/arch/x86/hvm/emulate.c >> @@ -24,6 +24,8 @@ >> #include <asm/hvm/svm/svm.h> >> #include <asm/vm_event.h> >> >> +DEFINE_PERCPU_RWLOCK_GLOBAL(emulate_locked_rwlock); > > ... this is the right file, though, considering the wide (including PV) > use of it. I'll hunt for a better place for it. >> @@ -3065,6 +3065,8 @@ x86_emulate( >> d = state.desc; >> #define state (&state) >> >> + ops->smp_lock(lock_prefix); > > There's a "goto complete_insn" upwards from here, which therefore > bypasses the acquire, but goes through the release path. Also this is > still too early to take the lock. True, it appears that there have been changes in staging since the last test. I'll need to follow the locking patch carefully. >> @@ -7925,8 +7932,11 @@ x86_emulate( >> ctxt->regs->eflags &= ~X86_EFLAGS_RF; >> >> done: >> + ops->smp_unlock(lock_prefix); > > And this, imo, is too late (except for covering error exits coming > here). I don't think you can avoid having a local tracking variable. Fair enough. Will use one. >> --- a/xen/arch/x86/x86_emulate/x86_emulate.h >> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h >> @@ -448,6 +448,14 @@ struct x86_emulate_ops >> /* vmfunc: Emulate VMFUNC via given set of EAX ECX inputs */ >> int (*vmfunc)( >> struct x86_emulate_ctxt *ctxt); >> + >> + /* smp_lock: Take a write lock if locked, read lock otherwise. */ >> + void (*smp_lock)( >> + bool locked); >> + >> + /* smp_unlock: Write unlock if locked, read unlock otherwise. */ >> + void (*smp_unlock)( >> + bool locked); >> }; > > All hooks should take a ctxt pointer. I'll try to adapt them. Thanks, Razvan
>>> On 15.03.17 at 17:46, <rcojocaru@bitdefender.com> wrote: > On 03/15/2017 06:30 PM, Jan Beulich wrote: >>>>> On 15.03.17 at 17:04, <rcojocaru@bitdefender.com> wrote: >>> --- >>> Changes since V1: >>> - Added Andrew Cooper's credit, as he's kept the patch current >>> througout non-trivial code changes since the initial patch. >>> - Significantly more patch testing (with XenServer). >>> - Restricted lock scope. >> >> Not by much, as it seems. In particular you continue to take the >> lock even for instructions not accessing memory at all. > > I'll take a closer look. > >> Also, by "reworked" I did assume you mean converted to at least the >> cmpxchg based model. > > I haven't been able to follow the latest emulator changes closely, could > you please clarify what you mean by "the cmpxchg model"? Thanks. This is unrelated to any recent changes. The idea is to make the ->cmpxchg() hook actually behave like what its name says. It's being used for LOCKed insn writeback already, and it could therefore simply force a retry of the full instruction if the compare part of it fails. It may need to be given another parameter, to allow the hook function to tell LOCKed from "normal" uses. Jan
On 03/15/2017 06:57 PM, Jan Beulich wrote: >>>> On 15.03.17 at 17:46, <rcojocaru@bitdefender.com> wrote: >> On 03/15/2017 06:30 PM, Jan Beulich wrote: >>>>>> On 15.03.17 at 17:04, <rcojocaru@bitdefender.com> wrote: >>>> --- >>>> Changes since V1: >>>> - Added Andrew Cooper's credit, as he's kept the patch current >>>> througout non-trivial code changes since the initial patch. >>>> - Significantly more patch testing (with XenServer). >>>> - Restricted lock scope. >>> >>> Not by much, as it seems. In particular you continue to take the >>> lock even for instructions not accessing memory at all. >> >> I'll take a closer look. >> >>> Also, by "reworked" I did assume you mean converted to at least the >>> cmpxchg based model. >> >> I haven't been able to follow the latest emulator changes closely, could >> you please clarify what you mean by "the cmpxchg model"? Thanks. > > This is unrelated to any recent changes. The idea is to make the > ->cmpxchg() hook actually behave like what its name says. It's > being used for LOCKed insn writeback already, and it could > therefore simply force a retry of the full instruction if the compare > part of it fails. It may need to be given another parameter, to > allow the hook function to tell LOCKed from "normal" uses. I assume this is what you have in mind? static int hvmemul_cmpxchg( enum x86_segment seg, unsigned long offset, void *p_old, void *p_new, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { /* Fix this in case the guest is really relying on r-m-w atomicity. */ uint64_t read; int rc; rc = hvmemul_read(seg, offset, &read, bytes, ctxt); if ( rc != X86EMUL_OKAY ) return rc; switch( bytes ) { case 1: if ( *(uint8_t *)read != *(uint8_t *)p_old ) { *(uint8_t *)p_old = *(uint8_t *)&read; return X86EMUL_RETRY; } break; case 2: if ( *(uint16_t *)read != *(uint16_t *)p_old ) { *(uint16_t *)p_old = *(uint16_t *)&read; return X86EMUL_RETRY; } break; case 4: if ( *(uint32_t *)&read != *(uint32_t *)p_old ) { *(uint32_t *)p_old = *(uint32_t *)&read; return X86EMUL_RETRY; } break; case 8: if ( read != *(uint64_t *)p_old ) { *(uint64_t *)p_old = read; return X86EMUL_RETRY; } break; } return hvmemul_write(seg, offset, p_new, bytes, ctxt); } I can rip this off and send this one small patch independently of the larger LOCK one if you think it's a good idea. Thanks, Razvan
On 03/21/2017 05:38 PM, Razvan Cojocaru wrote: > On 03/15/2017 06:57 PM, Jan Beulich wrote: >>>>> On 15.03.17 at 17:46, <rcojocaru@bitdefender.com> wrote: >>> On 03/15/2017 06:30 PM, Jan Beulich wrote: >>>>>>> On 15.03.17 at 17:04, <rcojocaru@bitdefender.com> wrote: >>>>> --- >>>>> Changes since V1: >>>>> - Added Andrew Cooper's credit, as he's kept the patch current >>>>> througout non-trivial code changes since the initial patch. >>>>> - Significantly more patch testing (with XenServer). >>>>> - Restricted lock scope. >>>> >>>> Not by much, as it seems. In particular you continue to take the >>>> lock even for instructions not accessing memory at all. >>> >>> I'll take a closer look. >>> >>>> Also, by "reworked" I did assume you mean converted to at least the >>>> cmpxchg based model. >>> >>> I haven't been able to follow the latest emulator changes closely, could >>> you please clarify what you mean by "the cmpxchg model"? Thanks. >> >> This is unrelated to any recent changes. The idea is to make the >> ->cmpxchg() hook actually behave like what its name says. It's >> being used for LOCKed insn writeback already, and it could >> therefore simply force a retry of the full instruction if the compare >> part of it fails. It may need to be given another parameter, to >> allow the hook function to tell LOCKed from "normal" uses. > > I assume this is what you have in mind? > > > static int hvmemul_cmpxchg( > enum x86_segment seg, > unsigned long offset, > void *p_old, > void *p_new, > unsigned int bytes, > struct x86_emulate_ctxt *ctxt) > { > /* Fix this in case the guest is really relying on r-m-w atomicity. */ > uint64_t read; > int rc; > > rc = hvmemul_read(seg, offset, &read, bytes, ctxt); > > if ( rc != X86EMUL_OKAY ) > return rc; > > switch( bytes ) > { > case 1: > if ( *(uint8_t *)read != *(uint8_t *)p_old ) > { > *(uint8_t *)p_old = *(uint8_t *)&read; > return X86EMUL_RETRY; > } > break; > case 2: > if ( *(uint16_t *)read != *(uint16_t *)p_old ) > { > *(uint16_t *)p_old = *(uint16_t *)&read; > return X86EMUL_RETRY; > } > break; Sorry, forgot to add & to "read" for the two cases above. Thanks, Razvan
>>> On 21.03.17 at 16:38, <rcojocaru@bitdefender.com> wrote: > On 03/15/2017 06:57 PM, Jan Beulich wrote: >>>>> On 15.03.17 at 17:46, <rcojocaru@bitdefender.com> wrote: >>> On 03/15/2017 06:30 PM, Jan Beulich wrote: >>>>>>> On 15.03.17 at 17:04, <rcojocaru@bitdefender.com> wrote: >>>>> --- >>>>> Changes since V1: >>>>> - Added Andrew Cooper's credit, as he's kept the patch current >>>>> througout non-trivial code changes since the initial patch. >>>>> - Significantly more patch testing (with XenServer). >>>>> - Restricted lock scope. >>>> >>>> Not by much, as it seems. In particular you continue to take the >>>> lock even for instructions not accessing memory at all. >>> >>> I'll take a closer look. >>> >>>> Also, by "reworked" I did assume you mean converted to at least the >>>> cmpxchg based model. >>> >>> I haven't been able to follow the latest emulator changes closely, could >>> you please clarify what you mean by "the cmpxchg model"? Thanks. >> >> This is unrelated to any recent changes. The idea is to make the >> ->cmpxchg() hook actually behave like what its name says. It's >> being used for LOCKed insn writeback already, and it could >> therefore simply force a retry of the full instruction if the compare >> part of it fails. It may need to be given another parameter, to >> allow the hook function to tell LOCKed from "normal" uses. > > I assume this is what you have in mind? Hmm, not exactly. I'd expect an actual CMPXCHG instruction to be used, with a LOCK prefix when the original access had one. There should certainly not be any tail call to hvmemul_write() anymore. Jan
On 03/21/2017 06:29 PM, Jan Beulich wrote: >>>> On 21.03.17 at 16:38, <rcojocaru@bitdefender.com> wrote: >> On 03/15/2017 06:57 PM, Jan Beulich wrote: >>>>>> On 15.03.17 at 17:46, <rcojocaru@bitdefender.com> wrote: >>>> On 03/15/2017 06:30 PM, Jan Beulich wrote: >>>>>>>> On 15.03.17 at 17:04, <rcojocaru@bitdefender.com> wrote: >>>>>> --- >>>>>> Changes since V1: >>>>>> - Added Andrew Cooper's credit, as he's kept the patch current >>>>>> througout non-trivial code changes since the initial patch. >>>>>> - Significantly more patch testing (with XenServer). >>>>>> - Restricted lock scope. >>>>> >>>>> Not by much, as it seems. In particular you continue to take the >>>>> lock even for instructions not accessing memory at all. >>>> >>>> I'll take a closer look. >>>> >>>>> Also, by "reworked" I did assume you mean converted to at least the >>>>> cmpxchg based model. >>>> >>>> I haven't been able to follow the latest emulator changes closely, could >>>> you please clarify what you mean by "the cmpxchg model"? Thanks. >>> >>> This is unrelated to any recent changes. The idea is to make the >>> ->cmpxchg() hook actually behave like what its name says. It's >>> being used for LOCKed insn writeback already, and it could >>> therefore simply force a retry of the full instruction if the compare >>> part of it fails. It may need to be given another parameter, to >>> allow the hook function to tell LOCKed from "normal" uses. >> >> I assume this is what you have in mind? > > Hmm, not exactly. I'd expect an actual CMPXCHG instruction to > be used, with a LOCK prefix when the original access had one. > There should certainly not be any tail call to hvmemul_write() > anymore. There seems to be a misunderstanding here: if I understand this reply correctly, you'd like hvmemul_cmpxchg() to become a stub that really runs CMPXCHG - but if that comes with the smp_lock part as well, it doesn't matter (except for the now-ignored p_old part, i.e. the comparison). I've initially asked if I should bring the old XenServer-queued LOCK patch back, and I've understood that it could serve a purpose, at least as a temporary workaround until your, and Andrew's emulator changes, make it unrequired. However, it appears that you've understood this to mean the addition of the CMPXCHG stub (speaking of which, could you please point out examples of implementing such stubs in the Xen code? I have never done one of those before.) Thanks, Razvan
>>> On 21.03.17 at 17:44, <rcojocaru@bitdefender.com> wrote: > On 03/21/2017 06:29 PM, Jan Beulich wrote: >>>>> On 21.03.17 at 16:38, <rcojocaru@bitdefender.com> wrote: >>> On 03/15/2017 06:57 PM, Jan Beulich wrote: >>>>>>> On 15.03.17 at 17:46, <rcojocaru@bitdefender.com> wrote: >>>>> On 03/15/2017 06:30 PM, Jan Beulich wrote: >>>>>>>>> On 15.03.17 at 17:04, <rcojocaru@bitdefender.com> wrote: >>>>>>> --- >>>>>>> Changes since V1: >>>>>>> - Added Andrew Cooper's credit, as he's kept the patch current >>>>>>> througout non-trivial code changes since the initial patch. >>>>>>> - Significantly more patch testing (with XenServer). >>>>>>> - Restricted lock scope. >>>>>> >>>>>> Not by much, as it seems. In particular you continue to take the >>>>>> lock even for instructions not accessing memory at all. >>>>> >>>>> I'll take a closer look. >>>>> >>>>>> Also, by "reworked" I did assume you mean converted to at least the >>>>>> cmpxchg based model. >>>>> >>>>> I haven't been able to follow the latest emulator changes closely, could >>>>> you please clarify what you mean by "the cmpxchg model"? Thanks. >>>> >>>> This is unrelated to any recent changes. The idea is to make the >>>> ->cmpxchg() hook actually behave like what its name says. It's >>>> being used for LOCKed insn writeback already, and it could >>>> therefore simply force a retry of the full instruction if the compare >>>> part of it fails. It may need to be given another parameter, to >>>> allow the hook function to tell LOCKed from "normal" uses. >>> >>> I assume this is what you have in mind? >> >> Hmm, not exactly. I'd expect an actual CMPXCHG instruction to >> be used, with a LOCK prefix when the original access had one. >> There should certainly not be any tail call to hvmemul_write() >> anymore. > > There seems to be a misunderstanding here: if I understand this reply > correctly, you'd like hvmemul_cmpxchg() to become a stub that really > runs CMPXCHG - but if that comes with the smp_lock part as well, it > doesn't matter (except for the now-ignored p_old part, i.e. the comparison). > > I've initially asked if I should bring the old XenServer-queued LOCK > patch back, and I've understood that it could serve a purpose, at least > as a temporary workaround until your, and Andrew's emulator changes, > make it unrequired. Well, yes, as said earlier (still visible in context above) I had originally assumed "reworked" would mean making the cmpxchg hook actually match its name. > However, it appears that you've understood this to > mean the addition of the CMPXCHG stub (speaking of which, could you > please point out examples of implementing such stubs in the Xen code? I > have never done one of those before.) There's no talk about stubs here. All I had hoped would have been done _instead_ of the smp-lock approach was to use the CMPXCHG instruction inside the hook function, at least for RAM case (I think we could live with MMIO still being forwarded to the write handler). So all this isn't to say that the smp-lock approach might not be worthwhile to take on its own, provided the locked region gets shrunk as much as possible without losing correctness. The CMPXCHG model would just not have this locking granularity problem in the first place. Jan
On 03/21/2017 07:02 PM, Jan Beulich wrote: >>>> On 21.03.17 at 17:44, <rcojocaru@bitdefender.com> wrote: >> On 03/21/2017 06:29 PM, Jan Beulich wrote: >>>>>> On 21.03.17 at 16:38, <rcojocaru@bitdefender.com> wrote: >>>> On 03/15/2017 06:57 PM, Jan Beulich wrote: >>>>>>>> On 15.03.17 at 17:46, <rcojocaru@bitdefender.com> wrote: >>>>>> On 03/15/2017 06:30 PM, Jan Beulich wrote: >>>>>>>>>> On 15.03.17 at 17:04, <rcojocaru@bitdefender.com> wrote: >>>>>>>> --- >>>>>>>> Changes since V1: >>>>>>>> - Added Andrew Cooper's credit, as he's kept the patch current >>>>>>>> througout non-trivial code changes since the initial patch. >>>>>>>> - Significantly more patch testing (with XenServer). >>>>>>>> - Restricted lock scope. >>>>>>> >>>>>>> Not by much, as it seems. In particular you continue to take the >>>>>>> lock even for instructions not accessing memory at all. >>>>>> >>>>>> I'll take a closer look. >>>>>> >>>>>>> Also, by "reworked" I did assume you mean converted to at least the >>>>>>> cmpxchg based model. >>>>>> >>>>>> I haven't been able to follow the latest emulator changes closely, could >>>>>> you please clarify what you mean by "the cmpxchg model"? Thanks. >>>>> >>>>> This is unrelated to any recent changes. The idea is to make the >>>>> ->cmpxchg() hook actually behave like what its name says. It's >>>>> being used for LOCKed insn writeback already, and it could >>>>> therefore simply force a retry of the full instruction if the compare >>>>> part of it fails. It may need to be given another parameter, to >>>>> allow the hook function to tell LOCKed from "normal" uses. >>>> >>>> I assume this is what you have in mind? >>> >>> Hmm, not exactly. I'd expect an actual CMPXCHG instruction to >>> be used, with a LOCK prefix when the original access had one. >>> There should certainly not be any tail call to hvmemul_write() >>> anymore. >> >> There seems to be a misunderstanding here: if I understand this reply >> correctly, you'd like hvmemul_cmpxchg() to become a stub that really >> runs CMPXCHG - but if that comes with the smp_lock part as well, it >> doesn't matter (except for the now-ignored p_old part, i.e. the comparison). >> >> I've initially asked if I should bring the old XenServer-queued LOCK >> patch back, and I've understood that it could serve a purpose, at least >> as a temporary workaround until your, and Andrew's emulator changes, >> make it unrequired. > > Well, yes, as said earlier (still visible in context above) I had > originally assumed "reworked" would mean making the cmpxchg > hook actually match its name. > >> However, it appears that you've understood this to >> mean the addition of the CMPXCHG stub (speaking of which, could you >> please point out examples of implementing such stubs in the Xen code? I >> have never done one of those before.) > > There's no talk about stubs here. All I had hoped would have been > done _instead_ of the smp-lock approach was to use the CMPXCHG > instruction inside the hook function, at least for RAM case (I think > we could live with MMIO still being forwarded to the write handler). > > So all this isn't to say that the smp-lock approach might not be > worthwhile to take on its own, provided the locked region gets > shrunk as much as possible without losing correctness. The > CMPXCHG model would just not have this locking granularity > problem in the first place. Sadly, I've now written this (rough) patch: http://pastebin.com/3DJ5WYt0 only to find that it does not solve our issue. With multiple processors per guest and heavy emulation at boot time, the VM got stuck at roughly the same point in its life as before the patch. Looks like more than CMPXCHG needs synchronizing. So it would appear that the smp_lock patch is the only way to bring this under control. Either that, or my patch misses something. Single-processor guests work just fine. Thanks, Razvan
>>> On 22.03.17 at 18:22, <rcojocaru@bitdefender.com> wrote: > On 03/21/2017 07:02 PM, Jan Beulich wrote: >>>>> On 21.03.17 at 17:44, <rcojocaru@bitdefender.com> wrote: >>> On 03/21/2017 06:29 PM, Jan Beulich wrote: >>>>>>> On 21.03.17 at 16:38, <rcojocaru@bitdefender.com> wrote: >>>>> On 03/15/2017 06:57 PM, Jan Beulich wrote: >>>>>>>>> On 15.03.17 at 17:46, <rcojocaru@bitdefender.com> wrote: >>>>>>> On 03/15/2017 06:30 PM, Jan Beulich wrote: >>>>>>>>>>> On 15.03.17 at 17:04, <rcojocaru@bitdefender.com> wrote: >>>>>>>>> --- >>>>>>>>> Changes since V1: >>>>>>>>> - Added Andrew Cooper's credit, as he's kept the patch current >>>>>>>>> througout non-trivial code changes since the initial patch. >>>>>>>>> - Significantly more patch testing (with XenServer). >>>>>>>>> - Restricted lock scope. >>>>>>>> >>>>>>>> Not by much, as it seems. In particular you continue to take the >>>>>>>> lock even for instructions not accessing memory at all. >>>>>>> >>>>>>> I'll take a closer look. >>>>>>> >>>>>>>> Also, by "reworked" I did assume you mean converted to at least the >>>>>>>> cmpxchg based model. >>>>>>> >>>>>>> I haven't been able to follow the latest emulator changes closely, could >>>>>>> you please clarify what you mean by "the cmpxchg model"? Thanks. >>>>>> >>>>>> This is unrelated to any recent changes. The idea is to make the >>>>>> ->cmpxchg() hook actually behave like what its name says. It's >>>>>> being used for LOCKed insn writeback already, and it could >>>>>> therefore simply force a retry of the full instruction if the compare >>>>>> part of it fails. It may need to be given another parameter, to >>>>>> allow the hook function to tell LOCKed from "normal" uses. >>>>> >>>>> I assume this is what you have in mind? >>>> >>>> Hmm, not exactly. I'd expect an actual CMPXCHG instruction to >>>> be used, with a LOCK prefix when the original access had one. >>>> There should certainly not be any tail call to hvmemul_write() >>>> anymore. >>> >>> There seems to be a misunderstanding here: if I understand this reply >>> correctly, you'd like hvmemul_cmpxchg() to become a stub that really >>> runs CMPXCHG - but if that comes with the smp_lock part as well, it >>> doesn't matter (except for the now-ignored p_old part, i.e. the comparison). >>> >>> I've initially asked if I should bring the old XenServer-queued LOCK >>> patch back, and I've understood that it could serve a purpose, at least >>> as a temporary workaround until your, and Andrew's emulator changes, >>> make it unrequired. >> >> Well, yes, as said earlier (still visible in context above) I had >> originally assumed "reworked" would mean making the cmpxchg >> hook actually match its name. >> >>> However, it appears that you've understood this to >>> mean the addition of the CMPXCHG stub (speaking of which, could you >>> please point out examples of implementing such stubs in the Xen code? I >>> have never done one of those before.) >> >> There's no talk about stubs here. All I had hoped would have been >> done _instead_ of the smp-lock approach was to use the CMPXCHG >> instruction inside the hook function, at least for RAM case (I think >> we could live with MMIO still being forwarded to the write handler). >> >> So all this isn't to say that the smp-lock approach might not be >> worthwhile to take on its own, provided the locked region gets >> shrunk as much as possible without losing correctness. The >> CMPXCHG model would just not have this locking granularity >> problem in the first place. > > Sadly, I've now written this (rough) patch: > > http://pastebin.com/3DJ5WYt0 > > only to find that it does not solve our issue. With multiple processors > per guest and heavy emulation at boot time, the VM got stuck at roughly > the same point in its life as before the patch. Looks like more than > CMPXCHG needs synchronizing. > > So it would appear that the smp_lock patch is the only way to bring this > under control. Either that, or my patch misses something. > Single-processor guests work just fine. Well, first of all the code to return RETRY is commented out. I may guess that you've tried with it not commented out, but I can't be sure. And then "stuck" of course can mean many things, so a better description of the state the VM is in and/or the operations that it's stuck on would be needed. Jan
On 03/23/2017 10:19 AM, Jan Beulich wrote: >>>> On 22.03.17 at 18:22, <rcojocaru@bitdefender.com> wrote: >> On 03/21/2017 07:02 PM, Jan Beulich wrote: >>>>>> On 21.03.17 at 17:44, <rcojocaru@bitdefender.com> wrote: >>>> On 03/21/2017 06:29 PM, Jan Beulich wrote: >>>>>>>> On 21.03.17 at 16:38, <rcojocaru@bitdefender.com> wrote: >>>>>> On 03/15/2017 06:57 PM, Jan Beulich wrote: >>>>>>>>>> On 15.03.17 at 17:46, <rcojocaru@bitdefender.com> wrote: >>>>>>>> On 03/15/2017 06:30 PM, Jan Beulich wrote: >>>>>>>>>>>> On 15.03.17 at 17:04, <rcojocaru@bitdefender.com> wrote: >>>>>>>>>> --- >>>>>>>>>> Changes since V1: >>>>>>>>>> - Added Andrew Cooper's credit, as he's kept the patch current >>>>>>>>>> througout non-trivial code changes since the initial patch. >>>>>>>>>> - Significantly more patch testing (with XenServer). >>>>>>>>>> - Restricted lock scope. >>>>>>>>> >>>>>>>>> Not by much, as it seems. In particular you continue to take the >>>>>>>>> lock even for instructions not accessing memory at all. >>>>>>>> >>>>>>>> I'll take a closer look. >>>>>>>> >>>>>>>>> Also, by "reworked" I did assume you mean converted to at least the >>>>>>>>> cmpxchg based model. >>>>>>>> >>>>>>>> I haven't been able to follow the latest emulator changes closely, could >>>>>>>> you please clarify what you mean by "the cmpxchg model"? Thanks. >>>>>>> >>>>>>> This is unrelated to any recent changes. The idea is to make the >>>>>>> ->cmpxchg() hook actually behave like what its name says. It's >>>>>>> being used for LOCKed insn writeback already, and it could >>>>>>> therefore simply force a retry of the full instruction if the compare >>>>>>> part of it fails. It may need to be given another parameter, to >>>>>>> allow the hook function to tell LOCKed from "normal" uses. >>>>>> >>>>>> I assume this is what you have in mind? >>>>> >>>>> Hmm, not exactly. I'd expect an actual CMPXCHG instruction to >>>>> be used, with a LOCK prefix when the original access had one. >>>>> There should certainly not be any tail call to hvmemul_write() >>>>> anymore. >>>> >>>> There seems to be a misunderstanding here: if I understand this reply >>>> correctly, you'd like hvmemul_cmpxchg() to become a stub that really >>>> runs CMPXCHG - but if that comes with the smp_lock part as well, it >>>> doesn't matter (except for the now-ignored p_old part, i.e. the comparison). >>>> >>>> I've initially asked if I should bring the old XenServer-queued LOCK >>>> patch back, and I've understood that it could serve a purpose, at least >>>> as a temporary workaround until your, and Andrew's emulator changes, >>>> make it unrequired. >>> >>> Well, yes, as said earlier (still visible in context above) I had >>> originally assumed "reworked" would mean making the cmpxchg >>> hook actually match its name. >>> >>>> However, it appears that you've understood this to >>>> mean the addition of the CMPXCHG stub (speaking of which, could you >>>> please point out examples of implementing such stubs in the Xen code? I >>>> have never done one of those before.) >>> >>> There's no talk about stubs here. All I had hoped would have been >>> done _instead_ of the smp-lock approach was to use the CMPXCHG >>> instruction inside the hook function, at least for RAM case (I think >>> we could live with MMIO still being forwarded to the write handler). >>> >>> So all this isn't to say that the smp-lock approach might not be >>> worthwhile to take on its own, provided the locked region gets >>> shrunk as much as possible without losing correctness. The >>> CMPXCHG model would just not have this locking granularity >>> problem in the first place. >> >> Sadly, I've now written this (rough) patch: >> >> http://pastebin.com/3DJ5WYt0 >> >> only to find that it does not solve our issue. With multiple processors >> per guest and heavy emulation at boot time, the VM got stuck at roughly >> the same point in its life as before the patch. Looks like more than >> CMPXCHG needs synchronizing. >> >> So it would appear that the smp_lock patch is the only way to bring this >> under control. Either that, or my patch misses something. >> Single-processor guests work just fine. > > Well, first of all the code to return RETRY is commented out. I > may guess that you've tried with it not commented out, but I > can't be sure. Indeed I have tried with it on, with the condition for success being at first "val != old", and then, as it is in the pasted code, "val == new". Both of these caused BSODs and random crashes. The guest only boots without any apparent issues (and with 1 VCPU only) after I've stopped returning RETRY. > And then "stuck" of course can mean many things, so a better > description of the state the VM is in and/or the operations that > it's stuck on would be needed. I agree, but on a first look it would appear that the hang is just like the one without the patch, when multiple processors are involved. This also is in line with my previous observation that when only cmpxchg() is locked, the problem still occurs (you may remember that initially I've tried to only synchronize the cmpxchg() call in x86_emulate()). I'll try to give the patch a few more spins and compare the guest stuck state with the stuck state without the patch. Thanks, Razvan
>>> On 23.03.17 at 09:27, <rcojocaru@bitdefender.com> wrote: > On 03/23/2017 10:19 AM, Jan Beulich wrote: >>>>> On 22.03.17 at 18:22, <rcojocaru@bitdefender.com> wrote: >>> On 03/21/2017 07:02 PM, Jan Beulich wrote: >>>>>>> On 21.03.17 at 17:44, <rcojocaru@bitdefender.com> wrote: >>>>> On 03/21/2017 06:29 PM, Jan Beulich wrote: >>>>>>>>> On 21.03.17 at 16:38, <rcojocaru@bitdefender.com> wrote: >>>>>>> On 03/15/2017 06:57 PM, Jan Beulich wrote: >>>>>>>>>>> On 15.03.17 at 17:46, <rcojocaru@bitdefender.com> wrote: >>>>>>>>> On 03/15/2017 06:30 PM, Jan Beulich wrote: >>>>>>>>>>>>> On 15.03.17 at 17:04, <rcojocaru@bitdefender.com> wrote: >>>>>>>>>>> --- >>>>>>>>>>> Changes since V1: >>>>>>>>>>> - Added Andrew Cooper's credit, as he's kept the patch current >>>>>>>>>>> througout non-trivial code changes since the initial patch. >>>>>>>>>>> - Significantly more patch testing (with XenServer). >>>>>>>>>>> - Restricted lock scope. >>>>>>>>>> >>>>>>>>>> Not by much, as it seems. In particular you continue to take the >>>>>>>>>> lock even for instructions not accessing memory at all. >>>>>>>>> >>>>>>>>> I'll take a closer look. >>>>>>>>> >>>>>>>>>> Also, by "reworked" I did assume you mean converted to at least the >>>>>>>>>> cmpxchg based model. >>>>>>>>> >>>>>>>>> I haven't been able to follow the latest emulator changes closely, could >>>>>>>>> you please clarify what you mean by "the cmpxchg model"? Thanks. >>>>>>>> >>>>>>>> This is unrelated to any recent changes. The idea is to make the >>>>>>>> ->cmpxchg() hook actually behave like what its name says. It's >>>>>>>> being used for LOCKed insn writeback already, and it could >>>>>>>> therefore simply force a retry of the full instruction if the compare >>>>>>>> part of it fails. It may need to be given another parameter, to >>>>>>>> allow the hook function to tell LOCKed from "normal" uses. >>>>>>> >>>>>>> I assume this is what you have in mind? >>>>>> >>>>>> Hmm, not exactly. I'd expect an actual CMPXCHG instruction to >>>>>> be used, with a LOCK prefix when the original access had one. >>>>>> There should certainly not be any tail call to hvmemul_write() >>>>>> anymore. >>>>> >>>>> There seems to be a misunderstanding here: if I understand this reply >>>>> correctly, you'd like hvmemul_cmpxchg() to become a stub that really >>>>> runs CMPXCHG - but if that comes with the smp_lock part as well, it >>>>> doesn't matter (except for the now-ignored p_old part, i.e. the comparison). >>>>> >>>>> I've initially asked if I should bring the old XenServer-queued LOCK >>>>> patch back, and I've understood that it could serve a purpose, at least >>>>> as a temporary workaround until your, and Andrew's emulator changes, >>>>> make it unrequired. >>>> >>>> Well, yes, as said earlier (still visible in context above) I had >>>> originally assumed "reworked" would mean making the cmpxchg >>>> hook actually match its name. >>>> >>>>> However, it appears that you've understood this to >>>>> mean the addition of the CMPXCHG stub (speaking of which, could you >>>>> please point out examples of implementing such stubs in the Xen code? I >>>>> have never done one of those before.) >>>> >>>> There's no talk about stubs here. All I had hoped would have been >>>> done _instead_ of the smp-lock approach was to use the CMPXCHG >>>> instruction inside the hook function, at least for RAM case (I think >>>> we could live with MMIO still being forwarded to the write handler). >>>> >>>> So all this isn't to say that the smp-lock approach might not be >>>> worthwhile to take on its own, provided the locked region gets >>>> shrunk as much as possible without losing correctness. The >>>> CMPXCHG model would just not have this locking granularity >>>> problem in the first place. >>> >>> Sadly, I've now written this (rough) patch: >>> >>> http://pastebin.com/3DJ5WYt0 >>> >>> only to find that it does not solve our issue. With multiple processors >>> per guest and heavy emulation at boot time, the VM got stuck at roughly >>> the same point in its life as before the patch. Looks like more than >>> CMPXCHG needs synchronizing. >>> >>> So it would appear that the smp_lock patch is the only way to bring this >>> under control. Either that, or my patch misses something. >>> Single-processor guests work just fine. >> >> Well, first of all the code to return RETRY is commented out. I >> may guess that you've tried with it not commented out, but I >> can't be sure. > > Indeed I have tried with it on, with the condition for success being at > first "val != old", and then, as it is in the pasted code, "val == new". > Both of these caused BSODs and random crashes. The guest only boots > without any apparent issues (and with 1 VCPU only) after I've stopped > returning RETRY. "val == new" is clearly wrong: Whether to retry depends on whether the cmpxchg was unsuccessful. >> And then "stuck" of course can mean many things, so a better >> description of the state the VM is in and/or the operations that >> it's stuck on would be needed. > > I agree, but on a first look it would appear that the hang is just like > the one without the patch, when multiple processors are involved. This > also is in line with my previous observation that when only cmpxchg() is > locked, the problem still occurs (you may remember that initially I've > tried to only synchronize the cmpxchg() call in x86_emulate()). > > I'll try to give the patch a few more spins and compare the guest stuck > state with the stuck state without the patch. Are you sure you don't run into a case of a RMW insn's memory access crossing a page boundary? Your code does not handle this case at all afaics. Other than that I can't see any obvious omission. Jan
On 03/23/2017 10:45 AM, Jan Beulich wrote: >>>> On 23.03.17 at 09:27, <rcojocaru@bitdefender.com> wrote: >> On 03/23/2017 10:19 AM, Jan Beulich wrote: >>>>>> On 22.03.17 at 18:22, <rcojocaru@bitdefender.com> wrote: >>>> On 03/21/2017 07:02 PM, Jan Beulich wrote: >>>>>>>> On 21.03.17 at 17:44, <rcojocaru@bitdefender.com> wrote: >>>>>> On 03/21/2017 06:29 PM, Jan Beulich wrote: >>>>>>>>>> On 21.03.17 at 16:38, <rcojocaru@bitdefender.com> wrote: >>>>>>>> On 03/15/2017 06:57 PM, Jan Beulich wrote: >>>>>>>>>>>> On 15.03.17 at 17:46, <rcojocaru@bitdefender.com> wrote: >>>>>>>>>> On 03/15/2017 06:30 PM, Jan Beulich wrote: >>>>>>>>>>>>>> On 15.03.17 at 17:04, <rcojocaru@bitdefender.com> wrote: >>>>>>>>>>>> --- >>>>>>>>>>>> Changes since V1: >>>>>>>>>>>> - Added Andrew Cooper's credit, as he's kept the patch current >>>>>>>>>>>> througout non-trivial code changes since the initial patch. >>>>>>>>>>>> - Significantly more patch testing (with XenServer). >>>>>>>>>>>> - Restricted lock scope. >>>>>>>>>>> >>>>>>>>>>> Not by much, as it seems. In particular you continue to take the >>>>>>>>>>> lock even for instructions not accessing memory at all. >>>>>>>>>> >>>>>>>>>> I'll take a closer look. >>>>>>>>>> >>>>>>>>>>> Also, by "reworked" I did assume you mean converted to at least the >>>>>>>>>>> cmpxchg based model. >>>>>>>>>> >>>>>>>>>> I haven't been able to follow the latest emulator changes closely, could >>>>>>>>>> you please clarify what you mean by "the cmpxchg model"? Thanks. >>>>>>>>> >>>>>>>>> This is unrelated to any recent changes. The idea is to make the >>>>>>>>> ->cmpxchg() hook actually behave like what its name says. It's >>>>>>>>> being used for LOCKed insn writeback already, and it could >>>>>>>>> therefore simply force a retry of the full instruction if the compare >>>>>>>>> part of it fails. It may need to be given another parameter, to >>>>>>>>> allow the hook function to tell LOCKed from "normal" uses. >>>>>>>> >>>>>>>> I assume this is what you have in mind? >>>>>>> >>>>>>> Hmm, not exactly. I'd expect an actual CMPXCHG instruction to >>>>>>> be used, with a LOCK prefix when the original access had one. >>>>>>> There should certainly not be any tail call to hvmemul_write() >>>>>>> anymore. >>>>>> >>>>>> There seems to be a misunderstanding here: if I understand this reply >>>>>> correctly, you'd like hvmemul_cmpxchg() to become a stub that really >>>>>> runs CMPXCHG - but if that comes with the smp_lock part as well, it >>>>>> doesn't matter (except for the now-ignored p_old part, i.e. the comparison). >>>>>> >>>>>> I've initially asked if I should bring the old XenServer-queued LOCK >>>>>> patch back, and I've understood that it could serve a purpose, at least >>>>>> as a temporary workaround until your, and Andrew's emulator changes, >>>>>> make it unrequired. >>>>> >>>>> Well, yes, as said earlier (still visible in context above) I had >>>>> originally assumed "reworked" would mean making the cmpxchg >>>>> hook actually match its name. >>>>> >>>>>> However, it appears that you've understood this to >>>>>> mean the addition of the CMPXCHG stub (speaking of which, could you >>>>>> please point out examples of implementing such stubs in the Xen code? I >>>>>> have never done one of those before.) >>>>> >>>>> There's no talk about stubs here. All I had hoped would have been >>>>> done _instead_ of the smp-lock approach was to use the CMPXCHG >>>>> instruction inside the hook function, at least for RAM case (I think >>>>> we could live with MMIO still being forwarded to the write handler). >>>>> >>>>> So all this isn't to say that the smp-lock approach might not be >>>>> worthwhile to take on its own, provided the locked region gets >>>>> shrunk as much as possible without losing correctness. The >>>>> CMPXCHG model would just not have this locking granularity >>>>> problem in the first place. >>>> >>>> Sadly, I've now written this (rough) patch: >>>> >>>> http://pastebin.com/3DJ5WYt0 >>>> >>>> only to find that it does not solve our issue. With multiple processors >>>> per guest and heavy emulation at boot time, the VM got stuck at roughly >>>> the same point in its life as before the patch. Looks like more than >>>> CMPXCHG needs synchronizing. >>>> >>>> So it would appear that the smp_lock patch is the only way to bring this >>>> under control. Either that, or my patch misses something. >>>> Single-processor guests work just fine. >>> >>> Well, first of all the code to return RETRY is commented out. I >>> may guess that you've tried with it not commented out, but I >>> can't be sure. >> >> Indeed I have tried with it on, with the condition for success being at >> first "val != old", and then, as it is in the pasted code, "val == new". >> Both of these caused BSODs and random crashes. The guest only boots >> without any apparent issues (and with 1 VCPU only) after I've stopped >> returning RETRY. > > "val == new" is clearly wrong: Whether to retry depends on whether > the cmpxchg was unsuccessful. You're right, it looks like using "val == old" as a test for success (which is of course the only correct test) and return RETRY on fail works (though I'm still seeing a BSOD very seldom, I'll need to test it carefully and see what's going on). Thanks for the help! Razvan
>>>>> Sadly, I've now written this (rough) patch: >>>>> >>>>> http://pastebin.com/3DJ5WYt0 >>>>> >>>>> only to find that it does not solve our issue. With multiple processors >>>>> per guest and heavy emulation at boot time, the VM got stuck at roughly >>>>> the same point in its life as before the patch. Looks like more than >>>>> CMPXCHG needs synchronizing. >>>>> >>>>> So it would appear that the smp_lock patch is the only way to bring this >>>>> under control. Either that, or my patch misses something. >>>>> Single-processor guests work just fine. >>>> >>>> Well, first of all the code to return RETRY is commented out. I >>>> may guess that you've tried with it not commented out, but I >>>> can't be sure. >>> >>> Indeed I have tried with it on, with the condition for success being at >>> first "val != old", and then, as it is in the pasted code, "val == new". >>> Both of these caused BSODs and random crashes. The guest only boots >>> without any apparent issues (and with 1 VCPU only) after I've stopped >>> returning RETRY. >> >> "val == new" is clearly wrong: Whether to retry depends on whether >> the cmpxchg was unsuccessful. > > You're right, it looks like using "val == old" as a test for success > (which is of course the only correct test) and return RETRY on fail > works (though I'm still seeing a BSOD very seldom, I'll need to test it > carefully and see what's going on). I've made a race more probable by resuming the VCPU only after processing all the events in the ring buffer and now all my Windows 7 32-bit guests BSOD with IRQL_NOT_LESS_OR_EQUAL every time at boot with the updated patch. Changing the guests' configuration to only use 1 VCPU again solves the issue, and also resuming the VCPU after treating each vm_event makes the BSOD much less likely to appear (hence the illusion that the problem is fixed). As for an instruction's crossing a page boundary, I'm not sure how that would affect things here - we're simply emulating whatever instructions cause page faults in interesting pages at boot, we're not yet preventing writes at this point. And if it were such an issue, would it have been fixed (and it is) by the smp_lock version of the patch? Thanks, Razvan
>>> On 23.03.17 at 11:21, <rcojocaru@bitdefender.com> wrote: >>>>>> Sadly, I've now written this (rough) patch: >>>>>> >>>>>> http://pastebin.com/3DJ5WYt0 >>>>>> >>>>>> only to find that it does not solve our issue. With multiple processors >>>>>> per guest and heavy emulation at boot time, the VM got stuck at roughly >>>>>> the same point in its life as before the patch. Looks like more than >>>>>> CMPXCHG needs synchronizing. >>>>>> >>>>>> So it would appear that the smp_lock patch is the only way to bring this >>>>>> under control. Either that, or my patch misses something. >>>>>> Single-processor guests work just fine. >>>>> >>>>> Well, first of all the code to return RETRY is commented out. I >>>>> may guess that you've tried with it not commented out, but I >>>>> can't be sure. >>>> >>>> Indeed I have tried with it on, with the condition for success being at >>>> first "val != old", and then, as it is in the pasted code, "val == new". >>>> Both of these caused BSODs and random crashes. The guest only boots >>>> without any apparent issues (and with 1 VCPU only) after I've stopped >>>> returning RETRY. >>> >>> "val == new" is clearly wrong: Whether to retry depends on whether >>> the cmpxchg was unsuccessful. >> >> You're right, it looks like using "val == old" as a test for success >> (which is of course the only correct test) and return RETRY on fail >> works (though I'm still seeing a BSOD very seldom, I'll need to test it >> carefully and see what's going on). > > I've made a race more probable by resuming the VCPU only after > processing all the events in the ring buffer and now all my Windows 7 > 32-bit guests BSOD with IRQL_NOT_LESS_OR_EQUAL every time at boot with > the updated patch. > > Changing the guests' configuration to only use 1 VCPU again solves the > issue, and also resuming the VCPU after treating each vm_event makes the > BSOD much less likely to appear (hence the illusion that the problem is > fixed). Well, I can only repeat: We need to understand what it is that goes wrong. > As for an instruction's crossing a page boundary, I'm not sure how that > would affect things here - we're simply emulating whatever instructions > cause page faults in interesting pages at boot, we're not yet preventing > writes at this point. And if it were such an issue, would it have been > fixed (and it is) by the smp_lock version of the patch? Yes, it likely would have been: The problem with your code is that you map just a single page even when the original access crosses a page boundary, yet then access the returned pointer as if you had mapped two adjacent pages. Jan
On 03/23/2017 03:23 PM, Jan Beulich wrote: >>>> On 23.03.17 at 11:21, <rcojocaru@bitdefender.com> wrote: >>>>>>> Sadly, I've now written this (rough) patch: >>>>>>> >>>>>>> http://pastebin.com/3DJ5WYt0 >>>>>>> >>>>>>> only to find that it does not solve our issue. With multiple processors >>>>>>> per guest and heavy emulation at boot time, the VM got stuck at roughly >>>>>>> the same point in its life as before the patch. Looks like more than >>>>>>> CMPXCHG needs synchronizing. >>>>>>> >>>>>>> So it would appear that the smp_lock patch is the only way to bring this >>>>>>> under control. Either that, or my patch misses something. >>>>>>> Single-processor guests work just fine. >>>>>> >>>>>> Well, first of all the code to return RETRY is commented out. I >>>>>> may guess that you've tried with it not commented out, but I >>>>>> can't be sure. >>>>> >>>>> Indeed I have tried with it on, with the condition for success being at >>>>> first "val != old", and then, as it is in the pasted code, "val == new". >>>>> Both of these caused BSODs and random crashes. The guest only boots >>>>> without any apparent issues (and with 1 VCPU only) after I've stopped >>>>> returning RETRY. >>>> >>>> "val == new" is clearly wrong: Whether to retry depends on whether >>>> the cmpxchg was unsuccessful. >>> >>> You're right, it looks like using "val == old" as a test for success >>> (which is of course the only correct test) and return RETRY on fail >>> works (though I'm still seeing a BSOD very seldom, I'll need to test it >>> carefully and see what's going on). >> >> I've made a race more probable by resuming the VCPU only after >> processing all the events in the ring buffer and now all my Windows 7 >> 32-bit guests BSOD with IRQL_NOT_LESS_OR_EQUAL every time at boot with >> the updated patch. >> >> Changing the guests' configuration to only use 1 VCPU again solves the >> issue, and also resuming the VCPU after treating each vm_event makes the >> BSOD much less likely to appear (hence the illusion that the problem is >> fixed). > > Well, I can only repeat: We need to understand what it is that > goes wrong. > >> As for an instruction's crossing a page boundary, I'm not sure how that >> would affect things here - we're simply emulating whatever instructions >> cause page faults in interesting pages at boot, we're not yet preventing >> writes at this point. And if it were such an issue, would it have been >> fixed (and it is) by the smp_lock version of the patch? > > Yes, it likely would have been: The problem with your code is that > you map just a single page even when the original access crosses > a page boundary, yet then access the returned pointer as if you > had mapped two adjacent pages. Oh, right, the patch code, of course. For some reason I was thinking about a rather complicated scenario related to emulating an instruction touching several protected pages. Yes, you're right, the patch does not correctly handle the case where we'd write beyond the mapped page. I'll check if that's what happens. Thanks, Razvan
On 03/23/2017 03:23 PM, Jan Beulich wrote: >>>> On 23.03.17 at 11:21, <rcojocaru@bitdefender.com> wrote: >>>>>>> Sadly, I've now written this (rough) patch: >>>>>>> >>>>>>> http://pastebin.com/3DJ5WYt0 >>>>>>> >>>>>>> only to find that it does not solve our issue. With multiple processors >>>>>>> per guest and heavy emulation at boot time, the VM got stuck at roughly >>>>>>> the same point in its life as before the patch. Looks like more than >>>>>>> CMPXCHG needs synchronizing. >>>>>>> >>>>>>> So it would appear that the smp_lock patch is the only way to bring this >>>>>>> under control. Either that, or my patch misses something. >>>>>>> Single-processor guests work just fine. >>>>>> >>>>>> Well, first of all the code to return RETRY is commented out. I >>>>>> may guess that you've tried with it not commented out, but I >>>>>> can't be sure. >>>>> >>>>> Indeed I have tried with it on, with the condition for success being at >>>>> first "val != old", and then, as it is in the pasted code, "val == new". >>>>> Both of these caused BSODs and random crashes. The guest only boots >>>>> without any apparent issues (and with 1 VCPU only) after I've stopped >>>>> returning RETRY. >>>> >>>> "val == new" is clearly wrong: Whether to retry depends on whether >>>> the cmpxchg was unsuccessful. >>> >>> You're right, it looks like using "val == old" as a test for success >>> (which is of course the only correct test) and return RETRY on fail >>> works (though I'm still seeing a BSOD very seldom, I'll need to test it >>> carefully and see what's going on). >> >> I've made a race more probable by resuming the VCPU only after >> processing all the events in the ring buffer and now all my Windows 7 >> 32-bit guests BSOD with IRQL_NOT_LESS_OR_EQUAL every time at boot with >> the updated patch. >> >> Changing the guests' configuration to only use 1 VCPU again solves the >> issue, and also resuming the VCPU after treating each vm_event makes the >> BSOD much less likely to appear (hence the illusion that the problem is >> fixed). > > Well, I can only repeat: We need to understand what it is that > goes wrong. > >> As for an instruction's crossing a page boundary, I'm not sure how that >> would affect things here - we're simply emulating whatever instructions >> cause page faults in interesting pages at boot, we're not yet preventing >> writes at this point. And if it were such an issue, would it have been >> fixed (and it is) by the smp_lock version of the patch? > > Yes, it likely would have been: The problem with your code is that > you map just a single page even when the original access crosses > a page boundary, yet then access the returned pointer as if you > had mapped two adjacent pages. I've printed out all the offsets in the new cmpxchg hook, and the largest one I've seen across many guest runs was 4064, with a size of 8, which would not cause the write to go beyond the page. So this is not what's causing the issue. Since it's working fine with 1 VCPU per guest, this I think also validates the new implementation of cmpxchg, since if it would have been wrong the guest would have run into trouble soon. This is born out by my before-and-after printk()s: (XEN) offset: 156, size: 4 (XEN) offset: 1708, size: 4 (XEN) - [0] mem: 0, old: 0, new: 1, lock: 1 (XEN) - [1] mem: 3, old: 3, new: 4, lock: 1 (XEN) + [0] mem: 1, old: 0, new: 1, val: 0 (XEN) + [1] mem: 4, old: 3, new: 4, val: 3 (XEN) returning X86EMUL_OKAY (XEN) returning X86EMUL_OKAY (XEN) offset: 1708, size: 4 (XEN) - [1] mem: 4, old: 4, new: 3, lock: 1 (XEN) offset: 204, size: 4 (XEN) + [1] mem: 3, old: 4, new: 3, val: 4 (XEN) - [0] mem: 3539, old: 3539, new: 353a, lock: 1 (XEN) returning X86EMUL_OKAY (XEN) + [0] mem: 353a, old: 3539, new: 353a, val: 3539 (XEN) returning X86EMUL_OKAY (XEN) offset: 1708, size: 4 (XEN) - [1] mem: 3, old: 3, new: 4, lock: 1 (XEN) + [1] mem: 4, old: 3, new: 4, val: 3 (XEN) returning X86EMUL_OKAY With 2 VCPUs, the guest can't seem to make up its mind to either BSOD with IRQL_NOT_LESS_OR_EQUAL (when I return RETRY on old != val ) or BAD_POOL_HEADER (when I always return OKAY), or simply get stuck. These are all situations that happen on boot, fairly quickly (my Windows 7 32-bit guest doesn't go beyond the "Starting Windows" screen). The last time it got stuck I've collected this info with succesive runs of xenctx: # ./xenctx -a 6 cs:eip: 0008:82a4209b flags: 00200046 cid z p ss:esp: 0010:8273f9bc eax: 00000000 ebx: 00000000 ecx: 82782480 edx: 000003fd esi: 80ba0020 edi: 00000000 ebp: 8273fa08 ds: 0023 es: 0023 fs: 0030 gs: 0000 cr0: 8001003b cr2: 8d486408 cr3: 00185000 cr4: 000406f9 dr0: 00000000 dr1: 00000000 dr2: 00000000 dr3: 00000000 dr6: fffe0ff0 dr7: 00000400 Code (instr addr 82a4209b) 14 a3 a2 82 cc cc cc cc cc cc cc cc cc cc 33 c0 8b 54 24 04 ec <c2> 04 00 8b ff 33 c0 8b 54 24 04 # ./xenctx -a 6 cs:eip: 0008:8c5065d6 flags: 00200246 cid i z p ss:esp: 0010:8273fb9c eax: 00000000 ebx: 8d662338 ecx: 850865f0 edx: 00000000 esi: 40008000 edi: 82742d20 ebp: 8273fc20 ds: 0023 es: 0023 fs: 0030 gs: 0000 cr0: 8001003b cr2: 8d486408 cr3: 00185000 cr4: 000406f9 dr0: 00000000 dr1: 00000000 dr2: 00000000 dr3: 00000000 dr6: fffe0ff0 dr7: 00000400 Code (instr addr 8c5065d6) 47 fc 83 c7 14 4e 75 ef 5f 5e c3 cc cc cc cc cc cc 8b ff fb f4 <c3> cc cc cc cc cc 8b ff 55 8b ec Hovewer this is likely irrelevant since by now we're past the trigger event. I'm not sure where to go from here. Thanks, Razvan
>>> On 23.03.17 at 16:54, <rcojocaru@bitdefender.com> wrote: > I'm not sure where to go from here. Well, without finding where things start to go wrong, I don't think we can make any progress here. Finding that may admittedly be a rather tedious process. Jan
On 03/23/2017 06:20 PM, Jan Beulich wrote: >>>> On 23.03.17 at 16:54, <rcojocaru@bitdefender.com> wrote: >> I'm not sure where to go from here. > > Well, without finding where things start to go wrong, I don't think > we can make any progress here. Finding that may admittedly be > a rather tedious process. I may have stumbled across a clue: hvm_emulate_one_vm_event() currently does nothing when hvm_emulate_one() returns RETRY. This was fine before, but if the cmpxchg hook can propagate RETRY upwards, a failed emulation attempt causes the guest to retry the instruction, generate another page-fault, and so on. I've now wrapped the body of hvm_emulate_one_vm_event() in a do { /* ... */ } while ( rc == X86EMUL_RETRY ); and my guest seems to be booting. At this point I'm not sure if this is the best solution (should I just retry indefinitely until it succeeds?) or if this is just luck, but it at least feels like a step in the right direction. Thanks, Razvan
diff --git a/tools/tests/x86_emulator/test_x86_emulator.c b/tools/tests/x86_emulator/test_x86_emulator.c index 04332bb..86b79a1 100644 --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -283,6 +283,14 @@ static int read_msr( return X86EMUL_UNHANDLEABLE; } +static void smp_lock(bool locked) +{ +} + +static void smp_unlock(bool locked) +{ +} + static struct x86_emulate_ops emulops = { .read = read, .insn_fetch = fetch, @@ -293,6 +301,8 @@ static struct x86_emulate_ops emulops = { .read_cr = emul_test_read_cr, .read_msr = read_msr, .get_fpu = emul_test_get_fpu, + .smp_lock = smp_lock, + .smp_unlock = smp_unlock, }; int main(int argc, char **argv) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 479aee6..55010f4 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -529,6 +529,8 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags, if ( config == NULL && !is_idle_domain(d) ) return -EINVAL; + percpu_rwlock_resource_init(&d->arch.emulate_lock, emulate_locked_rwlock); + d->arch.s3_integrity = !!(domcr_flags & DOMCRF_s3_integrity); INIT_LIST_HEAD(&d->arch.pdev_list); diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c index f36d7c9..d5bfbf1 100644 --- a/xen/arch/x86/hvm/emulate.c +++ b/xen/arch/x86/hvm/emulate.c @@ -24,6 +24,8 @@ #include <asm/hvm/svm/svm.h> #include <asm/vm_event.h> +DEFINE_PERCPU_RWLOCK_GLOBAL(emulate_locked_rwlock); + static void hvmtrace_io_assist(const ioreq_t *p) { unsigned int size, event; @@ -1682,6 +1684,26 @@ static int hvmemul_vmfunc( return rc; } +void emulate_smp_lock(bool locked) +{ + struct domain *d = current->domain; + + if ( locked ) + percpu_write_lock(emulate_locked_rwlock, &d->arch.emulate_lock); + else + percpu_read_lock(emulate_locked_rwlock, &d->arch.emulate_lock); +} + +void emulate_smp_unlock(bool locked) +{ + struct domain *d = current->domain; + + if ( locked ) + percpu_write_unlock(emulate_locked_rwlock, &d->arch.emulate_lock); + else + percpu_read_unlock(emulate_locked_rwlock, &d->arch.emulate_lock); +} + static const struct x86_emulate_ops hvm_emulate_ops = { .read = hvmemul_read, .insn_fetch = hvmemul_insn_fetch, @@ -1706,6 +1728,8 @@ static const struct x86_emulate_ops hvm_emulate_ops = { .put_fpu = hvmemul_put_fpu, .invlpg = hvmemul_invlpg, .vmfunc = hvmemul_vmfunc, + .smp_lock = emulate_smp_lock, + .smp_unlock = emulate_smp_unlock, }; static const struct x86_emulate_ops hvm_emulate_ops_no_write = { @@ -1731,6 +1755,8 @@ static const struct x86_emulate_ops hvm_emulate_ops_no_write = { .put_fpu = hvmemul_put_fpu, .invlpg = hvmemul_invlpg, .vmfunc = hvmemul_vmfunc, + .smp_lock = emulate_smp_lock, + .smp_unlock = emulate_smp_unlock, }; static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt, diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 7bc951d..2fb3325 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -5369,6 +5369,8 @@ static const struct x86_emulate_ops ptwr_emulate_ops = { .cmpxchg = ptwr_emulated_cmpxchg, .validate = pv_emul_is_mem_write, .cpuid = pv_emul_cpuid, + .smp_lock = emulate_smp_lock, + .smp_unlock = emulate_smp_unlock, }; /* Write page fault handler: check if guest is trying to modify a PTE. */ @@ -5485,6 +5487,8 @@ static const struct x86_emulate_ops mmio_ro_emulate_ops = { .write = mmio_ro_emulated_write, .validate = pv_emul_is_mem_write, .cpuid = pv_emul_cpuid, + .smp_lock = emulate_smp_lock, + .smp_unlock = emulate_smp_unlock, }; int mmcfg_intercept_write( @@ -5524,6 +5528,8 @@ static const struct x86_emulate_ops mmcfg_intercept_ops = { .write = mmcfg_intercept_write, .validate = pv_emul_is_mem_write, .cpuid = pv_emul_cpuid, + .smp_lock = emulate_smp_lock, + .smp_unlock = emulate_smp_unlock, }; /* Check if guest is trying to modify a r/o MMIO page. */ diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c index d078d78..3a2f02e 100644 --- a/xen/arch/x86/mm/shadow/common.c +++ b/xen/arch/x86/mm/shadow/common.c @@ -310,6 +310,8 @@ static const struct x86_emulate_ops hvm_shadow_emulator_ops = { .write = hvm_emulate_write, .cmpxchg = hvm_emulate_cmpxchg, .cpuid = hvmemul_cpuid, + .smp_lock = emulate_smp_lock, + .smp_unlock = emulate_smp_unlock, }; const struct x86_emulate_ops *shadow_init_emulation( diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 08b0070..8bdc8c8 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -2957,6 +2957,8 @@ static const struct x86_emulate_ops priv_op_ops = { .write_msr = priv_op_write_msr, .cpuid = pv_emul_cpuid, .wbinvd = priv_op_wbinvd, + .smp_lock = emulate_smp_lock, + .smp_unlock = emulate_smp_unlock, }; static int emulate_privileged_op(struct cpu_user_regs *regs) diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c index 4872f19..bec4af7 100644 --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -3037,7 +3037,7 @@ x86_emulate( struct x86_emulate_stub stub = {}; DECLARE_ALIGNED(mmval_t, mmval); - ASSERT(ops->read); + ASSERT(ops->read && ops->smp_lock && ops->smp_unlock); rc = x86_decode(&state, ctxt, ops); if ( rc != X86EMUL_OKAY ) @@ -3065,6 +3065,8 @@ x86_emulate( d = state.desc; #define state (&state) + ops->smp_lock(lock_prefix); + generate_exception_if(state->not_64bit && mode_64bit(), EXC_UD); if ( ea.type == OP_REG ) @@ -3593,6 +3595,12 @@ x86_emulate( break; case 0x86 ... 0x87: xchg: /* xchg */ + if ( !lock_prefix ) + { + ops->smp_unlock(lock_prefix); + lock_prefix = 1; + ops->smp_lock(lock_prefix); + } /* Write back the register source. */ switch ( dst.bytes ) { @@ -3603,7 +3611,6 @@ x86_emulate( } /* Write back the memory destination with implicit LOCK prefix. */ dst.val = src.val; - lock_prefix = 1; break; case 0xc6: /* Grp11: mov / xabort */ @@ -7925,8 +7932,11 @@ x86_emulate( ctxt->regs->eflags &= ~X86_EFLAGS_RF; done: + ops->smp_unlock(lock_prefix); + _put_fpu(); put_stub(stub); + return rc; #undef state } diff --git a/xen/arch/x86/x86_emulate/x86_emulate.h b/xen/arch/x86/x86_emulate/x86_emulate.h index 6e98453..3f8bb38 100644 --- a/xen/arch/x86/x86_emulate/x86_emulate.h +++ b/xen/arch/x86/x86_emulate/x86_emulate.h @@ -448,6 +448,14 @@ struct x86_emulate_ops /* vmfunc: Emulate VMFUNC via given set of EAX ECX inputs */ int (*vmfunc)( struct x86_emulate_ctxt *ctxt); + + /* smp_lock: Take a write lock if locked, read lock otherwise. */ + void (*smp_lock)( + bool locked); + + /* smp_unlock: Write unlock if locked, read unlock otherwise. */ + void (*smp_unlock)( + bool locked); }; struct cpu_user_regs; diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index ff5267f..2afccab 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -271,6 +271,8 @@ struct monitor_write_data { uint64_t cr4; }; +DECLARE_PERCPU_RWLOCK_GLOBAL(emulate_locked_rwlock); + struct arch_domain { struct page_info *perdomain_l3_pg; @@ -413,6 +415,8 @@ struct arch_domain /* Emulated devices enabled bitmap. */ uint32_t emulation_flags; + + percpu_rwlock_t emulate_lock; } __cacheline_aligned; #define has_vlapic(d) (!!((d)->arch.emulation_flags & XEN_X86_EMU_LAPIC)) diff --git a/xen/include/asm-x86/hvm/emulate.h b/xen/include/asm-x86/hvm/emulate.h index 88d6b70..b29a47e 100644 --- a/xen/include/asm-x86/hvm/emulate.h +++ b/xen/include/asm-x86/hvm/emulate.h @@ -93,6 +93,9 @@ int hvmemul_do_pio_buffer(uint16_t port, void hvm_dump_emulation_state(const char *prefix, struct hvm_emulate_ctxt *hvmemul_ctxt); +void emulate_smp_lock(bool locked); +void emulate_smp_unlock(bool locked); + #endif /* __ASM_X86_HVM_EMULATE_H__ */ /*