Message ID | b567e068-dcab-b294-9706-ffbecb36de3c@suse.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | x86emul: misc additions | expand |
On 04.04.2023 16:55, Jan Beulich wrote: > These are "compound" instructions to issue a series of RDMSR / WRMSR > respectively. In the emulator we can therefore implement them by using > the existing msr_{read,write}() hooks. The memory accesses utilize that > the HVM ->read() / ->write() hooks are already linear-address > (x86_seg_none) aware (by way of hvmemul_virtual_to_linear() handling > this case). > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > --- > TODO: Use VMX tertiary execution control (once bit is known; see > //todo-s) and then further adjust cpufeatureset.h. Argh, should have Cc-ed Kevin and Jun, even if there weren't this issue. Jan > RFC: In vmx_vmexit_handler() handling is forwarded to the emulator > blindly. Alternatively we could consult the exit qualification and > process just a single MSR at a time (without involving the > emulator), exiting back to the guest after every iteration. (I > don't think a mix of both models makes a lot of sense.) > > RFC: For PV priv_op_ops would need to gain proper read/write hooks, > which doesn't look desirable (albeit there we could refuse to > handle anything else than x86_seg_none); we may want to consider to > instead not support the feature for PV guests, requiring e.g. Linux > to process the lists in new pvops hooks. > > RFC: I wasn't sure whether to add preemption checks to the loops - > thoughts? > > With the VMX side of the spec still unclear (tertiary execution control > bit unspecified in ISE 046) we can't enable the insn yet for (HVM) guest > use. The precise behavior of MSR_BARRIER is also not spelled out, so the > (minimal) implementation is a guess for now. > > --- a/tools/libs/light/libxl_cpuid.c > +++ b/tools/libs/light/libxl_cpuid.c > @@ -240,6 +240,7 @@ int libxl_cpuid_parse_config(libxl_cpuid > {"lkgs", 0x00000007, 1, CPUID_REG_EAX, 18, 1}, > {"wrmsrns", 0x00000007, 1, CPUID_REG_EAX, 19, 1}, > {"avx-ifma", 0x00000007, 1, CPUID_REG_EAX, 23, 1}, > + {"msrlist", 0x00000007, 1, CPUID_REG_EAX, 27, 1}, > > {"avx-vnni-int8",0x00000007, 1, CPUID_REG_EDX, 4, 1}, > {"avx-ne-convert",0x00000007, 1, CPUID_REG_EDX, 5, 1}, > --- a/tools/misc/xen-cpuid.c > +++ b/tools/misc/xen-cpuid.c > @@ -195,6 +195,8 @@ static const char *const str_7a1[32] = > [18] = "lkgs", [19] = "wrmsrns", > > /* 22 */ [23] = "avx-ifma", > + > + /* 26 */ [27] = "msrlist", > }; > > static const char *const str_e21a[32] = > --- a/tools/tests/x86_emulator/predicates.c > +++ b/tools/tests/x86_emulator/predicates.c > @@ -342,6 +342,8 @@ static const struct { > { { 0x01, 0xc4 }, { 2, 2 }, F, N }, /* vmxoff */ > { { 0x01, 0xc5 }, { 2, 2 }, F, N }, /* pconfig */ > { { 0x01, 0xc6 }, { 2, 2 }, F, N }, /* wrmsrns */ > + { { 0x01, 0xc6 }, { 0, 2 }, F, W, pfx_f2 }, /* rdmsrlist */ > + { { 0x01, 0xc6 }, { 0, 2 }, F, R, pfx_f3 }, /* wrmsrlist */ > { { 0x01, 0xc8 }, { 2, 2 }, F, N }, /* monitor */ > { { 0x01, 0xc9 }, { 2, 2 }, F, N }, /* mwait */ > { { 0x01, 0xca }, { 2, 2 }, F, N }, /* clac */ > --- a/tools/tests/x86_emulator/test_x86_emulator.c > +++ b/tools/tests/x86_emulator/test_x86_emulator.c > @@ -589,6 +589,7 @@ static int read( > default: > if ( !is_x86_user_segment(seg) ) > return X86EMUL_UNHANDLEABLE; > + case x86_seg_none: > bytes_read += bytes; > break; > } > @@ -619,7 +620,7 @@ static int write( > if ( verbose ) > printf("** %s(%u, %p,, %u,)\n", __func__, seg, (void *)offset, bytes); > > - if ( !is_x86_user_segment(seg) ) > + if ( !is_x86_user_segment(seg) && seg != x86_seg_none ) > return X86EMUL_UNHANDLEABLE; > memcpy((void *)offset, p_data, bytes); > return X86EMUL_OKAY; > @@ -711,6 +712,10 @@ static int read_msr( > { > switch ( reg ) > { > + case 0x0000002f: /* BARRIER */ > + *val = 0; > + return X86EMUL_OKAY; > + > case 0xc0000080: /* EFER */ > *val = ctxt->addr_size > 32 ? 0x500 /* LME|LMA */ : 0; > return X86EMUL_OKAY; > @@ -1499,9 +1504,53 @@ int main(int argc, char **argv) > (gs_base != 0x0000111122224444UL) || > gs_base_shadow ) > goto fail; > + printf("okay\n"); > > cp.extd.nscb = i; > emulops.write_segment = NULL; > + > + printf("%-40s", "Testing rdmsrlist..."); > + instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6; > + regs.rip = (unsigned long)&instr[0]; > + regs.rsi = (unsigned long)(res + 0x80); > + regs.rdi = (unsigned long)(res + 0x80 + 0x40 * 2); > + regs.rcx = 0x0002000100008000UL; > + gs_base_shadow = 0x0000222244446666UL; > + memset(res + 0x80, ~0, 0x40 * 8 * 2); > + res[0x80 + 0x0f * 2] = 0xc0000101; /* GS_BASE */ > + res[0x80 + 0x0f * 2 + 1] = 0; > + res[0x80 + 0x20 * 2] = 0xc0000102; /* SHADOW_GS_BASE */ > + res[0x80 + 0x20 * 2 + 1] = 0; > + res[0x80 + 0x31 * 2] = 0x2f; /* BARRIER */ > + res[0x80 + 0x31 * 2 + 1] = 0; > + rc = x86_emulate(&ctxt, &emulops); > + if ( (rc != X86EMUL_OKAY) || > + (regs.rip != (unsigned long)&instr[4]) || > + regs.rcx || > + (res[0x80 + (0x40 + 0x0f) * 2] != (unsigned int)gs_base) || > + (res[0x80 + (0x40 + 0x0f) * 2 + 1] != (gs_base >> (8 * sizeof(int)))) || > + (res[0x80 + (0x40 + 0x20) * 2] != (unsigned int)gs_base_shadow) || > + (res[0x80 + (0x40 + 0x20) * 2 + 1] != (gs_base_shadow >> (8 * sizeof(int)))) || > + res[0x80 + (0x40 + 0x31) * 2] || res[0x80 + (0x40 + 0x31) * 2 + 1] ) > + goto fail; > + printf("okay\n"); > + > + printf("%-40s", "Testing wrmsrlist..."); > + instr[0] = 0xf3; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6; > + regs.eip = (unsigned long)&instr[0]; > + regs.rsi -= 0x11 * 8; > + regs.rdi -= 0x11 * 8; > + regs.rcx = 0x0002000100000000UL; > + res[0x80 + 0x0f * 2] = 0xc0000102; /* SHADOW_GS_BASE */ > + res[0x80 + 0x20 * 2] = 0xc0000101; /* GS_BASE */ > + rc = x86_emulate(&ctxt, &emulops); > + if ( (rc != X86EMUL_OKAY) || > + (regs.rip != (unsigned long)&instr[4]) || > + regs.rcx || > + (gs_base != 0x0000222244446666UL) || > + (gs_base_shadow != 0x0000111122224444UL) ) > + goto fail; > + > emulops.write_msr = NULL; > #endif > printf("okay\n"); > --- a/tools/tests/x86_emulator/x86-emulate.c > +++ b/tools/tests/x86_emulator/x86-emulate.c > @@ -88,6 +88,7 @@ bool emul_test_init(void) > cp.feat.rdpid = true; > cp.feat.lkgs = true; > cp.feat.wrmsrns = true; > + cp.feat.msrlist = true; > cp.extd.clzero = true; > > if ( cpu_has_xsave ) > --- a/xen/arch/x86/hvm/vmx/vmx.c > +++ b/xen/arch/x86/hvm/vmx/vmx.c > @@ -835,6 +835,17 @@ static void cf_check vmx_cpuid_policy_ch > else > vmx_set_msr_intercept(v, MSR_PKRS, VMX_MSR_RW); > > + if ( cp->feat.msrlist ) > + { > + vmx_clear_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW); > + //todo enable MSRLIST tertiary execution control > + } > + else > + { > + vmx_set_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW); > + //todo disable MSRLIST tertiary execution control > + } > + > out: > vmx_vmcs_exit(v); > > @@ -3705,6 +3716,22 @@ gp_fault: > return X86EMUL_EXCEPTION; > } > > +static bool cf_check is_msrlist( > + const struct x86_emulate_state *state, const struct x86_emulate_ctxt *ctxt) > +{ > + > + if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0x01) ) > + { > + unsigned int rm, reg; > + int mode = x86_insn_modrm(state, &rm, ®); > + > + /* This also includes WRMSRNS; should be okay. */ > + return mode == 3 && rm == 6 && !reg; > + } > + > + return false; > +} > + > static void vmx_do_extint(struct cpu_user_regs *regs) > { > unsigned long vector; > @@ -4513,6 +4540,17 @@ void vmx_vmexit_handler(struct cpu_user_ > } > break; > > + case EXIT_REASON_RDMSRLIST: > + case EXIT_REASON_WRMSRLIST: > + if ( vmx_guest_x86_mode(v) != 8 || !currd->arch.cpuid->feat.msrlist ) > + { > + ASSERT_UNREACHABLE(); > + hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC); > + } > + else if ( !hvm_emulate_one_insn(is_msrlist, "MSR list") ) > + hvm_inject_hw_exception(TRAP_gp_fault, 0); > + break; > + > case EXIT_REASON_VMXOFF: > case EXIT_REASON_VMXON: > case EXIT_REASON_VMCLEAR: > --- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h > +++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h > @@ -211,6 +211,8 @@ static inline void pi_clear_sn(struct pi > #define EXIT_REASON_XRSTORS 64 > #define EXIT_REASON_BUS_LOCK 74 > #define EXIT_REASON_NOTIFY 75 > +#define EXIT_REASON_RDMSRLIST 78 > +#define EXIT_REASON_WRMSRLIST 79 > /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */ > > /* > --- a/xen/arch/x86/include/asm/msr-index.h > +++ b/xen/arch/x86/include/asm/msr-index.h > @@ -24,6 +24,8 @@ > #define APIC_BASE_ENABLE (_AC(1, ULL) << 11) > #define APIC_BASE_ADDR_MASK 0x000ffffffffff000ULL > > +#define MSR_BARRIER 0x0000002f > + > #define MSR_TEST_CTRL 0x00000033 > #define TEST_CTRL_SPLITLOCK_DETECT (_AC(1, ULL) << 29) > #define TEST_CTRL_SPLITLOCK_DISABLE (_AC(1, ULL) << 31) > --- a/xen/arch/x86/include/asm/perfc_defn.h > +++ b/xen/arch/x86/include/asm/perfc_defn.h > @@ -6,7 +6,7 @@ PERFCOUNTER_ARRAY(exceptions, > > #ifdef CONFIG_HVM > > -#define VMX_PERF_EXIT_REASON_SIZE 76 > +#define VMX_PERF_EXIT_REASON_SIZE 80 > #define VMEXIT_NPF_PERFC 143 > #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1) > PERFCOUNTER_ARRAY(vmexits, "vmexits", > --- a/xen/arch/x86/msr.c > +++ b/xen/arch/x86/msr.c > @@ -223,6 +223,12 @@ int guest_rdmsr(struct vcpu *v, uint32_t > case MSR_AMD_PPIN: > goto gp_fault; > > + case MSR_BARRIER: > + if ( !cp->feat.msrlist ) > + goto gp_fault; > + *val = 0; > + break; > + > case MSR_IA32_FEATURE_CONTROL: > /* > * Architecturally, availability of this MSR is enumerated by the > @@ -493,6 +499,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t > uint64_t rsvd; > > /* Read-only */ > + case MSR_BARRIER: > case MSR_IA32_PLATFORM_ID: > case MSR_CORE_CAPABILITIES: > case MSR_INTEL_CORE_THREAD_COUNT: > --- a/xen/arch/x86/x86_emulate/0f01.c > +++ b/xen/arch/x86/x86_emulate/0f01.c > @@ -40,6 +40,7 @@ int x86emul_0f01(struct x86_emulate_stat > switch ( s->modrm ) > { > unsigned long base, limit, cr0, cr0w, cr4; > + unsigned int n; > struct segment_register sreg; > uint64_t msr_val; > > @@ -54,6 +55,56 @@ int x86emul_0f01(struct x86_emulate_stat > ((uint64_t)regs->r(dx) << 32) | regs->eax, > ctxt); > goto done; > + > + case vex_f3: /* wrmsrlist */ > + vcpu_must_have(msrlist); > + generate_exception_if(!mode_64bit(), X86_EXC_UD); > + generate_exception_if(!mode_ring0() || (regs->r(si) & 7) || > + (regs->r(di) & 7), > + X86_EXC_GP, 0); > + fail_if(!ops->write_msr); > + while ( regs->r(cx) ) > + { > + n = __builtin_ffsl(regs->r(cx)) - 1; > + if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8, > + &msr_val, 8, ctxt)) != X86EMUL_OKAY ) > + break; > + generate_exception_if(msr_val != (uint32_t)msr_val, > + X86_EXC_GP, 0); > + base = msr_val; > + if ( (rc = ops->read(x86_seg_none, regs->r(di) + n * 8, > + &msr_val, 8, ctxt)) != X86EMUL_OKAY || > + (rc = ops->write_msr(base, msr_val, ctxt)) != X86EMUL_OKAY ) > + break; > + regs->r(cx) &= ~(1UL << n); > + } > + goto done; > + > + case vex_f2: /* rdmsrlist */ > + vcpu_must_have(msrlist); > + generate_exception_if(!mode_64bit(), X86_EXC_UD); > + generate_exception_if(!mode_ring0() || (regs->r(si) & 7) || > + (regs->r(di) & 7), > + X86_EXC_GP, 0); > + fail_if(!ops->read_msr || !ops->write); > + while ( regs->r(cx) ) > + { > + n = __builtin_ffsl(regs->r(cx)) - 1; > + if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8, > + &msr_val, 8, ctxt)) != X86EMUL_OKAY ) > + break; > + generate_exception_if(msr_val != (uint32_t)msr_val, > + X86_EXC_GP, 0); > + if ( (rc = ops->read_msr(msr_val, &msr_val, > + ctxt)) != X86EMUL_OKAY || > + (rc = ops->write(x86_seg_none, regs->r(di) + n * 8, > + &msr_val, 8, ctxt)) != X86EMUL_OKAY ) > + break; > + regs->r(cx) &= ~(1UL << n); > + } > + if ( rc != X86EMUL_OKAY ) > + ctxt->regs->r(cx) = regs->r(cx); > + goto done; > } > generate_exception(X86_EXC_UD); > > --- a/xen/arch/x86/x86_emulate/private.h > +++ b/xen/arch/x86/x86_emulate/private.h > @@ -600,6 +600,7 @@ amd_like(const struct x86_emulate_ctxt * > #define vcpu_has_lkgs() (ctxt->cpuid->feat.lkgs) > #define vcpu_has_wrmsrns() (ctxt->cpuid->feat.wrmsrns) > #define vcpu_has_avx_ifma() (ctxt->cpuid->feat.avx_ifma) > +#define vcpu_has_msrlist() (ctxt->cpuid->feat.msrlist) > #define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8) > #define vcpu_has_avx_ne_convert() (ctxt->cpuid->feat.avx_ne_convert) > > --- a/xen/arch/x86/x86_emulate/util.c > +++ b/xen/arch/x86/x86_emulate/util.c > @@ -112,6 +112,9 @@ bool cf_check x86_insn_is_mem_access(con > break; > > case X86EMUL_OPC(0x0f, 0x01): > + /* {RD,WR}MSRLIST */ > + if ( mode_64bit() && s->modrm == 0xc6 ) > + return s->vex.pfx >= vex_f3; > /* Cover CLZERO. */ > return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7; > } > @@ -172,7 +175,11 @@ bool cf_check x86_insn_is_mem_write(cons > case 0xff: /* Grp5 */ > break; > > - case X86EMUL_OPC(0x0f, 0x01): /* CLZERO is the odd one. */ > + case X86EMUL_OPC(0x0f, 0x01): > + /* RDMSRLIST */ > + if ( mode_64bit() && s->modrm == 0xc6 ) > + return s->vex.pfx == vex_f2; > + /* CLZERO is another odd one. */ > return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7; > > default: > --- a/xen/include/public/arch-x86/cpufeatureset.h > +++ b/xen/include/public/arch-x86/cpufeatureset.h > @@ -286,6 +286,7 @@ XEN_CPUFEATURE(FRED, 10*32+17) / > XEN_CPUFEATURE(LKGS, 10*32+18) /*S Load Kernel GS Base */ > XEN_CPUFEATURE(WRMSRNS, 10*32+19) /*A WRMSR Non-Serialising */ > XEN_CPUFEATURE(AVX_IFMA, 10*32+23) /*A AVX-IFMA Instructions */ > +XEN_CPUFEATURE(MSRLIST, 10*32+27) /* MSR list instructions */ > > /* AMD-defined CPU features, CPUID level 0x80000021.eax, word 11 */ > XEN_CPUFEATURE(LFENCE_DISPATCH, 11*32+ 2) /*A LFENCE always serializing */ > >
On 04/04/2023 3:55 pm, Jan Beulich wrote: > These are "compound" instructions to issue a series of RDMSR / WRMSR > respectively. In the emulator we can therefore implement them by using > the existing msr_{read,write}() hooks. The memory accesses utilize that > the HVM ->read() / ->write() hooks are already linear-address > (x86_seg_none) aware (by way of hvmemul_virtual_to_linear() handling > this case). > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > --- > TODO: Use VMX tertiary execution control (once bit is known; see > //todo-s) and then further adjust cpufeatureset.h. > > RFC: In vmx_vmexit_handler() handling is forwarded to the emulator > blindly. Alternatively we could consult the exit qualification and > process just a single MSR at a time (without involving the > emulator), exiting back to the guest after every iteration. (I > don't think a mix of both models makes a lot of sense.) {RD,WR}MSRLIST are supposed to be used for context switch paths. They really shouldn't be exiting in the common case. What matters here is the conditional probability of a second MSR being intercepted too, given that one already has been. And I don't have a clue how to answer this. I would not expect Introspection to be intercepting a fastpath MSR. (And if it does, frankly it can live with the consequences.) For future scenarios such as reloading PMU/LBR/whatever, these will be all-or-nothing and we'd expect to have them eagerly in context anyway. If I were going to guess, I'd say that probably MSR_XSS or MSR_SPEC_CTRL will be giving us the most grief here, because they're both ones that are liable to be touched on a context switch path, and have split host/guest bits. > RFC: For PV priv_op_ops would need to gain proper read/write hooks, > which doesn't look desirable (albeit there we could refuse to > handle anything else than x86_seg_none); we may want to consider to > instead not support the feature for PV guests, requiring e.g. Linux > to process the lists in new pvops hooks. Ah - funny you should ask. See patch 2. These are even better reasons not to support on PV guests. > RFC: I wasn't sure whether to add preemption checks to the loops - > thoughts? I'd be tempted to. Mostly because a guest can exert 64* longest MSR worth of pressure here, along with associated emulation overhead. 64* "write hypercall page" sounds expensive. So too does 64* MSR_PAT, given all the EPT actions behind the scenes. Its probably one of those > With the VMX side of the spec still unclear (tertiary execution control > bit unspecified in ISE 046) we can't enable the insn yet for (HVM) guest > use. The precise behavior of MSR_BARRIER is also not spelled out, so the > (minimal) implementation is a guess for now. MSRs are expensive for several reasons. The %ecx decode alone isn't cheap, nor is the reserved bit checking, and that's before starting the action itself. What the list form gets you is the ability to pipeline the checks on the following MSR while performing the action of the current one. I suspect there are plans to try and parallelise the actions too if possible. As I understand it, BARRIER just halts the piplineing of processing the next MSR. ~Andrew
On 07.04.2023 00:03, Andrew Cooper wrote: > On 04/04/2023 3:55 pm, Jan Beulich wrote: >> --- >> TODO: Use VMX tertiary execution control (once bit is known; see >> //todo-s) and then further adjust cpufeatureset.h. >> >> RFC: In vmx_vmexit_handler() handling is forwarded to the emulator >> blindly. Alternatively we could consult the exit qualification and >> process just a single MSR at a time (without involving the >> emulator), exiting back to the guest after every iteration. (I >> don't think a mix of both models makes a lot of sense.) > > {RD,WR}MSRLIST are supposed to be used for context switch paths. They > really shouldn't be exiting in the common case. > > What matters here is the conditional probability of a second MSR being > intercepted too, given that one already has been. And I don't have a > clue how to answer this. > > I would not expect Introspection to be intercepting a fastpath MSR. > (And if it does, frankly it can live with the consequences.) > > For future scenarios such as reloading PMU/LBR/whatever, these will be > all-or-nothing and we'd expect to have them eagerly in context anyway. > > If I were going to guess, I'd say that probably MSR_XSS or MSR_SPEC_CTRL > will be giving us the most grief here, because they're both ones that > are liable to be touched on a context switch path, and have split > host/guest bits. I'm not really certain, but I'm tending to interpret your reply as agreement with the choice made (and hence not as a request to change anything in this regard); clarification appreciated. >> RFC: For PV priv_op_ops would need to gain proper read/write hooks, >> which doesn't look desirable (albeit there we could refuse to >> handle anything else than x86_seg_none); we may want to consider to >> instead not support the feature for PV guests, requiring e.g. Linux >> to process the lists in new pvops hooks. > > Ah - funny you should ask. See patch 2. These are even better reasons > not to support on PV guests. Yeah, with PV dropped there it's quite obvious to not consider it here either. I'll drop the remark. >> RFC: I wasn't sure whether to add preemption checks to the loops - >> thoughts? > > I'd be tempted to. Mostly because a guest can exert 64* longest MSR > worth of pressure here, along with associated emulation overhead. > > 64* "write hypercall page" sounds expensive. So too does 64* MSR_PAT, > given all the EPT actions behind the scenes. > > Its probably one of those Which leaves me inclined to add preemption checking to writes, but keep reads as they are. Thoughts? Jan
--- a/tools/libs/light/libxl_cpuid.c +++ b/tools/libs/light/libxl_cpuid.c @@ -240,6 +240,7 @@ int libxl_cpuid_parse_config(libxl_cpuid {"lkgs", 0x00000007, 1, CPUID_REG_EAX, 18, 1}, {"wrmsrns", 0x00000007, 1, CPUID_REG_EAX, 19, 1}, {"avx-ifma", 0x00000007, 1, CPUID_REG_EAX, 23, 1}, + {"msrlist", 0x00000007, 1, CPUID_REG_EAX, 27, 1}, {"avx-vnni-int8",0x00000007, 1, CPUID_REG_EDX, 4, 1}, {"avx-ne-convert",0x00000007, 1, CPUID_REG_EDX, 5, 1}, --- a/tools/misc/xen-cpuid.c +++ b/tools/misc/xen-cpuid.c @@ -195,6 +195,8 @@ static const char *const str_7a1[32] = [18] = "lkgs", [19] = "wrmsrns", /* 22 */ [23] = "avx-ifma", + + /* 26 */ [27] = "msrlist", }; static const char *const str_e21a[32] = --- a/tools/tests/x86_emulator/predicates.c +++ b/tools/tests/x86_emulator/predicates.c @@ -342,6 +342,8 @@ static const struct { { { 0x01, 0xc4 }, { 2, 2 }, F, N }, /* vmxoff */ { { 0x01, 0xc5 }, { 2, 2 }, F, N }, /* pconfig */ { { 0x01, 0xc6 }, { 2, 2 }, F, N }, /* wrmsrns */ + { { 0x01, 0xc6 }, { 0, 2 }, F, W, pfx_f2 }, /* rdmsrlist */ + { { 0x01, 0xc6 }, { 0, 2 }, F, R, pfx_f3 }, /* wrmsrlist */ { { 0x01, 0xc8 }, { 2, 2 }, F, N }, /* monitor */ { { 0x01, 0xc9 }, { 2, 2 }, F, N }, /* mwait */ { { 0x01, 0xca }, { 2, 2 }, F, N }, /* clac */ --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -589,6 +589,7 @@ static int read( default: if ( !is_x86_user_segment(seg) ) return X86EMUL_UNHANDLEABLE; + case x86_seg_none: bytes_read += bytes; break; } @@ -619,7 +620,7 @@ static int write( if ( verbose ) printf("** %s(%u, %p,, %u,)\n", __func__, seg, (void *)offset, bytes); - if ( !is_x86_user_segment(seg) ) + if ( !is_x86_user_segment(seg) && seg != x86_seg_none ) return X86EMUL_UNHANDLEABLE; memcpy((void *)offset, p_data, bytes); return X86EMUL_OKAY; @@ -711,6 +712,10 @@ static int read_msr( { switch ( reg ) { + case 0x0000002f: /* BARRIER */ + *val = 0; + return X86EMUL_OKAY; + case 0xc0000080: /* EFER */ *val = ctxt->addr_size > 32 ? 0x500 /* LME|LMA */ : 0; return X86EMUL_OKAY; @@ -1499,9 +1504,53 @@ int main(int argc, char **argv) (gs_base != 0x0000111122224444UL) || gs_base_shadow ) goto fail; + printf("okay\n"); cp.extd.nscb = i; emulops.write_segment = NULL; + + printf("%-40s", "Testing rdmsrlist..."); + instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6; + regs.rip = (unsigned long)&instr[0]; + regs.rsi = (unsigned long)(res + 0x80); + regs.rdi = (unsigned long)(res + 0x80 + 0x40 * 2); + regs.rcx = 0x0002000100008000UL; + gs_base_shadow = 0x0000222244446666UL; + memset(res + 0x80, ~0, 0x40 * 8 * 2); + res[0x80 + 0x0f * 2] = 0xc0000101; /* GS_BASE */ + res[0x80 + 0x0f * 2 + 1] = 0; + res[0x80 + 0x20 * 2] = 0xc0000102; /* SHADOW_GS_BASE */ + res[0x80 + 0x20 * 2 + 1] = 0; + res[0x80 + 0x31 * 2] = 0x2f; /* BARRIER */ + res[0x80 + 0x31 * 2 + 1] = 0; + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.rip != (unsigned long)&instr[4]) || + regs.rcx || + (res[0x80 + (0x40 + 0x0f) * 2] != (unsigned int)gs_base) || + (res[0x80 + (0x40 + 0x0f) * 2 + 1] != (gs_base >> (8 * sizeof(int)))) || + (res[0x80 + (0x40 + 0x20) * 2] != (unsigned int)gs_base_shadow) || + (res[0x80 + (0x40 + 0x20) * 2 + 1] != (gs_base_shadow >> (8 * sizeof(int)))) || + res[0x80 + (0x40 + 0x31) * 2] || res[0x80 + (0x40 + 0x31) * 2 + 1] ) + goto fail; + printf("okay\n"); + + printf("%-40s", "Testing wrmsrlist..."); + instr[0] = 0xf3; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6; + regs.eip = (unsigned long)&instr[0]; + regs.rsi -= 0x11 * 8; + regs.rdi -= 0x11 * 8; + regs.rcx = 0x0002000100000000UL; + res[0x80 + 0x0f * 2] = 0xc0000102; /* SHADOW_GS_BASE */ + res[0x80 + 0x20 * 2] = 0xc0000101; /* GS_BASE */ + rc = x86_emulate(&ctxt, &emulops); + if ( (rc != X86EMUL_OKAY) || + (regs.rip != (unsigned long)&instr[4]) || + regs.rcx || + (gs_base != 0x0000222244446666UL) || + (gs_base_shadow != 0x0000111122224444UL) ) + goto fail; + emulops.write_msr = NULL; #endif printf("okay\n"); --- a/tools/tests/x86_emulator/x86-emulate.c +++ b/tools/tests/x86_emulator/x86-emulate.c @@ -88,6 +88,7 @@ bool emul_test_init(void) cp.feat.rdpid = true; cp.feat.lkgs = true; cp.feat.wrmsrns = true; + cp.feat.msrlist = true; cp.extd.clzero = true; if ( cpu_has_xsave ) --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -835,6 +835,17 @@ static void cf_check vmx_cpuid_policy_ch else vmx_set_msr_intercept(v, MSR_PKRS, VMX_MSR_RW); + if ( cp->feat.msrlist ) + { + vmx_clear_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW); + //todo enable MSRLIST tertiary execution control + } + else + { + vmx_set_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW); + //todo disable MSRLIST tertiary execution control + } + out: vmx_vmcs_exit(v); @@ -3705,6 +3716,22 @@ gp_fault: return X86EMUL_EXCEPTION; } +static bool cf_check is_msrlist( + const struct x86_emulate_state *state, const struct x86_emulate_ctxt *ctxt) +{ + + if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0x01) ) + { + unsigned int rm, reg; + int mode = x86_insn_modrm(state, &rm, ®); + + /* This also includes WRMSRNS; should be okay. */ + return mode == 3 && rm == 6 && !reg; + } + + return false; +} + static void vmx_do_extint(struct cpu_user_regs *regs) { unsigned long vector; @@ -4513,6 +4540,17 @@ void vmx_vmexit_handler(struct cpu_user_ } break; + case EXIT_REASON_RDMSRLIST: + case EXIT_REASON_WRMSRLIST: + if ( vmx_guest_x86_mode(v) != 8 || !currd->arch.cpuid->feat.msrlist ) + { + ASSERT_UNREACHABLE(); + hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC); + } + else if ( !hvm_emulate_one_insn(is_msrlist, "MSR list") ) + hvm_inject_hw_exception(TRAP_gp_fault, 0); + break; + case EXIT_REASON_VMXOFF: case EXIT_REASON_VMXON: case EXIT_REASON_VMCLEAR: --- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h +++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h @@ -211,6 +211,8 @@ static inline void pi_clear_sn(struct pi #define EXIT_REASON_XRSTORS 64 #define EXIT_REASON_BUS_LOCK 74 #define EXIT_REASON_NOTIFY 75 +#define EXIT_REASON_RDMSRLIST 78 +#define EXIT_REASON_WRMSRLIST 79 /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */ /* --- a/xen/arch/x86/include/asm/msr-index.h +++ b/xen/arch/x86/include/asm/msr-index.h @@ -24,6 +24,8 @@ #define APIC_BASE_ENABLE (_AC(1, ULL) << 11) #define APIC_BASE_ADDR_MASK 0x000ffffffffff000ULL +#define MSR_BARRIER 0x0000002f + #define MSR_TEST_CTRL 0x00000033 #define TEST_CTRL_SPLITLOCK_DETECT (_AC(1, ULL) << 29) #define TEST_CTRL_SPLITLOCK_DISABLE (_AC(1, ULL) << 31) --- a/xen/arch/x86/include/asm/perfc_defn.h +++ b/xen/arch/x86/include/asm/perfc_defn.h @@ -6,7 +6,7 @@ PERFCOUNTER_ARRAY(exceptions, #ifdef CONFIG_HVM -#define VMX_PERF_EXIT_REASON_SIZE 76 +#define VMX_PERF_EXIT_REASON_SIZE 80 #define VMEXIT_NPF_PERFC 143 #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1) PERFCOUNTER_ARRAY(vmexits, "vmexits", --- a/xen/arch/x86/msr.c +++ b/xen/arch/x86/msr.c @@ -223,6 +223,12 @@ int guest_rdmsr(struct vcpu *v, uint32_t case MSR_AMD_PPIN: goto gp_fault; + case MSR_BARRIER: + if ( !cp->feat.msrlist ) + goto gp_fault; + *val = 0; + break; + case MSR_IA32_FEATURE_CONTROL: /* * Architecturally, availability of this MSR is enumerated by the @@ -493,6 +499,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t uint64_t rsvd; /* Read-only */ + case MSR_BARRIER: case MSR_IA32_PLATFORM_ID: case MSR_CORE_CAPABILITIES: case MSR_INTEL_CORE_THREAD_COUNT: --- a/xen/arch/x86/x86_emulate/0f01.c +++ b/xen/arch/x86/x86_emulate/0f01.c @@ -40,6 +40,7 @@ int x86emul_0f01(struct x86_emulate_stat switch ( s->modrm ) { unsigned long base, limit, cr0, cr0w, cr4; + unsigned int n; struct segment_register sreg; uint64_t msr_val; @@ -54,6 +55,56 @@ int x86emul_0f01(struct x86_emulate_stat ((uint64_t)regs->r(dx) << 32) | regs->eax, ctxt); goto done; + + case vex_f3: /* wrmsrlist */ + vcpu_must_have(msrlist); + generate_exception_if(!mode_64bit(), X86_EXC_UD); + generate_exception_if(!mode_ring0() || (regs->r(si) & 7) || + (regs->r(di) & 7), + X86_EXC_GP, 0); + fail_if(!ops->write_msr); + while ( regs->r(cx) ) + { + n = __builtin_ffsl(regs->r(cx)) - 1; + if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8, + &msr_val, 8, ctxt)) != X86EMUL_OKAY ) + break; + generate_exception_if(msr_val != (uint32_t)msr_val, + X86_EXC_GP, 0); + base = msr_val; + if ( (rc = ops->read(x86_seg_none, regs->r(di) + n * 8, + &msr_val, 8, ctxt)) != X86EMUL_OKAY || + (rc = ops->write_msr(base, msr_val, ctxt)) != X86EMUL_OKAY ) + break; + regs->r(cx) &= ~(1UL << n); + } + goto done; + + case vex_f2: /* rdmsrlist */ + vcpu_must_have(msrlist); + generate_exception_if(!mode_64bit(), X86_EXC_UD); + generate_exception_if(!mode_ring0() || (regs->r(si) & 7) || + (regs->r(di) & 7), + X86_EXC_GP, 0); + fail_if(!ops->read_msr || !ops->write); + while ( regs->r(cx) ) + { + n = __builtin_ffsl(regs->r(cx)) - 1; + if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8, + &msr_val, 8, ctxt)) != X86EMUL_OKAY ) + break; + generate_exception_if(msr_val != (uint32_t)msr_val, + X86_EXC_GP, 0); + if ( (rc = ops->read_msr(msr_val, &msr_val, + ctxt)) != X86EMUL_OKAY || + (rc = ops->write(x86_seg_none, regs->r(di) + n * 8, + &msr_val, 8, ctxt)) != X86EMUL_OKAY ) + break; + regs->r(cx) &= ~(1UL << n); + } + if ( rc != X86EMUL_OKAY ) + ctxt->regs->r(cx) = regs->r(cx); + goto done; } generate_exception(X86_EXC_UD); --- a/xen/arch/x86/x86_emulate/private.h +++ b/xen/arch/x86/x86_emulate/private.h @@ -600,6 +600,7 @@ amd_like(const struct x86_emulate_ctxt * #define vcpu_has_lkgs() (ctxt->cpuid->feat.lkgs) #define vcpu_has_wrmsrns() (ctxt->cpuid->feat.wrmsrns) #define vcpu_has_avx_ifma() (ctxt->cpuid->feat.avx_ifma) +#define vcpu_has_msrlist() (ctxt->cpuid->feat.msrlist) #define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8) #define vcpu_has_avx_ne_convert() (ctxt->cpuid->feat.avx_ne_convert) --- a/xen/arch/x86/x86_emulate/util.c +++ b/xen/arch/x86/x86_emulate/util.c @@ -112,6 +112,9 @@ bool cf_check x86_insn_is_mem_access(con break; case X86EMUL_OPC(0x0f, 0x01): + /* {RD,WR}MSRLIST */ + if ( mode_64bit() && s->modrm == 0xc6 ) + return s->vex.pfx >= vex_f3; /* Cover CLZERO. */ return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7; } @@ -172,7 +175,11 @@ bool cf_check x86_insn_is_mem_write(cons case 0xff: /* Grp5 */ break; - case X86EMUL_OPC(0x0f, 0x01): /* CLZERO is the odd one. */ + case X86EMUL_OPC(0x0f, 0x01): + /* RDMSRLIST */ + if ( mode_64bit() && s->modrm == 0xc6 ) + return s->vex.pfx == vex_f2; + /* CLZERO is another odd one. */ return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7; default: --- a/xen/include/public/arch-x86/cpufeatureset.h +++ b/xen/include/public/arch-x86/cpufeatureset.h @@ -286,6 +286,7 @@ XEN_CPUFEATURE(FRED, 10*32+17) / XEN_CPUFEATURE(LKGS, 10*32+18) /*S Load Kernel GS Base */ XEN_CPUFEATURE(WRMSRNS, 10*32+19) /*A WRMSR Non-Serialising */ XEN_CPUFEATURE(AVX_IFMA, 10*32+23) /*A AVX-IFMA Instructions */ +XEN_CPUFEATURE(MSRLIST, 10*32+27) /* MSR list instructions */ /* AMD-defined CPU features, CPUID level 0x80000021.eax, word 11 */ XEN_CPUFEATURE(LFENCE_DISPATCH, 11*32+ 2) /*A LFENCE always serializing */
These are "compound" instructions to issue a series of RDMSR / WRMSR respectively. In the emulator we can therefore implement them by using the existing msr_{read,write}() hooks. The memory accesses utilize that the HVM ->read() / ->write() hooks are already linear-address (x86_seg_none) aware (by way of hvmemul_virtual_to_linear() handling this case). Signed-off-by: Jan Beulich <jbeulich@suse.com> --- TODO: Use VMX tertiary execution control (once bit is known; see //todo-s) and then further adjust cpufeatureset.h. RFC: In vmx_vmexit_handler() handling is forwarded to the emulator blindly. Alternatively we could consult the exit qualification and process just a single MSR at a time (without involving the emulator), exiting back to the guest after every iteration. (I don't think a mix of both models makes a lot of sense.) RFC: For PV priv_op_ops would need to gain proper read/write hooks, which doesn't look desirable (albeit there we could refuse to handle anything else than x86_seg_none); we may want to consider to instead not support the feature for PV guests, requiring e.g. Linux to process the lists in new pvops hooks. RFC: I wasn't sure whether to add preemption checks to the loops - thoughts? With the VMX side of the spec still unclear (tertiary execution control bit unspecified in ISE 046) we can't enable the insn yet for (HVM) guest use. The precise behavior of MSR_BARRIER is also not spelled out, so the (minimal) implementation is a guess for now.