diff mbox series

[9/9] x86emul+VMX: support {RD,WR}MSRLIST

Message ID b567e068-dcab-b294-9706-ffbecb36de3c@suse.com (mailing list archive)
State Superseded
Headers show
Series x86emul: misc additions | expand

Commit Message

Jan Beulich April 4, 2023, 2:55 p.m. UTC
These are "compound" instructions to issue a series of RDMSR / WRMSR
respectively. In the emulator we can therefore implement them by using
the existing msr_{read,write}() hooks. The memory accesses utilize that
the HVM ->read() / ->write() hooks are already linear-address
(x86_seg_none) aware (by way of hvmemul_virtual_to_linear() handling
this case).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TODO: Use VMX tertiary execution control (once bit is known; see
      //todo-s) and then further adjust cpufeatureset.h.

RFC: In vmx_vmexit_handler() handling is forwarded to the emulator
     blindly. Alternatively we could consult the exit qualification and
     process just a single MSR at a time (without involving the
     emulator), exiting back to the guest after every iteration. (I
     don't think a mix of both models makes a lot of sense.)

RFC: For PV priv_op_ops would need to gain proper read/write hooks,
     which doesn't look desirable (albeit there we could refuse to
     handle anything else than x86_seg_none); we may want to consider to
     instead not support the feature for PV guests, requiring e.g. Linux
     to process the lists in new pvops hooks.

RFC: I wasn't sure whether to add preemption checks to the loops -
     thoughts?

With the VMX side of the spec still unclear (tertiary execution control
bit unspecified in ISE 046) we can't enable the insn yet for (HVM) guest
use. The precise behavior of MSR_BARRIER is also not spelled out, so the
(minimal) implementation is a guess for now.

Comments

Jan Beulich April 4, 2023, 2:57 p.m. UTC | #1
On 04.04.2023 16:55, Jan Beulich wrote:
> These are "compound" instructions to issue a series of RDMSR / WRMSR
> respectively. In the emulator we can therefore implement them by using
> the existing msr_{read,write}() hooks. The memory accesses utilize that
> the HVM ->read() / ->write() hooks are already linear-address
> (x86_seg_none) aware (by way of hvmemul_virtual_to_linear() handling
> this case).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> TODO: Use VMX tertiary execution control (once bit is known; see
>       //todo-s) and then further adjust cpufeatureset.h.

Argh, should have Cc-ed Kevin and Jun, even if there weren't this issue.

Jan

> RFC: In vmx_vmexit_handler() handling is forwarded to the emulator
>      blindly. Alternatively we could consult the exit qualification and
>      process just a single MSR at a time (without involving the
>      emulator), exiting back to the guest after every iteration. (I
>      don't think a mix of both models makes a lot of sense.)
> 
> RFC: For PV priv_op_ops would need to gain proper read/write hooks,
>      which doesn't look desirable (albeit there we could refuse to
>      handle anything else than x86_seg_none); we may want to consider to
>      instead not support the feature for PV guests, requiring e.g. Linux
>      to process the lists in new pvops hooks.
> 
> RFC: I wasn't sure whether to add preemption checks to the loops -
>      thoughts?
> 
> With the VMX side of the spec still unclear (tertiary execution control
> bit unspecified in ISE 046) we can't enable the insn yet for (HVM) guest
> use. The precise behavior of MSR_BARRIER is also not spelled out, so the
> (minimal) implementation is a guess for now.
> 
> --- a/tools/libs/light/libxl_cpuid.c
> +++ b/tools/libs/light/libxl_cpuid.c
> @@ -240,6 +240,7 @@ int libxl_cpuid_parse_config(libxl_cpuid
>          {"lkgs",         0x00000007,  1, CPUID_REG_EAX, 18,  1},
>          {"wrmsrns",      0x00000007,  1, CPUID_REG_EAX, 19,  1},
>          {"avx-ifma",     0x00000007,  1, CPUID_REG_EAX, 23,  1},
> +        {"msrlist",      0x00000007,  1, CPUID_REG_EAX, 27,  1},
>  
>          {"avx-vnni-int8",0x00000007,  1, CPUID_REG_EDX,  4,  1},
>          {"avx-ne-convert",0x00000007, 1, CPUID_REG_EDX,  5,  1},
> --- a/tools/misc/xen-cpuid.c
> +++ b/tools/misc/xen-cpuid.c
> @@ -195,6 +195,8 @@ static const char *const str_7a1[32] =
>      [18] = "lkgs",          [19] = "wrmsrns",
>  
>      /* 22 */                [23] = "avx-ifma",
> +
> +    /* 26 */                [27] = "msrlist",
>  };
>  
>  static const char *const str_e21a[32] =
> --- a/tools/tests/x86_emulator/predicates.c
> +++ b/tools/tests/x86_emulator/predicates.c
> @@ -342,6 +342,8 @@ static const struct {
>      { { 0x01, 0xc4 }, { 2, 2 }, F, N }, /* vmxoff */
>      { { 0x01, 0xc5 }, { 2, 2 }, F, N }, /* pconfig */
>      { { 0x01, 0xc6 }, { 2, 2 }, F, N }, /* wrmsrns */
> +    { { 0x01, 0xc6 }, { 0, 2 }, F, W, pfx_f2 }, /* rdmsrlist */
> +    { { 0x01, 0xc6 }, { 0, 2 }, F, R, pfx_f3 }, /* wrmsrlist */
>      { { 0x01, 0xc8 }, { 2, 2 }, F, N }, /* monitor */
>      { { 0x01, 0xc9 }, { 2, 2 }, F, N }, /* mwait */
>      { { 0x01, 0xca }, { 2, 2 }, F, N }, /* clac */
> --- a/tools/tests/x86_emulator/test_x86_emulator.c
> +++ b/tools/tests/x86_emulator/test_x86_emulator.c
> @@ -589,6 +589,7 @@ static int read(
>      default:
>          if ( !is_x86_user_segment(seg) )
>              return X86EMUL_UNHANDLEABLE;
> +    case x86_seg_none:
>          bytes_read += bytes;
>          break;
>      }
> @@ -619,7 +620,7 @@ static int write(
>      if ( verbose )
>          printf("** %s(%u, %p,, %u,)\n", __func__, seg, (void *)offset, bytes);
>  
> -    if ( !is_x86_user_segment(seg) )
> +    if ( !is_x86_user_segment(seg) && seg != x86_seg_none )
>          return X86EMUL_UNHANDLEABLE;
>      memcpy((void *)offset, p_data, bytes);
>      return X86EMUL_OKAY;
> @@ -711,6 +712,10 @@ static int read_msr(
>  {
>      switch ( reg )
>      {
> +    case 0x0000002f: /* BARRIER */
> +        *val = 0;
> +        return X86EMUL_OKAY;
> +
>      case 0xc0000080: /* EFER */
>          *val = ctxt->addr_size > 32 ? 0x500 /* LME|LMA */ : 0;
>          return X86EMUL_OKAY;
> @@ -1499,9 +1504,53 @@ int main(int argc, char **argv)
>           (gs_base != 0x0000111122224444UL) ||
>           gs_base_shadow )
>          goto fail;
> +    printf("okay\n");
>  
>      cp.extd.nscb = i;
>      emulops.write_segment = NULL;
> +
> +    printf("%-40s", "Testing rdmsrlist...");
> +    instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6;
> +    regs.rip = (unsigned long)&instr[0];
> +    regs.rsi = (unsigned long)(res + 0x80);
> +    regs.rdi = (unsigned long)(res + 0x80 + 0x40 * 2);
> +    regs.rcx = 0x0002000100008000UL;
> +    gs_base_shadow = 0x0000222244446666UL;
> +    memset(res + 0x80, ~0, 0x40 * 8 * 2);
> +    res[0x80 + 0x0f * 2] = 0xc0000101; /* GS_BASE */
> +    res[0x80 + 0x0f * 2 + 1] = 0;
> +    res[0x80 + 0x20 * 2] = 0xc0000102; /* SHADOW_GS_BASE */
> +    res[0x80 + 0x20 * 2 + 1] = 0;
> +    res[0x80 + 0x31 * 2] = 0x2f; /* BARRIER */
> +    res[0x80 + 0x31 * 2 + 1] = 0;
> +    rc = x86_emulate(&ctxt, &emulops);
> +    if ( (rc != X86EMUL_OKAY) ||
> +         (regs.rip != (unsigned long)&instr[4]) ||
> +         regs.rcx ||
> +         (res[0x80 + (0x40 + 0x0f) * 2] != (unsigned int)gs_base) ||
> +         (res[0x80 + (0x40 + 0x0f) * 2 + 1] != (gs_base >> (8 * sizeof(int)))) ||
> +         (res[0x80 + (0x40 + 0x20) * 2] != (unsigned int)gs_base_shadow) ||
> +         (res[0x80 + (0x40 + 0x20) * 2 + 1] != (gs_base_shadow >> (8 * sizeof(int)))) ||
> +         res[0x80 + (0x40 + 0x31) * 2] || res[0x80 + (0x40 + 0x31) * 2 + 1] )
> +        goto fail;
> +    printf("okay\n");
> +
> +    printf("%-40s", "Testing wrmsrlist...");
> +    instr[0] = 0xf3; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6;
> +    regs.eip = (unsigned long)&instr[0];
> +    regs.rsi -= 0x11 * 8;
> +    regs.rdi -= 0x11 * 8;
> +    regs.rcx = 0x0002000100000000UL;
> +    res[0x80 + 0x0f * 2] = 0xc0000102; /* SHADOW_GS_BASE */
> +    res[0x80 + 0x20 * 2] = 0xc0000101; /* GS_BASE */
> +    rc = x86_emulate(&ctxt, &emulops);
> +    if ( (rc != X86EMUL_OKAY) ||
> +         (regs.rip != (unsigned long)&instr[4]) ||
> +         regs.rcx ||
> +         (gs_base != 0x0000222244446666UL) ||
> +         (gs_base_shadow != 0x0000111122224444UL) )
> +        goto fail;
> +
>      emulops.write_msr     = NULL;
>  #endif
>      printf("okay\n");
> --- a/tools/tests/x86_emulator/x86-emulate.c
> +++ b/tools/tests/x86_emulator/x86-emulate.c
> @@ -88,6 +88,7 @@ bool emul_test_init(void)
>      cp.feat.rdpid = true;
>      cp.feat.lkgs = true;
>      cp.feat.wrmsrns = true;
> +    cp.feat.msrlist = true;
>      cp.extd.clzero = true;
>  
>      if ( cpu_has_xsave )
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -835,6 +835,17 @@ static void cf_check vmx_cpuid_policy_ch
>      else
>          vmx_set_msr_intercept(v, MSR_PKRS, VMX_MSR_RW);
>  
> +    if ( cp->feat.msrlist )
> +    {
> +        vmx_clear_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW);
> +        //todo enable MSRLIST tertiary execution control
> +    }
> +    else
> +    {
> +        vmx_set_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW);
> +        //todo disable MSRLIST tertiary execution control
> +    }
> +
>   out:
>      vmx_vmcs_exit(v);
>  
> @@ -3705,6 +3716,22 @@ gp_fault:
>      return X86EMUL_EXCEPTION;
>  }
>  
> +static bool cf_check is_msrlist(
> +    const struct x86_emulate_state *state, const struct x86_emulate_ctxt *ctxt)
> +{
> +
> +    if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0x01) )
> +    {
> +        unsigned int rm, reg;
> +        int mode = x86_insn_modrm(state, &rm, &reg);
> +
> +        /* This also includes WRMSRNS; should be okay. */
> +        return mode == 3 && rm == 6 && !reg;
> +    }
> +
> +    return false;
> +}
> +
>  static void vmx_do_extint(struct cpu_user_regs *regs)
>  {
>      unsigned long vector;
> @@ -4513,6 +4540,17 @@ void vmx_vmexit_handler(struct cpu_user_
>          }
>          break;
>  
> +    case EXIT_REASON_RDMSRLIST:
> +    case EXIT_REASON_WRMSRLIST:
> +        if ( vmx_guest_x86_mode(v) != 8 || !currd->arch.cpuid->feat.msrlist )
> +        {
> +            ASSERT_UNREACHABLE();
> +            hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
> +        }
> +        else if ( !hvm_emulate_one_insn(is_msrlist, "MSR list") )
> +            hvm_inject_hw_exception(TRAP_gp_fault, 0);
> +        break;
> +
>      case EXIT_REASON_VMXOFF:
>      case EXIT_REASON_VMXON:
>      case EXIT_REASON_VMCLEAR:
> --- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h
> +++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h
> @@ -211,6 +211,8 @@ static inline void pi_clear_sn(struct pi
>  #define EXIT_REASON_XRSTORS             64
>  #define EXIT_REASON_BUS_LOCK            74
>  #define EXIT_REASON_NOTIFY              75
> +#define EXIT_REASON_RDMSRLIST           78
> +#define EXIT_REASON_WRMSRLIST           79
>  /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */
>  
>  /*
> --- a/xen/arch/x86/include/asm/msr-index.h
> +++ b/xen/arch/x86/include/asm/msr-index.h
> @@ -24,6 +24,8 @@
>  #define  APIC_BASE_ENABLE                   (_AC(1, ULL) << 11)
>  #define  APIC_BASE_ADDR_MASK                0x000ffffffffff000ULL
>  
> +#define MSR_BARRIER                         0x0000002f
> +
>  #define MSR_TEST_CTRL                       0x00000033
>  #define  TEST_CTRL_SPLITLOCK_DETECT         (_AC(1, ULL) << 29)
>  #define  TEST_CTRL_SPLITLOCK_DISABLE        (_AC(1, ULL) << 31)
> --- a/xen/arch/x86/include/asm/perfc_defn.h
> +++ b/xen/arch/x86/include/asm/perfc_defn.h
> @@ -6,7 +6,7 @@ PERFCOUNTER_ARRAY(exceptions,
>  
>  #ifdef CONFIG_HVM
>  
> -#define VMX_PERF_EXIT_REASON_SIZE 76
> +#define VMX_PERF_EXIT_REASON_SIZE 80
>  #define VMEXIT_NPF_PERFC 143
>  #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1)
>  PERFCOUNTER_ARRAY(vmexits,              "vmexits",
> --- a/xen/arch/x86/msr.c
> +++ b/xen/arch/x86/msr.c
> @@ -223,6 +223,12 @@ int guest_rdmsr(struct vcpu *v, uint32_t
>      case MSR_AMD_PPIN:
>          goto gp_fault;
>  
> +    case MSR_BARRIER:
> +        if ( !cp->feat.msrlist )
> +            goto gp_fault;
> +        *val = 0;
> +        break;
> +
>      case MSR_IA32_FEATURE_CONTROL:
>          /*
>           * Architecturally, availability of this MSR is enumerated by the
> @@ -493,6 +499,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t
>          uint64_t rsvd;
>  
>          /* Read-only */
> +    case MSR_BARRIER:
>      case MSR_IA32_PLATFORM_ID:
>      case MSR_CORE_CAPABILITIES:
>      case MSR_INTEL_CORE_THREAD_COUNT:
> --- a/xen/arch/x86/x86_emulate/0f01.c
> +++ b/xen/arch/x86/x86_emulate/0f01.c
> @@ -40,6 +40,7 @@ int x86emul_0f01(struct x86_emulate_stat
>      switch ( s->modrm )
>      {
>          unsigned long base, limit, cr0, cr0w, cr4;
> +        unsigned int n;
>          struct segment_register sreg;
>          uint64_t msr_val;
>  
> @@ -54,6 +55,56 @@ int x86emul_0f01(struct x86_emulate_stat
>                                  ((uint64_t)regs->r(dx) << 32) | regs->eax,
>                                  ctxt);
>              goto done;
> +
> +        case vex_f3: /* wrmsrlist */
> +            vcpu_must_have(msrlist);
> +            generate_exception_if(!mode_64bit(), X86_EXC_UD);
> +            generate_exception_if(!mode_ring0() || (regs->r(si) & 7) ||
> +                                  (regs->r(di) & 7),
> +                                  X86_EXC_GP, 0);
> +            fail_if(!ops->write_msr);
> +            while ( regs->r(cx) )
> +            {
> +                n = __builtin_ffsl(regs->r(cx)) - 1;
> +                if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8,
> +                                     &msr_val, 8, ctxt)) != X86EMUL_OKAY )
> +                    break;
> +                generate_exception_if(msr_val != (uint32_t)msr_val,
> +                                      X86_EXC_GP, 0);
> +                base = msr_val;
> +                if ( (rc = ops->read(x86_seg_none, regs->r(di) + n * 8,
> +                                     &msr_val, 8, ctxt)) != X86EMUL_OKAY ||
> +                     (rc = ops->write_msr(base, msr_val, ctxt)) != X86EMUL_OKAY )
> +                    break;
> +                regs->r(cx) &= ~(1UL << n);
> +            }
> +            goto done;
> +
> +        case vex_f2: /* rdmsrlist */
> +            vcpu_must_have(msrlist);
> +            generate_exception_if(!mode_64bit(), X86_EXC_UD);
> +            generate_exception_if(!mode_ring0() || (regs->r(si) & 7) ||
> +                                  (regs->r(di) & 7),
> +                                  X86_EXC_GP, 0);
> +            fail_if(!ops->read_msr || !ops->write);
> +            while ( regs->r(cx) )
> +            {
> +                n = __builtin_ffsl(regs->r(cx)) - 1;
> +                if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8,
> +                                     &msr_val, 8, ctxt)) != X86EMUL_OKAY )
> +                    break;
> +                generate_exception_if(msr_val != (uint32_t)msr_val,
> +                                      X86_EXC_GP, 0);
> +                if ( (rc = ops->read_msr(msr_val, &msr_val,
> +                                         ctxt)) != X86EMUL_OKAY ||
> +                     (rc = ops->write(x86_seg_none, regs->r(di) + n * 8,
> +                                      &msr_val, 8, ctxt)) != X86EMUL_OKAY )
> +                    break;
> +                regs->r(cx) &= ~(1UL << n);
> +            }
> +            if ( rc != X86EMUL_OKAY )
> +                ctxt->regs->r(cx) = regs->r(cx);
> +            goto done;
>          }
>          generate_exception(X86_EXC_UD);
>  
> --- a/xen/arch/x86/x86_emulate/private.h
> +++ b/xen/arch/x86/x86_emulate/private.h
> @@ -600,6 +600,7 @@ amd_like(const struct x86_emulate_ctxt *
>  #define vcpu_has_lkgs()        (ctxt->cpuid->feat.lkgs)
>  #define vcpu_has_wrmsrns()     (ctxt->cpuid->feat.wrmsrns)
>  #define vcpu_has_avx_ifma()    (ctxt->cpuid->feat.avx_ifma)
> +#define vcpu_has_msrlist()     (ctxt->cpuid->feat.msrlist)
>  #define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8)
>  #define vcpu_has_avx_ne_convert() (ctxt->cpuid->feat.avx_ne_convert)
>  
> --- a/xen/arch/x86/x86_emulate/util.c
> +++ b/xen/arch/x86/x86_emulate/util.c
> @@ -112,6 +112,9 @@ bool cf_check x86_insn_is_mem_access(con
>          break;
>  
>      case X86EMUL_OPC(0x0f, 0x01):
> +        /* {RD,WR}MSRLIST */
> +        if ( mode_64bit() && s->modrm == 0xc6 )
> +            return s->vex.pfx >= vex_f3;
>          /* Cover CLZERO. */
>          return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7;
>      }
> @@ -172,7 +175,11 @@ bool cf_check x86_insn_is_mem_write(cons
>          case 0xff: /* Grp5 */
>              break;
>  
> -        case X86EMUL_OPC(0x0f, 0x01): /* CLZERO is the odd one. */
> +        case X86EMUL_OPC(0x0f, 0x01):
> +            /* RDMSRLIST */
> +            if ( mode_64bit() && s->modrm == 0xc6 )
> +                return s->vex.pfx == vex_f2;
> +            /* CLZERO is another odd one. */
>              return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7;
>  
>          default:
> --- a/xen/include/public/arch-x86/cpufeatureset.h
> +++ b/xen/include/public/arch-x86/cpufeatureset.h
> @@ -286,6 +286,7 @@ XEN_CPUFEATURE(FRED,         10*32+17) /
>  XEN_CPUFEATURE(LKGS,         10*32+18) /*S  Load Kernel GS Base */
>  XEN_CPUFEATURE(WRMSRNS,      10*32+19) /*A  WRMSR Non-Serialising */
>  XEN_CPUFEATURE(AVX_IFMA,     10*32+23) /*A  AVX-IFMA Instructions */
> +XEN_CPUFEATURE(MSRLIST,      10*32+27) /*   MSR list instructions */
>  
>  /* AMD-defined CPU features, CPUID level 0x80000021.eax, word 11 */
>  XEN_CPUFEATURE(LFENCE_DISPATCH,    11*32+ 2) /*A  LFENCE always serializing */
> 
>
Andrew Cooper April 6, 2023, 10:03 p.m. UTC | #2
On 04/04/2023 3:55 pm, Jan Beulich wrote:
> These are "compound" instructions to issue a series of RDMSR / WRMSR
> respectively. In the emulator we can therefore implement them by using
> the existing msr_{read,write}() hooks. The memory accesses utilize that
> the HVM ->read() / ->write() hooks are already linear-address
> (x86_seg_none) aware (by way of hvmemul_virtual_to_linear() handling
> this case).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> TODO: Use VMX tertiary execution control (once bit is known; see
>       //todo-s) and then further adjust cpufeatureset.h.
>
> RFC: In vmx_vmexit_handler() handling is forwarded to the emulator
>      blindly. Alternatively we could consult the exit qualification and
>      process just a single MSR at a time (without involving the
>      emulator), exiting back to the guest after every iteration. (I
>      don't think a mix of both models makes a lot of sense.)

{RD,WR}MSRLIST are supposed to be used for context switch paths.  They
really shouldn't be exiting in the common case.

What matters here is the conditional probability of a second MSR being
intercepted too, given that one already has been.  And I don't have a
clue how to answer this.

I would not expect Introspection to be intercepting a fastpath MSR. 
(And if it does, frankly it can live with the consequences.)

For future scenarios such as reloading PMU/LBR/whatever, these will be
all-or-nothing and we'd expect to have them eagerly in context anyway.

If I were going to guess, I'd say that probably MSR_XSS or MSR_SPEC_CTRL
will be giving us the most grief here, because they're both ones that
are liable to be touched on a context switch path, and have split
host/guest bits.

> RFC: For PV priv_op_ops would need to gain proper read/write hooks,
>      which doesn't look desirable (albeit there we could refuse to
>      handle anything else than x86_seg_none); we may want to consider to
>      instead not support the feature for PV guests, requiring e.g. Linux
>      to process the lists in new pvops hooks.

Ah - funny you should ask.  See patch 2.  These are even better reasons
not to support on PV guests.

> RFC: I wasn't sure whether to add preemption checks to the loops -
>      thoughts?

I'd be tempted to.  Mostly because a guest can exert 64* longest MSR
worth of pressure here, along with associated emulation overhead.

64* "write hypercall page" sounds expensive.  So too does 64* MSR_PAT,
given all the EPT actions behind the scenes.

Its probably one of those

> With the VMX side of the spec still unclear (tertiary execution control
> bit unspecified in ISE 046) we can't enable the insn yet for (HVM) guest
> use. The precise behavior of MSR_BARRIER is also not spelled out, so the
> (minimal) implementation is a guess for now.

MSRs are expensive for several reasons.  The %ecx decode alone isn't
cheap, nor is the reserved bit checking, and that's before starting the
action itself.

What the list form gets you is the ability to pipeline the checks on the
following MSR while performing the action of the current one.  I suspect
there are plans to try and parallelise the actions too if possible.

As I understand it, BARRIER just halts the piplineing of processing the
next MSR.

~Andrew
Jan Beulich April 17, 2023, 1:02 p.m. UTC | #3
On 07.04.2023 00:03, Andrew Cooper wrote:
> On 04/04/2023 3:55 pm, Jan Beulich wrote:
>> ---
>> TODO: Use VMX tertiary execution control (once bit is known; see
>>       //todo-s) and then further adjust cpufeatureset.h.
>>
>> RFC: In vmx_vmexit_handler() handling is forwarded to the emulator
>>      blindly. Alternatively we could consult the exit qualification and
>>      process just a single MSR at a time (without involving the
>>      emulator), exiting back to the guest after every iteration. (I
>>      don't think a mix of both models makes a lot of sense.)
> 
> {RD,WR}MSRLIST are supposed to be used for context switch paths.  They
> really shouldn't be exiting in the common case.
> 
> What matters here is the conditional probability of a second MSR being
> intercepted too, given that one already has been.  And I don't have a
> clue how to answer this.
> 
> I would not expect Introspection to be intercepting a fastpath MSR. 
> (And if it does, frankly it can live with the consequences.)
> 
> For future scenarios such as reloading PMU/LBR/whatever, these will be
> all-or-nothing and we'd expect to have them eagerly in context anyway.
> 
> If I were going to guess, I'd say that probably MSR_XSS or MSR_SPEC_CTRL
> will be giving us the most grief here, because they're both ones that
> are liable to be touched on a context switch path, and have split
> host/guest bits.

I'm not really certain, but I'm tending to interpret your reply as
agreement with the choice made (and hence not as a request to change
anything in this regard); clarification appreciated.

>> RFC: For PV priv_op_ops would need to gain proper read/write hooks,
>>      which doesn't look desirable (albeit there we could refuse to
>>      handle anything else than x86_seg_none); we may want to consider to
>>      instead not support the feature for PV guests, requiring e.g. Linux
>>      to process the lists in new pvops hooks.
> 
> Ah - funny you should ask.  See patch 2.  These are even better reasons
> not to support on PV guests.

Yeah, with PV dropped there it's quite obvious to not consider it here
either. I'll drop the remark.

>> RFC: I wasn't sure whether to add preemption checks to the loops -
>>      thoughts?
> 
> I'd be tempted to.  Mostly because a guest can exert 64* longest MSR
> worth of pressure here, along with associated emulation overhead.
> 
> 64* "write hypercall page" sounds expensive.  So too does 64* MSR_PAT,
> given all the EPT actions behind the scenes.
> 
> Its probably one of those

Which leaves me inclined to add preemption checking to writes, but
keep reads as they are. Thoughts?

Jan
diff mbox series

Patch

--- a/tools/libs/light/libxl_cpuid.c
+++ b/tools/libs/light/libxl_cpuid.c
@@ -240,6 +240,7 @@  int libxl_cpuid_parse_config(libxl_cpuid
         {"lkgs",         0x00000007,  1, CPUID_REG_EAX, 18,  1},
         {"wrmsrns",      0x00000007,  1, CPUID_REG_EAX, 19,  1},
         {"avx-ifma",     0x00000007,  1, CPUID_REG_EAX, 23,  1},
+        {"msrlist",      0x00000007,  1, CPUID_REG_EAX, 27,  1},
 
         {"avx-vnni-int8",0x00000007,  1, CPUID_REG_EDX,  4,  1},
         {"avx-ne-convert",0x00000007, 1, CPUID_REG_EDX,  5,  1},
--- a/tools/misc/xen-cpuid.c
+++ b/tools/misc/xen-cpuid.c
@@ -195,6 +195,8 @@  static const char *const str_7a1[32] =
     [18] = "lkgs",          [19] = "wrmsrns",
 
     /* 22 */                [23] = "avx-ifma",
+
+    /* 26 */                [27] = "msrlist",
 };
 
 static const char *const str_e21a[32] =
--- a/tools/tests/x86_emulator/predicates.c
+++ b/tools/tests/x86_emulator/predicates.c
@@ -342,6 +342,8 @@  static const struct {
     { { 0x01, 0xc4 }, { 2, 2 }, F, N }, /* vmxoff */
     { { 0x01, 0xc5 }, { 2, 2 }, F, N }, /* pconfig */
     { { 0x01, 0xc6 }, { 2, 2 }, F, N }, /* wrmsrns */
+    { { 0x01, 0xc6 }, { 0, 2 }, F, W, pfx_f2 }, /* rdmsrlist */
+    { { 0x01, 0xc6 }, { 0, 2 }, F, R, pfx_f3 }, /* wrmsrlist */
     { { 0x01, 0xc8 }, { 2, 2 }, F, N }, /* monitor */
     { { 0x01, 0xc9 }, { 2, 2 }, F, N }, /* mwait */
     { { 0x01, 0xca }, { 2, 2 }, F, N }, /* clac */
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -589,6 +589,7 @@  static int read(
     default:
         if ( !is_x86_user_segment(seg) )
             return X86EMUL_UNHANDLEABLE;
+    case x86_seg_none:
         bytes_read += bytes;
         break;
     }
@@ -619,7 +620,7 @@  static int write(
     if ( verbose )
         printf("** %s(%u, %p,, %u,)\n", __func__, seg, (void *)offset, bytes);
 
-    if ( !is_x86_user_segment(seg) )
+    if ( !is_x86_user_segment(seg) && seg != x86_seg_none )
         return X86EMUL_UNHANDLEABLE;
     memcpy((void *)offset, p_data, bytes);
     return X86EMUL_OKAY;
@@ -711,6 +712,10 @@  static int read_msr(
 {
     switch ( reg )
     {
+    case 0x0000002f: /* BARRIER */
+        *val = 0;
+        return X86EMUL_OKAY;
+
     case 0xc0000080: /* EFER */
         *val = ctxt->addr_size > 32 ? 0x500 /* LME|LMA */ : 0;
         return X86EMUL_OKAY;
@@ -1499,9 +1504,53 @@  int main(int argc, char **argv)
          (gs_base != 0x0000111122224444UL) ||
          gs_base_shadow )
         goto fail;
+    printf("okay\n");
 
     cp.extd.nscb = i;
     emulops.write_segment = NULL;
+
+    printf("%-40s", "Testing rdmsrlist...");
+    instr[0] = 0xf2; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6;
+    regs.rip = (unsigned long)&instr[0];
+    regs.rsi = (unsigned long)(res + 0x80);
+    regs.rdi = (unsigned long)(res + 0x80 + 0x40 * 2);
+    regs.rcx = 0x0002000100008000UL;
+    gs_base_shadow = 0x0000222244446666UL;
+    memset(res + 0x80, ~0, 0x40 * 8 * 2);
+    res[0x80 + 0x0f * 2] = 0xc0000101; /* GS_BASE */
+    res[0x80 + 0x0f * 2 + 1] = 0;
+    res[0x80 + 0x20 * 2] = 0xc0000102; /* SHADOW_GS_BASE */
+    res[0x80 + 0x20 * 2 + 1] = 0;
+    res[0x80 + 0x31 * 2] = 0x2f; /* BARRIER */
+    res[0x80 + 0x31 * 2 + 1] = 0;
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (regs.rip != (unsigned long)&instr[4]) ||
+         regs.rcx ||
+         (res[0x80 + (0x40 + 0x0f) * 2] != (unsigned int)gs_base) ||
+         (res[0x80 + (0x40 + 0x0f) * 2 + 1] != (gs_base >> (8 * sizeof(int)))) ||
+         (res[0x80 + (0x40 + 0x20) * 2] != (unsigned int)gs_base_shadow) ||
+         (res[0x80 + (0x40 + 0x20) * 2 + 1] != (gs_base_shadow >> (8 * sizeof(int)))) ||
+         res[0x80 + (0x40 + 0x31) * 2] || res[0x80 + (0x40 + 0x31) * 2 + 1] )
+        goto fail;
+    printf("okay\n");
+
+    printf("%-40s", "Testing wrmsrlist...");
+    instr[0] = 0xf3; instr[1] = 0x0f; instr[2] = 0x01; instr[3] = 0xc6;
+    regs.eip = (unsigned long)&instr[0];
+    regs.rsi -= 0x11 * 8;
+    regs.rdi -= 0x11 * 8;
+    regs.rcx = 0x0002000100000000UL;
+    res[0x80 + 0x0f * 2] = 0xc0000102; /* SHADOW_GS_BASE */
+    res[0x80 + 0x20 * 2] = 0xc0000101; /* GS_BASE */
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (regs.rip != (unsigned long)&instr[4]) ||
+         regs.rcx ||
+         (gs_base != 0x0000222244446666UL) ||
+         (gs_base_shadow != 0x0000111122224444UL) )
+        goto fail;
+
     emulops.write_msr     = NULL;
 #endif
     printf("okay\n");
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -88,6 +88,7 @@  bool emul_test_init(void)
     cp.feat.rdpid = true;
     cp.feat.lkgs = true;
     cp.feat.wrmsrns = true;
+    cp.feat.msrlist = true;
     cp.extd.clzero = true;
 
     if ( cpu_has_xsave )
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -835,6 +835,17 @@  static void cf_check vmx_cpuid_policy_ch
     else
         vmx_set_msr_intercept(v, MSR_PKRS, VMX_MSR_RW);
 
+    if ( cp->feat.msrlist )
+    {
+        vmx_clear_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW);
+        //todo enable MSRLIST tertiary execution control
+    }
+    else
+    {
+        vmx_set_msr_intercept(v, MSR_BARRIER, VMX_MSR_RW);
+        //todo disable MSRLIST tertiary execution control
+    }
+
  out:
     vmx_vmcs_exit(v);
 
@@ -3705,6 +3716,22 @@  gp_fault:
     return X86EMUL_EXCEPTION;
 }
 
+static bool cf_check is_msrlist(
+    const struct x86_emulate_state *state, const struct x86_emulate_ctxt *ctxt)
+{
+
+    if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0x01) )
+    {
+        unsigned int rm, reg;
+        int mode = x86_insn_modrm(state, &rm, &reg);
+
+        /* This also includes WRMSRNS; should be okay. */
+        return mode == 3 && rm == 6 && !reg;
+    }
+
+    return false;
+}
+
 static void vmx_do_extint(struct cpu_user_regs *regs)
 {
     unsigned long vector;
@@ -4513,6 +4540,17 @@  void vmx_vmexit_handler(struct cpu_user_
         }
         break;
 
+    case EXIT_REASON_RDMSRLIST:
+    case EXIT_REASON_WRMSRLIST:
+        if ( vmx_guest_x86_mode(v) != 8 || !currd->arch.cpuid->feat.msrlist )
+        {
+            ASSERT_UNREACHABLE();
+            hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
+        }
+        else if ( !hvm_emulate_one_insn(is_msrlist, "MSR list") )
+            hvm_inject_hw_exception(TRAP_gp_fault, 0);
+        break;
+
     case EXIT_REASON_VMXOFF:
     case EXIT_REASON_VMXON:
     case EXIT_REASON_VMCLEAR:
--- a/xen/arch/x86/include/asm/hvm/vmx/vmx.h
+++ b/xen/arch/x86/include/asm/hvm/vmx/vmx.h
@@ -211,6 +211,8 @@  static inline void pi_clear_sn(struct pi
 #define EXIT_REASON_XRSTORS             64
 #define EXIT_REASON_BUS_LOCK            74
 #define EXIT_REASON_NOTIFY              75
+#define EXIT_REASON_RDMSRLIST           78
+#define EXIT_REASON_WRMSRLIST           79
 /* Remember to also update VMX_PERF_EXIT_REASON_SIZE! */
 
 /*
--- a/xen/arch/x86/include/asm/msr-index.h
+++ b/xen/arch/x86/include/asm/msr-index.h
@@ -24,6 +24,8 @@ 
 #define  APIC_BASE_ENABLE                   (_AC(1, ULL) << 11)
 #define  APIC_BASE_ADDR_MASK                0x000ffffffffff000ULL
 
+#define MSR_BARRIER                         0x0000002f
+
 #define MSR_TEST_CTRL                       0x00000033
 #define  TEST_CTRL_SPLITLOCK_DETECT         (_AC(1, ULL) << 29)
 #define  TEST_CTRL_SPLITLOCK_DISABLE        (_AC(1, ULL) << 31)
--- a/xen/arch/x86/include/asm/perfc_defn.h
+++ b/xen/arch/x86/include/asm/perfc_defn.h
@@ -6,7 +6,7 @@  PERFCOUNTER_ARRAY(exceptions,
 
 #ifdef CONFIG_HVM
 
-#define VMX_PERF_EXIT_REASON_SIZE 76
+#define VMX_PERF_EXIT_REASON_SIZE 80
 #define VMEXIT_NPF_PERFC 143
 #define SVM_PERF_EXIT_REASON_SIZE (VMEXIT_NPF_PERFC + 1)
 PERFCOUNTER_ARRAY(vmexits,              "vmexits",
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -223,6 +223,12 @@  int guest_rdmsr(struct vcpu *v, uint32_t
     case MSR_AMD_PPIN:
         goto gp_fault;
 
+    case MSR_BARRIER:
+        if ( !cp->feat.msrlist )
+            goto gp_fault;
+        *val = 0;
+        break;
+
     case MSR_IA32_FEATURE_CONTROL:
         /*
          * Architecturally, availability of this MSR is enumerated by the
@@ -493,6 +499,7 @@  int guest_wrmsr(struct vcpu *v, uint32_t
         uint64_t rsvd;
 
         /* Read-only */
+    case MSR_BARRIER:
     case MSR_IA32_PLATFORM_ID:
     case MSR_CORE_CAPABILITIES:
     case MSR_INTEL_CORE_THREAD_COUNT:
--- a/xen/arch/x86/x86_emulate/0f01.c
+++ b/xen/arch/x86/x86_emulate/0f01.c
@@ -40,6 +40,7 @@  int x86emul_0f01(struct x86_emulate_stat
     switch ( s->modrm )
     {
         unsigned long base, limit, cr0, cr0w, cr4;
+        unsigned int n;
         struct segment_register sreg;
         uint64_t msr_val;
 
@@ -54,6 +55,56 @@  int x86emul_0f01(struct x86_emulate_stat
                                 ((uint64_t)regs->r(dx) << 32) | regs->eax,
                                 ctxt);
             goto done;
+
+        case vex_f3: /* wrmsrlist */
+            vcpu_must_have(msrlist);
+            generate_exception_if(!mode_64bit(), X86_EXC_UD);
+            generate_exception_if(!mode_ring0() || (regs->r(si) & 7) ||
+                                  (regs->r(di) & 7),
+                                  X86_EXC_GP, 0);
+            fail_if(!ops->write_msr);
+            while ( regs->r(cx) )
+            {
+                n = __builtin_ffsl(regs->r(cx)) - 1;
+                if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8,
+                                     &msr_val, 8, ctxt)) != X86EMUL_OKAY )
+                    break;
+                generate_exception_if(msr_val != (uint32_t)msr_val,
+                                      X86_EXC_GP, 0);
+                base = msr_val;
+                if ( (rc = ops->read(x86_seg_none, regs->r(di) + n * 8,
+                                     &msr_val, 8, ctxt)) != X86EMUL_OKAY ||
+                     (rc = ops->write_msr(base, msr_val, ctxt)) != X86EMUL_OKAY )
+                    break;
+                regs->r(cx) &= ~(1UL << n);
+            }
+            goto done;
+
+        case vex_f2: /* rdmsrlist */
+            vcpu_must_have(msrlist);
+            generate_exception_if(!mode_64bit(), X86_EXC_UD);
+            generate_exception_if(!mode_ring0() || (regs->r(si) & 7) ||
+                                  (regs->r(di) & 7),
+                                  X86_EXC_GP, 0);
+            fail_if(!ops->read_msr || !ops->write);
+            while ( regs->r(cx) )
+            {
+                n = __builtin_ffsl(regs->r(cx)) - 1;
+                if ( (rc = ops->read(x86_seg_none, regs->r(si) + n * 8,
+                                     &msr_val, 8, ctxt)) != X86EMUL_OKAY )
+                    break;
+                generate_exception_if(msr_val != (uint32_t)msr_val,
+                                      X86_EXC_GP, 0);
+                if ( (rc = ops->read_msr(msr_val, &msr_val,
+                                         ctxt)) != X86EMUL_OKAY ||
+                     (rc = ops->write(x86_seg_none, regs->r(di) + n * 8,
+                                      &msr_val, 8, ctxt)) != X86EMUL_OKAY )
+                    break;
+                regs->r(cx) &= ~(1UL << n);
+            }
+            if ( rc != X86EMUL_OKAY )
+                ctxt->regs->r(cx) = regs->r(cx);
+            goto done;
         }
         generate_exception(X86_EXC_UD);
 
--- a/xen/arch/x86/x86_emulate/private.h
+++ b/xen/arch/x86/x86_emulate/private.h
@@ -600,6 +600,7 @@  amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_lkgs()        (ctxt->cpuid->feat.lkgs)
 #define vcpu_has_wrmsrns()     (ctxt->cpuid->feat.wrmsrns)
 #define vcpu_has_avx_ifma()    (ctxt->cpuid->feat.avx_ifma)
+#define vcpu_has_msrlist()     (ctxt->cpuid->feat.msrlist)
 #define vcpu_has_avx_vnni_int8() (ctxt->cpuid->feat.avx_vnni_int8)
 #define vcpu_has_avx_ne_convert() (ctxt->cpuid->feat.avx_ne_convert)
 
--- a/xen/arch/x86/x86_emulate/util.c
+++ b/xen/arch/x86/x86_emulate/util.c
@@ -112,6 +112,9 @@  bool cf_check x86_insn_is_mem_access(con
         break;
 
     case X86EMUL_OPC(0x0f, 0x01):
+        /* {RD,WR}MSRLIST */
+        if ( mode_64bit() && s->modrm == 0xc6 )
+            return s->vex.pfx >= vex_f3;
         /* Cover CLZERO. */
         return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7;
     }
@@ -172,7 +175,11 @@  bool cf_check x86_insn_is_mem_write(cons
         case 0xff: /* Grp5 */
             break;
 
-        case X86EMUL_OPC(0x0f, 0x01): /* CLZERO is the odd one. */
+        case X86EMUL_OPC(0x0f, 0x01):
+            /* RDMSRLIST */
+            if ( mode_64bit() && s->modrm == 0xc6 )
+                return s->vex.pfx == vex_f2;
+            /* CLZERO is another odd one. */
             return (s->modrm_rm & 7) == 4 && (s->modrm_reg & 7) == 7;
 
         default:
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -286,6 +286,7 @@  XEN_CPUFEATURE(FRED,         10*32+17) /
 XEN_CPUFEATURE(LKGS,         10*32+18) /*S  Load Kernel GS Base */
 XEN_CPUFEATURE(WRMSRNS,      10*32+19) /*A  WRMSR Non-Serialising */
 XEN_CPUFEATURE(AVX_IFMA,     10*32+23) /*A  AVX-IFMA Instructions */
+XEN_CPUFEATURE(MSRLIST,      10*32+27) /*   MSR list instructions */
 
 /* AMD-defined CPU features, CPUID level 0x80000021.eax, word 11 */
 XEN_CPUFEATURE(LFENCE_DISPATCH,    11*32+ 2) /*A  LFENCE always serializing */