Message ID | 5eca97e6a3978cf4dcf1cff21be6ec8b639a66b9.1708933498.git.isaku.yamahata@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v19,001/130] x86/virt/tdx: Rename _offset to _member for TD_SYSINFO_MAP() macro | expand |
On Mon, Feb 26, 2024 at 12:25:41AM -0800, isaku.yamahata@intel.com wrote: >From: Isaku Yamahata <isaku.yamahata@intel.com> ... > >TDX requires additional parameters for TDX VM for confidential execution to >protect the confidentiality of its memory contents and CPU state from any >other software, including VMM. When creating a guest TD VM before creating >vcpu, the number of vcpu, TSC frequency (the values are the same among >vcpus, and it can't change.) CPUIDs which the TDX module emulates. Guest >TDs can trust those CPUIDs and sha384 values for measurement. > >Add a new subcommand, KVM_TDX_INIT_VM, to pass parameters for the TDX >guest. It assigns an encryption key to the TDX guest for memory >encryption. TDX encrypts memory per guest basis. The device model, say >qemu, passes per-VM parameters for the TDX guest. The maximum number of >vcpus, TSC frequency (TDX guest has fixed VM-wide TSC frequency, not per >vcpu. The TDX guest can not change it.), attributes (production or debug), >available extended features (which configure guest XCR0, IA32_XSS MSR), >CPUIDs, sha384 measurements, etc. > >Call this subcommand before creating vcpu and KVM_SET_CPUID2, i.e. CPUID >configurations aren't available yet. So CPUIDs configuration values need >to be passed in struct kvm_tdx_init_vm. The device model's responsibility >to make this CPUID config for KVM_TDX_INIT_VM and KVM_SET_CPUID2. > >Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com> >Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> the SOB chain makes no sense. >+static void setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, >+ struct td_params *td_params) >+{ >+ int i; >+ >+ /* >+ * td_params.cpuid_values: The number and the order of cpuid_value must >+ * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} >+ * It's assumed that td_params was zeroed. >+ */ >+ for (i = 0; i < tdx_info->num_cpuid_config; i++) { >+ const struct kvm_tdx_cpuid_config *c = &tdx_info->cpuid_configs[i]; >+ /* KVM_TDX_CPUID_NO_SUBLEAF means index = 0. */ >+ u32 index = c->sub_leaf == KVM_TDX_CPUID_NO_SUBLEAF ? 0 : c->sub_leaf; >+ const struct kvm_cpuid_entry2 *entry = >+ kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, >+ c->leaf, index); >+ struct tdx_cpuid_value *value = &td_params->cpuid_values[i]; >+ >+ if (!entry) >+ continue; >+ >+ /* >+ * tdsysinfo.cpuid_configs[].{eax, ebx, ecx, edx} >+ * bit 1 means it can be configured to zero or one. >+ * bit 0 means it must be zero. >+ * Mask out non-configurable bits. >+ */ >+ value->eax = entry->eax & c->eax; >+ value->ebx = entry->ebx & c->ebx; >+ value->ecx = entry->ecx & c->ecx; >+ value->edx = entry->edx & c->edx; Any reason to mask off non-configurable bits rather than return an error? this is misleading to userspace because guest sees the values emulated by TDX module instead of the values passed from userspace (i.e., the request from userspace isn't done but there is no indication of that to userspace). >+ } >+} >+ >+static int setup_tdparams_xfam(struct kvm_cpuid2 *cpuid, struct td_params *td_params) >+{ >+ const struct kvm_cpuid_entry2 *entry; >+ u64 guest_supported_xcr0; >+ u64 guest_supported_xss; >+ >+ /* Setup td_params.xfam */ >+ entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0xd, 0); >+ if (entry) >+ guest_supported_xcr0 = (entry->eax | ((u64)entry->edx << 32)); >+ else >+ guest_supported_xcr0 = 0; >+ guest_supported_xcr0 &= kvm_caps.supported_xcr0; >+ >+ entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0xd, 1); >+ if (entry) >+ guest_supported_xss = (entry->ecx | ((u64)entry->edx << 32)); >+ else >+ guest_supported_xss = 0; >+ >+ /* >+ * PT and CET can be exposed to TD guest regardless of KVM's XSS, PT >+ * and, CET support. >+ */ >+ guest_supported_xss &= >+ (kvm_caps.supported_xss | XFEATURE_MASK_PT | TDX_TD_XFAM_CET); >+ >+ td_params->xfam = guest_supported_xcr0 | guest_supported_xss; >+ if (td_params->xfam & XFEATURE_MASK_LBR) { >+ /* >+ * TODO: once KVM supports LBR(save/restore LBR related >+ * registers around TDENTER), remove this guard. >+ */ >+#define MSG_LBR "TD doesn't support LBR yet. KVM needs to save/restore IA32_LBR_DEPTH properly.\n" >+ pr_warn(MSG_LBR); Drop the pr_warn() because userspace can trigger it at will. I don't think KVM needs to relay TDX module capabilities to userspace as-is. KVM should advertise a feature only if both TDX module's and KVM's support are in place. if KVM masked out LBR and PERFMON, it should be a problem of userspace and we don't need to warn here. >+ return -EOPNOTSUPP; >+ } >+ >+ return 0; >+} >+ >+static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, >+ struct kvm_tdx_init_vm *init_vm) >+{ >+ struct kvm_cpuid2 *cpuid = &init_vm->cpuid; >+ int ret; >+ >+ if (kvm->created_vcpus) >+ return -EBUSY; -EINVAL >+ >+ if (init_vm->attributes & TDX_TD_ATTRIBUTE_PERFMON) { >+ /* >+ * TODO: save/restore PMU related registers around TDENTER. >+ * Once it's done, remove this guard. >+ */ >+#define MSG_PERFMON "TD doesn't support perfmon yet. KVM needs to save/restore host perf registers properly.\n" >+ pr_warn(MSG_PERFMON); drop the pr_warn(). >+ return -EOPNOTSUPP; >+ } >+ >+ td_params->max_vcpus = kvm->max_vcpus; >+ td_params->attributes = init_vm->attributes; >+ td_params->exec_controls = TDX_CONTROL_FLAG_NO_RBP_MOD; >+ td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz); >+ >+ ret = setup_tdparams_eptp_controls(cpuid, td_params); >+ if (ret) >+ return ret; >+ setup_tdparams_cpuids(cpuid, td_params); >+ ret = setup_tdparams_xfam(cpuid, td_params); >+ if (ret) >+ return ret; >+ >+#define MEMCPY_SAME_SIZE(dst, src) \ >+ do { \ >+ BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \ >+ memcpy((dst), (src), sizeof(dst)); \ >+ } while (0) >+ >+ MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid); >+ MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner); >+ MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig); >+ >+ return 0; >+} >+ >+static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, >+ u64 *seamcall_err) > { > struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); >+ struct tdx_module_args out; > cpumask_var_t packages; > unsigned long *tdcs_pa = NULL; > unsigned long tdr_pa = 0; >@@ -426,6 +581,7 @@ static int __tdx_td_init(struct kvm *kvm) > int ret, i; > u64 err; > >+ *seamcall_err = 0; > ret = tdx_guest_keyid_alloc(); > if (ret < 0) > return ret; >@@ -540,10 +696,23 @@ static int __tdx_td_init(struct kvm *kvm) > } > } > >- /* >- * Note, TDH_MNG_INIT cannot be invoked here. TDH_MNG_INIT requires a dedicated >- * ioctl() to define the configure CPUID values for the TD. >- */ >+ err = tdh_mng_init(kvm_tdx->tdr_pa, __pa(td_params), &out); >+ if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) { >+ /* >+ * Because a user gives operands, don't warn. >+ * Return a hint to the user because it's sometimes hard for the >+ * user to figure out which operand is invalid. SEAMCALL status >+ * code includes which operand caused invalid operand error. >+ */ >+ *seamcall_err = err; >+ ret = -EINVAL; >+ goto teardown; >+ } else if (WARN_ON_ONCE(err)) { >+ pr_tdx_error(TDH_MNG_INIT, err, &out); >+ ret = -EIO; >+ goto teardown; >+ } >+ > return 0; > > /* >@@ -586,6 +755,76 @@ static int __tdx_td_init(struct kvm *kvm) > return ret; > } > >+static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) >+{ >+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); >+ struct kvm_tdx_init_vm *init_vm = NULL; no need to initialize it to NULL. >+ struct td_params *td_params = NULL; >+ int ret; >+ >+ BUILD_BUG_ON(sizeof(*init_vm) != 8 * 1024); >+ BUILD_BUG_ON(sizeof(struct td_params) != 1024); >+ >+ if (is_hkid_assigned(kvm_tdx)) >+ return -EINVAL; >+ >+ if (cmd->flags) >+ return -EINVAL; >+ >+ init_vm = kzalloc(sizeof(*init_vm) + >+ sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES, >+ GFP_KERNEL); no need to zero the memory given ... >+ if (!init_vm) >+ return -ENOMEM; >+ if (copy_from_user(init_vm, (void __user *)cmd->data, sizeof(*init_vm))) { ... this. >+ ret = -EFAULT; >+ goto out; >+ } >+ if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) { >+ ret = -E2BIG; >+ goto out; >+ } >+ if (copy_from_user(init_vm->cpuid.entries, >+ (void __user *)cmd->data + sizeof(*init_vm), >+ flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) { >+ ret = -EFAULT; >+ goto out; >+ } >+ >+ if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { >+ ret = -EINVAL; >+ goto out; >+ }
On 2/26/2024 4:25 PM, isaku.yamahata@intel.com wrote: ... > +static int setup_tdparams_xfam(struct kvm_cpuid2 *cpuid, struct td_params *td_params) > +{ > + const struct kvm_cpuid_entry2 *entry; > + u64 guest_supported_xcr0; > + u64 guest_supported_xss; > + > + /* Setup td_params.xfam */ > + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0xd, 0); > + if (entry) > + guest_supported_xcr0 = (entry->eax | ((u64)entry->edx << 32)); > + else > + guest_supported_xcr0 = 0; > + guest_supported_xcr0 &= kvm_caps.supported_xcr0; > + > + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0xd, 1); > + if (entry) > + guest_supported_xss = (entry->ecx | ((u64)entry->edx << 32)); > + else > + guest_supported_xss = 0; > + > + /* > + * PT and CET can be exposed to TD guest regardless of KVM's XSS, PT > + * and, CET support. > + */ > + guest_supported_xss &= > + (kvm_caps.supported_xss | XFEATURE_MASK_PT | TDX_TD_XFAM_CET); > + > + td_params->xfam = guest_supported_xcr0 | guest_supported_xss; > + if (td_params->xfam & XFEATURE_MASK_LBR) { > + /* > + * TODO: once KVM supports LBR(save/restore LBR related > + * registers around TDENTER), remove this guard. > + */ > +#define MSG_LBR "TD doesn't support LBR yet. KVM needs to save/restore IA32_LBR_DEPTH properly.\n" > + pr_warn(MSG_LBR); > + return -EOPNOTSUPP; This unsupported behavior is totally decided by KVM even if TDX module supports it. I think we need to reflect it in tdx_info->xfam_fixed0, which gets reported to userspace via KVM_TDX_CAPABILITIES. So userspace will aware that LBR is not supported for TDs. > + } > + > + return 0; > +} > + > +static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, > + struct kvm_tdx_init_vm *init_vm) > +{ > + struct kvm_cpuid2 *cpuid = &init_vm->cpuid; > + int ret; > + > + if (kvm->created_vcpus) > + return -EBUSY; > + > + if (init_vm->attributes & TDX_TD_ATTRIBUTE_PERFMON) { > + /* > + * TODO: save/restore PMU related registers around TDENTER. > + * Once it's done, remove this guard. > + */ > +#define MSG_PERFMON "TD doesn't support perfmon yet. KVM needs to save/restore host perf registers properly.\n" > + pr_warn(MSG_PERFMON); > + return -EOPNOTSUPP; similar as above, we need reflect it in tdx_info->attributes_fixed0 > + } > +
On Wed, Mar 20, 2024 at 02:12:49PM +0800, Chao Gao <chao.gao@intel.com> wrote: > >+static void setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, > >+ struct td_params *td_params) > >+{ > >+ int i; > >+ > >+ /* > >+ * td_params.cpuid_values: The number and the order of cpuid_value must > >+ * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} > >+ * It's assumed that td_params was zeroed. > >+ */ > >+ for (i = 0; i < tdx_info->num_cpuid_config; i++) { > >+ const struct kvm_tdx_cpuid_config *c = &tdx_info->cpuid_configs[i]; > >+ /* KVM_TDX_CPUID_NO_SUBLEAF means index = 0. */ > >+ u32 index = c->sub_leaf == KVM_TDX_CPUID_NO_SUBLEAF ? 0 : c->sub_leaf; > >+ const struct kvm_cpuid_entry2 *entry = > >+ kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, > >+ c->leaf, index); > >+ struct tdx_cpuid_value *value = &td_params->cpuid_values[i]; > >+ > >+ if (!entry) > >+ continue; > >+ > >+ /* > >+ * tdsysinfo.cpuid_configs[].{eax, ebx, ecx, edx} > >+ * bit 1 means it can be configured to zero or one. > >+ * bit 0 means it must be zero. > >+ * Mask out non-configurable bits. > >+ */ > >+ value->eax = entry->eax & c->eax; > >+ value->ebx = entry->ebx & c->ebx; > >+ value->ecx = entry->ecx & c->ecx; > >+ value->edx = entry->edx & c->edx; > > Any reason to mask off non-configurable bits rather than return an error? this > is misleading to userspace because guest sees the values emulated by TDX module > instead of the values passed from userspace (i.e., the request from userspace > isn't done but there is no indication of that to userspace). Ok, I'll eliminate them. If user space passes wrong cpuids, TDX module will return error. I'll leave the error check to the TDX module. > >+ } > >+} > >+ > >+static int setup_tdparams_xfam(struct kvm_cpuid2 *cpuid, struct td_params *td_params) > >+{ > >+ const struct kvm_cpuid_entry2 *entry; > >+ u64 guest_supported_xcr0; > >+ u64 guest_supported_xss; > >+ > >+ /* Setup td_params.xfam */ > >+ entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0xd, 0); > >+ if (entry) > >+ guest_supported_xcr0 = (entry->eax | ((u64)entry->edx << 32)); > >+ else > >+ guest_supported_xcr0 = 0; > >+ guest_supported_xcr0 &= kvm_caps.supported_xcr0; > >+ > >+ entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0xd, 1); > >+ if (entry) > >+ guest_supported_xss = (entry->ecx | ((u64)entry->edx << 32)); > >+ else > >+ guest_supported_xss = 0; > >+ > >+ /* > >+ * PT and CET can be exposed to TD guest regardless of KVM's XSS, PT > >+ * and, CET support. > >+ */ > >+ guest_supported_xss &= > >+ (kvm_caps.supported_xss | XFEATURE_MASK_PT | TDX_TD_XFAM_CET); > >+ > >+ td_params->xfam = guest_supported_xcr0 | guest_supported_xss; > >+ if (td_params->xfam & XFEATURE_MASK_LBR) { > >+ /* > >+ * TODO: once KVM supports LBR(save/restore LBR related > >+ * registers around TDENTER), remove this guard. > >+ */ > >+#define MSG_LBR "TD doesn't support LBR yet. KVM needs to save/restore IA32_LBR_DEPTH properly.\n" > >+ pr_warn(MSG_LBR); > > Drop the pr_warn() because userspace can trigger it at will. > > I don't think KVM needs to relay TDX module capabilities to userspace as-is. > KVM should advertise a feature only if both TDX module's and KVM's support > are in place. if KVM masked out LBR and PERFMON, it should be a problem of > userspace and we don't need to warn here. Makes sense. Drop those message and don't advertise those features to user space.
On Wed, Mar 20, 2024 at 04:15:23PM +0800, Xiaoyao Li <xiaoyao.li@intel.com> wrote: > On 2/26/2024 4:25 PM, isaku.yamahata@intel.com wrote: > > ... > > > +static int setup_tdparams_xfam(struct kvm_cpuid2 *cpuid, struct td_params *td_params) > > +{ > > + const struct kvm_cpuid_entry2 *entry; > > + u64 guest_supported_xcr0; > > + u64 guest_supported_xss; > > + > > + /* Setup td_params.xfam */ > > + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0xd, 0); > > + if (entry) > > + guest_supported_xcr0 = (entry->eax | ((u64)entry->edx << 32)); > > + else > > + guest_supported_xcr0 = 0; > > + guest_supported_xcr0 &= kvm_caps.supported_xcr0; > > + > > + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0xd, 1); > > + if (entry) > > + guest_supported_xss = (entry->ecx | ((u64)entry->edx << 32)); > > + else > > + guest_supported_xss = 0; > > + > > + /* > > + * PT and CET can be exposed to TD guest regardless of KVM's XSS, PT > > + * and, CET support. > > + */ > > + guest_supported_xss &= > > + (kvm_caps.supported_xss | XFEATURE_MASK_PT | TDX_TD_XFAM_CET); > > + > > + td_params->xfam = guest_supported_xcr0 | guest_supported_xss; > > + if (td_params->xfam & XFEATURE_MASK_LBR) { > > + /* > > + * TODO: once KVM supports LBR(save/restore LBR related > > + * registers around TDENTER), remove this guard. > > + */ > > +#define MSG_LBR "TD doesn't support LBR yet. KVM needs to save/restore IA32_LBR_DEPTH properly.\n" > > + pr_warn(MSG_LBR); > > + return -EOPNOTSUPP; > > This unsupported behavior is totally decided by KVM even if TDX module > supports it. I think we need to reflect it in tdx_info->xfam_fixed0, which > gets reported to userspace via KVM_TDX_CAPABILITIES. So userspace will aware > that LBR is not supported for TDs. Yes, we can suppress KVM unpported features. I replied at https://lore.kernel.org/kvm/20240321155513.GL1994522@ls.amr.corp.intel.com/ So far we used KVM_TDX_CAPABILITIES for feature enumeration. I'm wondering about KVM_GET_DEVICE_ATTR [1]. It's future extensible. It's also consistent with SEV. [1] https://lore.kernel.org/r/20240226190344.787149-7-pbonzini@redhat.com
On Mon, 2024-02-26 at 00:25 -0800, isaku.yamahata@intel.com wrote: > +struct kvm_tdx_init_vm { > + __u64 attributes; > + __u64 mrconfigid[6]; /* sha384 digest */ > + __u64 mrowner[6]; /* sha384 digest */ > + __u64 mrownerconfig[6]; /* sha384 digest */ > + /* > + * For future extensibility to make sizeof(struct kvm_tdx_init_vm) = 8KB. > + * This should be enough given sizeof(TD_PARAMS) = 1024. > + * 8KB was chosen given because > + * sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES(=256) = 8KB. > + */ > + __u64 reserved[1004]; This is insane. You said you want to reserve 8K for CPUID entries, but how can these 1004 * 8 bytes be used for CPUID entries since ... > + > + /* > + * Call KVM_TDX_INIT_VM before vcpu creation, thus before > + * KVM_SET_CPUID2. > + * This configuration supersedes KVM_SET_CPUID2s for VCPUs because the > + * TDX module directly virtualizes those CPUIDs without VMM. The user > + * space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with > + * those values. If it doesn't, KVM may have wrong idea of vCPUIDs of > + * the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX > + * module doesn't virtualize. > + */ > + struct kvm_cpuid2 cpuid; ... they are actually placed right after here? > +}; > +
On Fri, Mar 22, 2024 at 11:20:01AM +0000, "Huang, Kai" <kai.huang@intel.com> wrote: > On Mon, 2024-02-26 at 00:25 -0800, isaku.yamahata@intel.com wrote: > > +struct kvm_tdx_init_vm { > > + __u64 attributes; > > + __u64 mrconfigid[6]; /* sha384 digest */ > > + __u64 mrowner[6]; /* sha384 digest */ > > + __u64 mrownerconfig[6]; /* sha384 digest */ > > + /* > > + * For future extensibility to make sizeof(struct kvm_tdx_init_vm) = 8KB. > > + * This should be enough given sizeof(TD_PARAMS) = 1024. > > + * 8KB was chosen given because > > + * sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES(=256) = 8KB. > > + */ > > + __u64 reserved[1004]; > > This is insane. > > You said you want to reserve 8K for CPUID entries, but how can these 1004 * 8 > bytes be used for CPUID entries since ... I tried to overestimate it. It's too much, how about to make it 1024, reserved[109]?
On Fri, 2024-03-22 at 18:22 -0700, Yamahata, Isaku wrote: > On Fri, Mar 22, 2024 at 11:20:01AM +0000, > "Huang, Kai" <kai.huang@intel.com> wrote: > > > On Mon, 2024-02-26 at 00:25 -0800, isaku.yamahata@intel.com wrote: > > > +struct kvm_tdx_init_vm { > > > + __u64 attributes; > > > + __u64 mrconfigid[6]; /* sha384 digest */ > > > + __u64 mrowner[6]; /* sha384 digest */ > > > + __u64 mrownerconfig[6]; /* sha384 digest */ > > > + /* > > > + * For future extensibility to make sizeof(struct kvm_tdx_init_vm) = 8KB. > > > + * This should be enough given sizeof(TD_PARAMS) = 1024. > > > + * 8KB was chosen given because > > > + * sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES(=256) = 8KB. > > > + */ > > > + __u64 reserved[1004]; > > > > This is insane. > > > > You said you want to reserve 8K for CPUID entries, but how can these 1004 * 8 > > bytes be used for CPUID entries since ... > > I tried to overestimate it. It's too much, how about to make it > 1024, reserved[109]? > I am not sure why we need 1024B either. IIUC, the inputs here in 'kvm_tdx_init_vm' should be a subset of the members in TD_PARAMS. This IOCTL() isn't intended to carry any additional input besides these defined in TD_PARAMS, right? If so, then it seems to me you "at most" only need to reserve the space for the members excluding the CPUID entries, because for the CPUID entries we will always pass them as a flexible array at the end of the structure. Based on the spec, the "non-CPUID-entry" part only occupies 256 bytes. To me it seems we have no reason to reserve more space than 256 bytes.
On Mon, Mar 25, 2024 at 10:39:10AM +0000, "Huang, Kai" <kai.huang@intel.com> wrote: > On Fri, 2024-03-22 at 18:22 -0700, Yamahata, Isaku wrote: > > On Fri, Mar 22, 2024 at 11:20:01AM +0000, > > "Huang, Kai" <kai.huang@intel.com> wrote: > > > > > On Mon, 2024-02-26 at 00:25 -0800, isaku.yamahata@intel.com wrote: > > > > +struct kvm_tdx_init_vm { > > > > + __u64 attributes; > > > > + __u64 mrconfigid[6]; /* sha384 digest */ > > > > + __u64 mrowner[6]; /* sha384 digest */ > > > > + __u64 mrownerconfig[6]; /* sha384 digest */ > > > > + /* > > > > + * For future extensibility to make sizeof(struct kvm_tdx_init_vm) = 8KB. > > > > + * This should be enough given sizeof(TD_PARAMS) = 1024. > > > > + * 8KB was chosen given because > > > > + * sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES(=256) = 8KB. > > > > + */ > > > > + __u64 reserved[1004]; > > > > > > This is insane. > > > > > > You said you want to reserve 8K for CPUID entries, but how can these 1004 * 8 > > > bytes be used for CPUID entries since ... > > > > I tried to overestimate it. It's too much, how about to make it > > 1024, reserved[109]? > > > > I am not sure why we need 1024B either. > > IIUC, the inputs here in 'kvm_tdx_init_vm' should be a subset of the members in > TD_PARAMS. This IOCTL() isn't intended to carry any additional input besides > these defined in TD_PARAMS, right? > > If so, then it seems to me you "at most" only need to reserve the space for the > members excluding the CPUID entries, because for the CPUID entries we will > always pass them as a flexible array at the end of the structure. > > Based on the spec, the "non-CPUID-entry" part only occupies 256 bytes. To me it > seems we have no reason to reserve more space than 256 bytes. Ok, I'll make it 256 bytes. The alternative is to use key-value. The user space loops to set all necessary parameters. Something like as follows. KVM_TDX_SET_VM_PARAM struct kvm_tdx_vm_param { /* TDCS metadata field. */ __u64 field_id; /* * value for attributes or data less or qeual to __u64. * pointer for sha384, cpuid, or data larger than __u64. */ __u64 value_or_ptr; };
On Thu, 2024-03-21 at 08:55 -0700, Isaku Yamahata wrote: > On Wed, Mar 20, 2024 at 02:12:49PM +0800, > Chao Gao <chao.gao@intel.com> wrote: > > > > +static void setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, > > > + struct td_params *td_params) > > > +{ > > > + int i; > > > + > > > + /* > > > + * td_params.cpuid_values: The number and the order of cpuid_value must > > > + * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} > > > + * It's assumed that td_params was zeroed. > > > + */ > > > + for (i = 0; i < tdx_info->num_cpuid_config; i++) { > > > + const struct kvm_tdx_cpuid_config *c = &tdx_info->cpuid_configs[i]; > > > + /* KVM_TDX_CPUID_NO_SUBLEAF means index = 0. */ > > > + u32 index = c->sub_leaf == KVM_TDX_CPUID_NO_SUBLEAF ? 0 : c->sub_leaf; > > > + const struct kvm_cpuid_entry2 *entry = > > > + kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, > > > + c->leaf, index); > > > + struct tdx_cpuid_value *value = &td_params->cpuid_values[i]; > > > + > > > + if (!entry) > > > + continue; > > > + > > > + /* > > > + * tdsysinfo.cpuid_configs[].{eax, ebx, ecx, edx} > > > + * bit 1 means it can be configured to zero or one. > > > + * bit 0 means it must be zero. > > > + * Mask out non-configurable bits. > > > + */ > > > + value->eax = entry->eax & c->eax; > > > + value->ebx = entry->ebx & c->ebx; > > > + value->ecx = entry->ecx & c->ecx; > > > + value->edx = entry->edx & c->edx; > > > > Any reason to mask off non-configurable bits rather than return an error? this > > is misleading to userspace because guest sees the values emulated by TDX module > > instead of the values passed from userspace (i.e., the request from userspace > > isn't done but there is no indication of that to userspace). > > Ok, I'll eliminate them. If user space passes wrong cpuids, TDX module will > return error. I'll leave the error check to the TDX module. I was just looking at this. Agreed. It breaks the selftests though.
On 3/28/2024 9:12 AM, Edgecombe, Rick P wrote: > On Thu, 2024-03-21 at 08:55 -0700, Isaku Yamahata wrote: >> On Wed, Mar 20, 2024 at 02:12:49PM +0800, >> Chao Gao <chao.gao@intel.com> wrote: >> >>>> +static void setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, >>>> + struct td_params *td_params) >>>> +{ >>>> + int i; >>>> + >>>> + /* >>>> + * td_params.cpuid_values: The number and the order of cpuid_value must >>>> + * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} >>>> + * It's assumed that td_params was zeroed. >>>> + */ >>>> + for (i = 0; i < tdx_info->num_cpuid_config; i++) { >>>> + const struct kvm_tdx_cpuid_config *c = &tdx_info->cpuid_configs[i]; >>>> + /* KVM_TDX_CPUID_NO_SUBLEAF means index = 0. */ >>>> + u32 index = c->sub_leaf == KVM_TDX_CPUID_NO_SUBLEAF ? 0 : c->sub_leaf; >>>> + const struct kvm_cpuid_entry2 *entry = >>>> + kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, >>>> + c->leaf, index); >>>> + struct tdx_cpuid_value *value = &td_params->cpuid_values[i]; >>>> + >>>> + if (!entry) >>>> + continue; >>>> + >>>> + /* >>>> + * tdsysinfo.cpuid_configs[].{eax, ebx, ecx, edx} >>>> + * bit 1 means it can be configured to zero or one. >>>> + * bit 0 means it must be zero. >>>> + * Mask out non-configurable bits. >>>> + */ >>>> + value->eax = entry->eax & c->eax; >>>> + value->ebx = entry->ebx & c->ebx; >>>> + value->ecx = entry->ecx & c->ecx; >>>> + value->edx = entry->edx & c->edx; >>> >>> Any reason to mask off non-configurable bits rather than return an error? this >>> is misleading to userspace because guest sees the values emulated by TDX module >>> instead of the values passed from userspace (i.e., the request from userspace >>> isn't done but there is no indication of that to userspace). >> >> Ok, I'll eliminate them. If user space passes wrong cpuids, TDX module will >> return error. I'll leave the error check to the TDX module. > > I was just looking at this. Agreed. It breaks the selftests though. If all you prefer to go this direction, then please update the error handling of this specific SEAMCALL.
On Thu, 2024-03-28 at 09:36 +0800, Xiaoyao Li wrote: > > > > Any reason to mask off non-configurable bits rather than return an error? this > > > > is misleading to userspace because guest sees the values emulated by TDX module > > > > instead of the values passed from userspace (i.e., the request from userspace > > > > isn't done but there is no indication of that to userspace). > > > > > > Ok, I'll eliminate them. If user space passes wrong cpuids, TDX module will > > > return error. I'll leave the error check to the TDX module. > > > > I was just looking at this. Agreed. It breaks the selftests though. > > If all you prefer to go this direction, then please update the error > handling of this specific SEAMCALL. What do you mean by SEAMCALL, TDH_MNG_INIT? Can you be more specific?
On 3/29/2024 2:26 AM, Edgecombe, Rick P wrote: > On Thu, 2024-03-28 at 09:36 +0800, Xiaoyao Li wrote: >>>>> Any reason to mask off non-configurable bits rather than return an error? this >>>>> is misleading to userspace because guest sees the values emulated by TDX module >>>>> instead of the values passed from userspace (i.e., the request from userspace >>>>> isn't done but there is no indication of that to userspace). >>>> >>>> Ok, I'll eliminate them. If user space passes wrong cpuids, TDX module will >>>> return error. I'll leave the error check to the TDX module. >>> >>> I was just looking at this. Agreed. It breaks the selftests though. >> >> If all you prefer to go this direction, then please update the error >> handling of this specific SEAMCALL. > > What do you mean by SEAMCALL, TDH_MNG_INIT? Can you be more specific? Sorry. I missed the fact that current patch already has the specific handling for TDX_OPERAND_INVALID for TDH.MNG.INIT. I need to update QEMU to match the new behavior.
On 26/02/2024 9:25 pm, isaku.yamahata@intel.com wrote: > From: Isaku Yamahata <isaku.yamahata@intel.com> > > TDX requires additional parameters for TDX VM for confidential execution to > protect the confidentiality of its memory contents and CPU state from any > other software, including VMM. Hmm.. not only "confidentiality" but also "integrity". And the "per-VM" TDX initializaiton here actually has nothing to do with "crypto-protection", because the establishment of the key has already been done before reaching here. I would just say: After the crypto-protection key has been configured, TDX requires a VM-scope initialization as a step of creating the TDX guest. This "per-VM" TDX initialization does the global configurations/features that the TDX guest can support, such as guest's CPUIDs (emulated by the TDX module), the maximum number of vcpus etc. When creating a guest TD VM before creating > vcpu, the number of vcpu, TSC frequency (the values are the same among > vcpus, and it can't change.) CPUIDs which the TDX module emulates. I cannot parse this sentence. It doesn't look like a sentence to me. Guest > TDs can trust those CPUIDs and sha384 values for measurement. Trustness is not about the "guest can trust", but the "people using the guest can trust". Just remove it. If you want to emphasize the attestation, you can add something like: " It also passes the VM's measurement and hash of the signer etc and the hardware only allows to initialize the TDX guest when that match. " > > Add a new subcommand, KVM_TDX_INIT_VM, to pass parameters for the TDX > guest. [...] It assigns an encryption key to the TDX guest for memory > encryption. TDX encrypts memory per guest basis. No it doesn't. The key has been programmed already in your previous patch. The device model, say > qemu, passes per-VM parameters for the TDX guest. This is implied by your first sentence of this paragraph. The maximum number of > vcpus, TSC frequency (TDX guest has fixed VM-wide TSC frequency, not per > vcpu. The TDX guest can not change it.), attributes (production or debug), > available extended features (which configure guest XCR0, IA32_XSS MSR), > CPUIDs, sha384 measurements, etc. This is not a sentence. > > Call this subcommand before creating vcpu and KVM_SET_CPUID2, i.e. CPUID > configurations aren't available yet. " This "per-VM" TDX initialization must be done before any "vcpu-scope" TDX initialization. To match this better, require the KVM_TDX_INIT_VM IOCTL() to be done before KVM creates any vcpus. Note KVM configures the VM's CPUIDs in KVM_SET_CPUID2 via vcpu. The downside of this approach is KVM will need to do some enforcement later to make sure the consisntency between the CPUIDs passed here and the CPUIDs done in KVM_SET_CPUID2. " So CPUIDs configuration values need > to be passed in struct kvm_tdx_init_vm. The device model's responsibility > to make this CPUID config for KVM_TDX_INIT_VM and KVM_SET_CPUID2. And I would leave how to handle KVM_SET_CPUID2 to the patch that actually enforces the consisntency. > > Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com> Missing Co-developed-by tag for Xiaoyao. > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> [...] > > +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2( > + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) > +{ > + return cpuid_entry2_find(entries, nent, function, index); > +} > +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2); Not sure whether we can export cpuid_entry2_find() directly? No strong opinion of course. But if we want to expose the wrapper, looks ... > + > struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, > u32 function, u32 index) > { > diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h > index 856e3037e74f..215d1c68c6d1 100644 > --- a/arch/x86/kvm/cpuid.h > +++ b/arch/x86/kvm/cpuid.h > @@ -13,6 +13,8 @@ void kvm_set_cpu_caps(void); > > void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); > void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); > +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries, > + int nent, u32 function, u64 index); > struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, > u32 function, u32 index); > struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, ... __kvm_find_cpuid_entry() would fit better? > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > index 1cf2b15da257..b11f105db3cd 100644 > --- a/arch/x86/kvm/vmx/tdx.c > +++ b/arch/x86/kvm/vmx/tdx.c > @@ -8,7 +8,6 @@ > #include "mmu.h" > #include "tdx_arch.h" > #include "tdx.h" > -#include "tdx_ops.h" ?? If it isn't needed, then it shouldn't be included in some previous patch. > #include "x86.h" > > #undef pr_fmt > @@ -350,18 +349,21 @@ static int tdx_do_tdh_mng_key_config(void *param) > return 0; > } > > -static int __tdx_td_init(struct kvm *kvm); > - > int tdx_vm_init(struct kvm *kvm) > { > + /* > + * This function initializes only KVM software construct. It doesn't > + * initialize TDX stuff, e.g. TDCS, TDR, TDCX, HKID etc. > + * It is handled by KVM_TDX_INIT_VM, __tdx_td_init(). > + */ > + > /* > * TDX has its own limit of the number of vcpus in addition to > * KVM_MAX_VCPUS. > */ > kvm->max_vcpus = min(kvm->max_vcpus, TDX_MAX_VCPUS); > > - /* Place holder for TDX specific logic. */ > - return __tdx_td_init(kvm); > + return 0; ?? I don't quite understand. What's wrong of still calling __tdx_td_init() in tdx_vm_init()? If there's anything preventing doing __tdx_td_init() from tdx_vm_init(), then it's wrong to implement that in your previous patch. [...]
On Mon, 2024-02-26 at 00:25 -0800, isaku.yamahata@intel.com wrote: > +static int setup_tdparams_xfam(struct kvm_cpuid2 *cpuid, struct td_params *td_params) > +{ > + const struct kvm_cpuid_entry2 *entry; > + u64 guest_supported_xcr0; > + u64 guest_supported_xss; > + > + /* Setup td_params.xfam */ > + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0xd, 0); > + if (entry) > + guest_supported_xcr0 = (entry->eax | ((u64)entry->edx << 32)); > + else > + guest_supported_xcr0 = 0; > + guest_supported_xcr0 &= kvm_caps.supported_xcr0; > + > + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0xd, 1); > + if (entry) > + guest_supported_xss = (entry->ecx | ((u64)entry->edx << 32)); > + else > + guest_supported_xss = 0; > + > + /* > + * PT and CET can be exposed to TD guest regardless of KVM's XSS, PT > + * and, CET support. > + */ > + guest_supported_xss &= > + (kvm_caps.supported_xss | XFEATURE_MASK_PT | TDX_TD_XFAM_CET); So this enables features based on xss support in the passed CPUID, but these features are not dependent xsave. You could have CET without xsave support. And in fact Kernel IBT doesn't use it. To utilize CPUID leafs to configure features, but diverge from the HW meaning seems like asking for trouble. > + > + td_params->xfam = guest_supported_xcr0 | guest_supported_xss; > + if (td_params->xfam & XFEATURE_MASK_LBR) { > + /* > + * TODO: once KVM supports LBR(save/restore LBR related > + * registers around TDENTER), remove this guard. > + */ > +#define MSG_LBR "TD doesn't support LBR yet. KVM needs to save/restore IA32_LBR_DEPTH > properly.\n" > + pr_warn(MSG_LBR); > + return -EOPNOTSUPP; > + } > + > + return 0; > +} > + > +static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, > + struct kvm_tdx_init_vm *init_vm) > +{ > + struct kvm_cpuid2 *cpuid = &init_vm->cpuid; > + int ret; > + > + if (kvm->created_vcpus) > + return -EBUSY; > + > + if (init_vm->attributes & TDX_TD_ATTRIBUTE_PERFMON) { > + /* > + * TODO: save/restore PMU related registers around TDENTER. > + * Once it's done, remove this guard. > + */ > +#define MSG_PERFMON "TD doesn't support perfmon yet. KVM needs to save/restore host perf > registers properly.\n" > + pr_warn(MSG_PERFMON); We need to remove the TODOs and a warn doesn't seem appropriate. > + return -EOPNOTSUPP; > + } > + > + td_params->max_vcpus = kvm->max_vcpus; > + td_params->attributes = init_vm->attributes; Don't we need to sanitize this for a selection of features known to KVM. For example what if something else like TDX_TD_ATTRIBUTE_PERFMON is added to a future TDX module and then suddenly userspace can configure it. So xfam is how to control features that are tied to save (CET, etc). And ATTRIBUTES are tied to features without xsave support (PKS, etc). If we are going to use CPUID for specifying which features should get enabled in the TDX module, we should match the arch definitions of the leafs. For things like CET whether xfam controls the value of multiple CPUID leafs, then we need should check that they are all set to some consistent values and otherwise reject them. So for CET we would need to check the SHSTK and IBT bits, as well as two XCR0 bits. If we are going to do that for XFAM based features, then why not do the same for ATTRIBUTE based features? We would need something like GET_SUPPORTED_CPUID for TDX, but also since some features can be forced on we would need to expose something like GET_SUPPORTED_CPUID_REQUIRED as well. > + td_params->exec_controls = TDX_CONTROL_FLAG_NO_RBP_MOD; > + td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
On Thu, Apr 04, 2024 at 12:59:45PM +1300, "Huang, Kai" <kai.huang@intel.com> wrote: > > > On 26/02/2024 9:25 pm, isaku.yamahata@intel.com wrote: > > From: Isaku Yamahata <isaku.yamahata@intel.com> > > > > TDX requires additional parameters for TDX VM for confidential execution to > > protect the confidentiality of its memory contents and CPU state from any > > other software, including VMM. > > Hmm.. not only "confidentiality" but also "integrity". And the "per-VM" TDX > initializaiton here actually has nothing to do with "crypto-protection", > because the establishment of the key has already been done before reaching > here. > > I would just say: > > After the crypto-protection key has been configured, TDX requires a VM-scope > initialization as a step of creating the TDX guest. This "per-VM" TDX > initialization does the global configurations/features that the TDX guest > can support, such as guest's CPUIDs (emulated by the TDX module), the > maximum number of vcpus etc. > > > > > When creating a guest TD VM before creating > > vcpu, the number of vcpu, TSC frequency (the values are the same among > > vcpus, and it can't change.) CPUIDs which the TDX module emulates. > > I cannot parse this sentence. It doesn't look like a sentence to me. > > Guest > > TDs can trust those CPUIDs and sha384 values for measurement. > > Trustness is not about the "guest can trust", but the "people using the > guest can trust". > > Just remove it. > > If you want to emphasize the attestation, you can add something like: > > " > It also passes the VM's measurement and hash of the signer etc and the > hardware only allows to initialize the TDX guest when that match. > " > > > > > Add a new subcommand, KVM_TDX_INIT_VM, to pass parameters for the TDX > > guest. > > [...] > > It assigns an encryption key to the TDX guest for memory > > encryption. TDX encrypts memory per guest basis. > > No it doesn't. The key has been programmed already in your previous patch. > > The device model, say > > qemu, passes per-VM parameters for the TDX guest. > > This is implied by your first sentence of this paragraph. > > The maximum number of > > vcpus, TSC frequency (TDX guest has fixed VM-wide TSC frequency, not per > > vcpu. The TDX guest can not change it.), attributes (production or debug), > > available extended features (which configure guest XCR0, IA32_XSS MSR), > > CPUIDs, sha384 measurements, etc. > > This is not a sentence. > > > > > Call this subcommand before creating vcpu and KVM_SET_CPUID2, i.e. CPUID > > configurations aren't available yet. > > " > This "per-VM" TDX initialization must be done before any "vcpu-scope" TDX > initialization. To match this better, require the KVM_TDX_INIT_VM IOCTL() > to be done before KVM creates any vcpus. > > Note KVM configures the VM's CPUIDs in KVM_SET_CPUID2 via vcpu. The > downside of this approach is KVM will need to do some enforcement later to > make sure the consisntency between the CPUIDs passed here and the CPUIDs > done in KVM_SET_CPUID2. > " Thanks for the draft. Let me update it. > So CPUIDs configuration values need > > to be passed in struct kvm_tdx_init_vm. The device model's responsibility > > to make this CPUID config for KVM_TDX_INIT_VM and KVM_SET_CPUID2. > > And I would leave how to handle KVM_SET_CPUID2 to the patch that actually > enforces the consisntency. Yes, that's a different discussion. > > +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2( > > + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) > > +{ > > + return cpuid_entry2_find(entries, nent, function, index); > > +} > > +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2); > > Not sure whether we can export cpuid_entry2_find() directly? > > No strong opinion of course. > > But if we want to expose the wrapper, looks ... Almost all KVM exported symbols have kvm_ prefix. I'm afraid that cpuid is too common. We can rename the function directly without wrapper. > > + > > struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, > > u32 function, u32 index) > > { > > diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h > > index 856e3037e74f..215d1c68c6d1 100644 > > --- a/arch/x86/kvm/cpuid.h > > +++ b/arch/x86/kvm/cpuid.h > > @@ -13,6 +13,8 @@ void kvm_set_cpu_caps(void); > > void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); > > void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); > > +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries, > > + int nent, u32 function, u64 index); > > struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, > > u32 function, u32 index); > struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, > > ... __kvm_find_cpuid_entry() would fit better? Ok, let's rename it. > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > > index 1cf2b15da257..b11f105db3cd 100644 > > --- a/arch/x86/kvm/vmx/tdx.c > > +++ b/arch/x86/kvm/vmx/tdx.c > > @@ -8,7 +8,6 @@ > > #include "mmu.h" > > #include "tdx_arch.h" > > #include "tdx.h" > > -#include "tdx_ops.h" > > ?? > > If it isn't needed, then it shouldn't be included in some previous patch. Will fix. > > #include "x86.h" > > #undef pr_fmt > > @@ -350,18 +349,21 @@ static int tdx_do_tdh_mng_key_config(void *param) > > return 0; > > } > > -static int __tdx_td_init(struct kvm *kvm); > > - > > int tdx_vm_init(struct kvm *kvm) > > { > > + /* > > + * This function initializes only KVM software construct. It doesn't > > + * initialize TDX stuff, e.g. TDCS, TDR, TDCX, HKID etc. > > + * It is handled by KVM_TDX_INIT_VM, __tdx_td_init(). > > + */ > > + > > /* > > * TDX has its own limit of the number of vcpus in addition to > > * KVM_MAX_VCPUS. > > */ > > kvm->max_vcpus = min(kvm->max_vcpus, TDX_MAX_VCPUS); > > - /* Place holder for TDX specific logic. */ > > - return __tdx_td_init(kvm); > > + return 0; > > ?? > > I don't quite understand. What's wrong of still calling __tdx_td_init() in > tdx_vm_init()? > > If there's anything preventing doing __tdx_td_init() from tdx_vm_init(), > then it's wrong to implement that in your previous patch. Yes. As discussed the previous patch is too big, we need to break the previous patch and this patch.
On Mon, Apr 08, 2024 at 06:38:56PM +0000, "Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote: > On Mon, 2024-02-26 at 00:25 -0800, isaku.yamahata@intel.com wrote: > > +static int setup_tdparams_xfam(struct kvm_cpuid2 *cpuid, struct td_params *td_params) > > +{ > > + const struct kvm_cpuid_entry2 *entry; > > + u64 guest_supported_xcr0; > > + u64 guest_supported_xss; > > + > > + /* Setup td_params.xfam */ > > + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0xd, 0); > > + if (entry) > > + guest_supported_xcr0 = (entry->eax | ((u64)entry->edx << 32)); > > + else > > + guest_supported_xcr0 = 0; > > + guest_supported_xcr0 &= kvm_caps.supported_xcr0; > > + > > + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0xd, 1); > > + if (entry) > > + guest_supported_xss = (entry->ecx | ((u64)entry->edx << 32)); > > + else > > + guest_supported_xss = 0; > > + > > + /* > > + * PT and CET can be exposed to TD guest regardless of KVM's XSS, PT > > + * and, CET support. > > + */ > > + guest_supported_xss &= > > + (kvm_caps.supported_xss | XFEATURE_MASK_PT | TDX_TD_XFAM_CET); > > So this enables features based on xss support in the passed CPUID, but these features are not > dependent xsave. You could have CET without xsave support. And in fact Kernel IBT doesn't use it. To > utilize CPUID leafs to configure features, but diverge from the HW meaning seems like asking for > trouble. TDX module checks the consistency. KVM can rely on it not to re-implement it. The TDX Base Architecture specification describes what check is done. Table 11.4: Extended Features Enumeration and Execution Control > > + > > + td_params->xfam = guest_supported_xcr0 | guest_supported_xss; > > + if (td_params->xfam & XFEATURE_MASK_LBR) { > > + /* > > + * TODO: once KVM supports LBR(save/restore LBR related > > + * registers around TDENTER), remove this guard. > > + */ > > +#define MSG_LBR "TD doesn't support LBR yet. KVM needs to save/restore IA32_LBR_DEPTH > > properly.\n" > > + pr_warn(MSG_LBR); > > + return -EOPNOTSUPP; > > + } > > + > > + return 0; > > +} > > + > > +static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, > > + struct kvm_tdx_init_vm *init_vm) > > +{ > > + struct kvm_cpuid2 *cpuid = &init_vm->cpuid; > > + int ret; > > + > > + if (kvm->created_vcpus) > > + return -EBUSY; > > + > > + if (init_vm->attributes & TDX_TD_ATTRIBUTE_PERFMON) { > > + /* > > + * TODO: save/restore PMU related registers around TDENTER. > > + * Once it's done, remove this guard. > > + */ > > +#define MSG_PERFMON "TD doesn't support perfmon yet. KVM needs to save/restore host perf > > registers properly.\n" > > + pr_warn(MSG_PERFMON); > > We need to remove the TODOs and a warn doesn't seem appropriate. Sure, let me drop them. > > + return -EOPNOTSUPP; > > + } > > + > > + td_params->max_vcpus = kvm->max_vcpus; > > + td_params->attributes = init_vm->attributes; > > Don't we need to sanitize this for a selection of features known to KVM. For example what if > something else like TDX_TD_ATTRIBUTE_PERFMON is added to a future TDX module and then suddenly > userspace can configure it. > > So xfam is how to control features that are tied to save (CET, etc). And ATTRIBUTES are tied to > features without xsave support (PKS, etc). > > If we are going to use CPUID for specifying which features should get enabled in the TDX module, we > should match the arch definitions of the leafs. For things like CET whether xfam controls the value > of multiple CPUID leafs, then we need should check that they are all set to some consistent values > and otherwise reject them. So for CET we would need to check the SHSTK and IBT bits, as well as two > XCR0 bits. > > If we are going to do that for XFAM based features, then why not do the same for ATTRIBUTE based > features? > > We would need something like GET_SUPPORTED_CPUID for TDX, but also since some features can be forced > on we would need to expose something like GET_SUPPORTED_CPUID_REQUIRED as well. I agree to reject attributes unknown to KVM. Let's add the check. The TDX module checks consistency between attributes, xfam, and cpuids as described in the spec, KVM can rely on it. When TDX module finds inconsistency (or anything bad), it returns error as SEAMCALL error status code. It includes which cpuid is bad. KVM returns it to the userspace VMM in struct kvm_tdx_cmd.error. We don't have to re-implement similar checks.
On Thu, 2024-04-11 at 12:26 -0700, Isaku Yamahata wrote: > > > > So this enables features based on xss support in the passed CPUID, but these > > features are not > > dependent xsave. You could have CET without xsave support. And in fact > > Kernel IBT doesn't use it. To > > utilize CPUID leafs to configure features, but diverge from the HW meaning > > seems like asking for > > trouble. > > TDX module checks the consistency. KVM can rely on it not to re-implement it. > The TDX Base Architecture specification describes what check is done. > Table 11.4: Extended Features Enumeration and Execution Control The point is that it is an strange interface. Why not take XFAM as a specific field in struct kvm_tdx_init_vm?
On Thu, Apr 11, 2024 at 07:51:55PM +0000, "Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote: > On Thu, 2024-04-11 at 12:26 -0700, Isaku Yamahata wrote: > > > > > > So this enables features based on xss support in the passed CPUID, but these > > > features are not > > > dependent xsave. You could have CET without xsave support. And in fact > > > Kernel IBT doesn't use it. To > > > utilize CPUID leafs to configure features, but diverge from the HW meaning > > > seems like asking for > > > trouble. > > > > TDX module checks the consistency. KVM can rely on it not to re-implement it. > > The TDX Base Architecture specification describes what check is done. > > Table 11.4: Extended Features Enumeration and Execution Control > > The point is that it is an strange interface. Why not take XFAM as a specific > field in struct kvm_tdx_init_vm? Now I see your point. Yes, we can add xfam to struct kvm_tdx_init_vm and move the burden to create xfam from the kernel to the user space.
On Thu, 2024-04-11 at 13:46 -0700, Isaku Yamahata wrote: > On Thu, Apr 11, 2024 at 07:51:55PM +0000, > "Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote: > > > On Thu, 2024-04-11 at 12:26 -0700, Isaku Yamahata wrote: > > > > > > > > So this enables features based on xss support in the passed CPUID, but > > > > these > > > > features are not > > > > dependent xsave. You could have CET without xsave support. And in fact > > > > Kernel IBT doesn't use it. To > > > > utilize CPUID leafs to configure features, but diverge from the HW > > > > meaning > > > > seems like asking for > > > > trouble. > > > > > > TDX module checks the consistency. KVM can rely on it not to re-implement > > > it. > > > The TDX Base Architecture specification describes what check is done. > > > Table 11.4: Extended Features Enumeration and Execution Control > > > > The point is that it is an strange interface. Why not take XFAM as a > > specific > > field in struct kvm_tdx_init_vm? > > Now I see your point. Yes, we can add xfam to struct kvm_tdx_init_vm and > move the burden to create xfam from the kernel to the user space. Oh, right. Qemu would have to figure out how to take it's CPUID based model and convert it to XFAM. I still think it would be better, but I was only thinking of it from KVM's perspective. We can see how the API discussion on the PUCK call resolves.
On Mon, Feb 26, 2024 at 12:25:41AM -0800, isaku.yamahata@intel.com wrote: > @@ -725,6 +967,17 @@ static int __init tdx_module_setup(void) > > tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE; > > + /* > + * Make TDH.VP.ENTER preserve RBP so that the stack unwinder > + * always work around it. Query the feature. > + */ > + if (!(tdx_info->features0 & MD_FIELD_ID_FEATURES0_NO_RBP_MOD) && > + !IS_ENABLED(CONFIG_FRAME_POINTER)) { I think it supposed to be IS_ENABLED(CONFIG_FRAME_POINTER). "!" shouldn't be here. > + pr_err("Too old version of TDX module. Consider upgrade.\n"); > + ret = -EOPNOTSUPP; > + goto error_out; > + } > + > return 0; >
On 17.05.24 16:32, Kirill A. Shutemov wrote: > On Mon, Feb 26, 2024 at 12:25:41AM -0800, isaku.yamahata@intel.com wrote: >> @@ -725,6 +967,17 @@ static int __init tdx_module_setup(void) >> >> tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE; >> >> + /* >> + * Make TDH.VP.ENTER preserve RBP so that the stack unwinder >> + * always work around it. Query the feature. >> + */ >> + if (!(tdx_info->features0 & MD_FIELD_ID_FEATURES0_NO_RBP_MOD) && >> + !IS_ENABLED(CONFIG_FRAME_POINTER)) { > > I think it supposed to be IS_ENABLED(CONFIG_FRAME_POINTER). "!" shouldn't > be here. No, I don't think so. With CONFIG_FRAME_POINTER %rbp is being saved and restored, so there is no problem in case the seamcall is clobbering it. Juergen > >> + pr_err("Too old version of TDX module. Consider upgrade.\n"); >> + ret = -EOPNOTSUPP; >> + goto error_out; >> + } >> + >> return 0; >> >
On Fri, May 17, 2024 at 04:37:16PM +0200, Juergen Gross wrote: > On 17.05.24 16:32, Kirill A. Shutemov wrote: > > On Mon, Feb 26, 2024 at 12:25:41AM -0800, isaku.yamahata@intel.com wrote: > > > @@ -725,6 +967,17 @@ static int __init tdx_module_setup(void) > > > tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE; > > > + /* > > > + * Make TDH.VP.ENTER preserve RBP so that the stack unwinder > > > + * always work around it. Query the feature. > > > + */ > > > + if (!(tdx_info->features0 & MD_FIELD_ID_FEATURES0_NO_RBP_MOD) && > > > + !IS_ENABLED(CONFIG_FRAME_POINTER)) { > > > > I think it supposed to be IS_ENABLED(CONFIG_FRAME_POINTER). "!" shouldn't > > be here. > > No, I don't think so. > > With CONFIG_FRAME_POINTER %rbp is being saved and restored, so there is no > problem in case the seamcall is clobbering it. Could you check setup_tdparams() in your tree? Commit [SEAM-WORKAROUND] KVM: TDX: Don't use NO_RBP_MOD for backward compatibility in my tree comments out the setting TDX_CONTROL_FLAG_NO_RBP_MOD. I now remember there was problem in EDK2 using RBP. So the patch is temporary until EDK2 is fixed.
On 17.05.24 16:53, Kirill A. Shutemov wrote: > On Fri, May 17, 2024 at 04:37:16PM +0200, Juergen Gross wrote: >> On 17.05.24 16:32, Kirill A. Shutemov wrote: >>> On Mon, Feb 26, 2024 at 12:25:41AM -0800, isaku.yamahata@intel.com wrote: >>>> @@ -725,6 +967,17 @@ static int __init tdx_module_setup(void) >>>> tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE; >>>> + /* >>>> + * Make TDH.VP.ENTER preserve RBP so that the stack unwinder >>>> + * always work around it. Query the feature. >>>> + */ >>>> + if (!(tdx_info->features0 & MD_FIELD_ID_FEATURES0_NO_RBP_MOD) && >>>> + !IS_ENABLED(CONFIG_FRAME_POINTER)) { >>> >>> I think it supposed to be IS_ENABLED(CONFIG_FRAME_POINTER). "!" shouldn't >>> be here. >> >> No, I don't think so. >> >> With CONFIG_FRAME_POINTER %rbp is being saved and restored, so there is no >> problem in case the seamcall is clobbering it. > > Could you check setup_tdparams() in your tree? > > Commit > > [SEAM-WORKAROUND] KVM: TDX: Don't use NO_RBP_MOD for backward compatibility > > in my tree comments out the setting TDX_CONTROL_FLAG_NO_RBP_MOD. > > I now remember there was problem in EDK2 using RBP. So the patch is > temporary until EDK2 is fixed. > I have the following line in setup_tdparams() (not commented out): td_params->exec_controls = TDX_CONTROL_FLAG_NO_RBP_MOD; Juergen
On Fri, May 17, 2024 at 05:00:19PM +0200, Jürgen Groß wrote: > On 17.05.24 16:53, Kirill A. Shutemov wrote: > > On Fri, May 17, 2024 at 04:37:16PM +0200, Juergen Gross wrote: > > > On 17.05.24 16:32, Kirill A. Shutemov wrote: > > > > On Mon, Feb 26, 2024 at 12:25:41AM -0800, isaku.yamahata@intel.com wrote: > > > > > @@ -725,6 +967,17 @@ static int __init tdx_module_setup(void) > > > > > tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE; > > > > > + /* > > > > > + * Make TDH.VP.ENTER preserve RBP so that the stack unwinder > > > > > + * always work around it. Query the feature. > > > > > + */ > > > > > + if (!(tdx_info->features0 & MD_FIELD_ID_FEATURES0_NO_RBP_MOD) && > > > > > + !IS_ENABLED(CONFIG_FRAME_POINTER)) { > > > > > > > > I think it supposed to be IS_ENABLED(CONFIG_FRAME_POINTER). "!" shouldn't > > > > be here. > > > > > > No, I don't think so. > > > > > > With CONFIG_FRAME_POINTER %rbp is being saved and restored, so there is no > > > problem in case the seamcall is clobbering it. > > > > Could you check setup_tdparams() in your tree? > > > > Commit > > > > [SEAM-WORKAROUND] KVM: TDX: Don't use NO_RBP_MOD for backward compatibility > > > > in my tree comments out the setting TDX_CONTROL_FLAG_NO_RBP_MOD. > > > > I now remember there was problem in EDK2 using RBP. So the patch is > > temporary until EDK2 is fixed. > > > > I have the following line in setup_tdparams() (not commented out): > > td_params->exec_controls = TDX_CONTROL_FLAG_NO_RBP_MOD; Could you check if it is visible from the guest side? It is zero for me. diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index c1cb90369915..f65993a6066d 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -822,13 +822,33 @@ static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages, return true; } +#define TDG_VM_RD 7 + +#define TDCS_CONFIG_FLAGS 0x1110000300000016 + +#define TDCS_CONFIG_NO_RBP_MOD BIT_ULL(2) + +/* Read TD-scoped metadata */ +static inline u64 tdg_vm_rd(u64 field, u64 *value) +{ + struct tdx_module_args args = { + .rdx = field, + }; + u64 ret; + + ret = __tdcall_ret(TDG_VM_RD, &args); + *value = args.r8; + + return ret; +} + void __init tdx_early_init(void) { struct tdx_module_args args = { .rdx = TDCS_NOTIFY_ENABLES, .r9 = -1ULL, }; - u64 cc_mask; + u64 cc_mask, config; u32 eax, sig[3]; cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]); @@ -893,4 +913,7 @@ void __init tdx_early_init(void) x86_cpuinit.parallel_bringup = false; pr_info("Guest detected\n"); + + tdg_vm_rd(TDCS_CONFIG_FLAGS, &config); + printk("NO_RBP_MOD: %#llx\n", config & TDCS_CONFIG_NO_RBP_MOD); }
On Fri, May 17, 2024 at 07:25:01PM +0300, Kirill A. Shutemov wrote: > On Fri, May 17, 2024 at 05:00:19PM +0200, Jürgen Groß wrote: > > On 17.05.24 16:53, Kirill A. Shutemov wrote: > > > On Fri, May 17, 2024 at 04:37:16PM +0200, Juergen Gross wrote: > > > > On 17.05.24 16:32, Kirill A. Shutemov wrote: > > > > > On Mon, Feb 26, 2024 at 12:25:41AM -0800, isaku.yamahata@intel.com wrote: > > > > > > @@ -725,6 +967,17 @@ static int __init tdx_module_setup(void) > > > > > > tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE; > > > > > > + /* > > > > > > + * Make TDH.VP.ENTER preserve RBP so that the stack unwinder > > > > > > + * always work around it. Query the feature. > > > > > > + */ > > > > > > + if (!(tdx_info->features0 & MD_FIELD_ID_FEATURES0_NO_RBP_MOD) && > > > > > > + !IS_ENABLED(CONFIG_FRAME_POINTER)) { > > > > > > > > > > I think it supposed to be IS_ENABLED(CONFIG_FRAME_POINTER). "!" shouldn't > > > > > be here. > > > > > > > > No, I don't think so. > > > > > > > > With CONFIG_FRAME_POINTER %rbp is being saved and restored, so there is no > > > > problem in case the seamcall is clobbering it. > > > > > > Could you check setup_tdparams() in your tree? > > > > > > Commit > > > > > > [SEAM-WORKAROUND] KVM: TDX: Don't use NO_RBP_MOD for backward compatibility > > > > > > in my tree comments out the setting TDX_CONTROL_FLAG_NO_RBP_MOD. > > > > > > I now remember there was problem in EDK2 using RBP. So the patch is > > > temporary until EDK2 is fixed. > > > > > > > I have the following line in setup_tdparams() (not commented out): > > > > td_params->exec_controls = TDX_CONTROL_FLAG_NO_RBP_MOD; > > Could you check if it is visible from the guest side? Jürgen, have you tried it?
On 23.05.24 12:35, Kirill A. Shutemov wrote: > On Fri, May 17, 2024 at 07:25:01PM +0300, Kirill A. Shutemov wrote: >> On Fri, May 17, 2024 at 05:00:19PM +0200, Jürgen Groß wrote: >>> On 17.05.24 16:53, Kirill A. Shutemov wrote: >>>> On Fri, May 17, 2024 at 04:37:16PM +0200, Juergen Gross wrote: >>>>> On 17.05.24 16:32, Kirill A. Shutemov wrote: >>>>>> On Mon, Feb 26, 2024 at 12:25:41AM -0800, isaku.yamahata@intel.com wrote: >>>>>>> @@ -725,6 +967,17 @@ static int __init tdx_module_setup(void) >>>>>>> tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE; >>>>>>> + /* >>>>>>> + * Make TDH.VP.ENTER preserve RBP so that the stack unwinder >>>>>>> + * always work around it. Query the feature. >>>>>>> + */ >>>>>>> + if (!(tdx_info->features0 & MD_FIELD_ID_FEATURES0_NO_RBP_MOD) && >>>>>>> + !IS_ENABLED(CONFIG_FRAME_POINTER)) { >>>>>> >>>>>> I think it supposed to be IS_ENABLED(CONFIG_FRAME_POINTER). "!" shouldn't >>>>>> be here. >>>>> >>>>> No, I don't think so. >>>>> >>>>> With CONFIG_FRAME_POINTER %rbp is being saved and restored, so there is no >>>>> problem in case the seamcall is clobbering it. >>>> >>>> Could you check setup_tdparams() in your tree? >>>> >>>> Commit >>>> >>>> [SEAM-WORKAROUND] KVM: TDX: Don't use NO_RBP_MOD for backward compatibility >>>> >>>> in my tree comments out the setting TDX_CONTROL_FLAG_NO_RBP_MOD. >>>> >>>> I now remember there was problem in EDK2 using RBP. So the patch is >>>> temporary until EDK2 is fixed. >>>> >>> >>> I have the following line in setup_tdparams() (not commented out): >>> >>> td_params->exec_controls = TDX_CONTROL_FLAG_NO_RBP_MOD; >> >> Could you check if it is visible from the guest side? > > Jürgen, have you tried it? > No, I'm not yet at the point where I can boot a guest successfully. Juergen
On 18/05/2024 4:25 am, Kirill A. Shutemov wrote: > On Fri, May 17, 2024 at 05:00:19PM +0200, Jürgen Groß wrote: >> On 17.05.24 16:53, Kirill A. Shutemov wrote: >>> On Fri, May 17, 2024 at 04:37:16PM +0200, Juergen Gross wrote: >>>> On 17.05.24 16:32, Kirill A. Shutemov wrote: >>>>> On Mon, Feb 26, 2024 at 12:25:41AM -0800, isaku.yamahata@intel.com wrote: >>>>>> @@ -725,6 +967,17 @@ static int __init tdx_module_setup(void) >>>>>> tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE; >>>>>> + /* >>>>>> + * Make TDH.VP.ENTER preserve RBP so that the stack unwinder >>>>>> + * always work around it. Query the feature. >>>>>> + */ >>>>>> + if (!(tdx_info->features0 & MD_FIELD_ID_FEATURES0_NO_RBP_MOD) && >>>>>> + !IS_ENABLED(CONFIG_FRAME_POINTER)) { >>>>> >>>>> I think it supposed to be IS_ENABLED(CONFIG_FRAME_POINTER). "!" shouldn't >>>>> be here. >>>> >>>> No, I don't think so. >>>> >>>> With CONFIG_FRAME_POINTER %rbp is being saved and restored, so there is no >>>> problem in case the seamcall is clobbering it. >>> >>> Could you check setup_tdparams() in your tree? >>> >>> Commit >>> >>> [SEAM-WORKAROUND] KVM: TDX: Don't use NO_RBP_MOD for backward compatibility >>> >>> in my tree comments out the setting TDX_CONTROL_FLAG_NO_RBP_MOD. >>> >>> I now remember there was problem in EDK2 using RBP. So the patch is >>> temporary until EDK2 is fixed. >>> >> >> I have the following line in setup_tdparams() (not commented out): >> >> td_params->exec_controls = TDX_CONTROL_FLAG_NO_RBP_MOD; > > Could you check if it is visible from the guest side? > > It is zero for me. > > diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c > index c1cb90369915..f65993a6066d 100644 > --- a/arch/x86/coco/tdx/tdx.c > +++ b/arch/x86/coco/tdx/tdx.c > @@ -822,13 +822,33 @@ static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages, > return true; > } > > +#define TDG_VM_RD 7 > + > +#define TDCS_CONFIG_FLAGS 0x1110000300000016 > + Hi Kirill, Where did you get this metadata field ID value from? I assume you meant below one, from which the ID is 0x9110000300000016? Or is there anything special rule for using metadata field ID in the guest? { "Control Structure": "TDCS", "Class": "Execution Controls", "Field Name": "CONFIG_FLAGS", "Description": [ "Non-attested TD configuration flags" ], "Type": "64b bitmap", "VM Applicability": null, "Mutability": "TDH.MNG.INIT/ TDH.IMPORT.STATE.IMMUTABLE", "Initial Value": "From TDH.MNG.INIT input", "Field Size (Bytes)": "8", "Num Fields": "1", "Num Elements": "1", "Element Size (Bytes)": "8", "Overall Size (Bytes)": "8", "Base FIELD_ID (Hex)": "0x9110000300000016", "Host VMM Access for a Production TD": "RO", "Host VMM Access for a Debug TD": "RO", "Guest Access": "RO", "Migration TD Access": "RO", "Host VMM Rd Mask for a Production TD ": "-1", "Host VMM Wr Mask for a Production TD ": "0", "Host VMM Rd Mask for a Debug TD ": "-1", "Host VMM Wr Mask for a Debug TD ": "0", "Guest Rd Mask": "-1", "Guest Wr Mask": "0", "Migration TD Rd Mask": "-1", "Migration TD wr Mask": "0" },
On Fri, May 24, 2024 at 11:37:15AM +1200, Huang, Kai wrote: > > > On 18/05/2024 4:25 am, Kirill A. Shutemov wrote: > > On Fri, May 17, 2024 at 05:00:19PM +0200, Jürgen Groß wrote: > > > On 17.05.24 16:53, Kirill A. Shutemov wrote: > > > > On Fri, May 17, 2024 at 04:37:16PM +0200, Juergen Gross wrote: > > > > > On 17.05.24 16:32, Kirill A. Shutemov wrote: > > > > > > On Mon, Feb 26, 2024 at 12:25:41AM -0800, isaku.yamahata@intel.com wrote: > > > > > > > @@ -725,6 +967,17 @@ static int __init tdx_module_setup(void) > > > > > > > tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE; > > > > > > > + /* > > > > > > > + * Make TDH.VP.ENTER preserve RBP so that the stack unwinder > > > > > > > + * always work around it. Query the feature. > > > > > > > + */ > > > > > > > + if (!(tdx_info->features0 & MD_FIELD_ID_FEATURES0_NO_RBP_MOD) && > > > > > > > + !IS_ENABLED(CONFIG_FRAME_POINTER)) { > > > > > > > > > > > > I think it supposed to be IS_ENABLED(CONFIG_FRAME_POINTER). "!" shouldn't > > > > > > be here. > > > > > > > > > > No, I don't think so. > > > > > > > > > > With CONFIG_FRAME_POINTER %rbp is being saved and restored, so there is no > > > > > problem in case the seamcall is clobbering it. > > > > > > > > Could you check setup_tdparams() in your tree? > > > > > > > > Commit > > > > > > > > [SEAM-WORKAROUND] KVM: TDX: Don't use NO_RBP_MOD for backward compatibility > > > > > > > > in my tree comments out the setting TDX_CONTROL_FLAG_NO_RBP_MOD. > > > > > > > > I now remember there was problem in EDK2 using RBP. So the patch is > > > > temporary until EDK2 is fixed. > > > > > > > > > > I have the following line in setup_tdparams() (not commented out): > > > > > > td_params->exec_controls = TDX_CONTROL_FLAG_NO_RBP_MOD; > > > > Could you check if it is visible from the guest side? > > > > It is zero for me. > > > > diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c > > index c1cb90369915..f65993a6066d 100644 > > --- a/arch/x86/coco/tdx/tdx.c > > +++ b/arch/x86/coco/tdx/tdx.c > > @@ -822,13 +822,33 @@ static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages, > > return true; > > } > > +#define TDG_VM_RD 7 > > + > > +#define TDCS_CONFIG_FLAGS 0x1110000300000016 > > + > > Hi Kirill, > > Where did you get this metadata field ID value from? I assume you meant > below one, from which the ID is 0x9110000300000016? The ID has changed in recent JSON ABI definitions. Looks fishy. I will find out what is going on.
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index e28189c81691..9ac0246bd974 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -570,6 +570,7 @@ struct kvm_pmu_event_filter { /* Trust Domain eXtension sub-ioctl() commands. */ enum kvm_tdx_cmd_id { KVM_TDX_CAPABILITIES = 0, + KVM_TDX_INIT_VM, KVM_TDX_CMD_NR_MAX, }; @@ -621,4 +622,30 @@ struct kvm_tdx_capabilities { struct kvm_tdx_cpuid_config cpuid_configs[]; }; +struct kvm_tdx_init_vm { + __u64 attributes; + __u64 mrconfigid[6]; /* sha384 digest */ + __u64 mrowner[6]; /* sha384 digest */ + __u64 mrownerconfig[6]; /* sha384 digest */ + /* + * For future extensibility to make sizeof(struct kvm_tdx_init_vm) = 8KB. + * This should be enough given sizeof(TD_PARAMS) = 1024. + * 8KB was chosen given because + * sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES(=256) = 8KB. + */ + __u64 reserved[1004]; + + /* + * Call KVM_TDX_INIT_VM before vcpu creation, thus before + * KVM_SET_CPUID2. + * This configuration supersedes KVM_SET_CPUID2s for VCPUs because the + * TDX module directly virtualizes those CPUIDs without VMM. The user + * space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with + * those values. If it doesn't, KVM may have wrong idea of vCPUIDs of + * the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX + * module doesn't virtualize. + */ + struct kvm_cpuid2 cpuid; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index adba49afb5fe..8cdcd6f406aa 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1443,6 +1443,13 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, return r; } +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2( + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) +{ + return cpuid_entry2_find(entries, nent, function, index); +} +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2); + struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, u32 function, u32 index) { diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 856e3037e74f..215d1c68c6d1 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -13,6 +13,8 @@ void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries, + int nent, u32 function, u64 index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, u32 function, u32 index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 1cf2b15da257..b11f105db3cd 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -8,7 +8,6 @@ #include "mmu.h" #include "tdx_arch.h" #include "tdx.h" -#include "tdx_ops.h" #include "x86.h" #undef pr_fmt @@ -350,18 +349,21 @@ static int tdx_do_tdh_mng_key_config(void *param) return 0; } -static int __tdx_td_init(struct kvm *kvm); - int tdx_vm_init(struct kvm *kvm) { + /* + * This function initializes only KVM software construct. It doesn't + * initialize TDX stuff, e.g. TDCS, TDR, TDCX, HKID etc. + * It is handled by KVM_TDX_INIT_VM, __tdx_td_init(). + */ + /* * TDX has its own limit of the number of vcpus in addition to * KVM_MAX_VCPUS. */ kvm->max_vcpus = min(kvm->max_vcpus, TDX_MAX_VCPUS); - /* Place holder for TDX specific logic. */ - return __tdx_td_init(kvm); + return 0; } static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) @@ -416,9 +418,162 @@ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) return ret; } -static int __tdx_td_init(struct kvm *kvm) +static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid, + struct td_params *td_params) +{ + const struct kvm_cpuid_entry2 *entry; + int max_pa = 36; + + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0); + if (entry) + max_pa = entry->eax & 0xff; + + td_params->eptp_controls = VMX_EPTP_MT_WB; + /* + * No CPU supports 4-level && max_pa > 48. + * "5-level paging and 5-level EPT" section 4.1 4-level EPT + * "4-level EPT is limited to translating 48-bit guest-physical + * addresses." + * cpu_has_vmx_ept_5levels() check is just in case. + */ + if (!cpu_has_vmx_ept_5levels() && max_pa > 48) + return -EINVAL; + if (cpu_has_vmx_ept_5levels() && max_pa > 48) { + td_params->eptp_controls |= VMX_EPTP_PWL_5; + td_params->exec_controls |= TDX_EXEC_CONTROL_MAX_GPAW; + } else { + td_params->eptp_controls |= VMX_EPTP_PWL_4; + } + + return 0; +} + +static void setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, + struct td_params *td_params) +{ + int i; + + /* + * td_params.cpuid_values: The number and the order of cpuid_value must + * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} + * It's assumed that td_params was zeroed. + */ + for (i = 0; i < tdx_info->num_cpuid_config; i++) { + const struct kvm_tdx_cpuid_config *c = &tdx_info->cpuid_configs[i]; + /* KVM_TDX_CPUID_NO_SUBLEAF means index = 0. */ + u32 index = c->sub_leaf == KVM_TDX_CPUID_NO_SUBLEAF ? 0 : c->sub_leaf; + const struct kvm_cpuid_entry2 *entry = + kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, + c->leaf, index); + struct tdx_cpuid_value *value = &td_params->cpuid_values[i]; + + if (!entry) + continue; + + /* + * tdsysinfo.cpuid_configs[].{eax, ebx, ecx, edx} + * bit 1 means it can be configured to zero or one. + * bit 0 means it must be zero. + * Mask out non-configurable bits. + */ + value->eax = entry->eax & c->eax; + value->ebx = entry->ebx & c->ebx; + value->ecx = entry->ecx & c->ecx; + value->edx = entry->edx & c->edx; + } +} + +static int setup_tdparams_xfam(struct kvm_cpuid2 *cpuid, struct td_params *td_params) +{ + const struct kvm_cpuid_entry2 *entry; + u64 guest_supported_xcr0; + u64 guest_supported_xss; + + /* Setup td_params.xfam */ + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0xd, 0); + if (entry) + guest_supported_xcr0 = (entry->eax | ((u64)entry->edx << 32)); + else + guest_supported_xcr0 = 0; + guest_supported_xcr0 &= kvm_caps.supported_xcr0; + + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0xd, 1); + if (entry) + guest_supported_xss = (entry->ecx | ((u64)entry->edx << 32)); + else + guest_supported_xss = 0; + + /* + * PT and CET can be exposed to TD guest regardless of KVM's XSS, PT + * and, CET support. + */ + guest_supported_xss &= + (kvm_caps.supported_xss | XFEATURE_MASK_PT | TDX_TD_XFAM_CET); + + td_params->xfam = guest_supported_xcr0 | guest_supported_xss; + if (td_params->xfam & XFEATURE_MASK_LBR) { + /* + * TODO: once KVM supports LBR(save/restore LBR related + * registers around TDENTER), remove this guard. + */ +#define MSG_LBR "TD doesn't support LBR yet. KVM needs to save/restore IA32_LBR_DEPTH properly.\n" + pr_warn(MSG_LBR); + return -EOPNOTSUPP; + } + + return 0; +} + +static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, + struct kvm_tdx_init_vm *init_vm) +{ + struct kvm_cpuid2 *cpuid = &init_vm->cpuid; + int ret; + + if (kvm->created_vcpus) + return -EBUSY; + + if (init_vm->attributes & TDX_TD_ATTRIBUTE_PERFMON) { + /* + * TODO: save/restore PMU related registers around TDENTER. + * Once it's done, remove this guard. + */ +#define MSG_PERFMON "TD doesn't support perfmon yet. KVM needs to save/restore host perf registers properly.\n" + pr_warn(MSG_PERFMON); + return -EOPNOTSUPP; + } + + td_params->max_vcpus = kvm->max_vcpus; + td_params->attributes = init_vm->attributes; + td_params->exec_controls = TDX_CONTROL_FLAG_NO_RBP_MOD; + td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz); + + ret = setup_tdparams_eptp_controls(cpuid, td_params); + if (ret) + return ret; + setup_tdparams_cpuids(cpuid, td_params); + ret = setup_tdparams_xfam(cpuid, td_params); + if (ret) + return ret; + +#define MEMCPY_SAME_SIZE(dst, src) \ + do { \ + BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \ + memcpy((dst), (src), sizeof(dst)); \ + } while (0) + + MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid); + MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner); + MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig); + + return 0; +} + +static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, + u64 *seamcall_err) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct tdx_module_args out; cpumask_var_t packages; unsigned long *tdcs_pa = NULL; unsigned long tdr_pa = 0; @@ -426,6 +581,7 @@ static int __tdx_td_init(struct kvm *kvm) int ret, i; u64 err; + *seamcall_err = 0; ret = tdx_guest_keyid_alloc(); if (ret < 0) return ret; @@ -540,10 +696,23 @@ static int __tdx_td_init(struct kvm *kvm) } } - /* - * Note, TDH_MNG_INIT cannot be invoked here. TDH_MNG_INIT requires a dedicated - * ioctl() to define the configure CPUID values for the TD. - */ + err = tdh_mng_init(kvm_tdx->tdr_pa, __pa(td_params), &out); + if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) { + /* + * Because a user gives operands, don't warn. + * Return a hint to the user because it's sometimes hard for the + * user to figure out which operand is invalid. SEAMCALL status + * code includes which operand caused invalid operand error. + */ + *seamcall_err = err; + ret = -EINVAL; + goto teardown; + } else if (WARN_ON_ONCE(err)) { + pr_tdx_error(TDH_MNG_INIT, err, &out); + ret = -EIO; + goto teardown; + } + return 0; /* @@ -586,6 +755,76 @@ static int __tdx_td_init(struct kvm *kvm) return ret; } +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct kvm_tdx_init_vm *init_vm = NULL; + struct td_params *td_params = NULL; + int ret; + + BUILD_BUG_ON(sizeof(*init_vm) != 8 * 1024); + BUILD_BUG_ON(sizeof(struct td_params) != 1024); + + if (is_hkid_assigned(kvm_tdx)) + return -EINVAL; + + if (cmd->flags) + return -EINVAL; + + init_vm = kzalloc(sizeof(*init_vm) + + sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES, + GFP_KERNEL); + if (!init_vm) + return -ENOMEM; + if (copy_from_user(init_vm, (void __user *)cmd->data, sizeof(*init_vm))) { + ret = -EFAULT; + goto out; + } + if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) { + ret = -E2BIG; + goto out; + } + if (copy_from_user(init_vm->cpuid.entries, + (void __user *)cmd->data + sizeof(*init_vm), + flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) { + ret = -EFAULT; + goto out; + } + + if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { + ret = -EINVAL; + goto out; + } + if (init_vm->cpuid.padding) { + ret = -EINVAL; + goto out; + } + + td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL); + if (!td_params) { + ret = -ENOMEM; + goto out; + } + + ret = setup_tdparams(kvm, td_params, init_vm); + if (ret) + goto out; + + ret = __tdx_td_init(kvm, td_params, &cmd->error); + if (ret) + goto out; + + kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET); + kvm_tdx->attributes = td_params->attributes; + kvm_tdx->xfam = td_params->xfam; + +out: + /* kfree() accepts NULL. */ + kfree(init_vm); + kfree(td_params); + return ret; +} + int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_tdx_cmd tdx_cmd; @@ -602,6 +841,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) case KVM_TDX_CAPABILITIES: r = tdx_get_capabilities(&tdx_cmd); break; + case KVM_TDX_INIT_VM: + r = tdx_td_init(kvm, &tdx_cmd); + break; default: r = -EINVAL; goto out; @@ -725,6 +967,17 @@ static int __init tdx_module_setup(void) tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE; + /* + * Make TDH.VP.ENTER preserve RBP so that the stack unwinder + * always work around it. Query the feature. + */ + if (!(tdx_info->features0 & MD_FIELD_ID_FEATURES0_NO_RBP_MOD) && + !IS_ENABLED(CONFIG_FRAME_POINTER)) { + pr_err("Too old version of TDX module. Consider upgrade.\n"); + ret = -EOPNOTSUPP; + goto error_out; + } + return 0; error_out: diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index ae117f864cfb..184fe394da86 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -12,7 +12,11 @@ struct kvm_tdx { unsigned long tdr_pa; unsigned long *tdcs_pa; + u64 attributes; + u64 xfam; int hkid; + + u64 tsc_offset; }; struct vcpu_tdx { @@ -39,6 +43,20 @@ static inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_tdx, vcpu); } + +static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field) +{ + struct tdx_module_args out; + u64 err; + + err = tdh_mng_rd(kvm_tdx->tdr_pa, TDCS_EXEC(field), &out); + if (unlikely(err)) { + pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err); + return 0; + } + return out.r8; +} + #else struct kvm_tdx { struct kvm kvm; diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h index e2c1a6f429d7..efc3c61c14ab 100644 --- a/arch/x86/kvm/vmx/tdx_arch.h +++ b/arch/x86/kvm/vmx/tdx_arch.h @@ -117,6 +117,12 @@ struct tdx_cpuid_value { #define TDX_TD_ATTRIBUTE_KL BIT_ULL(31) #define TDX_TD_ATTRIBUTE_PERFMON BIT_ULL(63) +/* + * TODO: Once XFEATURE_CET_{U, S} in arch/x86/include/asm/fpu/types.h is + * defined, Replace these with define ones. + */ +#define TDX_TD_XFAM_CET (BIT(11) | BIT(12)) + /* * TD_PARAMS is provided as an input to TDH_MNG_INIT, the size of which is 1024B. */