Message ID | d6a21fe6ea9eb53c24b6527ef8e5a07f0c2e8806.1708933498.git.isaku.yamahata@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v19,001/130] x86/virt/tdx: Rename _offset to _member for TD_SYSINFO_MAP() macro | expand |
>+/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ >+static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) >+{ >+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); >+ struct vcpu_tdx *tdx = to_tdx(vcpu); >+ unsigned long *tdvpx_pa = NULL; >+ unsigned long tdvpr_pa; >+ unsigned long va; >+ int ret, i; >+ u64 err; >+ >+ if (is_td_vcpu_created(tdx)) >+ return -EINVAL; >+ >+ /* >+ * vcpu_free method frees allocated pages. Avoid partial setup so >+ * that the method can't handle it. >+ */ >+ va = __get_free_page(GFP_KERNEL_ACCOUNT); >+ if (!va) >+ return -ENOMEM; >+ tdvpr_pa = __pa(va); >+ >+ tdvpx_pa = kcalloc(tdx_info->nr_tdvpx_pages, sizeof(*tdx->tdvpx_pa), >+ GFP_KERNEL_ACCOUNT); >+ if (!tdvpx_pa) { >+ ret = -ENOMEM; >+ goto free_tdvpr; >+ } >+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { >+ va = __get_free_page(GFP_KERNEL_ACCOUNT); >+ if (!va) { >+ ret = -ENOMEM; >+ goto free_tdvpx; >+ } >+ tdvpx_pa[i] = __pa(va); >+ } >+ >+ err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa); >+ if (KVM_BUG_ON(err, vcpu->kvm)) { >+ ret = -EIO; >+ pr_tdx_error(TDH_VP_CREATE, err, NULL); >+ goto free_tdvpx; >+ } >+ tdx->tdvpr_pa = tdvpr_pa; >+ >+ tdx->tdvpx_pa = tdvpx_pa; >+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { Can you merge the for-loop above into this one? then ... >+ err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]); >+ if (KVM_BUG_ON(err, vcpu->kvm)) { >+ pr_tdx_error(TDH_VP_ADDCX, err, NULL); >+ for (; i < tdx_info->nr_tdvpx_pages; i++) { >+ free_page((unsigned long)__va(tdvpx_pa[i])); >+ tdvpx_pa[i] = 0; >+ } ... no need to free remaining pages. >+ /* vcpu_free method frees TDVPX and TDR donated to TDX */ >+ return -EIO; >+ } >+ } >+ >+ err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx); >+ if (KVM_BUG_ON(err, vcpu->kvm)) { >+ pr_tdx_error(TDH_VP_INIT, err, NULL); >+ return -EIO; >+ } >+ >+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; >+ tdx->td_vcpu_created = true; >+ return 0; >+ >+free_tdvpx: >+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { >+ if (tdvpx_pa[i]) >+ free_page((unsigned long)__va(tdvpx_pa[i])); >+ tdvpx_pa[i] = 0; >+ } >+ kfree(tdvpx_pa); >+ tdx->tdvpx_pa = NULL; >+free_tdvpr: >+ if (tdvpr_pa) >+ free_page((unsigned long)__va(tdvpr_pa)); >+ tdx->tdvpr_pa = 0; >+ >+ return ret; >+} >+ >+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) >+{ >+ struct msr_data apic_base_msr; >+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); >+ struct vcpu_tdx *tdx = to_tdx(vcpu); >+ struct kvm_tdx_cmd cmd; >+ int ret; >+ >+ if (tdx->initialized) >+ return -EINVAL; >+ >+ if (!is_hkid_assigned(kvm_tdx) || is_td_finalized(kvm_tdx)) These checks look random e.g., I am not sure why is_td_created() isn't check here. A few helper functions and boolean variables are added to track which stage the TD or TD vCPU is in. e.g., is_hkid_assigned() is_td_finalized() is_td_created() tdx->initialized td_vcpu_created Insteading of doing this, I am wondering if adding two state machines for TD and TD vCPU would make the implementation clear and easy to extend. >+ return -EINVAL; >+ >+ if (copy_from_user(&cmd, argp, sizeof(cmd))) >+ return -EFAULT; >+ >+ if (cmd.error) >+ return -EINVAL; >+ >+ /* Currently only KVM_TDX_INTI_VCPU is defined for vcpu operation. */ >+ if (cmd.flags || cmd.id != KVM_TDX_INIT_VCPU) >+ return -EINVAL; Even though KVM_TD_INIT_VCPU is the only supported command, it is worthwhile to use a switch-case statement. New commands can be added easily without the need to refactor this function first. >+ >+ /* >+ * As TDX requires X2APIC, set local apic mode to X2APIC. User space >+ * VMM, e.g. qemu, is required to set CPUID[0x1].ecx.X2APIC=1 by >+ * KVM_SET_CPUID2. Otherwise kvm_set_apic_base() will fail. >+ */ >+ apic_base_msr = (struct msr_data) { >+ .host_initiated = true, >+ .data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | >+ (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0), >+ }; >+ if (kvm_set_apic_base(vcpu, &apic_base_msr)) >+ return -EINVAL; Exporting kvm_vcpu_is_reset_bsp() and kvm_set_apic_base() should be done here (rather than in a previous patch). >+ >+ ret = tdx_td_vcpu_init(vcpu, (u64)cmd.data); >+ if (ret) >+ return ret; >+ >+ tdx->initialized = true; >+ return 0; >+} >+ >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >index c002761bb662..2bd4b7c8fa51 100644 >--- a/arch/x86/kvm/x86.c >+++ b/arch/x86/kvm/x86.c >@@ -6274,6 +6274,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, > case KVM_SET_DEVICE_ATTR: > r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); > break; >+ case KVM_MEMORY_ENCRYPT_OP: >+ r = -ENOTTY; Maybe -EINVAL is better. Because previously trying to call this on vCPU fd failed with -EINVAL given ... >+ if (!kvm_x86_ops.vcpu_mem_enc_ioctl) >+ goto out; >+ r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp); >+ break; > default: > r = -EINVAL; ... this. > } >-- >2.25.1 > >
On Thu, Mar 21, 2024 at 01:43:14PM +0800, Chao Gao <chao.gao@intel.com> wrote: > >+/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ > >+static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) > >+{ > >+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); > >+ struct vcpu_tdx *tdx = to_tdx(vcpu); > >+ unsigned long *tdvpx_pa = NULL; > >+ unsigned long tdvpr_pa; > >+ unsigned long va; > >+ int ret, i; > >+ u64 err; > >+ > >+ if (is_td_vcpu_created(tdx)) > >+ return -EINVAL; > >+ > >+ /* > >+ * vcpu_free method frees allocated pages. Avoid partial setup so > >+ * that the method can't handle it. > >+ */ > >+ va = __get_free_page(GFP_KERNEL_ACCOUNT); > >+ if (!va) > >+ return -ENOMEM; > >+ tdvpr_pa = __pa(va); > >+ > >+ tdvpx_pa = kcalloc(tdx_info->nr_tdvpx_pages, sizeof(*tdx->tdvpx_pa), > >+ GFP_KERNEL_ACCOUNT); > >+ if (!tdvpx_pa) { > >+ ret = -ENOMEM; > >+ goto free_tdvpr; > >+ } > >+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { > >+ va = __get_free_page(GFP_KERNEL_ACCOUNT); > >+ if (!va) { > >+ ret = -ENOMEM; > >+ goto free_tdvpx; > >+ } > >+ tdvpx_pa[i] = __pa(va); > >+ } > >+ > >+ err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa); > >+ if (KVM_BUG_ON(err, vcpu->kvm)) { > >+ ret = -EIO; > >+ pr_tdx_error(TDH_VP_CREATE, err, NULL); > >+ goto free_tdvpx; > >+ } > >+ tdx->tdvpr_pa = tdvpr_pa; > >+ > >+ tdx->tdvpx_pa = tdvpx_pa; > >+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { > > Can you merge the for-loop above into this one? then ... > > >+ err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]); > >+ if (KVM_BUG_ON(err, vcpu->kvm)) { > >+ pr_tdx_error(TDH_VP_ADDCX, err, NULL); > > >+ for (; i < tdx_info->nr_tdvpx_pages; i++) { > >+ free_page((unsigned long)__va(tdvpx_pa[i])); > >+ tdvpx_pa[i] = 0; > >+ } > > ... no need to free remaining pages. Makes sense. Let me clean up this. > >+ /* vcpu_free method frees TDVPX and TDR donated to TDX */ > >+ return -EIO; > >+ } > >+ } > >+ > >+ err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx); > >+ if (KVM_BUG_ON(err, vcpu->kvm)) { > >+ pr_tdx_error(TDH_VP_INIT, err, NULL); > >+ return -EIO; > >+ } > >+ > >+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; > >+ tdx->td_vcpu_created = true; > >+ return 0; > >+ > >+free_tdvpx: > >+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { > >+ if (tdvpx_pa[i]) > >+ free_page((unsigned long)__va(tdvpx_pa[i])); > >+ tdvpx_pa[i] = 0; > >+ } > >+ kfree(tdvpx_pa); > >+ tdx->tdvpx_pa = NULL; > >+free_tdvpr: > >+ if (tdvpr_pa) > >+ free_page((unsigned long)__va(tdvpr_pa)); > >+ tdx->tdvpr_pa = 0; > >+ > >+ return ret; > >+} > >+ > >+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) > >+{ > >+ struct msr_data apic_base_msr; > >+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); > >+ struct vcpu_tdx *tdx = to_tdx(vcpu); > >+ struct kvm_tdx_cmd cmd; > >+ int ret; > >+ > >+ if (tdx->initialized) > >+ return -EINVAL; > >+ > >+ if (!is_hkid_assigned(kvm_tdx) || is_td_finalized(kvm_tdx)) > > These checks look random e.g., I am not sure why is_td_created() isn't check here. > > A few helper functions and boolean variables are added to track which stage the > TD or TD vCPU is in. e.g., > > is_hkid_assigned() > is_td_finalized() > is_td_created() > tdx->initialized > td_vcpu_created > > Insteading of doing this, I am wondering if adding two state machines for > TD and TD vCPU would make the implementation clear and easy to extend. Let me look into the state machine. Originally I hoped we don't need it, but it seems to deserve the state machine.. > >+ return -EINVAL; > >+ > >+ if (copy_from_user(&cmd, argp, sizeof(cmd))) > >+ return -EFAULT; > >+ > >+ if (cmd.error) > >+ return -EINVAL; > >+ > >+ /* Currently only KVM_TDX_INTI_VCPU is defined for vcpu operation. */ > >+ if (cmd.flags || cmd.id != KVM_TDX_INIT_VCPU) > >+ return -EINVAL; > > Even though KVM_TD_INIT_VCPU is the only supported command, it is worthwhile to > use a switch-case statement. New commands can be added easily without the need > to refactor this function first. Yes. For KVM_MAP_MEMORY, I will make KVM_TDX_INIT_MEM_REGION vcpu ioctl instead of vm ioctl because it is consistent and scalable. We'll have switch statement in the next respin. > >+ > >+ /* > >+ * As TDX requires X2APIC, set local apic mode to X2APIC. User space > >+ * VMM, e.g. qemu, is required to set CPUID[0x1].ecx.X2APIC=1 by > >+ * KVM_SET_CPUID2. Otherwise kvm_set_apic_base() will fail. > >+ */ > >+ apic_base_msr = (struct msr_data) { > >+ .host_initiated = true, > >+ .data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | > >+ (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0), > >+ }; > >+ if (kvm_set_apic_base(vcpu, &apic_base_msr)) > >+ return -EINVAL; > > Exporting kvm_vcpu_is_reset_bsp() and kvm_set_apic_base() should be done > here (rather than in a previous patch). Sure. > >+ > >+ ret = tdx_td_vcpu_init(vcpu, (u64)cmd.data); > >+ if (ret) > >+ return ret; > >+ > >+ tdx->initialized = true; > >+ return 0; > >+} > >+ > > >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > >index c002761bb662..2bd4b7c8fa51 100644 > >--- a/arch/x86/kvm/x86.c > >+++ b/arch/x86/kvm/x86.c > >@@ -6274,6 +6274,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, > > case KVM_SET_DEVICE_ATTR: > > r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); > > break; > >+ case KVM_MEMORY_ENCRYPT_OP: > >+ r = -ENOTTY; > > Maybe -EINVAL is better. Because previously trying to call this on vCPU fd > failed with -EINVAL given ... Oh, ok. Will change it. I followed VM ioctl case as default value. But vcpu ioctl seems to have -EINVAL as default value.
On Mon, 2024-02-26 at 00:25 -0800, isaku.yamahata@intel.com wrote: > +/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ > +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) > +{ > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); > + struct vcpu_tdx *tdx = to_tdx(vcpu); > + unsigned long *tdvpx_pa = NULL; > + unsigned long tdvpr_pa; I think we could drop theselocal variables and just use tdx->tdvpr_pa and tdx->tdvpx_pa. Then we don't have to have the assignments later. > + unsigned long va; > + int ret, i; > + u64 err; > + > + if (is_td_vcpu_created(tdx)) > + return -EINVAL; > + > + /* > + * vcpu_free method frees allocated pages. Avoid partial setup so > + * that the method can't handle it. > + */ > + va = __get_free_page(GFP_KERNEL_ACCOUNT); > + if (!va) > + return -ENOMEM; > + tdvpr_pa = __pa(va); > + > + tdvpx_pa = kcalloc(tdx_info->nr_tdvpx_pages, sizeof(*tdx->tdvpx_pa), > + GFP_KERNEL_ACCOUNT); > + if (!tdvpx_pa) { > + ret = -ENOMEM; > + goto free_tdvpr; > + } > + for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { > + va = __get_free_page(GFP_KERNEL_ACCOUNT); > + if (!va) { > + ret = -ENOMEM; > + goto free_tdvpx; > + } > + tdvpx_pa[i] = __pa(va); > + } > + > + err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa); > + if (KVM_BUG_ON(err, vcpu->kvm)) { > + ret = -EIO; > + pr_tdx_error(TDH_VP_CREATE, err, NULL); > + goto free_tdvpx; > + } > + tdx->tdvpr_pa = tdvpr_pa; > + > + tdx->tdvpx_pa = tdvpx_pa; Or alternatively let's move these to right before they are used. (in the current branch > + for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { > + err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]); > + if (KVM_BUG_ON(err, vcpu->kvm)) { > + pr_tdx_error(TDH_VP_ADDCX, err, NULL); > + for (; i < tdx_info->nr_tdvpx_pages; i++) { > + free_page((unsigned long)__va(tdvpx_pa[i])); > + tdvpx_pa[i] = 0; > + } > + /* vcpu_free method frees TDVPX and TDR donated to TDX */ > + return -EIO; > + } > + } > > In the current branch tdh_vp_init() takes struct vcpu_tdx, so they would be moved right here. What do you think? > + > + err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx); > + if (KVM_BUG_ON(err, vcpu->kvm)) { > + pr_tdx_error(TDH_VP_INIT, err, NULL); > + return -EIO; > + } > + > + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; > + tdx->td_vcpu_created = true; > + return 0; > + > +free_tdvpx: > + for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { > + if (tdvpx_pa[i]) > + free_page((unsigned long)__va(tdvpx_pa[i])); > + tdvpx_pa[i] = 0; > + } > + kfree(tdvpx_pa); > + tdx->tdvpx_pa = NULL; > +free_tdvpr: > + if (tdvpr_pa) > + free_page((unsigned long)__va(tdvpr_pa)); > + tdx->tdvpr_pa = 0; > + > + return ret; > +}
On Wed, Mar 27, 2024 at 12:27:03AM +0000, "Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote: > On Mon, 2024-02-26 at 00:25 -0800, isaku.yamahata@intel.com wrote: > > +/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ > > +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) > > +{ > > + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); > > + struct vcpu_tdx *tdx = to_tdx(vcpu); > > + unsigned long *tdvpx_pa = NULL; > > + unsigned long tdvpr_pa; > > > I think we could drop theselocal variables and just use tdx->tdvpr_pa and tdx->tdvpx_pa. Then we > don't have to have the assignments later. Yes, let me clean it up. The old version acquired spin lock in the middle. Now we don't have it. > > + unsigned long va; > > + int ret, i; > > + u64 err; > > + > > + if (is_td_vcpu_created(tdx)) > > + return -EINVAL; > > + > > + /* > > + * vcpu_free method frees allocated pages. Avoid partial setup so > > + * that the method can't handle it. > > + */ > > + va = __get_free_page(GFP_KERNEL_ACCOUNT); > > + if (!va) > > + return -ENOMEM; > > + tdvpr_pa = __pa(va); > > + > > + tdvpx_pa = kcalloc(tdx_info->nr_tdvpx_pages, sizeof(*tdx->tdvpx_pa), > > + GFP_KERNEL_ACCOUNT); > > + if (!tdvpx_pa) { > > + ret = -ENOMEM; > > + goto free_tdvpr; > > + } > > + for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { > > + va = __get_free_page(GFP_KERNEL_ACCOUNT); > > + if (!va) { > > + ret = -ENOMEM; > > + goto free_tdvpx; > > + } > > + tdvpx_pa[i] = __pa(va); > > + } > > + > > + err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa); > > + if (KVM_BUG_ON(err, vcpu->kvm)) { > > + ret = -EIO; > > + pr_tdx_error(TDH_VP_CREATE, err, NULL); > > + goto free_tdvpx; > > + } > > + tdx->tdvpr_pa = tdvpr_pa; > > + > > + tdx->tdvpx_pa = tdvpx_pa; > > Or alternatively let's move these to right before they are used. (in the current branch > > > + for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { > > + err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]); > > + if (KVM_BUG_ON(err, vcpu->kvm)) { > > + pr_tdx_error(TDH_VP_ADDCX, err, NULL); > > + for (; i < tdx_info->nr_tdvpx_pages; i++) { > > + free_page((unsigned long)__va(tdvpx_pa[i])); > > + tdvpx_pa[i] = 0; > > + } > > + /* vcpu_free method frees TDVPX and TDR donated to TDX */ > > + return -EIO; > > + } > > + } > > > > > In the current branch tdh_vp_init() takes struct vcpu_tdx, so they would be moved right here. > > What do you think? Yes, I should revise the error recovery path.
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index f78200492a3d..a8e96804a252 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -129,6 +129,7 @@ KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) #endif KVM_X86_OP(mem_enc_ioctl) +KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl) KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0e2408a4707e..5da3c211955d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1778,6 +1778,7 @@ struct kvm_x86_ops { #endif int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); + int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 9ac0246bd974..4000a2e087a8 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -571,6 +571,7 @@ struct kvm_pmu_event_filter { enum kvm_tdx_cmd_id { KVM_TDX_CAPABILITIES = 0, KVM_TDX_INIT_VM, + KVM_TDX_INIT_VCPU, KVM_TDX_CMD_NR_MAX, }; diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 5796fb45433f..d0f75020579f 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -131,6 +131,14 @@ static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) return tdx_vm_ioctl(kvm, argp); } +static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + if (!is_td_vcpu(vcpu)) + return -EINVAL; + + return tdx_vcpu_ioctl(vcpu, argp); +} + #define VMX_REQUIRED_APICV_INHIBITS \ (BIT(APICV_INHIBIT_REASON_DISABLE)| \ BIT(APICV_INHIBIT_REASON_ABSENT) | \ @@ -291,6 +299,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .get_untagged_addr = vmx_get_untagged_addr, .mem_enc_ioctl = vt_mem_enc_ioctl, + .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl, }; struct kvm_x86_init_ops vt_init_ops __initdata = { diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 51283d2cd011..aa1da51b8af7 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -49,6 +49,7 @@ struct tdx_info { u64 xfam_fixed1; u8 nr_tdcs_pages; + u8 nr_tdvpx_pages; u16 num_cpuid_config; /* This must the last member. */ @@ -104,6 +105,11 @@ static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid) return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits); } +static inline bool is_td_vcpu_created(struct vcpu_tdx *tdx) +{ + return tdx->td_vcpu_created; +} + static inline bool is_td_created(struct kvm_tdx *kvm_tdx) { return kvm_tdx->tdr_pa; @@ -121,6 +127,11 @@ static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) return kvm_tdx->hkid > 0; } +static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx) +{ + return kvm_tdx->finalized; +} + static void tdx_clear_page(unsigned long page_pa) { const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); @@ -399,7 +410,32 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu) void tdx_vcpu_free(struct kvm_vcpu *vcpu) { - /* This is stub for now. More logic will come. */ + struct vcpu_tdx *tdx = to_tdx(vcpu); + int i; + + /* + * This methods can be called when vcpu allocation/initialization + * failed. So it's possible that hkid, tdvpx and tdvpr are not assigned + * yet. + */ + if (is_hkid_assigned(to_kvm_tdx(vcpu->kvm))) { + WARN_ON_ONCE(tdx->tdvpx_pa); + WARN_ON_ONCE(tdx->tdvpr_pa); + return; + } + + if (tdx->tdvpx_pa) { + for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { + if (tdx->tdvpx_pa[i]) + tdx_reclaim_control_page(tdx->tdvpx_pa[i]); + } + kfree(tdx->tdvpx_pa); + tdx->tdvpx_pa = NULL; + } + if (tdx->tdvpr_pa) { + tdx_reclaim_control_page(tdx->tdvpr_pa); + tdx->tdvpr_pa = 0; + } } void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -408,8 +444,13 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) /* Ignore INIT silently because TDX doesn't support INIT event. */ if (init_event) return; + if (KVM_BUG_ON(is_td_vcpu_created(to_tdx(vcpu)), vcpu->kvm)) + return; - /* This is stub for now. More logic will come here. */ + /* + * Don't update mp_state to runnable because more initialization + * is needed by TDX_VCPU_INIT. + */ } static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) @@ -904,6 +945,137 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) return r; } +/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct vcpu_tdx *tdx = to_tdx(vcpu); + unsigned long *tdvpx_pa = NULL; + unsigned long tdvpr_pa; + unsigned long va; + int ret, i; + u64 err; + + if (is_td_vcpu_created(tdx)) + return -EINVAL; + + /* + * vcpu_free method frees allocated pages. Avoid partial setup so + * that the method can't handle it. + */ + va = __get_free_page(GFP_KERNEL_ACCOUNT); + if (!va) + return -ENOMEM; + tdvpr_pa = __pa(va); + + tdvpx_pa = kcalloc(tdx_info->nr_tdvpx_pages, sizeof(*tdx->tdvpx_pa), + GFP_KERNEL_ACCOUNT); + if (!tdvpx_pa) { + ret = -ENOMEM; + goto free_tdvpr; + } + for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { + va = __get_free_page(GFP_KERNEL_ACCOUNT); + if (!va) { + ret = -ENOMEM; + goto free_tdvpx; + } + tdvpx_pa[i] = __pa(va); + } + + err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa); + if (KVM_BUG_ON(err, vcpu->kvm)) { + ret = -EIO; + pr_tdx_error(TDH_VP_CREATE, err, NULL); + goto free_tdvpx; + } + tdx->tdvpr_pa = tdvpr_pa; + + tdx->tdvpx_pa = tdvpx_pa; + for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { + err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]); + if (KVM_BUG_ON(err, vcpu->kvm)) { + pr_tdx_error(TDH_VP_ADDCX, err, NULL); + for (; i < tdx_info->nr_tdvpx_pages; i++) { + free_page((unsigned long)__va(tdvpx_pa[i])); + tdvpx_pa[i] = 0; + } + /* vcpu_free method frees TDVPX and TDR donated to TDX */ + return -EIO; + } + } + + err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx); + if (KVM_BUG_ON(err, vcpu->kvm)) { + pr_tdx_error(TDH_VP_INIT, err, NULL); + return -EIO; + } + + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + tdx->td_vcpu_created = true; + return 0; + +free_tdvpx: + for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) { + if (tdvpx_pa[i]) + free_page((unsigned long)__va(tdvpx_pa[i])); + tdvpx_pa[i] = 0; + } + kfree(tdvpx_pa); + tdx->tdvpx_pa = NULL; +free_tdvpr: + if (tdvpr_pa) + free_page((unsigned long)__va(tdvpr_pa)); + tdx->tdvpr_pa = 0; + + return ret; +} + +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + struct msr_data apic_base_msr; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct kvm_tdx_cmd cmd; + int ret; + + if (tdx->initialized) + return -EINVAL; + + if (!is_hkid_assigned(kvm_tdx) || is_td_finalized(kvm_tdx)) + return -EINVAL; + + if (copy_from_user(&cmd, argp, sizeof(cmd))) + return -EFAULT; + + if (cmd.error) + return -EINVAL; + + /* Currently only KVM_TDX_INTI_VCPU is defined for vcpu operation. */ + if (cmd.flags || cmd.id != KVM_TDX_INIT_VCPU) + return -EINVAL; + + /* + * As TDX requires X2APIC, set local apic mode to X2APIC. User space + * VMM, e.g. qemu, is required to set CPUID[0x1].ecx.X2APIC=1 by + * KVM_SET_CPUID2. Otherwise kvm_set_apic_base() will fail. + */ + apic_base_msr = (struct msr_data) { + .host_initiated = true, + .data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | + (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0), + }; + if (kvm_set_apic_base(vcpu, &apic_base_msr)) + return -EINVAL; + + ret = tdx_td_vcpu_init(vcpu, (u64)cmd.data); + if (ret) + return ret; + + tdx->initialized = true; + return 0; +} + #define TDX_MD_MAP(_fid, _ptr) \ { .fid = MD_FIELD_ID_##_fid, \ .ptr = (_ptr), } @@ -953,13 +1125,14 @@ static int tdx_md_read(struct tdx_md_map *maps, int nr_maps) static int __init tdx_module_setup(void) { - u16 num_cpuid_config, tdcs_base_size; + u16 num_cpuid_config, tdcs_base_size, tdvps_base_size; int ret; u32 i; struct tdx_md_map mds[] = { TDX_MD_MAP(NUM_CPUID_CONFIG, &num_cpuid_config), TDX_MD_MAP(TDCS_BASE_SIZE, &tdcs_base_size), + TDX_MD_MAP(TDVPS_BASE_SIZE, &tdvps_base_size), }; struct tdx_metadata_field_mapping fields[] = { @@ -1013,6 +1186,11 @@ static int __init tdx_module_setup(void) } tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE; + /* + * TDVPS = TDVPR(4K page) + TDVPX(multiple 4K pages). + * -1 for TDVPR. + */ + tdx_info->nr_tdvpx_pages = tdvps_base_size / PAGE_SIZE - 1; /* * Make TDH.VP.ENTER preserve RBP so that the stack unwinder diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index 173ed19207fb..d3077151252c 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -17,12 +17,20 @@ struct kvm_tdx { u64 xfam; int hkid; + bool finalized; + u64 tsc_offset; }; struct vcpu_tdx { struct kvm_vcpu vcpu; + unsigned long tdvpr_pa; + unsigned long *tdvpx_pa; + bool td_vcpu_created; + + bool initialized; + /* * Dummy to make pmu_intel not corrupt memory. * TODO: Support PMU for TDX. Future work. diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index bb73a9b5b354..f5820f617b2e 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -150,6 +150,8 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp); int tdx_vcpu_create(struct kvm_vcpu *vcpu); void tdx_vcpu_free(struct kvm_vcpu *vcpu); void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); + +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); #else static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -EOPNOTSUPP; } static inline void tdx_hardware_unsetup(void) {} @@ -169,6 +171,8 @@ static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOP static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; } static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {} static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {} + +static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; } #endif #endif /* __KVM_X86_VMX_X86_OPS_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c002761bb662..2bd4b7c8fa51 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6274,6 +6274,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_SET_DEVICE_ATTR: r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); break; + case KVM_MEMORY_ENCRYPT_OP: + r = -ENOTTY; + if (!kvm_x86_ops.vcpu_mem_enc_ioctl) + goto out; + r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp); + break; default: r = -EINVAL; }