@@ -21,6 +21,7 @@ KVM_X86_OP(has_emulated_msr)
KVM_X86_OP(vcpu_after_set_cpuid)
KVM_X86_OP(vm_init)
KVM_X86_OP_OPTIONAL(vm_destroy)
+KVM_X86_OP_OPTIONAL(vm_free)
KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate)
KVM_X86_OP(vcpu_create)
KVM_X86_OP(vcpu_free)
@@ -1647,6 +1647,7 @@ struct kvm_x86_ops {
unsigned int vm_size;
int (*vm_init)(struct kvm *kvm);
void (*vm_destroy)(struct kvm *kvm);
+ void (*vm_free)(struct kvm *kvm);
/* Create, but do not attach this VCPU */
int (*vcpu_precreate)(struct kvm *kvm);
@@ -92,6 +92,8 @@ config KVM_SW_PROTECTED_VM
config KVM_INTEL
tristate "KVM for Intel (and compatible) processors support"
depends on KVM && IA32_FEAT_CTL
+ select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST
+ select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
help
Provides support for KVM on processors equipped with Intel's VT
extensions, a.k.a. Virtual Machine Extensions (VMX).
@@ -41,6 +41,28 @@ static __init int vt_hardware_setup(void)
return 0;
}
+static int vt_vm_init(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return tdx_vm_init(kvm);
+
+ return vmx_vm_init(kvm);
+}
+
+static void vt_vm_destroy(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return tdx_mmu_release_hkid(kvm);
+
+ vmx_vm_destroy(kvm);
+}
+
+static void vt_vm_free(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ tdx_vm_free(kvm);
+}
+
static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
if (!is_td(kvm))
@@ -72,8 +94,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.has_emulated_msr = vmx_has_emulated_msr,
.vm_size = sizeof(struct kvm_vmx),
- .vm_init = vmx_vm_init,
- .vm_destroy = vmx_vm_destroy,
+
+ .vm_init = vt_vm_init,
+ .vm_destroy = vt_vm_destroy,
+ .vm_free = vt_vm_free,
.vcpu_precreate = vmx_vcpu_precreate,
.vcpu_create = vmx_vcpu_create,
@@ -110,6 +110,285 @@ static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
return 0;
}
+/*
+ * Some SEAMCALLs acquire the TDX module globally, and can fail with
+ * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs.
+ */
+static DEFINE_MUTEX(tdx_lock);
+
+/* Maximum number of retries to attempt for SEAMCALLs. */
+#define TDX_SEAMCALL_RETRIES 10000
+
+static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
+{
+ return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
+}
+
+static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
+{
+ tdx_guest_keyid_free(kvm_tdx->hkid);
+ kvm_tdx->hkid = -1;
+}
+
+static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
+{
+ return kvm_tdx->hkid > 0;
+}
+
+static void tdx_clear_page(unsigned long page_pa)
+{
+ const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+ void *page = __va(page_pa);
+ unsigned long i;
+
+ /*
+ * The page could have been poisoned. MOVDIR64B also clears
+ * the poison bit so the kernel can safely use the page again.
+ */
+ for (i = 0; i < PAGE_SIZE; i += 64)
+ movdir64b(page + i, zero_page);
+ /*
+ * MOVDIR64B store uses WC buffer. Prevent following memory reads
+ * from seeing potentially poisoned cache.
+ */
+ __mb();
+}
+
+/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
+static int __tdx_reclaim_page(hpa_t pa)
+{
+ u64 err, rcx, rdx, r8;
+ int i;
+
+ for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
+ err = tdh_phymem_page_reclaim(pa, &rcx, &rdx, &r8);
+
+ /*
+ * TDH.PHYMEM.PAGE.RECLAIM is allowed only when TD is shutdown.
+ * state. i.e. destructing TD.
+ * TDH.PHYMEM.PAGE.RECLAIM requires TDR and target page.
+ * Because we're destructing TD, it's rare to contend with TDR.
+ */
+ switch (err) {
+ case TDX_OPERAND_BUSY | TDX_OPERAND_ID_RCX:
+ case TDX_OPERAND_BUSY | TDX_OPERAND_ID_TDR:
+ cond_resched();
+ continue;
+ default:
+ goto out;
+ }
+ }
+
+out:
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int tdx_reclaim_page(hpa_t pa)
+{
+ int r;
+
+ r = __tdx_reclaim_page(pa);
+ if (!r)
+ tdx_clear_page(pa);
+ return r;
+}
+
+
+/*
+ * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
+ * private KeyID. Assume the cache associated with the TDX private KeyID has
+ * been flushed.
+ */
+static void tdx_reclaim_control_page(unsigned long ctrl_page_pa)
+{
+ /*
+ * Leak the page if the kernel failed to reclaim the page.
+ * The kernel cannot use it safely anymore.
+ */
+ if (tdx_reclaim_page(ctrl_page_pa))
+ return;
+
+ free_page((unsigned long)__va(ctrl_page_pa));
+}
+
+static void smp_func_do_phymem_cache_wb(void *unused)
+{
+ u64 err = 0;
+ bool resume;
+ int i;
+
+ /*
+ * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
+ * KeyID on the package or core. The TDX module may not finish the
+ * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The
+ * kernel should retry it until it returns success w/o rescheduling.
+ */
+ for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
+ resume = !!err;
+ err = tdh_phymem_cache_wb(resume);
+ switch (err) {
+ case TDX_INTERRUPTED_RESUMABLE:
+ continue;
+ case TDX_NO_HKID_READY_TO_WBCACHE:
+ err = TDX_SUCCESS; /* Already done by other thread */
+ fallthrough;
+ default:
+ goto out;
+ }
+ }
+
+out:
+ if (WARN_ON_ONCE(err))
+ pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
+}
+
+void tdx_mmu_release_hkid(struct kvm *kvm)
+{
+ bool packages_allocated, targets_allocated;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ cpumask_var_t packages, targets;
+ u64 err;
+ int i;
+
+ if (!is_hkid_assigned(kvm_tdx))
+ return;
+
+ /* KeyID has been allocated but guest is not yet configured */
+ if (!kvm_tdx->tdr_pa) {
+ tdx_hkid_free(kvm_tdx);
+ return;
+ }
+
+ packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
+ targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
+ cpus_read_lock();
+
+ /*
+ * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
+ * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
+ * Multiple TDX guests can be destroyed simultaneously. Take the
+ * mutex to prevent it from getting error.
+ */
+ mutex_lock(&tdx_lock);
+
+ /*
+ * Releasing HKID is in vm_destroy().
+ * After the above flushing vps, there should be no more vCPU
+ * associations, as all vCPU fds have been released at this stage.
+ */
+ for_each_online_cpu(i) {
+ if (packages_allocated &&
+ cpumask_test_and_set_cpu(topology_physical_package_id(i),
+ packages))
+ continue;
+ if (targets_allocated)
+ cpumask_set_cpu(i, targets);
+ }
+ if (targets_allocated)
+ on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
+ else
+ on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
+ /*
+ * In the case of error in smp_func_do_phymem_cache_wb(), the following
+ * tdh_mng_key_freeid() will fail.
+ */
+ err = tdh_mng_key_freeid(kvm_tdx->tdr_pa);
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error(TDH_MNG_KEY_FREEID, err);
+ pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
+ kvm_tdx->hkid);
+ } else {
+ tdx_hkid_free(kvm_tdx);
+ }
+
+ mutex_unlock(&tdx_lock);
+ cpus_read_unlock();
+ free_cpumask_var(targets);
+ free_cpumask_var(packages);
+}
+
+static void tdx_reclaim_td_control_pages(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err;
+ int i;
+
+ /*
+ * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong
+ * heavily with TDX module. Give up freeing TD pages. As the function
+ * already warned, don't warn it again.
+ */
+ if (is_hkid_assigned(kvm_tdx))
+ return;
+
+ if (kvm_tdx->tdcs_pa) {
+ for (i = 0; i < kvm_tdx->nr_tdcs_pages; i++) {
+ if (!kvm_tdx->tdcs_pa[i])
+ continue;
+
+ tdx_reclaim_control_page(kvm_tdx->tdcs_pa[i]);
+ }
+ kfree(kvm_tdx->tdcs_pa);
+ kvm_tdx->tdcs_pa = NULL;
+ }
+
+ if (!kvm_tdx->tdr_pa)
+ return;
+
+ if (__tdx_reclaim_page(kvm_tdx->tdr_pa))
+ return;
+
+ /*
+ * Use a SEAMCALL to ask the TDX module to flush the cache based on the
+ * KeyID. TDX module may access TDR while operating on TD (Especially
+ * when it is reclaiming TDCS).
+ */
+ err = tdh_phymem_page_wbinvd_tdr(kvm_tdx->tdr_pa);
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+ return;
+ }
+ tdx_clear_page(kvm_tdx->tdr_pa);
+
+ free_page((unsigned long)__va(kvm_tdx->tdr_pa));
+ kvm_tdx->tdr_pa = 0;
+}
+
+void tdx_vm_free(struct kvm *kvm)
+{
+ tdx_reclaim_td_control_pages(kvm);
+}
+
+static int tdx_do_tdh_mng_key_config(void *param)
+{
+ struct kvm_tdx *kvm_tdx = param;
+ u64 err;
+
+ /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
+ err = tdh_mng_key_config(kvm_tdx->tdr_pa);
+
+ if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
+ pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int __tdx_td_init(struct kvm *kvm);
+
+int tdx_vm_init(struct kvm *kvm)
+{
+ kvm->arch.has_private_mem = true;
+
+ /* Place holder for TDX specific logic. */
+ return __tdx_td_init(kvm);
+}
+
static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
{
const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
@@ -158,6 +437,180 @@ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
return ret;
}
+static int __tdx_td_init(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ cpumask_var_t packages;
+ unsigned long *tdcs_pa = NULL;
+ unsigned long tdr_pa = 0;
+ unsigned long va;
+ int ret, i;
+ u64 err;
+
+ ret = tdx_guest_keyid_alloc();
+ if (ret < 0)
+ return ret;
+ kvm_tdx->hkid = ret;
+
+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!va)
+ goto free_hkid;
+ tdr_pa = __pa(va);
+
+ kvm_tdx->nr_tdcs_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
+ tdcs_pa = kcalloc(kvm_tdx->nr_tdcs_pages, sizeof(*kvm_tdx->tdcs_pa),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!tdcs_pa)
+ goto free_tdr;
+
+ for (i = 0; i < kvm_tdx->nr_tdcs_pages; i++) {
+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!va)
+ goto free_tdcs;
+ tdcs_pa[i] = __pa(va);
+ }
+
+ if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto free_tdcs;
+ }
+
+ cpus_read_lock();
+
+ /*
+ * Need at least one CPU of the package to be online in order to
+ * program all packages for host key id. Check it.
+ */
+ for_each_present_cpu(i)
+ cpumask_set_cpu(topology_physical_package_id(i), packages);
+ for_each_online_cpu(i)
+ cpumask_clear_cpu(topology_physical_package_id(i), packages);
+ if (!cpumask_empty(packages)) {
+ ret = -EIO;
+ /*
+ * Because it's hard for human operator to figure out the
+ * reason, warn it.
+ */
+#define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n"
+ pr_warn_ratelimited(MSG_ALLPKG);
+ goto free_packages;
+ }
+
+ /*
+ * TDH.MNG.CREATE tries to grab the global TDX module and fails
+ * with TDX_OPERAND_BUSY when it fails to grab. Take the global
+ * lock to prevent it from failure.
+ */
+ mutex_lock(&tdx_lock);
+ kvm_tdx->tdr_pa = tdr_pa;
+ err = tdh_mng_create(kvm_tdx->tdr_pa, kvm_tdx->hkid);
+ mutex_unlock(&tdx_lock);
+
+ if (err == TDX_RND_NO_ENTROPY) {
+ ret = -EAGAIN;
+ goto free_packages;
+ }
+
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_MNG_CREATE, err);
+ ret = -EIO;
+ goto free_packages;
+ }
+
+ for_each_online_cpu(i) {
+ int pkg = topology_physical_package_id(i);
+
+ if (cpumask_test_and_set_cpu(pkg, packages))
+ continue;
+
+ /*
+ * Program the memory controller in the package with an
+ * encryption key associated to a TDX private host key id
+ * assigned to this TDR. Concurrent operations on same memory
+ * controller results in TDX_OPERAND_BUSY. No locking needed
+ * beyond the cpus_read_lock() above as it serializes against
+ * hotplug and the first online CPU of the package is always
+ * used. We never have two CPUs in the same socket trying to
+ * program the key.
+ */
+ ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
+ kvm_tdx, true);
+ if (ret)
+ break;
+ }
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+ if (ret) {
+ i = 0;
+ goto teardown;
+ }
+
+ kvm_tdx->tdcs_pa = tdcs_pa;
+ for (i = 0; i < kvm_tdx->nr_tdcs_pages; i++) {
+ err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
+ if (err == TDX_RND_NO_ENTROPY) {
+ /* Here it's hard to allow userspace to retry. */
+ ret = -EBUSY;
+ goto teardown;
+ }
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_MNG_ADDCX, err);
+ ret = -EIO;
+ goto teardown;
+ }
+ }
+
+ /*
+ * Note, TDH_MNG_INIT cannot be invoked here. TDH_MNG_INIT requires a dedicated
+ * ioctl() to define the configure CPUID values for the TD.
+ */
+ return 0;
+
+ /*
+ * The sequence for freeing resources from a partially initialized TD
+ * varies based on where in the initialization flow failure occurred.
+ * Simply use the full teardown and destroy, which naturally play nice
+ * with partial initialization.
+ */
+teardown:
+ /* Only free pages not yet added, so start at 'i' */
+ for (; i < kvm_tdx->nr_tdcs_pages; i++) {
+ if (tdcs_pa[i]) {
+ free_page((unsigned long)__va(tdcs_pa[i]));
+ tdcs_pa[i] = 0;
+ }
+ }
+ if (!kvm_tdx->tdcs_pa)
+ kfree(tdcs_pa);
+
+ tdx_mmu_release_hkid(kvm);
+ tdx_reclaim_td_control_pages(kvm);
+
+ return ret;
+
+free_packages:
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+
+free_tdcs:
+ for (i = 0; i < kvm_tdx->nr_tdcs_pages; i++) {
+ if (tdcs_pa[i])
+ free_page((unsigned long)__va(tdcs_pa[i]));
+ }
+ kfree(tdcs_pa);
+ kvm_tdx->tdcs_pa = NULL;
+
+free_tdr:
+ if (tdr_pa)
+ free_page((unsigned long)__va(tdr_pa));
+ kvm_tdx->tdr_pa = 0;
+
+free_hkid:
+ tdx_hkid_free(kvm_tdx);
+
+ return ret;
+}
+
int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
{
struct kvm_tdx_cmd tdx_cmd;
@@ -251,6 +704,11 @@ static int __init __tdx_bringup(void)
{
int r;
+ if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
+ pr_warn("MOVDIR64B is reqiured for TDX\n");
+ return -EOPNOTSUPP;
+ }
+
if (!enable_ept) {
pr_err("Cannot enable TDX with EPT disabled.\n");
return -EINVAL;
@@ -12,7 +12,12 @@ extern bool enable_tdx;
struct kvm_tdx {
struct kvm kvm;
- /* TDX specific members follow. */
+
+ unsigned long tdr_pa;
+ unsigned long *tdcs_pa;
+
+ int hkid;
+ u8 nr_tdcs_pages;
};
struct vcpu_tdx {
@@ -119,8 +119,14 @@ void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu);
void vmx_setup_mce(struct kvm_vcpu *vcpu);
#ifdef CONFIG_INTEL_TDX_HOST
+int tdx_vm_init(struct kvm *kvm);
+void tdx_mmu_release_hkid(struct kvm *kvm);
+void tdx_vm_free(struct kvm *kvm);
int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
#else
+static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
+static inline void tdx_mmu_release_hkid(struct kvm *kvm) {}
+static inline void tdx_vm_free(struct kvm *kvm) {}
static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
#endif
@@ -12883,6 +12883,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_page_track_cleanup(kvm);
kvm_xen_destroy_vm(kvm);
kvm_hv_destroy_vm(kvm);
+ static_call_cond(kvm_x86_vm_free)(kvm);
}
static void memslot_rmap_free(struct kvm_memory_slot *slot)