@@ -114,10 +114,12 @@ static inline u64 sc_retry(sc_func_t func, u64 fn,
bool platform_tdx_enabled(void);
int tdx_cpu_enable(void);
int tdx_enable(void);
+void tdx_reset_memory(void);
#else
static inline bool platform_tdx_enabled(void) { return false; }
static inline int tdx_cpu_enable(void) { return -ENODEV; }
static inline int tdx_enable(void) { return -ENODEV; }
+static inline void tdx_reset_memory(void) { }
#endif /* CONFIG_INTEL_TDX_HOST */
#endif /* !__ASSEMBLY__ */
@@ -28,6 +28,7 @@
#include <asm/setup.h>
#include <asm/set_memory.h>
#include <asm/cpu.h>
+#include <asm/tdx.h>
#ifdef CONFIG_ACPI
/*
@@ -301,9 +302,24 @@ void machine_kexec(struct kimage *image)
void *control_page;
int save_ftrace_enabled;
+ /*
+ * For platforms with TDX "partial write machine check" erratum,
+ * all TDX private pages need to be converted back to normal
+ * before booting to the new kernel, otherwise the new kernel
+ * may get unexpected machine check.
+ *
+ * But skip this when preserve_context is on. The second kernel
+ * shouldn't write to the first kernel's memory anyway. Skipping
+ * this also avoids killing TDX in the first kernel, which would
+ * require more complicated handling.
+ */
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
save_processor_state();
+ else
+ tdx_reset_memory();
+#else
+ tdx_reset_memory();
#endif
save_ftrace_enabled = __ftrace_enabled_save();
@@ -25,6 +25,7 @@
#include <linux/align.h>
#include <linux/sort.h>
#include <linux/log2.h>
+#include <linux/reboot.h>
#include <asm/msr-index.h>
#include <asm/msr.h>
#include <asm/page.h>
@@ -46,6 +47,8 @@ static LIST_HEAD(tdx_memlist);
static struct tdmr_info_list tdx_tdmr_list;
+static bool tdx_rebooting;
+
typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
@@ -1159,6 +1162,9 @@ static int __tdx_enable(void)
{
int ret;
+ if (tdx_rebooting)
+ return -EAGAIN;
+
ret = init_tdx_module();
if (ret) {
pr_err("module initialization failed (%d)\n", ret);
@@ -1217,6 +1223,69 @@ int tdx_enable(void)
}
EXPORT_SYMBOL_GPL(tdx_enable);
+/*
+ * Convert TDX private pages back to normal on platforms with
+ * "partial write machine check" erratum.
+ *
+ * Called from machine_kexec() before booting to the new kernel.
+ */
+void tdx_reset_memory(void)
+{
+ if (!platform_tdx_enabled())
+ return;
+
+ /*
+ * Kernel read/write to TDX private memory doesn't
+ * cause machine check on hardware w/o this erratum.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
+ return;
+
+ /* Called from kexec() when only rebooting cpu is alive */
+ WARN_ON_ONCE(num_online_cpus() != 1);
+
+ /*
+ * tdx_reboot_notifier() waits until ongoing TDX module
+ * initialization to finish, and module initialization is
+ * rejected after that. Therefore @tdx_module_status is
+ * stable here and can be read w/o holding lock.
+ */
+ if (tdx_module_status != TDX_MODULE_INITIALIZED)
+ return;
+
+ /*
+ * Convert PAMTs back to normal. All other cpus are already
+ * dead and TDMRs/PAMTs are stable.
+ *
+ * Ideally it's better to cover all types of TDX private pages
+ * here, but it's impractical:
+ *
+ * - There's no existing infrastructure to tell whether a page
+ * is TDX private memory or not.
+ *
+ * - Using SEAMCALL to query TDX module isn't feasible either:
+ * - VMX has been turned off by reaching here so SEAMCALL
+ * cannot be made;
+ * - Even SEAMCALL can be made the result from TDX module may
+ * not be accurate (e.g., remote CPU can be stopped while
+ * the kernel is in the middle of reclaiming TDX private
+ * page and doing MOVDIR64B).
+ *
+ * One temporary solution could be just converting all memory
+ * pages, but it's problematic too, because not all pages are
+ * mapped as writable in direct mapping. It can be done by
+ * switching to the identical mapping for kexec() or a new page
+ * table which maps all pages as writable, but the complexity is
+ * overkill.
+ *
+ * Thus instead of doing something dramatic to convert all pages,
+ * only convert PAMTs here. Other kernel components which use
+ * TDX need to do the conversion on their own by intercepting the
+ * rebooting/shutdown notifier (KVM already does that).
+ */
+ tdmrs_reset_pamt_all(&tdx_tdmr_list);
+}
+
static int __init record_keyid_partitioning(u32 *tdx_keyid_start,
u32 *nr_tdx_keyids)
{
@@ -1295,6 +1364,21 @@ static struct notifier_block tdx_memory_nb = {
.notifier_call = tdx_memory_notifier,
};
+static int tdx_reboot_notifier(struct notifier_block *nb, unsigned long mode,
+ void *unused)
+{
+ /* Wait ongoing TDX initialization to finish */
+ mutex_lock(&tdx_module_lock);
+ tdx_rebooting = true;
+ mutex_unlock(&tdx_module_lock);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block tdx_reboot_nb = {
+ .notifier_call = tdx_reboot_notifier,
+};
+
static int __init tdx_init(void)
{
u32 tdx_keyid_start, nr_tdx_keyids;
@@ -1325,6 +1409,14 @@ static int __init tdx_init(void)
return -ENODEV;
}
+ err = register_reboot_notifier(&tdx_reboot_nb);
+ if (err) {
+ pr_err("initialization failed: register_reboot_notifier() failed (%d)\n",
+ err);
+ unregister_memory_notifier(&tdx_memory_nb);
+ return -ENODEV;
+ }
+
/*
* Just use the first TDX KeyID as the 'global KeyID' and
* leave the rest for TDX guests.