Message ID | 999b47f30fbe2535c37a5e8d602c6c27ac6212dd.1687784645.git.kai.huang@intel.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | TDX host kernel support | expand |
On Tue, Jun 27, 2023 at 02:12:39AM +1200, Kai Huang wrote: > +static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action, > + void *v) > +{ > + struct memory_notify *mn = v; > + > + if (action != MEM_GOING_ONLINE) > + return NOTIFY_OK; So offlining TDX memory is ok? > + > + /* > + * Empty list means TDX isn't enabled. Allow any memory > + * to go online. > + */ > + if (list_empty(&tdx_memlist)) > + return NOTIFY_OK; > + > + /* > + * The TDX memory configuration is static and can not be > + * changed. Reject onlining any memory which is outside of > + * the static configuration whether it supports TDX or not. > + */ > + return is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages) ? > + NOTIFY_OK : NOTIFY_BAD; if (is_tdx_memory(...)) return NOTIFY_OK; return NOTIFY_BAD; > +}
On Wed, 2023-06-28 at 16:17 +0200, Peter Zijlstra wrote: > On Tue, Jun 27, 2023 at 02:12:39AM +1200, Kai Huang wrote: > > > +static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action, > > + void *v) > > +{ > > + struct memory_notify *mn = v; > > + > > + if (action != MEM_GOING_ONLINE) > > + return NOTIFY_OK; > > So offlining TDX memory is ok? Yes. We want to support normal software memory hotplug logic even TDX is enabled. User can offline part of memory and then online again. > > > + > > + /* > > + * Empty list means TDX isn't enabled. Allow any memory > > + * to go online. > > + */ > > + if (list_empty(&tdx_memlist)) > > + return NOTIFY_OK; > > + > > + /* > > + * The TDX memory configuration is static and can not be > > + * changed. Reject onlining any memory which is outside of > > + * the static configuration whether it supports TDX or not. > > + */ > > + return is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages) ? > > + NOTIFY_OK : NOTIFY_BAD; > > if (is_tdx_memory(...)) > return NOTIFY_OK; > > return NOTIFY_BAD; > Sure will do. Thanks! >
[...] > +/* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ > +static LIST_HEAD(tdx_memlist); > + > /* > * Wrapper of __seamcall() to convert SEAMCALL leaf function error code > * to kernel error code. @seamcall_ret and @out contain the SEAMCALL > @@ -204,6 +214,79 @@ static int tdx_get_sysinfo(struct tdsysinfo_struct *sysinfo, > return 0; > } > > +/* > + * Add a memory region as a TDX memory block. The caller must make sure > + * all memory regions are added in address ascending order and don't > + * overlap. > + */ > +static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, > + unsigned long end_pfn) > +{ > + struct tdx_memblock *tmb; > + > + tmb = kmalloc(sizeof(*tmb), GFP_KERNEL); > + if (!tmb) > + return -ENOMEM; > + > + INIT_LIST_HEAD(&tmb->list); > + tmb->start_pfn = start_pfn; > + tmb->end_pfn = end_pfn; > + > + /* @tmb_list is protected by mem_hotplug_lock */ If the list is static and independent of memory hotplug, why does it have to be protected? I assume because the memory notifier might currently trigger before building the list. Not sure if that is the right approach. See below. > + list_add_tail(&tmb->list, tmb_list); > + return 0; > +} > + > +static void free_tdx_memlist(struct list_head *tmb_list) > +{ > + /* @tmb_list is protected by mem_hotplug_lock */ > + while (!list_empty(tmb_list)) { > + struct tdx_memblock *tmb = list_first_entry(tmb_list, > + struct tdx_memblock, list); > + > + list_del(&tmb->list); > + kfree(tmb); > + } > +} > + > +/* > + * Ensure that all memblock memory regions are convertible to TDX > + * memory. Once this has been established, stash the memblock > + * ranges off in a secondary structure because memblock is modified > + * in memory hotplug while TDX memory regions are fixed. > + */ > +static int build_tdx_memlist(struct list_head *tmb_list) > +{ > + unsigned long start_pfn, end_pfn; > + int i, ret; > + > + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { > + /* > + * The first 1MB is not reported as TDX convertible memory. > + * Although the first 1MB is always reserved and won't end up > + * to the page allocator, it is still in memblock's memory > + * regions. Skip them manually to exclude them as TDX memory. > + */ > + start_pfn = max(start_pfn, PHYS_PFN(SZ_1M)); > + if (start_pfn >= end_pfn) > + continue; > + > + /* > + * Add the memory regions as TDX memory. The regions in > + * memblock has already guaranteed they are in address > + * ascending order and don't overlap. > + */ > + ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn); > + if (ret) > + goto err; > + } So at the time init_tdx_module() is called, you simply go over all memblocks. But how can you be sure that they are TDX-capable? While the memory notifier will deny onlining new memory blocks, add_memory() already happened and added a new memory block to the system (and to memblock). See add_memory_resource(). It might be cleaner to build the list once during module init (before any memory hotplug can happen and before we tear down memblock) and not require ARCH_KEEP_MEMBLOCK. Essentially, before registering the notifier. So the list is really static. But maybe I am missing something. > + > + return 0; > +err: > + free_tdx_memlist(tmb_list); > + return ret; > +} > + > static int init_tdx_module(void) > { > struct tdsysinfo_struct *sysinfo; > @@ -230,10 +313,25 @@ static int init_tdx_module(void) > if (ret) > goto out; [...] > > +struct tdx_memblock { > + struct list_head list; > + unsigned long start_pfn; > + unsigned long end_pfn; > +}; If it's never consumed by someone else, maybe keep it local to the c file? > + > struct tdx_module_output; > u64 __seamcall(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, > struct tdx_module_output *out);
On Tue, 2023-07-11 at 13:38 +0200, David Hildenbrand wrote: > > [...] > > > +/* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ > > +static LIST_HEAD(tdx_memlist); > > + > > /* > > * Wrapper of __seamcall() to convert SEAMCALL leaf function error code > > * to kernel error code. @seamcall_ret and @out contain the SEAMCALL > > @@ -204,6 +214,79 @@ static int tdx_get_sysinfo(struct tdsysinfo_struct *sysinfo, > > return 0; > > } > > > > +/* > > + * Add a memory region as a TDX memory block. The caller must make sure > > + * all memory regions are added in address ascending order and don't > > + * overlap. > > + */ > > +static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, > > + unsigned long end_pfn) > > +{ > > + struct tdx_memblock *tmb; > > + > > + tmb = kmalloc(sizeof(*tmb), GFP_KERNEL); > > + if (!tmb) > > + return -ENOMEM; > > + > > + INIT_LIST_HEAD(&tmb->list); > > + tmb->start_pfn = start_pfn; > > + tmb->end_pfn = end_pfn; > > + > > + /* @tmb_list is protected by mem_hotplug_lock */ > > If the list is static and independent of memory hotplug, why does it > have to be protected? Thanks for review! The @tdx_memlist itself is a static variable, but the elements in the list are built during module initialization, so we need to protect the list from memory hotplug code path. > > I assume because the memory notifier might currently trigger before > building the list. > > Not sure if that is the right approach. See below. > > > + list_add_tail(&tmb->list, tmb_list); > > + return 0; > > +} > > + > > +static void free_tdx_memlist(struct list_head *tmb_list) > > +{ > > + /* @tmb_list is protected by mem_hotplug_lock */ > > + while (!list_empty(tmb_list)) { > > + struct tdx_memblock *tmb = list_first_entry(tmb_list, > > + struct tdx_memblock, list); > > + > > + list_del(&tmb->list); > > + kfree(tmb); > > + } > > +} > > + > > +/* > > + * Ensure that all memblock memory regions are convertible to TDX > > + * memory. Once this has been established, stash the memblock > > + * ranges off in a secondary structure because memblock is modified > > + * in memory hotplug while TDX memory regions are fixed. > > + */ > > +static int build_tdx_memlist(struct list_head *tmb_list) > > +{ > > + unsigned long start_pfn, end_pfn; > > + int i, ret; > > + > > + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { > > + /* > > + * The first 1MB is not reported as TDX convertible memory. > > + * Although the first 1MB is always reserved and won't end up > > + * to the page allocator, it is still in memblock's memory > > + * regions. Skip them manually to exclude them as TDX memory. > > + */ > > + start_pfn = max(start_pfn, PHYS_PFN(SZ_1M)); > > + if (start_pfn >= end_pfn) > > + continue; > > + > > + /* > > + * Add the memory regions as TDX memory. The regions in > > + * memblock has already guaranteed they are in address > > + * ascending order and don't overlap. > > + */ > > + ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn); > > + if (ret) > > + goto err; > > + } > > So at the time init_tdx_module() is called, you simply go over all > memblocks. > > But how can you be sure that they are TDX-capable? If any memory isn't TDX-capable, the later SEAMCALL TDH.SYS.CONFIG will fail. There's no explicit check to see whether all memblocks are within CMRs here, but depends on the TDH.SYS.CONFIG to do that. This is mainly for code simplicity. > > While the memory notifier will deny onlining new memory blocks, > add_memory() already happened and added a new memory block to the system > (and to memblock). See add_memory_resource(). Yes but this is fine, as long as they are not "plugged" into the buddy system. > > It might be cleaner to build the list once during module init (before > any memory hotplug can happen and before we tear down memblock) and not > require ARCH_KEEP_MEMBLOCK. Essentially, before registering the > notifier. So the list is really static. This can be another solution. In fact I tried this before. But one problem is when TDX module happens, some hot-added memory may already have been hot-added and/or become online. So during module initialization, we cannot simply pass the TDX memblocks built during kernel boot to the TDX module, but need to verify the current memblocks (this will ARCH_KEEP_MEMBLOCK) or the online memory_blocks don't contain any memory that isn't in TDX memblocks. To me this approach isn't simpler than the current approach. > > But maybe I am missing something. > > > + > > + return 0; > > +err: > > + free_tdx_memlist(tmb_list); > > + return ret; > > +} > > + > > static int init_tdx_module(void) > > { > > struct tdsysinfo_struct *sysinfo; > > @@ -230,10 +313,25 @@ static int init_tdx_module(void) > > if (ret) > > goto out; > > [...] > > > > > +struct tdx_memblock { > > + struct list_head list; > > + unsigned long start_pfn; > > + unsigned long end_pfn; > > +}; > > If it's never consumed by someone else, maybe keep it local to the c file? We can, and actually I did this in the old versions, but I changed to put here because there's another structure 'struct tdmr_info_list' being added in later patch. Also, if we move this structure to .c file, then we should move all kernel-defined structures/type declarations to the .c file too (for those architecture structures I want to keep them in tdx.h as they are lengthy and can be used by KVM in the future). I somehow found it's not easy to read too. But I am fine with either way. Kirill/Dave, do you have any comments? > > > + > > struct tdx_module_output; > > u64 __seamcall(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, > > struct tdx_module_output *out); >
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f0f3f1a2c8e0..2226d8a4c749 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1958,6 +1958,7 @@ config INTEL_TDX_HOST depends on X86_64 depends on KVM_INTEL depends on X86_X2APIC + select ARCH_KEEP_MEMBLOCK help Intel Trust Domain Extensions (TDX) protects guest VMs from malicious host and certain physical attacks. This option enables necessary TDX diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 16babff771bd..fd94f8186b9c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1159,6 +1159,8 @@ void __init setup_arch(char **cmdline_p) * * Moreover, on machines with SandyBridge graphics or in setups that use * crashkernel the entire 1M is reserved anyway. + * + * Note the host kernel TDX also requires the first 1MB being reserved. */ x86_platform.realmode_reserve(); diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index a2129cbe056e..127036f06752 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -17,6 +17,13 @@ #include <linux/spinlock.h> #include <linux/percpu-defs.h> #include <linux/mutex.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/memblock.h> +#include <linux/memory.h> +#include <linux/minmax.h> +#include <linux/sizes.h> +#include <linux/pfn.h> #include <asm/msr-index.h> #include <asm/msr.h> #include <asm/archrandom.h> @@ -35,6 +42,9 @@ static DEFINE_PER_CPU(bool, tdx_lp_initialized); static enum tdx_module_status_t tdx_module_status; static DEFINE_MUTEX(tdx_module_lock); +/* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ +static LIST_HEAD(tdx_memlist); + /* * Wrapper of __seamcall() to convert SEAMCALL leaf function error code * to kernel error code. @seamcall_ret and @out contain the SEAMCALL @@ -204,6 +214,79 @@ static int tdx_get_sysinfo(struct tdsysinfo_struct *sysinfo, return 0; } +/* + * Add a memory region as a TDX memory block. The caller must make sure + * all memory regions are added in address ascending order and don't + * overlap. + */ +static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, + unsigned long end_pfn) +{ + struct tdx_memblock *tmb; + + tmb = kmalloc(sizeof(*tmb), GFP_KERNEL); + if (!tmb) + return -ENOMEM; + + INIT_LIST_HEAD(&tmb->list); + tmb->start_pfn = start_pfn; + tmb->end_pfn = end_pfn; + + /* @tmb_list is protected by mem_hotplug_lock */ + list_add_tail(&tmb->list, tmb_list); + return 0; +} + +static void free_tdx_memlist(struct list_head *tmb_list) +{ + /* @tmb_list is protected by mem_hotplug_lock */ + while (!list_empty(tmb_list)) { + struct tdx_memblock *tmb = list_first_entry(tmb_list, + struct tdx_memblock, list); + + list_del(&tmb->list); + kfree(tmb); + } +} + +/* + * Ensure that all memblock memory regions are convertible to TDX + * memory. Once this has been established, stash the memblock + * ranges off in a secondary structure because memblock is modified + * in memory hotplug while TDX memory regions are fixed. + */ +static int build_tdx_memlist(struct list_head *tmb_list) +{ + unsigned long start_pfn, end_pfn; + int i, ret; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + /* + * The first 1MB is not reported as TDX convertible memory. + * Although the first 1MB is always reserved and won't end up + * to the page allocator, it is still in memblock's memory + * regions. Skip them manually to exclude them as TDX memory. + */ + start_pfn = max(start_pfn, PHYS_PFN(SZ_1M)); + if (start_pfn >= end_pfn) + continue; + + /* + * Add the memory regions as TDX memory. The regions in + * memblock has already guaranteed they are in address + * ascending order and don't overlap. + */ + ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn); + if (ret) + goto err; + } + + return 0; +err: + free_tdx_memlist(tmb_list); + return ret; +} + static int init_tdx_module(void) { struct tdsysinfo_struct *sysinfo; @@ -230,10 +313,25 @@ static int init_tdx_module(void) if (ret) goto out; + /* + * To keep things simple, assume that all TDX-protected memory + * will come from the page allocator. Make sure all pages in the + * page allocator are TDX-usable memory. + * + * Build the list of "TDX-usable" memory regions which cover all + * pages in the page allocator to guarantee that. Do it while + * holding mem_hotplug_lock read-lock as the memory hotplug code + * path reads the @tdx_memlist to reject any new memory. + */ + get_online_mems(); + + ret = build_tdx_memlist(&tdx_memlist); + if (ret) + goto out_put_tdxmem; + /* * TODO: * - * - Build the list of TDX-usable memory regions. * - Construct a list of "TD Memory Regions" (TDMRs) to cover * all TDX-usable memory regions. * - Configure the TDMRs and the global KeyID to the TDX module. @@ -243,6 +341,12 @@ static int init_tdx_module(void) * Return error before all steps are done. */ ret = -EINVAL; +out_put_tdxmem: + /* + * @tdx_memlist is written here and read at memory hotplug time. + * Lock out memory hotplug code while building it. + */ + put_online_mems(); out: /* * For now both @sysinfo and @cmr_array are only used during @@ -339,6 +443,54 @@ static int __init record_keyid_partitioning(u32 *tdx_keyid_start, return 0; } +static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn) +{ + struct tdx_memblock *tmb; + + /* + * This check assumes that the start_pfn<->end_pfn range does not + * cross multiple @tdx_memlist entries. A single memory online + * event across multiple memblocks (from which @tdx_memlist + * entries are derived at the time of module initialization) is + * not possible. This is because memory offline/online is done + * on granularity of 'struct memory_block', and the hotpluggable + * memory region (one memblock) must be multiple of memory_block. + */ + list_for_each_entry(tmb, &tdx_memlist, list) { + if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn) + return true; + } + return false; +} + +static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action, + void *v) +{ + struct memory_notify *mn = v; + + if (action != MEM_GOING_ONLINE) + return NOTIFY_OK; + + /* + * Empty list means TDX isn't enabled. Allow any memory + * to go online. + */ + if (list_empty(&tdx_memlist)) + return NOTIFY_OK; + + /* + * The TDX memory configuration is static and can not be + * changed. Reject onlining any memory which is outside of + * the static configuration whether it supports TDX or not. + */ + return is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages) ? + NOTIFY_OK : NOTIFY_BAD; +} + +static struct notifier_block tdx_memory_nb = { + .notifier_call = tdx_memory_notifier, +}; + static int __init tdx_init(void) { u32 tdx_keyid_start, nr_tdx_keyids; @@ -362,6 +514,13 @@ static int __init tdx_init(void) return -ENODEV; } + err = register_memory_notifier(&tdx_memory_nb); + if (err) { + pr_info("initialization failed: register_memory_notifier() failed (%d)\n", + err); + return -ENODEV; + } + /* * Just use the first TDX KeyID as the 'global KeyID' and * leave the rest for TDX guests. diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 8ab2d40971ea..37ee7c5dce1c 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -94,6 +94,12 @@ enum tdx_module_status_t { TDX_MODULE_ERROR }; +struct tdx_memblock { + struct list_head list; + unsigned long start_pfn; + unsigned long end_pfn; +}; + struct tdx_module_output; u64 __seamcall(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, struct tdx_module_output *out);