Message ID | 4e108968c3294189ad150f62df1f146168036342.1685887183.git.kai.huang@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | TDX host kernel support | expand |
On Mon, Jun 05, 2023 at 02:27:25AM +1200, Kai Huang wrote: > diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c > index fa9fa8bc581a..5f0499ba5d67 100644 > --- a/arch/x86/virt/vmx/tdx/tdx.c > +++ b/arch/x86/virt/vmx/tdx/tdx.c > @@ -265,7 +265,7 @@ static int tdx_get_sysinfo(struct tdsysinfo_struct *sysinfo, > * overlap. > */ > static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, > - unsigned long end_pfn) > + unsigned long end_pfn, int nid) > { > struct tdx_memblock *tmb; > > @@ -276,6 +276,7 @@ static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, > INIT_LIST_HEAD(&tmb->list); > tmb->start_pfn = start_pfn; > tmb->end_pfn = end_pfn; > + tmb->nid = nid; > > /* @tmb_list is protected by mem_hotplug_lock */ > list_add_tail(&tmb->list, tmb_list); > @@ -303,9 +304,9 @@ static void free_tdx_memlist(struct list_head *tmb_list) > static int build_tdx_memlist(struct list_head *tmb_list) > { > unsigned long start_pfn, end_pfn; > - int i, ret; > + int i, nid, ret; > > - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { > + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { > /* > * The first 1MB is not reported as TDX convertible memory. > * Although the first 1MB is always reserved and won't end up > @@ -321,7 +322,7 @@ static int build_tdx_memlist(struct list_head *tmb_list) > * memblock has already guaranteed they are in address > * ascending order and don't overlap. > */ > - ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn); > + ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid); > if (ret) > goto err; > } These three hunks and change to struct tdx_memblock looks unrelated. Why not fold this to 09/20? > @@ -472,6 +473,202 @@ static int fill_out_tdmrs(struct list_head *tmb_list, > return 0; > } > > +/* > + * Calculate PAMT size given a TDMR and a page size. The returned > + * PAMT size is always aligned up to 4K page boundary. > + */ > +static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, > + u16 pamt_entry_size) > +{ > + unsigned long pamt_sz, nr_pamt_entries; > + > + switch (pgsz) { > + case TDX_PS_4K: > + nr_pamt_entries = tdmr->size >> PAGE_SHIFT; > + break; > + case TDX_PS_2M: > + nr_pamt_entries = tdmr->size >> PMD_SHIFT; > + break; > + case TDX_PS_1G: > + nr_pamt_entries = tdmr->size >> PUD_SHIFT; > + break; > + default: > + WARN_ON_ONCE(1); > + return 0; > + } > + > + pamt_sz = nr_pamt_entries * pamt_entry_size; > + /* TDX requires PAMT size must be 4K aligned */ > + pamt_sz = ALIGN(pamt_sz, PAGE_SIZE); > + > + return pamt_sz; > +} > + > +/* > + * Locate a NUMA node which should hold the allocation of the @tdmr > + * PAMT. This node will have some memory covered by the TDMR. The > + * relative amount of memory covered is not considered. > + */ > +static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) > +{ > + struct tdx_memblock *tmb; > + > + /* > + * A TDMR must cover at least part of one TMB. That TMB will end > + * after the TDMR begins. But, that TMB may have started before > + * the TDMR. Find the next 'tmb' that _ends_ after this TDMR > + * begins. Ignore 'tmb' start addresses. They are irrelevant. > + */ > + list_for_each_entry(tmb, tmb_list, list) { > + if (tmb->end_pfn > PHYS_PFN(tdmr->base)) > + return tmb->nid; > + } > + > + /* > + * Fall back to allocating the TDMR's metadata from node 0 when > + * no TDX memory block can be found. This should never happen > + * since TDMRs originate from TDX memory blocks. > + */ > + pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n", > + tdmr->base, tdmr_end(tdmr)); > + return 0; > +} > + > +#define TDX_PS_NR (TDX_PS_1G + 1) This should be next to the rest TDX_PS_*. > + > +/* > + * Allocate PAMTs from the local NUMA node of some memory in @tmb_list > + * within @tdmr, and set up PAMTs for @tdmr. > + */ > +static int tdmr_set_up_pamt(struct tdmr_info *tdmr, > + struct list_head *tmb_list, > + u16 pamt_entry_size) > +{ > + unsigned long pamt_base[TDX_PS_NR]; > + unsigned long pamt_size[TDX_PS_NR]; > + unsigned long tdmr_pamt_base; > + unsigned long tdmr_pamt_size; > + struct page *pamt; > + int pgsz, nid; > + > + nid = tdmr_get_nid(tdmr, tmb_list); > + > + /* > + * Calculate the PAMT size for each TDX supported page size > + * and the total PAMT size. > + */ > + tdmr_pamt_size = 0; > + for (pgsz = TDX_PS_4K; pgsz <= TDX_PS_1G ; pgsz++) { "< TDX_PS_NR" instead of "<= TDX_PS_1G". > + pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz, > + pamt_entry_size); > + tdmr_pamt_size += pamt_size[pgsz]; > + } > + > + /* > + * Allocate one chunk of physically contiguous memory for all > + * PAMTs. This helps minimize the PAMT's use of reserved areas > + * in overlapped TDMRs. > + */ > + pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL, > + nid, &node_online_map); > + if (!pamt) > + return -ENOMEM; > + > + /* > + * Break the contiguous allocation back up into the > + * individual PAMTs for each page size. > + */ > + tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT; > + for (pgsz = TDX_PS_4K; pgsz <= TDX_PS_1G; pgsz++) { > + pamt_base[pgsz] = tdmr_pamt_base; > + tdmr_pamt_base += pamt_size[pgsz]; > + } > + > + tdmr->pamt_4k_base = pamt_base[TDX_PS_4K]; > + tdmr->pamt_4k_size = pamt_size[TDX_PS_4K]; > + tdmr->pamt_2m_base = pamt_base[TDX_PS_2M]; > + tdmr->pamt_2m_size = pamt_size[TDX_PS_2M]; > + tdmr->pamt_1g_base = pamt_base[TDX_PS_1G]; > + tdmr->pamt_1g_size = pamt_size[TDX_PS_1G]; > + > + return 0; > +} > + > +static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn, > + unsigned long *pamt_npages) > +{ > + unsigned long pamt_base, pamt_sz; > + > + /* > + * The PAMT was allocated in one contiguous unit. The 4K PAMT > + * should always point to the beginning of that allocation. > + */ > + pamt_base = tdmr->pamt_4k_base; > + pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size; > + > + *pamt_pfn = PHYS_PFN(pamt_base); > + *pamt_npages = pamt_sz >> PAGE_SHIFT; > +} > + > +static void tdmr_free_pamt(struct tdmr_info *tdmr) > +{ > + unsigned long pamt_pfn, pamt_npages; > + > + tdmr_get_pamt(tdmr, &pamt_pfn, &pamt_npages); > + > + /* Do nothing if PAMT hasn't been allocated for this TDMR */ > + if (!pamt_npages) > + return; > + > + if (WARN_ON_ONCE(!pamt_pfn)) > + return; > + > + free_contig_range(pamt_pfn, pamt_npages); > +} > + > +static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) > +{ > + int i; > + > + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) > + tdmr_free_pamt(tdmr_entry(tdmr_list, i)); > +} > + > +/* Allocate and set up PAMTs for all TDMRs */ > +static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, > + struct list_head *tmb_list, > + u16 pamt_entry_size) > +{ > + int i, ret = 0; > + > + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { > + ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list, > + pamt_entry_size); > + if (ret) > + goto err; > + } > + > + return 0; > +err: > + tdmrs_free_pamt_all(tdmr_list); > + return ret; > +} > + > +static unsigned long tdmrs_count_pamt_pages(struct tdmr_info_list *tdmr_list) > +{ > + unsigned long pamt_npages = 0; > + int i; > + > + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { > + unsigned long pfn, npages; > + > + tdmr_get_pamt(tdmr_entry(tdmr_list, i), &pfn, &npages); > + pamt_npages += npages; > + } > + > + return pamt_npages; > +} > + > /* > * Construct a list of TDMRs on the preallocated space in @tdmr_list > * to cover all TDX memory regions in @tmb_list based on the TDX module > @@ -487,10 +684,13 @@ static int construct_tdmrs(struct list_head *tmb_list, > if (ret) > return ret; > > + ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, > + sysinfo->pamt_entry_size); > + if (ret) > + return ret; > /* > * TODO: > * > - * - Allocate and set up PAMTs for each TDMR. > * - Designate reserved areas for each TDMR. > * > * Return -EINVAL until constructing TDMRs is done > @@ -547,6 +747,11 @@ static int init_tdx_module(void) > * Return error before all steps are done. > */ > ret = -EINVAL; > + if (ret) > + tdmrs_free_pamt_all(&tdx_tdmr_list); > + else > + pr_info("%lu KBs allocated for PAMT.\n", > + tdmrs_count_pamt_pages(&tdx_tdmr_list) * 4); "* 4"? This is very cryptic. procfs uses "<< (PAGE_SHIFT - 10)" which slightly less magic to me. And just make the helper that returns kilobytes to begin with, if it is the only caller. > out_free_tdmrs: > if (ret) > free_tdmr_list(&tdx_tdmr_list); > diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h > index c20848e76469..e8110e1a9980 100644 > --- a/arch/x86/virt/vmx/tdx/tdx.h > +++ b/arch/x86/virt/vmx/tdx/tdx.h > @@ -133,6 +133,7 @@ struct tdx_memblock { > struct list_head list; > unsigned long start_pfn; > unsigned long end_pfn; > + int nid; > }; > > struct tdmr_info_list { > -- > 2.40.1 >
On 6/8/23 16:24, kirill.shutemov@linux.intel.com wrote: >> ret = -EINVAL; >> + if (ret) >> + tdmrs_free_pamt_all(&tdx_tdmr_list); >> + else >> + pr_info("%lu KBs allocated for PAMT.\n", >> + tdmrs_count_pamt_pages(&tdx_tdmr_list) * 4); > "* 4"? This is very cryptic. procfs uses "<< (PAGE_SHIFT - 10)" which > slightly less magic to me. And just make the helper that returns kilobytes > to begin with, if it is the only caller. Let's look at where this data comes from: +static unsigned long tdmrs_count_pamt_pages(struct tdmr_info_list *tdmr_list) +{ + unsigned long pamt_npages = 0; + int i; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + unsigned long pfn, npages; + + tdmr_get_pamt(tdmr_entry(tdmr_list, i), &pfn, &npages); + pamt_npages += npages; + } OK, so tdmr_get_pamt() is getting it in pages. How is it *stored*? +static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn, + unsigned long *pamt_npages) +{ ... + pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size; ++ *pamt_pfn = PHYS_PFN(pamt_base); + *pamt_npages = pamt_sz >> PAGE_SHIFT; +} Oh, it's actually stored in bytes. So to print it out you actually convert it from bytes->pages->kbytes. Not the best. If tdmr_get_pamt() just returned 'pamt_size_bytes', you could do one conversion at: free_contig_range(pamt_pfn, pamt_size_bytes >> PAGE_SIZE); and since tdmrs_count_pamt_pages() has only one caller you can just make it: tdmrs_count_pamt_kb(). The print becomes: pr_info("%lu KBs allocated for PAMT.\n", tdmrs_count_pamt_kb(&tdx_tdmr_list) * 4); and tdmrs_count_pamt_kb() does something super fancy like: return pamt_size_bytes / 1024; which makes total complete obvious sense and needs zero explanation.
On Thu, 2023-06-08 at 16:43 -0700, Dave Hansen wrote: > On 6/8/23 16:24, kirill.shutemov@linux.intel.com wrote: > > > ret = -EINVAL; > > > + if (ret) > > > + tdmrs_free_pamt_all(&tdx_tdmr_list); > > > + else > > > + pr_info("%lu KBs allocated for PAMT.\n", > > > + tdmrs_count_pamt_pages(&tdx_tdmr_list) * 4); > > "* 4"? This is very cryptic. procfs uses "<< (PAGE_SHIFT - 10)" which > > slightly less magic to me. And just make the helper that returns kilobytes > > to begin with, if it is the only caller. > > Let's look at where this data comes from: > > +static unsigned long tdmrs_count_pamt_pages(struct tdmr_info_list > *tdmr_list) > +{ > + unsigned long pamt_npages = 0; > + int i; > + > + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { > + unsigned long pfn, npages; > + > + tdmr_get_pamt(tdmr_entry(tdmr_list, i), &pfn, &npages); > + pamt_npages += npages; > + } > > OK, so tdmr_get_pamt() is getting it in pages. How is it *stored*? > > +static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn, > + unsigned long *pamt_npages) > +{ > ... > + pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size; > ++ *pamt_pfn = PHYS_PFN(pamt_base); > + *pamt_npages = pamt_sz >> PAGE_SHIFT; > +} > > Oh, it's actually stored in bytes. So to print it out you actually > convert it from bytes->pages->kbytes. Not the best. > > If tdmr_get_pamt() just returned 'pamt_size_bytes', you could do one > conversion at: > > free_contig_range(pamt_pfn, pamt_size_bytes >> PAGE_SIZE); I thought making tdmr_get_pamt() return pamt_pfn and pamt_npages would be more clear that PAMTs must be in 4K granularity, but I guess it doesn't matter anyway. If we return bytes for PAMT size, I think we should also return physical address instead of PFN for PAMT start? I'll change tdmr_get_pamt() to return physical address and bytes for PAMT location and size respectively. Please let me know if you have any comments. > > and since tdmrs_count_pamt_pages() has only one caller you can just make > it: tdmrs_count_pamt_kb(). The print becomes: > > pr_info("%lu KBs allocated for PAMT.\n", > tdmrs_count_pamt_kb(&tdx_tdmr_list) * 4); > > and tdmrs_count_pamt_kb() does something super fancy like: > > return pamt_size_bytes / 1024; > > which makes total complete obvious sense and needs zero explanation. Will do. Thanks for the feedback.
On 4.06.23 г. 17:27 ч., Kai Huang wrote: <snip> > /* > * Construct a list of TDMRs on the preallocated space in @tdmr_list > * to cover all TDX memory regions in @tmb_list based on the TDX module > @@ -487,10 +684,13 @@ static int construct_tdmrs(struct list_head *tmb_list, > if (ret) > return ret; > > + ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, > + sysinfo->pamt_entry_size); > + if (ret) > + return ret; > /* > * TODO: > * > - * - Allocate and set up PAMTs for each TDMR. > * - Designate reserved areas for each TDMR. > * > * Return -EINVAL until constructing TDMRs is done > @@ -547,6 +747,11 @@ static int init_tdx_module(void) > * Return error before all steps are done. > */ > ret = -EINVAL; > + if (ret) > + tdmrs_free_pamt_all(&tdx_tdmr_list); > + else > + pr_info("%lu KBs allocated for PAMT.\n", > + tdmrs_count_pamt_pages(&tdx_tdmr_list) * 4); Why not put the pr_info right after the 'if (ret)' check following tdmrs_setup_pamt_all(). And make the tdmrs_free_pamt_all call unconditional. It seems the main reason for having a bunch of conditionals in the exit reason is that you share the put_online_mems(); in both the success and failure cases. If you simply add : put_online_mems(); return 0; // failure labels follow Then you can make do without the if (ret) checks and have straight line code doing the error handling. > out_free_tdmrs: > if (ret) > free_tdmr_list(&tdx_tdmr_list); > diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h > index c20848e76469..e8110e1a9980 100644 > --- a/arch/x86/virt/vmx/tdx/tdx.h > +++ b/arch/x86/virt/vmx/tdx/tdx.h > @@ -133,6 +133,7 @@ struct tdx_memblock { > struct list_head list; > unsigned long start_pfn; > unsigned long end_pfn; > + int nid; > }; > > struct tdmr_info_list {
On Fri, 2023-06-09 at 02:24 +0300, kirill.shutemov@linux.intel.com wrote: > On Mon, Jun 05, 2023 at 02:27:25AM +1200, Kai Huang wrote: > > diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c > > index fa9fa8bc581a..5f0499ba5d67 100644 > > --- a/arch/x86/virt/vmx/tdx/tdx.c > > +++ b/arch/x86/virt/vmx/tdx/tdx.c > > @@ -265,7 +265,7 @@ static int tdx_get_sysinfo(struct tdsysinfo_struct *sysinfo, > > * overlap. > > */ > > static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, > > - unsigned long end_pfn) > > + unsigned long end_pfn, int nid) > > { > > struct tdx_memblock *tmb; > > > > @@ -276,6 +276,7 @@ static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, > > INIT_LIST_HEAD(&tmb->list); > > tmb->start_pfn = start_pfn; > > tmb->end_pfn = end_pfn; > > + tmb->nid = nid; > > > > /* @tmb_list is protected by mem_hotplug_lock */ > > list_add_tail(&tmb->list, tmb_list); > > @@ -303,9 +304,9 @@ static void free_tdx_memlist(struct list_head *tmb_list) > > static int build_tdx_memlist(struct list_head *tmb_list) > > { > > unsigned long start_pfn, end_pfn; > > - int i, ret; > > + int i, nid, ret; > > > > - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { > > + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { > > /* > > * The first 1MB is not reported as TDX convertible memory. > > * Although the first 1MB is always reserved and won't end up > > @@ -321,7 +322,7 @@ static int build_tdx_memlist(struct list_head *tmb_list) > > * memblock has already guaranteed they are in address > > * ascending order and don't overlap. > > */ > > - ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn); > > + ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid); > > if (ret) > > goto err; > > } > > > These three hunks and change to struct tdx_memblock looks unrelated. > Why not fold this to 09/20? Sorry I missed to reply this. The @nid is used to try to allocate the PAMT from local node. It only gets used in this patch. Originally (in v7) I had it in patch 09 but Dave suggested to move to this patch (see the first comment in below link): https://lore.kernel.org/lkml/8e6803f5-bec6-843d-f3c4-75006ffd0d2f@intel.com/ > > + > > +#define TDX_PS_NR (TDX_PS_1G + 1) > > This should be next to the rest TDX_PS_*. > Done.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2226d8a4c749..ad364f01de33 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1959,6 +1959,7 @@ config INTEL_TDX_HOST depends on KVM_INTEL depends on X86_X2APIC select ARCH_KEEP_MEMBLOCK + depends on CONTIG_ALLOC help Intel Trust Domain Extensions (TDX) protects guest VMs from malicious host and certain physical attacks. This option enables necessary TDX diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index fa9fa8bc581a..5f0499ba5d67 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -265,7 +265,7 @@ static int tdx_get_sysinfo(struct tdsysinfo_struct *sysinfo, * overlap. */ static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, - unsigned long end_pfn) + unsigned long end_pfn, int nid) { struct tdx_memblock *tmb; @@ -276,6 +276,7 @@ static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, INIT_LIST_HEAD(&tmb->list); tmb->start_pfn = start_pfn; tmb->end_pfn = end_pfn; + tmb->nid = nid; /* @tmb_list is protected by mem_hotplug_lock */ list_add_tail(&tmb->list, tmb_list); @@ -303,9 +304,9 @@ static void free_tdx_memlist(struct list_head *tmb_list) static int build_tdx_memlist(struct list_head *tmb_list) { unsigned long start_pfn, end_pfn; - int i, ret; + int i, nid, ret; - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { /* * The first 1MB is not reported as TDX convertible memory. * Although the first 1MB is always reserved and won't end up @@ -321,7 +322,7 @@ static int build_tdx_memlist(struct list_head *tmb_list) * memblock has already guaranteed they are in address * ascending order and don't overlap. */ - ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn); + ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid); if (ret) goto err; } @@ -472,6 +473,202 @@ static int fill_out_tdmrs(struct list_head *tmb_list, return 0; } +/* + * Calculate PAMT size given a TDMR and a page size. The returned + * PAMT size is always aligned up to 4K page boundary. + */ +static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, + u16 pamt_entry_size) +{ + unsigned long pamt_sz, nr_pamt_entries; + + switch (pgsz) { + case TDX_PS_4K: + nr_pamt_entries = tdmr->size >> PAGE_SHIFT; + break; + case TDX_PS_2M: + nr_pamt_entries = tdmr->size >> PMD_SHIFT; + break; + case TDX_PS_1G: + nr_pamt_entries = tdmr->size >> PUD_SHIFT; + break; + default: + WARN_ON_ONCE(1); + return 0; + } + + pamt_sz = nr_pamt_entries * pamt_entry_size; + /* TDX requires PAMT size must be 4K aligned */ + pamt_sz = ALIGN(pamt_sz, PAGE_SIZE); + + return pamt_sz; +} + +/* + * Locate a NUMA node which should hold the allocation of the @tdmr + * PAMT. This node will have some memory covered by the TDMR. The + * relative amount of memory covered is not considered. + */ +static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) +{ + struct tdx_memblock *tmb; + + /* + * A TDMR must cover at least part of one TMB. That TMB will end + * after the TDMR begins. But, that TMB may have started before + * the TDMR. Find the next 'tmb' that _ends_ after this TDMR + * begins. Ignore 'tmb' start addresses. They are irrelevant. + */ + list_for_each_entry(tmb, tmb_list, list) { + if (tmb->end_pfn > PHYS_PFN(tdmr->base)) + return tmb->nid; + } + + /* + * Fall back to allocating the TDMR's metadata from node 0 when + * no TDX memory block can be found. This should never happen + * since TDMRs originate from TDX memory blocks. + */ + pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n", + tdmr->base, tdmr_end(tdmr)); + return 0; +} + +#define TDX_PS_NR (TDX_PS_1G + 1) + +/* + * Allocate PAMTs from the local NUMA node of some memory in @tmb_list + * within @tdmr, and set up PAMTs for @tdmr. + */ +static int tdmr_set_up_pamt(struct tdmr_info *tdmr, + struct list_head *tmb_list, + u16 pamt_entry_size) +{ + unsigned long pamt_base[TDX_PS_NR]; + unsigned long pamt_size[TDX_PS_NR]; + unsigned long tdmr_pamt_base; + unsigned long tdmr_pamt_size; + struct page *pamt; + int pgsz, nid; + + nid = tdmr_get_nid(tdmr, tmb_list); + + /* + * Calculate the PAMT size for each TDX supported page size + * and the total PAMT size. + */ + tdmr_pamt_size = 0; + for (pgsz = TDX_PS_4K; pgsz <= TDX_PS_1G ; pgsz++) { + pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz, + pamt_entry_size); + tdmr_pamt_size += pamt_size[pgsz]; + } + + /* + * Allocate one chunk of physically contiguous memory for all + * PAMTs. This helps minimize the PAMT's use of reserved areas + * in overlapped TDMRs. + */ + pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL, + nid, &node_online_map); + if (!pamt) + return -ENOMEM; + + /* + * Break the contiguous allocation back up into the + * individual PAMTs for each page size. + */ + tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT; + for (pgsz = TDX_PS_4K; pgsz <= TDX_PS_1G; pgsz++) { + pamt_base[pgsz] = tdmr_pamt_base; + tdmr_pamt_base += pamt_size[pgsz]; + } + + tdmr->pamt_4k_base = pamt_base[TDX_PS_4K]; + tdmr->pamt_4k_size = pamt_size[TDX_PS_4K]; + tdmr->pamt_2m_base = pamt_base[TDX_PS_2M]; + tdmr->pamt_2m_size = pamt_size[TDX_PS_2M]; + tdmr->pamt_1g_base = pamt_base[TDX_PS_1G]; + tdmr->pamt_1g_size = pamt_size[TDX_PS_1G]; + + return 0; +} + +static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn, + unsigned long *pamt_npages) +{ + unsigned long pamt_base, pamt_sz; + + /* + * The PAMT was allocated in one contiguous unit. The 4K PAMT + * should always point to the beginning of that allocation. + */ + pamt_base = tdmr->pamt_4k_base; + pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size; + + *pamt_pfn = PHYS_PFN(pamt_base); + *pamt_npages = pamt_sz >> PAGE_SHIFT; +} + +static void tdmr_free_pamt(struct tdmr_info *tdmr) +{ + unsigned long pamt_pfn, pamt_npages; + + tdmr_get_pamt(tdmr, &pamt_pfn, &pamt_npages); + + /* Do nothing if PAMT hasn't been allocated for this TDMR */ + if (!pamt_npages) + return; + + if (WARN_ON_ONCE(!pamt_pfn)) + return; + + free_contig_range(pamt_pfn, pamt_npages); +} + +static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) +{ + int i; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) + tdmr_free_pamt(tdmr_entry(tdmr_list, i)); +} + +/* Allocate and set up PAMTs for all TDMRs */ +static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, + struct list_head *tmb_list, + u16 pamt_entry_size) +{ + int i, ret = 0; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list, + pamt_entry_size); + if (ret) + goto err; + } + + return 0; +err: + tdmrs_free_pamt_all(tdmr_list); + return ret; +} + +static unsigned long tdmrs_count_pamt_pages(struct tdmr_info_list *tdmr_list) +{ + unsigned long pamt_npages = 0; + int i; + + for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) { + unsigned long pfn, npages; + + tdmr_get_pamt(tdmr_entry(tdmr_list, i), &pfn, &npages); + pamt_npages += npages; + } + + return pamt_npages; +} + /* * Construct a list of TDMRs on the preallocated space in @tdmr_list * to cover all TDX memory regions in @tmb_list based on the TDX module @@ -487,10 +684,13 @@ static int construct_tdmrs(struct list_head *tmb_list, if (ret) return ret; + ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, + sysinfo->pamt_entry_size); + if (ret) + return ret; /* * TODO: * - * - Allocate and set up PAMTs for each TDMR. * - Designate reserved areas for each TDMR. * * Return -EINVAL until constructing TDMRs is done @@ -547,6 +747,11 @@ static int init_tdx_module(void) * Return error before all steps are done. */ ret = -EINVAL; + if (ret) + tdmrs_free_pamt_all(&tdx_tdmr_list); + else + pr_info("%lu KBs allocated for PAMT.\n", + tdmrs_count_pamt_pages(&tdx_tdmr_list) * 4); out_free_tdmrs: if (ret) free_tdmr_list(&tdx_tdmr_list); diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index c20848e76469..e8110e1a9980 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -133,6 +133,7 @@ struct tdx_memblock { struct list_head list; unsigned long start_pfn; unsigned long end_pfn; + int nid; }; struct tdmr_info_list {