diff mbox series

[v11,12/20] x86/virt/tdx: Allocate and set up PAMTs for TDMRs

Message ID 4e108968c3294189ad150f62df1f146168036342.1685887183.git.kai.huang@intel.com (mailing list archive)
State New, archived
Headers show
Series TDX host kernel support | expand

Commit Message

Huang, Kai June 4, 2023, 2:27 p.m. UTC
The TDX module uses additional metadata to record things like which
guest "owns" a given page of memory.  This metadata, referred as
Physical Address Metadata Table (PAMT), essentially serves as the
'struct page' for the TDX module.  PAMTs are not reserved by hardware
up front.  They must be allocated by the kernel and then given to the
TDX module during module initialization.

TDX supports 3 page sizes: 4K, 2M, and 1G.  Each "TD Memory Region"
(TDMR) has 3 PAMTs to track the 3 supported page sizes.  Each PAMT must
be a physically contiguous area from a Convertible Memory Region (CMR).
However, the PAMTs which track pages in one TDMR do not need to reside
within that TDMR but can be anywhere in CMRs.  If one PAMT overlaps with
any TDMR, the overlapping part must be reported as a reserved area in
that particular TDMR.

Use alloc_contig_pages() since PAMT must be a physically contiguous area
and it may be potentially large (~1/256th of the size of the given TDMR).
The downside is alloc_contig_pages() may fail at runtime.  One (bad)
mitigation is to launch a TDX guest early during system boot to get
those PAMTs allocated at early time, but the only way to fix is to add a
boot option to allocate or reserve PAMTs during kernel boot.

It is imperfect but will be improved on later.

TDX only supports a limited number of reserved areas per TDMR to cover
both PAMTs and memory holes within the given TDMR.  If many PAMTs are
allocated within a single TDMR, the reserved areas may not be sufficient
to cover all of them.

Adopt the following policies when allocating PAMTs for a given TDMR:

  - Allocate three PAMTs of the TDMR in one contiguous chunk to minimize
    the total number of reserved areas consumed for PAMTs.
  - Try to first allocate PAMT from the local node of the TDMR for better
    NUMA locality.

Also dump out how many pages are allocated for PAMTs when the TDX module
is initialized successfully.  This helps answer the eternal "where did
all my memory go?" questions.

Signed-off-by: Kai Huang <kai.huang@intel.com>
Reviewed-by: Isaku Yamahata <isaku.yamahata@intel.com>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
---

v10 -> v11:
 - No update

v9 -> v10:
 - Removed code change in disable_tdx_module() as it doesn't exist
   anymore.

v8 -> v9:
 - Added TDX_PS_NR macro instead of open-coding (Dave).
 - Better alignment of 'pamt_entry_size' in tdmr_set_up_pamt() (Dave).
 - Changed to print out PAMTs in "KBs" instead of "pages" (Dave).
 - Added Dave's Reviewed-by.

v7 -> v8: (Dave)
 - Changelog:
  - Added a sentence to state PAMT allocation will be improved.
  - Others suggested by Dave.
 - Moved 'nid' of 'struct tdx_memblock' to this patch.
 - Improved comments around tdmr_get_nid().
 - WARN_ON_ONCE() -> pr_warn() in tdmr_get_nid().
 - Other changes due to 'struct tdmr_info_list'.

v6 -> v7:
 - Changes due to using macros instead of 'enum' for TDX supported page
   sizes.

v5 -> v6:
 - Rebase due to using 'tdx_memblock' instead of memblock.
 - 'int pamt_entry_nr' -> 'unsigned long nr_pamt_entries' (Dave/Sagis).
 - Improved comment around tdmr_get_nid() (Dave).
 - Improved comment in tdmr_set_up_pamt() around breaking the PAMT
   into PAMTs for 4K/2M/1G (Dave).
 - tdmrs_get_pamt_pages() -> tdmrs_count_pamt_pages() (Dave).   

- v3 -> v5 (no feedback on v4):
 - Used memblock to get the NUMA node for given TDMR.
 - Removed tdmr_get_pamt_sz() helper but use open-code instead.
 - Changed to use 'switch .. case..' for each TDX supported page size in
   tdmr_get_pamt_sz() (the original __tdmr_get_pamt_sz()).
 - Added printing out memory used for PAMT allocation when TDX module is
   initialized successfully.
 - Explained downside of alloc_contig_pages() in changelog.
 - Addressed other minor comments.


---
 arch/x86/Kconfig            |   1 +
 arch/x86/virt/vmx/tdx/tdx.c | 215 +++++++++++++++++++++++++++++++++++-
 arch/x86/virt/vmx/tdx/tdx.h |   1 +
 3 files changed, 212 insertions(+), 5 deletions(-)

Comments

Kirill A . Shutemov June 8, 2023, 11:24 p.m. UTC | #1
On Mon, Jun 05, 2023 at 02:27:25AM +1200, Kai Huang wrote:
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index fa9fa8bc581a..5f0499ba5d67 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -265,7 +265,7 @@ static int tdx_get_sysinfo(struct tdsysinfo_struct *sysinfo,
>   * overlap.
>   */
>  static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
> -			    unsigned long end_pfn)
> +			    unsigned long end_pfn, int nid)
>  {
>  	struct tdx_memblock *tmb;
>  
> @@ -276,6 +276,7 @@ static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
>  	INIT_LIST_HEAD(&tmb->list);
>  	tmb->start_pfn = start_pfn;
>  	tmb->end_pfn = end_pfn;
> +	tmb->nid = nid;
>  
>  	/* @tmb_list is protected by mem_hotplug_lock */
>  	list_add_tail(&tmb->list, tmb_list);
> @@ -303,9 +304,9 @@ static void free_tdx_memlist(struct list_head *tmb_list)
>  static int build_tdx_memlist(struct list_head *tmb_list)
>  {
>  	unsigned long start_pfn, end_pfn;
> -	int i, ret;
> +	int i, nid, ret;
>  
> -	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
> +	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
>  		/*
>  		 * The first 1MB is not reported as TDX convertible memory.
>  		 * Although the first 1MB is always reserved and won't end up
> @@ -321,7 +322,7 @@ static int build_tdx_memlist(struct list_head *tmb_list)
>  		 * memblock has already guaranteed they are in address
>  		 * ascending order and don't overlap.
>  		 */
> -		ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn);
> +		ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
>  		if (ret)
>  			goto err;
>  	}


These three hunks and change to struct tdx_memblock looks unrelated.
Why not fold this to 09/20?

> @@ -472,6 +473,202 @@ static int fill_out_tdmrs(struct list_head *tmb_list,
>  	return 0;
>  }
>  
> +/*
> + * Calculate PAMT size given a TDMR and a page size.  The returned
> + * PAMT size is always aligned up to 4K page boundary.
> + */
> +static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
> +				      u16 pamt_entry_size)
> +{
> +	unsigned long pamt_sz, nr_pamt_entries;
> +
> +	switch (pgsz) {
> +	case TDX_PS_4K:
> +		nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
> +		break;
> +	case TDX_PS_2M:
> +		nr_pamt_entries = tdmr->size >> PMD_SHIFT;
> +		break;
> +	case TDX_PS_1G:
> +		nr_pamt_entries = tdmr->size >> PUD_SHIFT;
> +		break;
> +	default:
> +		WARN_ON_ONCE(1);
> +		return 0;
> +	}
> +
> +	pamt_sz = nr_pamt_entries * pamt_entry_size;
> +	/* TDX requires PAMT size must be 4K aligned */
> +	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
> +
> +	return pamt_sz;
> +}
> +
> +/*
> + * Locate a NUMA node which should hold the allocation of the @tdmr
> + * PAMT.  This node will have some memory covered by the TDMR.  The
> + * relative amount of memory covered is not considered.
> + */
> +static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
> +{
> +	struct tdx_memblock *tmb;
> +
> +	/*
> +	 * A TDMR must cover at least part of one TMB.  That TMB will end
> +	 * after the TDMR begins.  But, that TMB may have started before
> +	 * the TDMR.  Find the next 'tmb' that _ends_ after this TDMR
> +	 * begins.  Ignore 'tmb' start addresses.  They are irrelevant.
> +	 */
> +	list_for_each_entry(tmb, tmb_list, list) {
> +		if (tmb->end_pfn > PHYS_PFN(tdmr->base))
> +			return tmb->nid;
> +	}
> +
> +	/*
> +	 * Fall back to allocating the TDMR's metadata from node 0 when
> +	 * no TDX memory block can be found.  This should never happen
> +	 * since TDMRs originate from TDX memory blocks.
> +	 */
> +	pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
> +			tdmr->base, tdmr_end(tdmr));
> +	return 0;
> +}
> +
> +#define TDX_PS_NR	(TDX_PS_1G + 1)

This should be next to the rest TDX_PS_*.

> +
> +/*
> + * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
> + * within @tdmr, and set up PAMTs for @tdmr.
> + */
> +static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
> +			    struct list_head *tmb_list,
> +			    u16 pamt_entry_size)
> +{
> +	unsigned long pamt_base[TDX_PS_NR];
> +	unsigned long pamt_size[TDX_PS_NR];
> +	unsigned long tdmr_pamt_base;
> +	unsigned long tdmr_pamt_size;
> +	struct page *pamt;
> +	int pgsz, nid;
> +
> +	nid = tdmr_get_nid(tdmr, tmb_list);
> +
> +	/*
> +	 * Calculate the PAMT size for each TDX supported page size
> +	 * and the total PAMT size.
> +	 */
> +	tdmr_pamt_size = 0;
> +	for (pgsz = TDX_PS_4K; pgsz <= TDX_PS_1G ; pgsz++) {

"< TDX_PS_NR" instead of "<= TDX_PS_1G".

> +		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
> +					pamt_entry_size);
> +		tdmr_pamt_size += pamt_size[pgsz];
> +	}
> +
> +	/*
> +	 * Allocate one chunk of physically contiguous memory for all
> +	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
> +	 * in overlapped TDMRs.
> +	 */
> +	pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
> +			nid, &node_online_map);
> +	if (!pamt)
> +		return -ENOMEM;
> +
> +	/*
> +	 * Break the contiguous allocation back up into the
> +	 * individual PAMTs for each page size.
> +	 */
> +	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
> +	for (pgsz = TDX_PS_4K; pgsz <= TDX_PS_1G; pgsz++) {
> +		pamt_base[pgsz] = tdmr_pamt_base;
> +		tdmr_pamt_base += pamt_size[pgsz];
> +	}
> +
> +	tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
> +	tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
> +	tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
> +	tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
> +	tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
> +	tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
> +
> +	return 0;
> +}
> +
> +static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn,
> +			  unsigned long *pamt_npages)
> +{
> +	unsigned long pamt_base, pamt_sz;
> +
> +	/*
> +	 * The PAMT was allocated in one contiguous unit.  The 4K PAMT
> +	 * should always point to the beginning of that allocation.
> +	 */
> +	pamt_base = tdmr->pamt_4k_base;
> +	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
> +
> +	*pamt_pfn = PHYS_PFN(pamt_base);
> +	*pamt_npages = pamt_sz >> PAGE_SHIFT;
> +}
> +
> +static void tdmr_free_pamt(struct tdmr_info *tdmr)
> +{
> +	unsigned long pamt_pfn, pamt_npages;
> +
> +	tdmr_get_pamt(tdmr, &pamt_pfn, &pamt_npages);
> +
> +	/* Do nothing if PAMT hasn't been allocated for this TDMR */
> +	if (!pamt_npages)
> +		return;
> +
> +	if (WARN_ON_ONCE(!pamt_pfn))
> +		return;
> +
> +	free_contig_range(pamt_pfn, pamt_npages);
> +}
> +
> +static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
> +{
> +	int i;
> +
> +	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
> +		tdmr_free_pamt(tdmr_entry(tdmr_list, i));
> +}
> +
> +/* Allocate and set up PAMTs for all TDMRs */
> +static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
> +				 struct list_head *tmb_list,
> +				 u16 pamt_entry_size)
> +{
> +	int i, ret = 0;
> +
> +	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
> +		ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
> +				pamt_entry_size);
> +		if (ret)
> +			goto err;
> +	}
> +
> +	return 0;
> +err:
> +	tdmrs_free_pamt_all(tdmr_list);
> +	return ret;
> +}
> +
> +static unsigned long tdmrs_count_pamt_pages(struct tdmr_info_list *tdmr_list)
> +{
> +	unsigned long pamt_npages = 0;
> +	int i;
> +
> +	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
> +		unsigned long pfn, npages;
> +
> +		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &pfn, &npages);
> +		pamt_npages += npages;
> +	}
> +
> +	return pamt_npages;
> +}
> +
>  /*
>   * Construct a list of TDMRs on the preallocated space in @tdmr_list
>   * to cover all TDX memory regions in @tmb_list based on the TDX module
> @@ -487,10 +684,13 @@ static int construct_tdmrs(struct list_head *tmb_list,
>  	if (ret)
>  		return ret;
>  
> +	ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list,
> +			sysinfo->pamt_entry_size);
> +	if (ret)
> +		return ret;
>  	/*
>  	 * TODO:
>  	 *
> -	 *  - Allocate and set up PAMTs for each TDMR.
>  	 *  - Designate reserved areas for each TDMR.
>  	 *
>  	 * Return -EINVAL until constructing TDMRs is done
> @@ -547,6 +747,11 @@ static int init_tdx_module(void)
>  	 *  Return error before all steps are done.
>  	 */
>  	ret = -EINVAL;
> +	if (ret)
> +		tdmrs_free_pamt_all(&tdx_tdmr_list);
> +	else
> +		pr_info("%lu KBs allocated for PAMT.\n",
> +				tdmrs_count_pamt_pages(&tdx_tdmr_list) * 4);

"* 4"? This is very cryptic. procfs uses "<< (PAGE_SHIFT - 10)" which
slightly less magic to me. And just make the helper that returns kilobytes
to begin with, if it is the only caller.

>  out_free_tdmrs:
>  	if (ret)
>  		free_tdmr_list(&tdx_tdmr_list);
> diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
> index c20848e76469..e8110e1a9980 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.h
> +++ b/arch/x86/virt/vmx/tdx/tdx.h
> @@ -133,6 +133,7 @@ struct tdx_memblock {
>  	struct list_head list;
>  	unsigned long start_pfn;
>  	unsigned long end_pfn;
> +	int nid;
>  };
>  
>  struct tdmr_info_list {
> -- 
> 2.40.1
>
Dave Hansen June 8, 2023, 11:43 p.m. UTC | #2
On 6/8/23 16:24, kirill.shutemov@linux.intel.com wrote:
>>  	ret = -EINVAL;
>> +	if (ret)
>> +		tdmrs_free_pamt_all(&tdx_tdmr_list);
>> +	else
>> +		pr_info("%lu KBs allocated for PAMT.\n",
>> +				tdmrs_count_pamt_pages(&tdx_tdmr_list) * 4);
> "* 4"? This is very cryptic. procfs uses "<< (PAGE_SHIFT - 10)" which
> slightly less magic to me. And just make the helper that returns kilobytes
> to begin with, if it is the only caller.

Let's look at where this data comes from:

+static unsigned long tdmrs_count_pamt_pages(struct tdmr_info_list
*tdmr_list)
+{
+	unsigned long pamt_npages = 0;
+	int i;
+
+	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+		unsigned long pfn, npages;
+
+		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &pfn, &npages);
+		pamt_npages += npages;
+	}

OK, so tdmr_get_pamt() is getting it in pages.  How is it *stored*?

+static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn,
+			  unsigned long *pamt_npages)
+{
...
+	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
++	*pamt_pfn = PHYS_PFN(pamt_base);
+	*pamt_npages = pamt_sz >> PAGE_SHIFT;
+}

Oh, it's actually stored in bytes.  So to print it out you actually
convert it from bytes->pages->kbytes.  Not the best.

If tdmr_get_pamt() just returned 'pamt_size_bytes', you could do one
conversion at:

	free_contig_range(pamt_pfn, pamt_size_bytes >> PAGE_SIZE);

and since tdmrs_count_pamt_pages() has only one caller you can just make
it:  tdmrs_count_pamt_kb().  The print becomes:

	pr_info("%lu KBs allocated for PAMT.\n",
		tdmrs_count_pamt_kb(&tdx_tdmr_list) * 4);

and tdmrs_count_pamt_kb() does something super fancy like:

	return pamt_size_bytes / 1024;

which makes total complete obvious sense and needs zero explanation.
Huang, Kai June 12, 2023, 2:52 a.m. UTC | #3
On Thu, 2023-06-08 at 16:43 -0700, Dave Hansen wrote:
> On 6/8/23 16:24, kirill.shutemov@linux.intel.com wrote:
> > >  	ret = -EINVAL;
> > > +	if (ret)
> > > +		tdmrs_free_pamt_all(&tdx_tdmr_list);
> > > +	else
> > > +		pr_info("%lu KBs allocated for PAMT.\n",
> > > +				tdmrs_count_pamt_pages(&tdx_tdmr_list) * 4);
> > "* 4"? This is very cryptic. procfs uses "<< (PAGE_SHIFT - 10)" which
> > slightly less magic to me. And just make the helper that returns kilobytes
> > to begin with, if it is the only caller.
> 
> Let's look at where this data comes from:
> 
> +static unsigned long tdmrs_count_pamt_pages(struct tdmr_info_list
> *tdmr_list)
> +{
> +	unsigned long pamt_npages = 0;
> +	int i;
> +
> +	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
> +		unsigned long pfn, npages;
> +
> +		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &pfn, &npages);
> +		pamt_npages += npages;
> +	}
> 
> OK, so tdmr_get_pamt() is getting it in pages.  How is it *stored*?
> 
> +static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn,
> +			  unsigned long *pamt_npages)
> +{
> ...
> +	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
> ++	*pamt_pfn = PHYS_PFN(pamt_base);
> +	*pamt_npages = pamt_sz >> PAGE_SHIFT;
> +}
> 
> Oh, it's actually stored in bytes.  So to print it out you actually
> convert it from bytes->pages->kbytes.  Not the best.
> 
> If tdmr_get_pamt() just returned 'pamt_size_bytes', you could do one
> conversion at:
> 
> 	free_contig_range(pamt_pfn, pamt_size_bytes >> PAGE_SIZE);

I thought making tdmr_get_pamt() return pamt_pfn and pamt_npages would be more
clear that PAMTs must be in 4K granularity, but I guess it doesn't matter
anyway.

If we return bytes for PAMT size, I think we should also return physical address
instead of PFN for PAMT start?

I'll change tdmr_get_pamt() to return physical address and bytes for PAMT
location and size respectively.  Please let me know if you have any comments. 

> 
> and since tdmrs_count_pamt_pages() has only one caller you can just make
> it:  tdmrs_count_pamt_kb().  The print becomes:
> 
> 	pr_info("%lu KBs allocated for PAMT.\n",
> 		tdmrs_count_pamt_kb(&tdx_tdmr_list) * 4);
> 
> and tdmrs_count_pamt_kb() does something super fancy like:
> 
> 	return pamt_size_bytes / 1024;
> 
> which makes total complete obvious sense and needs zero explanation.

Will do.

Thanks for the feedback.
Nikolay Borisov June 15, 2023, 7:48 a.m. UTC | #4
On 4.06.23 г. 17:27 ч., Kai Huang wrote:

<snip>

>   /*
>    * Construct a list of TDMRs on the preallocated space in @tdmr_list
>    * to cover all TDX memory regions in @tmb_list based on the TDX module
> @@ -487,10 +684,13 @@ static int construct_tdmrs(struct list_head *tmb_list,
>   	if (ret)
>   		return ret;
>   
> +	ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list,
> +			sysinfo->pamt_entry_size);
> +	if (ret)
> +		return ret;
>   	/*
>   	 * TODO:
>   	 *
> -	 *  - Allocate and set up PAMTs for each TDMR.
>   	 *  - Designate reserved areas for each TDMR.
>   	 *
>   	 * Return -EINVAL until constructing TDMRs is done
> @@ -547,6 +747,11 @@ static int init_tdx_module(void)
>   	 *  Return error before all steps are done.
>   	 */
>   	ret = -EINVAL;
> +	if (ret)
> +		tdmrs_free_pamt_all(&tdx_tdmr_list);
> +	else
> +		pr_info("%lu KBs allocated for PAMT.\n",
> +				tdmrs_count_pamt_pages(&tdx_tdmr_list) * 4);

Why not put the pr_info right after the 'if (ret)' check following 
tdmrs_setup_pamt_all(). And make the tdmrs_free_pamt_all call 
unconditional.

It seems the main reason for having a bunch of conditionals in the exit 
reason is that you share the put_online_mems(); in both the success and 
failure cases. If you simply add :


put_online_mems();
return 0;

// failure labels follow

Then you can make do without the if (ret) checks and have straight line 
code doing the error handling.

>   out_free_tdmrs:
>   	if (ret)
>   		free_tdmr_list(&tdx_tdmr_list);
> diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
> index c20848e76469..e8110e1a9980 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.h
> +++ b/arch/x86/virt/vmx/tdx/tdx.h
> @@ -133,6 +133,7 @@ struct tdx_memblock {
>   	struct list_head list;
>   	unsigned long start_pfn;
>   	unsigned long end_pfn;
> +	int nid;
>   };
>   
>   struct tdmr_info_list {
Huang, Kai June 25, 2023, 3:38 p.m. UTC | #5
On Fri, 2023-06-09 at 02:24 +0300, kirill.shutemov@linux.intel.com wrote:
> On Mon, Jun 05, 2023 at 02:27:25AM +1200, Kai Huang wrote:
> > diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> > index fa9fa8bc581a..5f0499ba5d67 100644
> > --- a/arch/x86/virt/vmx/tdx/tdx.c
> > +++ b/arch/x86/virt/vmx/tdx/tdx.c
> > @@ -265,7 +265,7 @@ static int tdx_get_sysinfo(struct tdsysinfo_struct *sysinfo,
> >   * overlap.
> >   */
> >  static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
> > -			    unsigned long end_pfn)
> > +			    unsigned long end_pfn, int nid)
> >  {
> >  	struct tdx_memblock *tmb;
> >  
> > @@ -276,6 +276,7 @@ static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
> >  	INIT_LIST_HEAD(&tmb->list);
> >  	tmb->start_pfn = start_pfn;
> >  	tmb->end_pfn = end_pfn;
> > +	tmb->nid = nid;
> >  
> >  	/* @tmb_list is protected by mem_hotplug_lock */
> >  	list_add_tail(&tmb->list, tmb_list);
> > @@ -303,9 +304,9 @@ static void free_tdx_memlist(struct list_head *tmb_list)
> >  static int build_tdx_memlist(struct list_head *tmb_list)
> >  {
> >  	unsigned long start_pfn, end_pfn;
> > -	int i, ret;
> > +	int i, nid, ret;
> >  
> > -	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
> > +	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
> >  		/*
> >  		 * The first 1MB is not reported as TDX convertible memory.
> >  		 * Although the first 1MB is always reserved and won't end up
> > @@ -321,7 +322,7 @@ static int build_tdx_memlist(struct list_head *tmb_list)
> >  		 * memblock has already guaranteed they are in address
> >  		 * ascending order and don't overlap.
> >  		 */
> > -		ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn);
> > +		ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
> >  		if (ret)
> >  			goto err;
> >  	}
> 
> 
> These three hunks and change to struct tdx_memblock looks unrelated.
> Why not fold this to 09/20?

Sorry I missed to reply this.

The @nid is used to try to allocate the PAMT from local node.  It only gets used
in this patch.  Originally (in v7) I had it in patch 09 but Dave suggested to
move to this patch (see the first comment in below link):

https://lore.kernel.org/lkml/8e6803f5-bec6-843d-f3c4-75006ffd0d2f@intel.com/

> > +
> > +#define TDX_PS_NR	(TDX_PS_1G + 1)
> 
> This should be next to the rest TDX_PS_*.
> 

Done.
diff mbox series

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2226d8a4c749..ad364f01de33 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1959,6 +1959,7 @@  config INTEL_TDX_HOST
 	depends on KVM_INTEL
 	depends on X86_X2APIC
 	select ARCH_KEEP_MEMBLOCK
+	depends on CONTIG_ALLOC
 	help
 	  Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
 	  host and certain physical attacks.  This option enables necessary TDX
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index fa9fa8bc581a..5f0499ba5d67 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -265,7 +265,7 @@  static int tdx_get_sysinfo(struct tdsysinfo_struct *sysinfo,
  * overlap.
  */
 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
-			    unsigned long end_pfn)
+			    unsigned long end_pfn, int nid)
 {
 	struct tdx_memblock *tmb;
 
@@ -276,6 +276,7 @@  static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
 	INIT_LIST_HEAD(&tmb->list);
 	tmb->start_pfn = start_pfn;
 	tmb->end_pfn = end_pfn;
+	tmb->nid = nid;
 
 	/* @tmb_list is protected by mem_hotplug_lock */
 	list_add_tail(&tmb->list, tmb_list);
@@ -303,9 +304,9 @@  static void free_tdx_memlist(struct list_head *tmb_list)
 static int build_tdx_memlist(struct list_head *tmb_list)
 {
 	unsigned long start_pfn, end_pfn;
-	int i, ret;
+	int i, nid, ret;
 
-	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 		/*
 		 * The first 1MB is not reported as TDX convertible memory.
 		 * Although the first 1MB is always reserved and won't end up
@@ -321,7 +322,7 @@  static int build_tdx_memlist(struct list_head *tmb_list)
 		 * memblock has already guaranteed they are in address
 		 * ascending order and don't overlap.
 		 */
-		ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn);
+		ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
 		if (ret)
 			goto err;
 	}
@@ -472,6 +473,202 @@  static int fill_out_tdmrs(struct list_head *tmb_list,
 	return 0;
 }
 
+/*
+ * Calculate PAMT size given a TDMR and a page size.  The returned
+ * PAMT size is always aligned up to 4K page boundary.
+ */
+static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
+				      u16 pamt_entry_size)
+{
+	unsigned long pamt_sz, nr_pamt_entries;
+
+	switch (pgsz) {
+	case TDX_PS_4K:
+		nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
+		break;
+	case TDX_PS_2M:
+		nr_pamt_entries = tdmr->size >> PMD_SHIFT;
+		break;
+	case TDX_PS_1G:
+		nr_pamt_entries = tdmr->size >> PUD_SHIFT;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+
+	pamt_sz = nr_pamt_entries * pamt_entry_size;
+	/* TDX requires PAMT size must be 4K aligned */
+	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
+
+	return pamt_sz;
+}
+
+/*
+ * Locate a NUMA node which should hold the allocation of the @tdmr
+ * PAMT.  This node will have some memory covered by the TDMR.  The
+ * relative amount of memory covered is not considered.
+ */
+static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
+{
+	struct tdx_memblock *tmb;
+
+	/*
+	 * A TDMR must cover at least part of one TMB.  That TMB will end
+	 * after the TDMR begins.  But, that TMB may have started before
+	 * the TDMR.  Find the next 'tmb' that _ends_ after this TDMR
+	 * begins.  Ignore 'tmb' start addresses.  They are irrelevant.
+	 */
+	list_for_each_entry(tmb, tmb_list, list) {
+		if (tmb->end_pfn > PHYS_PFN(tdmr->base))
+			return tmb->nid;
+	}
+
+	/*
+	 * Fall back to allocating the TDMR's metadata from node 0 when
+	 * no TDX memory block can be found.  This should never happen
+	 * since TDMRs originate from TDX memory blocks.
+	 */
+	pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
+			tdmr->base, tdmr_end(tdmr));
+	return 0;
+}
+
+#define TDX_PS_NR	(TDX_PS_1G + 1)
+
+/*
+ * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
+ * within @tdmr, and set up PAMTs for @tdmr.
+ */
+static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
+			    struct list_head *tmb_list,
+			    u16 pamt_entry_size)
+{
+	unsigned long pamt_base[TDX_PS_NR];
+	unsigned long pamt_size[TDX_PS_NR];
+	unsigned long tdmr_pamt_base;
+	unsigned long tdmr_pamt_size;
+	struct page *pamt;
+	int pgsz, nid;
+
+	nid = tdmr_get_nid(tdmr, tmb_list);
+
+	/*
+	 * Calculate the PAMT size for each TDX supported page size
+	 * and the total PAMT size.
+	 */
+	tdmr_pamt_size = 0;
+	for (pgsz = TDX_PS_4K; pgsz <= TDX_PS_1G ; pgsz++) {
+		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
+					pamt_entry_size);
+		tdmr_pamt_size += pamt_size[pgsz];
+	}
+
+	/*
+	 * Allocate one chunk of physically contiguous memory for all
+	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
+	 * in overlapped TDMRs.
+	 */
+	pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
+			nid, &node_online_map);
+	if (!pamt)
+		return -ENOMEM;
+
+	/*
+	 * Break the contiguous allocation back up into the
+	 * individual PAMTs for each page size.
+	 */
+	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
+	for (pgsz = TDX_PS_4K; pgsz <= TDX_PS_1G; pgsz++) {
+		pamt_base[pgsz] = tdmr_pamt_base;
+		tdmr_pamt_base += pamt_size[pgsz];
+	}
+
+	tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
+	tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
+	tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
+	tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
+	tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
+	tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
+
+	return 0;
+}
+
+static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn,
+			  unsigned long *pamt_npages)
+{
+	unsigned long pamt_base, pamt_sz;
+
+	/*
+	 * The PAMT was allocated in one contiguous unit.  The 4K PAMT
+	 * should always point to the beginning of that allocation.
+	 */
+	pamt_base = tdmr->pamt_4k_base;
+	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
+
+	*pamt_pfn = PHYS_PFN(pamt_base);
+	*pamt_npages = pamt_sz >> PAGE_SHIFT;
+}
+
+static void tdmr_free_pamt(struct tdmr_info *tdmr)
+{
+	unsigned long pamt_pfn, pamt_npages;
+
+	tdmr_get_pamt(tdmr, &pamt_pfn, &pamt_npages);
+
+	/* Do nothing if PAMT hasn't been allocated for this TDMR */
+	if (!pamt_npages)
+		return;
+
+	if (WARN_ON_ONCE(!pamt_pfn))
+		return;
+
+	free_contig_range(pamt_pfn, pamt_npages);
+}
+
+static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
+{
+	int i;
+
+	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
+		tdmr_free_pamt(tdmr_entry(tdmr_list, i));
+}
+
+/* Allocate and set up PAMTs for all TDMRs */
+static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
+				 struct list_head *tmb_list,
+				 u16 pamt_entry_size)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+		ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
+				pamt_entry_size);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+err:
+	tdmrs_free_pamt_all(tdmr_list);
+	return ret;
+}
+
+static unsigned long tdmrs_count_pamt_pages(struct tdmr_info_list *tdmr_list)
+{
+	unsigned long pamt_npages = 0;
+	int i;
+
+	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
+		unsigned long pfn, npages;
+
+		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &pfn, &npages);
+		pamt_npages += npages;
+	}
+
+	return pamt_npages;
+}
+
 /*
  * Construct a list of TDMRs on the preallocated space in @tdmr_list
  * to cover all TDX memory regions in @tmb_list based on the TDX module
@@ -487,10 +684,13 @@  static int construct_tdmrs(struct list_head *tmb_list,
 	if (ret)
 		return ret;
 
+	ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list,
+			sysinfo->pamt_entry_size);
+	if (ret)
+		return ret;
 	/*
 	 * TODO:
 	 *
-	 *  - Allocate and set up PAMTs for each TDMR.
 	 *  - Designate reserved areas for each TDMR.
 	 *
 	 * Return -EINVAL until constructing TDMRs is done
@@ -547,6 +747,11 @@  static int init_tdx_module(void)
 	 *  Return error before all steps are done.
 	 */
 	ret = -EINVAL;
+	if (ret)
+		tdmrs_free_pamt_all(&tdx_tdmr_list);
+	else
+		pr_info("%lu KBs allocated for PAMT.\n",
+				tdmrs_count_pamt_pages(&tdx_tdmr_list) * 4);
 out_free_tdmrs:
 	if (ret)
 		free_tdmr_list(&tdx_tdmr_list);
diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
index c20848e76469..e8110e1a9980 100644
--- a/arch/x86/virt/vmx/tdx/tdx.h
+++ b/arch/x86/virt/vmx/tdx/tdx.h
@@ -133,6 +133,7 @@  struct tdx_memblock {
 	struct list_head list;
 	unsigned long start_pfn;
 	unsigned long end_pfn;
+	int nid;
 };
 
 struct tdmr_info_list {