diff mbox series

[v5,15/22] x86/virt/tdx: Allocate and set up PAMTs for TDMRs

Message ID c504a8acd06dc455050c25e2a4cc70aef5eb9358.1655894131.git.kai.huang@intel.com (mailing list archive)
State New, archived
Headers show
Series TDX host kernel support | expand

Commit Message

Huang, Kai June 22, 2022, 11:17 a.m. UTC
The TDX module uses additional metadata to record things like which
guest "owns" a given page of memory.  This metadata, referred as
Physical Address Metadata Table (PAMT), essentially serves as the
'struct page' for the TDX module.  PAMTs are not reserved by hardware
up front.  They must be allocated by the kernel and then given to the
TDX module.

TDX supports 3 page sizes: 4K, 2M, and 1G.  Each "TD Memory Region"
(TDMR) has 3 PAMTs to track the 3 supported page sizes.  Each PAMT must
be a physically contiguous area from a Convertible Memory Region (CMR).
However, the PAMTs which track pages in one TDMR do not need to reside
within that TDMR but can be anywhere in CMRs.  If one PAMT overlaps with
any TDMR, the overlapping part must be reported as a reserved area in
that particular TDMR.

Use alloc_contig_pages() since PAMT must be a physically contiguous area
and it may be potentially large (~1/256th of the size of the given TDMR).
The downside is alloc_contig_pages() may fail at runtime.  One (bad)
mitigation is to launch a TD guest early during system boot to get those
PAMTs allocated at early time, but the only way to fix is to add a boot
option to allocate or reserve PAMTs during kernel boot.

TDX only supports a limited number of reserved areas per TDMR to cover
both PAMTs and memory holes within the given TDMR.  If many PAMTs are
allocated within a single TDMR, the reserved areas may not be sufficient
to cover all of them.

Adopt the following policies when allocating PAMTs for a given TDMR:

  - Allocate three PAMTs of the TDMR in one contiguous chunk to minimize
    the total number of reserved areas consumed for PAMTs.
  - Try to first allocate PAMT from the local node of the TDMR for better
    NUMA locality.

Also dump out how many pages are allocated for PAMTs when the TDX module
is initialized successfully.

Signed-off-by: Kai Huang <kai.huang@intel.com>
---

- v3 -> v5 (no feedback on v4):
 - Used memblock to get the NUMA node for given TDMR.
 - Removed tdmr_get_pamt_sz() helper but use open-code instead.
 - Changed to use 'switch .. case..' for each TDX supported page size in
   tdmr_get_pamt_sz() (the original __tdmr_get_pamt_sz()).
 - Added printing out memory used for PAMT allocation when TDX module is
   initialized successfully.
 - Explained downside of alloc_contig_pages() in changelog.
 - Addressed other minor comments.

---
 arch/x86/Kconfig            |   1 +
 arch/x86/virt/vmx/tdx/tdx.c | 200 ++++++++++++++++++++++++++++++++++++
 2 files changed, 201 insertions(+)

Comments

Dave Hansen June 24, 2022, 8:13 p.m. UTC | #1
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 4988a91d5283..ec496e96d120 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1973,6 +1973,7 @@ config INTEL_TDX_HOST
>  	depends on CPU_SUP_INTEL
>  	depends on X86_64
>  	depends on KVM_INTEL
> +	depends on CONTIG_ALLOC
>  	select ARCH_HAS_CC_PLATFORM
>  	select ARCH_KEEP_MEMBLOCK
>  	help
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index fd9f449b5395..36260dd7e69f 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -558,6 +558,196 @@ static int create_tdmrs(struct tdmr_info *tdmr_array, int *tdmr_num)
>  	return 0;
>  }
>  
> +/* Page sizes supported by TDX */
> +enum tdx_page_sz {
> +	TDX_PG_4K,
> +	TDX_PG_2M,
> +	TDX_PG_1G,
> +	TDX_PG_MAX,
> +};

Are these the same constants as the magic numbers in Kirill's
try_accept_one()?

> +/*
> + * Calculate PAMT size given a TDMR and a page size.  The returned
> + * PAMT size is always aligned up to 4K page boundary.
> + */
> +static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr,
> +				      enum tdx_page_sz pgsz)
> +{
> +	unsigned long pamt_sz;
> +	int pamt_entry_nr;

'nr_pamt_entries', please.

> +	switch (pgsz) {
> +	case TDX_PG_4K:
> +		pamt_entry_nr = tdmr->size >> PAGE_SHIFT;
> +		break;
> +	case TDX_PG_2M:
> +		pamt_entry_nr = tdmr->size >> PMD_SHIFT;
> +		break;
> +	case TDX_PG_1G:
> +		pamt_entry_nr = tdmr->size >> PUD_SHIFT;
> +		break;
> +	default:
> +		WARN_ON_ONCE(1);
> +		return 0;
> +	}
> +
> +	pamt_sz = pamt_entry_nr * tdx_sysinfo.pamt_entry_size;
> +	/* TDX requires PAMT size must be 4K aligned */
> +	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
> +
> +	return pamt_sz;
> +}
> +
> +/*
> + * Pick a NUMA node on which to allocate this TDMR's metadata.
> + *
> + * This is imprecise since TDMRs are 1G aligned and NUMA nodes might
> + * not be.  If the TDMR covers more than one node, just use the _first_
> + * one.  This can lead to small areas of off-node metadata for some
> + * memory.
> + */
> +static int tdmr_get_nid(struct tdmr_info *tdmr)
> +{
> +	unsigned long start_pfn, end_pfn;
> +	int i, nid;
> +
> +	/* Find the first memory region covered by the TDMR */
> +	memblock_for_each_tdx_mem_pfn_range(i, &start_pfn, &end_pfn, &nid) {
> +		if (end_pfn > (tdmr_start(tdmr) >> PAGE_SHIFT))
> +			return nid;
> +	}
> +
> +	/*
> +	 * No memory region found for this TDMR.  It cannot happen since
> +	 * when one TDMR is created, it must cover at least one (or
> +	 * partial) memory region.
> +	 */
> +	WARN_ON_ONCE(1);
> +	return 0;
> +}

You should really describe what you are doing.  At first glance "return
0;" looks like "declare success".  How about something like this?

	/*
	 * Fall back to allocating the TDMR from node 0 when no memblock
	 * can be found.  This should never happen since TDMRs originate
	 * from the memblocks.
	 */

Does that miss any of the points you were trying to make?

> +static int tdmr_set_up_pamt(struct tdmr_info *tdmr)
> +{
> +	unsigned long pamt_base[TDX_PG_MAX];
> +	unsigned long pamt_size[TDX_PG_MAX];
> +	unsigned long tdmr_pamt_base;
> +	unsigned long tdmr_pamt_size;
> +	enum tdx_page_sz pgsz;
> +	struct page *pamt;
> +	int nid;
> +
> +	nid = tdmr_get_nid(tdmr);
> +
> +	/*
> +	 * Calculate the PAMT size for each TDX supported page size
> +	 * and the total PAMT size.
> +	 */
> +	tdmr_pamt_size = 0;
> +	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
> +		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz);
> +		tdmr_pamt_size += pamt_size[pgsz];
> +	}
> +
> +	/*
> +	 * Allocate one chunk of physically contiguous memory for all
> +	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
> +	 * in overlapped TDMRs.
> +	 */
> +	pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
> +			nid, &node_online_map);
> +	if (!pamt)
> +		return -ENOMEM;

I'm not sure it's worth mentioning, but this doesn't really need to be
GFP_KERNEL.  __GFP_HIGHMEM would actually be just fine.  But,
considering that this is 64-bit only, that's just a technicality.

> +	/* Calculate PAMT base and size for all supported page sizes. */

That comment isn't doing much good.  If you say anything here it should be:

	/*
	 * Break the contiguous allocation back up into
	 * the individual PAMTs for each page size:
	 */

Also, this is *not* "calculating size".  That's done above.

> +	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
> +	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
> +		pamt_base[pgsz] = tdmr_pamt_base;
> +		tdmr_pamt_base += pamt_size[pgsz];
> +	}
> +
> +	tdmr->pamt_4k_base = pamt_base[TDX_PG_4K];
> +	tdmr->pamt_4k_size = pamt_size[TDX_PG_4K];
> +	tdmr->pamt_2m_base = pamt_base[TDX_PG_2M];
> +	tdmr->pamt_2m_size = pamt_size[TDX_PG_2M];
> +	tdmr->pamt_1g_base = pamt_base[TDX_PG_1G];
> +	tdmr->pamt_1g_size = pamt_size[TDX_PG_1G];
> +
> +	return 0;
> +}
>
> +static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn,
> +			  unsigned long *pamt_npages)
> +{
> +	unsigned long pamt_base, pamt_sz;
> +
> +	/*
> +	 * The PAMT was allocated in one contiguous unit.  The 4K PAMT
> +	 * should always point to the beginning of that allocation.
> +	 */
> +	pamt_base = tdmr->pamt_4k_base;
> +	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
> +
> +	*pamt_pfn = pamt_base >> PAGE_SHIFT;
> +	*pamt_npages = pamt_sz >> PAGE_SHIFT;
> +}
> +
> +static void tdmr_free_pamt(struct tdmr_info *tdmr)
> +{
> +	unsigned long pamt_pfn, pamt_npages;
> +
> +	tdmr_get_pamt(tdmr, &pamt_pfn, &pamt_npages);
> +
> +	/* Do nothing if PAMT hasn't been allocated for this TDMR */
> +	if (!pamt_npages)
> +		return;
> +
> +	if (WARN_ON_ONCE(!pamt_pfn))
> +		return;
> +
> +	free_contig_range(pamt_pfn, pamt_npages);
> +}
> +
> +static void tdmrs_free_pamt_all(struct tdmr_info *tdmr_array, int tdmr_num)
> +{
> +	int i;
> +
> +	for (i = 0; i < tdmr_num; i++)
> +		tdmr_free_pamt(tdmr_array_entry(tdmr_array, i));
> +}
> +
> +/* Allocate and set up PAMTs for all TDMRs */
> +static int tdmrs_set_up_pamt_all(struct tdmr_info *tdmr_array, int tdmr_num)
> +{
> +	int i, ret = 0;
> +
> +	for (i = 0; i < tdmr_num; i++) {
> +		ret = tdmr_set_up_pamt(tdmr_array_entry(tdmr_array, i));
> +		if (ret)
> +			goto err;
> +	}
> +
> +	return 0;
> +err:
> +	tdmrs_free_pamt_all(tdmr_array, tdmr_num);
> +	return ret;
> +}
> +
> +static unsigned long tdmrs_get_pamt_pages(struct tdmr_info *tdmr_array,
> +					  int tdmr_num)

"get" is for refcounting.  tdmrs_count_pamt_pages() would be preferable.

> +{
> +	unsigned long pamt_npages = 0;
> +	int i;
> +
> +	for (i = 0; i < tdmr_num; i++) {
> +		unsigned long pfn, npages;
> +
> +		tdmr_get_pamt(tdmr_array_entry(tdmr_array, i), &pfn, &npages);
> +		pamt_npages += npages;
> +	}
> +
> +	return pamt_npages;
> +}
> +
>  /*
>   * Construct an array of TDMRs to cover all memory regions in memblock.
>   * This makes sure all pages managed by the page allocator are TDX
> @@ -572,8 +762,13 @@ static int construct_tdmrs_memeblock(struct tdmr_info *tdmr_array,
>  	if (ret)
>  		goto err;
>  
> +	ret = tdmrs_set_up_pamt_all(tdmr_array, *tdmr_num);
> +	if (ret)
> +		goto err;
> +
>  	/* Return -EINVAL until constructing TDMRs is done */
>  	ret = -EINVAL;
> +	tdmrs_free_pamt_all(tdmr_array, *tdmr_num);
>  err:
>  	return ret;
>  }
> @@ -644,6 +839,11 @@ static int init_tdx_module(void)
>  	 * process are done.
>  	 */
>  	ret = -EINVAL;
> +	if (ret)
> +		tdmrs_free_pamt_all(tdmr_array, tdmr_num);
> +	else
> +		pr_info("%lu pages allocated for PAMT.\n",
> +				tdmrs_get_pamt_pages(tdmr_array, tdmr_num));
>  out_free_tdmrs:
>  	/*
>  	 * The array of TDMRs is freed no matter the initialization is

The rest looks OK.
Huang, Kai June 27, 2022, 10:31 a.m. UTC | #2
On Fri, 2022-06-24 at 13:13 -0700, Dave Hansen wrote:
> > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> > index 4988a91d5283..ec496e96d120 100644
> > --- a/arch/x86/Kconfig
> > +++ b/arch/x86/Kconfig
> > @@ -1973,6 +1973,7 @@ config INTEL_TDX_HOST
> >  	depends on CPU_SUP_INTEL
> >  	depends on X86_64
> >  	depends on KVM_INTEL
> > +	depends on CONTIG_ALLOC
> >  	select ARCH_HAS_CC_PLATFORM
> >  	select ARCH_KEEP_MEMBLOCK
> >  	help
> > diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> > index fd9f449b5395..36260dd7e69f 100644
> > --- a/arch/x86/virt/vmx/tdx/tdx.c
> > +++ b/arch/x86/virt/vmx/tdx/tdx.c
> > @@ -558,6 +558,196 @@ static int create_tdmrs(struct tdmr_info *tdmr_array,
> > int *tdmr_num)
> >  	return 0;
> >  }
> >  
> > +/* Page sizes supported by TDX */
> > +enum tdx_page_sz {
> > +	TDX_PG_4K,
> > +	TDX_PG_2M,
> > +	TDX_PG_1G,
> > +	TDX_PG_MAX,
> > +};
> 
> Are these the same constants as the magic numbers in Kirill's
> try_accept_one()?

try_accept_once() uses 'enum pg_level' PG_LEVEL_{4K,2M,1G} directly.  They can
be used directly too, but 'enum pg_level' has more than we need here:

enum pg_level {
        PG_LEVEL_NONE,                                                         
        PG_LEVEL_4K,                                                           
        PG_LEVEL_2M,                                                           
        PG_LEVEL_1G,
        PG_LEVEL_512G,                                                         
        PG_LEVEL_NUM                                                           
}; 

It has PG_LEVEL_NONE, so PG_LEVEL_4K starts with 1.

Below in tdmr_set_up_pamt(), I have two local arrays to store the base/size for
all TDX supported page sizes:

	unsigned long pamt_base[TDX_PG_MAX];
	unsigned long pamt_size[TDX_PG_MAX]; 

And a loop to calculate the size of PAMT for each page size:

	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz);
		...
	}

And later a similar loop to get the PAMT base of each page size too.

I can change them to:

	/*
	 * TDX only supports 4K, 2M and 1G page, but doesn't
	 * support 512G page size.
	 */
#define TDX_PG_LEVEL_MAX	PG_LEVEL_512G

	unsigned long pamt_base[TDX_PG_LEVEL_MAX];
	unsigned long pamt_size[TDX_PG_LEVEL_MAX];

And change the loop to:

	for (pgsz = PG_LEVEL_4K; pgsz < TDX_PG_LEVEL_MAX; pgsz++) {
		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz);
		...
	}

This would waste one 'unsigned long' for both pamt_base and pamt_size array, as
entry 0 isn't used for both of them.  Or we explicitly -1 array index:

	for (pgsz = PG_LEVEL_4K; pgsz < TDX_PG_LEVEL_MAX; pgsz++) {
		pamt_size[pgsz - 1] = tdmr_get_pamt_sz(tdmr, pgsz);
		...
	}

What's your opinion? 

> > +/*
> > + * Calculate PAMT size given a TDMR and a page size.  The returned
> > + * PAMT size is always aligned up to 4K page boundary.
> > + */
> > +static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr,
> > +				      enum tdx_page_sz pgsz)
> > +{
> > +	unsigned long pamt_sz;
> > +	int pamt_entry_nr;
> 
> 'nr_pamt_entries', please.

OK.

> 
> > +	switch (pgsz) {
> > +	case TDX_PG_4K:
> > +		pamt_entry_nr = tdmr->size >> PAGE_SHIFT;
> > +		break;
> > +	case TDX_PG_2M:
> > +		pamt_entry_nr = tdmr->size >> PMD_SHIFT;
> > +		break;
> > +	case TDX_PG_1G:
> > +		pamt_entry_nr = tdmr->size >> PUD_SHIFT;
> > +		break;
> > +	default:
> > +		WARN_ON_ONCE(1);
> > +		return 0;
> > +	}
> > +
> > +	pamt_sz = pamt_entry_nr * tdx_sysinfo.pamt_entry_size;
> > +	/* TDX requires PAMT size must be 4K aligned */
> > +	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
> > +
> > +	return pamt_sz;
> > +}
> > +
> > +/*
> > + * Pick a NUMA node on which to allocate this TDMR's metadata.
> > + *
> > + * This is imprecise since TDMRs are 1G aligned and NUMA nodes might
> > + * not be.  If the TDMR covers more than one node, just use the _first_
> > + * one.  This can lead to small areas of off-node metadata for some
> > + * memory.
> > + */
> > +static int tdmr_get_nid(struct tdmr_info *tdmr)
> > +{
> > +	unsigned long start_pfn, end_pfn;
> > +	int i, nid;
> > +
> > +	/* Find the first memory region covered by the TDMR */
> > +	memblock_for_each_tdx_mem_pfn_range(i, &start_pfn, &end_pfn, &nid)
> > {
> > +		if (end_pfn > (tdmr_start(tdmr) >> PAGE_SHIFT))
> > +			return nid;
> > +	}
> > +
> > +	/*
> > +	 * No memory region found for this TDMR.  It cannot happen since
> > +	 * when one TDMR is created, it must cover at least one (or
> > +	 * partial) memory region.
> > +	 */
> > +	WARN_ON_ONCE(1);
> > +	return 0;
> > +}
> 
> You should really describe what you are doing.  At first glance "return
> 0;" looks like "declare success".  How about something like this?
> 
> 	/*
> 	 * Fall back to allocating the TDMR from node 0 when no memblock
> 	 * can be found.  This should never happen since TDMRs originate
> 	 * from the memblocks.
> 	 */
> 
> Does that miss any of the points you were trying to make?

No. Your comments looks better and will use yours.  Thanks.

> 
> > +static int tdmr_set_up_pamt(struct tdmr_info *tdmr)
> > +{
> > +	unsigned long pamt_base[TDX_PG_MAX];
> > +	unsigned long pamt_size[TDX_PG_MAX];
> > +	unsigned long tdmr_pamt_base;
> > +	unsigned long tdmr_pamt_size;
> > +	enum tdx_page_sz pgsz;
> > +	struct page *pamt;
> > +	int nid;
> > +
> > +	nid = tdmr_get_nid(tdmr);
> > +
> > +	/*
> > +	 * Calculate the PAMT size for each TDX supported page size
> > +	 * and the total PAMT size.
> > +	 */
> > +	tdmr_pamt_size = 0;
> > +	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
> > +		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz);
> > +		tdmr_pamt_size += pamt_size[pgsz];
> > +	}
> > +
> > +	/*
> > +	 * Allocate one chunk of physically contiguous memory for all
> > +	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
> > +	 * in overlapped TDMRs.
> > +	 */
> > +	pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
> > +			nid, &node_online_map);
> > +	if (!pamt)
> > +		return -ENOMEM;
> 
> I'm not sure it's worth mentioning, but this doesn't really need to be
> GFP_KERNEL.  __GFP_HIGHMEM would actually be just fine.  But,
> considering that this is 64-bit only, that's just a technicality.



> 
> > +	/* Calculate PAMT base and size for all supported page sizes. */
> 
> That comment isn't doing much good.  If you say anything here it should be:
> 
> 	/*
> 	 * Break the contiguous allocation back up into
> 	 * the individual PAMTs for each page size:
> 	 */
> 
> Also, this is *not* "calculating size".  That's done above.

Thanks will use this comment.

> 
> > +	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
> > +	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
> > +		pamt_base[pgsz] = tdmr_pamt_base;
> > +		tdmr_pamt_base += pamt_size[pgsz];
> > +	}
> > +
> > +	tdmr->pamt_4k_base = pamt_base[TDX_PG_4K];
> > +	tdmr->pamt_4k_size = pamt_size[TDX_PG_4K];
> > +	tdmr->pamt_2m_base = pamt_base[TDX_PG_2M];
> > +	tdmr->pamt_2m_size = pamt_size[TDX_PG_2M];
> > +	tdmr->pamt_1g_base = pamt_base[TDX_PG_1G];
> > +	tdmr->pamt_1g_size = pamt_size[TDX_PG_1G];
> > +
> > +	return 0;
> > +}
> > 
> > +static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn,
> > +			  unsigned long *pamt_npages)
> > +{
> > +	unsigned long pamt_base, pamt_sz;
> > +
> > +	/*
> > +	 * The PAMT was allocated in one contiguous unit.  The 4K PAMT
> > +	 * should always point to the beginning of that allocation.
> > +	 */
> > +	pamt_base = tdmr->pamt_4k_base;
> > +	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr-
> > >pamt_1g_size;
> > +
> > +	*pamt_pfn = pamt_base >> PAGE_SHIFT;
> > +	*pamt_npages = pamt_sz >> PAGE_SHIFT;
> > +}
> > +
> > +static void tdmr_free_pamt(struct tdmr_info *tdmr)
> > +{
> > +	unsigned long pamt_pfn, pamt_npages;
> > +
> > +	tdmr_get_pamt(tdmr, &pamt_pfn, &pamt_npages);
> > +
> > +	/* Do nothing if PAMT hasn't been allocated for this TDMR */
> > +	if (!pamt_npages)
> > +		return;
> > +
> > +	if (WARN_ON_ONCE(!pamt_pfn))
> > +		return;
> > +
> > +	free_contig_range(pamt_pfn, pamt_npages);
> > +}
> > +
> > +static void tdmrs_free_pamt_all(struct tdmr_info *tdmr_array, int tdmr_num)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < tdmr_num; i++)
> > +		tdmr_free_pamt(tdmr_array_entry(tdmr_array, i));
> > +}
> > +
> > +/* Allocate and set up PAMTs for all TDMRs */
> > +static int tdmrs_set_up_pamt_all(struct tdmr_info *tdmr_array, int
> > tdmr_num)
> > +{
> > +	int i, ret = 0;
> > +
> > +	for (i = 0; i < tdmr_num; i++) {
> > +		ret = tdmr_set_up_pamt(tdmr_array_entry(tdmr_array, i));
> > +		if (ret)
> > +			goto err;
> > +	}
> > +
> > +	return 0;
> > +err:
> > +	tdmrs_free_pamt_all(tdmr_array, tdmr_num);
> > +	return ret;
> > +}
> > +
> > +static unsigned long tdmrs_get_pamt_pages(struct tdmr_info *tdmr_array,
> > +					  int tdmr_num)
> 
> "get" is for refcounting.  tdmrs_count_pamt_pages() would be preferable.

Will use count.  Thanks.

> 
> > +{
> > +	unsigned long pamt_npages = 0;
> > +	int i;
> > +
> > +	for (i = 0; i < tdmr_num; i++) {
> > +		unsigned long pfn, npages;
> > +
> > +		tdmr_get_pamt(tdmr_array_entry(tdmr_array, i), &pfn,
> > &npages);
> > +		pamt_npages += npages;
> > +	}
> > +
> > +	return pamt_npages;
> > +}
> > +
> >  /*
> >   * Construct an array of TDMRs to cover all memory regions in memblock.
> >   * This makes sure all pages managed by the page allocator are TDX
> > @@ -572,8 +762,13 @@ static int construct_tdmrs_memeblock(struct tdmr_info
> > *tdmr_array,
> >  	if (ret)
> >  		goto err;
> >  
> > +	ret = tdmrs_set_up_pamt_all(tdmr_array, *tdmr_num);
> > +	if (ret)
> > +		goto err;
> > +
> >  	/* Return -EINVAL until constructing TDMRs is done */
> >  	ret = -EINVAL;
> > +	tdmrs_free_pamt_all(tdmr_array, *tdmr_num);
> >  err:
> >  	return ret;
> >  }
> > @@ -644,6 +839,11 @@ static int init_tdx_module(void)
> >  	 * process are done.
> >  	 */
> >  	ret = -EINVAL;
> > +	if (ret)
> > +		tdmrs_free_pamt_all(tdmr_array, tdmr_num);
> > +	else
> > +		pr_info("%lu pages allocated for PAMT.\n",
> > +				tdmrs_get_pamt_pages(tdmr_array,
> > tdmr_num));
> >  out_free_tdmrs:
> >  	/*
> >  	 * The array of TDMRs is freed no matter the initialization is
> 
> The rest looks OK.

Thanks.
Dave Hansen June 27, 2022, 8:41 p.m. UTC | #3
On 6/27/22 03:31, Kai Huang wrote:
>>> +/* Page sizes supported by TDX */
>>> +enum tdx_page_sz {
>>> +	TDX_PG_4K,
>>> +	TDX_PG_2M,
>>> +	TDX_PG_1G,
>>> +	TDX_PG_MAX,
>>> +};
>> Are these the same constants as the magic numbers in Kirill's
>> try_accept_one()?
> try_accept_once() uses 'enum pg_level' PG_LEVEL_{4K,2M,1G} directly.  They can
> be used directly too, but 'enum pg_level' has more than we need here:

I meant this:

+       switch (level) {
+       case PG_LEVEL_4K:
+               page_size = 0;
+               break;

Because TDX_PG_4K==page_size==0, and for this:

+       case PG_LEVEL_2M:
+               page_size = 1;

where TDX_PG_2M==page_size==1

See?

Are Kirill's magic 0/1/2 numbers the same as

	TDX_PG_4K,
	TDX_PG_2M,
	TDX_PG_1G,

?
Huang, Kai June 27, 2022, 10:50 p.m. UTC | #4
On Mon, 2022-06-27 at 13:41 -0700, Dave Hansen wrote:
> On 6/27/22 03:31, Kai Huang wrote:
> > > > +/* Page sizes supported by TDX */
> > > > +enum tdx_page_sz {
> > > > +	TDX_PG_4K,
> > > > +	TDX_PG_2M,
> > > > +	TDX_PG_1G,
> > > > +	TDX_PG_MAX,
> > > > +};
> > > Are these the same constants as the magic numbers in Kirill's
> > > try_accept_one()?
> > try_accept_once() uses 'enum pg_level' PG_LEVEL_{4K,2M,1G} directly.  They can
> > be used directly too, but 'enum pg_level' has more than we need here:
> 
> I meant this:
> 
> +       switch (level) {
> +       case PG_LEVEL_4K:
> +               page_size = 0;
> +               break;
> 
> Because TDX_PG_4K==page_size==0, and for this:
> 
> +       case PG_LEVEL_2M:
> +               page_size = 1;
> 
> where TDX_PG_2M==page_size==1
> 
> See?
> 
> Are Kirill's magic 0/1/2 numbers the same as
> 
> 	TDX_PG_4K,
> 	TDX_PG_2M,
> 	TDX_PG_1G,
> 
> ?

Yes they are the same.  Kirill uses 0/1/2 as input of TDX_ACCEPT_PAGE TDCALL. 
Here I only need them to distinguish different page sizes.

Do you mean we should put TDX_PG_4K/2M/1G definition to asm/tdx.h, and
try_accept_one() should use them instead of magic 0/1/2?
Dave Hansen June 27, 2022, 10:57 p.m. UTC | #5
On 6/27/22 15:50, Kai Huang wrote:
>> Are Kirill's magic 0/1/2 numbers the same as
>>
>> 	TDX_PG_4K,
>> 	TDX_PG_2M,
>> 	TDX_PG_1G,
>>
>> ?
> Yes they are the same.  Kirill uses 0/1/2 as input of TDX_ACCEPT_PAGE TDCALL. 
> Here I only need them to distinguish different page sizes.
> 
> Do you mean we should put TDX_PG_4K/2M/1G definition to asm/tdx.h, and
> try_accept_one() should use them instead of magic 0/1/2?

I honestly don't care how you do it as long as the magic numbers go away
(within reason).
Huang, Kai June 27, 2022, 11:05 p.m. UTC | #6
On Mon, 2022-06-27 at 15:57 -0700, Dave Hansen wrote:
> On 6/27/22 15:50, Kai Huang wrote:
> > > Are Kirill's magic 0/1/2 numbers the same as
> > > 
> > > 	TDX_PG_4K,
> > > 	TDX_PG_2M,
> > > 	TDX_PG_1G,
> > > 
> > > ?
> > Yes they are the same.  Kirill uses 0/1/2 as input of TDX_ACCEPT_PAGE TDCALL. 
> > Here I only need them to distinguish different page sizes.
> > 
> > Do you mean we should put TDX_PG_4K/2M/1G definition to asm/tdx.h, and
> > try_accept_one() should use them instead of magic 0/1/2?
> 
> I honestly don't care how you do it as long as the magic numbers go away
> (within reason).

OK.  I'll write a patch to replace 0/1/2 magic numbers in try_accept_one().
Xiaoyao Li June 28, 2022, 12:48 a.m. UTC | #7
On 6/28/2022 4:41 AM, Dave Hansen wrote:
> On 6/27/22 03:31, Kai Huang wrote:
>>>> +/* Page sizes supported by TDX */
>>>> +enum tdx_page_sz {
>>>> +	TDX_PG_4K,
>>>> +	TDX_PG_2M,
>>>> +	TDX_PG_1G,
>>>> +	TDX_PG_MAX,
>>>> +};
>>> Are these the same constants as the magic numbers in Kirill's
>>> try_accept_one()?
>> try_accept_once() uses 'enum pg_level' PG_LEVEL_{4K,2M,1G} directly.  They can
>> be used directly too, but 'enum pg_level' has more than we need here:
> 
> I meant this:
> 
> +       switch (level) {
> +       case PG_LEVEL_4K:
> +               page_size = 0;
> +               break;
> 
> Because TDX_PG_4K==page_size==0, and for this:
> 
> +       case PG_LEVEL_2M:
> +               page_size = 1;

here we can just do

	page_size = level - 1;

or
	
	tdx_page_level = level - 1;

yes, TDX's page level definition is one level smaller of Linux's definition.

> where TDX_PG_2M==page_size==1
> 
> See?
> 
> Are Kirill's magic 0/1/2 numbers the same as
> 
> 	TDX_PG_4K,
> 	TDX_PG_2M,
> 	TDX_PG_1G,
> 
> ?
Dave Hansen June 28, 2022, 5:03 p.m. UTC | #8
On 6/27/22 17:48, Xiaoyao Li wrote:
>>
>> I meant this:
>>
>> +       switch (level) {
>> +       case PG_LEVEL_4K:
>> +               page_size = 0;
>> +               break;
>>
>> Because TDX_PG_4K==page_size==0, and for this:
>>
>> +       case PG_LEVEL_2M:
>> +               page_size = 1;
> 
> here we can just do
> 
>     page_size = level - 1;
> 
> or
>     
>     tdx_page_level = level - 1;
> 
> yes, TDX's page level definition is one level smaller of Linux's
> definition.

Uhh.  No.

The 'page_size' is in the kernel/TDX-module ABI.  It can't change.
PG_LEVEL_* is just some random internal Linux enum.  It *CAN* change.

There's a *MASSIVE* difference between the two.  What you suggest will
probably actually work.  But, it will work accidentally and may break in
horribly confusing ways in the future.

It's the difference between hacking something together and actually
writing code that will keep working for a long time.  Please, take a
minute and reflect on this.  Please.
Sagi Shahar Aug. 17, 2022, 10:46 p.m. UTC | #9
On Wed, Jun 22, 2022 at 4:19 AM Kai Huang <kai.huang@intel.com> wrote:
>
> The TDX module uses additional metadata to record things like which
> guest "owns" a given page of memory.  This metadata, referred as
> Physical Address Metadata Table (PAMT), essentially serves as the
> 'struct page' for the TDX module.  PAMTs are not reserved by hardware
> up front.  They must be allocated by the kernel and then given to the
> TDX module.
>
> TDX supports 3 page sizes: 4K, 2M, and 1G.  Each "TD Memory Region"
> (TDMR) has 3 PAMTs to track the 3 supported page sizes.  Each PAMT must
> be a physically contiguous area from a Convertible Memory Region (CMR).
> However, the PAMTs which track pages in one TDMR do not need to reside
> within that TDMR but can be anywhere in CMRs.  If one PAMT overlaps with
> any TDMR, the overlapping part must be reported as a reserved area in
> that particular TDMR.
>
> Use alloc_contig_pages() since PAMT must be a physically contiguous area
> and it may be potentially large (~1/256th of the size of the given TDMR).
> The downside is alloc_contig_pages() may fail at runtime.  One (bad)
> mitigation is to launch a TD guest early during system boot to get those
> PAMTs allocated at early time, but the only way to fix is to add a boot
> option to allocate or reserve PAMTs during kernel boot.
>
> TDX only supports a limited number of reserved areas per TDMR to cover
> both PAMTs and memory holes within the given TDMR.  If many PAMTs are
> allocated within a single TDMR, the reserved areas may not be sufficient
> to cover all of them.
>
> Adopt the following policies when allocating PAMTs for a given TDMR:
>
>   - Allocate three PAMTs of the TDMR in one contiguous chunk to minimize
>     the total number of reserved areas consumed for PAMTs.
>   - Try to first allocate PAMT from the local node of the TDMR for better
>     NUMA locality.
>
> Also dump out how many pages are allocated for PAMTs when the TDX module
> is initialized successfully.
>
> Signed-off-by: Kai Huang <kai.huang@intel.com>
> ---
>
> - v3 -> v5 (no feedback on v4):
>  - Used memblock to get the NUMA node for given TDMR.
>  - Removed tdmr_get_pamt_sz() helper but use open-code instead.
>  - Changed to use 'switch .. case..' for each TDX supported page size in
>    tdmr_get_pamt_sz() (the original __tdmr_get_pamt_sz()).
>  - Added printing out memory used for PAMT allocation when TDX module is
>    initialized successfully.
>  - Explained downside of alloc_contig_pages() in changelog.
>  - Addressed other minor comments.
>
> ---
>  arch/x86/Kconfig            |   1 +
>  arch/x86/virt/vmx/tdx/tdx.c | 200 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 201 insertions(+)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 4988a91d5283..ec496e96d120 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1973,6 +1973,7 @@ config INTEL_TDX_HOST
>         depends on CPU_SUP_INTEL
>         depends on X86_64
>         depends on KVM_INTEL
> +       depends on CONTIG_ALLOC
>         select ARCH_HAS_CC_PLATFORM
>         select ARCH_KEEP_MEMBLOCK
>         help
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index fd9f449b5395..36260dd7e69f 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -558,6 +558,196 @@ static int create_tdmrs(struct tdmr_info *tdmr_array, int *tdmr_num)
>         return 0;
>  }
>
> +/* Page sizes supported by TDX */
> +enum tdx_page_sz {
> +       TDX_PG_4K,
> +       TDX_PG_2M,
> +       TDX_PG_1G,
> +       TDX_PG_MAX,
> +};
> +
> +/*
> + * Calculate PAMT size given a TDMR and a page size.  The returned
> + * PAMT size is always aligned up to 4K page boundary.
> + */
> +static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr,
> +                                     enum tdx_page_sz pgsz)
> +{
> +       unsigned long pamt_sz;
> +       int pamt_entry_nr;
            ^
This should be an 'unsigned long'. Otherwise you get an integer
overflow for large memory machines.

> +
> +       switch (pgsz) {
> +       case TDX_PG_4K:
> +               pamt_entry_nr = tdmr->size >> PAGE_SHIFT;
> +               break;
> +       case TDX_PG_2M:
> +               pamt_entry_nr = tdmr->size >> PMD_SHIFT;
> +               break;
> +       case TDX_PG_1G:
> +               pamt_entry_nr = tdmr->size >> PUD_SHIFT;
> +               break;
> +       default:
> +               WARN_ON_ONCE(1);
> +               return 0;
> +       }
> +
> +       pamt_sz = pamt_entry_nr * tdx_sysinfo.pamt_entry_size;
> +       /* TDX requires PAMT size must be 4K aligned */
> +       pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
> +
> +       return pamt_sz;
> +}
> +
> +/*
> + * Pick a NUMA node on which to allocate this TDMR's metadata.
> + *
> + * This is imprecise since TDMRs are 1G aligned and NUMA nodes might
> + * not be.  If the TDMR covers more than one node, just use the _first_
> + * one.  This can lead to small areas of off-node metadata for some
> + * memory.
> + */
> +static int tdmr_get_nid(struct tdmr_info *tdmr)
> +{
> +       unsigned long start_pfn, end_pfn;
> +       int i, nid;
> +
> +       /* Find the first memory region covered by the TDMR */
> +       memblock_for_each_tdx_mem_pfn_range(i, &start_pfn, &end_pfn, &nid) {
> +               if (end_pfn > (tdmr_start(tdmr) >> PAGE_SHIFT))
> +                       return nid;
> +       }
> +
> +       /*
> +        * No memory region found for this TDMR.  It cannot happen since
> +        * when one TDMR is created, it must cover at least one (or
> +        * partial) memory region.
> +        */
> +       WARN_ON_ONCE(1);
> +       return 0;
> +}
> +
> +static int tdmr_set_up_pamt(struct tdmr_info *tdmr)
> +{
> +       unsigned long pamt_base[TDX_PG_MAX];
> +       unsigned long pamt_size[TDX_PG_MAX];
> +       unsigned long tdmr_pamt_base;
> +       unsigned long tdmr_pamt_size;
> +       enum tdx_page_sz pgsz;
> +       struct page *pamt;
> +       int nid;
> +
> +       nid = tdmr_get_nid(tdmr);
> +
> +       /*
> +        * Calculate the PAMT size for each TDX supported page size
> +        * and the total PAMT size.
> +        */
> +       tdmr_pamt_size = 0;
> +       for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
> +               pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz);
> +               tdmr_pamt_size += pamt_size[pgsz];
> +       }
> +
> +       /*
> +        * Allocate one chunk of physically contiguous memory for all
> +        * PAMTs.  This helps minimize the PAMT's use of reserved areas
> +        * in overlapped TDMRs.
> +        */
> +       pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
> +                       nid, &node_online_map);
> +       if (!pamt)
> +               return -ENOMEM;
> +
> +       /* Calculate PAMT base and size for all supported page sizes. */
> +       tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
> +       for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
> +               pamt_base[pgsz] = tdmr_pamt_base;
> +               tdmr_pamt_base += pamt_size[pgsz];
> +       }
> +
> +       tdmr->pamt_4k_base = pamt_base[TDX_PG_4K];
> +       tdmr->pamt_4k_size = pamt_size[TDX_PG_4K];
> +       tdmr->pamt_2m_base = pamt_base[TDX_PG_2M];
> +       tdmr->pamt_2m_size = pamt_size[TDX_PG_2M];
> +       tdmr->pamt_1g_base = pamt_base[TDX_PG_1G];
> +       tdmr->pamt_1g_size = pamt_size[TDX_PG_1G];
> +
> +       return 0;
> +}
> +
> +static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn,
> +                         unsigned long *pamt_npages)
> +{
> +       unsigned long pamt_base, pamt_sz;
> +
> +       /*
> +        * The PAMT was allocated in one contiguous unit.  The 4K PAMT
> +        * should always point to the beginning of that allocation.
> +        */
> +       pamt_base = tdmr->pamt_4k_base;
> +       pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
> +
> +       *pamt_pfn = pamt_base >> PAGE_SHIFT;
> +       *pamt_npages = pamt_sz >> PAGE_SHIFT;
> +}
> +
> +static void tdmr_free_pamt(struct tdmr_info *tdmr)
> +{
> +       unsigned long pamt_pfn, pamt_npages;
> +
> +       tdmr_get_pamt(tdmr, &pamt_pfn, &pamt_npages);
> +
> +       /* Do nothing if PAMT hasn't been allocated for this TDMR */
> +       if (!pamt_npages)
> +               return;
> +
> +       if (WARN_ON_ONCE(!pamt_pfn))
> +               return;
> +
> +       free_contig_range(pamt_pfn, pamt_npages);
> +}
> +
> +static void tdmrs_free_pamt_all(struct tdmr_info *tdmr_array, int tdmr_num)
> +{
> +       int i;
> +
> +       for (i = 0; i < tdmr_num; i++)
> +               tdmr_free_pamt(tdmr_array_entry(tdmr_array, i));
> +}
> +
> +/* Allocate and set up PAMTs for all TDMRs */
> +static int tdmrs_set_up_pamt_all(struct tdmr_info *tdmr_array, int tdmr_num)
> +{
> +       int i, ret = 0;
> +
> +       for (i = 0; i < tdmr_num; i++) {
> +               ret = tdmr_set_up_pamt(tdmr_array_entry(tdmr_array, i));
> +               if (ret)
> +                       goto err;
> +       }
> +
> +       return 0;
> +err:
> +       tdmrs_free_pamt_all(tdmr_array, tdmr_num);
> +       return ret;
> +}
> +
> +static unsigned long tdmrs_get_pamt_pages(struct tdmr_info *tdmr_array,
> +                                         int tdmr_num)
> +{
> +       unsigned long pamt_npages = 0;
> +       int i;
> +
> +       for (i = 0; i < tdmr_num; i++) {
> +               unsigned long pfn, npages;
> +
> +               tdmr_get_pamt(tdmr_array_entry(tdmr_array, i), &pfn, &npages);
> +               pamt_npages += npages;
> +       }
> +
> +       return pamt_npages;
> +}
> +
>  /*
>   * Construct an array of TDMRs to cover all memory regions in memblock.
>   * This makes sure all pages managed by the page allocator are TDX
> @@ -572,8 +762,13 @@ static int construct_tdmrs_memeblock(struct tdmr_info *tdmr_array,
>         if (ret)
>                 goto err;
>
> +       ret = tdmrs_set_up_pamt_all(tdmr_array, *tdmr_num);
> +       if (ret)
> +               goto err;
> +
>         /* Return -EINVAL until constructing TDMRs is done */
>         ret = -EINVAL;
> +       tdmrs_free_pamt_all(tdmr_array, *tdmr_num);
>  err:
>         return ret;
>  }
> @@ -644,6 +839,11 @@ static int init_tdx_module(void)
>          * process are done.
>          */
>         ret = -EINVAL;
> +       if (ret)
> +               tdmrs_free_pamt_all(tdmr_array, tdmr_num);
> +       else
> +               pr_info("%lu pages allocated for PAMT.\n",
> +                               tdmrs_get_pamt_pages(tdmr_array, tdmr_num));
>  out_free_tdmrs:
>         /*
>          * The array of TDMRs is freed no matter the initialization is
> --
> 2.36.1
>
Huang, Kai Aug. 17, 2022, 11:43 p.m. UTC | #10
On Wed, 2022-08-17 at 15:46 -0700, Sagi Shahar wrote:
> > +/*
> > + * Calculate PAMT size given a TDMR and a page size.  The returned
> > + * PAMT size is always aligned up to 4K page boundary.
> > + */
> > +static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr,
> > +                                     enum tdx_page_sz pgsz)
> > +{
> > +       unsigned long pamt_sz;
> > +       int pamt_entry_nr;
>             ^
> This should be an 'unsigned long'. Otherwise you get an integer
> overflow for large memory machines.

Agreed.  Thanks.

> 
> > +
> > +       switch (pgsz) {
> > +       case TDX_PG_4K:
> > +               pamt_entry_nr = tdmr->size >> PAGE_SHIFT;
> > +               break;
> > +       case TDX_PG_2M:
> > +               pamt_entry_nr = tdmr->size >> PMD_SHIFT;
> > +               break;
> > +       case TDX_PG_1G:
> > +               pamt_entry_nr = tdmr->size >> PUD_SHIFT;
> > +               break;
> > +       default:
> > +               WARN_ON_ONCE(1);
> > +               return 0;
> > +       }
> > +
> > +       pamt_sz = pamt_entry_nr * tdx_sysinfo.pamt_entry_size;
> > +       /* TDX requires PAMT size must be 4K aligned */
> > +       pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
> > +
> > +       return pamt_sz;
> > +}
> > +
diff mbox series

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4988a91d5283..ec496e96d120 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1973,6 +1973,7 @@  config INTEL_TDX_HOST
 	depends on CPU_SUP_INTEL
 	depends on X86_64
 	depends on KVM_INTEL
+	depends on CONTIG_ALLOC
 	select ARCH_HAS_CC_PLATFORM
 	select ARCH_KEEP_MEMBLOCK
 	help
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index fd9f449b5395..36260dd7e69f 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -558,6 +558,196 @@  static int create_tdmrs(struct tdmr_info *tdmr_array, int *tdmr_num)
 	return 0;
 }
 
+/* Page sizes supported by TDX */
+enum tdx_page_sz {
+	TDX_PG_4K,
+	TDX_PG_2M,
+	TDX_PG_1G,
+	TDX_PG_MAX,
+};
+
+/*
+ * Calculate PAMT size given a TDMR and a page size.  The returned
+ * PAMT size is always aligned up to 4K page boundary.
+ */
+static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr,
+				      enum tdx_page_sz pgsz)
+{
+	unsigned long pamt_sz;
+	int pamt_entry_nr;
+
+	switch (pgsz) {
+	case TDX_PG_4K:
+		pamt_entry_nr = tdmr->size >> PAGE_SHIFT;
+		break;
+	case TDX_PG_2M:
+		pamt_entry_nr = tdmr->size >> PMD_SHIFT;
+		break;
+	case TDX_PG_1G:
+		pamt_entry_nr = tdmr->size >> PUD_SHIFT;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+
+	pamt_sz = pamt_entry_nr * tdx_sysinfo.pamt_entry_size;
+	/* TDX requires PAMT size must be 4K aligned */
+	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
+
+	return pamt_sz;
+}
+
+/*
+ * Pick a NUMA node on which to allocate this TDMR's metadata.
+ *
+ * This is imprecise since TDMRs are 1G aligned and NUMA nodes might
+ * not be.  If the TDMR covers more than one node, just use the _first_
+ * one.  This can lead to small areas of off-node metadata for some
+ * memory.
+ */
+static int tdmr_get_nid(struct tdmr_info *tdmr)
+{
+	unsigned long start_pfn, end_pfn;
+	int i, nid;
+
+	/* Find the first memory region covered by the TDMR */
+	memblock_for_each_tdx_mem_pfn_range(i, &start_pfn, &end_pfn, &nid) {
+		if (end_pfn > (tdmr_start(tdmr) >> PAGE_SHIFT))
+			return nid;
+	}
+
+	/*
+	 * No memory region found for this TDMR.  It cannot happen since
+	 * when one TDMR is created, it must cover at least one (or
+	 * partial) memory region.
+	 */
+	WARN_ON_ONCE(1);
+	return 0;
+}
+
+static int tdmr_set_up_pamt(struct tdmr_info *tdmr)
+{
+	unsigned long pamt_base[TDX_PG_MAX];
+	unsigned long pamt_size[TDX_PG_MAX];
+	unsigned long tdmr_pamt_base;
+	unsigned long tdmr_pamt_size;
+	enum tdx_page_sz pgsz;
+	struct page *pamt;
+	int nid;
+
+	nid = tdmr_get_nid(tdmr);
+
+	/*
+	 * Calculate the PAMT size for each TDX supported page size
+	 * and the total PAMT size.
+	 */
+	tdmr_pamt_size = 0;
+	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
+		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz);
+		tdmr_pamt_size += pamt_size[pgsz];
+	}
+
+	/*
+	 * Allocate one chunk of physically contiguous memory for all
+	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
+	 * in overlapped TDMRs.
+	 */
+	pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
+			nid, &node_online_map);
+	if (!pamt)
+		return -ENOMEM;
+
+	/* Calculate PAMT base and size for all supported page sizes. */
+	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
+	for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
+		pamt_base[pgsz] = tdmr_pamt_base;
+		tdmr_pamt_base += pamt_size[pgsz];
+	}
+
+	tdmr->pamt_4k_base = pamt_base[TDX_PG_4K];
+	tdmr->pamt_4k_size = pamt_size[TDX_PG_4K];
+	tdmr->pamt_2m_base = pamt_base[TDX_PG_2M];
+	tdmr->pamt_2m_size = pamt_size[TDX_PG_2M];
+	tdmr->pamt_1g_base = pamt_base[TDX_PG_1G];
+	tdmr->pamt_1g_size = pamt_size[TDX_PG_1G];
+
+	return 0;
+}
+
+static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn,
+			  unsigned long *pamt_npages)
+{
+	unsigned long pamt_base, pamt_sz;
+
+	/*
+	 * The PAMT was allocated in one contiguous unit.  The 4K PAMT
+	 * should always point to the beginning of that allocation.
+	 */
+	pamt_base = tdmr->pamt_4k_base;
+	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
+
+	*pamt_pfn = pamt_base >> PAGE_SHIFT;
+	*pamt_npages = pamt_sz >> PAGE_SHIFT;
+}
+
+static void tdmr_free_pamt(struct tdmr_info *tdmr)
+{
+	unsigned long pamt_pfn, pamt_npages;
+
+	tdmr_get_pamt(tdmr, &pamt_pfn, &pamt_npages);
+
+	/* Do nothing if PAMT hasn't been allocated for this TDMR */
+	if (!pamt_npages)
+		return;
+
+	if (WARN_ON_ONCE(!pamt_pfn))
+		return;
+
+	free_contig_range(pamt_pfn, pamt_npages);
+}
+
+static void tdmrs_free_pamt_all(struct tdmr_info *tdmr_array, int tdmr_num)
+{
+	int i;
+
+	for (i = 0; i < tdmr_num; i++)
+		tdmr_free_pamt(tdmr_array_entry(tdmr_array, i));
+}
+
+/* Allocate and set up PAMTs for all TDMRs */
+static int tdmrs_set_up_pamt_all(struct tdmr_info *tdmr_array, int tdmr_num)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < tdmr_num; i++) {
+		ret = tdmr_set_up_pamt(tdmr_array_entry(tdmr_array, i));
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+err:
+	tdmrs_free_pamt_all(tdmr_array, tdmr_num);
+	return ret;
+}
+
+static unsigned long tdmrs_get_pamt_pages(struct tdmr_info *tdmr_array,
+					  int tdmr_num)
+{
+	unsigned long pamt_npages = 0;
+	int i;
+
+	for (i = 0; i < tdmr_num; i++) {
+		unsigned long pfn, npages;
+
+		tdmr_get_pamt(tdmr_array_entry(tdmr_array, i), &pfn, &npages);
+		pamt_npages += npages;
+	}
+
+	return pamt_npages;
+}
+
 /*
  * Construct an array of TDMRs to cover all memory regions in memblock.
  * This makes sure all pages managed by the page allocator are TDX
@@ -572,8 +762,13 @@  static int construct_tdmrs_memeblock(struct tdmr_info *tdmr_array,
 	if (ret)
 		goto err;
 
+	ret = tdmrs_set_up_pamt_all(tdmr_array, *tdmr_num);
+	if (ret)
+		goto err;
+
 	/* Return -EINVAL until constructing TDMRs is done */
 	ret = -EINVAL;
+	tdmrs_free_pamt_all(tdmr_array, *tdmr_num);
 err:
 	return ret;
 }
@@ -644,6 +839,11 @@  static int init_tdx_module(void)
 	 * process are done.
 	 */
 	ret = -EINVAL;
+	if (ret)
+		tdmrs_free_pamt_all(tdmr_array, tdmr_num);
+	else
+		pr_info("%lu pages allocated for PAMT.\n",
+				tdmrs_get_pamt_pages(tdmr_array, tdmr_num));
 out_free_tdmrs:
 	/*
 	 * The array of TDMRs is freed no matter the initialization is