diff mbox series

[v5,12/22] x86/virt/tdx: Convert all memory regions in memblock to TDX memory

Message ID 8288396be7fedd10521a28531e138579594d757a.1655894131.git.kai.huang@intel.com (mailing list archive)
State New, archived
Headers show
Series TDX host kernel support | expand

Commit Message

Huang, Kai June 22, 2022, 11:17 a.m. UTC
The TDX module reports a list of Convertible Memory Regions (CMR) to
identify which memory regions can be used as TDX memory, but they are
not automatically usable to the TDX module.  The kernel needs to choose
which convertible memory regions to be TDX memory and configure those
regions by passing an array of "TD Memory Regions" (TDMR) to the TDX
module.

To avoid having to modify the page allocator to distinguish TDX and
non-TDX memory allocation, convert all memory regions in the memblock to
TDX memory.  As the first step, sanity check all memory regions in
memblock are fully covered by CMRs so the above conversion is guaranteed
to work.  This works also because both ACPI memory hotplug (reported as
BIOS bug) and driver managed memory hotplug are both prevented when TDX
is enabled by BIOS, so no new non-TDX-convertible memory can end up to
the page allocator.

Select ARCH_KEEP_MEMBLOCK when CONFIG_INTEL_TDX_HOST to keep memblock
after boot so it can be used during the TDX module initialization.

Also, explicitly exclude memory regions below first 1MB as TDX memory
because those regions may not be reported as convertible memory.  This
is OK as the first 1MB is always reserved during kernel boot and won't
end up to the page allocator.

Signed-off-by: Kai Huang <kai.huang@intel.com>
---

- v3 -> v4 (no feedback on v4):
 - Changed to use memblock from e820.
 - Simplified changelog a lot.

---
 arch/x86/Kconfig            |   1 +
 arch/x86/virt/vmx/tdx/tdx.c | 100 ++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+)

Comments

Dave Hansen June 24, 2022, 7:40 p.m. UTC | #1
On 6/22/22 04:17, Kai Huang wrote:
...
> Also, explicitly exclude memory regions below first 1MB as TDX memory
> because those regions may not be reported as convertible memory.  This
> is OK as the first 1MB is always reserved during kernel boot and won't
> end up to the page allocator.

Are you sure?  I wasn't for a few minutes until I found reserve_real_mode()

Could we point to that in this changelog, please?

> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index efa830853e98..4988a91d5283 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1974,6 +1974,7 @@ config INTEL_TDX_HOST
>  	depends on X86_64
>  	depends on KVM_INTEL
>  	select ARCH_HAS_CC_PLATFORM
> +	select ARCH_KEEP_MEMBLOCK
>  	help
>  	  Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
>  	  host and certain physical attacks.  This option enables necessary TDX
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index 1bc97756bc0d..2b20d4a7a62b 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -15,6 +15,8 @@
>  #include <linux/cpumask.h>
>  #include <linux/smp.h>
>  #include <linux/atomic.h>
> +#include <linux/sizes.h>
> +#include <linux/memblock.h>
>  #include <asm/cpufeatures.h>
>  #include <asm/cpufeature.h>
>  #include <asm/msr-index.h>
> @@ -338,6 +340,91 @@ static int tdx_get_sysinfo(struct tdsysinfo_struct *tdsysinfo,
>  	return check_cmrs(cmr_array, actual_cmr_num);
>  }
>  
> +/*
> + * Skip the memory region below 1MB.  Return true if the entire
> + * region is skipped.  Otherwise, the updated range is returned.
> + */
> +static bool pfn_range_skip_lowmem(unsigned long *p_start_pfn,
> +				  unsigned long *p_end_pfn)
> +{
> +	u64 start, end;
> +
> +	start = *p_start_pfn << PAGE_SHIFT;
> +	end = *p_end_pfn << PAGE_SHIFT;
> +
> +	if (start < SZ_1M)
> +		start = SZ_1M;
> +
> +	if (start >= end)
> +		return true;
> +
> +	*p_start_pfn = (start >> PAGE_SHIFT);
> +
> +	return false;
> +}
> +
> +/*
> + * Walks over all memblock memory regions that are intended to be
> + * converted to TDX memory.  Essentially, it is all memblock memory
> + * regions excluding the low memory below 1MB.
> + *
> + * This is because on some TDX platforms the low memory below 1MB is
> + * not included in CMRs.  Excluding the low 1MB can still guarantee
> + * that the pages managed by the page allocator are always TDX memory,
> + * as the low 1MB is reserved during kernel boot and won't end up to
> + * the ZONE_DMA (see reserve_real_mode()).
> + */
> +#define memblock_for_each_tdx_mem_pfn_range(i, p_start, p_end, p_nid)	\
> +	for_each_mem_pfn_range(i, MAX_NUMNODES, p_start, p_end, p_nid)	\
> +		if (!pfn_range_skip_lowmem(p_start, p_end))

Let's summarize where we are at this point:

1. All RAM is described in memblocks
2. Some memblocks are reserved and some are free
3. The lower 1MB is marked reserved
4. for_each_mem_pfn_range() walks all reserved and free memblocks, so we
   have to exclude the lower 1MB as a special case.

That seems superficially rather ridiculous.  Shouldn't we just pick a
memblock iterator that skips the 1MB?  Surely there is such a thing.
Or, should we be doing something different with the 1MB in the memblock
structure?

> +/* Check whether first range is the subrange of the second */
> +static bool is_subrange(u64 r1_start, u64 r1_end, u64 r2_start, u64 r2_end)
> +{
> +	return r1_start >= r2_start && r1_end <= r2_end;
> +}
> +
> +/* Check whether address range is covered by any CMR or not. */
> +static bool range_covered_by_cmr(struct cmr_info *cmr_array, int cmr_num,
> +				 u64 start, u64 end)
> +{
> +	int i;
> +
> +	for (i = 0; i < cmr_num; i++) {
> +		struct cmr_info *cmr = &cmr_array[i];
> +
> +		if (is_subrange(start, end, cmr->base, cmr->base + cmr->size))
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
> +/*
> + * Check whether all memory regions in memblock are TDX convertible
> + * memory.  Return 0 if all memory regions are convertible, or error.
> + */
> +static int check_memblock_tdx_convertible(void)
> +{
> +	unsigned long start_pfn, end_pfn;
> +	int i;
> +
> +	memblock_for_each_tdx_mem_pfn_range(i, &start_pfn, &end_pfn, NULL) {
> +		u64 start, end;
> +
> +		start = start_pfn << PAGE_SHIFT;
> +		end = end_pfn << PAGE_SHIFT;
> +		if (!range_covered_by_cmr(tdx_cmr_array, tdx_cmr_num, start,
> +					end)) {
> +			pr_err("[0x%llx, 0x%llx) is not fully convertible memory\n",
> +					start, end);
> +			return -EINVAL;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  /*
>   * Detect and initialize the TDX module.
>   *
> @@ -371,6 +458,19 @@ static int init_tdx_module(void)
>  	if (ret)
>  		goto out;
>  
> +	/*
> +	 * To avoid having to modify the page allocator to distinguish
> +	 * TDX and non-TDX memory allocation, convert all memory regions
> +	 * in memblock to TDX memory to make sure all pages managed by
> +	 * the page allocator are TDX memory.
> +	 *
> +	 * Sanity check all memory regions are fully covered by CMRs to
> +	 * make sure they are truly convertible.
> +	 */
> +	ret = check_memblock_tdx_convertible();
> +	if (ret)
> +		goto out;
> +
>  	/*
>  	 * Return -EINVAL until all steps of TDX module initialization
>  	 * process are done.
Huang, Kai June 27, 2022, 6:16 a.m. UTC | #2
On Fri, 2022-06-24 at 12:40 -0700, Dave Hansen wrote:
> On 6/22/22 04:17, Kai Huang wrote:
> ...
> > Also, explicitly exclude memory regions below first 1MB as TDX memory
> > because those regions may not be reported as convertible memory.  This
> > is OK as the first 1MB is always reserved during kernel boot and won't
> > end up to the page allocator.
> 
> Are you sure?  I wasn't for a few minutes until I found reserve_real_mode()
> 
> Could we point to that in this changelog, please?

OK will explicitly point out reserve_real_mode().

> 
> > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> > index efa830853e98..4988a91d5283 100644
> > --- a/arch/x86/Kconfig
> > +++ b/arch/x86/Kconfig
> > @@ -1974,6 +1974,7 @@ config INTEL_TDX_HOST
> >  	depends on X86_64
> >  	depends on KVM_INTEL
> >  	select ARCH_HAS_CC_PLATFORM
> > +	select ARCH_KEEP_MEMBLOCK
> >  	help
> >  	  Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
> >  	  host and certain physical attacks.  This option enables necessary TDX
> > diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> > index 1bc97756bc0d..2b20d4a7a62b 100644
> > --- a/arch/x86/virt/vmx/tdx/tdx.c
> > +++ b/arch/x86/virt/vmx/tdx/tdx.c
> > @@ -15,6 +15,8 @@
> >  #include <linux/cpumask.h>
> >  #include <linux/smp.h>
> >  #include <linux/atomic.h>
> > +#include <linux/sizes.h>
> > +#include <linux/memblock.h>
> >  #include <asm/cpufeatures.h>
> >  #include <asm/cpufeature.h>
> >  #include <asm/msr-index.h>
> > @@ -338,6 +340,91 @@ static int tdx_get_sysinfo(struct tdsysinfo_struct *tdsysinfo,
> >  	return check_cmrs(cmr_array, actual_cmr_num);
> >  }
> >  
> > +/*
> > + * Skip the memory region below 1MB.  Return true if the entire
> > + * region is skipped.  Otherwise, the updated range is returned.
> > + */
> > +static bool pfn_range_skip_lowmem(unsigned long *p_start_pfn,
> > +				  unsigned long *p_end_pfn)
> > +{
> > +	u64 start, end;
> > +
> > +	start = *p_start_pfn << PAGE_SHIFT;
> > +	end = *p_end_pfn << PAGE_SHIFT;
> > +
> > +	if (start < SZ_1M)
> > +		start = SZ_1M;
> > +
> > +	if (start >= end)
> > +		return true;
> > +
> > +	*p_start_pfn = (start >> PAGE_SHIFT);
> > +
> > +	return false;
> > +}
> > +
> > +/*
> > + * Walks over all memblock memory regions that are intended to be
> > + * converted to TDX memory.  Essentially, it is all memblock memory
> > + * regions excluding the low memory below 1MB.
> > + *
> > + * This is because on some TDX platforms the low memory below 1MB is
> > + * not included in CMRs.  Excluding the low 1MB can still guarantee
> > + * that the pages managed by the page allocator are always TDX memory,
> > + * as the low 1MB is reserved during kernel boot and won't end up to
> > + * the ZONE_DMA (see reserve_real_mode()).
> > + */
> > +#define memblock_for_each_tdx_mem_pfn_range(i, p_start, p_end, p_nid)	\
> > +	for_each_mem_pfn_range(i, MAX_NUMNODES, p_start, p_end, p_nid)	\
> > +		if (!pfn_range_skip_lowmem(p_start, p_end))
> 
> Let's summarize where we are at this point:
> 
> 1. All RAM is described in memblocks
> 2. Some memblocks are reserved and some are free
> 3. The lower 1MB is marked reserved
> 4. for_each_mem_pfn_range() walks all reserved and free memblocks, so we
>    have to exclude the lower 1MB as a special case.
> 
> That seems superficially rather ridiculous.  Shouldn't we just pick a
> memblock iterator that skips the 1MB?  Surely there is such a thing.

Perhaps you are suggesting we should always loop the _free_ ranges so we don't
need to care about the first 1MB which is reserved?

The problem is some reserved memory regions are actually later freed to the page
allocator, for example, initrd.  So to cover all those 'late-freed-reserved-
regions', I used for_each_mem_pfn_range(), instead of for_each_free_mem_range().

Btw, I do have a checkpatch warning around this code:

ERROR: Macros with complex values should be enclosed in parentheses
#109: FILE: arch/x86/virt/vmx/tdx/tdx.c:377:
+#define memblock_for_each_tdx_mem_pfn_range(i, p_start, p_end, p_nid)	\
+	for_each_mem_pfn_range(i, MAX_NUMNODES, p_start, p_end, p_nid)	\
+		if (!pfn_range_skip_lowmem(p_start, p_end))

But it looks like a false positive to me.

> Or, should we be doing something different with the 1MB in the memblock
> structure?

memblock APIs are used by other kernel components.  I don't think we should
modify memblock code behaviour for TDX.  Do you have any specific suggestion?

One possible option I can think is explicitly "register" memory regions as TDX
memory when they are firstly freed to the page allocator.  Those regions
includes: 

1) memblock_free_all();

Where majority of the pages are freed to page allocator from memblock.

2) memblock_free_late();

Which covers regions freed to page allocator after 1).

3) free_init_pages();

Which is explicitly used for some reserved areas such as initrd and part of
kernel image.

This will require new data structures to represent TDX memblock and the code to
create, insert and merge contiguous TDX memblocks, etc.  The advantage is we can
just iterate those TDX memblocks when constructing TDMRs.
Huang, Kai July 7, 2022, 2:37 a.m. UTC | #3
> > > +/*
> > > + * Walks over all memblock memory regions that are intended to be
> > > + * converted to TDX memory.  Essentially, it is all memblock memory
> > > + * regions excluding the low memory below 1MB.
> > > + *
> > > + * This is because on some TDX platforms the low memory below 1MB is
> > > + * not included in CMRs.  Excluding the low 1MB can still guarantee
> > > + * that the pages managed by the page allocator are always TDX memory,
> > > + * as the low 1MB is reserved during kernel boot and won't end up to
> > > + * the ZONE_DMA (see reserve_real_mode()).
> > > + */
> > > +#define memblock_for_each_tdx_mem_pfn_range(i, p_start, p_end, p_nid)	\
> > > +	for_each_mem_pfn_range(i, MAX_NUMNODES, p_start, p_end, p_nid)	\
> > > +		if (!pfn_range_skip_lowmem(p_start, p_end))
> > 
> > Let's summarize where we are at this point:
> > 
> > 1. All RAM is described in memblocks
> > 2. Some memblocks are reserved and some are free
> > 3. The lower 1MB is marked reserved
> > 4. for_each_mem_pfn_range() walks all reserved and free memblocks, so we
> >    have to exclude the lower 1MB as a special case.
> > 
> > That seems superficially rather ridiculous.  Shouldn't we just pick a
> > memblock iterator that skips the 1MB?  Surely there is such a thing.
> 
> Perhaps you are suggesting we should always loop the _free_ ranges so we don't
> need to care about the first 1MB which is reserved?
> 
> The problem is some reserved memory regions are actually later freed to the page
> allocator, for example, initrd.  So to cover all those 'late-freed-reserved-
> regions', I used for_each_mem_pfn_range(), instead of for_each_free_mem_range().
> 
> Btw, I do have a checkpatch warning around this code:
> 
> ERROR: Macros with complex values should be enclosed in parentheses
> #109: FILE: arch/x86/virt/vmx/tdx/tdx.c:377:
> +#define memblock_for_each_tdx_mem_pfn_range(i, p_start, p_end, p_nid)	\
> +	for_each_mem_pfn_range(i, MAX_NUMNODES, p_start, p_end, p_nid)	\
> +		if (!pfn_range_skip_lowmem(p_start, p_end))
> 
> But it looks like a false positive to me.

Hi Dave,

Sorry to ping. Just double check, any comments around here, ..

> 
> > Or, should we be doing something different with the 1MB in the memblock
> > structure?
> 
> memblock APIs are used by other kernel components.  I don't think we should
> modify memblock code behaviour for TDX.  Do you have any specific suggestion?
> 
> One possible option I can think is explicitly "register" memory regions as TDX
> memory when they are firstly freed to the page allocator.  

[...]

> 
> This will require new data structures to represent TDX memblock and the code to
> create, insert and merge contiguous TDX memblocks, etc.  The advantage is we can
> just iterate those TDX memblocks when constructing TDMRs.
> 
> 

And here?
Dave Hansen July 7, 2022, 2:26 p.m. UTC | #4
On 6/26/22 23:16, Kai Huang wrote:
> On Fri, 2022-06-24 at 12:40 -0700, Dave Hansen wrote:
>>> +/*
>>> + * Walks over all memblock memory regions that are intended to be
>>> + * converted to TDX memory.  Essentially, it is all memblock memory
>>> + * regions excluding the low memory below 1MB.
>>> + *
>>> + * This is because on some TDX platforms the low memory below 1MB is
>>> + * not included in CMRs.  Excluding the low 1MB can still guarantee
>>> + * that the pages managed by the page allocator are always TDX memory,
>>> + * as the low 1MB is reserved during kernel boot and won't end up to
>>> + * the ZONE_DMA (see reserve_real_mode()).
>>> + */
>>> +#define memblock_for_each_tdx_mem_pfn_range(i, p_start, p_end, p_nid)	\
>>> +	for_each_mem_pfn_range(i, MAX_NUMNODES, p_start, p_end, p_nid)	\
>>> +		if (!pfn_range_skip_lowmem(p_start, p_end))
>>
>> Let's summarize where we are at this point:
>>
>> 1. All RAM is described in memblocks
>> 2. Some memblocks are reserved and some are free
>> 3. The lower 1MB is marked reserved
>> 4. for_each_mem_pfn_range() walks all reserved and free memblocks, so we
>>    have to exclude the lower 1MB as a special case.
>>
>> That seems superficially rather ridiculous.  Shouldn't we just pick a
>> memblock iterator that skips the 1MB?  Surely there is such a thing.
> 
> Perhaps you are suggesting we should always loop the _free_ ranges so we don't
> need to care about the first 1MB which is reserved?
> 
> The problem is some reserved memory regions are actually later freed to the page
> allocator, for example, initrd.  So to cover all those 'late-freed-reserved-
> regions', I used for_each_mem_pfn_range(), instead of for_each_free_mem_range().

Why not just entirely remove the lower 1MB from the memblock structure
on TDX systems?  Do something equivalent to adding this on the kernel
command line:

	memmap=1M$0x0

> Btw, I do have a checkpatch warning around this code:
> 
> ERROR: Macros with complex values should be enclosed in parentheses
> #109: FILE: arch/x86/virt/vmx/tdx/tdx.c:377:
> +#define memblock_for_each_tdx_mem_pfn_range(i, p_start, p_end, p_nid)	\
> +	for_each_mem_pfn_range(i, MAX_NUMNODES, p_start, p_end, p_nid)	\
> +		if (!pfn_range_skip_lowmem(p_start, p_end))
> 
> But it looks like a false positive to me.

I think it doesn't like the if().
Jürgen Groß July 7, 2022, 2:36 p.m. UTC | #5
On 07.07.22 16:26, Dave Hansen wrote:
> On 6/26/22 23:16, Kai Huang wrote:
>> On Fri, 2022-06-24 at 12:40 -0700, Dave Hansen wrote:
>>>> +/*
>>>> + * Walks over all memblock memory regions that are intended to be
>>>> + * converted to TDX memory.  Essentially, it is all memblock memory
>>>> + * regions excluding the low memory below 1MB.
>>>> + *
>>>> + * This is because on some TDX platforms the low memory below 1MB is
>>>> + * not included in CMRs.  Excluding the low 1MB can still guarantee
>>>> + * that the pages managed by the page allocator are always TDX memory,
>>>> + * as the low 1MB is reserved during kernel boot and won't end up to
>>>> + * the ZONE_DMA (see reserve_real_mode()).
>>>> + */
>>>> +#define memblock_for_each_tdx_mem_pfn_range(i, p_start, p_end, p_nid)	\
>>>> +	for_each_mem_pfn_range(i, MAX_NUMNODES, p_start, p_end, p_nid)	\
>>>> +		if (!pfn_range_skip_lowmem(p_start, p_end))
>>>
>>> Let's summarize where we are at this point:
>>>
>>> 1. All RAM is described in memblocks
>>> 2. Some memblocks are reserved and some are free
>>> 3. The lower 1MB is marked reserved
>>> 4. for_each_mem_pfn_range() walks all reserved and free memblocks, so we
>>>     have to exclude the lower 1MB as a special case.
>>>
>>> That seems superficially rather ridiculous.  Shouldn't we just pick a
>>> memblock iterator that skips the 1MB?  Surely there is such a thing.
>>
>> Perhaps you are suggesting we should always loop the _free_ ranges so we don't
>> need to care about the first 1MB which is reserved?
>>
>> The problem is some reserved memory regions are actually later freed to the page
>> allocator, for example, initrd.  So to cover all those 'late-freed-reserved-
>> regions', I used for_each_mem_pfn_range(), instead of for_each_free_mem_range().
> 
> Why not just entirely remove the lower 1MB from the memblock structure
> on TDX systems?  Do something equivalent to adding this on the kernel
> command line:
> 
> 	memmap=1M$0x0
> 
>> Btw, I do have a checkpatch warning around this code:
>>
>> ERROR: Macros with complex values should be enclosed in parentheses
>> #109: FILE: arch/x86/virt/vmx/tdx/tdx.c:377:
>> +#define memblock_for_each_tdx_mem_pfn_range(i, p_start, p_end, p_nid)	\
>> +	for_each_mem_pfn_range(i, MAX_NUMNODES, p_start, p_end, p_nid)	\
>> +		if (!pfn_range_skip_lowmem(p_start, p_end))
>>
>> But it looks like a false positive to me.
> 
> I think it doesn't like the if().

I think it is right.

Consider:

if (a)
     memblock_for_each_tdx_mem_pfn_range(...)
         func();
else
     other_func();


Juergen
Huang, Kai July 7, 2022, 11:34 p.m. UTC | #6
On Thu, 2022-07-07 at 07:26 -0700, Dave Hansen wrote:
> On 6/26/22 23:16, Kai Huang wrote:
> > On Fri, 2022-06-24 at 12:40 -0700, Dave Hansen wrote:
> > > > +/*
> > > > + * Walks over all memblock memory regions that are intended to be
> > > > + * converted to TDX memory.  Essentially, it is all memblock memory
> > > > + * regions excluding the low memory below 1MB.
> > > > + *
> > > > + * This is because on some TDX platforms the low memory below 1MB is
> > > > + * not included in CMRs.  Excluding the low 1MB can still guarantee
> > > > + * that the pages managed by the page allocator are always TDX memory,
> > > > + * as the low 1MB is reserved during kernel boot and won't end up to
> > > > + * the ZONE_DMA (see reserve_real_mode()).
> > > > + */
> > > > +#define memblock_for_each_tdx_mem_pfn_range(i, p_start, p_end, p_nid)	\
> > > > +	for_each_mem_pfn_range(i, MAX_NUMNODES, p_start, p_end, p_nid)	\
> > > > +		if (!pfn_range_skip_lowmem(p_start, p_end))
> > > 
> > > Let's summarize where we are at this point:
> > > 
> > > 1. All RAM is described in memblocks
> > > 2. Some memblocks are reserved and some are free
> > > 3. The lower 1MB is marked reserved
> > > 4. for_each_mem_pfn_range() walks all reserved and free memblocks, so we
> > >    have to exclude the lower 1MB as a special case.
> > > 
> > > That seems superficially rather ridiculous.  Shouldn't we just pick a
> > > memblock iterator that skips the 1MB?  Surely there is such a thing.
> > 
> > Perhaps you are suggesting we should always loop the _free_ ranges so we don't
> > need to care about the first 1MB which is reserved?
> > 
> > The problem is some reserved memory regions are actually later freed to the page
> > allocator, for example, initrd.  So to cover all those 'late-freed-reserved-
> > regions', I used for_each_mem_pfn_range(), instead of for_each_free_mem_range().
> 
> Why not just entirely remove the lower 1MB from the memblock structure
> on TDX systems?  Do something equivalent to adding this on the kernel
> command line:
> 
> 	memmap=1M$0x0

I will explore this option.  Thanks!

> 
> > Btw, I do have a checkpatch warning around this code:
> > 
> > ERROR: Macros with complex values should be enclosed in parentheses
> > #109: FILE: arch/x86/virt/vmx/tdx/tdx.c:377:
> > +#define memblock_for_each_tdx_mem_pfn_range(i, p_start, p_end, p_nid)	\
> > +	for_each_mem_pfn_range(i, MAX_NUMNODES, p_start, p_end, p_nid)	\
> > +		if (!pfn_range_skip_lowmem(p_start, p_end))
> > 
> > But it looks like a false positive to me.
> 
> I think it doesn't like the if().

Yes. I'll explore your suggestion above and I hope this can be avoided.
Huang, Kai July 7, 2022, 11:42 p.m. UTC | #7
On Thu, 2022-07-07 at 16:36 +0200, Juergen Gross wrote:
> On 07.07.22 16:26, Dave Hansen wrote:
> > On 6/26/22 23:16, Kai Huang wrote:
> > > On Fri, 2022-06-24 at 12:40 -0700, Dave Hansen wrote:
> > > > > +/*
> > > > > + * Walks over all memblock memory regions that are intended to be
> > > > > + * converted to TDX memory.  Essentially, it is all memblock memory
> > > > > + * regions excluding the low memory below 1MB.
> > > > > + *
> > > > > + * This is because on some TDX platforms the low memory below 1MB is
> > > > > + * not included in CMRs.  Excluding the low 1MB can still guarantee
> > > > > + * that the pages managed by the page allocator are always TDX memory,
> > > > > + * as the low 1MB is reserved during kernel boot and won't end up to
> > > > > + * the ZONE_DMA (see reserve_real_mode()).
> > > > > + */
> > > > > +#define memblock_for_each_tdx_mem_pfn_range(i, p_start, p_end, p_nid)	\
> > > > > +	for_each_mem_pfn_range(i, MAX_NUMNODES, p_start, p_end, p_nid)	\
> > > > > +		if (!pfn_range_skip_lowmem(p_start, p_end))
> > > > 
> > > > Let's summarize where we are at this point:
> > > > 
> > > > 1. All RAM is described in memblocks
> > > > 2. Some memblocks are reserved and some are free
> > > > 3. The lower 1MB is marked reserved
> > > > 4. for_each_mem_pfn_range() walks all reserved and free memblocks, so we
> > > >     have to exclude the lower 1MB as a special case.
> > > > 
> > > > That seems superficially rather ridiculous.  Shouldn't we just pick a
> > > > memblock iterator that skips the 1MB?  Surely there is such a thing.
> > > 
> > > Perhaps you are suggesting we should always loop the _free_ ranges so we don't
> > > need to care about the first 1MB which is reserved?
> > > 
> > > The problem is some reserved memory regions are actually later freed to the page
> > > allocator, for example, initrd.  So to cover all those 'late-freed-reserved-
> > > regions', I used for_each_mem_pfn_range(), instead of for_each_free_mem_range().
> > 
> > Why not just entirely remove the lower 1MB from the memblock structure
> > on TDX systems?  Do something equivalent to adding this on the kernel
> > command line:
> > 
> > 	memmap=1M$0x0
> > 
> > > Btw, I do have a checkpatch warning around this code:
> > > 
> > > ERROR: Macros with complex values should be enclosed in parentheses
> > > #109: FILE: arch/x86/virt/vmx/tdx/tdx.c:377:
> > > +#define memblock_for_each_tdx_mem_pfn_range(i, p_start, p_end, p_nid)	\
> > > +	for_each_mem_pfn_range(i, MAX_NUMNODES, p_start, p_end, p_nid)	\
> > > +		if (!pfn_range_skip_lowmem(p_start, p_end))
> > > 
> > > But it looks like a false positive to me.
> > 
> > I think it doesn't like the if().
> 
> I think it is right.
> 
> Consider:
> 
> if (a)
>      memblock_for_each_tdx_mem_pfn_range(...)
>          func();
> else
>      other_func();
> 
> 
> 
Interesting case.  Thanks.

Yes we will require explicit { } around memblock_for_each_tdx_mem_pfn_range() in
this case.
Huang, Kai Aug. 3, 2022, 1:30 a.m. UTC | #8
On Fri, 2022-07-08 at 11:34 +1200, Kai Huang wrote:
> > Why not just entirely remove the lower 1MB from the memblock structure
> > on TDX systems?  Do something equivalent to adding this on the kernel
> > command line:
> > 
> >  	memmap=1M$0x0
> 
> I will explore this option.  Thanks!

Hi Dave,

After investigating and testing, we cannot simply remove first 1MB from e820
table which is similar to what 'memmap=1M$0x0' does, as the kernel needs low
memory as trampoline to bring up all APs.

Currently I am doing below:

--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -65,6 +65,17 @@ void __init reserve_real_mode(void)
         * setup_arch().
         */
        memblock_reserve(0, SZ_1M);
+
+       /*
+        * As one step of initializing the TDX module (on-demand), the
+        * kernel will later verify all memory regions in memblock are
+        * truly TDX-capable and convert all of them to TDX memory.
+        * The first 1MB may not be enumerated as TDX-capable memory.
+        * To avoid failure to verify, explicitly remove the first 1MB
+        * from memblock for a TDX (BIOS) enabled system.
+        */
+       if (platform_tdx_enabled())
+               memblock_remove(0, SZ_1M);

I tested an it worked (I didn't observe any problem), but am I missing
something?

Also, regarding to whether we can remove platform_tdx_enabled() at all, I looked
into the spec again and there's no MSR or CPUID from which we can check TDX is
enabled by BIOS -- except checking the SEAMRR_MASK MSR, which is basically
platform_tdx_enabled() also did.

Checking MSR_MTRRcap.SEAMRR bit isn't enough as it will be true as long as the
hardware supports SEAMRR, but it doesn't tell whether SEAMRR(TDX) is enabled by
BIOS.

So if above code is reasonable, I think we can still detect TDX during boot and
keep platform_tdx_enabled().  

It also detects TDX KeyIDs, which isn't necessary for removing the first 1MB
here (nor for kexec() support), but detecting TDX KeyIDs must be done anyway
either during kernel boot or during initializing TDX module.

Detecting TDX KeyID at boot time also has an advantage that in the future we can
expose KeyIDs via /sysfs and userspace can know how many TDs the machine can
support w/o having to initializing the  TDX module first (we received such
requirement from customer but yes it is arguable).

Any comments?
Dave Hansen Aug. 3, 2022, 2:22 p.m. UTC | #9
On 8/2/22 18:30, Kai Huang wrote:
> On Fri, 2022-07-08 at 11:34 +1200, Kai Huang wrote:
>>> Why not just entirely remove the lower 1MB from the memblock structure
>>> on TDX systems?  Do something equivalent to adding this on the kernel
>>> command line:
>>>
>>>  	memmap=1M$0x0
>> I will explore this option.  Thanks!
> Hi Dave,
> 
> After investigating and testing, we cannot simply remove first 1MB from e820
> table which is similar to what 'memmap=1M$0x0' does, as the kernel needs low
> memory as trampoline to bring up all APs.

OK, so don't remove it, but reserve it so that the trampoline code can
use it.
Huang, Kai Aug. 3, 2022, 10:14 p.m. UTC | #10
On Wed, 2022-08-03 at 07:22 -0700, Dave Hansen wrote:
> On 8/2/22 18:30, Kai Huang wrote:
> > On Fri, 2022-07-08 at 11:34 +1200, Kai Huang wrote:
> > > > Why not just entirely remove the lower 1MB from the memblock structure
> > > > on TDX systems?  Do something equivalent to adding this on the kernel
> > > > command line:
> > > > 
> > > >  	memmap=1M$0x0
> > > I will explore this option.  Thanks!
> > Hi Dave,
> > 
> > After investigating and testing, we cannot simply remove first 1MB from e820
> > table which is similar to what 'memmap=1M$0x0' does, as the kernel needs low
> > memory as trampoline to bring up all APs.
> 
> OK, so don't remove it, but reserve it so that the trampoline code can
> use it.

It's already reserved in the existing reserve_real_mode().  What we need is to
*remove* the first 1MB from memblock.memory, so that the
for_each_mem_pfn_range() will just not get any memory below 1MB.  Otherwise we
need to explicitly skip the first 1MB in TDX code like what I did  in this
series.
diff mbox series

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index efa830853e98..4988a91d5283 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1974,6 +1974,7 @@  config INTEL_TDX_HOST
 	depends on X86_64
 	depends on KVM_INTEL
 	select ARCH_HAS_CC_PLATFORM
+	select ARCH_KEEP_MEMBLOCK
 	help
 	  Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
 	  host and certain physical attacks.  This option enables necessary TDX
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 1bc97756bc0d..2b20d4a7a62b 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -15,6 +15,8 @@ 
 #include <linux/cpumask.h>
 #include <linux/smp.h>
 #include <linux/atomic.h>
+#include <linux/sizes.h>
+#include <linux/memblock.h>
 #include <asm/cpufeatures.h>
 #include <asm/cpufeature.h>
 #include <asm/msr-index.h>
@@ -338,6 +340,91 @@  static int tdx_get_sysinfo(struct tdsysinfo_struct *tdsysinfo,
 	return check_cmrs(cmr_array, actual_cmr_num);
 }
 
+/*
+ * Skip the memory region below 1MB.  Return true if the entire
+ * region is skipped.  Otherwise, the updated range is returned.
+ */
+static bool pfn_range_skip_lowmem(unsigned long *p_start_pfn,
+				  unsigned long *p_end_pfn)
+{
+	u64 start, end;
+
+	start = *p_start_pfn << PAGE_SHIFT;
+	end = *p_end_pfn << PAGE_SHIFT;
+
+	if (start < SZ_1M)
+		start = SZ_1M;
+
+	if (start >= end)
+		return true;
+
+	*p_start_pfn = (start >> PAGE_SHIFT);
+
+	return false;
+}
+
+/*
+ * Walks over all memblock memory regions that are intended to be
+ * converted to TDX memory.  Essentially, it is all memblock memory
+ * regions excluding the low memory below 1MB.
+ *
+ * This is because on some TDX platforms the low memory below 1MB is
+ * not included in CMRs.  Excluding the low 1MB can still guarantee
+ * that the pages managed by the page allocator are always TDX memory,
+ * as the low 1MB is reserved during kernel boot and won't end up to
+ * the ZONE_DMA (see reserve_real_mode()).
+ */
+#define memblock_for_each_tdx_mem_pfn_range(i, p_start, p_end, p_nid)	\
+	for_each_mem_pfn_range(i, MAX_NUMNODES, p_start, p_end, p_nid)	\
+		if (!pfn_range_skip_lowmem(p_start, p_end))
+
+/* Check whether first range is the subrange of the second */
+static bool is_subrange(u64 r1_start, u64 r1_end, u64 r2_start, u64 r2_end)
+{
+	return r1_start >= r2_start && r1_end <= r2_end;
+}
+
+/* Check whether address range is covered by any CMR or not. */
+static bool range_covered_by_cmr(struct cmr_info *cmr_array, int cmr_num,
+				 u64 start, u64 end)
+{
+	int i;
+
+	for (i = 0; i < cmr_num; i++) {
+		struct cmr_info *cmr = &cmr_array[i];
+
+		if (is_subrange(start, end, cmr->base, cmr->base + cmr->size))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Check whether all memory regions in memblock are TDX convertible
+ * memory.  Return 0 if all memory regions are convertible, or error.
+ */
+static int check_memblock_tdx_convertible(void)
+{
+	unsigned long start_pfn, end_pfn;
+	int i;
+
+	memblock_for_each_tdx_mem_pfn_range(i, &start_pfn, &end_pfn, NULL) {
+		u64 start, end;
+
+		start = start_pfn << PAGE_SHIFT;
+		end = end_pfn << PAGE_SHIFT;
+		if (!range_covered_by_cmr(tdx_cmr_array, tdx_cmr_num, start,
+					end)) {
+			pr_err("[0x%llx, 0x%llx) is not fully convertible memory\n",
+					start, end);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Detect and initialize the TDX module.
  *
@@ -371,6 +458,19 @@  static int init_tdx_module(void)
 	if (ret)
 		goto out;
 
+	/*
+	 * To avoid having to modify the page allocator to distinguish
+	 * TDX and non-TDX memory allocation, convert all memory regions
+	 * in memblock to TDX memory to make sure all pages managed by
+	 * the page allocator are TDX memory.
+	 *
+	 * Sanity check all memory regions are fully covered by CMRs to
+	 * make sure they are truly convertible.
+	 */
+	ret = check_memblock_tdx_convertible();
+	if (ret)
+		goto out;
+
 	/*
 	 * Return -EINVAL until all steps of TDX module initialization
 	 * process are done.