diff mbox

[RFC,22/22] x86/kaslr: Add option to extend KASLR range from 1GB to 3GB

Message ID 20170718223333.110371-23-thgarnie@google.com (mailing list archive)
State New, archived
Headers show

Commit Message

Thomas Garnier July 18, 2017, 10:33 p.m. UTC
Add a new CONFIG_RANDOMIZE_BASE_LARGE option to benefit from PIE
support. It increases the KASLR range from 1GB to 3GB. The new range
stars at 0xffffffff00000000 just above the EFI memory region. This
option is off by default.

The boot code is adapted to create the appropriate page table spanning
three PUD pages.

The relocation table uses 64-bit integers generated with the updated
relocation tool with the large-reloc option.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
 arch/x86/Kconfig                     | 21 +++++++++++++++++++++
 arch/x86/boot/compressed/Makefile    |  5 +++++
 arch/x86/boot/compressed/misc.c      | 10 +++++++++-
 arch/x86/include/asm/page_64_types.h |  9 +++++++++
 arch/x86/kernel/head64.c             | 18 ++++++++++++++----
 arch/x86/kernel/head_64.S            | 11 ++++++++++-
 6 files changed, 68 insertions(+), 6 deletions(-)

Comments

Baoquan He July 19, 2017, 12:10 p.m. UTC | #1
On 07/18/17 at 03:33pm, Thomas Garnier wrote:

>  quiet_cmd_relocs = RELOCS  $@
>        cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $<
>  $(obj)/vmlinux.relocs: vmlinux FORCE
> diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
> index a0838ab929f2..0a0c80ab1842 100644
> --- a/arch/x86/boot/compressed/misc.c
> +++ b/arch/x86/boot/compressed/misc.c
> @@ -170,10 +170,18 @@ void __puthex(unsigned long value)
>  }
>  
>  #if CONFIG_X86_NEED_RELOCS
> +
> +/* Large randomization go lower than -2G and use large relocation table */
> +#ifdef CONFIG_RANDOMIZE_BASE_LARGE
> +typedef long rel_t;
> +#else
> +typedef int rel_t;
> +#endif
> +
>  static void handle_relocations(void *output, unsigned long output_len,
>  			       unsigned long virt_addr)
>  {
> -	int *reloc;
> +	rel_t *reloc;
>  	unsigned long delta, map, ptr;
>  	unsigned long min_addr = (unsigned long)output;
>  	unsigned long max_addr = min_addr + (VO___bss_start - VO__text);
> diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
> index 3f5f08b010d0..6b65f846dd64 100644
> --- a/arch/x86/include/asm/page_64_types.h
> +++ b/arch/x86/include/asm/page_64_types.h
> @@ -48,7 +48,11 @@
>  #define __PAGE_OFFSET           __PAGE_OFFSET_BASE
>  #endif /* CONFIG_RANDOMIZE_MEMORY */
>  
> +#ifdef CONFIG_RANDOMIZE_BASE_LARGE
> +#define __START_KERNEL_map	_AC(0xffffffff00000000, UL)
> +#else
>  #define __START_KERNEL_map	_AC(0xffffffff80000000, UL)
> +#endif /* CONFIG_RANDOMIZE_BASE_LARGE */
>  
>  /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
>  #ifdef CONFIG_X86_5LEVEL
> @@ -65,9 +69,14 @@
>   * 512MiB by default, leaving 1.5GiB for modules once the page tables
>   * are fully set up. If kernel ASLR is configured, it can extend the
>   * kernel page table mapping, reducing the size of the modules area.
> + * On PIE, we relocate the binary 2G lower so add this extra space.
>   */
>  #if defined(CONFIG_RANDOMIZE_BASE)
> +#ifdef CONFIG_RANDOMIZE_BASE_LARGE
> +#define KERNEL_IMAGE_SIZE	(_AC(3, UL) * 1024 * 1024 * 1024)
> +#else
>  #define KERNEL_IMAGE_SIZE	(1024 * 1024 * 1024)
> +#endif
>  #else
>  #define KERNEL_IMAGE_SIZE	(512 * 1024 * 1024)
>  #endif
> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
> index 4103e90ff128..235c3f7b46c7 100644
> --- a/arch/x86/kernel/head64.c
> +++ b/arch/x86/kernel/head64.c
> @@ -39,6 +39,7 @@ static unsigned int __initdata next_early_pgt;
>  pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
>  
>  #define __head	__section(.head.text)
> +#define pud_count(x)   (((x + (PUD_SIZE - 1)) & ~(PUD_SIZE - 1)) >> PUD_SHIFT)
>  
>  static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
>  {
> @@ -54,6 +55,8 @@ unsigned long _text_offset = (unsigned long)(_text - __START_KERNEL_map);
>  void __head notrace __startup_64(unsigned long physaddr)
>  {
>  	unsigned long load_delta, *p;
> +	unsigned long level3_kernel_start, level3_kernel_count;
> +	unsigned long level3_fixmap_start;
>  	pgdval_t *pgd;
>  	p4dval_t *p4d;
>  	pudval_t *pud;
> @@ -74,6 +77,11 @@ void __head notrace __startup_64(unsigned long physaddr)
>  	if (load_delta & ~PMD_PAGE_MASK)
>  		for (;;);
>  
> +	/* Look at the randomization spread to adapt page table used */
> +	level3_kernel_start = pud_index(__START_KERNEL_map);
> +	level3_kernel_count = pud_count(KERNEL_IMAGE_SIZE);
> +	level3_fixmap_start = level3_kernel_start + level3_kernel_count;
> +
>  	/* Fixup the physical addresses in the page table */
>  
>  	pgd = fixup_pointer(&early_top_pgt, physaddr);
> @@ -85,8 +93,9 @@ void __head notrace __startup_64(unsigned long physaddr)
>  	}
>  
>  	pud = fixup_pointer(&level3_kernel_pgt, physaddr);
> -	pud[510] += load_delta;
> -	pud[511] += load_delta;
> +	for (i = 0; i < level3_kernel_count; i++)
> +		pud[level3_kernel_start + i] += load_delta;
> +	pud[level3_fixmap_start] += load_delta;
>  
>  	pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
>  	pmd[506] += load_delta;
> @@ -137,7 +146,7 @@ void __head notrace __startup_64(unsigned long physaddr)
>  	 */
>  
>  	pmd = fixup_pointer(level2_kernel_pgt, physaddr);
> -	for (i = 0; i < PTRS_PER_PMD; i++) {
> +	for (i = 0; i < PTRS_PER_PMD * level3_kernel_count; i++) {
>  		if (pmd[i] & _PAGE_PRESENT)
>  			pmd[i] += load_delta;

Wow, this is dangerous. Three pud entries of level3_kernel_pgt all point
to level2_kernel_pgt, it's out of bound of level2_kernel_pgt and
overwrite the next data.

And if only use one page for level2_kernel_pgt, and kernel is randomized
to cross the pud entry of -4G to -1G, it won't work well.

>  	}
> @@ -268,7 +277,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
>  	 */
>  	BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map);
>  	BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE);
> -	BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
> +	BUILD_BUG_ON(!IS_ENABLED(CONFIG_RANDOMIZE_BASE_LARGE) &&
> +		     MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
>  	BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
>  	BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
>  	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> index 4d0a7e68bfe8..e8b2d6706eca 100644
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -39,11 +39,15 @@
>  
>  #define p4d_index(x)	(((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
>  #define pud_index(x)	(((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
> +#define pud_count(x)   (((x + (PUD_SIZE - 1)) & ~(PUD_SIZE - 1)) >> PUD_SHIFT)
>  
>  PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
>  PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
>  L3_START_KERNEL = pud_index(__START_KERNEL_map)
>  
> +/* Adapt page table L3 space based on range of randomization */
> +L3_KERNEL_ENTRY_COUNT = pud_count(KERNEL_IMAGE_SIZE)
> +
>  	.text
>  	__HEAD
>  	.code64
> @@ -396,7 +400,12 @@ NEXT_PAGE(level4_kernel_pgt)
>  NEXT_PAGE(level3_kernel_pgt)
>  	.fill	L3_START_KERNEL,8,0
>  	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
> -	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
> +	i = 0
> +	.rept	L3_KERNEL_ENTRY_COUNT
> +	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE \
> +		+ PAGE_SIZE*i
> +	i = i + 1
> +	.endr
>  	.quad	level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
>  
>  NEXT_PAGE(level2_kernel_pgt)
> -- 
> 2.13.2.932.g7449e964c-goog
>
Baoquan He July 19, 2017, 1:49 p.m. UTC | #2
On 07/19/17 at 08:10pm, Baoquan He wrote:
> On 07/18/17 at 03:33pm, Thomas Garnier wrote:
> 
> >  quiet_cmd_relocs = RELOCS  $@
> >        cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $<
> >  $(obj)/vmlinux.relocs: vmlinux FORCE
> > diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
> > index a0838ab929f2..0a0c80ab1842 100644
> > --- a/arch/x86/boot/compressed/misc.c
> > +++ b/arch/x86/boot/compressed/misc.c
> > @@ -170,10 +170,18 @@ void __puthex(unsigned long value)
> >  }
> >  
> >  #if CONFIG_X86_NEED_RELOCS
> > +
> > +/* Large randomization go lower than -2G and use large relocation table */
> > +#ifdef CONFIG_RANDOMIZE_BASE_LARGE
> > +typedef long rel_t;
> > +#else
> > +typedef int rel_t;
> > +#endif
> > +
> >  static void handle_relocations(void *output, unsigned long output_len,
> >  			       unsigned long virt_addr)
> >  {
> > -	int *reloc;
> > +	rel_t *reloc;
> >  	unsigned long delta, map, ptr;
> >  	unsigned long min_addr = (unsigned long)output;
> >  	unsigned long max_addr = min_addr + (VO___bss_start - VO__text);
> > diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
> > index 3f5f08b010d0..6b65f846dd64 100644
> > --- a/arch/x86/include/asm/page_64_types.h
> > +++ b/arch/x86/include/asm/page_64_types.h
> > @@ -48,7 +48,11 @@
> >  #define __PAGE_OFFSET           __PAGE_OFFSET_BASE
> >  #endif /* CONFIG_RANDOMIZE_MEMORY */
> >  
> > +#ifdef CONFIG_RANDOMIZE_BASE_LARGE
> > +#define __START_KERNEL_map	_AC(0xffffffff00000000, UL)
> > +#else
> >  #define __START_KERNEL_map	_AC(0xffffffff80000000, UL)
> > +#endif /* CONFIG_RANDOMIZE_BASE_LARGE */
> >  
> >  /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
> >  #ifdef CONFIG_X86_5LEVEL
> > @@ -65,9 +69,14 @@
> >   * 512MiB by default, leaving 1.5GiB for modules once the page tables
> >   * are fully set up. If kernel ASLR is configured, it can extend the
> >   * kernel page table mapping, reducing the size of the modules area.
> > + * On PIE, we relocate the binary 2G lower so add this extra space.
> >   */
> >  #if defined(CONFIG_RANDOMIZE_BASE)
> > +#ifdef CONFIG_RANDOMIZE_BASE_LARGE
> > +#define KERNEL_IMAGE_SIZE	(_AC(3, UL) * 1024 * 1024 * 1024)
> > +#else
> >  #define KERNEL_IMAGE_SIZE	(1024 * 1024 * 1024)
> > +#endif
> >  #else
> >  #define KERNEL_IMAGE_SIZE	(512 * 1024 * 1024)
> >  #endif
> > diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
> > index 4103e90ff128..235c3f7b46c7 100644
> > --- a/arch/x86/kernel/head64.c
> > +++ b/arch/x86/kernel/head64.c
> > @@ -39,6 +39,7 @@ static unsigned int __initdata next_early_pgt;
> >  pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
> >  
> >  #define __head	__section(.head.text)
> > +#define pud_count(x)   (((x + (PUD_SIZE - 1)) & ~(PUD_SIZE - 1)) >> PUD_SHIFT)
> >  
> >  static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
> >  {
> > @@ -54,6 +55,8 @@ unsigned long _text_offset = (unsigned long)(_text - __START_KERNEL_map);
> >  void __head notrace __startup_64(unsigned long physaddr)
> >  {
> >  	unsigned long load_delta, *p;
> > +	unsigned long level3_kernel_start, level3_kernel_count;
> > +	unsigned long level3_fixmap_start;
> >  	pgdval_t *pgd;
> >  	p4dval_t *p4d;
> >  	pudval_t *pud;
> > @@ -74,6 +77,11 @@ void __head notrace __startup_64(unsigned long physaddr)
> >  	if (load_delta & ~PMD_PAGE_MASK)
> >  		for (;;);
> >  
> > +	/* Look at the randomization spread to adapt page table used */
> > +	level3_kernel_start = pud_index(__START_KERNEL_map);
> > +	level3_kernel_count = pud_count(KERNEL_IMAGE_SIZE);
> > +	level3_fixmap_start = level3_kernel_start + level3_kernel_count;
> > +
> >  	/* Fixup the physical addresses in the page table */
> >  
> >  	pgd = fixup_pointer(&early_top_pgt, physaddr);
> > @@ -85,8 +93,9 @@ void __head notrace __startup_64(unsigned long physaddr)
> >  	}
> >  
> >  	pud = fixup_pointer(&level3_kernel_pgt, physaddr);
> > -	pud[510] += load_delta;
> > -	pud[511] += load_delta;
> > +	for (i = 0; i < level3_kernel_count; i++)
> > +		pud[level3_kernel_start + i] += load_delta;
> > +	pud[level3_fixmap_start] += load_delta;
> >  
> >  	pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
> >  	pmd[506] += load_delta;
> > @@ -137,7 +146,7 @@ void __head notrace __startup_64(unsigned long physaddr)
> >  	 */
> >  
> >  	pmd = fixup_pointer(level2_kernel_pgt, physaddr);
> > -	for (i = 0; i < PTRS_PER_PMD; i++) {
> > +	for (i = 0; i < PTRS_PER_PMD * level3_kernel_count; i++) {
> >  		if (pmd[i] & _PAGE_PRESENT)
> >  			pmd[i] += load_delta;
> 
> Wow, this is dangerous. Three pud entries of level3_kernel_pgt all point
> to level2_kernel_pgt, it's out of bound of level2_kernel_pgt and
> overwrite the next data.
> 
> And if only use one page for level2_kernel_pgt, and kernel is randomized
> to cross the pud entry of -4G to -1G, it won't work well.

Sorry, I was wrong, the size of level2_kernel_pgt is decided by
KERNEL_IMAGE_SIZE. So it's not a problem, please ignore this comment.

> 
> >  	}
> > @@ -268,7 +277,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
> >  	 */
> >  	BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map);
> >  	BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE);
> > -	BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
> > +	BUILD_BUG_ON(!IS_ENABLED(CONFIG_RANDOMIZE_BASE_LARGE) &&
> > +		     MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
> >  	BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
> >  	BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
> >  	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
> > diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> > index 4d0a7e68bfe8..e8b2d6706eca 100644
> > --- a/arch/x86/kernel/head_64.S
> > +++ b/arch/x86/kernel/head_64.S
> > @@ -39,11 +39,15 @@
> >  
> >  #define p4d_index(x)	(((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
> >  #define pud_index(x)	(((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
> > +#define pud_count(x)   (((x + (PUD_SIZE - 1)) & ~(PUD_SIZE - 1)) >> PUD_SHIFT)
> >  
> >  PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
> >  PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
> >  L3_START_KERNEL = pud_index(__START_KERNEL_map)
> >  
> > +/* Adapt page table L3 space based on range of randomization */
> > +L3_KERNEL_ENTRY_COUNT = pud_count(KERNEL_IMAGE_SIZE)
> > +
> >  	.text
> >  	__HEAD
> >  	.code64
> > @@ -396,7 +400,12 @@ NEXT_PAGE(level4_kernel_pgt)
> >  NEXT_PAGE(level3_kernel_pgt)
> >  	.fill	L3_START_KERNEL,8,0
> >  	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
> > -	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
> > +	i = 0
> > +	.rept	L3_KERNEL_ENTRY_COUNT
> > +	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE \
> > +		+ PAGE_SIZE*i
> > +	i = i + 1
> > +	.endr
> >  	.quad	level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
> >  
> >  NEXT_PAGE(level2_kernel_pgt)
> > -- 
> > 2.13.2.932.g7449e964c-goog
> >
diff mbox

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 60d161391d5a..8054eef76dfc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2096,6 +2096,27 @@  config X86_MODULE_PLTS
 	select X86_MODULE_MODEL_LARGE
 	select HAVE_MOD_ARCH_SPECIFIC
 
+config RANDOMIZE_BASE_LARGE
+	bool "Increase the randomization range of the kernel image"
+	depends on X86_64 && RANDOMIZE_BASE
+	select X86_PIE
+	select X86_MODULE_PLTS if MODULES
+	default n
+	---help---
+	  Build the kernel as a Position Independent Executable (PIE) and
+	  increase the available randomization range from 1GB to 3GB.
+
+	  This option impacts performance on kernel CPU intensive workloads up
+	  to 10% due to PIE generated code. Impact on user-mode processes and
+	  typical usage would be significantly less (0.50% when you build the
+	  kernel).
+
+	  The kernel and modules will generate slightly more assembly (1 to 2%
+	  increase on the .text sections). The vmlinux binary will be
+	  significantly smaller due to less relocations.
+
+	  If unsure say N
+
 config HOTPLUG_CPU
 	bool "Support for hot-pluggable CPUs"
 	depends on SMP
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 2c860ad4fe06..8f4317864e98 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -111,7 +111,12 @@  $(obj)/vmlinux.bin: vmlinux FORCE
 
 targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs
 
+# Large randomization require bigger relocation table
+ifeq ($(CONFIG_RANDOMIZE_BASE_LARGE),y)
+CMD_RELOCS = arch/x86/tools/relocs --large-reloc
+else
 CMD_RELOCS = arch/x86/tools/relocs
+endif
 quiet_cmd_relocs = RELOCS  $@
       cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $<
 $(obj)/vmlinux.relocs: vmlinux FORCE
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index a0838ab929f2..0a0c80ab1842 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -170,10 +170,18 @@  void __puthex(unsigned long value)
 }
 
 #if CONFIG_X86_NEED_RELOCS
+
+/* Large randomization go lower than -2G and use large relocation table */
+#ifdef CONFIG_RANDOMIZE_BASE_LARGE
+typedef long rel_t;
+#else
+typedef int rel_t;
+#endif
+
 static void handle_relocations(void *output, unsigned long output_len,
 			       unsigned long virt_addr)
 {
-	int *reloc;
+	rel_t *reloc;
 	unsigned long delta, map, ptr;
 	unsigned long min_addr = (unsigned long)output;
 	unsigned long max_addr = min_addr + (VO___bss_start - VO__text);
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 3f5f08b010d0..6b65f846dd64 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -48,7 +48,11 @@ 
 #define __PAGE_OFFSET           __PAGE_OFFSET_BASE
 #endif /* CONFIG_RANDOMIZE_MEMORY */
 
+#ifdef CONFIG_RANDOMIZE_BASE_LARGE
+#define __START_KERNEL_map	_AC(0xffffffff00000000, UL)
+#else
 #define __START_KERNEL_map	_AC(0xffffffff80000000, UL)
+#endif /* CONFIG_RANDOMIZE_BASE_LARGE */
 
 /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
 #ifdef CONFIG_X86_5LEVEL
@@ -65,9 +69,14 @@ 
  * 512MiB by default, leaving 1.5GiB for modules once the page tables
  * are fully set up. If kernel ASLR is configured, it can extend the
  * kernel page table mapping, reducing the size of the modules area.
+ * On PIE, we relocate the binary 2G lower so add this extra space.
  */
 #if defined(CONFIG_RANDOMIZE_BASE)
+#ifdef CONFIG_RANDOMIZE_BASE_LARGE
+#define KERNEL_IMAGE_SIZE	(_AC(3, UL) * 1024 * 1024 * 1024)
+#else
 #define KERNEL_IMAGE_SIZE	(1024 * 1024 * 1024)
+#endif
 #else
 #define KERNEL_IMAGE_SIZE	(512 * 1024 * 1024)
 #endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 4103e90ff128..235c3f7b46c7 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,6 +39,7 @@  static unsigned int __initdata next_early_pgt;
 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
 
 #define __head	__section(.head.text)
+#define pud_count(x)   (((x + (PUD_SIZE - 1)) & ~(PUD_SIZE - 1)) >> PUD_SHIFT)
 
 static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
 {
@@ -54,6 +55,8 @@  unsigned long _text_offset = (unsigned long)(_text - __START_KERNEL_map);
 void __head notrace __startup_64(unsigned long physaddr)
 {
 	unsigned long load_delta, *p;
+	unsigned long level3_kernel_start, level3_kernel_count;
+	unsigned long level3_fixmap_start;
 	pgdval_t *pgd;
 	p4dval_t *p4d;
 	pudval_t *pud;
@@ -74,6 +77,11 @@  void __head notrace __startup_64(unsigned long physaddr)
 	if (load_delta & ~PMD_PAGE_MASK)
 		for (;;);
 
+	/* Look at the randomization spread to adapt page table used */
+	level3_kernel_start = pud_index(__START_KERNEL_map);
+	level3_kernel_count = pud_count(KERNEL_IMAGE_SIZE);
+	level3_fixmap_start = level3_kernel_start + level3_kernel_count;
+
 	/* Fixup the physical addresses in the page table */
 
 	pgd = fixup_pointer(&early_top_pgt, physaddr);
@@ -85,8 +93,9 @@  void __head notrace __startup_64(unsigned long physaddr)
 	}
 
 	pud = fixup_pointer(&level3_kernel_pgt, physaddr);
-	pud[510] += load_delta;
-	pud[511] += load_delta;
+	for (i = 0; i < level3_kernel_count; i++)
+		pud[level3_kernel_start + i] += load_delta;
+	pud[level3_fixmap_start] += load_delta;
 
 	pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
 	pmd[506] += load_delta;
@@ -137,7 +146,7 @@  void __head notrace __startup_64(unsigned long physaddr)
 	 */
 
 	pmd = fixup_pointer(level2_kernel_pgt, physaddr);
-	for (i = 0; i < PTRS_PER_PMD; i++) {
+	for (i = 0; i < PTRS_PER_PMD * level3_kernel_count; i++) {
 		if (pmd[i] & _PAGE_PRESENT)
 			pmd[i] += load_delta;
 	}
@@ -268,7 +277,8 @@  asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 	 */
 	BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map);
 	BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE);
-	BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
+	BUILD_BUG_ON(!IS_ENABLED(CONFIG_RANDOMIZE_BASE_LARGE) &&
+		     MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
 	BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
 	BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
 	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 4d0a7e68bfe8..e8b2d6706eca 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -39,11 +39,15 @@ 
 
 #define p4d_index(x)	(((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
 #define pud_index(x)	(((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
+#define pud_count(x)   (((x + (PUD_SIZE - 1)) & ~(PUD_SIZE - 1)) >> PUD_SHIFT)
 
 PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
 PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
 L3_START_KERNEL = pud_index(__START_KERNEL_map)
 
+/* Adapt page table L3 space based on range of randomization */
+L3_KERNEL_ENTRY_COUNT = pud_count(KERNEL_IMAGE_SIZE)
+
 	.text
 	__HEAD
 	.code64
@@ -396,7 +400,12 @@  NEXT_PAGE(level4_kernel_pgt)
 NEXT_PAGE(level3_kernel_pgt)
 	.fill	L3_START_KERNEL,8,0
 	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
-	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	i = 0
+	.rept	L3_KERNEL_ENTRY_COUNT
+	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE \
+		+ PAGE_SIZE*i
+	i = i + 1
+	.endr
 	.quad	level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
 
 NEXT_PAGE(level2_kernel_pgt)