diff mbox

[v7u1,31/31] x86, 64bit, mm: hibernate use generic mapping_init

Message ID 1357260531-11115-32-git-send-email-yinghai@kernel.org (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Yinghai Lu Jan. 4, 2013, 12:48 a.m. UTC
Make it only map range in pfn_mapped array.

and it has kernel mapping with EXEC.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Cc: linux-pm@vger.kernel.org
---
 arch/x86/power/hibernate_64.c |   66 ++++++++++++++---------------------------
 1 file changed, 22 insertions(+), 44 deletions(-)

Comments

Rafael Wysocki Jan. 4, 2013, 11:43 a.m. UTC | #1
On Thursday, January 03, 2013 04:48:51 PM Yinghai Lu wrote:
> Make it only map range in pfn_mapped array.

Can you please explain why that should be sufficient?

Have you tested it?

> and it has kernel mapping with EXEC.

That's because it needs to execute code from one of those pages and it
doesn't know in advance which one that's going to be.

Thanks,
Rafael


> Signed-off-by: Yinghai Lu <yinghai@kernel.org>
> Cc: Pavel Machek <pavel@ucw.cz>
> Cc: Rafael J. Wysocki <rjw@sisk.pl>
> Cc: linux-pm@vger.kernel.org
> ---
>  arch/x86/power/hibernate_64.c |   66 ++++++++++++++---------------------------
>  1 file changed, 22 insertions(+), 44 deletions(-)
> 
> diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
> index 460f314..a0fde91 100644
> --- a/arch/x86/power/hibernate_64.c
> +++ b/arch/x86/power/hibernate_64.c
> @@ -11,6 +11,8 @@
>  #include <linux/gfp.h>
>  #include <linux/smp.h>
>  #include <linux/suspend.h>
> +
> +#include <asm/init.h>
>  #include <asm/proto.h>
>  #include <asm/page.h>
>  #include <asm/pgtable.h>
> @@ -39,41 +41,21 @@ pgd_t *temp_level4_pgt;
>  
>  void *relocated_restore_code;
>  
> -static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
> +static void *alloc_pgt_page(void *context)
>  {
> -	long i, j;
> -
> -	i = pud_index(address);
> -	pud = pud + i;
> -	for (; i < PTRS_PER_PUD; pud++, i++) {
> -		unsigned long paddr;
> -		pmd_t *pmd;
> -
> -		paddr = address + i*PUD_SIZE;
> -		if (paddr >= end)
> -			break;
> -
> -		pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
> -		if (!pmd)
> -			return -ENOMEM;
> -		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
> -		for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
> -			unsigned long pe;
> -
> -			if (paddr >= end)
> -				break;
> -			pe = __PAGE_KERNEL_LARGE_EXEC | paddr;
> -			pe &= __supported_pte_mask;
> -			set_pmd(pmd, __pmd(pe));
> -		}
> -	}
> -	return 0;
> +	return (void *)get_safe_page(GFP_ATOMIC);
>  }
>
>  static int set_up_temporary_mappings(void)
>  {
> -	unsigned long start, end, next;
> -	int error;
> +	struct x86_mapping_info info = {
> +		.alloc_pgt_page	= alloc_pgt_page,
> +		.pmd_flag	= __PAGE_KERNEL_LARGE_EXEC,
> +		.kernel_mapping = true,
> +	};
> +	unsigned long mstart, mend;
> +	int result;
> +	int i;
>  
>  	temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
>  	if (!temp_level4_pgt)
> @@ -84,21 +66,17 @@ static int set_up_temporary_mappings(void)
>  		init_level4_pgt[pgd_index(__START_KERNEL_map)]);
>  
>  	/* Set up the direct mapping from scratch */
> -	start = (unsigned long)pfn_to_kaddr(0);
> -	end = (unsigned long)pfn_to_kaddr(max_pfn);
> -
> -	for (; start < end; start = next) {
> -		pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
> -		if (!pud)
> -			return -ENOMEM;
> -		next = start + PGDIR_SIZE;
> -		if (next > end)
> -			next = end;
> -		if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
> -			return error;
> -		set_pgd(temp_level4_pgt + pgd_index(start),
> -			mk_kernel_pgd(__pa(pud)));
> +	for (i = 0; i < nr_pfn_mapped; i++) {
> +		mstart = pfn_mapped[i].start << PAGE_SHIFT;
> +		mend   = pfn_mapped[i].end << PAGE_SHIFT;
> +
> +		result = kernel_ident_mapping_init(&info, temp_level4_pgt,
> +						   mstart, mend);
> +
> +		if (result)
> +			return result;
>  	}
> +
>  	return 0;
>  }
>  
>
Yinghai Lu Jan. 4, 2013, 9:59 p.m. UTC | #2
On Fri, Jan 4, 2013 at 3:43 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> On Thursday, January 03, 2013 04:48:51 PM Yinghai Lu wrote:
>> Make it only map range in pfn_mapped array.
>
> Can you please explain why that should be sufficient?

It is needed.

http://git.kernel.org/?p=linux/kernel/git/tip/tip.git;a=commitdiff;h=66520ebc2df3fe52eb4792f8101fac573b766baf

Subject: [PATCH] x86, mm: Only direct map addresses that are marked as
 E820_RAM

Currently direct mappings are created for [ 0 to max_low_pfn<<PAGE_SHIFT )
and [ 4GB to max_pfn<<PAGE_SHIFT ), which may include regions that are not
backed by actual DRAM. This is fine for holes under 4GB which are covered
by fixed and variable range MTRRs to be UC. However, we run into trouble
on higher memory addresses which cannot be covered by MTRRs.

Our system with 1TB of RAM has an e820 that looks like this:

 BIOS-e820: [mem 0x0000000000000000-0x00000000000983ff] usable
 BIOS-e820: [mem 0x0000000000098400-0x000000000009ffff] reserved
 BIOS-e820: [mem 0x00000000000d0000-0x00000000000fffff] reserved
 BIOS-e820: [mem 0x0000000000100000-0x00000000c7ebffff] usable
 BIOS-e820: [mem 0x00000000c7ec0000-0x00000000c7ed7fff] ACPI data
 BIOS-e820: [mem 0x00000000c7ed8000-0x00000000c7ed9fff] ACPI NVS
 BIOS-e820: [mem 0x00000000c7eda000-0x00000000c7ffffff] reserved
 BIOS-e820: [mem 0x00000000fec00000-0x00000000fec0ffff] reserved
 BIOS-e820: [mem 0x00000000fee00000-0x00000000fee00fff] reserved
 BIOS-e820: [mem 0x00000000fff00000-0x00000000ffffffff] reserved
 BIOS-e820: [mem 0x0000000100000000-0x000000e037ffffff] usable
 BIOS-e820: [mem 0x000000e038000000-0x000000fcffffffff] reserved
 BIOS-e820: [mem 0x0000010000000000-0x0000011ffeffffff] usable

and so direct mappings are created for huge memory hole between
0x000000e038000000 to 0x0000010000000000. Even though the kernel never
generates memory accesses in that region, since the page tables mark
them incorrectly as being WB, our (AMD) processor ends up causing a MCE
while doing some memory bookkeeping/optimizations around that area.

This patch iterates through e820 and only direct maps ranges that are
marked as E820_RAM, and keeps track of those pfn ranges. Depending on
the alignment of E820 ranges, this may possibly result in using smaller
size (i.e. 4K instead of 2M or 1G) page tables.

>
> Have you tested it?
>

No

will update to

Subject: [PATCH] x86, 64bit, mm: hibernate use generic mapping_init

We should not set mapping for all under max_pfn.
That causes same problem that is fixed by
        x86, mm: Only direct map addresses that are marked as E820_RAM

Make it only map range in pfn_mapped array.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael Wysocki Jan. 4, 2013, 10:07 p.m. UTC | #3
On Friday, January 04, 2013 01:59:33 PM Yinghai Lu wrote:
> On Fri, Jan 4, 2013 at 3:43 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> > On Thursday, January 03, 2013 04:48:51 PM Yinghai Lu wrote:
> >> Make it only map range in pfn_mapped array.
> >
> > Can you please explain why that should be sufficient?
> 
> It is needed.
> 
> http://git.kernel.org/?p=linux/kernel/git/tip/tip.git;a=commitdiff;h=66520ebc2df3fe52eb4792f8101fac573b766baf
> 
> Subject: [PATCH] x86, mm: Only direct map addresses that are marked as
>  E820_RAM
> 
> Currently direct mappings are created for [ 0 to max_low_pfn<<PAGE_SHIFT )
> and [ 4GB to max_pfn<<PAGE_SHIFT ), which may include regions that are not
> backed by actual DRAM. This is fine for holes under 4GB which are covered
> by fixed and variable range MTRRs to be UC. However, we run into trouble
> on higher memory addresses which cannot be covered by MTRRs.
> 
> Our system with 1TB of RAM has an e820 that looks like this:
> 
>  BIOS-e820: [mem 0x0000000000000000-0x00000000000983ff] usable
>  BIOS-e820: [mem 0x0000000000098400-0x000000000009ffff] reserved
>  BIOS-e820: [mem 0x00000000000d0000-0x00000000000fffff] reserved
>  BIOS-e820: [mem 0x0000000000100000-0x00000000c7ebffff] usable
>  BIOS-e820: [mem 0x00000000c7ec0000-0x00000000c7ed7fff] ACPI data
>  BIOS-e820: [mem 0x00000000c7ed8000-0x00000000c7ed9fff] ACPI NVS
>  BIOS-e820: [mem 0x00000000c7eda000-0x00000000c7ffffff] reserved
>  BIOS-e820: [mem 0x00000000fec00000-0x00000000fec0ffff] reserved
>  BIOS-e820: [mem 0x00000000fee00000-0x00000000fee00fff] reserved
>  BIOS-e820: [mem 0x00000000fff00000-0x00000000ffffffff] reserved
>  BIOS-e820: [mem 0x0000000100000000-0x000000e037ffffff] usable
>  BIOS-e820: [mem 0x000000e038000000-0x000000fcffffffff] reserved
>  BIOS-e820: [mem 0x0000010000000000-0x0000011ffeffffff] usable
> 
> and so direct mappings are created for huge memory hole between
> 0x000000e038000000 to 0x0000010000000000. Even though the kernel never
> generates memory accesses in that region, since the page tables mark
> them incorrectly as being WB, our (AMD) processor ends up causing a MCE
> while doing some memory bookkeeping/optimizations around that area.
> 
> This patch iterates through e820 and only direct maps ranges that are
> marked as E820_RAM, and keeps track of those pfn ranges. Depending on
> the alignment of E820 ranges, this may possibly result in using smaller
> size (i.e. 4K instead of 2M or 1G) page tables.
> 
> >
> > Have you tested it?
> >
> 
> No
> 
> will update to
> 
> Subject: [PATCH] x86, 64bit, mm: hibernate use generic mapping_init
> 
> We should not set mapping for all under max_pfn.
> That causes same problem that is fixed by
>         x86, mm: Only direct map addresses that are marked as E820_RAM
> 
> Make it only map range in pfn_mapped array.

OK

Thanks,
Rafael
diff mbox

Patch

diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 460f314..a0fde91 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -11,6 +11,8 @@ 
 #include <linux/gfp.h>
 #include <linux/smp.h>
 #include <linux/suspend.h>
+
+#include <asm/init.h>
 #include <asm/proto.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -39,41 +41,21 @@  pgd_t *temp_level4_pgt;
 
 void *relocated_restore_code;
 
-static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+static void *alloc_pgt_page(void *context)
 {
-	long i, j;
-
-	i = pud_index(address);
-	pud = pud + i;
-	for (; i < PTRS_PER_PUD; pud++, i++) {
-		unsigned long paddr;
-		pmd_t *pmd;
-
-		paddr = address + i*PUD_SIZE;
-		if (paddr >= end)
-			break;
-
-		pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
-		if (!pmd)
-			return -ENOMEM;
-		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
-		for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
-			unsigned long pe;
-
-			if (paddr >= end)
-				break;
-			pe = __PAGE_KERNEL_LARGE_EXEC | paddr;
-			pe &= __supported_pte_mask;
-			set_pmd(pmd, __pmd(pe));
-		}
-	}
-	return 0;
+	return (void *)get_safe_page(GFP_ATOMIC);
 }
 
 static int set_up_temporary_mappings(void)
 {
-	unsigned long start, end, next;
-	int error;
+	struct x86_mapping_info info = {
+		.alloc_pgt_page	= alloc_pgt_page,
+		.pmd_flag	= __PAGE_KERNEL_LARGE_EXEC,
+		.kernel_mapping = true,
+	};
+	unsigned long mstart, mend;
+	int result;
+	int i;
 
 	temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
 	if (!temp_level4_pgt)
@@ -84,21 +66,17 @@  static int set_up_temporary_mappings(void)
 		init_level4_pgt[pgd_index(__START_KERNEL_map)]);
 
 	/* Set up the direct mapping from scratch */
-	start = (unsigned long)pfn_to_kaddr(0);
-	end = (unsigned long)pfn_to_kaddr(max_pfn);
-
-	for (; start < end; start = next) {
-		pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
-		if (!pud)
-			return -ENOMEM;
-		next = start + PGDIR_SIZE;
-		if (next > end)
-			next = end;
-		if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
-			return error;
-		set_pgd(temp_level4_pgt + pgd_index(start),
-			mk_kernel_pgd(__pa(pud)));
+	for (i = 0; i < nr_pfn_mapped; i++) {
+		mstart = pfn_mapped[i].start << PAGE_SHIFT;
+		mend   = pfn_mapped[i].end << PAGE_SHIFT;
+
+		result = kernel_ident_mapping_init(&info, temp_level4_pgt,
+						   mstart, mend);
+
+		if (result)
+			return result;
 	}
+
 	return 0;
 }