diff mbox series

[v3,3/4] arm64: kdump: support more than one crash kernel regions

Message ID 20190409102819.121335-4-chenzhou10@huawei.com (mailing list archive)
State New, archived
Headers show
Series support reserving crashkernel above 4G on arm64 kdump | expand

Commit Message

chenzhou April 9, 2019, 10:28 a.m. UTC
After commit (arm64: kdump: support reserving crashkernel above 4G),
there may be two crash kernel regions, one is below 4G, the other is
above 4G.

Crash dump kernel reads more than one crash kernel regions via a dtb
property under node /chosen,
linux,usable-memory-range = <BASE1 SIZE1 [BASE2 SIZE2]>

Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
---
 arch/arm64/mm/init.c     | 66 ++++++++++++++++++++++++++++++++++++++++--------
 include/linux/memblock.h |  6 +++++
 mm/memblock.c            |  7 ++---
 3 files changed, 66 insertions(+), 13 deletions(-)

Comments

Mike Rapoport April 10, 2019, 1:09 p.m. UTC | #1
Hi,

On Tue, Apr 09, 2019 at 06:28:18PM +0800, Chen Zhou wrote:
> After commit (arm64: kdump: support reserving crashkernel above 4G),
> there may be two crash kernel regions, one is below 4G, the other is
> above 4G.
> 
> Crash dump kernel reads more than one crash kernel regions via a dtb
> property under node /chosen,
> linux,usable-memory-range = <BASE1 SIZE1 [BASE2 SIZE2]>
> 
> Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
> ---
>  arch/arm64/mm/init.c     | 66 ++++++++++++++++++++++++++++++++++++++++--------
>  include/linux/memblock.h |  6 +++++
>  mm/memblock.c            |  7 ++---
>  3 files changed, 66 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 3bebddf..0f18665 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -65,6 +65,11 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init;
>  
>  #ifdef CONFIG_KEXEC_CORE
>  
> +/* at most two crash kernel regions, low_region and high_region */
> +#define CRASH_MAX_USABLE_RANGES	2
> +#define LOW_REGION_IDX			0
> +#define HIGH_REGION_IDX			1
> +
>  /*
>   * reserve_crashkernel() - reserves memory for crash kernel
>   *
> @@ -297,8 +302,8 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
>  		const char *uname, int depth, void *data)
>  {
>  	struct memblock_region *usablemem = data;
> -	const __be32 *reg;
> -	int len;
> +	const __be32 *reg, *endp;
> +	int len, nr = 0;
>  
>  	if (depth != 1 || strcmp(uname, "chosen") != 0)
>  		return 0;
> @@ -307,22 +312,63 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
>  	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
>  		return 1;
>  
> -	usablemem->base = dt_mem_next_cell(dt_root_addr_cells, &reg);
> -	usablemem->size = dt_mem_next_cell(dt_root_size_cells, &reg);
> +	endp = reg + (len / sizeof(__be32));
> +	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
> +		usablemem[nr].base = dt_mem_next_cell(dt_root_addr_cells, &reg);
> +		usablemem[nr].size = dt_mem_next_cell(dt_root_size_cells, &reg);
> +
> +		if (++nr >= CRASH_MAX_USABLE_RANGES)
> +			break;
> +	}
>  
>  	return 1;
>  }
>  
>  static void __init fdt_enforce_memory_region(void)
>  {
> -	struct memblock_region reg = {
> -		.size = 0,
> -	};
> +	int i, cnt = 0;
> +	struct memblock_region regs[CRASH_MAX_USABLE_RANGES];

I only now noticed that fdt_enforce_memory_region() uses memblock_region to
pass the ranges around. If we'd switch to memblock_type instead, the
implementation of memblock_cap_memory_ranges() would be really
straightforward. Can you check if the below patch works for you? 

From e476d584098e31273af573e1a78e308880c5cf28 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 10 Apr 2019 16:02:32 +0300
Subject: [PATCH] memblock: extend memblock_cap_memory_range to multiple ranges

The memblock_cap_memory_range() removes all the memory except the range
passed to it. Extend this function to recieve memblock_type with the
regions that should be kept. This allows switching to simple iteration over
memblock arrays with 'for_each_mem_range' to remove the unneeded memory.

Enable use of this function in arm64 for reservation of multile regions for
the crash kernel.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
---
 arch/arm64/mm/init.c     | 34 ++++++++++++++++++++++++----------
 include/linux/memblock.h |  2 +-
 mm/memblock.c            | 45 ++++++++++++++++++++++-----------------------
 3 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 6bc1350..30a496f 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -64,6 +64,10 @@ EXPORT_SYMBOL(memstart_addr);
 phys_addr_t arm64_dma_phys_limit __ro_after_init;
 
 #ifdef CONFIG_KEXEC_CORE
+
+/* at most two crash kernel regions, low_region and high_region */
+#define CRASH_MAX_USABLE_RANGES	2
+
 /*
  * reserve_crashkernel() - reserves memory for crash kernel
  *
@@ -280,9 +284,9 @@ early_param("mem", early_mem);
 static int __init early_init_dt_scan_usablemem(unsigned long node,
 		const char *uname, int depth, void *data)
 {
-	struct memblock_region *usablemem = data;
-	const __be32 *reg;
-	int len;
+	struct memblock_type *usablemem = data;
+	const __be32 *reg, *endp;
+	int len, nr = 0;
 
 	if (depth != 1 || strcmp(uname, "chosen") != 0)
 		return 0;
@@ -291,22 +295,32 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
 	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
 		return 1;
 
-	usablemem->base = dt_mem_next_cell(dt_root_addr_cells, &reg);
-	usablemem->size = dt_mem_next_cell(dt_root_size_cells, &reg);
+	endp = reg + (len / sizeof(__be32));
+	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
+		unsigned long base = dt_mem_next_cell(dt_root_addr_cells, &reg);
+		unsigned long size = dt_mem_next_cell(dt_root_size_cells, &reg);
 
+		if (memblock_add_range(usablemem, base, size, NUMA_NO_NODE,
+				       MEMBLOCK_NONE))
+			return 0;
+		if (++nr >= CRASH_MAX_USABLE_RANGES)
+			break;
+	}
 	return 1;
 }
 
 static void __init fdt_enforce_memory_region(void)
 {
-	struct memblock_region reg = {
-		.size = 0,
+	struct memblock_region usable_regions[CRASH_MAX_USABLE_RANGES];
+	struct memblock_type usablemem = {
+		.max = CRASH_MAX_USABLE_RANGES,
+		.regions = usable_regions,
 	};
 
-	of_scan_flat_dt(early_init_dt_scan_usablemem, &reg);
+	of_scan_flat_dt(early_init_dt_scan_usablemem, &usablemem);
 
-	if (reg.size)
-		memblock_cap_memory_range(reg.base, reg.size);
+	if (usablemem.cnt)
+		memblock_cap_memory_ranges(&usablemem);
 }
 
 void __init arm64_memblock_init(void)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 294d5d8..a803ae9 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -404,7 +404,7 @@ phys_addr_t memblock_mem_size(unsigned long limit_pfn);
 phys_addr_t memblock_start_of_DRAM(void);
 phys_addr_t memblock_end_of_DRAM(void);
 void memblock_enforce_memory_limit(phys_addr_t memory_limit);
-void memblock_cap_memory_range(phys_addr_t base, phys_addr_t size);
+void memblock_cap_memory_ranges(struct memblock_type *regions_to_keep);
 void memblock_mem_limit_remove_map(phys_addr_t limit);
 bool memblock_is_memory(phys_addr_t addr);
 bool memblock_is_map_memory(phys_addr_t addr);
diff --git a/mm/memblock.c b/mm/memblock.c
index e7665cf..83d84d4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1605,36 +1605,34 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
 			      PHYS_ADDR_MAX);
 }
 
-void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
+void __init memblock_cap_memory_ranges(struct memblock_type *regions_to_keep)
 {
-	int start_rgn, end_rgn;
-	int i, ret;
-
-	if (!size)
-		return;
-
-	ret = memblock_isolate_range(&memblock.memory, base, size,
-						&start_rgn, &end_rgn);
-	if (ret)
-		return;
-
-	/* remove all the MAP regions */
-	for (i = memblock.memory.cnt - 1; i >= end_rgn; i--)
-		if (!memblock_is_nomap(&memblock.memory.regions[i]))
-			memblock_remove_region(&memblock.memory, i);
+	phys_addr_t start, end;
+	u64 i;
 
-	for (i = start_rgn - 1; i >= 0; i--)
-		if (!memblock_is_nomap(&memblock.memory.regions[i]))
-			memblock_remove_region(&memblock.memory, i);
+	/* truncate memory while skipping NOMAP regions */
+	for_each_mem_range(i, &memblock.memory, regions_to_keep, NUMA_NO_NODE,
+			   MEMBLOCK_NONE, &start, &end, NULL)
+		memblock_remove(start, end);
 
 	/* truncate the reserved regions */
-	memblock_remove_range(&memblock.reserved, 0, base);
-	memblock_remove_range(&memblock.reserved,
-			base + size, PHYS_ADDR_MAX);
+	for_each_mem_range(i, &memblock.reserved, regions_to_keep, NUMA_NO_NODE,
+			   MEMBLOCK_NONE, &start, &end, NULL)
+		memblock_remove_range(&memblock.reserved, start, end);
 }
 
 void __init memblock_mem_limit_remove_map(phys_addr_t limit)
 {
+	struct memblock_region rgn = {
+		.base = 0,
+	};
+
+	struct memblock_type region_to_keep = {
+		.cnt = 1,
+		.max = 1,
+		.regions = &rgn,
+	};
+
 	phys_addr_t max_addr;
 
 	if (!limit)
@@ -1646,7 +1644,8 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit)
 	if (max_addr == PHYS_ADDR_MAX)
 		return;
 
-	memblock_cap_memory_range(0, max_addr);
+	region_to_keep.regions[0].size = max_addr;
+	memblock_cap_memory_ranges(&region_to_keep);
 }
 
 static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
chenzhou April 11, 2019, 12:17 p.m. UTC | #2
Hi Mike,

This overall looks well.
Replacing memblock_cap_memory_range() with memblock_cap_memory_ranges() was what i wanted
to do in v1, sorry for don't express that clearly.

But there are some issues as below. After fixing this, it can work correctly.

On 2019/4/10 21:09, Mike Rapoport wrote:
> Hi,
> 
> On Tue, Apr 09, 2019 at 06:28:18PM +0800, Chen Zhou wrote:
>> After commit (arm64: kdump: support reserving crashkernel above 4G),
>> there may be two crash kernel regions, one is below 4G, the other is
>> above 4G.
>>
>> Crash dump kernel reads more than one crash kernel regions via a dtb
>> property under node /chosen,
>> linux,usable-memory-range = <BASE1 SIZE1 [BASE2 SIZE2]>
>>
>> Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
>> ---
>>  arch/arm64/mm/init.c     | 66 ++++++++++++++++++++++++++++++++++++++++--------
>>  include/linux/memblock.h |  6 +++++
>>  mm/memblock.c            |  7 ++---
>>  3 files changed, 66 insertions(+), 13 deletions(-)
>>
>> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
>> index 3bebddf..0f18665 100644
>> --- a/arch/arm64/mm/init.c
>> +++ b/arch/arm64/mm/init.c
>> @@ -65,6 +65,11 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init;
>>  
>>  #ifdef CONFIG_KEXEC_CORE
>>  
>> +/* at most two crash kernel regions, low_region and high_region */
>> +#define CRASH_MAX_USABLE_RANGES	2
>> +#define LOW_REGION_IDX			0
>> +#define HIGH_REGION_IDX			1
>> +
>>  /*
>>   * reserve_crashkernel() - reserves memory for crash kernel
>>   *
>> @@ -297,8 +302,8 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
>>  		const char *uname, int depth, void *data)
>>  {
>>  	struct memblock_region *usablemem = data;
>> -	const __be32 *reg;
>> -	int len;
>> +	const __be32 *reg, *endp;
>> +	int len, nr = 0;
>>  
>>  	if (depth != 1 || strcmp(uname, "chosen") != 0)
>>  		return 0;
>> @@ -307,22 +312,63 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
>>  	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
>>  		return 1;
>>  
>> -	usablemem->base = dt_mem_next_cell(dt_root_addr_cells, &reg);
>> -	usablemem->size = dt_mem_next_cell(dt_root_size_cells, &reg);
>> +	endp = reg + (len / sizeof(__be32));
>> +	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
>> +		usablemem[nr].base = dt_mem_next_cell(dt_root_addr_cells, &reg);
>> +		usablemem[nr].size = dt_mem_next_cell(dt_root_size_cells, &reg);
>> +
>> +		if (++nr >= CRASH_MAX_USABLE_RANGES)
>> +			break;
>> +	}
>>  
>>  	return 1;
>>  }
>>  
>>  static void __init fdt_enforce_memory_region(void)
>>  {
>> -	struct memblock_region reg = {
>> -		.size = 0,
>> -	};
>> +	int i, cnt = 0;
>> +	struct memblock_region regs[CRASH_MAX_USABLE_RANGES];
> 
> I only now noticed that fdt_enforce_memory_region() uses memblock_region to
> pass the ranges around. If we'd switch to memblock_type instead, the
> implementation of memblock_cap_memory_ranges() would be really
> straightforward. Can you check if the below patch works for you? 
> 
>>From e476d584098e31273af573e1a78e308880c5cf28 Mon Sep 17 00:00:00 2001
> From: Mike Rapoport <rppt@linux.ibm.com>
> Date: Wed, 10 Apr 2019 16:02:32 +0300
> Subject: [PATCH] memblock: extend memblock_cap_memory_range to multiple ranges
> 
> The memblock_cap_memory_range() removes all the memory except the range
> passed to it. Extend this function to recieve memblock_type with the
> regions that should be kept. This allows switching to simple iteration over
> memblock arrays with 'for_each_mem_range' to remove the unneeded memory.
> 
> Enable use of this function in arm64 for reservation of multile regions for
> the crash kernel.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>  arch/arm64/mm/init.c     | 34 ++++++++++++++++++++++++----------
>  include/linux/memblock.h |  2 +-
>  mm/memblock.c            | 45 ++++++++++++++++++++++-----------------------
>  3 files changed, 47 insertions(+), 34 deletions(-)
> 
>  
> -void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
> +void __init memblock_cap_memory_ranges(struct memblock_type *regions_to_keep)
>  {
> -	int start_rgn, end_rgn;
> -	int i, ret;
> -
> -	if (!size)
> -		return;
> -
> -	ret = memblock_isolate_range(&memblock.memory, base, size,
> -						&start_rgn, &end_rgn);
> -	if (ret)
> -		return;
> -
> -	/* remove all the MAP regions */
> -	for (i = memblock.memory.cnt - 1; i >= end_rgn; i--)
> -		if (!memblock_is_nomap(&memblock.memory.regions[i]))
> -			memblock_remove_region(&memblock.memory, i);
> +	phys_addr_t start, end;
> +	u64 i;
>  
> -	for (i = start_rgn - 1; i >= 0; i--)
> -		if (!memblock_is_nomap(&memblock.memory.regions[i]))
> -			memblock_remove_region(&memblock.memory, i);
> +	/* truncate memory while skipping NOMAP regions */
> +	for_each_mem_range(i, &memblock.memory, regions_to_keep, NUMA_NO_NODE,
> +			   MEMBLOCK_NONE, &start, &end, NULL)
> +		memblock_remove(start, end);

1. use memblock_remove(start, size) instead of memblock_remove(start, end).

2. There is a another hidden issue. We couldn't mix __next_mem_range()(called by for_each_mem_range) operation
with remove operation because __next_mem_range() records the index of last time. If we do remove between
__next_mem_range(), the index may be mess.

Therefore, we could do remove operation after for_each_mem_range like this, solution A:
 void __init memblock_cap_memory_ranges(struct memblock_type *regions_to_keep)
 {
-	phys_addr_t start, end;
-	u64 i;
+	phys_addr_t start[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
+	phys_addr_t end[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
+	u64 i, nr = 0;

 	/* truncate memory while skipping NOMAP regions */
 	for_each_mem_range(i, &memblock.memory, regions_to_keep, NUMA_NO_NODE,
-			   MEMBLOCK_NONE, &start, &end, NULL)
-		memblock_remove(start, end);
+			   MEMBLOCK_NONE, &start[nr], &end[nr], NULL)
+		nr++;
+	for (i = 0; i < nr; i++)
+		memblock_remove(start[i], end[i] - start[i]);

 	/* truncate the reserved regions */
+	nr = 0;
 	for_each_mem_range(i, &memblock.reserved, regions_to_keep, NUMA_NO_NODE,
-			   MEMBLOCK_NONE, &start, &end, NULL)
-		memblock_remove_range(&memblock.reserved, start, end);
+			   MEMBLOCK_NONE, &start[nr], &end[nr], NULL)
+		nr++;
+	for (i = 0; i < nr; i++)
+		memblock_remove_range(&memblock.reserved, start[i],
+				end[i] - start[i]);
 }

But a warning occurs when compiling:
  CALL    scripts/atomic/check-atomics.sh
  CALL    scripts/checksyscalls.sh
  CHK     include/generated/compile.h
  CC      mm/memblock.o
mm/memblock.c: In function ‘memblock_cap_memory_ranges’:
mm/memblock.c:1635:1: warning: the frame size of 36912 bytes is larger than 2048 bytes [-Wframe-larger-than=]
 }

another solution is my implementation in v1, solution B:
+void __init memblock_cap_memory_ranges(struct memblock_type *regions_to_keep)
+{
+   int start_rgn[INIT_MEMBLOCK_REGIONS], end_rgn[INIT_MEMBLOCK_REGIONS];
+   int i, j, ret, nr = 0;
+   memblock_region *regs = regions_to_keep->regions;
+
+   nr = regions_to_keep -> cnt;
+   if (!nr)
+       return;
+
+   /* remove all the MAP regions */
+   for (i = memblock.memory.cnt - 1; i >= end_rgn[nr - 1]; i--)
+       if (!memblock_is_nomap(&memblock.memory.regions[i]))
+           memblock_remove_region(&memblock.memory, i);
+
+   for (i = nr - 1; i > 0; i--)
+       for (j = start_rgn[i] - 1; j >= end_rgn[i - 1]; j--)
+           if (!memblock_is_nomap(&memblock.memory.regions[j]))
+               memblock_remove_region(&memblock.memory, j);
+
+   for (i = start_rgn[0] - 1; i >= 0; i--)
+       if (!memblock_is_nomap(&memblock.memory.regions[i]))
+           memblock_remove_region(&memblock.memory, i);
+
+   /* truncate the reserved regions */
+   memblock_remove_range(&memblock.reserved, 0, regs[0].base);
+
+   for (i = nr - 1; i > 0; i--)
+       memblock_remove_range(&memblock.reserved,
+               regs[i - 1].base + regs[i - 1].size,
+		regs[i].base - regs[i - 1].base - regs[i - 1].size);
+
+   memblock_remove_range(&memblock.reserved,
+           regs[nr - 1].base + regs[nr - 1].size, PHYS_ADDR_MAX);
+}

solution A: 	phys_addr_t start[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
		phys_addr_t end[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
start, end is physical addr

solution B: 	int start_rgn[INIT_MEMBLOCK_REGIONS], end_rgn[INIT_MEMBLOCK_REGIONS];
start_rgn, end_rgn is rgn index		

Solution B do less remove operations and with no warning comparing to solution A.
I think solution B is better, could you give some suggestions?

>  
>  	/* truncate the reserved regions */
> -	memblock_remove_range(&memblock.reserved, 0, base);
> -	memblock_remove_range(&memblock.reserved,
> -			base + size, PHYS_ADDR_MAX);
> +	for_each_mem_range(i, &memblock.reserved, regions_to_keep, NUMA_NO_NODE,
> +			   MEMBLOCK_NONE, &start, &end, NULL)
> +		memblock_remove_range(&memblock.reserved, start, end);

There are the same issues as above.

>  }
>  
>  void __init memblock_mem_limit_remove_map(phys_addr_t limit)
>  {
> +	struct memblock_region rgn = {
> +		.base = 0,
> +	};
> +
> +	struct memblock_type region_to_keep = {
> +		.cnt = 1,
> +		.max = 1,
> +		.regions = &rgn,
> +	};
> +
>  	phys_addr_t max_addr;
>  
>  	if (!limit)
> @@ -1646,7 +1644,8 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit)
>  	if (max_addr == PHYS_ADDR_MAX)
>  		return;
>  
> -	memblock_cap_memory_range(0, max_addr);
> +	region_to_keep.regions[0].size = max_addr;
> +	memblock_cap_memory_ranges(&region_to_keep);
>  }
>  
>  static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
> 

Thanks,
Chen Zhou
chenzhou April 13, 2019, 8:14 a.m. UTC | #3
Hi Mike,

On 2019/4/11 20:17, Chen Zhou wrote:
> Hi Mike,
> 
> This overall looks well.
> Replacing memblock_cap_memory_range() with memblock_cap_memory_ranges() was what i wanted
> to do in v1, sorry for don't express that clearly.
> 
> But there are some issues as below. After fixing this, it can work correctly.
> 
> On 2019/4/10 21:09, Mike Rapoport wrote:
>> Hi,
>>
>> On Tue, Apr 09, 2019 at 06:28:18PM +0800, Chen Zhou wrote:
>>> After commit (arm64: kdump: support reserving crashkernel above 4G),
>>> there may be two crash kernel regions, one is below 4G, the other is
>>> above 4G.
>>>
>>> Crash dump kernel reads more than one crash kernel regions via a dtb
>>> property under node /chosen,
>>> linux,usable-memory-range = <BASE1 SIZE1 [BASE2 SIZE2]>
>>>
>>> Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
>>> ---
>>>  arch/arm64/mm/init.c     | 66 ++++++++++++++++++++++++++++++++++++++++--------
>>>  include/linux/memblock.h |  6 +++++
>>>  mm/memblock.c            |  7 ++---
>>>  3 files changed, 66 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
>>> index 3bebddf..0f18665 100644
>>> --- a/arch/arm64/mm/init.c
>>> +++ b/arch/arm64/mm/init.c
>>> @@ -65,6 +65,11 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init;
>>>  
>>>  #ifdef CONFIG_KEXEC_CORE
>>>  
>>> +/* at most two crash kernel regions, low_region and high_region */
>>> +#define CRASH_MAX_USABLE_RANGES	2
>>> +#define LOW_REGION_IDX			0
>>> +#define HIGH_REGION_IDX			1
>>> +
>>>  /*
>>>   * reserve_crashkernel() - reserves memory for crash kernel
>>>   *
>>> @@ -297,8 +302,8 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
>>>  		const char *uname, int depth, void *data)
>>>  {
>>>  	struct memblock_region *usablemem = data;
>>> -	const __be32 *reg;
>>> -	int len;
>>> +	const __be32 *reg, *endp;
>>> +	int len, nr = 0;
>>>  
>>>  	if (depth != 1 || strcmp(uname, "chosen") != 0)
>>>  		return 0;
>>> @@ -307,22 +312,63 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
>>>  	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
>>>  		return 1;
>>>  
>>> -	usablemem->base = dt_mem_next_cell(dt_root_addr_cells, &reg);
>>> -	usablemem->size = dt_mem_next_cell(dt_root_size_cells, &reg);
>>> +	endp = reg + (len / sizeof(__be32));
>>> +	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
>>> +		usablemem[nr].base = dt_mem_next_cell(dt_root_addr_cells, &reg);
>>> +		usablemem[nr].size = dt_mem_next_cell(dt_root_size_cells, &reg);
>>> +
>>> +		if (++nr >= CRASH_MAX_USABLE_RANGES)
>>> +			break;
>>> +	}
>>>  
>>>  	return 1;
>>>  }
>>>  
>>>  static void __init fdt_enforce_memory_region(void)
>>>  {
>>> -	struct memblock_region reg = {
>>> -		.size = 0,
>>> -	};
>>> +	int i, cnt = 0;
>>> +	struct memblock_region regs[CRASH_MAX_USABLE_RANGES];
>>
>> I only now noticed that fdt_enforce_memory_region() uses memblock_region to
>> pass the ranges around. If we'd switch to memblock_type instead, the
>> implementation of memblock_cap_memory_ranges() would be really
>> straightforward. Can you check if the below patch works for you? 
>>
>> >From e476d584098e31273af573e1a78e308880c5cf28 Mon Sep 17 00:00:00 2001
>> From: Mike Rapoport <rppt@linux.ibm.com>
>> Date: Wed, 10 Apr 2019 16:02:32 +0300
>> Subject: [PATCH] memblock: extend memblock_cap_memory_range to multiple ranges
>>
>> The memblock_cap_memory_range() removes all the memory except the range
>> passed to it. Extend this function to recieve memblock_type with the
>> regions that should be kept. This allows switching to simple iteration over
>> memblock arrays with 'for_each_mem_range' to remove the unneeded memory.
>>
>> Enable use of this function in arm64 for reservation of multile regions for
>> the crash kernel.
>>
>> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
>> ---
>>  arch/arm64/mm/init.c     | 34 ++++++++++++++++++++++++----------
>>  include/linux/memblock.h |  2 +-
>>  mm/memblock.c            | 45 ++++++++++++++++++++++-----------------------
>>  3 files changed, 47 insertions(+), 34 deletions(-)
>>
>>  
>> -void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
>> +void __init memblock_cap_memory_ranges(struct memblock_type *regions_to_keep)
>>  {
>> -	int start_rgn, end_rgn;
>> -	int i, ret;
>> -
>> -	if (!size)
>> -		return;
>> -
>> -	ret = memblock_isolate_range(&memblock.memory, base, size,
>> -						&start_rgn, &end_rgn);
>> -	if (ret)
>> -		return;
>> -
>> -	/* remove all the MAP regions */
>> -	for (i = memblock.memory.cnt - 1; i >= end_rgn; i--)
>> -		if (!memblock_is_nomap(&memblock.memory.regions[i]))
>> -			memblock_remove_region(&memblock.memory, i);
>> +	phys_addr_t start, end;
>> +	u64 i;
>>  
>> -	for (i = start_rgn - 1; i >= 0; i--)
>> -		if (!memblock_is_nomap(&memblock.memory.regions[i]))
>> -			memblock_remove_region(&memblock.memory, i);
>> +	/* truncate memory while skipping NOMAP regions */
>> +	for_each_mem_range(i, &memblock.memory, regions_to_keep, NUMA_NO_NODE,
>> +			   MEMBLOCK_NONE, &start, &end, NULL)
>> +		memblock_remove(start, end);
> 
> 1. use memblock_remove(start, size) instead of memblock_remove(start, end).
> 
> 2. There is a another hidden issue. We couldn't mix __next_mem_range()(called by for_each_mem_range) operation
> with remove operation because __next_mem_range() records the index of last time. If we do remove between
> __next_mem_range(), the index may be mess.
> 
> Therefore, we could do remove operation after for_each_mem_range like this, solution A:
>  void __init memblock_cap_memory_ranges(struct memblock_type *regions_to_keep)
>  {
> -	phys_addr_t start, end;
> -	u64 i;
> +	phys_addr_t start[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
> +	phys_addr_t end[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
> +	u64 i, nr = 0;
> 
>  	/* truncate memory while skipping NOMAP regions */
>  	for_each_mem_range(i, &memblock.memory, regions_to_keep, NUMA_NO_NODE,
> -			   MEMBLOCK_NONE, &start, &end, NULL)
> -		memblock_remove(start, end);
> +			   MEMBLOCK_NONE, &start[nr], &end[nr], NULL)
> +		nr++;
> +	for (i = 0; i < nr; i++)
> +		memblock_remove(start[i], end[i] - start[i]);
> 
>  	/* truncate the reserved regions */
> +	nr = 0;
>  	for_each_mem_range(i, &memblock.reserved, regions_to_keep, NUMA_NO_NODE,
> -			   MEMBLOCK_NONE, &start, &end, NULL)
> -		memblock_remove_range(&memblock.reserved, start, end);
> +			   MEMBLOCK_NONE, &start[nr], &end[nr], NULL)
> +		nr++;
> +	for (i = 0; i < nr; i++)
> +		memblock_remove_range(&memblock.reserved, start[i],
> +				end[i] - start[i]);
>  }
> 
> But a warning occurs when compiling:
>   CALL    scripts/atomic/check-atomics.sh
>   CALL    scripts/checksyscalls.sh
>   CHK     include/generated/compile.h
>   CC      mm/memblock.o
> mm/memblock.c: In function ‘memblock_cap_memory_ranges’:
> mm/memblock.c:1635:1: warning: the frame size of 36912 bytes is larger than 2048 bytes [-Wframe-larger-than=]
>  }
> 
> another solution is my implementation in v1, solution B:
> +void __init memblock_cap_memory_ranges(struct memblock_type *regions_to_keep)

----------
> +{
> +   int start_rgn[INIT_MEMBLOCK_REGIONS], end_rgn[INIT_MEMBLOCK_REGIONS];
> +   int i, j, ret, nr = 0;
> +   memblock_region *regs = regions_to_keep->regions;
> +
> +   nr = regions_to_keep -> cnt;
> +   if (!nr)
> +       return;
----------
Sorry, i sent the drafts by mistake. I mixed the drafts with my tested version.
These lines replace with below.

+       int start_rgn[INIT_MEMBLOCK_REGIONS], end_rgn[INIT_MEMBLOCK_REGIONS];
+       int i, j, ret, nr = 0;
+       struct memblock_region *regs = regions_to_keep->regions;
+
+       for (i = 0; i < regions_to_keep->cnt; i++) {
+               ret = memblock_isolate_range(&memblock.memory, regs[i].base,
+                               regs[i].size, &start_rgn[i], &end_rgn[i]);
+               if (ret)
+                       break;
+               nr++;
+       }
+       if (!nr)
+               return;

Thanks,
Chen Zhou

> +
> +   /* remove all the MAP regions */
> +   for (i = memblock.memory.cnt - 1; i >= end_rgn[nr - 1]; i--)
> +       if (!memblock_is_nomap(&memblock.memory.regions[i]))
> +           memblock_remove_region(&memblock.memory, i);
> +
> +   for (i = nr - 1; i > 0; i--)
> +       for (j = start_rgn[i] - 1; j >= end_rgn[i - 1]; j--)
> +           if (!memblock_is_nomap(&memblock.memory.regions[j]))
> +               memblock_remove_region(&memblock.memory, j);
> +
> +   for (i = start_rgn[0] - 1; i >= 0; i--)
> +       if (!memblock_is_nomap(&memblock.memory.regions[i]))
> +           memblock_remove_region(&memblock.memory, i);
> +
> +   /* truncate the reserved regions */
> +   memblock_remove_range(&memblock.reserved, 0, regs[0].base);
> +
> +   for (i = nr - 1; i > 0; i--)
> +       memblock_remove_range(&memblock.reserved,
> +               regs[i - 1].base + regs[i - 1].size,
> +		regs[i].base - regs[i - 1].base - regs[i - 1].size);
> +
> +   memblock_remove_range(&memblock.reserved,
> +           regs[nr - 1].base + regs[nr - 1].size, PHYS_ADDR_MAX);
> +}
> 
> solution A: 	phys_addr_t start[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
> 		phys_addr_t end[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
> start, end is physical addr
> 
> solution B: 	int start_rgn[INIT_MEMBLOCK_REGIONS], end_rgn[INIT_MEMBLOCK_REGIONS];
> start_rgn, end_rgn is rgn index		
> 
> Solution B do less remove operations and with no warning comparing to solution A.
> I think solution B is better, could you give some suggestions?
> 
>>  
>>  	/* truncate the reserved regions */
>> -	memblock_remove_range(&memblock.reserved, 0, base);
>> -	memblock_remove_range(&memblock.reserved,
>> -			base + size, PHYS_ADDR_MAX);
>> +	for_each_mem_range(i, &memblock.reserved, regions_to_keep, NUMA_NO_NODE,
>> +			   MEMBLOCK_NONE, &start, &end, NULL)
>> +		memblock_remove_range(&memblock.reserved, start, end);
> 
> There are the same issues as above.
> 
>>  }
>>  
>>  void __init memblock_mem_limit_remove_map(phys_addr_t limit)
>>  {
>> +	struct memblock_region rgn = {
>> +		.base = 0,
>> +	};
>> +
>> +	struct memblock_type region_to_keep = {
>> +		.cnt = 1,
>> +		.max = 1,
>> +		.regions = &rgn,
>> +	};
>> +
>>  	phys_addr_t max_addr;
>>  
>>  	if (!limit)
>> @@ -1646,7 +1644,8 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit)
>>  	if (max_addr == PHYS_ADDR_MAX)
>>  		return;
>>  
>> -	memblock_cap_memory_range(0, max_addr);
>> +	region_to_keep.regions[0].size = max_addr;
>> +	memblock_cap_memory_ranges(&region_to_keep);
>>  }
>>  
>>  static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
>>
> 
> Thanks,
> Chen Zhou
>
Mike Rapoport April 14, 2019, 12:10 p.m. UTC | #4
Hi,

On Thu, Apr 11, 2019 at 08:17:43PM +0800, Chen Zhou wrote:
> Hi Mike,
> 
> This overall looks well.
> Replacing memblock_cap_memory_range() with memblock_cap_memory_ranges() was what i wanted
> to do in v1, sorry for don't express that clearly.

I didn't object to memblock_cap_memory_ranges() in general, I was worried
about it's complexity and I hoped that we could find a simpler solution.
 
> But there are some issues as below. After fixing this, it can work correctly.
> 
> On 2019/4/10 21:09, Mike Rapoport wrote:
> > Hi,
> > 
> > On Tue, Apr 09, 2019 at 06:28:18PM +0800, Chen Zhou wrote:
> >> After commit (arm64: kdump: support reserving crashkernel above 4G),
> >> there may be two crash kernel regions, one is below 4G, the other is
> >> above 4G.
> >>
> >> Crash dump kernel reads more than one crash kernel regions via a dtb
> >> property under node /chosen,
> >> linux,usable-memory-range = <BASE1 SIZE1 [BASE2 SIZE2]>
> >>
> >> Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
> >> ---
> >>  arch/arm64/mm/init.c     | 66 ++++++++++++++++++++++++++++++++++++++++--------
> >>  include/linux/memblock.h |  6 +++++
> >>  mm/memblock.c            |  7 ++---
> >>  3 files changed, 66 insertions(+), 13 deletions(-)
> >>
> >> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> >> index 3bebddf..0f18665 100644
> >> --- a/arch/arm64/mm/init.c
> >> +++ b/arch/arm64/mm/init.c
> >> @@ -65,6 +65,11 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init;
> >>  
> >>  #ifdef CONFIG_KEXEC_CORE
> >>  
> >> +/* at most two crash kernel regions, low_region and high_region */
> >> +#define CRASH_MAX_USABLE_RANGES	2
> >> +#define LOW_REGION_IDX			0
> >> +#define HIGH_REGION_IDX			1
> >> +
> >>  /*
> >>   * reserve_crashkernel() - reserves memory for crash kernel
> >>   *
> >> @@ -297,8 +302,8 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
> >>  		const char *uname, int depth, void *data)
> >>  {
> >>  	struct memblock_region *usablemem = data;
> >> -	const __be32 *reg;
> >> -	int len;
> >> +	const __be32 *reg, *endp;
> >> +	int len, nr = 0;
> >>  
> >>  	if (depth != 1 || strcmp(uname, "chosen") != 0)
> >>  		return 0;
> >> @@ -307,22 +312,63 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
> >>  	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
> >>  		return 1;
> >>  
> >> -	usablemem->base = dt_mem_next_cell(dt_root_addr_cells, &reg);
> >> -	usablemem->size = dt_mem_next_cell(dt_root_size_cells, &reg);
> >> +	endp = reg + (len / sizeof(__be32));
> >> +	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
> >> +		usablemem[nr].base = dt_mem_next_cell(dt_root_addr_cells, &reg);
> >> +		usablemem[nr].size = dt_mem_next_cell(dt_root_size_cells, &reg);
> >> +
> >> +		if (++nr >= CRASH_MAX_USABLE_RANGES)
> >> +			break;
> >> +	}
> >>  
> >>  	return 1;
> >>  }
> >>  
> >>  static void __init fdt_enforce_memory_region(void)
> >>  {
> >> -	struct memblock_region reg = {
> >> -		.size = 0,
> >> -	};
> >> +	int i, cnt = 0;
> >> +	struct memblock_region regs[CRASH_MAX_USABLE_RANGES];
> > 
> > I only now noticed that fdt_enforce_memory_region() uses memblock_region to
> > pass the ranges around. If we'd switch to memblock_type instead, the
> > implementation of memblock_cap_memory_ranges() would be really
> > straightforward. Can you check if the below patch works for you? 
> > 
> >>From e476d584098e31273af573e1a78e308880c5cf28 Mon Sep 17 00:00:00 2001
> > From: Mike Rapoport <rppt@linux.ibm.com>
> > Date: Wed, 10 Apr 2019 16:02:32 +0300
> > Subject: [PATCH] memblock: extend memblock_cap_memory_range to multiple ranges
> > 
> > The memblock_cap_memory_range() removes all the memory except the range
> > passed to it. Extend this function to recieve memblock_type with the
> > regions that should be kept. This allows switching to simple iteration over
> > memblock arrays with 'for_each_mem_range' to remove the unneeded memory.
> > 
> > Enable use of this function in arm64 for reservation of multile regions for
> > the crash kernel.
> > 
> > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > ---
> >  arch/arm64/mm/init.c     | 34 ++++++++++++++++++++++++----------
> >  include/linux/memblock.h |  2 +-
> >  mm/memblock.c            | 45 ++++++++++++++++++++++-----------------------
> >  3 files changed, 47 insertions(+), 34 deletions(-)
> > 
> >  
> > -void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
> > +void __init memblock_cap_memory_ranges(struct memblock_type *regions_to_keep)
> >  {
> > -	int start_rgn, end_rgn;
> > -	int i, ret;
> > -
> > -	if (!size)
> > -		return;
> > -
> > -	ret = memblock_isolate_range(&memblock.memory, base, size,
> > -						&start_rgn, &end_rgn);
> > -	if (ret)
> > -		return;
> > -
> > -	/* remove all the MAP regions */
> > -	for (i = memblock.memory.cnt - 1; i >= end_rgn; i--)
> > -		if (!memblock_is_nomap(&memblock.memory.regions[i]))
> > -			memblock_remove_region(&memblock.memory, i);
> > +	phys_addr_t start, end;
> > +	u64 i;
> >  
> > -	for (i = start_rgn - 1; i >= 0; i--)
> > -		if (!memblock_is_nomap(&memblock.memory.regions[i]))
> > -			memblock_remove_region(&memblock.memory, i);
> > +	/* truncate memory while skipping NOMAP regions */
> > +	for_each_mem_range(i, &memblock.memory, regions_to_keep, NUMA_NO_NODE,
> > +			   MEMBLOCK_NONE, &start, &end, NULL)
> > +		memblock_remove(start, end);
> 
> 1. use memblock_remove(start, size) instead of memblock_remove(start, end).
> 
> 2. There is a another hidden issue. We couldn't mix __next_mem_range()(called by for_each_mem_range) operation
> with remove operation because __next_mem_range() records the index of last time. If we do remove between
> __next_mem_range(), the index may be mess.

Oops, I've really missed that :)
 
> Therefore, we could do remove operation after for_each_mem_range like this, solution A:
>  void __init memblock_cap_memory_ranges(struct memblock_type *regions_to_keep)
>  {
> -	phys_addr_t start, end;
> -	u64 i;
> +	phys_addr_t start[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
> +	phys_addr_t end[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
> +	u64 i, nr = 0;
> 
>  	/* truncate memory while skipping NOMAP regions */
>  	for_each_mem_range(i, &memblock.memory, regions_to_keep, NUMA_NO_NODE,
> -			   MEMBLOCK_NONE, &start, &end, NULL)
> -		memblock_remove(start, end);
> +			   MEMBLOCK_NONE, &start[nr], &end[nr], NULL)
> +		nr++;
> +	for (i = 0; i < nr; i++)
> +		memblock_remove(start[i], end[i] - start[i]);
> 
>  	/* truncate the reserved regions */
> +	nr = 0;
>  	for_each_mem_range(i, &memblock.reserved, regions_to_keep, NUMA_NO_NODE,
> -			   MEMBLOCK_NONE, &start, &end, NULL)
> -		memblock_remove_range(&memblock.reserved, start, end);
> +			   MEMBLOCK_NONE, &start[nr], &end[nr], NULL)
> +		nr++;
> +	for (i = 0; i < nr; i++)
> +		memblock_remove_range(&memblock.reserved, start[i],
> +				end[i] - start[i]);
>  }
> 
> But a warning occurs when compiling:
>   CALL    scripts/atomic/check-atomics.sh
>   CALL    scripts/checksyscalls.sh
>   CHK     include/generated/compile.h
>   CC      mm/memblock.o
> mm/memblock.c: In function ‘memblock_cap_memory_ranges’:
> mm/memblock.c:1635:1: warning: the frame size of 36912 bytes is larger than 2048 bytes [-Wframe-larger-than=]
>  }
> 
> another solution is my implementation in v1, solution B:
> +void __init memblock_cap_memory_ranges(struct memblock_type *regions_to_keep)
> +{
> +   int start_rgn[INIT_MEMBLOCK_REGIONS], end_rgn[INIT_MEMBLOCK_REGIONS];
> +   int i, j, ret, nr = 0;
> +   memblock_region *regs = regions_to_keep->regions;
> +
> +   nr = regions_to_keep -> cnt;
> +   if (!nr)
> +       return;
> +
> +   /* remove all the MAP regions */
> +   for (i = memblock.memory.cnt - 1; i >= end_rgn[nr - 1]; i--)
> +       if (!memblock_is_nomap(&memblock.memory.regions[i]))
> +           memblock_remove_region(&memblock.memory, i);
> +
> +   for (i = nr - 1; i > 0; i--)
> +       for (j = start_rgn[i] - 1; j >= end_rgn[i - 1]; j--)
> +           if (!memblock_is_nomap(&memblock.memory.regions[j]))
> +               memblock_remove_region(&memblock.memory, j);
> +
> +   for (i = start_rgn[0] - 1; i >= 0; i--)
> +       if (!memblock_is_nomap(&memblock.memory.regions[i]))
> +           memblock_remove_region(&memblock.memory, i);
> +
> +   /* truncate the reserved regions */
> +   memblock_remove_range(&memblock.reserved, 0, regs[0].base);
> +
> +   for (i = nr - 1; i > 0; i--)
> +       memblock_remove_range(&memblock.reserved,
> +               regs[i - 1].base + regs[i - 1].size,
> +		regs[i].base - regs[i - 1].base - regs[i - 1].size);
> +
> +   memblock_remove_range(&memblock.reserved,
> +           regs[nr - 1].base + regs[nr - 1].size, PHYS_ADDR_MAX);
> +}
> 
> solution A: 	phys_addr_t start[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
> 		phys_addr_t end[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
> start, end is physical addr
> 
> solution B: 	int start_rgn[INIT_MEMBLOCK_REGIONS], end_rgn[INIT_MEMBLOCK_REGIONS];
> start_rgn, end_rgn is rgn index		
> 
> Solution B do less remove operations and with no warning comparing to solution A.
> I think solution B is better, could you give some suggestions?
 
Solution B is indeed better that solution A, but I'm still worried by
relatively large arrays on stack and the amount of loops :(

The very least we could do is to call memblock_cap_memory_range() to drop
the memory before and after the ranges we'd like to keep.

> >  
> >  	/* truncate the reserved regions */
> > -	memblock_remove_range(&memblock.reserved, 0, base);
> > -	memblock_remove_range(&memblock.reserved,
> > -			base + size, PHYS_ADDR_MAX);
> > +	for_each_mem_range(i, &memblock.reserved, regions_to_keep, NUMA_NO_NODE,
> > +			   MEMBLOCK_NONE, &start, &end, NULL)
> > +		memblock_remove_range(&memblock.reserved, start, end);
> 
> There are the same issues as above.
> 
> >  }
> >  
> >  void __init memblock_mem_limit_remove_map(phys_addr_t limit)
> >  {
> > +	struct memblock_region rgn = {
> > +		.base = 0,
> > +	};
> > +
> > +	struct memblock_type region_to_keep = {
> > +		.cnt = 1,
> > +		.max = 1,
> > +		.regions = &rgn,
> > +	};
> > +
> >  	phys_addr_t max_addr;
> >  
> >  	if (!limit)
> > @@ -1646,7 +1644,8 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit)
> >  	if (max_addr == PHYS_ADDR_MAX)
> >  		return;
> >  
> > -	memblock_cap_memory_range(0, max_addr);
> > +	region_to_keep.regions[0].size = max_addr;
> > +	memblock_cap_memory_ranges(&region_to_keep);
> >  }
> >  
> >  static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
> > 
> 
> Thanks,
> Chen Zhou
>
Mike Rapoport April 14, 2019, 12:13 p.m. UTC | #5
Hi,

On Tue, Apr 09, 2019 at 06:28:18PM +0800, Chen Zhou wrote:
> After commit (arm64: kdump: support reserving crashkernel above 4G),
> there may be two crash kernel regions, one is below 4G, the other is
> above 4G.
> 
> Crash dump kernel reads more than one crash kernel regions via a dtb
> property under node /chosen,
> linux,usable-memory-range = <BASE1 SIZE1 [BASE2 SIZE2]>

Somehow I've missed that previously, but how is this supposed to work on
EFI systems?
 
> Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
> ---
>  arch/arm64/mm/init.c     | 66 ++++++++++++++++++++++++++++++++++++++++--------
>  include/linux/memblock.h |  6 +++++
>  mm/memblock.c            |  7 ++---
>  3 files changed, 66 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 3bebddf..0f18665 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -65,6 +65,11 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init;
>  
>  #ifdef CONFIG_KEXEC_CORE
>  
> +/* at most two crash kernel regions, low_region and high_region */
> +#define CRASH_MAX_USABLE_RANGES	2
> +#define LOW_REGION_IDX			0
> +#define HIGH_REGION_IDX			1
> +
>  /*
>   * reserve_crashkernel() - reserves memory for crash kernel
>   *
> @@ -297,8 +302,8 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
>  		const char *uname, int depth, void *data)
>  {
>  	struct memblock_region *usablemem = data;
> -	const __be32 *reg;
> -	int len;
> +	const __be32 *reg, *endp;
> +	int len, nr = 0;
>  
>  	if (depth != 1 || strcmp(uname, "chosen") != 0)
>  		return 0;
> @@ -307,22 +312,63 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
>  	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
>  		return 1;
>  
> -	usablemem->base = dt_mem_next_cell(dt_root_addr_cells, &reg);
> -	usablemem->size = dt_mem_next_cell(dt_root_size_cells, &reg);
> +	endp = reg + (len / sizeof(__be32));
> +	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
> +		usablemem[nr].base = dt_mem_next_cell(dt_root_addr_cells, &reg);
> +		usablemem[nr].size = dt_mem_next_cell(dt_root_size_cells, &reg);
> +
> +		if (++nr >= CRASH_MAX_USABLE_RANGES)
> +			break;
> +	}
>  
>  	return 1;
>  }
>  
>  static void __init fdt_enforce_memory_region(void)
>  {
> -	struct memblock_region reg = {
> -		.size = 0,
> -	};
> +	int i, cnt = 0;
> +	struct memblock_region regs[CRASH_MAX_USABLE_RANGES];
> +
> +	memset(regs, 0, sizeof(regs));
> +	of_scan_flat_dt(early_init_dt_scan_usablemem, regs);
> +
> +	for (i = 0; i < CRASH_MAX_USABLE_RANGES; i++)
> +		if (regs[i].size)
> +			cnt++;
> +		else
> +			break;
> +
> +	if (cnt - 1 == LOW_REGION_IDX)
> +		memblock_cap_memory_range(regs[LOW_REGION_IDX].base,
> +				regs[LOW_REGION_IDX].size);
> +	else if (cnt - 1 == HIGH_REGION_IDX) {
> +		/*
> +		 * Two crash kernel regions, cap the memory range
> +		 * [regs[LOW_REGION_IDX].base, regs[HIGH_REGION_IDX].end]
> +		 * and then remove the memory range in the middle.
> +		 */
> +		int start_rgn, end_rgn, i, ret;
> +		phys_addr_t mid_base, mid_size;
> +
> +		mid_base = regs[LOW_REGION_IDX].base + regs[LOW_REGION_IDX].size;
> +		mid_size = regs[HIGH_REGION_IDX].base - mid_base;
> +		ret = memblock_isolate_range(&memblock.memory, mid_base,
> +				mid_size, &start_rgn, &end_rgn);
>  
> -	of_scan_flat_dt(early_init_dt_scan_usablemem, &reg);
> +		if (ret)
> +			return;
>  
> -	if (reg.size)
> -		memblock_cap_memory_range(reg.base, reg.size);
> +		memblock_cap_memory_range(regs[LOW_REGION_IDX].base,
> +				regs[HIGH_REGION_IDX].base -
> +				regs[LOW_REGION_IDX].base +
> +				regs[HIGH_REGION_IDX].size);
> +		for (i = end_rgn - 1; i >= start_rgn; i--) {
> +			if (!memblock_is_nomap(&memblock.memory.regions[i]))
> +				memblock_remove_region(&memblock.memory, i);
> +		}
> +		memblock_remove_range(&memblock.reserved, mid_base,
> +				mid_base + mid_size);
> +	}
>  }
>  
>  void __init arm64_memblock_init(void)
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index 294d5d8..787d252 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -110,9 +110,15 @@ void memblock_discard(void);
>  
>  phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
>  				   phys_addr_t size, phys_addr_t align);
> +void memblock_remove_region(struct memblock_type *type, unsigned long r);
>  void memblock_allow_resize(void);
>  int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
>  int memblock_add(phys_addr_t base, phys_addr_t size);
> +int memblock_isolate_range(struct memblock_type *type,
> +					phys_addr_t base, phys_addr_t size,
> +					int *start_rgn, int *end_rgn);
> +int memblock_remove_range(struct memblock_type *type,
> +					phys_addr_t base, phys_addr_t size);
>  int memblock_remove(phys_addr_t base, phys_addr_t size);
>  int memblock_free(phys_addr_t base, phys_addr_t size);
>  int memblock_reserve(phys_addr_t base, phys_addr_t size);
> diff --git a/mm/memblock.c b/mm/memblock.c
> index e7665cf..1846e2d 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -357,7 +357,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
>  	return ret;
>  }
>  
> -static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
> +void __init_memblock memblock_remove_region(struct memblock_type *type,
> +					unsigned long r)
>  {
>  	type->total_size -= type->regions[r].size;
>  	memmove(&type->regions[r], &type->regions[r + 1],
> @@ -724,7 +725,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
>   * Return:
>   * 0 on success, -errno on failure.
>   */
> -static int __init_memblock memblock_isolate_range(struct memblock_type *type,
> +int __init_memblock memblock_isolate_range(struct memblock_type *type,
>  					phys_addr_t base, phys_addr_t size,
>  					int *start_rgn, int *end_rgn)
>  {
> @@ -784,7 +785,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
>  	return 0;
>  }
>  
> -static int __init_memblock memblock_remove_range(struct memblock_type *type,
> +int __init_memblock memblock_remove_range(struct memblock_type *type,
>  					  phys_addr_t base, phys_addr_t size)
>  {
>  	int start_rgn, end_rgn;
> -- 
> 2.7.4
>
chenzhou April 15, 2019, 2:05 a.m. UTC | #6
Hi Mike,

On 2019/4/14 20:10, Mike Rapoport wrote:
> Hi,
> 
> On Thu, Apr 11, 2019 at 08:17:43PM +0800, Chen Zhou wrote:
>> Hi Mike,
>>
>> This overall looks well.
>> Replacing memblock_cap_memory_range() with memblock_cap_memory_ranges() was what i wanted
>> to do in v1, sorry for don't express that clearly.
> 
> I didn't object to memblock_cap_memory_ranges() in general, I was worried
> about it's complexity and I hoped that we could find a simpler solution.
>  
>> But there are some issues as below. After fixing this, it can work correctly.
>>
>> On 2019/4/10 21:09, Mike Rapoport wrote:
>>> Hi,
>>>
>>> On Tue, Apr 09, 2019 at 06:28:18PM +0800, Chen Zhou wrote:
>>>> After commit (arm64: kdump: support reserving crashkernel above 4G),
>>>> there may be two crash kernel regions, one is below 4G, the other is
>>>> above 4G.
>>>>
>>>> Crash dump kernel reads more than one crash kernel regions via a dtb
>>>> property under node /chosen,
>>>> linux,usable-memory-range = <BASE1 SIZE1 [BASE2 SIZE2]>
>>>>
>>>> Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
>>>> ---
>>>>  arch/arm64/mm/init.c     | 66 ++++++++++++++++++++++++++++++++++++++++--------
>>>>  include/linux/memblock.h |  6 +++++
>>>>  mm/memblock.c            |  7 ++---
>>>>  3 files changed, 66 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
>>>> index 3bebddf..0f18665 100644
>>>> --- a/arch/arm64/mm/init.c
>>>> +++ b/arch/arm64/mm/init.c
>>>> @@ -65,6 +65,11 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init;
>>>>  
>>>>  #ifdef CONFIG_KEXEC_CORE
>>>>  
>>>> +/* at most two crash kernel regions, low_region and high_region */
>>>> +#define CRASH_MAX_USABLE_RANGES	2
>>>> +#define LOW_REGION_IDX			0
>>>> +#define HIGH_REGION_IDX			1
>>>> +
>>>>  /*
>>>>   * reserve_crashkernel() - reserves memory for crash kernel
>>>>   *
>>>> @@ -297,8 +302,8 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
>>>>  		const char *uname, int depth, void *data)
>>>>  {
>>>>  	struct memblock_region *usablemem = data;
>>>> -	const __be32 *reg;
>>>> -	int len;
>>>> +	const __be32 *reg, *endp;
>>>> +	int len, nr = 0;
>>>>  
>>>>  	if (depth != 1 || strcmp(uname, "chosen") != 0)
>>>>  		return 0;
>>>> @@ -307,22 +312,63 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
>>>>  	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
>>>>  		return 1;
>>>>  
>>>> -	usablemem->base = dt_mem_next_cell(dt_root_addr_cells, &reg);
>>>> -	usablemem->size = dt_mem_next_cell(dt_root_size_cells, &reg);
>>>> +	endp = reg + (len / sizeof(__be32));
>>>> +	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
>>>> +		usablemem[nr].base = dt_mem_next_cell(dt_root_addr_cells, &reg);
>>>> +		usablemem[nr].size = dt_mem_next_cell(dt_root_size_cells, &reg);
>>>> +
>>>> +		if (++nr >= CRASH_MAX_USABLE_RANGES)
>>>> +			break;
>>>> +	}
>>>>  
>>>>  	return 1;
>>>>  }
>>>>  
>>>>  static void __init fdt_enforce_memory_region(void)
>>>>  {
>>>> -	struct memblock_region reg = {
>>>> -		.size = 0,
>>>> -	};
>>>> +	int i, cnt = 0;
>>>> +	struct memblock_region regs[CRASH_MAX_USABLE_RANGES];
>>>
>>> I only now noticed that fdt_enforce_memory_region() uses memblock_region to
>>> pass the ranges around. If we'd switch to memblock_type instead, the
>>> implementation of memblock_cap_memory_ranges() would be really
>>> straightforward. Can you check if the below patch works for you? 
>>>
>>> >From e476d584098e31273af573e1a78e308880c5cf28 Mon Sep 17 00:00:00 2001
>>> From: Mike Rapoport <rppt@linux.ibm.com>
>>> Date: Wed, 10 Apr 2019 16:02:32 +0300
>>> Subject: [PATCH] memblock: extend memblock_cap_memory_range to multiple ranges
>>>
>>> The memblock_cap_memory_range() removes all the memory except the range
>>> passed to it. Extend this function to recieve memblock_type with the
>>> regions that should be kept. This allows switching to simple iteration over
>>> memblock arrays with 'for_each_mem_range' to remove the unneeded memory.
>>>
>>> Enable use of this function in arm64 for reservation of multile regions for
>>> the crash kernel.
>>>
>>> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
>>> ---
>>>  arch/arm64/mm/init.c     | 34 ++++++++++++++++++++++++----------
>>>  include/linux/memblock.h |  2 +-
>>>  mm/memblock.c            | 45 ++++++++++++++++++++++-----------------------
>>>  3 files changed, 47 insertions(+), 34 deletions(-)
>>>
>>>  
>>> -void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
>>> +void __init memblock_cap_memory_ranges(struct memblock_type *regions_to_keep)
>>>  {
>>> -	int start_rgn, end_rgn;
>>> -	int i, ret;
>>> -
>>> -	if (!size)
>>> -		return;
>>> -
>>> -	ret = memblock_isolate_range(&memblock.memory, base, size,
>>> -						&start_rgn, &end_rgn);
>>> -	if (ret)
>>> -		return;
>>> -
>>> -	/* remove all the MAP regions */
>>> -	for (i = memblock.memory.cnt - 1; i >= end_rgn; i--)
>>> -		if (!memblock_is_nomap(&memblock.memory.regions[i]))
>>> -			memblock_remove_region(&memblock.memory, i);
>>> +	phys_addr_t start, end;
>>> +	u64 i;
>>>  
>>> -	for (i = start_rgn - 1; i >= 0; i--)
>>> -		if (!memblock_is_nomap(&memblock.memory.regions[i]))
>>> -			memblock_remove_region(&memblock.memory, i);
>>> +	/* truncate memory while skipping NOMAP regions */
>>> +	for_each_mem_range(i, &memblock.memory, regions_to_keep, NUMA_NO_NODE,
>>> +			   MEMBLOCK_NONE, &start, &end, NULL)
>>> +		memblock_remove(start, end);
>>
>> 1. use memblock_remove(start, size) instead of memblock_remove(start, end).
>>
>> 2. There is a another hidden issue. We couldn't mix __next_mem_range()(called by for_each_mem_range) operation
>> with remove operation because __next_mem_range() records the index of last time. If we do remove between
>> __next_mem_range(), the index may be mess.
> 
> Oops, I've really missed that :)
>  
>> Therefore, we could do remove operation after for_each_mem_range like this, solution A:
>>  void __init memblock_cap_memory_ranges(struct memblock_type *regions_to_keep)
>>  {
>> -	phys_addr_t start, end;
>> -	u64 i;
>> +	phys_addr_t start[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
>> +	phys_addr_t end[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
>> +	u64 i, nr = 0;
>>
>>  	/* truncate memory while skipping NOMAP regions */
>>  	for_each_mem_range(i, &memblock.memory, regions_to_keep, NUMA_NO_NODE,
>> -			   MEMBLOCK_NONE, &start, &end, NULL)
>> -		memblock_remove(start, end);
>> +			   MEMBLOCK_NONE, &start[nr], &end[nr], NULL)
>> +		nr++;
>> +	for (i = 0; i < nr; i++)
>> +		memblock_remove(start[i], end[i] - start[i]);
>>
>>  	/* truncate the reserved regions */
>> +	nr = 0;
>>  	for_each_mem_range(i, &memblock.reserved, regions_to_keep, NUMA_NO_NODE,
>> -			   MEMBLOCK_NONE, &start, &end, NULL)
>> -		memblock_remove_range(&memblock.reserved, start, end);
>> +			   MEMBLOCK_NONE, &start[nr], &end[nr], NULL)
>> +		nr++;
>> +	for (i = 0; i < nr; i++)
>> +		memblock_remove_range(&memblock.reserved, start[i],
>> +				end[i] - start[i]);
>>  }
>>
>> But a warning occurs when compiling:
>>   CALL    scripts/atomic/check-atomics.sh
>>   CALL    scripts/checksyscalls.sh
>>   CHK     include/generated/compile.h
>>   CC      mm/memblock.o
>> mm/memblock.c: In function ‘memblock_cap_memory_ranges’:
>> mm/memblock.c:1635:1: warning: the frame size of 36912 bytes is larger than 2048 bytes [-Wframe-larger-than=]
>>  }
>>
>> another solution is my implementation in v1, solution B:
>> +void __init memblock_cap_memory_ranges(struct memblock_type *regions_to_keep)
>> +{
>> +   int start_rgn[INIT_MEMBLOCK_REGIONS], end_rgn[INIT_MEMBLOCK_REGIONS];
>> +   int i, j, ret, nr = 0;
>> +   memblock_region *regs = regions_to_keep->regions;
>> +
>> +   nr = regions_to_keep -> cnt;
>> +   if (!nr)
>> +       return;
>> +
>> +   /* remove all the MAP regions */
>> +   for (i = memblock.memory.cnt - 1; i >= end_rgn[nr - 1]; i--)
>> +       if (!memblock_is_nomap(&memblock.memory.regions[i]))
>> +           memblock_remove_region(&memblock.memory, i);
>> +
>> +   for (i = nr - 1; i > 0; i--)
>> +       for (j = start_rgn[i] - 1; j >= end_rgn[i - 1]; j--)
>> +           if (!memblock_is_nomap(&memblock.memory.regions[j]))
>> +               memblock_remove_region(&memblock.memory, j);
>> +
>> +   for (i = start_rgn[0] - 1; i >= 0; i--)
>> +       if (!memblock_is_nomap(&memblock.memory.regions[i]))
>> +           memblock_remove_region(&memblock.memory, i);
>> +
>> +   /* truncate the reserved regions */
>> +   memblock_remove_range(&memblock.reserved, 0, regs[0].base);
>> +
>> +   for (i = nr - 1; i > 0; i--)
>> +       memblock_remove_range(&memblock.reserved,
>> +               regs[i - 1].base + regs[i - 1].size,
>> +		regs[i].base - regs[i - 1].base - regs[i - 1].size);
>> +
>> +   memblock_remove_range(&memblock.reserved,
>> +           regs[nr - 1].base + regs[nr - 1].size, PHYS_ADDR_MAX);
>> +}
>>
>> solution A: 	phys_addr_t start[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
>> 		phys_addr_t end[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
>> start, end is physical addr
>>
>> solution B: 	int start_rgn[INIT_MEMBLOCK_REGIONS], end_rgn[INIT_MEMBLOCK_REGIONS];
>> start_rgn, end_rgn is rgn index		
>>
>> Solution B do less remove operations and with no warning comparing to solution A.
>> I think solution B is better, could you give some suggestions?
>  
> Solution B is indeed better that solution A, but I'm still worried by
> relatively large arrays on stack and the amount of loops :(
> 
> The very least we could do is to call memblock_cap_memory_range() to drop
> the memory before and after the ranges we'd like to keep.

1. relatively large arrays
As my said above, the start_rgn, end_rgn is rgn index, we could use unsigned char type.

2. loops
Loops always exist, and the solution with fewer loops may be just encapsulated well.

Thanks,
Chen Zhou

> 
>>>  
>>>  	/* truncate the reserved regions */
>>> -	memblock_remove_range(&memblock.reserved, 0, base);
>>> -	memblock_remove_range(&memblock.reserved,
>>> -			base + size, PHYS_ADDR_MAX);
>>> +	for_each_mem_range(i, &memblock.reserved, regions_to_keep, NUMA_NO_NODE,
>>> +			   MEMBLOCK_NONE, &start, &end, NULL)
>>> +		memblock_remove_range(&memblock.reserved, start, end);
>>
>> There are the same issues as above.
>>
>>>  }
>>>  
>>>  void __init memblock_mem_limit_remove_map(phys_addr_t limit)
>>>  {
>>> +	struct memblock_region rgn = {
>>> +		.base = 0,
>>> +	};
>>> +
>>> +	struct memblock_type region_to_keep = {
>>> +		.cnt = 1,
>>> +		.max = 1,
>>> +		.regions = &rgn,
>>> +	};
>>> +
>>>  	phys_addr_t max_addr;
>>>  
>>>  	if (!limit)
>>> @@ -1646,7 +1644,8 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit)
>>>  	if (max_addr == PHYS_ADDR_MAX)
>>>  		return;
>>>  
>>> -	memblock_cap_memory_range(0, max_addr);
>>> +	region_to_keep.regions[0].size = max_addr;
>>> +	memblock_cap_memory_ranges(&region_to_keep);
>>>  }
>>>  
>>>  static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
>>>
>>
>> Thanks,
>> Chen Zhou
>>
>
chenzhou April 15, 2019, 2:27 a.m. UTC | #7
Hi Mike,

On 2019/4/14 20:13, Mike Rapoport wrote:
> Hi,
> 
> On Tue, Apr 09, 2019 at 06:28:18PM +0800, Chen Zhou wrote:
>> After commit (arm64: kdump: support reserving crashkernel above 4G),
>> there may be two crash kernel regions, one is below 4G, the other is
>> above 4G.
>>
>> Crash dump kernel reads more than one crash kernel regions via a dtb
>> property under node /chosen,
>> linux,usable-memory-range = <BASE1 SIZE1 [BASE2 SIZE2]>
> 
> Somehow I've missed that previously, but how is this supposed to work on
> EFI systems?

Whatever the way in which the systems work, there is FDT pointer(__fdt_pointer)
in arm64 kernel and file /sys/firmware/fdt will be created in late_initcall.

Kexec-tools read and update file /sys/firmware/fdt in EFI systems to support kdump to
boot capture kernel.

For supporting more than one crash kernel regions, kexec-tools make changes accordingly.
Details are in below:
http://lists.infradead.org/pipermail/kexec/2019-April/022792.html

Thanks,
Chen Zhou

>  
>> Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
>> ---
>>  arch/arm64/mm/init.c     | 66 ++++++++++++++++++++++++++++++++++++++++--------
>>  include/linux/memblock.h |  6 +++++
>>  mm/memblock.c            |  7 ++---
>>  3 files changed, 66 insertions(+), 13 deletions(-)
>>
>> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
>> index 3bebddf..0f18665 100644
>> --- a/arch/arm64/mm/init.c
>> +++ b/arch/arm64/mm/init.c
>> @@ -65,6 +65,11 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init;
>>  
>>  #ifdef CONFIG_KEXEC_CORE
>>  
>> +/* at most two crash kernel regions, low_region and high_region */
>> +#define CRASH_MAX_USABLE_RANGES	2
>> +#define LOW_REGION_IDX			0
>> +#define HIGH_REGION_IDX			1
>> +
>>  /*
>>   * reserve_crashkernel() - reserves memory for crash kernel
>>   *
>> @@ -297,8 +302,8 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
>>  		const char *uname, int depth, void *data)
>>  {
>>  	struct memblock_region *usablemem = data;
>> -	const __be32 *reg;
>> -	int len;
>> +	const __be32 *reg, *endp;
>> +	int len, nr = 0;
>>  
>>  	if (depth != 1 || strcmp(uname, "chosen") != 0)
>>  		return 0;
>> @@ -307,22 +312,63 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
>>  	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
>>  		return 1;
>>  
>> -	usablemem->base = dt_mem_next_cell(dt_root_addr_cells, &reg);
>> -	usablemem->size = dt_mem_next_cell(dt_root_size_cells, &reg);
>> +	endp = reg + (len / sizeof(__be32));
>> +	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
>> +		usablemem[nr].base = dt_mem_next_cell(dt_root_addr_cells, &reg);
>> +		usablemem[nr].size = dt_mem_next_cell(dt_root_size_cells, &reg);
>> +
>> +		if (++nr >= CRASH_MAX_USABLE_RANGES)
>> +			break;
>> +	}
>>  
>>  	return 1;
>>  }
>>  
>>  static void __init fdt_enforce_memory_region(void)
>>  {
>> -	struct memblock_region reg = {
>> -		.size = 0,
>> -	};
>> +	int i, cnt = 0;
>> +	struct memblock_region regs[CRASH_MAX_USABLE_RANGES];
>> +
>> +	memset(regs, 0, sizeof(regs));
>> +	of_scan_flat_dt(early_init_dt_scan_usablemem, regs);
>> +
>> +	for (i = 0; i < CRASH_MAX_USABLE_RANGES; i++)
>> +		if (regs[i].size)
>> +			cnt++;
>> +		else
>> +			break;
>> +
>> +	if (cnt - 1 == LOW_REGION_IDX)
>> +		memblock_cap_memory_range(regs[LOW_REGION_IDX].base,
>> +				regs[LOW_REGION_IDX].size);
>> +	else if (cnt - 1 == HIGH_REGION_IDX) {
>> +		/*
>> +		 * Two crash kernel regions, cap the memory range
>> +		 * [regs[LOW_REGION_IDX].base, regs[HIGH_REGION_IDX].end]
>> +		 * and then remove the memory range in the middle.
>> +		 */
>> +		int start_rgn, end_rgn, i, ret;
>> +		phys_addr_t mid_base, mid_size;
>> +
>> +		mid_base = regs[LOW_REGION_IDX].base + regs[LOW_REGION_IDX].size;
>> +		mid_size = regs[HIGH_REGION_IDX].base - mid_base;
>> +		ret = memblock_isolate_range(&memblock.memory, mid_base,
>> +				mid_size, &start_rgn, &end_rgn);
>>  
>> -	of_scan_flat_dt(early_init_dt_scan_usablemem, &reg);
>> +		if (ret)
>> +			return;
>>  
>> -	if (reg.size)
>> -		memblock_cap_memory_range(reg.base, reg.size);
>> +		memblock_cap_memory_range(regs[LOW_REGION_IDX].base,
>> +				regs[HIGH_REGION_IDX].base -
>> +				regs[LOW_REGION_IDX].base +
>> +				regs[HIGH_REGION_IDX].size);
>> +		for (i = end_rgn - 1; i >= start_rgn; i--) {
>> +			if (!memblock_is_nomap(&memblock.memory.regions[i]))
>> +				memblock_remove_region(&memblock.memory, i);
>> +		}
>> +		memblock_remove_range(&memblock.reserved, mid_base,
>> +				mid_base + mid_size);
>> +	}
>>  }
>>  
>>  void __init arm64_memblock_init(void)
>> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
>> index 294d5d8..787d252 100644
>> --- a/include/linux/memblock.h
>> +++ b/include/linux/memblock.h
>> @@ -110,9 +110,15 @@ void memblock_discard(void);
>>  
>>  phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
>>  				   phys_addr_t size, phys_addr_t align);
>> +void memblock_remove_region(struct memblock_type *type, unsigned long r);
>>  void memblock_allow_resize(void);
>>  int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
>>  int memblock_add(phys_addr_t base, phys_addr_t size);
>> +int memblock_isolate_range(struct memblock_type *type,
>> +					phys_addr_t base, phys_addr_t size,
>> +					int *start_rgn, int *end_rgn);
>> +int memblock_remove_range(struct memblock_type *type,
>> +					phys_addr_t base, phys_addr_t size);
>>  int memblock_remove(phys_addr_t base, phys_addr_t size);
>>  int memblock_free(phys_addr_t base, phys_addr_t size);
>>  int memblock_reserve(phys_addr_t base, phys_addr_t size);
>> diff --git a/mm/memblock.c b/mm/memblock.c
>> index e7665cf..1846e2d 100644
>> --- a/mm/memblock.c
>> +++ b/mm/memblock.c
>> @@ -357,7 +357,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
>>  	return ret;
>>  }
>>  
>> -static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
>> +void __init_memblock memblock_remove_region(struct memblock_type *type,
>> +					unsigned long r)
>>  {
>>  	type->total_size -= type->regions[r].size;
>>  	memmove(&type->regions[r], &type->regions[r + 1],
>> @@ -724,7 +725,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
>>   * Return:
>>   * 0 on success, -errno on failure.
>>   */
>> -static int __init_memblock memblock_isolate_range(struct memblock_type *type,
>> +int __init_memblock memblock_isolate_range(struct memblock_type *type,
>>  					phys_addr_t base, phys_addr_t size,
>>  					int *start_rgn, int *end_rgn)
>>  {
>> @@ -784,7 +785,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
>>  	return 0;
>>  }
>>  
>> -static int __init_memblock memblock_remove_range(struct memblock_type *type,
>> +int __init_memblock memblock_remove_range(struct memblock_type *type,
>>  					  phys_addr_t base, phys_addr_t size)
>>  {
>>  	int start_rgn, end_rgn;
>> -- 
>> 2.7.4
>>
>
Mike Rapoport April 15, 2019, 4:55 a.m. UTC | #8
On Mon, Apr 15, 2019 at 10:27:30AM +0800, Chen Zhou wrote:
> Hi Mike,
> 
> On 2019/4/14 20:13, Mike Rapoport wrote:
> > Hi,
> > 
> > On Tue, Apr 09, 2019 at 06:28:18PM +0800, Chen Zhou wrote:
> >> After commit (arm64: kdump: support reserving crashkernel above 4G),
> >> there may be two crash kernel regions, one is below 4G, the other is
> >> above 4G.
> >>
> >> Crash dump kernel reads more than one crash kernel regions via a dtb
> >> property under node /chosen,
> >> linux,usable-memory-range = <BASE1 SIZE1 [BASE2 SIZE2]>
> > 
> > Somehow I've missed that previously, but how is this supposed to work on
> > EFI systems?
> 
> Whatever the way in which the systems work, there is FDT pointer(__fdt_pointer)
> in arm64 kernel and file /sys/firmware/fdt will be created in late_initcall.
> 
> Kexec-tools read and update file /sys/firmware/fdt in EFI systems to support kdump to
> boot capture kernel.
> 
> For supporting more than one crash kernel regions, kexec-tools make changes accordingly.
> Details are in below:
> http://lists.infradead.org/pipermail/kexec/2019-April/022792.html
 
Thanks for the clarification!

> Thanks,
> Chen Zhou
> 
> >  
> >> Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
> >> ---
> >>  arch/arm64/mm/init.c     | 66 ++++++++++++++++++++++++++++++++++++++++--------
> >>  include/linux/memblock.h |  6 +++++
> >>  mm/memblock.c            |  7 ++---
> >>  3 files changed, 66 insertions(+), 13 deletions(-)
> >>
> >> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> >> index 3bebddf..0f18665 100644
> >> --- a/arch/arm64/mm/init.c
> >> +++ b/arch/arm64/mm/init.c
> >> @@ -65,6 +65,11 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init;
> >>  
> >>  #ifdef CONFIG_KEXEC_CORE
> >>  
> >> +/* at most two crash kernel regions, low_region and high_region */
> >> +#define CRASH_MAX_USABLE_RANGES	2
> >> +#define LOW_REGION_IDX			0
> >> +#define HIGH_REGION_IDX			1
> >> +
> >>  /*
> >>   * reserve_crashkernel() - reserves memory for crash kernel
> >>   *
> >> @@ -297,8 +302,8 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
> >>  		const char *uname, int depth, void *data)
> >>  {
> >>  	struct memblock_region *usablemem = data;
> >> -	const __be32 *reg;
> >> -	int len;
> >> +	const __be32 *reg, *endp;
> >> +	int len, nr = 0;
> >>  
> >>  	if (depth != 1 || strcmp(uname, "chosen") != 0)
> >>  		return 0;
> >> @@ -307,22 +312,63 @@ static int __init early_init_dt_scan_usablemem(unsigned long node,
> >>  	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
> >>  		return 1;
> >>  
> >> -	usablemem->base = dt_mem_next_cell(dt_root_addr_cells, &reg);
> >> -	usablemem->size = dt_mem_next_cell(dt_root_size_cells, &reg);
> >> +	endp = reg + (len / sizeof(__be32));
> >> +	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
> >> +		usablemem[nr].base = dt_mem_next_cell(dt_root_addr_cells, &reg);
> >> +		usablemem[nr].size = dt_mem_next_cell(dt_root_size_cells, &reg);
> >> +
> >> +		if (++nr >= CRASH_MAX_USABLE_RANGES)
> >> +			break;
> >> +	}
> >>  
> >>  	return 1;
> >>  }
> >>  
> >>  static void __init fdt_enforce_memory_region(void)
> >>  {
> >> -	struct memblock_region reg = {
> >> -		.size = 0,
> >> -	};
> >> +	int i, cnt = 0;
> >> +	struct memblock_region regs[CRASH_MAX_USABLE_RANGES];
> >> +
> >> +	memset(regs, 0, sizeof(regs));
> >> +	of_scan_flat_dt(early_init_dt_scan_usablemem, regs);
> >> +
> >> +	for (i = 0; i < CRASH_MAX_USABLE_RANGES; i++)
> >> +		if (regs[i].size)
> >> +			cnt++;
> >> +		else
> >> +			break;
> >> +
> >> +	if (cnt - 1 == LOW_REGION_IDX)
> >> +		memblock_cap_memory_range(regs[LOW_REGION_IDX].base,
> >> +				regs[LOW_REGION_IDX].size);
> >> +	else if (cnt - 1 == HIGH_REGION_IDX) {
> >> +		/*
> >> +		 * Two crash kernel regions, cap the memory range
> >> +		 * [regs[LOW_REGION_IDX].base, regs[HIGH_REGION_IDX].end]
> >> +		 * and then remove the memory range in the middle.
> >> +		 */
> >> +		int start_rgn, end_rgn, i, ret;
> >> +		phys_addr_t mid_base, mid_size;
> >> +
> >> +		mid_base = regs[LOW_REGION_IDX].base + regs[LOW_REGION_IDX].size;
> >> +		mid_size = regs[HIGH_REGION_IDX].base - mid_base;
> >> +		ret = memblock_isolate_range(&memblock.memory, mid_base,
> >> +				mid_size, &start_rgn, &end_rgn);
> >>  
> >> -	of_scan_flat_dt(early_init_dt_scan_usablemem, &reg);
> >> +		if (ret)
> >> +			return;
> >>  
> >> -	if (reg.size)
> >> -		memblock_cap_memory_range(reg.base, reg.size);
> >> +		memblock_cap_memory_range(regs[LOW_REGION_IDX].base,
> >> +				regs[HIGH_REGION_IDX].base -
> >> +				regs[LOW_REGION_IDX].base +
> >> +				regs[HIGH_REGION_IDX].size);
> >> +		for (i = end_rgn - 1; i >= start_rgn; i--) {
> >> +			if (!memblock_is_nomap(&memblock.memory.regions[i]))
> >> +				memblock_remove_region(&memblock.memory, i);
> >> +		}
> >> +		memblock_remove_range(&memblock.reserved, mid_base,
> >> +				mid_base + mid_size);
> >> +	}
> >>  }
> >>  
> >>  void __init arm64_memblock_init(void)
> >> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> >> index 294d5d8..787d252 100644
> >> --- a/include/linux/memblock.h
> >> +++ b/include/linux/memblock.h
> >> @@ -110,9 +110,15 @@ void memblock_discard(void);
> >>  
> >>  phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
> >>  				   phys_addr_t size, phys_addr_t align);
> >> +void memblock_remove_region(struct memblock_type *type, unsigned long r);
> >>  void memblock_allow_resize(void);
> >>  int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
> >>  int memblock_add(phys_addr_t base, phys_addr_t size);
> >> +int memblock_isolate_range(struct memblock_type *type,
> >> +					phys_addr_t base, phys_addr_t size,
> >> +					int *start_rgn, int *end_rgn);
> >> +int memblock_remove_range(struct memblock_type *type,
> >> +					phys_addr_t base, phys_addr_t size);
> >>  int memblock_remove(phys_addr_t base, phys_addr_t size);
> >>  int memblock_free(phys_addr_t base, phys_addr_t size);
> >>  int memblock_reserve(phys_addr_t base, phys_addr_t size);
> >> diff --git a/mm/memblock.c b/mm/memblock.c
> >> index e7665cf..1846e2d 100644
> >> --- a/mm/memblock.c
> >> +++ b/mm/memblock.c
> >> @@ -357,7 +357,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
> >>  	return ret;
> >>  }
> >>  
> >> -static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
> >> +void __init_memblock memblock_remove_region(struct memblock_type *type,
> >> +					unsigned long r)
> >>  {
> >>  	type->total_size -= type->regions[r].size;
> >>  	memmove(&type->regions[r], &type->regions[r + 1],
> >> @@ -724,7 +725,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
> >>   * Return:
> >>   * 0 on success, -errno on failure.
> >>   */
> >> -static int __init_memblock memblock_isolate_range(struct memblock_type *type,
> >> +int __init_memblock memblock_isolate_range(struct memblock_type *type,
> >>  					phys_addr_t base, phys_addr_t size,
> >>  					int *start_rgn, int *end_rgn)
> >>  {
> >> @@ -784,7 +785,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
> >>  	return 0;
> >>  }
> >>  
> >> -static int __init_memblock memblock_remove_range(struct memblock_type *type,
> >> +int __init_memblock memblock_remove_range(struct memblock_type *type,
> >>  					  phys_addr_t base, phys_addr_t size)
> >>  {
> >>  	int start_rgn, end_rgn;
> >> -- 
> >> 2.7.4
> >>
> > 
>
Mike Rapoport April 15, 2019, 5:04 a.m. UTC | #9
Hi,

On Mon, Apr 15, 2019 at 10:05:18AM +0800, Chen Zhou wrote:
> Hi Mike,
> 
> On 2019/4/14 20:10, Mike Rapoport wrote:
> >>
> >> solution A: 	phys_addr_t start[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
> >> 		phys_addr_t end[INIT_MEMBLOCK_RESERVED_REGIONS * 2];
> >> start, end is physical addr
> >>
> >> solution B: 	int start_rgn[INIT_MEMBLOCK_REGIONS], end_rgn[INIT_MEMBLOCK_REGIONS];
> >> start_rgn, end_rgn is rgn index		
> >>
> >> Solution B do less remove operations and with no warning comparing to solution A.
> >> I think solution B is better, could you give some suggestions?
> >  
> > Solution B is indeed better that solution A, but I'm still worried by
> > relatively large arrays on stack and the amount of loops :(
> > 
> > The very least we could do is to call memblock_cap_memory_range() to drop
> > the memory before and after the ranges we'd like to keep.
> 
> 1. relatively large arrays
> As my said above, the start_rgn, end_rgn is rgn index, we could use unsigned char type.

Let's stick to int for now

> 2. loops
> Loops always exist, and the solution with fewer loops may be just encapsulated well.

Of course the loops are there, I just hoped we could get rid of the nested
loop and get away with single passes in all the cases.
Apparently it's not the case :(

> Thanks,
> Chen Zhou
>
diff mbox series

Patch

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 3bebddf..0f18665 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -65,6 +65,11 @@  phys_addr_t arm64_dma_phys_limit __ro_after_init;
 
 #ifdef CONFIG_KEXEC_CORE
 
+/* at most two crash kernel regions, low_region and high_region */
+#define CRASH_MAX_USABLE_RANGES	2
+#define LOW_REGION_IDX			0
+#define HIGH_REGION_IDX			1
+
 /*
  * reserve_crashkernel() - reserves memory for crash kernel
  *
@@ -297,8 +302,8 @@  static int __init early_init_dt_scan_usablemem(unsigned long node,
 		const char *uname, int depth, void *data)
 {
 	struct memblock_region *usablemem = data;
-	const __be32 *reg;
-	int len;
+	const __be32 *reg, *endp;
+	int len, nr = 0;
 
 	if (depth != 1 || strcmp(uname, "chosen") != 0)
 		return 0;
@@ -307,22 +312,63 @@  static int __init early_init_dt_scan_usablemem(unsigned long node,
 	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
 		return 1;
 
-	usablemem->base = dt_mem_next_cell(dt_root_addr_cells, &reg);
-	usablemem->size = dt_mem_next_cell(dt_root_size_cells, &reg);
+	endp = reg + (len / sizeof(__be32));
+	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
+		usablemem[nr].base = dt_mem_next_cell(dt_root_addr_cells, &reg);
+		usablemem[nr].size = dt_mem_next_cell(dt_root_size_cells, &reg);
+
+		if (++nr >= CRASH_MAX_USABLE_RANGES)
+			break;
+	}
 
 	return 1;
 }
 
 static void __init fdt_enforce_memory_region(void)
 {
-	struct memblock_region reg = {
-		.size = 0,
-	};
+	int i, cnt = 0;
+	struct memblock_region regs[CRASH_MAX_USABLE_RANGES];
+
+	memset(regs, 0, sizeof(regs));
+	of_scan_flat_dt(early_init_dt_scan_usablemem, regs);
+
+	for (i = 0; i < CRASH_MAX_USABLE_RANGES; i++)
+		if (regs[i].size)
+			cnt++;
+		else
+			break;
+
+	if (cnt - 1 == LOW_REGION_IDX)
+		memblock_cap_memory_range(regs[LOW_REGION_IDX].base,
+				regs[LOW_REGION_IDX].size);
+	else if (cnt - 1 == HIGH_REGION_IDX) {
+		/*
+		 * Two crash kernel regions, cap the memory range
+		 * [regs[LOW_REGION_IDX].base, regs[HIGH_REGION_IDX].end]
+		 * and then remove the memory range in the middle.
+		 */
+		int start_rgn, end_rgn, i, ret;
+		phys_addr_t mid_base, mid_size;
+
+		mid_base = regs[LOW_REGION_IDX].base + regs[LOW_REGION_IDX].size;
+		mid_size = regs[HIGH_REGION_IDX].base - mid_base;
+		ret = memblock_isolate_range(&memblock.memory, mid_base,
+				mid_size, &start_rgn, &end_rgn);
 
-	of_scan_flat_dt(early_init_dt_scan_usablemem, &reg);
+		if (ret)
+			return;
 
-	if (reg.size)
-		memblock_cap_memory_range(reg.base, reg.size);
+		memblock_cap_memory_range(regs[LOW_REGION_IDX].base,
+				regs[HIGH_REGION_IDX].base -
+				regs[LOW_REGION_IDX].base +
+				regs[HIGH_REGION_IDX].size);
+		for (i = end_rgn - 1; i >= start_rgn; i--) {
+			if (!memblock_is_nomap(&memblock.memory.regions[i]))
+				memblock_remove_region(&memblock.memory, i);
+		}
+		memblock_remove_range(&memblock.reserved, mid_base,
+				mid_base + mid_size);
+	}
 }
 
 void __init arm64_memblock_init(void)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 294d5d8..787d252 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -110,9 +110,15 @@  void memblock_discard(void);
 
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
 				   phys_addr_t size, phys_addr_t align);
+void memblock_remove_region(struct memblock_type *type, unsigned long r);
 void memblock_allow_resize(void);
 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_add(phys_addr_t base, phys_addr_t size);
+int memblock_isolate_range(struct memblock_type *type,
+					phys_addr_t base, phys_addr_t size,
+					int *start_rgn, int *end_rgn);
+int memblock_remove_range(struct memblock_type *type,
+					phys_addr_t base, phys_addr_t size);
 int memblock_remove(phys_addr_t base, phys_addr_t size);
 int memblock_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
diff --git a/mm/memblock.c b/mm/memblock.c
index e7665cf..1846e2d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -357,7 +357,8 @@  phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
 	return ret;
 }
 
-static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
+void __init_memblock memblock_remove_region(struct memblock_type *type,
+					unsigned long r)
 {
 	type->total_size -= type->regions[r].size;
 	memmove(&type->regions[r], &type->regions[r + 1],
@@ -724,7 +725,7 @@  int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
  * Return:
  * 0 on success, -errno on failure.
  */
-static int __init_memblock memblock_isolate_range(struct memblock_type *type,
+int __init_memblock memblock_isolate_range(struct memblock_type *type,
 					phys_addr_t base, phys_addr_t size,
 					int *start_rgn, int *end_rgn)
 {
@@ -784,7 +785,7 @@  static int __init_memblock memblock_isolate_range(struct memblock_type *type,
 	return 0;
 }
 
-static int __init_memblock memblock_remove_range(struct memblock_type *type,
+int __init_memblock memblock_remove_range(struct memblock_type *type,
 					  phys_addr_t base, phys_addr_t size)
 {
 	int start_rgn, end_rgn;