diff mbox series

[v24,3/6] arm64: kdump: Reimplement crashkernel=X

Message ID 20220506114402.365-4-thunder.leizhen@huawei.com (mailing list archive)
State New, archived
Headers show
Series support reserving crashkernel above 4G on arm64 kdump | expand

Commit Message

Leizhen (ThunderTown) May 6, 2022, 11:43 a.m. UTC
From: Chen Zhou <chenzhou10@huawei.com>

There are following issues in arm64 kdump:
1. We use crashkernel=X to reserve crashkernel in DMA zone, which
will fail when there is not enough low memory.
2. If reserving crashkernel above DMA zone, in this case, crash dump
kernel will fail to boot because there is no low memory available
for allocation.

To solve these issues, introduce crashkernel=X,[high,low].
The "crashkernel=X,high" is used to select a region above DMA zone, and
the "crashkernel=Y,low" is used to allocate specified size low memory.

Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
Co-developed-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
---
 arch/arm64/kernel/machine_kexec.c      |  9 +++-
 arch/arm64/kernel/machine_kexec_file.c | 12 ++++-
 arch/arm64/mm/init.c                   | 63 +++++++++++++++++++++++---
 3 files changed, 74 insertions(+), 10 deletions(-)

Comments

Baoquan He May 6, 2022, 11:10 p.m. UTC | #1
On 05/06/22 at 07:43pm, Zhen Lei wrote:
......  
> @@ -118,8 +162,7 @@ static void __init reserve_crashkernel(void)
>  	if (crash_base)
>  		crash_max = crash_base + crash_size;
>  
> -	/* Current arm64 boot protocol requires 2MB alignment */
> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
>  					       crash_base, crash_max);
>  	if (!crash_base) {
>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> @@ -127,6 +170,11 @@ static void __init reserve_crashkernel(void)
>  		return;
>  	}
>  

There's corner case missed, e.g
1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
2) ,high and ,low are specified, the whole system memory is under 4G.

Below judgement can filter them away:
        
	if (crash_base > arm64_dma_phys_limit && crash_low_size &&
	    reserve_crashkernel_low(crash_low_size)) {

What's your opinion? Leave it and add document to notice user, or fix it
with code change?

I would suggest merging this series, Lei can add this corner case
handling on top. Since this is a newly added support, we don't have
to make it one step. Doing step by step can make reviewing easier.

> +	if (crash_low_size && reserve_crashkernel_low(crash_low_size)) {
> +		memblock_phys_free(crash_base, crash_size);
> +		return;
> +	}
> +
>  	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
>  		crash_base, crash_base + crash_size, crash_size >> 20);
>  
> @@ -135,6 +183,9 @@ static void __init reserve_crashkernel(void)
>  	 * map. Inform kmemleak so that it won't try to access it.
>  	 */
>  	kmemleak_ignore_phys(crash_base);
> +	if (crashk_low_res.end)
> +		kmemleak_ignore_phys(crashk_low_res.start);
> +
>  	crashk_res.start = crash_base;
>  	crashk_res.end = crash_base + crash_size - 1;
>  	insert_resource(&iomem_resource, &crashk_res);
> -- 
> 2.25.1
>
Leizhen (ThunderTown) May 7, 2022, 1:34 a.m. UTC | #2
On 2022/5/7 7:10, Baoquan He wrote:
> On 05/06/22 at 07:43pm, Zhen Lei wrote:
> ......  
>> @@ -118,8 +162,7 @@ static void __init reserve_crashkernel(void)
>>  	if (crash_base)
>>  		crash_max = crash_base + crash_size;
>>  
>> -	/* Current arm64 boot protocol requires 2MB alignment */
>> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
>> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
>>  					       crash_base, crash_max);
>>  	if (!crash_base) {
>>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
>> @@ -127,6 +170,11 @@ static void __init reserve_crashkernel(void)
>>  		return;
>>  	}
>>  
> 
> There's corner case missed, e.g
> 1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
> 2) ,high and ,low are specified, the whole system memory is under 4G.
> 
> Below judgement can filter them away:
>         
> 	if (crash_base > arm64_dma_phys_limit && crash_low_size &&
> 	    reserve_crashkernel_low(crash_low_size)) {
> 
> What's your opinion? Leave it and add document to notice user, or fix it
> with code change?

I think maybe we can leave it unchanged. If the user configures two memory ranges,
we'd better apply for two. Otherwise, he'll be confused when he inquires. Currently,
crash_low_size is non-zero only when 'crashkernel=Y,low' is explicitly configured.

> 
> I would suggest merging this series, Lei can add this corner case
> handling on top. Since this is a newly added support, we don't have
> to make it one step. Doing step by step can make reviewing easier.
> 
>> +	if (crash_low_size && reserve_crashkernel_low(crash_low_size)) {
>> +		memblock_phys_free(crash_base, crash_size);
>> +		return;
>> +	}
>> +
>>  	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
>>  		crash_base, crash_base + crash_size, crash_size >> 20);
>>  
>> @@ -135,6 +183,9 @@ static void __init reserve_crashkernel(void)
>>  	 * map. Inform kmemleak so that it won't try to access it.
>>  	 */
>>  	kmemleak_ignore_phys(crash_base);
>> +	if (crashk_low_res.end)
>> +		kmemleak_ignore_phys(crashk_low_res.start);
>> +
>>  	crashk_res.start = crash_base;
>>  	crashk_res.end = crash_base + crash_size - 1;
>>  	insert_resource(&iomem_resource, &crashk_res);
>> -- 
>> 2.25.1
>>
> 
> .
>
Baoquan He May 7, 2022, 2:07 a.m. UTC | #3
On 05/07/22 at 09:34am, Leizhen (ThunderTown) wrote:
> 
> 
> On 2022/5/7 7:10, Baoquan He wrote:
> > On 05/06/22 at 07:43pm, Zhen Lei wrote:
> > ......  
> >> @@ -118,8 +162,7 @@ static void __init reserve_crashkernel(void)
> >>  	if (crash_base)
> >>  		crash_max = crash_base + crash_size;
> >>  
> >> -	/* Current arm64 boot protocol requires 2MB alignment */
> >> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
> >> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
> >>  					       crash_base, crash_max);
> >>  	if (!crash_base) {
> >>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> >> @@ -127,6 +170,11 @@ static void __init reserve_crashkernel(void)
> >>  		return;
> >>  	}
> >>  
> > 
> > There's corner case missed, e.g
> > 1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
> > 2) ,high and ,low are specified, the whole system memory is under 4G.
> > 
> > Below judgement can filter them away:
> >         
> > 	if (crash_base > arm64_dma_phys_limit && crash_low_size &&
> > 	    reserve_crashkernel_low(crash_low_size)) {
> > 
> > What's your opinion? Leave it and add document to notice user, or fix it
> > with code change?
> 
> I think maybe we can leave it unchanged. If the user configures two memory ranges,
> we'd better apply for two. Otherwise, he'll be confused when he inquires. Currently,
> crash_low_size is non-zero only when 'crashkernel=Y,low' is explicitly configured.

Then user need know the system information, e.g how much is the high
memory, low memory, if CONFIG_ZONE_DMA|DMA32 is enabled. And we need
describe these cases in document. Any corner case or exception need
be noted if we don't handle it in code.

Caring about this very much because we have CI with existed test cases
to run on the system, and QA will check these manually too. Support
engineer need detailed document if anything special but happened.
Anything unclear or uncovered will be reported as bug to our kernel dev.
Guess your company do the similar thing like this.

This crashkerne,high and crashkernel,low reservation is special if we
allow ,high, ,low existing in the same zone. Imagine on system with
CONFIG_ZONE_DMA|DMA32 disabled, people copy the crashkernel=512M,high
and crashkernel=128M,low from other system, and he could get
crash_res at [5G, 5G+512M], while crash_low_res at [6G, 6G+128M]. Guess
how they will judge us.

> 
> > 
> > I would suggest merging this series, Lei can add this corner case
> > handling on top. Since this is a newly added support, we don't have
> > to make it one step. Doing step by step can make reviewing easier.
> > 
> >> +	if (crash_low_size && reserve_crashkernel_low(crash_low_size)) {
> >> +		memblock_phys_free(crash_base, crash_size);
> >> +		return;
> >> +	}
> >> +
> >>  	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
> >>  		crash_base, crash_base + crash_size, crash_size >> 20);
> >>  
> >> @@ -135,6 +183,9 @@ static void __init reserve_crashkernel(void)
> >>  	 * map. Inform kmemleak so that it won't try to access it.
> >>  	 */
> >>  	kmemleak_ignore_phys(crash_base);
> >> +	if (crashk_low_res.end)
> >> +		kmemleak_ignore_phys(crashk_low_res.start);
> >> +
> >>  	crashk_res.start = crash_base;
> >>  	crashk_res.end = crash_base + crash_size - 1;
> >>  	insert_resource(&iomem_resource, &crashk_res);
> >> -- 
> >> 2.25.1
> >>
> > 
> > .
> > 
> 
> -- 
> Regards,
>   Zhen Lei
>
Leizhen (ThunderTown) May 7, 2022, 3:37 a.m. UTC | #4
On 2022/5/7 10:07, Baoquan He wrote:
> On 05/07/22 at 09:34am, Leizhen (ThunderTown) wrote:
>>
>>
>> On 2022/5/7 7:10, Baoquan He wrote:
>>> On 05/06/22 at 07:43pm, Zhen Lei wrote:
>>> ......  
>>>> @@ -118,8 +162,7 @@ static void __init reserve_crashkernel(void)
>>>>  	if (crash_base)
>>>>  		crash_max = crash_base + crash_size;
>>>>  
>>>> -	/* Current arm64 boot protocol requires 2MB alignment */
>>>> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
>>>> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
>>>>  					       crash_base, crash_max);
>>>>  	if (!crash_base) {
>>>>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
>>>> @@ -127,6 +170,11 @@ static void __init reserve_crashkernel(void)
>>>>  		return;
>>>>  	}
>>>>  
>>>
>>> There's corner case missed, e.g
>>> 1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
>>> 2) ,high and ,low are specified, the whole system memory is under 4G.
>>>
>>> Below judgement can filter them away:
>>>         
>>> 	if (crash_base > arm64_dma_phys_limit && crash_low_size &&
>>> 	    reserve_crashkernel_low(crash_low_size)) {
>>>
>>> What's your opinion? Leave it and add document to notice user, or fix it
>>> with code change?
>>
>> I think maybe we can leave it unchanged. If the user configures two memory ranges,
>> we'd better apply for two. Otherwise, he'll be confused when he inquires. Currently,
>> crash_low_size is non-zero only when 'crashkernel=Y,low' is explicitly configured.
> 
> Then user need know the system information, e.g how much is the high
> memory, low memory, if CONFIG_ZONE_DMA|DMA32 is enabled. And we need
> describe these cases in document. Any corner case or exception need
> be noted if we don't handle it in code.
> 
> Caring about this very much because we have CI with existed test cases
> to run on the system, and QA will check these manually too. Support
> engineer need detailed document if anything special but happened.
> Anything unclear or uncovered will be reported as bug to our kernel dev.
> Guess your company do the similar thing like this.
> 
> This crashkerne,high and crashkernel,low reservation is special if we
> allow ,high, ,low existing in the same zone. Imagine on system with
> CONFIG_ZONE_DMA|DMA32 disabled, people copy the crashkernel=512M,high
> and crashkernel=128M,low from other system, and he could get
> crash_res at [5G, 5G+512M], while crash_low_res at [6G, 6G+128M]. Guess
> how they will judge us.

OK, I got it.

> 
>>
>>>
>>> I would suggest merging this series, Lei can add this corner case
>>> handling on top. Since this is a newly added support, we don't have
>>> to make it one step. Doing step by step can make reviewing easier.
>>>
>>>> +	if (crash_low_size && reserve_crashkernel_low(crash_low_size)) {
>>>> +		memblock_phys_free(crash_base, crash_size);
>>>> +		return;
>>>> +	}
>>>> +
>>>>  	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
>>>>  		crash_base, crash_base + crash_size, crash_size >> 20);
>>>>  
>>>> @@ -135,6 +183,9 @@ static void __init reserve_crashkernel(void)
>>>>  	 * map. Inform kmemleak so that it won't try to access it.
>>>>  	 */
>>>>  	kmemleak_ignore_phys(crash_base);
>>>> +	if (crashk_low_res.end)
>>>> +		kmemleak_ignore_phys(crashk_low_res.start);
>>>> +
>>>>  	crashk_res.start = crash_base;
>>>>  	crashk_res.end = crash_base + crash_size - 1;
>>>>  	insert_resource(&iomem_resource, &crashk_res);
>>>> -- 
>>>> 2.25.1
>>>>
>>>
>>> .
>>>
>>
>> -- 
>> Regards,
>>   Zhen Lei
>>
> 
> .
>
Leizhen (ThunderTown) May 7, 2022, 9:35 a.m. UTC | #5
On 2022/5/7 11:37, Leizhen (ThunderTown) wrote:
> 
> 
> On 2022/5/7 10:07, Baoquan He wrote:
>> On 05/07/22 at 09:34am, Leizhen (ThunderTown) wrote:
>>>
>>>
>>> On 2022/5/7 7:10, Baoquan He wrote:
>>>> On 05/06/22 at 07:43pm, Zhen Lei wrote:
>>>> ......  
>>>>> @@ -118,8 +162,7 @@ static void __init reserve_crashkernel(void)
>>>>>  	if (crash_base)
>>>>>  		crash_max = crash_base + crash_size;
>>>>>  
>>>>> -	/* Current arm64 boot protocol requires 2MB alignment */
>>>>> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
>>>>> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
>>>>>  					       crash_base, crash_max);
>>>>>  	if (!crash_base) {
>>>>>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
>>>>> @@ -127,6 +170,11 @@ static void __init reserve_crashkernel(void)
>>>>>  		return;
>>>>>  	}
>>>>>  
>>>>
>>>> There's corner case missed, e.g
>>>> 1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
>>>> 2) ,high and ,low are specified, the whole system memory is under 4G.
>>>>
>>>> Below judgement can filter them away:
>>>>         
>>>> 	if (crash_base > arm64_dma_phys_limit && crash_low_size &&
>>>> 	    reserve_crashkernel_low(crash_low_size)) {
>>>>
>>>> What's your opinion? Leave it and add document to notice user, or fix it
>>>> with code change?

I decided to modify the code and document. But the code changes aren't what
you suggested. For the following reasons:
1. The memory allocated for 'high' may be partially under 4G. So the low
   memory may not be enough. Of course, it's rare.
2. The second kernel can work properly only when the high and low memory
   are successfully applied for. For example, high=128M, low=128M, but the
   second kernel need 256M.

So for the cases you listed:
1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
   --> Follow you suggestion, ignore crashkernel=Y,low, don't allocate low memory.

@@ -100,6 +100,14 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
 {
        unsigned long long low_base;

+       /*
+        * The kernel does not have any DMA zone, so the range of each DMA
+        * zone is unknown. Please make sure both CONFIG_ZONE_DMA and
+        * CONFIG_ZONE_DMA32 are also not set in the second kernel.
+        */
+       if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
+               return 0;
+

2) ,high and ,low are specified, the whole system memory is under 4G.
   --> two memory ranges will be allocated, the size is what 'high' and 'low' specified.
   --> Yes, the memory of 'low' may be above 'high', but the 'high' just hint allocation
   --> from top, try high memory first. Of course, this may cause kexec to fail to load.
   --> Because the memory of 'low' with small size will be used to store Image, etc..
   --> But the memory of 'low' above 'high' is almost impossible, we use memblock API to
   --> allocate memory from top to bottem, 'low' above 'high' need a sizeable memory block
   --> (128M, 256M?) to be freed at init phase.
   -->  Maybe I should add: crash_max = min(crash_base, CRASH_ADDR_LOW_MAX);
   --> to make sure the memory of 'low' is always under 'high'

>>>
>>> I think maybe we can leave it unchanged. If the user configures two memory ranges,
>>> we'd better apply for two. Otherwise, he'll be confused when he inquires. Currently,
>>> crash_low_size is non-zero only when 'crashkernel=Y,low' is explicitly configured.
>>
>> Then user need know the system information, e.g how much is the high
>> memory, low memory, if CONFIG_ZONE_DMA|DMA32 is enabled. And we need
>> describe these cases in document. Any corner case or exception need
>> be noted if we don't handle it in code.
>>
>> Caring about this very much because we have CI with existed test cases
>> to run on the system, and QA will check these manually too. Support
>> engineer need detailed document if anything special but happened.
>> Anything unclear or uncovered will be reported as bug to our kernel dev.
>> Guess your company do the similar thing like this.
>>
>> This crashkerne,high and crashkernel,low reservation is special if we
>> allow ,high, ,low existing in the same zone. Imagine on system with
>> CONFIG_ZONE_DMA|DMA32 disabled, people copy the crashkernel=512M,high
>> and crashkernel=128M,low from other system, and he could get
>> crash_res at [5G, 5G+512M], while crash_low_res at [6G, 6G+128M]. Guess
>> how they will judge us.
> 
> OK, I got it.
> 
>>
>>>
>>>>
>>>> I would suggest merging this series, Lei can add this corner case
>>>> handling on top. Since this is a newly added support, we don't have
>>>> to make it one step. Doing step by step can make reviewing easier.
>>>>
>>>>> +	if (crash_low_size && reserve_crashkernel_low(crash_low_size)) {
>>>>> +		memblock_phys_free(crash_base, crash_size);
>>>>> +		return;
>>>>> +	}
>>>>> +
>>>>>  	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
>>>>>  		crash_base, crash_base + crash_size, crash_size >> 20);
>>>>>  
>>>>> @@ -135,6 +183,9 @@ static void __init reserve_crashkernel(void)
>>>>>  	 * map. Inform kmemleak so that it won't try to access it.
>>>>>  	 */
>>>>>  	kmemleak_ignore_phys(crash_base);
>>>>> +	if (crashk_low_res.end)
>>>>> +		kmemleak_ignore_phys(crashk_low_res.start);
>>>>> +
>>>>>  	crashk_res.start = crash_base;
>>>>>  	crashk_res.end = crash_base + crash_size - 1;
>>>>>  	insert_resource(&iomem_resource, &crashk_res);
>>>>> -- 
>>>>> 2.25.1
>>>>>
>>>>
>>>> .
>>>>
>>>
>>> -- 
>>> Regards,
>>>   Zhen Lei
>>>
>>
>> .
>>
>
Leizhen (ThunderTown) May 7, 2022, 11:49 a.m. UTC | #6
On 2022/5/7 17:35, Leizhen (ThunderTown) wrote:
> 
> 
> On 2022/5/7 11:37, Leizhen (ThunderTown) wrote:
>>
>>
>> On 2022/5/7 10:07, Baoquan He wrote:
>>> On 05/07/22 at 09:34am, Leizhen (ThunderTown) wrote:
>>>>
>>>>
>>>> On 2022/5/7 7:10, Baoquan He wrote:
>>>>> On 05/06/22 at 07:43pm, Zhen Lei wrote:
>>>>> ......  
>>>>>> @@ -118,8 +162,7 @@ static void __init reserve_crashkernel(void)
>>>>>>  	if (crash_base)
>>>>>>  		crash_max = crash_base + crash_size;
>>>>>>  
>>>>>> -	/* Current arm64 boot protocol requires 2MB alignment */
>>>>>> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
>>>>>> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
>>>>>>  					       crash_base, crash_max);
>>>>>>  	if (!crash_base) {
>>>>>>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
>>>>>> @@ -127,6 +170,11 @@ static void __init reserve_crashkernel(void)
>>>>>>  		return;
>>>>>>  	}
>>>>>>  
>>>>>
>>>>> There's corner case missed, e.g
>>>>> 1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
>>>>> 2) ,high and ,low are specified, the whole system memory is under 4G.
>>>>>
>>>>> Below judgement can filter them away:
>>>>>         
>>>>> 	if (crash_base > arm64_dma_phys_limit && crash_low_size &&
>>>>> 	    reserve_crashkernel_low(crash_low_size)) {
>>>>>
>>>>> What's your opinion? Leave it and add document to notice user, or fix it
>>>>> with code change?
> 
> I decided to modify the code and document. But the code changes aren't what
> you suggested. For the following reasons:
> 1. The memory allocated for 'high' may be partially under 4G. So the low
>    memory may not be enough. Of course, it's rare.
> 2. The second kernel can work properly only when the high and low memory
>    are successfully applied for. For example, high=128M, low=128M, but the
>    second kernel need 256M.
> 
> So for the cases you listed:
> 1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
>    --> Follow you suggestion, ignore crashkernel=Y,low, don't allocate low memory.
> 
> @@ -100,6 +100,14 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
>  {
>         unsigned long long low_base;
> 
> +       /*
> +        * The kernel does not have any DMA zone, so the range of each DMA
> +        * zone is unknown. Please make sure both CONFIG_ZONE_DMA and
> +        * CONFIG_ZONE_DMA32 are also not set in the second kernel.
> +        */
> +       if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
> +               return 0;
> +
> 
> 2) ,high and ,low are specified, the whole system memory is under 4G.
>    --> two memory ranges will be allocated, the size is what 'high' and 'low' specified.
>    --> Yes, the memory of 'low' may be above 'high', but the 'high' just hint allocation
>    --> from top, try high memory first. Of course, this may cause kexec to fail to load.
>    --> Because the memory of 'low' with small size will be used to store Image, etc..
>    --> But the memory of 'low' above 'high' is almost impossible, we use memblock API to
>    --> allocate memory from top to bottem, 'low' above 'high' need a sizeable memory block
>    --> (128M, 256M?) to be freed at init phase.
>    -->  Maybe I should add: crash_max = min(crash_base, CRASH_ADDR_LOW_MAX);
>    --> to make sure the memory of 'low' is always under 'high'

I have added the min() above.

Test result:
1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
root@localhost:~# dmesg | grep crash
[    0.000000] crashkernel reserved: 0x0000000420000000 - 0x0000000440000000 (512 MB)
[    0.000000] Kernel command line: console=ttyAMA0 root=/dev/vda rw panic_on_oops=1 oops=panic crashkernel=512M,high crashkernel=128M,low

2) ,high and ,low are specified, the whole system memory is under 4G.
root@localhost:~# dmesg | grep crash
[    0.000000] crashkernel tmp reserved: 0x00000000f2800000 - 0x00000000fa800000 (128 MB)
[    0.000000] crashkernel low memory reserved: 0xca800000 - 0xd2800000 (128 MB)
[    0.000000] crashkernel reserved: 0x00000000d2800000 - 0x00000000f2800000 (512 MB)
[    0.000000] Kernel command line: console=ttyAMA0 root=/dev/vda rw panic_on_oops=1 oops=panic crashkernel=512M,high crashkernel=128M,low

test stub for 2):

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 5cb73bbd286b100..abbde2158a0976a 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -147,6 +147,7 @@ static void __init reserve_crashkernel(void)
        unsigned long long crash_max = CRASH_ADDR_LOW_MAX;
        char *cmdline = boot_command_line;
        int ret;
+       unsigned long long tmp_base;

        if (!IS_ENABLED(CONFIG_KEXEC_CORE))
                return;
@@ -179,6 +180,11 @@ static void __init reserve_crashkernel(void)
        if (crash_base)
                crash_max = crash_base + crash_size;

+       tmp_base = memblock_phys_alloc_range(crash_low_size, CRASH_ALIGN, crash_base, crash_max);
+       BUG_ON(!tmp_base);
+       pr_info("crashkernel tmp reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+               tmp_base, tmp_base + crash_low_size, crash_low_size >> 20);
+
        crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
                                               crash_base, crash_max);
        if (!crash_base) {
@@ -186,6 +192,7 @@ static void __init reserve_crashkernel(void)
                        crash_size);
                return;
        }
+       memblock_phys_free(tmp_base, crash_low_size);

        if (crash_low_size && reserve_crashkernel_low(crash_low_size, crash_base)) {
                memblock_phys_free(crash_base, crash_size);

> 
>>>>
>>>> I think maybe we can leave it unchanged. If the user configures two memory ranges,
>>>> we'd better apply for two. Otherwise, he'll be confused when he inquires. Currently,
>>>> crash_low_size is non-zero only when 'crashkernel=Y,low' is explicitly configured.
>>>
>>> Then user need know the system information, e.g how much is the high
>>> memory, low memory, if CONFIG_ZONE_DMA|DMA32 is enabled. And we need
>>> describe these cases in document. Any corner case or exception need
>>> be noted if we don't handle it in code.
>>>
>>> Caring about this very much because we have CI with existed test cases
>>> to run on the system, and QA will check these manually too. Support
>>> engineer need detailed document if anything special but happened.
>>> Anything unclear or uncovered will be reported as bug to our kernel dev.
>>> Guess your company do the similar thing like this.
>>>
>>> This crashkerne,high and crashkernel,low reservation is special if we
>>> allow ,high, ,low existing in the same zone. Imagine on system with
>>> CONFIG_ZONE_DMA|DMA32 disabled, people copy the crashkernel=512M,high
>>> and crashkernel=128M,low from other system, and he could get
>>> crash_res at [5G, 5G+512M], while crash_low_res at [6G, 6G+128M]. Guess
>>> how they will judge us.
>>
>> OK, I got it.
>>
>>>
>>>>
>>>>>
>>>>> I would suggest merging this series, Lei can add this corner case
>>>>> handling on top. Since this is a newly added support, we don't have
>>>>> to make it one step. Doing step by step can make reviewing easier.
>>>>>
>>>>>> +	if (crash_low_size && reserve_crashkernel_low(crash_low_size)) {
>>>>>> +		memblock_phys_free(crash_base, crash_size);
>>>>>> +		return;
>>>>>> +	}
>>>>>> +
>>>>>>  	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
>>>>>>  		crash_base, crash_base + crash_size, crash_size >> 20);
>>>>>>  
>>>>>> @@ -135,6 +183,9 @@ static void __init reserve_crashkernel(void)
>>>>>>  	 * map. Inform kmemleak so that it won't try to access it.
>>>>>>  	 */
>>>>>>  	kmemleak_ignore_phys(crash_base);
>>>>>> +	if (crashk_low_res.end)
>>>>>> +		kmemleak_ignore_phys(crashk_low_res.start);
>>>>>> +
>>>>>>  	crashk_res.start = crash_base;
>>>>>>  	crashk_res.end = crash_base + crash_size - 1;
>>>>>>  	insert_resource(&iomem_resource, &crashk_res);
>>>>>> -- 
>>>>>> 2.25.1
>>>>>>
>>>>>
>>>>> .
>>>>>
>>>>
>>>> -- 
>>>> Regards,
>>>>   Zhen Lei
>>>>
>>>
>>> .
>>>
>>
>
Leizhen (ThunderTown) May 7, 2022, 12:20 p.m. UTC | #7
On 2022/5/7 19:49, Leizhen (ThunderTown) wrote:
> 
> 
> On 2022/5/7 17:35, Leizhen (ThunderTown) wrote:
>>
>>
>> On 2022/5/7 11:37, Leizhen (ThunderTown) wrote:
>>>
>>>
>>> On 2022/5/7 10:07, Baoquan He wrote:
>>>> On 05/07/22 at 09:34am, Leizhen (ThunderTown) wrote:
>>>>>
>>>>>
>>>>> On 2022/5/7 7:10, Baoquan He wrote:
>>>>>> On 05/06/22 at 07:43pm, Zhen Lei wrote:
>>>>>> ......  
>>>>>>> @@ -118,8 +162,7 @@ static void __init reserve_crashkernel(void)
>>>>>>>  	if (crash_base)
>>>>>>>  		crash_max = crash_base + crash_size;
>>>>>>>  
>>>>>>> -	/* Current arm64 boot protocol requires 2MB alignment */
>>>>>>> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
>>>>>>> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
>>>>>>>  					       crash_base, crash_max);
>>>>>>>  	if (!crash_base) {
>>>>>>>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
>>>>>>> @@ -127,6 +170,11 @@ static void __init reserve_crashkernel(void)
>>>>>>>  		return;
>>>>>>>  	}
>>>>>>>  
>>>>>>
>>>>>> There's corner case missed, e.g
>>>>>> 1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
>>>>>> 2) ,high and ,low are specified, the whole system memory is under 4G.
>>>>>>
>>>>>> Below judgement can filter them away:
>>>>>>         
>>>>>> 	if (crash_base > arm64_dma_phys_limit && crash_low_size &&
>>>>>> 	    reserve_crashkernel_low(crash_low_size)) {
>>>>>>
>>>>>> What's your opinion? Leave it and add document to notice user, or fix it
>>>>>> with code change?

I've now got the patch ready as suggested, to be as consistent as possible
with x86.

Just wait for next Monday Catalin's response: a seperate patch or v25?



>>
>> I decided to modify the code and document. But the code changes aren't what
>> you suggested. For the following reasons:
>> 1. The memory allocated for 'high' may be partially under 4G. So the low
>>    memory may not be enough. Of course, it's rare.
>> 2. The second kernel can work properly only when the high and low memory
>>    are successfully applied for. For example, high=128M, low=128M, but the
>>    second kernel need 256M.
>>
>> So for the cases you listed:
>> 1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
>>    --> Follow you suggestion, ignore crashkernel=Y,low, don't allocate low memory.
>>
>> @@ -100,6 +100,14 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
>>  {
>>         unsigned long long low_base;
>>
>> +       /*
>> +        * The kernel does not have any DMA zone, so the range of each DMA
>> +        * zone is unknown. Please make sure both CONFIG_ZONE_DMA and
>> +        * CONFIG_ZONE_DMA32 are also not set in the second kernel.
>> +        */
>> +       if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
>> +               return 0;
>> +
>>
>> 2) ,high and ,low are specified, the whole system memory is under 4G.
>>    --> two memory ranges will be allocated, the size is what 'high' and 'low' specified.
>>    --> Yes, the memory of 'low' may be above 'high', but the 'high' just hint allocation
>>    --> from top, try high memory first. Of course, this may cause kexec to fail to load.
>>    --> Because the memory of 'low' with small size will be used to store Image, etc..
>>    --> But the memory of 'low' above 'high' is almost impossible, we use memblock API to
>>    --> allocate memory from top to bottem, 'low' above 'high' need a sizeable memory block
>>    --> (128M, 256M?) to be freed at init phase.
>>    -->  Maybe I should add: crash_max = min(crash_base, CRASH_ADDR_LOW_MAX);
>>    --> to make sure the memory of 'low' is always under 'high'
> 
> I have added the min() above.
> 
> Test result:
> 1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
> root@localhost:~# dmesg | grep crash
> [    0.000000] crashkernel reserved: 0x0000000420000000 - 0x0000000440000000 (512 MB)
> [    0.000000] Kernel command line: console=ttyAMA0 root=/dev/vda rw panic_on_oops=1 oops=panic crashkernel=512M,high crashkernel=128M,low
> 
> 2) ,high and ,low are specified, the whole system memory is under 4G.
> root@localhost:~# dmesg | grep crash
> [    0.000000] crashkernel tmp reserved: 0x00000000f2800000 - 0x00000000fa800000 (128 MB)
> [    0.000000] crashkernel low memory reserved: 0xca800000 - 0xd2800000 (128 MB)
> [    0.000000] crashkernel reserved: 0x00000000d2800000 - 0x00000000f2800000 (512 MB)
> [    0.000000] Kernel command line: console=ttyAMA0 root=/dev/vda rw panic_on_oops=1 oops=panic crashkernel=512M,high crashkernel=128M,low
> 
> test stub for 2):
> 
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 5cb73bbd286b100..abbde2158a0976a 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -147,6 +147,7 @@ static void __init reserve_crashkernel(void)
>         unsigned long long crash_max = CRASH_ADDR_LOW_MAX;
>         char *cmdline = boot_command_line;
>         int ret;
> +       unsigned long long tmp_base;
> 
>         if (!IS_ENABLED(CONFIG_KEXEC_CORE))
>                 return;
> @@ -179,6 +180,11 @@ static void __init reserve_crashkernel(void)
>         if (crash_base)
>                 crash_max = crash_base + crash_size;
> 
> +       tmp_base = memblock_phys_alloc_range(crash_low_size, CRASH_ALIGN, crash_base, crash_max);
> +       BUG_ON(!tmp_base);
> +       pr_info("crashkernel tmp reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
> +               tmp_base, tmp_base + crash_low_size, crash_low_size >> 20);
> +
>         crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
>                                                crash_base, crash_max);
>         if (!crash_base) {
> @@ -186,6 +192,7 @@ static void __init reserve_crashkernel(void)
>                         crash_size);
>                 return;
>         }
> +       memblock_phys_free(tmp_base, crash_low_size);
> 
>         if (crash_low_size && reserve_crashkernel_low(crash_low_size, crash_base)) {
>                 memblock_phys_free(crash_base, crash_size);
> 
>>
>>>>>
>>>>> I think maybe we can leave it unchanged. If the user configures two memory ranges,
>>>>> we'd better apply for two. Otherwise, he'll be confused when he inquires. Currently,
>>>>> crash_low_size is non-zero only when 'crashkernel=Y,low' is explicitly configured.
>>>>
>>>> Then user need know the system information, e.g how much is the high
>>>> memory, low memory, if CONFIG_ZONE_DMA|DMA32 is enabled. And we need
>>>> describe these cases in document. Any corner case or exception need
>>>> be noted if we don't handle it in code.
>>>>
>>>> Caring about this very much because we have CI with existed test cases
>>>> to run on the system, and QA will check these manually too. Support
>>>> engineer need detailed document if anything special but happened.
>>>> Anything unclear or uncovered will be reported as bug to our kernel dev.
>>>> Guess your company do the similar thing like this.
>>>>
>>>> This crashkerne,high and crashkernel,low reservation is special if we
>>>> allow ,high, ,low existing in the same zone. Imagine on system with
>>>> CONFIG_ZONE_DMA|DMA32 disabled, people copy the crashkernel=512M,high
>>>> and crashkernel=128M,low from other system, and he could get
>>>> crash_res at [5G, 5G+512M], while crash_low_res at [6G, 6G+128M]. Guess
>>>> how they will judge us.
>>>
>>> OK, I got it.
>>>
>>>>
>>>>>
>>>>>>
>>>>>> I would suggest merging this series, Lei can add this corner case
>>>>>> handling on top. Since this is a newly added support, we don't have
>>>>>> to make it one step. Doing step by step can make reviewing easier.
>>>>>>
>>>>>>> +	if (crash_low_size && reserve_crashkernel_low(crash_low_size)) {
>>>>>>> +		memblock_phys_free(crash_base, crash_size);
>>>>>>> +		return;
>>>>>>> +	}
>>>>>>> +
>>>>>>>  	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
>>>>>>>  		crash_base, crash_base + crash_size, crash_size >> 20);
>>>>>>>  
>>>>>>> @@ -135,6 +183,9 @@ static void __init reserve_crashkernel(void)
>>>>>>>  	 * map. Inform kmemleak so that it won't try to access it.
>>>>>>>  	 */
>>>>>>>  	kmemleak_ignore_phys(crash_base);
>>>>>>> +	if (crashk_low_res.end)
>>>>>>> +		kmemleak_ignore_phys(crashk_low_res.start);
>>>>>>> +
>>>>>>>  	crashk_res.start = crash_base;
>>>>>>>  	crashk_res.end = crash_base + crash_size - 1;
>>>>>>>  	insert_resource(&iomem_resource, &crashk_res);
>>>>>>> -- 
>>>>>>> 2.25.1
>>>>>>>
>>>>>>
>>>>>> .
>>>>>>
>>>>>
>>>>> -- 
>>>>> Regards,
>>>>>   Zhen Lei
>>>>>
>>>>
>>>> .
>>>>
>>>
>>
>
Baoquan He May 7, 2022, 1:22 p.m. UTC | #8
On 05/07/22 at 05:35pm, Leizhen (ThunderTown) wrote:
> 
> 
> On 2022/5/7 11:37, Leizhen (ThunderTown) wrote:
> > 
> > 
> > On 2022/5/7 10:07, Baoquan He wrote:
> >> On 05/07/22 at 09:34am, Leizhen (ThunderTown) wrote:
> >>>
> >>>
> >>> On 2022/5/7 7:10, Baoquan He wrote:
> >>>> On 05/06/22 at 07:43pm, Zhen Lei wrote:
> >>>> ......  
> >>>>> @@ -118,8 +162,7 @@ static void __init reserve_crashkernel(void)
> >>>>>  	if (crash_base)
> >>>>>  		crash_max = crash_base + crash_size;
> >>>>>  
> >>>>> -	/* Current arm64 boot protocol requires 2MB alignment */
> >>>>> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
> >>>>> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
> >>>>>  					       crash_base, crash_max);
> >>>>>  	if (!crash_base) {
> >>>>>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> >>>>> @@ -127,6 +170,11 @@ static void __init reserve_crashkernel(void)
> >>>>>  		return;
> >>>>>  	}
> >>>>>  
> >>>>
> >>>> There's corner case missed, e.g
> >>>> 1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
> >>>> 2) ,high and ,low are specified, the whole system memory is under 4G.
> >>>>
> >>>> Below judgement can filter them away:
> >>>>         
> >>>> 	if (crash_base > arm64_dma_phys_limit && crash_low_size &&
> >>>> 	    reserve_crashkernel_low(crash_low_size)) {
> >>>>
> >>>> What's your opinion? Leave it and add document to notice user, or fix it
> >>>> with code change?
> 
> I decided to modify the code and document. But the code changes aren't what
> you suggested. For the following reasons:

Hi Lei,

I would say let's merge this version firstly, then add the left step
by step. Crashkernel= is not a simple parameter, expecting to make it in
one step is not realistic. Otherwise, we will be in a mess of all
cases of discussion and handling. Let's slow down and get the basic
support added.

> 1. The memory allocated for 'high' may be partially under 4G. So the low
>    memory may not be enough. Of course, it's rare.

No, let's forget under 4G or above 4G thing on arm64, but use
arm64_dma_phys_limit instead. It's basically equivalent to 4G
on x86, while will cause confusion.

And I may not get what you are saying the 'high' partially under 4G
thing, could you be more specific or give an example?

> 2. The second kernel can work properly only when the high and low memory
>    are successfully applied for. For example, high=128M, low=128M, but the
>    second kernel need 256M.

I may not get this either. We usually won't split our memory requirement
into ,high and ,low region. ,high is the main place to accommadate kernel
image, initrd, and user space program's memory allocation. ,low is for
DMA during kernel bootup.

We probably should not encourage or guide user to use like this if I got
you correctly. That will complicate the crashkernel= usage more.

> 
> So for the cases you listed:
> 1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
>    --> Follow you suggestion, ignore crashkernel=Y,low, don't allocate low memory.
> 
> @@ -100,6 +100,14 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
>  {
>         unsigned long long low_base;
> 
> +       /*
> +        * The kernel does not have any DMA zone, so the range of each DMA
> +        * zone is unknown. Please make sure both CONFIG_ZONE_DMA and
> +        * CONFIG_ZONE_DMA32 are also not set in the second kernel.
> +        */
> +       if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
> +               return 0;
> +
> 
> 2) ,high and ,low are specified, the whole system memory is under 4G.
>    --> two memory ranges will be allocated, the size is what 'high' and 'low' specified.
>    --> Yes, the memory of 'low' may be above 'high', but the 'high' just hint allocation
>    --> from top, try high memory first. Of course, this may cause kexec to fail to load.
>    --> Because the memory of 'low' with small size will be used to store Image, etc..
>    --> But the memory of 'low' above 'high' is almost impossible, we use memblock API to
>    --> allocate memory from top to bottem, 'low' above 'high' need a sizeable memory block
>    --> (128M, 256M?) to be freed at init phase.

Not really. Please think about the case that crashkernel=1G,hign
crashkernel=128M,low. memblock top down allocation find a lower position
for 1G, but a higher position for 128M because of meomry fragmentation.
It's easy but reasonable thing.

>    -->  Maybe I should add: crash_max = min(crash_base, CRASH_ADDR_LOW_MAX);
>    --> to make sure the memory of 'low' is always under 'high'

I would say let's not scatter these details into different places.
Like what I changed, it's much easier and code is more understandable.
Let's discuss this after this series accepted. A new series can be
posted to handle these. 

> 
> >>>
> >>> I think maybe we can leave it unchanged. If the user configures two memory ranges,
> >>> we'd better apply for two. Otherwise, he'll be confused when he inquires. Currently,
> >>> crash_low_size is non-zero only when 'crashkernel=Y,low' is explicitly configured.
> >>
> >> Then user need know the system information, e.g how much is the high
> >> memory, low memory, if CONFIG_ZONE_DMA|DMA32 is enabled. And we need
> >> describe these cases in document. Any corner case or exception need
> >> be noted if we don't handle it in code.
> >>
> >> Caring about this very much because we have CI with existed test cases
> >> to run on the system, and QA will check these manually too. Support
> >> engineer need detailed document if anything special but happened.
> >> Anything unclear or uncovered will be reported as bug to our kernel dev.
> >> Guess your company do the similar thing like this.
> >>
> >> This crashkerne,high and crashkernel,low reservation is special if we
> >> allow ,high, ,low existing in the same zone. Imagine on system with
> >> CONFIG_ZONE_DMA|DMA32 disabled, people copy the crashkernel=512M,high
> >> and crashkernel=128M,low from other system, and he could get
> >> crash_res at [5G, 5G+512M], while crash_low_res at [6G, 6G+128M]. Guess
> >> how they will judge us.
> > 
> > OK, I got it.
> > 
> >>
> >>>
> >>>>
> >>>> I would suggest merging this series, Lei can add this corner case
> >>>> handling on top. Since this is a newly added support, we don't have
> >>>> to make it one step. Doing step by step can make reviewing easier.
> >>>>
> >>>>> +	if (crash_low_size && reserve_crashkernel_low(crash_low_size)) {
> >>>>> +		memblock_phys_free(crash_base, crash_size);
> >>>>> +		return;
> >>>>> +	}
> >>>>> +
> >>>>>  	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
> >>>>>  		crash_base, crash_base + crash_size, crash_size >> 20);
> >>>>>  
> >>>>> @@ -135,6 +183,9 @@ static void __init reserve_crashkernel(void)
> >>>>>  	 * map. Inform kmemleak so that it won't try to access it.
> >>>>>  	 */
> >>>>>  	kmemleak_ignore_phys(crash_base);
> >>>>> +	if (crashk_low_res.end)
> >>>>> +		kmemleak_ignore_phys(crashk_low_res.start);
> >>>>> +
> >>>>>  	crashk_res.start = crash_base;
> >>>>>  	crashk_res.end = crash_base + crash_size - 1;
> >>>>>  	insert_resource(&iomem_resource, &crashk_res);
> >>>>> -- 
> >>>>> 2.25.1
> >>>>>
> >>>>
> >>>> .
> >>>>
> >>>
> >>> -- 
> >>> Regards,
> >>>   Zhen Lei
> >>>
> >>
> >> .
> >>
> > 
> 
> -- 
> Regards,
>   Zhen Lei
>
John Donnelly May 7, 2022, 5:30 p.m. UTC | #9
On 5/6/22 6:10 PM, Baoquan He wrote:
> On 05/06/22 at 07:43pm, Zhen Lei wrote:
> ......
>> @@ -118,8 +162,7 @@ static void __init reserve_crashkernel(void)
>>   	if (crash_base)
>>   		crash_max = crash_base + crash_size;
>>   
>> -	/* Current arm64 boot protocol requires 2MB alignment */
>> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
>> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
>>   					       crash_base, crash_max);
>>   	if (!crash_base) {
>>   		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
>> @@ -127,6 +170,11 @@ static void __init reserve_crashkernel(void)
>>   		return;
>>   	}
>>   
> 
> There's corner case missed, e.g
> 1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
> 2) ,high and ,low are specified, the whole system memory is under 4G.
> 
> Below judgement can filter them away:
>          
> 	if (crash_base > arm64_dma_phys_limit && crash_low_size &&
> 	    reserve_crashkernel_low(crash_low_size)) {
> 
> What's your opinion? Leave it and add document to notice user, or fix it
> with code change >
> I would suggest merging this series, Lei can add this corner case
> handling on top. Since this is a newly added support, we don't have
> to make it one step. Doing step by step can make reviewing easier.

Lets get this added and tested with a broader audience. It has been in 
review since March, 4th 2019 - 3+ years. I applaud Zhen for his 
endurance and patience in carrying this for so long.


> 
>> +	if (crash_low_size && reserve_crashkernel_low(crash_low_size)) {
>> +		memblock_phys_free(crash_base, crash_size);
>> +		return;
>> +	}
>> +
>>   	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
>>   		crash_base, crash_base + crash_size, crash_size >> 20);
>>   
>> @@ -135,6 +183,9 @@ static void __init reserve_crashkernel(void)
>>   	 * map. Inform kmemleak so that it won't try to access it.
>>   	 */
>>   	kmemleak_ignore_phys(crash_base);
>> +	if (crashk_low_res.end)
>> +		kmemleak_ignore_phys(crashk_low_res.start);
>> +
>>   	crashk_res.start = crash_base;
>>   	crashk_res.end = crash_base + crash_size - 1;
>>   	insert_resource(&iomem_resource, &crashk_res);
>> -- 
>> 2.25.1
>>
>
Catalin Marinas May 7, 2022, 6:50 p.m. UTC | #10
On Sat, May 07, 2022 at 07:10:32AM +0800, Baoquan He wrote:
> On 05/06/22 at 07:43pm, Zhen Lei wrote:
> ......  
> > @@ -118,8 +162,7 @@ static void __init reserve_crashkernel(void)
> >  	if (crash_base)
> >  		crash_max = crash_base + crash_size;
> >  
> > -	/* Current arm64 boot protocol requires 2MB alignment */
> > -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
> > +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
> >  					       crash_base, crash_max);
> >  	if (!crash_base) {
> >  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> > @@ -127,6 +170,11 @@ static void __init reserve_crashkernel(void)
> >  		return;
> >  	}
> >  
> 
> There's corner case missed, e.g
> 1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
> 2) ,high and ,low are specified, the whole system memory is under 4G.

My view of ,low is that it should only used to override the default
ZONE_DMA allocation if that one is not suitable. If no ZONE_DMA exists
or everything is ZONE_DMA, ignore it altogether. That's a specialist
case for people that know more about the memory layout, otherwise
crashkernel=X works in most case with crashkernel=X,high as an
alternative to allow high allocation.

> I would suggest merging this series, Lei can add this corner case
> handling on top. Since this is a newly added support, we don't have
> to make it one step. Doing step by step can make reviewing easier.

I agree.
Baoquan He May 9, 2022, 4:04 a.m. UTC | #11
On 05/07/22 at 07:50pm, Catalin Marinas wrote:
> On Sat, May 07, 2022 at 07:10:32AM +0800, Baoquan He wrote:
> > On 05/06/22 at 07:43pm, Zhen Lei wrote:
> > ......  
> > > @@ -118,8 +162,7 @@ static void __init reserve_crashkernel(void)
> > >  	if (crash_base)
> > >  		crash_max = crash_base + crash_size;
> > >  
> > > -	/* Current arm64 boot protocol requires 2MB alignment */
> > > -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
> > > +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
> > >  					       crash_base, crash_max);
> > >  	if (!crash_base) {
> > >  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> > > @@ -127,6 +170,11 @@ static void __init reserve_crashkernel(void)
> > >  		return;
> > >  	}
> > >  
> > 
> > There's corner case missed, e.g
> > 1) ,high and ,low are specified, CONFIG_ZONE_DMA|DMA32 is not enabled;
> > 2) ,high and ,low are specified, the whole system memory is under 4G.
> 
> My view of ,low is that it should only used to override the default
> ZONE_DMA allocation if that one is not suitable. If no ZONE_DMA exists
> or everything is ZONE_DMA, ignore it altogether. That's a specialist
> case for people that know more about the memory layout, otherwise
> crashkernel=X works in most case with crashkernel=X,high as an
> alternative to allow high allocation.

Totally agree with the conclusion.
diff mbox series

Patch

diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index e16b248699d5c3c..19c2d487cb08feb 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -329,8 +329,13 @@  bool crash_is_nosave(unsigned long pfn)
 
 	/* in reserved memory? */
 	addr = __pfn_to_phys(pfn);
-	if ((addr < crashk_res.start) || (crashk_res.end < addr))
-		return false;
+	if ((addr < crashk_res.start) || (crashk_res.end < addr)) {
+		if (!crashk_low_res.end)
+			return false;
+
+		if ((addr < crashk_low_res.start) || (crashk_low_res.end < addr))
+			return false;
+	}
 
 	if (!kexec_crash_image)
 		return true;
diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
index 59c648d51848886..889951291cc0f9c 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -65,10 +65,18 @@  static int prepare_elf_headers(void **addr, unsigned long *sz)
 
 	/* Exclude crashkernel region */
 	ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
+	if (ret)
+		goto out;
+
+	if (crashk_low_res.end) {
+		ret = crash_exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end);
+		if (ret)
+			goto out;
+	}
 
-	if (!ret)
-		ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);
+	ret = crash_prepare_elf64_headers(cmem, true, addr, sz);
 
+out:
 	kfree(cmem);
 	return ret;
 }
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 51863f1448c6989..18ba66c90991ea0 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -90,6 +90,32 @@  phys_addr_t __ro_after_init arm64_dma_phys_limit;
 phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1;
 #endif
 
+/* Current arm64 boot protocol requires 2MB alignment */
+#define CRASH_ALIGN			SZ_2M
+
+#define CRASH_ADDR_LOW_MAX		arm64_dma_phys_limit
+#define CRASH_ADDR_HIGH_MAX		(PHYS_MASK + 1)
+
+static int __init reserve_crashkernel_low(unsigned long long low_size)
+{
+	unsigned long long low_base;
+
+	low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
+	if (!low_base) {
+		pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
+		return -ENOMEM;
+	}
+
+	pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
+		low_base, low_base + low_size, low_size >> 20);
+
+	crashk_low_res.start = low_base;
+	crashk_low_res.end   = low_base + low_size - 1;
+	insert_resource(&iomem_resource, &crashk_low_res);
+
+	return 0;
+}
+
 /*
  * reserve_crashkernel() - reserves memory for crash kernel
  *
@@ -100,17 +126,35 @@  phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1;
 static void __init reserve_crashkernel(void)
 {
 	unsigned long long crash_base, crash_size;
-	unsigned long long crash_max = arm64_dma_phys_limit;
+	unsigned long long crash_low_size = 0;
+	unsigned long long crash_max = CRASH_ADDR_LOW_MAX;
+	char *cmdline = boot_command_line;
 	int ret;
 
 	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
 		return;
 
-	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
+	/* crashkernel=X[@offset] */
+	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
 				&crash_size, &crash_base);
-	/* no crashkernel= or invalid value specified */
-	if (ret || !crash_size)
+	if (ret == -ENOENT) {
+		ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base);
+		if (ret || !crash_size)
+			return;
+
+		/*
+		 * crashkernel=Y,low can be specified or not, but invalid value
+		 * is not allowed.
+		 */
+		ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base);
+		if (ret && (ret != -ENOENT))
+			return;
+
+		crash_max = CRASH_ADDR_HIGH_MAX;
+	} else if (ret || !crash_size) {
+		/* The specified value is invalid */
 		return;
+	}
 
 	crash_size = PAGE_ALIGN(crash_size);
 
@@ -118,8 +162,7 @@  static void __init reserve_crashkernel(void)
 	if (crash_base)
 		crash_max = crash_base + crash_size;
 
-	/* Current arm64 boot protocol requires 2MB alignment */
-	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
+	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
 					       crash_base, crash_max);
 	if (!crash_base) {
 		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
@@ -127,6 +170,11 @@  static void __init reserve_crashkernel(void)
 		return;
 	}
 
+	if (crash_low_size && reserve_crashkernel_low(crash_low_size)) {
+		memblock_phys_free(crash_base, crash_size);
+		return;
+	}
+
 	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
 		crash_base, crash_base + crash_size, crash_size >> 20);
 
@@ -135,6 +183,9 @@  static void __init reserve_crashkernel(void)
 	 * map. Inform kmemleak so that it won't try to access it.
 	 */
 	kmemleak_ignore_phys(crash_base);
+	if (crashk_low_res.end)
+		kmemleak_ignore_phys(crashk_low_res.start);
+
 	crashk_res.start = crash_base;
 	crashk_res.end = crash_base + crash_size - 1;
 	insert_resource(&iomem_resource, &crashk_res);