diff mbox series

[v23,3/6] arm64: kdump: Reimplement crashkernel=X

Message ID 20220505091845.167-4-thunder.leizhen@huawei.com (mailing list archive)
State New, archived
Headers show
Series support reserving crashkernel above 4G on arm64 kdump | expand

Commit Message

Leizhen (ThunderTown) May 5, 2022, 9:18 a.m. UTC
From: Chen Zhou <chenzhou10@huawei.com>

There are following issues in arm64 kdump:
1. We use crashkernel=X to reserve crashkernel in DMA zone, which
will fail when there is not enough low memory.
2. If reserving crashkernel above DMA zone, in this case, crash dump
kernel will fail to boot because there is no low memory available
for allocation.

To solve these issues, introduce crashkernel=X,[high,low].
The "crashkernel=X,high" is used to select a region above DMA zone, and
the "crashkernel=Y,low" is used to allocate specified size low memory.

Signed-off-by: Chen Zhou <chenzhou10@huawei.com>
Co-developed-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
---
 arch/arm64/kernel/machine_kexec.c      |  9 +++-
 arch/arm64/kernel/machine_kexec_file.c | 12 ++++-
 arch/arm64/mm/init.c                   | 62 +++++++++++++++++++++++---
 3 files changed, 72 insertions(+), 11 deletions(-)

Comments

Catalin Marinas May 5, 2022, 5:01 p.m. UTC | #1
On Thu, May 05, 2022 at 05:18:42PM +0800, Zhen Lei wrote:
> From: Chen Zhou <chenzhou10@huawei.com>
> 
> There are following issues in arm64 kdump:
> 1. We use crashkernel=X to reserve crashkernel in DMA zone, which
> will fail when there is not enough low memory.
> 2. If reserving crashkernel above DMA zone, in this case, crash dump
> kernel will fail to boot because there is no low memory available
> for allocation.
> 
> To solve these issues, introduce crashkernel=X,[high,low].
> The "crashkernel=X,high" is used to select a region above DMA zone, and
> the "crashkernel=Y,low" is used to allocate specified size low memory.

Thanks for posting the simplified version, though the discussion with
Baoquan is still ongoing. AFAICT there is no fallback if crashkernel=
fails. The advantage with this series is cleaner code, we set the limits
during parsing and don't have to adjust them if some of the first
allocation failed.

> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 51863f1448c6989..11406f3e1443168 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -90,6 +90,32 @@ phys_addr_t __ro_after_init arm64_dma_phys_limit;
>  phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1;
>  #endif
>  
> +/* Current arm64 boot protocol requires 2MB alignment */
> +#define CRASH_ALIGN			SZ_2M
> +
> +#define CRASH_ADDR_LOW_MAX		arm64_dma_phys_limit
> +#define CRASH_ADDR_HIGH_MAX		memblock.current_limit

Better use memblock_get_current_limit() if you need to or just
MEMBLOCK_ALLOC_ANYWHERE, memblock.current_limit is just a memblock
internal. But I think we can go for (PHYS_MASK + 1) if you need
something other than MEMBLOCK_ALLOC_ANYWHERE, memblock knows what to
allocate anyway.

> +static int __init reserve_crashkernel_low(unsigned long long low_size)
> +{
> +	unsigned long long low_base;
> +
> +	low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
> +	if (!low_base) {
> +		pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
> +		return -ENOMEM;
> +	}
> +
> +	pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
> +		low_base, low_base + low_size, low_size >> 20);
> +
> +	crashk_low_res.start = low_base;
> +	crashk_low_res.end   = low_base + low_size - 1;
> +	insert_resource(&iomem_resource, &crashk_low_res);
> +
> +	return 0;
> +}
> +
>  /*
>   * reserve_crashkernel() - reserves memory for crash kernel
>   *
> @@ -100,17 +126,32 @@ phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1;
>  static void __init reserve_crashkernel(void)
>  {
>  	unsigned long long crash_base, crash_size;
> -	unsigned long long crash_max = arm64_dma_phys_limit;
> +	unsigned long long crash_low_size = 0;
> +	unsigned long long crash_max = CRASH_ADDR_LOW_MAX;
> +	char *cmdline = boot_command_line;
>  	int ret;
>  
>  	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
>  		return;
>  
> -	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
> +	/* crashkernel=X[@offset] */
> +	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
>  				&crash_size, &crash_base);
> -	/* no crashkernel= or invalid value specified */
> -	if (ret || !crash_size)
> -		return;
> +	if (ret || !crash_size) {

I think we should check for ret == -ENOENT only. If the crashkernel=
exists but is malformed or the size is 0, we shouldn't bother with
high/low at all.

> +		ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base);
> +		if (ret || !crash_size)
> +			return;
> +
> +		/*
> +		 * crashkernel=Y,low can be specified or not, but invalid value
> +		 * is not allowed.
> +		 */
> +		ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base);
> +		if (ret && (ret != -ENOENT))
> +			return;
> +
> +		crash_max = CRASH_ADDR_HIGH_MAX;
> +	}
>  
>  	crash_size = PAGE_ALIGN(crash_size);
>  
> @@ -118,8 +159,7 @@ static void __init reserve_crashkernel(void)
>  	if (crash_base)
>  		crash_max = crash_base + crash_size;
>  
> -	/* Current arm64 boot protocol requires 2MB alignment */
> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
>  					       crash_base, crash_max);
>  	if (!crash_base) {
>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",

I personally like this but let's see how the other thread goes. I guess
if we want a fallback, it would come just before the check the above:

	if (!crash_base && crash_max != CRASH_ADDR_HIGH_MAX) {
		/* attempt high allocation with default low */
		if (!crash_low_size)
			crash_low_size = some default;
		crash_max = CRASH_ADDR_LOW_MAX;
		crash_base = memblock_phys_alloc_range();
	}

Well, I guess we end up with your earlier proposal but I think I
understand it better now ;).
Leizhen (ThunderTown) May 6, 2022, 3:22 a.m. UTC | #2
On 2022/5/6 1:01, Catalin Marinas wrote:
> On Thu, May 05, 2022 at 05:18:42PM +0800, Zhen Lei wrote:
>> From: Chen Zhou <chenzhou10@huawei.com>
>>
>> There are following issues in arm64 kdump:
>> 1. We use crashkernel=X to reserve crashkernel in DMA zone, which
>> will fail when there is not enough low memory.
>> 2. If reserving crashkernel above DMA zone, in this case, crash dump
>> kernel will fail to boot because there is no low memory available
>> for allocation.
>>
>> To solve these issues, introduce crashkernel=X,[high,low].
>> The "crashkernel=X,high" is used to select a region above DMA zone, and
>> the "crashkernel=Y,low" is used to allocate specified size low memory.
> 
> Thanks for posting the simplified version, though the discussion with
> Baoquan is still ongoing. AFAICT there is no fallback if crashkernel=
> fails. The advantage with this series is cleaner code, we set the limits
> during parsing and don't have to adjust them if some of the first
> allocation failed.

Yes, I'm currently implementing it in the simplest version, providing only
the most basic functions. Because the conclusions of this part of the discussion
are clear. I think I can send the fallback, default low size, and mapping optimization
patches separately after this basic version is merged. These three functions can
be discussed separately.

> 
>> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
>> index 51863f1448c6989..11406f3e1443168 100644
>> --- a/arch/arm64/mm/init.c
>> +++ b/arch/arm64/mm/init.c
>> @@ -90,6 +90,32 @@ phys_addr_t __ro_after_init arm64_dma_phys_limit;
>>  phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1;
>>  #endif
>>  
>> +/* Current arm64 boot protocol requires 2MB alignment */
>> +#define CRASH_ALIGN			SZ_2M
>> +
>> +#define CRASH_ADDR_LOW_MAX		arm64_dma_phys_limit
>> +#define CRASH_ADDR_HIGH_MAX		memblock.current_limit
> 
> Better use memblock_get_current_limit() if you need to or just
> MEMBLOCK_ALLOC_ANYWHERE, memblock.current_limit is just a memblock
> internal. But I think we can go for (PHYS_MASK + 1) if you need
> something other than MEMBLOCK_ALLOC_ANYWHERE, memblock knows what to
> allocate anyway.

Yes, it would be better to use (PHYS_MASK + 1).

> 
>> +static int __init reserve_crashkernel_low(unsigned long long low_size)
>> +{
>> +	unsigned long long low_base;
>> +
>> +	low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
>> +	if (!low_base) {
>> +		pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
>> +		low_base, low_base + low_size, low_size >> 20);
>> +
>> +	crashk_low_res.start = low_base;
>> +	crashk_low_res.end   = low_base + low_size - 1;
>> +	insert_resource(&iomem_resource, &crashk_low_res);
>> +
>> +	return 0;
>> +}
>> +
>>  /*
>>   * reserve_crashkernel() - reserves memory for crash kernel
>>   *
>> @@ -100,17 +126,32 @@ phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1;
>>  static void __init reserve_crashkernel(void)
>>  {
>>  	unsigned long long crash_base, crash_size;
>> -	unsigned long long crash_max = arm64_dma_phys_limit;
>> +	unsigned long long crash_low_size = 0;
>> +	unsigned long long crash_max = CRASH_ADDR_LOW_MAX;
>> +	char *cmdline = boot_command_line;
>>  	int ret;
>>  
>>  	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
>>  		return;
>>  
>> -	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
>> +	/* crashkernel=X[@offset] */
>> +	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
>>  				&crash_size, &crash_base);
>> -	/* no crashkernel= or invalid value specified */
>> -	if (ret || !crash_size)
>> -		return;
>> +	if (ret || !crash_size) {
> 
> I think we should check for ret == -ENOENT only. If the crashkernel=
> exists but is malformed or the size is 0, we shouldn't bother with
> high/low at all.

That's right.

> 
>> +		ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base);
>> +		if (ret || !crash_size)
>> +			return;
>> +
>> +		/*
>> +		 * crashkernel=Y,low can be specified or not, but invalid value
>> +		 * is not allowed.
>> +		 */
>> +		ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base);
>> +		if (ret && (ret != -ENOENT))
>> +			return;
>> +
>> +		crash_max = CRASH_ADDR_HIGH_MAX;
>> +	}
>>  
>>  	crash_size = PAGE_ALIGN(crash_size);
>>  
>> @@ -118,8 +159,7 @@ static void __init reserve_crashkernel(void)
>>  	if (crash_base)
>>  		crash_max = crash_base + crash_size;
>>  
>> -	/* Current arm64 boot protocol requires 2MB alignment */
>> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
>> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
>>  					       crash_base, crash_max);
>>  	if (!crash_base) {
>>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> 
> I personally like this but let's see how the other thread goes. I guess

Me too. This fallback complicates code logic more than just a little.
I'm not sure why someone would rather add fallback than change the bootup
options to crashkernel=X,[high|low]. Perhaps fallback to high/low is a better
compatible and extended mode when crashkernel=X fails to reserve memory. And
the code logic will be much clearer.

//parse crashkernel=X		//To simplify the discussion, Ignore [@offset]
crash_base = memblock_phys_alloc_range()
if (!crash_base || /* crashkernel=X is not specified */) {
	//parse crashkernel=X,[high,low]
	//reserve high/low memory
}

So that, the following three modes are supported:
1) crashkernel=X[@offset]
2) crashkernel=X,high crashkernel=X,low
3) crashkernel=X[@offset] crashkernel=X,high [crashkernel=Y,low]

For case 3), try "crashkernel=X[@offset]" first, if it can not work, fallback
to "crashkernel=X,high crashkernel=X,low". This looks better than the old "crashkernel=X"
fallback ---- Select a region under 4G first, and fall back to reserve region above 4G.

Note: when the X of crashkernel=X and crashkernel=X,high are the same, It's equivalent
to the old "crashkernel=X" fallback.

> if we want a fallback, it would come just before the check the above:
> 
> 	if (!crash_base && crash_max != CRASH_ADDR_HIGH_MAX) {
> 		/* attempt high allocation with default low */
> 		if (!crash_low_size)
> 			crash_low_size = some default;
> 		crash_max = CRASH_ADDR_LOW_MAX;

crash_max = CRASH_ADDR_HIGH_MAX; We should fallback to high memory now.

> 		crash_base = memblock_phys_alloc_range();
> 	}
> 
> Well, I guess we end up with your earlier proposal but I think I
> understand it better now ;).
>
Catalin Marinas May 6, 2022, 11:06 a.m. UTC | #3
On Fri, May 06, 2022 at 11:22:51AM +0800, Leizhen (ThunderTown) wrote:
> On 2022/5/6 1:01, Catalin Marinas wrote:
> > On Thu, May 05, 2022 at 05:18:42PM +0800, Zhen Lei wrote:
> >> From: Chen Zhou <chenzhou10@huawei.com>
> >>
> >> There are following issues in arm64 kdump:
> >> 1. We use crashkernel=X to reserve crashkernel in DMA zone, which
> >> will fail when there is not enough low memory.
> >> 2. If reserving crashkernel above DMA zone, in this case, crash dump
> >> kernel will fail to boot because there is no low memory available
> >> for allocation.
> >>
> >> To solve these issues, introduce crashkernel=X,[high,low].
> >> The "crashkernel=X,high" is used to select a region above DMA zone, and
> >> the "crashkernel=Y,low" is used to allocate specified size low memory.
> > 
> > Thanks for posting the simplified version, though the discussion with
> > Baoquan is still ongoing. AFAICT there is no fallback if crashkernel=
> > fails. The advantage with this series is cleaner code, we set the limits
> > during parsing and don't have to adjust them if some of the first
> > allocation failed.
> 
> Yes, I'm currently implementing it in the simplest version, providing only
> the most basic functions. Because the conclusions of this part of the discussion
> are clear. I think I can send the fallback, default low size, and mapping optimization
> patches separately after this basic version is merged. These three functions can
> be discussed separately.

This works for me. If we decide to go for fallbacks, it can be done as a
separate patch.

> >> +		ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base);
> >> +		if (ret || !crash_size)
> >> +			return;
> >> +
> >> +		/*
> >> +		 * crashkernel=Y,low can be specified or not, but invalid value
> >> +		 * is not allowed.
> >> +		 */
> >> +		ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base);
> >> +		if (ret && (ret != -ENOENT))
> >> +			return;
> >> +
> >> +		crash_max = CRASH_ADDR_HIGH_MAX;
> >> +	}
> >>  
> >>  	crash_size = PAGE_ALIGN(crash_size);
> >>  
> >> @@ -118,8 +159,7 @@ static void __init reserve_crashkernel(void)
> >>  	if (crash_base)
> >>  		crash_max = crash_base + crash_size;
> >>  
> >> -	/* Current arm64 boot protocol requires 2MB alignment */
> >> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
> >> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
> >>  					       crash_base, crash_max);
> >>  	if (!crash_base) {
> >>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> > 
> > I personally like this but let's see how the other thread goes. I guess
> 
> Me too. This fallback complicates code logic more than just a little.
> I'm not sure why someone would rather add fallback than change the bootup
> options to crashkernel=X,[high|low]. Perhaps fallback to high/low is a better
> compatible and extended mode when crashkernel=X fails to reserve memory. And
> the code logic will be much clearer.
> 
> //parse crashkernel=X		//To simplify the discussion, Ignore [@offset]
> crash_base = memblock_phys_alloc_range()
> if (!crash_base || /* crashkernel=X is not specified */) {
> 	//parse crashkernel=X,[high,low]
> 	//reserve high/low memory
> }
> 
> So that, the following three modes are supported:
> 1) crashkernel=X[@offset]
> 2) crashkernel=X,high crashkernel=X,low
> 3) crashkernel=X[@offset] crashkernel=X,high [crashkernel=Y,low]

The whole interface isn't great but if we add fall-back options, I'd
rather stick close to what x86 does. IOW, if crashkernel=X is provided,
ignore explicit high/low (so 3 does not exist).

(if I had added it from the beginning, I'd have removed 'high'
completely and allow crashkernel=X to fall-back to 'high' with an
optional explicit 'low' or 'dma' if the default is not sufficient; but I
think there's too much bikeshedding already)

> > if we want a fallback, it would come just before the check the above:
> > 
> > 	if (!crash_base && crash_max != CRASH_ADDR_HIGH_MAX) {
> > 		/* attempt high allocation with default low */
> > 		if (!crash_low_size)
> > 			crash_low_size = some default;
> > 		crash_max = CRASH_ADDR_LOW_MAX;
> 
> crash_max = CRASH_ADDR_HIGH_MAX; We should fallback to high memory now.

Yes, that's the idea.

Anyway, please post the current series with the minor updates I
mentioned and we can add a fallback patch (or two) on top.

Thanks.
Leizhen (ThunderTown) May 6, 2022, 12:35 p.m. UTC | #4
On 2022/5/6 19:06, Catalin Marinas wrote:
> On Fri, May 06, 2022 at 11:22:51AM +0800, Leizhen (ThunderTown) wrote:
>> On 2022/5/6 1:01, Catalin Marinas wrote:
>>> On Thu, May 05, 2022 at 05:18:42PM +0800, Zhen Lei wrote:
>>>> From: Chen Zhou <chenzhou10@huawei.com>
>>>>
>>>> There are following issues in arm64 kdump:
>>>> 1. We use crashkernel=X to reserve crashkernel in DMA zone, which
>>>> will fail when there is not enough low memory.
>>>> 2. If reserving crashkernel above DMA zone, in this case, crash dump
>>>> kernel will fail to boot because there is no low memory available
>>>> for allocation.
>>>>
>>>> To solve these issues, introduce crashkernel=X,[high,low].
>>>> The "crashkernel=X,high" is used to select a region above DMA zone, and
>>>> the "crashkernel=Y,low" is used to allocate specified size low memory.
>>>
>>> Thanks for posting the simplified version, though the discussion with
>>> Baoquan is still ongoing. AFAICT there is no fallback if crashkernel=
>>> fails. The advantage with this series is cleaner code, we set the limits
>>> during parsing and don't have to adjust them if some of the first
>>> allocation failed.
>>
>> Yes, I'm currently implementing it in the simplest version, providing only
>> the most basic functions. Because the conclusions of this part of the discussion
>> are clear. I think I can send the fallback, default low size, and mapping optimization
>> patches separately after this basic version is merged. These three functions can
>> be discussed separately.
> 
> This works for me. If we decide to go for fallbacks, it can be done as a
> separate patch.
> 
>>>> +		ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base);
>>>> +		if (ret || !crash_size)
>>>> +			return;
>>>> +
>>>> +		/*
>>>> +		 * crashkernel=Y,low can be specified or not, but invalid value
>>>> +		 * is not allowed.
>>>> +		 */
>>>> +		ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base);
>>>> +		if (ret && (ret != -ENOENT))
>>>> +			return;
>>>> +
>>>> +		crash_max = CRASH_ADDR_HIGH_MAX;
>>>> +	}
>>>>  
>>>>  	crash_size = PAGE_ALIGN(crash_size);
>>>>  
>>>> @@ -118,8 +159,7 @@ static void __init reserve_crashkernel(void)
>>>>  	if (crash_base)
>>>>  		crash_max = crash_base + crash_size;
>>>>  
>>>> -	/* Current arm64 boot protocol requires 2MB alignment */
>>>> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
>>>> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
>>>>  					       crash_base, crash_max);
>>>>  	if (!crash_base) {
>>>>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
>>>
>>> I personally like this but let's see how the other thread goes. I guess
>>
>> Me too. This fallback complicates code logic more than just a little.
>> I'm not sure why someone would rather add fallback than change the bootup
>> options to crashkernel=X,[high|low]. Perhaps fallback to high/low is a better
>> compatible and extended mode when crashkernel=X fails to reserve memory. And
>> the code logic will be much clearer.
>>
>> //parse crashkernel=X		//To simplify the discussion, Ignore [@offset]
>> crash_base = memblock_phys_alloc_range()
>> if (!crash_base || /* crashkernel=X is not specified */) {
>> 	//parse crashkernel=X,[high,low]
>> 	//reserve high/low memory
>> }
>>
>> So that, the following three modes are supported:
>> 1) crashkernel=X[@offset]
>> 2) crashkernel=X,high crashkernel=X,low
>> 3) crashkernel=X[@offset] crashkernel=X,high [crashkernel=Y,low]
> 
> The whole interface isn't great but if we add fall-back options, I'd
> rather stick close to what x86 does. IOW, if crashkernel=X is provided,
> ignore explicit high/low (so 3 does not exist).
> 
> (if I had added it from the beginning, I'd have removed 'high'
> completely and allow crashkernel=X to fall-back to 'high' with an
> optional explicit 'low' or 'dma' if the default is not sufficient; but I

Er, my idea almost coincides with yours. When 3) removes 'high', it's the same
way you think. Of course, I haven't thought of deleting 'high' yet. So your
idea is more perfect.

> think there's too much bikeshedding already)

Yeah, the oldest prince has royal power. There's no choice now.

> 
>>> if we want a fallback, it would come just before the check the above:
>>>
>>> 	if (!crash_base && crash_max != CRASH_ADDR_HIGH_MAX) {
>>> 		/* attempt high allocation with default low */
>>> 		if (!crash_low_size)
>>> 			crash_low_size = some default;
>>> 		crash_max = CRASH_ADDR_LOW_MAX;
>>
>> crash_max = CRASH_ADDR_HIGH_MAX; We should fallback to high memory now.
> 
> Yes, that's the idea.
> 
> Anyway, please post the current series with the minor updates I
> mentioned and we can add a fallback patch (or two) on top.
> 
> Thanks.
>
Baoquan He May 6, 2022, 1:16 p.m. UTC | #5
On 05/06/22 at 11:22am, Leizhen (ThunderTown) wrote:
......  
> >> @@ -118,8 +159,7 @@ static void __init reserve_crashkernel(void)
> >>  	if (crash_base)
> >>  		crash_max = crash_base + crash_size;
> >>  
> >> -	/* Current arm64 boot protocol requires 2MB alignment */
> >> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
> >> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
> >>  					       crash_base, crash_max);
> >>  	if (!crash_base) {
> >>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> > 
> > I personally like this but let's see how the other thread goes. I guess
> 
> Me too. This fallback complicates code logic more than just a little.
> I'm not sure why someone would rather add fallback than change the bootup
> options to crashkernel=X,[high|low]. Perhaps fallback to high/low is a better
> compatible and extended mode when crashkernel=X fails to reserve memory. And
> the code logic will be much clearer.

The fallback does complicates code, while it was not made at the
beginning, but added later. The original crahskernel=xM can only reserve
low memory under 896M on x86 to be back compatible with the case in which
normal kernel is x86_64, while kdump kernel could be i386. Then customer
complained why crashkernel=xM can't be put anywhere so that they don't
need to know the details of limited low memory and huge high memory fact 
in system.

The implementation of fallback is truly complicated, but its use is
quite simple. And it makes crashkernel reservation setting simple.
Most of users don't need to know crashkernel=,high, ,low things, unless
the crashkernel region is too big. Nobody wants to take away 1G or more
from low memory for kdump just in case bad thing happens, while normal
kernel itself is seriously impacted by limited low memory.

> 
> //parse crashkernel=X		//To simplify the discussion, Ignore [@offset]
> crash_base = memblock_phys_alloc_range()
> if (!crash_base || /* crashkernel=X is not specified */) {
> 	//parse crashkernel=X,[high,low]
> 	//reserve high/low memory
> }
> 
> So that, the following three modes are supported:
> 1) crashkernel=X[@offset]
> 2) crashkernel=X,high crashkernel=X,low
> 3) crashkernel=X[@offset] crashkernel=X,high [crashkernel=Y,low]
> 
> For case 3), try "crashkernel=X[@offset]" first, if it can not work, fallback
> to "crashkernel=X,high crashkernel=X,low". This looks better than the old "crashkernel=X"
> fallback ---- Select a region under 4G first, and fall back to reserve region above 4G.

Don't get it. Aren't they the same?

> 
> Note: when the X of crashkernel=X and crashkernel=X,high are the same, It's equivalent
> to the old "crashkernel=X" fallback.
> 
> > if we want a fallback, it would come just before the check the above:
> > 
> > 	if (!crash_base && crash_max != CRASH_ADDR_HIGH_MAX) {
> > 		/* attempt high allocation with default low */
> > 		if (!crash_low_size)
> > 			crash_low_size = some default;
> > 		crash_max = CRASH_ADDR_LOW_MAX;
> 
> crash_max = CRASH_ADDR_HIGH_MAX; We should fallback to high memory now.
> 
> > 		crash_base = memblock_phys_alloc_range();
> > 	}
> > 
> > Well, I guess we end up with your earlier proposal but I think I
> > understand it better now ;).
> > 
> 
> -- 
> Regards,
>   Zhen Lei
>
Catalin Marinas May 6, 2022, 5:45 p.m. UTC | #6
On Fri, May 06, 2022 at 09:16:08PM +0800, Baoquan He wrote:
> On 05/06/22 at 11:22am, Leizhen (ThunderTown) wrote:
> ......  
> > >> @@ -118,8 +159,7 @@ static void __init reserve_crashkernel(void)
> > >>  	if (crash_base)
> > >>  		crash_max = crash_base + crash_size;
> > >>  
> > >> -	/* Current arm64 boot protocol requires 2MB alignment */
> > >> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
> > >> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
> > >>  					       crash_base, crash_max);
> > >>  	if (!crash_base) {
> > >>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> > > 
> > > I personally like this but let's see how the other thread goes. I guess
> > 
> > Me too. This fallback complicates code logic more than just a little.
> > I'm not sure why someone would rather add fallback than change the bootup
> > options to crashkernel=X,[high|low]. Perhaps fallback to high/low is a better
> > compatible and extended mode when crashkernel=X fails to reserve memory. And
> > the code logic will be much clearer.
> 
> The fallback does complicates code, while it was not made at the
> beginning, but added later. The original crahskernel=xM can only reserve
> low memory under 896M on x86 to be back compatible with the case in which
> normal kernel is x86_64, while kdump kernel could be i386. Then customer
> complained why crashkernel=xM can't be put anywhere so that they don't
> need to know the details of limited low memory and huge high memory fact 
> in system.
> 
> The implementation of fallback is truly complicated, but its use is
> quite simple. And it makes crashkernel reservation setting simple.
> Most of users don't need to know crashkernel=,high, ,low things, unless
> the crashkernel region is too big. Nobody wants to take away 1G or more
> from low memory for kdump just in case bad thing happens, while normal
> kernel itself is seriously impacted by limited low memory.

IIUC, that's exactly what happens even on x86, it may take away a
significant chunk of the low memory. Let's say we have 1.2GB of 'low'
memory (below 4GB) on an arm64 platform. A crashkernel=1G would succeed
in a low allocation, pretty much affecting the whole system. It would
only fall back to 'high' _if_ you pass something like crashkernel=1.2G
so that the low allocation fails. So if I got this right, I find the
fall-back from crashkernel=X pretty useless, we shouldn't even try it.

It makes more sense if crashkernel=X,high is a hint to attempt a high
allocation first with a default low (overridden by a ,low option) or
even fall-back to low if there's no memory above 4GB.

Could you please have a look at Zhen Lei's latest series without any
fall-backs? I'd like to queue that if you are happy with it. We can then
look at adding some fall-back options on top.

IMO, we should only aim for:

	crashkernel=X		ZONE_DMA allocation, no fall-back
	crashkernel=X,high	hint for high allocation, small default
				low, fall back to low if alloc fails
	crashkernel=X,low	control the default low allocation, only
				high is passed

With the above, I'd expect admins to just go for crashkernel=X,high on
modern hardware with up to date kexec tools and it does the right thing.
The crashkernel=X can lead to unexpected results if it eats up all the
low memory. Let's say this option is for backwards compatibility only.

Thanks.
Baoquan He May 7, 2022, 10:45 a.m. UTC | #7
On 05/06/22 at 06:45pm, Catalin Marinas wrote:
> On Fri, May 06, 2022 at 09:16:08PM +0800, Baoquan He wrote:
> > On 05/06/22 at 11:22am, Leizhen (ThunderTown) wrote:
> > ......  
> > > >> @@ -118,8 +159,7 @@ static void __init reserve_crashkernel(void)
> > > >>  	if (crash_base)
> > > >>  		crash_max = crash_base + crash_size;
> > > >>  
> > > >> -	/* Current arm64 boot protocol requires 2MB alignment */
> > > >> -	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
> > > >> +	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
> > > >>  					       crash_base, crash_max);
> > > >>  	if (!crash_base) {
> > > >>  		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
> > > > 
> > > > I personally like this but let's see how the other thread goes. I guess
> > > 
> > > Me too. This fallback complicates code logic more than just a little.
> > > I'm not sure why someone would rather add fallback than change the bootup
> > > options to crashkernel=X,[high|low]. Perhaps fallback to high/low is a better
> > > compatible and extended mode when crashkernel=X fails to reserve memory. And
> > > the code logic will be much clearer.
> > 
> > The fallback does complicates code, while it was not made at the
> > beginning, but added later. The original crahskernel=xM can only reserve
> > low memory under 896M on x86 to be back compatible with the case in which
> > normal kernel is x86_64, while kdump kernel could be i386. Then customer
> > complained why crashkernel=xM can't be put anywhere so that they don't
> > need to know the details of limited low memory and huge high memory fact 
> > in system.
> > 
> > The implementation of fallback is truly complicated, but its use is
> > quite simple. And it makes crashkernel reservation setting simple.
> > Most of users don't need to know crashkernel=,high, ,low things, unless
> > the crashkernel region is too big. Nobody wants to take away 1G or more
> > from low memory for kdump just in case bad thing happens, while normal
> > kernel itself is seriously impacted by limited low memory.
> 
> IIUC, that's exactly what happens even on x86, it may take away a
> significant chunk of the low memory. Let's say we have 1.2GB of 'low'
> memory (below 4GB) on an arm64 platform. A crashkernel=1G would succeed
> in a low allocation, pretty much affecting the whole system. It would
> only fall back to 'high' _if_ you pass something like crashkernel=1.2G
> so that the low allocation fails. So if I got this right, I find the
> fall-back from crashkernel=X pretty useless, we shouldn't even try it.

Most of time, it's not easy to get 1G contiguous low memory. On x86,
firmware is mapped into low 4G virt address, and system initialization
will take some of them too. On arm64, it's hard too, e.g the physical
memory will start at 1G or 2G position, and firmware need be mapped
under 4G too, and kernel initialization costing. And we are eager to see
crashkernel=,high support too because we got a bug that on an arm64
server, its physical memory is scattered under low 4G so that the
biggest contiguous memory is less than 300M. (Not sure if it's a prototype
machine, I would not say its name in public.) In this case, we need
the fallback implementation to make our default crashkernel=xM setting
succeed getting the required memory from above 4G.

So from our experience and feedback given by customer, crashkernel=xM as
a default setting is the first choice and very easy to use and can
satisfy 99% of needs. If big crashkernel reservation is required,
considering low memory is limited and precious, while most of time
high memory is huge, crashkernel=,high is recommended. The price is about
200M or less memory for DMA, however much the required high memory is, 2G
or more. Believe me this kind of big memory requirement happens on very
few machines, because vmcore dumping tool makedumpfile taking the default
cyclic buffer method to dump which require not much memory. Unless user
has their own dumping tool or other dumping method which require much
memory.

crashkernel=xM, whether it is from its name, or the actual need, had
better get the fallback mechanism to allow it being put anywhere.

> 
> It makes more sense if crashkernel=X,high is a hint to attempt a high
> allocation first with a default low (overridden by a ,low option) or
> even fall-back to low if there's no memory above 4GB.

Hmm, maybe not so much. Please also consider the big end servers usually
carry tons of devices, its rebooting will take half an hour or even more
time. Imagine in an lab with hundereds of servers, one time of OS upgrading 
need to attempt high allocation firstly on each machine, then decide
what is set. That will drive operator mad. So we give them the simplest
way, crashkernel=xM to make it work. If you want to optimize the memory
usage and you know each system well, then please use crashkernel=,high
and crashkernel=,low to make it.

In our distros, the policy is if default crashkernel=xM setting with
OS installation doesn't work well, e.g OOM or reserving too much memory
causing wasting, bug can be reported. crashkernel=,high and
crashkernel=,low don't work well, settle by yourself.

> 
> Could you please have a look at Zhen Lei's latest series without any
> fall-backs? I'd like to queue that if you are happy with it. We can then
> look at adding some fall-back options on top.

I am fine with the v24, except of the corner case I pointed out. I
personally suggest merging the v24 series, and fix the corner case
and add fall back on top, with step by step style.

Thanks
Baoquan

> 
> IMO, we should only aim for:
> 
> 	crashkernel=X		ZONE_DMA allocation, no fall-back
> 	crashkernel=X,high	hint for high allocation, small default
> 				low, fall back to low if alloc fails
> 	crashkernel=X,low	control the default low allocation, only
> 				high is passed
> 
> With the above, I'd expect admins to just go for crashkernel=X,high on
> modern hardware with up to date kexec tools and it does the right thing.
> The crashkernel=X can lead to unexpected results if it eats up all the
> low memory. Let's say this option is for backwards compatibility only.
> 
> Thanks.
> 
> -- 
> Catalin
>
diff mbox series

Patch

diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index e16b248699d5c3c..19c2d487cb08feb 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -329,8 +329,13 @@  bool crash_is_nosave(unsigned long pfn)
 
 	/* in reserved memory? */
 	addr = __pfn_to_phys(pfn);
-	if ((addr < crashk_res.start) || (crashk_res.end < addr))
-		return false;
+	if ((addr < crashk_res.start) || (crashk_res.end < addr)) {
+		if (!crashk_low_res.end)
+			return false;
+
+		if ((addr < crashk_low_res.start) || (crashk_low_res.end < addr))
+			return false;
+	}
 
 	if (!kexec_crash_image)
 		return true;
diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
index 59c648d51848886..889951291cc0f9c 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -65,10 +65,18 @@  static int prepare_elf_headers(void **addr, unsigned long *sz)
 
 	/* Exclude crashkernel region */
 	ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
+	if (ret)
+		goto out;
+
+	if (crashk_low_res.end) {
+		ret = crash_exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end);
+		if (ret)
+			goto out;
+	}
 
-	if (!ret)
-		ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);
+	ret = crash_prepare_elf64_headers(cmem, true, addr, sz);
 
+out:
 	kfree(cmem);
 	return ret;
 }
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 51863f1448c6989..11406f3e1443168 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -90,6 +90,32 @@  phys_addr_t __ro_after_init arm64_dma_phys_limit;
 phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1;
 #endif
 
+/* Current arm64 boot protocol requires 2MB alignment */
+#define CRASH_ALIGN			SZ_2M
+
+#define CRASH_ADDR_LOW_MAX		arm64_dma_phys_limit
+#define CRASH_ADDR_HIGH_MAX		memblock.current_limit
+
+static int __init reserve_crashkernel_low(unsigned long long low_size)
+{
+	unsigned long long low_base;
+
+	low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
+	if (!low_base) {
+		pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
+		return -ENOMEM;
+	}
+
+	pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
+		low_base, low_base + low_size, low_size >> 20);
+
+	crashk_low_res.start = low_base;
+	crashk_low_res.end   = low_base + low_size - 1;
+	insert_resource(&iomem_resource, &crashk_low_res);
+
+	return 0;
+}
+
 /*
  * reserve_crashkernel() - reserves memory for crash kernel
  *
@@ -100,17 +126,32 @@  phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1;
 static void __init reserve_crashkernel(void)
 {
 	unsigned long long crash_base, crash_size;
-	unsigned long long crash_max = arm64_dma_phys_limit;
+	unsigned long long crash_low_size = 0;
+	unsigned long long crash_max = CRASH_ADDR_LOW_MAX;
+	char *cmdline = boot_command_line;
 	int ret;
 
 	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
 		return;
 
-	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
+	/* crashkernel=X[@offset] */
+	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
 				&crash_size, &crash_base);
-	/* no crashkernel= or invalid value specified */
-	if (ret || !crash_size)
-		return;
+	if (ret || !crash_size) {
+		ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base);
+		if (ret || !crash_size)
+			return;
+
+		/*
+		 * crashkernel=Y,low can be specified or not, but invalid value
+		 * is not allowed.
+		 */
+		ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base);
+		if (ret && (ret != -ENOENT))
+			return;
+
+		crash_max = CRASH_ADDR_HIGH_MAX;
+	}
 
 	crash_size = PAGE_ALIGN(crash_size);
 
@@ -118,8 +159,7 @@  static void __init reserve_crashkernel(void)
 	if (crash_base)
 		crash_max = crash_base + crash_size;
 
-	/* Current arm64 boot protocol requires 2MB alignment */
-	crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
+	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
 					       crash_base, crash_max);
 	if (!crash_base) {
 		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
@@ -127,6 +167,11 @@  static void __init reserve_crashkernel(void)
 		return;
 	}
 
+	if (crash_low_size && reserve_crashkernel_low(crash_low_size)) {
+		memblock_phys_free(crash_base, crash_size);
+		return;
+	}
+
 	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
 		crash_base, crash_base + crash_size, crash_size >> 20);
 
@@ -135,6 +180,9 @@  static void __init reserve_crashkernel(void)
 	 * map. Inform kmemleak so that it won't try to access it.
 	 */
 	kmemleak_ignore_phys(crash_base);
+	if (crashk_low_res.end)
+		kmemleak_ignore_phys(crashk_low_res.start);
+
 	crashk_res.start = crash_base;
 	crashk_res.end = crash_base + crash_size - 1;
 	insert_resource(&iomem_resource, &crashk_res);