diff mbox

[v9,07/11] arm64: kexec_file: add crash dump support

Message ID 20180425062629.29404-8-takahiro.akashi@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

AKASHI Takahiro April 25, 2018, 6:26 a.m. UTC
Enabling crash dump (kdump) includes
* prepare contents of ELF header of a core dump file, /proc/vmcore,
  using crash_prepare_elf64_headers(), and
* add two device tree properties, "linux,usable-memory-range" and
  "linux,elfcorehdr", which represent repsectively a memory range
  to be used by crash dump kernel and the header's location

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/kexec.h         |   4 +
 arch/arm64/kernel/kexec_image.c        |   9 +-
 arch/arm64/kernel/machine_kexec_file.c | 202 +++++++++++++++++++++++++
 3 files changed, 213 insertions(+), 2 deletions(-)

Comments

James Morse May 15, 2018, 5:11 p.m. UTC | #1
Hi Akashi,

On 25/04/18 07:26, AKASHI Takahiro wrote:
> Enabling crash dump (kdump) includes
> * prepare contents of ELF header of a core dump file, /proc/vmcore,
>   using crash_prepare_elf64_headers(), and
> * add two device tree properties, "linux,usable-memory-range" and
>   "linux,elfcorehdr", which represent repsectively a memory range

(Nit: respectively)


>   to be used by crash dump kernel and the header's location

>  arch/arm64/include/asm/kexec.h         |   4 +
>  arch/arm64/kernel/kexec_image.c        |   9 +-
>  arch/arm64/kernel/machine_kexec_file.c | 202 +++++++++++++++++++++++++

In this patch, machine_kexec_file.c gains its own private fdt array encoder.


> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
> index 37c0a9dc2e47..ec674f4d267c 100644
> --- a/arch/arm64/kernel/machine_kexec_file.c
> +++ b/arch/arm64/kernel/machine_kexec_file.c
> @@ -76,6 +81,78 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf,
>  	return ret;
>  }
>  
> +static int __init arch_kexec_file_init(void)
> +{
> +	/* Those values are used later on loading the kernel */
> +	__dt_root_addr_cells = dt_root_addr_cells;
> +	__dt_root_size_cells = dt_root_size_cells;
> +
> +	return 0;
> +}
> +late_initcall(arch_kexec_file_init);

If we need these is it worth taking them out of __initdata? I note they've been
'temporary' for quite a long time.


> +
> +#define FDT_ALIGN(x, a)	(((x) + (a) - 1) & ~((a) - 1))
> +#define FDT_TAGALIGN(x)	(FDT_ALIGN((x), FDT_TAGSIZE))
> +
> +static int fdt_prop_len(const char *prop_name, int len)
> +{
> +	return (strlen(prop_name) + 1) +
> +		sizeof(struct fdt_property) +
> +		FDT_TAGALIGN(len);
> +}

This stuff should really be in libfdt.h  Those macros come from
libfdt_internal.h, so we're probably doing something wrong here.


> +static bool cells_size_fitted(unsigned long base, unsigned long size)
> +{
> +	/* if *_cells >= 2, cells can hold 64-bit values anyway */
> +	if ((__dt_root_addr_cells == 1) && (base >= (1ULL << 32)))
> +		return false;
> +
> +	if ((__dt_root_size_cells == 1) && (size >= (1ULL << 32)))
> +		return false;

Using '> U32_MAX' here may be more readable.


> +	return true;
> +}
> +
> +static void fill_property(void *buf, u64 val64, int cells)
> +{
> +	u32 val32;
> +
> +	if (cells == 1) {
> +		val32 = cpu_to_fdt32((u32)val64);
> +		memcpy(buf, &val32, sizeof(val32));
> +	} else {

> +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));
> +		buf += cells * sizeof(u32) - sizeof(u64);

Is this trying to clear the 'top' cells and shuffle the pointer to point at the
'bottom' 2? I'm pretty sure this isn't endian safe.

Do we really expect a system to have #address-cells > 2?


> +		val64 = cpu_to_fdt64(val64);
> +		memcpy(buf, &val64, sizeof(val64));
> +	}
> +}
> +
> +static int fdt_setprop_range(void *fdt, int nodeoffset, const char *name,
> +				unsigned long addr, unsigned long size)

(the device-tree spec describes a 'ranges' property, which had me confused. This
is encoding a prop-encoded-array)

> +{
> +	void *buf, *prop;
> +	size_t buf_size;
> +	int result;
> +
> +	buf_size = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
> +	prop = buf = vmalloc(buf_size);

virtual memory allocation for something less than PAGE_SIZE?


> +	if (!buf)
> +		return -ENOMEM;
> +
> +	fill_property(prop, addr, __dt_root_addr_cells);
> +	prop += __dt_root_addr_cells * sizeof(u32);
> +
> +	fill_property(prop, size, __dt_root_size_cells);
> +
> +	result = fdt_setprop(fdt, nodeoffset, name, buf, buf_size);
> +
> +	vfree(buf);
> +
> +	return result;
> +}

Doesn't this stuff belong in libfdt? I guess there is no 'add array element' api
because this the first time we've wanted to create a node with more than
key=fixed-size-value.

I don't think this belongs in arch C code. Do we have a plan for getting libfdt
to support encoding prop-arrays? Can we put it somewhere anyone else duplicating
this will find it, until we can (re)move it?

I have no idea how that happens... it looks like the devicetree list is the
place to ask.


>  static int setup_dtb(struct kimage *image,
>  		unsigned long initrd_load_addr, unsigned long initrd_len,
>  		char *cmdline, unsigned long cmdline_len,
> @@ -88,10 +165,26 @@ static int setup_dtb(struct kimage *image,
>  	int range_len;
>  	int ret;
>  
> +	/* check ranges against root's #address-cells and #size-cells */
> +	if (image->type == KEXEC_TYPE_CRASH &&
> +		(!cells_size_fitted(image->arch.elf_load_addr,
> +				image->arch.elf_headers_sz) ||
> +		 !cells_size_fitted(crashk_res.start,
> +				crashk_res.end - crashk_res.start + 1))) {
> +		pr_err("Crash memory region doesn't fit into DT's root cell sizes.\n");
> +		ret = -EINVAL;
> +		goto out_err;
> +	}

To check I've understood this properly: This can happen if the firmware provided
a DTB with 32bit address/size cells, but at least some of the memory requires 64
bit address/size cells. This could only happen on a UEFI system where the
firmware-DTB doesn't describe memory. ACPI-only systems would have the EFIstub DT.


>  	/* duplicate dt blob */
>  	buf_size = fdt_totalsize(initial_boot_params);
>  	range_len = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
>  
> +	if (image->type == KEXEC_TYPE_CRASH)
> +		buf_size += fdt_prop_len("linux,elfcorehdr", range_len)
> +				+ fdt_prop_len("linux,usable-memory-range",
> +								range_len);
> +
>  	if (initrd_load_addr)
>  		buf_size += fdt_prop_len("linux,initrd-start", sizeof(u64))
>  				+ fdt_prop_len("linux,initrd-end", sizeof(u64));
> @@ -113,6 +206,23 @@ static int setup_dtb(struct kimage *image,
>  	if (nodeoffset < 0)
>  		goto out_err;
>  
> +	if (image->type == KEXEC_TYPE_CRASH) {
> +		/* add linux,elfcorehdr */
> +		ret = fdt_setprop_range(buf, nodeoffset, "linux,elfcorehdr",
> +				image->arch.elf_load_addr,
> +				image->arch.elf_headers_sz);
> +		if (ret)
> +			goto out_err;
> +
> +		/* add linux,usable-memory-range */
> +		ret = fdt_setprop_range(buf, nodeoffset,
> +				"linux,usable-memory-range",
> +				crashk_res.start,
> +				crashk_res.end - crashk_res.start + 1);

Don't you need to add "linux,usable-memory-range" to the buf_size estimate?


> +		if (ret)
> +			goto out_err;
> +	}

> @@ -148,17 +258,109 @@ static int setup_dtb(struct kimage *image,

> +static struct crash_mem *get_crash_memory_ranges(void)
> +{
> +	unsigned int nr_ranges;
> +	struct crash_mem *cmem;
> +
> +	nr_ranges = 1; /* for exclusion of crashkernel region */
> +	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ranges_callback);
> +
> +	cmem = vmalloc(sizeof(struct crash_mem) +
> +			sizeof(struct crash_mem_range) * nr_ranges);
> +	if (!cmem)
> +		return NULL;
> +
> +	cmem->max_nr_ranges = nr_ranges;
> +	cmem->nr_ranges = 0;
> +	walk_system_ram_res(0, -1, cmem, add_mem_range_callback);
> +
> +	/* Exclude crashkernel region */
> +	if (crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end)) {
> +		vfree(cmem);
> +		return NULL;
> +	}
> +
> +	return cmem;
> +}

Could this function be included in prepare_elf_headers() so that the alloc() and
free() occur together.


> +static int prepare_elf_headers(void **addr, unsigned long *sz)
> +{
> +	struct crash_mem *cmem;
> +	int ret = 0;
> +
> +	cmem = get_crash_memory_ranges();
> +	if (!cmem)
> +		return -ENOMEM;
> +
> +	ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);
> +
> +	vfree(cmem);

> +	return ret;
> +}

All this is moving memory-range information from core-code's
walk_system_ram_res() into core-code's struct crash_mem, and excluding
crashk_res, which again is accessible to the core code.

It looks like this is duplicated in arch/x86 and arch/arm64 because arm64
doesn't have a second 'crashk_low_res' region, and always wants elf64, instead
of when IS_ENABLED(CONFIG_X86_64).
If we can abstract just those two, more of this could be moved to core code
where powerpc can make use of it if they want to support kdump with
kexec_file_load().

But, its getting late for cross-architecture dependencies, lets put that on the
for-later list. (assuming there isn't a powerpc-kdump series out there adding a
third copy of this)


Thanks,

James
James Morse May 15, 2018, 5:12 p.m. UTC | #2
Hi guys,

(CC: +RobH, devicetree list)

On 25/04/18 07:26, AKASHI Takahiro wrote:
> Enabling crash dump (kdump) includes
> * prepare contents of ELF header of a core dump file, /proc/vmcore,
>   using crash_prepare_elf64_headers(), and
> * add two device tree properties, "linux,usable-memory-range" and
>   "linux,elfcorehdr", which represent repsectively a memory range
>   to be used by crash dump kernel and the header's location

kexec_file_load() on arm64 needs to be able to create a prop encoded array to
the FDT, but there doesn't appear to be a libfdt helper to do this.

Akashi's code below adds fdt_setprop_range() to the arch code, and duplicates
bits of libfdt_internal.h to do the work.

How should this be done? I'm assuming this is something we need a new API in
libfdt.h for. How do these come about, and is there an interim step we can use
until then?

Thanks!

James

> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
> index 37c0a9dc2e47..ec674f4d267c 100644
> --- a/arch/arm64/kernel/machine_kexec_file.c
> +++ b/arch/arm64/kernel/machine_kexec_file.c
> @@ -76,6 +81,78 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf,
>  	return ret;
>  }
>  
> +static int __init arch_kexec_file_init(void)
> +{
> +	/* Those values are used later on loading the kernel */
> +	__dt_root_addr_cells = dt_root_addr_cells;
> +	__dt_root_size_cells = dt_root_size_cells;
> +
> +	return 0;
> +}
> +late_initcall(arch_kexec_file_init);
> +
> +#define FDT_ALIGN(x, a)	(((x) + (a) - 1) & ~((a) - 1))
> +#define FDT_TAGALIGN(x)	(FDT_ALIGN((x), FDT_TAGSIZE))
> +
> +static int fdt_prop_len(const char *prop_name, int len)
> +{
> +	return (strlen(prop_name) + 1) +
> +		sizeof(struct fdt_property) +
> +		FDT_TAGALIGN(len);
> +}
> +
> +static bool cells_size_fitted(unsigned long base, unsigned long size)
> +{
> +	/* if *_cells >= 2, cells can hold 64-bit values anyway */
> +	if ((__dt_root_addr_cells == 1) && (base >= (1ULL << 32)))
> +		return false;
> +
> +	if ((__dt_root_size_cells == 1) && (size >= (1ULL << 32)))
> +		return false;
> +
> +	return true;
> +}
> +
> +static void fill_property(void *buf, u64 val64, int cells)
> +{
> +	u32 val32;
> +
> +	if (cells == 1) {
> +		val32 = cpu_to_fdt32((u32)val64);
> +		memcpy(buf, &val32, sizeof(val32));
> +	} else {
> +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));
> +		buf += cells * sizeof(u32) - sizeof(u64);
> +
> +		val64 = cpu_to_fdt64(val64);
> +		memcpy(buf, &val64, sizeof(val64));
> +	}
> +}
> +
> +static int fdt_setprop_range(void *fdt, int nodeoffset, const char *name,
> +				unsigned long addr, unsigned long size)
> +{
> +	void *buf, *prop;
> +	size_t buf_size;
> +	int result;
> +
> +	buf_size = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
> +	prop = buf = vmalloc(buf_size);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	fill_property(prop, addr, __dt_root_addr_cells);
> +	prop += __dt_root_addr_cells * sizeof(u32);
> +
> +	fill_property(prop, size, __dt_root_size_cells);
> +
> +	result = fdt_setprop(fdt, nodeoffset, name, buf, buf_size);
> +
> +	vfree(buf);
> +
> +	return result;
> +}
> +
>  static int setup_dtb(struct kimage *image,
>  		unsigned long initrd_load_addr, unsigned long initrd_len,
>  		char *cmdline, unsigned long cmdline_len,
> @@ -88,10 +165,26 @@ static int setup_dtb(struct kimage *image,
>  	int range_len;
>  	int ret;
>  
> +	/* check ranges against root's #address-cells and #size-cells */
> +	if (image->type == KEXEC_TYPE_CRASH &&
> +		(!cells_size_fitted(image->arch.elf_load_addr,
> +				image->arch.elf_headers_sz) ||
> +		 !cells_size_fitted(crashk_res.start,
> +				crashk_res.end - crashk_res.start + 1))) {
> +		pr_err("Crash memory region doesn't fit into DT's root cell sizes.\n");
> +		ret = -EINVAL;
> +		goto out_err;
> +	}
> +
>  	/* duplicate dt blob */
>  	buf_size = fdt_totalsize(initial_boot_params);
>  	range_len = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
>  
> +	if (image->type == KEXEC_TYPE_CRASH)
> +		buf_size += fdt_prop_len("linux,elfcorehdr", range_len)
> +				+ fdt_prop_len("linux,usable-memory-range",
> +								range_len);
> +
>  	if (initrd_load_addr)
>  		buf_size += fdt_prop_len("linux,initrd-start", sizeof(u64))
>  				+ fdt_prop_len("linux,initrd-end", sizeof(u64));
> @@ -113,6 +206,23 @@ static int setup_dtb(struct kimage *image,
>  	if (nodeoffset < 0)
>  		goto out_err;
>  
> +	if (image->type == KEXEC_TYPE_CRASH) {
> +		/* add linux,elfcorehdr */
> +		ret = fdt_setprop_range(buf, nodeoffset, "linux,elfcorehdr",
> +				image->arch.elf_load_addr,
> +				image->arch.elf_headers_sz);
> +		if (ret)
> +			goto out_err;
> +
> +		/* add linux,usable-memory-range */
> +		ret = fdt_setprop_range(buf, nodeoffset,
> +				"linux,usable-memory-range",
> +				crashk_res.start,
> +				crashk_res.end - crashk_res.start + 1);
> +		if (ret)
> +			goto out_err;
> +	}
> +
>  	/* add bootargs */
>  	if (cmdline) {
>  		ret = fdt_setprop(buf, nodeoffset, "bootargs",
James Morse May 16, 2018, 8:34 a.m. UTC | #3
Hi Akashi,

On 15/05/18 18:11, James Morse wrote:
> On 25/04/18 07:26, AKASHI Takahiro wrote:
>> Enabling crash dump (kdump) includes
>> * prepare contents of ELF header of a core dump file, /proc/vmcore,
>>   using crash_prepare_elf64_headers(), and
>> * add two device tree properties, "linux,usable-memory-range" and
>>   "linux,elfcorehdr", which represent repsectively a memory range
>>   to be used by crash dump kernel and the header's location

>> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
>> index 37c0a9dc2e47..ec674f4d267c 100644
>> --- a/arch/arm64/kernel/machine_kexec_file.c
>> +++ b/arch/arm64/kernel/machine_kexec_file.c
>> @@ -76,6 +81,78 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf,

>> +static void fill_property(void *buf, u64 val64, int cells)
>> +{
>> +	u32 val32;
>> +
>> +	if (cells == 1) {
>> +		val32 = cpu_to_fdt32((u32)val64);
>> +		memcpy(buf, &val32, sizeof(val32));
>> +	} else {
> 
>> +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));
>> +		buf += cells * sizeof(u32) - sizeof(u64);
> 
> Is this trying to clear the 'top' cells and shuffle the pointer to point at the
> 'bottom' 2? I'm pretty sure this isn't endian safe.

It came to me at 2am: this only works on big-endian, which is exactly what you
want as that is the DT format.


> Do we really expect a system to have #address-cells > 2?


Thanks,

James
James Morse May 16, 2018, 10:06 a.m. UTC | #4
Hi Akashi,

On 15/05/18 18:11, James Morse wrote:
> On 25/04/18 07:26, AKASHI Takahiro wrote:
>> Enabling crash dump (kdump) includes
>> * prepare contents of ELF header of a core dump file, /proc/vmcore,
>>   using crash_prepare_elf64_headers(), and
>> * add two device tree properties, "linux,usable-memory-range" and
>>   "linux,elfcorehdr", which represent repsectively a memory range
>>   to be used by crash dump kernel and the header's location

>> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
>> index 37c0a9dc2e47..ec674f4d267c 100644
>> --- a/arch/arm64/kernel/machine_kexec_file.c
>> +++ b/arch/arm64/kernel/machine_kexec_file.c

>> +static struct crash_mem *get_crash_memory_ranges(void)
>> +{
>> +	unsigned int nr_ranges;
>> +	struct crash_mem *cmem;
>> +
>> +	nr_ranges = 1; /* for exclusion of crashkernel region */
>> +	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ranges_callback);
>> +
>> +	cmem = vmalloc(sizeof(struct crash_mem) +
>> +			sizeof(struct crash_mem_range) * nr_ranges);
>> +	if (!cmem)
>> +		return NULL;
>> +
>> +	cmem->max_nr_ranges = nr_ranges;
>> +	cmem->nr_ranges = 0;
>> +	walk_system_ram_res(0, -1, cmem, add_mem_range_callback);
>> +
>> +	/* Exclude crashkernel region */
>> +	if (crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end)) {
>> +		vfree(cmem);
>> +		return NULL;
>> +	}
>> +
>> +	return cmem;
>> +}
> 
> Could this function be included in prepare_elf_headers() so that the alloc() and
> free() occur together.
> 
> 
>> +static int prepare_elf_headers(void **addr, unsigned long *sz)
>> +{
>> +	struct crash_mem *cmem;
>> +	int ret = 0;
>> +
>> +	cmem = get_crash_memory_ranges();
>> +	if (!cmem)
>> +		return -ENOMEM;
>> +
>> +	ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);
>> +
>> +	vfree(cmem);
> 
>> +	return ret;
>> +}
> 
> All this is moving memory-range information from core-code's
> walk_system_ram_res() into core-code's struct crash_mem, and excluding
> crashk_res, which again is accessible to the core code.
> 
> It looks like this is duplicated in arch/x86 and arch/arm64 because arm64
> doesn't have a second 'crashk_low_res' region, and always wants elf64, instead
> of when IS_ENABLED(CONFIG_X86_64).

Thinking about it some more: don't we want to walk memblock here, not
walk_system_ram_res()? What we want is a list of not-nomap regions that the
kernel may have been using, to form part of vmcore.
walk_system_ram_res() is becoming a murkier list of maybe-nomap, maybe-reserved.

I think we should walk the same list here as we do in patch 4.


Thanks,

James
AKASHI Takahiro May 18, 2018, 9:50 a.m. UTC | #5
On Wed, May 16, 2018 at 11:06:02AM +0100, James Morse wrote:
> Hi Akashi,
> 
> On 15/05/18 18:11, James Morse wrote:
> > On 25/04/18 07:26, AKASHI Takahiro wrote:
> >> Enabling crash dump (kdump) includes
> >> * prepare contents of ELF header of a core dump file, /proc/vmcore,
> >>   using crash_prepare_elf64_headers(), and
> >> * add two device tree properties, "linux,usable-memory-range" and
> >>   "linux,elfcorehdr", which represent repsectively a memory range
> >>   to be used by crash dump kernel and the header's location
> 
> >> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
> >> index 37c0a9dc2e47..ec674f4d267c 100644
> >> --- a/arch/arm64/kernel/machine_kexec_file.c
> >> +++ b/arch/arm64/kernel/machine_kexec_file.c
> 
> >> +static struct crash_mem *get_crash_memory_ranges(void)
> >> +{
> >> +	unsigned int nr_ranges;
> >> +	struct crash_mem *cmem;
> >> +
> >> +	nr_ranges = 1; /* for exclusion of crashkernel region */
> >> +	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ranges_callback);
> >> +
> >> +	cmem = vmalloc(sizeof(struct crash_mem) +
> >> +			sizeof(struct crash_mem_range) * nr_ranges);
> >> +	if (!cmem)
> >> +		return NULL;
> >> +
> >> +	cmem->max_nr_ranges = nr_ranges;
> >> +	cmem->nr_ranges = 0;
> >> +	walk_system_ram_res(0, -1, cmem, add_mem_range_callback);
> >> +
> >> +	/* Exclude crashkernel region */
> >> +	if (crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end)) {
> >> +		vfree(cmem);
> >> +		return NULL;
> >> +	}
> >> +
> >> +	return cmem;
> >> +}
> > 
> > Could this function be included in prepare_elf_headers() so that the alloc() and
> > free() occur together.
> > 
> > 
> >> +static int prepare_elf_headers(void **addr, unsigned long *sz)
> >> +{
> >> +	struct crash_mem *cmem;
> >> +	int ret = 0;
> >> +
> >> +	cmem = get_crash_memory_ranges();
> >> +	if (!cmem)
> >> +		return -ENOMEM;
> >> +
> >> +	ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);
> >> +
> >> +	vfree(cmem);
> > 
> >> +	return ret;
> >> +}
> > 
> > All this is moving memory-range information from core-code's
> > walk_system_ram_res() into core-code's struct crash_mem, and excluding
> > crashk_res, which again is accessible to the core code.
> > 
> > It looks like this is duplicated in arch/x86 and arch/arm64 because arm64
> > doesn't have a second 'crashk_low_res' region, and always wants elf64, instead
> > of when IS_ENABLED(CONFIG_X86_64).
> 
> Thinking about it some more: don't we want to walk memblock here, not
> walk_system_ram_res()? What we want is a list of not-nomap regions that the
> kernel may have been using, to form part of vmcore.
> walk_system_ram_res() is becoming a murkier list of maybe-nomap, maybe-reserved.
> 
> I think we should walk the same list here as we do in patch 4.

For consistency, yes.
I missed that.

-Takahiro AKASHI

> 
> 
> Thanks,
> 
> James
AKASHI Takahiro May 18, 2018, 9:58 a.m. UTC | #6
On Wed, May 16, 2018 at 09:34:41AM +0100, James Morse wrote:
> Hi Akashi,
> 
> On 15/05/18 18:11, James Morse wrote:
> > On 25/04/18 07:26, AKASHI Takahiro wrote:
> >> Enabling crash dump (kdump) includes
> >> * prepare contents of ELF header of a core dump file, /proc/vmcore,
> >>   using crash_prepare_elf64_headers(), and
> >> * add two device tree properties, "linux,usable-memory-range" and
> >>   "linux,elfcorehdr", which represent repsectively a memory range
> >>   to be used by crash dump kernel and the header's location
> 
> >> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
> >> index 37c0a9dc2e47..ec674f4d267c 100644
> >> --- a/arch/arm64/kernel/machine_kexec_file.c
> >> +++ b/arch/arm64/kernel/machine_kexec_file.c
> >> @@ -76,6 +81,78 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf,
> 
> >> +static void fill_property(void *buf, u64 val64, int cells)
> >> +{
> >> +	u32 val32;
> >> +
> >> +	if (cells == 1) {
> >> +		val32 = cpu_to_fdt32((u32)val64);
> >> +		memcpy(buf, &val32, sizeof(val32));
> >> +	} else {
> > 
> >> +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));
> >> +		buf += cells * sizeof(u32) - sizeof(u64);
> > 
> > Is this trying to clear the 'top' cells and shuffle the pointer to point at the
> > 'bottom' 2? I'm pretty sure this isn't endian safe.
> 
> It came to me at 2am: this only works on big-endian, which is exactly what you
> want as that is the DT format.

Oops, I was almost tricked as I haven't tested kexec on BE
for a long time :)

Thanks,
-Takahiro AKASHI

> 
> > Do we really expect a system to have #address-cells > 2?
> 
> 
> Thanks,
> 
> James
AKASHI Takahiro May 18, 2018, 10:39 a.m. UTC | #7
On Tue, May 15, 2018 at 06:11:15PM +0100, James Morse wrote:
> Hi Akashi,
> 
> On 25/04/18 07:26, AKASHI Takahiro wrote:
> > Enabling crash dump (kdump) includes
> > * prepare contents of ELF header of a core dump file, /proc/vmcore,
> >   using crash_prepare_elf64_headers(), and
> > * add two device tree properties, "linux,usable-memory-range" and
> >   "linux,elfcorehdr", which represent repsectively a memory range
> 
> (Nit: respectively)

Will fix.

> 
> >   to be used by crash dump kernel and the header's location
> 
> >  arch/arm64/include/asm/kexec.h         |   4 +
> >  arch/arm64/kernel/kexec_image.c        |   9 +-
> >  arch/arm64/kernel/machine_kexec_file.c | 202 +++++++++++++++++++++++++
> 
> In this patch, machine_kexec_file.c gains its own private fdt array encoder.

See below.

> 
> > diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
> > index 37c0a9dc2e47..ec674f4d267c 100644
> > --- a/arch/arm64/kernel/machine_kexec_file.c
> > +++ b/arch/arm64/kernel/machine_kexec_file.c
> > @@ -76,6 +81,78 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf,
> >  	return ret;
> >  }
> >  
> > +static int __init arch_kexec_file_init(void)
> > +{
> > +	/* Those values are used later on loading the kernel */
> > +	__dt_root_addr_cells = dt_root_addr_cells;
> > +	__dt_root_size_cells = dt_root_size_cells;
> > +
> > +	return 0;
> > +}
> > +late_initcall(arch_kexec_file_init);
> 
> If we need these is it worth taking them out of __initdata? I note they've been
> 'temporary' for quite a long time.

I think that I had some reason that I didn't do that, but don't remember now.
If there's no problem, I will take your suggestion.

> 
> > +
> > +#define FDT_ALIGN(x, a)	(((x) + (a) - 1) & ~((a) - 1))
> > +#define FDT_TAGALIGN(x)	(FDT_ALIGN((x), FDT_TAGSIZE))
> > +
> > +static int fdt_prop_len(const char *prop_name, int len)
> > +{
> > +	return (strlen(prop_name) + 1) +
> > +		sizeof(struct fdt_property) +
> > +		FDT_TAGALIGN(len);
> > +}
> 
> This stuff should really be in libfdt.h  Those macros come from
> libfdt_internal.h, so we're probably doing something wrong here.
> 
> 
> > +static bool cells_size_fitted(unsigned long base, unsigned long size)
> > +{
> > +	/* if *_cells >= 2, cells can hold 64-bit values anyway */
> > +	if ((__dt_root_addr_cells == 1) && (base >= (1ULL << 32)))
> > +		return false;
> > +
> > +	if ((__dt_root_size_cells == 1) && (size >= (1ULL << 32)))
> > +		return false;
> 
> Using '> U32_MAX' here may be more readable.

OK

> 
> > +	return true;
> > +}
> > +
> > +static void fill_property(void *buf, u64 val64, int cells)
> > +{
> > +	u32 val32;
> > +
> > +	if (cells == 1) {
> > +		val32 = cpu_to_fdt32((u32)val64);
> > +		memcpy(buf, &val32, sizeof(val32));
> > +	} else {
> 
> > +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));
> > +		buf += cells * sizeof(u32) - sizeof(u64);
> 
> Is this trying to clear the 'top' cells and shuffle the pointer to point at the
> 'bottom' 2? I'm pretty sure this isn't endian safe.
> 
> Do we really expect a system to have #address-cells > 2?

I don't know, but just for safety.

> 
> > +		val64 = cpu_to_fdt64(val64);
> > +		memcpy(buf, &val64, sizeof(val64));
> > +	}
> > +}
> > +
> > +static int fdt_setprop_range(void *fdt, int nodeoffset, const char *name,
> > +				unsigned long addr, unsigned long size)
> 
> (the device-tree spec describes a 'ranges' property, which had me confused. This
> is encoding a prop-encoded-array)

Should we rename it to, say, fdt_setprop_reg()?


> > +{
> > +	void *buf, *prop;
> > +	size_t buf_size;
> > +	int result;
> > +
> > +	buf_size = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
> > +	prop = buf = vmalloc(buf_size);
> 
> virtual memory allocation for something less than PAGE_SIZE?

I've never cared about that. Let me think again.

> 
> > +	if (!buf)
> > +		return -ENOMEM;
> > +
> > +	fill_property(prop, addr, __dt_root_addr_cells);
> > +	prop += __dt_root_addr_cells * sizeof(u32);
> > +
> > +	fill_property(prop, size, __dt_root_size_cells);
> > +
> > +	result = fdt_setprop(fdt, nodeoffset, name, buf, buf_size);
> > +
> > +	vfree(buf);
> > +
> > +	return result;
> > +}
> 
> Doesn't this stuff belong in libfdt? I guess there is no 'add array element' api
> because this the first time we've wanted to create a node with more than
> key=fixed-size-value.
> 
> I don't think this belongs in arch C code. Do we have a plan for getting libfdt
> to support encoding prop-arrays? Can we put it somewhere anyone else duplicating
> this will find it, until we can (re)move it?

I will temporarily move all fdt-related stuff to a separate file, but

> I have no idea how that happens... it looks like the devicetree list is the
> place to ask.

should we always sync with the original dtc/libfdt repository?

> 
> >  static int setup_dtb(struct kimage *image,
> >  		unsigned long initrd_load_addr, unsigned long initrd_len,
> >  		char *cmdline, unsigned long cmdline_len,
> > @@ -88,10 +165,26 @@ static int setup_dtb(struct kimage *image,
> >  	int range_len;
> >  	int ret;
> >  
> > +	/* check ranges against root's #address-cells and #size-cells */
> > +	if (image->type == KEXEC_TYPE_CRASH &&
> > +		(!cells_size_fitted(image->arch.elf_load_addr,
> > +				image->arch.elf_headers_sz) ||
> > +		 !cells_size_fitted(crashk_res.start,
> > +				crashk_res.end - crashk_res.start + 1))) {
> > +		pr_err("Crash memory region doesn't fit into DT's root cell sizes.\n");
> > +		ret = -EINVAL;
> > +		goto out_err;
> > +	}
> 
> To check I've understood this properly: This can happen if the firmware provided
> a DTB with 32bit address/size cells, but at least some of the memory requires 64
> bit address/size cells. This could only happen on a UEFI system where the
> firmware-DTB doesn't describe memory. ACPI-only systems would have the EFIstub DT.

Probably, yes. I assumed the case where #address-cells and #size-cells
were just missing in fdt.

> 
> >  	/* duplicate dt blob */
> >  	buf_size = fdt_totalsize(initial_boot_params);
> >  	range_len = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
> >  
> > +	if (image->type == KEXEC_TYPE_CRASH)
> > +		buf_size += fdt_prop_len("linux,elfcorehdr", range_len)
> > +				+ fdt_prop_len("linux,usable-memory-range",
> > +								range_len);

                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

> > +
> >  	if (initrd_load_addr)
> >  		buf_size += fdt_prop_len("linux,initrd-start", sizeof(u64))
> >  				+ fdt_prop_len("linux,initrd-end", sizeof(u64));
> > @@ -113,6 +206,23 @@ static int setup_dtb(struct kimage *image,
> >  	if (nodeoffset < 0)
> >  		goto out_err;
> >  
> > +	if (image->type == KEXEC_TYPE_CRASH) {
> > +		/* add linux,elfcorehdr */
> > +		ret = fdt_setprop_range(buf, nodeoffset, "linux,elfcorehdr",
> > +				image->arch.elf_load_addr,
> > +				image->arch.elf_headers_sz);
> > +		if (ret)
> > +			goto out_err;
> > +
> > +		/* add linux,usable-memory-range */
> > +		ret = fdt_setprop_range(buf, nodeoffset,
> > +				"linux,usable-memory-range",
> > +				crashk_res.start,
> > +				crashk_res.end - crashk_res.start + 1);
> 
> Don't you need to add "linux,usable-memory-range" to the buf_size estimate?

I think the code exists. See above.

> 
> > +		if (ret)
> > +			goto out_err;
> > +	}
> 
> > @@ -148,17 +258,109 @@ static int setup_dtb(struct kimage *image,
> 
> > +static struct crash_mem *get_crash_memory_ranges(void)
> > +{
> > +	unsigned int nr_ranges;
> > +	struct crash_mem *cmem;
> > +
> > +	nr_ranges = 1; /* for exclusion of crashkernel region */
> > +	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ranges_callback);
> > +
> > +	cmem = vmalloc(sizeof(struct crash_mem) +
> > +			sizeof(struct crash_mem_range) * nr_ranges);
> > +	if (!cmem)
> > +		return NULL;
> > +
> > +	cmem->max_nr_ranges = nr_ranges;
> > +	cmem->nr_ranges = 0;
> > +	walk_system_ram_res(0, -1, cmem, add_mem_range_callback);
> > +
> > +	/* Exclude crashkernel region */
> > +	if (crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end)) {
> > +		vfree(cmem);
> > +		return NULL;
> > +	}
> > +
> > +	return cmem;
> > +}
> 
> Could this function be included in prepare_elf_headers() so that the alloc() and
> free() occur together.


Or aiming that arm64 and x86 have similar-look code?

> 
> > +static int prepare_elf_headers(void **addr, unsigned long *sz)
> > +{
> > +	struct crash_mem *cmem;
> > +	int ret = 0;
> > +
> > +	cmem = get_crash_memory_ranges();
> > +	if (!cmem)
> > +		return -ENOMEM;
> > +
> > +	ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);
> > +
> > +	vfree(cmem);
> 
> > +	return ret;
> > +}
> 
> All this is moving memory-range information from core-code's
> walk_system_ram_res() into core-code's struct crash_mem, and excluding
> crashk_res, which again is accessible to the core code.
> 
> It looks like this is duplicated in arch/x86 and arch/arm64 because arm64
> doesn't have a second 'crashk_low_res' region, and always wants elf64, instead
> of when IS_ENABLED(CONFIG_X86_64).
> If we can abstract just those two, more of this could be moved to core code
> where powerpc can make use of it if they want to support kdump with
> kexec_file_load().
> 
> But, its getting late for cross-architecture dependencies, lets put that on the
> for-later list. (assuming there isn't a powerpc-kdump series out there adding a
> third copy of this)

Sure. X86 code has so many exceptional lines in the code :)

Thanks,
-Takahiro AKASHI


> 
> Thanks,
> 
> James
Rob Herring (Arm) May 18, 2018, 3:35 p.m. UTC | #8
On Tue, May 15, 2018 at 06:12:59PM +0100, James Morse wrote:
> Hi guys,
> 
> (CC: +RobH, devicetree list)

Thanks.

> On 25/04/18 07:26, AKASHI Takahiro wrote:
> > Enabling crash dump (kdump) includes
> > * prepare contents of ELF header of a core dump file, /proc/vmcore,
> >   using crash_prepare_elf64_headers(), and
> > * add two device tree properties, "linux,usable-memory-range" and
> >   "linux,elfcorehdr", which represent repsectively a memory range
> >   to be used by crash dump kernel and the header's location

BTW, I intend to move existing parsing these out of the arch code. 
Please don't add more DT handling to arch/ unless it is *really* arch 
specific. I'd assume that the next arch to add kexec support will use 
these bindings instead of the powerpc way.

> kexec_file_load() on arm64 needs to be able to create a prop encoded array to
> the FDT, but there doesn't appear to be a libfdt helper to do this.
> 
> Akashi's code below adds fdt_setprop_range() to the arch code, and duplicates
> bits of libfdt_internal.h to do the work.
> 
> How should this be done? I'm assuming this is something we need a new API in
> libfdt.h for. How do these come about, and is there an interim step we can use
> until then?

Submit patches to upstream dtc and then we can pull it in. Ahead of that 
you can add it to drivers/of/fdt.c (or maybe fdt_address.c because 
that's really what this is dealing with).

libfdt has only recently gained the beginnings of address handling.

> 
> Thanks!
> 
> James
> 
> > diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
> > index 37c0a9dc2e47..ec674f4d267c 100644
> > --- a/arch/arm64/kernel/machine_kexec_file.c
> > +++ b/arch/arm64/kernel/machine_kexec_file.c
> > @@ -76,6 +81,78 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf,
> >  	return ret;
> >  }
> >  
> > +static int __init arch_kexec_file_init(void)
> > +{
> > +	/* Those values are used later on loading the kernel */
> > +	__dt_root_addr_cells = dt_root_addr_cells;
> > +	__dt_root_size_cells = dt_root_size_cells;

I intend to make dt_root_*_cells private, so don't add another user 
outside of drivers/of/.

> > +
> > +	return 0;
> > +}
> > +late_initcall(arch_kexec_file_init);
> > +
> > +#define FDT_ALIGN(x, a)	(((x) + (a) - 1) & ~((a) - 1))
> > +#define FDT_TAGALIGN(x)	(FDT_ALIGN((x), FDT_TAGSIZE))
> > +
> > +static int fdt_prop_len(const char *prop_name, int len)
> > +{
> > +	return (strlen(prop_name) + 1) +
> > +		sizeof(struct fdt_property) +
> > +		FDT_TAGALIGN(len);
> > +}
> > +
> > +static bool cells_size_fitted(unsigned long base, unsigned long size)

I can't imagine this would happen. However, when this is moved to 
drivers/of/ or dtc, these need to be u64 types to work on 32-bit.

> > +{
> > +	/* if *_cells >= 2, cells can hold 64-bit values anyway */
> > +	if ((__dt_root_addr_cells == 1) && (base >= (1ULL << 32)))
> > +		return false;
> > +
> > +	if ((__dt_root_size_cells == 1) && (size >= (1ULL << 32)))
> > +		return false;
> > +
> > +	return true;
> > +}
> > +
> > +static void fill_property(void *buf, u64 val64, int cells)
> > +{
> > +	u32 val32;

This should be a __be32 or fdt32 type. So should buf.

> > +
> > +	if (cells == 1) {
> > +		val32 = cpu_to_fdt32((u32)val64);
> > +		memcpy(buf, &val32, sizeof(val32));
> > +	} else {
> > +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));
> > +		buf += cells * sizeof(u32) - sizeof(u64);
> > +
> > +		val64 = cpu_to_fdt64(val64);
> > +		memcpy(buf, &val64, sizeof(val64));

Look how of_read_number() is implemented. You should be able to do 
something similar here looping and avoiding the if/else.

> > +	}
> > +}
> > +
> > +static int fdt_setprop_range(void *fdt, int nodeoffset, const char *name,
> > +				unsigned long addr, unsigned long size)

A very generic sounding function, but really only works on addresses in 
children of the root node.

> > +{
> > +	void *buf, *prop;
> > +	size_t buf_size;
> > +	int result;
> > +
> > +	buf_size = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
> > +	prop = buf = vmalloc(buf_size);

This can go on the stack instead (and would be required to to work in 
libfdt).

> > +	if (!buf)
> > +		return -ENOMEM;
> > +
> > +	fill_property(prop, addr, __dt_root_addr_cells);
> > +	prop += __dt_root_addr_cells * sizeof(u32);
> > +
> > +	fill_property(prop, size, __dt_root_size_cells);
> > +
> > +	result = fdt_setprop(fdt, nodeoffset, name, buf, buf_size);
> > +
> > +	vfree(buf);
> > +
> > +	return result;
> > +}
> > +
> >  static int setup_dtb(struct kimage *image,
> >  		unsigned long initrd_load_addr, unsigned long initrd_len,
> >  		char *cmdline, unsigned long cmdline_len,
> > @@ -88,10 +165,26 @@ static int setup_dtb(struct kimage *image,
> >  	int range_len;
> >  	int ret;
> >  
> > +	/* check ranges against root's #address-cells and #size-cells */
> > +	if (image->type == KEXEC_TYPE_CRASH &&
> > +		(!cells_size_fitted(image->arch.elf_load_addr,
> > +				image->arch.elf_headers_sz) ||
> > +		 !cells_size_fitted(crashk_res.start,
> > +				crashk_res.end - crashk_res.start + 1))) {
> > +		pr_err("Crash memory region doesn't fit into DT's root cell sizes.\n");
> > +		ret = -EINVAL;
> > +		goto out_err;
> > +	}
> > +
> >  	/* duplicate dt blob */
> >  	buf_size = fdt_totalsize(initial_boot_params);
> >  	range_len = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
> >  
> > +	if (image->type == KEXEC_TYPE_CRASH)
> > +		buf_size += fdt_prop_len("linux,elfcorehdr", range_len)
> > +				+ fdt_prop_len("linux,usable-memory-range",
> > +								range_len);
> > +
> >  	if (initrd_load_addr)
> >  		buf_size += fdt_prop_len("linux,initrd-start", sizeof(u64))
> >  				+ fdt_prop_len("linux,initrd-end", sizeof(u64));
> > @@ -113,6 +206,23 @@ static int setup_dtb(struct kimage *image,
> >  	if (nodeoffset < 0)
> >  		goto out_err;
> >  
> > +	if (image->type == KEXEC_TYPE_CRASH) {
> > +		/* add linux,elfcorehdr */
> > +		ret = fdt_setprop_range(buf, nodeoffset, "linux,elfcorehdr",
> > +				image->arch.elf_load_addr,
> > +				image->arch.elf_headers_sz);
> > +		if (ret)
> > +			goto out_err;
> > +
> > +		/* add linux,usable-memory-range */
> > +		ret = fdt_setprop_range(buf, nodeoffset,
> > +				"linux,usable-memory-range",
> > +				crashk_res.start,
> > +				crashk_res.end - crashk_res.start + 1);
> > +		if (ret)
> > +			goto out_err;
> > +	}
> > +
> >  	/* add bootargs */
> >  	if (cmdline) {
> >  		ret = fdt_setprop(buf, nodeoffset, "bootargs",
>
James Morse May 18, 2018, 4 p.m. UTC | #9
Hi Akashi,

On 18/05/18 11:39, AKASHI Takahiro wrote:
> On Tue, May 15, 2018 at 06:11:15PM +0100, James Morse wrote:
>> On 25/04/18 07:26, AKASHI Takahiro wrote:
>>> Enabling crash dump (kdump) includes
>>> * prepare contents of ELF header of a core dump file, /proc/vmcore,
>>>   using crash_prepare_elf64_headers(), and
>>> * add two device tree properties, "linux,usable-memory-range" and
>>>   "linux,elfcorehdr", which represent repsectively a memory range

>>> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
>>> index 37c0a9dc2e47..ec674f4d267c 100644
>>> --- a/arch/arm64/kernel/machine_kexec_file.c
>>> +++ b/arch/arm64/kernel/machine_kexec_file.c

>>> +static void fill_property(void *buf, u64 val64, int cells)
>>> +{
>>> +	u32 val32;
>>> +
>>> +	if (cells == 1) {
>>> +		val32 = cpu_to_fdt32((u32)val64);
>>> +		memcpy(buf, &val32, sizeof(val32));
>>> +	} else {
>>
>>> +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));
>>> +		buf += cells * sizeof(u32) - sizeof(u64);
>>
>> Is this trying to clear the 'top' cells and shuffle the pointer to point at the
>> 'bottom' 2? I'm pretty sure this isn't endian safe.
>>
>> Do we really expect a system to have #address-cells > 2?
> 
> I don't know, but just for safety.

Okay, so this is aiming to be a cover-all-cases library function.


>>> +		val64 = cpu_to_fdt64(val64);
>>> +		memcpy(buf, &val64, sizeof(val64));
>>> +	}
>>> +}
>>> +
>>> +static int fdt_setprop_range(void *fdt, int nodeoffset, const char *name,
>>> +				unsigned long addr, unsigned long size)
>>
>> (the device-tree spec describes a 'ranges' property, which had me confused. This
>> is encoding a prop-encoded-array)
> 
> Should we rename it to, say, fdt_setprop_reg()?

Sure, but I'd really like this code to come from libfdt. I'm hoping for some
temporary workaround, lets see what the DT folk say.


>>> +	if (!buf)
>>> +		return -ENOMEM;
>>> +
>>> +	fill_property(prop, addr, __dt_root_addr_cells);
>>> +	prop += __dt_root_addr_cells * sizeof(u32);
>>> +
>>> +	fill_property(prop, size, __dt_root_size_cells);
>>> +
>>> +	result = fdt_setprop(fdt, nodeoffset, name, buf, buf_size);
>>> +
>>> +	vfree(buf);
>>> +
>>> +	return result;
>>> +}
>>
>> Doesn't this stuff belong in libfdt? I guess there is no 'add array element' api
>> because this the first time we've wanted to create a node with more than
>> key=fixed-size-value.
>>
>> I don't think this belongs in arch C code. Do we have a plan for getting libfdt
>> to support encoding prop-arrays? Can we put it somewhere anyone else duplicating
>> this will find it, until we can (re)move it?
> 
> I will temporarily move all fdt-related stuff to a separate file, but
> 
>> I have no idea how that happens... it looks like the devicetree list is the
>> place to ask.
> 
> should we always sync with the original dtc/libfdt repository?

I thought so, libfdt is one of those external libraries that the kernel
consumes, like acpica. For acpica at least the rule is changes go upstream, then
get sync'd back.


>>>  static int setup_dtb(struct kimage *image,
>>>  		unsigned long initrd_load_addr, unsigned long initrd_len,
>>>  		char *cmdline, unsigned long cmdline_len,
>>> @@ -88,10 +165,26 @@ static int setup_dtb(struct kimage *image,
>>>  	int range_len;
>>>  	int ret;
>>>  
>>> +	/* check ranges against root's #address-cells and #size-cells */
>>> +	if (image->type == KEXEC_TYPE_CRASH &&
>>> +		(!cells_size_fitted(image->arch.elf_load_addr,
>>> +				image->arch.elf_headers_sz) ||
>>> +		 !cells_size_fitted(crashk_res.start,
>>> +				crashk_res.end - crashk_res.start + 1))) {
>>> +		pr_err("Crash memory region doesn't fit into DT's root cell sizes.\n");
>>> +		ret = -EINVAL;
>>> +		goto out_err;
>>> +	}
>>
>> To check I've understood this properly: This can happen if the firmware provided
>> a DTB with 32bit address/size cells, but at least some of the memory requires 64
>> bit address/size cells. This could only happen on a UEFI system where the
>> firmware-DTB doesn't describe memory. ACPI-only systems would have the EFIstub DT.
> 
> Probably, yes. I assumed the case where #address-cells and #size-cells
> were just missing in fdt.

Ah, that's another one. I just wanted to check we could boot on a system where
this can happen.


>>>  	/* duplicate dt blob */
>>>  	buf_size = fdt_totalsize(initial_boot_params);
>>>  	range_len = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
>>>  
>>> +	if (image->type == KEXEC_TYPE_CRASH)
>>> +		buf_size += fdt_prop_len("linux,elfcorehdr", range_len)
>>> +				+ fdt_prop_len("linux,usable-memory-range",
>>> +								range_len);

>                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[...]

>> Don't you need to add "linux,usable-memory-range" to the buf_size estimate?
> 
> I think the code exists. See above.

Sorry, turns out I can't read!


>>> +		if (ret)
>>> +			goto out_err;
>>> +	}
>>
>>> @@ -148,17 +258,109 @@ static int setup_dtb(struct kimage *image,
>>
>>> +static struct crash_mem *get_crash_memory_ranges(void)
>>> +{
>>> +	unsigned int nr_ranges;
>>> +	struct crash_mem *cmem;
>>> +
>>> +	nr_ranges = 1; /* for exclusion of crashkernel region */
>>> +	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ranges_callback);
>>> +
>>> +	cmem = vmalloc(sizeof(struct crash_mem) +
>>> +			sizeof(struct crash_mem_range) * nr_ranges);
>>> +	if (!cmem)
>>> +		return NULL;
>>> +
>>> +	cmem->max_nr_ranges = nr_ranges;
>>> +	cmem->nr_ranges = 0;
>>> +	walk_system_ram_res(0, -1, cmem, add_mem_range_callback);
>>> +
>>> +	/* Exclude crashkernel region */
>>> +	if (crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end)) {
>>> +		vfree(cmem);
>>> +		return NULL;
>>> +	}
>>> +
>>> +	return cmem;
>>> +}
>>
>> Could this function be included in prepare_elf_headers() so that the alloc() and
>> free() occur together.
> 
> Or aiming that arm64 and x86 have similar-look code?

What's the advantage in things looking the same? If they are the same, it
probably shouldn't be in per-arch code. Otherwise it should be as simple as
possible, otherwise we can't spot the bugs/leaks.

But I think walking memblock here will remove all 'looks the same' properties here.


>>> +static int prepare_elf_headers(void **addr, unsigned long *sz)
>>> +{
>>> +	struct crash_mem *cmem;
>>> +	int ret = 0;
>>> +
>>> +	cmem = get_crash_memory_ranges();
>>> +	if (!cmem)
>>> +		return -ENOMEM;
>>> +
>>> +	ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);
>>> +
>>> +	vfree(cmem);
>>
>>> +	return ret;
>>> +}
>>
>> All this is moving memory-range information from core-code's
>> walk_system_ram_res() into core-code's struct crash_mem, and excluding
>> crashk_res, which again is accessible to the core code.
>>
>> It looks like this is duplicated in arch/x86 and arch/arm64 because arm64
>> doesn't have a second 'crashk_low_res' region, and always wants elf64, instead
>> of when IS_ENABLED(CONFIG_X86_64).
>> If we can abstract just those two, more of this could be moved to core code
>> where powerpc can make use of it if they want to support kdump with
>> kexec_file_load().
>>
>> But, its getting late for cross-architecture dependencies, lets put that on the
>> for-later list. (assuming there isn't a powerpc-kdump series out there adding a
>> third copy of this)
> 
> Sure. X86 code has so many exceptional lines in the code :)

They also pass the e820 'usable-memory' map on the cmdline...


Thanks,

James
AKASHI Takahiro May 21, 2018, 9:46 a.m. UTC | #10
James,

On Fri, May 18, 2018 at 05:00:55PM +0100, James Morse wrote:
> Hi Akashi,
> 
> On 18/05/18 11:39, AKASHI Takahiro wrote:
> > On Tue, May 15, 2018 at 06:11:15PM +0100, James Morse wrote:
> >> On 25/04/18 07:26, AKASHI Takahiro wrote:
> >>> Enabling crash dump (kdump) includes
> >>> * prepare contents of ELF header of a core dump file, /proc/vmcore,
> >>>   using crash_prepare_elf64_headers(), and
> >>> * add two device tree properties, "linux,usable-memory-range" and
> >>>   "linux,elfcorehdr", which represent repsectively a memory range
> 
> >>> diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
> >>> index 37c0a9dc2e47..ec674f4d267c 100644
> >>> --- a/arch/arm64/kernel/machine_kexec_file.c
> >>> +++ b/arch/arm64/kernel/machine_kexec_file.c
> 
> >>> +static void fill_property(void *buf, u64 val64, int cells)
> >>> +{
> >>> +	u32 val32;
> >>> +
> >>> +	if (cells == 1) {
> >>> +		val32 = cpu_to_fdt32((u32)val64);
> >>> +		memcpy(buf, &val32, sizeof(val32));
> >>> +	} else {
> >>
> >>> +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));
> >>> +		buf += cells * sizeof(u32) - sizeof(u64);
> >>
> >> Is this trying to clear the 'top' cells and shuffle the pointer to point at the
> >> 'bottom' 2? I'm pretty sure this isn't endian safe.
> >>
> >> Do we really expect a system to have #address-cells > 2?
> > 
> > I don't know, but just for safety.
> 
> Okay, so this is aiming to be a cover-all-cases library function.
> 
> 
> >>> +		val64 = cpu_to_fdt64(val64);
> >>> +		memcpy(buf, &val64, sizeof(val64));
> >>> +	}
> >>> +}
> >>> +
> >>> +static int fdt_setprop_range(void *fdt, int nodeoffset, const char *name,
> >>> +				unsigned long addr, unsigned long size)
> >>
> >> (the device-tree spec describes a 'ranges' property, which had me confused. This
> >> is encoding a prop-encoded-array)
> > 
> > Should we rename it to, say, fdt_setprop_reg()?
> 
> Sure, but I'd really like this code to come from libfdt. I'm hoping for some
> temporary workaround, lets see what the DT folk say.

OK, I will follow Rob's suggestion.

> >>> +	if (!buf)
> >>> +		return -ENOMEM;
> >>> +
> >>> +	fill_property(prop, addr, __dt_root_addr_cells);
> >>> +	prop += __dt_root_addr_cells * sizeof(u32);
> >>> +
> >>> +	fill_property(prop, size, __dt_root_size_cells);
> >>> +
> >>> +	result = fdt_setprop(fdt, nodeoffset, name, buf, buf_size);
> >>> +
> >>> +	vfree(buf);
> >>> +
> >>> +	return result;
> >>> +}
> >>
> >> Doesn't this stuff belong in libfdt? I guess there is no 'add array element' api
> >> because this the first time we've wanted to create a node with more than
> >> key=fixed-size-value.
> >>
> >> I don't think this belongs in arch C code. Do we have a plan for getting libfdt
> >> to support encoding prop-arrays? Can we put it somewhere anyone else duplicating
> >> this will find it, until we can (re)move it?
> > 
> > I will temporarily move all fdt-related stuff to a separate file, but
> > 
> >> I have no idea how that happens... it looks like the devicetree list is the
> >> place to ask.
> > 
> > should we always sync with the original dtc/libfdt repository?
> 
> I thought so, libfdt is one of those external libraries that the kernel
> consumes, like acpica. For acpica at least the rule is changes go upstream, then
> get sync'd back.

Same above.

> >>>  static int setup_dtb(struct kimage *image,
> >>>  		unsigned long initrd_load_addr, unsigned long initrd_len,
> >>>  		char *cmdline, unsigned long cmdline_len,
> >>> @@ -88,10 +165,26 @@ static int setup_dtb(struct kimage *image,
> >>>  	int range_len;
> >>>  	int ret;
> >>>  
> >>> +	/* check ranges against root's #address-cells and #size-cells */
> >>> +	if (image->type == KEXEC_TYPE_CRASH &&
> >>> +		(!cells_size_fitted(image->arch.elf_load_addr,
> >>> +				image->arch.elf_headers_sz) ||
> >>> +		 !cells_size_fitted(crashk_res.start,
> >>> +				crashk_res.end - crashk_res.start + 1))) {
> >>> +		pr_err("Crash memory region doesn't fit into DT's root cell sizes.\n");
> >>> +		ret = -EINVAL;
> >>> +		goto out_err;
> >>> +	}
> >>
> >> To check I've understood this properly: This can happen if the firmware provided
> >> a DTB with 32bit address/size cells, but at least some of the memory requires 64
> >> bit address/size cells. This could only happen on a UEFI system where the
> >> firmware-DTB doesn't describe memory. ACPI-only systems would have the EFIstub DT.
> > 
> > Probably, yes. I assumed the case where #address-cells and #size-cells
> > were just missing in fdt.
> 
> Ah, that's another one. I just wanted to check we could boot on a system where
> this can happen.
> 
> 
> >>>  	/* duplicate dt blob */
> >>>  	buf_size = fdt_totalsize(initial_boot_params);
> >>>  	range_len = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
> >>>  
> >>> +	if (image->type == KEXEC_TYPE_CRASH)
> >>> +		buf_size += fdt_prop_len("linux,elfcorehdr", range_len)
> >>> +				+ fdt_prop_len("linux,usable-memory-range",
> >>> +								range_len);
> 
> >                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> [...]
> 
> >> Don't you need to add "linux,usable-memory-range" to the buf_size estimate?
> > 
> > I think the code exists. See above.
> 
> Sorry, turns out I can't read!
> 
> 
> >>> +		if (ret)
> >>> +			goto out_err;
> >>> +	}
> >>
> >>> @@ -148,17 +258,109 @@ static int setup_dtb(struct kimage *image,
> >>
> >>> +static struct crash_mem *get_crash_memory_ranges(void)
> >>> +{
> >>> +	unsigned int nr_ranges;
> >>> +	struct crash_mem *cmem;
> >>> +
> >>> +	nr_ranges = 1; /* for exclusion of crashkernel region */
> >>> +	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ranges_callback);
> >>> +
> >>> +	cmem = vmalloc(sizeof(struct crash_mem) +
> >>> +			sizeof(struct crash_mem_range) * nr_ranges);
> >>> +	if (!cmem)
> >>> +		return NULL;
> >>> +
> >>> +	cmem->max_nr_ranges = nr_ranges;
> >>> +	cmem->nr_ranges = 0;
> >>> +	walk_system_ram_res(0, -1, cmem, add_mem_range_callback);
> >>> +
> >>> +	/* Exclude crashkernel region */
> >>> +	if (crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end)) {
> >>> +		vfree(cmem);
> >>> +		return NULL;
> >>> +	}
> >>> +
> >>> +	return cmem;
> >>> +}
> >>
> >> Could this function be included in prepare_elf_headers() so that the alloc() and
> >> free() occur together.
> > 
> > Or aiming that arm64 and x86 have similar-look code?
> 
> What's the advantage in things looking the same? If they are the same, it
> probably shouldn't be in per-arch code. Otherwise it should be as simple as
> possible, otherwise we can't spot the bugs/leaks.
> 
> But I think walking memblock here will remove all 'looks the same' properties here.

OK, I will unfold the function in prepare_elf_headers().

> 
> >>> +static int prepare_elf_headers(void **addr, unsigned long *sz)
> >>> +{
> >>> +	struct crash_mem *cmem;
> >>> +	int ret = 0;
> >>> +
> >>> +	cmem = get_crash_memory_ranges();
> >>> +	if (!cmem)
> >>> +		return -ENOMEM;
> >>> +
> >>> +	ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);
> >>> +
> >>> +	vfree(cmem);
> >>
> >>> +	return ret;
> >>> +}
> >>
> >> All this is moving memory-range information from core-code's
> >> walk_system_ram_res() into core-code's struct crash_mem, and excluding
> >> crashk_res, which again is accessible to the core code.
> >>
> >> It looks like this is duplicated in arch/x86 and arch/arm64 because arm64
> >> doesn't have a second 'crashk_low_res' region, and always wants elf64, instead
> >> of when IS_ENABLED(CONFIG_X86_64).
> >> If we can abstract just those two, more of this could be moved to core code
> >> where powerpc can make use of it if they want to support kdump with
> >> kexec_file_load().
> >>
> >> But, its getting late for cross-architecture dependencies, lets put that on the
> >> for-later list. (assuming there isn't a powerpc-kdump series out there adding a
> >> third copy of this)
> > 
> > Sure. X86 code has so many exceptional lines in the code :)
> 
> They also pass the e820 'usable-memory' map on the cmdline...

Well, according to Dave(RedHat)'s past comment, this type of kernel
parameters are in a old style, and x86 now has a dedicated memory region
passed for this sake.

Thanks,
-Takahiro AKASHI

> 
> Thanks,
> 
> James
AKASHI Takahiro May 21, 2018, 10:14 a.m. UTC | #11
Hi Rob,

On Fri, May 18, 2018 at 10:35:52AM -0500, Rob Herring wrote:
> On Tue, May 15, 2018 at 06:12:59PM +0100, James Morse wrote:
> > Hi guys,
> > 
> > (CC: +RobH, devicetree list)
> 
> Thanks.
> 
> > On 25/04/18 07:26, AKASHI Takahiro wrote:
> > > Enabling crash dump (kdump) includes
> > > * prepare contents of ELF header of a core dump file, /proc/vmcore,
> > >   using crash_prepare_elf64_headers(), and
> > > * add two device tree properties, "linux,usable-memory-range" and
> > >   "linux,elfcorehdr", which represent repsectively a memory range
> > >   to be used by crash dump kernel and the header's location
> 
> BTW, I intend to move existing parsing these out of the arch code. 
> Please don't add more DT handling to arch/ unless it is *really* arch 
> specific. I'd assume that the next arch to add kexec support will use 
> these bindings instead of the powerpc way.

So do you expect all the fdt-related stuff in my current implementation
for arm64 to be put into libfdt, or at least drivers/of, from the beginning?

I'm not sure how arch-specific the properties here are. For instance,
it is only arm64 that uses "linux,usable-memory-range" right now but
if some other arch follows, it is no more arch-specific.
# I remember that you didn't like this property :)

> > kexec_file_load() on arm64 needs to be able to create a prop encoded array to
> > the FDT, but there doesn't appear to be a libfdt helper to do this.
> > 
> > Akashi's code below adds fdt_setprop_range() to the arch code, and duplicates
> > bits of libfdt_internal.h to do the work.
> > 
> > How should this be done? I'm assuming this is something we need a new API in
> > libfdt.h for. How do these come about, and is there an interim step we can use
> > until then?
> 
> Submit patches to upstream dtc and then we can pull it in. Ahead of that 
> you can add it to drivers/of/fdt.c (or maybe fdt_address.c because 
> that's really what this is dealing with).

OK, I'm going to try to follow your suggestion.

> libfdt has only recently gained the beginnings of address handling.
> 
> > 
> > Thanks!
> > 
> > James
> > 
> > > diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
> > > index 37c0a9dc2e47..ec674f4d267c 100644
> > > --- a/arch/arm64/kernel/machine_kexec_file.c
> > > +++ b/arch/arm64/kernel/machine_kexec_file.c
> > > @@ -76,6 +81,78 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf,
> > >  	return ret;
> > >  }
> > >  
> > > +static int __init arch_kexec_file_init(void)
> > > +{
> > > +	/* Those values are used later on loading the kernel */
> > > +	__dt_root_addr_cells = dt_root_addr_cells;
> > > +	__dt_root_size_cells = dt_root_size_cells;
> 
> I intend to make dt_root_*_cells private, so don't add another user 
> outside of drivers/of/.

Once cells_size_fitted() moves to drivers/of, there will be no users.

> > > +
> > > +	return 0;
> > > +}
> > > +late_initcall(arch_kexec_file_init);
> > > +
> > > +#define FDT_ALIGN(x, a)	(((x) + (a) - 1) & ~((a) - 1))
> > > +#define FDT_TAGALIGN(x)	(FDT_ALIGN((x), FDT_TAGSIZE))
> > > +
> > > +static int fdt_prop_len(const char *prop_name, int len)
> > > +{
> > > +	return (strlen(prop_name) + 1) +
> > > +		sizeof(struct fdt_property) +
> > > +		FDT_TAGALIGN(len);
> > > +}
> > > +
> > > +static bool cells_size_fitted(unsigned long base, unsigned long size)
> 
> I can't imagine this would happen. However, when this is moved to 
> drivers/of/ or dtc, these need to be u64 types to work on 32-bit.

OK.

> > > +	/* if *_cells >= 2, cells can hold 64-bit values anyway */
> > > +	if ((__dt_root_addr_cells == 1) && (base >= (1ULL << 32)))
> > > +		return false;
> > > +
> > > +	if ((__dt_root_size_cells == 1) && (size >= (1ULL << 32)))
> > > +		return false;
> > > +
> > > +	return true;
> > > +}
> > > +
> > > +static void fill_property(void *buf, u64 val64, int cells)
> > > +{
> > > +	u32 val32;
> 
> This should be a __be32 or fdt32 type. So should buf.

OK for val32, but buf is a local pointer address.

> > > +
> > > +	if (cells == 1) {
> > > +		val32 = cpu_to_fdt32((u32)val64);
> > > +		memcpy(buf, &val32, sizeof(val32));
> > > +	} else {
> > > +		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));
> > > +		buf += cells * sizeof(u32) - sizeof(u64);
> > > +
> > > +		val64 = cpu_to_fdt64(val64);
> > > +		memcpy(buf, &val64, sizeof(val64));
> 
> Look how of_read_number() is implemented. You should be able to do 
> something similar here looping and avoiding the if/else.

Ah, excellent!

> > > +	}
> > > +}
> > > +
> > > +static int fdt_setprop_range(void *fdt, int nodeoffset, const char *name,
> > > +				unsigned long addr, unsigned long size)
> 
> A very generic sounding function, but really only works on addresses in 
> children of the root node.
> 
> > > +{
> > > +	void *buf, *prop;
> > > +	size_t buf_size;
> > > +	int result;
> > > +
> > > +	buf_size = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
> > > +	prop = buf = vmalloc(buf_size);
> 
> This can go on the stack instead (and would be required to to work in 
> libfdt).

Well, I can't agree with you here since we are now in effort, as far as
I correctly understand, of purging all the variable-sized arrays on a local
stack out of the kernel code.

Thank you for your review.
-Takahiro AKASHI

> > > +	if (!buf)
> > > +		return -ENOMEM;
> > > +
> > > +	fill_property(prop, addr, __dt_root_addr_cells);
> > > +	prop += __dt_root_addr_cells * sizeof(u32);
> > > +
> > > +	fill_property(prop, size, __dt_root_size_cells);
> > > +
> > > +	result = fdt_setprop(fdt, nodeoffset, name, buf, buf_size);
> > > +
> > > +	vfree(buf);
> > > +
> > > +	return result;
> > > +}
> > > +
> > >  static int setup_dtb(struct kimage *image,
> > >  		unsigned long initrd_load_addr, unsigned long initrd_len,
> > >  		char *cmdline, unsigned long cmdline_len,
> > > @@ -88,10 +165,26 @@ static int setup_dtb(struct kimage *image,
> > >  	int range_len;
> > >  	int ret;
> > >  
> > > +	/* check ranges against root's #address-cells and #size-cells */
> > > +	if (image->type == KEXEC_TYPE_CRASH &&
> > > +		(!cells_size_fitted(image->arch.elf_load_addr,
> > > +				image->arch.elf_headers_sz) ||
> > > +		 !cells_size_fitted(crashk_res.start,
> > > +				crashk_res.end - crashk_res.start + 1))) {
> > > +		pr_err("Crash memory region doesn't fit into DT's root cell sizes.\n");
> > > +		ret = -EINVAL;
> > > +		goto out_err;
> > > +	}
> > > +
> > >  	/* duplicate dt blob */
> > >  	buf_size = fdt_totalsize(initial_boot_params);
> > >  	range_len = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
> > >  
> > > +	if (image->type == KEXEC_TYPE_CRASH)
> > > +		buf_size += fdt_prop_len("linux,elfcorehdr", range_len)
> > > +				+ fdt_prop_len("linux,usable-memory-range",
> > > +								range_len);
> > > +
> > >  	if (initrd_load_addr)
> > >  		buf_size += fdt_prop_len("linux,initrd-start", sizeof(u64))
> > >  				+ fdt_prop_len("linux,initrd-end", sizeof(u64));
> > > @@ -113,6 +206,23 @@ static int setup_dtb(struct kimage *image,
> > >  	if (nodeoffset < 0)
> > >  		goto out_err;
> > >  
> > > +	if (image->type == KEXEC_TYPE_CRASH) {
> > > +		/* add linux,elfcorehdr */
> > > +		ret = fdt_setprop_range(buf, nodeoffset, "linux,elfcorehdr",
> > > +				image->arch.elf_load_addr,
> > > +				image->arch.elf_headers_sz);
> > > +		if (ret)
> > > +			goto out_err;
> > > +
> > > +		/* add linux,usable-memory-range */
> > > +		ret = fdt_setprop_range(buf, nodeoffset,
> > > +				"linux,usable-memory-range",
> > > +				crashk_res.start,
> > > +				crashk_res.end - crashk_res.start + 1);
> > > +		if (ret)
> > > +			goto out_err;
> > > +	}
> > > +
> > >  	/* add bootargs */
> > >  	if (cmdline) {
> > >  		ret = fdt_setprop(buf, nodeoffset, "bootargs",
> >
Rob Herring (Arm) May 24, 2018, 2:25 p.m. UTC | #12
On Mon, May 21, 2018 at 5:14 AM, AKASHI Takahiro
<takahiro.akashi@linaro.org> wrote:
> Hi Rob,
>
> On Fri, May 18, 2018 at 10:35:52AM -0500, Rob Herring wrote:
>> On Tue, May 15, 2018 at 06:12:59PM +0100, James Morse wrote:
>> > Hi guys,
>> >
>> > (CC: +RobH, devicetree list)
>>
>> Thanks.
>>
>> > On 25/04/18 07:26, AKASHI Takahiro wrote:
>> > > Enabling crash dump (kdump) includes
>> > > * prepare contents of ELF header of a core dump file, /proc/vmcore,
>> > >   using crash_prepare_elf64_headers(), and
>> > > * add two device tree properties, "linux,usable-memory-range" and
>> > >   "linux,elfcorehdr", which represent repsectively a memory range
>> > >   to be used by crash dump kernel and the header's location
>>
>> BTW, I intend to move existing parsing these out of the arch code.
>> Please don't add more DT handling to arch/ unless it is *really* arch
>> specific. I'd assume that the next arch to add kexec support will use
>> these bindings instead of the powerpc way.
>
> So do you expect all the fdt-related stuff in my current implementation
> for arm64 to be put into libfdt, or at least drivers/of, from the beginning?

Yes.

> I'm not sure how arch-specific the properties here are. For instance,
> it is only arm64 that uses "linux,usable-memory-range" right now but
> if some other arch follows, it is no more arch-specific.
> # I remember that you didn't like this property :)

The question I guess is what will the next arch use. I don't think any
other DT based arch supports crashdump or kexec yet.

>> > > +{
>> > > + void *buf, *prop;
>> > > + size_t buf_size;
>> > > + int result;
>> > > +
>> > > + buf_size = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
>> > > + prop = buf = vmalloc(buf_size);
>>
>> This can go on the stack instead (and would be required to to work in
>> libfdt).
>
> Well, I can't agree with you here since we are now in effort, as far as
> I correctly understand, of purging all the variable-sized arrays on a local
> stack out of the kernel code.

You don't need a variable sized array. The array size just needs to
the the maximum size (16 bytes).

Rob
diff mbox

Patch

diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
index 3cba4161818a..77f05bcf6a42 100644
--- a/arch/arm64/include/asm/kexec.h
+++ b/arch/arm64/include/asm/kexec.h
@@ -100,6 +100,10 @@  struct kimage_arch {
 	int kern_segment;
 	phys_addr_t dtb_mem;
 	void *dtb_buf;
+	/* Core ELF header buffer */
+	void *elf_headers;
+	unsigned long elf_headers_sz;
+	unsigned long elf_load_addr;
 };
 
 /**
diff --git a/arch/arm64/kernel/kexec_image.c b/arch/arm64/kernel/kexec_image.c
index 4dd524ad6611..2b3baf7285e0 100644
--- a/arch/arm64/kernel/kexec_image.c
+++ b/arch/arm64/kernel/kexec_image.c
@@ -39,8 +39,13 @@  static void *image_load(struct kimage *image,
 
 	/* Load the kernel */
 	kbuf.image = image;
-	kbuf.buf_min = 0;
-	kbuf.buf_max = ULONG_MAX;
+	if (image->type == KEXEC_TYPE_CRASH) {
+		kbuf.buf_min = crashk_res.start;
+		kbuf.buf_max = crashk_res.end + 1;
+	} else {
+		kbuf.buf_min = 0;
+		kbuf.buf_max = ULONG_MAX;
+	}
 	kbuf.top_down = false;
 
 	kbuf.buffer = kernel;
diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
index 37c0a9dc2e47..ec674f4d267c 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -17,6 +17,7 @@ 
 #include <linux/memblock.h>
 #include <linux/of_fdt.h>
 #include <linux/types.h>
+#include <linux/vmalloc.h>
 #include <asm/byteorder.h>
 
 static int __dt_root_addr_cells;
@@ -32,6 +33,10 @@  int arch_kimage_file_post_load_cleanup(struct kimage *image)
 	vfree(image->arch.dtb_buf);
 	image->arch.dtb_buf = NULL;
 
+	vfree(image->arch.elf_headers);
+	image->arch.elf_headers = NULL;
+	image->arch.elf_headers_sz = 0;
+
 	return kexec_image_post_load_cleanup_default(image);
 }
 
@@ -76,6 +81,78 @@  int arch_kexec_walk_mem(struct kexec_buf *kbuf,
 	return ret;
 }
 
+static int __init arch_kexec_file_init(void)
+{
+	/* Those values are used later on loading the kernel */
+	__dt_root_addr_cells = dt_root_addr_cells;
+	__dt_root_size_cells = dt_root_size_cells;
+
+	return 0;
+}
+late_initcall(arch_kexec_file_init);
+
+#define FDT_ALIGN(x, a)	(((x) + (a) - 1) & ~((a) - 1))
+#define FDT_TAGALIGN(x)	(FDT_ALIGN((x), FDT_TAGSIZE))
+
+static int fdt_prop_len(const char *prop_name, int len)
+{
+	return (strlen(prop_name) + 1) +
+		sizeof(struct fdt_property) +
+		FDT_TAGALIGN(len);
+}
+
+static bool cells_size_fitted(unsigned long base, unsigned long size)
+{
+	/* if *_cells >= 2, cells can hold 64-bit values anyway */
+	if ((__dt_root_addr_cells == 1) && (base >= (1ULL << 32)))
+		return false;
+
+	if ((__dt_root_size_cells == 1) && (size >= (1ULL << 32)))
+		return false;
+
+	return true;
+}
+
+static void fill_property(void *buf, u64 val64, int cells)
+{
+	u32 val32;
+
+	if (cells == 1) {
+		val32 = cpu_to_fdt32((u32)val64);
+		memcpy(buf, &val32, sizeof(val32));
+	} else {
+		memset(buf, 0, cells * sizeof(u32) - sizeof(u64));
+		buf += cells * sizeof(u32) - sizeof(u64);
+
+		val64 = cpu_to_fdt64(val64);
+		memcpy(buf, &val64, sizeof(val64));
+	}
+}
+
+static int fdt_setprop_range(void *fdt, int nodeoffset, const char *name,
+				unsigned long addr, unsigned long size)
+{
+	void *buf, *prop;
+	size_t buf_size;
+	int result;
+
+	buf_size = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
+	prop = buf = vmalloc(buf_size);
+	if (!buf)
+		return -ENOMEM;
+
+	fill_property(prop, addr, __dt_root_addr_cells);
+	prop += __dt_root_addr_cells * sizeof(u32);
+
+	fill_property(prop, size, __dt_root_size_cells);
+
+	result = fdt_setprop(fdt, nodeoffset, name, buf, buf_size);
+
+	vfree(buf);
+
+	return result;
+}
+
 static int setup_dtb(struct kimage *image,
 		unsigned long initrd_load_addr, unsigned long initrd_len,
 		char *cmdline, unsigned long cmdline_len,
@@ -88,10 +165,26 @@  static int setup_dtb(struct kimage *image,
 	int range_len;
 	int ret;
 
+	/* check ranges against root's #address-cells and #size-cells */
+	if (image->type == KEXEC_TYPE_CRASH &&
+		(!cells_size_fitted(image->arch.elf_load_addr,
+				image->arch.elf_headers_sz) ||
+		 !cells_size_fitted(crashk_res.start,
+				crashk_res.end - crashk_res.start + 1))) {
+		pr_err("Crash memory region doesn't fit into DT's root cell sizes.\n");
+		ret = -EINVAL;
+		goto out_err;
+	}
+
 	/* duplicate dt blob */
 	buf_size = fdt_totalsize(initial_boot_params);
 	range_len = (__dt_root_addr_cells + __dt_root_size_cells) * sizeof(u32);
 
+	if (image->type == KEXEC_TYPE_CRASH)
+		buf_size += fdt_prop_len("linux,elfcorehdr", range_len)
+				+ fdt_prop_len("linux,usable-memory-range",
+								range_len);
+
 	if (initrd_load_addr)
 		buf_size += fdt_prop_len("linux,initrd-start", sizeof(u64))
 				+ fdt_prop_len("linux,initrd-end", sizeof(u64));
@@ -113,6 +206,23 @@  static int setup_dtb(struct kimage *image,
 	if (nodeoffset < 0)
 		goto out_err;
 
+	if (image->type == KEXEC_TYPE_CRASH) {
+		/* add linux,elfcorehdr */
+		ret = fdt_setprop_range(buf, nodeoffset, "linux,elfcorehdr",
+				image->arch.elf_load_addr,
+				image->arch.elf_headers_sz);
+		if (ret)
+			goto out_err;
+
+		/* add linux,usable-memory-range */
+		ret = fdt_setprop_range(buf, nodeoffset,
+				"linux,usable-memory-range",
+				crashk_res.start,
+				crashk_res.end - crashk_res.start + 1);
+		if (ret)
+			goto out_err;
+	}
+
 	/* add bootargs */
 	if (cmdline) {
 		ret = fdt_setprop(buf, nodeoffset, "bootargs",
@@ -148,17 +258,109 @@  static int setup_dtb(struct kimage *image,
 	return ret;
 }
 
+static int get_nr_ranges_callback(struct resource *res, void *arg)
+{
+	unsigned int *nr_ranges = arg;
+
+	(*nr_ranges)++;
+	return 0;
+}
+
+static int add_mem_range_callback(struct resource *res, void *arg)
+{
+	struct crash_mem *cmem = arg;
+
+	cmem->ranges[cmem->nr_ranges].start = res->start;
+	cmem->ranges[cmem->nr_ranges].end = res->end;
+	cmem->nr_ranges++;
+
+	return 0;
+}
+
+static struct crash_mem *get_crash_memory_ranges(void)
+{
+	unsigned int nr_ranges;
+	struct crash_mem *cmem;
+
+	nr_ranges = 1; /* for exclusion of crashkernel region */
+	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ranges_callback);
+
+	cmem = vmalloc(sizeof(struct crash_mem) +
+			sizeof(struct crash_mem_range) * nr_ranges);
+	if (!cmem)
+		return NULL;
+
+	cmem->max_nr_ranges = nr_ranges;
+	cmem->nr_ranges = 0;
+	walk_system_ram_res(0, -1, cmem, add_mem_range_callback);
+
+	/* Exclude crashkernel region */
+	if (crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end)) {
+		vfree(cmem);
+		return NULL;
+	}
+
+	return cmem;
+}
+
+static int prepare_elf_headers(void **addr, unsigned long *sz)
+{
+	struct crash_mem *cmem;
+	int ret = 0;
+
+	cmem = get_crash_memory_ranges();
+	if (!cmem)
+		return -ENOMEM;
+
+	ret =  crash_prepare_elf64_headers(cmem, true, addr, sz);
+
+	vfree(cmem);
+	return ret;
+}
+
 int load_other_segments(struct kimage *image,
 			char *initrd, unsigned long initrd_len,
 			char *cmdline, unsigned long cmdline_len)
 {
 	struct kexec_segment *kern_seg;
 	struct kexec_buf kbuf;
+	void *hdrs_addr;
+	unsigned long hdrs_sz;
 	unsigned long initrd_load_addr = 0;
 	char *dtb = NULL;
 	unsigned long dtb_len = 0;
 	int ret = 0;
 
+	/* load elf core header */
+	if (image->type == KEXEC_TYPE_CRASH) {
+		ret = prepare_elf_headers(&hdrs_addr, &hdrs_sz);
+		if (ret) {
+			pr_err("Preparing elf core header failed\n");
+			goto out_err;
+		}
+
+		kbuf.image = image;
+		kbuf.buffer = hdrs_addr;
+		kbuf.bufsz = hdrs_sz;
+		kbuf.memsz = hdrs_sz;
+		kbuf.buf_align = PAGE_SIZE;
+		kbuf.buf_min = crashk_res.start;
+		kbuf.buf_max = crashk_res.end + 1;
+		kbuf.top_down = true;
+
+		ret = kexec_add_buffer(&kbuf);
+		if (ret) {
+			vfree(hdrs_addr);
+			goto out_err;
+		}
+		image->arch.elf_headers = hdrs_addr;
+		image->arch.elf_headers_sz = hdrs_sz;
+		image->arch.elf_load_addr = kbuf.mem;
+
+		pr_debug("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+				 image->arch.elf_load_addr, hdrs_sz, hdrs_sz);
+	}
+
 	kern_seg = &image->segment[image->arch.kern_segment];
 	kbuf.image = image;
 	/* not allocate anything below the kernel */