diff mbox

[v31,09/12] arm64: kdump: provide /proc/vmcore file

Message ID 20170201124630.6016-8-takahiro.akashi@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

AKASHI Takahiro Feb. 1, 2017, 12:46 p.m. UTC
Add arch-specific functions to provide a dump file, /proc/vmcore.

This file is in ELF format and its ELF header needs to be prepared by
userspace tools, like kexec-tools, in adance. The primary kernel is
responsible to allocate the region with reserve_elfcorehdr() at boot time
and advertize its location to crash dump kernel via a new device-tree
property, "linux,elfcorehdr".

Then crash dump kernel will access the primary kernel's memory with
copy_oldmem_page(), which feeds the data page-by-page by ioremap'ing it
since it does not reside in linear mapping on crash dump kernel.

We also need our own elfcorehdr_read() here since the header is placed
within crash dump kernel's usable memory.

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Reviewed-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig             | 11 +++++++
 arch/arm64/kernel/Makefile     |  1 +
 arch/arm64/kernel/crash_dump.c | 71 ++++++++++++++++++++++++++++++++++++++++++
 arch/arm64/mm/init.c           | 53 +++++++++++++++++++++++++++++++
 4 files changed, 136 insertions(+)
 create mode 100644 arch/arm64/kernel/crash_dump.c

Comments

Mark Rutland Feb. 1, 2017, 7:21 p.m. UTC | #1
On Wed, Feb 01, 2017 at 09:46:28PM +0900, AKASHI Takahiro wrote:
> Add arch-specific functions to provide a dump file, /proc/vmcore.
> 
> This file is in ELF format and its ELF header needs to be prepared by
> userspace tools, like kexec-tools, in adance. The primary kernel is
> responsible to allocate the region with reserve_elfcorehdr() at boot time
> and advertize its location to crash dump kernel via a new device-tree
> property, "linux,elfcorehdr".

> +static int __init early_init_dt_scan_elfcorehdr(unsigned long node,
> +		const char *uname, int depth, void *data)
> +{
> +	const __be32 *reg;
> +	int len;
> +
> +	if (depth != 1 || strcmp(uname, "chosen") != 0)
> +		return 0;
> +
> +	reg = of_get_flat_dt_prop(node, "linux,elfcorehdr", &len);
> +	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
> +		return 1;
> +
> +	elfcorehdr_addr = dt_mem_next_cell(dt_root_addr_cells, &reg);
> +	elfcorehdr_size = dt_mem_next_cell(dt_root_size_cells, &reg);
> +
> +	return 1;
> +}
> +
> +/*
> + * reserve_elfcorehdr() - reserves memory for elf core header
> + *
> + * This function reserves elf core header given in "elfcorehdr=" kernel
> + * command line parameter. This region contains all the information about
> + * primary kernel's core image and is used by a dump capture kernel to
> + * access the system memory on primary kernel.
> + */
> +static void __init reserve_elfcorehdr(void)
> +{
> +	of_scan_flat_dt(early_init_dt_scan_elfcorehdr, NULL);
> +
> +	if (!elfcorehdr_size)
> +		return;
> +
> +	if (memblock_is_region_reserved(elfcorehdr_addr, elfcorehdr_size)) {
> +		pr_warn("elfcorehdr is overlapped\n");
> +		return;
> +	}
> +
> +	memblock_reserve(elfcorehdr_addr, elfcorehdr_size);
> +
> +	pr_info("Reserving %lldKB of memory at 0x%llx for elfcorehdr\n",
> +		elfcorehdr_size >> 10, elfcorehdr_addr);
> +}

This doesn't seem right to me.  The logic here doesn't match the commit
message, the comment above reserve_elfcorehdr() doesn't match the
implementation, and this doesn't match my understanding of how this was
intended to be used from the DT binding.

I had assumed that we'd treat this in much the same way as the
linux,reserved-memory-region property, with the primary kernel either
dynamically allocating the region or using a command line option, and
the base being exposed to userspace via /sys/ or /proc/ somehow.

Why is that not the case?

Thanks,
Mark.
AKASHI Takahiro Feb. 2, 2017, 6:24 a.m. UTC | #2
On Wed, Feb 01, 2017 at 07:21:22PM +0000, Mark Rutland wrote:
> On Wed, Feb 01, 2017 at 09:46:28PM +0900, AKASHI Takahiro wrote:
> > Add arch-specific functions to provide a dump file, /proc/vmcore.
> > 
> > This file is in ELF format and its ELF header needs to be prepared by
> > userspace tools, like kexec-tools, in adance. The primary kernel is
> > responsible to allocate the region with reserve_elfcorehdr() at boot time
> > and advertize its location to crash dump kernel via a new device-tree
> > property, "linux,elfcorehdr".
> 
> > +static int __init early_init_dt_scan_elfcorehdr(unsigned long node,
> > +		const char *uname, int depth, void *data)
> > +{
> > +	const __be32 *reg;
> > +	int len;
> > +
> > +	if (depth != 1 || strcmp(uname, "chosen") != 0)
> > +		return 0;
> > +
> > +	reg = of_get_flat_dt_prop(node, "linux,elfcorehdr", &len);
> > +	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
> > +		return 1;
> > +
> > +	elfcorehdr_addr = dt_mem_next_cell(dt_root_addr_cells, &reg);
> > +	elfcorehdr_size = dt_mem_next_cell(dt_root_size_cells, &reg);
> > +
> > +	return 1;
> > +}
> > +
> > +/*
> > + * reserve_elfcorehdr() - reserves memory for elf core header
> > + *
> > + * This function reserves elf core header given in "elfcorehdr=" kernel
> > + * command line parameter. This region contains all the information about
> > + * primary kernel's core image and is used by a dump capture kernel to
> > + * access the system memory on primary kernel.
> > + */
> > +static void __init reserve_elfcorehdr(void)
> > +{
> > +	of_scan_flat_dt(early_init_dt_scan_elfcorehdr, NULL);
> > +
> > +	if (!elfcorehdr_size)
> > +		return;
> > +
> > +	if (memblock_is_region_reserved(elfcorehdr_addr, elfcorehdr_size)) {
> > +		pr_warn("elfcorehdr is overlapped\n");
> > +		return;
> > +	}
> > +
> > +	memblock_reserve(elfcorehdr_addr, elfcorehdr_size);
> > +
> > +	pr_info("Reserving %lldKB of memory at 0x%llx for elfcorehdr\n",
> > +		elfcorehdr_size >> 10, elfcorehdr_addr);
> > +}
> 
> This doesn't seem right to me.  The logic here doesn't match the commit
> message, the comment above reserve_elfcorehdr() doesn't match the
> implementation, and this doesn't match my understanding of how this was
> intended to be used from the DT binding.

Surely the commit message was wrong/misleading; It should say

===8<===
Arch-specific functions are added to allow for implementing a crash dump
file interface, /proc/vmcore, which can be viewed as a ELF file.

A user space tool, like kexec-tools, is responsible for allocating
a separate region for the core's ELF header within crash kdump kernel 
memory and filling it in when executing kexec_load().

Then, its location will be advertised to crash dump kernel via a new
device-tree property, "linux,elfcorehdr", and crash dump kernel preserves
the region for later use with reserve_elfcorehdr() at boot time.

On crash dump kernel, /proc/vmcore will access the primary kernel's memory
with copy_oldmem_page(), which feeds the data page-by-page by ioremap'ing
it since it does not reside in linear mapping on crash dump kernel.

Meanwhile, elfcorehdr_read() is simple as the region is always mapped.
===>8===

Does this make things clear?

> I had assumed that we'd treat this in much the same way as the
> linux,reserved-memory-region property, with the primary kernel either
> dynamically allocating the region or using a command line option, and
> the base being exposed to userspace via /sys/ or /proc/ somehow.

I didn't get the point here, but please note that the data in ELF core header
is produced by kexec-tools (who knows its location, too), and consumed solely
by the crash dump kernel.

Thanks,
-Takahiro AKASHI

> Why is that not the case?
> 
> Thanks,
> Mark.
Mark Rutland Feb. 2, 2017, 12:03 p.m. UTC | #3
On Thu, Feb 02, 2017 at 03:24:09PM +0900, AKASHI Takahiro wrote:
> On Wed, Feb 01, 2017 at 07:21:22PM +0000, Mark Rutland wrote:
> > On Wed, Feb 01, 2017 at 09:46:28PM +0900, AKASHI Takahiro wrote:
> > > Add arch-specific functions to provide a dump file, /proc/vmcore.
> > > 
> > > This file is in ELF format and its ELF header needs to be prepared by
> > > userspace tools, like kexec-tools, in adance. The primary kernel is
> > > responsible to allocate the region with reserve_elfcorehdr() at boot time
> > > and advertize its location to crash dump kernel via a new device-tree
> > > property, "linux,elfcorehdr".
> > 
> > > +static int __init early_init_dt_scan_elfcorehdr(unsigned long node,
> > > +		const char *uname, int depth, void *data)
> > > +{
> > > +	const __be32 *reg;
> > > +	int len;
> > > +
> > > +	if (depth != 1 || strcmp(uname, "chosen") != 0)
> > > +		return 0;
> > > +
> > > +	reg = of_get_flat_dt_prop(node, "linux,elfcorehdr", &len);
> > > +	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
> > > +		return 1;
> > > +
> > > +	elfcorehdr_addr = dt_mem_next_cell(dt_root_addr_cells, &reg);
> > > +	elfcorehdr_size = dt_mem_next_cell(dt_root_size_cells, &reg);
> > > +
> > > +	return 1;
> > > +}
> > > +
> > > +/*
> > > + * reserve_elfcorehdr() - reserves memory for elf core header
> > > + *
> > > + * This function reserves elf core header given in "elfcorehdr=" kernel
> > > + * command line parameter. This region contains all the information about
> > > + * primary kernel's core image and is used by a dump capture kernel to
> > > + * access the system memory on primary kernel.
> > > + */
> > > +static void __init reserve_elfcorehdr(void)
> > > +{
> > > +	of_scan_flat_dt(early_init_dt_scan_elfcorehdr, NULL);
> > > +
> > > +	if (!elfcorehdr_size)
> > > +		return;
> > > +
> > > +	if (memblock_is_region_reserved(elfcorehdr_addr, elfcorehdr_size)) {
> > > +		pr_warn("elfcorehdr is overlapped\n");
> > > +		return;
> > > +	}
> > > +
> > > +	memblock_reserve(elfcorehdr_addr, elfcorehdr_size);
> > > +
> > > +	pr_info("Reserving %lldKB of memory at 0x%llx for elfcorehdr\n",
> > > +		elfcorehdr_size >> 10, elfcorehdr_addr);
> > > +}
> > 
> > This doesn't seem right to me.  The logic here doesn't match the commit
> > message, the comment above reserve_elfcorehdr() doesn't match the
> > implementation, and this doesn't match my understanding of how this was
> > intended to be used from the DT binding.
> 
> Surely the commit message was wrong/misleading; It should say
> 
> ===8<===
> Arch-specific functions are added to allow for implementing a crash dump
> file interface, /proc/vmcore, which can be viewed as a ELF file.
> 
> A user space tool, like kexec-tools, is responsible for allocating
> a separate region for the core's ELF header within crash kdump kernel 
> memory and filling it in when executing kexec_load().
> 
> Then, its location will be advertised to crash dump kernel via a new
> device-tree property, "linux,elfcorehdr", and crash dump kernel preserves
> the region for later use with reserve_elfcorehdr() at boot time.
> 
> On crash dump kernel, /proc/vmcore will access the primary kernel's memory
> with copy_oldmem_page(), which feeds the data page-by-page by ioremap'ing
> it since it does not reside in linear mapping on crash dump kernel.
> 
> Meanwhile, elfcorehdr_read() is simple as the region is always mapped.
> ===>8===
> 
> Does this make things clear?

Yes. That sounds roughly like what I was expecting. Thanks for
clarifiying that!

Can you also fix the comment above reserve_elfcorehdr()? It refers to a
non-existent kernel command line option.

... at the same time, can we change the print there to something like:

	pr_info("elfcorehdr found and reserved at 0x%016llx - 0x%016llx (%lld KB)\n",
		elfcorehdr_addr, elfcorehdr_addr + elfcorehdr_size, 
		elfcorehdr_size >> 10);

... that makes it clear that we've found an existing elfcorehdr, rather
than we're reserving space to later fill with an elfcorehdr, and
printing the rgion base and size in that way aligns with what we do
elsewhere.

With all that:

Acked-by: Mark Rutland <mark.rutland@arm.com>

Thanks,
Mark.
Mark Rutland Feb. 2, 2017, 12:08 p.m. UTC | #4
On Thu, Feb 02, 2017 at 12:03:20PM +0000, Mark Rutland wrote:
> On Thu, Feb 02, 2017 at 03:24:09PM +0900, AKASHI Takahiro wrote:
> > On Wed, Feb 01, 2017 at 07:21:22PM +0000, Mark Rutland wrote:
> > > On Wed, Feb 01, 2017 at 09:46:28PM +0900, AKASHI Takahiro wrote:

> > > > +/*
> > > > + * reserve_elfcorehdr() - reserves memory for elf core header
> > > > + *
> > > > + * This function reserves elf core header given in "elfcorehdr=" kernel
> > > > + * command line parameter. This region contains all the information about
> > > > + * primary kernel's core image and is used by a dump capture kernel to
> > > > + * access the system memory on primary kernel.
> > > > + */

> Can you also fix the comment above reserve_elfcorehdr()? It refers to a
> non-existent kernel command line option.

... or rather, one that's handled by core code, not
reserve_elfcorehdr().

So more specifically, it would be nicer to have something like:

	/*
	 * reserve_elfcorehdr() - reserves memory for elf core header
	 *
	 * This function reserves the memory occupied by an elf core header
	 * described in the device tree. This region contains all the
	 * information about primary kernel's core image and is used by a dump
	 * capture kernel to access the system memory on primary kernel.
	 */

Thanks,
Mark.
AKASHI Takahiro Feb. 2, 2017, 2:39 p.m. UTC | #5
On Thu, Feb 02, 2017 at 12:08:50PM +0000, Mark Rutland wrote:
> On Thu, Feb 02, 2017 at 12:03:20PM +0000, Mark Rutland wrote:
> > On Thu, Feb 02, 2017 at 03:24:09PM +0900, AKASHI Takahiro wrote:
> > > On Wed, Feb 01, 2017 at 07:21:22PM +0000, Mark Rutland wrote:
> > > > On Wed, Feb 01, 2017 at 09:46:28PM +0900, AKASHI Takahiro wrote:
> 
> > > > > +/*
> > > > > + * reserve_elfcorehdr() - reserves memory for elf core header
> > > > > + *
> > > > > + * This function reserves elf core header given in "elfcorehdr=" kernel
> > > > > + * command line parameter. This region contains all the information about
> > > > > + * primary kernel's core image and is used by a dump capture kernel to
> > > > > + * access the system memory on primary kernel.
> > > > > + */
> 
> > Can you also fix the comment above reserve_elfcorehdr()? It refers to a
> > non-existent kernel command line option.
> 
> ... or rather, one that's handled by core code, not
> reserve_elfcorehdr().
> 
> So more specifically, it would be nicer to have something like:
> 
> 	/*
> 	 * reserve_elfcorehdr() - reserves memory for elf core header
> 	 *
> 	 * This function reserves the memory occupied by an elf core header
> 	 * described in the device tree. This region contains all the
> 	 * information about primary kernel's core image and is used by a dump
> 	 * capture kernel to access the system memory on primary kernel.
> 	 */

Perfect!

Thanks a lot,
-Takahiro AKASHI


> Thanks,
> Mark.
diff mbox

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 111742126897..2bd6a1a062b9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -693,6 +693,17 @@  config KEXEC
 	  but it is independent of the system firmware.   And like a reboot
 	  you can start any kernel with it, not just Linux.
 
+config CRASH_DUMP
+	bool "Build kdump crash kernel"
+	help
+	  Generate crash dump after being started by kexec. This should
+	  be normally only set in special crash dump kernels which are
+	  loaded in the main kernel with kexec-tools into a specially
+	  reserved region and then later executed after a crash by
+	  kdump/kexec.
+
+	  For more details see Documentation/kdump/kdump.txt
+
 config XEN_DOM0
 	def_bool y
 	depends on XEN
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 7d66bbaafc0c..6a7384eee08d 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -50,6 +50,7 @@  arm64-obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr.o
 arm64-obj-$(CONFIG_HIBERNATION)		+= hibernate.o hibernate-asm.o
 arm64-obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o	\
 					   cpu-reset.o
+arm64-obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
 
 obj-y					+= $(arm64-obj-y) vdso/ probes/
 obj-m					+= $(arm64-obj-m)
diff --git a/arch/arm64/kernel/crash_dump.c b/arch/arm64/kernel/crash_dump.c
new file mode 100644
index 000000000000..c3d5a21c081e
--- /dev/null
+++ b/arch/arm64/kernel/crash_dump.c
@@ -0,0 +1,71 @@ 
+/*
+ * Routines for doing kexec-based kdump
+ *
+ * Copyright (C) 2014 Linaro Limited
+ * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/crash_dump.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/memblock.h>
+#include <linux/uaccess.h>
+#include <asm/memory.h>
+
+/**
+ * copy_oldmem_page() - copy one page from old kernel memory
+ * @pfn: page frame number to be copied
+ * @buf: buffer where the copied page is placed
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page
+ * @userbuf: if set, @buf is in a user address space
+ *
+ * This function copies one page from old kernel memory into buffer pointed by
+ * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes
+ * copied or negative error in case of failure.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+			 size_t csize, unsigned long offset,
+			 int userbuf)
+{
+	void *vaddr;
+
+	if (!csize)
+		return 0;
+
+	vaddr = memremap(__pfn_to_phys(pfn), PAGE_SIZE, MEMREMAP_WB);
+	if (!vaddr)
+		return -ENOMEM;
+
+	if (userbuf) {
+		if (copy_to_user((char __user *)buf, vaddr + offset, csize)) {
+			memunmap(vaddr);
+			return -EFAULT;
+		}
+	} else {
+		memcpy(buf, vaddr + offset, csize);
+	}
+
+	memunmap(vaddr);
+
+	return csize;
+}
+
+/**
+ * elfcorehdr_read - read from ELF core header
+ * @buf: buffer where the data is placed
+ * @csize: number of bytes to read
+ * @ppos: address in the memory
+ *
+ * This function reads @count bytes from elf core header which exists
+ * on crash dump kernel's memory.
+ */
+ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
+{
+	memcpy(buf, phys_to_virt((phys_addr_t)*ppos), count);
+	return count;
+}
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 2aba75dc7720..323b87197e18 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -38,6 +38,7 @@ 
 #include <linux/swiotlb.h>
 #include <linux/vmalloc.h>
 #include <linux/kexec.h>
+#include <linux/crash_dump.h>
 
 #include <asm/boot.h>
 #include <asm/fixmap.h>
@@ -135,6 +136,56 @@  static void __init reserve_crashkernel(void)
 }
 #endif /* CONFIG_KEXEC_CORE */
 
+#ifdef CONFIG_CRASH_DUMP
+static int __init early_init_dt_scan_elfcorehdr(unsigned long node,
+		const char *uname, int depth, void *data)
+{
+	const __be32 *reg;
+	int len;
+
+	if (depth != 1 || strcmp(uname, "chosen") != 0)
+		return 0;
+
+	reg = of_get_flat_dt_prop(node, "linux,elfcorehdr", &len);
+	if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
+		return 1;
+
+	elfcorehdr_addr = dt_mem_next_cell(dt_root_addr_cells, &reg);
+	elfcorehdr_size = dt_mem_next_cell(dt_root_size_cells, &reg);
+
+	return 1;
+}
+
+/*
+ * reserve_elfcorehdr() - reserves memory for elf core header
+ *
+ * This function reserves elf core header given in "elfcorehdr=" kernel
+ * command line parameter. This region contains all the information about
+ * primary kernel's core image and is used by a dump capture kernel to
+ * access the system memory on primary kernel.
+ */
+static void __init reserve_elfcorehdr(void)
+{
+	of_scan_flat_dt(early_init_dt_scan_elfcorehdr, NULL);
+
+	if (!elfcorehdr_size)
+		return;
+
+	if (memblock_is_region_reserved(elfcorehdr_addr, elfcorehdr_size)) {
+		pr_warn("elfcorehdr is overlapped\n");
+		return;
+	}
+
+	memblock_reserve(elfcorehdr_addr, elfcorehdr_size);
+
+	pr_info("Reserving %lldKB of memory at 0x%llx for elfcorehdr\n",
+		elfcorehdr_size >> 10, elfcorehdr_addr);
+}
+#else
+static void __init reserve_elfcorehdr(void)
+{
+}
+#endif /* CONFIG_CRASH_DUMP */
 /*
  * Return the maximum physical address for ZONE_DMA (DMA_BIT_MASK(32)). It
  * currently assumes that for memory starting above 4G, 32-bit devices will
@@ -393,6 +444,8 @@  void __init arm64_memblock_init(void)
 
 	reserve_crashkernel();
 
+	reserve_elfcorehdr();
+
 	dma_contiguous_reserve(arm64_dma_phys_limit);
 
 	memblock_allow_resize();