Message ID | 20191223152349.180172-2-chenzhou10@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | support reserving crashkernel above 4G on arm64 kdump | expand |
Hi, On 12/23/19 at 11:23pm, Chen Zhou wrote: > In preparation for supporting reserve_crashkernel_low in arm64 as > x86_64 does, move reserve_crashkernel_low() into kernel/crash_core.c. > > Note, in arm64, we reserve low memory if and only if crashkernel=X,low > is specified. Different with x86_64, don't set low memory automatically. Do you have any reason for the difference? I'd expect we have same logic if possible and remove some of the ifdefs. > > Reported-by: kbuild test robot <lkp@intel.com> > Signed-off-by: Chen Zhou <chenzhou10@huawei.com> > --- > arch/x86/kernel/setup.c | 62 ++++----------------------------- > include/linux/crash_core.h | 3 ++ > include/linux/kexec.h | 2 -- > kernel/crash_core.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++ > kernel/kexec_core.c | 17 --------- > 5 files changed, 96 insertions(+), 75 deletions(-) > > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c > index cedfe20..5f38942 100644 > --- a/arch/x86/kernel/setup.c > +++ b/arch/x86/kernel/setup.c > @@ -486,59 +486,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) > # define CRASH_ADDR_HIGH_MAX SZ_64T > #endif > > -static int __init reserve_crashkernel_low(void) > -{ > -#ifdef CONFIG_X86_64 > - unsigned long long base, low_base = 0, low_size = 0; > - unsigned long total_low_mem; > - int ret; > - > - total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); > - > - /* crashkernel=Y,low */ > - ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); > - if (ret) { > - /* > - * two parts from kernel/dma/swiotlb.c: > - * -swiotlb size: user-specified with swiotlb= or default. > - * > - * -swiotlb overflow buffer: now hardcoded to 32k. We round it > - * to 8M for other buffers that may need to stay low too. Also > - * make sure we allocate enough extra low memory so that we > - * don't run out of DMA buffers for 32-bit devices. > - */ > - low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); > - } else { > - /* passed with crashkernel=0,low ? */ > - if (!low_size) > - return 0; > - } > - > - low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); > - if (!low_base) { > - pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", > - (unsigned long)(low_size >> 20)); > - return -ENOMEM; > - } > - > - ret = memblock_reserve(low_base, low_size); > - if (ret) { > - pr_err("%s: Error reserving crashkernel low memblock.\n", __func__); > - return ret; > - } > - > - pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", > - (unsigned long)(low_size >> 20), > - (unsigned long)(low_base >> 20), > - (unsigned long)(total_low_mem >> 20)); > - > - crashk_low_res.start = low_base; > - crashk_low_res.end = low_base + low_size - 1; > - insert_resource(&iomem_resource, &crashk_low_res); > -#endif > - return 0; > -} > - > static void __init reserve_crashkernel(void) > { > unsigned long long crash_size, crash_base, total_mem; > @@ -602,9 +549,12 @@ static void __init reserve_crashkernel(void) > return; > } > > - if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { > - memblock_free(crash_base, crash_size); > - return; > + if (crash_base >= (1ULL << 32)) { > + if (reserve_crashkernel_low()) { > + memblock_free(crash_base, crash_size); > + return; > + } > + insert_resource(&iomem_resource, &crashk_low_res); Some specific reason to move insert_resouce out of the reserve_crashkernel_low function? > } > > pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", > diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h > index 525510a..4df8c0b 100644 > --- a/include/linux/crash_core.h > +++ b/include/linux/crash_core.h > @@ -63,6 +63,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); > extern unsigned char *vmcoreinfo_data; > extern size_t vmcoreinfo_size; > extern u32 *vmcoreinfo_note; > +extern struct resource crashk_res; > +extern struct resource crashk_low_res; > > Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, > void *data, size_t data_len); > @@ -74,5 +76,6 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, > unsigned long long *crash_size, unsigned long long *crash_base); > int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, > unsigned long long *crash_size, unsigned long long *crash_base); > +int __init reserve_crashkernel_low(void); > > #endif /* LINUX_CRASH_CORE_H */ > diff --git a/include/linux/kexec.h b/include/linux/kexec.h > index 1776eb2..5d5d963 100644 > --- a/include/linux/kexec.h > +++ b/include/linux/kexec.h > @@ -330,8 +330,6 @@ extern int kexec_load_disabled; > > /* Location of a reserved region to hold the crash kernel. > */ > -extern struct resource crashk_res; > -extern struct resource crashk_low_res; > extern note_buf_t __percpu *crash_notes; > > /* flag to track if kexec reboot is in progress */ > diff --git a/kernel/crash_core.c b/kernel/crash_core.c > index 9f1557b..eb72fd6 100644 > --- a/kernel/crash_core.c > +++ b/kernel/crash_core.c > @@ -7,6 +7,8 @@ > #include <linux/crash_core.h> > #include <linux/utsname.h> > #include <linux/vmalloc.h> > +#include <linux/memblock.h> > +#include <linux/swiotlb.h> > > #include <asm/page.h> > #include <asm/sections.h> > @@ -19,6 +21,22 @@ u32 *vmcoreinfo_note; > /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ > static unsigned char *vmcoreinfo_data_safecopy; > > +/* Location of the reserved area for the crash kernel */ > +struct resource crashk_res = { > + .name = "Crash kernel", > + .start = 0, > + .end = 0, > + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, > + .desc = IORES_DESC_CRASH_KERNEL > +}; > +struct resource crashk_low_res = { > + .name = "Crash kernel", > + .start = 0, > + .end = 0, > + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, > + .desc = IORES_DESC_CRASH_KERNEL > +}; > + > /* > * parsing the "crashkernel" commandline > * > @@ -292,6 +310,75 @@ int __init parse_crashkernel_low(char *cmdline, > "crashkernel=", suffix_tbl[SUFFIX_LOW]); > } > > +#if defined(CONFIG_X86_64) > +#define CRASH_ALIGN SZ_16M > +#elif defined(CONFIG_ARM64) > +#define CRASH_ALIGN SZ_2M > +#endif I think no need to have the #ifdef, although I can not think out of reason we have 16M for X86, maybe move it to 2M as well if no other objections. Then it will be easier to reserve crashkernel successfully considering nowadays we have KASLR and other stuff it becomes harder. > + > +int __init reserve_crashkernel_low(void) > +{ > +#if defined(CONFIG_X86_64) || defined(CONFIG_ARM64) > + unsigned long long base, low_base = 0, low_size = 0; > + unsigned long total_low_mem; > + int ret; > + > + total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); > + > + /* crashkernel=Y,low */ > + ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, > + &base); > + if (ret) { > +#ifdef CONFIG_X86_64 > + /* > + * two parts from lib/swiotlb.c: > + * -swiotlb size: user-specified with swiotlb= or default. > + * > + * -swiotlb overflow buffer: now hardcoded to 32k. We round it > + * to 8M for other buffers that may need to stay low too. Also > + * make sure we allocate enough extra low memory so that we > + * don't run out of DMA buffers for 32-bit devices. > + */ > + low_size = max(swiotlb_size_or_default() + (8UL << 20), > + 256UL << 20); > +#else > + /* > + * in arm64, reserve low memory if and only if crashkernel=X,low > + * specified. > + */ > + return -EINVAL; > +#endif As said before, can you explore about why it needs different logic, it would be good to keep two arches same. > + } else { > + /* passed with crashkernel=0,low ? */ > + if (!low_size) > + return 0; > + } > + > + low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); > + if (!low_base) { > + pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", > + (unsigned long)(low_size >> 20)); > + return -ENOMEM; > + } > + > + ret = memblock_reserve(low_base, low_size); > + if (ret) { > + pr_err("%s: Error reserving crashkernel low memblock.\n", > + __func__); > + return ret; > + } > + > + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", > + (unsigned long)(low_size >> 20), > + (unsigned long)(low_base >> 20), > + (unsigned long)(total_low_mem >> 20)); > + > + crashk_low_res.start = low_base; > + crashk_low_res.end = low_base + low_size - 1; > +#endif > + return 0; > +} > + > Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, > void *data, size_t data_len) > { > diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c > index 15d70a9..458d093 100644 > --- a/kernel/kexec_core.c > +++ b/kernel/kexec_core.c > @@ -53,23 +53,6 @@ note_buf_t __percpu *crash_notes; > /* Flag to indicate we are going to kexec a new kernel */ > bool kexec_in_progress = false; > > - > -/* Location of the reserved area for the crash kernel */ > -struct resource crashk_res = { > - .name = "Crash kernel", > - .start = 0, > - .end = 0, > - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, > - .desc = IORES_DESC_CRASH_KERNEL > -}; > -struct resource crashk_low_res = { > - .name = "Crash kernel", > - .start = 0, > - .end = 0, > - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, > - .desc = IORES_DESC_CRASH_KERNEL > -}; > - > int kexec_should_crash(struct task_struct *p) > { > /* > -- > 2.7.4 > Thanks Dave
Hi Dave On 2019/12/27 13:54, Dave Young wrote: > Hi, > On 12/23/19 at 11:23pm, Chen Zhou wrote: >> In preparation for supporting reserve_crashkernel_low in arm64 as >> x86_64 does, move reserve_crashkernel_low() into kernel/crash_core.c. >> >> Note, in arm64, we reserve low memory if and only if crashkernel=X,low >> is specified. Different with x86_64, don't set low memory automatically. > > Do you have any reason for the difference? I'd expect we have same > logic if possible and remove some of the ifdefs. In x86_64, if we reserve crashkernel above 4G, then we call reserve_crashkernel_low() to reserve low memory. In arm64, to simplify, we call reserve_crashkernel_low() at the beginning of reserve_crashkernel() and then relax the arm64_dma32_phys_limit if reserve_crashkernel_low() allocated something. In this case, if reserve crashkernel below 4G there will be 256M low memory set automatically and this needs extra considerations. previous discusses: https://lkml.org/lkml/2019/6/5/670 https://lkml.org/lkml/2019/6/13/229 > >> >> Reported-by: kbuild test robot <lkp@intel.com> >> Signed-off-by: Chen Zhou <chenzhou10@huawei.com> >> --- >> arch/x86/kernel/setup.c | 62 ++++----------------------------- >> include/linux/crash_core.h | 3 ++ >> include/linux/kexec.h | 2 -- >> kernel/crash_core.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++ >> kernel/kexec_core.c | 17 --------- >> 5 files changed, 96 insertions(+), 75 deletions(-) >> >> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c >> index cedfe20..5f38942 100644 >> --- a/arch/x86/kernel/setup.c >> +++ b/arch/x86/kernel/setup.c >> @@ -486,59 +486,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) >> # define CRASH_ADDR_HIGH_MAX SZ_64T >> #endif >> >> -static int __init reserve_crashkernel_low(void) >> -{ >> -#ifdef CONFIG_X86_64 >> - unsigned long long base, low_base = 0, low_size = 0; >> - unsigned long total_low_mem; >> - int ret; >> - >> - total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); >> - >> - /* crashkernel=Y,low */ >> - ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); >> - if (ret) { >> - /* >> - * two parts from kernel/dma/swiotlb.c: >> - * -swiotlb size: user-specified with swiotlb= or default. >> - * >> - * -swiotlb overflow buffer: now hardcoded to 32k. We round it >> - * to 8M for other buffers that may need to stay low too. Also >> - * make sure we allocate enough extra low memory so that we >> - * don't run out of DMA buffers for 32-bit devices. >> - */ >> - low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); >> - } else { >> - /* passed with crashkernel=0,low ? */ >> - if (!low_size) >> - return 0; >> - } >> - >> - low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); >> - if (!low_base) { >> - pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", >> - (unsigned long)(low_size >> 20)); >> - return -ENOMEM; >> - } >> - >> - ret = memblock_reserve(low_base, low_size); >> - if (ret) { >> - pr_err("%s: Error reserving crashkernel low memblock.\n", __func__); >> - return ret; >> - } >> - >> - pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", >> - (unsigned long)(low_size >> 20), >> - (unsigned long)(low_base >> 20), >> - (unsigned long)(total_low_mem >> 20)); >> - >> - crashk_low_res.start = low_base; >> - crashk_low_res.end = low_base + low_size - 1; >> - insert_resource(&iomem_resource, &crashk_low_res); >> -#endif >> - return 0; >> -} >> - >> static void __init reserve_crashkernel(void) >> { >> unsigned long long crash_size, crash_base, total_mem; >> @@ -602,9 +549,12 @@ static void __init reserve_crashkernel(void) >> return; >> } >> >> - if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { >> - memblock_free(crash_base, crash_size); >> - return; >> + if (crash_base >= (1ULL << 32)) { >> + if (reserve_crashkernel_low()) { >> + memblock_free(crash_base, crash_size); >> + return; >> + } >> + insert_resource(&iomem_resource, &crashk_low_res); > > Some specific reason to move insert_resouce out of the > reserve_crashkernel_low function? No specific reason. I just exposed arm64 "Crash kernel low" in request_standard_resources() as other resources, so did this change. > >> } >> >> pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", >> diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h >> index 525510a..4df8c0b 100644 >> --- a/include/linux/crash_core.h >> +++ b/include/linux/crash_core.h >> @@ -63,6 +63,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); >> extern unsigned char *vmcoreinfo_data; >> extern size_t vmcoreinfo_size; >> extern u32 *vmcoreinfo_note; >> +extern struct resource crashk_res; >> +extern struct resource crashk_low_res; >> >> Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, >> void *data, size_t data_len); >> @@ -74,5 +76,6 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, >> unsigned long long *crash_size, unsigned long long *crash_base); >> int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, >> unsigned long long *crash_size, unsigned long long *crash_base); >> +int __init reserve_crashkernel_low(void); >> >> #endif /* LINUX_CRASH_CORE_H */ >> diff --git a/include/linux/kexec.h b/include/linux/kexec.h >> index 1776eb2..5d5d963 100644 >> --- a/include/linux/kexec.h >> +++ b/include/linux/kexec.h >> @@ -330,8 +330,6 @@ extern int kexec_load_disabled; >> >> /* Location of a reserved region to hold the crash kernel. >> */ >> -extern struct resource crashk_res; >> -extern struct resource crashk_low_res; >> extern note_buf_t __percpu *crash_notes; >> >> /* flag to track if kexec reboot is in progress */ >> diff --git a/kernel/crash_core.c b/kernel/crash_core.c >> index 9f1557b..eb72fd6 100644 >> --- a/kernel/crash_core.c >> +++ b/kernel/crash_core.c >> @@ -7,6 +7,8 @@ >> #include <linux/crash_core.h> >> #include <linux/utsname.h> >> #include <linux/vmalloc.h> >> +#include <linux/memblock.h> >> +#include <linux/swiotlb.h> >> >> #include <asm/page.h> >> #include <asm/sections.h> >> @@ -19,6 +21,22 @@ u32 *vmcoreinfo_note; >> /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ >> static unsigned char *vmcoreinfo_data_safecopy; >> >> +/* Location of the reserved area for the crash kernel */ >> +struct resource crashk_res = { >> + .name = "Crash kernel", >> + .start = 0, >> + .end = 0, >> + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, >> + .desc = IORES_DESC_CRASH_KERNEL >> +}; >> +struct resource crashk_low_res = { >> + .name = "Crash kernel", >> + .start = 0, >> + .end = 0, >> + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, >> + .desc = IORES_DESC_CRASH_KERNEL >> +}; >> + >> /* >> * parsing the "crashkernel" commandline >> * >> @@ -292,6 +310,75 @@ int __init parse_crashkernel_low(char *cmdline, >> "crashkernel=", suffix_tbl[SUFFIX_LOW]); >> } >> >> +#if defined(CONFIG_X86_64) >> +#define CRASH_ALIGN SZ_16M >> +#elif defined(CONFIG_ARM64) >> +#define CRASH_ALIGN SZ_2M >> +#endif > > I think no need to have the #ifdef, although I can not think out of > reason we have 16M for X86, maybe move it to 2M as well if no other > objections. Then it will be easier to reserve crashkernel successfully > considering nowadays we have KASLR and other stuff it becomes harder. I also don't figure out why it is 16M in x86. > >> + >> +int __init reserve_crashkernel_low(void) >> +{ >> +#if defined(CONFIG_X86_64) || defined(CONFIG_ARM64) >> + unsigned long long base, low_base = 0, low_size = 0; >> + unsigned long total_low_mem; >> + int ret; >> + >> + total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); >> + >> + /* crashkernel=Y,low */ >> + ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, >> + &base); >> + if (ret) { >> +#ifdef CONFIG_X86_64 >> + /* >> + * two parts from lib/swiotlb.c: >> + * -swiotlb size: user-specified with swiotlb= or default. >> + * >> + * -swiotlb overflow buffer: now hardcoded to 32k. We round it >> + * to 8M for other buffers that may need to stay low too. Also >> + * make sure we allocate enough extra low memory so that we >> + * don't run out of DMA buffers for 32-bit devices. >> + */ >> + low_size = max(swiotlb_size_or_default() + (8UL << 20), >> + 256UL << 20); >> +#else >> + /* >> + * in arm64, reserve low memory if and only if crashkernel=X,low >> + * specified. >> + */ >> + return -EINVAL; >> +#endif > > As said before, can you explore about why it needs different logic, it > would be good to keep two arches same. > >> + } else { >> + /* passed with crashkernel=0,low ? */ >> + if (!low_size) >> + return 0; >> + } >> + >> + low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); >> + if (!low_base) { >> + pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", >> + (unsigned long)(low_size >> 20)); >> + return -ENOMEM; >> + } >> + >> + ret = memblock_reserve(low_base, low_size); >> + if (ret) { >> + pr_err("%s: Error reserving crashkernel low memblock.\n", >> + __func__); >> + return ret; >> + } >> + >> + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", >> + (unsigned long)(low_size >> 20), >> + (unsigned long)(low_base >> 20), >> + (unsigned long)(total_low_mem >> 20)); >> + >> + crashk_low_res.start = low_base; >> + crashk_low_res.end = low_base + low_size - 1; >> +#endif >> + return 0; >> +} >> + >> Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, >> void *data, size_t data_len) >> { >> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c >> index 15d70a9..458d093 100644 >> --- a/kernel/kexec_core.c >> +++ b/kernel/kexec_core.c >> @@ -53,23 +53,6 @@ note_buf_t __percpu *crash_notes; >> /* Flag to indicate we are going to kexec a new kernel */ >> bool kexec_in_progress = false; >> >> - >> -/* Location of the reserved area for the crash kernel */ >> -struct resource crashk_res = { >> - .name = "Crash kernel", >> - .start = 0, >> - .end = 0, >> - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, >> - .desc = IORES_DESC_CRASH_KERNEL >> -}; >> -struct resource crashk_low_res = { >> - .name = "Crash kernel", >> - .start = 0, >> - .end = 0, >> - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, >> - .desc = IORES_DESC_CRASH_KERNEL >> -}; >> - >> int kexec_should_crash(struct task_struct *p) >> { >> /* >> -- >> 2.7.4 >> > > Thanks > Dave > > > . > Thanks, Chen Zhou
On 12/27/19 at 07:04pm, Chen Zhou wrote: > Hi Dave > > On 2019/12/27 13:54, Dave Young wrote: > > Hi, > > On 12/23/19 at 11:23pm, Chen Zhou wrote: > >> In preparation for supporting reserve_crashkernel_low in arm64 as > >> x86_64 does, move reserve_crashkernel_low() into kernel/crash_core.c. > >> > >> Note, in arm64, we reserve low memory if and only if crashkernel=X,low > >> is specified. Different with x86_64, don't set low memory automatically. > > > > Do you have any reason for the difference? I'd expect we have same > > logic if possible and remove some of the ifdefs. > > In x86_64, if we reserve crashkernel above 4G, then we call reserve_crashkernel_low() > to reserve low memory. > > In arm64, to simplify, we call reserve_crashkernel_low() at the beginning of reserve_crashkernel() > and then relax the arm64_dma32_phys_limit if reserve_crashkernel_low() allocated something. > In this case, if reserve crashkernel below 4G there will be 256M low memory set automatically > and this needs extra considerations. Sorry that I did not read the old thread details and thought that is arch dependent. But rethink about that, it would be better that we can have same semantic about crashkernel parameters across arches. If we make them different then it causes confusion, especially for distributions. OTOH, I thought if we reserve high memory then the low memory should be needed. There might be some exceptions, but I do not know the exact one, can we make the behavior same, and special case those systems which do not need low memory reservation. > > previous discusses: > https://lkml.org/lkml/2019/6/5/670 > https://lkml.org/lkml/2019/6/13/229 Another concern from James: " With both crashk_low_res and crashk_res, we end up with two entries in /proc/iomem called "Crash kernel". Because its sorted by address, and kexec-tools stops searching when it find "Crash kernel", you are always going to get the kernel placed in the lower portion. " The kexec-tools code is iterating all "Crash kernel" ranges and add them in an array. In X86 code, it uses the higher range to locate memory. > > > > >> > >> Reported-by: kbuild test robot <lkp@intel.com> > >> Signed-off-by: Chen Zhou <chenzhou10@huawei.com> > >> --- > >> arch/x86/kernel/setup.c | 62 ++++----------------------------- > >> include/linux/crash_core.h | 3 ++ > >> include/linux/kexec.h | 2 -- > >> kernel/crash_core.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++ > >> kernel/kexec_core.c | 17 --------- > >> 5 files changed, 96 insertions(+), 75 deletions(-) > >> > >> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c > >> index cedfe20..5f38942 100644 > >> --- a/arch/x86/kernel/setup.c > >> +++ b/arch/x86/kernel/setup.c > >> @@ -486,59 +486,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) > >> # define CRASH_ADDR_HIGH_MAX SZ_64T > >> #endif > >> > >> -static int __init reserve_crashkernel_low(void) > >> -{ > >> -#ifdef CONFIG_X86_64 > >> - unsigned long long base, low_base = 0, low_size = 0; > >> - unsigned long total_low_mem; > >> - int ret; > >> - > >> - total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); > >> - > >> - /* crashkernel=Y,low */ > >> - ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); > >> - if (ret) { > >> - /* > >> - * two parts from kernel/dma/swiotlb.c: > >> - * -swiotlb size: user-specified with swiotlb= or default. > >> - * > >> - * -swiotlb overflow buffer: now hardcoded to 32k. We round it > >> - * to 8M for other buffers that may need to stay low too. Also > >> - * make sure we allocate enough extra low memory so that we > >> - * don't run out of DMA buffers for 32-bit devices. > >> - */ > >> - low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); > >> - } else { > >> - /* passed with crashkernel=0,low ? */ > >> - if (!low_size) > >> - return 0; > >> - } > >> - > >> - low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); > >> - if (!low_base) { > >> - pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", > >> - (unsigned long)(low_size >> 20)); > >> - return -ENOMEM; > >> - } > >> - > >> - ret = memblock_reserve(low_base, low_size); > >> - if (ret) { > >> - pr_err("%s: Error reserving crashkernel low memblock.\n", __func__); > >> - return ret; > >> - } > >> - > >> - pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", > >> - (unsigned long)(low_size >> 20), > >> - (unsigned long)(low_base >> 20), > >> - (unsigned long)(total_low_mem >> 20)); > >> - > >> - crashk_low_res.start = low_base; > >> - crashk_low_res.end = low_base + low_size - 1; > >> - insert_resource(&iomem_resource, &crashk_low_res); > >> -#endif > >> - return 0; > >> -} > >> - > >> static void __init reserve_crashkernel(void) > >> { > >> unsigned long long crash_size, crash_base, total_mem; > >> @@ -602,9 +549,12 @@ static void __init reserve_crashkernel(void) > >> return; > >> } > >> > >> - if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { > >> - memblock_free(crash_base, crash_size); > >> - return; > >> + if (crash_base >= (1ULL << 32)) { > >> + if (reserve_crashkernel_low()) { > >> + memblock_free(crash_base, crash_size); > >> + return; > >> + } > >> + insert_resource(&iomem_resource, &crashk_low_res); > > > > Some specific reason to move insert_resouce out of the > > reserve_crashkernel_low function? > > No specific reason. > I just exposed arm64 "Crash kernel low" in request_standard_resources() as other resources, > so did this change. Ok. > > > > >> } > >> > >> pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", > >> diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h > >> index 525510a..4df8c0b 100644 > >> --- a/include/linux/crash_core.h > >> +++ b/include/linux/crash_core.h > >> @@ -63,6 +63,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); > >> extern unsigned char *vmcoreinfo_data; > >> extern size_t vmcoreinfo_size; > >> extern u32 *vmcoreinfo_note; > >> +extern struct resource crashk_res; > >> +extern struct resource crashk_low_res; > >> > >> Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, > >> void *data, size_t data_len); > >> @@ -74,5 +76,6 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, > >> unsigned long long *crash_size, unsigned long long *crash_base); > >> int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, > >> unsigned long long *crash_size, unsigned long long *crash_base); > >> +int __init reserve_crashkernel_low(void); > >> > >> #endif /* LINUX_CRASH_CORE_H */ > >> diff --git a/include/linux/kexec.h b/include/linux/kexec.h > >> index 1776eb2..5d5d963 100644 > >> --- a/include/linux/kexec.h > >> +++ b/include/linux/kexec.h > >> @@ -330,8 +330,6 @@ extern int kexec_load_disabled; > >> > >> /* Location of a reserved region to hold the crash kernel. > >> */ > >> -extern struct resource crashk_res; > >> -extern struct resource crashk_low_res; > >> extern note_buf_t __percpu *crash_notes; > >> > >> /* flag to track if kexec reboot is in progress */ > >> diff --git a/kernel/crash_core.c b/kernel/crash_core.c > >> index 9f1557b..eb72fd6 100644 > >> --- a/kernel/crash_core.c > >> +++ b/kernel/crash_core.c > >> @@ -7,6 +7,8 @@ > >> #include <linux/crash_core.h> > >> #include <linux/utsname.h> > >> #include <linux/vmalloc.h> > >> +#include <linux/memblock.h> > >> +#include <linux/swiotlb.h> > >> > >> #include <asm/page.h> > >> #include <asm/sections.h> > >> @@ -19,6 +21,22 @@ u32 *vmcoreinfo_note; > >> /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ > >> static unsigned char *vmcoreinfo_data_safecopy; > >> > >> +/* Location of the reserved area for the crash kernel */ > >> +struct resource crashk_res = { > >> + .name = "Crash kernel", > >> + .start = 0, > >> + .end = 0, > >> + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, > >> + .desc = IORES_DESC_CRASH_KERNEL > >> +}; > >> +struct resource crashk_low_res = { > >> + .name = "Crash kernel", > >> + .start = 0, > >> + .end = 0, > >> + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, > >> + .desc = IORES_DESC_CRASH_KERNEL > >> +}; > >> + > >> /* > >> * parsing the "crashkernel" commandline > >> * > >> @@ -292,6 +310,75 @@ int __init parse_crashkernel_low(char *cmdline, > >> "crashkernel=", suffix_tbl[SUFFIX_LOW]); > >> } > >> > >> +#if defined(CONFIG_X86_64) > >> +#define CRASH_ALIGN SZ_16M > >> +#elif defined(CONFIG_ARM64) > >> +#define CRASH_ALIGN SZ_2M > >> +#endif > > > > I think no need to have the #ifdef, although I can not think out of > > reason we have 16M for X86, maybe move it to 2M as well if no other > > objections. Then it will be easier to reserve crashkernel successfully > > considering nowadays we have KASLR and other stuff it becomes harder. > > I also don't figure out why it is 16M in x86. IMHO, if we do not know why and in theory it should work with 2M, can you do some basic testing and move it to 2M? We can easily move back to 16M if someone really report something, but if we do not change it will always stay there but we do not know why. > > > > >> + > >> +int __init reserve_crashkernel_low(void) > >> +{ > >> +#if defined(CONFIG_X86_64) || defined(CONFIG_ARM64) > >> + unsigned long long base, low_base = 0, low_size = 0; > >> + unsigned long total_low_mem; > >> + int ret; > >> + > >> + total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); > >> + > >> + /* crashkernel=Y,low */ > >> + ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, > >> + &base); > >> + if (ret) { > >> +#ifdef CONFIG_X86_64 > >> + /* > >> + * two parts from lib/swiotlb.c: > >> + * -swiotlb size: user-specified with swiotlb= or default. > >> + * > >> + * -swiotlb overflow buffer: now hardcoded to 32k. We round it > >> + * to 8M for other buffers that may need to stay low too. Also > >> + * make sure we allocate enough extra low memory so that we > >> + * don't run out of DMA buffers for 32-bit devices. > >> + */ > >> + low_size = max(swiotlb_size_or_default() + (8UL << 20), > >> + 256UL << 20); > >> +#else > >> + /* > >> + * in arm64, reserve low memory if and only if crashkernel=X,low > >> + * specified. > >> + */ > >> + return -EINVAL; > >> +#endif > > > > As said before, can you explore about why it needs different logic, it > > would be good to keep two arches same. > > > >> + } else { > >> + /* passed with crashkernel=0,low ? */ > >> + if (!low_size) > >> + return 0; > >> + } > >> + > >> + low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); > >> + if (!low_base) { > >> + pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", > >> + (unsigned long)(low_size >> 20)); > >> + return -ENOMEM; > >> + } > >> + > >> + ret = memblock_reserve(low_base, low_size); > >> + if (ret) { > >> + pr_err("%s: Error reserving crashkernel low memblock.\n", > >> + __func__); > >> + return ret; > >> + } > >> + > >> + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", > >> + (unsigned long)(low_size >> 20), > >> + (unsigned long)(low_base >> 20), > >> + (unsigned long)(total_low_mem >> 20)); > >> + > >> + crashk_low_res.start = low_base; > >> + crashk_low_res.end = low_base + low_size - 1; > >> +#endif > >> + return 0; > >> +} > >> + > >> Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, > >> void *data, size_t data_len) > >> { > >> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c > >> index 15d70a9..458d093 100644 > >> --- a/kernel/kexec_core.c > >> +++ b/kernel/kexec_core.c > >> @@ -53,23 +53,6 @@ note_buf_t __percpu *crash_notes; > >> /* Flag to indicate we are going to kexec a new kernel */ > >> bool kexec_in_progress = false; > >> > >> - > >> -/* Location of the reserved area for the crash kernel */ > >> -struct resource crashk_res = { > >> - .name = "Crash kernel", > >> - .start = 0, > >> - .end = 0, > >> - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, > >> - .desc = IORES_DESC_CRASH_KERNEL > >> -}; > >> -struct resource crashk_low_res = { > >> - .name = "Crash kernel", > >> - .start = 0, > >> - .end = 0, > >> - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, > >> - .desc = IORES_DESC_CRASH_KERNEL > >> -}; > >> - > >> int kexec_should_crash(struct task_struct *p) > >> { > >> /* > >> -- > >> 2.7.4 > >> > > > > Thanks > > Dave > > > > > > . > > > Thanks, > Chen Zhou > Thanks Dave
Hi Dave, On 2019/12/28 17:32, Dave Young wrote: > On 12/27/19 at 07:04pm, Chen Zhou wrote: >> Hi Dave >> >> On 2019/12/27 13:54, Dave Young wrote: >>> Hi, >>> On 12/23/19 at 11:23pm, Chen Zhou wrote: >>>> In preparation for supporting reserve_crashkernel_low in arm64 as >>>> x86_64 does, move reserve_crashkernel_low() into kernel/crash_core.c. >>>> >>>> Note, in arm64, we reserve low memory if and only if crashkernel=X,low >>>> is specified. Different with x86_64, don't set low memory automatically. >>> >>> Do you have any reason for the difference? I'd expect we have same >>> logic if possible and remove some of the ifdefs. >> >> In x86_64, if we reserve crashkernel above 4G, then we call reserve_crashkernel_low() >> to reserve low memory. >> >> In arm64, to simplify, we call reserve_crashkernel_low() at the beginning of reserve_crashkernel() >> and then relax the arm64_dma32_phys_limit if reserve_crashkernel_low() allocated something. >> In this case, if reserve crashkernel below 4G there will be 256M low memory set automatically >> and this needs extra considerations. > > Sorry that I did not read the old thread details and thought that is > arch dependent. But rethink about that, it would be better that we can > have same semantic about crashkernel parameters across arches. If we > make them different then it causes confusion, especially for > distributions. > > OTOH, I thought if we reserve high memory then the low memory should be > needed. There might be some exceptions, but I do not know the exact > one, can we make the behavior same, and special case those systems which > do not need low memory reservation. > I thought like this and did implement with crashkernel parameters arch independent. This is my v4: https://lkml.org/lkml/2019/5/6/1361, i implemented according to x86_64's behavior. >> >> previous discusses: >> https://lkml.org/lkml/2019/6/5/670 >> https://lkml.org/lkml/2019/6/13/229 > > Another concern from James: > " > With both crashk_low_res and crashk_res, we end up with two entries in /proc/iomem called > "Crash kernel". Because its sorted by address, and kexec-tools stops searching when it > find "Crash kernel", you are always going to get the kernel placed in the lower portion. > " > > The kexec-tools code is iterating all "Crash kernel" ranges and add them > in an array. In X86 code, it uses the higher range to locate memory. We also discussed about this: https://lkml.org/lkml/2019/6/13/227. I guess James's opinion is that kexec-tools should take forward compatibility into account. "But we can't rely on people updating user-space when they update the kernel!" -- James > >> >>> >>>> >>>> Reported-by: kbuild test robot <lkp@intel.com> >>>> Signed-off-by: Chen Zhou <chenzhou10@huawei.com> >>>> --- >>>> arch/x86/kernel/setup.c | 62 ++++----------------------------- >>>> include/linux/crash_core.h | 3 ++ >>>> include/linux/kexec.h | 2 -- >>>> kernel/crash_core.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++ >>>> kernel/kexec_core.c | 17 --------- >>>> 5 files changed, 96 insertions(+), 75 deletions(-) >>>> >>>> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c >>>> index cedfe20..5f38942 100644 >>>> --- a/arch/x86/kernel/setup.c >>>> +++ b/arch/x86/kernel/setup.c >>>> @@ -486,59 +486,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) >>>> # define CRASH_ADDR_HIGH_MAX SZ_64T >>>> #endif >>>> >>>> -static int __init reserve_crashkernel_low(void) >>>> -{ >>>> -#ifdef CONFIG_X86_64 >>>> - unsigned long long base, low_base = 0, low_size = 0; >>>> - unsigned long total_low_mem; >>>> - int ret; >>>> - >>>> - total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); >>>> - >>>> - /* crashkernel=Y,low */ >>>> - ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); >>>> - if (ret) { >>>> - /* >>>> - * two parts from kernel/dma/swiotlb.c: >>>> - * -swiotlb size: user-specified with swiotlb= or default. >>>> - * >>>> - * -swiotlb overflow buffer: now hardcoded to 32k. We round it >>>> - * to 8M for other buffers that may need to stay low too. Also >>>> - * make sure we allocate enough extra low memory so that we >>>> - * don't run out of DMA buffers for 32-bit devices. >>>> - */ >>>> - low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); >>>> - } else { >>>> - /* passed with crashkernel=0,low ? */ >>>> - if (!low_size) >>>> - return 0; >>>> - } >>>> - >>>> - low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); >>>> - if (!low_base) { >>>> - pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", >>>> - (unsigned long)(low_size >> 20)); >>>> - return -ENOMEM; >>>> - } >>>> - >>>> - ret = memblock_reserve(low_base, low_size); >>>> - if (ret) { >>>> - pr_err("%s: Error reserving crashkernel low memblock.\n", __func__); >>>> - return ret; >>>> - } >>>> - >>>> - pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", >>>> - (unsigned long)(low_size >> 20), >>>> - (unsigned long)(low_base >> 20), >>>> - (unsigned long)(total_low_mem >> 20)); >>>> - >>>> - crashk_low_res.start = low_base; >>>> - crashk_low_res.end = low_base + low_size - 1; >>>> - insert_resource(&iomem_resource, &crashk_low_res); >>>> -#endif >>>> - return 0; >>>> -} >>>> - >>>> static void __init reserve_crashkernel(void) >>>> { >>>> unsigned long long crash_size, crash_base, total_mem; >>>> @@ -602,9 +549,12 @@ static void __init reserve_crashkernel(void) >>>> return; >>>> } >>>> >>>> - if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { >>>> - memblock_free(crash_base, crash_size); >>>> - return; >>>> + if (crash_base >= (1ULL << 32)) { >>>> + if (reserve_crashkernel_low()) { >>>> + memblock_free(crash_base, crash_size); >>>> + return; >>>> + } >>>> + insert_resource(&iomem_resource, &crashk_low_res); >>> >>> Some specific reason to move insert_resouce out of the >>> reserve_crashkernel_low function? >> >> No specific reason. >> I just exposed arm64 "Crash kernel low" in request_standard_resources() as other resources, >> so did this change. > > Ok. > >> >>> >>>> } >>>> >>>> pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", >>>> diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h >>>> index 525510a..4df8c0b 100644 >>>> --- a/include/linux/crash_core.h >>>> +++ b/include/linux/crash_core.h >>>> @@ -63,6 +63,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); >>>> extern unsigned char *vmcoreinfo_data; >>>> extern size_t vmcoreinfo_size; >>>> extern u32 *vmcoreinfo_note; >>>> +extern struct resource crashk_res; >>>> +extern struct resource crashk_low_res; >>>> >>>> Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, >>>> void *data, size_t data_len); >>>> @@ -74,5 +76,6 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, >>>> unsigned long long *crash_size, unsigned long long *crash_base); >>>> int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, >>>> unsigned long long *crash_size, unsigned long long *crash_base); >>>> +int __init reserve_crashkernel_low(void); >>>> >>>> #endif /* LINUX_CRASH_CORE_H */ >>>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h >>>> index 1776eb2..5d5d963 100644 >>>> --- a/include/linux/kexec.h >>>> +++ b/include/linux/kexec.h >>>> @@ -330,8 +330,6 @@ extern int kexec_load_disabled; >>>> >>>> /* Location of a reserved region to hold the crash kernel. >>>> */ >>>> -extern struct resource crashk_res; >>>> -extern struct resource crashk_low_res; >>>> extern note_buf_t __percpu *crash_notes; >>>> >>>> /* flag to track if kexec reboot is in progress */ >>>> diff --git a/kernel/crash_core.c b/kernel/crash_core.c >>>> index 9f1557b..eb72fd6 100644 >>>> --- a/kernel/crash_core.c >>>> +++ b/kernel/crash_core.c >>>> @@ -7,6 +7,8 @@ >>>> #include <linux/crash_core.h> >>>> #include <linux/utsname.h> >>>> #include <linux/vmalloc.h> >>>> +#include <linux/memblock.h> >>>> +#include <linux/swiotlb.h> >>>> >>>> #include <asm/page.h> >>>> #include <asm/sections.h> >>>> @@ -19,6 +21,22 @@ u32 *vmcoreinfo_note; >>>> /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ >>>> static unsigned char *vmcoreinfo_data_safecopy; >>>> >>>> +/* Location of the reserved area for the crash kernel */ >>>> +struct resource crashk_res = { >>>> + .name = "Crash kernel", >>>> + .start = 0, >>>> + .end = 0, >>>> + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, >>>> + .desc = IORES_DESC_CRASH_KERNEL >>>> +}; >>>> +struct resource crashk_low_res = { >>>> + .name = "Crash kernel", >>>> + .start = 0, >>>> + .end = 0, >>>> + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, >>>> + .desc = IORES_DESC_CRASH_KERNEL >>>> +}; >>>> + >>>> /* >>>> * parsing the "crashkernel" commandline >>>> * >>>> @@ -292,6 +310,75 @@ int __init parse_crashkernel_low(char *cmdline, >>>> "crashkernel=", suffix_tbl[SUFFIX_LOW]); >>>> } >>>> >>>> +#if defined(CONFIG_X86_64) >>>> +#define CRASH_ALIGN SZ_16M >>>> +#elif defined(CONFIG_ARM64) >>>> +#define CRASH_ALIGN SZ_2M >>>> +#endif >>> >>> I think no need to have the #ifdef, although I can not think out of >>> reason we have 16M for X86, maybe move it to 2M as well if no other >>> objections. Then it will be easier to reserve crashkernel successfully >>> considering nowadays we have KASLR and other stuff it becomes harder. >> >> I also don't figure out why it is 16M in x86. > > IMHO, if we do not know why and in theory it should work with 2M, can > you do some basic testing and move it to 2M? > > We can easily move back to 16M if someone really report something, but > if we do not change it will always stay there but we do not know why. Ok. I will do some test later. > >> >>> >>>> + >>>> +int __init reserve_crashkernel_low(void) >>>> +{ >>>> +#if defined(CONFIG_X86_64) || defined(CONFIG_ARM64) >>>> + unsigned long long base, low_base = 0, low_size = 0; >>>> + unsigned long total_low_mem; >>>> + int ret; >>>> + >>>> + total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); >>>> + >>>> + /* crashkernel=Y,low */ >>>> + ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, >>>> + &base); >>>> + if (ret) { >>>> +#ifdef CONFIG_X86_64 >>>> + /* >>>> + * two parts from lib/swiotlb.c: >>>> + * -swiotlb size: user-specified with swiotlb= or default. >>>> + * >>>> + * -swiotlb overflow buffer: now hardcoded to 32k. We round it >>>> + * to 8M for other buffers that may need to stay low too. Also >>>> + * make sure we allocate enough extra low memory so that we >>>> + * don't run out of DMA buffers for 32-bit devices. >>>> + */ >>>> + low_size = max(swiotlb_size_or_default() + (8UL << 20), >>>> + 256UL << 20); >>>> +#else >>>> + /* >>>> + * in arm64, reserve low memory if and only if crashkernel=X,low >>>> + * specified. >>>> + */ >>>> + return -EINVAL; >>>> +#endif >>> >>> As said before, can you explore about why it needs different logic, it >>> would be good to keep two arches same. >>> >>>> + } else { >>>> + /* passed with crashkernel=0,low ? */ >>>> + if (!low_size) >>>> + return 0; >>>> + } >>>> + >>>> + low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); >>>> + if (!low_base) { >>>> + pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", >>>> + (unsigned long)(low_size >> 20)); >>>> + return -ENOMEM; >>>> + } >>>> + >>>> + ret = memblock_reserve(low_base, low_size); >>>> + if (ret) { >>>> + pr_err("%s: Error reserving crashkernel low memblock.\n", >>>> + __func__); >>>> + return ret; >>>> + } >>>> + >>>> + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", >>>> + (unsigned long)(low_size >> 20), >>>> + (unsigned long)(low_base >> 20), >>>> + (unsigned long)(total_low_mem >> 20)); >>>> + >>>> + crashk_low_res.start = low_base; >>>> + crashk_low_res.end = low_base + low_size - 1; >>>> +#endif >>>> + return 0; >>>> +} >>>> + >>>> Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, >>>> void *data, size_t data_len) >>>> { >>>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c >>>> index 15d70a9..458d093 100644 >>>> --- a/kernel/kexec_core.c >>>> +++ b/kernel/kexec_core.c >>>> @@ -53,23 +53,6 @@ note_buf_t __percpu *crash_notes; >>>> /* Flag to indicate we are going to kexec a new kernel */ >>>> bool kexec_in_progress = false; >>>> >>>> - >>>> -/* Location of the reserved area for the crash kernel */ >>>> -struct resource crashk_res = { >>>> - .name = "Crash kernel", >>>> - .start = 0, >>>> - .end = 0, >>>> - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, >>>> - .desc = IORES_DESC_CRASH_KERNEL >>>> -}; >>>> -struct resource crashk_low_res = { >>>> - .name = "Crash kernel", >>>> - .start = 0, >>>> - .end = 0, >>>> - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, >>>> - .desc = IORES_DESC_CRASH_KERNEL >>>> -}; >>>> - >>>> int kexec_should_crash(struct task_struct *p) >>>> { >>>> /* >>>> -- >>>> 2.7.4 >>>> >>> >>> Thanks >>> Dave >>> >>> >>> . >>> >> Thanks, >> Chen Zhou >> > > Thanks > Dave > > Thanks, Chen Zhou
Hi guys, On 28/12/2019 09:32, Dave Young wrote: > On 12/27/19 at 07:04pm, Chen Zhou wrote: >> On 2019/12/27 13:54, Dave Young wrote: >>> On 12/23/19 at 11:23pm, Chen Zhou wrote: >>>> In preparation for supporting reserve_crashkernel_low in arm64 as >>>> x86_64 does, move reserve_crashkernel_low() into kernel/crash_core.c. >>>> >>>> Note, in arm64, we reserve low memory if and only if crashkernel=X,low >>>> is specified. Different with x86_64, don't set low memory automatically. >>> >>> Do you have any reason for the difference? I'd expect we have same >>> logic if possible and remove some of the ifdefs. >> >> In x86_64, if we reserve crashkernel above 4G, then we call reserve_crashkernel_low() >> to reserve low memory. >> >> In arm64, to simplify, we call reserve_crashkernel_low() at the beginning of reserve_crashkernel() >> and then relax the arm64_dma32_phys_limit if reserve_crashkernel_low() allocated something. >> In this case, if reserve crashkernel below 4G there will be 256M low memory set automatically >> and this needs extra considerations. > Sorry that I did not read the old thread details and thought that is > arch dependent. But rethink about that, it would be better that we can > have same semantic about crashkernel parameters across arches. If we > make them different then it causes confusion, especially for > distributions. Surely distros also want one crashkernel* string they can use on all platforms without having to detect the kernel version, platform or changeable memory layout... > OTOH, I thought if we reserve high memory then the low memory should be > needed. There might be some exceptions, but I do not know the exact > one, > can we make the behavior same, and special case those systems which > do not need low memory reservation. Its tricky to work out which systems are the 'normal' ones. We don't have a fixed memory layout for arm64. Some systems have no memory below 4G. Others have no memory above 4G. Chen Zhou's machine has some memory below 4G, but its too precious to reserve a large chunk for kdump. Without any memory below 4G some of the drivers won't work. I don't see what distros can set as their default for all platforms if high/low are mutually exclusive with the 'crashkernel=' in use today. How did x86 navigate this, ... or was it so long ago? No one else has reported a problem with the existing placement logic, hence treating this 'low' thing as the 'in addition' special case. >> previous discusses: >> https://lkml.org/lkml/2019/6/5/670 >> https://lkml.org/lkml/2019/6/13/229 > > Another concern from James: > " > With both crashk_low_res and crashk_res, we end up with two entries in /proc/iomem called > "Crash kernel". Because its sorted by address, and kexec-tools stops searching when it > find "Crash kernel", you are always going to get the kernel placed in the lower portion. > " > > The kexec-tools code is iterating all "Crash kernel" ranges and add them > in an array. In X86 code, it uses the higher range to locate memory. Then my hurried reading of what the user-space code does was wrong! If kexec-tools places the kernel in the low region, there may not be enough memory left for whatever purpose it was reserved for. This was the motivation for giving it a different name. Thanks, James
> On Jan 16, 2020, at 9:17 AM, James Morse <james.morse@arm.com> wrote: > > Hi guys, > > On 28/12/2019 09:32, Dave Young wrote: >> On 12/27/19 at 07:04pm, Chen Zhou wrote: >>> On 2019/12/27 13:54, Dave Young wrote: >>>> On 12/23/19 at 11:23pm, Chen Zhou wrote: >>>>> In preparation for supporting reserve_crashkernel_low in arm64 as >>>>> x86_64 does, move reserve_crashkernel_low() into kernel/crash_core.c. >>>>> >>>>> Note, in arm64, we reserve low memory if and only if crashkernel=X,low >>>>> is specified. Different with x86_64, don't set low memory automatically. >>>> >>>> Do you have any reason for the difference? I'd expect we have same >>>> logic if possible and remove some of the ifdefs. >>> >>> In x86_64, if we reserve crashkernel above 4G, then we call reserve_crashkernel_low() >>> to reserve low memory. >>> >>> In arm64, to simplify, we call reserve_crashkernel_low() at the beginning of reserve_crashkernel() >>> and then relax the arm64_dma32_phys_limit if reserve_crashkernel_low() allocated something. >>> In this case, if reserve crashkernel below 4G there will be 256M low memory set automatically >>> and this needs extra considerations. > >> Sorry that I did not read the old thread details and thought that is >> arch dependent. But rethink about that, it would be better that we can >> have same semantic about crashkernel parameters across arches. If we >> make them different then it causes confusion, especially for >> distributions. > > Surely distros also want one crashkernel* string they can use on all platforms without > having to detect the kernel version, platform or changeable memory layout... > > >> OTOH, I thought if we reserve high memory then the low memory should be >> needed. There might be some exceptions, but I do not know the exact >> one, > >> can we make the behavior same, and special case those systems which >> do not need low memory reservation. > > Its tricky to work out which systems are the 'normal' ones. > > We don't have a fixed memory layout for arm64. Some systems have no memory below 4G. > Others have no memory above 4G. > > Chen Zhou's machine has some memory below 4G, but its too precious to reserve a large > chunk for kdump. Without any memory below 4G some of the drivers won't work. > > I don't see what distros can set as their default for all platforms if high/low are > mutually exclusive with the 'crashkernel=' in use today. How did x86 navigate this, ... or > was it so long ago? > > No one else has reported a problem with the existing placement logic, hence treating this > 'low' thing as the 'in addition' special case. Hi, I am seeing similar Arm crash dump issues on 5.4 kernels where we need rather large amount of crashkernel memory reserved that is not available below 4GB ( The maximum reserved size appears to be around 768M ) . When I pick memory range higher than 4GB , I see adapters that fail to initialize : There is no low-memory <4G memory for DMA ; [ 11.506792] kworker/0:14: page allocation failure: order:0, mode:0x104(GFP_DMA32|__GFP_ZERO), nodemask=(null),cpuset=/,mems_allowed=0 [ 11.518793] CPU: 0 PID: 150 Comm: kworker/0:14 Not tainted 5.4.0-1948.3.el8uek.aarch64 #1 [ 11.526955] Hardware name: To be filled by O.E.M. Saber/Saber, BIOS 0ACKL025 01/18/2019 [ 11.534948] Workqueue: events work_for_cpu_fn [ 11.539291] Call trace: [ 11.541727] dump_backtrace+0x0/0x18c [ 11.545376] show_stack+0x24/0x30 [ 11.548679] dump_stack+0xbc/0xe0 [ 11.551982] warn_alloc+0xf0/0x15c [ 11.555370] __alloc_pages_slowpath+0xb4c/0xb84 [ 11.559887] __alloc_pages_nodemask+0x2d0/0x330 [ 11.564405] alloc_pages_current+0x8c/0xf8 [ 11.568496] ttm_bo_device_init+0x188/0x220 [ttm] [ 11.573187] drm_vram_mm_init+0x58/0x80 [drm_vram_helper] [ 11.578572] drm_vram_helper_alloc_mm+0x64/0xb0 [drm_vram_helper] [ 11.584655] ast_mm_init+0x38/0x80 [ast] [ 11.588566] ast_driver_load+0x474/0xa70 [ast] [ 11.593029] drm_dev_register+0x144/0x1c8 [drm] [ 11.597573] drm_get_pci_dev+0xa4/0x168 [drm] [ 11.601919] ast_pci_probe+0x8c/0x9c [ast] [ 11.606004] local_pci_probe+0x44/0x98 [ 11.609739] work_for_cpu_fn+0x20/0x30 [ 11.613474] process_one_work+0x1c4/0x41c [ 11.617470] worker_thread+0x150/0x4b0 [ 11.621206] kthread+0x110/0x114 [ 11.624422] ret_from_fork+0x10/0x18 This failure is related to a graphics adapter. The more complex kdump configurations that use networking stack to NFS mount a filesystem to dump to , or use ssh to copy to another machine, require more crashkernel memory reservations than perhaps the “default*” settings of a minimal kdump that creates a minimal vmcore to local storage in /var/crash. If crashkernel is too small I get Out of Memory issues and the entire vmcore process fails. ( *default kdump setting I assume are a minimal vmcore to /var/crash using primary boot device where /root is located ) > > >>> previous discusses: >>> https://urldefense.proofpoint.com/v2/url?u=https-3A__lkml.org_lkml_2019_6_5_670&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=t2fPg9D87F7D8jm0_3CG9yoiIKdRg4qc_thBw4bzMhc&m=jOAu1DTDpohsWszalfTCYx46eGF19TSWVLchN5yBPgk&s=gS9BLOkmj78lP5L7SP6_VLHwvP249uWKaE2R7N7sxgM&e= >>> https://urldefense.proofpoint.com/v2/url?u=https-3A__lkml.org_lkml_2019_6_13_229&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=t2fPg9D87F7D8jm0_3CG9yoiIKdRg4qc_thBw4bzMhc&m=jOAu1DTDpohsWszalfTCYx46eGF19TSWVLchN5yBPgk&s=U1Nis29n3A7XSBzED53fiE4MDAv5NlxYp1UorvvBOOw&e= >> >> Another concern from James: >> " >> With both crashk_low_res and crashk_res, we end up with two entries in /proc/iomem called >> "Crash kernel". Because its sorted by address, and kexec-tools stops searching when it >> find "Crash kernel", you are always going to get the kernel placed in the lower portion. >> " >> >> The kexec-tools code is iterating all "Crash kernel" ranges and add them >> in an array. In X86 code, it uses the higher range to locate memory. > > Then my hurried reading of what the user-space code does was wrong! > > If kexec-tools places the kernel in the low region, there may not be enough memory left > for whatever purpose it was reserved for. This was the motivation for giving it a > different name. > > > Thanks, > > James > > _______________________________________________ > kexec mailing list > kexec@lists.infradead.org > https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_kexec&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=t2fPg9D87F7D8jm0_3CG9yoiIKdRg4qc_thBw4bzMhc&m=jOAu1DTDpohsWszalfTCYx46eGF19TSWVLchN5yBPgk&s=bqp02iQDP_Ez-XvLIvj-IPHqbbZwMPlDgmEcG8vhXFE&e=
On 01/16/20 at 03:17pm, James Morse wrote: > Hi guys, > > On 28/12/2019 09:32, Dave Young wrote: > > On 12/27/19 at 07:04pm, Chen Zhou wrote: > >> On 2019/12/27 13:54, Dave Young wrote: > >>> On 12/23/19 at 11:23pm, Chen Zhou wrote: > >>>> In preparation for supporting reserve_crashkernel_low in arm64 as > >>>> x86_64 does, move reserve_crashkernel_low() into kernel/crash_core.c. > >>>> > >>>> Note, in arm64, we reserve low memory if and only if crashkernel=X,low > >>>> is specified. Different with x86_64, don't set low memory automatically. > >>> > >>> Do you have any reason for the difference? I'd expect we have same > >>> logic if possible and remove some of the ifdefs. > >> > >> In x86_64, if we reserve crashkernel above 4G, then we call reserve_crashkernel_low() > >> to reserve low memory. > >> > >> In arm64, to simplify, we call reserve_crashkernel_low() at the beginning of reserve_crashkernel() > >> and then relax the arm64_dma32_phys_limit if reserve_crashkernel_low() allocated something. > >> In this case, if reserve crashkernel below 4G there will be 256M low memory set automatically > >> and this needs extra considerations. > > > Sorry that I did not read the old thread details and thought that is > > arch dependent. But rethink about that, it would be better that we can > > have same semantic about crashkernel parameters across arches. If we > > make them different then it causes confusion, especially for > > distributions. > > Surely distros also want one crashkernel* string they can use on all platforms without > having to detect the kernel version, platform or changeable memory layout... > > > > OTOH, I thought if we reserve high memory then the low memory should be > > needed. There might be some exceptions, but I do not know the exact > > one, > > > can we make the behavior same, and special case those systems which > > do not need low memory reservation. > > Its tricky to work out which systems are the 'normal' ones. > > We don't have a fixed memory layout for arm64. Some systems have no memory below 4G. > Others have no memory above 4G. > > Chen Zhou's machine has some memory below 4G, but its too precious to reserve a large > chunk for kdump. Without any memory below 4G some of the drivers won't work. > > I don't see what distros can set as their default for all platforms if high/low are > mutually exclusive with the 'crashkernel=' in use today. How did x86 navigate this, ... or > was it so long ago? It is very rare for such machine without any low memory in X86, at least from what I know, so the current way just works fine. Since arm64 is quite different, I would agree with current way proposed in the patch, but a question is, for those arm64 systems how can admin know if low crashkernel memory is needed or not? and just skip the low reservation for machine with high memory installed only? > > No one else has reported a problem with the existing placement logic, hence treating this > 'low' thing as the 'in addition' special case. > > > >> previous discusses: > >> https://lkml.org/lkml/2019/6/5/670 > >> https://lkml.org/lkml/2019/6/13/229 > > > > Another concern from James: > > " > > With both crashk_low_res and crashk_res, we end up with two entries in /proc/iomem called > > "Crash kernel". Because its sorted by address, and kexec-tools stops searching when it > > find "Crash kernel", you are always going to get the kernel placed in the lower portion. > > " > > > > The kexec-tools code is iterating all "Crash kernel" ranges and add them > > in an array. In X86 code, it uses the higher range to locate memory. > > Then my hurried reading of what the user-space code does was wrong! > > If kexec-tools places the kernel in the low region, there may not be enough memory left > for whatever purpose it was reserved for. This was the motivation for giving it a > different name. Agreed, it is still a potential problem though. Say we have both low and high reserved. Kdump kernel boots up, the kernel and drivers, applications will use memory, I'm not sure if there is a memory allocation policy to let them all use high mem first.. Anyway that is beyond the kexec-tools and resource name. Thanks Dave
> On Jan 16, 2020, at 9:47 AM, John Donnelly <john.p.donnelly@oracle.com> wrote: > > > >> On Jan 16, 2020, at 9:17 AM, James Morse <james.morse@arm.com> wrote: >> >> Hi guys, >> >> On 28/12/2019 09:32, Dave Young wrote: >>> On 12/27/19 at 07:04pm, Chen Zhou wrote: >>>> On 2019/12/27 13:54, Dave Young wrote: >>>>> On 12/23/19 at 11:23pm, Chen Zhou wrote: >>>>>> In preparation for supporting reserve_crashkernel_low in arm64 as >>>>>> x86_64 does, move reserve_crashkernel_low() into kernel/crash_core.c. >>>>>> >>>>>> Note, in arm64, we reserve low memory if and only if crashkernel=X,low >>>>>> is specified. Different with x86_64, don't set low memory automatically. >>>>> >>>>> Do you have any reason for the difference? I'd expect we have same >>>>> logic if possible and remove some of the ifdefs. >>>> >>>> In x86_64, if we reserve crashkernel above 4G, then we call reserve_crashkernel_low() >>>> to reserve low memory. >>>> >>>> In arm64, to simplify, we call reserve_crashkernel_low() at the beginning of reserve_crashkernel() >>>> and then relax the arm64_dma32_phys_limit if reserve_crashkernel_low() allocated something. >>>> In this case, if reserve crashkernel below 4G there will be 256M low memory set automatically >>>> and this needs extra considerations. >> >>> Sorry that I did not read the old thread details and thought that is >>> arch dependent. But rethink about that, it would be better that we can >>> have same semantic about crashkernel parameters across arches. If we >>> make them different then it causes confusion, especially for >>> distributions. >> >> Surely distros also want one crashkernel* string they can use on all platforms without >> having to detect the kernel version, platform or changeable memory layout... >> >> >>> OTOH, I thought if we reserve high memory then the low memory should be >>> needed. There might be some exceptions, but I do not know the exact >>> one, >> >>> can we make the behavior same, and special case those systems which >>> do not need low memory reservation. >> >> Its tricky to work out which systems are the 'normal' ones. >> >> We don't have a fixed memory layout for arm64. Some systems have no memory below 4G. >> Others have no memory above 4G. >> >> Chen Zhou's machine has some memory below 4G, but its too precious to reserve a large >> chunk for kdump. Without any memory below 4G some of the drivers won't work. >> >> I don't see what distros can set as their default for all platforms if high/low are >> mutually exclusive with the 'crashkernel=' in use today. How did x86 navigate this, ... or >> was it so long ago? >> >> No one else has reported a problem with the existing placement logic, hence treating this >> 'low' thing as the 'in addition' special case. > > > Hi, > > I am seeing similar Arm crash dump issues on 5.4 kernels where we need rather large amount of crashkernel memory reserved that is not available below 4GB ( The maximum reserved size appears to be around 768M ) . When I pick memory range higher than 4GB , I see adapters that fail to initialize : > > > There is no low-memory <4G memory for DMA ; > > [ 11.506792] kworker/0:14: page allocation failure: order:0, > mode:0x104(GFP_DMA32|__GFP_ZERO), nodemask=(null),cpuset=/,mems_allowed=0 > [ 11.518793] CPU: 0 PID: 150 Comm: kworker/0:14 Not tainted > 5.4.0-1948.3.el8uek.aarch64 #1 > [ 11.526955] Hardware name: To be filled by O.E.M. Saber/Saber, BIOS > 0ACKL025 01/18/2019 > [ 11.534948] Workqueue: events work_for_cpu_fn > [ 11.539291] Call trace: > [ 11.541727] dump_backtrace+0x0/0x18c > [ 11.545376] show_stack+0x24/0x30 > [ 11.548679] dump_stack+0xbc/0xe0 > [ 11.551982] warn_alloc+0xf0/0x15c > [ 11.555370] __alloc_pages_slowpath+0xb4c/0xb84 > [ 11.559887] __alloc_pages_nodemask+0x2d0/0x330 > [ 11.564405] alloc_pages_current+0x8c/0xf8 > [ 11.568496] ttm_bo_device_init+0x188/0x220 [ttm] > [ 11.573187] drm_vram_mm_init+0x58/0x80 [drm_vram_helper] > [ 11.578572] drm_vram_helper_alloc_mm+0x64/0xb0 [drm_vram_helper] > [ 11.584655] ast_mm_init+0x38/0x80 [ast] > [ 11.588566] ast_driver_load+0x474/0xa70 [ast] > [ 11.593029] drm_dev_register+0x144/0x1c8 [drm] > [ 11.597573] drm_get_pci_dev+0xa4/0x168 [drm] > [ 11.601919] ast_pci_probe+0x8c/0x9c [ast] > [ 11.606004] local_pci_probe+0x44/0x98 > [ 11.609739] work_for_cpu_fn+0x20/0x30 > [ 11.613474] process_one_work+0x1c4/0x41c > [ 11.617470] worker_thread+0x150/0x4b0 > [ 11.621206] kthread+0x110/0x114 > [ 11.624422] ret_from_fork+0x10/0x18 > > This failure is related to a graphics adapter. > > The more complex kdump configurations that use networking stack to NFS mount a filesystem to dump to , or use ssh to copy to another machine, require more crashkernel memory reservations than perhaps the “default*” settings of a minimal kdump that creates a minimal vmcore to local storage in /var/crash. If crashkernel is too small I get Out of Memory issues and the entire vmcore process fails. > > ( *default kdump setting I assume are a minimal vmcore to /var/crash using primary boot device where /root is located ) > Hi Chen, I was able to unit test these series of kernel patches applied to a 5.4.17 test kernel along with the kexec CLI change : 0001-arm64-kdump-add-another-DT-property-to-crash-dump-ke.patch Applied to : kexec-tools-2.0.19-12.0.4.el8.src.rpm And obtained a vmcore using this cmdline : BOOT_IMAGE=(hd6,gpt2)/vmlinuz-5.4.17-4-uek6m_ol8-jpdonnel+ root=/dev/mapper/ol01-root ro crashkernel=2048M@35G crashkernel=250M,low rd.lvm.lv=ol01/root rd.lvm.lv=ol01/swap console=ttyS4 loglevel=7 Can you add : Tested-by: John Donnelly <John.p.donnelly@oracle.com> How can we get these changes included into an rc kernel release ? Thanks, John. > > > >> >> >>>> previous discusses: >>>> https://urldefense.proofpoint.com/v2/url?u=https-3A__lkml.org_lkml_2019_6_5_670&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=t2fPg9D87F7D8jm0_3CG9yoiIKdRg4qc_thBw4bzMhc&m=jOAu1DTDpohsWszalfTCYx46eGF19TSWVLchN5yBPgk&s=gS9BLOkmj78lP5L7SP6_VLHwvP249uWKaE2R7N7sxgM&e= >>>> https://urldefense.proofpoint.com/v2/url?u=https-3A__lkml.org_lkml_2019_6_13_229&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=t2fPg9D87F7D8jm0_3CG9yoiIKdRg4qc_thBw4bzMhc&m=jOAu1DTDpohsWszalfTCYx46eGF19TSWVLchN5yBPgk&s=U1Nis29n3A7XSBzED53fiE4MDAv5NlxYp1UorvvBOOw&e= >>> >>> Another concern from James: >>> " >>> With both crashk_low_res and crashk_res, we end up with two entries in /proc/iomem called >>> "Crash kernel". Because its sorted by address, and kexec-tools stops searching when it >>> find "Crash kernel", you are always going to get the kernel placed in the lower portion. >>> " >>> >>> The kexec-tools code is iterating all "Crash kernel" ranges and add them >>> in an array. In X86 code, it uses the higher range to locate memory. >> >> Then my hurried reading of what the user-space code does was wrong! >> >> If kexec-tools places the kernel in the low region, there may not be enough memory left >> for whatever purpose it was reserved for. This was the motivation for giving it a >> different name. >> >> >> Thanks, >> >> James >> >> _______________________________________________ >> kexec mailing list >> kexec@lists.infradead.org >> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_kexec&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=t2fPg9D87F7D8jm0_3CG9yoiIKdRg4qc_thBw4bzMhc&m=jOAu1DTDpohsWszalfTCYx46eGF19TSWVLchN5yBPgk&s=bqp02iQDP_Ez-XvLIvj-IPHqbbZwMPlDgmEcG8vhXFE&e= > > > _______________________________________________ > kexec mailing list > kexec@lists.infradead.org > https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_kexec&d=DwIGaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=t2fPg9D87F7D8jm0_3CG9yoiIKdRg4qc_thBw4bzMhc&m=whm9_BOrgAjJvBn0Ey_brHhFg2YMU_P0HF02dhgdgwU&s=vLar_m5JbicYwwuo6N84ZiBDGZUPM8bBLSPLQBtPZNY&e=
On 2020/2/24 23:25, John Donnelly wrote: > > >> On Jan 16, 2020, at 9:47 AM, John Donnelly <john.p.donnelly@oracle.com> wrote: >> >> >> >>> On Jan 16, 2020, at 9:17 AM, James Morse <james.morse@arm.com> wrote: >>> >>> Hi guys, >>> >>> On 28/12/2019 09:32, Dave Young wrote: >>>> On 12/27/19 at 07:04pm, Chen Zhou wrote: >>>>> On 2019/12/27 13:54, Dave Young wrote: >>>>>> On 12/23/19 at 11:23pm, Chen Zhou wrote: >>>>>>> In preparation for supporting reserve_crashkernel_low in arm64 as >>>>>>> x86_64 does, move reserve_crashkernel_low() into kernel/crash_core.c. >>>>>>> >>>>>>> Note, in arm64, we reserve low memory if and only if crashkernel=X,low >>>>>>> is specified. Different with x86_64, don't set low memory automatically. >>>>>> >>>>>> Do you have any reason for the difference? I'd expect we have same >>>>>> logic if possible and remove some of the ifdefs. >>>>> >>>>> In x86_64, if we reserve crashkernel above 4G, then we call reserve_crashkernel_low() >>>>> to reserve low memory. >>>>> >>>>> In arm64, to simplify, we call reserve_crashkernel_low() at the beginning of reserve_crashkernel() >>>>> and then relax the arm64_dma32_phys_limit if reserve_crashkernel_low() allocated something. >>>>> In this case, if reserve crashkernel below 4G there will be 256M low memory set automatically >>>>> and this needs extra considerations. >>> >>>> Sorry that I did not read the old thread details and thought that is >>>> arch dependent. But rethink about that, it would be better that we can >>>> have same semantic about crashkernel parameters across arches. If we >>>> make them different then it causes confusion, especially for >>>> distributions. >>> >>> Surely distros also want one crashkernel* string they can use on all platforms without >>> having to detect the kernel version, platform or changeable memory layout... >>> >>> >>>> OTOH, I thought if we reserve high memory then the low memory should be >>>> needed. There might be some exceptions, but I do not know the exact >>>> one, >>> >>>> can we make the behavior same, and special case those systems which >>>> do not need low memory reservation. >>> >>> Its tricky to work out which systems are the 'normal' ones. >>> >>> We don't have a fixed memory layout for arm64. Some systems have no memory below 4G. >>> Others have no memory above 4G. >>> >>> Chen Zhou's machine has some memory below 4G, but its too precious to reserve a large >>> chunk for kdump. Without any memory below 4G some of the drivers won't work. >>> >>> I don't see what distros can set as their default for all platforms if high/low are >>> mutually exclusive with the 'crashkernel=' in use today. How did x86 navigate this, ... or >>> was it so long ago? >>> >>> No one else has reported a problem with the existing placement logic, hence treating this >>> 'low' thing as the 'in addition' special case. >> >> >> Hi, >> >> I am seeing similar Arm crash dump issues on 5.4 kernels where we need rather large amount of crashkernel memory reserved that is not available below 4GB ( The maximum reserved size appears to be around 768M ) . When I pick memory range higher than 4GB , I see adapters that fail to initialize : >> >> >> There is no low-memory <4G memory for DMA ; >> >> [ 11.506792] kworker/0:14: page allocation failure: order:0, >> mode:0x104(GFP_DMA32|__GFP_ZERO), nodemask=(null),cpuset=/,mems_allowed=0 >> [ 11.518793] CPU: 0 PID: 150 Comm: kworker/0:14 Not tainted >> 5.4.0-1948.3.el8uek.aarch64 #1 >> [ 11.526955] Hardware name: To be filled by O.E.M. Saber/Saber, BIOS >> 0ACKL025 01/18/2019 >> [ 11.534948] Workqueue: events work_for_cpu_fn >> [ 11.539291] Call trace: >> [ 11.541727] dump_backtrace+0x0/0x18c >> [ 11.545376] show_stack+0x24/0x30 >> [ 11.548679] dump_stack+0xbc/0xe0 >> [ 11.551982] warn_alloc+0xf0/0x15c >> [ 11.555370] __alloc_pages_slowpath+0xb4c/0xb84 >> [ 11.559887] __alloc_pages_nodemask+0x2d0/0x330 >> [ 11.564405] alloc_pages_current+0x8c/0xf8 >> [ 11.568496] ttm_bo_device_init+0x188/0x220 [ttm] >> [ 11.573187] drm_vram_mm_init+0x58/0x80 [drm_vram_helper] >> [ 11.578572] drm_vram_helper_alloc_mm+0x64/0xb0 [drm_vram_helper] >> [ 11.584655] ast_mm_init+0x38/0x80 [ast] >> [ 11.588566] ast_driver_load+0x474/0xa70 [ast] >> [ 11.593029] drm_dev_register+0x144/0x1c8 [drm] >> [ 11.597573] drm_get_pci_dev+0xa4/0x168 [drm] >> [ 11.601919] ast_pci_probe+0x8c/0x9c [ast] >> [ 11.606004] local_pci_probe+0x44/0x98 >> [ 11.609739] work_for_cpu_fn+0x20/0x30 >> [ 11.613474] process_one_work+0x1c4/0x41c >> [ 11.617470] worker_thread+0x150/0x4b0 >> [ 11.621206] kthread+0x110/0x114 >> [ 11.624422] ret_from_fork+0x10/0x18 >> >> This failure is related to a graphics adapter. >> >> The more complex kdump configurations that use networking stack to NFS mount a filesystem to dump to , or use ssh to copy to another machine, require more crashkernel memory reservations than perhaps the “default*” settings of a minimal kdump that creates a minimal vmcore to local storage in /var/crash. If crashkernel is too small I get Out of Memory issues and the entire vmcore process fails. >> >> ( *default kdump setting I assume are a minimal vmcore to /var/crash using primary boot device where /root is located ) >> > Hi Chen, > > > I was able to unit test these series of kernel patches applied to a 5.4.17 test kernel along with the kexec CLI change : > > 0001-arm64-kdump-add-another-DT-property-to-crash-dump-ke.patch > > Applied to : > > kexec-tools-2.0.19-12.0.4.el8.src.rpm > > And obtained a vmcore using this cmdline : > > BOOT_IMAGE=(hd6,gpt2)/vmlinuz-5.4.17-4-uek6m_ol8-jpdonnel+ root=/dev/mapper/ol01-root ro crashkernel=2048M@35G crashkernel=250M,low rd.lvm.lv=ol01/root rd.lvm.lv=ol01/swap console=ttyS4 loglevel=7 > > Can you add : > > Tested-by: John Donnelly <John.p.donnelly@oracle.com> > > > How can we get these changes included into an rc kernel release ? > > Thanks, > > John. Hi all, Friendly ping... > > >> >> >> >>> >>> >>>>> previous discusses: >>>>> https://urldefense.proofpoint.com/v2/url?u=https-3A__lkml.org_lkml_2019_6_5_670&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=t2fPg9D87F7D8jm0_3CG9yoiIKdRg4qc_thBw4bzMhc&m=jOAu1DTDpohsWszalfTCYx46eGF19TSWVLchN5yBPgk&s=gS9BLOkmj78lP5L7SP6_VLHwvP249uWKaE2R7N7sxgM&e= >>>>> https://urldefense.proofpoint.com/v2/url?u=https-3A__lkml.org_lkml_2019_6_13_229&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=t2fPg9D87F7D8jm0_3CG9yoiIKdRg4qc_thBw4bzMhc&m=jOAu1DTDpohsWszalfTCYx46eGF19TSWVLchN5yBPgk&s=U1Nis29n3A7XSBzED53fiE4MDAv5NlxYp1UorvvBOOw&e= >>>> >>>> Another concern from James: >>>> " >>>> With both crashk_low_res and crashk_res, we end up with two entries in /proc/iomem called >>>> "Crash kernel". Because its sorted by address, and kexec-tools stops searching when it >>>> find "Crash kernel", you are always going to get the kernel placed in the lower portion. >>>> " >>>> >>>> The kexec-tools code is iterating all "Crash kernel" ranges and add them >>>> in an array. In X86 code, it uses the higher range to locate memory. >>> >>> Then my hurried reading of what the user-space code does was wrong! >>> >>> If kexec-tools places the kernel in the low region, there may not be enough memory left >>> for whatever purpose it was reserved for. This was the motivation for giving it a >>> different name. >>> >>> >>> Thanks, >>> >>> James >>> >>> _______________________________________________ >>> kexec mailing list >>> kexec@lists.infradead.org >>> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_kexec&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=t2fPg9D87F7D8jm0_3CG9yoiIKdRg4qc_thBw4bzMhc&m=jOAu1DTDpohsWszalfTCYx46eGF19TSWVLchN5yBPgk&s=bqp02iQDP_Ez-XvLIvj-IPHqbbZwMPlDgmEcG8vhXFE&e= >> >> >> _______________________________________________ >> kexec mailing list >> kexec@lists.infradead.org >> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_kexec&d=DwIGaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=t2fPg9D87F7D8jm0_3CG9yoiIKdRg4qc_thBw4bzMhc&m=whm9_BOrgAjJvBn0Ey_brHhFg2YMU_P0HF02dhgdgwU&s=vLar_m5JbicYwwuo6N84ZiBDGZUPM8bBLSPLQBtPZNY&e= > > > . >
Hi Dave, On 2019/12/31 9:39, Chen Zhou wrote: > Hi Dave, > > On 2019/12/28 17:32, Dave Young wrote: >> On 12/27/19 at 07:04pm, Chen Zhou wrote: >>> Hi Dave >>> >>> On 2019/12/27 13:54, Dave Young wrote: >>>> Hi, >>>> On 12/23/19 at 11:23pm, Chen Zhou wrote: >>>>> In preparation for supporting reserve_crashkernel_low in arm64 as >>>>> x86_64 does, move reserve_crashkernel_low() into kernel/crash_core.c. >>>>> >>>>> Note, in arm64, we reserve low memory if and only if crashkernel=X,low >>>>> is specified. Different with x86_64, don't set low memory automatically. >>>> >>>> Do you have any reason for the difference? I'd expect we have same >>>> logic if possible and remove some of the ifdefs. >>> >>> In x86_64, if we reserve crashkernel above 4G, then we call reserve_crashkernel_low() >>> to reserve low memory. >>> >>> In arm64, to simplify, we call reserve_crashkernel_low() at the beginning of reserve_crashkernel() >>> and then relax the arm64_dma32_phys_limit if reserve_crashkernel_low() allocated something. >>> In this case, if reserve crashkernel below 4G there will be 256M low memory set automatically >>> and this needs extra considerations. >> >> Sorry that I did not read the old thread details and thought that is >> arch dependent. But rethink about that, it would be better that we can >> have same semantic about crashkernel parameters across arches. If we >> make them different then it causes confusion, especially for >> distributions. >> >> OTOH, I thought if we reserve high memory then the low memory should be >> needed. There might be some exceptions, but I do not know the exact >> one, can we make the behavior same, and special case those systems which >> do not need low memory reservation. >> > I thought like this and did implement with crashkernel parameters arch independent. > This is my v4: https://lkml.org/lkml/2019/5/6/1361, i implemented according to x86_64's > behavior. > >>> >>> previous discusses: >>> https://lkml.org/lkml/2019/6/5/670 >>> https://lkml.org/lkml/2019/6/13/229 >> >> Another concern from James: >> " >> With both crashk_low_res and crashk_res, we end up with two entries in /proc/iomem called >> "Crash kernel". Because its sorted by address, and kexec-tools stops searching when it >> find "Crash kernel", you are always going to get the kernel placed in the lower portion. >> " >> >> The kexec-tools code is iterating all "Crash kernel" ranges and add them >> in an array. In X86 code, it uses the higher range to locate memory. > > We also discussed about this: https://lkml.org/lkml/2019/6/13/227. > I guess James's opinion is that kexec-tools should take forward compatibility into account. > "But we can't rely on people updating user-space when they update the kernel!" -- James > >> >>> >>>> >>>>> >>>>> Reported-by: kbuild test robot <lkp@intel.com> >>>>> Signed-off-by: Chen Zhou <chenzhou10@huawei.com> >>>>> --- >>>>> arch/x86/kernel/setup.c | 62 ++++----------------------------- >>>>> include/linux/crash_core.h | 3 ++ >>>>> include/linux/kexec.h | 2 -- >>>>> kernel/crash_core.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++ >>>>> kernel/kexec_core.c | 17 --------- >>>>> 5 files changed, 96 insertions(+), 75 deletions(-) >>>>> >>>>> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c >>>>> index cedfe20..5f38942 100644 >>>>> --- a/arch/x86/kernel/setup.c >>>>> +++ b/arch/x86/kernel/setup.c >>>>> @@ -486,59 +486,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) >>>>> # define CRASH_ADDR_HIGH_MAX SZ_64T >>>>> #endif >>>>> >>>>> -static int __init reserve_crashkernel_low(void) >>>>> -{ >>>>> -#ifdef CONFIG_X86_64 >>>>> - unsigned long long base, low_base = 0, low_size = 0; >>>>> - unsigned long total_low_mem; >>>>> - int ret; >>>>> - >>>>> - total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); >>>>> - >>>>> - /* crashkernel=Y,low */ >>>>> - ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); >>>>> - if (ret) { >>>>> - /* >>>>> - * two parts from kernel/dma/swiotlb.c: >>>>> - * -swiotlb size: user-specified with swiotlb= or default. >>>>> - * >>>>> - * -swiotlb overflow buffer: now hardcoded to 32k. We round it >>>>> - * to 8M for other buffers that may need to stay low too. Also >>>>> - * make sure we allocate enough extra low memory so that we >>>>> - * don't run out of DMA buffers for 32-bit devices. >>>>> - */ >>>>> - low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); >>>>> - } else { >>>>> - /* passed with crashkernel=0,low ? */ >>>>> - if (!low_size) >>>>> - return 0; >>>>> - } >>>>> - >>>>> - low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); >>>>> - if (!low_base) { >>>>> - pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", >>>>> - (unsigned long)(low_size >> 20)); >>>>> - return -ENOMEM; >>>>> - } >>>>> - >>>>> - ret = memblock_reserve(low_base, low_size); >>>>> - if (ret) { >>>>> - pr_err("%s: Error reserving crashkernel low memblock.\n", __func__); >>>>> - return ret; >>>>> - } >>>>> - >>>>> - pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", >>>>> - (unsigned long)(low_size >> 20), >>>>> - (unsigned long)(low_base >> 20), >>>>> - (unsigned long)(total_low_mem >> 20)); >>>>> - >>>>> - crashk_low_res.start = low_base; >>>>> - crashk_low_res.end = low_base + low_size - 1; >>>>> - insert_resource(&iomem_resource, &crashk_low_res); >>>>> -#endif >>>>> - return 0; >>>>> -} >>>>> - >>>>> static void __init reserve_crashkernel(void) >>>>> { >>>>> unsigned long long crash_size, crash_base, total_mem; >>>>> @@ -602,9 +549,12 @@ static void __init reserve_crashkernel(void) >>>>> return; >>>>> } >>>>> >>>>> - if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { >>>>> - memblock_free(crash_base, crash_size); >>>>> - return; >>>>> + if (crash_base >= (1ULL << 32)) { >>>>> + if (reserve_crashkernel_low()) { >>>>> + memblock_free(crash_base, crash_size); >>>>> + return; >>>>> + } >>>>> + insert_resource(&iomem_resource, &crashk_low_res); >>>> >>>> Some specific reason to move insert_resouce out of the >>>> reserve_crashkernel_low function? >>> >>> No specific reason. >>> I just exposed arm64 "Crash kernel low" in request_standard_resources() as other resources, >>> so did this change. >> >> Ok. >> >>> >>>> >>>>> } >>>>> >>>>> pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", >>>>> diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h >>>>> index 525510a..4df8c0b 100644 >>>>> --- a/include/linux/crash_core.h >>>>> +++ b/include/linux/crash_core.h >>>>> @@ -63,6 +63,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); >>>>> extern unsigned char *vmcoreinfo_data; >>>>> extern size_t vmcoreinfo_size; >>>>> extern u32 *vmcoreinfo_note; >>>>> +extern struct resource crashk_res; >>>>> +extern struct resource crashk_low_res; >>>>> >>>>> Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, >>>>> void *data, size_t data_len); >>>>> @@ -74,5 +76,6 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, >>>>> unsigned long long *crash_size, unsigned long long *crash_base); >>>>> int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, >>>>> unsigned long long *crash_size, unsigned long long *crash_base); >>>>> +int __init reserve_crashkernel_low(void); >>>>> >>>>> #endif /* LINUX_CRASH_CORE_H */ >>>>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h >>>>> index 1776eb2..5d5d963 100644 >>>>> --- a/include/linux/kexec.h >>>>> +++ b/include/linux/kexec.h >>>>> @@ -330,8 +330,6 @@ extern int kexec_load_disabled; >>>>> >>>>> /* Location of a reserved region to hold the crash kernel. >>>>> */ >>>>> -extern struct resource crashk_res; >>>>> -extern struct resource crashk_low_res; >>>>> extern note_buf_t __percpu *crash_notes; >>>>> >>>>> /* flag to track if kexec reboot is in progress */ >>>>> diff --git a/kernel/crash_core.c b/kernel/crash_core.c >>>>> index 9f1557b..eb72fd6 100644 >>>>> --- a/kernel/crash_core.c >>>>> +++ b/kernel/crash_core.c >>>>> @@ -7,6 +7,8 @@ >>>>> #include <linux/crash_core.h> >>>>> #include <linux/utsname.h> >>>>> #include <linux/vmalloc.h> >>>>> +#include <linux/memblock.h> >>>>> +#include <linux/swiotlb.h> >>>>> >>>>> #include <asm/page.h> >>>>> #include <asm/sections.h> >>>>> @@ -19,6 +21,22 @@ u32 *vmcoreinfo_note; >>>>> /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ >>>>> static unsigned char *vmcoreinfo_data_safecopy; >>>>> >>>>> +/* Location of the reserved area for the crash kernel */ >>>>> +struct resource crashk_res = { >>>>> + .name = "Crash kernel", >>>>> + .start = 0, >>>>> + .end = 0, >>>>> + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, >>>>> + .desc = IORES_DESC_CRASH_KERNEL >>>>> +}; >>>>> +struct resource crashk_low_res = { >>>>> + .name = "Crash kernel", >>>>> + .start = 0, >>>>> + .end = 0, >>>>> + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, >>>>> + .desc = IORES_DESC_CRASH_KERNEL >>>>> +}; >>>>> + >>>>> /* >>>>> * parsing the "crashkernel" commandline >>>>> * >>>>> @@ -292,6 +310,75 @@ int __init parse_crashkernel_low(char *cmdline, >>>>> "crashkernel=", suffix_tbl[SUFFIX_LOW]); >>>>> } >>>>> >>>>> +#if defined(CONFIG_X86_64) >>>>> +#define CRASH_ALIGN SZ_16M >>>>> +#elif defined(CONFIG_ARM64) >>>>> +#define CRASH_ALIGN SZ_2M >>>>> +#endif >>>> >>>> I think no need to have the #ifdef, although I can not think out of >>>> reason we have 16M for X86, maybe move it to 2M as well if no other >>>> objections. Then it will be easier to reserve crashkernel successfully >>>> considering nowadays we have KASLR and other stuff it becomes harder. >>> >>> I also don't figure out why it is 16M in x86. >> >> IMHO, if we do not know why and in theory it should work with 2M, can >> you do some basic testing and move it to 2M? >> >> We can easily move back to 16M if someone really report something, but >> if we do not change it will always stay there but we do not know why. > > Ok. I will do some test later. Recently, i tested with 2M alignment in x86 and the system works well. Besides, i found memblock_find_in_range() in reserve_crashkernel() restrict the lower bound of the range to "CRASH_ALIGN". If we can make memblock_find_in_range() search from the start of memory? The code is as follows: static void __init reserve_crashkernel(void) { ... if (!high) crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_LOW_MAX, crash_size, CRASH_ALIGN); if (!crash_base) crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_HIGH_MAX, crash_size, CRASH_ALIGN); Thanks, Chen Zhou > >> >>> >>>> >>>>> + >>>>> +int __init reserve_crashkernel_low(void) >>>>> +{ >>>>> +#if defined(CONFIG_X86_64) || defined(CONFIG_ARM64) >>>>> + unsigned long long base, low_base = 0, low_size = 0; >>>>> + unsigned long total_low_mem; >>>>> + int ret; >>>>> + >>>>> + total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); >>>>> + >>>>> + /* crashkernel=Y,low */ >>>>> + ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, >>>>> + &base); >>>>> + if (ret) { >>>>> +#ifdef CONFIG_X86_64 >>>>> + /* >>>>> + * two parts from lib/swiotlb.c: >>>>> + * -swiotlb size: user-specified with swiotlb= or default. >>>>> + * >>>>> + * -swiotlb overflow buffer: now hardcoded to 32k. We round it >>>>> + * to 8M for other buffers that may need to stay low too. Also >>>>> + * make sure we allocate enough extra low memory so that we >>>>> + * don't run out of DMA buffers for 32-bit devices. >>>>> + */ >>>>> + low_size = max(swiotlb_size_or_default() + (8UL << 20), >>>>> + 256UL << 20); >>>>> +#else >>>>> + /* >>>>> + * in arm64, reserve low memory if and only if crashkernel=X,low >>>>> + * specified. >>>>> + */ >>>>> + return -EINVAL; >>>>> +#endif >>>> >>>> As said before, can you explore about why it needs different logic, it >>>> would be good to keep two arches same. >>>> >>>>> + } else { >>>>> + /* passed with crashkernel=0,low ? */ >>>>> + if (!low_size) >>>>> + return 0; >>>>> + } >>>>> + >>>>> + low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); >>>>> + if (!low_base) { >>>>> + pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", >>>>> + (unsigned long)(low_size >> 20)); >>>>> + return -ENOMEM; >>>>> + } >>>>> + >>>>> + ret = memblock_reserve(low_base, low_size); >>>>> + if (ret) { >>>>> + pr_err("%s: Error reserving crashkernel low memblock.\n", >>>>> + __func__); >>>>> + return ret; >>>>> + } >>>>> + >>>>> + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", >>>>> + (unsigned long)(low_size >> 20), >>>>> + (unsigned long)(low_base >> 20), >>>>> + (unsigned long)(total_low_mem >> 20)); >>>>> + >>>>> + crashk_low_res.start = low_base; >>>>> + crashk_low_res.end = low_base + low_size - 1; >>>>> +#endif >>>>> + return 0; >>>>> +} >>>>> + >>>>> Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, >>>>> void *data, size_t data_len) >>>>> { >>>>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c >>>>> index 15d70a9..458d093 100644 >>>>> --- a/kernel/kexec_core.c >>>>> +++ b/kernel/kexec_core.c >>>>> @@ -53,23 +53,6 @@ note_buf_t __percpu *crash_notes; >>>>> /* Flag to indicate we are going to kexec a new kernel */ >>>>> bool kexec_in_progress = false; >>>>> >>>>> - >>>>> -/* Location of the reserved area for the crash kernel */ >>>>> -struct resource crashk_res = { >>>>> - .name = "Crash kernel", >>>>> - .start = 0, >>>>> - .end = 0, >>>>> - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, >>>>> - .desc = IORES_DESC_CRASH_KERNEL >>>>> -}; >>>>> -struct resource crashk_low_res = { >>>>> - .name = "Crash kernel", >>>>> - .start = 0, >>>>> - .end = 0, >>>>> - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, >>>>> - .desc = IORES_DESC_CRASH_KERNEL >>>>> -}; >>>>> - >>>>> int kexec_should_crash(struct task_struct *p) >>>>> { >>>>> /* >>>>> -- >>>>> 2.7.4 >>>>> >>>> >>>> Thanks >>>> Dave >>>> >>>> >>>> . >>>> >>> Thanks, >>> Chen Zhou >>> >> >> Thanks >> Dave >> >> > > Thanks, > Chen Zhou >
Hi Dave/James, On 2020/1/17 11:58, Dave Young wrote: > On 01/16/20 at 03:17pm, James Morse wrote: >> Hi guys, >> >> On 28/12/2019 09:32, Dave Young wrote: >>> On 12/27/19 at 07:04pm, Chen Zhou wrote: >>>> On 2019/12/27 13:54, Dave Young wrote: >>>>> On 12/23/19 at 11:23pm, Chen Zhou wrote: >>>>>> In preparation for supporting reserve_crashkernel_low in arm64 as >>>>>> x86_64 does, move reserve_crashkernel_low() into kernel/crash_core.c. >>>>>> >>>>>> Note, in arm64, we reserve low memory if and only if crashkernel=X,low >>>>>> is specified. Different with x86_64, don't set low memory automatically. >>>>> >>>>> Do you have any reason for the difference? I'd expect we have same >>>>> logic if possible and remove some of the ifdefs. >>>> >>>> In x86_64, if we reserve crashkernel above 4G, then we call reserve_crashkernel_low() >>>> to reserve low memory. >>>> >>>> In arm64, to simplify, we call reserve_crashkernel_low() at the beginning of reserve_crashkernel() >>>> and then relax the arm64_dma32_phys_limit if reserve_crashkernel_low() allocated something. >>>> In this case, if reserve crashkernel below 4G there will be 256M low memory set automatically >>>> and this needs extra considerations. >> >>> Sorry that I did not read the old thread details and thought that is >>> arch dependent. But rethink about that, it would be better that we can >>> have same semantic about crashkernel parameters across arches. If we >>> make them different then it causes confusion, especially for >>> distributions. >> >> Surely distros also want one crashkernel* string they can use on all platforms without >> having to detect the kernel version, platform or changeable memory layout... >> >> >>> OTOH, I thought if we reserve high memory then the low memory should be >>> needed. There might be some exceptions, but I do not know the exact >>> one, >> >>> can we make the behavior same, and special case those systems which >>> do not need low memory reservation. >> >> Its tricky to work out which systems are the 'normal' ones. >> >> We don't have a fixed memory layout for arm64. Some systems have no memory below 4G. >> Others have no memory above 4G. >> >> Chen Zhou's machine has some memory below 4G, but its too precious to reserve a large >> chunk for kdump. Without any memory below 4G some of the drivers won't work. >> >> I don't see what distros can set as their default for all platforms if high/low are >> mutually exclusive with the 'crashkernel=' in use today. How did x86 navigate this, ... or >> was it so long ago? > > It is very rare for such machine without any low memory in X86, at least > from what I know, so the current way just works fine. > > Since arm64 is quite different, I would agree with current way > proposed in the patch, but a question is, for those arm64 systems how can > admin know if low crashkernel memory is needed or not? and just skip the > low reservation for machine with high memory installed only? Specified size low memory is for crash dump kernel devices. I think admin should know if there are devices needing low memory in crash dump kernel. James, any suggestions? Thanks, Chen Zhou > >> >> No one else has reported a problem with the existing placement logic, hence treating this >> 'low' thing as the 'in addition' special case. >> >> >>>> previous discusses: >>>> https://lkml.org/lkml/2019/6/5/670 >>>> https://lkml.org/lkml/2019/6/13/229 >>> >>> Another concern from James: >>> " >>> With both crashk_low_res and crashk_res, we end up with two entries in /proc/iomem called >>> "Crash kernel". Because its sorted by address, and kexec-tools stops searching when it >>> find "Crash kernel", you are always going to get the kernel placed in the lower portion. >>> " >>> >>> The kexec-tools code is iterating all "Crash kernel" ranges and add them >>> in an array. In X86 code, it uses the higher range to locate memory. >> >> Then my hurried reading of what the user-space code does was wrong! >> >> If kexec-tools places the kernel in the low region, there may not be enough memory left >> for whatever purpose it was reserved for. This was the motivation for giving it a >> different name. > > Agreed, it is still a potential problem though. Say we have both low > and high reserved. Kdump kernel boots up, the kernel and drivers, > applications will use memory, I'm not sure if there is a memory > allocation policy to let them all use high mem first.. Anyway that is > beyond the kexec-tools and resource name. > > Thanks > Dave > > > . >
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cedfe20..5f38942 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -486,59 +486,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) # define CRASH_ADDR_HIGH_MAX SZ_64T #endif -static int __init reserve_crashkernel_low(void) -{ -#ifdef CONFIG_X86_64 - unsigned long long base, low_base = 0, low_size = 0; - unsigned long total_low_mem; - int ret; - - total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); - - /* crashkernel=Y,low */ - ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); - if (ret) { - /* - * two parts from kernel/dma/swiotlb.c: - * -swiotlb size: user-specified with swiotlb= or default. - * - * -swiotlb overflow buffer: now hardcoded to 32k. We round it - * to 8M for other buffers that may need to stay low too. Also - * make sure we allocate enough extra low memory so that we - * don't run out of DMA buffers for 32-bit devices. - */ - low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); - } else { - /* passed with crashkernel=0,low ? */ - if (!low_size) - return 0; - } - - low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); - if (!low_base) { - pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", - (unsigned long)(low_size >> 20)); - return -ENOMEM; - } - - ret = memblock_reserve(low_base, low_size); - if (ret) { - pr_err("%s: Error reserving crashkernel low memblock.\n", __func__); - return ret; - } - - pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", - (unsigned long)(low_size >> 20), - (unsigned long)(low_base >> 20), - (unsigned long)(total_low_mem >> 20)); - - crashk_low_res.start = low_base; - crashk_low_res.end = low_base + low_size - 1; - insert_resource(&iomem_resource, &crashk_low_res); -#endif - return 0; -} - static void __init reserve_crashkernel(void) { unsigned long long crash_size, crash_base, total_mem; @@ -602,9 +549,12 @@ static void __init reserve_crashkernel(void) return; } - if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { - memblock_free(crash_base, crash_size); - return; + if (crash_base >= (1ULL << 32)) { + if (reserve_crashkernel_low()) { + memblock_free(crash_base, crash_size); + return; + } + insert_resource(&iomem_resource, &crashk_low_res); } pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 525510a..4df8c0b 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -63,6 +63,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); extern unsigned char *vmcoreinfo_data; extern size_t vmcoreinfo_size; extern u32 *vmcoreinfo_note; +extern struct resource crashk_res; +extern struct resource crashk_low_res; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); @@ -74,5 +76,6 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); +int __init reserve_crashkernel_low(void); #endif /* LINUX_CRASH_CORE_H */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 1776eb2..5d5d963 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -330,8 +330,6 @@ extern int kexec_load_disabled; /* Location of a reserved region to hold the crash kernel. */ -extern struct resource crashk_res; -extern struct resource crashk_low_res; extern note_buf_t __percpu *crash_notes; /* flag to track if kexec reboot is in progress */ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 9f1557b..eb72fd6 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -7,6 +7,8 @@ #include <linux/crash_core.h> #include <linux/utsname.h> #include <linux/vmalloc.h> +#include <linux/memblock.h> +#include <linux/swiotlb.h> #include <asm/page.h> #include <asm/sections.h> @@ -19,6 +21,22 @@ u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ static unsigned char *vmcoreinfo_data_safecopy; +/* Location of the reserved area for the crash kernel */ +struct resource crashk_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL +}; +struct resource crashk_low_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL +}; + /* * parsing the "crashkernel" commandline * @@ -292,6 +310,75 @@ int __init parse_crashkernel_low(char *cmdline, "crashkernel=", suffix_tbl[SUFFIX_LOW]); } +#if defined(CONFIG_X86_64) +#define CRASH_ALIGN SZ_16M +#elif defined(CONFIG_ARM64) +#define CRASH_ALIGN SZ_2M +#endif + +int __init reserve_crashkernel_low(void) +{ +#if defined(CONFIG_X86_64) || defined(CONFIG_ARM64) + unsigned long long base, low_base = 0, low_size = 0; + unsigned long total_low_mem; + int ret; + + total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); + + /* crashkernel=Y,low */ + ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, + &base); + if (ret) { +#ifdef CONFIG_X86_64 + /* + * two parts from lib/swiotlb.c: + * -swiotlb size: user-specified with swiotlb= or default. + * + * -swiotlb overflow buffer: now hardcoded to 32k. We round it + * to 8M for other buffers that may need to stay low too. Also + * make sure we allocate enough extra low memory so that we + * don't run out of DMA buffers for 32-bit devices. + */ + low_size = max(swiotlb_size_or_default() + (8UL << 20), + 256UL << 20); +#else + /* + * in arm64, reserve low memory if and only if crashkernel=X,low + * specified. + */ + return -EINVAL; +#endif + } else { + /* passed with crashkernel=0,low ? */ + if (!low_size) + return 0; + } + + low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); + if (!low_base) { + pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", + (unsigned long)(low_size >> 20)); + return -ENOMEM; + } + + ret = memblock_reserve(low_base, low_size); + if (ret) { + pr_err("%s: Error reserving crashkernel low memblock.\n", + __func__); + return ret; + } + + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", + (unsigned long)(low_size >> 20), + (unsigned long)(low_base >> 20), + (unsigned long)(total_low_mem >> 20)); + + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; +#endif + return 0; +} + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 15d70a9..458d093 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -53,23 +53,6 @@ note_buf_t __percpu *crash_notes; /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; - -/* Location of the reserved area for the crash kernel */ -struct resource crashk_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, - .desc = IORES_DESC_CRASH_KERNEL -}; -struct resource crashk_low_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, - .desc = IORES_DESC_CRASH_KERNEL -}; - int kexec_should_crash(struct task_struct *p) { /*
In preparation for supporting reserve_crashkernel_low in arm64 as x86_64 does, move reserve_crashkernel_low() into kernel/crash_core.c. Note, in arm64, we reserve low memory if and only if crashkernel=X,low is specified. Different with x86_64, don't set low memory automatically. Reported-by: kbuild test robot <lkp@intel.com> Signed-off-by: Chen Zhou <chenzhou10@huawei.com> --- arch/x86/kernel/setup.c | 62 ++++----------------------------- include/linux/crash_core.h | 3 ++ include/linux/kexec.h | 2 -- kernel/crash_core.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/kexec_core.c | 17 --------- 5 files changed, 96 insertions(+), 75 deletions(-)