diff mbox

[v26,2/7] arm64: kdump: implement machine_crash_shutdown()

Message ID 20160907042908.6232-3-takahiro.akashi@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

AKASHI Takahiro Sept. 7, 2016, 4:29 a.m. UTC
Primary kernel calls machine_crash_shutdown() to shut down non-boot cpus
and save registers' status in per-cpu ELF notes before starting crash
dump kernel. See kernel_kexec().
Even if not all secondary cpus have shut down, we do kdump anyway.

As we don't have to make non-boot(crashed) cpus offline (to preserve
correct status of cpus at crash dump) before shutting down, this patch
also adds a variant of smp_send_stop().

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
---
 arch/arm64/include/asm/hardirq.h  |  2 +-
 arch/arm64/include/asm/kexec.h    | 41 ++++++++++++++++++++++++-
 arch/arm64/include/asm/smp.h      |  2 ++
 arch/arm64/kernel/machine_kexec.c | 56 ++++++++++++++++++++++++++++++++--
 arch/arm64/kernel/smp.c           | 63 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 159 insertions(+), 5 deletions(-)

Comments

James Morse Sept. 14, 2016, 6:09 p.m. UTC | #1
Hi Akashi,

(CC: Marc who knows how this irqchip wizardry works
 Cover letter: https://www.spinics.net/lists/arm-kernel/msg529520.html )

On 07/09/16 05:29, AKASHI Takahiro wrote:
> Primary kernel calls machine_crash_shutdown() to shut down non-boot cpus
> and save registers' status in per-cpu ELF notes before starting crash
> dump kernel. See kernel_kexec().
> Even if not all secondary cpus have shut down, we do kdump anyway.
> 
> As we don't have to make non-boot(crashed) cpus offline (to preserve
> correct status of cpus at crash dump) before shutting down, this patch
> also adds a variant of smp_send_stop().
> 
> Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
> ---
>  arch/arm64/include/asm/hardirq.h  |  2 +-
>  arch/arm64/include/asm/kexec.h    | 41 ++++++++++++++++++++++++-
>  arch/arm64/include/asm/smp.h      |  2 ++
>  arch/arm64/kernel/machine_kexec.c | 56 ++++++++++++++++++++++++++++++++--
>  arch/arm64/kernel/smp.c           | 63 +++++++++++++++++++++++++++++++++++++++
>  5 files changed, 159 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h
> index 8740297..1473fc2 100644
> --- a/arch/arm64/include/asm/hardirq.h
> +++ b/arch/arm64/include/asm/hardirq.h
> @@ -20,7 +20,7 @@
>  #include <linux/threads.h>
>  #include <asm/irq.h>
>  
> -#define NR_IPI	6
> +#define NR_IPI	7
>  
>  typedef struct {
>  	unsigned int __softirq_pending;
> diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
> index 04744dc..a908958 100644
> --- a/arch/arm64/include/asm/kexec.h
> +++ b/arch/arm64/include/asm/kexec.h
> @@ -40,7 +40,46 @@
>  static inline void crash_setup_regs(struct pt_regs *newregs,
>  				    struct pt_regs *oldregs)
>  {
> -	/* Empty routine needed to avoid build errors. */
> +	if (oldregs) {
> +		memcpy(newregs, oldregs, sizeof(*newregs));
> +	} else {
> +		u64 tmp1, tmp2;
> +
> +		__asm__ __volatile__ (
> +			"stp	 x0,   x1, [%2, #16 *  0]\n"
> +			"stp	 x2,   x3, [%2, #16 *  1]\n"
> +			"stp	 x4,   x5, [%2, #16 *  2]\n"
> +			"stp	 x6,   x7, [%2, #16 *  3]\n"
> +			"stp	 x8,   x9, [%2, #16 *  4]\n"
> +			"stp	x10,  x11, [%2, #16 *  5]\n"
> +			"stp	x12,  x13, [%2, #16 *  6]\n"
> +			"stp	x14,  x15, [%2, #16 *  7]\n"
> +			"stp	x16,  x17, [%2, #16 *  8]\n"
> +			"stp	x18,  x19, [%2, #16 *  9]\n"
> +			"stp	x20,  x21, [%2, #16 * 10]\n"
> +			"stp	x22,  x23, [%2, #16 * 11]\n"
> +			"stp	x24,  x25, [%2, #16 * 12]\n"
> +			"stp	x26,  x27, [%2, #16 * 13]\n"
> +			"stp	x28,  x29, [%2, #16 * 14]\n"
> +			"mov	 %0,  sp\n"
> +			"stp	x30,  %0,  [%2, #16 * 15]\n"
> +
> +			"/* faked current PSTATE */\n"
> +			"mrs	 %0, CurrentEL\n"
> +			"mrs	 %1, DAIF\n"
> +			"orr	 %0, %0, %1\n"
> +			"mrs	 %1, NZCV\n"
> +			"orr	 %0, %0, %1\n"
> +

What about SPSEL? While we don't use it, it is correctly preserved for
everything except a CPU that calls panic()...


> +			/* pc */
> +			"adr	 %1, 1f\n"
> +		"1:\n"
> +			"stp	 %1, %0,   [%2, #16 * 16]\n"
> +			: "=r" (tmp1), "=r" (tmp2), "+r" (newregs)
> +			:
> +			: "memory"
> +		);
> +	}
>  }
>  
>  #endif /* __ASSEMBLY__ */
> diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
> index 0226447..6b0f2c7 100644
> --- a/arch/arm64/include/asm/smp.h
> +++ b/arch/arm64/include/asm/smp.h
> @@ -136,6 +136,8 @@ static inline void cpu_panic_kernel(void)
>   */
>  bool cpus_are_stuck_in_kernel(void);
>  
> +extern void smp_send_crash_stop(void);
> +
>  #endif /* ifndef __ASSEMBLY__ */
>  
>  #endif /* ifndef __ASM_SMP_H */
> diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
> index bc96c8a..8ac9dba8 100644
> --- a/arch/arm64/kernel/machine_kexec.c
> +++ b/arch/arm64/kernel/machine_kexec.c
> @@ -9,6 +9,9 @@
>   * published by the Free Software Foundation.
>   */
>  
> +#include <linux/interrupt.h>
> +#include <linux/irq.h>
> +#include <linux/kernel.h>
>  #include <linux/kexec.h>
>  #include <linux/smp.h>
>  
> @@ -22,6 +25,7 @@
>  extern const unsigned char arm64_relocate_new_kernel[];
>  extern const unsigned long arm64_relocate_new_kernel_size;
>  
> +bool in_crash_kexec;

static?


>  static unsigned long kimage_start;
>  
>  /**
> @@ -148,7 +152,8 @@ void machine_kexec(struct kimage *kimage)
>  	/*
>  	 * New cpus may have become stuck_in_kernel after we loaded the image.
>  	 */
> -	BUG_ON(cpus_are_stuck_in_kernel() || (num_online_cpus() > 1));
> +	BUG_ON((cpus_are_stuck_in_kernel() || (num_online_cpus() > 1)) &&
> +			!WARN_ON(in_crash_kexec));

In the kdump case, num_online_cpus() is unchanged as ipi_cpu_crash_stop()
doesn't update the online cpu masks, so this WARN_ON always fires. This is
confusing as the 'failed to stop secondary CPUs' message doesn't appear, because
those CPUs did stop, and waiting_for_crash_ipi has the expected value...


>  
>  	reboot_code_buffer_phys = page_to_phys(kimage->control_code_page);
>  	reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);
> @@ -200,13 +205,58 @@ void machine_kexec(struct kimage *kimage)
>  	 * relocation is complete.
>  	 */
>  
> -	cpu_soft_restart(1, reboot_code_buffer_phys, kimage->head,
> +	cpu_soft_restart(!in_crash_kexec, reboot_code_buffer_phys, kimage->head,
>  		kimage_start, 0);
>  
>  	BUG(); /* Should never get here. */
>  }



> +static void machine_kexec_mask_interrupts(void)
> +{
> +	unsigned int i;
> +	struct irq_desc *desc;
> +
> +	for_each_irq_desc(i, desc) {
> +		struct irq_chip *chip;
> +		int ret;
> +
> +		chip = irq_desc_get_chip(desc);
> +		if (!chip)
> +			continue;
> +
> +		/*
> +		 * First try to remove the active state. If this
> +		 * fails, try to EOI the interrupt.
> +		 */
> +		ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false);
> +
> +		if (ret && irqd_irq_inprogress(&desc->irq_data) &&
> +		    chip->irq_eoi)
> +			chip->irq_eoi(&desc->irq_data);
> +
> +		if (chip->irq_mask)
> +			chip->irq_mask(&desc->irq_data);
> +
> +		if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
> +			chip->irq_disable(&desc->irq_data);
> +	}
> +}

This function is over my head ... I have no idea how this works, I can only
comment that its different to the version under arch/arm

/me adds Marc Z to CC.


> +/**
> + * machine_crash_shutdown - shutdown non-crashing cpus and save registers
> + */
>  void machine_crash_shutdown(struct pt_regs *regs)
>  {
> -	/* Empty routine needed to avoid build errors. */
> +	local_irq_disable();
> +
> +	in_crash_kexec = true;
> +
> +	/* shutdown non-crashing cpus */
> +	smp_send_crash_stop();
> +
> +	/* for crashing cpu */
> +	crash_save_cpu(regs, smp_processor_id());
> +	machine_kexec_mask_interrupts();
> +
> +	pr_info("Starting crashdump kernel...\n");
>  }
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index d93d433..b401b25 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -37,6 +37,7 @@
>  #include <linux/completion.h>
>  #include <linux/of.h>
>  #include <linux/irq_work.h>
> +#include <linux/kexec.h>
>  
>  #include <asm/alternative.h>
>  #include <asm/atomic.h>
> @@ -71,6 +72,7 @@ enum ipi_msg_type {
>  	IPI_RESCHEDULE,
>  	IPI_CALL_FUNC,
>  	IPI_CPU_STOP,
> +	IPI_CPU_CRASH_STOP,
>  	IPI_TIMER,
>  	IPI_IRQ_WORK,
>  	IPI_WAKEUP
> @@ -734,6 +736,7 @@ static const char *ipi_types[NR_IPI] __tracepoint_string = {
>  	S(IPI_RESCHEDULE, "Rescheduling interrupts"),
>  	S(IPI_CALL_FUNC, "Function call interrupts"),
>  	S(IPI_CPU_STOP, "CPU stop interrupts"),
> +	S(IPI_CPU_CRASH_STOP, "CPU stop (for crash dump) interrupts"),
>  	S(IPI_TIMER, "Timer broadcast interrupts"),
>  	S(IPI_IRQ_WORK, "IRQ work interrupts"),
>  	S(IPI_WAKEUP, "CPU wake-up interrupts"),
> @@ -808,6 +811,29 @@ static void ipi_cpu_stop(unsigned int cpu)
>  		cpu_relax();
>  }
>  
> +#ifdef CONFIG_KEXEC_CORE
> +static atomic_t waiting_for_crash_ipi;
> +#endif
> +
> +static void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
> +{
> +#ifdef CONFIG_KEXEC_CORE
> +	crash_save_cpu(regs, cpu);
> +
> +	atomic_dec(&waiting_for_crash_ipi);
> +
> +	local_irq_disable();
> +
> +#ifdef CONFIG_HOTPLUG_CPU
> +	if (cpu_ops[cpu]->cpu_die)
> +		cpu_ops[cpu]->cpu_die(cpu);
> +#endif
> +
> +	/* just in case */
> +	cpu_park_loop();
> +#endif
> +}
> +
>  /*
>   * Main handler for inter-processor interrupts
>   */
> @@ -838,6 +864,15 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
>  		irq_exit();
>  		break;
>  
> +	case IPI_CPU_CRASH_STOP:
> +		if (IS_ENABLED(CONFIG_KEXEC_CORE)) {
> +			irq_enter();
> +			ipi_cpu_crash_stop(cpu, regs);
> +
> +			unreachable();
> +		}
> +		break;
> +
>  #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
>  	case IPI_TIMER:
>  		irq_enter();
> @@ -910,6 +945,34 @@ void smp_send_stop(void)
>  			   cpumask_pr_args(cpu_online_mask));
>  }
>  
> +#ifdef CONFIG_KEXEC_CORE
> +void smp_send_crash_stop(void)
> +{
> +	cpumask_t mask;
> +	unsigned long timeout;
> +
> +	if (num_online_cpus() == 1)
> +		return;
> +
> +	cpumask_copy(&mask, cpu_online_mask);
> +	cpumask_clear_cpu(smp_processor_id(), &mask);
> +
> +	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
> +
> +	pr_crit("SMP: stopping secondary CPUs\n");
> +	smp_cross_call(&mask, IPI_CPU_CRASH_STOP);
> +
> +	/* Wait up to one second for other CPUs to stop */
> +	timeout = USEC_PER_SEC;
> +	while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--)
> +		udelay(1);
> +
> +	if (atomic_read(&waiting_for_crash_ipi) > 0)
> +		pr_warning("SMP: failed to stop secondary CPUs %*pbl\n",
> +			   cpumask_pr_args(cpu_online_mask));
> +}
> +#endif

This is very similar to smp_send_stop() which also has the timeout. Is it
possible to merge them? You could use in_crash_kexec to choose the IPI type.


> +
>  /*
>   * not supported here
>   */
> 


Reviewed-by: James Morse <james.morse@arm.com>


Thanks,

James
Marc Zyngier Sept. 15, 2016, 8:13 a.m. UTC | #2
Hi James,

Thanks for cc-ing me.

On 14/09/16 19:09, James Morse wrote:
> Hi Akashi,
> 
> (CC: Marc who knows how this irqchip wizardry works
>  Cover letter: https://www.spinics.net/lists/arm-kernel/msg529520.html )
> 
> On 07/09/16 05:29, AKASHI Takahiro wrote:
>> Primary kernel calls machine_crash_shutdown() to shut down non-boot cpus
>> and save registers' status in per-cpu ELF notes before starting crash
>> dump kernel. See kernel_kexec().
>> Even if not all secondary cpus have shut down, we do kdump anyway.
>>
>> As we don't have to make non-boot(crashed) cpus offline (to preserve
>> correct status of cpus at crash dump) before shutting down, this patch
>> also adds a variant of smp_send_stop().
>>
>> Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
>> ---
>>  arch/arm64/include/asm/hardirq.h  |  2 +-
>>  arch/arm64/include/asm/kexec.h    | 41 ++++++++++++++++++++++++-
>>  arch/arm64/include/asm/smp.h      |  2 ++
>>  arch/arm64/kernel/machine_kexec.c | 56 ++++++++++++++++++++++++++++++++--
>>  arch/arm64/kernel/smp.c           | 63 +++++++++++++++++++++++++++++++++++++++
>>  5 files changed, 159 insertions(+), 5 deletions(-)

[...]

>> +static void machine_kexec_mask_interrupts(void)
>> +{
>> +	unsigned int i;
>> +	struct irq_desc *desc;
>> +
>> +	for_each_irq_desc(i, desc) {
>> +		struct irq_chip *chip;
>> +		int ret;
>> +
>> +		chip = irq_desc_get_chip(desc);
>> +		if (!chip)
>> +			continue;
>> +
>> +		/*
>> +		 * First try to remove the active state. If this
>> +		 * fails, try to EOI the interrupt.
>> +		 */
>> +		ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false);
>> +
>> +		if (ret && irqd_irq_inprogress(&desc->irq_data) &&
>> +		    chip->irq_eoi)
>> +			chip->irq_eoi(&desc->irq_data);
>> +
>> +		if (chip->irq_mask)
>> +			chip->irq_mask(&desc->irq_data);
>> +
>> +		if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
>> +			chip->irq_disable(&desc->irq_data);
>> +	}
>> +}
> 
> This function is over my head ... I have no idea how this works, I can only
> comment that its different to the version under arch/arm
> 
> /me adds Marc Z to CC.

I wrote the damn code! ;-)

The main idea is that simply EOIing an interrupt is not good enough if
the interrupt has been offloaded to a VM. It needs to be actively
deactivated for the state machine to be reset.

But realistically, even that is not enough. What we need is a way to
completely shut off the GIC, irrespective of the state of the various
interrupts. A "panic button" of some sort, with no return.

That would probably work for GICv3 (assuming that we don't need to
involve the secure side of things), but anything GICv2 based would be
difficult to deal with (you cannot access the other CPU private
interrupt configuration). Maybe that'd be enough, maybe not. Trying to
boot a crash kernel is like buying a lottery ticket anyway (and with
similar odds...).

I'll have a look.

Thanks,

	M.
AKASHI Takahiro Sept. 16, 2016, 3:21 a.m. UTC | #3
James,

Thank you for your review.

On Wed, Sep 14, 2016 at 07:09:33PM +0100, James Morse wrote:
> Hi Akashi,
> 
> (CC: Marc who knows how this irqchip wizardry works
>  Cover letter: https://www.spinics.net/lists/arm-kernel/msg529520.html )
> 
> On 07/09/16 05:29, AKASHI Takahiro wrote:
> > Primary kernel calls machine_crash_shutdown() to shut down non-boot cpus
> > and save registers' status in per-cpu ELF notes before starting crash
> > dump kernel. See kernel_kexec().
> > Even if not all secondary cpus have shut down, we do kdump anyway.
> > 
> > As we don't have to make non-boot(crashed) cpus offline (to preserve
> > correct status of cpus at crash dump) before shutting down, this patch
> > also adds a variant of smp_send_stop().
> > 
> > Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
> > ---
> >  arch/arm64/include/asm/hardirq.h  |  2 +-
> >  arch/arm64/include/asm/kexec.h    | 41 ++++++++++++++++++++++++-
> >  arch/arm64/include/asm/smp.h      |  2 ++
> >  arch/arm64/kernel/machine_kexec.c | 56 ++++++++++++++++++++++++++++++++--
> >  arch/arm64/kernel/smp.c           | 63 +++++++++++++++++++++++++++++++++++++++
> >  5 files changed, 159 insertions(+), 5 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h
> > index 8740297..1473fc2 100644
> > --- a/arch/arm64/include/asm/hardirq.h
> > +++ b/arch/arm64/include/asm/hardirq.h
> > @@ -20,7 +20,7 @@
> >  #include <linux/threads.h>
> >  #include <asm/irq.h>
> >  
> > -#define NR_IPI	6
> > +#define NR_IPI	7
> >  
> >  typedef struct {
> >  	unsigned int __softirq_pending;
> > diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
> > index 04744dc..a908958 100644
> > --- a/arch/arm64/include/asm/kexec.h
> > +++ b/arch/arm64/include/asm/kexec.h
> > @@ -40,7 +40,46 @@
> >  static inline void crash_setup_regs(struct pt_regs *newregs,
> >  				    struct pt_regs *oldregs)
> >  {
> > -	/* Empty routine needed to avoid build errors. */
> > +	if (oldregs) {
> > +		memcpy(newregs, oldregs, sizeof(*newregs));
> > +	} else {
> > +		u64 tmp1, tmp2;
> > +
> > +		__asm__ __volatile__ (
> > +			"stp	 x0,   x1, [%2, #16 *  0]\n"
> > +			"stp	 x2,   x3, [%2, #16 *  1]\n"
> > +			"stp	 x4,   x5, [%2, #16 *  2]\n"
> > +			"stp	 x6,   x7, [%2, #16 *  3]\n"
> > +			"stp	 x8,   x9, [%2, #16 *  4]\n"
> > +			"stp	x10,  x11, [%2, #16 *  5]\n"
> > +			"stp	x12,  x13, [%2, #16 *  6]\n"
> > +			"stp	x14,  x15, [%2, #16 *  7]\n"
> > +			"stp	x16,  x17, [%2, #16 *  8]\n"
> > +			"stp	x18,  x19, [%2, #16 *  9]\n"
> > +			"stp	x20,  x21, [%2, #16 * 10]\n"
> > +			"stp	x22,  x23, [%2, #16 * 11]\n"
> > +			"stp	x24,  x25, [%2, #16 * 12]\n"
> > +			"stp	x26,  x27, [%2, #16 * 13]\n"
> > +			"stp	x28,  x29, [%2, #16 * 14]\n"
> > +			"mov	 %0,  sp\n"
> > +			"stp	x30,  %0,  [%2, #16 * 15]\n"
> > +
> > +			"/* faked current PSTATE */\n"
> > +			"mrs	 %0, CurrentEL\n"
> > +			"mrs	 %1, DAIF\n"
> > +			"orr	 %0, %0, %1\n"
> > +			"mrs	 %1, NZCV\n"
> > +			"orr	 %0, %0, %1\n"
> > +
> 
> What about SPSEL? While we don't use it, it is correctly preserved for
> everything except a CPU that calls panic()...

My comment above might be confusing, but what I want to fake
here is "spsr" as pt_regs.pstate is normally set based on spsr_el1.
So there is no corresponding field of SPSEL in spsr.

> 
> > +			/* pc */
> > +			"adr	 %1, 1f\n"
> > +		"1:\n"
> > +			"stp	 %1, %0,   [%2, #16 * 16]\n"
> > +			: "=r" (tmp1), "=r" (tmp2), "+r" (newregs)
> > +			:
> > +			: "memory"
> > +		);
> > +	}
> >  }
> >  
> >  #endif /* __ASSEMBLY__ */
> > diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
> > index 0226447..6b0f2c7 100644
> > --- a/arch/arm64/include/asm/smp.h
> > +++ b/arch/arm64/include/asm/smp.h
> > @@ -136,6 +136,8 @@ static inline void cpu_panic_kernel(void)
> >   */
> >  bool cpus_are_stuck_in_kernel(void);
> >  
> > +extern void smp_send_crash_stop(void);
> > +
> >  #endif /* ifndef __ASSEMBLY__ */
> >  
> >  #endif /* ifndef __ASM_SMP_H */
> > diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
> > index bc96c8a..8ac9dba8 100644
> > --- a/arch/arm64/kernel/machine_kexec.c
> > +++ b/arch/arm64/kernel/machine_kexec.c
> > @@ -9,6 +9,9 @@
> >   * published by the Free Software Foundation.
> >   */
> >  
> > +#include <linux/interrupt.h>
> > +#include <linux/irq.h>
> > +#include <linux/kernel.h>
> >  #include <linux/kexec.h>
> >  #include <linux/smp.h>
> >  
> > @@ -22,6 +25,7 @@
> >  extern const unsigned char arm64_relocate_new_kernel[];
> >  extern const unsigned long arm64_relocate_new_kernel_size;
> >  
> > +bool in_crash_kexec;
> 
> static?

Yes, it can be.

> >  static unsigned long kimage_start;
> >  
> >  /**
> > @@ -148,7 +152,8 @@ void machine_kexec(struct kimage *kimage)
> >  	/*
> >  	 * New cpus may have become stuck_in_kernel after we loaded the image.
> >  	 */
> > -	BUG_ON(cpus_are_stuck_in_kernel() || (num_online_cpus() > 1));
> > +	BUG_ON((cpus_are_stuck_in_kernel() || (num_online_cpus() > 1)) &&
> > +			!WARN_ON(in_crash_kexec));
> 
> In the kdump case, num_online_cpus() is unchanged as ipi_cpu_crash_stop()
> doesn't update the online cpu masks, so this WARN_ON always fires. This is
> confusing as the 'failed to stop secondary CPUs' message doesn't appear, because
> those CPUs did stop, and waiting_for_crash_ipi has the expected value...

Good catch! I've never noticed that the message was wrong.
The line should be changed to:
	BUG_ON((cpus_are_stuck_in_kernel() || (num_online_cpus() > 1)) &&
			!in_crash_kexec);

> >  
> >  	reboot_code_buffer_phys = page_to_phys(kimage->control_code_page);
> >  	reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);
> > @@ -200,13 +205,58 @@ void machine_kexec(struct kimage *kimage)
> >  	 * relocation is complete.
> >  	 */
> >  
> > -	cpu_soft_restart(1, reboot_code_buffer_phys, kimage->head,
> > +	cpu_soft_restart(!in_crash_kexec, reboot_code_buffer_phys, kimage->head,
> >  		kimage_start, 0);
> >  
> >  	BUG(); /* Should never get here. */
> >  }
> 
> 
> 
> > +static void machine_kexec_mask_interrupts(void)
> > +{
> > +	unsigned int i;
> > +	struct irq_desc *desc;
> > +
> > +	for_each_irq_desc(i, desc) {
> > +		struct irq_chip *chip;
> > +		int ret;
> > +
> > +		chip = irq_desc_get_chip(desc);
> > +		if (!chip)
> > +			continue;
> > +
> > +		/*
> > +		 * First try to remove the active state. If this
> > +		 * fails, try to EOI the interrupt.
> > +		 */
> > +		ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false);
> > +
> > +		if (ret && irqd_irq_inprogress(&desc->irq_data) &&
> > +		    chip->irq_eoi)
> > +			chip->irq_eoi(&desc->irq_data);
> > +
> > +		if (chip->irq_mask)
> > +			chip->irq_mask(&desc->irq_data);
> > +
> > +		if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
> > +			chip->irq_disable(&desc->irq_data);
> > +	}
> > +}
> 
> This function is over my head ... I have no idea how this works, I can only
> comment that its different to the version under arch/arm
> 
> /me adds Marc Z to CC.

This function was once borrowed from arch/arm, then dropped temporarily
and revamped based on Marc's comment IIRC.
So I would like to defer to Marc.

> 
> > +/**
> > + * machine_crash_shutdown - shutdown non-crashing cpus and save registers
> > + */
> >  void machine_crash_shutdown(struct pt_regs *regs)
> >  {
> > -	/* Empty routine needed to avoid build errors. */
> > +	local_irq_disable();
> > +
> > +	in_crash_kexec = true;
> > +
> > +	/* shutdown non-crashing cpus */
> > +	smp_send_crash_stop();
> > +
> > +	/* for crashing cpu */
> > +	crash_save_cpu(regs, smp_processor_id());
> > +	machine_kexec_mask_interrupts();
> > +
> > +	pr_info("Starting crashdump kernel...\n");
> >  }
> > diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> > index d93d433..b401b25 100644
> > --- a/arch/arm64/kernel/smp.c
> > +++ b/arch/arm64/kernel/smp.c
> > @@ -37,6 +37,7 @@
> >  #include <linux/completion.h>
> >  #include <linux/of.h>
> >  #include <linux/irq_work.h>
> > +#include <linux/kexec.h>
> >  
> >  #include <asm/alternative.h>
> >  #include <asm/atomic.h>
> > @@ -71,6 +72,7 @@ enum ipi_msg_type {
> >  	IPI_RESCHEDULE,
> >  	IPI_CALL_FUNC,
> >  	IPI_CPU_STOP,
> > +	IPI_CPU_CRASH_STOP,
> >  	IPI_TIMER,
> >  	IPI_IRQ_WORK,
> >  	IPI_WAKEUP
> > @@ -734,6 +736,7 @@ static const char *ipi_types[NR_IPI] __tracepoint_string = {
> >  	S(IPI_RESCHEDULE, "Rescheduling interrupts"),
> >  	S(IPI_CALL_FUNC, "Function call interrupts"),
> >  	S(IPI_CPU_STOP, "CPU stop interrupts"),
> > +	S(IPI_CPU_CRASH_STOP, "CPU stop (for crash dump) interrupts"),
> >  	S(IPI_TIMER, "Timer broadcast interrupts"),
> >  	S(IPI_IRQ_WORK, "IRQ work interrupts"),
> >  	S(IPI_WAKEUP, "CPU wake-up interrupts"),
> > @@ -808,6 +811,29 @@ static void ipi_cpu_stop(unsigned int cpu)
> >  		cpu_relax();
> >  }
> >  
> > +#ifdef CONFIG_KEXEC_CORE
> > +static atomic_t waiting_for_crash_ipi;
> > +#endif
> > +
> > +static void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
> > +{
> > +#ifdef CONFIG_KEXEC_CORE
> > +	crash_save_cpu(regs, cpu);
> > +
> > +	atomic_dec(&waiting_for_crash_ipi);
> > +
> > +	local_irq_disable();
> > +
> > +#ifdef CONFIG_HOTPLUG_CPU
> > +	if (cpu_ops[cpu]->cpu_die)
> > +		cpu_ops[cpu]->cpu_die(cpu);
> > +#endif
> > +
> > +	/* just in case */
> > +	cpu_park_loop();
> > +#endif
> > +}
> > +
> >  /*
> >   * Main handler for inter-processor interrupts
> >   */
> > @@ -838,6 +864,15 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
> >  		irq_exit();
> >  		break;
> >  
> > +	case IPI_CPU_CRASH_STOP:
> > +		if (IS_ENABLED(CONFIG_KEXEC_CORE)) {
> > +			irq_enter();
> > +			ipi_cpu_crash_stop(cpu, regs);
> > +
> > +			unreachable();
> > +		}
> > +		break;
> > +
> >  #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
> >  	case IPI_TIMER:
> >  		irq_enter();
> > @@ -910,6 +945,34 @@ void smp_send_stop(void)
> >  			   cpumask_pr_args(cpu_online_mask));
> >  }
> >  
> > +#ifdef CONFIG_KEXEC_CORE
> > +void smp_send_crash_stop(void)
> > +{
> > +	cpumask_t mask;
> > +	unsigned long timeout;
> > +
> > +	if (num_online_cpus() == 1)
> > +		return;
> > +
> > +	cpumask_copy(&mask, cpu_online_mask);
> > +	cpumask_clear_cpu(smp_processor_id(), &mask);
> > +
> > +	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
> > +
> > +	pr_crit("SMP: stopping secondary CPUs\n");
> > +	smp_cross_call(&mask, IPI_CPU_CRASH_STOP);
> > +
> > +	/* Wait up to one second for other CPUs to stop */
> > +	timeout = USEC_PER_SEC;
> > +	while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--)
> > +		udelay(1);
> > +
> > +	if (atomic_read(&waiting_for_crash_ipi) > 0)
> > +		pr_warning("SMP: failed to stop secondary CPUs %*pbl\n",
> > +			   cpumask_pr_args(cpu_online_mask));
> > +}
> > +#endif
> 
> This is very similar to smp_send_stop() which also has the timeout. Is it
> possible to merge them? You could use in_crash_kexec to choose the IPI type.

Yeah, we could merge them along with ipi_cpu_(crash_)stop().
But the resulting code would be quite noisy if each line
is switched by "if (in_crash_kexec)."
Otherwise, we may have one big "if" like:
void smp_send_stop(void)
{
    if (in_crash_kexec)
        ...
    else
        ...
}
It seems to me that it is not much different from the current code.
What do you think?

-Takahiro AKASHI


> 
> > +
> >  /*
> >   * not supported here
> >   */
> > 
> 
> 
> Reviewed-by: James Morse <james.morse@arm.com>
> 
> 
> Thanks,
> 
> James
> 
> 
>
James Morse Sept. 16, 2016, 2:49 p.m. UTC | #4
On 16/09/16 04:21, AKASHI Takahiro wrote:
> On Wed, Sep 14, 2016 at 07:09:33PM +0100, James Morse wrote:
>> On 07/09/16 05:29, AKASHI Takahiro wrote:
>>> Primary kernel calls machine_crash_shutdown() to shut down non-boot cpus
>>> and save registers' status in per-cpu ELF notes before starting crash
>>> dump kernel. See kernel_kexec().
>>> Even if not all secondary cpus have shut down, we do kdump anyway.
>>>
>>> As we don't have to make non-boot(crashed) cpus offline (to preserve
>>> correct status of cpus at crash dump) before shutting down, this patch
>>> also adds a variant of smp_send_stop().

>>> diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
>>> index 04744dc..a908958 100644
>>> --- a/arch/arm64/include/asm/kexec.h
>>> +++ b/arch/arm64/include/asm/kexec.h
>>> @@ -40,7 +40,46 @@
>>>  static inline void crash_setup_regs(struct pt_regs *newregs,
>>>  				    struct pt_regs *oldregs)
>>>  {
>>> -	/* Empty routine needed to avoid build errors. */
>>> +	if (oldregs) {
>>> +		memcpy(newregs, oldregs, sizeof(*newregs));
>>> +	} else {
>>> +		u64 tmp1, tmp2;
>>> +
>>> +		__asm__ __volatile__ (
>>> +			"stp	 x0,   x1, [%2, #16 *  0]\n"
>>> +			"stp	 x2,   x3, [%2, #16 *  1]\n"
>>> +			"stp	 x4,   x5, [%2, #16 *  2]\n"
>>> +			"stp	 x6,   x7, [%2, #16 *  3]\n"
>>> +			"stp	 x8,   x9, [%2, #16 *  4]\n"
>>> +			"stp	x10,  x11, [%2, #16 *  5]\n"
>>> +			"stp	x12,  x13, [%2, #16 *  6]\n"
>>> +			"stp	x14,  x15, [%2, #16 *  7]\n"
>>> +			"stp	x16,  x17, [%2, #16 *  8]\n"
>>> +			"stp	x18,  x19, [%2, #16 *  9]\n"
>>> +			"stp	x20,  x21, [%2, #16 * 10]\n"
>>> +			"stp	x22,  x23, [%2, #16 * 11]\n"
>>> +			"stp	x24,  x25, [%2, #16 * 12]\n"
>>> +			"stp	x26,  x27, [%2, #16 * 13]\n"
>>> +			"stp	x28,  x29, [%2, #16 * 14]\n"
>>> +			"mov	 %0,  sp\n"
>>> +			"stp	x30,  %0,  [%2, #16 * 15]\n"
>>> +
>>> +			"/* faked current PSTATE */\n"
>>> +			"mrs	 %0, CurrentEL\n"
>>> +			"mrs	 %1, DAIF\n"
>>> +			"orr	 %0, %0, %1\n"
>>> +			"mrs	 %1, NZCV\n"
>>> +			"orr	 %0, %0, %1\n"
>>> +
>>
>> What about SPSEL? While we don't use it, it is correctly preserved for
>> everything except a CPU that calls panic()...
> 
> My comment above might be confusing, but what I want to fake
> here is "spsr" as pt_regs.pstate is normally set based on spsr_el1.
> So there is no corresponding field of SPSEL in spsr.

Here is my logic, I may have missed something obvious, see what you think:

SPSR_EL{1,2} shows the CPU mode 'M' in bits 0-4, From aarch64 bit 4 is always 0.
From the register definitions in the ARM-ARM C5.2, likely values in 0-3 are:
0100 EL1t
0101 EL1h
1000 EL2t
1001 EL2h

I'm pretty sure this least significant bit is what SPSEL changes, so it does get
implicitly recorded in SPSR.
CurrentEL returns a value in bits 0-3, of which 0-1 are RES0, so we lose the
difference between EL?t and EL?h.


> 
>>
>>> +			/* pc */
>>> +			"adr	 %1, 1f\n"
>>> +		"1:\n"
>>> +			"stp	 %1, %0,   [%2, #16 * 16]\n"
>>> +			: "=r" (tmp1), "=r" (tmp2), "+r" (newregs)
>>> +			:
>>> +			: "memory"

Do you need the memory clobber? This asm only modifies values in newregs.


>>> +		);
>>> +	}
>>>  }
>>>  
>>>  #endif /* __ASSEMBLY__ */


>>> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c

>>> +#ifdef CONFIG_KEXEC_CORE
>>> +void smp_send_crash_stop(void)
>>> +{
>>> +	cpumask_t mask;
>>> +	unsigned long timeout;
>>> +
>>> +	if (num_online_cpus() == 1)
>>> +		return;
>>> +
>>> +	cpumask_copy(&mask, cpu_online_mask);
>>> +	cpumask_clear_cpu(smp_processor_id(), &mask);
>>> +
>>> +	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
>>> +
>>> +	pr_crit("SMP: stopping secondary CPUs\n");
>>> +	smp_cross_call(&mask, IPI_CPU_CRASH_STOP);
>>> +
>>> +	/* Wait up to one second for other CPUs to stop */
>>> +	timeout = USEC_PER_SEC;
>>> +	while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--)
>>> +		udelay(1);
>>> +
>>> +	if (atomic_read(&waiting_for_crash_ipi) > 0)
>>> +		pr_warning("SMP: failed to stop secondary CPUs %*pbl\n",
>>> +			   cpumask_pr_args(cpu_online_mask));
>>> +}
>>> +#endif
>>
>> This is very similar to smp_send_stop() which also has the timeout. Is it
>> possible to merge them? You could use in_crash_kexec to choose the IPI type.
> 
> Yeah, we could merge them along with ipi_cpu_(crash_)stop().
> But the resulting code would be quite noisy if each line
> is switched by "if (in_crash_kexec)."
> Otherwise, we may have one big "if" like:
> void smp_send_stop(void)
> {
>     if (in_crash_kexec)
>         ...
>     else
>         ...
> }
> It seems to me that it is not much different from the current code.
> What do you think?

Hmm, yes, its too fiddly to keep the existing behaviour of both.

The problems are ipi_cpu_stop() doesn't call cpu_die(), (I can't see a good
reason for this, but more archaeology is needed), and ipi_cpu_crash_stop()
doesn't modify the online cpu mask.

I don't suggest we do this yet, but it could be future cleanup if it's proved to
be safe:
smp_send_stop() is only called from:  machine_halt(), machine_power_off(),
machine_restart() and panic(). In all those cases the CPUs are never expected to
come back, so we can probably merge the IPIs.  This involves modifying the
online cpu mask during kdump, (which I think is fine as it uses the atomic
bitops so we won't get blocked on a lock), and promoting in_crash_kexec to some
atomic type.

But I think we should leave it as it is for now,


Thanks,

James
AKASHI Takahiro Sept. 20, 2016, 7:36 a.m. UTC | #5
On Fri, Sep 16, 2016 at 03:49:31PM +0100, James Morse wrote:
> On 16/09/16 04:21, AKASHI Takahiro wrote:
> > On Wed, Sep 14, 2016 at 07:09:33PM +0100, James Morse wrote:
> >> On 07/09/16 05:29, AKASHI Takahiro wrote:
> >>> Primary kernel calls machine_crash_shutdown() to shut down non-boot cpus
> >>> and save registers' status in per-cpu ELF notes before starting crash
> >>> dump kernel. See kernel_kexec().
> >>> Even if not all secondary cpus have shut down, we do kdump anyway.
> >>>
> >>> As we don't have to make non-boot(crashed) cpus offline (to preserve
> >>> correct status of cpus at crash dump) before shutting down, this patch
> >>> also adds a variant of smp_send_stop().
> 
> >>> diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
> >>> index 04744dc..a908958 100644
> >>> --- a/arch/arm64/include/asm/kexec.h
> >>> +++ b/arch/arm64/include/asm/kexec.h
> >>> @@ -40,7 +40,46 @@
> >>>  static inline void crash_setup_regs(struct pt_regs *newregs,
> >>>  				    struct pt_regs *oldregs)
> >>>  {
> >>> -	/* Empty routine needed to avoid build errors. */
> >>> +	if (oldregs) {
> >>> +		memcpy(newregs, oldregs, sizeof(*newregs));
> >>> +	} else {
> >>> +		u64 tmp1, tmp2;
> >>> +
> >>> +		__asm__ __volatile__ (
> >>> +			"stp	 x0,   x1, [%2, #16 *  0]\n"
> >>> +			"stp	 x2,   x3, [%2, #16 *  1]\n"
> >>> +			"stp	 x4,   x5, [%2, #16 *  2]\n"
> >>> +			"stp	 x6,   x7, [%2, #16 *  3]\n"
> >>> +			"stp	 x8,   x9, [%2, #16 *  4]\n"
> >>> +			"stp	x10,  x11, [%2, #16 *  5]\n"
> >>> +			"stp	x12,  x13, [%2, #16 *  6]\n"
> >>> +			"stp	x14,  x15, [%2, #16 *  7]\n"
> >>> +			"stp	x16,  x17, [%2, #16 *  8]\n"
> >>> +			"stp	x18,  x19, [%2, #16 *  9]\n"
> >>> +			"stp	x20,  x21, [%2, #16 * 10]\n"
> >>> +			"stp	x22,  x23, [%2, #16 * 11]\n"
> >>> +			"stp	x24,  x25, [%2, #16 * 12]\n"
> >>> +			"stp	x26,  x27, [%2, #16 * 13]\n"
> >>> +			"stp	x28,  x29, [%2, #16 * 14]\n"
> >>> +			"mov	 %0,  sp\n"
> >>> +			"stp	x30,  %0,  [%2, #16 * 15]\n"
> >>> +
> >>> +			"/* faked current PSTATE */\n"
> >>> +			"mrs	 %0, CurrentEL\n"
> >>> +			"mrs	 %1, DAIF\n"
> >>> +			"orr	 %0, %0, %1\n"
> >>> +			"mrs	 %1, NZCV\n"
> >>> +			"orr	 %0, %0, %1\n"
> >>> +
> >>
> >> What about SPSEL? While we don't use it, it is correctly preserved for
> >> everything except a CPU that calls panic()...
> > 
> > My comment above might be confusing, but what I want to fake
> > here is "spsr" as pt_regs.pstate is normally set based on spsr_el1.
> > So there is no corresponding field of SPSEL in spsr.
> 
> Here is my logic, I may have missed something obvious, see what you think:
> 
> SPSR_EL{1,2} shows the CPU mode 'M' in bits 0-4, From aarch64 bit 4 is always 0.
> From the register definitions in the ARM-ARM C5.2, likely values in 0-3 are:
> 0100 EL1t
> 0101 EL1h
> 1000 EL2t
> 1001 EL2h
> 
> I'm pretty sure this least significant bit is what SPSEL changes, so it does get
> implicitly recorded in SPSR.
> CurrentEL returns a value in bits 0-3, of which 0-1 are RES0, so we lose the
> difference between EL?t and EL?h.

OK.
SPSel will be added assuming that CurrentEL is never 0 here.

> 
> > 
> >>
> >>> +			/* pc */
> >>> +			"adr	 %1, 1f\n"
> >>> +		"1:\n"
> >>> +			"stp	 %1, %0,   [%2, #16 * 16]\n"
> >>> +			: "=r" (tmp1), "=r" (tmp2), "+r" (newregs)
> >>> +			:
> >>> +			: "memory"
> 
> Do you need the memory clobber? This asm only modifies values in newregs.

What about this (including the change above):
|                       "/* faked current PSTATE */\n"
|                       "mrs     %0, CurrentEL\n"
|                       "mrs     %1, SPSEL\n"
|                       "orr     %0, %0, %1\n"
|                       "mrs     %1, DAIF\n"
|                       "orr     %0, %0, %1\n"
|                       "mrs     %1, NZCV\n"
|                       "orr     %0, %0, %1\n"
|                       /* pc */
|                       "adr     %1, 1f\n"
|               "1:\n"
|                       "stp     %1, %0,   [%2, #16 * 16]\n"
|                       : "+r" (tmp1), "+r" (tmp2)
|                       : "r" (newregs)
|                       : "memory"


> 
> >>> +		);
> >>> +	}
> >>>  }
> >>>  
> >>>  #endif /* __ASSEMBLY__ */
> 
> 
> >>> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> 
> >>> +#ifdef CONFIG_KEXEC_CORE
> >>> +void smp_send_crash_stop(void)
> >>> +{
> >>> +	cpumask_t mask;
> >>> +	unsigned long timeout;
> >>> +
> >>> +	if (num_online_cpus() == 1)
> >>> +		return;
> >>> +
> >>> +	cpumask_copy(&mask, cpu_online_mask);
> >>> +	cpumask_clear_cpu(smp_processor_id(), &mask);
> >>> +
> >>> +	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
> >>> +
> >>> +	pr_crit("SMP: stopping secondary CPUs\n");
> >>> +	smp_cross_call(&mask, IPI_CPU_CRASH_STOP);
> >>> +
> >>> +	/* Wait up to one second for other CPUs to stop */
> >>> +	timeout = USEC_PER_SEC;
> >>> +	while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--)
> >>> +		udelay(1);
> >>> +
> >>> +	if (atomic_read(&waiting_for_crash_ipi) > 0)
> >>> +		pr_warning("SMP: failed to stop secondary CPUs %*pbl\n",
> >>> +			   cpumask_pr_args(cpu_online_mask));
> >>> +}
> >>> +#endif
> >>
> >> This is very similar to smp_send_stop() which also has the timeout. Is it
> >> possible to merge them? You could use in_crash_kexec to choose the IPI type.
> > 
> > Yeah, we could merge them along with ipi_cpu_(crash_)stop().
> > But the resulting code would be quite noisy if each line
> > is switched by "if (in_crash_kexec)."
> > Otherwise, we may have one big "if" like:
> > void smp_send_stop(void)
> > {
> >     if (in_crash_kexec)
> >         ...
> >     else
> >         ...
> > }
> > It seems to me that it is not much different from the current code.
> > What do you think?
> 
> Hmm, yes, its too fiddly to keep the existing behaviour of both.
> 
> The problems are ipi_cpu_stop() doesn't call cpu_die(), (I can't see a good
> reason for this, but more archaeology is needed), and ipi_cpu_crash_stop()
> doesn't modify the online cpu mask.
> 
> I don't suggest we do this yet, but it could be future cleanup if it's proved to
> be safe:
> smp_send_stop() is only called from:  machine_halt(), machine_power_off(),
> machine_restart() and panic(). In all those cases the CPUs are never expected to
> come back, so we can probably merge the IPIs.  This involves modifying the
> online cpu mask during kdump, (which I think is fine as it uses the atomic
> bitops so we won't get blocked on a lock), and promoting in_crash_kexec to some
> atomic type.
> 
> But I think we should leave it as it is for now,

Sure.

Thanks,
-Takahiro AKASHI

> 
> Thanks,
> 
> James
>
diff mbox

Patch

diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h
index 8740297..1473fc2 100644
--- a/arch/arm64/include/asm/hardirq.h
+++ b/arch/arm64/include/asm/hardirq.h
@@ -20,7 +20,7 @@ 
 #include <linux/threads.h>
 #include <asm/irq.h>
 
-#define NR_IPI	6
+#define NR_IPI	7
 
 typedef struct {
 	unsigned int __softirq_pending;
diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
index 04744dc..a908958 100644
--- a/arch/arm64/include/asm/kexec.h
+++ b/arch/arm64/include/asm/kexec.h
@@ -40,7 +40,46 @@ 
 static inline void crash_setup_regs(struct pt_regs *newregs,
 				    struct pt_regs *oldregs)
 {
-	/* Empty routine needed to avoid build errors. */
+	if (oldregs) {
+		memcpy(newregs, oldregs, sizeof(*newregs));
+	} else {
+		u64 tmp1, tmp2;
+
+		__asm__ __volatile__ (
+			"stp	 x0,   x1, [%2, #16 *  0]\n"
+			"stp	 x2,   x3, [%2, #16 *  1]\n"
+			"stp	 x4,   x5, [%2, #16 *  2]\n"
+			"stp	 x6,   x7, [%2, #16 *  3]\n"
+			"stp	 x8,   x9, [%2, #16 *  4]\n"
+			"stp	x10,  x11, [%2, #16 *  5]\n"
+			"stp	x12,  x13, [%2, #16 *  6]\n"
+			"stp	x14,  x15, [%2, #16 *  7]\n"
+			"stp	x16,  x17, [%2, #16 *  8]\n"
+			"stp	x18,  x19, [%2, #16 *  9]\n"
+			"stp	x20,  x21, [%2, #16 * 10]\n"
+			"stp	x22,  x23, [%2, #16 * 11]\n"
+			"stp	x24,  x25, [%2, #16 * 12]\n"
+			"stp	x26,  x27, [%2, #16 * 13]\n"
+			"stp	x28,  x29, [%2, #16 * 14]\n"
+			"mov	 %0,  sp\n"
+			"stp	x30,  %0,  [%2, #16 * 15]\n"
+
+			"/* faked current PSTATE */\n"
+			"mrs	 %0, CurrentEL\n"
+			"mrs	 %1, DAIF\n"
+			"orr	 %0, %0, %1\n"
+			"mrs	 %1, NZCV\n"
+			"orr	 %0, %0, %1\n"
+
+			/* pc */
+			"adr	 %1, 1f\n"
+		"1:\n"
+			"stp	 %1, %0,   [%2, #16 * 16]\n"
+			: "=r" (tmp1), "=r" (tmp2), "+r" (newregs)
+			:
+			: "memory"
+		);
+	}
 }
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 0226447..6b0f2c7 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -136,6 +136,8 @@  static inline void cpu_panic_kernel(void)
  */
 bool cpus_are_stuck_in_kernel(void);
 
+extern void smp_send_crash_stop(void);
+
 #endif /* ifndef __ASSEMBLY__ */
 
 #endif /* ifndef __ASM_SMP_H */
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index bc96c8a..8ac9dba8 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -9,6 +9,9 @@ 
  * published by the Free Software Foundation.
  */
 
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
 #include <linux/kexec.h>
 #include <linux/smp.h>
 
@@ -22,6 +25,7 @@ 
 extern const unsigned char arm64_relocate_new_kernel[];
 extern const unsigned long arm64_relocate_new_kernel_size;
 
+bool in_crash_kexec;
 static unsigned long kimage_start;
 
 /**
@@ -148,7 +152,8 @@  void machine_kexec(struct kimage *kimage)
 	/*
 	 * New cpus may have become stuck_in_kernel after we loaded the image.
 	 */
-	BUG_ON(cpus_are_stuck_in_kernel() || (num_online_cpus() > 1));
+	BUG_ON((cpus_are_stuck_in_kernel() || (num_online_cpus() > 1)) &&
+			!WARN_ON(in_crash_kexec));
 
 	reboot_code_buffer_phys = page_to_phys(kimage->control_code_page);
 	reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);
@@ -200,13 +205,58 @@  void machine_kexec(struct kimage *kimage)
 	 * relocation is complete.
 	 */
 
-	cpu_soft_restart(1, reboot_code_buffer_phys, kimage->head,
+	cpu_soft_restart(!in_crash_kexec, reboot_code_buffer_phys, kimage->head,
 		kimage_start, 0);
 
 	BUG(); /* Should never get here. */
 }
 
+static void machine_kexec_mask_interrupts(void)
+{
+	unsigned int i;
+	struct irq_desc *desc;
+
+	for_each_irq_desc(i, desc) {
+		struct irq_chip *chip;
+		int ret;
+
+		chip = irq_desc_get_chip(desc);
+		if (!chip)
+			continue;
+
+		/*
+		 * First try to remove the active state. If this
+		 * fails, try to EOI the interrupt.
+		 */
+		ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false);
+
+		if (ret && irqd_irq_inprogress(&desc->irq_data) &&
+		    chip->irq_eoi)
+			chip->irq_eoi(&desc->irq_data);
+
+		if (chip->irq_mask)
+			chip->irq_mask(&desc->irq_data);
+
+		if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
+			chip->irq_disable(&desc->irq_data);
+	}
+}
+
+/**
+ * machine_crash_shutdown - shutdown non-crashing cpus and save registers
+ */
 void machine_crash_shutdown(struct pt_regs *regs)
 {
-	/* Empty routine needed to avoid build errors. */
+	local_irq_disable();
+
+	in_crash_kexec = true;
+
+	/* shutdown non-crashing cpus */
+	smp_send_crash_stop();
+
+	/* for crashing cpu */
+	crash_save_cpu(regs, smp_processor_id());
+	machine_kexec_mask_interrupts();
+
+	pr_info("Starting crashdump kernel...\n");
 }
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index d93d433..b401b25 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -37,6 +37,7 @@ 
 #include <linux/completion.h>
 #include <linux/of.h>
 #include <linux/irq_work.h>
+#include <linux/kexec.h>
 
 #include <asm/alternative.h>
 #include <asm/atomic.h>
@@ -71,6 +72,7 @@  enum ipi_msg_type {
 	IPI_RESCHEDULE,
 	IPI_CALL_FUNC,
 	IPI_CPU_STOP,
+	IPI_CPU_CRASH_STOP,
 	IPI_TIMER,
 	IPI_IRQ_WORK,
 	IPI_WAKEUP
@@ -734,6 +736,7 @@  static const char *ipi_types[NR_IPI] __tracepoint_string = {
 	S(IPI_RESCHEDULE, "Rescheduling interrupts"),
 	S(IPI_CALL_FUNC, "Function call interrupts"),
 	S(IPI_CPU_STOP, "CPU stop interrupts"),
+	S(IPI_CPU_CRASH_STOP, "CPU stop (for crash dump) interrupts"),
 	S(IPI_TIMER, "Timer broadcast interrupts"),
 	S(IPI_IRQ_WORK, "IRQ work interrupts"),
 	S(IPI_WAKEUP, "CPU wake-up interrupts"),
@@ -808,6 +811,29 @@  static void ipi_cpu_stop(unsigned int cpu)
 		cpu_relax();
 }
 
+#ifdef CONFIG_KEXEC_CORE
+static atomic_t waiting_for_crash_ipi;
+#endif
+
+static void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
+{
+#ifdef CONFIG_KEXEC_CORE
+	crash_save_cpu(regs, cpu);
+
+	atomic_dec(&waiting_for_crash_ipi);
+
+	local_irq_disable();
+
+#ifdef CONFIG_HOTPLUG_CPU
+	if (cpu_ops[cpu]->cpu_die)
+		cpu_ops[cpu]->cpu_die(cpu);
+#endif
+
+	/* just in case */
+	cpu_park_loop();
+#endif
+}
+
 /*
  * Main handler for inter-processor interrupts
  */
@@ -838,6 +864,15 @@  void handle_IPI(int ipinr, struct pt_regs *regs)
 		irq_exit();
 		break;
 
+	case IPI_CPU_CRASH_STOP:
+		if (IS_ENABLED(CONFIG_KEXEC_CORE)) {
+			irq_enter();
+			ipi_cpu_crash_stop(cpu, regs);
+
+			unreachable();
+		}
+		break;
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 	case IPI_TIMER:
 		irq_enter();
@@ -910,6 +945,34 @@  void smp_send_stop(void)
 			   cpumask_pr_args(cpu_online_mask));
 }
 
+#ifdef CONFIG_KEXEC_CORE
+void smp_send_crash_stop(void)
+{
+	cpumask_t mask;
+	unsigned long timeout;
+
+	if (num_online_cpus() == 1)
+		return;
+
+	cpumask_copy(&mask, cpu_online_mask);
+	cpumask_clear_cpu(smp_processor_id(), &mask);
+
+	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+
+	pr_crit("SMP: stopping secondary CPUs\n");
+	smp_cross_call(&mask, IPI_CPU_CRASH_STOP);
+
+	/* Wait up to one second for other CPUs to stop */
+	timeout = USEC_PER_SEC;
+	while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--)
+		udelay(1);
+
+	if (atomic_read(&waiting_for_crash_ipi) > 0)
+		pr_warning("SMP: failed to stop secondary CPUs %*pbl\n",
+			   cpumask_pr_args(cpu_online_mask));
+}
+#endif
+
 /*
  * not supported here
  */