@@ -35,6 +35,7 @@
#include <asm/page.h>
#include <asm/cacheflush.h>
#include <asm/hvcall.h>
+#include <asm/mce.h>
#define KVM_MAX_VCPUS NR_CPUS
#define KVM_MAX_VCORES NR_CPUS
@@ -637,6 +638,7 @@ struct kvm_vcpu_arch {
int thread_cpu;
bool timer_running;
wait_queue_head_t cpu_run;
+ struct machine_check_event mce_evt; /* Valid if trap == 0x200 */
struct kvm_vcpu_arch_shared *shared;
#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
@@ -15,6 +15,7 @@
#include <linux/export.h>
#include <asm/setup.h>
+#include <asm/mce.h>
/* We export this macro for external modules like Alsa to know if
* ppc_md.feature_call is implemented or not
@@ -112,6 +113,12 @@ struct machdep_calls {
/* Called during machine check exception to retrive fixup address. */
bool (*mce_check_early_recovery)(struct pt_regs *regs);
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ /* Called after KVM interrupt handler finishes handling MCE for guest */
+ int (*machine_check_exception_guest)
+ (struct machine_check_event *evt);
+#endif
+
/* Motherboard/chipset features. This is a kind of general purpose
* hook used to control some machine specific features (like reset
* lines, chip power control, etc...).
@@ -17,6 +17,7 @@
#ifndef __ASSEMBLY__
#include <linux/notifier.h>
+#include <asm/mce.h>
/* We calculate number of sg entries based on PAGE_SIZE */
#define SG_ENTRIES_PER_NODE ((PAGE_SIZE - 16) / sizeof(struct opal_sg_entry))
@@ -276,6 +277,9 @@ extern int opal_hmi_handler_init(void);
extern int opal_event_init(void);
extern int opal_machine_check(struct pt_regs *regs);
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+extern int opal_machine_check_guest(struct machine_check_event *evt);
+#endif
extern bool opal_mce_check_early_recovery(struct pt_regs *regs);
extern int opal_hmi_exception_early(struct pt_regs *regs);
extern int opal_handle_hmi_exception(struct pt_regs *regs);
@@ -57,6 +57,9 @@ struct kvm_regs {
#define KVM_SREGS_E_FSL_PIDn (1 << 0) /* PID1/PID2 */
+/* flags for kvm_run.flags */
+#define KVM_RUN_PPC_NMI_RECOVERED (1 << 0)
+
/*
* Feature bits indicate which sections of the sregs struct are valid,
* both in KVM_GET_SREGS and KVM_SET_SREGS. On KVM_SET_SREGS, registers
@@ -954,15 +954,22 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_MACHINE_CHECK:
+ /* Exit to guest with KVM_EXIT_NMI as exit reason */
+ run->exit_reason = KVM_EXIT_NMI;
+ run->hw.hardware_exit_reason = vcpu->arch.trap;
+ if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
+ run->flags |= KVM_RUN_PPC_NMI_RECOVERED;
+ else
+ run->flags &= ~KVM_RUN_PPC_NMI_RECOVERED;
+
+ r = RESUME_HOST;
/*
- * Deliver a machine check interrupt to the guest.
- * We have to do this, even if the host has handled the
- * machine check, because machine checks use SRR0/1 and
- * the interrupt might have trashed guest state in them.
+ * Invoke host-kernel handler to perform any host-side
+ * handling before exiting the guest.
*/
- kvmppc_book3s_queue_irqprio(vcpu,
- BOOK3S_INTERRUPT_MACHINE_CHECK);
- r = RESUME_GUEST;
+ if (ppc_md.machine_check_exception_guest)
+ ppc_md.machine_check_exception_guest(
+ &vcpu->arch.mce_evt);
break;
case BOOK3S_INTERRUPT_PROGRAM:
{
@@ -133,8 +133,20 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
* interrupt (for unhandled errors) or will continue from
* current HSRR0 (for handled errors) in guest. Hence
* queue up the event so that we can log it from host console later.
+ * If QEMU support FWNMI capability then hook the MCE event into
+ * vcpu structure.
*/
- machine_check_queue_event();
+ if (vcpu->arch.fwnmi_enabled) {
+ /*
+ * Hook up the mce event on to vcpu structure.
+ * First clear the old event.
+ */
+ memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt));
+ if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) {
+ vcpu->arch.mce_evt = mce_evt;
+ }
+ } else
+ machine_check_queue_event();
return handled;
}
@@ -134,21 +134,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
stb r0, HSTATE_HWTHREAD_REQ(r13)
/*
- * For external and machine check interrupts, we need
- * to call the Linux handler to process the interrupt.
- * We do that by jumping to absolute address 0x500 for
- * external interrupts, or the machine_check_fwnmi label
- * for machine checks (since firmware might have patched
- * the vector area at 0x200). The [h]rfid at the end of the
- * handler will return to the book3s_hv_interrupts.S code.
- * For other interrupts we do the rfid to get back
- * to the book3s_hv_interrupts.S code here.
+ * For external interrupts we need to call the Linux
+ * handler to process the interrupt. We do that by jumping
+ * to absolute address 0x500 for external interrupts.
+ * The [h]rfid at the end of the handler will return to
+ * the book3s_hv_interrupts.S code. For other interrupts
+ * we do the rfid to get back to the book3s_hv_interrupts.S
+ * code here.
*/
ld r8, 112+PPC_LR_STKOFF(r1)
addi r1, r1, 112
ld r7, HSTATE_HOST_MSR(r13)
- cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
beq 11f
cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
@@ -163,7 +160,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtmsrd r6, 1 /* Clear RI in MSR */
mtsrr0 r8
mtsrr1 r7
- beq cr1, 13f /* machine check */
+ /*
+ * BOOK3S_INTERRUPT_MACHINE_CHECK is handled at the
+ * time of guest exit
+ */
RFI
/* On POWER7, we have external interrupts set to use HSRR0/1 */
@@ -171,8 +171,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_HSRR1, r7
ba 0x500
-13: b machine_check_fwnmi
-
14: mtspr SPRN_HSRR0, r8
mtspr SPRN_HSRR1, r7
b hmi_exception_after_realmode
@@ -2338,15 +2336,13 @@ machine_check_realmode:
ld r9, HSTATE_KVM_VCPU(r13)
li r12, BOOK3S_INTERRUPT_MACHINE_CHECK
/*
- * Deliver unhandled/fatal (e.g. UE) MCE errors to guest through
- * machine check interrupt (set HSRR0 to 0x200). And for handled
- * errors (no-fatal), just go back to guest execution with current
- * HSRR0 instead of exiting guest. This new approach will inject
- * machine check to guest for fatal error causing guest to crash.
- *
- * The old code used to return to host for unhandled errors which
- * was causing guest to hang with soft lockups inside guest and
- * makes it difficult to recover guest instance.
+ * Deliver unhandled/fatal (e.g. UE) MCE errors to guest either
+ * through machine check interrupt (set HSRR0 to 0x200) or by
+ * exiting the guest with KVM_EXIT_NMI exit reason if guest is
+ * FWNMI capable. For handled errors (no-fatal), just go back
+ * to guest execution with current HSRR0. This new approach
+ * injects machine check errors in guest address space to guest
+ * enabling guest kernel to suitably handle such errors.
*
* if we receive machine check with MSR(RI=0) then deliver it to
* guest as machine check causing guest to crash.
@@ -2354,13 +2350,18 @@ machine_check_realmode:
ld r11, VCPU_MSR(r9)
rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
bne mc_cont /* if so, exit to host */
+ /* Check if guest is capable of handling NMI exit */
+ ld r0, VCPU_KVM(r9)
+ lbz r0, KVM_FWNMI(r0)
+ cmpdi r0, 1 /* FWNMI capable? */
+ beq mc_cont
andi. r10, r11, MSR_RI /* check for unrecoverable exception */
beq 1f /* Deliver a machine check to guest */
ld r10, VCPU_PC(r9)
cmpdi r3, 0 /* Did we handle MCE ? */
bne 2f /* Continue guest execution. */
/* If not, deliver a machine check. SRR0/1 are already set */
-1: li r10, BOOK3S_INTERRUPT_MACHINE_CHECK
+ li r10, BOOK3S_INTERRUPT_MACHINE_CHECK
bl kvmppc_msr_interrupt
2: b fast_interrupt_c_return
@@ -488,6 +488,32 @@ int opal_machine_check(struct pt_regs *regs)
return 0;
}
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+/*
+ * opal_machine_check_guest() is a hook which is invoked at the time
+ * of guest exit to facilitate the host-side handling of machine check
+ * exception before the exception is passed on to the guest. This hook
+ * is invoked from host virtual mode from KVM (before exiting the guest
+ * with KVM_EXIT_NMI reason) for machine check exception that occurs in
+ * the guest.
+ *
+ * Currently no action is performed in the host other than printing the
+ * event information. The machine check exception is passed on to the
+ * guest kernel and the guest kernel will attempt for recovery.
+ */
+int opal_machine_check_guest(struct machine_check_event *evt)
+{
+ /* Print things out */
+ if (evt->version != MCE_V1) {
+ pr_err("Machine Check Exception, Unknown event version %d !\n",
+ evt->version);
+ return 0;
+ }
+ machine_check_print_event_info(evt);
+ return 0;
+}
+#endif
+
/* Early hmi handler called in real mode. */
int opal_hmi_exception_early(struct pt_regs *regs)
{
@@ -264,6 +264,9 @@ static void __init pnv_setup_machdep_opal(void)
ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
ppc_md.hmi_exception_early = opal_hmi_exception_early;
ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ ppc_md.machine_check_exception_guest = opal_machine_check_guest;
+#endif
}
static int __init pnv_probe(void)