@@ -35,6 +35,7 @@
#include <asm/page.h>
#include <asm/cacheflush.h>
#include <asm/hvcall.h>
+#include <asm/mce.h>
#define KVM_MAX_VCPUS NR_CPUS
#define KVM_MAX_VCORES NR_CPUS
@@ -660,6 +661,7 @@ struct kvm_vcpu_arch {
int thread_cpu;
bool timer_running;
wait_queue_head_t cpu_run;
+ struct machine_check_event mce_evt; /* Valid if trap == 0x200 */
struct kvm_vcpu_arch_shared *shared;
#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
@@ -15,6 +15,7 @@
#include <linux/export.h>
#include <asm/setup.h>
+#include <asm/mce.h>
/* We export this macro for external modules like Alsa to know if
* ppc_md.feature_call is implemented or not
@@ -112,6 +113,12 @@ struct machdep_calls {
/* Called during machine check exception to retrive fixup address. */
bool (*mce_check_early_recovery)(struct pt_regs *regs);
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ /* Called after KVM interrupt handler finishes handling MCE for guest */
+ int (*machine_check_exception_guest)
+ (struct machine_check_event *evt);
+#endif
+
/* Motherboard/chipset features. This is a kind of general purpose
* hook used to control some machine specific features (like reset
* lines, chip power control, etc...).
@@ -17,6 +17,7 @@
#ifndef __ASSEMBLY__
#include <linux/notifier.h>
+#include <asm/mce.h>
/* We calculate number of sg entries based on PAGE_SIZE */
#define SG_ENTRIES_PER_NODE ((PAGE_SIZE - 16) / sizeof(struct opal_sg_entry))
@@ -279,6 +280,9 @@ extern int opal_hmi_handler_init(void);
extern int opal_event_init(void);
extern int opal_machine_check(struct pt_regs *regs);
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+extern int opal_machine_check_guest(struct machine_check_event *evt);
+#endif
extern bool opal_mce_check_early_recovery(struct pt_regs *regs);
extern int opal_hmi_exception_early(struct pt_regs *regs);
extern int opal_handle_hmi_exception(struct pt_regs *regs);
@@ -57,6 +57,12 @@ struct kvm_regs {
#define KVM_SREGS_E_FSL_PIDn (1 << 0) /* PID1/PID2 */
+/* flags for kvm_run.flags */
+#define KVM_RUN_PPC_NMI_DISP_MASK (3 << 0)
+#define KVM_RUN_PPC_NMI_DISP_FULLY_RECOV (1 << 0)
+#define KVM_RUN_PPC_NMI_DISP_LIMITED_RECOV (2 << 0)
+#define KVM_RUN_PPC_NMI_DISP_NOT_RECOV (3 << 0)
+
/*
* Feature bits indicate which sections of the sregs struct are valid,
* both in KVM_GET_SREGS and KVM_SET_SREGS. On KVM_SET_SREGS, registers
@@ -968,15 +968,25 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_MACHINE_CHECK:
+ /* Exit to guest with KVM_EXIT_NMI as exit reason */
+ run->exit_reason = KVM_EXIT_NMI;
+ run->hw.hardware_exit_reason = vcpu->arch.trap;
+ /* Clear out the old NMI status from run->flags */
+ run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK;
+ /* Now set the NMI status */
+ if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
+ run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
+ else
+ run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
+
+ r = RESUME_HOST;
/*
- * Deliver a machine check interrupt to the guest.
- * We have to do this, even if the host has handled the
- * machine check, because machine checks use SRR0/1 and
- * the interrupt might have trashed guest state in them.
+ * Invoke host-kernel handler to perform any host-side
+ * handling before exiting the guest.
*/
- kvmppc_book3s_queue_irqprio(vcpu,
- BOOK3S_INTERRUPT_MACHINE_CHECK);
- r = RESUME_GUEST;
+ if (ppc_md.machine_check_exception_guest)
+ ppc_md.machine_check_exception_guest(
+ &vcpu->arch.mce_evt);
break;
case BOOK3S_INTERRUPT_PROGRAM:
{
@@ -130,12 +130,28 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
out:
/*
+ * For guest that supports FWNMI capability, hook the MCE event into
+ * vcpu structure. We are going to exit the guest with KVM_EXIT_NMI
+ * exit reason. On our way to exit we will pull this event from vcpu
+ * structure and print it from thread 0 of the core/subcore.
+ *
+ * For guest that does not support FWNMI capability (old QEMU):
* We are now going enter guest either through machine check
* interrupt (for unhandled errors) or will continue from
* current HSRR0 (for handled errors) in guest. Hence
* queue up the event so that we can log it from host console later.
*/
- machine_check_queue_event();
+ if (vcpu->kvm->arch.fwnmi_enabled) {
+ /*
+ * Hook up the mce event on to vcpu structure.
+ * First clear the old event.
+ */
+ memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt));
+ if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) {
+ vcpu->arch.mce_evt = mce_evt;
+ }
+ } else
+ machine_check_queue_event();
return handled;
}
@@ -134,21 +134,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
stb r0, HSTATE_HWTHREAD_REQ(r13)
/*
- * For external and machine check interrupts, we need
- * to call the Linux handler to process the interrupt.
- * We do that by jumping to absolute address 0x500 for
- * external interrupts, or the machine_check_fwnmi label
- * for machine checks (since firmware might have patched
- * the vector area at 0x200). The [h]rfid at the end of the
- * handler will return to the book3s_hv_interrupts.S code.
- * For other interrupts we do the rfid to get back
- * to the book3s_hv_interrupts.S code here.
+ * For external interrupts we need to call the Linux
+ * handler to process the interrupt. We do that by jumping
+ * to absolute address 0x500 for external interrupts.
+ * The [h]rfid at the end of the handler will return to
+ * the book3s_hv_interrupts.S code. For other interrupts
+ * we do the rfid to get back to the book3s_hv_interrupts.S
+ * code here.
*/
ld r8, 112+PPC_LR_STKOFF(r1)
addi r1, r1, 112
ld r7, HSTATE_HOST_MSR(r13)
- cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
beq 11f
cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
@@ -163,7 +160,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtmsrd r6, 1 /* Clear RI in MSR */
mtsrr0 r8
mtsrr1 r7
- beq cr1, 13f /* machine check */
+ /*
+ * BOOK3S_INTERRUPT_MACHINE_CHECK is handled at the
+ * time of guest exit
+ */
RFI
/* On POWER7, we have external interrupts set to use HSRR0/1 */
@@ -171,8 +171,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_HSRR1, r7
ba 0x500
-13: b machine_check_fwnmi
-
14: mtspr SPRN_HSRR0, r8
mtspr SPRN_HSRR1, r7
b hmi_exception_after_realmode
@@ -2394,22 +2392,32 @@ machine_check_realmode:
ld r9, HSTATE_KVM_VCPU(r13)
li r12, BOOK3S_INTERRUPT_MACHINE_CHECK
/*
- * Deliver unhandled/fatal (e.g. UE) MCE errors to guest through
- * machine check interrupt (set HSRR0 to 0x200). And for handled
- * errors (no-fatal), just go back to guest execution with current
- * HSRR0 instead of exiting guest. This new approach will inject
- * machine check to guest for fatal error causing guest to crash.
- *
- * The old code used to return to host for unhandled errors which
- * was causing guest to hang with soft lockups inside guest and
- * makes it difficult to recover guest instance.
+ * For the guest that is FWNMI capable, deliver all the MCE errors
+ * (handled/unhandled) by exiting the guest with KVM_EXIT_NMI exit
+ * reason. This new approach injects machine check errors in guest
+ * address space to guest with additional information in the form
+ * of RTAS event, thus enabling guest kernel to suitably handle
+ * such errors.
*
+ * For the guest that is not FWNMI capable (old QEMU) fallback
+ * to old behaviour for backward compatibility:
+ * Deliver unhandled/fatal (e.g. UE) MCE errors to guest either
+ * through machine check interrupt (set HSRR0 to 0x200).
+ * For handled errors (no-fatal), just go back to guest execution
+ * with current HSRR0.
* if we receive machine check with MSR(RI=0) then deliver it to
* guest as machine check causing guest to crash.
*/
ld r11, VCPU_MSR(r9)
rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
bne mc_cont /* if so, exit to host */
+ /* Check if guest is capable of handling NMI exit */
+ ld r10, VCPU_KVM(r9)
+ lbz r10, KVM_FWNMI(r10)
+ cmpdi r10, 1 /* FWNMI capable? */
+ beq mc_cont /* if so, exit with KVM_EXIT_NMI. */
+
+ /* if not, fall through for backward compatibility. */
andi. r10, r11, MSR_RI /* check for unrecoverable exception */
beq 1f /* Deliver a machine check to guest */
ld r10, VCPU_PC(r9)
@@ -488,6 +488,32 @@ int opal_machine_check(struct pt_regs *regs)
return 0;
}
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+/*
+ * opal_machine_check_guest() is a hook which is invoked at the time
+ * of guest exit to facilitate the host-side handling of machine check
+ * exception before the exception is passed on to the guest. This hook
+ * is invoked from host virtual mode from KVM (before exiting the guest
+ * with KVM_EXIT_NMI reason) for machine check exception that occurs in
+ * the guest.
+ *
+ * Currently no action is performed in the host other than printing the
+ * event information. The machine check exception is passed on to the
+ * guest kernel and the guest kernel will attempt for recovery.
+ */
+int opal_machine_check_guest(struct machine_check_event *evt)
+{
+ /* Print things out */
+ if (evt->version != MCE_V1) {
+ pr_err("Machine Check Exception, Unknown event version %d !\n",
+ evt->version);
+ return 0;
+ }
+ machine_check_print_event_info(evt);
+ return 0;
+}
+#endif
+
/* Early hmi handler called in real mode. */
int opal_hmi_exception_early(struct pt_regs *regs)
{
@@ -264,6 +264,9 @@ static void __init pnv_setup_machdep_opal(void)
ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
ppc_md.hmi_exception_early = opal_hmi_exception_early;
ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ ppc_md.machine_check_exception_guest = opal_machine_check_guest;
+#endif
}
static int __init pnv_probe(void)