Message ID | 1487712121-16688-11-git-send-email-tbaicar@codeaurora.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Tyler, On 21/02/17 21:22, Tyler Baicar wrote: > Currently external aborts are unsupported by the guest abort > handling. Add handling for SEAs so that the host kernel reports > SEAs which occur in the guest kernel. > diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h > index e22089f..33a77509 100644 > --- a/arch/arm/include/asm/kvm_arm.h > +++ b/arch/arm/include/asm/kvm_arm.h > @@ -187,6 +187,7 @@ > #define FSC_FAULT (0x04) > #define FSC_ACCESS (0x08) > #define FSC_PERM (0x0c) > +#define FSC_EXTABT (0x10) arm64 has ESR_ELx_FSC_EXTABT which is used in inject_abt64(), but for matching an external abort coming from hardware the range is wider. Looking at the ARM-ARMs 'ISS encoding for an exception from an Instruction Abort' in 'D7.2.27 ESR_ELx, Exception Syndrome Register (ELx)' (page D7-1954 of version 'k'...iss10775), the ten flavours of you Synchronous abort you hooked with do_sea() in patch 4 occupy 0x10 to 0x1f... > diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c > index a5265ed..04f1dd50 100644 > --- a/arch/arm/kvm/mmu.c > +++ b/arch/arm/kvm/mmu.c > @@ -29,6 +29,7 @@ > #include <asm/kvm_asm.h> > #include <asm/kvm_emulate.h> > #include <asm/virt.h> > +#include <asm/system_misc.h> > > #include "trace.h" > > @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) > > /* Check the stage-2 fault is trans. fault or write fault */ > fault_status = kvm_vcpu_trap_get_fault_type(vcpu); ... kvm_vcpu_trap_get_fault_type() on both arm and arm64 masks the HSR/ESR_EL2 with 0x3c ... > - if (fault_status != FSC_FAULT && fault_status != FSC_PERM && > - fault_status != FSC_ACCESS) { > + > + /* The host kernel will handle the synchronous external abort. There > + * is no need to pass the error into the guest. > + */ > + if (fault_status == FSC_EXTABT) { ... but here we only check for 'Synchronous external abort, not on a translation table walk'. Are the other types relevant? If so we need some helper as this range is sparse and 'all other values are reserved'. The aarch32 HSR format is slightly different. (G6-4411 ISS encoding from an exception from a Data Abort). If not, can we change patch 4 to check this type too so we don't call out to APEI for a fault type we know isn't relevant. > + if(handle_guest_sea((unsigned long)fault_ipa, > + kvm_vcpu_get_hsr(vcpu))) { > + kvm_err("Failed to handle guest SEA, FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", > + kvm_vcpu_trap_get_class(vcpu), > + (unsigned long)kvm_vcpu_trap_get_fault(vcpu), > + (unsigned long)kvm_vcpu_get_hsr(vcpu)); > + return -EFAULT; > + } > + } else if (fault_status != FSC_FAULT && fault_status != FSC_PERM && > + fault_status != FSC_ACCESS) { > kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", > kvm_vcpu_trap_get_class(vcpu), > (unsigned long)kvm_vcpu_trap_get_fault(vcpu), > diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c > index b2d57fc..403277b 100644 > --- a/arch/arm64/mm/fault.c > +++ b/arch/arm64/mm/fault.c > @@ -602,6 +602,24 @@ static const char *fault_name(unsigned int esr) > } > > /* > + * Handle Synchronous External Aborts that occur in a guest kernel. > + */ > +int handle_guest_sea(unsigned long addr, unsigned int esr) > +{ > + if(IS_ENABLED(HAVE_ACPI_APEI_SEA)) { > + nmi_enter(); > + ghes_notify_sea(); > + nmi_exit(); This nmi stuff was needed for synchronous aborts that may have interrupted APEI's interrupts-masked code. We want to avoid trying to take the same set of locks, hence taking the in_nmi() path through APEI. Here we know we interrupted a guest, so there is no risk that we have interrupted APEI on the host. ghes_notify_sea() can safely take the normal path. Thanks, James
Hi Tyler, On 2017/2/22 5:22, Tyler Baicar wrote: > Currently external aborts are unsupported by the guest abort > handling. Add handling for SEAs so that the host kernel reports > SEAs which occur in the guest kernel. > > Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org> > --- > arch/arm/include/asm/kvm_arm.h | 1 + > arch/arm/include/asm/system_misc.h | 5 +++++ > arch/arm/kvm/mmu.c | 18 ++++++++++++++++-- > arch/arm64/include/asm/kvm_arm.h | 1 + > arch/arm64/include/asm/system_misc.h | 2 ++ > arch/arm64/mm/fault.c | 18 ++++++++++++++++++ > 6 files changed, 43 insertions(+), 2 deletions(-) > > diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h > index e22089f..33a77509 100644 > --- a/arch/arm/include/asm/kvm_arm.h > +++ b/arch/arm/include/asm/kvm_arm.h > @@ -187,6 +187,7 @@ > #define FSC_FAULT (0x04) > #define FSC_ACCESS (0x08) > #define FSC_PERM (0x0c) > +#define FSC_EXTABT (0x10) > > /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ > #define HPFAR_MASK (~0xf) > diff --git a/arch/arm/include/asm/system_misc.h b/arch/arm/include/asm/system_misc.h > index a3d61ad..ea45d94 100644 > --- a/arch/arm/include/asm/system_misc.h > +++ b/arch/arm/include/asm/system_misc.h > @@ -24,4 +24,9 @@ > > #endif /* !__ASSEMBLY__ */ > > +static inline int handle_guest_sea(unsigned long addr, unsigned int esr) > +{ > + return -1; > +} > + > #endif /* __ASM_ARM_SYSTEM_MISC_H */ > diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c > index a5265ed..04f1dd50 100644 > --- a/arch/arm/kvm/mmu.c > +++ b/arch/arm/kvm/mmu.c > @@ -29,6 +29,7 @@ > #include <asm/kvm_asm.h> > #include <asm/kvm_emulate.h> > #include <asm/virt.h> > +#include <asm/system_misc.h> > > #include "trace.h" > > @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) > > /* Check the stage-2 fault is trans. fault or write fault */ > fault_status = kvm_vcpu_trap_get_fault_type(vcpu); > - if (fault_status != FSC_FAULT && fault_status != FSC_PERM && > - fault_status != FSC_ACCESS) { > + > + /* The host kernel will handle the synchronous external abort. There > + * is no need to pass the error into the guest. > + */ Can we inject an sea into the guest, so that the guest can kill the application which causes the error if the guest won't be terminated later. I'm not sure whether ghes_handle_memory_failure() called in ghes_do_proc() will kill the qemu process. I think it only kill user processes marked with PF_MCE_PROCESS & PF_MCE_EARLY. > + if (fault_status == FSC_EXTABT) { > + if(handle_guest_sea((unsigned long)fault_ipa, > + kvm_vcpu_get_hsr(vcpu))) { > + kvm_err("Failed to handle guest SEA, FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", > + kvm_vcpu_trap_get_class(vcpu), > + (unsigned long)kvm_vcpu_trap_get_fault(vcpu), > + (unsigned long)kvm_vcpu_get_hsr(vcpu)); > + return -EFAULT; > + } > + } else if (fault_status != FSC_FAULT && fault_status != FSC_PERM && > + fault_status != FSC_ACCESS) { > kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", > kvm_vcpu_trap_get_class(vcpu), > (unsigned long)kvm_vcpu_trap_get_fault(vcpu), > diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h > index 2a2752b..2b11d59 100644 > --- a/arch/arm64/include/asm/kvm_arm.h > +++ b/arch/arm64/include/asm/kvm_arm.h > @@ -201,6 +201,7 @@ > #define FSC_FAULT ESR_ELx_FSC_FAULT > #define FSC_ACCESS ESR_ELx_FSC_ACCESS > #define FSC_PERM ESR_ELx_FSC_PERM > +#define FSC_EXTABT ESR_ELx_FSC_EXTABT > > /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ > #define HPFAR_MASK (~UL(0xf)) > diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h > index bc81243..5b2cecd1 100644 > --- a/arch/arm64/include/asm/system_misc.h > +++ b/arch/arm64/include/asm/system_misc.h > @@ -58,4 +58,6 @@ void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned int, > > #endif /* __ASSEMBLY__ */ > > +int handle_guest_sea(unsigned long addr, unsigned int esr); > + > #endif /* __ASM_SYSTEM_MISC_H */ > diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c > index b2d57fc..403277b 100644 > --- a/arch/arm64/mm/fault.c > +++ b/arch/arm64/mm/fault.c > @@ -602,6 +602,24 @@ static const char *fault_name(unsigned int esr) > } > > /* > + * Handle Synchronous External Aborts that occur in a guest kernel. > + */ > +int handle_guest_sea(unsigned long addr, unsigned int esr) > +{ > + /* > + * synchronize_rcu() will wait for nmi_exit(), so no need to > + * rcu_read_lock(). > + */ > + if(IS_ENABLED(HAVE_ACPI_APEI_SEA)) { > + nmi_enter(); > + ghes_notify_sea(); > + nmi_exit(); > + } > + > + return 0; > +} > + > +/* > * Dispatch a data abort to the relevant handler. > */ > asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr, > Thanks, Wang Xiongfeng
@@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) /* Check the stage-2 fault is trans. fault or write fault */ fault_status = kvm_vcpu_trap_get_fault_type(vcpu); - if (fault_status != FSC_FAULT && fault_status != FSC_PERM && - fault_status != FSC_ACCESS) { + + /* The host kernel will handle the synchronous external abort. There + * is no need to pass the error into the guest. + */ + if (fault_status == FSC_EXTABT) { + if(handle_guest_sea((unsigned long)fault_ipa, + kvm_vcpu_get_hsr(vcpu))) { + kvm_err("Failed to handle guest SEA, FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", + kvm_vcpu_trap_get_class(vcpu), + (unsigned long)kvm_vcpu_trap_get_fault(vcpu), + (unsigned long)kvm_vcpu_get_hsr(vcpu)); + return -EFAULT; + } + } else if (fault_status != FSC_FAULT && fault_status != FSC_PERM && + fault_status != FSC_ACCESS) { kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", kvm_vcpu_trap_get_class(vcpu), (unsigned long)kvm_vcpu_trap_get_fault(vcpu), if the error is SEA and we want to inject the sea to guest OK, after finish the handle, whether we need to directly return? instead of continuation? as shown below: if (fault_status == FSC_EXTABT) { if(handle_guest_sea((unsigned long)fault_ipa, kvm_vcpu_get_hsr(vcpu))) { kvm_err("Failed to handle guest SEA, FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", kvm_vcpu_trap_get_class(vcpu), (unsigned long)kvm_vcpu_trap_get_fault(vcpu), (unsigned long)kvm_vcpu_get_hsr(vcpu)); return -EFAULT; } else return 1; 2017-02-24 18:42 GMT+08:00 James Morse <james.morse@arm.com>: > Hi Tyler, > > On 21/02/17 21:22, Tyler Baicar wrote: >> Currently external aborts are unsupported by the guest abort >> handling. Add handling for SEAs so that the host kernel reports >> SEAs which occur in the guest kernel. > >> diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h >> index e22089f..33a77509 100644 >> --- a/arch/arm/include/asm/kvm_arm.h >> +++ b/arch/arm/include/asm/kvm_arm.h >> @@ -187,6 +187,7 @@ >> #define FSC_FAULT (0x04) >> #define FSC_ACCESS (0x08) >> #define FSC_PERM (0x0c) >> +#define FSC_EXTABT (0x10) > > arm64 has ESR_ELx_FSC_EXTABT which is used in inject_abt64(), but for matching > an external abort coming from hardware the range is wider. > > Looking at the ARM-ARMs 'ISS encoding for an exception from an Instruction > Abort' in 'D7.2.27 ESR_ELx, Exception Syndrome Register (ELx)' (page D7-1954 of > version 'k'...iss10775), the ten flavours of you Synchronous abort you hooked > with do_sea() in patch 4 occupy 0x10 to 0x1f... > > >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c >> index a5265ed..04f1dd50 100644 >> --- a/arch/arm/kvm/mmu.c >> +++ b/arch/arm/kvm/mmu.c >> @@ -29,6 +29,7 @@ >> #include <asm/kvm_asm.h> >> #include <asm/kvm_emulate.h> >> #include <asm/virt.h> >> +#include <asm/system_misc.h> >> >> #include "trace.h" >> >> @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) >> >> /* Check the stage-2 fault is trans. fault or write fault */ >> fault_status = kvm_vcpu_trap_get_fault_type(vcpu); > > ... kvm_vcpu_trap_get_fault_type() on both arm and arm64 masks the HSR/ESR_EL2 > with 0x3c ... > > >> - if (fault_status != FSC_FAULT && fault_status != FSC_PERM && >> - fault_status != FSC_ACCESS) { >> + >> + /* The host kernel will handle the synchronous external abort. There >> + * is no need to pass the error into the guest. >> + */ >> + if (fault_status == FSC_EXTABT) { > > ... but here we only check for 'Synchronous external abort, not on a translation > table walk'. Are the other types relevant? > > If so we need some helper as this range is sparse and 'all other values are > reserved'. The aarch32 HSR format is slightly different. (G6-4411 ISS encoding > from an exception from a Data Abort). > > If not, can we change patch 4 to check this type too so we don't call out to > APEI for a fault type we know isn't relevant. > > >> + if(handle_guest_sea((unsigned long)fault_ipa, >> + kvm_vcpu_get_hsr(vcpu))) { >> + kvm_err("Failed to handle guest SEA, FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", >> + kvm_vcpu_trap_get_class(vcpu), >> + (unsigned long)kvm_vcpu_trap_get_fault(vcpu), >> + (unsigned long)kvm_vcpu_get_hsr(vcpu)); >> + return -EFAULT; >> + } >> + } else if (fault_status != FSC_FAULT && fault_status != FSC_PERM && >> + fault_status != FSC_ACCESS) { >> kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", >> kvm_vcpu_trap_get_class(vcpu), >> (unsigned long)kvm_vcpu_trap_get_fault(vcpu), > >> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c >> index b2d57fc..403277b 100644 >> --- a/arch/arm64/mm/fault.c >> +++ b/arch/arm64/mm/fault.c >> @@ -602,6 +602,24 @@ static const char *fault_name(unsigned int esr) >> } >> >> /* >> + * Handle Synchronous External Aborts that occur in a guest kernel. >> + */ >> +int handle_guest_sea(unsigned long addr, unsigned int esr) >> +{ > >> + if(IS_ENABLED(HAVE_ACPI_APEI_SEA)) { >> + nmi_enter(); >> + ghes_notify_sea(); >> + nmi_exit(); > > This nmi stuff was needed for synchronous aborts that may have interrupted > APEI's interrupts-masked code. We want to avoid trying to take the same set of > locks, hence taking the in_nmi() path through APEI. Here we know we interrupted > a guest, so there is no risk that we have interrupted APEI on the host. > ghes_notify_sea() can safely take the normal path. > > > Thanks, > > James
Hi Wang Xiongfeng, On 25/02/17 07:15, Xiongfeng Wang wrote: > On 2017/2/22 5:22, Tyler Baicar wrote: >> Currently external aborts are unsupported by the guest abort >> handling. Add handling for SEAs so that the host kernel reports >> SEAs which occur in the guest kernel. >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c >> index a5265ed..04f1dd50 100644 >> --- a/arch/arm/kvm/mmu.c >> +++ b/arch/arm/kvm/mmu.c >> @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) >> >> /* Check the stage-2 fault is trans. fault or write fault */ >> fault_status = kvm_vcpu_trap_get_fault_type(vcpu); >> - if (fault_status != FSC_FAULT && fault_status != FSC_PERM && >> - fault_status != FSC_ACCESS) { >> + >> + /* The host kernel will handle the synchronous external abort. There >> + * is no need to pass the error into the guest. >> + */ > Can we inject an sea into the guest, so that the guest can kill the > application which causes the error if the guest won't be terminated > later. I'm not sure whether ghes_handle_memory_failure() called in > ghes_do_proc() will kill the qemu process. I think it only kill user > processes marked with PF_MCE_PROCESS & PF_MCE_EARLY. My understanding is the pages will get unmapped and recovered where possible (e.g. re-read from disk), the user space process will get SIGBUS/SIGSEV when it next tries to access that page, which could be some time later. These flags in find_early_kill_thread() are a way to make the memory-failure code signal the process early, before it does any recovery. The 'MCE' makes me think its x86 specific. (early and late are described more in [0]) Guests are a special case as QEMU may never access the faulty memory itself, so it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I have patches to add support for this which I intend to send at rc1. [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given x86s KVM_PFN_ERR_HWPOISON, this may be out of date. Either way, once QEMU gets a signal indicating the virtual address, it can generate its own APEI CPER records and use the KVM APIs to mock up an Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the guest's polling thread to come round, whichever was described to the guest via the HEST/GHES tables). We can't hand the APEI CPER records we have in the kernel to the guest, as they hold a host physical address, and maybe a host virtual address. We don't know where in guest memory we could write new APEI CPER records as these locations have to be reserved in the guests-UEFI memory map, and only QEMU knows where they are. To deliver RAS events to a guest we have to get QEMU involved. Thanks, James [0] https://www.kernel.org/doc/Documentation/vm/hwpoison.txt
Hi James, On 2017/2/27 21:58, James Morse wrote: > Hi Wang Xiongfeng, > > On 25/02/17 07:15, Xiongfeng Wang wrote: >> On 2017/2/22 5:22, Tyler Baicar wrote: >>> Currently external aborts are unsupported by the guest abort >>> handling. Add handling for SEAs so that the host kernel reports >>> SEAs which occur in the guest kernel. > >>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c >>> index a5265ed..04f1dd50 100644 >>> --- a/arch/arm/kvm/mmu.c >>> +++ b/arch/arm/kvm/mmu.c >>> @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) >>> >>> /* Check the stage-2 fault is trans. fault or write fault */ >>> fault_status = kvm_vcpu_trap_get_fault_type(vcpu); >>> - if (fault_status != FSC_FAULT && fault_status != FSC_PERM && >>> - fault_status != FSC_ACCESS) { >>> + >>> + /* The host kernel will handle the synchronous external abort. There >>> + * is no need to pass the error into the guest. >>> + */ > >> Can we inject an sea into the guest, so that the guest can kill the >> application which causes the error if the guest won't be terminated >> later. I'm not sure whether ghes_handle_memory_failure() called in >> ghes_do_proc() will kill the qemu process. I think it only kill user >> processes marked with PF_MCE_PROCESS & PF_MCE_EARLY. > > My understanding is the pages will get unmapped and recovered where possible > (e.g. re-read from disk), the user space process will get SIGBUS/SIGSEV when it > next tries to access that page, which could be some time later. > These flags in find_early_kill_thread() are a way to make the memory-failure > code signal the process early, before it does any recovery. The 'MCE' makes me > think its x86 specific. > (early and late are described more in [0]) > > > Guests are a special case as QEMU may never access the faulty memory itself, so > it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support > for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I > have patches to add support for this which I intend to send at rc1. > > [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given > x86s KVM_PFN_ERR_HWPOISON, this may be out of date. > > > Either way, once QEMU gets a signal indicating the virtual address, it can > generate its own APEI CPER records and use the KVM APIs to mock up an > Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the > guest's polling thread to come round, whichever was described to the guest via > the HEST/GHES tables). > > We can't hand the APEI CPER records we have in the kernel to the guest, as they > hold a host physical address, and maybe a host virtual address. We don't know > where in guest memory we could write new APEI CPER records as these locations > have to be reserved in the guests-UEFI memory map, and only QEMU knows where > they are. > > To deliver RAS events to a guest we have to get QEMU involved. Thanks for your reply! I have another idea about the handling procedure of SEA. Can we divide the SEA handing procedure into two procedures? The first procedure does the more urgent work, including sending SIGBUS to user process or panic, just as PATCH 04/10 does. The second procedure does the APEI analysis work, including calling memory_failure. The second procedure is executed when actual errors detected in memory, such as a 2-bit ECC error is detected on memory read or write, in which case, a fault handling interrupt is generated by the memory controller, as RAS Extension specification says. We can route this fault handling interrupt into EL3. After BIOS has filled the HEST table, it can notify OS with an IRQ. And the second procedure is executed in the IRQ handler. The notification type of HEST/GHES tables is GSIV. When uncorrectable data error is detected on write data, a fault handling interrupt is generated, and no SEA is generated, as RAS extension specification 6.4.4 says. In this situation, the second procedure should be executed since error occurs in memory. In ARM/arm64 KVM situation, when an SEA takes place, an SEA is injected into guest os directly in kvm_handle_guest_abort(). And the guest os can execute the first procedure. When the host OS executes the second procedure and analyses the HEST table, it sends SIGBUS to qemu process in memory_failure(). And the qemu process can mock up a HEST table with IPA of the error data. Then the qemu process can notify the guest OS with an IRQ, and the second procedure is executed in guest OS. Is this idea reasonable? Thanks! Wang Xiongfeng
Hi, On 28/02/17 06:25, Xiongfeng Wang wrote: > On 2017/2/27 21:58, James Morse wrote: >> On 25/02/17 07:15, Xiongfeng Wang wrote: >>> Can we inject an sea into the guest, so that the guest can kill the >>> application which causes the error if the guest won't be terminated >>> later. I'm not sure whether ghes_handle_memory_failure() called in >>> ghes_do_proc() will kill the qemu process. I think it only kill user >>> processes marked with PF_MCE_PROCESS & PF_MCE_EARLY. >> >> My understanding is the pages will get unmapped and recovered where possible >> (e.g. re-read from disk), the user space process will get SIGBUS/SIGSEV when it >> next tries to access that page, which could be some time later. >> These flags in find_early_kill_thread() are a way to make the memory-failure >> code signal the process early, before it does any recovery. The 'MCE' makes me >> think its x86 specific. >> (early and late are described more in [0]) >> >> >> Guests are a special case as QEMU may never access the faulty memory itself, so >> it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support >> for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I >> have patches to add support for this which I intend to send at rc1. >> >> [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given >> x86s KVM_PFN_ERR_HWPOISON, this may be out of date. >> >> >> Either way, once QEMU gets a signal indicating the virtual address, it can >> generate its own APEI CPER records and use the KVM APIs to mock up an >> Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the >> guest's polling thread to come round, whichever was described to the guest via >> the HEST/GHES tables). >> >> We can't hand the APEI CPER records we have in the kernel to the guest, as they >> hold a host physical address, and maybe a host virtual address. We don't know >> where in guest memory we could write new APEI CPER records as these locations >> have to be reserved in the guests-UEFI memory map, and only QEMU knows where >> they are. >> >> To deliver RAS events to a guest we have to get QEMU involved. > I have another idea about the handling procedure of SEA. Can we divide > the SEA handing procedure into two procedures? The first procedure does > the more urgent work, including sending SIGBUS to user process or panic, > just as PATCH 04/10 does. How do we know which user processes to signal? (There may be more than one - we need a memory address to find them). How do we know if this error is serious and we should panic? This information is in the APEI CPER records. > The second procedure does the APEI analysis > work, including calling memory_failure. The second procedure is executed > when actual errors detected in memory, such as a 2-bit ECC error is > detected on memory read or write, in which case, a fault handling > interrupt is generated by the memory controller, as RAS Extension > specification says. You are splitting the APEI notification and the processing of records. One has to happen immediately after the other because we want to contain the error. > We can route this fault handling interrupt into EL3. After BIOS has > filled the HEST table, it can notify OS with an IRQ. And the second > procedure is executed in the IRQ handler. The notification type of > HEST/GHES tables is GSIV. > > When uncorrectable data error is detected on write data, a fault > handling interrupt is generated, and no SEA is generated, This sounds more like ACPI's firmware first error handling. Yes errors should be routed to EL3 where firmware can do some platform-specific work, then describe them to the host OS via CPER records. By doing this, you could prevent a hardware-generated External Abort reaching the host OS, but you still need to notify the OS via one of the mechanisms in '18.3.2.9 Hardware Error Notification'. If the error is synchronous (we read a bad page of memory instead of it being detected on background DRAM scrub) we need to notify the OS synchronously. Using SEA would be a firmware-generated External Abort delivered to EL2/EL1. However the notification is done it needs to match one of the GHES records in the HEST, so firmware has to decide which notification methods it will use before we boot the OS. > In ARM/arm64 KVM situation, when an SEA takes place, an SEA is injected > into guest os directly in kvm_handle_guest_abort(). And the guest os can > execute the first procedure. What can the guest do with this? Without the APEI CPER records it doesn't know what happened. Was it unrecoverable memory corruption? In which case killing the running task is a start... Which memory ranges should we mark as unusable? Maybe it was something more catastrophic for the running CPU, in which case we should panic(). > When the host OS executes the second procedure and analyses the HEST > table, it sends SIGBUS to qemu process in memory_failure(). And the qemu > process can mock up a HEST table with IPA of the error data. Then the > qemu process can notify the guest OS with an IRQ, and the second > procedure is executed in guest OS. Is this idea reasonable? So we tell the guest something happened, and it should wait a while to find out what... I don't think this will work. It is best to not run the guest until Qemu has done its work and called VCPU_RUN again. This way we only notify the guest once the records are available for processing. This is how APEI's firmware-first works between the host OS and EL3, it should be the same between a guest and QEMU (which plays the part of firmware for the guest). Can you share more details of the problem you are trying to solve? I don't think we can get RAS support in a guest 'for free', somewhere along the line we need support from Qemu. Thanks, James
Hello James, On 2/24/2017 3:42 AM, James Morse wrote: > On 21/02/17 21:22, Tyler Baicar wrote: >> Currently external aborts are unsupported by the guest abort >> handling. Add handling for SEAs so that the host kernel reports >> SEAs which occur in the guest kernel. >> diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h >> index e22089f..33a77509 100644 >> --- a/arch/arm/include/asm/kvm_arm.h >> +++ b/arch/arm/include/asm/kvm_arm.h >> @@ -187,6 +187,7 @@ >> #define FSC_FAULT (0x04) >> #define FSC_ACCESS (0x08) >> #define FSC_PERM (0x0c) >> +#define FSC_EXTABT (0x10) > arm64 has ESR_ELx_FSC_EXTABT which is used in inject_abt64(), but for matching > an external abort coming from hardware the range is wider. > > Looking at the ARM-ARMs 'ISS encoding for an exception from an Instruction > Abort' in 'D7.2.27 ESR_ELx, Exception Syndrome Register (ELx)' (page D7-1954 of > version 'k'...iss10775), the ten flavours of you Synchronous abort you hooked > with do_sea() in patch 4 occupy 0x10 to 0x1f... > > >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c >> index a5265ed..04f1dd50 100644 >> --- a/arch/arm/kvm/mmu.c >> +++ b/arch/arm/kvm/mmu.c >> @@ -29,6 +29,7 @@ >> #include <asm/kvm_asm.h> >> #include <asm/kvm_emulate.h> >> #include <asm/virt.h> >> +#include <asm/system_misc.h> >> >> #include "trace.h" >> >> @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) >> >> /* Check the stage-2 fault is trans. fault or write fault */ >> fault_status = kvm_vcpu_trap_get_fault_type(vcpu); > ... kvm_vcpu_trap_get_fault_type() on both arm and arm64 masks the HSR/ESR_EL2 > with 0x3c ... > > >> - if (fault_status != FSC_FAULT && fault_status != FSC_PERM && >> - fault_status != FSC_ACCESS) { >> + >> + /* The host kernel will handle the synchronous external abort. There >> + * is no need to pass the error into the guest. >> + */ >> + if (fault_status == FSC_EXTABT) { > ... but here we only check for 'Synchronous external abort, not on a translation > table walk'. Are the other types relevant? I believe the others should be relevant here as well. I haven't been able to test the other types within a guest though. > If so we need some helper as this range is sparse and 'all other values are > reserved'. The aarch32 HSR format is slightly different. (G6-4411 ISS encoding > from an exception from a Data Abort). I can add a helper so that this if statement matches any of the 10 FSC values which are mapped to the do_sea in the host code. > If not, can we change patch 4 to check this type too so we don't call out to > APEI for a fault type we know isn't relevant. > > >> + if(handle_guest_sea((unsigned long)fault_ipa, >> + kvm_vcpu_get_hsr(vcpu))) { >> + kvm_err("Failed to handle guest SEA, FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", >> + kvm_vcpu_trap_get_class(vcpu), >> + (unsigned long)kvm_vcpu_trap_get_fault(vcpu), >> + (unsigned long)kvm_vcpu_get_hsr(vcpu)); >> + return -EFAULT; >> + } >> + } else if (fault_status != FSC_FAULT && fault_status != FSC_PERM && >> + fault_status != FSC_ACCESS) { >> kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", >> kvm_vcpu_trap_get_class(vcpu), >> (unsigned long)kvm_vcpu_trap_get_fault(vcpu), >> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c >> index b2d57fc..403277b 100644 >> --- a/arch/arm64/mm/fault.c >> +++ b/arch/arm64/mm/fault.c >> @@ -602,6 +602,24 @@ static const char *fault_name(unsigned int esr) >> } >> >> /* >> + * Handle Synchronous External Aborts that occur in a guest kernel. >> + */ >> +int handle_guest_sea(unsigned long addr, unsigned int esr) >> +{ >> + if(IS_ENABLED(HAVE_ACPI_APEI_SEA)) { >> + nmi_enter(); >> + ghes_notify_sea(); >> + nmi_exit(); > This nmi stuff was needed for synchronous aborts that may have interrupted > APEI's interrupts-masked code. We want to avoid trying to take the same set of > locks, hence taking the in_nmi() path through APEI. Here we know we interrupted > a guest, so there is no risk that we have interrupted APEI on the host. > ghes_notify_sea() can safely take the normal path. Makes sense, I can remove the nmi_* calls here. Thanks, Tyler
Hi James, On 2017/2/28 21:21, James Morse wrote: > Hi, > > On 28/02/17 06:25, Xiongfeng Wang wrote: >> On 2017/2/27 21:58, James Morse wrote: >>> On 25/02/17 07:15, Xiongfeng Wang wrote: >>>> Can we inject an sea into the guest, so that the guest can kill the >>>> application which causes the error if the guest won't be terminated >>>> later. I'm not sure whether ghes_handle_memory_failure() called in >>>> ghes_do_proc() will kill the qemu process. I think it only kill user >>>> processes marked with PF_MCE_PROCESS & PF_MCE_EARLY. >>> >>> My understanding is the pages will get unmapped and recovered where possible >>> (e.g. re-read from disk), the user space process will get SIGBUS/SIGSEV when it >>> next tries to access that page, which could be some time later. >>> These flags in find_early_kill_thread() are a way to make the memory-failure >>> code signal the process early, before it does any recovery. The 'MCE' makes me >>> think its x86 specific. >>> (early and late are described more in [0]) >>> >>> >>> Guests are a special case as QEMU may never access the faulty memory itself, so >>> it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support >>> for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I >>> have patches to add support for this which I intend to send at rc1. >>> >>> [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given >>> x86s KVM_PFN_ERR_HWPOISON, this may be out of date. >>> >>> >>> Either way, once QEMU gets a signal indicating the virtual address, it can >>> generate its own APEI CPER records and use the KVM APIs to mock up an >>> Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the >>> guest's polling thread to come round, whichever was described to the guest via >>> the HEST/GHES tables). >>> >>> We can't hand the APEI CPER records we have in the kernel to the guest, as they >>> hold a host physical address, and maybe a host virtual address. We don't know >>> where in guest memory we could write new APEI CPER records as these locations >>> have to be reserved in the guests-UEFI memory map, and only QEMU knows where >>> they are. >>> >>> To deliver RAS events to a guest we have to get QEMU involved. > >> I have another idea about the handling procedure of SEA. Can we divide >> the SEA handing procedure into two procedures? The first procedure does >> the more urgent work, including sending SIGBUS to user process or panic, >> just as PATCH 04/10 does. > > How do we know which user processes to signal? (There may be more than one - we > need a memory address to find them). > How do we know if this error is serious and we should panic? > This information is in the APEI CPER records. > Since the SEA exception is synchronous, the current user process is the one to be signaled if the exception is taken from EL0. Certainly, the error memory may be mapped to several processes. When another process read that area again, another SEA will be generated, and that process will be signaled. Also we can get the virtual address of the error data from FAR_EL1. When the user process is signaled, virtual address is attached, and the process can register its own signal handler if it can handle the error according to the virtual address of the error data. We can determine where the exception is taken from according to CPSR stored in the stack. And if the exception is taken from EL1, the error is in the kernel space now, and we are going to consume it, so we need to panic now. > >> The second procedure does the APEI analysis >> work, including calling memory_failure. The second procedure is executed >> when actual errors detected in memory, such as a 2-bit ECC error is >> detected on memory read or write, in which case, a fault handling >> interrupt is generated by the memory controller, as RAS Extension >> specification says. > > You are splitting the APEI notification and the processing of records. One has > to happen immediately after the other because we want to contain the error. > My understanding is that processing of records is not so urgent since the process access the error data has been killed (The first procedure is executed in SEA exception handler). Other codes won't access the error data, so the error won't be consumed and propagated. > >> We can route this fault handling interrupt into EL3. After BIOS has >> filled the HEST table, it can notify OS with an IRQ. And the second >> procedure is executed in the IRQ handler. The notification type of >> HEST/GHES tables is GSIV. >> >> When uncorrectable data error is detected on write data, a fault >> handling interrupt is generated, and no SEA is generated, > > This sounds more like ACPI's firmware first error handling. Yes errors should be > routed to EL3 where firmware can do some platform-specific work, then describe > them to the host OS via CPER records. Yes , I'm saying the ACPI's firmware first error handling. > By doing this, you could prevent a hardware-generated External Abort reaching > the host OS, but you still need to notify the OS via one of the mechanisms in > '18.3.2.9 Hardware Error Notification'. Yes, the BIOS will notify OS with GSIV notify type, which will rely on Shiju's patch 'acpi: apei: handle GSIV notification type' > > If the error is synchronous (we read a bad page of memory instead of it being > detected on background DRAM scrub) we need to notify the OS synchronously. Using > SEA would be a firmware-generated External Abort delivered to EL2/EL1. Yes, the first procedure is executed in SEA exception handler and is synchronous. The second procedure won't access the error data and is not so urgent, so it may not need to be synchronous. > > However the notification is done it needs to match one of the GHES records in > the HEST, so firmware has to decide which notification methods it will use > before we boot the OS. > > >> In ARM/arm64 KVM situation, when an SEA takes place, an SEA is injected >> into guest os directly in kvm_handle_guest_abort(). And the guest os can >> execute the first procedure. > > What can the guest do with this? Without the APEI CPER records it doesn't know > what happened. Was it unrecoverable memory corruption? In which case killing the > running task is a start... Which memory ranges should we mark as unusable? Maybe > it was something more catastrophic for the running CPU, in which case we should > panic(). > If an SEA is injected into guest OS, the guest OS will jump to the SEA exception entry when the context switched to guest OS. And the CPSR and FAR_EL1 are recovered according to the content of vcpu. Then the guest OS can signal a process or panic. If another guest process read the error data, another SEA will be generated and it will be single too. Without QEMU involved, the drawback is that no APEI table can be mocked up in guest OS, and no memory_failure() will be called. So the memory of error data will be released into buddy system and assigned to another process. If the error was caused by instantaneous radiation or electromagnetic, the memory is usable again if it is written with a correct data. If the memory has wore out and a correct data is written, the ECC error may occurs again with high possibility. Before a 2-bit ECC error is reported, much more 1-bit errors will be reported. This is report to host os, the host os can determine the memory node has worn out and hot-plug out the memory node, and guest os may be terminated if its memory data can't be migrated. Of course, it is better to get QEMU involved, so the memory_failure can be executed in guest OS. But before that implemented, can we add SEA injection in kvm_handle_guest_abort()? > >> When the host OS executes the second procedure and analyses the HEST >> table, it sends SIGBUS to qemu process in memory_failure(). And the qemu >> process can mock up a HEST table with IPA of the error data. Then the >> qemu process can notify the guest OS with an IRQ, and the second >> procedure is executed in guest OS. Is this idea reasonable? > > So we tell the guest something happened, and it should wait a while to find out > what... I don't think this will work. It is best to not run the guest until Qemu > has done its work and called VCPU_RUN again. This way we only notify the guest > once the records are available for processing. This is how APEI's firmware-first > works between the host OS and EL3, it should be the same between a guest and > QEMU (which plays the part of firmware for the guest). > > > Can you share more details of the problem you are trying to solve? I don't think > we can get RAS support in a guest 'for free', somewhere along the line we need > support from Qemu. > Thanks, Wang Xiongfeng .
On 01/03/17 02:31, Xiongfeng Wang wrote: [lot of things] > If an SEA is injected into guest OS, the guest OS will jump to the SEA > exception entry when the context switched to guest OS. And the CPSR and > FAR_EL1 are recovered according to the content of vcpu. Then the guest > OS can signal a process or panic. If another guest process read the > error data, another SEA will be generated and it will be single too. > > Without QEMU involved, the drawback is that no APEI table can be mocked > up in guest OS, and no memory_failure() will be called. So the memory of > error data will be released into buddy system and assigned to another > process. If the error was caused by instantaneous radiation or > electromagnetic, the memory is usable again if it is written with a > correct data. If the memory has wore out and a correct data is written, > the ECC error may occurs again with high possibility. Before a 2-bit ECC > error is reported, much more 1-bit errors will be reported. This is > report to host os, the host os can determine the memory node has worn > out and hot-plug out the memory node, and guest os may be terminated if > its memory data can't be migrated. > > Of course, it is better to get QEMU involved, so the memory_failure can > be executed in guest OS. But before that implemented, can we add SEA > injection in kvm_handle_guest_abort()? No. I will strongly object to that. This is a platform decision to forward SEAs, not an architectural one. The core KVM code is only concerned about implementing the ARM architecture, and not something that is firmware dependent. Thanks, M.
Hi James, > Hi Wang Xiongfeng, > > On 25/02/17 07:15, Xiongfeng Wang wrote: >> On 2017/2/22 5:22, Tyler Baicar wrote: >>> Currently external aborts are unsupported by the guest abort >>> handling. Add handling for SEAs so that the host kernel reports >>> SEAs which occur in the guest kernel. > >>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c >>> index a5265ed..04f1dd50 100644 >>> --- a/arch/arm/kvm/mmu.c >>> +++ b/arch/arm/kvm/mmu.c >>> @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) >>> >>> /* Check the stage-2 fault is trans. fault or write fault */ >>> fault_status = kvm_vcpu_trap_get_fault_type(vcpu); >>> - if (fault_status != FSC_FAULT && fault_status != FSC_PERM && >>> - fault_status != FSC_ACCESS) { >>> + >>> + /* The host kernel will handle the synchronous external abort. There >>> + * is no need to pass the error into the guest. >>> + */ > >> Can we inject an sea into the guest, so that the guest can kill the >> application which causes the error if the guest won't be terminated >> later. I'm not sure whether ghes_handle_memory_failure() called in >> ghes_do_proc() will kill the qemu process. I think it only kill user >> processes marked with PF_MCE_PROCESS & PF_MCE_EARLY. > > My understanding is the pages will get unmapped and recovered where possible > (e.g. re-read from disk), the user space process will get SIGBUS/SIGSEV when it > next tries to access that page, which could be some time later. > These flags in find_early_kill_thread() are a way to make the memory-failure > code signal the process early, before it does any recovery. The 'MCE' makes me > think its x86 specific. > (early and late are described more in [0]) > > > Guests are a special case as QEMU may never access the faulty memory itself, so > it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support > for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I > have patches to add support for this which I intend to send at rc1. could you push this patch to opensource? > > [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given > x86s KVM_PFN_ERR_HWPOISON, this may be out of date. > > > Either way, once QEMU gets a signal indicating the virtual address, it can > generate its own APEI CPER records and use the KVM APIs to mock up an > Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the > guest's polling thread to come round, whichever was described to the guest via > the HEST/GHES tables). > > We can't hand the APEI CPER records we have in the kernel to the guest, as they > hold a host physical address, and maybe a host virtual address. We don't know > where in guest memory we could write new APEI CPER records as these locations > have to be reserved in the guests-UEFI memory map, and only QEMU knows where > they are. > > To deliver RAS events to a guest we have to get QEMU involved. > > > Thanks, > > James > > > [0] https://www.kernel.org/doc/Documentation/vm/hwpoison.txt >
Hi Marc, On 2017/3/2 17:39, Marc Zyngier wrote: > On 01/03/17 02:31, Xiongfeng Wang wrote: > > [lot of things] > >> If an SEA is injected into guest OS, the guest OS will jump to the SEA >> exception entry when the context switched to guest OS. And the CPSR and >> FAR_EL1 are recovered according to the content of vcpu. Then the guest >> OS can signal a process or panic. If another guest process read the >> error data, another SEA will be generated and it will be single too. >> >> Without QEMU involved, the drawback is that no APEI table can be mocked >> up in guest OS, and no memory_failure() will be called. So the memory of >> error data will be released into buddy system and assigned to another >> process. If the error was caused by instantaneous radiation or >> electromagnetic, the memory is usable again if it is written with a >> correct data. If the memory has wore out and a correct data is written, >> the ECC error may occurs again with high possibility. Before a 2-bit ECC >> error is reported, much more 1-bit errors will be reported. This is >> report to host os, the host os can determine the memory node has worn >> out and hot-plug out the memory node, and guest os may be terminated if >> its memory data can't be migrated. >> >> Of course, it is better to get QEMU involved, so the memory_failure can >> be executed in guest OS. But before that implemented, can we add SEA >> injection in kvm_handle_guest_abort()? > > No. I will strongly object to that. This is a platform decision to > forward SEAs, not an architectural one. The core KVM code is only > concerned about implementing the ARM architecture, and not something > that is firmware dependent. > Thanks for the reply! I'm not sure if there exists some misunderstanding here. I was saying that APEI stuff is not included in the core KVM code, but SEA injection can be included, just like the SEI injection in the core KVM code. Yes, APEI is firmware dependent, but I think SEA is not. And the processing for SEA doesn't depend on whether APEI is implemented. Thanks, Wang Xiongfeng .
Hi Tyler, On 28/02/17 19:43, Baicar, Tyler wrote: > On 2/24/2017 3:42 AM, James Morse wrote: >> On 21/02/17 21:22, Tyler Baicar wrote: >>> Currently external aborts are unsupported by the guest abort >>> handling. Add handling for SEAs so that the host kernel reports >>> SEAs which occur in the guest kernel. >>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c >>> index b2d57fc..403277b 100644 >>> --- a/arch/arm64/mm/fault.c >>> +++ b/arch/arm64/mm/fault.c >>> @@ -602,6 +602,24 @@ static const char *fault_name(unsigned int esr) >>> } >>> >>> /* >>> + * Handle Synchronous External Aborts that occur in a guest kernel. >>> + */ >>> +int handle_guest_sea(unsigned long addr, unsigned int esr) >>> +{ >>> + if(IS_ENABLED(HAVE_ACPI_APEI_SEA)) { >>> + nmi_enter(); >>> + ghes_notify_sea(); >>> + nmi_exit(); >> This nmi stuff was needed for synchronous aborts that may have interrupted >> APEI's interrupts-masked code. We want to avoid trying to take the same set of >> locks, hence taking the in_nmi() path through APEI. Here we know we interrupted >> a guest, so there is no risk that we have interrupted APEI on the host. >> ghes_notify_sea() can safely take the normal path. > Makes sense, I can remove the nmi_* calls here. Just occurs to me: if we do this we need to add the rcu_read_lock() in ghes_notify_sea() as its not protected by the rcu/nmi weirdness. Thanks, James
Hello James, On 3/6/2017 3:28 AM, James Morse wrote: > On 28/02/17 19:43, Baicar, Tyler wrote: >> On 2/24/2017 3:42 AM, James Morse wrote: >>> On 21/02/17 21:22, Tyler Baicar wrote: >>>> Currently external aborts are unsupported by the guest abort >>>> handling. Add handling for SEAs so that the host kernel reports >>>> SEAs which occur in the guest kernel. >>>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c >>>> index b2d57fc..403277b 100644 >>>> --- a/arch/arm64/mm/fault.c >>>> +++ b/arch/arm64/mm/fault.c >>>> @@ -602,6 +602,24 @@ static const char *fault_name(unsigned int esr) >>>> } >>>> >>>> /* >>>> + * Handle Synchronous External Aborts that occur in a guest kernel. >>>> + */ >>>> +int handle_guest_sea(unsigned long addr, unsigned int esr) >>>> +{ >>>> + if(IS_ENABLED(HAVE_ACPI_APEI_SEA)) { >>>> + nmi_enter(); >>>> + ghes_notify_sea(); >>>> + nmi_exit(); >>> This nmi stuff was needed for synchronous aborts that may have interrupted >>> APEI's interrupts-masked code. We want to avoid trying to take the same set of >>> locks, hence taking the in_nmi() path through APEI. Here we know we interrupted >>> a guest, so there is no risk that we have interrupted APEI on the host. >>> ghes_notify_sea() can safely take the normal path. >> Makes sense, I can remove the nmi_* calls here. > Just occurs to me: if we do this we need to add the rcu_read_lock() in > ghes_notify_sea() as its not protected by the rcu/nmi weirdness. > True, would you suggest leaving these nmi_* calls or adding the rcu_* calls? And since that's only needed for this KVM case, shouldn't the rcu_* calls just replace the nmi_* calls here (outside of ghes_notify_sea)? Thanks, Tyler
Hi James, > Guests are a special case as QEMU may never access the faulty memory itself, so > it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support > for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I > have patches to add support for this which I intend to send at rc1. > > [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given > x86s KVM_PFN_ERR_HWPOISON, this may be out of date. > > > Either way, once QEMU gets a signal indicating the virtual address, it can > generate its own APEI CPER records and use the KVM APIs to mock up an > Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the > guest's polling thread to come round, whichever was described to the guest via > the HEST/GHES tables). > I have another confusion about the SIGBUS signal. Can QEMU always get a SIGBUS when needed. I know one circumstance which will send SIGBUS. The ghes_handle_memory_failure() in ghes_do_proc() will send SIGBUS to QEMU, but this only happens when there exists memory section in ghes, that is the section type is CPER_SEC_PLATFORM_MEM. Suppose this case, an load error in guest application causes an SEA, and the firmware take it. The firmware begin to scan the error record and fill the ghes. But the error record in memory node has been read by other handler. The firmware won't add memory section in ghes, so ghes_handle_memory_failure() won't be called. I mean that we may not rely on ghes_handle_memory_failure() to send SIGBUS to QEMU. Whether we should add some other code to send SIGBUS in handle_guest_abort(). I don't know whether the ARM/arm64 KVM_PFN_ERR_HWPOISON you mentioned above will cover all the cases. Thanks, Wang Xiongfeng .
Hi Wang Xiongfeng, On 22/03/17 02:46, Xiongfeng Wang wrote: >> Guests are a special case as QEMU may never access the faulty memory itself, so >> it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support >> for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I >> have patches to add support for this which I intend to send at rc1. >> >> [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given >> x86s KVM_PFN_ERR_HWPOISON, this may be out of date. >> >> >> Either way, once QEMU gets a signal indicating the virtual address, it can >> generate its own APEI CPER records and use the KVM APIs to mock up an >> Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the >> guest's polling thread to come round, whichever was described to the guest via >> the HEST/GHES tables). >> > > I have another confusion about the SIGBUS signal. Can QEMU always get a SIGBUS when needed. > I know one circumstance which will send SIGBUS. The ghes_handle_memory_failure() in > ghes_do_proc() will send SIGBUS to QEMU, but this only happens when there exists memory section > in ghes, that is the section type is CPER_SEC_PLATFORM_MEM. > Suppose this case, an load error in guest application causes an SEA, and the firmware take it. > The firmware begin to scan the error record and fill the ghes. But the error record in memory node > has been read by other handler. (this looks like a race) > The firmware won't add memory section in ghes, so ghes_handle_memory_failure() won't be called. I think this would be a firmware bug. Firmware can reserve as much memory as it needs for writing CPER records, there should not be a case where 'the memory' is currently being processed by another handler. The memory firmware uses to write CPER records too shouldn't be published to the OS until it has finished. Once firmware has finished writing the CPER records it can update the memory pointed to by GHES->ErrorStatusAddress with the location of the CPER records and invoke the Notification method for this GHES. (SEI, SEA, IRQ etc). We should always get a complete set of CPER records to describe the error. It firmware uses GHESv2 it can get an 'ack' write from APEI once it has finished processing the records. Once it gets this firmware knows it can re-use the memory. (Obviously each GHES entry can only process one error at a time. Firmware should either handle this, or have one entry for each Error Source that can happen independently) > I mean that we may not rely on ghes_handle_memory_failure() to send SIGBUS to QEMU. Whether we should > add some other code to send SIGBUS in handle_guest_abort(). I don't know whether the ARM/arm64 > KVM_PFN_ERR_HWPOISON you mentioned above will cover all the cases. The SIGBUS routine is part of the kernel's recovery method for memory errors. It should cover all the errors reported with this CPER_SEC_PLATFORM_MEM. Back to the race you describe. It shouldn't matter if one CPU processes an error for guest memory while a vcpu is running on another. This may happen if the error was detected by DRAM's background scrub. If we don't treat KVM/Qemu as anything special the memory_failure()->SIGBUS path will happen regardless of whether the fault interrupted the guest or not. There are other types of error such as PCIe, CPU, BUS error etc. If it's possible to recover from these we may need additional code in the kernel. This shouldn't necessarily treat KVM as a special case. Thanks, James
Hi James, On 2017/3/22 19:14, James Morse wrote: > Hi Wang Xiongfeng, > > On 22/03/17 02:46, Xiongfeng Wang wrote: >>> Guests are a special case as QEMU may never access the faulty memory itself, so >>> it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support >>> for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I >>> have patches to add support for this which I intend to send at rc1. >>> >>> [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given >>> x86s KVM_PFN_ERR_HWPOISON, this may be out of date. >>> >>> >>> Either way, once QEMU gets a signal indicating the virtual address, it can >>> generate its own APEI CPER records and use the KVM APIs to mock up an >>> Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the >>> guest's polling thread to come round, whichever was described to the guest via >>> the HEST/GHES tables). >>> >> >> I have another confusion about the SIGBUS signal. Can QEMU always get a SIGBUS when needed. >> I know one circumstance which will send SIGBUS. The ghes_handle_memory_failure() in >> ghes_do_proc() will send SIGBUS to QEMU, but this only happens when there exists memory section >> in ghes, that is the section type is CPER_SEC_PLATFORM_MEM. >> Suppose this case, an load error in guest application causes an SEA, and the firmware take it. >> The firmware begin to scan the error record and fill the ghes. But the error record in memory node >> has been read by other handler. > > (this looks like a race) > >> The firmware won't add memory section in ghes, so ghes_handle_memory_failure() won't be called. > > I think this would be a firmware bug. Firmware can reserve as much memory as it > needs for writing CPER records, there should not be a case where 'the memory' is > currently being processed by another handler. I have a question here: Consider this case, the memory controller first detected a memory error, but it has not been consumed. So it will not generate the SEA. Memory error may be reported to the OS by IRQ with MEM section in CPER record; and after for a while, the error data was loaded into the cache and consumed, when the SEA is generated. Is it possible only processor section, and no MEM section in CPER record? Obviously there are two different GHES above, one for SEA and another for IRQ/GSIV. Could we assume that there is mem section in the SEA ghes table? > > The memory firmware uses to write CPER records too shouldn't be published to the > OS until it has finished. Once firmware has finished writing the CPER records it > can update the memory pointed to by GHES->ErrorStatusAddress with the location > of the CPER records and invoke the Notification method for this GHES. (SEI, SEA, > IRQ etc). We should always get a complete set of CPER records to describe the error. > Does it mean that the BIOS has the responsibility to ensure that the GHES table has a complete error info, including memory, bus, tlb, cache and other related error info?
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index e22089f..33a77509 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -187,6 +187,7 @@ #define FSC_FAULT (0x04) #define FSC_ACCESS (0x08) #define FSC_PERM (0x0c) +#define FSC_EXTABT (0x10) /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~0xf) diff --git a/arch/arm/include/asm/system_misc.h b/arch/arm/include/asm/system_misc.h index a3d61ad..ea45d94 100644 --- a/arch/arm/include/asm/system_misc.h +++ b/arch/arm/include/asm/system_misc.h @@ -24,4 +24,9 @@ #endif /* !__ASSEMBLY__ */ +static inline int handle_guest_sea(unsigned long addr, unsigned int esr) +{ + return -1; +} + #endif /* __ASM_ARM_SYSTEM_MISC_H */ diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index a5265ed..04f1dd50 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -29,6 +29,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/virt.h> +#include <asm/system_misc.h> #include "trace.h" @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) /* Check the stage-2 fault is trans. fault or write fault */ fault_status = kvm_vcpu_trap_get_fault_type(vcpu); - if (fault_status != FSC_FAULT && fault_status != FSC_PERM && - fault_status != FSC_ACCESS) { + + /* The host kernel will handle the synchronous external abort. There + * is no need to pass the error into the guest. + */ + if (fault_status == FSC_EXTABT) { + if(handle_guest_sea((unsigned long)fault_ipa, + kvm_vcpu_get_hsr(vcpu))) { + kvm_err("Failed to handle guest SEA, FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", + kvm_vcpu_trap_get_class(vcpu), + (unsigned long)kvm_vcpu_trap_get_fault(vcpu), + (unsigned long)kvm_vcpu_get_hsr(vcpu)); + return -EFAULT; + } + } else if (fault_status != FSC_FAULT && fault_status != FSC_PERM && + fault_status != FSC_ACCESS) { kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", kvm_vcpu_trap_get_class(vcpu), (unsigned long)kvm_vcpu_trap_get_fault(vcpu), diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 2a2752b..2b11d59 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -201,6 +201,7 @@ #define FSC_FAULT ESR_ELx_FSC_FAULT #define FSC_ACCESS ESR_ELx_FSC_ACCESS #define FSC_PERM ESR_ELx_FSC_PERM +#define FSC_EXTABT ESR_ELx_FSC_EXTABT /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~UL(0xf)) diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h index bc81243..5b2cecd1 100644 --- a/arch/arm64/include/asm/system_misc.h +++ b/arch/arm64/include/asm/system_misc.h @@ -58,4 +58,6 @@ void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned int, #endif /* __ASSEMBLY__ */ +int handle_guest_sea(unsigned long addr, unsigned int esr); + #endif /* __ASM_SYSTEM_MISC_H */ diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index b2d57fc..403277b 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -602,6 +602,24 @@ static const char *fault_name(unsigned int esr) } /* + * Handle Synchronous External Aborts that occur in a guest kernel. + */ +int handle_guest_sea(unsigned long addr, unsigned int esr) +{ + /* + * synchronize_rcu() will wait for nmi_exit(), so no need to + * rcu_read_lock(). + */ + if(IS_ENABLED(HAVE_ACPI_APEI_SEA)) { + nmi_enter(); + ghes_notify_sea(); + nmi_exit(); + } + + return 0; +} + +/* * Dispatch a data abort to the relevant handler. */ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
Currently external aborts are unsupported by the guest abort handling. Add handling for SEAs so that the host kernel reports SEAs which occur in the guest kernel. Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org> --- arch/arm/include/asm/kvm_arm.h | 1 + arch/arm/include/asm/system_misc.h | 5 +++++ arch/arm/kvm/mmu.c | 18 ++++++++++++++++-- arch/arm64/include/asm/kvm_arm.h | 1 + arch/arm64/include/asm/system_misc.h | 2 ++ arch/arm64/mm/fault.c | 18 ++++++++++++++++++ 6 files changed, 43 insertions(+), 2 deletions(-)