Message ID | 20191219040612.28431-1-gshan@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [RFC] hw/arm/virt: Support NMI injection | expand |
On 12/19/19 3:06 PM, Gavin Shan wrote: > This supports NMI injection for virtual machine and currently it's only > supported on GICv3 controller, which is emulated by qemu or host kernel. > The design is highlighted as below: > > * The NMI is identified by its priority (0x20). In the guest (linux) > kernel, the GICC_PMR is set to 0x80, to block all interrupts except > the NMIs when the external interrupt is disabled. It means the FIQ > and IRQ bit in PSTATE isn't touched when the functionality (NMI) is > functional. > * LPIs aren't considered as NMIs because of their nature. It means NMI > is either SPI or PPI. Besides, the NMIs are injected in round-robin > fashion is there are multiple NMIs existing. > * When the GICv3 controller is emulated by qemu, the interrupt states > (e.g. enabled, priority) is fetched from the corresponding data struct > directly. However, we have to pause all CPUs to fetch the interrupt > states from host in advance if the GICv3 controller is emulated by > host. > > The testing scenario is to tweak guest (linux) kernel where the pl011 SPI > can be enabled as NMI by request_nmi(). Check "/proc/interrupts" after injecting > several NMIs, to see if the interrupt count is increased or not. The result > is just as expected. > > Signed-off-by: Gavin Shan <gshan@redhat.com> > --- > hw/arm/virt.c | 24 ++++++++ > hw/intc/arm_gicv3.c | 76 ++++++++++++++++++++++++ > hw/intc/arm_gicv3_kvm.c | 92 ++++++++++++++++++++++++++++++ > include/hw/intc/arm_gicv3_common.h | 2 + > 4 files changed, 194 insertions(+) > Peter, could you please take a look when you have free time? Thanks in advance for your comments :) Thanks, Gavin > diff --git a/hw/arm/virt.c b/hw/arm/virt.c > index 39ab5f47e0..fc58ee70b4 100644 > --- a/hw/arm/virt.c > +++ b/hw/arm/virt.c > @@ -71,6 +71,8 @@ > #include "hw/mem/pc-dimm.h" > #include "hw/mem/nvdimm.h" > #include "hw/acpi/generic_event_device.h" > +#include "hw/nmi.h" > +#include "hw/intc/arm_gicv3.h" > > #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ > static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ > @@ -1980,6 +1982,25 @@ static void virt_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev, > " type: %s", object_get_typename(OBJECT(dev))); > } > > +static void virt_nmi(NMIState *n, int cpu_index, Error **errp) > +{ > + VirtMachineState *vms = VIRT_MACHINE(n); > + ARMGICv3CommonClass *agcc; > + > + if (vms->gic_version != 3) { > + error_setg(errp, "NMI is only supported by GICv3"); > + return; > + } > + > + agcc = ARM_GICV3_COMMON_GET_CLASS(vms->gic); > + if (agcc->inject_nmi) { > + agcc->inject_nmi(vms->gic, cpu_index, errp); > + } else { > + error_setg(errp, "NMI injection isn't supported by %s", > + object_get_typename(OBJECT(vms->gic))); > + } > +} > + > static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine, > DeviceState *dev) > { > @@ -2025,6 +2046,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) > { > MachineClass *mc = MACHINE_CLASS(oc); > HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); > + NMIClass *nc = NMI_CLASS(oc); > > mc->init = machvirt_init; > /* Start with max_cpus set to 512, which is the maximum supported by KVM. > @@ -2051,6 +2073,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) > hc->pre_plug = virt_machine_device_pre_plug_cb; > hc->plug = virt_machine_device_plug_cb; > hc->unplug_request = virt_machine_device_unplug_request_cb; > + nc->nmi_monitor_handler = virt_nmi; > mc->numa_mem_supported = true; > mc->auto_enable_numa_with_memhp = true; > } > @@ -2136,6 +2159,7 @@ static const TypeInfo virt_machine_info = { > .instance_init = virt_instance_init, > .interfaces = (InterfaceInfo[]) { > { TYPE_HOTPLUG_HANDLER }, > + { TYPE_NMI }, > { } > }, > }; > diff --git a/hw/intc/arm_gicv3.c b/hw/intc/arm_gicv3.c > index 66eaa97198..d3409cb6ef 100644 > --- a/hw/intc/arm_gicv3.c > +++ b/hw/intc/arm_gicv3.c > @@ -338,6 +338,81 @@ static void gicv3_set_irq(void *opaque, int irq, int level) > } > } > > +static bool arm_gicv3_inject_nmi_once(GICv3State*s, int start, int end) > +{ > + GICv3CPUState *cs; > + int irq_count = (s->num_irq + (GIC_INTERNAL * (s->num_cpu - 1))); > + int i, cpu, irq; > + > + /* SPIs */ > + for (i = start; (i < end) && (i < (s->num_irq - GIC_INTERNAL)); i++) { > + if (gicv3_gicd_enabled_test(s, i + GIC_INTERNAL) && > + s->gicd_ipriority[i + GIC_INTERNAL] == 0x20) { > + > + /* > + * Reset the level and toggling the pending bit will ensure > + * the interrupt is queued. > + */ > + if (gicv3_gicd_level_test(s, i + GIC_INTERNAL)) { > + gicv3_set_irq(s, i, false); > + } > + > + gicv3_gicd_pending_set(s, i + GIC_INTERNAL); > + gicv3_set_irq(s, i, true); > + > + s->last_nmi_index = (i + 1); > + return true; > + } > + } > + > + /* PPIs */ > + if (start < (s->num_irq - GIC_INTERNAL)) { > + start = (s->num_irq - GIC_INTERNAL); > + } > + > + for (i = start; (i < end) && (i < irq_count); i++) { > + cpu = (i - ((s->num_irq - GIC_INTERNAL))) / GIC_INTERNAL; > + irq = (i - ((s->num_irq - GIC_INTERNAL))) % GIC_INTERNAL; > + cs = &s->cpu[cpu]; > + > + if ((cs->gicr_ienabler0 & (1 << irq)) && > + cs->gicr_ipriorityr[irq] == 0x20) { > + > + if (extract32(cs->level, irq, 1)) { > + gicv3_set_irq(s, i, false); > + } > + > + deposit32(cs->gicr_ipendr0, irq, 1, 1); > + gicv3_set_irq(s, i, true); > + > + s->last_nmi_index = (i + 1); > + if (s->last_nmi_index > irq_count) { > + s->last_nmi_index = 0; > + } > + > + return true; > + } > + } > + > + return false; > +} > + > +static void arm_gicv3_inject_nmi(DeviceState *dev, int cpu_index, Error **errp) > +{ > + GICv3State *s = ARM_GICV3(dev); > + int irq_count = (s->num_irq + (GIC_INTERNAL * (s->num_cpu - 1))); > + bool injected; > + > + injected = arm_gicv3_inject_nmi_once(s, s->last_nmi_index, irq_count); > + if (!injected) { > + injected = arm_gicv3_inject_nmi_once(s, 0, s->last_nmi_index); > + } > + > + if (!injected) { > + error_setg(errp, "No NMI found"); > + } > +} > + > static void arm_gicv3_post_load(GICv3State *s) > { > /* Recalculate our cached idea of the current highest priority > @@ -395,6 +470,7 @@ static void arm_gicv3_class_init(ObjectClass *klass, void *data) > ARMGICv3CommonClass *agcc = ARM_GICV3_COMMON_CLASS(klass); > ARMGICv3Class *agc = ARM_GICV3_CLASS(klass); > > + agcc->inject_nmi = arm_gicv3_inject_nmi; > agcc->post_load = arm_gicv3_post_load; > device_class_set_parent_realize(dc, arm_gic_realize, &agc->parent_realize); > } > diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c > index 9c7f4ab871..b076d67c52 100644 > --- a/hw/intc/arm_gicv3_kvm.c > +++ b/hw/intc/arm_gicv3_kvm.c > @@ -31,6 +31,7 @@ > #include "gicv3_internal.h" > #include "vgic_common.h" > #include "migration/blocker.h" > +#include "sysemu/cpus.h" > > #ifdef DEBUG_GICV3_KVM > #define DPRINTF(fmt, ...) \ > @@ -506,6 +507,96 @@ static void kvm_arm_gicv3_put(GICv3State *s) > } > } > > +static bool kvm_arm_gicv3_inject_nmi_once(GICv3State *s, int start, int end) > +{ > + GICv3CPUState *cs; > + int irq_count = (s->num_irq + (GIC_INTERNAL * (s->num_cpu - 1))); > + int i, cpu, irq; > + > + /* SPIs */ > + for (i = start; (i < end) && (i < (s->num_irq - GIC_INTERNAL)); i++) { > + if (gicv3_gicd_enabled_test(s, i + GIC_INTERNAL) && > + s->gicd_ipriority[i + GIC_INTERNAL] == 0x20) { > + kvm_arm_gicv3_set_irq(s, i, true); > + > + s->last_nmi_index = (i + 1); > + return true; > + } > + } > + > + /* PPIs */ > + if (start < (s->num_irq - GIC_INTERNAL)) { > + start = (s->num_irq - GIC_INTERNAL); > + } > + > + for (i = start; (i < end) && (i < irq_count); i++) { > + cpu = (i - ((s->num_irq - GIC_INTERNAL))) / GIC_INTERNAL; > + irq = (i - ((s->num_irq - GIC_INTERNAL))) % GIC_INTERNAL; > + cs = &s->cpu[cpu]; > + > + if ((cs->gicr_ienabler0 & (1 << irq)) && > + cs->gicr_ipriorityr[irq] == 0x20) { > + kvm_arm_gicv3_set_irq(s, i, true); > + > + s->last_nmi_index = (i + 1); > + if (s->last_nmi_index > irq_count) { > + s->last_nmi_index = 0; > + } > + > + return true; > + } > + } > + > + return false; > +} > + > +static void kvm_arm_gicv3_snapshot(GICv3State *s) > +{ > + GICv3CPUState *c; > + uint32_t val; > + int i, j; > + > + pause_all_vcpus(); > + > + kvm_dist_getbmp(s, GICD_ISENABLER, s->enabled); > + kvm_dist_get_priority(s, GICD_IPRIORITYR, s->gicd_ipriority); > + for (i = 0; i < s->num_cpu; i++) { > + c = &s->cpu[i]; > + > + kvm_gicr_access(s, GICR_ISENABLER0, i, &val, false); > + c->gicr_ienabler0 = val; > + > + for (j = 0; j < GIC_INTERNAL; j += 4) { > + kvm_gicr_access(s, GICR_IPRIORITYR + j, i, &val, false); > + c->gicr_ipriorityr[j] = extract32(val, 0, 8); > + c->gicr_ipriorityr[j + 1] = extract32(val, 8, 8); > + c->gicr_ipriorityr[j + 2] = extract32(val, 16, 8); > + c->gicr_ipriorityr[j + 3] = extract32(val, 24, 8); > + } > + } > + > + resume_all_vcpus(); > +} > + > +static void kvm_arm_gicv3_inject_nmi(DeviceState *dev, > + int cpu_index, Error **errp) > +{ > + GICv3State *s = KVM_ARM_GICV3(dev); > + int irq_count = (s->num_irq + (GIC_INTERNAL * (s->num_cpu - 1))); > + bool injected; > + > + kvm_arm_gicv3_snapshot(s); > + > + injected = kvm_arm_gicv3_inject_nmi_once(s, s->last_nmi_index, irq_count); > + if (!injected) { > + injected = kvm_arm_gicv3_inject_nmi_once(s, 0, s->last_nmi_index); > + } > + > + if (!injected) { > + error_setg(errp, "No NMI found"); > + } > +} > + > static void kvm_arm_gicv3_get(GICv3State *s) > { > uint32_t regl, regh, reg; > @@ -882,6 +973,7 @@ static void kvm_arm_gicv3_class_init(ObjectClass *klass, void *data) > ARMGICv3CommonClass *agcc = ARM_GICV3_COMMON_CLASS(klass); > KVMARMGICv3Class *kgc = KVM_ARM_GICV3_CLASS(klass); > > + agcc->inject_nmi = kvm_arm_gicv3_inject_nmi; > agcc->pre_save = kvm_arm_gicv3_get; > agcc->post_load = kvm_arm_gicv3_put; > device_class_set_parent_realize(dc, kvm_arm_gicv3_realize, > diff --git a/include/hw/intc/arm_gicv3_common.h b/include/hw/intc/arm_gicv3_common.h > index 31ec9a1ae4..0ae9c45aa2 100644 > --- a/include/hw/intc/arm_gicv3_common.h > +++ b/include/hw/intc/arm_gicv3_common.h > @@ -225,6 +225,7 @@ struct GICv3State { > > int dev_fd; /* kvm device fd if backed by kvm vgic support */ > Error *migration_blocker; > + int last_nmi_index; > > /* Distributor */ > > @@ -291,6 +292,7 @@ typedef struct ARMGICv3CommonClass { > SysBusDeviceClass parent_class; > /*< public >*/ > > + void (*inject_nmi)(DeviceState *dev, int cpu_index, Error **errp); > void (*pre_save)(GICv3State *s); > void (*post_load)(GICv3State *s); > } ARMGICv3CommonClass; >
On Thu, 19 Dec 2019 at 04:06, Gavin Shan <gshan@redhat.com> wrote: > This supports NMI injection for virtual machine and currently it's only > supported on GICv3 controller, which is emulated by qemu or host kernel. > The design is highlighted as below: > > * The NMI is identified by its priority (0x20). In the guest (linux) > kernel, the GICC_PMR is set to 0x80, to block all interrupts except > the NMIs when the external interrupt is disabled. It means the FIQ > and IRQ bit in PSTATE isn't touched when the functionality (NMI) is > functional. > * LPIs aren't considered as NMIs because of their nature. It means NMI > is either SPI or PPI. Besides, the NMIs are injected in round-robin > fashion is there are multiple NMIs existing. > * When the GICv3 controller is emulated by qemu, the interrupt states > (e.g. enabled, priority) is fetched from the corresponding data struct > directly. However, we have to pause all CPUs to fetch the interrupt > states from host in advance if the GICv3 controller is emulated by > host. > > The testing scenario is to tweak guest (linux) kernel where the pl011 SPI > can be enabled as NMI by request_nmi(). Check "/proc/interrupts" after injecting > several NMIs, to see if the interrupt count is increased or not. The result > is just as expected. So, QEMU is trying to emulate actual hardware. None of this looks to me like what GICv3 hardware does... If you want to have the virt board send an interrupt, do it the usual way by wiring up a qemu_irq from some device to the GIC, please. (More generally, there is no concept of an "NMI" in the GIC; there are just interrupts at varying possible guest-programmable priority levels.) thanks -- PMM
[including more folks into the discussion] > On Fri, 17 Jan 2020 at 14:00, Peter Maydell <peter.maydell@linaro.org> wrote: >> On Thu, 19 Dec 2019 at 04:06, Gavin Shan <gshan@redhat.com> wrote: >>> This supports NMI injection for virtual machine and currently it's only >>> supported on GICv3 controller, which is emulated by qemu or host kernel. >>> The design is highlighted as below: >>> >>> * The NMI is identified by its priority (0x20). In the guest (linux) >>> kernel, the GICC_PMR is set to 0x80, to block all interrupts except >>> the NMIs when the external interrupt is disabled. It means the FIQ >>> and IRQ bit in PSTATE isn't touched when the functionality (NMI) is >>> functional. >>> * LPIs aren't considered as NMIs because of their nature. It means NMI >>> is either SPI or PPI. Besides, the NMIs are injected in round-robin >>> fashion is there are multiple NMIs existing. >>> * When the GICv3 controller is emulated by qemu, the interrupt states >>> (e.g. enabled, priority) is fetched from the corresponding data struct >>> directly. However, we have to pause all CPUs to fetch the interrupt >>> states from host in advance if the GICv3 controller is emulated by >>> host. >>> >>> The testing scenario is to tweak guest (linux) kernel where the pl011 SPI >>> can be enabled as NMI by request_nmi(). Check "/proc/interrupts" after injecting >>> several NMIs, to see if the interrupt count is increased or not. The result >>> is just as expected. >>> > > So, QEMU is trying to emulate actual hardware. None of this > looks to me like what GICv3 hardware does... If you want to > have the virt board send an interrupt, do it the usual way > by wiring up a qemu_irq from some device to the GIC, please. > (More generally, there is no concept of an "NMI" in the GIC; > there are just interrupts at varying possible guest-programmable > priority levels.) > Peter, I missed to read your reply in time and apologies for late response. Yes, there is no concept of "NMI" in the GIC from hardware perspective. However, NMI has been supported from the software by kernel commit bc3c03ccb4641 ("arm64: Enable the support of pseudo-NMIs"). The NMIs have higher priority than normal ones. NMIs are deliverable after local_irq_disable() because the SYS_ICC_PMR_EL1 is tweaked so that normal interrupts are masked only. It's unclear about the purpose of "nmi" QMP/HMP command. It's why I put a RFC tag. The command has been supported by multiple architects including x86/ppc. However, they are having different behaviors. The system will be restarted on ppc with this command, but a NMI is injected through LAPIC on x86. So I'm not sure what architect (system reset on ppc or injecting NMI on x86) aarch64 should follow. Thanks, Gavin
Hi, On 1/28/20 7:48 AM, Gavin Shan wrote: > [including more folks into the discussion] > >> On Fri, 17 Jan 2020 at 14:00, Peter Maydell <peter.maydell@linaro.org> >> wrote: >>> On Thu, 19 Dec 2019 at 04:06, Gavin Shan <gshan@redhat.com> wrote: >>>> This supports NMI injection for virtual machine and currently it's only >>>> supported on GICv3 controller, which is emulated by qemu or host >>>> kernel. >>>> The design is highlighted as below: >>>> >>>> * The NMI is identified by its priority (0x20). In the guest (linux) >>>> kernel, the GICC_PMR is set to 0x80, to block all interrupts except >>>> the NMIs when the external interrupt is disabled. It means the FIQ >>>> and IRQ bit in PSTATE isn't touched when the functionality (NMI) is >>>> functional. >>>> * LPIs aren't considered as NMIs because of their nature. It means NMI >>>> is either SPI or PPI. Besides, the NMIs are injected in round-robin >>>> fashion is there are multiple NMIs existing. >>>> * When the GICv3 controller is emulated by qemu, the interrupt states >>>> (e.g. enabled, priority) is fetched from the corresponding data struct >>>> directly. However, we have to pause all CPUs to fetch the interrupt >>>> states from host in advance if the GICv3 controller is emulated by >>>> host. >>>> >>>> The testing scenario is to tweak guest (linux) kernel where the >>>> pl011 SPI >>>> can be enabled as NMI by request_nmi(). Check "/proc/interrupts" >>>> after injecting >>>> several NMIs, to see if the interrupt count is increased or not. The >>>> result >>>> is just as expected. >>>> >> >> So, QEMU is trying to emulate actual hardware. None of this >> looks to me like what GICv3 hardware does... If you want to >> have the virt board send an interrupt, do it the usual way >> by wiring up a qemu_irq from some device to the GIC, please. >> (More generally, there is no concept of an "NMI" in the GIC; >> there are just interrupts at varying possible guest-programmable >> priority levels.) >> > > Peter, I missed to read your reply in time and apologies for late response. > > Yes, there is no concept of "NMI" in the GIC from hardware perspective. > However, NMI has been supported from the software by kernel commit > bc3c03ccb4641 ("arm64: Enable the support of pseudo-NMIs"). The NMIs > have higher priority than normal ones. NMIs are deliverable after > local_irq_disable() because the SYS_ICC_PMR_EL1 is tweaked so that > normal interrupts are masked only. > > It's unclear about the purpose of "nmi" QMP/HMP command. It's why I > put a RFC tag. The command has been supported by multiple architects > including x86/ppc. However, they are having different behaviors. The > system will be restarted on ppc with this command, but a NMI is injected > through LAPIC on x86. So I'm not sure what architect (system reset on > ppc or injecting NMI on x86) aarch64 should follow. arm_pmu driver was reworked to use pseudo-NMIs. I don't know the exact status of this work though (https://patchwork.kernel.org/cover/11047407/). So we cannot use any random NMI for guest injection. I wonder whether we should implement the KVM_NMI vcpu ioctl once we have agreed on which behavior is expected upon NMI injection. However the kernel doc says this ioctl only is well defined if "KVM_CREATE_IRQCHIP has not been called" (?). Thanks Eric > > Thanks, > Gavin > >
Hi Gavin, On 1/28/20 6:48 AM, Gavin Shan wrote: > [including more folks into the discussion] > >> On Fri, 17 Jan 2020 at 14:00, Peter Maydell <peter.maydell@linaro.org> >> wrote: >>> On Thu, 19 Dec 2019 at 04:06, Gavin Shan <gshan@redhat.com> wrote: >>>> This supports NMI injection for virtual machine and currently it's only >>>> supported on GICv3 controller, which is emulated by qemu or host >>>> kernel. >>>> The design is highlighted as below: >>>> >>>> * The NMI is identified by its priority (0x20). In the guest (linux) >>>> kernel, the GICC_PMR is set to 0x80, to block all interrupts except >>>> the NMIs when the external interrupt is disabled. It means the FIQ >>>> and IRQ bit in PSTATE isn't touched when the functionality (NMI) is >>>> functional. >>>> * LPIs aren't considered as NMIs because of their nature. It means NMI >>>> is either SPI or PPI. Besides, the NMIs are injected in round-robin >>>> fashion is there are multiple NMIs existing. >>>> * When the GICv3 controller is emulated by qemu, the interrupt states >>>> (e.g. enabled, priority) is fetched from the corresponding data struct >>>> directly. However, we have to pause all CPUs to fetch the interrupt >>>> states from host in advance if the GICv3 controller is emulated by >>>> host. >>>> >>>> The testing scenario is to tweak guest (linux) kernel where the >>>> pl011 SPI >>>> can be enabled as NMI by request_nmi(). Check "/proc/interrupts" >>>> after injecting >>>> several NMIs, to see if the interrupt count is increased or not. The >>>> result >>>> is just as expected. >>>> >> >> So, QEMU is trying to emulate actual hardware. None of this >> looks to me like what GICv3 hardware does... If you want to >> have the virt board send an interrupt, do it the usual way >> by wiring up a qemu_irq from some device to the GIC, please. >> (More generally, there is no concept of an "NMI" in the GIC; >> there are just interrupts at varying possible guest-programmable >> priority levels.) >> > > Peter, I missed to read your reply in time and apologies for late response. > > Yes, there is no concept of "NMI" in the GIC from hardware perspective. > However, NMI has been supported from the software by kernel commit > bc3c03ccb4641 ("arm64: Enable the support of pseudo-NMIs"). The NMIs > have higher priority than normal ones. NMIs are deliverable after > local_irq_disable() because the SYS_ICC_PMR_EL1 is tweaked so that > normal interrupts are masked only. > > It's unclear about the purpose of "nmi" QMP/HMP command. It's why I > put a RFC tag. The command has been supported by multiple architects > including x86/ppc. However, they are having different behaviors. The > system will be restarted on ppc with this command, but a NMI is injected > through LAPIC on x86. So I'm not sure what architect (system reset on > ppc or injecting NMI on x86) aarch64 should follow. > As Peter stated, there is no NMI concept on aarch64 hardware. The pseudo-NMI in the Linux port is purely a software concept. The OS itself decides which interrupts should have the "NMI" properties and sets them up accordingly. For QEMU to inject a pseudo-NMI into the guest would require it not only to know that the guest supports that feature. But also how such an interrupt has to be set up (currently there is no guaranty that the priority used for the NMI and the mask should stay the same across Linux version as it is purely internal to GICv3/arm64, no generic kAPI nor uAPI have access to it). And also, you would probably need to know what is handling the NMI you are injecting. QEMU shouldn't try to guess "that might be dealt as an NMI, lets raise it". I'm not familiar with the QMP/HMP nor the inner workings of QEMU, but if for some reason QEMU requires to trigger an NMI-like mechanic on aarch64, a proper way might be through para-virt. Having some "qemu-nmi-driver" in linux which calls "request_nmi()" and does the proper handling expected by QEMU. Cheers,
Gavin, Eric, On 2020-01-28 08:05, Auger Eric wrote: > Hi, > > On 1/28/20 7:48 AM, Gavin Shan wrote: >> [including more folks into the discussion] >> >>> On Fri, 17 Jan 2020 at 14:00, Peter Maydell >>> <peter.maydell@linaro.org> >>> wrote: >>>> On Thu, 19 Dec 2019 at 04:06, Gavin Shan <gshan@redhat.com> wrote: >>>>> This supports NMI injection for virtual machine and currently it's >>>>> only >>>>> supported on GICv3 controller, which is emulated by qemu or host >>>>> kernel. >>>>> The design is highlighted as below: >>>>> >>>>> * The NMI is identified by its priority (0x20). In the guest >>>>> (linux) >>>>> kernel, the GICC_PMR is set to 0x80, to block all interrupts except >>>>> the NMIs when the external interrupt is disabled. It means the FIQ >>>>> and IRQ bit in PSTATE isn't touched when the functionality (NMI) is >>>>> functional. >>>>> * LPIs aren't considered as NMIs because of their nature. It means >>>>> NMI >>>>> is either SPI or PPI. Besides, the NMIs are injected in round-robin >>>>> fashion is there are multiple NMIs existing. >>>>> * When the GICv3 controller is emulated by qemu, the interrupt >>>>> states >>>>> (e.g. enabled, priority) is fetched from the corresponding data >>>>> struct >>>>> directly. However, we have to pause all CPUs to fetch the interrupt >>>>> states from host in advance if the GICv3 controller is emulated by >>>>> host. >>>>> >>>>> The testing scenario is to tweak guest (linux) kernel where the >>>>> pl011 SPI >>>>> can be enabled as NMI by request_nmi(). Check "/proc/interrupts" >>>>> after injecting >>>>> several NMIs, to see if the interrupt count is increased or not. >>>>> The >>>>> result >>>>> is just as expected. >>>>> >>> >>> So, QEMU is trying to emulate actual hardware. None of this >>> looks to me like what GICv3 hardware does... If you want to >>> have the virt board send an interrupt, do it the usual way >>> by wiring up a qemu_irq from some device to the GIC, please. >>> (More generally, there is no concept of an "NMI" in the GIC; >>> there are just interrupts at varying possible guest-programmable >>> priority levels.) >>> >> >> Peter, I missed to read your reply in time and apologies for late >> response. >> >> Yes, there is no concept of "NMI" in the GIC from hardware >> perspective. >> However, NMI has been supported from the software by kernel commit >> bc3c03ccb4641 ("arm64: Enable the support of pseudo-NMIs"). The NMIs >> have higher priority than normal ones. NMIs are deliverable after >> local_irq_disable() because the SYS_ICC_PMR_EL1 is tweaked so that >> normal interrupts are masked only. And none of that is an NMI. This is a completely SW-defined mechanism, and you can't rely on this to inject something that would behave as a NMI in in a guest. I thought the "pseudo" prefix would give it away :-(. >> >> It's unclear about the purpose of "nmi" QMP/HMP command. It's why I >> put a RFC tag. The command has been supported by multiple architects >> including x86/ppc. However, they are having different behaviors. The >> system will be restarted on ppc with this command, but a NMI is >> injected >> through LAPIC on x86. So I'm not sure what architect (system reset on >> ppc or injecting NMI on x86) aarch64 should follow. The x86 NMI has no equivalent on ARM, full stop. And the only thing that the ARM implementation should follow is the letter of the architecture, without added concepts. > arm_pmu driver was reworked to use pseudo-NMIs. I don't know the exact > status of this work though > (https://patchwork.kernel.org/cover/11047407/). So we cannot use any > random NMI for guest injection. > > I wonder whether we should implement the KVM_NMI vcpu ioctl once we > have > agreed on which behavior is expected upon NMI injection. However the > kernel doc says this ioctl only is well defined if "KVM_CREATE_IRQCHIP > has not been called" (?). But what architectural concept would you map your KVM_NMI to? The number of things you can do is pretty limited: - Reset: we already have this - Interrupt: you don't get to decide the priority or the group - SError: Pretty much fatal in all cases You *could* try something like SDEI [1], but that's a pretty terrible interface too. M. [1] https://static.docs.arm.com/den0054/a/ARM_DEN0054A_Software_Delegated_Exception_Interface.pdf
Hi Marc, On 1/28/20 10:25 AM, Marc Zyngier wrote: > Gavin, Eric, > > On 2020-01-28 08:05, Auger Eric wrote: >> Hi, >> >> On 1/28/20 7:48 AM, Gavin Shan wrote: >>> [including more folks into the discussion] >>> >>>> On Fri, 17 Jan 2020 at 14:00, Peter Maydell <peter.maydell@linaro.org> >>>> wrote: >>>>> On Thu, 19 Dec 2019 at 04:06, Gavin Shan <gshan@redhat.com> wrote: >>>>>> This supports NMI injection for virtual machine and currently it's >>>>>> only >>>>>> supported on GICv3 controller, which is emulated by qemu or host >>>>>> kernel. >>>>>> The design is highlighted as below: >>>>>> >>>>>> * The NMI is identified by its priority (0x20). In the guest (linux) >>>>>> kernel, the GICC_PMR is set to 0x80, to block all interrupts except >>>>>> the NMIs when the external interrupt is disabled. It means the FIQ >>>>>> and IRQ bit in PSTATE isn't touched when the functionality (NMI) is >>>>>> functional. >>>>>> * LPIs aren't considered as NMIs because of their nature. It means >>>>>> NMI >>>>>> is either SPI or PPI. Besides, the NMIs are injected in round-robin >>>>>> fashion is there are multiple NMIs existing. >>>>>> * When the GICv3 controller is emulated by qemu, the interrupt states >>>>>> (e.g. enabled, priority) is fetched from the corresponding data >>>>>> struct >>>>>> directly. However, we have to pause all CPUs to fetch the interrupt >>>>>> states from host in advance if the GICv3 controller is emulated by >>>>>> host. >>>>>> >>>>>> The testing scenario is to tweak guest (linux) kernel where the >>>>>> pl011 SPI >>>>>> can be enabled as NMI by request_nmi(). Check "/proc/interrupts" >>>>>> after injecting >>>>>> several NMIs, to see if the interrupt count is increased or not. The >>>>>> result >>>>>> is just as expected. >>>>>> >>>> >>>> So, QEMU is trying to emulate actual hardware. None of this >>>> looks to me like what GICv3 hardware does... If you want to >>>> have the virt board send an interrupt, do it the usual way >>>> by wiring up a qemu_irq from some device to the GIC, please. >>>> (More generally, there is no concept of an "NMI" in the GIC; >>>> there are just interrupts at varying possible guest-programmable >>>> priority levels.) >>>> >>> >>> Peter, I missed to read your reply in time and apologies for late >>> response. >>> >>> Yes, there is no concept of "NMI" in the GIC from hardware perspective. >>> However, NMI has been supported from the software by kernel commit >>> bc3c03ccb4641 ("arm64: Enable the support of pseudo-NMIs"). The NMIs >>> have higher priority than normal ones. NMIs are deliverable after >>> local_irq_disable() because the SYS_ICC_PMR_EL1 is tweaked so that >>> normal interrupts are masked only. > > And none of that is an NMI. This is a completely SW-defined mechanism, > and you can't rely on this to inject something that would behave as > a NMI in in a guest. I thought the "pseudo" prefix would give it away :-(. > >>> >>> It's unclear about the purpose of "nmi" QMP/HMP command. It's why I >>> put a RFC tag. The command has been supported by multiple architects >>> including x86/ppc. However, they are having different behaviors. The >>> system will be restarted on ppc with this command, but a NMI is injected >>> through LAPIC on x86. So I'm not sure what architect (system reset on >>> ppc or injecting NMI on x86) aarch64 should follow. > > The x86 NMI has no equivalent on ARM, full stop. And the only thing that > the ARM implementation should follow is the letter of the architecture, > without added concepts. > >> arm_pmu driver was reworked to use pseudo-NMIs. I don't know the exact >> status of this work though >> (https://patchwork.kernel.org/cover/11047407/). So we cannot use any >> random NMI for guest injection. >> >> I wonder whether we should implement the KVM_NMI vcpu ioctl once we have >> agreed on which behavior is expected upon NMI injection. However the >> kernel doc says this ioctl only is well defined if "KVM_CREATE_IRQCHIP >> has not been called" (?). > > But what architectural concept would you map your KVM_NMI to? The number > of things you can do is pretty limited: > > - Reset: we already have this > - Interrupt: you don't get to decide the priority or the group > - SError: Pretty much fatal in all cases > > You *could* try something like SDEI [1], but that's a pretty terrible > interface too. Thank you for the pointer. So Gavin, not sure the QEMU QMP/HMP NMI command is relevant on ARM (at least at this point)? Thanks Eric > > M. > > [1] > https://static.docs.arm.com/den0054/a/ARM_DEN0054A_Software_Delegated_Exception_Interface.pdf >
On Tue, 28 Jan 2020 at 10:56, Auger Eric <eric.auger@redhat.com> wrote: > On 1/28/20 10:25 AM, Marc Zyngier wrote: > > You *could* try something like SDEI [1], but that's a pretty terrible > > interface too. > > Thank you for the pointer. There was a patchset recently that had an SDEI implementation, but I would strongly prefer not to have QEMU itself take on the job of firmware API implementation, and the facilities provided are somewhere between awkward and impossible to implement from within a guest firmware blob :-/ thanks -- PMM
On 2020-01-28 10:59, Peter Maydell wrote: > On Tue, 28 Jan 2020 at 10:56, Auger Eric <eric.auger@redhat.com> wrote: >> On 1/28/20 10:25 AM, Marc Zyngier wrote: >> > You *could* try something like SDEI [1], but that's a pretty terrible >> > interface too. >> >> Thank you for the pointer. > > There was a patchset recently that had an SDEI implementation, > but I would strongly prefer not to have QEMU itself take > on the job of firmware API implementation, and the facilities > provided are somewhere between awkward and impossible to > implement from within a guest firmware blob :-/ Hey, I said it was terrible! ;-) M.
On 28/01/2020 17:48, Gavin Shan wrote: > [including more folks into the discussion] > >> On Fri, 17 Jan 2020 at 14:00, Peter Maydell <peter.maydell@linaro.org> >> wrote: >>> On Thu, 19 Dec 2019 at 04:06, Gavin Shan <gshan@redhat.com> wrote: >>>> This supports NMI injection for virtual machine and currently it's only >>>> supported on GICv3 controller, which is emulated by qemu or host >>>> kernel. >>>> The design is highlighted as below: >>>> >>>> * The NMI is identified by its priority (0x20). In the guest (linux) >>>> kernel, the GICC_PMR is set to 0x80, to block all interrupts except >>>> the NMIs when the external interrupt is disabled. It means the FIQ >>>> and IRQ bit in PSTATE isn't touched when the functionality (NMI) is >>>> functional. >>>> * LPIs aren't considered as NMIs because of their nature. It means NMI >>>> is either SPI or PPI. Besides, the NMIs are injected in round-robin >>>> fashion is there are multiple NMIs existing. >>>> * When the GICv3 controller is emulated by qemu, the interrupt states >>>> (e.g. enabled, priority) is fetched from the corresponding data struct >>>> directly. However, we have to pause all CPUs to fetch the interrupt >>>> states from host in advance if the GICv3 controller is emulated by >>>> host. >>>> >>>> The testing scenario is to tweak guest (linux) kernel where the >>>> pl011 SPI >>>> can be enabled as NMI by request_nmi(). Check "/proc/interrupts" >>>> after injecting >>>> several NMIs, to see if the interrupt count is increased or not. The >>>> result >>>> is just as expected. >>>> >> >> So, QEMU is trying to emulate actual hardware. None of this >> looks to me like what GICv3 hardware does... If you want to >> have the virt board send an interrupt, do it the usual way >> by wiring up a qemu_irq from some device to the GIC, please. >> (More generally, there is no concept of an "NMI" in the GIC; >> there are just interrupts at varying possible guest-programmable >> priority levels.) >> > > Peter, I missed to read your reply in time and apologies for late response. > > Yes, there is no concept of "NMI" in the GIC from hardware perspective. > However, NMI has been supported from the software by kernel commit > bc3c03ccb4641 ("arm64: Enable the support of pseudo-NMIs"). The NMIs > have higher priority than normal ones. NMIs are deliverable after > local_irq_disable() because the SYS_ICC_PMR_EL1 is tweaked so that > normal interrupts are masked only. > > It's unclear about the purpose of "nmi" QMP/HMP command. It's why I > put a RFC tag. The command has been supported by multiple architects > including x86/ppc. However, they are having different behaviors. The > system will be restarted on ppc with this command, We inject "system reset" as it is the closest thing to the idea of NMI (could be a "machine check"). The system behaviour is configurable on POWERPC, it is either kdump (store a system dump and reboot) or simple reboot or activate XMON (in-kernel debugger, needs to be enabled beforehand). The injector in QEMU is called NMIClass::nmi_monitor_handler and as the name suggests it is not an NMI (the hardware concept which x86 may be still has and others do not) but an "nmi" command of the QEMU monitor which is rather a debug tool - "kick an unresponsive guest" - for us (POWERPC). > but a NMI is injected > through LAPIC on x86. So I'm not sure what architect (system reset on > ppc or injecting NMI on x86) aarch64 should follow. I'd say whatever triggers in-kernel debugger or kdump but I am not familiar with ARM at all :)
On 1/28/20 9:56 PM, Auger Eric wrote: > Hi Marc, > On 1/28/20 10:25 AM, Marc Zyngier wrote: >> Gavin, Eric, >> >> On 2020-01-28 08:05, Auger Eric wrote: >>> Hi, >>> >>> On 1/28/20 7:48 AM, Gavin Shan wrote: >>>> [including more folks into the discussion] >>>> >>>>> On Fri, 17 Jan 2020 at 14:00, Peter Maydell <peter.maydell@linaro.org> >>>>> wrote: >>>>>> On Thu, 19 Dec 2019 at 04:06, Gavin Shan <gshan@redhat.com> wrote: >>>>>>> This supports NMI injection for virtual machine and currently it's >>>>>>> only >>>>>>> supported on GICv3 controller, which is emulated by qemu or host >>>>>>> kernel. >>>>>>> The design is highlighted as below: >>>>>>> >>>>>>> * The NMI is identified by its priority (0x20). In the guest (linux) >>>>>>> kernel, the GICC_PMR is set to 0x80, to block all interrupts except >>>>>>> the NMIs when the external interrupt is disabled. It means the FIQ >>>>>>> and IRQ bit in PSTATE isn't touched when the functionality (NMI) is >>>>>>> functional. >>>>>>> * LPIs aren't considered as NMIs because of their nature. It means >>>>>>> NMI >>>>>>> is either SPI or PPI. Besides, the NMIs are injected in round-robin >>>>>>> fashion is there are multiple NMIs existing. >>>>>>> * When the GICv3 controller is emulated by qemu, the interrupt states >>>>>>> (e.g. enabled, priority) is fetched from the corresponding data >>>>>>> struct >>>>>>> directly. However, we have to pause all CPUs to fetch the interrupt >>>>>>> states from host in advance if the GICv3 controller is emulated by >>>>>>> host. >>>>>>> >>>>>>> The testing scenario is to tweak guest (linux) kernel where the >>>>>>> pl011 SPI >>>>>>> can be enabled as NMI by request_nmi(). Check "/proc/interrupts" >>>>>>> after injecting >>>>>>> several NMIs, to see if the interrupt count is increased or not. The >>>>>>> result >>>>>>> is just as expected. >>>>>>> >>>>> >>>>> So, QEMU is trying to emulate actual hardware. None of this >>>>> looks to me like what GICv3 hardware does... If you want to >>>>> have the virt board send an interrupt, do it the usual way >>>>> by wiring up a qemu_irq from some device to the GIC, please. >>>>> (More generally, there is no concept of an "NMI" in the GIC; >>>>> there are just interrupts at varying possible guest-programmable >>>>> priority levels.) >>>>> >>>> >>>> Peter, I missed to read your reply in time and apologies for late >>>> response. >>>> >>>> Yes, there is no concept of "NMI" in the GIC from hardware perspective. >>>> However, NMI has been supported from the software by kernel commit >>>> bc3c03ccb4641 ("arm64: Enable the support of pseudo-NMIs"). The NMIs >>>> have higher priority than normal ones. NMIs are deliverable after >>>> local_irq_disable() because the SYS_ICC_PMR_EL1 is tweaked so that >>>> normal interrupts are masked only. >> >> And none of that is an NMI. This is a completely SW-defined mechanism, >> and you can't rely on this to inject something that would behave as >> a NMI in in a guest. I thought the "pseudo" prefix would give it away :-(. >> Marc, thanks for the explanation. >>>> >>>> It's unclear about the purpose of "nmi" QMP/HMP command. It's why I >>>> put a RFC tag. The command has been supported by multiple architects >>>> including x86/ppc. However, they are having different behaviors. The >>>> system will be restarted on ppc with this command, but a NMI is injected >>>> through LAPIC on x86. So I'm not sure what architect (system reset on >>>> ppc or injecting NMI on x86) aarch64 should follow. >> >> The x86 NMI has no equivalent on ARM, full stop. And the only thing that >> the ARM implementation should follow is the letter of the architecture, >> without added concepts. >> >>> arm_pmu driver was reworked to use pseudo-NMIs. I don't know the exact >>> status of this work though >>> (https://patchwork.kernel.org/cover/11047407/). So we cannot use any >>> random NMI for guest injection. >>> >>> I wonder whether we should implement the KVM_NMI vcpu ioctl once we have >>> agreed on which behavior is expected upon NMI injection. However the >>> kernel doc says this ioctl only is well defined if "KVM_CREATE_IRQCHIP >>> has not been called" (?). >> >> But what architectural concept would you map your KVM_NMI to? The number >> of things you can do is pretty limited: >> >> - Reset: we already have this >> - Interrupt: you don't get to decide the priority or the group >> - SError: Pretty much fatal in all cases >> >> You *could* try something like SDEI [1], but that's a pretty terrible >> interface too. > > Thank you for the pointer. > > So Gavin, not sure the QEMU QMP/HMP NMI command is relevant on ARM (at > least at this point)? > Yes, the primary concern is what behavior we should have for ARM when QMP/HMP "nmi" command is executed. After that's determined, I can dig into SDEI if needed. As Alexey said in another reply, it's used to force the guest to have crash dump or drop into in-kernel debugger (xmon) on PowerPC. However, x86 guest will receive NMI after the command is issued. This RFC patch is following x86 to inject "pseudo" NMIs, but it seems incorrect. So the question is what behavior we should have for ARM when QMP/HMP "nmi" command is issued? I'm expecting more input in this regard :) Thanks, Gavin > Thanks > > Eric > > >> >> M. >> >> [1] >> https://static.docs.arm.com/den0054/a/ARM_DEN0054A_Software_Delegated_Exception_Interface.pdf >> >
On 1/29/20 1:44 PM, Alexey Kardashevskiy wrote: > > > On 28/01/2020 17:48, Gavin Shan wrote: >> [including more folks into the discussion] >> >>> On Fri, 17 Jan 2020 at 14:00, Peter Maydell <peter.maydell@linaro.org> >>> wrote: >>>> On Thu, 19 Dec 2019 at 04:06, Gavin Shan <gshan@redhat.com> wrote: >>>>> This supports NMI injection for virtual machine and currently it's only >>>>> supported on GICv3 controller, which is emulated by qemu or host >>>>> kernel. >>>>> The design is highlighted as below: >>>>> >>>>> * The NMI is identified by its priority (0x20). In the guest (linux) >>>>> kernel, the GICC_PMR is set to 0x80, to block all interrupts except >>>>> the NMIs when the external interrupt is disabled. It means the FIQ >>>>> and IRQ bit in PSTATE isn't touched when the functionality (NMI) is >>>>> functional. >>>>> * LPIs aren't considered as NMIs because of their nature. It means NMI >>>>> is either SPI or PPI. Besides, the NMIs are injected in round-robin >>>>> fashion is there are multiple NMIs existing. >>>>> * When the GICv3 controller is emulated by qemu, the interrupt states >>>>> (e.g. enabled, priority) is fetched from the corresponding data struct >>>>> directly. However, we have to pause all CPUs to fetch the interrupt >>>>> states from host in advance if the GICv3 controller is emulated by >>>>> host. >>>>> >>>>> The testing scenario is to tweak guest (linux) kernel where the >>>>> pl011 SPI >>>>> can be enabled as NMI by request_nmi(). Check "/proc/interrupts" >>>>> after injecting >>>>> several NMIs, to see if the interrupt count is increased or not. The >>>>> result >>>>> is just as expected. >>>>> >>> >>> So, QEMU is trying to emulate actual hardware. None of this >>> looks to me like what GICv3 hardware does... If you want to >>> have the virt board send an interrupt, do it the usual way >>> by wiring up a qemu_irq from some device to the GIC, please. >>> (More generally, there is no concept of an "NMI" in the GIC; >>> there are just interrupts at varying possible guest-programmable >>> priority levels.) >>> >> >> Peter, I missed to read your reply in time and apologies for late response. >> >> Yes, there is no concept of "NMI" in the GIC from hardware perspective. >> However, NMI has been supported from the software by kernel commit >> bc3c03ccb4641 ("arm64: Enable the support of pseudo-NMIs"). The NMIs >> have higher priority than normal ones. NMIs are deliverable after >> local_irq_disable() because the SYS_ICC_PMR_EL1 is tweaked so that >> normal interrupts are masked only. >> >> It's unclear about the purpose of "nmi" QMP/HMP command. It's why I >> put a RFC tag. The command has been supported by multiple architects >> including x86/ppc. However, they are having different behaviors. The >> system will be restarted on ppc with this command, > > We inject "system reset" as it is the closest thing to the idea of NMI > (could be a "machine check"). > > The system behaviour is configurable on POWERPC, it is either kdump > (store a system dump and reboot) or simple reboot or activate XMON > (in-kernel debugger, needs to be enabled beforehand). > > The injector in QEMU is called NMIClass::nmi_monitor_handler and as the > name suggests it is not an NMI (the hardware concept which x86 may be > still has and others do not) but an "nmi" command of the QEMU monitor > which is rather a debug tool - "kick an unresponsive guest" - for us > (POWERPC). > Alexey, thanks for the explanation. The behavior for PowerPC is clear now :) > >> but a NMI is injected >> through LAPIC on x86. So I'm not sure what architect (system reset on >> ppc or injecting NMI on x86) aarch64 should follow. > > I'd say whatever triggers in-kernel debugger or kdump but I am not > familiar with ARM at all :) > For x86, the behavior is really depending the NMI handler. Currently, it seems nothing other than outputting below messages. However, it's configurable to get a system crash via "/proc/sys/kernel/unknown_nmi_panic" (qemu) nmi [ 6731.137504] Uhhuh. NMI received for unknown reason 30 on CPU 0. [ 6731.137511] Do you have a strange power saving mode enabled? [ 6731.137512] Dazed and confused, but trying to continue guest# cat /proc/sys/kernel/unknown_nmi_panic 0 guest# echo 1 > /proc/sys/kernel/unknown_nmi_panic (qemu) nmi [ 6852.848600] Do you have a strange power saving mode enabled? [ 6852.848601] Kernel panic - not syncing: NMI: Not continuing [ 6852.848602] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.4.0-rc6-gshan+ #21 [ 6852.848604] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-48-4 [ 6852.848604] Call Trace: [ 6852.848605] <NMI> [ 6852.848606] dump_stack+0x6d/0x9a [ 6852.848607] panic+0x101/0x2e3 [ 6852.848608] nmi_panic.cold+0xc/0xc [ 6852.848609] unknown_nmi_error.cold+0x46/0x57 [ 6852.848609] default_do_nmi+0xda/0x110 [ 6852.848610] do_nmi+0x16e/0x1d0 [ 6852.848611] end_repeat_nmi+0x16/0x1a [ 6852.848625] RIP: 0010:native_safe_halt+0xe/0x10 [ 6852.848628] Code: 7b ff ff ff eb bd 90 90 90 90 90 90 e9 07 00 00 00 0f 00 2d 56 bc5 [ 6852.848639] RSP: 0018:ffffffffba603e10 EFLAGS: 00000246 [ 6852.848642] RAX: ffffffffb9ccbdb0 RBX: 0000000000000000 RCX: 0000000000000001 [ 6852.848643] RDX: 00000000000202ce RSI: 0000000000000083 RDI: 0000000000000000 [ 6852.848644] RBP: ffffffffba603e30 R08: 0000063b8ee46b61 R09: 0000000000000201 [ 6852.848645] R10: ffff9e29be53866c R11: 0000000000000018 R12: 0000000000000000 [ 6852.848646] R13: ffffffffba611780 R14: 0000000000000000 R15: 0000000000000000 [ 6852.848647] ? __sched_text_end+0x1/0x1 [ 6852.848648] ? native_safe_halt+0xe/0x10 [ 6852.848649] ? native_safe_halt+0xe/0x10 [ 6852.848650] </NMI> [ 6852.848650] ? default_idle+0x20/0x140 [ 6852.848651] arch_cpu_idle+0x15/0x20 [ 6852.848652] default_idle_call+0x23/0x30 [ 6852.848653] do_idle+0x1fb/0x270 [ 6852.848654] cpu_startup_entry+0x20/0x30 [ 6852.848655] rest_init+0xae/0xb0 [ 6852.848656] arch_call_rest_init+0xe/0x1b [ 6852.848657] start_kernel+0x4dd/0x4fd [ 6852.848658] x86_64_start_reservations+0x24/0x26 [ 6852.848658] x86_64_start_kernel+0x75/0x79 [ 6852.848659] secondary_startup_64+0xa4/0xb0 [ 6852.849153] Kernel Offset: 0x38400000 from 0xffffffff81000000 (relocation range: 0x) Thanks, Gavin
On 1/28/20 7:29 PM, Julien Thierry wrote: > Hi Gavin, > > On 1/28/20 6:48 AM, Gavin Shan wrote: >> [including more folks into the discussion] >> >>> On Fri, 17 Jan 2020 at 14:00, Peter Maydell <peter.maydell@linaro.org> wrote: >>>> On Thu, 19 Dec 2019 at 04:06, Gavin Shan <gshan@redhat.com> wrote: >>>>> This supports NMI injection for virtual machine and currently it's only >>>>> supported on GICv3 controller, which is emulated by qemu or host kernel. >>>>> The design is highlighted as below: >>>>> >>>>> * The NMI is identified by its priority (0x20). In the guest (linux) >>>>> kernel, the GICC_PMR is set to 0x80, to block all interrupts except >>>>> the NMIs when the external interrupt is disabled. It means the FIQ >>>>> and IRQ bit in PSTATE isn't touched when the functionality (NMI) is >>>>> functional. >>>>> * LPIs aren't considered as NMIs because of their nature. It means NMI >>>>> is either SPI or PPI. Besides, the NMIs are injected in round-robin >>>>> fashion is there are multiple NMIs existing. >>>>> * When the GICv3 controller is emulated by qemu, the interrupt states >>>>> (e.g. enabled, priority) is fetched from the corresponding data struct >>>>> directly. However, we have to pause all CPUs to fetch the interrupt >>>>> states from host in advance if the GICv3 controller is emulated by >>>>> host. >>>>> >>>>> The testing scenario is to tweak guest (linux) kernel where the pl011 SPI >>>>> can be enabled as NMI by request_nmi(). Check "/proc/interrupts" after injecting >>>>> several NMIs, to see if the interrupt count is increased or not. The result >>>>> is just as expected. >>>>> >>> >>> So, QEMU is trying to emulate actual hardware. None of this >>> looks to me like what GICv3 hardware does... If you want to >>> have the virt board send an interrupt, do it the usual way >>> by wiring up a qemu_irq from some device to the GIC, please. >>> (More generally, there is no concept of an "NMI" in the GIC; >>> there are just interrupts at varying possible guest-programmable >>> priority levels.) >>> >> >> Peter, I missed to read your reply in time and apologies for late response. >> >> Yes, there is no concept of "NMI" in the GIC from hardware perspective. >> However, NMI has been supported from the software by kernel commit >> bc3c03ccb4641 ("arm64: Enable the support of pseudo-NMIs"). The NMIs >> have higher priority than normal ones. NMIs are deliverable after >> local_irq_disable() because the SYS_ICC_PMR_EL1 is tweaked so that >> normal interrupts are masked only. >> >> It's unclear about the purpose of "nmi" QMP/HMP command. It's why I >> put a RFC tag. The command has been supported by multiple architects >> including x86/ppc. However, they are having different behaviors. The >> system will be restarted on ppc with this command, but a NMI is injected >> through LAPIC on x86. So I'm not sure what architect (system reset on >> ppc or injecting NMI on x86) aarch64 should follow. >> > > As Peter stated, there is no NMI concept on aarch64 hardware. The pseudo-NMI in the Linux port is purely a software concept. The OS itself decides which interrupts should have the "NMI" properties and sets them up accordingly. > > For QEMU to inject a pseudo-NMI into the guest would require it not only to know that the guest supports that feature. But also how such an interrupt has to be set up (currently there is no guaranty that the priority used for the NMI and the mask should stay the same across Linux version as it is purely internal to GICv3/arm64, no generic kAPI nor uAPI have access to it). And also, you would probably need to know what is handling the NMI you are injecting. > > QEMU shouldn't try to guess "that might be dealt as an NMI, lets raise it". > > I'm not familiar with the QMP/HMP nor the inner workings of QEMU, but if for some reason QEMU requires to trigger an NMI-like mechanic on aarch64, a proper way might be through para-virt. Having some "qemu-nmi-driver" in linux which calls "request_nmi()" and does the proper handling expected by QEMU. > > Cheers, > Julien, thanks for the explanation. The question we're not sure if NMI should be injected on receiving HMP/QMP "nmi" command. It means it's not clear what behavior we should have for this command on ARM. However, I have one more unrelated question: "pseudo" NMI on ARM64 should be PPI? I mean SPI can't be "pseudo" NMI. Thanks, Gavin
On 1/29/20 3:46 AM, Gavin Shan wrote: > On 1/28/20 7:29 PM, Julien Thierry wrote: >> Hi Gavin, >> >> On 1/28/20 6:48 AM, Gavin Shan wrote: >>> [including more folks into the discussion] >>> >>>> On Fri, 17 Jan 2020 at 14:00, Peter Maydell >>>> <peter.maydell@linaro.org> wrote: >>>>> On Thu, 19 Dec 2019 at 04:06, Gavin Shan <gshan@redhat.com> wrote: >>>>>> This supports NMI injection for virtual machine and currently it's >>>>>> only >>>>>> supported on GICv3 controller, which is emulated by qemu or host >>>>>> kernel. >>>>>> The design is highlighted as below: >>>>>> >>>>>> * The NMI is identified by its priority (0x20). In the guest (linux) >>>>>> kernel, the GICC_PMR is set to 0x80, to block all interrupts except >>>>>> the NMIs when the external interrupt is disabled. It means the FIQ >>>>>> and IRQ bit in PSTATE isn't touched when the functionality (NMI) is >>>>>> functional. >>>>>> * LPIs aren't considered as NMIs because of their nature. It means >>>>>> NMI >>>>>> is either SPI or PPI. Besides, the NMIs are injected in round-robin >>>>>> fashion is there are multiple NMIs existing. >>>>>> * When the GICv3 controller is emulated by qemu, the interrupt states >>>>>> (e.g. enabled, priority) is fetched from the corresponding data >>>>>> struct >>>>>> directly. However, we have to pause all CPUs to fetch the interrupt >>>>>> states from host in advance if the GICv3 controller is emulated by >>>>>> host. >>>>>> >>>>>> The testing scenario is to tweak guest (linux) kernel where the >>>>>> pl011 SPI >>>>>> can be enabled as NMI by request_nmi(). Check "/proc/interrupts" >>>>>> after injecting >>>>>> several NMIs, to see if the interrupt count is increased or not. >>>>>> The result >>>>>> is just as expected. >>>>>> >>>> >>>> So, QEMU is trying to emulate actual hardware. None of this >>>> looks to me like what GICv3 hardware does... If you want to >>>> have the virt board send an interrupt, do it the usual way >>>> by wiring up a qemu_irq from some device to the GIC, please. >>>> (More generally, there is no concept of an "NMI" in the GIC; >>>> there are just interrupts at varying possible guest-programmable >>>> priority levels.) >>>> >>> >>> Peter, I missed to read your reply in time and apologies for late >>> response. >>> >>> Yes, there is no concept of "NMI" in the GIC from hardware perspective. >>> However, NMI has been supported from the software by kernel commit >>> bc3c03ccb4641 ("arm64: Enable the support of pseudo-NMIs"). The NMIs >>> have higher priority than normal ones. NMIs are deliverable after >>> local_irq_disable() because the SYS_ICC_PMR_EL1 is tweaked so that >>> normal interrupts are masked only. >>> >>> It's unclear about the purpose of "nmi" QMP/HMP command. It's why I >>> put a RFC tag. The command has been supported by multiple architects >>> including x86/ppc. However, they are having different behaviors. The >>> system will be restarted on ppc with this command, but a NMI is injected >>> through LAPIC on x86. So I'm not sure what architect (system reset on >>> ppc or injecting NMI on x86) aarch64 should follow. >>> >> >> As Peter stated, there is no NMI concept on aarch64 hardware. The >> pseudo-NMI in the Linux port is purely a software concept. The OS >> itself decides which interrupts should have the "NMI" properties and >> sets them up accordingly. >> >> For QEMU to inject a pseudo-NMI into the guest would require it not >> only to know that the guest supports that feature. But also how such >> an interrupt has to be set up (currently there is no guaranty that the >> priority used for the NMI and the mask should stay the same across >> Linux version as it is purely internal to GICv3/arm64, no generic kAPI >> nor uAPI have access to it). And also, you would probably need to know >> what is handling the NMI you are injecting. >> >> QEMU shouldn't try to guess "that might be dealt as an NMI, lets raise >> it". >> >> I'm not familiar with the QMP/HMP nor the inner workings of QEMU, but >> if for some reason QEMU requires to trigger an NMI-like mechanic on >> aarch64, a proper way might be through para-virt. Having some >> "qemu-nmi-driver" in linux which calls "request_nmi()" and does the >> proper handling expected by QEMU. >> >> Cheers, >> > > Julien, thanks for the explanation. The question we're not sure if NMI > should > be injected on receiving HMP/QMP "nmi" command. It means it's not clear > what > behavior we should have for this command on ARM. However, I have one more > unrelated question: "pseudo" NMI on ARM64 should be PPI? I mean SPI can't > be "pseudo" NMI. > I'm not sure I understand why you say "SPI can't be "pseudo" NMI". Currently both PPI and SPI are supported in the "pseudo" NMI scheme. Do you think that should not be the case? If so, can you elaborate? Thanks,
On 2020-01-29 02:44, Alexey Kardashevskiy wrote: > On 28/01/2020 17:48, Gavin Shan wrote: >> but a NMI is injected >> through LAPIC on x86. So I'm not sure what architect (system reset on >> ppc or injecting NMI on x86) aarch64 should follow. > > I'd say whatever triggers in-kernel debugger or kdump but I am not > familiar with ARM at all :) All that is completely OS specific, and has no relation to the architecture. As I mentioned in another part of the thread, the closest thing to this would be to implement SDEI together with an IMPDEF mechanism to enter it (or even generate a RAS error). On the other hand, SDEI is pretty horrible, and means either KVM or QEMU acting like a firmware for the guest. To say that I'm not keen is a massive understatement. M.
On 1/29/20 6:57 PM, Julien Thierry wrote: > On 1/29/20 3:46 AM, Gavin Shan wrote: >> On 1/28/20 7:29 PM, Julien Thierry wrote: .../... >> >> Julien, thanks for the explanation. The question we're not sure if NMI should >> be injected on receiving HMP/QMP "nmi" command. It means it's not clear what >> behavior we should have for this command on ARM. However, I have one more >> unrelated question: "pseudo" NMI on ARM64 should be PPI? I mean SPI can't >> be "pseudo" NMI. >> > > I'm not sure I understand why you say "SPI can't be "pseudo" NMI". Currently both PPI and SPI are supported in the "pseudo" NMI scheme. Do you think that should not be the case? If so, can you elaborate? > > Thanks, > Julien, NMI interrupt is connected to the system by request_nmi() where we have a check as below. -EINVAL will be returned from request_nmi() on those interrupts whose descriptors aren't marked with IRQ_NOAUTOEN. SPI falls into this category. Please refer to below code snippet extracted from gic_irq_domain_map() for more details. kernel/irq/manage.c::request_nmi() int request_nmi(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *name, void *dev_id) { : if (!desc || irq_settings_can_autoenable(desc) || !irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc)) || !irq_supports_nmi(desc)) return -EINVAL; : } acpi_register_gsi irq_create_fwspec_mapping irq_domain_alloc_irqs __irq_domain_alloc_irqs irq_domain_alloc_irqs_hierarchy gic_irq_domain_alloc # irq_domain_ops::alloc gic_irq_domain_map drivers/irqchip/irq-gic-v3.c::gic_irq_domain_map() static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) { : switch (__get_intid_range(hw)) { case PPI_RANGE: case EPPI_RANGE: irq_set_percpu_devid(irq); irq_domain_set_info(d, irq, hw, chip, d->host_data, handle_percpu_devid_irq, NULL, NULL); irq_set_status_flags(irq, IRQ_NOAUTOEN); <<<< this flag break; case SPI_RANGE: case ESPI_RANGE: irq_domain_set_info(d, irq, hw, chip, d->host_data, handle_fasteoi_irq, NULL, NULL); irq_set_probe(irq); irqd_set_single_target(irq_desc_get_irq_data(irq_to_desc(irq))); break; : } : } Thanks, Gavin
Gavin, On 2020-01-29 21:54, Gavin Shan wrote: > On 1/29/20 6:57 PM, Julien Thierry wrote: >> On 1/29/20 3:46 AM, Gavin Shan wrote: >>> On 1/28/20 7:29 PM, Julien Thierry wrote: > > .../... > >>> >>> Julien, thanks for the explanation. The question we're not sure if >>> NMI should >>> be injected on receiving HMP/QMP "nmi" command. It means it's not >>> clear what >>> behavior we should have for this command on ARM. However, I have one >>> more >>> unrelated question: "pseudo" NMI on ARM64 should be PPI? I mean SPI >>> can't >>> be "pseudo" NMI. >>> >> >> I'm not sure I understand why you say "SPI can't be "pseudo" NMI". >> Currently both PPI and SPI are supported in the "pseudo" NMI scheme. >> Do you think that should not be the case? If so, can you elaborate? >> >> Thanks, >> > > Julien, NMI interrupt is connected to the system by request_nmi() where > we have > a check as below. -EINVAL will be returned from request_nmi() on those > interrupts > whose descriptors aren't marked with IRQ_NOAUTOEN. SPI falls into this > category. The IRQ_NOAUTOEN is set on PPIs because you can't enable them all at once, for obvious reasons. This doesn't mean you cannot set it on other interrupt classes, including SPIs. It is actually a fairly common thing to do when you want to decouple requesting the interrupt from the enabling, if you do not want the interrupt to be able to fire right away. M.
On 1/30/20 9:58 PM, Marc Zyngier wrote: > On 2020-01-29 21:54, Gavin Shan wrote: >> On 1/29/20 6:57 PM, Julien Thierry wrote: >>> On 1/29/20 3:46 AM, Gavin Shan wrote: >>>> On 1/28/20 7:29 PM, Julien Thierry wrote: >> >> .../... >> >>>> >>>> Julien, thanks for the explanation. The question we're not sure if NMI should >>>> be injected on receiving HMP/QMP "nmi" command. It means it's not clear what >>>> behavior we should have for this command on ARM. However, I have one more >>>> unrelated question: "pseudo" NMI on ARM64 should be PPI? I mean SPI can't >>>> be "pseudo" NMI. >>>> >>> >>> I'm not sure I understand why you say "SPI can't be "pseudo" NMI". Currently both PPI and SPI are supported in the "pseudo" NMI scheme. Do you think that should not be the case? If so, can you elaborate? >>> >>> Thanks, >>> >> >> Julien, NMI interrupt is connected to the system by request_nmi() where we have >> a check as below. -EINVAL will be returned from request_nmi() on those >> interrupts >> whose descriptors aren't marked with IRQ_NOAUTOEN. SPI falls into this category. > > The IRQ_NOAUTOEN is set on PPIs because you can't enable them all at once, > for obvious reasons. > > This doesn't mean you cannot set it on other interrupt classes, including SPIs. > It is actually a fairly common thing to do when you want to decouple requesting > the interrupt from the enabling, if you do not want the interrupt to be able to > fire right away. > > M. Marc, Ok, thanks for the details, which make things clear. Thanks, Gavin
On 1/29/20 8:04 PM, Marc Zyngier wrote: > On 2020-01-29 02:44, Alexey Kardashevskiy wrote: >> On 28/01/2020 17:48, Gavin Shan wrote: >>> but a NMI is injected >>> through LAPIC on x86. So I'm not sure what architect (system reset on >>> ppc or injecting NMI on x86) aarch64 should follow. >> >> I'd say whatever triggers in-kernel debugger or kdump but I am not >> familiar with ARM at all :) > > All that is completely OS specific, and has no relation to the architecture. > As I mentioned in another part of the thread, the closest thing to this > would be to implement SDEI together with an IMPDEF mechanism to enter it > (or even generate a RAS error). > > On the other hand, SDEI is pretty horrible, and means either KVM or QEMU > acting like a firmware for the guest. To say that I'm not keen is a massive > understatement. > > M. Marc, could you please explain a bit about "IMPDEF mechanism"? I'm not sure if it means a non-standard SDEI event should be used, corresponding to the HMP/QMP "nmi" command. Also, If I'm correct, you agree that a crash dump should be triggered on arm64 guest once HMP/QMP "nmi" command is issued? I also dig into SDEI a bit. It seems the SDEI support in QEMU isn't upstream yet: https://patchew.org/QEMU/20191105091056.9541-1-guoheyi@huawei.com/ Thanks, Gavin
On 2020-01-31 06:59, Gavin Shan wrote: > On 1/29/20 8:04 PM, Marc Zyngier wrote: >> On 2020-01-29 02:44, Alexey Kardashevskiy wrote: >>> On 28/01/2020 17:48, Gavin Shan wrote: >>>> but a NMI is injected >>>> through LAPIC on x86. So I'm not sure what architect (system reset >>>> on >>>> ppc or injecting NMI on x86) aarch64 should follow. >>> >>> I'd say whatever triggers in-kernel debugger or kdump but I am not >>> familiar with ARM at all :) >> >> All that is completely OS specific, and has no relation to the >> architecture. >> As I mentioned in another part of the thread, the closest thing to >> this >> would be to implement SDEI together with an IMPDEF mechanism to enter >> it >> (or even generate a RAS error). >> >> On the other hand, SDEI is pretty horrible, and means either KVM or >> QEMU >> acting like a firmware for the guest. To say that I'm not keen is a >> massive >> understatement. >> >> M. > > Marc, could you please explain a bit about "IMPDEF mechanism"? I'm not > sure if > it means a non-standard SDEI event should be used, corresponding to the > HMP/QMP > "nmi" command. SDEI doesn't quite describe *why* and *how* you enter it. You just get there by some mean (SError, Group-0 interrupt, or IMPlementation DEFined mechanism). It is then for the SDEI implementation to sort it out and enter the OS using the registered entry point. > Also, If I'm correct, you agree that a crash dump should be triggered > on arm64 > guest once HMP/QMP "nmi" command is issued? No, I don't agree. QEMU and KVM are OS agnostic, and there is nothing in the ARMv8 architecture that talks about "crash dumps". If your "nmi" command generates a SDEI event and that event gets dispatched to the guest, it is entirely the guest's responsibility to do whatever it wants. We should stay clear of assuming behaviours. > I also dig into SDEI a bit. It seems the SDEI support in QEMU isn't > upstream yet: > > https://patchew.org/QEMU/20191105091056.9541-1-guoheyi@huawei.com/ And I'm glad. SDEI, as I said, is absolutely horrible. I'm also very fortunate to have been CC'd on this series on an email address I cannot read. This would have huge impacts on both QEMU and KVM, and this needs more than a knee jerk reaction to support a QEMU command. And to be honest, if what you require is the guest kernel to panic, why don't you just implement a QEMU-specific driver in Linux that does just that? Some kind of watchdog driver, maybe? Thanks, M.
On 1/31/20 8:39 PM, Marc Zyngier wrote: > On 2020-01-31 06:59, Gavin Shan wrote: >> On 1/29/20 8:04 PM, Marc Zyngier wrote: >>> On 2020-01-29 02:44, Alexey Kardashevskiy wrote: >>>> On 28/01/2020 17:48, Gavin Shan wrote: >>>>> but a NMI is injected >>>>> through LAPIC on x86. So I'm not sure what architect (system reset on >>>>> ppc or injecting NMI on x86) aarch64 should follow. >>>> >>>> I'd say whatever triggers in-kernel debugger or kdump but I am not >>>> familiar with ARM at all :) >>> >>> All that is completely OS specific, and has no relation to the architecture. >>> As I mentioned in another part of the thread, the closest thing to this >>> would be to implement SDEI together with an IMPDEF mechanism to enter it >>> (or even generate a RAS error). >>> >>> On the other hand, SDEI is pretty horrible, and means either KVM or QEMU >>> acting like a firmware for the guest. To say that I'm not keen is a massive >>> understatement. >>> >>> M. >> >> Marc, could you please explain a bit about "IMPDEF mechanism"? I'm not sure if >> it means a non-standard SDEI event should be used, corresponding to the HMP/QMP >> "nmi" command. > > SDEI doesn't quite describe *why* and *how* you enter it. You just get there by > some mean (SError, Group-0 interrupt, or IMPlementation DEFined mechanism). > It is then for the SDEI implementation to sort it out and enter the OS using the > registered entry point. > Thanks for the explanation, which make things much clearer. >> Also, If I'm correct, you agree that a crash dump should be triggered on arm64 >> guest once HMP/QMP "nmi" command is issued? > > No, I don't agree. QEMU and KVM are OS agnostic, and there is nothing in the ARMv8 > architecture that talks about "crash dumps". If your "nmi" command generates > a SDEI event and that event gets dispatched to the guest, it is entirely the guest's > responsibility to do whatever it wants. We should stay clear of assuming behaviours. > Ok. Thank you for your clarification. >> I also dig into SDEI a bit. It seems the SDEI support in QEMU isn't upstream yet: >> >> https://patchew.org/QEMU/20191105091056.9541-1-guoheyi@huawei.com/ > > And I'm glad. SDEI, as I said, is absolutely horrible. I'm also very fortunate > to have been CC'd on this series on an email address I cannot read. > This would have huge impacts on both QEMU and KVM, and this needs more than > a knee jerk reaction to support a QEMU command. > > And to be honest, if what you require is the guest kernel to panic, why don't > you just implement a QEMU-specific driver in Linux that does just that? > Some kind of watchdog driver, maybe? > Marc, sorry for the delay and didn't come to you in time because I wanted to figure out the mechanism, which helps to get similar output as x86/ppc: NMI (or reset exception) is received as indication to errors, possibly panic and restart the guest kernel. The mechanism I figured out is to inject SError to guest, as below snippet shows. It helps to get a panic and guest rebooting, which looks similar to what x86/ppc have. I can post the patch as RFC if it's right direction to go :) Note: I'm still investigating the code to see how SError can be injected when TCG is used. I think we need same function when TCG is enabled, or it's something for future. static void do_inject_nmi_kvm(CPUState *cpu, Error **errp) { struct kvm_vcpu_events events; int ret; : memset(&events, 0, sizeof(events)); events.exception.serror_pending = 1; ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events); : } # echo 1 > /proc/sys/kernel/panic # (qemu) nmi [ 812.510613] SError Interrupt on CPU0, code 0xbf000000 -- SError [ 812.510617] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.5.0-rc2-00187-gf72202430e30 #2 [ 812.510617] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 [ 812.510618] pstate: 40400085 (nZcv daIf +PAN -UAO) [ 812.510619] pc : cpu_do_idle+0x48/0x58 [ 812.510619] lr : arch_cpu_idle+0x30/0x238 [ 812.510620] sp : ffff8000112c3e80 [ 812.510620] pmr_save: 000000f0 [ 812.510621] x29: ffff8000112c3e80 x28: 0000000040e10018 [ 812.510622] x27: 000000033e50d340 x26: 0000000000000000 [ 812.510623] x25: 0000000000000000 x24: ffff8000112ca21c [ 812.510624] x23: ffff800010f98cb8 x22: ffff8000112c98c8 [ 812.510625] x21: ffff8000112ca1f8 x20: 0000000000000001 [ 812.510626] x19: ffff800010f86018 x18: 0000000000000000 [ 812.510627] x17: 0000000000000000 x16: 0000000000000000 [ 812.510628] x15: 0000000000000000 x14: 0000000000000000 [ 812.510629] x13: 0000000000000000 x12: ffff0002fe640100 [ 812.510630] x11: ffffffffffffffff x10: 00000000000009f0 [ 812.510631] x9 : ffff800010088928 x8 : ffff8000112d28d0 [ 812.510632] x7 : ffff8002ed63a000 x6 : 0000002fe2092876 [ 812.510633] x5 : 00ffffffffffffff x4 : ffff8002ed63a000 [ 812.510634] x3 : 0000000000001bce x2 : 00000000000000f0 [ 812.510635] x1 : 0000000000000000 x0 : 0000000000000060 [ 812.510636] Kernel panic - not syncing: Asynchronous SError Interrupt [ 812.510637] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.5.0-rc2-00187-gf72202430e30 #2 [ 812.510638] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 [ 812.510638] Call trace: [ 812.510639] dump_backtrace+0x0/0x1a8 [ 812.510639] show_stack+0x1c/0x28 [ 812.510640] dump_stack+0xbc/0x100 [ 812.510640] panic+0x168/0x370 [ 812.510640] nmi_panic+0x68/0x98 [ 812.510641] arm64_serror_panic+0x84/0x90 [ 812.510641] do_serror+0x88/0x140 [ 812.510642] el1_error+0x8c/0x108 [ 812.510642] cpu_do_idle+0x48/0x58 [ 812.510643] default_idle_call+0x44/0x4c [ 812.510643] do_idle+0x228/0x2d0 [ 812.510644] cpu_startup_entry+0x28/0x48 [ 812.510644] rest_init+0xdc/0xe8 [ 812.510645] arch_call_rest_init+0x14/0x1c [ 812.510645] start_kernel+0x494/0x4c0 [ 812.510675] SMP: stopping secondary CPUs [ 812.510676] Kernel Offset: disabled [ 812.510676] CPU features: 0x04402,22000438 [ 812.510677] Memory Limit: none : <guest is rebooted> Thanks, Gavin
On Tue, 4 Feb 2020 at 03:51, Gavin Shan <gshan@redhat.com> wrote: > Note: I'm still investigating the code to see how SError can be injected when TCG > is used. I think we need same function when TCG is enabled, or it's something for > future. TCG doesn't currently implement SError -- it could be added, but there's a bit of plumbing you'd need to do to get it to work and to ensure the exception is taken, routed and masked correctly. thanks -- PMM
Peter Maydell <peter.maydell@linaro.org> 于2020年2月4日周二 下午9:22写道: > On Tue, 4 Feb 2020 at 03:51, Gavin Shan <gshan@redhat.com> wrote: > > Note: I'm still investigating the code to see how SError can be injected > when TCG > > is used. I think we need same function when TCG is enabled, or it's > something for > > future. > > TCG doesn't currently implement SError -- it could be added, but > there's a bit of plumbing you'd need to do to get it to work and to > ensure the exception is taken, routed and masked correctly. > > Thanks, Peter. Yeah, I will post v2 RFC patch shortly. Please let me know if there are anything wrong :) Thanks, Gavin > thanks > -- PMM >
On Tue, 4 Feb 2020 14:51:39 +1100 Gavin Shan <gshan@redhat.com> wrote: [...] > The mechanism I figured out is to inject SError to guest, as below snippet shows. It > helps to get a panic and guest rebooting, which looks similar to what x86/p > pc have. I think being able to inject a SError is a reasonable thing to have for QEMU. After all, the RAS spec[1] says that "An SError interrupt can also be generated for IMPLEMENTATION DEFINED causes." What I'm more concerned about is how this is going to coexist with the RAS extension itself (which is mandated with ARMv8.2), and whether we should classify this with a particular syndrome (by populating the ESR field in the event injection structure), if only to be able to distinguish the SW-injected SError from a RAS error propagated by KVM. It looks to me that this SError could be described as a "Recoverable Error", but I'd like someone who knows more about the RAS extension to comment on this. Thanks, M. [1] https://static.docs.arm.com/ddi0587/cb/2019_07_05_DD_0587_C_b.pdf
diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 39ab5f47e0..fc58ee70b4 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -71,6 +71,8 @@ #include "hw/mem/pc-dimm.h" #include "hw/mem/nvdimm.h" #include "hw/acpi/generic_event_device.h" +#include "hw/nmi.h" +#include "hw/intc/arm_gicv3.h" #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ @@ -1980,6 +1982,25 @@ static void virt_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev, " type: %s", object_get_typename(OBJECT(dev))); } +static void virt_nmi(NMIState *n, int cpu_index, Error **errp) +{ + VirtMachineState *vms = VIRT_MACHINE(n); + ARMGICv3CommonClass *agcc; + + if (vms->gic_version != 3) { + error_setg(errp, "NMI is only supported by GICv3"); + return; + } + + agcc = ARM_GICV3_COMMON_GET_CLASS(vms->gic); + if (agcc->inject_nmi) { + agcc->inject_nmi(vms->gic, cpu_index, errp); + } else { + error_setg(errp, "NMI injection isn't supported by %s", + object_get_typename(OBJECT(vms->gic))); + } +} + static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine, DeviceState *dev) { @@ -2025,6 +2046,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) { MachineClass *mc = MACHINE_CLASS(oc); HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); + NMIClass *nc = NMI_CLASS(oc); mc->init = machvirt_init; /* Start with max_cpus set to 512, which is the maximum supported by KVM. @@ -2051,6 +2073,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) hc->pre_plug = virt_machine_device_pre_plug_cb; hc->plug = virt_machine_device_plug_cb; hc->unplug_request = virt_machine_device_unplug_request_cb; + nc->nmi_monitor_handler = virt_nmi; mc->numa_mem_supported = true; mc->auto_enable_numa_with_memhp = true; } @@ -2136,6 +2159,7 @@ static const TypeInfo virt_machine_info = { .instance_init = virt_instance_init, .interfaces = (InterfaceInfo[]) { { TYPE_HOTPLUG_HANDLER }, + { TYPE_NMI }, { } }, }; diff --git a/hw/intc/arm_gicv3.c b/hw/intc/arm_gicv3.c index 66eaa97198..d3409cb6ef 100644 --- a/hw/intc/arm_gicv3.c +++ b/hw/intc/arm_gicv3.c @@ -338,6 +338,81 @@ static void gicv3_set_irq(void *opaque, int irq, int level) } } +static bool arm_gicv3_inject_nmi_once(GICv3State*s, int start, int end) +{ + GICv3CPUState *cs; + int irq_count = (s->num_irq + (GIC_INTERNAL * (s->num_cpu - 1))); + int i, cpu, irq; + + /* SPIs */ + for (i = start; (i < end) && (i < (s->num_irq - GIC_INTERNAL)); i++) { + if (gicv3_gicd_enabled_test(s, i + GIC_INTERNAL) && + s->gicd_ipriority[i + GIC_INTERNAL] == 0x20) { + + /* + * Reset the level and toggling the pending bit will ensure + * the interrupt is queued. + */ + if (gicv3_gicd_level_test(s, i + GIC_INTERNAL)) { + gicv3_set_irq(s, i, false); + } + + gicv3_gicd_pending_set(s, i + GIC_INTERNAL); + gicv3_set_irq(s, i, true); + + s->last_nmi_index = (i + 1); + return true; + } + } + + /* PPIs */ + if (start < (s->num_irq - GIC_INTERNAL)) { + start = (s->num_irq - GIC_INTERNAL); + } + + for (i = start; (i < end) && (i < irq_count); i++) { + cpu = (i - ((s->num_irq - GIC_INTERNAL))) / GIC_INTERNAL; + irq = (i - ((s->num_irq - GIC_INTERNAL))) % GIC_INTERNAL; + cs = &s->cpu[cpu]; + + if ((cs->gicr_ienabler0 & (1 << irq)) && + cs->gicr_ipriorityr[irq] == 0x20) { + + if (extract32(cs->level, irq, 1)) { + gicv3_set_irq(s, i, false); + } + + deposit32(cs->gicr_ipendr0, irq, 1, 1); + gicv3_set_irq(s, i, true); + + s->last_nmi_index = (i + 1); + if (s->last_nmi_index > irq_count) { + s->last_nmi_index = 0; + } + + return true; + } + } + + return false; +} + +static void arm_gicv3_inject_nmi(DeviceState *dev, int cpu_index, Error **errp) +{ + GICv3State *s = ARM_GICV3(dev); + int irq_count = (s->num_irq + (GIC_INTERNAL * (s->num_cpu - 1))); + bool injected; + + injected = arm_gicv3_inject_nmi_once(s, s->last_nmi_index, irq_count); + if (!injected) { + injected = arm_gicv3_inject_nmi_once(s, 0, s->last_nmi_index); + } + + if (!injected) { + error_setg(errp, "No NMI found"); + } +} + static void arm_gicv3_post_load(GICv3State *s) { /* Recalculate our cached idea of the current highest priority @@ -395,6 +470,7 @@ static void arm_gicv3_class_init(ObjectClass *klass, void *data) ARMGICv3CommonClass *agcc = ARM_GICV3_COMMON_CLASS(klass); ARMGICv3Class *agc = ARM_GICV3_CLASS(klass); + agcc->inject_nmi = arm_gicv3_inject_nmi; agcc->post_load = arm_gicv3_post_load; device_class_set_parent_realize(dc, arm_gic_realize, &agc->parent_realize); } diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c index 9c7f4ab871..b076d67c52 100644 --- a/hw/intc/arm_gicv3_kvm.c +++ b/hw/intc/arm_gicv3_kvm.c @@ -31,6 +31,7 @@ #include "gicv3_internal.h" #include "vgic_common.h" #include "migration/blocker.h" +#include "sysemu/cpus.h" #ifdef DEBUG_GICV3_KVM #define DPRINTF(fmt, ...) \ @@ -506,6 +507,96 @@ static void kvm_arm_gicv3_put(GICv3State *s) } } +static bool kvm_arm_gicv3_inject_nmi_once(GICv3State *s, int start, int end) +{ + GICv3CPUState *cs; + int irq_count = (s->num_irq + (GIC_INTERNAL * (s->num_cpu - 1))); + int i, cpu, irq; + + /* SPIs */ + for (i = start; (i < end) && (i < (s->num_irq - GIC_INTERNAL)); i++) { + if (gicv3_gicd_enabled_test(s, i + GIC_INTERNAL) && + s->gicd_ipriority[i + GIC_INTERNAL] == 0x20) { + kvm_arm_gicv3_set_irq(s, i, true); + + s->last_nmi_index = (i + 1); + return true; + } + } + + /* PPIs */ + if (start < (s->num_irq - GIC_INTERNAL)) { + start = (s->num_irq - GIC_INTERNAL); + } + + for (i = start; (i < end) && (i < irq_count); i++) { + cpu = (i - ((s->num_irq - GIC_INTERNAL))) / GIC_INTERNAL; + irq = (i - ((s->num_irq - GIC_INTERNAL))) % GIC_INTERNAL; + cs = &s->cpu[cpu]; + + if ((cs->gicr_ienabler0 & (1 << irq)) && + cs->gicr_ipriorityr[irq] == 0x20) { + kvm_arm_gicv3_set_irq(s, i, true); + + s->last_nmi_index = (i + 1); + if (s->last_nmi_index > irq_count) { + s->last_nmi_index = 0; + } + + return true; + } + } + + return false; +} + +static void kvm_arm_gicv3_snapshot(GICv3State *s) +{ + GICv3CPUState *c; + uint32_t val; + int i, j; + + pause_all_vcpus(); + + kvm_dist_getbmp(s, GICD_ISENABLER, s->enabled); + kvm_dist_get_priority(s, GICD_IPRIORITYR, s->gicd_ipriority); + for (i = 0; i < s->num_cpu; i++) { + c = &s->cpu[i]; + + kvm_gicr_access(s, GICR_ISENABLER0, i, &val, false); + c->gicr_ienabler0 = val; + + for (j = 0; j < GIC_INTERNAL; j += 4) { + kvm_gicr_access(s, GICR_IPRIORITYR + j, i, &val, false); + c->gicr_ipriorityr[j] = extract32(val, 0, 8); + c->gicr_ipriorityr[j + 1] = extract32(val, 8, 8); + c->gicr_ipriorityr[j + 2] = extract32(val, 16, 8); + c->gicr_ipriorityr[j + 3] = extract32(val, 24, 8); + } + } + + resume_all_vcpus(); +} + +static void kvm_arm_gicv3_inject_nmi(DeviceState *dev, + int cpu_index, Error **errp) +{ + GICv3State *s = KVM_ARM_GICV3(dev); + int irq_count = (s->num_irq + (GIC_INTERNAL * (s->num_cpu - 1))); + bool injected; + + kvm_arm_gicv3_snapshot(s); + + injected = kvm_arm_gicv3_inject_nmi_once(s, s->last_nmi_index, irq_count); + if (!injected) { + injected = kvm_arm_gicv3_inject_nmi_once(s, 0, s->last_nmi_index); + } + + if (!injected) { + error_setg(errp, "No NMI found"); + } +} + static void kvm_arm_gicv3_get(GICv3State *s) { uint32_t regl, regh, reg; @@ -882,6 +973,7 @@ static void kvm_arm_gicv3_class_init(ObjectClass *klass, void *data) ARMGICv3CommonClass *agcc = ARM_GICV3_COMMON_CLASS(klass); KVMARMGICv3Class *kgc = KVM_ARM_GICV3_CLASS(klass); + agcc->inject_nmi = kvm_arm_gicv3_inject_nmi; agcc->pre_save = kvm_arm_gicv3_get; agcc->post_load = kvm_arm_gicv3_put; device_class_set_parent_realize(dc, kvm_arm_gicv3_realize, diff --git a/include/hw/intc/arm_gicv3_common.h b/include/hw/intc/arm_gicv3_common.h index 31ec9a1ae4..0ae9c45aa2 100644 --- a/include/hw/intc/arm_gicv3_common.h +++ b/include/hw/intc/arm_gicv3_common.h @@ -225,6 +225,7 @@ struct GICv3State { int dev_fd; /* kvm device fd if backed by kvm vgic support */ Error *migration_blocker; + int last_nmi_index; /* Distributor */ @@ -291,6 +292,7 @@ typedef struct ARMGICv3CommonClass { SysBusDeviceClass parent_class; /*< public >*/ + void (*inject_nmi)(DeviceState *dev, int cpu_index, Error **errp); void (*pre_save)(GICv3State *s); void (*post_load)(GICv3State *s); } ARMGICv3CommonClass;
This supports NMI injection for virtual machine and currently it's only supported on GICv3 controller, which is emulated by qemu or host kernel. The design is highlighted as below: * The NMI is identified by its priority (0x20). In the guest (linux) kernel, the GICC_PMR is set to 0x80, to block all interrupts except the NMIs when the external interrupt is disabled. It means the FIQ and IRQ bit in PSTATE isn't touched when the functionality (NMI) is functional. * LPIs aren't considered as NMIs because of their nature. It means NMI is either SPI or PPI. Besides, the NMIs are injected in round-robin fashion is there are multiple NMIs existing. * When the GICv3 controller is emulated by qemu, the interrupt states (e.g. enabled, priority) is fetched from the corresponding data struct directly. However, we have to pause all CPUs to fetch the interrupt states from host in advance if the GICv3 controller is emulated by host. The testing scenario is to tweak guest (linux) kernel where the pl011 SPI can be enabled as NMI by request_nmi(). Check "/proc/interrupts" after injecting several NMIs, to see if the interrupt count is increased or not. The result is just as expected. Signed-off-by: Gavin Shan <gshan@redhat.com> --- hw/arm/virt.c | 24 ++++++++ hw/intc/arm_gicv3.c | 76 ++++++++++++++++++++++++ hw/intc/arm_gicv3_kvm.c | 92 ++++++++++++++++++++++++++++++ include/hw/intc/arm_gicv3_common.h | 2 + 4 files changed, 194 insertions(+)