diff mbox series

[4/5] ppc/spapr: Don't kill the guest if a recovered FWNMI machine check delivery fails

Message ID 20200317050215.159334-5-npiggin@gmail.com (mailing list archive)
State New, archived
Headers show
Series FWNMI follow up patches | expand

Commit Message

Nicholas Piggin March 17, 2020, 5:02 a.m. UTC
Try to be tolerant of errors if the machine check had been recovered
by the host.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 hw/ppc/spapr_events.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

Comments

Greg Kurz March 17, 2020, 4:57 p.m. UTC | #1
On Tue, 17 Mar 2020 15:02:14 +1000
Nicholas Piggin <npiggin@gmail.com> wrote:

> Try to be tolerant of errors if the machine check had been recovered
> by the host.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---

Same comment as previous patch on multi-line error strings and
warn_report() in the !recovered case.

>  hw/ppc/spapr_events.c | 25 ++++++++++++++++++-------
>  1 file changed, 18 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index d35151eeb0..3f524cb0ca 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -807,13 +807,20 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
>      /* get rtas addr from fdt */
>      rtas_addr = spapr_get_rtas_addr();
>      if (!rtas_addr) {
> -        warn_report("FWNMI: Unable to deliver machine check to guest: "
> -                    "rtas_addr not found.");
> -        qemu_system_guest_panicked(NULL);
> +        if (!recovered) {
> +            warn_report("FWNMI: Unable to deliver machine check to guest: "
> +                        "rtas_addr not found.");
> +            qemu_system_guest_panicked(NULL);
> +        } else {
> +            warn_report("FWNMI: Unable to deliver machine check to guest: "
> +                        "rtas_addr not found. Machine check recovered.");
> +        }
>          g_free(ext_elog);
>          return;
>      }
>  
> +    spapr->fwnmi_machine_check_interlock = cpu->vcpu_id;
> +

I don't understand this change.

>      stq_be_phys(&address_space_memory, rtas_addr + RTAS_ERROR_LOG_OFFSET,
>                  env->gpr[3]);
>      cpu_physical_memory_write(rtas_addr + RTAS_ERROR_LOG_OFFSET +
> @@ -850,9 +857,14 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>           * that CPU called "ibm,nmi-interlock")
>           */
>          if (spapr->fwnmi_machine_check_interlock == cpu->vcpu_id) {
> -            warn_report("FWNMI: Unable to deliver machine check to guest: "
> -                        "nested machine check.");
> -            qemu_system_guest_panicked(NULL);
> +            if (!recovered) {
> +                warn_report("FWNMI: Unable to deliver machine check to guest: "
> +                            "nested machine check.");
> +                qemu_system_guest_panicked(NULL);
> +            } else {
> +                warn_report("FWNMI: Unable to deliver machine check to guest: "
> +                            "nested machine check. Machine check recovered.");
> +            }
>              return;
>          }
>          qemu_cond_wait_iothread(&spapr->fwnmi_machine_check_interlock_cond);
> @@ -880,7 +892,6 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>          warn_report("Received a fwnmi while migration was in progress");
>      }
>  
> -    spapr->fwnmi_machine_check_interlock = cpu->vcpu_id;
>      spapr_mce_dispatch_elog(cpu, recovered);
>  }
>
Nicholas Piggin March 19, 2020, 2:29 a.m. UTC | #2
Greg Kurz's on March 18, 2020 2:57 am:
> On Tue, 17 Mar 2020 15:02:14 +1000
> Nicholas Piggin <npiggin@gmail.com> wrote:
> 
>> Try to be tolerant of errors if the machine check had been recovered
>> by the host.
>> 
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
> 
> Same comment as previous patch on multi-line error strings and
> warn_report() in the !recovered case.
> 
>>  hw/ppc/spapr_events.c | 25 ++++++++++++++++++-------
>>  1 file changed, 18 insertions(+), 7 deletions(-)
>> 
>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>> index d35151eeb0..3f524cb0ca 100644
>> --- a/hw/ppc/spapr_events.c
>> +++ b/hw/ppc/spapr_events.c
>> @@ -807,13 +807,20 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
>>      /* get rtas addr from fdt */
>>      rtas_addr = spapr_get_rtas_addr();
>>      if (!rtas_addr) {
>> -        warn_report("FWNMI: Unable to deliver machine check to guest: "
>> -                    "rtas_addr not found.");
>> -        qemu_system_guest_panicked(NULL);
>> +        if (!recovered) {
>> +            warn_report("FWNMI: Unable to deliver machine check to guest: "
>> +                        "rtas_addr not found.");
>> +            qemu_system_guest_panicked(NULL);
>> +        } else {
>> +            warn_report("FWNMI: Unable to deliver machine check to guest: "
>> +                        "rtas_addr not found. Machine check recovered.");
>> +        }
>>          g_free(ext_elog);
>>          return;
>>      }
>>  
>> +    spapr->fwnmi_machine_check_interlock = cpu->vcpu_id;
>> +
> 
> I don't understand this change.

If we bail out without delivering the interrupt, we can't take the
interlock otherwise the guest can never release it.

Thanks,
Nick
diff mbox series

Patch

diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index d35151eeb0..3f524cb0ca 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -807,13 +807,20 @@  static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
     /* get rtas addr from fdt */
     rtas_addr = spapr_get_rtas_addr();
     if (!rtas_addr) {
-        warn_report("FWNMI: Unable to deliver machine check to guest: "
-                    "rtas_addr not found.");
-        qemu_system_guest_panicked(NULL);
+        if (!recovered) {
+            warn_report("FWNMI: Unable to deliver machine check to guest: "
+                        "rtas_addr not found.");
+            qemu_system_guest_panicked(NULL);
+        } else {
+            warn_report("FWNMI: Unable to deliver machine check to guest: "
+                        "rtas_addr not found. Machine check recovered.");
+        }
         g_free(ext_elog);
         return;
     }
 
+    spapr->fwnmi_machine_check_interlock = cpu->vcpu_id;
+
     stq_be_phys(&address_space_memory, rtas_addr + RTAS_ERROR_LOG_OFFSET,
                 env->gpr[3]);
     cpu_physical_memory_write(rtas_addr + RTAS_ERROR_LOG_OFFSET +
@@ -850,9 +857,14 @@  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
          * that CPU called "ibm,nmi-interlock")
          */
         if (spapr->fwnmi_machine_check_interlock == cpu->vcpu_id) {
-            warn_report("FWNMI: Unable to deliver machine check to guest: "
-                        "nested machine check.");
-            qemu_system_guest_panicked(NULL);
+            if (!recovered) {
+                warn_report("FWNMI: Unable to deliver machine check to guest: "
+                            "nested machine check.");
+                qemu_system_guest_panicked(NULL);
+            } else {
+                warn_report("FWNMI: Unable to deliver machine check to guest: "
+                            "nested machine check. Machine check recovered.");
+            }
             return;
         }
         qemu_cond_wait_iothread(&spapr->fwnmi_machine_check_interlock_cond);
@@ -880,7 +892,6 @@  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
         warn_report("Received a fwnmi while migration was in progress");
     }
 
-    spapr->fwnmi_machine_check_interlock = cpu->vcpu_id;
     spapr_mce_dispatch_elog(cpu, recovered);
 }