@@ -11,7 +11,8 @@
#include "qemu/osdep.h"
#include "hw/acpi/ghes.h"
-int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address)
+int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address,
+ bool retry_allowed)
{
return -1;
}
@@ -391,7 +391,7 @@ static void get_hw_error_offsets(uint64_t ghes_addr,
}
static int ghes_record_cper_errors(const void *cper, size_t len,
- uint16_t source_id)
+ uint16_t source_id, bool retry_allowed)
{
uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register;
AcpiGedState *acpi_ged_state;
@@ -424,6 +424,10 @@ static int ghes_record_cper_errors(const void *cper, size_t len,
/* zero means OSPM does not acknowledge the error */
if (!read_ack_register) {
+ if (retry_allowed) {
+ return 1;
+ }
+
error_report("OSPM does not acknowledge previous error,"
" so can not record CPER for current error anymore");
return -1;
@@ -443,7 +447,8 @@ static int ghes_record_cper_errors(const void *cper, size_t len,
return 0;
}
-int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address)
+int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address,
+ bool retry_allowed)
{
/* Memory Error Section Type */
const uint8_t guid[] =
@@ -468,7 +473,8 @@ int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address)
acpi_ghes_build_append_mem_cper(block, physical_address);
/* Report the error */
- ret = ghes_record_cper_errors(block->data, block->len, source_id);
+ ret = ghes_record_cper_errors(block->data, block->len,
+ source_id, retry_allowed);
g_array_free(block, true);
@@ -74,7 +74,8 @@ void acpi_build_hest(GArray *table_data, GArray *hardware_errors,
const char *oem_id, const char *oem_table_id);
void acpi_ghes_add_fw_cfg(AcpiGhesState *vms, FWCfgState *s,
GArray *hardware_errors);
-int acpi_ghes_memory_errors(uint16_t source_id, uint64_t error_physical_addr);
+int acpi_ghes_memory_errors(uint16_t source_id, uint64_t error_physical_addr,
+ bool retry_allowed);
/**
* acpi_ghes_present: Report whether ACPI GHES table is present
@@ -2387,7 +2387,7 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
*/
if (code == BUS_MCEERR_AR) {
kvm_cpu_synchronize_state(c);
- if (!acpi_ghes_memory_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
+ if (!acpi_ghes_memory_errors(ACPI_HEST_SRC_ID_SEA, paddr, false)) {
kvm_inject_arm_sea(c);
} else {
error_report("failed to record the error");
Multiple CPER errors can be raised on multiple vCPUs at the same time. The error -1 is returned from ghes_record_cper_errors() and QEMU is terminated due to abort() in kvm_arch_on_sigbus_vcpu(). it isn't correct and expected behaviour since the affected vCPU can't proceed with execution. It's reasonable to retry if the reported error is transient, for example the previously reported CPER error isn't claimed by the guest. Add one more parameter (@retry_allowed) to acpi_ghes_memory_errors(), passed down to ghes_record_cper_errors(). The differentiated error number (1 or -1) is returned if the the previously reported CPER error hasn't been claimed by the guest. The caller will retry the request if the returned error number is 1. Signed-off-by: Gavin Shan <gshan@redhat.com> --- hw/acpi/ghes-stub.c | 3 ++- hw/acpi/ghes.c | 12 +++++++++--- include/hw/acpi/ghes.h | 3 ++- target/arm/kvm.c | 2 +- 4 files changed, 14 insertions(+), 6 deletions(-)