@@ -549,6 +549,50 @@ static int host_stage2_idmap(u64 addr)
return ret;
}
+static void host_inject_abort(struct kvm_cpu_context *host_ctxt)
+{
+ u64 spsr = read_sysreg_el2(SYS_SPSR);
+ u64 esr = read_sysreg_el2(SYS_ESR);
+ u64 ventry, ec;
+
+ /* Repaint the ESR to report a same-level fault if taken from EL1 */
+ if ((spsr & PSR_MODE_MASK) != PSR_MODE_EL0t) {
+ ec = ESR_ELx_EC(esr);
+ if (ec == ESR_ELx_EC_DABT_LOW)
+ ec = ESR_ELx_EC_DABT_CUR;
+ else if (ec == ESR_ELx_EC_IABT_LOW)
+ ec = ESR_ELx_EC_IABT_CUR;
+ else
+ WARN_ON(1);
+ esr &= ~ESR_ELx_EC_MASK;
+ esr |= ec << ESR_ELx_EC_SHIFT;
+ }
+
+ /*
+ * Since S1PTW should only ever be set for stage-2 faults, we're pretty
+ * much guaranteed that it won't be set in ESR_EL1 by the hardware. So,
+ * let's use that bit to allow the host abort handler to differentiate
+ * this abort from normal userspace faults.
+ *
+ * Note: although S1PTW is RES0 at EL1, it is guaranteed by the
+ * architecture to be backed by flops, so it should be safe to use.
+ */
+ esr |= ESR_ELx_S1PTW;
+
+ write_sysreg_el1(esr, SYS_ESR);
+ write_sysreg_el1(spsr, SYS_SPSR);
+ write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR);
+ write_sysreg_el1(read_sysreg_el2(SYS_FAR), SYS_FAR);
+
+ ventry = read_sysreg_el1(SYS_VBAR);
+ ventry += get_except64_offset(spsr, PSR_MODE_EL1h, except_type_sync);
+ write_sysreg_el2(ventry, SYS_ELR);
+
+ spsr = get_except64_cpsr(spsr, system_supports_mte(),
+ read_sysreg_el1(SYS_SCTLR), PSR_MODE_EL1h);
+ write_sysreg_el2(spsr, SYS_SPSR);
+}
+
void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
{
struct kvm_vcpu_fault_info fault;
@@ -560,7 +604,11 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
ret = host_stage2_idmap(addr);
- BUG_ON(ret && ret != -EAGAIN);
+
+ if (ret == -EPERM)
+ host_inject_abort(host_ctxt);
+ else
+ BUG_ON(ret && ret != -EAGAIN);
}
struct pkvm_mem_transition {
@@ -41,6 +41,7 @@
#include <asm/system_misc.h>
#include <asm/tlbflush.h>
#include <asm/traps.h>
+#include <asm/virt.h>
struct fault_info {
int (*fn)(unsigned long far, unsigned int esr,
@@ -257,6 +258,15 @@ static inline bool is_el1_permission_fault(unsigned long addr, unsigned int esr,
return false;
}
+static bool is_pkvm_stage2_abort(unsigned int esr)
+{
+ /*
+ * S1PTW should only ever be set in ESR_EL1 if the pkvm hypervisor
+ * injected a stage-2 abort -- see host_inject_abort().
+ */
+ return is_pkvm_initialized() && (esr & ESR_ELx_S1PTW);
+}
+
static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
unsigned int esr,
struct pt_regs *regs)
@@ -268,6 +278,9 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
(esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT)
return false;
+ if (is_pkvm_stage2_abort(esr))
+ return false;
+
local_irq_save(flags);
asm volatile("at s1e1r, %0" :: "r" (addr));
isb();
@@ -383,6 +396,8 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr,
msg = "read from unreadable memory";
} else if (addr < PAGE_SIZE) {
msg = "NULL pointer dereference";
+ } else if (is_pkvm_stage2_abort(esr)) {
+ msg = "access to hypervisor-protected memory";
} else {
if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
return;
@@ -572,6 +587,13 @@ static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
addr, esr, regs);
}
+ if (is_pkvm_stage2_abort(esr)) {
+ if (!user_mode(regs))
+ goto no_context;
+ arm64_force_sig_fault(SIGSEGV, SEGV_ACCERR, far, "stage-2 fault");
+ return 0;
+ }
+
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
/*