@@ -17,6 +17,7 @@ obj-$(CONFIG_HAX) += hax-all.o hax-mem.o hax-posix.o
endif
obj-$(CONFIG_HVF) += hvf/
obj-$(CONFIG_WHPX) += whpx-all.o
+obj-$(CONFIG_NVMM) += nvmm-all.o
endif
obj-$(CONFIG_SEV) += sev.o
obj-$(call lnot,$(CONFIG_SEV)) += sev-stub.o
new file mode 100644
@@ -0,0 +1,1226 @@
+/*
+ * Copyright (c) 2018-2019 Maxime Villard, All rights reserved.
+ *
+ * NetBSD Virtual Machine Monitor (NVMM) accelerator for QEMU.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "exec/address-spaces.h"
+#include "exec/ioport.h"
+#include "qemu-common.h"
+#include "strings.h"
+#include "sysemu/accel.h"
+#include "sysemu/nvmm.h"
+#include "sysemu/runstate.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/cpus.h"
+#include "qemu/main-loop.h"
+#include "qemu/error-report.h"
+#include "qemu/queue.h"
+#include "qapi/error.h"
+#include "migration/blocker.h"
+
+#include <nvmm.h>
+
+struct qemu_vcpu {
+ struct nvmm_vcpu vcpu;
+ uint8_t tpr;
+ bool stop;
+
+ /* Window-exiting for INTs/NMIs. */
+ bool int_window_exit;
+ bool nmi_window_exit;
+
+ /* The guest is in an interrupt shadow (POP SS, etc). */
+ bool int_shadow;
+};
+
+struct qemu_machine {
+ struct nvmm_capability cap;
+ struct nvmm_machine mach;
+};
+
+/* -------------------------------------------------------------------------- */
+
+static bool nvmm_allowed;
+static struct qemu_machine qemu_mach;
+
+static struct qemu_vcpu *
+get_qemu_vcpu(CPUState *cpu)
+{
+ return (struct qemu_vcpu *)cpu->hax_vcpu;
+}
+
+static struct nvmm_machine *
+get_nvmm_mach(void)
+{
+ return &qemu_mach.mach;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void
+nvmm_set_segment(struct nvmm_x64_state_seg *nseg, const SegmentCache *qseg)
+{
+ uint32_t attrib = qseg->flags;
+
+ nseg->selector = qseg->selector;
+ nseg->limit = qseg->limit;
+ nseg->base = qseg->base;
+ nseg->attrib.type = __SHIFTOUT(attrib, DESC_TYPE_MASK);
+ nseg->attrib.s = __SHIFTOUT(attrib, DESC_S_MASK);
+ nseg->attrib.dpl = __SHIFTOUT(attrib, DESC_DPL_MASK);
+ nseg->attrib.p = __SHIFTOUT(attrib, DESC_P_MASK);
+ nseg->attrib.avl = __SHIFTOUT(attrib, DESC_AVL_MASK);
+ nseg->attrib.l = __SHIFTOUT(attrib, DESC_L_MASK);
+ nseg->attrib.def = __SHIFTOUT(attrib, DESC_B_MASK);
+ nseg->attrib.g = __SHIFTOUT(attrib, DESC_G_MASK);
+}
+
+static void
+nvmm_set_registers(CPUState *cpu)
+{
+ struct CPUX86State *env = (CPUArchState *)cpu->env_ptr;
+ struct nvmm_machine *mach = get_nvmm_mach();
+ struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+ struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+ struct nvmm_x64_state *state = vcpu->state;
+ uint64_t bitmap;
+ size_t i;
+ int ret;
+
+ assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
+
+ /* GPRs. */
+ state->gprs[NVMM_X64_GPR_RAX] = env->regs[R_EAX];
+ state->gprs[NVMM_X64_GPR_RCX] = env->regs[R_ECX];
+ state->gprs[NVMM_X64_GPR_RDX] = env->regs[R_EDX];
+ state->gprs[NVMM_X64_GPR_RBX] = env->regs[R_EBX];
+ state->gprs[NVMM_X64_GPR_RSP] = env->regs[R_ESP];
+ state->gprs[NVMM_X64_GPR_RBP] = env->regs[R_EBP];
+ state->gprs[NVMM_X64_GPR_RSI] = env->regs[R_ESI];
+ state->gprs[NVMM_X64_GPR_RDI] = env->regs[R_EDI];
+#ifdef TARGET_X86_64
+ state->gprs[NVMM_X64_GPR_R8] = env->regs[R_R8];
+ state->gprs[NVMM_X64_GPR_R9] = env->regs[R_R9];
+ state->gprs[NVMM_X64_GPR_R10] = env->regs[R_R10];
+ state->gprs[NVMM_X64_GPR_R11] = env->regs[R_R11];
+ state->gprs[NVMM_X64_GPR_R12] = env->regs[R_R12];
+ state->gprs[NVMM_X64_GPR_R13] = env->regs[R_R13];
+ state->gprs[NVMM_X64_GPR_R14] = env->regs[R_R14];
+ state->gprs[NVMM_X64_GPR_R15] = env->regs[R_R15];
+#endif
+
+ /* RIP and RFLAGS. */
+ state->gprs[NVMM_X64_GPR_RIP] = env->eip;
+ state->gprs[NVMM_X64_GPR_RFLAGS] = env->eflags;
+
+ /* Segments. */
+ nvmm_set_segment(&state->segs[NVMM_X64_SEG_CS], &env->segs[R_CS]);
+ nvmm_set_segment(&state->segs[NVMM_X64_SEG_DS], &env->segs[R_DS]);
+ nvmm_set_segment(&state->segs[NVMM_X64_SEG_ES], &env->segs[R_ES]);
+ nvmm_set_segment(&state->segs[NVMM_X64_SEG_FS], &env->segs[R_FS]);
+ nvmm_set_segment(&state->segs[NVMM_X64_SEG_GS], &env->segs[R_GS]);
+ nvmm_set_segment(&state->segs[NVMM_X64_SEG_SS], &env->segs[R_SS]);
+
+ /* Special segments. */
+ nvmm_set_segment(&state->segs[NVMM_X64_SEG_GDT], &env->gdt);
+ nvmm_set_segment(&state->segs[NVMM_X64_SEG_LDT], &env->ldt);
+ nvmm_set_segment(&state->segs[NVMM_X64_SEG_TR], &env->tr);
+ nvmm_set_segment(&state->segs[NVMM_X64_SEG_IDT], &env->idt);
+
+ /* Control registers. */
+ state->crs[NVMM_X64_CR_CR0] = env->cr[0];
+ state->crs[NVMM_X64_CR_CR2] = env->cr[2];
+ state->crs[NVMM_X64_CR_CR3] = env->cr[3];
+ state->crs[NVMM_X64_CR_CR4] = env->cr[4];
+ state->crs[NVMM_X64_CR_CR8] = qcpu->tpr;
+ state->crs[NVMM_X64_CR_XCR0] = env->xcr0;
+
+ /* Debug registers. */
+ state->drs[NVMM_X64_DR_DR0] = env->dr[0];
+ state->drs[NVMM_X64_DR_DR1] = env->dr[1];
+ state->drs[NVMM_X64_DR_DR2] = env->dr[2];
+ state->drs[NVMM_X64_DR_DR3] = env->dr[3];
+ state->drs[NVMM_X64_DR_DR6] = env->dr[6];
+ state->drs[NVMM_X64_DR_DR7] = env->dr[7];
+
+ /* FPU. */
+ state->fpu.fx_cw = env->fpuc;
+ state->fpu.fx_sw = (env->fpus & ~0x3800) | ((env->fpstt & 0x7) << 11);
+ state->fpu.fx_tw = 0;
+ for (i = 0; i < 8; i++) {
+ state->fpu.fx_tw |= (!env->fptags[i]) << i;
+ }
+ state->fpu.fx_opcode = env->fpop;
+ state->fpu.fx_ip.fa_64 = env->fpip;
+ state->fpu.fx_dp.fa_64 = env->fpdp;
+ state->fpu.fx_mxcsr = env->mxcsr;
+ state->fpu.fx_mxcsr_mask = 0x0000FFFF;
+ assert(sizeof(state->fpu.fx_87_ac) == sizeof(env->fpregs));
+ memcpy(state->fpu.fx_87_ac, env->fpregs, sizeof(env->fpregs));
+ for (i = 0; i < CPU_NB_REGS; i++) {
+ memcpy(&state->fpu.fx_xmm[i].xmm_bytes[0],
+ &env->xmm_regs[i].ZMM_Q(0), 8);
+ memcpy(&state->fpu.fx_xmm[i].xmm_bytes[8],
+ &env->xmm_regs[i].ZMM_Q(1), 8);
+ }
+
+ /* MSRs. */
+ state->msrs[NVMM_X64_MSR_EFER] = env->efer;
+ state->msrs[NVMM_X64_MSR_STAR] = env->star;
+#ifdef TARGET_X86_64
+ state->msrs[NVMM_X64_MSR_LSTAR] = env->lstar;
+ state->msrs[NVMM_X64_MSR_CSTAR] = env->cstar;
+ state->msrs[NVMM_X64_MSR_SFMASK] = env->fmask;
+ state->msrs[NVMM_X64_MSR_KERNELGSBASE] = env->kernelgsbase;
+#endif
+ state->msrs[NVMM_X64_MSR_SYSENTER_CS] = env->sysenter_cs;
+ state->msrs[NVMM_X64_MSR_SYSENTER_ESP] = env->sysenter_esp;
+ state->msrs[NVMM_X64_MSR_SYSENTER_EIP] = env->sysenter_eip;
+ state->msrs[NVMM_X64_MSR_PAT] = env->pat;
+ state->msrs[NVMM_X64_MSR_TSC] = env->tsc;
+
+ bitmap =
+ NVMM_X64_STATE_SEGS |
+ NVMM_X64_STATE_GPRS |
+ NVMM_X64_STATE_CRS |
+ NVMM_X64_STATE_DRS |
+ NVMM_X64_STATE_MSRS |
+ NVMM_X64_STATE_FPU;
+
+ ret = nvmm_vcpu_setstate(mach, vcpu, bitmap);
+ if (ret == -1) {
+ error_report("NVMM: Failed to set virtual processor context,"
+ " error=%d", errno);
+ }
+}
+
+static void
+nvmm_get_segment(SegmentCache *qseg, const struct nvmm_x64_state_seg *nseg)
+{
+ qseg->selector = nseg->selector;
+ qseg->limit = nseg->limit;
+ qseg->base = nseg->base;
+
+ qseg->flags =
+ __SHIFTIN((uint32_t)nseg->attrib.type, DESC_TYPE_MASK) |
+ __SHIFTIN((uint32_t)nseg->attrib.s, DESC_S_MASK) |
+ __SHIFTIN((uint32_t)nseg->attrib.dpl, DESC_DPL_MASK) |
+ __SHIFTIN((uint32_t)nseg->attrib.p, DESC_P_MASK) |
+ __SHIFTIN((uint32_t)nseg->attrib.avl, DESC_AVL_MASK) |
+ __SHIFTIN((uint32_t)nseg->attrib.l, DESC_L_MASK) |
+ __SHIFTIN((uint32_t)nseg->attrib.def, DESC_B_MASK) |
+ __SHIFTIN((uint32_t)nseg->attrib.g, DESC_G_MASK);
+}
+
+static void
+nvmm_get_registers(CPUState *cpu)
+{
+ struct CPUX86State *env = (CPUArchState *)cpu->env_ptr;
+ struct nvmm_machine *mach = get_nvmm_mach();
+ struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+ struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ struct nvmm_x64_state *state = vcpu->state;
+ uint64_t bitmap, tpr;
+ size_t i;
+ int ret;
+
+ assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
+
+ bitmap =
+ NVMM_X64_STATE_SEGS |
+ NVMM_X64_STATE_GPRS |
+ NVMM_X64_STATE_CRS |
+ NVMM_X64_STATE_DRS |
+ NVMM_X64_STATE_MSRS |
+ NVMM_X64_STATE_FPU;
+
+ ret = nvmm_vcpu_getstate(mach, vcpu, bitmap);
+ if (ret == -1) {
+ error_report("NVMM: Failed to get virtual processor context,"
+ " error=%d", errno);
+ }
+
+ /* GPRs. */
+ env->regs[R_EAX] = state->gprs[NVMM_X64_GPR_RAX];
+ env->regs[R_ECX] = state->gprs[NVMM_X64_GPR_RCX];
+ env->regs[R_EDX] = state->gprs[NVMM_X64_GPR_RDX];
+ env->regs[R_EBX] = state->gprs[NVMM_X64_GPR_RBX];
+ env->regs[R_ESP] = state->gprs[NVMM_X64_GPR_RSP];
+ env->regs[R_EBP] = state->gprs[NVMM_X64_GPR_RBP];
+ env->regs[R_ESI] = state->gprs[NVMM_X64_GPR_RSI];
+ env->regs[R_EDI] = state->gprs[NVMM_X64_GPR_RDI];
+#ifdef TARGET_X86_64
+ env->regs[R_R8] = state->gprs[NVMM_X64_GPR_R8];
+ env->regs[R_R9] = state->gprs[NVMM_X64_GPR_R9];
+ env->regs[R_R10] = state->gprs[NVMM_X64_GPR_R10];
+ env->regs[R_R11] = state->gprs[NVMM_X64_GPR_R11];
+ env->regs[R_R12] = state->gprs[NVMM_X64_GPR_R12];
+ env->regs[R_R13] = state->gprs[NVMM_X64_GPR_R13];
+ env->regs[R_R14] = state->gprs[NVMM_X64_GPR_R14];
+ env->regs[R_R15] = state->gprs[NVMM_X64_GPR_R15];
+#endif
+
+ /* RIP and RFLAGS. */
+ env->eip = state->gprs[NVMM_X64_GPR_RIP];
+ env->eflags = state->gprs[NVMM_X64_GPR_RFLAGS];
+
+ /* Segments. */
+ nvmm_get_segment(&env->segs[R_ES], &state->segs[NVMM_X64_SEG_ES]);
+ nvmm_get_segment(&env->segs[R_CS], &state->segs[NVMM_X64_SEG_CS]);
+ nvmm_get_segment(&env->segs[R_SS], &state->segs[NVMM_X64_SEG_SS]);
+ nvmm_get_segment(&env->segs[R_DS], &state->segs[NVMM_X64_SEG_DS]);
+ nvmm_get_segment(&env->segs[R_FS], &state->segs[NVMM_X64_SEG_FS]);
+ nvmm_get_segment(&env->segs[R_GS], &state->segs[NVMM_X64_SEG_GS]);
+
+ /* Special segments. */
+ nvmm_get_segment(&env->gdt, &state->segs[NVMM_X64_SEG_GDT]);
+ nvmm_get_segment(&env->ldt, &state->segs[NVMM_X64_SEG_LDT]);
+ nvmm_get_segment(&env->tr, &state->segs[NVMM_X64_SEG_TR]);
+ nvmm_get_segment(&env->idt, &state->segs[NVMM_X64_SEG_IDT]);
+
+ /* Control registers. */
+ env->cr[0] = state->crs[NVMM_X64_CR_CR0];
+ env->cr[2] = state->crs[NVMM_X64_CR_CR2];
+ env->cr[3] = state->crs[NVMM_X64_CR_CR3];
+ env->cr[4] = state->crs[NVMM_X64_CR_CR4];
+ tpr = state->crs[NVMM_X64_CR_CR8];
+ if (tpr != qcpu->tpr) {
+ qcpu->tpr = tpr;
+ cpu_set_apic_tpr(x86_cpu->apic_state, tpr);
+ }
+ env->xcr0 = state->crs[NVMM_X64_CR_XCR0];
+
+ /* Debug registers. */
+ env->dr[0] = state->drs[NVMM_X64_DR_DR0];
+ env->dr[1] = state->drs[NVMM_X64_DR_DR1];
+ env->dr[2] = state->drs[NVMM_X64_DR_DR2];
+ env->dr[3] = state->drs[NVMM_X64_DR_DR3];
+ env->dr[6] = state->drs[NVMM_X64_DR_DR6];
+ env->dr[7] = state->drs[NVMM_X64_DR_DR7];
+
+ /* FPU. */
+ env->fpuc = state->fpu.fx_cw;
+ env->fpstt = (state->fpu.fx_sw >> 11) & 0x7;
+ env->fpus = state->fpu.fx_sw & ~0x3800;
+ for (i = 0; i < 8; i++) {
+ env->fptags[i] = !((state->fpu.fx_tw >> i) & 1);
+ }
+ env->fpop = state->fpu.fx_opcode;
+ env->fpip = state->fpu.fx_ip.fa_64;
+ env->fpdp = state->fpu.fx_dp.fa_64;
+ env->mxcsr = state->fpu.fx_mxcsr;
+ assert(sizeof(state->fpu.fx_87_ac) == sizeof(env->fpregs));
+ memcpy(env->fpregs, state->fpu.fx_87_ac, sizeof(env->fpregs));
+ for (i = 0; i < CPU_NB_REGS; i++) {
+ memcpy(&env->xmm_regs[i].ZMM_Q(0),
+ &state->fpu.fx_xmm[i].xmm_bytes[0], 8);
+ memcpy(&env->xmm_regs[i].ZMM_Q(1),
+ &state->fpu.fx_xmm[i].xmm_bytes[8], 8);
+ }
+
+ /* MSRs. */
+ env->efer = state->msrs[NVMM_X64_MSR_EFER];
+ env->star = state->msrs[NVMM_X64_MSR_STAR];
+#ifdef TARGET_X86_64
+ env->lstar = state->msrs[NVMM_X64_MSR_LSTAR];
+ env->cstar = state->msrs[NVMM_X64_MSR_CSTAR];
+ env->fmask = state->msrs[NVMM_X64_MSR_SFMASK];
+ env->kernelgsbase = state->msrs[NVMM_X64_MSR_KERNELGSBASE];
+#endif
+ env->sysenter_cs = state->msrs[NVMM_X64_MSR_SYSENTER_CS];
+ env->sysenter_esp = state->msrs[NVMM_X64_MSR_SYSENTER_ESP];
+ env->sysenter_eip = state->msrs[NVMM_X64_MSR_SYSENTER_EIP];
+ env->pat = state->msrs[NVMM_X64_MSR_PAT];
+ env->tsc = state->msrs[NVMM_X64_MSR_TSC];
+
+ x86_update_hflags(env);
+}
+
+static bool
+nvmm_can_take_int(CPUState *cpu)
+{
+ struct CPUX86State *env = (CPUArchState *)cpu->env_ptr;
+ struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+ struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+ struct nvmm_machine *mach = get_nvmm_mach();
+
+ if (qcpu->int_window_exit) {
+ return false;
+ }
+
+ if (qcpu->int_shadow || !(env->eflags & IF_MASK)) {
+ struct nvmm_x64_state *state = vcpu->state;
+
+ /* Exit on interrupt window. */
+ nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_INTR);
+ state->intr.int_window_exiting = 1;
+ nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_INTR);
+
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+nvmm_can_take_nmi(CPUState *cpu)
+{
+ struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+
+ /*
+ * Contrary to INTs, NMIs always schedule an exit when they are
+ * completed. Therefore, if window-exiting is enabled, it means
+ * NMIs are blocked.
+ */
+ if (qcpu->nmi_window_exit) {
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Called before the VCPU is run. We inject events generated by the I/O
+ * thread, and synchronize the guest TPR.
+ */
+static void
+nvmm_vcpu_pre_run(CPUState *cpu)
+{
+ struct CPUX86State *env = (CPUArchState *)cpu->env_ptr;
+ struct nvmm_machine *mach = get_nvmm_mach();
+ struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+ struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ struct nvmm_x64_state *state = vcpu->state;
+ struct nvmm_vcpu_event *event = vcpu->event;
+ bool has_event = false;
+ bool sync_tpr = false;
+ uint8_t tpr;
+ int ret;
+
+ qemu_mutex_lock_iothread();
+
+ tpr = cpu_get_apic_tpr(x86_cpu->apic_state);
+ if (tpr != qcpu->tpr) {
+ qcpu->tpr = tpr;
+ sync_tpr = true;
+ }
+
+ /*
+ * Force the VCPU out of its inner loop to process any INIT requests
+ * or commit pending TPR access.
+ */
+ if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
+ cpu->exit_request = 1;
+ }
+
+ if (!has_event && (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
+ if (nvmm_can_take_nmi(cpu)) {
+ cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
+ event->type = NVMM_VCPU_EVENT_INTR;
+ event->vector = 2;
+ has_event = true;
+ }
+ }
+
+ if (!has_event && (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
+ if (nvmm_can_take_int(cpu)) {
+ cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
+ event->type = NVMM_VCPU_EVENT_INTR;
+ event->vector = cpu_get_pic_interrupt(env);
+ has_event = true;
+ }
+ }
+
+ /* Don't want SMIs. */
+ if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
+ cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
+ }
+
+ if (sync_tpr) {
+ ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_CRS);
+ if (ret == -1) {
+ error_report("NVMM: Failed to get CPU state,"
+ " error=%d", errno);
+ }
+
+ state->crs[NVMM_X64_CR_CR8] = qcpu->tpr;
+
+ ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_CRS);
+ if (ret == -1) {
+ error_report("NVMM: Failed to set CPU state,"
+ " error=%d", errno);
+ }
+ }
+
+ if (has_event) {
+ ret = nvmm_vcpu_inject(mach, vcpu);
+ if (ret == -1) {
+ error_report("NVMM: Failed to inject event,"
+ " error=%d", errno);
+ }
+ }
+
+ qemu_mutex_unlock_iothread();
+}
+
+/*
+ * Called after the VCPU ran. We synchronize the host view of the TPR and
+ * RFLAGS.
+ */
+static void
+nvmm_vcpu_post_run(CPUState *cpu, struct nvmm_vcpu_exit *exit)
+{
+ struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+ struct CPUX86State *env = (CPUArchState *)cpu->env_ptr;
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ uint64_t tpr;
+
+ env->eflags = exit->exitstate.rflags;
+ qcpu->int_shadow = exit->exitstate.int_shadow;
+ qcpu->int_window_exit = exit->exitstate.int_window_exiting;
+ qcpu->nmi_window_exit = exit->exitstate.nmi_window_exiting;
+
+ tpr = exit->exitstate.cr8;
+ if (qcpu->tpr != tpr) {
+ qcpu->tpr = tpr;
+ qemu_mutex_lock_iothread();
+ cpu_set_apic_tpr(x86_cpu->apic_state, qcpu->tpr);
+ qemu_mutex_unlock_iothread();
+ }
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void
+nvmm_io_callback(struct nvmm_io *io)
+{
+ MemTxAttrs attrs = { 0 };
+ int ret;
+
+ ret = address_space_rw(&address_space_io, io->port, attrs, io->data,
+ io->size, !io->in);
+ if (ret != MEMTX_OK) {
+ error_report("NVMM: I/O Transaction Failed "
+ "[%s, port=%u, size=%zu]", (io->in ? "in" : "out"),
+ io->port, io->size);
+ }
+
+ /* Needed, otherwise infinite loop. */
+ current_cpu->vcpu_dirty = false;
+}
+
+static void
+nvmm_mem_callback(struct nvmm_mem *mem)
+{
+ cpu_physical_memory_rw(mem->gpa, mem->data, mem->size, mem->write);
+
+ /* XXX Needed, otherwise infinite loop. */
+ current_cpu->vcpu_dirty = false;
+}
+
+static struct nvmm_assist_callbacks nvmm_callbacks = {
+ .io = nvmm_io_callback,
+ .mem = nvmm_mem_callback
+};
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_handle_mem(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
+{
+ int ret;
+
+ ret = nvmm_assist_mem(mach, vcpu);
+ if (ret == -1) {
+ error_report("NVMM: Mem Assist Failed [gpa=%p]",
+ (void *)vcpu->exit->u.mem.gpa);
+ }
+
+ return ret;
+}
+
+static int
+nvmm_handle_io(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
+{
+ int ret;
+
+ ret = nvmm_assist_io(mach, vcpu);
+ if (ret == -1) {
+ error_report("NVMM: I/O Assist Failed [port=%d]",
+ (int)vcpu->exit->u.io.port);
+ }
+
+ return ret;
+}
+
+static int
+nvmm_handle_rdmsr(struct nvmm_machine *mach, CPUState *cpu,
+ struct nvmm_vcpu_exit *exit)
+{
+ struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+ struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ struct nvmm_x64_state *state = vcpu->state;
+ uint64_t val;
+ int ret;
+
+ switch (exit->u.rdmsr.msr) {
+ case MSR_IA32_APICBASE:
+ val = cpu_get_apic_base(x86_cpu->apic_state);
+ break;
+ case MSR_MTRRcap:
+ case MSR_MTRRdefType:
+ case MSR_MCG_CAP:
+ case MSR_MCG_STATUS:
+ val = 0;
+ break;
+ default: /* More MSRs to add? */
+ val = 0;
+ error_report("NVMM: Unexpected RDMSR 0x%x, ignored",
+ exit->u.rdmsr.msr);
+ break;
+ }
+
+ ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_GPRS);
+ if (ret == -1) {
+ return -1;
+ }
+
+ state->gprs[NVMM_X64_GPR_RAX] = (val & 0xFFFFFFFF);
+ state->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
+ state->gprs[NVMM_X64_GPR_RIP] = exit->u.rdmsr.npc;
+
+ ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_GPRS);
+ if (ret == -1) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+nvmm_handle_wrmsr(struct nvmm_machine *mach, CPUState *cpu,
+ struct nvmm_vcpu_exit *exit)
+{
+ struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+ struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ struct nvmm_x64_state *state = vcpu->state;
+ uint64_t val;
+ int ret;
+
+ val = exit->u.wrmsr.val;
+
+ switch (exit->u.wrmsr.msr) {
+ case MSR_IA32_APICBASE:
+ cpu_set_apic_base(x86_cpu->apic_state, val);
+ break;
+ case MSR_MTRRdefType:
+ case MSR_MCG_STATUS:
+ break;
+ default: /* More MSRs to add? */
+ error_report("NVMM: Unexpected WRMSR 0x%x [val=0x%lx], ignored",
+ exit->u.wrmsr.msr, val);
+ break;
+ }
+
+ ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_GPRS);
+ if (ret == -1) {
+ return -1;
+ }
+
+ state->gprs[NVMM_X64_GPR_RIP] = exit->u.wrmsr.npc;
+
+ ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_GPRS);
+ if (ret == -1) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+nvmm_handle_halted(struct nvmm_machine *mach, CPUState *cpu,
+ struct nvmm_vcpu_exit *exit)
+{
+ struct CPUX86State *env = (CPUArchState *)cpu->env_ptr;
+ int ret = 0;
+
+ qemu_mutex_lock_iothread();
+
+ if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
+ (env->eflags & IF_MASK)) &&
+ !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
+ cpu->exception_index = EXCP_HLT;
+ cpu->halted = true;
+ ret = 1;
+ }
+
+ qemu_mutex_unlock_iothread();
+
+ return ret;
+}
+
+static int
+nvmm_inject_ud(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
+{
+ struct nvmm_vcpu_event *event = vcpu->event;
+
+ event->type = NVMM_VCPU_EVENT_EXCP;
+ event->vector = 6;
+ event->u.excp.error = 0;
+
+ return nvmm_vcpu_inject(mach, vcpu);
+}
+
+static int
+nvmm_vcpu_loop(CPUState *cpu)
+{
+ struct CPUX86State *env = (CPUArchState *)cpu->env_ptr;
+ struct nvmm_machine *mach = get_nvmm_mach();
+ struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+ struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ struct nvmm_vcpu_exit *exit = vcpu->exit;
+ int ret;
+
+ /*
+ * Some asynchronous events must be handled outside of the inner
+ * VCPU loop. They are handled here.
+ */
+ if (cpu->interrupt_request & CPU_INTERRUPT_INIT) {
+ nvmm_cpu_synchronize_state(cpu);
+ do_cpu_init(x86_cpu);
+ /* set int/nmi windows back to the reset state */
+ }
+ if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
+ cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
+ apic_poll_irq(x86_cpu->apic_state);
+ }
+ if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
+ (env->eflags & IF_MASK)) ||
+ (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
+ cpu->halted = false;
+ }
+ if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
+ nvmm_cpu_synchronize_state(cpu);
+ do_cpu_sipi(x86_cpu);
+ }
+ if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
+ cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
+ nvmm_cpu_synchronize_state(cpu);
+ apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
+ env->tpr_access_type);
+ }
+
+ if (cpu->halted) {
+ cpu->exception_index = EXCP_HLT;
+ atomic_set(&cpu->exit_request, false);
+ return 0;
+ }
+
+ qemu_mutex_unlock_iothread();
+ cpu_exec_start(cpu);
+
+ /*
+ * Inner VCPU loop.
+ */
+ do {
+ if (cpu->vcpu_dirty) {
+ nvmm_set_registers(cpu);
+ cpu->vcpu_dirty = false;
+ }
+
+ if (qcpu->stop) {
+ cpu->exception_index = EXCP_INTERRUPT;
+ qcpu->stop = false;
+ ret = 1;
+ break;
+ }
+
+ nvmm_vcpu_pre_run(cpu);
+
+ if (atomic_read(&cpu->exit_request)) {
+ qemu_cpu_kick_self();
+ }
+
+ ret = nvmm_vcpu_run(mach, vcpu);
+ if (ret == -1) {
+ error_report("NVMM: Failed to exec a virtual processor,"
+ " error=%d", errno);
+ break;
+ }
+
+ nvmm_vcpu_post_run(cpu, exit);
+
+ switch (exit->reason) {
+ case NVMM_VCPU_EXIT_NONE:
+ break;
+ case NVMM_VCPU_EXIT_MEMORY:
+ ret = nvmm_handle_mem(mach, vcpu);
+ break;
+ case NVMM_VCPU_EXIT_IO:
+ ret = nvmm_handle_io(mach, vcpu);
+ break;
+ case NVMM_VCPU_EXIT_INT_READY:
+ case NVMM_VCPU_EXIT_NMI_READY:
+ case NVMM_VCPU_EXIT_TPR_CHANGED:
+ break;
+ case NVMM_VCPU_EXIT_HALTED:
+ ret = nvmm_handle_halted(mach, cpu, exit);
+ break;
+ case NVMM_VCPU_EXIT_SHUTDOWN:
+ qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
+ cpu->exception_index = EXCP_INTERRUPT;
+ ret = 1;
+ break;
+ case NVMM_VCPU_EXIT_RDMSR:
+ ret = nvmm_handle_rdmsr(mach, cpu, exit);
+ break;
+ case NVMM_VCPU_EXIT_WRMSR:
+ ret = nvmm_handle_wrmsr(mach, cpu, exit);
+ break;
+ case NVMM_VCPU_EXIT_MONITOR:
+ case NVMM_VCPU_EXIT_MWAIT:
+ ret = nvmm_inject_ud(mach, vcpu);
+ break;
+ default:
+ error_report("NVMM: Unexpected VM exit code 0x%lx [hw=0x%lx]",
+ exit->reason, exit->u.inv.hwcode);
+ nvmm_get_registers(cpu);
+ qemu_mutex_lock_iothread();
+ qemu_system_guest_panicked(cpu_get_crash_info(cpu));
+ qemu_mutex_unlock_iothread();
+ ret = -1;
+ break;
+ }
+ } while (ret == 0);
+
+ cpu_exec_end(cpu);
+ qemu_mutex_lock_iothread();
+ current_cpu = cpu;
+
+ atomic_set(&cpu->exit_request, false);
+
+ return ret < 0;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void
+do_nvmm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
+{
+ nvmm_get_registers(cpu);
+ cpu->vcpu_dirty = true;
+}
+
+static void
+do_nvmm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
+{
+ nvmm_set_registers(cpu);
+ cpu->vcpu_dirty = false;
+}
+
+static void
+do_nvmm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
+{
+ nvmm_set_registers(cpu);
+ cpu->vcpu_dirty = false;
+}
+
+static void
+do_nvmm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
+{
+ cpu->vcpu_dirty = true;
+}
+
+void nvmm_cpu_synchronize_state(CPUState *cpu)
+{
+ if (!cpu->vcpu_dirty) {
+ run_on_cpu(cpu, do_nvmm_cpu_synchronize_state, RUN_ON_CPU_NULL);
+ }
+}
+
+void nvmm_cpu_synchronize_post_reset(CPUState *cpu)
+{
+ run_on_cpu(cpu, do_nvmm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
+}
+
+void nvmm_cpu_synchronize_post_init(CPUState *cpu)
+{
+ run_on_cpu(cpu, do_nvmm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
+}
+
+void nvmm_cpu_synchronize_pre_loadvm(CPUState *cpu)
+{
+ run_on_cpu(cpu, do_nvmm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
+}
+
+/* -------------------------------------------------------------------------- */
+
+static Error *nvmm_migration_blocker;
+
+static void
+nvmm_ipi_signal(int sigcpu)
+{
+ struct qemu_vcpu *qcpu;
+
+ if (current_cpu) {
+ qcpu = get_qemu_vcpu(current_cpu);
+ qcpu->stop = true;
+ }
+}
+
+static void
+nvmm_init_cpu_signals(void)
+{
+ struct sigaction sigact;
+ sigset_t set;
+
+ /* Install the IPI handler. */
+ memset(&sigact, 0, sizeof(sigact));
+ sigact.sa_handler = nvmm_ipi_signal;
+ sigaction(SIG_IPI, &sigact, NULL);
+
+ /* Allow IPIs on the current thread. */
+ sigprocmask(SIG_BLOCK, NULL, &set);
+ sigdelset(&set, SIG_IPI);
+ pthread_sigmask(SIG_SETMASK, &set, NULL);
+}
+
+int
+nvmm_init_vcpu(CPUState *cpu)
+{
+ struct nvmm_machine *mach = get_nvmm_mach();
+ struct nvmm_vcpu_conf_cpuid cpuid;
+ struct nvmm_vcpu_conf_tpr tpr;
+ Error *local_error = NULL;
+ struct qemu_vcpu *qcpu;
+ int ret, err;
+
+ nvmm_init_cpu_signals();
+
+ if (nvmm_migration_blocker == NULL) {
+ error_setg(&nvmm_migration_blocker,
+ "NVMM: Migration not supported");
+
+ (void)migrate_add_blocker(nvmm_migration_blocker, &local_error);
+ if (local_error) {
+ error_report_err(local_error);
+ migrate_del_blocker(nvmm_migration_blocker);
+ error_free(nvmm_migration_blocker);
+ return -EINVAL;
+ }
+ }
+
+ qcpu = g_malloc0(sizeof(*qcpu));
+ if (qcpu == NULL) {
+ error_report("NVMM: Failed to allocate VCPU context.");
+ return -ENOMEM;
+ }
+
+ ret = nvmm_vcpu_create(mach, cpu->cpu_index, &qcpu->vcpu);
+ if (ret == -1) {
+ err = errno;
+ error_report("NVMM: Failed to create a virtual processor,"
+ " error=%d", err);
+ g_free(qcpu);
+ return -err;
+ }
+
+ memset(&cpuid, 0, sizeof(cpuid));
+ cpuid.mask = 1;
+ cpuid.leaf = 0x00000001;
+ cpuid.u.mask.set.edx = CPUID_MCE | CPUID_MCA | CPUID_MTRR;
+ ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_CPUID,
+ &cpuid);
+ if (ret == -1) {
+ err = errno;
+ error_report("NVMM: Failed to configure a virtual processor,"
+ " error=%d", err);
+ g_free(qcpu);
+ return -err;
+ }
+
+ ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_CALLBACKS,
+ &nvmm_callbacks);
+ if (ret == -1) {
+ err = errno;
+ error_report("NVMM: Failed to configure a virtual processor,"
+ " error=%d", err);
+ g_free(qcpu);
+ return -err;
+ }
+
+ if (qemu_mach.cap.arch.vcpu_conf_support & NVMM_CAP_ARCH_VCPU_CONF_TPR) {
+ memset(&tpr, 0, sizeof(tpr));
+ tpr.exit_changed = 1;
+ ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_TPR, &tpr);
+ if (ret == -1) {
+ err = errno;
+ error_report("NVMM: Failed to configure a virtual processor,"
+ " error=%d", err);
+ g_free(qcpu);
+ return -err;
+ }
+ }
+
+ cpu->vcpu_dirty = true;
+ cpu->hax_vcpu = (struct hax_vcpu_state *)qcpu;
+
+ return 0;
+}
+
+int
+nvmm_vcpu_exec(CPUState *cpu)
+{
+ int ret, fatal;
+
+ while (1) {
+ if (cpu->exception_index >= EXCP_INTERRUPT) {
+ ret = cpu->exception_index;
+ cpu->exception_index = -1;
+ break;
+ }
+
+ fatal = nvmm_vcpu_loop(cpu);
+
+ if (fatal) {
+ error_report("NVMM: Failed to execute a VCPU.");
+ abort();
+ }
+ }
+
+ return ret;
+}
+
+void
+nvmm_destroy_vcpu(CPUState *cpu)
+{
+ struct nvmm_machine *mach = get_nvmm_mach();
+ struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+
+ nvmm_vcpu_destroy(mach, &qcpu->vcpu);
+ g_free(cpu->hax_vcpu);
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void
+nvmm_update_mapping(hwaddr start_pa, ram_addr_t size, uintptr_t hva,
+ bool add, bool rom, const char *name)
+{
+ struct nvmm_machine *mach = get_nvmm_mach();
+ int ret, prot;
+
+ if (add) {
+ prot = PROT_READ | PROT_EXEC;
+ if (!rom) {
+ prot |= PROT_WRITE;
+ }
+ ret = nvmm_gpa_map(mach, hva, start_pa, size, prot);
+ } else {
+ ret = nvmm_gpa_unmap(mach, hva, start_pa, size);
+ }
+
+ if (ret == -1) {
+ error_report("NVMM: Failed to %s GPA range '%s' PA:%p, "
+ "Size:%p bytes, HostVA:%p, error=%d",
+ (add ? "map" : "unmap"), name, (void *)(uintptr_t)start_pa,
+ (void *)size, (void *)hva, errno);
+ }
+}
+
+static void
+nvmm_process_section(MemoryRegionSection *section, int add)
+{
+ MemoryRegion *mr = section->mr;
+ hwaddr start_pa = section->offset_within_address_space;
+ ram_addr_t size = int128_get64(section->size);
+ unsigned int delta;
+ uintptr_t hva;
+
+ if (!memory_region_is_ram(mr)) {
+ return;
+ }
+
+ /* Adjust start_pa and size so that they are page-aligned. */
+ delta = qemu_real_host_page_size - (start_pa & ~qemu_real_host_page_mask);
+ delta &= ~qemu_real_host_page_mask;
+ if (delta > size) {
+ return;
+ }
+ start_pa += delta;
+ size -= delta;
+ size &= qemu_real_host_page_mask;
+ if (!size || (start_pa & ~qemu_real_host_page_mask)) {
+ return;
+ }
+
+ hva = (uintptr_t)memory_region_get_ram_ptr(mr) +
+ section->offset_within_region + delta;
+
+ nvmm_update_mapping(start_pa, size, hva, add,
+ memory_region_is_rom(mr), mr->name);
+}
+
+static void
+nvmm_region_add(MemoryListener *listener, MemoryRegionSection *section)
+{
+ memory_region_ref(section->mr);
+ nvmm_process_section(section, 1);
+}
+
+static void
+nvmm_region_del(MemoryListener *listener, MemoryRegionSection *section)
+{
+ nvmm_process_section(section, 0);
+ memory_region_unref(section->mr);
+}
+
+static void
+nvmm_transaction_begin(MemoryListener *listener)
+{
+ /* nothing */
+}
+
+static void
+nvmm_transaction_commit(MemoryListener *listener)
+{
+ /* nothing */
+}
+
+static void
+nvmm_log_sync(MemoryListener *listener, MemoryRegionSection *section)
+{
+ MemoryRegion *mr = section->mr;
+
+ if (!memory_region_is_ram(mr)) {
+ return;
+ }
+
+ memory_region_set_dirty(mr, 0, int128_get64(section->size));
+}
+
+static MemoryListener nvmm_memory_listener = {
+ .begin = nvmm_transaction_begin,
+ .commit = nvmm_transaction_commit,
+ .region_add = nvmm_region_add,
+ .region_del = nvmm_region_del,
+ .log_sync = nvmm_log_sync,
+ .priority = 10,
+};
+
+static void
+nvmm_ram_block_added(RAMBlockNotifier *n, void *host, size_t size)
+{
+ struct nvmm_machine *mach = get_nvmm_mach();
+ uintptr_t hva = (uintptr_t)host;
+ int ret;
+
+ ret = nvmm_hva_map(mach, hva, size);
+
+ if (ret == -1) {
+ error_report("NVMM: Failed to map HVA, HostVA:%p "
+ "Size:%p bytes, error=%d",
+ (void *)hva, (void *)size, errno);
+ }
+}
+
+static struct RAMBlockNotifier nvmm_ram_notifier = {
+ .ram_block_added = nvmm_ram_block_added
+};
+
+/* -------------------------------------------------------------------------- */
+
+static void
+nvmm_handle_interrupt(CPUState *cpu, int mask)
+{
+ cpu->interrupt_request |= mask;
+
+ if (!qemu_cpu_is_self(cpu)) {
+ qemu_cpu_kick(cpu);
+ }
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_accel_init(MachineState *ms)
+{
+ int ret, err;
+
+ ret = nvmm_init();
+ if (ret == -1) {
+ err = errno;
+ error_report("NVMM: Initialization failed, error=%d", errno);
+ return -err;
+ }
+
+ ret = nvmm_capability(&qemu_mach.cap);
+ if (ret == -1) {
+ err = errno;
+ error_report("NVMM: Unable to fetch capability, error=%d", errno);
+ return -err;
+ }
+ if (qemu_mach.cap.version != 1) {
+ error_report("NVMM: Unsupported version %u", qemu_mach.cap.version);
+ return -EPROGMISMATCH;
+ }
+ if (qemu_mach.cap.state_size != sizeof(struct nvmm_x64_state)) {
+ error_report("NVMM: Wrong state size %u", qemu_mach.cap.state_size);
+ return -EPROGMISMATCH;
+ }
+
+ ret = nvmm_machine_create(&qemu_mach.mach);
+ if (ret == -1) {
+ err = errno;
+ error_report("NVMM: Machine creation failed, error=%d", errno);
+ return -err;
+ }
+
+ memory_listener_register(&nvmm_memory_listener, &address_space_memory);
+ ram_block_notifier_add(&nvmm_ram_notifier);
+
+ cpu_interrupt_handler = nvmm_handle_interrupt;
+
+ printf("NetBSD Virtual Machine Monitor accelerator is operational\n");
+ return 0;
+}
+
+int
+nvmm_enabled(void)
+{
+ return nvmm_allowed;
+}
+
+static void
+nvmm_accel_class_init(ObjectClass *oc, void *data)
+{
+ AccelClass *ac = ACCEL_CLASS(oc);
+ ac->name = "NVMM";
+ ac->init_machine = nvmm_accel_init;
+ ac->allowed = &nvmm_allowed;
+}
+
+static const TypeInfo nvmm_accel_type = {
+ .name = ACCEL_CLASS_NAME("nvmm"),
+ .parent = TYPE_ACCEL,
+ .class_init = nvmm_accel_class_init,
+};
+
+static void
+nvmm_type_init(void)
+{
+ type_register_static(&nvmm_accel_type);
+}
+
+type_init(nvmm_type_init);