@@ -20,6 +20,7 @@ obj-y += vsyscall/
obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o
obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o
obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o
+obj-$(CONFIG_PVM_GUEST) += entry_64_pvm.o
ifeq ($(CONFIG_X86_64),y)
obj-y += entry_64_switcher.o
new file mode 100644
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/asm-offsets.h>
+#include <asm/percpu.h>
+#include <asm/pvm_para.h>
+
+#include "calling.h"
+
+/* Construct struct pt_regs on stack */
+.macro PUSH_IRET_FRAME_FROM_PVCS has_cs_ss:req is_kernel:req
+ .if \has_cs_ss == 1
+ movl PER_CPU_VAR(pvm_vcpu_struct + PVCS_user_ss), %ecx
+ andl $0xff, %ecx
+ pushq %rcx /* pt_regs->ss */
+ .elseif \is_kernel == 1
+ pushq $__KERNEL_DS
+ .else
+ pushq $__USER_DS
+ .endif
+
+ pushq PER_CPU_VAR(pvm_vcpu_struct + PVCS_rsp) /* pt_regs->sp */
+ movl PER_CPU_VAR(pvm_vcpu_struct + PVCS_eflags), %ecx
+ pushq %rcx /* pt_regs->flags */
+
+ .if \has_cs_ss == 1
+ movl PER_CPU_VAR(pvm_vcpu_struct + PVCS_user_cs), %ecx
+ andl $0xff, %ecx
+ pushq %rcx /* pt_regs->cs */
+ .elseif \is_kernel == 1
+ pushq $__KERNEL_CS
+ .else
+ pushq $__USER_CS
+ .endif
+
+ pushq PER_CPU_VAR(pvm_vcpu_struct + PVCS_rip) /* pt_regs->ip */
+
+ /* set %rcx, %r11 per PVM event handling specification */
+ movq PER_CPU_VAR(pvm_vcpu_struct + PVCS_rcx), %rcx
+ movq PER_CPU_VAR(pvm_vcpu_struct + PVCS_r11), %r11
+.endm
+
+.code64
+.section .entry.text, "ax"
+
+SYM_CODE_START(entry_SYSCALL_64_pvm)
+ UNWIND_HINT_ENTRY
+ ENDBR
+
+ PUSH_IRET_FRAME_FROM_PVCS has_cs_ss=0 is_kernel=0
+
+ jmp entry_SYSCALL_64_after_hwframe
+SYM_CODE_END(entry_SYSCALL_64_pvm)
+
+/*
+ * The new RIP value that PVM event delivery establishes is
+ * MSR_PVM_EVENT_ENTRY for vector events that occur in user mode.
+ */
+ .align 64
+SYM_CODE_START(pvm_user_event_entry)
+ UNWIND_HINT_ENTRY
+ ENDBR
+
+ PUSH_IRET_FRAME_FROM_PVCS has_cs_ss=1 is_kernel=0
+ /* pt_regs->orig_ax: errcode and vector */
+ pushq PER_CPU_VAR(pvm_vcpu_struct + PVCS_event_errcode)
+
+ PUSH_AND_CLEAR_REGS
+ movq %rsp, %rdi /* %rdi -> pt_regs */
+ call pvm_event
+
+SYM_INNER_LABEL(pvm_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
+ POP_REGS
+
+ /* Copy %rcx, %r11 to the PVM CPU structure. */
+ movq %rcx, PER_CPU_VAR(pvm_vcpu_struct + PVCS_rcx)
+ movq %r11, PER_CPU_VAR(pvm_vcpu_struct + PVCS_r11)
+
+ /* Copy the IRET frame to the PVM CPU structure. */
+ movq 1*8(%rsp), %rcx /* RIP */
+ movq %rcx, PER_CPU_VAR(pvm_vcpu_struct + PVCS_rip)
+ movq 2*8(%rsp), %rcx /* CS */
+ movw %cx, PER_CPU_VAR(pvm_vcpu_struct + PVCS_user_cs)
+ movq 3*8(%rsp), %rcx /* RFLAGS */
+ movl %ecx, PER_CPU_VAR(pvm_vcpu_struct + PVCS_eflags)
+ movq 4*8(%rsp), %rcx /* RSP */
+ movq %rcx, PER_CPU_VAR(pvm_vcpu_struct + PVCS_rsp)
+ movq 5*8(%rsp), %rcx /* SS */
+ movw %cx, PER_CPU_VAR(pvm_vcpu_struct + PVCS_user_ss)
+ /*
+ * We are on the trampoline stack. All regs are live.
+ * We can do future final exit work right here.
+ */
+ STACKLEAK_ERASE_NOCLOBBER
+
+ addq $6*8, %rsp
+SYM_INNER_LABEL(pvm_retu_rip, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ syscall
+SYM_CODE_END(pvm_user_event_entry)
+
+/*
+ * The new RIP value that PVM event delivery establishes is
+ * MSR_PVM_EVENT_ENTRY + 256 for events with vector < 32
+ * that occur in supervisor mode.
+ */
+ .org pvm_user_event_entry+256, 0xcc
+SYM_CODE_START(pvm_kernel_exception_entry)
+ UNWIND_HINT_ENTRY
+ ENDBR
+
+ /* set %rcx, %r11 per PVM event handling specification */
+ movq 6*8(%rsp), %rcx
+ movq 7*8(%rsp), %r11
+
+ PUSH_AND_CLEAR_REGS
+ movq %rsp, %rdi /* %rdi -> pt_regs */
+ call pvm_event
+
+ jmp pvm_restore_regs_and_return_to_kernel
+SYM_CODE_END(pvm_kernel_exception_entry)
+
+/*
+ * The new RIP value that PVM event delivery establishes is
+ * MSR_PVM_EVENT_ENTRY + 512 for events with vector >= 32
+ * that occur in supervisor mode.
+ */
+ .org pvm_user_event_entry+512, 0xcc
+SYM_CODE_START(pvm_kernel_interrupt_entry)
+ UNWIND_HINT_ENTRY
+ ENDBR
+
+ /* Reserve space for rcx/r11 */
+ subq $16, %rsp
+
+ PUSH_IRET_FRAME_FROM_PVCS has_cs_ss=0 is_kernel=1
+ /* pt_regs->orig_ax: errcode and vector */
+ pushq PER_CPU_VAR(pvm_vcpu_struct + PVCS_event_errcode)
+
+ PUSH_AND_CLEAR_REGS
+ movq %rsp, %rdi /* %rdi -> pt_regs */
+ call pvm_event
+
+SYM_INNER_LABEL(pvm_restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
+ POP_REGS
+
+ movq %rcx, 6*8(%rsp)
+ movq %r11, 7*8(%rsp)
+SYM_INNER_LABEL(pvm_rets_rip, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ syscall
+SYM_CODE_END(pvm_kernel_interrupt_entry)
@@ -5,6 +5,8 @@
#include <linux/init.h>
#include <uapi/asm/pvm_para.h>
+#ifndef __ASSEMBLY__
+
#ifdef CONFIG_PVM_GUEST
#include <asm/irqflags.h>
#include <uapi/asm/kvm_para.h>
@@ -72,4 +74,10 @@ static inline bool pvm_kernel_layout_relocate(void)
}
#endif /* CONFIG_PVM_GUEST */
+void entry_SYSCALL_64_pvm(void);
+void pvm_user_event_entry(void);
+void pvm_retu_rip(void);
+void pvm_rets_rip(void);
+#endif /* !__ASSEMBLY__ */
+
#endif /* _ASM_X86_PVM_PARA_H */
@@ -11,14 +11,195 @@
#define pr_fmt(fmt) "pvm-guest: " fmt
#include <linux/mm_types.h>
+#include <linux/nospec.h>
#include <asm/cpufeature.h>
#include <asm/cpu_entry_area.h>
+#include <asm/desc.h>
#include <asm/pvm_para.h>
+#include <asm/traps.h>
+
+DEFINE_PER_CPU_PAGE_ALIGNED(struct pvm_vcpu_struct, pvm_vcpu_struct);
unsigned long pvm_range_start __initdata;
unsigned long pvm_range_end __initdata;
+static noinstr void pvm_bad_event(struct pt_regs *regs, unsigned long vector,
+ unsigned long error_code)
+{
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+
+ instrumentation_begin();
+
+ /* Panic on events from a high stack level */
+ if (!user_mode(regs)) {
+ pr_emerg("PANIC: invalid or fatal PVM event;"
+ "vector %lu error 0x%lx at %04lx:%016lx\n",
+ vector, error_code, regs->cs, regs->ip);
+ die("invalid or fatal PVM event", regs, error_code);
+ panic("invalid or fatal PVM event");
+ } else {
+ unsigned long flags = oops_begin();
+ int sig = SIGKILL;
+
+ pr_alert("BUG: invalid or fatal FRED event;"
+ "vector %lu error 0x%lx at %04lx:%016lx\n",
+ vector, error_code, regs->cs, regs->ip);
+
+ if (__die("Invalid or fatal FRED event", regs, error_code))
+ sig = 0;
+
+ oops_end(flags, regs, sig);
+ }
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
+}
+
+DEFINE_IDTENTRY_RAW(pvm_exc_debug)
+{
+ /*
+ * There's no IST on PVM. but we still need to sipatch
+ * to the correct handler.
+ */
+ if (user_mode(regs))
+ noist_exc_debug(regs);
+ else
+ exc_debug(regs);
+}
+
+#ifdef CONFIG_X86_MCE
+DEFINE_IDTENTRY_RAW(pvm_exc_machine_check)
+{
+ /*
+ * There's no IST on PVM, but we still need to dispatch
+ * to the correct handler.
+ */
+ if (user_mode(regs))
+ noist_exc_machine_check(regs);
+ else
+ exc_machine_check(regs);
+}
+#endif
+
+static noinstr void pvm_exception(struct pt_regs *regs, unsigned long vector,
+ unsigned long error_code)
+{
+ /* Optimize for #PF. That's the only exception which matters performance wise */
+ if (likely(vector == X86_TRAP_PF)) {
+ exc_page_fault(regs, error_code);
+ return;
+ }
+
+ switch (vector) {
+ case X86_TRAP_DE: return exc_divide_error(regs);
+ case X86_TRAP_DB: return pvm_exc_debug(regs);
+ case X86_TRAP_NMI: return exc_nmi(regs);
+ case X86_TRAP_BP: return exc_int3(regs);
+ case X86_TRAP_OF: return exc_overflow(regs);
+ case X86_TRAP_BR: return exc_bounds(regs);
+ case X86_TRAP_UD: return exc_invalid_op(regs);
+ case X86_TRAP_NM: return exc_device_not_available(regs);
+ case X86_TRAP_DF: return exc_double_fault(regs, error_code);
+ case X86_TRAP_TS: return exc_invalid_tss(regs, error_code);
+ case X86_TRAP_NP: return exc_segment_not_present(regs, error_code);
+ case X86_TRAP_SS: return exc_stack_segment(regs, error_code);
+ case X86_TRAP_GP: return exc_general_protection(regs, error_code);
+ case X86_TRAP_MF: return exc_coprocessor_error(regs);
+ case X86_TRAP_AC: return exc_alignment_check(regs, error_code);
+ case X86_TRAP_XF: return exc_simd_coprocessor_error(regs);
+#ifdef CONFIG_X86_MCE
+ case X86_TRAP_MC: return pvm_exc_machine_check(regs);
+#endif
+#ifdef CONFIG_X86_CET
+ case X86_TRAP_CP: return exc_control_protection(regs, error_code);
+#endif
+ default: return pvm_bad_event(regs, vector, error_code);
+ }
+}
+
+static noinstr void pvm_handle_INT80_compat(struct pt_regs *regs)
+{
+#ifdef CONFIG_IA32_EMULATION
+ if (ia32_enabled()) {
+ int80_emulation(regs);
+ return;
+ }
+#endif
+ exc_general_protection(regs, 0);
+}
+
+typedef void (*idtentry_t)(struct pt_regs *regs);
+
+#define SYSVEC(_vector, _function) [_vector - FIRST_SYSTEM_VECTOR] = sysvec_##_function
+
+#define pvm_handle_spurious_interrupt ((idtentry_t)(void *)spurious_interrupt)
+
+static idtentry_t pvm_sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
+ [0 ... NR_SYSTEM_VECTORS-1] = pvm_handle_spurious_interrupt,
+
+ SYSVEC(ERROR_APIC_VECTOR, error_interrupt),
+ SYSVEC(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt),
+ SYSVEC(LOCAL_TIMER_VECTOR, apic_timer_interrupt),
+ SYSVEC(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi),
+
+#ifdef CONFIG_SMP
+ SYSVEC(RESCHEDULE_VECTOR, reschedule_ipi),
+ SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, call_function_single),
+ SYSVEC(CALL_FUNCTION_VECTOR, call_function),
+ SYSVEC(REBOOT_VECTOR, reboot),
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ SYSVEC(THRESHOLD_APIC_VECTOR, threshold),
+#endif
+#ifdef CONFIG_X86_MCE_AMD
+ SYSVEC(DEFERRED_ERROR_VECTOR, deferred_error),
+#endif
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ SYSVEC(THERMAL_APIC_VECTOR, thermal),
+#endif
+#ifdef CONFIG_IRQ_WORK
+ SYSVEC(IRQ_WORK_VECTOR, irq_work),
+#endif
+#ifdef CONFIG_HAVE_KVM
+ SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi),
+ SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
+ SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
+#endif
+};
+
+/*
+ * some pointers in pvm_sysvec_table are actual spurious_interrupt() who
+ * expects the second argument to be the vector.
+ */
+typedef void (*idtentry_x_t)(struct pt_regs *regs, int vector);
+
+static __always_inline void pvm_handle_sysvec(struct pt_regs *regs, unsigned long vector)
+{
+ unsigned int index = array_index_nospec(vector - FIRST_SYSTEM_VECTOR,
+ NR_SYSTEM_VECTORS);
+ idtentry_x_t func = (void *)pvm_sysvec_table[index];
+
+ func(regs, vector);
+}
+
+__visible noinstr void pvm_event(struct pt_regs *regs)
+{
+ u32 error_code = regs->orig_ax;
+ u64 vector = regs->orig_ax >> 32;
+
+ /* Invalidate orig_ax so that syscall_get_nr() works correctly */
+ regs->orig_ax = -1;
+
+ if (vector < NUM_EXCEPTION_VECTORS)
+ pvm_exception(regs, vector, error_code);
+ else if (vector >= FIRST_SYSTEM_VECTOR)
+ pvm_handle_sysvec(regs, vector);
+ else if (unlikely(vector == IA32_SYSCALL_VECTOR))
+ pvm_handle_INT80_compat(regs);
+ else
+ common_interrupt(regs, vector);
+}
+
void __init pvm_early_setup(void)
{
if (!pvm_range_end)