diff mbox

[v9,12/16] ARM: KVM: World-switch implementation

Message ID 20120703090126.27746.5908.stgit@ubuntu (mailing list archive)
State New, archived
Headers show

Commit Message

Christoffer Dall July 3, 2012, 9:01 a.m. UTC
Provides complete world-switch implementation to switch to other guests
running in non-secure modes. Includes Hyp exception handlers that
capture necessary exception information and stores the information on
the VCPU and KVM structures.

The following Hyp-ABI is also documented in the code:

Hyp-ABI: Switching from host kernel to Hyp-mode:
   Switching to Hyp mode is done through a simple HVC instructions. The
   exception vector code will check that the HVC comes from VMID==0 and if
   so will store the necessary state on the Hyp stack, which will look like
   this (growing downwards, see the hyp_hvc handler):
     ...
     stack_page + 4: spsr (Host-SVC cpsr)
     stack_page    : lr_usr
     --------------: stack bottom

Hyp-ABI: Switching from Hyp-mode to host kernel SVC mode:
   When returning from Hyp mode to SVC mode, another HVC instruction is
   executed from Hyp mode, which is taken in the hyp_svc handler. The
   bottom of the Hyp is derived from the Hyp stack pointer (only a single
   page aligned stack is used per CPU) and the initial SVC registers are
   used to restore the host state.

Otherwise, the world-switch is pretty straight-forward. All state that
can be modified by the guest is first backed up on the Hyp stack and the
VCPU values is loaded onto the hardware. State, which is not loaded, but
theoretically modifiable by the guest is protected through the
virtualiation features to generate a trap and cause software emulation.
Upon guest returns, all state is restored from hardware onto the VCPU
struct and the original state is restored from the Hyp-stack onto the
hardware.

One controversy may be the back-door call to __irq_svc (the host
kernel's own physical IRQ handler) which is called when a physical IRQ
exception is taken in Hyp mode while running in the guest.

SMP support using the VMPIDR calculated on the basis of the host MPIDR
and overriding the low bits with KVM vcpu_id contributed by Marc Zyngier.

Reuse of VMIDs has been implemented by Antonios Motakis and adapated from
a separate patch into the appropriate patches introducing the
functionality. Note that the VMIDs are stored per VM as required by the ARM
architecture reference manual.

Signed-off-by: Christoffer Dall <c.dall@virtualopensystems.com>
---
 arch/arm/include/asm/kvm_arm.h |   37 ++
 arch/arm/kernel/armksyms.c     |    7 
 arch/arm/kernel/asm-offsets.c  |   43 +++
 arch/arm/kernel/entry-armv.S   |    1 
 arch/arm/kvm/arm.c             |  181 ++++++++++++
 arch/arm/kvm/interrupts.S      |  599 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 865 insertions(+), 3 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Avi Kivity July 3, 2012, 10:07 a.m. UTC | #1
On 07/03/2012 12:01 PM, Christoffer Dall wrote:
> Provides complete world-switch implementation to switch to other guests
> running in non-secure modes. Includes Hyp exception handlers that
> capture necessary exception information and stores the information on
> the VCPU and KVM structures.
> 
> The following Hyp-ABI is also documented in the code:
> 
> Hyp-ABI: Switching from host kernel to Hyp-mode:
>    Switching to Hyp mode is done through a simple HVC instructions. The
>    exception vector code will check that the HVC comes from VMID==0 and if
>    so will store the necessary state on the Hyp stack, which will look like
>    this (growing downwards, see the hyp_hvc handler):
>      ...
>      stack_page + 4: spsr (Host-SVC cpsr)
>      stack_page    : lr_usr
>      --------------: stack bottom
> 
> Hyp-ABI: Switching from Hyp-mode to host kernel SVC mode:
>    When returning from Hyp mode to SVC mode, another HVC instruction is
>    executed from Hyp mode, which is taken in the hyp_svc handler. The
>    bottom of the Hyp is derived from the Hyp stack pointer (only a single
>    page aligned stack is used per CPU) and the initial SVC registers are
>    used to restore the host state.
> 
> Otherwise, the world-switch is pretty straight-forward. All state that
> can be modified by the guest is first backed up on the Hyp stack and the
> VCPU values is loaded onto the hardware. State, which is not loaded, but
> theoretically modifiable by the guest is protected through the
> virtualiation features to generate a trap and cause software emulation.
> Upon guest returns, all state is restored from hardware onto the VCPU
> struct and the original state is restored from the Hyp-stack onto the
> hardware.
> 
> One controversy may be the back-door call to __irq_svc (the host
> kernel's own physical IRQ handler) which is called when a physical IRQ
> exception is taken in Hyp mode while running in the guest.
> 
> SMP support using the VMPIDR calculated on the basis of the host MPIDR
> and overriding the low bits with KVM vcpu_id contributed by Marc Zyngier.

He should sign off on this patch then.

> 
> Reuse of VMIDs has been implemented by Antonios Motakis and adapated from
> a separate patch into the appropriate patches introducing the
> functionality. Note that the VMIDs are stored per VM as required by the ARM
> architecture reference manual.

Ditto.

> diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
> index 220f241..232117c 100644
> --- a/arch/arm/include/asm/kvm_arm.h
> +++ b/arch/arm/include/asm/kvm_arm.h
> @@ -105,6 +105,17 @@
>  #define TTBCR_T0SZ	3
>  #define HTCR_MASK	(TTBCR_T0SZ | TTBCR_IRGN0 | TTBCR_ORGN0 | TTBCR_SH0)
>  
> +/* Hyp System Trap Register */
> +#define HSTR_T(x)	(1 << x)
> +#define HSTR_TTEE	(1 << 16)
> +#define HSTR_TJDBX	(1 << 17)
> +
> +/* Hyp Coprocessor Trap Register */
> +#define HCPTR_TCP(x)	(1 << x)
> +#define HCPTR_TCP_MASK	(0x3fff)
> +#define HCPTR_TASE	(1 << 15)
> +#define HCPTR_TTA	(1 << 20)
> +#define HCPTR_TCPAC	(1 << 31)
>  
>  /* Virtualization Translation Control Register (VTCR) bits */
>  #define VTCR_SH0	(3 << 12)
> @@ -126,5 +137,31 @@
>  #define VTTBR_X		(5 - VTCR_GUEST_T0SZ)
>  #endif
>  
> +/* Hyp Syndrome Register (HSR) bits */
> +#define HSR_EC_SHIFT	(26)
> +#define HSR_EC		(0x3fU << HSR_EC_SHIFT)
> +#define HSR_IL		(1U << 25)
> +#define HSR_ISS		(HSR_IL - 1)
> +#define HSR_ISV_SHIFT	(24)
> +#define HSR_ISV		(1U << HSR_ISV_SHIFT)
> +
> +#define HSR_EC_UNKNOWN	(0x00)
> +#define HSR_EC_WFI	(0x01)
> +#define HSR_EC_CP15_32	(0x03)
> +#define HSR_EC_CP15_64	(0x04)
> +#define HSR_EC_CP14_MR	(0x05)
> +#define HSR_EC_CP14_LS	(0x06)
> +#define HSR_EC_CP_0_13	(0x07)
> +#define HSR_EC_CP10_ID	(0x08)
> +#define HSR_EC_JAZELLE	(0x09)
> +#define HSR_EC_BXJ	(0x0A)
> +#define HSR_EC_CP14_64	(0x0C)
> +#define HSR_EC_SVC_HYP	(0x11)
> +#define HSR_EC_HVC	(0x12)
> +#define HSR_EC_SMC	(0x13)
> +#define HSR_EC_IABT	(0x20)
> +#define HSR_EC_IABT_HYP	(0x21)
> +#define HSR_EC_DABT	(0x24)
> +#define HSR_EC_DABT_HYP	(0x25)
>  
>  #endif /* __ARM_KVM_ARM_H__ */
> diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
> index b57c75e..38d3a12 100644
> --- a/arch/arm/kernel/armksyms.c
> +++ b/arch/arm/kernel/armksyms.c
> @@ -48,6 +48,13 @@ extern void __aeabi_ulcmp(void);
>  
>  extern void fpundefinstr(void);
>  
> +#ifdef CONFIG_KVM_ARM_HOST
> +/* This is needed for KVM */
> +extern void __irq_svc(void);
> +
> +EXPORT_SYMBOL_GPL(__irq_svc);
> +#endif
> +
>  	/* platform dependent support */
>  EXPORT_SYMBOL(__udelay);
>  EXPORT_SYMBOL(__const_udelay);
> diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
> index 1429d89..9c76b53 100644
> --- a/arch/arm/kernel/asm-offsets.c
> +++ b/arch/arm/kernel/asm-offsets.c
> @@ -13,6 +13,7 @@
>  #include <linux/sched.h>
>  #include <linux/mm.h>
>  #include <linux/dma-mapping.h>
> +#include <linux/kvm_host.h>
>  #include <asm/cacheflush.h>
>  #include <asm/glue-df.h>
>  #include <asm/glue-pf.h>
> @@ -144,5 +145,47 @@ int main(void)
>    DEFINE(DMA_BIDIRECTIONAL,	DMA_BIDIRECTIONAL);
>    DEFINE(DMA_TO_DEVICE,		DMA_TO_DEVICE);
>    DEFINE(DMA_FROM_DEVICE,	DMA_FROM_DEVICE);
> +#ifdef CONFIG_KVM_ARM_HOST
> +  DEFINE(VCPU_KVM,		offsetof(struct kvm_vcpu, kvm));
> +  DEFINE(VCPU_MIDR,		offsetof(struct kvm_vcpu, arch.cp15[c0_MIDR]));
> +  DEFINE(VCPU_MPIDR,		offsetof(struct kvm_vcpu, arch.cp15[c0_MPIDR]));
> +  DEFINE(VCPU_SCTLR,		offsetof(struct kvm_vcpu, arch.cp15[c1_SCTLR]));
> +  DEFINE(VCPU_CPACR,		offsetof(struct kvm_vcpu, arch.cp15[c1_CPACR]));
> +  DEFINE(VCPU_TTBR0,		offsetof(struct kvm_vcpu, arch.cp15[c2_TTBR0]));
> +  DEFINE(VCPU_TTBR1,		offsetof(struct kvm_vcpu, arch.cp15[c2_TTBR1]));
> +  DEFINE(VCPU_TTBCR,		offsetof(struct kvm_vcpu, arch.cp15[c2_TTBCR]));
> +  DEFINE(VCPU_DACR,		offsetof(struct kvm_vcpu, arch.cp15[c3_DACR]));
> +  DEFINE(VCPU_DFSR,		offsetof(struct kvm_vcpu, arch.cp15[c5_DFSR]));
> +  DEFINE(VCPU_IFSR,		offsetof(struct kvm_vcpu, arch.cp15[c5_IFSR]));
> +  DEFINE(VCPU_ADFSR,		offsetof(struct kvm_vcpu, arch.cp15[c5_ADFSR]));
> +  DEFINE(VCPU_AIFSR,		offsetof(struct kvm_vcpu, arch.cp15[c5_AIFSR]));
> +  DEFINE(VCPU_DFAR,		offsetof(struct kvm_vcpu, arch.cp15[c6_DFAR]));
> +  DEFINE(VCPU_IFAR,		offsetof(struct kvm_vcpu, arch.cp15[c6_IFAR]));
> +  DEFINE(VCPU_PRRR,		offsetof(struct kvm_vcpu, arch.cp15[c10_PRRR]));
> +  DEFINE(VCPU_NMRR,		offsetof(struct kvm_vcpu, arch.cp15[c10_NMRR]));
> +  DEFINE(VCPU_VBAR,		offsetof(struct kvm_vcpu, arch.cp15[c12_VBAR]));
> +  DEFINE(VCPU_CID,		offsetof(struct kvm_vcpu, arch.cp15[c13_CID]));
> +  DEFINE(VCPU_TID_URW,		offsetof(struct kvm_vcpu, arch.cp15[c13_TID_URW]));
> +  DEFINE(VCPU_TID_URO,		offsetof(struct kvm_vcpu, arch.cp15[c13_TID_URO]));
> +  DEFINE(VCPU_TID_PRIV,		offsetof(struct kvm_vcpu, arch.cp15[c13_TID_PRIV]));
> +  DEFINE(VCPU_REGS,		offsetof(struct kvm_vcpu, arch.regs));
> +  DEFINE(VCPU_USR_REGS,		offsetof(struct kvm_vcpu, arch.regs.usr_regs));
> +  DEFINE(VCPU_SVC_REGS,		offsetof(struct kvm_vcpu, arch.regs.svc_regs));
> +  DEFINE(VCPU_ABT_REGS,		offsetof(struct kvm_vcpu, arch.regs.abt_regs));
> +  DEFINE(VCPU_UND_REGS,		offsetof(struct kvm_vcpu, arch.regs.und_regs));
> +  DEFINE(VCPU_IRQ_REGS,		offsetof(struct kvm_vcpu, arch.regs.irq_regs));
> +  DEFINE(VCPU_FIQ_REGS,		offsetof(struct kvm_vcpu, arch.regs.fiq_regs));
> +  DEFINE(VCPU_PC,		offsetof(struct kvm_vcpu, arch.regs.pc));
> +  DEFINE(VCPU_CPSR,		offsetof(struct kvm_vcpu, arch.regs.cpsr));
> +  DEFINE(VCPU_IRQ_LINES,	offsetof(struct kvm_vcpu, arch.irq_lines));
> +  DEFINE(VCPU_HSR,		offsetof(struct kvm_vcpu, arch.hsr));
> +  DEFINE(VCPU_HDFAR,		offsetof(struct kvm_vcpu, arch.hdfar));
> +  DEFINE(VCPU_HIFAR,		offsetof(struct kvm_vcpu, arch.hifar));
> +  DEFINE(VCPU_HPFAR,		offsetof(struct kvm_vcpu, arch.hpfar));
> +  DEFINE(VCPU_PC_IPA,		offsetof(struct kvm_vcpu, arch.pc_ipa));
> +  DEFINE(VCPU_PC_IPA2,		offsetof(struct kvm_vcpu, arch.pc_ipa2));
> +  DEFINE(VCPU_HYP_PC,		offsetof(struct kvm_vcpu, arch.hyp_pc));
> +  DEFINE(KVM_VTTBR,		offsetof(struct kvm, arch.vttbr));
> +#endif
>    return 0; 
>  }
> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
> index 437f0c4..db029bb 100644
> --- a/arch/arm/kernel/entry-armv.S
> +++ b/arch/arm/kernel/entry-armv.S
> @@ -209,6 +209,7 @@ __dabt_svc:
>  ENDPROC(__dabt_svc)
>  
>  	.align	5
> +	.globl __irq_svc
>  __irq_svc:
>  	svc_entry
>  	irq_handler
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 8b024ee..4687690 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -37,12 +37,19 @@
>  #include <asm/mman.h>
>  #include <asm/idmap.h>
>  #include <asm/tlbflush.h>
> +#include <asm/cputype.h>
>  #include <asm/kvm_arm.h>
>  #include <asm/kvm_asm.h>
>  #include <asm/kvm_mmu.h>
> +#include <asm/kvm_emulate.h>
>  
>  static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
>  
> +/* The VMID used in the VTTBR */
> +static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
> +static u8 kvm_next_vmid;
> +DEFINE_SPINLOCK(kvm_vmid_lock);

static, too.

> +
> +
> +/**
> + * check_new_vmid_gen - check that the VMID is still valid
> + * @kvm: The VM's VMID to checkt
> + *
> + * return true if there is a new generation of VMIDs being used
> + *
> + * The hardware supports only 256 values with the value zero reserved for the
> + * host, so we check if an assigned value belongs to a previous generation,
> + * which which requires us to assign a new value. If we're the first to use a
> + * VMID for the new generation, we must flush necessary caches and TLBs on all
> + * CPUs.
> + */
> +static bool check_new_vmid_gen(struct kvm *kvm)
> +{
> +	return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen));
> +}

Better have the name indicate what a true return value means, like
'need_new_vmid_gen()'.

> +
> +/**
> + * update_vttbr - Update the VTTBR with a valid VMID before the guest runs
> + * @kvm	The guest that we are about to run
> + *
> + * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the
> + * VM has a valid VMID, otherwise assigns a new one and flushes corresponding
> + * caches and TLBs.
> + */
> +static void update_vttbr(struct kvm *kvm)
> +{
> +	phys_addr_t pgd_phys;
> +
> +	if (!check_new_vmid_gen(kvm))
> +		return;
> +
> +	spin_lock(&kvm_vmid_lock);
> +
> +	/* First user of a new VMID generation? */
> +	if (unlikely(kvm_next_vmid == 0)) {
> +		atomic64_inc(&kvm_vmid_gen);
> +		kvm_next_vmid = 1;
> +
> +		/* This does nothing on UP */
> +		smp_call_function(reset_vm_context, NULL, 1);
> +
> +		/*
> +		 * On SMP we know no other CPUs can use this CPU's or
> +		 * each other's VMID since the kvm_vmid_lock blocks
> +		 * them from reentry to the guest.
> +		 */
> +
> +		reset_vm_context(NULL);

on_each_cpu() will combine the two lines above.

> +	}
> +
> +	kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen);
> +	kvm->arch.vmid = kvm_next_vmid;
> +	kvm_next_vmid++;
> +
> +	/* update vttbr to be used with the new vmid */
> +	pgd_phys = virt_to_phys(kvm->arch.pgd);
> +	kvm->arch.vttbr = pgd_phys & ((1LLU << 40) - 1)
> +			  & ~((2 << VTTBR_X) - 1);
> +	kvm->arch.vttbr |= (u64)(kvm->arch.vmid) << 48;
> +
> +	spin_unlock(&kvm_vmid_lock);
> +}
> +
> +/*
> + * Return 0 to return to guest, < 0 on error, exit_reason ( > 0) on proper
> + * exit to QEMU.
> + */
> +static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
> +		       int exception_index)
> +{
> +	return -EINVAL;

x86 returns KVM_EXIT_INTERNAL_ERROR when it encounters an unhandlable
exit.  -EINVAL indicates that the user has done something wrong, which
isn't the case here.

> +}
> +
> +/*
> + * Return 0 to proceed with guest entry
> + */
> +static int vcpu_pre_guest_enter(struct kvm_vcpu *vcpu, int *exit_reason)
> +{
> +	if (signal_pending(current)) {
> +		*exit_reason = KVM_EXIT_INTR;
> +		return -EINTR;
> +	}
> +
> +	if (check_new_vmid_gen(vcpu->kvm))
> +		return 1;
> +
> +	BUG_ON(__vcpu_mode(*vcpu_cpsr(vcpu)) == 0xf);
> +
>  	return 0;
>  }
>  
> +/**
> + * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
> + * @vcpu:	The VCPU pointer
> + * @run:	The kvm_run structure pointer used for userspace state exchange
> + *
> + * This function is called through the VCPU_RUN ioctl called from user space. It
> + * will execute VM code in a loop until the time slice for the process is used
> + * or some emulation is needed from user space in which case the function will
> + * return with return value 0 and with the kvm_run structure filled in with the
> + * required data for the requested emulation.
> + */
>  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
>  {
> -	return -EINVAL;
> +	int ret = 0;
> +	int exit_reason;
> +	sigset_t sigsaved;
> +
> +	if (vcpu->sigset_active)
> +		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
> +

We should move this to common code.  But I don't mind if this is done
post merge.

> +	exit_reason = KVM_EXIT_UNKNOWN;
> +	while (exit_reason == KVM_EXIT_UNKNOWN) {

Looping over 'ret' is more in line with x86 and clearer IMO.  x86 uses
the convention: < 0 -> return to userspace with error, 0 -> return to
userspace, 1 -> loop.

> +		/*
> +		 * Check conditions before entering the guest
> +		 */
> +		cond_resched();
> +
> +		update_vttbr(vcpu->kvm);
> +
> +		local_irq_disable();
> +
> +		/* Re-check atomic conditions */
> +		ret = vcpu_pre_guest_enter(vcpu, &exit_reason);
> +		if (ret != 0) {
> +			local_irq_enable();
> +			preempt_enable();
> +			continue;

See - you continue, only to break out of the loop due to a side effect
on exit_reason.

> +		}
> +
> +		/**************************************************************
> +		 * Enter the guest
> +		 */
> +		trace_kvm_entry(vcpu->arch.regs.pc);
> +		kvm_guest_enter();
> +		vcpu->mode = IN_GUEST_MODE;
> +
> +		ret = __kvm_vcpu_run(vcpu);
> +
> +		vcpu->mode = OUTSIDE_GUEST_MODE;
> +		vcpu->stat.exits++;

The tracepoint above should be sufficient for statistics.

> +		kvm_guest_exit();
> +		trace_kvm_exit(vcpu->arch.regs.pc);
> +		local_irq_enable();
> +
> +		/*
> +		 * Back from guest
> +		 *************************************************************/
> +
> +		ret = handle_exit(vcpu, run, ret);
> +		if (ret < 0) {
> +			kvm_err("Error in handle_exit\n");
> +			break;
> +		} else {
> +			exit_reason = ret; /* 0 == KVM_EXIT_UNKNOWN */
> +		}
> +	}
> +
> +	if (vcpu->sigset_active)
> +		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
> +
> +	run->exit_reason = exit_reason;
> +	return ret;
>  }
>
Christoffer Dall July 25, 2012, 2:16 p.m. UTC | #2
On Tue, Jul 3, 2012 at 12:07 PM, Avi Kivity <avi@redhat.com> wrote:
> On 07/03/2012 12:01 PM, Christoffer Dall wrote:
>> Provides complete world-switch implementation to switch to other guests
>> running in non-secure modes. Includes Hyp exception handlers that
>> capture necessary exception information and stores the information on
>> the VCPU and KVM structures.
>>
>> The following Hyp-ABI is also documented in the code:
>>
>> Hyp-ABI: Switching from host kernel to Hyp-mode:
>>    Switching to Hyp mode is done through a simple HVC instructions. The
>>    exception vector code will check that the HVC comes from VMID==0 and if
>>    so will store the necessary state on the Hyp stack, which will look like
>>    this (growing downwards, see the hyp_hvc handler):
>>      ...
>>      stack_page + 4: spsr (Host-SVC cpsr)
>>      stack_page    : lr_usr
>>      --------------: stack bottom
>>
>> Hyp-ABI: Switching from Hyp-mode to host kernel SVC mode:
>>    When returning from Hyp mode to SVC mode, another HVC instruction is
>>    executed from Hyp mode, which is taken in the hyp_svc handler. The
>>    bottom of the Hyp is derived from the Hyp stack pointer (only a single
>>    page aligned stack is used per CPU) and the initial SVC registers are
>>    used to restore the host state.
>>
>> Otherwise, the world-switch is pretty straight-forward. All state that
>> can be modified by the guest is first backed up on the Hyp stack and the
>> VCPU values is loaded onto the hardware. State, which is not loaded, but
>> theoretically modifiable by the guest is protected through the
>> virtualiation features to generate a trap and cause software emulation.
>> Upon guest returns, all state is restored from hardware onto the VCPU
>> struct and the original state is restored from the Hyp-stack onto the
>> hardware.
>>
>> One controversy may be the back-door call to __irq_svc (the host
>> kernel's own physical IRQ handler) which is called when a physical IRQ
>> exception is taken in Hyp mode while running in the guest.
>>
>> SMP support using the VMPIDR calculated on the basis of the host MPIDR
>> and overriding the low bits with KVM vcpu_id contributed by Marc Zyngier.
>
> He should sign off on this patch then.
>
>>
>> Reuse of VMIDs has been implemented by Antonios Motakis and adapated from
>> a separate patch into the appropriate patches introducing the
>> functionality. Note that the VMIDs are stored per VM as required by the ARM
>> architecture reference manual.
>
> Ditto.
>
>> diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
>> index 220f241..232117c 100644
>> --- a/arch/arm/include/asm/kvm_arm.h
>> +++ b/arch/arm/include/asm/kvm_arm.h
>> @@ -105,6 +105,17 @@
>>  #define TTBCR_T0SZ   3
>>  #define HTCR_MASK    (TTBCR_T0SZ | TTBCR_IRGN0 | TTBCR_ORGN0 | TTBCR_SH0)
>>
>> +/* Hyp System Trap Register */
>> +#define HSTR_T(x)    (1 << x)
>> +#define HSTR_TTEE    (1 << 16)
>> +#define HSTR_TJDBX   (1 << 17)
>> +
>> +/* Hyp Coprocessor Trap Register */
>> +#define HCPTR_TCP(x) (1 << x)
>> +#define HCPTR_TCP_MASK       (0x3fff)
>> +#define HCPTR_TASE   (1 << 15)
>> +#define HCPTR_TTA    (1 << 20)
>> +#define HCPTR_TCPAC  (1 << 31)
>>
>>  /* Virtualization Translation Control Register (VTCR) bits */
>>  #define VTCR_SH0     (3 << 12)
>> @@ -126,5 +137,31 @@
>>  #define VTTBR_X              (5 - VTCR_GUEST_T0SZ)
>>  #endif
>>
>> +/* Hyp Syndrome Register (HSR) bits */
>> +#define HSR_EC_SHIFT (26)
>> +#define HSR_EC               (0x3fU << HSR_EC_SHIFT)
>> +#define HSR_IL               (1U << 25)
>> +#define HSR_ISS              (HSR_IL - 1)
>> +#define HSR_ISV_SHIFT        (24)
>> +#define HSR_ISV              (1U << HSR_ISV_SHIFT)
>> +
>> +#define HSR_EC_UNKNOWN       (0x00)
>> +#define HSR_EC_WFI   (0x01)
>> +#define HSR_EC_CP15_32       (0x03)
>> +#define HSR_EC_CP15_64       (0x04)
>> +#define HSR_EC_CP14_MR       (0x05)
>> +#define HSR_EC_CP14_LS       (0x06)
>> +#define HSR_EC_CP_0_13       (0x07)
>> +#define HSR_EC_CP10_ID       (0x08)
>> +#define HSR_EC_JAZELLE       (0x09)
>> +#define HSR_EC_BXJ   (0x0A)
>> +#define HSR_EC_CP14_64       (0x0C)
>> +#define HSR_EC_SVC_HYP       (0x11)
>> +#define HSR_EC_HVC   (0x12)
>> +#define HSR_EC_SMC   (0x13)
>> +#define HSR_EC_IABT  (0x20)
>> +#define HSR_EC_IABT_HYP      (0x21)
>> +#define HSR_EC_DABT  (0x24)
>> +#define HSR_EC_DABT_HYP      (0x25)
>>
>>  #endif /* __ARM_KVM_ARM_H__ */
>> diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
>> index b57c75e..38d3a12 100644
>> --- a/arch/arm/kernel/armksyms.c
>> +++ b/arch/arm/kernel/armksyms.c
>> @@ -48,6 +48,13 @@ extern void __aeabi_ulcmp(void);
>>
>>  extern void fpundefinstr(void);
>>
>> +#ifdef CONFIG_KVM_ARM_HOST
>> +/* This is needed for KVM */
>> +extern void __irq_svc(void);
>> +
>> +EXPORT_SYMBOL_GPL(__irq_svc);
>> +#endif
>> +
>>       /* platform dependent support */
>>  EXPORT_SYMBOL(__udelay);
>>  EXPORT_SYMBOL(__const_udelay);
>> diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
>> index 1429d89..9c76b53 100644
>> --- a/arch/arm/kernel/asm-offsets.c
>> +++ b/arch/arm/kernel/asm-offsets.c
>> @@ -13,6 +13,7 @@
>>  #include <linux/sched.h>
>>  #include <linux/mm.h>
>>  #include <linux/dma-mapping.h>
>> +#include <linux/kvm_host.h>
>>  #include <asm/cacheflush.h>
>>  #include <asm/glue-df.h>
>>  #include <asm/glue-pf.h>
>> @@ -144,5 +145,47 @@ int main(void)
>>    DEFINE(DMA_BIDIRECTIONAL,  DMA_BIDIRECTIONAL);
>>    DEFINE(DMA_TO_DEVICE,              DMA_TO_DEVICE);
>>    DEFINE(DMA_FROM_DEVICE,    DMA_FROM_DEVICE);
>> +#ifdef CONFIG_KVM_ARM_HOST
>> +  DEFINE(VCPU_KVM,           offsetof(struct kvm_vcpu, kvm));
>> +  DEFINE(VCPU_MIDR,          offsetof(struct kvm_vcpu, arch.cp15[c0_MIDR]));
>> +  DEFINE(VCPU_MPIDR,         offsetof(struct kvm_vcpu, arch.cp15[c0_MPIDR]));
>> +  DEFINE(VCPU_SCTLR,         offsetof(struct kvm_vcpu, arch.cp15[c1_SCTLR]));
>> +  DEFINE(VCPU_CPACR,         offsetof(struct kvm_vcpu, arch.cp15[c1_CPACR]));
>> +  DEFINE(VCPU_TTBR0,         offsetof(struct kvm_vcpu, arch.cp15[c2_TTBR0]));
>> +  DEFINE(VCPU_TTBR1,         offsetof(struct kvm_vcpu, arch.cp15[c2_TTBR1]));
>> +  DEFINE(VCPU_TTBCR,         offsetof(struct kvm_vcpu, arch.cp15[c2_TTBCR]));
>> +  DEFINE(VCPU_DACR,          offsetof(struct kvm_vcpu, arch.cp15[c3_DACR]));
>> +  DEFINE(VCPU_DFSR,          offsetof(struct kvm_vcpu, arch.cp15[c5_DFSR]));
>> +  DEFINE(VCPU_IFSR,          offsetof(struct kvm_vcpu, arch.cp15[c5_IFSR]));
>> +  DEFINE(VCPU_ADFSR,         offsetof(struct kvm_vcpu, arch.cp15[c5_ADFSR]));
>> +  DEFINE(VCPU_AIFSR,         offsetof(struct kvm_vcpu, arch.cp15[c5_AIFSR]));
>> +  DEFINE(VCPU_DFAR,          offsetof(struct kvm_vcpu, arch.cp15[c6_DFAR]));
>> +  DEFINE(VCPU_IFAR,          offsetof(struct kvm_vcpu, arch.cp15[c6_IFAR]));
>> +  DEFINE(VCPU_PRRR,          offsetof(struct kvm_vcpu, arch.cp15[c10_PRRR]));
>> +  DEFINE(VCPU_NMRR,          offsetof(struct kvm_vcpu, arch.cp15[c10_NMRR]));
>> +  DEFINE(VCPU_VBAR,          offsetof(struct kvm_vcpu, arch.cp15[c12_VBAR]));
>> +  DEFINE(VCPU_CID,           offsetof(struct kvm_vcpu, arch.cp15[c13_CID]));
>> +  DEFINE(VCPU_TID_URW,               offsetof(struct kvm_vcpu, arch.cp15[c13_TID_URW]));
>> +  DEFINE(VCPU_TID_URO,               offsetof(struct kvm_vcpu, arch.cp15[c13_TID_URO]));
>> +  DEFINE(VCPU_TID_PRIV,              offsetof(struct kvm_vcpu, arch.cp15[c13_TID_PRIV]));
>> +  DEFINE(VCPU_REGS,          offsetof(struct kvm_vcpu, arch.regs));
>> +  DEFINE(VCPU_USR_REGS,              offsetof(struct kvm_vcpu, arch.regs.usr_regs));
>> +  DEFINE(VCPU_SVC_REGS,              offsetof(struct kvm_vcpu, arch.regs.svc_regs));
>> +  DEFINE(VCPU_ABT_REGS,              offsetof(struct kvm_vcpu, arch.regs.abt_regs));
>> +  DEFINE(VCPU_UND_REGS,              offsetof(struct kvm_vcpu, arch.regs.und_regs));
>> +  DEFINE(VCPU_IRQ_REGS,              offsetof(struct kvm_vcpu, arch.regs.irq_regs));
>> +  DEFINE(VCPU_FIQ_REGS,              offsetof(struct kvm_vcpu, arch.regs.fiq_regs));
>> +  DEFINE(VCPU_PC,            offsetof(struct kvm_vcpu, arch.regs.pc));
>> +  DEFINE(VCPU_CPSR,          offsetof(struct kvm_vcpu, arch.regs.cpsr));
>> +  DEFINE(VCPU_IRQ_LINES,     offsetof(struct kvm_vcpu, arch.irq_lines));
>> +  DEFINE(VCPU_HSR,           offsetof(struct kvm_vcpu, arch.hsr));
>> +  DEFINE(VCPU_HDFAR,         offsetof(struct kvm_vcpu, arch.hdfar));
>> +  DEFINE(VCPU_HIFAR,         offsetof(struct kvm_vcpu, arch.hifar));
>> +  DEFINE(VCPU_HPFAR,         offsetof(struct kvm_vcpu, arch.hpfar));
>> +  DEFINE(VCPU_PC_IPA,                offsetof(struct kvm_vcpu, arch.pc_ipa));
>> +  DEFINE(VCPU_PC_IPA2,               offsetof(struct kvm_vcpu, arch.pc_ipa2));
>> +  DEFINE(VCPU_HYP_PC,                offsetof(struct kvm_vcpu, arch.hyp_pc));
>> +  DEFINE(KVM_VTTBR,          offsetof(struct kvm, arch.vttbr));
>> +#endif
>>    return 0;
>>  }
>> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
>> index 437f0c4..db029bb 100644
>> --- a/arch/arm/kernel/entry-armv.S
>> +++ b/arch/arm/kernel/entry-armv.S
>> @@ -209,6 +209,7 @@ __dabt_svc:
>>  ENDPROC(__dabt_svc)
>>
>>       .align  5
>> +     .globl __irq_svc
>>  __irq_svc:
>>       svc_entry
>>       irq_handler
>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>> index 8b024ee..4687690 100644
>> --- a/arch/arm/kvm/arm.c
>> +++ b/arch/arm/kvm/arm.c
>> @@ -37,12 +37,19 @@
>>  #include <asm/mman.h>
>>  #include <asm/idmap.h>
>>  #include <asm/tlbflush.h>
>> +#include <asm/cputype.h>
>>  #include <asm/kvm_arm.h>
>>  #include <asm/kvm_asm.h>
>>  #include <asm/kvm_mmu.h>
>> +#include <asm/kvm_emulate.h>
>>
>>  static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
>>
>> +/* The VMID used in the VTTBR */
>> +static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
>> +static u8 kvm_next_vmid;
>> +DEFINE_SPINLOCK(kvm_vmid_lock);
>
> static, too.
>
>> +
>> +
>> +/**
>> + * check_new_vmid_gen - check that the VMID is still valid
>> + * @kvm: The VM's VMID to checkt
>> + *
>> + * return true if there is a new generation of VMIDs being used
>> + *
>> + * The hardware supports only 256 values with the value zero reserved for the
>> + * host, so we check if an assigned value belongs to a previous generation,
>> + * which which requires us to assign a new value. If we're the first to use a
>> + * VMID for the new generation, we must flush necessary caches and TLBs on all
>> + * CPUs.
>> + */
>> +static bool check_new_vmid_gen(struct kvm *kvm)
>> +{
>> +     return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen));
>> +}
>
> Better have the name indicate what a true return value means, like
> 'need_new_vmid_gen()'.
>
agreed
>> +
>> +/**
>> + * update_vttbr - Update the VTTBR with a valid VMID before the guest runs
>> + * @kvm      The guest that we are about to run
>> + *
>> + * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the
>> + * VM has a valid VMID, otherwise assigns a new one and flushes corresponding
>> + * caches and TLBs.
>> + */
>> +static void update_vttbr(struct kvm *kvm)
>> +{
>> +     phys_addr_t pgd_phys;
>> +
>> +     if (!check_new_vmid_gen(kvm))
>> +             return;
>> +
>> +     spin_lock(&kvm_vmid_lock);
>> +
>> +     /* First user of a new VMID generation? */
>> +     if (unlikely(kvm_next_vmid == 0)) {
>> +             atomic64_inc(&kvm_vmid_gen);
>> +             kvm_next_vmid = 1;
>> +
>> +             /* This does nothing on UP */
>> +             smp_call_function(reset_vm_context, NULL, 1);
>> +
>> +             /*
>> +              * On SMP we know no other CPUs can use this CPU's or
>> +              * each other's VMID since the kvm_vmid_lock blocks
>> +              * them from reentry to the guest.
>> +              */
>> +
>> +             reset_vm_context(NULL);
>
> on_each_cpu() will combine the two lines above.
>
thanks
>> +     }
>> +
>> +     kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen);
>> +     kvm->arch.vmid = kvm_next_vmid;
>> +     kvm_next_vmid++;
>> +
>> +     /* update vttbr to be used with the new vmid */
>> +     pgd_phys = virt_to_phys(kvm->arch.pgd);
>> +     kvm->arch.vttbr = pgd_phys & ((1LLU << 40) - 1)
>> +                       & ~((2 << VTTBR_X) - 1);
>> +     kvm->arch.vttbr |= (u64)(kvm->arch.vmid) << 48;
>> +
>> +     spin_unlock(&kvm_vmid_lock);
>> +}
>> +
>> +/*
>> + * Return 0 to return to guest, < 0 on error, exit_reason ( > 0) on proper
>> + * exit to QEMU.
>> + */
>> +static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
>> +                    int exception_index)
>> +{
>> +     return -EINVAL;
>
> x86 returns KVM_EXIT_INTERNAL_ERROR when it encounters an unhandlable
> exit.  -EINVAL indicates that the user has done something wrong, which
> isn't the case here.
>

ok, fair enough this has been reworked as per your comments.

>> +}
>> +
>> +/*
>> + * Return 0 to proceed with guest entry
>> + */
>> +static int vcpu_pre_guest_enter(struct kvm_vcpu *vcpu, int *exit_reason)
>> +{
>> +     if (signal_pending(current)) {
>> +             *exit_reason = KVM_EXIT_INTR;
>> +             return -EINTR;
>> +     }
>> +
>> +     if (check_new_vmid_gen(vcpu->kvm))
>> +             return 1;
>> +
>> +     BUG_ON(__vcpu_mode(*vcpu_cpsr(vcpu)) == 0xf);
>> +
>>       return 0;
>>  }
>>
>> +/**
>> + * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
>> + * @vcpu:    The VCPU pointer
>> + * @run:     The kvm_run structure pointer used for userspace state exchange
>> + *
>> + * This function is called through the VCPU_RUN ioctl called from user space. It
>> + * will execute VM code in a loop until the time slice for the process is used
>> + * or some emulation is needed from user space in which case the function will
>> + * return with return value 0 and with the kvm_run structure filled in with the
>> + * required data for the requested emulation.
>> + */
>>  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
>>  {
>> -     return -EINVAL;
>> +     int ret = 0;
>> +     int exit_reason;
>> +     sigset_t sigsaved;
>> +
>> +     if (vcpu->sigset_active)
>> +             sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
>> +
>
> We should move this to common code.  But I don't mind if this is done
> post merge.
>
>> +     exit_reason = KVM_EXIT_UNKNOWN;
>> +     while (exit_reason == KVM_EXIT_UNKNOWN) {
>
> Looping over 'ret' is more in line with x86 and clearer IMO.  x86 uses
> the convention: < 0 -> return to userspace with error, 0 -> return to
> userspace, 1 -> loop.
>

I duplicated this scheme now for next patch series.

>> +             /*
>> +              * Check conditions before entering the guest
>> +              */
>> +             cond_resched();
>> +
>> +             update_vttbr(vcpu->kvm);
>> +
>> +             local_irq_disable();
>> +
>> +             /* Re-check atomic conditions */
>> +             ret = vcpu_pre_guest_enter(vcpu, &exit_reason);
>> +             if (ret != 0) {
>> +                     local_irq_enable();
>> +                     preempt_enable();
>> +                     continue;
>
> See - you continue, only to break out of the loop due to a side effect
> on exit_reason.
>

yep, I was never in love with this.

>> +             }
>> +
>> +             /**************************************************************
>> +              * Enter the guest
>> +              */
>> +             trace_kvm_entry(vcpu->arch.regs.pc);
>> +             kvm_guest_enter();
>> +             vcpu->mode = IN_GUEST_MODE;
>> +
>> +             ret = __kvm_vcpu_run(vcpu);
>> +
>> +             vcpu->mode = OUTSIDE_GUEST_MODE;
>> +             vcpu->stat.exits++;
>
> The tracepoint above should be sufficient for statistics.
>

Marc added this one, so I assume there's a valid need for the separate
counter. Marc?

>> +             kvm_guest_exit();
>> +             trace_kvm_exit(vcpu->arch.regs.pc);
>> +             local_irq_enable();
>> +
>> +             /*
>> +              * Back from guest
>> +              *************************************************************/
>> +
>> +             ret = handle_exit(vcpu, run, ret);
>> +             if (ret < 0) {
>> +                     kvm_err("Error in handle_exit\n");
>> +                     break;
>> +             } else {
>> +                     exit_reason = ret; /* 0 == KVM_EXIT_UNKNOWN */
>> +             }
>> +     }
>> +
>> +     if (vcpu->sigset_active)
>> +             sigprocmask(SIG_SETMASK, &sigsaved, NULL);
>> +
>> +     run->exit_reason = exit_reason;
>> +     return ret;
>>  }
>>
>

Thanks!
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index 220f241..232117c 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -105,6 +105,17 @@ 
 #define TTBCR_T0SZ	3
 #define HTCR_MASK	(TTBCR_T0SZ | TTBCR_IRGN0 | TTBCR_ORGN0 | TTBCR_SH0)
 
+/* Hyp System Trap Register */
+#define HSTR_T(x)	(1 << x)
+#define HSTR_TTEE	(1 << 16)
+#define HSTR_TJDBX	(1 << 17)
+
+/* Hyp Coprocessor Trap Register */
+#define HCPTR_TCP(x)	(1 << x)
+#define HCPTR_TCP_MASK	(0x3fff)
+#define HCPTR_TASE	(1 << 15)
+#define HCPTR_TTA	(1 << 20)
+#define HCPTR_TCPAC	(1 << 31)
 
 /* Virtualization Translation Control Register (VTCR) bits */
 #define VTCR_SH0	(3 << 12)
@@ -126,5 +137,31 @@ 
 #define VTTBR_X		(5 - VTCR_GUEST_T0SZ)
 #endif
 
+/* Hyp Syndrome Register (HSR) bits */
+#define HSR_EC_SHIFT	(26)
+#define HSR_EC		(0x3fU << HSR_EC_SHIFT)
+#define HSR_IL		(1U << 25)
+#define HSR_ISS		(HSR_IL - 1)
+#define HSR_ISV_SHIFT	(24)
+#define HSR_ISV		(1U << HSR_ISV_SHIFT)
+
+#define HSR_EC_UNKNOWN	(0x00)
+#define HSR_EC_WFI	(0x01)
+#define HSR_EC_CP15_32	(0x03)
+#define HSR_EC_CP15_64	(0x04)
+#define HSR_EC_CP14_MR	(0x05)
+#define HSR_EC_CP14_LS	(0x06)
+#define HSR_EC_CP_0_13	(0x07)
+#define HSR_EC_CP10_ID	(0x08)
+#define HSR_EC_JAZELLE	(0x09)
+#define HSR_EC_BXJ	(0x0A)
+#define HSR_EC_CP14_64	(0x0C)
+#define HSR_EC_SVC_HYP	(0x11)
+#define HSR_EC_HVC	(0x12)
+#define HSR_EC_SMC	(0x13)
+#define HSR_EC_IABT	(0x20)
+#define HSR_EC_IABT_HYP	(0x21)
+#define HSR_EC_DABT	(0x24)
+#define HSR_EC_DABT_HYP	(0x25)
 
 #endif /* __ARM_KVM_ARM_H__ */
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index b57c75e..38d3a12 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -48,6 +48,13 @@  extern void __aeabi_ulcmp(void);
 
 extern void fpundefinstr(void);
 
+#ifdef CONFIG_KVM_ARM_HOST
+/* This is needed for KVM */
+extern void __irq_svc(void);
+
+EXPORT_SYMBOL_GPL(__irq_svc);
+#endif
+
 	/* platform dependent support */
 EXPORT_SYMBOL(__udelay);
 EXPORT_SYMBOL(__const_udelay);
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 1429d89..9c76b53 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -13,6 +13,7 @@ 
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
+#include <linux/kvm_host.h>
 #include <asm/cacheflush.h>
 #include <asm/glue-df.h>
 #include <asm/glue-pf.h>
@@ -144,5 +145,47 @@  int main(void)
   DEFINE(DMA_BIDIRECTIONAL,	DMA_BIDIRECTIONAL);
   DEFINE(DMA_TO_DEVICE,		DMA_TO_DEVICE);
   DEFINE(DMA_FROM_DEVICE,	DMA_FROM_DEVICE);
+#ifdef CONFIG_KVM_ARM_HOST
+  DEFINE(VCPU_KVM,		offsetof(struct kvm_vcpu, kvm));
+  DEFINE(VCPU_MIDR,		offsetof(struct kvm_vcpu, arch.cp15[c0_MIDR]));
+  DEFINE(VCPU_MPIDR,		offsetof(struct kvm_vcpu, arch.cp15[c0_MPIDR]));
+  DEFINE(VCPU_SCTLR,		offsetof(struct kvm_vcpu, arch.cp15[c1_SCTLR]));
+  DEFINE(VCPU_CPACR,		offsetof(struct kvm_vcpu, arch.cp15[c1_CPACR]));
+  DEFINE(VCPU_TTBR0,		offsetof(struct kvm_vcpu, arch.cp15[c2_TTBR0]));
+  DEFINE(VCPU_TTBR1,		offsetof(struct kvm_vcpu, arch.cp15[c2_TTBR1]));
+  DEFINE(VCPU_TTBCR,		offsetof(struct kvm_vcpu, arch.cp15[c2_TTBCR]));
+  DEFINE(VCPU_DACR,		offsetof(struct kvm_vcpu, arch.cp15[c3_DACR]));
+  DEFINE(VCPU_DFSR,		offsetof(struct kvm_vcpu, arch.cp15[c5_DFSR]));
+  DEFINE(VCPU_IFSR,		offsetof(struct kvm_vcpu, arch.cp15[c5_IFSR]));
+  DEFINE(VCPU_ADFSR,		offsetof(struct kvm_vcpu, arch.cp15[c5_ADFSR]));
+  DEFINE(VCPU_AIFSR,		offsetof(struct kvm_vcpu, arch.cp15[c5_AIFSR]));
+  DEFINE(VCPU_DFAR,		offsetof(struct kvm_vcpu, arch.cp15[c6_DFAR]));
+  DEFINE(VCPU_IFAR,		offsetof(struct kvm_vcpu, arch.cp15[c6_IFAR]));
+  DEFINE(VCPU_PRRR,		offsetof(struct kvm_vcpu, arch.cp15[c10_PRRR]));
+  DEFINE(VCPU_NMRR,		offsetof(struct kvm_vcpu, arch.cp15[c10_NMRR]));
+  DEFINE(VCPU_VBAR,		offsetof(struct kvm_vcpu, arch.cp15[c12_VBAR]));
+  DEFINE(VCPU_CID,		offsetof(struct kvm_vcpu, arch.cp15[c13_CID]));
+  DEFINE(VCPU_TID_URW,		offsetof(struct kvm_vcpu, arch.cp15[c13_TID_URW]));
+  DEFINE(VCPU_TID_URO,		offsetof(struct kvm_vcpu, arch.cp15[c13_TID_URO]));
+  DEFINE(VCPU_TID_PRIV,		offsetof(struct kvm_vcpu, arch.cp15[c13_TID_PRIV]));
+  DEFINE(VCPU_REGS,		offsetof(struct kvm_vcpu, arch.regs));
+  DEFINE(VCPU_USR_REGS,		offsetof(struct kvm_vcpu, arch.regs.usr_regs));
+  DEFINE(VCPU_SVC_REGS,		offsetof(struct kvm_vcpu, arch.regs.svc_regs));
+  DEFINE(VCPU_ABT_REGS,		offsetof(struct kvm_vcpu, arch.regs.abt_regs));
+  DEFINE(VCPU_UND_REGS,		offsetof(struct kvm_vcpu, arch.regs.und_regs));
+  DEFINE(VCPU_IRQ_REGS,		offsetof(struct kvm_vcpu, arch.regs.irq_regs));
+  DEFINE(VCPU_FIQ_REGS,		offsetof(struct kvm_vcpu, arch.regs.fiq_regs));
+  DEFINE(VCPU_PC,		offsetof(struct kvm_vcpu, arch.regs.pc));
+  DEFINE(VCPU_CPSR,		offsetof(struct kvm_vcpu, arch.regs.cpsr));
+  DEFINE(VCPU_IRQ_LINES,	offsetof(struct kvm_vcpu, arch.irq_lines));
+  DEFINE(VCPU_HSR,		offsetof(struct kvm_vcpu, arch.hsr));
+  DEFINE(VCPU_HDFAR,		offsetof(struct kvm_vcpu, arch.hdfar));
+  DEFINE(VCPU_HIFAR,		offsetof(struct kvm_vcpu, arch.hifar));
+  DEFINE(VCPU_HPFAR,		offsetof(struct kvm_vcpu, arch.hpfar));
+  DEFINE(VCPU_PC_IPA,		offsetof(struct kvm_vcpu, arch.pc_ipa));
+  DEFINE(VCPU_PC_IPA2,		offsetof(struct kvm_vcpu, arch.pc_ipa2));
+  DEFINE(VCPU_HYP_PC,		offsetof(struct kvm_vcpu, arch.hyp_pc));
+  DEFINE(KVM_VTTBR,		offsetof(struct kvm, arch.vttbr));
+#endif
   return 0; 
 }
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 437f0c4..db029bb 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -209,6 +209,7 @@  __dabt_svc:
 ENDPROC(__dabt_svc)
 
 	.align	5
+	.globl __irq_svc
 __irq_svc:
 	svc_entry
 	irq_handler
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 8b024ee..4687690 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -37,12 +37,19 @@ 
 #include <asm/mman.h>
 #include <asm/idmap.h>
 #include <asm/tlbflush.h>
+#include <asm/cputype.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_mmu.h>
+#include <asm/kvm_emulate.h>
 
 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
 
+/* The VMID used in the VTTBR */
+static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
+static u8 kvm_next_vmid;
+DEFINE_SPINLOCK(kvm_vmid_lock);
+
 int kvm_arch_hardware_enable(void *garbage)
 {
 	return 0;
@@ -248,6 +255,12 @@  int __attribute_const__ kvm_target_cpu(void)
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
+	int ret;
+
+	ret = kvm_reset_vcpu(vcpu);
+	if (ret < 0)
+		return ret;
+
 	return 0;
 }
 
@@ -290,12 +303,178 @@  int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 
 int kvm_arch_vcpu_in_guest_mode(struct kvm_vcpu *v)
 {
+	return v->mode == IN_GUEST_MODE;
+}
+
+static void reset_vm_context(void *info)
+{
+	__kvm_flush_vm_context();
+}
+
+/**
+ * check_new_vmid_gen - check that the VMID is still valid
+ * @kvm: The VM's VMID to checkt
+ *
+ * return true if there is a new generation of VMIDs being used
+ *
+ * The hardware supports only 256 values with the value zero reserved for the
+ * host, so we check if an assigned value belongs to a previous generation,
+ * which which requires us to assign a new value. If we're the first to use a
+ * VMID for the new generation, we must flush necessary caches and TLBs on all
+ * CPUs.
+ */
+static bool check_new_vmid_gen(struct kvm *kvm)
+{
+	return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen));
+}
+
+/**
+ * update_vttbr - Update the VTTBR with a valid VMID before the guest runs
+ * @kvm	The guest that we are about to run
+ *
+ * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the
+ * VM has a valid VMID, otherwise assigns a new one and flushes corresponding
+ * caches and TLBs.
+ */
+static void update_vttbr(struct kvm *kvm)
+{
+	phys_addr_t pgd_phys;
+
+	if (!check_new_vmid_gen(kvm))
+		return;
+
+	spin_lock(&kvm_vmid_lock);
+
+	/* First user of a new VMID generation? */
+	if (unlikely(kvm_next_vmid == 0)) {
+		atomic64_inc(&kvm_vmid_gen);
+		kvm_next_vmid = 1;
+
+		/* This does nothing on UP */
+		smp_call_function(reset_vm_context, NULL, 1);
+
+		/*
+		 * On SMP we know no other CPUs can use this CPU's or
+		 * each other's VMID since the kvm_vmid_lock blocks
+		 * them from reentry to the guest.
+		 */
+
+		reset_vm_context(NULL);
+	}
+
+	kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen);
+	kvm->arch.vmid = kvm_next_vmid;
+	kvm_next_vmid++;
+
+	/* update vttbr to be used with the new vmid */
+	pgd_phys = virt_to_phys(kvm->arch.pgd);
+	kvm->arch.vttbr = pgd_phys & ((1LLU << 40) - 1)
+			  & ~((2 << VTTBR_X) - 1);
+	kvm->arch.vttbr |= (u64)(kvm->arch.vmid) << 48;
+
+	spin_unlock(&kvm_vmid_lock);
+}
+
+/*
+ * Return 0 to return to guest, < 0 on error, exit_reason ( > 0) on proper
+ * exit to QEMU.
+ */
+static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
+		       int exception_index)
+{
+	return -EINVAL;
+}
+
+/*
+ * Return 0 to proceed with guest entry
+ */
+static int vcpu_pre_guest_enter(struct kvm_vcpu *vcpu, int *exit_reason)
+{
+	if (signal_pending(current)) {
+		*exit_reason = KVM_EXIT_INTR;
+		return -EINTR;
+	}
+
+	if (check_new_vmid_gen(vcpu->kvm))
+		return 1;
+
+	BUG_ON(__vcpu_mode(*vcpu_cpsr(vcpu)) == 0xf);
+
 	return 0;
 }
 
+/**
+ * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
+ * @vcpu:	The VCPU pointer
+ * @run:	The kvm_run structure pointer used for userspace state exchange
+ *
+ * This function is called through the VCPU_RUN ioctl called from user space. It
+ * will execute VM code in a loop until the time slice for the process is used
+ * or some emulation is needed from user space in which case the function will
+ * return with return value 0 and with the kvm_run structure filled in with the
+ * required data for the requested emulation.
+ */
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
-	return -EINVAL;
+	int ret = 0;
+	int exit_reason;
+	sigset_t sigsaved;
+
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+	exit_reason = KVM_EXIT_UNKNOWN;
+	while (exit_reason == KVM_EXIT_UNKNOWN) {
+		/*
+		 * Check conditions before entering the guest
+		 */
+		cond_resched();
+
+		update_vttbr(vcpu->kvm);
+
+		local_irq_disable();
+
+		/* Re-check atomic conditions */
+		ret = vcpu_pre_guest_enter(vcpu, &exit_reason);
+		if (ret != 0) {
+			local_irq_enable();
+			preempt_enable();
+			continue;
+		}
+
+		/**************************************************************
+		 * Enter the guest
+		 */
+		trace_kvm_entry(vcpu->arch.regs.pc);
+		kvm_guest_enter();
+		vcpu->mode = IN_GUEST_MODE;
+
+		ret = __kvm_vcpu_run(vcpu);
+
+		vcpu->mode = OUTSIDE_GUEST_MODE;
+		vcpu->stat.exits++;
+		kvm_guest_exit();
+		trace_kvm_exit(vcpu->arch.regs.pc);
+		local_irq_enable();
+
+		/*
+		 * Back from guest
+		 *************************************************************/
+
+		ret = handle_exit(vcpu, run, ret);
+		if (ret < 0) {
+			kvm_err("Error in handle_exit\n");
+			break;
+		} else {
+			exit_reason = ret; /* 0 == KVM_EXIT_UNKNOWN */
+		}
+	}
+
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	run->exit_reason = exit_reason;
+	return ret;
 }
 
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level)
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index b4e1992..a0e370b 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -24,6 +24,11 @@ 
 #include <asm/kvm_asm.h>
 #include <asm/kvm_arm.h>
 
+#define VCPU_USR_REG(_reg_nr)	(VCPU_USR_REGS + (_reg_nr * 4))
+#define VCPU_USR_SP		(VCPU_USR_REG(13))
+#define VCPU_FIQ_REG(_reg_nr)	(VCPU_FIQ_REGS + (_reg_nr * 4))
+#define VCPU_FIQ_SPSR		(VCPU_FIQ_REG(7))
+
 	.text
 	.align	PAGE_SHIFT
 
@@ -35,24 +40,614 @@  __kvm_hyp_code_start:
 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
 ENTRY(__kvm_flush_vm_context)
+	hvc	#0			@ switch to hyp-mode
+
+	mov	r0, #0			@ rn parameter for c15 flushes is SBZ
+	mcr     p15, 4, r0, c8, c7, 4   @ Invalidate Non-secure Non-Hyp TLB
+	mcr     p15, 0, r0, c7, c5, 0   @ Invalidate instruction caches
+	dsb
+	isb
+
+	hvc	#0			@ switch back to svc-mode, see hyp_svc
 	bx	lr
+ENDPROC(__kvm_flush_vm_context)
 
 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 @  Hypervisor world-switch code
 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
+/* These are simply for the macros to work - value don't have meaning */
+.equ usr, 0
+.equ svc, 1
+.equ abt, 2
+.equ und, 3
+.equ irq, 4
+.equ fiq, 5
+
+.macro store_mode_state base_reg, mode
+	.if \mode == usr
+	mrs	r2, SP_usr
+	mov	r3, lr
+	stmdb	\base_reg!, {r2, r3}
+	.elseif \mode != fiq
+	mrs	r2, SP_\mode
+	mrs	r3, LR_\mode
+	mrs	r4, SPSR_\mode
+	stmdb	\base_reg!, {r2, r3, r4}
+	.else
+	mrs	r2, r8_fiq
+	mrs	r3, r9_fiq
+	mrs	r4, r10_fiq
+	mrs	r5, r11_fiq
+	mrs	r6, r12_fiq
+	mrs	r7, SP_fiq
+	mrs	r8, LR_fiq
+	mrs	r9, SPSR_fiq
+	stmdb	\base_reg!, {r2-r9}
+	.endif
+.endm
+
+.macro load_mode_state base_reg, mode
+	.if \mode == usr
+	ldmia	\base_reg!, {r2, r3}
+	msr	SP_usr, r2
+	mov	lr, r3
+	.elseif \mode != fiq
+	ldmia	\base_reg!, {r2, r3, r4}
+	msr	SP_\mode, r2
+	msr	LR_\mode, r3
+	msr	SPSR_\mode, r4
+	.else
+	ldmia	\base_reg!, {r2-r9}
+	msr	r8_fiq, r2
+	msr	r9_fiq, r3
+	msr	r10_fiq, r4
+	msr	r11_fiq, r5
+	msr	r12_fiq, r6
+	msr	SP_fiq, r7
+	msr	LR_fiq, r8
+	msr	SPSR_fiq, r9
+	.endif
+.endm
+
+/* Reads cp15 registers from hardware and stores them in memory
+ * @vcpu:   If 0, registers are written in-order to the stack,
+ * 	    otherwise to the VCPU struct pointed to by vcpup
+ * @vcpup:  Register pointing to VCPU struct
+ */
+.macro read_cp15_state vcpu=0, vcpup
+	mrc	p15, 0, r2, c1, c0, 0	@ SCTLR
+	mrc	p15, 0, r3, c1, c0, 2	@ CPACR
+	mrc	p15, 0, r4, c2, c0, 2	@ TTBCR
+	mrc	p15, 0, r5, c3, c0, 0	@ DACR
+	mrrc	p15, 0, r6, r7, c2	@ TTBR 0
+	mrrc	p15, 1, r8, r9, c2	@ TTBR 1
+	mrc	p15, 0, r10, c10, c2, 0	@ PRRR
+	mrc	p15, 0, r11, c10, c2, 1	@ NMRR
+
+	.if \vcpu == 0
+	push	{r2-r11}		@ Push CP15 registers
+	.else
+	str	r2, [\vcpup, #VCPU_SCTLR]
+	str	r3, [\vcpup, #VCPU_CPACR]
+	str	r4, [\vcpup, #VCPU_TTBCR]
+	str	r5, [\vcpup, #VCPU_DACR]
+	add	\vcpup, \vcpup, #VCPU_TTBR0
+	strd	r6, r7, [\vcpup]
+	add	\vcpup, \vcpup, #(VCPU_TTBR1 - VCPU_TTBR0)
+	strd	r8, r9, [\vcpup]
+	sub	\vcpup, \vcpup, #(VCPU_TTBR1)
+	str	r10, [\vcpup, #VCPU_PRRR]
+	str	r11, [\vcpup, #VCPU_NMRR]
+	.endif
+
+	mrc	p15, 0, r2, c13, c0, 1	@ CID
+	mrc	p15, 0, r3, c13, c0, 2	@ TID_URW
+	mrc	p15, 0, r4, c13, c0, 3	@ TID_URO
+	mrc	p15, 0, r5, c13, c0, 4	@ TID_PRIV
+	mrc	p15, 0, r6, c5, c0, 0	@ DFSR
+	mrc	p15, 0, r7, c5, c0, 1	@ IFSR
+	mrc	p15, 0, r8, c5, c1, 0	@ ADFSR
+	mrc	p15, 0, r9, c5, c1, 1	@ AIFSR
+	mrc	p15, 0, r10, c6, c0, 0	@ DFAR
+	mrc	p15, 0, r11, c6, c0, 2	@ IFAR
+	mrc	p15, 0, r12, c12, c0, 0	@ VBAR
+
+	.if \vcpu == 0
+	push	{r2-r12}		@ Push CP15 registers
+	.else
+	str	r2, [\vcpup, #VCPU_CID]
+	str	r3, [\vcpup, #VCPU_TID_URW]
+	str	r4, [\vcpup, #VCPU_TID_URO]
+	str	r5, [\vcpup, #VCPU_TID_PRIV]
+	str	r6, [\vcpup, #VCPU_DFSR]
+	str	r7, [\vcpup, #VCPU_IFSR]
+	str	r8, [\vcpup, #VCPU_ADFSR]
+	str	r9, [\vcpup, #VCPU_AIFSR]
+	str	r10, [\vcpup, #VCPU_DFAR]
+	str	r11, [\vcpup, #VCPU_IFAR]
+	str	r12, [\vcpup, #VCPU_VBAR]
+	.endif
+.endm
+
+/* Reads cp15 registers from memory and writes them to hardware
+ * @vcpu:   If 0, registers are read in-order from the stack,
+ * 	    otherwise from the VCPU struct pointed to by vcpup
+ * @vcpup:  Register pointing to VCPU struct
+ */
+.macro write_cp15_state vcpu=0, vcpup
+	.if \vcpu == 0
+	pop	{r2-r12}
+	.else
+	ldr	r2, [\vcpup, #VCPU_CID]
+	ldr	r3, [\vcpup, #VCPU_TID_URW]
+	ldr	r4, [\vcpup, #VCPU_TID_URO]
+	ldr	r5, [\vcpup, #VCPU_TID_PRIV]
+	ldr	r6, [\vcpup, #VCPU_DFSR]
+	ldr	r7, [\vcpup, #VCPU_IFSR]
+	ldr	r8, [\vcpup, #VCPU_ADFSR]
+	ldr	r9, [\vcpup, #VCPU_AIFSR]
+	ldr	r10, [\vcpup, #VCPU_DFAR]
+	ldr	r11, [\vcpup, #VCPU_IFAR]
+	ldr	r12, [\vcpup, #VCPU_VBAR]
+	.endif
+
+	mcr	p15, 0, r2, c13, c0, 1	@ CID
+	mcr	p15, 0, r3, c13, c0, 2	@ TID_URW
+	mcr	p15, 0, r4, c13, c0, 3	@ TID_URO
+	mcr	p15, 0, r5, c13, c0, 4	@ TID_PRIV
+	mcr	p15, 0, r6, c5, c0, 0	@ DFSR
+	mcr	p15, 0, r7, c5, c0, 1	@ IFSR
+	mcr	p15, 0, r8, c5, c1, 0	@ ADFSR
+	mcr	p15, 0, r9, c5, c1, 1	@ AIFSR
+	mcr	p15, 0, r10, c6, c0, 0	@ DFAR
+	mcr	p15, 0, r11, c6, c0, 2	@ IFAR
+	mcr	p15, 0, r12, c12, c0, 0	@ VBAR
+
+	.if \vcpu == 0
+	pop	{r2-r11}
+	.else
+	ldr	r2, [\vcpup, #VCPU_SCTLR]
+	ldr	r3, [\vcpup, #VCPU_CPACR]
+	ldr	r4, [\vcpup, #VCPU_TTBCR]
+	ldr	r5, [\vcpup, #VCPU_DACR]
+	add	\vcpup, \vcpup, #VCPU_TTBR0
+	ldrd	r6, r7, [\vcpup]
+	add	\vcpup, \vcpup, #(VCPU_TTBR1 - VCPU_TTBR0)
+	ldrd	r8, r9, [\vcpup]
+	sub	\vcpup, \vcpup, #(VCPU_TTBR1)
+	ldr	r10, [\vcpup, #VCPU_PRRR]
+	ldr	r11, [\vcpup, #VCPU_NMRR]
+	.endif
+
+	mcr	p15, 0, r2, c1, c0, 0	@ SCTLR
+	mcr	p15, 0, r3, c1, c0, 2	@ CPACR
+	mcr	p15, 0, r4, c2, c0, 2	@ TTBCR
+	mcr	p15, 0, r5, c3, c0, 0	@ DACR
+	mcrr	p15, 0, r6, r7, c2	@ TTBR 0
+	mcrr	p15, 1, r8, r9, c2	@ TTBR 1
+	mcr	p15, 0, r10, c10, c2, 0	@ PRRR
+	mcr	p15, 0, r11, c10, c2, 1	@ NMRR
+.endm
+
+/* Configures the HSTR (Hyp System Trap Register) on entry/return
+ * (hardware reset value is 0) */
+.macro set_hstr entry
+	mrc	p15, 4, r2, c1, c1, 3
+	ldr	r3, =(HSTR_T(9) | HSTR_T(10) | HSTR_T(11) | HSTR_T(15))
+	.if \entry == 1
+	orr	r2, r2, r3		@ Trap CR{9,10,11,15}
+	.else
+	bic	r2, r2, r3		@ Don't trap any CRx accesses
+	.endif
+	mcr	p15, 4, r2, c1, c1, 3
+.endm
+
+/* Configures the HCPTR (Hyp Coprocessor Trap Register) on entry/return
+ * (hardware reset value is 0) */
+.macro set_hcptr entry
+	mrc	p15, 4, r2, c1, c1, 2
+	ldr	r3, =(HCPTR_TTA)
+	.if \entry == 1
+	orr	r2, r2, r3		@ Trap some coproc-accesses
+	.else
+	bic	r2, r2, r3		@ Don't trap any coproc- accesses
+	.endif
+	mcr	p15, 4, r2, c1, c1, 2
+.endm
+
+/* Enable/Disable: stage-2 trans., trap interrupts, trap wfi, trap smc */
+.macro configure_hyp_role entry, vcpu_ptr
+	mrc	p15, 4, r2, c1, c1, 0	@ HCR
+	bic	r2, r2, #HCR_VIRT_EXCP_MASK
+	ldr	r3, =HCR_GUEST_MASK
+	.if \entry == 1
+	orr	r2, r2, r3
+	ldr	r3, [\vcpu_ptr, #VCPU_IRQ_LINES]
+	orr	r2, r2, r3
+	.else
+	bic	r2, r2, r3
+	.endif
+	mcr	p15, 4, r2, c1, c1, 0
+.endm
+
+@ Arguments:
+@  r0: pointer to vcpu struct
 ENTRY(__kvm_vcpu_run)
-	bx	lr
+	hvc	#0			@ switch to hyp-mode
+
+	@ Now we're in Hyp-mode and lr_usr, spsr_hyp are on the stack
+	mrs	r2, sp_usr
+	push	{r2}			@ Push r13_usr
+	push	{r4-r12}		@ Push r4-r12
+
+	store_mode_state sp, svc
+	store_mode_state sp, abt
+	store_mode_state sp, und
+	store_mode_state sp, irq
+	store_mode_state sp, fiq
+
+	@ Store hardware CP15 state and load guest state
+	read_cp15_state
+	write_cp15_state 1, r0
+
+	push	{r0}			@ Push the VCPU pointer
+
+	@ Configure Hyp-role
+	configure_hyp_role 1, r0
+
+	@ Trap coprocessor CRx accesses
+	set_hstr 1
+	set_hcptr 1
+
+	@ Write configured ID register into MIDR alias
+	ldr	r1, [r0, #VCPU_MIDR]
+	mcr	p15, 4, r1, c0, c0, 0
+
+	@ Write guest view of MPIDR into VMPIDR
+	ldr	r1, [r0, #VCPU_MPIDR]
+	mcr	p15, 4, r1, c0, c0, 5
+
+	@ Load guest registers
+	add	r0, r0, #(VCPU_USR_SP)
+	load_mode_state r0, usr
+	load_mode_state r0, svc
+	load_mode_state r0, abt
+	load_mode_state r0, und
+	load_mode_state r0, irq
+	load_mode_state r0, fiq
+
+	@ Load return state (r0 now points to vcpu->arch.regs.pc)
+	ldmia	r0, {r2, r3}
+	msr	ELR_hyp, r2
+	msr	SPSR_cxsf, r3
+
+	@ Set up guest memory translation
+	sub	r1, r0, #(VCPU_PC - VCPU_KVM)	@ r1 points to kvm struct
+	ldr	r1, [r1]
+	add	r1, r1, #KVM_VTTBR
+	ldrd	r2, r3, [r1]
+	mcrr	p15, 6, r2, r3, c2	@ Write VTTBR
+
+	@ Load remaining registers and do the switch
+	sub	r0, r0, #(VCPU_PC - VCPU_USR_REGS)
+	ldmia	r0, {r0-r12}
+	eret
+
+__kvm_vcpu_return:
+	@ Set VMID == 0
+	mov	r2, #0
+	mov	r3, #0
+	mcrr	p15, 6, r2, r3, c2	@ Write VTTBR
+
+	@ Store return state
+	mrs	r2, ELR_hyp
+	mrs	r3, spsr
+	str	r2, [r1, #VCPU_PC]
+	str	r3, [r1, #VCPU_CPSR]
+
+	@ Store guest registers
+	add	r1, r1, #(VCPU_FIQ_SPSR + 4)
+	store_mode_state r1, fiq
+	store_mode_state r1, irq
+	store_mode_state r1, und
+	store_mode_state r1, abt
+	store_mode_state r1, svc
+	store_mode_state r1, usr
+	sub	r1, r1, #(VCPU_USR_REG(13))
+
+	@ Don't trap coprocessor accesses for host kernel
+	set_hstr 0
+	set_hcptr 0
+
+	@ Reset Hyp-role
+	configure_hyp_role 0, r1
+
+	@ Let host read hardware MIDR
+	mrc	p15, 0, r2, c0, c0, 0
+	mcr	p15, 4, r2, c0, c0, 0
+
+	@ Back to hardware MPIDR
+	mrc	p15, 0, r2, c0, c0, 5
+	mcr	p15, 4, r2, c0, c0, 5
+
+	@ Store guest CP15 state and restore host state
+	read_cp15_state 1, r1
+	write_cp15_state
+
+	load_mode_state sp, fiq
+	load_mode_state sp, irq
+	load_mode_state sp, und
+	load_mode_state sp, abt
+	load_mode_state sp, svc
+
+	pop	{r4-r12}		@ Pop r4-r12
+	pop	{r2}			@ Pop r13_usr
+	msr	sp_usr, r2
 
+	ldr	r2, =(~PAGE_MASK)	@ Get svc-cpsr in case we need it for
+	mov	r1, sp			@ the __irq_svc call
+	tst	r1, r2
+	subeq	r2, r1, #0x1000
+	bicne	r2, r1, r2
+	ldr	r2, [r2, #4]		@ r2 = svc_cpsr
+
+	hvc	#0			@ switch back to svc-mode, see hyp_svc
+
+	cmp	r0, #ARM_EXCEPTION_IRQ
+	bxne	lr			@ return to IOCTL
+
+	/*
+	 * It's time to launch the kernel IRQ handler for IRQ exceptions. This
+	 * requires some manipulation though.
+	 *
+	 *  - The easiest entry point to the host handler is __irq_svc.
+	 *  - The __irq_svc expects to be called from SVC mode, which has been
+	 *    switched to from vector_stub code in entry-armv.S. The __irq_svc
+	 *    calls svc_entry which uses values stored in memory and pointed to
+	 *    by r0 to return from handler. We allocate this memory on the
+	 *    stack, which will contain these values:
+	 *      0x8:   cpsr
+	 *      0x4:   return_address
+	 *      0x0:   r0
+	 */
+	adr	r1, irq_kernel_resume	@ Where to resume
+	push	{r0 - r2}
+	mov	r0, sp
+	b	__irq_svc
+
+irq_kernel_resume:
+	pop	{r0}
+	add	sp, sp, #8
+	bx	lr			@ return to IOCTL
 
 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 @  Hypervisor exception vector and handlers
 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
+/*
+ * The KVM/ARM Hypervisor ABI is defined as follows:
+ *
+ * Entry to Hyp mode from the host kernel will happen _only_ when an HVC
+ * instruction is issued since all traps are disabled when running the host
+ * kernel as per the Hyp-mode initialization at boot time.
+ *
+ * HVC instructions cause a trap to the vector page + offset 0x18 (see hyp_hvc
+ * below) when the HVC instruction is called from SVC mode (i.e. a guest or the
+ * host kernel) and they cause a trap to the vector page + offset 0xc when HVC
+ * instructions are called from within Hyp-mode.
+ *
+ * Hyp-ABI: Switching from host kernel to Hyp-mode:
+ *    Switching to Hyp mode is done through a simple HVC instructions. The
+ *    exception vector code will check that the HVC comes from VMID==0 and if
+ *    so will store the necessary state on the Hyp stack, which will look like
+ *    this (growing downwards, see the hyp_hvc handler):
+ *      ...
+ *      stack_page + 4: spsr (Host-SVC cpsr)
+ *      stack_page    : lr_usr
+ *      --------------: stack bottom
+ *
+ * Hyp-ABI: Switching from Hyp-mode to host kernel SVC mode:
+ *    When returning from Hyp mode to SVC mode, another HVC instruction is
+ *    executed from Hyp mode, which is taken in the hyp_svc handler. The
+ *    bottom of the Hyp is derived from the Hyp stack pointer (only a single
+ *    page aligned stack is used per CPU) and the initial SVC registers are
+ *    used to restore the host state.
+ *
+ *
+ * Note that the above is used to execute code in Hyp-mode from a host-kernel
+ * point of view, and is a different concept from performing a world-switch and
+ * executing guest code SVC mode (with a VMID != 0).
+ */
+
+@ Handle undef, svc, pabt, or dabt by crashing with a user notice
+.macro bad_exception exception_code, panic_str
+	mrrc	p15, 6, r2, r3, c2	@ Read VTTBR
+	lsr	r3, r3, #16
+	ands	r3, r3, #0xff
+
+	@ COND:neq means we're probably in the guest and we can try fetching
+	@ the vcpu pointer and stuff off the stack and keep our fingers crossed
+	beq	99f
+	mov	r0, #\exception_code
+	pop	{r1}			@ Load VCPU pointer
+	.if \exception_code == ARM_EXCEPTION_DATA_ABORT
+	mrc	p15, 4, r2, c5, c2, 0	@ HSR
+	mrc	p15, 4, r3, c6, c0, 0	@ HDFAR
+	str	r2, [r1, #VCPU_HSR]
+	str	r3, [r1, #VCPU_HDFAR]
+	.endif
+	.if \exception_code == ARM_EXCEPTION_PREF_ABORT
+	mrc	p15, 4, r2, c5, c2, 0	@ HSR
+	mrc	p15, 4, r3, c6, c0, 2	@ HIFAR
+	str	r2, [r1, #VCPU_HSR]
+	str	r3, [r1, #VCPU_HIFAR]
+	.endif
+	mrs	r2, ELR_hyp
+	str	r2, [r1, #VCPU_HYP_PC]
+	b	__kvm_vcpu_return
+
+	@ We were in the host already
+99:	hvc	#0	@ switch to SVC mode
+	ldr	r0, \panic_str
+	mrs	r1, ELR_hyp
+	b	panic
+
+.endm
+
+	.text
+
 	.align 5
 __kvm_hyp_vector:
 	.globl __kvm_hyp_vector
-	nop
+
+	@ Hyp-mode exception vector
+	W(b)	hyp_reset
+	W(b)	hyp_undef
+	W(b)	hyp_svc
+	W(b)	hyp_pabt
+	W(b)	hyp_dabt
+	W(b)	hyp_hvc
+	W(b)	hyp_irq
+	W(b)	hyp_fiq
+
+	.align
+hyp_reset:
+	b	hyp_reset
+
+	.align
+hyp_undef:
+	bad_exception ARM_EXCEPTION_UNDEFINED, und_die_str
+
+	.align
+hyp_svc:
+	@ Can only get here if HVC or SVC is called from Hyp, mode which means
+	@ we want to change mode back to SVC mode.
+	push	{r12}
+	mov	r12, sp
+	bic	r12, r12, #0x0ff
+	bic	r12, r12, #0xf00
+	ldr	lr, [r12, #4]
+	msr	SPSR_csxf, lr
+	ldr	lr, [r12]
+	pop	{r12}
+	eret
+
+	.align
+hyp_pabt:
+	bad_exception ARM_EXCEPTION_PREF_ABORT, pabt_die_str
+
+	.align
+hyp_dabt:
+	bad_exception ARM_EXCEPTION_DATA_ABORT, dabt_die_str
+
+	.align
+hyp_hvc:
+	@ Getting here is either becuase of a trap from a guest or from calling
+	@ HVC from the host kernel, which means "switch to Hyp mode".
+	push	{r0, r1, r2}
+
+	@ Check syndrome register
+	mrc	p15, 4, r0, c5, c2, 0	@ HSR
+	lsr	r1, r0, #HSR_EC_SHIFT
+	cmp	r1, #HSR_EC_HVC
+	bne	guest_trap		@ Not HVC instr.
+
+	@ Let's check if the HVC came from VMID 0 and allow simple
+	@ switch to Hyp mode
+	mrrc    p15, 6, r1, r2, c2
+	lsr     r2, r2, #16
+	and     r2, r2, #0xff
+	cmp     r2, #0
+	bne	guest_trap		@ Guest called HVC
+
+	@ Store lr_usr,spsr (svc cpsr) on bottom of stack
+	mov	r1, sp
+	bic	r1, r1, #0x0ff
+	bic	r1, r1, #0xf00
+	str	lr, [r1]
+	mrs	lr, spsr
+	str	lr, [r1, #4]
+
+	pop	{r0, r1, r2}
+
+	@ Return to caller in Hyp mode
+	mrs	lr, ELR_hyp
+	mov	pc, lr
+
+guest_trap:
+	ldr	r1, [sp, #12]		@ Load VCPU pointer
+	str	r0, [r1, #VCPU_HSR]
+	add	r1, r1, #VCPU_USR_REG(3)
+	stmia	r1, {r3-r12}
+	sub	r1, r1, #(VCPU_USR_REG(3) - VCPU_USR_REG(0))
+	pop	{r3, r4, r5}
+	add	sp, sp, #4		@ We loaded the VCPU pointer above
+	stmia	r1, {r3, r4, r5}
+	sub	r1, r1, #VCPU_USR_REG(0)
+
+	@ Check if we need the fault information
+	lsr	r2, r0, #HSR_EC_SHIFT
+	cmp	r2, #HSR_EC_IABT
+	beq	2f
+	cmpne	r2, #HSR_EC_DABT
+	bne	1f
+
+	@ For non-valid data aborts, get the offending instr. PA
+	lsr	r2, r0, #HSR_ISV_SHIFT
+	ands	r2, r2, #1
+	bne	2f
+	mrs	r3, ELR_hyp
+	mcr	p15, 0, r3, c7, c8, 0	@ VA to PA, ATS1CPR
+	mrrc	p15, 0, r4, r5, c7	@ PAR
+	add	r6, r1, #VCPU_PC_IPA
+	strd	r4, r5, [r6]
+
+	@ Check if we might have a wide thumb instruction spill-over
+	ldr	r5, =0xfff
+	bic	r4, r3, r5		@ clear page mask
+	sub	r5, r5, #1		@ last 2-byte page bounday, 0xffe
+	cmp	r4, r5
+	bne	2f
+	add	r4, r3, #2		@ _really_ unlikely!
+	mcr	p15, 0, r4, c7, c8, 0	@ VA to PA, ATS1CPR
+	mrrc	p15, 0, r4, r5, c7	@ PAR
+	add	r6, r1, #VCPU_PC_IPA2
+	strd	r4, r5, [r6]
+
+2:	mrc	p15, 4, r2, c6, c0, 0	@ HDFAR
+	mrc	p15, 4, r3, c6, c0, 2	@ HIFAR
+	mrc	p15, 4, r4, c6, c0, 4	@ HPFAR
+	add	r5, r1, #VCPU_HDFAR
+	stmia	r5, {r2, r3, r4}
+
+1:	mov	r0, #ARM_EXCEPTION_HVC
+	b	__kvm_vcpu_return
+
+	.align
+hyp_irq:
+	push	{r0}
+	ldr	r0, [sp, #4]		@ Load VCPU pointer
+	add	r0, r0, #(VCPU_USR_REG(1))
+	stmia	r0, {r1-r12}
+	pop	{r0, r1}		@ r1 == vcpu pointer
+	str	r0, [r1, #VCPU_USR_REG(0)]
+
+	mov	r0, #ARM_EXCEPTION_IRQ
+	b	__kvm_vcpu_return
+
+	.align
+hyp_fiq:
+	b	hyp_fiq
+
+	.ltorg
+
+und_die_str:
+	.ascii	"unexpected undefined exception in Hyp mode at: %#08x"
+pabt_die_str:
+	.ascii	"unexpected prefetch abort in Hyp mode at: %#08x"
+dabt_die_str:
+	.ascii	"unexpected data abort in Hyp mode at: %#08x"
 
 /*
  * The below lines makes sure the HYP mode code fits in a single page (the