diff mbox

[v3,1/8] ARM: KVM: Initial skeleton to compile KVM support

Message ID 20110603150318.17011.82777.stgit@ubuntu (mailing list archive)
State New, archived
Headers show

Commit Message

Christoffer Dall June 3, 2011, 3:03 p.m. UTC
Targets KVM support for Cortex A-15 processors.

Contains no real functionality but all the framework components,
make files, header files and some tracing functionality.
---
 arch/arm/Kconfig                   |    2 
 arch/arm/Makefile                  |    1 
 arch/arm/include/asm/kvm.h         |   65 +++++
 arch/arm/include/asm/kvm_asm.h     |   28 ++
 arch/arm/include/asm/kvm_emulate.h |   89 +++++++
 arch/arm/include/asm/kvm_host.h    |  114 +++++++++
 arch/arm/include/asm/kvm_para.h    |    9 +
 arch/arm/include/asm/unified.h     |   12 +
 arch/arm/kvm/Kconfig               |   44 ++++
 arch/arm/kvm/Makefile              |   13 +
 arch/arm/kvm/arm.c                 |  363 ++++++++++++++++++++++++++++++
 arch/arm/kvm/arm_emulate.c         |   70 ++++++
 arch/arm/kvm/arm_guest.c           |  142 ++++++++++++
 arch/arm/kvm/arm_interrupts.S      |   17 +
 arch/arm/kvm/arm_mmu.c             |    0 
 arch/arm/kvm/trace.c               |  436 ++++++++++++++++++++++++++++++++++++
 arch/arm/kvm/trace.h               |  108 +++++++++
 arch/arm/mach-vexpress/Kconfig     |    1 
 arch/arm/mm/Kconfig                |    7 +
 include/linux/kvm.h                |    1 
 20 files changed, 1522 insertions(+), 0 deletions(-)
 create mode 100644 arch/arm/include/asm/kvm.h
 create mode 100644 arch/arm/include/asm/kvm_asm.h
 create mode 100644 arch/arm/include/asm/kvm_emulate.h
 create mode 100644 arch/arm/include/asm/kvm_host.h
 create mode 100644 arch/arm/include/asm/kvm_para.h
 create mode 100644 arch/arm/kvm/Kconfig
 create mode 100644 arch/arm/kvm/Makefile
 create mode 100644 arch/arm/kvm/arm.c
 create mode 100644 arch/arm/kvm/arm_emulate.c
 create mode 100644 arch/arm/kvm/arm_guest.c
 create mode 100644 arch/arm/kvm/arm_interrupts.S
 create mode 100644 arch/arm/kvm/arm_mmu.c
 create mode 100644 arch/arm/kvm/trace.c
 create mode 100644 arch/arm/kvm/trace.h


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Jan Kiszka June 3, 2011, 3:31 p.m. UTC | #1
On 2011-06-03 17:03, Christoffer Dall wrote:
> Targets KVM support for Cortex A-15 processors.
> 
> Contains no real functionality but all the framework components,
> make files, header files and some tracing functionality.
> ---

...

> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index ea2dc1a..d2ab07e 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -310,6 +310,7 @@ struct kvm_translation {
>  struct kvm_interrupt {
>  	/* in */
>  	__u32 irq;
> +	__u8  raise;
>  };

This touches an existing ABI and corrupts the definition of
KVM_INTERRUPT IOCTL. The might exist jurisdictions considering this a
capital crime. :)

You rather have to define a new CPU IRQ injection interface that
supports both raising and lowering and declare its availability via a
KVM_CAP. Don't forget to make it extensible (flags field) so that future
requirements can be added without breaking existing users.

Jan
Jan Kiszka June 3, 2011, 3:53 p.m. UTC | #2
On 2011-06-03 17:31, Jan Kiszka wrote:
> On 2011-06-03 17:03, Christoffer Dall wrote:
>> Targets KVM support for Cortex A-15 processors.
>>
>> Contains no real functionality but all the framework components,
>> make files, header files and some tracing functionality.
>> ---
> 
> ...
> 
>> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
>> index ea2dc1a..d2ab07e 100644
>> --- a/include/linux/kvm.h
>> +++ b/include/linux/kvm.h
>> @@ -310,6 +310,7 @@ struct kvm_translation {
>>  struct kvm_interrupt {
>>  	/* in */
>>  	__u32 irq;
>> +	__u8  raise;
>>  };
> 
> This touches an existing ABI and corrupts the definition of
> KVM_INTERRUPT IOCTL. The might exist jurisdictions considering this a
> capital crime. :)
> 
> You rather have to define a new CPU IRQ injection interface that
> supports both raising and lowering and declare its availability via a
> KVM_CAP. Don't forget to make it extensible (flags field) so that future
> requirements can be added without breaking existing users.

Or much easier (this is what PowerPC is doing): Define irq values in a
way that they include a raise/lower flag.

Jan
Christoffer Dall June 3, 2011, 4:19 p.m. UTC | #3
thanks, I will fix this for next version.

On Fri, Jun 3, 2011 at 5:53 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2011-06-03 17:31, Jan Kiszka wrote:
>> On 2011-06-03 17:03, Christoffer Dall wrote:
>>> Targets KVM support for Cortex A-15 processors.
>>>
>>> Contains no real functionality but all the framework components,
>>> make files, header files and some tracing functionality.
>>> ---
>>
>> ...
>>
>>> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
>>> index ea2dc1a..d2ab07e 100644
>>> --- a/include/linux/kvm.h
>>> +++ b/include/linux/kvm.h
>>> @@ -310,6 +310,7 @@ struct kvm_translation {
>>>  struct kvm_interrupt {
>>>      /* in */
>>>      __u32 irq;
>>> +    __u8  raise;
>>>  };
>>
>> This touches an existing ABI and corrupts the definition of
>> KVM_INTERRUPT IOCTL. The might exist jurisdictions considering this a
>> capital crime. :)
>>
>> You rather have to define a new CPU IRQ injection interface that
>> supports both raising and lowering and declare its availability via a
>> KVM_CAP. Don't forget to make it extensible (flags field) so that future
>> requirements can be added without breaking existing users.
>
> Or much easier (this is what PowerPC is doing): Define irq values in a
> way that they include a raise/lower flag.
>
> Jan
>
> --
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf June 3, 2011, 4:31 p.m. UTC | #4
On 03.06.2011, at 18:19, Christoffer Dall wrote:

> thanks, I will fix this for next version.

Please be prepared that we might want to model the API as well, so I'd propose you send out an RFC that patches Documentation/kvm with the respective API documentation so we can have some constructive influence :).

Currently, on book3s at least, PPC simply uses the irq field to indicate raise/lower since there's only a single "external interrupt" line that the CPU sees. All other dispatching happens in user space. I don't know if you could use the same model or of you need specific table offsets for different external interrupt vectors.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 5, 2011, 12:21 p.m. UTC | #5
On 06/03/2011 06:53 PM, Jan Kiszka wrote:
> >>  @@ -310,6 +310,7 @@ struct kvm_translation {
> >>   struct kvm_interrupt {
> >>   	/* in */
> >>   	__u32 irq;
> >>  +	__u8  raise;
> >>   };
> >
> >  This touches an existing ABI and corrupts the definition of
> >  KVM_INTERRUPT IOCTL. The might exist jurisdictions considering this a
> >  capital crime. :)
> >
> >  You rather have to define a new CPU IRQ injection interface that
> >  supports both raising and lowering

This is KVM_IRQ_LINE:



> and declare its availability via a
> >  KVM_CAP. Don't forget to make it extensible (flags field) so that future
> >  requirements can be added without breaking existing users.
>
> Or much easier (this is what PowerPC is doing): Define irq values in a
> way that they include a raise/lower flag.

Much easier and much horribler.
Avi Kivity June 5, 2011, 12:36 p.m. UTC | #6
On 06/03/2011 06:03 PM, Christoffer Dall wrote:
> Targets KVM support for Cortex A-15 processors.
>
> Contains no real functionality but all the framework components,
> make files, header files and some tracing functionality.
>
> +
> +struct kvm_regs {
> +	__u32 regs0_7[8];	/* Unbanked regs. (r0 - r7)	   */
> +	__u32 fiq_regs8_12[5];	/* Banked fiq regs. (r8 - r12)	   */
> +	__u32 usr_regs8_12[5];	/* Banked usr registers (r8 - r12) */
> +	__u32 reg13[6];		/* Banked r13, indexed by MODE_	   */
> +	__u32 reg14[6];		/* Banked r13, indexed by MODE_	   */
> +	__u32 reg15;
> +	__u32 cpsr;
> +	__u32 spsr[5];		/* Banked SPSR,  indexed by MODE_  */
> +	struct {
> +		__u32 c2_base0;
> +		__u32 c2_base1;
> +		__u32 c3_dacr;
> +	} cp15;
> +
> +};
> +
> +struct kvm_sregs {
> +};
> +
> +struct kvm_fpu {
> +};
> +
> +struct kvm_guest_debug_arch {
> +};
> +
> +struct kvm_debug_exit_arch {
> +};

Presumably, to be filled in later?

> +
> +/* Get vcpu register for current mode */
> +#define vcpu_reg(_vcpu, _reg_num) \
> +	(*kvm_vcpu_reg((_vcpu), _reg_num, vcpu_mode(_vcpu)))
> +
> +/* Get vcpu register for specific mode */
> +#define vcpu_reg_m(_vcpu, _reg_num, _mode) \
> +	(*kvm_vcpu_reg(_vcpu, _reg_num, _mode))
> +
> +#define vcpu_cpsr(_vcpu) \
> +	(_vcpu->arch.regs.cpsr)
> +
> +/* Get vcpu SPSR for current mode */
> +#define vcpu_spsr(_vcpu) \
> +	kvm_vcpu_spsr(_vcpu, vcpu_mode(_vcpu))
> +
> +/* Get vcpu SPSR for specific mode */
> +#define vcpu_spsr_m(_vcpu, _mode) \
> +	kvm_vcpu_spsr(_vcpu, _mode)
> +
> +#define MODE_HAS_SPSR(_vcpu) \
> +	 ((vcpu_mode(_vcpu))<  MODE_USR)
> +
> +#define VCPU_MODE_PRIV(_vcpu) \
> +	(((vcpu_mode(_vcpu)) == MODE_USR) ? 0 : 1)

Please use static inlines.  Yes, you'll need more helpers to set 
registers, but it's worth it, especially as some macros evaluate an 
argument multiple times.

> +if VIRTUALIZATION
> +
> +config KVM
> +	bool "Kernel-based Virtual Machine (KVM) support"
> +	select PREEMPT_NOTIFIERS
> +	select ANON_INODES
> +	select KVM_ARM_HOST
> +	select KVM_MMIO
> +	---help---
> +	  Support hosting virtualized guest machines. You will also
> +	  need to select one or more of the processor modules below.
> +
> +	  This module provides access to the hardware capabilities through
> +	  a character device node named /dev/kvm.
> +
> +	  If unsure, say N.

I see you can't support a modular build, which is a pity.

> +
> +static int k_show(struct seq_file *m, void *v)
> +{
> +	print_kvm_debug_info(&seq_printf, m);
> +	return 0;
> +}
> +
> +static void *k_start(struct seq_file *m, loff_t *pos)
> +{
> +	return *pos<  1 ? (void *)1 : NULL;
> +}
> +
> +static void *k_next(struct seq_file *m, void *v, loff_t *pos)
> +{
> +	++*pos;
> +	return NULL;
> +}
> +
> +static void k_stop(struct seq_file *m, void *v)
> +{
> +}
> +
> +static const struct seq_operations kvmproc_op = {
> +	.start	= k_start,
> +	.next	= k_next,
> +	.stop	= k_stop,
> +	.show	= k_show
> +};
> +
> +static int kvm_open(struct inode *inode, struct file *file)
> +{
> +	return seq_open(file,&kvmproc_op);
> +}
> +
> +static const struct file_operations proc_kvm_operations = {
> +	.open		= kvm_open,
> +	.read		= seq_read,
> +	.llseek		= seq_lseek,
> +	.release	= seq_release,
> +};
> +
> +static int arm_init(void)
> +{
> +	int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
> +	if (rc == 0)
> +		proc_create("kvm", 0, NULL,&proc_kvm_operations);
> +	return rc;
> +}

/proc is frowned upon these days.  Is there no better place for this?+
> +/*
> + * Return a pointer to the register number valid in the specified mode of
> + * the virtual CPU.
> + */
> +u32* kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode)
> +{
> +	struct kvm_vcpu_regs *regs;
> +	u8 reg_idx;
> +	BUG_ON(reg_num>  15);
> +
> +	regs =&vcpu->arch.regs;
> +
> +	/* The PC is trivial */
> +	if (reg_num == 15)
> +		return&(regs->pc);
> +
> +	/* Non-banked registers */
> +	if (reg_num<  8)
> +		return&(regs->usr_regs[reg_num]);
> +
> +	/* Banked registers r13 and r14 */
> +	if (reg_num>= 13) {
> +		reg_idx = reg_num - 13; /* 0=r13 and 1=r14 */
> +		switch (mode) {
> +		case MODE_FIQ:
> +			return&(regs->fiq_regs[reg_idx + 5]);
> +		case MODE_IRQ:
> +			return&(regs->irq_regs[reg_idx]);
> +		case MODE_SVC:
> +			return&(regs->svc_regs[reg_idx]);
> +		case MODE_ABT:
> +			return&(regs->abt_regs[reg_idx]);
> +		case MODE_UND:
> +			return&(regs->und_regs[reg_idx]);
> +		case MODE_USR:
> +		case MODE_SYS:
> +			return&(regs->usr_regs[reg_idx]);
> +		}
> +	}
> +
> +	/* Banked FIQ registers r8-r12 */
> +	if (reg_num>= 8&&  reg_num<= 12) {
> +		if (mode == MODE_FIQ) {
> +			reg_idx = reg_num - 8; /* 0=r8, ..., 4=r12 */
> +			return&(regs->fiq_regs[reg_idx]);
> +		} else
> +			return&(regs->usr_regs[reg_num]);
> +	}

You could have a static 2D array indexed by mode and register number, 
returning an offsetof() into the vcpu structure.

> +
> +	BUG();
> +	return NULL;
> +}
>
> diff --git a/arch/arm/kvm/trace.c b/arch/arm/kvm/trace.c
> new file mode 100644
> index 0000000..8ea1155
> --- /dev/null
> +++ b/arch/arm/kvm/trace.c
> @@ -0,0 +1,436 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
> + *
> + */
> +#include<linux/types.h>
> +#include<linux/kvm_types.h>
> +#include<linux/kvm_host.h>
> +
> +#include<asm/kvm_emulate.h>
> +#include "trace.h"
> +
> +
> +/******************************************************************************
> + * Simple event counting
> + */
> +
> +struct kvm_event {
> +	unsigned long long cnt;
> +	char *descr;
> +};
> +
> +static struct kvm_event kvm_eventc_log[KVM_EVENTC_ITEMS] =
> +{
> +	{ 0, "switch to guest" },
> +	{ 0, "exit from guest" },
> +	{ 0, "Block VCPU" },
> +	{ 0, "Exit to QEMU for IRQ window" },
> +	{ 0, "Switch VCPU mode" },
> +	{ 0, "VCPU IRQs on" },
> +	{ 0, "VCPU IRQs off" },
> +	{ 0, "Wait-for-interrupts" },
> +	{ 0, "Flush shadow page table" },
> +	{ 0, "Virtual TTBR change" },
> +	{ 0, "Read guest page table entry" },
> +	{ 0, "Map GVA to GFN" },
> +	{ 0, "Virtual DACR change" },
> +	{ 0, "VCPU switch to privileged mode" },
> +	{ 0, "VCPU switch from privileged mode" },
> +	{ 0, "VCPU process ID registers change" },
> +	{ 0, "Emulate Load/Store with translation" },
> +	{ 0, "Emulate MRS" },
> +	{ 0, "Emulate MSR" },
> +	{ 0, "Emulate CPS" },
> +	{ 0, "Need reschedule in execution loop" },
> +	{ 0, "MCR 7,  5, 0 - Invalidate entire I-cache" },
> +	{ 0, "MCR 7,  5, 1 - Invalidate line in I-cache MVA" },
> +	{ 0, "MCR 7,  5, 2 - Invalidate line in I-cache set/way" },
> +	{ 0, "MCR 7,  5, 7 - Flush branch target cache - MVA" },
> +	{ 0, "MCR 7,  6, 0 - Invalidate entire data cache" },
> +	{ 0, "MCR 7,  6, 1 - Invalidate data cache line - MVA" },
> +	{ 0, "MCR 7,  6, 2 - Invalidate data cache line - set/way" },
> +	{ 0, "MCR 7,  7, 0 - Invalidate D- and I-cache" },
> +	{ 0, "MCR 7, 10, 0 - Clean entire data cache" },
> +	{ 0, "MCR 7, 10, 1 - Clean data cache line - MVA" },
> +	{ 0, "MCR 7, 10, 4 - Data Synchronization Barrier (DSB)" },
> +	{ 0, "MCR 7, 14, 0 - Clean and invalidate entire D-cache" },
> +	{ 0, "MCR 7, 14, 1 - Clean and invalidate D-cache line - MVA" },
> +	{ 0, "MCR 7, 15, 0 - Clean and invalidate unified cache" },
> +	{ 0, "MCR 8,  5, 0 - Invalidate instruction TLB" },
> +	{ 0, "MCR 8,  6, 0 - Invalidate data TLB" },
> +	{ 0, "MCR 8,  7, 0 - Invalidate unified TLB" },
> +	{ 0, "Emulate Load-Store multiple" },
> +};
> +
> +void kvm_arm_count_event(unsigned int event)
> +{
> +	if (event>= KVM_EVENTC_ITEMS)
> +		return;
> +
> +	kvm_eventc_log[event].cnt++;
> +}

We've switched to ftrace for this sort of thing.  Simply add a 
tracepoint for each interesting event, and the kernel can provide you with

- a count of events ('perf stat')
- a log of events ('trace-cmd record/report'), possibly with other 
kernel events interspersed
- a running histogram ('kvm_stat')

with near-zero impact when disabled.

See include/trace/events/kvm.h, arch/x86/kvm/trace.h.
Avi Kivity June 5, 2011, 12:52 p.m. UTC | #7
On 06/03/2011 06:03 PM, Christoffer Dall wrote:
> Targets KVM support for Cortex A-15 processors.
>
> Contains no real functionality but all the framework components,
> make files, header files and some tracing functionality.


(series review - please have a cover letter in the future for this stuff)

Looks good in general.  Of course I can't say much about technical 
correctness and will rely on the ARM maintainers for that.

Please document which ioctls are supported in 
Documentation/virtual/kvm/api.txt, and make sure that any ARM-specific 
ioctls (if you ever have any) are documented there.
Avi Kivity June 5, 2011, 2 p.m. UTC | #8
On 06/05/2011 03:52 PM, Avi Kivity wrote:
>
> (series review - please have a cover letter in the future for this stuff)
>

Oh - you did - but in a separate thread.  Oh well.
Christoffer Dall June 5, 2011, 2:13 p.m. UTC | #9
On Sun, Jun 5, 2011 at 4:00 PM, Avi Kivity <avi@redhat.com> wrote:
> On 06/05/2011 03:52 PM, Avi Kivity wrote:
>>
>> (series review - please have a cover letter in the future for this stuff)
>>
>
> Oh - you did - but in a separate thread.  Oh well.

My internet router decided to reset immediately after sending the
cover letter, when it was backup stgit gave up on sending the rest of
the patches, and I didn't think about putting the rest in the same
thread. Sorry about that.

>
> --
> error compiling committee.c: too many arguments to function
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka June 5, 2011, 2:13 p.m. UTC | #10
On 2011-06-05 14:21, Avi Kivity wrote:
> On 06/03/2011 06:53 PM, Jan Kiszka wrote:
>> >>  @@ -310,6 +310,7 @@ struct kvm_translation {
>> >>   struct kvm_interrupt {
>> >>       /* in */
>> >>       __u32 irq;
>> >>  +    __u8  raise;
>> >>   };
>> >
>> >  This touches an existing ABI and corrupts the definition of
>> >  KVM_INTERRUPT IOCTL. The might exist jurisdictions considering this a
>> >  capital crime. :)
>> >
>> >  You rather have to define a new CPU IRQ injection interface that
>> >  supports both raising and lowering
> 
> This is KVM_IRQ_LINE:
> 

It's so far associated with in-kernel irqchip input pins, not with
raising CPU IRQs.

> 
> 
>> and declare its availability via a
>> >  KVM_CAP. Don't forget to make it extensible (flags field) so that
>> future
>> >  requirements can be added without breaking existing users.
>>
>> Or much easier (this is what PowerPC is doing): Define irq values in a
>> way that they include a raise/lower flag.
> 
> Much easier and much horribler.
> 

Less horrible than overloading KVM_IRQ_LINE IMHO. The semantics of
kvm_interrupt::irq are in arch hands anyway.

Jan
Avi Kivity June 5, 2011, 2:18 p.m. UTC | #11
On 06/05/2011 05:13 PM, Jan Kiszka wrote:
> On 2011-06-05 14:21, Avi Kivity wrote:
> >  On 06/03/2011 06:53 PM, Jan Kiszka wrote:
> >>  >>   @@ -310,6 +310,7 @@ struct kvm_translation {
> >>  >>    struct kvm_interrupt {
> >>  >>        /* in */
> >>  >>        __u32 irq;
> >>  >>   +    __u8  raise;
> >>  >>    };
> >>  >
> >>  >   This touches an existing ABI and corrupts the definition of
> >>  >   KVM_INTERRUPT IOCTL. The might exist jurisdictions considering this a
> >>  >   capital crime. :)
> >>  >
> >>  >   You rather have to define a new CPU IRQ injection interface that
> >>  >   supports both raising and lowering
> >
> >  This is KVM_IRQ_LINE:
> >
>
> It's so far associated with in-kernel irqchip input pins, not with
> raising CPU IRQs.

It's up to the architecture to define what it's connected to.

Note that with KVM_SET_GSI_ROUTING (bad name for ARM...) we can even 
choose if an irq line is connected to a kernel-emulated interrupt 
controller or to the core's irq input.

> >>  and declare its availability via a
> >>  >   KVM_CAP. Don't forget to make it extensible (flags field) so that
> >>  future
> >>  >   requirements can be added without breaking existing users.
> >>
> >>  Or much easier (this is what PowerPC is doing): Define irq values in a
> >>  way that they include a raise/lower flag.
> >
> >  Much easier and much horribler.
> >
>
> Less horrible than overloading KVM_IRQ_LINE IMHO. The semantics of
> kvm_interrupt::irq are in arch hands anyway.

Something that can be raised or lowered is an irq line.
Avi Kivity June 5, 2011, 2:18 p.m. UTC | #12
On 06/05/2011 05:13 PM, Christoffer Dall wrote:
> On Sun, Jun 5, 2011 at 4:00 PM, Avi Kivity<avi@redhat.com>  wrote:
> >  On 06/05/2011 03:52 PM, Avi Kivity wrote:
> >>
> >>  (series review - please have a cover letter in the future for this stuff)
> >>
> >
> >  Oh - you did - but in a separate thread.  Oh well.
>
> My internet router decided to reset immediately after sending the
> cover letter, when it was backup stgit gave up on sending the rest of
> the patches, and I didn't think about putting the rest in the same
> thread. Sorry about that.

No problem.
Jan Kiszka June 5, 2011, 2:58 p.m. UTC | #13
On 2011-06-05 16:18, Avi Kivity wrote:
> On 06/05/2011 05:13 PM, Jan Kiszka wrote:
>> On 2011-06-05 14:21, Avi Kivity wrote:
>> >  On 06/03/2011 06:53 PM, Jan Kiszka wrote:
>> >>  >>   @@ -310,6 +310,7 @@ struct kvm_translation {
>> >>  >>    struct kvm_interrupt {
>> >>  >>        /* in */
>> >>  >>        __u32 irq;
>> >>  >>   +    __u8  raise;
>> >>  >>    };
>> >>  >
>> >>  >   This touches an existing ABI and corrupts the definition of
>> >>  >   KVM_INTERRUPT IOCTL. The might exist jurisdictions considering
>> this a
>> >>  >   capital crime. :)
>> >>  >
>> >>  >   You rather have to define a new CPU IRQ injection interface that
>> >>  >   supports both raising and lowering
>> >
>> >  This is KVM_IRQ_LINE:
>> >
>>
>> It's so far associated with in-kernel irqchip input pins, not with
>> raising CPU IRQs.
> 
> It's up to the architecture to define what it's connected to.
> 
> Note that with KVM_SET_GSI_ROUTING (bad name for ARM...) we can even
> choose if an irq line is connected to a kernel-emulated interrupt
> controller or to the core's irq input.

Makes some sense: Add KVM_IRQ_ROUTING_CPU, and kvm_irq_routing_entry's
union would require some struct kvm_irq_routing_cpu containing the
target identifier.

However, I would recommend to carefully check the generic irq routing
bits before use - if they still contain some x86/ia64 specifics or
unwanted irqchip_in_kernel().

Jan
Avi Kivity June 5, 2011, 3:10 p.m. UTC | #14
On 06/05/2011 05:58 PM, Jan Kiszka wrote:
> >
> >  Note that with KVM_SET_GSI_ROUTING (bad name for ARM...) we can even
> >  choose if an irq line is connected to a kernel-emulated interrupt
> >  controller or to the core's irq input.
>
> Makes some sense: Add KVM_IRQ_ROUTING_CPU, and kvm_irq_routing_entry's
> union would require some struct kvm_irq_routing_cpu containing the
> target identifier.

Right.  Note it would be the default, so no need to implement 
KVM_SET_GSI_ROUTING just yet.

An additional advantage is that this is a vm ioctl, not a vcpu ioctl, so 
no need to interrupt the vcpu thread in userspace in order to queue an 
interrupt.  Of course, it still happens in the kernel, but it's easier 
for userspace to implement its device model this way.

> However, I would recommend to carefully check the generic irq routing
> bits before use - if they still contain some x86/ia64 specifics or
> unwanted irqchip_in_kernel().

Most likely yes.
Jan Kiszka June 5, 2011, 3:14 p.m. UTC | #15
On 2011-06-05 17:10, Avi Kivity wrote:
> On 06/05/2011 05:58 PM, Jan Kiszka wrote:
>> >
>> >  Note that with KVM_SET_GSI_ROUTING (bad name for ARM...) we can even
>> >  choose if an irq line is connected to a kernel-emulated interrupt
>> >  controller or to the core's irq input.
>>
>> Makes some sense: Add KVM_IRQ_ROUTING_CPU, and kvm_irq_routing_entry's
>> union would require some struct kvm_irq_routing_cpu containing the
>> target identifier.
> 
> Right.  Note it would be the default, so no need to implement
> KVM_SET_GSI_ROUTING just yet.
> 
> An additional advantage is that this is a vm ioctl, not a vcpu ioctl, so
> no need to interrupt the vcpu thread in userspace in order to queue an
> interrupt.  Of course, it still happens in the kernel, but it's easier
> for userspace to implement its device model this way.

So supporting this over existing archs would have some benefits as well,
though a bit smaller if in-kernel irqchip is already implemented.

Jan
Avi Kivity June 5, 2011, 3:18 p.m. UTC | #16
On 06/05/2011 06:14 PM, Jan Kiszka wrote:
> >  An additional advantage is that this is a vm ioctl, not a vcpu ioctl, so
> >  no need to interrupt the vcpu thread in userspace in order to queue an
> >  interrupt.  Of course, it still happens in the kernel, but it's easier
> >  for userspace to implement its device model this way.
>
> So supporting this over existing archs would have some benefits as well,
> though a bit smaller if in-kernel irqchip is already implemented.

Yes, except for s390, of course.
Christoffer Dall June 5, 2011, 4:03 p.m. UTC | #17
On Sun, Jun 5, 2011 at 2:36 PM, Avi Kivity <avi@redhat.com> wrote:
> On 06/03/2011 06:03 PM, Christoffer Dall wrote:
>>
>> Targets KVM support for Cortex A-15 processors.
>>
>> Contains no real functionality but all the framework components,
>> make files, header files and some tracing functionality.
>>
>> +
>> +struct kvm_regs {
>> +       __u32 regs0_7[8];       /* Unbanked regs. (r0 - r7)        */
>> +       __u32 fiq_regs8_12[5];  /* Banked fiq regs. (r8 - r12)     */
>> +       __u32 usr_regs8_12[5];  /* Banked usr registers (r8 - r12) */
>> +       __u32 reg13[6];         /* Banked r13, indexed by MODE_    */
>> +       __u32 reg14[6];         /* Banked r13, indexed by MODE_    */
>> +       __u32 reg15;
>> +       __u32 cpsr;
>> +       __u32 spsr[5];          /* Banked SPSR,  indexed by MODE_  */
>> +       struct {
>> +               __u32 c2_base0;
>> +               __u32 c2_base1;
>> +               __u32 c3_dacr;
>> +       } cp15;
>> +
>> +};
>> +
>> +struct kvm_sregs {
>> +};
>> +
>> +struct kvm_fpu {
>> +};
>> +
>> +struct kvm_guest_debug_arch {
>> +};
>> +
>> +struct kvm_debug_exit_arch {
>> +};
>
> Presumably, to be filled in later?
>

I simply didn't look at these yet and didn't need them yet. I will
look into this later on.

>> +
>> +/* Get vcpu register for current mode */
>> +#define vcpu_reg(_vcpu, _reg_num) \
>> +       (*kvm_vcpu_reg((_vcpu), _reg_num, vcpu_mode(_vcpu)))
>> +
>> +/* Get vcpu register for specific mode */
>> +#define vcpu_reg_m(_vcpu, _reg_num, _mode) \
>> +       (*kvm_vcpu_reg(_vcpu, _reg_num, _mode))
>> +
>> +#define vcpu_cpsr(_vcpu) \
>> +       (_vcpu->arch.regs.cpsr)
>> +
>> +/* Get vcpu SPSR for current mode */
>> +#define vcpu_spsr(_vcpu) \
>> +       kvm_vcpu_spsr(_vcpu, vcpu_mode(_vcpu))
>> +
>> +/* Get vcpu SPSR for specific mode */
>> +#define vcpu_spsr_m(_vcpu, _mode) \
>> +       kvm_vcpu_spsr(_vcpu, _mode)
>> +
>> +#define MODE_HAS_SPSR(_vcpu) \
>> +        ((vcpu_mode(_vcpu))<  MODE_USR)
>> +
>> +#define VCPU_MODE_PRIV(_vcpu) \
>> +       (((vcpu_mode(_vcpu)) == MODE_USR) ? 0 : 1)
>
> Please use static inlines.  Yes, you'll need more helpers to set registers,
> but it's worth it, especially as some macros evaluate an argument multiple
> times.

ok.

>
>> +if VIRTUALIZATION
>> +
>> +config KVM
>> +       bool "Kernel-based Virtual Machine (KVM) support"
>> +       select PREEMPT_NOTIFIERS
>> +       select ANON_INODES
>> +       select KVM_ARM_HOST
>> +       select KVM_MMIO
>> +       ---help---
>> +         Support hosting virtualized guest machines. You will also
>> +         need to select one or more of the processor modules below.
>> +
>> +         This module provides access to the hardware capabilities through
>> +         a character device node named /dev/kvm.
>> +
>> +         If unsure, say N.
>
> I see you can't support a modular build, which is a pity.
>

My concern is that I map in code that needs to be in place for running
in Hyp mode. Of course I just need to pin these, so it should be
possible. I'll look into this as well.

>> +
>> +static int k_show(struct seq_file *m, void *v)
>> +{
>> +       print_kvm_debug_info(&seq_printf, m);
>> +       return 0;
>> +}
>> +
>> +static void *k_start(struct seq_file *m, loff_t *pos)
>> +{
>> +       return *pos<  1 ? (void *)1 : NULL;
>> +}
>> +
>> +static void *k_next(struct seq_file *m, void *v, loff_t *pos)
>> +{
>> +       ++*pos;
>> +       return NULL;
>> +}
>> +
>> +static void k_stop(struct seq_file *m, void *v)
>> +{
>> +}
>> +
>> +static const struct seq_operations kvmproc_op = {
>> +       .start  = k_start,
>> +       .next   = k_next,
>> +       .stop   = k_stop,
>> +       .show   = k_show
>> +};
>> +
>> +static int kvm_open(struct inode *inode, struct file *file)
>> +{
>> +       return seq_open(file,&kvmproc_op);
>> +}
>> +
>> +static const struct file_operations proc_kvm_operations = {
>> +       .open           = kvm_open,
>> +       .read           = seq_read,
>> +       .llseek         = seq_lseek,
>> +       .release        = seq_release,
>> +};
>> +
>> +static int arm_init(void)
>> +{
>> +       int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
>> +       if (rc == 0)
>> +               proc_create("kvm", 0, NULL,&proc_kvm_operations);
>> +       return rc;
>> +}
>
> /proc is frowned upon these days.  Is there no better place for this?+
>>

Yeah, this is actually quite legacy and probably shouldn't have been
included. I will give this a thorough overhaul before next patch
series.

>> +/*
>> + * Return a pointer to the register number valid in the specified mode of
>> + * the virtual CPU.
>> + */
>> +u32* kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode)
>> +{
>> +       struct kvm_vcpu_regs *regs;
>> +       u8 reg_idx;
>> +       BUG_ON(reg_num>  15);
>> +
>> +       regs =&vcpu->arch.regs;
>> +
>> +       /* The PC is trivial */
>> +       if (reg_num == 15)
>> +               return&(regs->pc);
>> +
>> +       /* Non-banked registers */
>> +       if (reg_num<  8)
>> +               return&(regs->usr_regs[reg_num]);
>> +
>> +       /* Banked registers r13 and r14 */
>> +       if (reg_num>= 13) {
>> +               reg_idx = reg_num - 13; /* 0=r13 and 1=r14 */
>> +               switch (mode) {
>> +               case MODE_FIQ:
>> +                       return&(regs->fiq_regs[reg_idx + 5]);
>> +               case MODE_IRQ:
>> +                       return&(regs->irq_regs[reg_idx]);
>> +               case MODE_SVC:
>> +                       return&(regs->svc_regs[reg_idx]);
>> +               case MODE_ABT:
>> +                       return&(regs->abt_regs[reg_idx]);
>> +               case MODE_UND:
>> +                       return&(regs->und_regs[reg_idx]);
>> +               case MODE_USR:
>> +               case MODE_SYS:
>> +                       return&(regs->usr_regs[reg_idx]);
>> +               }
>> +       }
>> +
>> +       /* Banked FIQ registers r8-r12 */
>> +       if (reg_num>= 8&&  reg_num<= 12) {
>> +               if (mode == MODE_FIQ) {
>> +                       reg_idx = reg_num - 8; /* 0=r8, ..., 4=r12 */
>> +                       return&(regs->fiq_regs[reg_idx]);
>> +               } else
>> +                       return&(regs->usr_regs[reg_num]);
>> +       }
>
> You could have a static 2D array indexed by mode and register number,
> returning an offsetof() into the vcpu structure.

You think it's simpler or faster? I don't quite see the incentive.
It's not going to be called a whole lot given the Virt. Extensions.

>
>> +
>> +       BUG();
>> +       return NULL;
>> +}
>>
>> diff --git a/arch/arm/kvm/trace.c b/arch/arm/kvm/trace.c
>> new file mode 100644
>> index 0000000..8ea1155

[snip]

>> +
>> +void kvm_arm_count_event(unsigned int event)
>> +{
>> +       if (event>= KVM_EVENTC_ITEMS)
>> +               return;
>> +
>> +       kvm_eventc_log[event].cnt++;
>> +}
>
> We've switched to ftrace for this sort of thing.  Simply add a tracepoint
> for each interesting event, and the kernel can provide you with
>
> - a count of events ('perf stat')
> - a log of events ('trace-cmd record/report'), possibly with other kernel
> events interspersed
> - a running histogram ('kvm_stat')
>
> with near-zero impact when disabled.
>
> See include/trace/events/kvm.h, arch/x86/kvm/trace.h.

Will do, thanks.
>
> --
> error compiling committee.c: too many arguments to function
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 5, 2011, 4:06 p.m. UTC | #18
On 06/05/2011 07:03 PM, Christoffer Dall wrote:
> >>  +
> >>  +       /* Banked FIQ registers r8-r12 */
> >>  +       if (reg_num>= 8&&    reg_num<= 12) {
> >>  +               if (mode == MODE_FIQ) {
> >>  +                       reg_idx = reg_num - 8; /* 0=r8, ..., 4=r12 */
> >>  +                       return&(regs->fiq_regs[reg_idx]);
> >>  +               } else
> >>  +                       return&(regs->usr_regs[reg_num]);
> >>  +       }
> >
> >  You could have a static 2D array indexed by mode and register number,
> >  returning an offsetof() into the vcpu structure.
>
> You think it's simpler or faster? I don't quite see the incentive.
> It's not going to be called a whole lot given the Virt. Extensions.

Yes (those are mostly unpredictable branches), and clearer as well.  But 
I agree it's a rare-enough path that doesn't need optimizing, certainly 
not prematurely.
Christoffer Dall June 5, 2011, 4:24 p.m. UTC | #19
On Sun, Jun 5, 2011 at 4:58 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
> On 2011-06-05 16:18, Avi Kivity wrote:
>> On 06/05/2011 05:13 PM, Jan Kiszka wrote:
>>> On 2011-06-05 14:21, Avi Kivity wrote:
>>> >  On 06/03/2011 06:53 PM, Jan Kiszka wrote:
>>> >>  >>   @@ -310,6 +310,7 @@ struct kvm_translation {
>>> >>  >>    struct kvm_interrupt {
>>> >>  >>        /* in */
>>> >>  >>        __u32 irq;
>>> >>  >>   +    __u8  raise;
>>> >>  >>    };
>>> >>  >
>>> >>  >   This touches an existing ABI and corrupts the definition of
>>> >>  >   KVM_INTERRUPT IOCTL. The might exist jurisdictions considering
>>> this a
>>> >>  >   capital crime. :)
>>> >>  >
>>> >>  >   You rather have to define a new CPU IRQ injection interface that
>>> >>  >   supports both raising and lowering
>>> >
>>> >  This is KVM_IRQ_LINE:
>>> >
>>>
>>> It's so far associated with in-kernel irqchip input pins, not with
>>> raising CPU IRQs.
>>
>> It's up to the architecture to define what it's connected to.
>>
>> Note that with KVM_SET_GSI_ROUTING (bad name for ARM...) we can even
>> choose if an irq line is connected to a kernel-emulated interrupt
>> controller or to the core's irq input.
>
> Makes some sense: Add KVM_IRQ_ROUTING_CPU, and kvm_irq_routing_entry's
> union would require some struct kvm_irq_routing_cpu containing the
> target identifier.
>
> However, I would recommend to carefully check the generic irq routing
> bits before use - if they still contain some x86/ia64 specifics or
> unwanted irqchip_in_kernel().
>
OK, so let me try to get this straight.

We all agree that ruining the ABI is crazy - sorry about even putting
that out there.

On ARM, we are talking about IRQs and FIQs, both of which can be
lowered and raised. In other words, four values.

But instead of this, you want to use KVM_IRQ_LINE and just assume one
CPU to begin with and always assert/deassert the interrupts on that
CPU and later, with SMP support, you want to use KVM_SET_GSI_ROUTING
to control which CPU gets the interrupt - even though GSI is a
non-existing concept for ARM? (Also, I could not seem to find any use
of GSI_ROUTING in current upstream QEMU, but I may be missing
something here?).

I suggest I simply use a flag in the irq field on the kvm_interrupt
struct as suggested by Jan, for now, and if we start emulating GICs
inside the kernel for ARM we can go the extra mile at that point.
IMHO, I think the semantics of actually raising an interrup line on
the VCPU, as opposed to the VM, when it's CPU interrupt we are talking
about is equally important.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoffer Dall June 5, 2011, 4:25 p.m. UTC | #20
On Sun, Jun 5, 2011 at 5:14 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
> On 2011-06-05 17:10, Avi Kivity wrote:
>> On 06/05/2011 05:58 PM, Jan Kiszka wrote:
>>> >
>>> >  Note that with KVM_SET_GSI_ROUTING (bad name for ARM...) we can even
>>> >  choose if an irq line is connected to a kernel-emulated interrupt
>>> >  controller or to the core's irq input.
>>>
>>> Makes some sense: Add KVM_IRQ_ROUTING_CPU, and kvm_irq_routing_entry's
>>> union would require some struct kvm_irq_routing_cpu containing the
>>> target identifier.
>>
>> Right.  Note it would be the default, so no need to implement
>> KVM_SET_GSI_ROUTING just yet.
>>
>> An additional advantage is that this is a vm ioctl, not a vcpu ioctl, so
>> no need to interrupt the vcpu thread in userspace in order to queue an
>> interrupt.  Of course, it still happens in the kernel, but it's easier
>> for userspace to implement its device model this way.
>
> So supporting this over existing archs would have some benefits as well,
> though a bit smaller if in-kernel irqchip is already implemented.
>

Could you elaborate what you mean here? I'm not really following. Are
you suggesting a new arch-generic interface? (Pardon my ignorance).
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 5, 2011, 4:28 p.m. UTC | #21
On 06/05/2011 07:25 PM, Christoffer Dall wrote:
> >>
> >>  An additional advantage is that this is a vm ioctl, not a vcpu ioctl, so
> >>  no need to interrupt the vcpu thread in userspace in order to queue an
> >>  interrupt.  Of course, it still happens in the kernel, but it's easier
> >>  for userspace to implement its device model this way.
> >
> >  So supporting this over existing archs would have some benefits as well,
> >  though a bit smaller if in-kernel irqchip is already implemented.
> >
>
> Could you elaborate what you mean here? I'm not really following. Are
> you suggesting a new arch-generic interface? (Pardon my ignorance).

Using KVM_IRQ_LINE everywhere except s390, not just in x86 and ARM.
Alexander Graf June 5, 2011, 4:30 p.m. UTC | #22
On 05.06.2011, at 18:28, Avi Kivity wrote:

> On 06/05/2011 07:25 PM, Christoffer Dall wrote:
>>>> 
>>>> An additional advantage is that this is a vm ioctl, not a vcpu ioctl, so
>>>> no need to interrupt the vcpu thread in userspace in order to queue an
>>>> interrupt.  Of course, it still happens in the kernel, but it's easier
>>>> for userspace to implement its device model this way.
>>> 
>>> So supporting this over existing archs would have some benefits as well,
>>> though a bit smaller if in-kernel irqchip is already implemented.
>>> 
>> 
>> Could you elaborate what you mean here? I'm not really following. Are
>> you suggesting a new arch-generic interface? (Pardon my ignorance).
> 
> Using KVM_IRQ_LINE everywhere except s390, not just in x86 and ARM.

An in-kernel MPIC implementation is coming for PPC, so I don't see any reason to switch from something that works now.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 5, 2011, 4:31 p.m. UTC | #23
On 06/05/2011 07:24 PM, Christoffer Dall wrote:
> On ARM, we are talking about IRQs and FIQs, both of which can be
> lowered and raised. In other words, four values.

Two lines per cpu, each of which can be raised or lowered.

> But instead of this, you want to use KVM_IRQ_LINE and just assume one
> CPU to begin with and always assert/deassert the interrupts on that
> CPU and later, with SMP support, you want to use KVM_SET_GSI_ROUTING
> to control which CPU gets the interrupt - even though GSI is a
> non-existing concept for ARM? (Also, I could not seem to find any use
> of GSI_ROUTING in current upstream QEMU, but I may be missing
> something here?).

Almost.  vcpu N's IRQ -> KVM_IRQ_LINE(N*2, level).  vcpu N's FIQ -> 
KVM_IRQ_LINE(N*2+1, level), + documentation somewhere.

> I suggest I simply use a flag in the irq field on the kvm_interrupt
> struct as suggested by Jan, for now, and if we start emulating GICs
> inside the kernel for ARM we can go the extra mile at that point.
> IMHO, I think the semantics of actually raising an interrup line on
> the VCPU, as opposed to the VM, when it's CPU interrupt we are talking
> about is equally important.

When you implement an interrupt controller, you can use 
KVM_SET_GSI_ROUTING to change the meaning of the parameter to 
KVM_IRQ_LINE to point to the interrupt controllers.
Avi Kivity June 5, 2011, 4:33 p.m. UTC | #24
On 06/05/2011 07:30 PM, Alexander Graf wrote:
> >>
> >>  Could you elaborate what you mean here? I'm not really following. Are
> >>  you suggesting a new arch-generic interface? (Pardon my ignorance).
> >
> >  Using KVM_IRQ_LINE everywhere except s390, not just in x86 and ARM.
>
> An in-kernel MPIC implementation is coming for PPC, so I don't see any reason to switch from something that works now.

Right, this is spilled milk.

Does the ppc qemu implementation raise KVM_INTERRUPT solely from the 
vcpu thread?
Alexander Graf June 5, 2011, 5:19 p.m. UTC | #25
On 05.06.2011, at 18:33, Avi Kivity wrote:

> On 06/05/2011 07:30 PM, Alexander Graf wrote:
>> >>
>> >>  Could you elaborate what you mean here? I'm not really following. Are
>> >>  you suggesting a new arch-generic interface? (Pardon my ignorance).
>> >
>> >  Using KVM_IRQ_LINE everywhere except s390, not just in x86 and ARM.
>> 
>> An in-kernel MPIC implementation is coming for PPC, so I don't see any reason to switch from something that works now.
> 
> Right, this is spilled milk.
> 
> Does the ppc qemu implementation raise KVM_INTERRUPT solely from the vcpu thread?

Well, without iothread it used to obviously. Now that we have an iothread, it calls ioctl(KVM_INTERRUPT) from a separate thread. The code also doesn't forcefully wake up the vcpu thread, so yes, I think here's a chance for at least delaying interrupt delivery. Chances are pretty slim we don't get out of the vcpu thread at all :).


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka June 5, 2011, 5:48 p.m. UTC | #26
On 2011-06-05 19:19, Alexander Graf wrote:
> 
> On 05.06.2011, at 18:33, Avi Kivity wrote:
> 
>> On 06/05/2011 07:30 PM, Alexander Graf wrote:
>>>>>
>>>>>  Could you elaborate what you mean here? I'm not really following. Are
>>>>>  you suggesting a new arch-generic interface? (Pardon my ignorance).
>>>>
>>>>  Using KVM_IRQ_LINE everywhere except s390, not just in x86 and ARM.
>>>
>>> An in-kernel MPIC implementation is coming for PPC, so I don't see any reason to switch from something that works now.
>>
>> Right, this is spilled milk.
>>
>> Does the ppc qemu implementation raise KVM_INTERRUPT solely from the vcpu thread?
> 
> Well, without iothread it used to obviously. Now that we have an iothread, it calls ioctl(KVM_INTERRUPT) from a separate thread. The code also doesn't forcefully wake up the vcpu thread, so yes, I think here's a chance for at least delaying interrupt delivery. Chances are pretty slim we don't get out of the vcpu thread at all :).

There are good chances to run into a deadlock when calling a per-vcpu
IOCTL over a foreign context: calling thread holds qemu_mutex and blocks
on kvm_mutex inside the kernel, target vcpu is running endless guest
loop, holding kvm_mutex, all other qemu threads will sooner or later
block on the global lock. That's at least one pattern you can get on x86
(we had a few of such bugs in the past).

Jan
Alexander Graf June 5, 2011, 5:54 p.m. UTC | #27
On 05.06.2011, at 19:48, Jan Kiszka wrote:

> On 2011-06-05 19:19, Alexander Graf wrote:
>> 
>> On 05.06.2011, at 18:33, Avi Kivity wrote:
>> 
>>> On 06/05/2011 07:30 PM, Alexander Graf wrote:
>>>>>> 
>>>>>> Could you elaborate what you mean here? I'm not really following. Are
>>>>>> you suggesting a new arch-generic interface? (Pardon my ignorance).
>>>>> 
>>>>> Using KVM_IRQ_LINE everywhere except s390, not just in x86 and ARM.
>>>> 
>>>> An in-kernel MPIC implementation is coming for PPC, so I don't see any reason to switch from something that works now.
>>> 
>>> Right, this is spilled milk.
>>> 
>>> Does the ppc qemu implementation raise KVM_INTERRUPT solely from the vcpu thread?
>> 
>> Well, without iothread it used to obviously. Now that we have an iothread, it calls ioctl(KVM_INTERRUPT) from a separate thread. The code also doesn't forcefully wake up the vcpu thread, so yes, I think here's a chance for at least delaying interrupt delivery. Chances are pretty slim we don't get out of the vcpu thread at all :).
> 
> There are good chances to run into a deadlock when calling a per-vcpu
> IOCTL over a foreign context: calling thread holds qemu_mutex and blocks
> on kvm_mutex inside the kernel, target vcpu is running endless guest
> loop, holding kvm_mutex, all other qemu threads will sooner or later
> block on the global lock. That's at least one pattern you can get on x86
> (we had a few of such bugs in the past).

Any recommendations? Should we just signal the main thread when we want to inject an interrupt?


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka June 5, 2011, 5:56 p.m. UTC | #28
On 2011-06-05 19:54, Alexander Graf wrote:
> 
> On 05.06.2011, at 19:48, Jan Kiszka wrote:
> 
>> On 2011-06-05 19:19, Alexander Graf wrote:
>>>
>>> On 05.06.2011, at 18:33, Avi Kivity wrote:
>>>
>>>> On 06/05/2011 07:30 PM, Alexander Graf wrote:
>>>>>>>
>>>>>>> Could you elaborate what you mean here? I'm not really following. Are
>>>>>>> you suggesting a new arch-generic interface? (Pardon my ignorance).
>>>>>>
>>>>>> Using KVM_IRQ_LINE everywhere except s390, not just in x86 and ARM.
>>>>>
>>>>> An in-kernel MPIC implementation is coming for PPC, so I don't see any reason to switch from something that works now.
>>>>
>>>> Right, this is spilled milk.
>>>>
>>>> Does the ppc qemu implementation raise KVM_INTERRUPT solely from the vcpu thread?
>>>
>>> Well, without iothread it used to obviously. Now that we have an iothread, it calls ioctl(KVM_INTERRUPT) from a separate thread. The code also doesn't forcefully wake up the vcpu thread, so yes, I think here's a chance for at least delaying interrupt delivery. Chances are pretty slim we don't get out of the vcpu thread at all :).
>>
>> There are good chances to run into a deadlock when calling a per-vcpu
>> IOCTL over a foreign context: calling thread holds qemu_mutex and blocks
>> on kvm_mutex inside the kernel, target vcpu is running endless guest
>> loop, holding kvm_mutex, all other qemu threads will sooner or later
>> block on the global lock. That's at least one pattern you can get on x86
>> (we had a few of such bugs in the past).
> 
> Any recommendations? Should we just signal the main thread when we want to inject an interrupt?

Yep. That's also what x86 does (when using user space irqchips).

Jan
Alexander Graf June 5, 2011, 6 p.m. UTC | #29
On 05.06.2011, at 19:56, Jan Kiszka wrote:

> On 2011-06-05 19:54, Alexander Graf wrote:
>> 
>> On 05.06.2011, at 19:48, Jan Kiszka wrote:
>> 
>>> On 2011-06-05 19:19, Alexander Graf wrote:
>>>> 
>>>> On 05.06.2011, at 18:33, Avi Kivity wrote:
>>>> 
>>>>> On 06/05/2011 07:30 PM, Alexander Graf wrote:
>>>>>>>> 
>>>>>>>> Could you elaborate what you mean here? I'm not really following. Are
>>>>>>>> you suggesting a new arch-generic interface? (Pardon my ignorance).
>>>>>>> 
>>>>>>> Using KVM_IRQ_LINE everywhere except s390, not just in x86 and ARM.
>>>>>> 
>>>>>> An in-kernel MPIC implementation is coming for PPC, so I don't see any reason to switch from something that works now.
>>>>> 
>>>>> Right, this is spilled milk.
>>>>> 
>>>>> Does the ppc qemu implementation raise KVM_INTERRUPT solely from the vcpu thread?
>>>> 
>>>> Well, without iothread it used to obviously. Now that we have an iothread, it calls ioctl(KVM_INTERRUPT) from a separate thread. The code also doesn't forcefully wake up the vcpu thread, so yes, I think here's a chance for at least delaying interrupt delivery. Chances are pretty slim we don't get out of the vcpu thread at all :).
>>> 
>>> There are good chances to run into a deadlock when calling a per-vcpu
>>> IOCTL over a foreign context: calling thread holds qemu_mutex and blocks
>>> on kvm_mutex inside the kernel, target vcpu is running endless guest
>>> loop, holding kvm_mutex, all other qemu threads will sooner or later
>>> block on the global lock. That's at least one pattern you can get on x86
>>> (we had a few of such bugs in the past).
>> 
>> Any recommendations? Should we just signal the main thread when we want to inject an interrupt?
> 
> Yep. That's also what x86 does (when using user space irqchips).

Hrm, ok :). I guess the main reason we don't see major issues is that

  1) people don't use iothread too often yet - is it even enabled by default?
  2) the decrementor interrupt happens in-kernel, so timer interrupts still arrive properly


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka June 5, 2011, 6:04 p.m. UTC | #30
On 2011-06-05 20:00, Alexander Graf wrote:
> 
> On 05.06.2011, at 19:56, Jan Kiszka wrote:
> 
>> On 2011-06-05 19:54, Alexander Graf wrote:
>>>
>>> On 05.06.2011, at 19:48, Jan Kiszka wrote:
>>>
>>>> On 2011-06-05 19:19, Alexander Graf wrote:
>>>>>
>>>>> On 05.06.2011, at 18:33, Avi Kivity wrote:
>>>>>
>>>>>> On 06/05/2011 07:30 PM, Alexander Graf wrote:
>>>>>>>>>
>>>>>>>>> Could you elaborate what you mean here? I'm not really following. Are
>>>>>>>>> you suggesting a new arch-generic interface? (Pardon my ignorance).
>>>>>>>>
>>>>>>>> Using KVM_IRQ_LINE everywhere except s390, not just in x86 and ARM.
>>>>>>>
>>>>>>> An in-kernel MPIC implementation is coming for PPC, so I don't see any reason to switch from something that works now.
>>>>>>
>>>>>> Right, this is spilled milk.
>>>>>>
>>>>>> Does the ppc qemu implementation raise KVM_INTERRUPT solely from the vcpu thread?
>>>>>
>>>>> Well, without iothread it used to obviously. Now that we have an iothread, it calls ioctl(KVM_INTERRUPT) from a separate thread. The code also doesn't forcefully wake up the vcpu thread, so yes, I think here's a chance for at least delaying interrupt delivery. Chances are pretty slim we don't get out of the vcpu thread at all :).
>>>>
>>>> There are good chances to run into a deadlock when calling a per-vcpu
>>>> IOCTL over a foreign context: calling thread holds qemu_mutex and blocks
>>>> on kvm_mutex inside the kernel, target vcpu is running endless guest
>>>> loop, holding kvm_mutex, all other qemu threads will sooner or later
>>>> block on the global lock. That's at least one pattern you can get on x86
>>>> (we had a few of such bugs in the past).
>>>
>>> Any recommendations? Should we just signal the main thread when we want to inject an interrupt?
>>
>> Yep. That's also what x86 does (when using user space irqchips).
> 
> Hrm, ok :). I guess the main reason we don't see major issues is that
> 
>   1) people don't use iothread too often yet - is it even enabled by default?

Nope (unless you use qemu-kvm.git next).

>   2) the decrementor interrupt happens in-kernel, so timer interrupts still arrive properly

Means PPC periodically returns to user space?

Jan
Alexander Graf June 5, 2011, 6:12 p.m. UTC | #31
On 05.06.2011, at 20:04, Jan Kiszka wrote:

> On 2011-06-05 20:00, Alexander Graf wrote:
>> 
>> On 05.06.2011, at 19:56, Jan Kiszka wrote:
>> 
>>> On 2011-06-05 19:54, Alexander Graf wrote:
>>>> 
>>>> On 05.06.2011, at 19:48, Jan Kiszka wrote:
>>>> 
>>>>> On 2011-06-05 19:19, Alexander Graf wrote:
>>>>>> 
>>>>>> On 05.06.2011, at 18:33, Avi Kivity wrote:
>>>>>> 
>>>>>>> On 06/05/2011 07:30 PM, Alexander Graf wrote:
>>>>>>>>>> 
>>>>>>>>>> Could you elaborate what you mean here? I'm not really following. Are
>>>>>>>>>> you suggesting a new arch-generic interface? (Pardon my ignorance).
>>>>>>>>> 
>>>>>>>>> Using KVM_IRQ_LINE everywhere except s390, not just in x86 and ARM.
>>>>>>>> 
>>>>>>>> An in-kernel MPIC implementation is coming for PPC, so I don't see any reason to switch from something that works now.
>>>>>>> 
>>>>>>> Right, this is spilled milk.
>>>>>>> 
>>>>>>> Does the ppc qemu implementation raise KVM_INTERRUPT solely from the vcpu thread?
>>>>>> 
>>>>>> Well, without iothread it used to obviously. Now that we have an iothread, it calls ioctl(KVM_INTERRUPT) from a separate thread. The code also doesn't forcefully wake up the vcpu thread, so yes, I think here's a chance for at least delaying interrupt delivery. Chances are pretty slim we don't get out of the vcpu thread at all :).
>>>>> 
>>>>> There are good chances to run into a deadlock when calling a per-vcpu
>>>>> IOCTL over a foreign context: calling thread holds qemu_mutex and blocks
>>>>> on kvm_mutex inside the kernel, target vcpu is running endless guest
>>>>> loop, holding kvm_mutex, all other qemu threads will sooner or later
>>>>> block on the global lock. That's at least one pattern you can get on x86
>>>>> (we had a few of such bugs in the past).
>>>> 
>>>> Any recommendations? Should we just signal the main thread when we want to inject an interrupt?
>>> 
>>> Yep. That's also what x86 does (when using user space irqchips).
>> 
>> Hrm, ok :). I guess the main reason we don't see major issues is that
>> 
>>  1) people don't use iothread too often yet - is it even enabled by default?
> 
> Nope (unless you use qemu-kvm.git next).

Any plans on finally doing that step? Code that isn't enabled by default is pretty prone to not be tested ;). It's a good way to slowly move code upstream, stabilize it there and then finally have it enabled by default. But I don't think this process should last more than 1/2 year. And IIRC with iothread, we're way past that point.

>>  2) the decrementor interrupt happens in-kernel, so timer interrupts still arrive properly
> 
> Means PPC periodically returns to user space?

Hrm. No. It stays in kernel space even more. But at least it continues to live without the need for a timer interrupt from user space, so an endless loop doesn't completely stall the guest. I see your point though.

With VGA we're good thanks to the dirty bitmap update. So we get quite a good number of exits from guest context. -nographic might be problematic though. I don't see how a new key event or a new packet on the network could easily arrive inside the guest there. Sigh.

I'll take a look at it when I get my head around it. Thanks a lot for the insight :).


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka June 5, 2011, 6:19 p.m. UTC | #32
On 2011-06-05 20:12, Alexander Graf wrote:
> 
> On 05.06.2011, at 20:04, Jan Kiszka wrote:
> 
>> On 2011-06-05 20:00, Alexander Graf wrote:
>>>
>>> On 05.06.2011, at 19:56, Jan Kiszka wrote:
>>>
>>>> On 2011-06-05 19:54, Alexander Graf wrote:
>>>>>
>>>>> On 05.06.2011, at 19:48, Jan Kiszka wrote:
>>>>>
>>>>>> On 2011-06-05 19:19, Alexander Graf wrote:
>>>>>>>
>>>>>>> On 05.06.2011, at 18:33, Avi Kivity wrote:
>>>>>>>
>>>>>>>> On 06/05/2011 07:30 PM, Alexander Graf wrote:
>>>>>>>>>>>
>>>>>>>>>>> Could you elaborate what you mean here? I'm not really following. Are
>>>>>>>>>>> you suggesting a new arch-generic interface? (Pardon my ignorance).
>>>>>>>>>>
>>>>>>>>>> Using KVM_IRQ_LINE everywhere except s390, not just in x86 and ARM.
>>>>>>>>>
>>>>>>>>> An in-kernel MPIC implementation is coming for PPC, so I don't see any reason to switch from something that works now.
>>>>>>>>
>>>>>>>> Right, this is spilled milk.
>>>>>>>>
>>>>>>>> Does the ppc qemu implementation raise KVM_INTERRUPT solely from the vcpu thread?
>>>>>>>
>>>>>>> Well, without iothread it used to obviously. Now that we have an iothread, it calls ioctl(KVM_INTERRUPT) from a separate thread. The code also doesn't forcefully wake up the vcpu thread, so yes, I think here's a chance for at least delaying interrupt delivery. Chances are pretty slim we don't get out of the vcpu thread at all :).
>>>>>>
>>>>>> There are good chances to run into a deadlock when calling a per-vcpu
>>>>>> IOCTL over a foreign context: calling thread holds qemu_mutex and blocks
>>>>>> on kvm_mutex inside the kernel, target vcpu is running endless guest
>>>>>> loop, holding kvm_mutex, all other qemu threads will sooner or later
>>>>>> block on the global lock. That's at least one pattern you can get on x86
>>>>>> (we had a few of such bugs in the past).
>>>>>
>>>>> Any recommendations? Should we just signal the main thread when we want to inject an interrupt?
>>>>
>>>> Yep. That's also what x86 does (when using user space irqchips).
>>>
>>> Hrm, ok :). I guess the main reason we don't see major issues is that
>>>
>>>  1) people don't use iothread too often yet - is it even enabled by default?
>>
>> Nope (unless you use qemu-kvm.git next).
> 
> Any plans on finally doing that step? Code that isn't enabled by default is pretty prone to not be tested ;). It's a good way to slowly move code upstream, stabilize it there and then finally have it enabled by default. But I don't think this process should last more than 1/2 year. And IIRC with iothread, we're way past that point.

That's getting a bit off-topic for this thread now:

The good news is that 'next' will become 'master' fairly soon (unless
some regression is found), and then we are using QEMU upstream's
iothread code in qemu-kvm. By default.

For upstream, we are still facing TCG performance regressions in
iothread mode, thus it's still default off. $Someone would have to sort
them out, and then we could flip defaults there as well.

Jan
Avi Kivity June 6, 2011, 7:41 a.m. UTC | #33
On 06/05/2011 08:19 PM, Alexander Graf wrote:
> On 05.06.2011, at 18:33, Avi Kivity wrote:
>
> >  On 06/05/2011 07:30 PM, Alexander Graf wrote:
> >>  >>
> >>  >>   Could you elaborate what you mean here? I'm not really following. Are
> >>  >>   you suggesting a new arch-generic interface? (Pardon my ignorance).
> >>  >
> >>  >   Using KVM_IRQ_LINE everywhere except s390, not just in x86 and ARM.
> >>
> >>  An in-kernel MPIC implementation is coming for PPC, so I don't see any reason to switch from something that works now.
> >
> >  Right, this is spilled milk.
> >
> >  Does the ppc qemu implementation raise KVM_INTERRUPT solely from the vcpu thread?
>
> Well, without iothread it used to obviously. Now that we have an iothread, it calls ioctl(KVM_INTERRUPT) from a separate thread.

That's 100% broken, as api.txt states.  Besides consistency, this is to 
allow an eventual move to a syscall interface where the vcpu is derived 
from current rather than an fd parameter.

> The code also doesn't forcefully wake up the vcpu thread, so yes, I think here's a chance for at least delaying interrupt delivery. Chances are pretty slim we don't get out of the vcpu thread at all :).

Maybe slim, but still a major bug.
Avi Kivity June 6, 2011, 7:42 a.m. UTC | #34
On 06/05/2011 08:54 PM, Alexander Graf wrote:
> On 05.06.2011, at 19:48, Jan Kiszka wrote:
>
> >  On 2011-06-05 19:19, Alexander Graf wrote:
> >>
> >>  On 05.06.2011, at 18:33, Avi Kivity wrote:
> >>
> >>>  On 06/05/2011 07:30 PM, Alexander Graf wrote:
> >>>>>>
> >>>>>>  Could you elaborate what you mean here? I'm not really following. Are
> >>>>>>  you suggesting a new arch-generic interface? (Pardon my ignorance).
> >>>>>
> >>>>>  Using KVM_IRQ_LINE everywhere except s390, not just in x86 and ARM.
> >>>>
> >>>>  An in-kernel MPIC implementation is coming for PPC, so I don't see any reason to switch from something that works now.
> >>>
> >>>  Right, this is spilled milk.
> >>>
> >>>  Does the ppc qemu implementation raise KVM_INTERRUPT solely from the vcpu thread?
> >>
> >>  Well, without iothread it used to obviously. Now that we have an iothread, it calls ioctl(KVM_INTERRUPT) from a separate thread. The code also doesn't forcefully wake up the vcpu thread, so yes, I think here's a chance for at least delaying interrupt delivery. Chances are pretty slim we don't get out of the vcpu thread at all :).
> >
> >  There are good chances to run into a deadlock when calling a per-vcpu
> >  IOCTL over a foreign context: calling thread holds qemu_mutex and blocks
> >  on kvm_mutex inside the kernel, target vcpu is running endless guest
> >  loop, holding kvm_mutex, all other qemu threads will sooner or later
> >  block on the global lock. That's at least one pattern you can get on x86
> >  (we had a few of such bugs in the past).
>
> Any recommendations? Should we just signal the main thread when we want to inject an interrupt?
>

Signal the vcpu thread, of course.  There's on_vcpu (or on_cpu, don't 
know how it's called today) for that.
Christoffer Dall June 10, 2011, 8:40 a.m. UTC | #35
On Sun, Jun 5, 2011 at 6:27 PM, Alexander Graf <agraf@suse.de> wrote:
>
> On 05.06.2011, at 18:03, Christoffer Dall wrote:
>
> On Sun, Jun 5, 2011 at 2:36 PM, Avi Kivity <avi@redhat.com> wrote:
>
> On 06/03/2011 06:03 PM, Christoffer Dall wrote:
>
>
> [...]
>
> +if VIRTUALIZATION
>
> +
>
> +config KVM
>
> +       bool "Kernel-based Virtual Machine (KVM) support"
>
> +       select PREEMPT_NOTIFIERS
>
> +       select ANON_INODES
>
> +       select KVM_ARM_HOST
>
> +       select KVM_MMIO
>
> +       ---help---
>
> +         Support hosting virtualized guest machines. You will also
>
> +         need to select one or more of the processor modules below.
>
> +
>
> +         This module provides access to the hardware capabilities through
>
> +         a character device node named /dev/kvm.
>
> +
>
> +         If unsure, say N.
>
> I see you can't support a modular build, which is a pity.
>
>
> My concern is that I map in code that needs to be in place for running
> in Hyp mode. Of course I just need to pin these, so it should be
> possible. I'll look into this as well.
>
> We have similar requirements for PPC, so feel free to check how we did it
> there :). On =M we still include reasonably small parts of the code in the
> kernel as =Y, but keep the big chunks outside as module.
>

OK, so I looked into compiling KVM/ARM as a module and there are a few
obstacles:

I need to export several symbols (identitiy_mapping_add,
pgd_clear_bad, __irq_svc). __irq_svc is tricky because it's in the
entry-armv.S assembly file, so it's not obvious where to export it
from. Recommendations? (dedicated KVM export file as in PPC or simply
adding EXPORT_SYMBOL_GPL(...) or alternative?

Also, iirc, modules are loaded using vmalloc, wouldn't that make it
impossible for me to use virt_to_phys, which I need to do on some of
the code pages for Hypervisor initialization? If any of the ARM people
could give an advise here, it would be much appreciated - of course I
can kmalloc a page and relocate the init code, but I really would like
to avoid this.

Thanks!
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Catalin Marinas June 10, 2011, 9:23 a.m. UTC | #36
On Fri, Jun 10, 2011 at 09:40:34AM +0100, Christoffer Dall wrote:
> OK, so I looked into compiling KVM/ARM as a module and there are a few
> obstacles:
> 
> I need to export several symbols (identitiy_mapping_add,
> pgd_clear_bad, __irq_svc). __irq_svc is tricky because it's in the
> entry-armv.S assembly file, so it's not obvious where to export it
> from. Recommendations? (dedicated KVM export file as in PPC or simply
> adding EXPORT_SYMBOL_GPL(...) or alternative?

There is arch/arm/kernel/armksyms.c where we export extra symbols coming
from .S files. I haven't looked at the code yet, maybe some of the
symbols could be avoided.

> Also, iirc, modules are loaded using vmalloc, wouldn't that make it
> impossible for me to use virt_to_phys, which I need to do on some of
> the code pages for Hypervisor initialization? If any of the ARM people
> could give an advise here, it would be much appreciated - of course I
> can kmalloc a page and relocate the init code, but I really would like
> to avoid this.

You could alloc a page (or a few) an copy the hypervisor code in there.
Just make sure that it is position independent (which is not difficult
with hand-written assembly).

Alternatively, you could set up the Hyp translation tables to also
include the modules area (pretty much duplicating the corresponding
swapper_pg_dir entries, with some attributes changed).

But for now you can probably ignore this issue and get back to it once
everything else is working.
Alexander Graf June 10, 2011, 9:53 a.m. UTC | #37
Am 10.06.2011 um 11:23 schrieb Catalin Marinas <catalin.marinas@arm.com>:

> On Fri, Jun 10, 2011 at 09:40:34AM +0100, Christoffer Dall wrote:
>> OK, so I looked into compiling KVM/ARM as a module and there are a few
>> obstacles:
>> 
>> I need to export several symbols (identitiy_mapping_add,
>> pgd_clear_bad, __irq_svc). __irq_svc is tricky because it's in the
>> entry-armv.S assembly file, so it's not obvious where to export it
>> from. Recommendations? (dedicated KVM export file as in PPC or simply
>> adding EXPORT_SYMBOL_GPL(...) or alternative?
> 
> There is arch/arm/kernel/armksyms.c where we export extra symbols coming
> from .S files. I haven't looked at the code yet, maybe some of the
> symbols could be avoided.
> 
>> Also, iirc, modules are loaded using vmalloc, wouldn't that make it
>> impossible for me to use virt_to_phys, which I need to do on some of
>> the code pages for Hypervisor initialization? If any of the ARM people
>> could give an advise here, it would be much appreciated - of course I
>> can kmalloc a page and relocate the init code, but I really would like
>> to avoid this.
> 
> You could alloc a page (or a few) an copy the hypervisor code in there.
> Just make sure that it is position independent (which is not difficult
> with hand-written assembly).

What we do on ppc is to select another kernel option =Y when we set kvm=M. Based on that other option, we then add the code that need to run in real mode (linear map) to the main kernel binary.

That way, the guest entry/exit code for example is always in the kernel, while the logic that dispatches what to do on a specific guest exit can stay as module, making development a lot easier.


Alex

> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Catalin Marinas June 10, 2011, 9:58 a.m. UTC | #38
On Fri, Jun 10, 2011 at 10:53:06AM +0100, Alexander Graf wrote:
> Am 10.06.2011 um 11:23 schrieb Catalin Marinas <catalin.marinas@arm.com>:
> > On Fri, Jun 10, 2011 at 09:40:34AM +0100, Christoffer Dall wrote:
> >> OK, so I looked into compiling KVM/ARM as a module and there are a few
> >> obstacles:
> >> 
> >> I need to export several symbols (identitiy_mapping_add,
> >> pgd_clear_bad, __irq_svc). __irq_svc is tricky because it's in the
> >> entry-armv.S assembly file, so it's not obvious where to export it
> >> from. Recommendations? (dedicated KVM export file as in PPC or simply
> >> adding EXPORT_SYMBOL_GPL(...) or alternative?
> > 
> > There is arch/arm/kernel/armksyms.c where we export extra symbols coming
> > from .S files. I haven't looked at the code yet, maybe some of the
> > symbols could be avoided.
> > 
> >> Also, iirc, modules are loaded using vmalloc, wouldn't that make it
> >> impossible for me to use virt_to_phys, which I need to do on some of
> >> the code pages for Hypervisor initialization? If any of the ARM people
> >> could give an advise here, it would be much appreciated - of course I
> >> can kmalloc a page and relocate the init code, but I really would like
> >> to avoid this.
> > 
> > You could alloc a page (or a few) an copy the hypervisor code in there.
> > Just make sure that it is position independent (which is not difficult
> > with hand-written assembly).
> 
> What we do on ppc is to select another kernel option =Y when we set
> kvm=M. Based on that other option, we then add the code that need to
> run in real mode (linear map) to the main kernel binary.
> 
> That way, the guest entry/exit code for example is always in the
> kernel, while the logic that dispatches what to do on a specific guest
> exit can stay as module, making development a lot easier.

Good point. We could do the same on ARM.

Thanks.
Christoffer Dall June 10, 2011, 11:56 a.m. UTC | #39
On Fri, Jun 10, 2011 at 11:58 AM, Catalin Marinas
<catalin.marinas@arm.com> wrote:
> On Fri, Jun 10, 2011 at 10:53:06AM +0100, Alexander Graf wrote:
>> Am 10.06.2011 um 11:23 schrieb Catalin Marinas <catalin.marinas@arm.com>:
>> > On Fri, Jun 10, 2011 at 09:40:34AM +0100, Christoffer Dall wrote:
>> >> OK, so I looked into compiling KVM/ARM as a module and there are a few
>> >> obstacles:
>> >>
>> >> I need to export several symbols (identitiy_mapping_add,
>> >> pgd_clear_bad, __irq_svc). __irq_svc is tricky because it's in the
>> >> entry-armv.S assembly file, so it's not obvious where to export it
>> >> from. Recommendations? (dedicated KVM export file as in PPC or simply
>> >> adding EXPORT_SYMBOL_GPL(...) or alternative?
>> >
>> > There is arch/arm/kernel/armksyms.c where we export extra symbols coming
>> > from .S files. I haven't looked at the code yet, maybe some of the
>> > symbols could be avoided.
>> >
>> >> Also, iirc, modules are loaded using vmalloc, wouldn't that make it
>> >> impossible for me to use virt_to_phys, which I need to do on some of
>> >> the code pages for Hypervisor initialization? If any of the ARM people
>> >> could give an advise here, it would be much appreciated - of course I
>> >> can kmalloc a page and relocate the init code, but I really would like
>> >> to avoid this.
>> >
>> > You could alloc a page (or a few) an copy the hypervisor code in there.
>> > Just make sure that it is position independent (which is not difficult
>> > with hand-written assembly).
>>
>> What we do on ppc is to select another kernel option =Y when we set
>> kvm=M. Based on that other option, we then add the code that need to
>> run in real mode (linear map) to the main kernel binary.
>>
>> That way, the guest entry/exit code for example is always in the
>> kernel, while the logic that dispatches what to do on a specific guest
>> exit can stay as module, making development a lot easier.
>
> Good point. We could do the same on ARM.

Thanks, I will follow your suggestions for next patch series.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b410049..b2a2b65 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1951,3 +1951,5 @@  source "security/Kconfig"
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
+
+source "arch/arm/kvm/Kconfig"
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 6f7b292..72335fc 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -254,6 +254,7 @@  core-$(CONFIG_VFP)		+= arch/arm/vfp/
 
 # If we have a machine-specific directory, then include it in the build.
 core-y				+= arch/arm/kernel/ arch/arm/mm/ arch/arm/common/
+core-$(CONFIG_KVM) 		+= arch/arm/kvm/
 core-y				+= $(machdirs) $(platdirs)
 
 drivers-$(CONFIG_OPROFILE)      += arch/arm/oprofile/
diff --git a/arch/arm/include/asm/kvm.h b/arch/arm/include/asm/kvm.h
new file mode 100644
index 0000000..8311198
--- /dev/null
+++ b/arch/arm/include/asm/kvm.h
@@ -0,0 +1,65 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ */
+
+#ifndef __ARM_KVM_H__
+#define __ARM_KVM_H__
+
+#include <asm/types.h>
+
+/*
+ * Modes used for short-hand mode determinition in the world-switch code and
+ * in emulation code.
+ *
+ * Note: These indices do NOT correspond to the value of the CPSR mode bits!
+ */
+#define MODE_FIQ	0
+#define MODE_IRQ	1
+#define MODE_SVC	2
+#define MODE_ABT	3
+#define MODE_UND	4
+#define MODE_USR	5
+#define MODE_SYS	6
+
+struct kvm_regs {
+	__u32 regs0_7[8];	/* Unbanked regs. (r0 - r7)	   */
+	__u32 fiq_regs8_12[5];	/* Banked fiq regs. (r8 - r12)	   */
+	__u32 usr_regs8_12[5];	/* Banked usr registers (r8 - r12) */
+	__u32 reg13[6];		/* Banked r13, indexed by MODE_	   */
+	__u32 reg14[6];		/* Banked r13, indexed by MODE_	   */
+	__u32 reg15;
+	__u32 cpsr;
+	__u32 spsr[5];		/* Banked SPSR,  indexed by MODE_  */
+	struct {
+		__u32 c2_base0;
+		__u32 c2_base1;
+		__u32 c3_dacr;
+	} cp15;
+
+};
+
+struct kvm_sregs {
+};
+
+struct kvm_fpu {
+};
+
+struct kvm_guest_debug_arch {
+};
+
+struct kvm_debug_exit_arch {
+};
+
+#endif /* __ARM_KVM_H__ */
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
new file mode 100644
index 0000000..c3d4458
--- /dev/null
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -0,0 +1,28 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ */
+
+#ifndef __ARM_KVM_ASM_H__
+#define __ARM_KVM_ASM_H__
+
+#define ARM_EXCEPTION_RESET	  0
+#define ARM_EXCEPTION_UNDEFINED   1
+#define ARM_EXCEPTION_SOFTWARE    2
+#define ARM_EXCEPTION_PREF_ABORT  3
+#define ARM_EXCEPTION_DATA_ABORT  4
+#define ARM_EXCEPTION_IRQ	  5
+#define ARM_EXCEPTION_FIQ	  6
+
+#endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
new file mode 100644
index 0000000..8eed752
--- /dev/null
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -0,0 +1,89 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ */
+
+#ifndef __ARM_KVM_EMULATE_H__
+#define __ARM_KVM_EMULATE_H__
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_asm.h>
+
+u32* kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
+
+static inline unsigned char vcpu_mode(struct kvm_vcpu *vcpu)
+{
+	u8 modes_table[16] = {
+		MODE_USR,	// 0x0
+		MODE_FIQ,	// 0x1
+		MODE_IRQ,	// 0x2
+		MODE_SVC,	// 0x3
+		0xf, 0xf, 0xf,
+		MODE_ABT,	// 0x7
+		0xf, 0xf, 0xf,
+		MODE_UND,	// 0xb
+		0xf, 0xf, 0xf,
+		MODE_SYS};	// 0xf
+
+	BUG_ON(modes_table[vcpu->arch.regs.cpsr & 0xf] == 0xf);
+	return modes_table[vcpu->arch.regs.cpsr & 0xf];
+}
+
+/*
+ * Return the SPSR for the specified mode of the virtual CPU.
+ */
+static inline u32 kvm_vcpu_spsr(struct kvm_vcpu *vcpu, u32 mode)
+{
+	switch (mode) {
+	case MODE_SVC:
+		return vcpu->arch.regs.svc_regs[2];
+	case MODE_ABT:
+		return vcpu->arch.regs.svc_regs[2];
+	case MODE_UND:
+		return vcpu->arch.regs.svc_regs[2];
+	case MODE_IRQ:
+		return vcpu->arch.regs.svc_regs[2];
+	case MODE_FIQ:
+		return vcpu->arch.regs.fiq_regs[7];
+	default:
+		BUG();
+	}
+}
+
+/* Get vcpu register for current mode */
+#define vcpu_reg(_vcpu, _reg_num) \
+	(*kvm_vcpu_reg((_vcpu), _reg_num, vcpu_mode(_vcpu)))
+
+/* Get vcpu register for specific mode */
+#define vcpu_reg_m(_vcpu, _reg_num, _mode) \
+	(*kvm_vcpu_reg(_vcpu, _reg_num, _mode))
+
+#define vcpu_cpsr(_vcpu) \
+	(_vcpu->arch.regs.cpsr)
+
+/* Get vcpu SPSR for current mode */
+#define vcpu_spsr(_vcpu) \
+	kvm_vcpu_spsr(_vcpu, vcpu_mode(_vcpu))
+
+/* Get vcpu SPSR for specific mode */
+#define vcpu_spsr_m(_vcpu, _mode) \
+	kvm_vcpu_spsr(_vcpu, _mode)
+
+#define MODE_HAS_SPSR(_vcpu) \
+	 ((vcpu_mode(_vcpu)) < MODE_USR)
+
+#define VCPU_MODE_PRIV(_vcpu) \
+	(((vcpu_mode(_vcpu)) == MODE_USR) ? 0 : 1)
+
+#endif /* __ARM_KVM_EMULATE_H__ */
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
new file mode 100644
index 0000000..6e8a08d
--- /dev/null
+++ b/arch/arm/include/asm/kvm_host.h
@@ -0,0 +1,114 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ */
+
+#ifndef __ARM_KVM_HOST_H__
+#define __ARM_KVM_HOST_H__
+
+#define KVM_MAX_VCPUS 1
+#define KVM_MEMORY_SLOTS 32
+#define KVM_PRIVATE_MEM_SLOTS 4
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
+/* We don't currently support large pages. */
+#define KVM_HPAGE_GFN_SHIFT(x)	0
+#define KVM_NR_PAGE_SIZES	1
+#define KVM_PAGES_PER_HPAGE(x)	(1UL<<31)
+
+struct kvm_vcpu;
+u32* kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
+
+struct kvm_arch {
+};
+
+#define EXCEPTION_NONE      0
+#define EXCEPTION_RESET     0x80
+#define EXCEPTION_UNDEFINED 0x40
+#define EXCEPTION_SOFTWARE  0x20
+#define EXCEPTION_PREFETCH  0x10
+#define EXCEPTION_DATA      0x08
+#define EXCEPTION_IMPRECISE 0x04
+#define EXCEPTION_IRQ       0x02
+#define EXCEPTION_FIQ       0x01
+
+struct kvm_vcpu_regs {
+	u32 usr_regs[15];	/* R0_usr - R14_usr */
+	u32 svc_regs[3];	/* SP_svc, LR_svc, SPSR_svc */
+	u32 abt_regs[3];	/* SP_abt, LR_abt, SPSR_abt */
+	u32 und_regs[3];	/* SP_und, LR_und, SPSR_und */
+	u32 irq_regs[3];	/* SP_irq, LR_irq, SPSR_irq */
+	u32 fiq_regs[8];	/* R8_fiq - R14_fiq, SPSR_fiq */
+	u32 pc;			/* The program counter (r15) */
+	u32 cpsr;		/* Guest emulated CPSR */
+} __packed;
+
+struct kvm_vcpu_arch {
+	/* Pointer to regs struct on shared page */
+	struct kvm_vcpu_regs regs;
+
+	/* Pointer to cached mode on shared page */
+	unsigned long *mode;
+
+	/* System control coprocessor (cp15) */
+	struct {
+		u32 c1_SCTLR;		/* System Control Register */
+		u32 c1_ACTLR;		/* Auxilliary Control Register */
+		u32 c1_CPACR;		/* Coprocessor Access Control Register */
+		u64 c2_TTBR0;		/* Translation Table Base Register 0 */
+		u64 c2_TTBR1;		/* Translation Table Base Register 1 */
+		u32 c2_TTBCR;		/* Translation Table Base Control Register */
+		u32 c3_DACR;		/* Domain Access Control Register */
+	} cp15;
+
+	u32 exception_pending;  	/* Exception to raise after emulation */
+
+	/* Exception Information */
+	u32 hsr;		/* Hyp Syndrom Register */
+	u32 hdfar;		/* Hyp Data Fault Address Register */
+	u32 hifar;		/* Hyp Inst. Fault Address Register */
+	u32 hpfar;		/* Hyp IPA Fault Address Register */
+
+	/* IO related fields */
+	u32 mmio_rd;
+
+	/* Misc. fields */
+	u32 wait_for_interrupts;
+};
+
+struct kvm_vm_stat {
+	u32 remote_tlb_flush;
+};
+
+struct kvm_vcpu_stat {
+	u32 sum_exits;
+	u32 mmio_exits;
+	u32 dcr_exits;
+	u32 signal_exits;
+	u32 light_exits;
+	/* Account for special types of light exits: */
+	u32 itlb_real_miss_exits;
+	u32 itlb_virt_miss_exits;
+	u32 dtlb_real_miss_exits;
+	u32 dtlb_virt_miss_exits;
+	u32 syscall_exits;
+	u32 isi_exits;
+	u32 dsi_exits;
+	u32 emulated_inst_exits;
+	u32 dec_exits;
+	u32 ext_intr_exits;
+	u32 halt_wakeup;
+};
+
+#endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/asm/kvm_para.h b/arch/arm/include/asm/kvm_para.h
new file mode 100644
index 0000000..7ce5f1c
--- /dev/null
+++ b/arch/arm/include/asm/kvm_para.h
@@ -0,0 +1,9 @@ 
+#ifndef _ASM_X86_KVM_PARA_H
+#define _ASM_X86_KVM_PARA_H
+
+static inline unsigned int kvm_arch_para_features(void)
+{
+	return 0;
+}
+
+#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/arm/include/asm/unified.h b/arch/arm/include/asm/unified.h
index bc63116..14ad6bb 100644
--- a/arch/arm/include/asm/unified.h
+++ b/arch/arm/include/asm/unified.h
@@ -54,6 +54,18 @@ 
 
 #endif	/* CONFIG_THUMB2_KERNEL */
 
+#ifdef CONFIG_KVM
+#ifdef __ASSEMBLY__
+.arch_extension sec
+.arch_extension virt
+#else
+__asm__(
+"	.arch_extension sec\n"
+"	.arch_extension virt\n"
+);
+#endif
+#endif
+
 #ifndef CONFIG_ARM_ASM_UNIFIED
 
 /*
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
new file mode 100644
index 0000000..1806a6d
--- /dev/null
+++ b/arch/arm/kvm/Kconfig
@@ -0,0 +1,44 @@ 
+#
+# KVM configuration
+#
+
+source "virt/kvm/Kconfig"
+
+menuconfig VIRTUALIZATION
+	bool "Virtualization"
+	---help---
+	  Say Y here to get to see options for using your Linux host to run
+	  other operating systems inside virtual machines (guests).
+	  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and
+	  disabled.
+
+if VIRTUALIZATION
+
+config KVM
+	bool "Kernel-based Virtual Machine (KVM) support"
+	select PREEMPT_NOTIFIERS
+	select ANON_INODES
+	select KVM_ARM_HOST
+	select KVM_MMIO
+	---help---
+	  Support hosting virtualized guest machines. You will also
+	  need to select one or more of the processor modules below.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
+config KVM_ARM_HOST
+	bool "KVM host support for ARM cpus."
+	depends on KVM
+	depends on MMU
+	depends on CPU_V7 || ARM_VIRT_EXT
+	---help---
+	  Provides host support for ARM processors.
+
+source drivers/virtio/Kconfig
+
+endif # VIRTUALIZATION
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
new file mode 100644
index 0000000..4ff905d
--- /dev/null
+++ b/arch/arm/kvm/Makefile
@@ -0,0 +1,13 @@ 
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+
+EXTRA_CFLAGS += -Ivirt/kvm -Iarch/arm/kvm
+AFLAGS_arm_interrupts.o := -I$(obj)
+
+kvm-arm-y += $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
+
+kvm-arm-y += arm.o arm_guest.o arm_interrupts.o arm_mmu.o arm_emulate.o \
+		trace.o
+
+obj-$(CONFIG_KVM) += kvm-arm.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
new file mode 100644
index 0000000..2157c1e
--- /dev/null
+++ b/arch/arm/kvm/arm.c
@@ -0,0 +1,363 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mman.h>
+#include <asm/uaccess.h>
+#include <asm/ptrace.h>
+#include <asm/mman.h>
+
+#include "trace.h"
+
+int kvm_arch_hardware_enable(void *garbage)
+{
+	return 0;
+}
+
+void kvm_arch_hardware_disable(void *garbage)
+{
+}
+
+int kvm_arch_hardware_setup(void)
+{
+	return 0;
+}
+
+void kvm_arch_hardware_unsetup(void)
+{
+}
+
+void kvm_arch_check_processor_compat(void *rtn)
+{
+	*(int *)rtn = 0;
+}
+
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+}
+
+int kvm_arch_init_vm(struct kvm *kvm)
+{
+	return 0;
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		if (kvm->vcpus[i]) {
+			kvm_arch_vcpu_free(kvm->vcpus[i]);
+			kvm->vcpus[i] = NULL;
+		}
+	}
+}
+
+int kvm_dev_ioctl_check_extension(long ext)
+{
+	int r;
+	switch (ext) {
+	case KVM_CAP_USER_MEMORY:
+	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+		r = 1;
+		break;
+	case KVM_CAP_COALESCED_MMIO:
+		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
+		break;
+	default:
+		r = 0;
+		break;
+	}
+	return r;
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+			unsigned int ioctl, unsigned long arg)
+{
+	int ret = 0;
+
+	switch (ioctl) {
+	default:
+		ret = -EINVAL;
+	}
+
+	if (ret < 0)
+		printk(KERN_ERR "error processing ARM ioct: %d", ret);
+	return ret;
+}
+
+int kvm_arch_set_memory_region(struct kvm *kvm,
+			       struct kvm_userspace_memory_region *mem,
+			       struct kvm_memory_slot old,
+			       int user_alloc)
+{
+	return 0;
+}
+
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+				   struct kvm_memory_slot *memslot,
+				   struct kvm_memory_slot old,
+				   struct kvm_userspace_memory_region *mem,
+				   int user_alloc)
+{
+	return 0;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+				   struct kvm_userspace_memory_region *mem,
+				   struct kvm_memory_slot old,
+				   int user_alloc)
+{
+}
+
+void kvm_arch_flush_shadow(struct kvm *kvm)
+{
+	// XXX What should this do?
+}
+
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	int err;
+	struct kvm_vcpu *vcpu;
+
+	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	if (!vcpu) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
+
+	return vcpu;
+free_vcpu:
+	kmem_cache_free(kvm_vcpu_cache, vcpu);
+out:
+	return ERR_PTR(err);
+}
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	KVMARM_NOT_IMPLEMENTED();
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+	kvm_arch_vcpu_free(vcpu);
+}
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	KVMARM_NOT_IMPLEMENTED();
+	return 0;
+}
+
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+                                        struct kvm_guest_debug *dbg)
+{
+	return -EINVAL;
+}
+
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+				    struct kvm_mp_state *mp_state)
+{
+	return -EINVAL;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+				    struct kvm_mp_state *mp_state)
+{
+	return -EINVAL;
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
+{
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	KVMARM_NOT_IMPLEMENTED();
+	return -EINVAL;
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
+				    struct kvm_interrupt *intr)
+{
+	u32 mask;
+
+	switch (intr->irq) {
+	case EXCEPTION_IRQ:
+		/* IRQ */
+		mask = EXCEPTION_IRQ;
+		break;
+	case EXCEPTION_FIQ:
+		/* FIQ */
+		mask = EXCEPTION_FIQ;
+		break;
+	default:
+		/* Only async exceptions are supported here */
+		return -EINVAL;
+	}
+
+	if (intr->raise) {
+		if (mask == EXCEPTION_IRQ)
+			kvm_trace_activity(101, "raise IRQ");
+		else if (mask == EXCEPTION_FIQ)
+			kvm_trace_activity(102, "raise FIQ");
+		vcpu->arch.exception_pending |= mask;
+		vcpu->arch.wait_for_interrupts = 0;
+	} else {
+		if (mask == EXCEPTION_IRQ)
+			kvm_trace_activity(103, "lower IRQ");
+		else if (mask == EXCEPTION_FIQ)
+			kvm_trace_activity(104, "lower FIQ");
+
+		vcpu->arch.exception_pending &= ~mask;
+	}
+
+	return 0;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+			 unsigned int ioctl, unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	int r;
+
+	switch (ioctl) {
+	case KVM_S390_STORE_STATUS: {
+		return -EINVAL;
+	}
+	case KVM_INTERRUPT: {
+		struct kvm_interrupt intr;
+
+		r = -EFAULT;
+		if (copy_from_user(&intr, argp, sizeof intr))
+			break;
+		r = kvm_vcpu_ioctl_interrupt(vcpu, &intr);
+		break;
+	}
+	default:
+		r = -EINVAL;
+	}
+
+	return r;
+}
+
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+{
+	return -ENOTSUPP;
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+		       unsigned int ioctl, unsigned long arg)
+{
+	printk(KERN_ERR "kvm_arch_vm_ioctl: Unsupported ioctl (%d)\n", ioctl);
+	return -EINVAL;
+}
+
+int kvm_arch_init(void *opaque)
+{
+	return 0;
+}
+
+void kvm_arch_exit(void)
+{
+}
+
+static int k_show(struct seq_file *m, void *v)
+{
+	print_kvm_debug_info(&seq_printf, m);
+	return 0;
+}
+
+static void *k_start(struct seq_file *m, loff_t *pos)
+{
+	return *pos < 1 ? (void *)1 : NULL;
+}
+
+static void *k_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return NULL;
+}
+
+static void k_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations kvmproc_op = {
+	.start	= k_start,
+	.next	= k_next,
+	.stop	= k_stop,
+	.show	= k_show
+};
+
+static int kvm_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &kvmproc_op);
+}
+
+static const struct file_operations proc_kvm_operations = {
+	.open		= kvm_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int arm_init(void)
+{
+	int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+	if (rc == 0)
+		proc_create("kvm", 0, NULL, &proc_kvm_operations);
+	return rc;
+}
+
+static void __exit arm_exit(void)
+{
+	kvm_exit();
+}
+
+module_init(arm_init);
+module_exit(arm_exit)
diff --git a/arch/arm/kvm/arm_emulate.c b/arch/arm/kvm/arm_emulate.c
new file mode 100644
index 0000000..3dd4f08
--- /dev/null
+++ b/arch/arm/kvm/arm_emulate.c
@@ -0,0 +1,70 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ */
+
+#include <asm/kvm_emulate.h>
+
+/*
+ * Return a pointer to the register number valid in the specified mode of
+ * the virtual CPU.
+ */
+u32* kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode)
+{
+	struct kvm_vcpu_regs *regs;
+	u8 reg_idx;
+	BUG_ON(reg_num > 15);
+
+	regs = &vcpu->arch.regs;
+
+	/* The PC is trivial */
+	if (reg_num == 15)
+		return &(regs->pc);
+
+	/* Non-banked registers */
+	if (reg_num < 8)
+		return &(regs->usr_regs[reg_num]);
+
+	/* Banked registers r13 and r14 */
+	if (reg_num >= 13) {
+		reg_idx = reg_num - 13; /* 0=r13 and 1=r14 */
+		switch (mode) {
+		case MODE_FIQ:
+			return &(regs->fiq_regs[reg_idx + 5]);
+		case MODE_IRQ:
+			return &(regs->irq_regs[reg_idx]);
+		case MODE_SVC:
+			return &(regs->svc_regs[reg_idx]);
+		case MODE_ABT:
+			return &(regs->abt_regs[reg_idx]);
+		case MODE_UND:
+			return &(regs->und_regs[reg_idx]);
+		case MODE_USR:
+		case MODE_SYS:
+			return &(regs->usr_regs[reg_idx]);
+		}
+	}
+
+	/* Banked FIQ registers r8-r12 */
+	if (reg_num >= 8 && reg_num <= 12) {
+		if (mode == MODE_FIQ) {
+			reg_idx = reg_num - 8; /* 0=r8, ..., 4=r12 */
+			return &(regs->fiq_regs[reg_idx]);
+		} else
+			return &(regs->usr_regs[reg_num]);
+	}
+
+	BUG();
+	return NULL;
+}
diff --git a/arch/arm/kvm/arm_guest.c b/arch/arm/kvm/arm_guest.c
new file mode 100644
index 0000000..646f60c
--- /dev/null
+++ b/arch/arm/kvm/arm_guest.c
@@ -0,0 +1,142 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
+
+
+#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+};
+
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	struct kvm_vcpu_regs *vcpu_regs = &vcpu->arch.regs;
+
+	/*
+	 * GPRs and PSRs
+	 */
+	memcpy(regs->regs0_7, &(vcpu_regs->usr_regs[0]), sizeof(u32) * 8);
+	memcpy(regs->usr_regs8_12, &(vcpu_regs->usr_regs[8]), sizeof(u32) * 5);
+	memcpy(regs->fiq_regs8_12, &(vcpu_regs->fiq_regs[0]), sizeof(u32) * 5);
+	regs->reg13[MODE_FIQ] = vcpu_regs->fiq_regs[5];
+	regs->reg14[MODE_FIQ] = vcpu_regs->fiq_regs[6];
+	regs->reg13[MODE_IRQ] = vcpu_regs->irq_regs[0];
+	regs->reg14[MODE_IRQ] = vcpu_regs->irq_regs[1];
+	regs->reg13[MODE_SVC] = vcpu_regs->svc_regs[0];
+	regs->reg14[MODE_SVC] = vcpu_regs->svc_regs[1];
+	regs->reg13[MODE_ABT] = vcpu_regs->abt_regs[0];
+	regs->reg14[MODE_ABT] = vcpu_regs->abt_regs[1];
+	regs->reg13[MODE_UND] = vcpu_regs->und_regs[0];
+	regs->reg14[MODE_UND] = vcpu_regs->und_regs[1];
+	regs->reg13[MODE_USR] = vcpu_regs->usr_regs[0];
+	regs->reg14[MODE_USR] = vcpu_regs->usr_regs[1];
+
+	regs->spsr[MODE_FIQ]  = vcpu_regs->fiq_regs[7];
+	regs->spsr[MODE_IRQ]  = vcpu_regs->irq_regs[2];
+	regs->spsr[MODE_SVC]  = vcpu_regs->svc_regs[2];
+	regs->spsr[MODE_ABT]  = vcpu_regs->abt_regs[2];
+	regs->spsr[MODE_UND]  = vcpu_regs->und_regs[2];
+
+	regs->reg15 = vcpu_regs->pc;
+	regs->cpsr = vcpu_regs->cpsr;
+
+
+	/*
+	 * Co-processor registers.
+	 */
+	regs->cp15.c2_base0 = vcpu->arch.cp15.c2_TTBR0;
+	regs->cp15.c2_base1 = vcpu->arch.cp15.c2_TTBR1;
+	regs->cp15.c3_dacr = vcpu->arch.cp15.c3_DACR;
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	struct kvm_vcpu_regs *vcpu_regs = &vcpu->arch.regs;
+
+	memcpy(&(vcpu_regs->usr_regs[0]), regs->regs0_7, sizeof(u32) * 8);
+	memcpy(&(vcpu_regs->usr_regs[8]), regs->usr_regs8_12, sizeof(u32) * 5);
+	memcpy(&(vcpu_regs->fiq_regs[0]), regs->fiq_regs8_12, sizeof(u32) * 5);
+
+	vcpu_regs->fiq_regs[5] = regs->reg13[MODE_FIQ];
+	vcpu_regs->fiq_regs[6] = regs->reg14[MODE_FIQ];
+	vcpu_regs->irq_regs[0] = regs->reg13[MODE_IRQ];
+	vcpu_regs->irq_regs[1] = regs->reg14[MODE_IRQ];
+	vcpu_regs->svc_regs[0] = regs->reg13[MODE_SVC];
+	vcpu_regs->svc_regs[1] = regs->reg14[MODE_SVC];
+	vcpu_regs->abt_regs[0] = regs->reg13[MODE_ABT];
+	vcpu_regs->abt_regs[1] = regs->reg14[MODE_ABT];
+	vcpu_regs->und_regs[0] = regs->reg13[MODE_UND];
+	vcpu_regs->und_regs[1] = regs->reg14[MODE_UND];
+	vcpu_regs->usr_regs[0] = regs->reg13[MODE_USR];
+	vcpu_regs->usr_regs[1] = regs->reg14[MODE_USR];
+
+	vcpu_regs->fiq_regs[7] = regs->spsr[MODE_FIQ];
+	vcpu_regs->irq_regs[2] = regs->spsr[MODE_IRQ];
+	vcpu_regs->svc_regs[2] = regs->spsr[MODE_SVC];
+	vcpu_regs->abt_regs[2] = regs->spsr[MODE_ABT];
+	vcpu_regs->und_regs[2] = regs->spsr[MODE_UND];
+
+	vcpu_regs->pc = regs->reg15;
+	vcpu_regs->cpsr = regs->cpsr;
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs)
+{
+	return -ENOTSUPP;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs)
+{
+	return -ENOTSUPP;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	return -ENOTSUPP;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	return -ENOTSUPP;
+}
+
+/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+				  struct kvm_translation *tr)
+{
+	return 0;
+}
diff --git a/arch/arm/kvm/arm_interrupts.S b/arch/arm/kvm/arm_interrupts.S
new file mode 100644
index 0000000..073a494
--- /dev/null
+++ b/arch/arm/kvm/arm_interrupts.S
@@ -0,0 +1,17 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ */
+#include <asm/asm-offsets.h>
+#include <asm/kvm_asm.h>
diff --git a/arch/arm/kvm/arm_mmu.c b/arch/arm/kvm/arm_mmu.c
new file mode 100644
index 0000000..e69de29
diff --git a/arch/arm/kvm/trace.c b/arch/arm/kvm/trace.c
new file mode 100644
index 0000000..8ea1155
--- /dev/null
+++ b/arch/arm/kvm/trace.c
@@ -0,0 +1,436 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ */
+#include <linux/types.h>
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+#include "trace.h"
+
+
+/******************************************************************************
+ * Simple event counting
+ */
+
+struct kvm_event {
+	unsigned long long cnt;
+	char *descr;
+};
+
+static struct kvm_event kvm_eventc_log[KVM_EVENTC_ITEMS] =
+{
+	{ 0, "switch to guest" },
+	{ 0, "exit from guest" },
+	{ 0, "Block VCPU" },
+	{ 0, "Exit to QEMU for IRQ window" },
+	{ 0, "Switch VCPU mode" },
+	{ 0, "VCPU IRQs on" },
+	{ 0, "VCPU IRQs off" },
+	{ 0, "Wait-for-interrupts" },
+	{ 0, "Flush shadow page table" },
+	{ 0, "Virtual TTBR change" },
+	{ 0, "Read guest page table entry" },
+	{ 0, "Map GVA to GFN" },
+	{ 0, "Virtual DACR change" },
+	{ 0, "VCPU switch to privileged mode" },
+	{ 0, "VCPU switch from privileged mode" },
+	{ 0, "VCPU process ID registers change" },
+	{ 0, "Emulate Load/Store with translation" },
+	{ 0, "Emulate MRS" },
+	{ 0, "Emulate MSR" },
+	{ 0, "Emulate CPS" },
+	{ 0, "Need reschedule in execution loop" },
+	{ 0, "MCR 7,  5, 0 - Invalidate entire I-cache" },
+	{ 0, "MCR 7,  5, 1 - Invalidate line in I-cache MVA" },
+	{ 0, "MCR 7,  5, 2 - Invalidate line in I-cache set/way" },
+	{ 0, "MCR 7,  5, 7 - Flush branch target cache - MVA" },
+	{ 0, "MCR 7,  6, 0 - Invalidate entire data cache" },
+	{ 0, "MCR 7,  6, 1 - Invalidate data cache line - MVA" },
+	{ 0, "MCR 7,  6, 2 - Invalidate data cache line - set/way" },
+	{ 0, "MCR 7,  7, 0 - Invalidate D- and I-cache" },
+	{ 0, "MCR 7, 10, 0 - Clean entire data cache" },
+	{ 0, "MCR 7, 10, 1 - Clean data cache line - MVA" },
+	{ 0, "MCR 7, 10, 4 - Data Synchronization Barrier (DSB)" },
+	{ 0, "MCR 7, 14, 0 - Clean and invalidate entire D-cache" },
+	{ 0, "MCR 7, 14, 1 - Clean and invalidate D-cache line - MVA" },
+	{ 0, "MCR 7, 15, 0 - Clean and invalidate unified cache" },
+	{ 0, "MCR 8,  5, 0 - Invalidate instruction TLB" },
+	{ 0, "MCR 8,  6, 0 - Invalidate data TLB" },
+	{ 0, "MCR 8,  7, 0 - Invalidate unified TLB" },
+	{ 0, "Emulate Load-Store multiple" },
+};
+
+void kvm_arm_count_event(unsigned int event)
+{
+	if (event >= KVM_EVENTC_ITEMS)
+		return;
+
+	kvm_eventc_log[event].cnt++;
+}
+
+void kvm_arm_init_eventc(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < KVM_EVENTC_ITEMS; i++)
+		kvm_eventc_log[i].cnt = 0;
+}
+
+struct kvm_event_order {
+	struct kvm_event *event;
+	struct kvm_event_order *next;
+	struct kvm_event_order *prev;
+};
+static struct kvm_event_order event_order[KVM_EVENTC_ITEMS];
+
+static struct kvm_event_order *sort_kvm_event_log(void)
+{
+	unsigned int i;
+	struct kvm_event_order *ptr;
+	struct kvm_event_order head =
+		{ .event = NULL, .next = &head, .prev = &head };
+
+	for (i = 0; i < KVM_EVENTC_ITEMS; i++) {
+		event_order[i].event = &kvm_eventc_log[i];
+		ptr = head.next;
+		while (ptr->event != NULL &&
+		       ptr->event->cnt > kvm_eventc_log[i].cnt) {
+			ptr = ptr->next;
+		}
+		ptr->prev->next = &event_order[i];
+		event_order[i].prev = ptr->prev;
+		event_order[i].next = ptr;
+		ptr->prev = &event_order[i];
+	}
+
+	head.prev->next = NULL; /* Mark end of linked list */
+	return head.next;
+}
+
+/******************************************************************************
+ * Trace ring-buffer local to KVM/ARM
+ */
+
+#define KVM_TRACE_ACTIVITY
+#ifndef KVM_TRACE_ACTIVITY
+void kvm_trace_activity(unsigned int activity, char *fmt, ...)
+{
+}
+#else
+
+#define ACTIVITY_TRACE_ITEMS 50
+#define TRACE_DESCR_LEN 80
+static u32 activity_trace[ACTIVITY_TRACE_ITEMS];
+static u32 activity_trace_cnt[ACTIVITY_TRACE_ITEMS];
+static char activity_trace_descr[ACTIVITY_TRACE_ITEMS][TRACE_DESCR_LEN];
+static int activity_trace_index = 0;
+static bool trace_init = false;
+
+void kvm_trace_activity(unsigned int activity, char *fmt, ...)
+{
+	va_list ap;
+	unsigned int i;
+	char *ptr;
+
+	if (!trace_init) {
+		for (i = 0; i < ACTIVITY_TRACE_ITEMS; i++)
+			activity_trace_descr[i][0] = '\0';
+		trace_init = true;
+	}
+
+	if (activity_trace[activity_trace_index] == activity) {
+		activity_trace_cnt[activity_trace_index]++;
+	} else {
+		activity_trace_index = (activity_trace_index + 1)
+			% ACTIVITY_TRACE_ITEMS;
+		activity_trace[activity_trace_index] = activity;
+		activity_trace_cnt[activity_trace_index] = 0;
+
+		ptr = activity_trace_descr[activity_trace_index];
+		va_start(ap, fmt);
+		vsnprintf(ptr, TRACE_DESCR_LEN, fmt, ap);
+		va_end(ap);
+	}
+}
+#endif
+
+/******************************************************************************
+ * World-switch ring-buffer
+ */
+
+#define WS_TRACE_ITEMS 10
+static u32 ws_trace_enter[WS_TRACE_ITEMS];
+static int ws_trace_enter_index = 0;
+static u32 ws_trace_exit[WS_TRACE_ITEMS];
+static int ws_trace_exit_index = 0;
+static u32 ws_trace_exit_codes[WS_TRACE_ITEMS];
+DEFINE_MUTEX(ws_trace_mutex);
+
+void trace_ws_enter(u32 guest_pc)
+{
+	mutex_lock(&ws_trace_mutex);
+	ws_trace_enter[ws_trace_enter_index++] = guest_pc;
+	if (ws_trace_enter_index >= WS_TRACE_ITEMS)
+		ws_trace_enter_index = 0;
+	mutex_unlock(&ws_trace_mutex);
+}
+
+void trace_ws_exit(u32 guest_pc, u32 exit_code)
+{
+	mutex_lock(&ws_trace_mutex);
+	ws_trace_exit[ws_trace_exit_index] = guest_pc;
+	ws_trace_exit_codes[ws_trace_exit_index++] = exit_code;
+	if (ws_trace_exit_index >= WS_TRACE_ITEMS)
+		ws_trace_exit_index = 0;
+	mutex_unlock(&ws_trace_mutex);
+}
+
+void print_ws_trace(void)
+{
+	int i;
+	mutex_lock(&ws_trace_mutex);
+
+	if (ws_trace_enter_index != ws_trace_exit_index) {
+		kvm_msg("enter and exit WS trace count differ");
+		mutex_unlock(&ws_trace_mutex);
+		return;
+	}
+
+	/* Avoid potential endless loop */
+	if (ws_trace_enter_index < 0 || ws_trace_enter_index >= WS_TRACE_ITEMS) {
+		kvm_msg("ws_trace_enter_index out of bounds: %d",
+				ws_trace_enter_index);
+		mutex_unlock(&ws_trace_mutex);
+		return;
+	}
+
+	for (i = ws_trace_enter_index - 1; i != ws_trace_enter_index; i--) {
+		if (i < 0) {
+			i = WS_TRACE_ITEMS;
+			continue;
+		}
+
+		printk(KERN_ERR "Enter: %08x    Exit: %08x (%d)\n",
+			ws_trace_enter[i],
+			ws_trace_exit[i],
+			ws_trace_exit_codes[i]);
+	}
+	mutex_unlock(&ws_trace_mutex);
+}
+
+/******************************************************************************
+ * Dump total debug info, or write to /proc/kvm
+ */
+
+struct kvm_vcpu *latest_vcpu = NULL;
+
+void print_kvm_debug_info(int (*print_fn)(print_fn_args), struct seq_file *m)
+{
+	int i;
+	struct kvm_vcpu_regs *regs;
+	char *mode = NULL;
+	char *exceptions[7];
+	struct kvm_vcpu *vcpu = latest_vcpu;
+	struct kvm_event_order *ptr;
+
+	print_fn(m, "KVM/ARM runtime info\n");
+	print_fn(m, "======================================================");
+	print_fn(m, "\n\n");
+
+	if (vcpu == NULL) {
+		print_fn(m, "No registered VCPU\n");
+		goto print_ws_hist;
+	}
+
+
+	switch (vcpu_mode(vcpu)) {
+		case MODE_USR:	mode = "USR"; break;
+		case MODE_FIQ:	mode = "FIQ"; break;
+		case MODE_IRQ:	mode = "IRQ"; break;
+		case MODE_SVC:	mode = "SVC"; break;
+		case MODE_ABT:	mode = "ABT"; break;
+		case MODE_UND:	mode = "UND"; break;
+		case MODE_SYS:	mode = "SYS"; break;
+	}
+
+	vcpu_load(vcpu);
+	regs = &vcpu->arch.regs;
+
+	print_fn(m, "Virtual CPU state:\n\n");
+	print_fn(m, "PC is at: \t%08x\n", vcpu_reg(vcpu, 15));
+	print_fn(m, "CPSR:     \t%08x\n(Mode: %s)  (IRQs: %s)  (FIQs: %s) "
+		      "  (Vec: %s)\n",
+		      regs->cpsr, mode,
+		      (regs->cpsr & PSR_I_BIT) ? "off" : "on",
+		      (regs->cpsr & PSR_F_BIT) ? "off" : "on",
+		      (regs->cpsr & PSR_V_BIT) ? "high" : "low");
+
+	for (i = 0; i <= 12; i++) {
+		if ((i % 4) == 0)
+			print_fn(m, "\nregs[%u]: ", i);
+
+		print_fn(m, "\t0x%08x", vcpu_reg_m(vcpu, i, MODE_USR));
+	}
+
+	print_fn(m, "\n\n");
+	print_fn(m, "Banked registers:  \tr13\t\tr14\t\tspsr\n");
+	print_fn(m, "-------------------\t--------\t--------\t--------\n");
+	print_fn(m, "             USR:  \t%08x\t%08x\t////////\n",
+			vcpu_reg_m(vcpu, 13, MODE_USR),
+			vcpu_reg_m(vcpu, 14, MODE_USR));
+	print_fn(m, "             SVC:  \t%08x\t%08x\t%08x\n",
+			vcpu_reg_m(vcpu, 13, MODE_SVC),
+			vcpu_reg_m(vcpu, 14, MODE_SVC),
+			vcpu_spsr_m(vcpu, MODE_SVC));
+	print_fn(m, "             ABT:  \t%08x\t%08x\t%08x\n",
+			vcpu_reg_m(vcpu, 13, MODE_ABT),
+			vcpu_reg_m(vcpu, 14, MODE_ABT),
+			vcpu_spsr_m(vcpu, MODE_ABT));
+	print_fn(m, "             UND:  \t%08x\t%08x\t%08x\n",
+			vcpu_reg_m(vcpu, 13, MODE_UND),
+			vcpu_reg_m(vcpu, 14, MODE_UND),
+			vcpu_spsr_m(vcpu, MODE_UND));
+	print_fn(m, "             IRQ:  \t%08x\t%08x\t%08x\n",
+			vcpu_reg_m(vcpu, 13, MODE_IRQ),
+			vcpu_reg_m(vcpu, 14, MODE_IRQ),
+			vcpu_spsr_m(vcpu, MODE_IRQ));
+	print_fn(m, "             FIQ:  \t%08x\t%08x\t%08x\n",
+			vcpu_reg_m(vcpu, 13, MODE_FIQ),
+			vcpu_reg_m(vcpu, 14, MODE_FIQ),
+			vcpu_spsr_m(vcpu, MODE_FIQ));
+
+	print_fn(m, "\n");
+	print_fn(m, "fiq regs:\t%08x\t%08x\t%08x\t%08x\n"
+			  "         \t%08x\n",
+			vcpu_reg_m(vcpu, 8, MODE_FIQ),
+			vcpu_reg_m(vcpu, 9, MODE_FIQ),
+			vcpu_reg_m(vcpu, 10, MODE_FIQ),
+			vcpu_reg_m(vcpu, 11, MODE_FIQ),
+			vcpu_reg_m(vcpu, 12, MODE_FIQ));
+
+print_ws_hist:
+	/*
+	 * Print world-switch trace circular buffer
+	 */
+	print_fn(m, "\n\nWorld switch history:\n");
+	print_fn(m, "---------------------\n");
+	mutex_lock(&ws_trace_mutex);
+
+	if (ws_trace_enter_index != ws_trace_exit_index ||
+			ws_trace_enter_index < 0 ||
+			ws_trace_enter_index >= WS_TRACE_ITEMS)
+	{
+		mutex_unlock(&ws_trace_mutex);
+		goto print_trace_activity;
+	}
+
+	exceptions[0] = "reset";
+	exceptions[1] = "undefined";
+	exceptions[2] = "software";
+	exceptions[3] = "prefetch abort";
+	exceptions[4] = "data abort";
+	exceptions[5] = "irq";
+	exceptions[6] = "fiq";
+
+	for (i = ws_trace_enter_index - 1; i != ws_trace_enter_index; i--) {
+		if (i < 0) {
+			i = WS_TRACE_ITEMS;
+			continue;
+		}
+
+		print_fn(m, "Enter: %08x    Exit: %08x (%s)\n",
+			ws_trace_enter[i], ws_trace_exit[i],
+			exceptions[ws_trace_exit_codes[i]]);
+	}
+	mutex_unlock(&ws_trace_mutex);
+
+print_trace_activity:
+#ifdef KVM_TRACE_ACTIVITY
+	/*
+	 * Print activity trace
+	 */
+	print_fn(m, "\n\nActivity circular buffer:\n");
+	print_fn(m, "-----------------------------\n");
+	for (i = activity_trace_index - 1; i != activity_trace_index; i--) {
+		if (i < 0) {
+			i = ACTIVITY_TRACE_ITEMS;
+			continue;
+		}
+
+		print_fn(m, "%lu: \t %s\n",
+				activity_trace_cnt[i],
+				activity_trace_descr[i]);
+	}
+#endif
+
+	/*
+	 * Print event counters sorted
+	 */
+	print_fn(m, "\n\nEvent counters:\n");
+	print_fn(m, "-----------------------------\n");
+	ptr = sort_kvm_event_log();
+	while (ptr != NULL) {
+		if (ptr->event->cnt > 0) {
+			print_fn(m, "%12llu  #  %s\n", ptr->event->cnt,
+							ptr->event->descr);
+		}
+		ptr = ptr->next;
+	}
+
+	if (vcpu != NULL) {
+		vcpu_put(vcpu);
+	}
+}
+
+static int __printk_relay(struct seq_file *m, const char *fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	vprintk(fmt, ap);
+	va_end(ap);
+	return 0;
+}
+
+void kvm_dump_vcpu_state(void)
+{
+	print_kvm_debug_info(&__printk_relay, NULL);
+}
+
+/******************************************************************************
+ * Printk-log-wrapping functionality
+ */
+
+#define TMP_LOG_LEN 512
+static char __tmp_log_data[TMP_LOG_LEN];
+DEFINE_MUTEX(__tmp_log_lock);
+void __kvm_print_msg(char *fmt, ...)
+{
+	va_list ap;
+	unsigned int size;
+
+	mutex_lock(&__tmp_log_lock);
+
+	va_start(ap, fmt);
+	size = vsnprintf(__tmp_log_data, TMP_LOG_LEN, fmt, ap);
+	va_end(ap);
+
+	if (size >= TMP_LOG_LEN)
+		printk("Message exceeded log length!\n");
+	else
+		printk("%s", __tmp_log_data);
+
+	mutex_unlock(&__tmp_log_lock);
+}
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
new file mode 100644
index 0000000..020240a
--- /dev/null
+++ b/arch/arm/kvm/trace.h
@@ -0,0 +1,108 @@ 
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ *
+ *
+ * This file contains debugging and tracing functions and definitions for KVM/ARM.
+ *
+ */
+#ifndef __ARM_KVM_TRACE_H__
+#define __ARM_KVM_TRACE_H__
+
+#include <linux/types.h>
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+
+#define EVENT_GUEST_ENTER	0
+#define EVENT_GUEST_EXIT	1
+#define EVENT_VCPU_BLOCK	2
+#define EVENT_IRQ_WINDOW	3
+#define EVENT_SWITCH_MODE	4
+#define EVENT_VCPU_IRQS_ON	5
+#define EVENT_VCPU_IRQS_OFF	6
+#define EVENT_WFI		7
+#define EVENT_FLUSH_SHADOW	8
+#define EVENT_MOD_TTBR		9
+#define EVENT_READ_GUEST_ENTRY	10
+#define EVENT_MAP_GVA_TO_GFN	11
+#define EVENT_DACR_CHANGE	12
+#define EVENT_SWITCH_PRIV	13
+#define EVENT_SWITCH_USER	14
+#define EVENT_VCPU_ASID		15
+#define EVENT_LS_TRANS		16
+#define EVENT_EMUL_MRS		17
+#define EVENT_EMUL_MSR		18
+#define EVENT_EMUL_CPS		19
+#define EVENT_NEED_RESCHED	20
+#define EVENT_MCR_7_5_0		21
+#define EVENT_MCR_7_5_1		22
+#define EVENT_MCR_7_5_2		23
+#define EVENT_MCR_7_5_7		24
+#define EVENT_MCR_7_6_0		25
+#define EVENT_MCR_7_6_1		26
+#define EVENT_MCR_7_6_2		27
+#define EVENT_MCR_7_7_0		28
+#define EVENT_MCR_7_10_0	29
+#define EVENT_MCR_7_10_1	30
+#define EVENT_MCR_7_10_4	31
+#define EVENT_MCR_7_14_0	32
+#define EVENT_MCR_7_14_1	33
+#define EVENT_MCR_7_15_0	34
+#define EVENT_MCR_8_5_X		35
+#define EVENT_MCR_8_6_X		36
+#define EVENT_MCR_8_7_X		37
+#define EVENT_EMUL_LSMULT	38
+
+#define KVM_EVENTC_ITEMS	39
+
+void kvm_arm_init_eventc(void);
+void kvm_arm_count_event(unsigned int event);
+void kvm_dump_vcpu_state(void);
+
+void trace_ws_enter(u32 guest_pc);
+void trace_ws_exit(u32 guest_pc, u32 exit_code);
+
+
+#define print_fn_args struct seq_file *, const char *, ...
+void print_kvm_debug_info(int (*print_fn)(print_fn_args), struct seq_file *m);
+
+
+void __kvm_print_msg(char *_fmt, ...);
+
+#define kvm_err(err, fmt, args...) do {			\
+	__kvm_print_msg(KERN_ERR "KVM error [%s:%d]: (%d) ", \
+			__FUNCTION__, __LINE__, err); \
+	__kvm_print_msg(fmt "\n", ##args); \
+} while (0)
+
+#define __kvm_msg(fmt, args...) do {			\
+	__kvm_print_msg(KERN_ERR "KVM [%s:%d]: ", __FUNCTION__, __LINE__); \
+	__kvm_print_msg(fmt, ##args); \
+} while (0)
+
+#define kvm_msg(__fmt, __args...) __kvm_msg(__fmt "\n", ##__args)
+
+
+#define KVMARM_NOT_IMPLEMENTED() \
+   { \
+	    printk(KERN_ERR "KVM not implemented [%s:%d] in %s \n", \
+		   __FILE__, __LINE__, __FUNCTION__); \
+   }
+
+extern bool trace_gva_to_gfn;
+void print_shadow_mapping(struct kvm_vcpu *vcpu, gva_t gva);
+void print_ws_trace(void);
+void kvm_trace_activity(unsigned int activity, char *fmt, ...);
+
+#endif  /* __ARM_KVM_TRACE_H__ */
diff --git a/arch/arm/mach-vexpress/Kconfig b/arch/arm/mach-vexpress/Kconfig
index cfbc0f1..5b00905 100644
--- a/arch/arm/mach-vexpress/Kconfig
+++ b/arch/arm/mach-vexpress/Kconfig
@@ -33,6 +33,7 @@  config ARCH_VEXPRESS_CA15X4
 	bool "Versatile Express Cortex-A15x4 tile"
 	depends on VEXPRESS_EXTENDED_MEMORY_MAP
 	select CPU_V7
+	select ARM_VIRT_EXT
 	select ARM_GIC
 	select HAVE_ARCH_TIMERS
 
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 04de742..ad77805 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -628,6 +628,13 @@  config ARM_LPAE
 	  Say Y if you have an ARMv7 processor supporting the LPAE page table
 	  format and you would like to access memory beyond the 4GB limit.
 
+config ARM_VIRT_EXT
+	bool "Support for ARM Virtualization Extensions"
+	depends on ARM_LPAE
+	help
+	  Say Y if you have an ARMv7 processor supporting the ARM hardware
+	  Virtualization extensions.
+
 config ARCH_PHYS_ADDR_T_64BIT
 	def_bool ARM_LPAE
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..d2ab07e 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -310,6 +310,7 @@  struct kvm_translation {
 struct kvm_interrupt {
 	/* in */
 	__u32 irq;
+	__u8  raise;
 };
 
 /* for KVM_GET_DIRTY_LOG */