diff mbox

[v3,7/8] ARM: KVM: Handle guest faults in KVM

Message ID 20110603150414.17011.72525.stgit@ubuntu (mailing list archive)
State New, archived
Headers show

Commit Message

Christoffer Dall June 3, 2011, 3:04 p.m. UTC
Handles the guest faults in KVM by mapping in corresponding user pages
in the 2nd stage page tables.

Introduces new ARM-specific kernel memory types, PAGE_KVM_GUEST and
pgprot_guest variables used to map 2nd stage memory for KVM guests.
---
 arch/arm/include/asm/pgtable-3level.h |    9 +++
 arch/arm/include/asm/pgtable.h        |    4 ++
 arch/arm/kernel/asm-offsets.c         |    3 +
 arch/arm/kvm/arm_interrupts.S         |   14 +++++
 arch/arm/kvm/arm_mmu.c                |   86 +++++++++++++++++++++++++++++++++
 arch/arm/mm/mmu.c                     |    3 +
 6 files changed, 118 insertions(+), 1 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Avi Kivity June 5, 2011, 12:48 p.m. UTC | #1
On 06/03/2011 06:04 PM, Christoffer Dall wrote:
> Handles the guest faults in KVM by mapping in corresponding user pages
> in the 2nd stage page tables.
>
>
>
> +static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> +			  gfn_t gfn, struct kvm_memory_slot *memslot)
> +{
> +	pfn_t pfn;
> +	pgd_t *pgd;
> +	pmd_t *pmd;
> +	pte_t *pte, new_pte;
> +
> +	pfn = gfn_to_pfn(vcpu->kvm, gfn);
> +
> +	if (is_error_pfn(pfn)) {
> +		kvm_err(-EFAULT, "Guest gfn %u (0x%08lx) does not have "
> +				"corresponding host mapping",
> +				gfn, gfn<<  PAGE_SHIFT);
> +		return -EFAULT;
> +	}
> +
> +	/* Create 2nd stage page table mapping - Level 1 */
> +	pgd = vcpu->kvm->arch.pgd + pgd_index(fault_ipa);
> +	if (pgd_none(*pgd)) {
> +		pmd = pmd_alloc_one(NULL, fault_ipa);
> +		if (!pmd) {
> +			kvm_err(-ENOMEM, "Cannot allocate 2nd stage pmd");
> +			return -ENOMEM;
> +		}
> +		pgd_populate(NULL, pgd, pmd);
> +		pmd += pmd_index(fault_ipa);
> +	} else
> +		pmd = pmd_offset(pgd, fault_ipa);
> +
> +	/* Create 2nd stage page table mapping - Level 2 */
> +	if (pmd_none(*pmd)) {
> +		pte = pte_alloc_one_kernel(NULL, fault_ipa);
> +		if (!pte) {
> +			kvm_err(-ENOMEM, "Cannot allocate 2nd stage pte");
> +			return -ENOMEM;
> +		}
> +		pmd_populate_kernel(NULL, pmd, pte);
> +		pte += pte_index(fault_ipa);
> +	} else
> +		pte = pte_offset_kernel(pmd, fault_ipa);
> +
> +	/* Create 2nd stage page table mapping - Level 3 */
> +	new_pte = pfn_pte(pfn, PAGE_KVM_GUEST);
> +	set_pte_ext(pte, new_pte, 0);
> +
> +	return 0;
> +}
> +
> +#define HSR_ABT_FS	(0x3f)
> +#define HPFAR_MASK	(~0xf)
>   int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
>   {
> +	unsigned long hsr_ec;
> +	unsigned long fault_status;
> +	phys_addr_t fault_ipa;
> +	struct kvm_memory_slot *memslot = NULL;
> +	bool is_iabt;
> +	gfn_t gfn;
> +
> +	hsr_ec = vcpu->arch.hsr>>  HSR_EC_SHIFT;
> +	is_iabt = (hsr_ec == HSR_EC_IABT);
> +
> +	/* Check that the second stage fault is a translation fault */
> +	fault_status = vcpu->arch.hsr&  HSR_ABT_FS;
> +	if ((fault_status&  0x3c) != 0x4) {
> +		kvm_err(-EFAULT, "Unsupported fault status: %x",
> +				fault_status&  0x3c);
> +		return -EFAULT;
> +	}
> +
> +	fault_ipa = ((phys_addr_t)vcpu->arch.hpfar&  HPFAR_MASK)<<  8;
> +
> +	gfn = fault_ipa>>  PAGE_SHIFT;
> +	if (!kvm_is_visible_gfn(vcpu->kvm, gfn))
> +		goto io_mem_abort;
> +
> +	memslot = gfn_to_memslot(vcpu->kvm, gfn);
> +	if (memslot->user_alloc)
> +		return user_mem_abort(vcpu, fault_ipa, gfn, memslot);

Non-user_alloc should not exist for ARM (and are not supported for x86 
these days, except for a few implementation internal slots).

> +
> +io_mem_abort:
> +	if (is_iabt) {
> +		kvm_err(-EFAULT, "Inst. abort on I/O address");
> +		return -EFAULT;
> +	}
> +
> +	kvm_msg("I/O address abort...");
>   	KVMARM_NOT_IMPLEMENTED();
>   	return -EINVAL;
>   }

Okay, this is about a zillion times simpler than x86.  Congratulations.

What are your thoughts about mmu notifier support?
Christoffer Dall June 11, 2011, 10:37 a.m. UTC | #2
On Sun, Jun 5, 2011 at 2:48 PM, Avi Kivity <avi@redhat.com> wrote:
> On 06/03/2011 06:04 PM, Christoffer Dall wrote:
>>
>> Handles the guest faults in KVM by mapping in corresponding user pages
>> in the 2nd stage page tables.
>>
>>
>>
>> +static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>> +                         gfn_t gfn, struct kvm_memory_slot *memslot)
>> +{
>> +       pfn_t pfn;
>> +       pgd_t *pgd;
>> +       pmd_t *pmd;
>> +       pte_t *pte, new_pte;
>> +
>> +       pfn = gfn_to_pfn(vcpu->kvm, gfn);
>> +
>> +       if (is_error_pfn(pfn)) {
>> +               kvm_err(-EFAULT, "Guest gfn %u (0x%08lx) does not have "
>> +                               "corresponding host mapping",
>> +                               gfn, gfn<<  PAGE_SHIFT);
>> +               return -EFAULT;
>> +       }
>> +
>> +       /* Create 2nd stage page table mapping - Level 1 */
>> +       pgd = vcpu->kvm->arch.pgd + pgd_index(fault_ipa);
>> +       if (pgd_none(*pgd)) {
>> +               pmd = pmd_alloc_one(NULL, fault_ipa);
>> +               if (!pmd) {
>> +                       kvm_err(-ENOMEM, "Cannot allocate 2nd stage pmd");
>> +                       return -ENOMEM;
>> +               }
>> +               pgd_populate(NULL, pgd, pmd);
>> +               pmd += pmd_index(fault_ipa);
>> +       } else
>> +               pmd = pmd_offset(pgd, fault_ipa);
>> +
>> +       /* Create 2nd stage page table mapping - Level 2 */
>> +       if (pmd_none(*pmd)) {
>> +               pte = pte_alloc_one_kernel(NULL, fault_ipa);
>> +               if (!pte) {
>> +                       kvm_err(-ENOMEM, "Cannot allocate 2nd stage pte");
>> +                       return -ENOMEM;
>> +               }
>> +               pmd_populate_kernel(NULL, pmd, pte);
>> +               pte += pte_index(fault_ipa);
>> +       } else
>> +               pte = pte_offset_kernel(pmd, fault_ipa);
>> +
>> +       /* Create 2nd stage page table mapping - Level 3 */
>> +       new_pte = pfn_pte(pfn, PAGE_KVM_GUEST);
>> +       set_pte_ext(pte, new_pte, 0);
>> +
>> +       return 0;
>> +}
>> +
>> +#define HSR_ABT_FS     (0x3f)
>> +#define HPFAR_MASK     (~0xf)
>>  int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
>>  {
>> +       unsigned long hsr_ec;
>> +       unsigned long fault_status;
>> +       phys_addr_t fault_ipa;
>> +       struct kvm_memory_slot *memslot = NULL;
>> +       bool is_iabt;
>> +       gfn_t gfn;
>> +
>> +       hsr_ec = vcpu->arch.hsr>>  HSR_EC_SHIFT;
>> +       is_iabt = (hsr_ec == HSR_EC_IABT);
>> +
>> +       /* Check that the second stage fault is a translation fault */
>> +       fault_status = vcpu->arch.hsr&  HSR_ABT_FS;
>> +       if ((fault_status&  0x3c) != 0x4) {
>> +               kvm_err(-EFAULT, "Unsupported fault status: %x",
>> +                               fault_status&  0x3c);
>> +               return -EFAULT;
>> +       }
>> +
>> +       fault_ipa = ((phys_addr_t)vcpu->arch.hpfar&  HPFAR_MASK)<<  8;
>> +
>> +       gfn = fault_ipa>>  PAGE_SHIFT;
>> +       if (!kvm_is_visible_gfn(vcpu->kvm, gfn))
>> +               goto io_mem_abort;
>> +
>> +       memslot = gfn_to_memslot(vcpu->kvm, gfn);
>> +       if (memslot->user_alloc)
>> +               return user_mem_abort(vcpu, fault_ipa, gfn, memslot);
>
> Non-user_alloc should not exist for ARM (and are not supported for x86 these
> days, except for a few implementation internal slots).

ok, I raise an error in when (!memslot->user_alloc) instead now. thanks.

>
>> +
>> +io_mem_abort:
>> +       if (is_iabt) {
>> +               kvm_err(-EFAULT, "Inst. abort on I/O address");
>> +               return -EFAULT;
>> +       }
>> +
>> +       kvm_msg("I/O address abort...");
>>        KVMARM_NOT_IMPLEMENTED();
>>        return -EINVAL;
>>  }
>
> Okay, this is about a zillion times simpler than x86.  Congratulations.

Well, I need to handle the I/O aborts, but it's quite simple. What
makes it much more complicated on x86?

>
> What are your thoughts about mmu notifier support?

For what purpose? There is no swapping on ARM, so only case that jumps
to my mind is for KSM. And I'm not quite there yet :)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 12, 2011, 8:24 a.m. UTC | #3
On 06/11/2011 01:37 PM, Christoffer Dall wrote:
> >
> >  Okay, this is about a zillion times simpler than x86.  Congratulations.
>
> Well, I need to handle the I/O aborts, but it's quite simple. What
> makes it much more complicated on x86?

- lack of nested paging on earlier processors
- 97 different paging modes
- lots of extra bits bringing in wierd functionality
- lots of optimizations

> >
> >  What are your thoughts about mmu notifier support?
>
> For what purpose? There is no swapping on ARM, so only case that jumps
> to my mind is for KSM. And I'm not quite there yet :)

Really?  I imaging swapping will be needed for server workloads.  mmu 
notifiers are also useful for transparent hugepages and page 
migrations.  I imagine these will all follow if ARM servers take off.
Christoffer Dall June 12, 2011, 8:57 a.m. UTC | #4
>> >
>> >  What are your thoughts about mmu notifier support?
>>
>> For what purpose? There is no swapping on ARM, so only case that jumps
>> to my mind is for KSM. And I'm not quite there yet :)
>
> Really?  I imaging swapping will be needed for server workloads.  mmu
> notifiers are also useful for transparent hugepages and page migrations.  I
> imagine these will all follow if ARM servers take off.
>

You may be right, but I guess it depends how ARM servers are going to
be used. I agree though, swapping could very well be useful in an ARM
server scenario and at that time mmu notifier integration should be
looked into, but it's not in my critical path yet.

Thanks for the input.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 14a3e28..f90d120 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -103,4 +103,13 @@ 
  */
 #define L_PGD_SWAPPER		(_AT(pgdval_t, 1) << 55)	/* swapper_pg_dir entry */
 
+/*
+ * 2-nd stage PTE definitions for LPAE.
+ */
+#define L_PTE2_READ		(_AT(pteval_t, 1) << 6)		/* HAP[0] */
+#define L_PTE2_WRITE		(_AT(pteval_t, 1) << 7)		/* HAP[1] */
+#define L_PTE2_NORM_WB		(_AT(pteval_t, 3) << 4)		/* MemAttr[3:2] */
+#define L_PTE2_INNER_WB		(_AT(pteval_t, 3) << 2)		/* MemAttr[1:0] */
+
+
 #endif /* _ASM_PGTABLE_3LEVEL_H */
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 2906f35..c4e71ff 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -75,6 +75,7 @@  extern void __pgd_error(const char *file, int line, pgd_t);
 
 extern pgprot_t		pgprot_user;
 extern pgprot_t		pgprot_kernel;
+extern pgprot_t		pgprot_guest;
 
 #define _MOD_PROT(p, b)	__pgprot(pgprot_val(p) | (b))
 
@@ -88,6 +89,9 @@  extern pgprot_t		pgprot_kernel;
 #define PAGE_KERNEL		_MOD_PROT(pgprot_kernel, L_PTE_XN)
 #define PAGE_KERNEL_EXEC	pgprot_kernel
 #define PAGE_HYP		_MOD_PROT(pgprot_kernel, L_PTE_USER)
+#define PAGE_KVM_GUEST		_MOD_PROT(pgprot_guest, L_PTE2_READ | \
+					  L_PTE2_WRITE | L_PTE2_NORM_WB | \
+					  L_PTE2_INNER_WB)
 
 #define __PAGE_NONE		__pgprot(_L_PTE_DEFAULT | L_PTE_RDONLY | L_PTE_XN)
 #define __PAGE_SHARED		__pgprot(_L_PTE_DEFAULT | L_PTE_USER | L_PTE_XN)
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 769fa97..9f485aa 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -139,6 +139,9 @@  int main(void)
   DEFINE(VCPU_PC,		offsetof(struct kvm_vcpu, arch.regs.pc));
   DEFINE(VCPU_CPSR,		offsetof(struct kvm_vcpu, arch.regs.cpsr));
   DEFINE(VCPU_HSR,		offsetof(struct kvm_vcpu, arch.hsr));
+  DEFINE(VCPU_HDFAR,		offsetof(struct kvm_vcpu, arch.hdfar));
+  DEFINE(VCPU_HIFAR,		offsetof(struct kvm_vcpu, arch.hifar));
+  DEFINE(VCPU_HPFAR,		offsetof(struct kvm_vcpu, arch.hpfar));
   DEFINE(KVM_VTTBR,		offsetof(struct kvm, arch.vttbr));
 #endif
   return 0; 
diff --git a/arch/arm/kvm/arm_interrupts.S b/arch/arm/kvm/arm_interrupts.S
index 6d3044c..689b337 100644
--- a/arch/arm/kvm/arm_interrupts.S
+++ b/arch/arm/kvm/arm_interrupts.S
@@ -483,7 +483,19 @@  guest_trap:
 	stmia	r1, {r3, r4, r5}
 	sub	r1, r1, #VCPU_USR_REG(0)
 
-	mov	r0, #ARM_EXCEPTION_HVC
+	@ Check if we need the fault information
+	lsr	r0, r0, #HSR_EC_SHIFT
+	cmp	r0, #HSR_EC_IABT
+	beq	2f
+	cmp	r0, #HSR_EC_DABT
+	beq	2f
+	b	1f
+2:	mrc	p15, 4, r2, c6, c0, 0	@ HDFAR
+	mrc	p15, 4, r3, c6, c0, 2	@ HIFAR
+	mrc	p15, 4, r4, c6, c0, 4	@ HPFAR
+	add	r5, r1, #VCPU_HDFAR
+	stmia	r5, {r2, r3, r4}
+1:	mov	r0, #ARM_EXCEPTION_HVC
 	b	__kvm_vcpu_return
 
 	.align
diff --git a/arch/arm/kvm/arm_mmu.c b/arch/arm/kvm/arm_mmu.c
index 683f971..fe27e59 100644
--- a/arch/arm/kvm/arm_mmu.c
+++ b/arch/arm/kvm/arm_mmu.c
@@ -248,8 +248,94 @@  void kvm_free_stage2_pgd(struct kvm *kvm)
 	KVMARM_NOT_IMPLEMENTED();
 }
 
+static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+			  gfn_t gfn, struct kvm_memory_slot *memslot)
+{
+	pfn_t pfn;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte, new_pte;
+
+	pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+	if (is_error_pfn(pfn)) {
+		kvm_err(-EFAULT, "Guest gfn %u (0x%08lx) does not have "
+				"corresponding host mapping",
+				gfn, gfn << PAGE_SHIFT);
+		return -EFAULT;
+	}
+
+	/* Create 2nd stage page table mapping - Level 1 */
+	pgd = vcpu->kvm->arch.pgd + pgd_index(fault_ipa);
+	if (pgd_none(*pgd)) {
+		pmd = pmd_alloc_one(NULL, fault_ipa);
+		if (!pmd) {
+			kvm_err(-ENOMEM, "Cannot allocate 2nd stage pmd");
+			return -ENOMEM;
+		}
+		pgd_populate(NULL, pgd, pmd);
+		pmd += pmd_index(fault_ipa);
+	} else
+		pmd = pmd_offset(pgd, fault_ipa);
+
+	/* Create 2nd stage page table mapping - Level 2 */
+	if (pmd_none(*pmd)) {
+		pte = pte_alloc_one_kernel(NULL, fault_ipa);
+		if (!pte) {
+			kvm_err(-ENOMEM, "Cannot allocate 2nd stage pte");
+			return -ENOMEM;
+		}
+		pmd_populate_kernel(NULL, pmd, pte);
+		pte += pte_index(fault_ipa);
+	} else
+		pte = pte_offset_kernel(pmd, fault_ipa);
+
+	/* Create 2nd stage page table mapping - Level 3 */
+	new_pte = pfn_pte(pfn, PAGE_KVM_GUEST);
+	set_pte_ext(pte, new_pte, 0);
+
+	return 0;
+}
+
+#define HSR_ABT_FS	(0x3f)
+#define HPFAR_MASK	(~0xf)
 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
+	unsigned long hsr_ec;
+	unsigned long fault_status;
+	phys_addr_t fault_ipa;
+	struct kvm_memory_slot *memslot = NULL;
+	bool is_iabt;
+	gfn_t gfn;
+
+	hsr_ec = vcpu->arch.hsr >> HSR_EC_SHIFT;
+	is_iabt = (hsr_ec == HSR_EC_IABT);
+
+	/* Check that the second stage fault is a translation fault */
+	fault_status = vcpu->arch.hsr & HSR_ABT_FS;
+	if ((fault_status & 0x3c) != 0x4) {
+		kvm_err(-EFAULT, "Unsupported fault status: %x",
+				fault_status & 0x3c);
+		return -EFAULT;
+	}
+
+	fault_ipa = ((phys_addr_t)vcpu->arch.hpfar & HPFAR_MASK) << 8;
+
+	gfn = fault_ipa >> PAGE_SHIFT;
+	if (!kvm_is_visible_gfn(vcpu->kvm, gfn))
+		goto io_mem_abort;
+
+	memslot = gfn_to_memslot(vcpu->kvm, gfn);
+	if (memslot->user_alloc)
+		return user_mem_abort(vcpu, fault_ipa, gfn, memslot);
+
+io_mem_abort:
+	if (is_iabt) {
+		kvm_err(-EFAULT, "Inst. abort on I/O address");
+		return -EFAULT;
+	}
+
+	kvm_msg("I/O address abort...");
 	KVMARM_NOT_IMPLEMENTED();
 	return -EINVAL;
 }
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index d1da559..c5cbcd3 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -55,9 +55,11 @@  static unsigned int cachepolicy __initdata = CPOLICY_WRITEBACK;
 static unsigned int ecc_mask __initdata = 0;
 pgprot_t pgprot_user;
 pgprot_t pgprot_kernel;
+pgprot_t pgprot_guest;
 
 EXPORT_SYMBOL(pgprot_user);
 EXPORT_SYMBOL(pgprot_kernel);
+EXPORT_SYMBOL(pgprot_guest);
 
 struct cachepolicy {
 	const char	policy[16];
@@ -497,6 +499,7 @@  static void __init build_mem_type_table(void)
 	pgprot_user   = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | user_pgprot);
 	pgprot_kernel = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG |
 				 L_PTE_DIRTY | kern_pgprot);
+	pgprot_guest  = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG);
 
 	mem_types[MT_LOW_VECTORS].prot_l1 |= ecc_mask;
 	mem_types[MT_HIGH_VECTORS].prot_l1 |= ecc_mask;