Message ID | 1380832591-4789-2-git-send-email-christoffer.dall@linaro.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Christoffer, On 03/10/13 21:36, Christoffer Dall wrote: > Support huge pages in KVM/ARM and KVM/ARM64. The pud_huge checking on > the unmap path may feel a bit silly as the pud_huge check is always > defined to false, but the compiler should be smart about this. > > Note: This deals only with VMAs marked as huge which are allocated by > users through hugetlbfs only. Transparent huge pages can only be > detected by looking at the underlying pages (or the page tables > themselves) and this patch so far simply maps these on a page-by-page > level in the Stage-2 page tables. > > Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org> > > --- > Changelog[v2]: > - Invaliate TLB on clearing stage-2 huge page entries > - Restructure unmap_range to share more code > - Share stage_2 pmd allocation > - Factor out THP support to separate patch > - Fix incorrect gfn_to_pfn_prot and mmu_seq ordering > --- > arch/arm/include/asm/kvm_mmu.h | 17 +++- > arch/arm/kvm/mmu.c | 172 +++++++++++++++++++++++++++++--------- > arch/arm64/include/asm/kvm_mmu.h | 12 ++- > 3 files changed, 157 insertions(+), 44 deletions(-) > > diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h > index 9b28c41..d629fb3 100644 > --- a/arch/arm/include/asm/kvm_mmu.h > +++ b/arch/arm/include/asm/kvm_mmu.h > @@ -62,6 +62,12 @@ phys_addr_t kvm_get_idmap_vector(void); > int kvm_mmu_init(void); > void kvm_clear_hyp_idmap(void); > > +static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd) > +{ > + *pmd = new_pmd; > + flush_pmd_entry(pmd); > +} > + > static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) > { > *pte = new_pte; > @@ -103,9 +109,15 @@ static inline void kvm_set_s2pte_writable(pte_t *pte) > pte_val(*pte) |= L_PTE_S2_RDWR; > } > > +static inline void kvm_set_s2pmd_writable(pmd_t *pmd) > +{ > + pmd_val(*pmd) |= L_PTE_S2_RDWR; It feels a bit weird to use a PTE specific flag to mess with a PMD. Can you define a new flag for this? > +} > + > struct kvm; > > -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) > +static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, > + unsigned long size) > { > /* > * If we are going to insert an instruction page and the icache is > @@ -120,8 +132,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) > * need any kind of flushing (DDI 0406C.b - Page B3-1392). > */ > if (icache_is_pipt()) { > - unsigned long hva = gfn_to_hva(kvm, gfn); > - __cpuc_coherent_user_range(hva, hva + PAGE_SIZE); > + __cpuc_coherent_user_range(hva, hva + size); > } else if (!icache_is_vivt_asid_tagged()) { > /* any kind of VIPT cache */ > __flush_icache_all(); > diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c > index b0de86b..cab031b 100644 > --- a/arch/arm/kvm/mmu.c > +++ b/arch/arm/kvm/mmu.c > @@ -19,6 +19,7 @@ > #include <linux/mman.h> > #include <linux/kvm_host.h> > #include <linux/io.h> > +#include <linux/hugetlb.h> > #include <trace/events/kvm.h> > #include <asm/pgalloc.h> > #include <asm/cacheflush.h> > @@ -41,6 +42,8 @@ static unsigned long hyp_idmap_start; > static unsigned long hyp_idmap_end; > static phys_addr_t hyp_idmap_vector; > > +#define kvm_pmd_huge(_x) (pmd_huge(_x)) > + > static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) > { > /* > @@ -93,19 +96,29 @@ static bool page_empty(void *ptr) > > static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) > { > - pmd_t *pmd_table = pmd_offset(pud, 0); > - pud_clear(pud); > - kvm_tlb_flush_vmid_ipa(kvm, addr); > - pmd_free(NULL, pmd_table); > + if (pud_huge(*pud)) { > + pud_clear(pud); > + kvm_tlb_flush_vmid_ipa(kvm, addr); > + } else { > + pmd_t *pmd_table = pmd_offset(pud, 0); > + pud_clear(pud); > + kvm_tlb_flush_vmid_ipa(kvm, addr); > + pmd_free(NULL, pmd_table); > + } > put_page(virt_to_page(pud)); > } > > static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) > { > - pte_t *pte_table = pte_offset_kernel(pmd, 0); > - pmd_clear(pmd); > - kvm_tlb_flush_vmid_ipa(kvm, addr); > - pte_free_kernel(NULL, pte_table); > + if (kvm_pmd_huge(*pmd)) { > + pmd_clear(pmd); > + kvm_tlb_flush_vmid_ipa(kvm, addr); > + } else { > + pte_t *pte_table = pte_offset_kernel(pmd, 0); > + pmd_clear(pmd); > + kvm_tlb_flush_vmid_ipa(kvm, addr); > + pte_free_kernel(NULL, pte_table); > + } > put_page(virt_to_page(pmd)); > } > > @@ -136,18 +149,32 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, > continue; > } > > + if (pud_huge(*pud)) { > + /* > + * If we are dealing with a huge pud, just clear it and > + * move on. > + */ > + clear_pud_entry(kvm, pud, addr); > + addr = pud_addr_end(addr, end); > + continue; > + } > + > pmd = pmd_offset(pud, addr); > if (pmd_none(*pmd)) { > addr = pmd_addr_end(addr, end); > continue; > } > > - pte = pte_offset_kernel(pmd, addr); > - clear_pte_entry(kvm, pte, addr); > - next = addr + PAGE_SIZE; > + if (!kvm_pmd_huge(*pmd)) { > + pte = pte_offset_kernel(pmd, addr); > + clear_pte_entry(kvm, pte, addr); > + next = addr + PAGE_SIZE; > + } > > - /* If we emptied the pte, walk back up the ladder */ > - if (page_empty(pte)) { > + /* > + * If the pmd entry is to be cleared, walk back up the ladder > + */ > + if (kvm_pmd_huge(*pmd) || page_empty(pte)) { > clear_pmd_entry(kvm, pmd, addr); > next = pmd_addr_end(addr, end); > if (page_empty(pmd) && !page_empty(pud)) { > @@ -420,29 +447,71 @@ void kvm_free_stage2_pgd(struct kvm *kvm) > kvm->arch.pgd = NULL; > } > > - > -static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, > - phys_addr_t addr, const pte_t *new_pte, bool iomap) > +static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, > + phys_addr_t addr) > { > pgd_t *pgd; > pud_t *pud; > pmd_t *pmd; > - pte_t *pte, old_pte; > > - /* Create 2nd stage page table mapping - Level 1 */ > pgd = kvm->arch.pgd + pgd_index(addr); > pud = pud_offset(pgd, addr); > if (pud_none(*pud)) { > if (!cache) > - return 0; /* ignore calls from kvm_set_spte_hva */ > + return NULL; > pmd = mmu_memory_cache_alloc(cache); > pud_populate(NULL, pud, pmd); > get_page(virt_to_page(pud)); > } > > - pmd = pmd_offset(pud, addr); > + return pmd_offset(pud, addr); > +} > + > +static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache > + *cache, phys_addr_t addr, const pmd_t *new_pmd) > +{ > + pmd_t *pmd, old_pmd; > + > + pmd = stage2_get_pmd(kvm, cache, addr); > + VM_BUG_ON(!pmd); > > - /* Create 2nd stage page table mapping - Level 2 */ > + /* > + * Mapping in huge pages should only happen through a fault. If a > + * page is merged into a transparent huge page, the individual > + * subpages of that huge page should be unmapped through MMU > + * notifiers before we get here. > + * > + * Merging of CompoundPages is not supported; they should become > + * splitting first, unmapped, merged, and mapped back in on-demand. > + */ > + VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); > + > + old_pmd = *pmd; > + kvm_set_pmd(pmd, *new_pmd); > + if (pmd_present(old_pmd)) > + kvm_tlb_flush_vmid_ipa(kvm, addr); > + else > + get_page(virt_to_page(pmd)); > + return 0; > +} > + > +static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, > + phys_addr_t addr, const pte_t *new_pte, bool iomap) > +{ > + pmd_t *pmd; > + pte_t *pte, old_pte; > + > + /* Create stage-2 page table mapping - Level 1 */ > + pmd = stage2_get_pmd(kvm, cache, addr); > + if (!pmd) { > + /* > + * Ignore calls from kvm_set_spte_hva for unallocated > + * address ranges. > + */ > + return 0; > + } > + > + /* Create stage-2 page mappings - Level 2 */ > if (pmd_none(*pmd)) { > if (!cache) > return 0; /* ignore calls from kvm_set_spte_hva */ > @@ -508,15 +577,18 @@ out: > } > > static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > - gfn_t gfn, struct kvm_memory_slot *memslot, > + struct kvm_memory_slot *memslot, > unsigned long fault_status) > { > - pte_t new_pte; > - pfn_t pfn; > int ret; > - bool write_fault, writable; > + bool write_fault, writable, hugetlb = false, force_pte = false; > unsigned long mmu_seq; > + gfn_t gfn = fault_ipa >> PAGE_SHIFT; > + unsigned long hva = gfn_to_hva(vcpu->kvm, gfn); > + struct kvm *kvm = vcpu->kvm; > struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; > + struct vm_area_struct *vma; > + pfn_t pfn; > > write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); > if (fault_status == FSC_PERM && !write_fault) { > @@ -524,6 +596,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > return -EFAULT; > } > > + /* Let's check if we will get back a huge page backed by hugetlbfs */ > + down_read(¤t->mm->mmap_sem); > + vma = find_vma_intersection(current->mm, hva, hva + 1); > + if (is_vm_hugetlb_page(vma)) { > + hugetlb = true; > + gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; > + } else { > + if (vma->vm_start & ~PMD_MASK) > + force_pte = true; What is this test for exactly? I have the feeling this is actually for THP (since nothing in this patch is actually using force_pte), but I don't really get it. It probably deserves a comment. > + } > + up_read(¤t->mm->mmap_sem); > + > /* We need minimum second+third level pages */ > ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS); > if (ret) > @@ -541,26 +625,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > */ > smp_rmb(); > > - pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable); > + pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); > if (is_error_pfn(pfn)) > return -EFAULT; > > - new_pte = pfn_pte(pfn, PAGE_S2); > - coherent_icache_guest_page(vcpu->kvm, gfn); > - > - spin_lock(&vcpu->kvm->mmu_lock); > - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) > + spin_lock(&kvm->mmu_lock); > + if (mmu_notifier_retry(kvm, mmu_seq)) > goto out_unlock; > - if (writable) { > - kvm_set_s2pte_writable(&new_pte); > - kvm_set_pfn_dirty(pfn); > + > + if (hugetlb) { > + pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); > + new_pmd = pmd_mkhuge(new_pmd); > + if (writable) { > + kvm_set_s2pmd_writable(&new_pmd); > + kvm_set_pfn_dirty(pfn); > + } > + coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE); > + ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); > + } else { > + pte_t new_pte = pfn_pte(pfn, PAGE_S2); > + if (writable) { > + kvm_set_s2pte_writable(&new_pte); > + kvm_set_pfn_dirty(pfn); > + } > + coherent_icache_guest_page(kvm, hva, PAGE_SIZE); > + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); > } > - stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false); > + > > out_unlock: > - spin_unlock(&vcpu->kvm->mmu_lock); > + spin_unlock(&kvm->mmu_lock); > kvm_release_pfn_clean(pfn); > - return 0; > + return ret; > } > > /** > @@ -629,7 +725,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) > > memslot = gfn_to_memslot(vcpu->kvm, gfn); > > - ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status); > + ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status); > if (ret == 0) > ret = 1; > out_unlock: > diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h > index efe609c..e7f3852 100644 > --- a/arch/arm64/include/asm/kvm_mmu.h > +++ b/arch/arm64/include/asm/kvm_mmu.h > @@ -91,6 +91,7 @@ int kvm_mmu_init(void); > void kvm_clear_hyp_idmap(void); > > #define kvm_set_pte(ptep, pte) set_pte(ptep, pte) > +#define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd) > > static inline bool kvm_is_write_fault(unsigned long esr) > { > @@ -116,13 +117,18 @@ static inline void kvm_set_s2pte_writable(pte_t *pte) > pte_val(*pte) |= PTE_S2_RDWR; > } > > +static inline void kvm_set_s2pmd_writable(pmd_t *pmd) > +{ > + pmd_val(*pmd) |= PTE_S2_RDWR; > +} Same comment as above. > struct kvm; > > -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) > +static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, > + unsigned long size) > { > if (!icache_is_aliasing()) { /* PIPT */ > - unsigned long hva = gfn_to_hva(kvm, gfn); > - flush_icache_range(hva, hva + PAGE_SIZE); > + flush_icache_range(hva, hva + size); > } else if (!icache_is_aivivt()) { /* non ASID-tagged VIVT */ > /* any kind of VIPT cache */ > __flush_icache_all(); It is otherwise starting to look good. M.
On Fri, Oct 04, 2013 at 09:50:35AM +0100, Marc Zyngier wrote: > Hi Christoffer, > > On 03/10/13 21:36, Christoffer Dall wrote: > > Support huge pages in KVM/ARM and KVM/ARM64. The pud_huge checking on > > the unmap path may feel a bit silly as the pud_huge check is always > > defined to false, but the compiler should be smart about this. > > > > Note: This deals only with VMAs marked as huge which are allocated by > > users through hugetlbfs only. Transparent huge pages can only be > > detected by looking at the underlying pages (or the page tables > > themselves) and this patch so far simply maps these on a page-by-page > > level in the Stage-2 page tables. > > > > Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org> > > > > --- > > Changelog[v2]: > > - Invaliate TLB on clearing stage-2 huge page entries > > - Restructure unmap_range to share more code > > - Share stage_2 pmd allocation > > - Factor out THP support to separate patch > > - Fix incorrect gfn_to_pfn_prot and mmu_seq ordering > > --- > > arch/arm/include/asm/kvm_mmu.h | 17 +++- > > arch/arm/kvm/mmu.c | 172 +++++++++++++++++++++++++++++--------- > > arch/arm64/include/asm/kvm_mmu.h | 12 ++- > > 3 files changed, 157 insertions(+), 44 deletions(-) > > > > diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h > > index 9b28c41..d629fb3 100644 > > --- a/arch/arm/include/asm/kvm_mmu.h > > +++ b/arch/arm/include/asm/kvm_mmu.h > > @@ -62,6 +62,12 @@ phys_addr_t kvm_get_idmap_vector(void); > > int kvm_mmu_init(void); > > void kvm_clear_hyp_idmap(void); > > > > +static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd) > > +{ > > + *pmd = new_pmd; > > + flush_pmd_entry(pmd); > > +} > > + > > static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) > > { > > *pte = new_pte; > > @@ -103,9 +109,15 @@ static inline void kvm_set_s2pte_writable(pte_t *pte) > > pte_val(*pte) |= L_PTE_S2_RDWR; > > } > > > > +static inline void kvm_set_s2pmd_writable(pmd_t *pmd) > > +{ > > + pmd_val(*pmd) |= L_PTE_S2_RDWR; > > It feels a bit weird to use a PTE specific flag to mess with a PMD. Can > you define a new flag for this? > yup > > +} > > + > > struct kvm; > > > > -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) > > +static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, > > + unsigned long size) > > { > > /* > > * If we are going to insert an instruction page and the icache is > > @@ -120,8 +132,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) > > * need any kind of flushing (DDI 0406C.b - Page B3-1392). > > */ > > if (icache_is_pipt()) { > > - unsigned long hva = gfn_to_hva(kvm, gfn); > > - __cpuc_coherent_user_range(hva, hva + PAGE_SIZE); > > + __cpuc_coherent_user_range(hva, hva + size); > > } else if (!icache_is_vivt_asid_tagged()) { > > /* any kind of VIPT cache */ > > __flush_icache_all(); > > diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c > > index b0de86b..cab031b 100644 > > --- a/arch/arm/kvm/mmu.c > > +++ b/arch/arm/kvm/mmu.c > > @@ -19,6 +19,7 @@ > > #include <linux/mman.h> > > #include <linux/kvm_host.h> > > #include <linux/io.h> > > +#include <linux/hugetlb.h> > > #include <trace/events/kvm.h> > > #include <asm/pgalloc.h> > > #include <asm/cacheflush.h> > > @@ -41,6 +42,8 @@ static unsigned long hyp_idmap_start; > > static unsigned long hyp_idmap_end; > > static phys_addr_t hyp_idmap_vector; > > > > +#define kvm_pmd_huge(_x) (pmd_huge(_x)) > > + > > static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) > > { > > /* > > @@ -93,19 +96,29 @@ static bool page_empty(void *ptr) > > > > static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) > > { > > - pmd_t *pmd_table = pmd_offset(pud, 0); > > - pud_clear(pud); > > - kvm_tlb_flush_vmid_ipa(kvm, addr); > > - pmd_free(NULL, pmd_table); > > + if (pud_huge(*pud)) { > > + pud_clear(pud); > > + kvm_tlb_flush_vmid_ipa(kvm, addr); > > + } else { > > + pmd_t *pmd_table = pmd_offset(pud, 0); > > + pud_clear(pud); > > + kvm_tlb_flush_vmid_ipa(kvm, addr); > > + pmd_free(NULL, pmd_table); > > + } > > put_page(virt_to_page(pud)); > > } > > > > static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) > > { > > - pte_t *pte_table = pte_offset_kernel(pmd, 0); > > - pmd_clear(pmd); > > - kvm_tlb_flush_vmid_ipa(kvm, addr); > > - pte_free_kernel(NULL, pte_table); > > + if (kvm_pmd_huge(*pmd)) { > > + pmd_clear(pmd); > > + kvm_tlb_flush_vmid_ipa(kvm, addr); > > + } else { > > + pte_t *pte_table = pte_offset_kernel(pmd, 0); > > + pmd_clear(pmd); > > + kvm_tlb_flush_vmid_ipa(kvm, addr); > > + pte_free_kernel(NULL, pte_table); > > + } > > put_page(virt_to_page(pmd)); > > } > > > > @@ -136,18 +149,32 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, > > continue; > > } > > > > + if (pud_huge(*pud)) { > > + /* > > + * If we are dealing with a huge pud, just clear it and > > + * move on. > > + */ > > + clear_pud_entry(kvm, pud, addr); > > + addr = pud_addr_end(addr, end); > > + continue; > > + } > > + > > pmd = pmd_offset(pud, addr); > > if (pmd_none(*pmd)) { > > addr = pmd_addr_end(addr, end); > > continue; > > } > > > > - pte = pte_offset_kernel(pmd, addr); > > - clear_pte_entry(kvm, pte, addr); > > - next = addr + PAGE_SIZE; > > + if (!kvm_pmd_huge(*pmd)) { > > + pte = pte_offset_kernel(pmd, addr); > > + clear_pte_entry(kvm, pte, addr); > > + next = addr + PAGE_SIZE; > > + } > > > > - /* If we emptied the pte, walk back up the ladder */ > > - if (page_empty(pte)) { > > + /* > > + * If the pmd entry is to be cleared, walk back up the ladder > > + */ > > + if (kvm_pmd_huge(*pmd) || page_empty(pte)) { > > clear_pmd_entry(kvm, pmd, addr); > > next = pmd_addr_end(addr, end); > > if (page_empty(pmd) && !page_empty(pud)) { > > @@ -420,29 +447,71 @@ void kvm_free_stage2_pgd(struct kvm *kvm) > > kvm->arch.pgd = NULL; > > } > > > > - > > -static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, > > - phys_addr_t addr, const pte_t *new_pte, bool iomap) > > +static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, > > + phys_addr_t addr) > > { > > pgd_t *pgd; > > pud_t *pud; > > pmd_t *pmd; > > - pte_t *pte, old_pte; > > > > - /* Create 2nd stage page table mapping - Level 1 */ > > pgd = kvm->arch.pgd + pgd_index(addr); > > pud = pud_offset(pgd, addr); > > if (pud_none(*pud)) { > > if (!cache) > > - return 0; /* ignore calls from kvm_set_spte_hva */ > > + return NULL; > > pmd = mmu_memory_cache_alloc(cache); > > pud_populate(NULL, pud, pmd); > > get_page(virt_to_page(pud)); > > } > > > > - pmd = pmd_offset(pud, addr); > > + return pmd_offset(pud, addr); > > +} > > + > > +static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache > > + *cache, phys_addr_t addr, const pmd_t *new_pmd) > > +{ > > + pmd_t *pmd, old_pmd; > > + > > + pmd = stage2_get_pmd(kvm, cache, addr); > > + VM_BUG_ON(!pmd); > > > > - /* Create 2nd stage page table mapping - Level 2 */ > > + /* > > + * Mapping in huge pages should only happen through a fault. If a > > + * page is merged into a transparent huge page, the individual > > + * subpages of that huge page should be unmapped through MMU > > + * notifiers before we get here. > > + * > > + * Merging of CompoundPages is not supported; they should become > > + * splitting first, unmapped, merged, and mapped back in on-demand. > > + */ > > + VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); > > + > > + old_pmd = *pmd; > > + kvm_set_pmd(pmd, *new_pmd); > > + if (pmd_present(old_pmd)) > > + kvm_tlb_flush_vmid_ipa(kvm, addr); > > + else > > + get_page(virt_to_page(pmd)); > > + return 0; > > +} > > + > > +static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, > > + phys_addr_t addr, const pte_t *new_pte, bool iomap) > > +{ > > + pmd_t *pmd; > > + pte_t *pte, old_pte; > > + > > + /* Create stage-2 page table mapping - Level 1 */ > > + pmd = stage2_get_pmd(kvm, cache, addr); > > + if (!pmd) { > > + /* > > + * Ignore calls from kvm_set_spte_hva for unallocated > > + * address ranges. > > + */ > > + return 0; > > + } > > + > > + /* Create stage-2 page mappings - Level 2 */ > > if (pmd_none(*pmd)) { > > if (!cache) > > return 0; /* ignore calls from kvm_set_spte_hva */ > > @@ -508,15 +577,18 @@ out: > > } > > > > static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > - gfn_t gfn, struct kvm_memory_slot *memslot, > > + struct kvm_memory_slot *memslot, > > unsigned long fault_status) > > { > > - pte_t new_pte; > > - pfn_t pfn; > > int ret; > > - bool write_fault, writable; > > + bool write_fault, writable, hugetlb = false, force_pte = false; > > unsigned long mmu_seq; > > + gfn_t gfn = fault_ipa >> PAGE_SHIFT; > > + unsigned long hva = gfn_to_hva(vcpu->kvm, gfn); > > + struct kvm *kvm = vcpu->kvm; > > struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; > > + struct vm_area_struct *vma; > > + pfn_t pfn; > > > > write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); > > if (fault_status == FSC_PERM && !write_fault) { > > @@ -524,6 +596,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > return -EFAULT; > > } > > > > + /* Let's check if we will get back a huge page backed by hugetlbfs */ > > + down_read(¤t->mm->mmap_sem); > > + vma = find_vma_intersection(current->mm, hva, hva + 1); > > + if (is_vm_hugetlb_page(vma)) { > > + hugetlb = true; > > + gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; > > + } else { > > + if (vma->vm_start & ~PMD_MASK) > > + force_pte = true; > > What is this test for exactly? I have the feeling this is actually for > THP (since nothing in this patch is actually using force_pte), but I > don't really get it. It probably deserves a comment. > Yes, it's for THP, sorry it got into the wrong patch. If your VMA is not 2M aligned then there's no correlation between a THP in the process page table being mapped on the PMD level and the same set of pages being mapped in the stage-2 page tables and we simply bail from using THP and map each page in a THP using individual page mappings. (Note that this doesn't break in any way, we just don't get the benefit of huge pages in this case). > > + } > > + up_read(¤t->mm->mmap_sem); > > + > > /* We need minimum second+third level pages */ > > ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS); > > if (ret) > > @@ -541,26 +625,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > > */ > > smp_rmb(); > > > > - pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable); > > + pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); > > if (is_error_pfn(pfn)) > > return -EFAULT; > > > > - new_pte = pfn_pte(pfn, PAGE_S2); > > - coherent_icache_guest_page(vcpu->kvm, gfn); > > - > > - spin_lock(&vcpu->kvm->mmu_lock); > > - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) > > + spin_lock(&kvm->mmu_lock); > > + if (mmu_notifier_retry(kvm, mmu_seq)) > > goto out_unlock; > > - if (writable) { > > - kvm_set_s2pte_writable(&new_pte); > > - kvm_set_pfn_dirty(pfn); > > + > > + if (hugetlb) { > > + pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); > > + new_pmd = pmd_mkhuge(new_pmd); > > + if (writable) { > > + kvm_set_s2pmd_writable(&new_pmd); > > + kvm_set_pfn_dirty(pfn); > > + } > > + coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE); > > + ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); > > + } else { > > + pte_t new_pte = pfn_pte(pfn, PAGE_S2); > > + if (writable) { > > + kvm_set_s2pte_writable(&new_pte); > > + kvm_set_pfn_dirty(pfn); > > + } > > + coherent_icache_guest_page(kvm, hva, PAGE_SIZE); > > + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); > > } > > - stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false); > > + > > > > out_unlock: > > - spin_unlock(&vcpu->kvm->mmu_lock); > > + spin_unlock(&kvm->mmu_lock); > > kvm_release_pfn_clean(pfn); > > - return 0; > > + return ret; > > } > > > > /** > > @@ -629,7 +725,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) > > > > memslot = gfn_to_memslot(vcpu->kvm, gfn); > > > > - ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status); > > + ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status); > > if (ret == 0) > > ret = 1; > > out_unlock: > > diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h > > index efe609c..e7f3852 100644 > > --- a/arch/arm64/include/asm/kvm_mmu.h > > +++ b/arch/arm64/include/asm/kvm_mmu.h > > @@ -91,6 +91,7 @@ int kvm_mmu_init(void); > > void kvm_clear_hyp_idmap(void); > > > > #define kvm_set_pte(ptep, pte) set_pte(ptep, pte) > > +#define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd) > > > > static inline bool kvm_is_write_fault(unsigned long esr) > > { > > @@ -116,13 +117,18 @@ static inline void kvm_set_s2pte_writable(pte_t *pte) > > pte_val(*pte) |= PTE_S2_RDWR; > > } > > > > +static inline void kvm_set_s2pmd_writable(pmd_t *pmd) > > +{ > > + pmd_val(*pmd) |= PTE_S2_RDWR; > > +} > > Same comment as above. > yup -Christoffer
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 9b28c41..d629fb3 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -62,6 +62,12 @@ phys_addr_t kvm_get_idmap_vector(void); int kvm_mmu_init(void); void kvm_clear_hyp_idmap(void); +static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd) +{ + *pmd = new_pmd; + flush_pmd_entry(pmd); +} + static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) { *pte = new_pte; @@ -103,9 +109,15 @@ static inline void kvm_set_s2pte_writable(pte_t *pte) pte_val(*pte) |= L_PTE_S2_RDWR; } +static inline void kvm_set_s2pmd_writable(pmd_t *pmd) +{ + pmd_val(*pmd) |= L_PTE_S2_RDWR; +} + struct kvm; -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) +static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, + unsigned long size) { /* * If we are going to insert an instruction page and the icache is @@ -120,8 +132,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) * need any kind of flushing (DDI 0406C.b - Page B3-1392). */ if (icache_is_pipt()) { - unsigned long hva = gfn_to_hva(kvm, gfn); - __cpuc_coherent_user_range(hva, hva + PAGE_SIZE); + __cpuc_coherent_user_range(hva, hva + size); } else if (!icache_is_vivt_asid_tagged()) { /* any kind of VIPT cache */ __flush_icache_all(); diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index b0de86b..cab031b 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -19,6 +19,7 @@ #include <linux/mman.h> #include <linux/kvm_host.h> #include <linux/io.h> +#include <linux/hugetlb.h> #include <trace/events/kvm.h> #include <asm/pgalloc.h> #include <asm/cacheflush.h> @@ -41,6 +42,8 @@ static unsigned long hyp_idmap_start; static unsigned long hyp_idmap_end; static phys_addr_t hyp_idmap_vector; +#define kvm_pmd_huge(_x) (pmd_huge(_x)) + static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { /* @@ -93,19 +96,29 @@ static bool page_empty(void *ptr) static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { - pmd_t *pmd_table = pmd_offset(pud, 0); - pud_clear(pud); - kvm_tlb_flush_vmid_ipa(kvm, addr); - pmd_free(NULL, pmd_table); + if (pud_huge(*pud)) { + pud_clear(pud); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + pmd_t *pmd_table = pmd_offset(pud, 0); + pud_clear(pud); + kvm_tlb_flush_vmid_ipa(kvm, addr); + pmd_free(NULL, pmd_table); + } put_page(virt_to_page(pud)); } static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) { - pte_t *pte_table = pte_offset_kernel(pmd, 0); - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - pte_free_kernel(NULL, pte_table); + if (kvm_pmd_huge(*pmd)) { + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + pte_t *pte_table = pte_offset_kernel(pmd, 0); + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + pte_free_kernel(NULL, pte_table); + } put_page(virt_to_page(pmd)); } @@ -136,18 +149,32 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, continue; } + if (pud_huge(*pud)) { + /* + * If we are dealing with a huge pud, just clear it and + * move on. + */ + clear_pud_entry(kvm, pud, addr); + addr = pud_addr_end(addr, end); + continue; + } + pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { addr = pmd_addr_end(addr, end); continue; } - pte = pte_offset_kernel(pmd, addr); - clear_pte_entry(kvm, pte, addr); - next = addr + PAGE_SIZE; + if (!kvm_pmd_huge(*pmd)) { + pte = pte_offset_kernel(pmd, addr); + clear_pte_entry(kvm, pte, addr); + next = addr + PAGE_SIZE; + } - /* If we emptied the pte, walk back up the ladder */ - if (page_empty(pte)) { + /* + * If the pmd entry is to be cleared, walk back up the ladder + */ + if (kvm_pmd_huge(*pmd) || page_empty(pte)) { clear_pmd_entry(kvm, pmd, addr); next = pmd_addr_end(addr, end); if (page_empty(pmd) && !page_empty(pud)) { @@ -420,29 +447,71 @@ void kvm_free_stage2_pgd(struct kvm *kvm) kvm->arch.pgd = NULL; } - -static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, bool iomap) +static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *pte, old_pte; - /* Create 2nd stage page table mapping - Level 1 */ pgd = kvm->arch.pgd + pgd_index(addr); pud = pud_offset(pgd, addr); if (pud_none(*pud)) { if (!cache) - return 0; /* ignore calls from kvm_set_spte_hva */ + return NULL; pmd = mmu_memory_cache_alloc(cache); pud_populate(NULL, pud, pmd); get_page(virt_to_page(pud)); } - pmd = pmd_offset(pud, addr); + return pmd_offset(pud, addr); +} + +static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache + *cache, phys_addr_t addr, const pmd_t *new_pmd) +{ + pmd_t *pmd, old_pmd; + + pmd = stage2_get_pmd(kvm, cache, addr); + VM_BUG_ON(!pmd); - /* Create 2nd stage page table mapping - Level 2 */ + /* + * Mapping in huge pages should only happen through a fault. If a + * page is merged into a transparent huge page, the individual + * subpages of that huge page should be unmapped through MMU + * notifiers before we get here. + * + * Merging of CompoundPages is not supported; they should become + * splitting first, unmapped, merged, and mapped back in on-demand. + */ + VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); + + old_pmd = *pmd; + kvm_set_pmd(pmd, *new_pmd); + if (pmd_present(old_pmd)) + kvm_tlb_flush_vmid_ipa(kvm, addr); + else + get_page(virt_to_page(pmd)); + return 0; +} + +static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr, const pte_t *new_pte, bool iomap) +{ + pmd_t *pmd; + pte_t *pte, old_pte; + + /* Create stage-2 page table mapping - Level 1 */ + pmd = stage2_get_pmd(kvm, cache, addr); + if (!pmd) { + /* + * Ignore calls from kvm_set_spte_hva for unallocated + * address ranges. + */ + return 0; + } + + /* Create stage-2 page mappings - Level 2 */ if (pmd_none(*pmd)) { if (!cache) return 0; /* ignore calls from kvm_set_spte_hva */ @@ -508,15 +577,18 @@ out: } static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - gfn_t gfn, struct kvm_memory_slot *memslot, + struct kvm_memory_slot *memslot, unsigned long fault_status) { - pte_t new_pte; - pfn_t pfn; int ret; - bool write_fault, writable; + bool write_fault, writable, hugetlb = false, force_pte = false; unsigned long mmu_seq; + gfn_t gfn = fault_ipa >> PAGE_SHIFT; + unsigned long hva = gfn_to_hva(vcpu->kvm, gfn); + struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; + struct vm_area_struct *vma; + pfn_t pfn; write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM && !write_fault) { @@ -524,6 +596,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } + /* Let's check if we will get back a huge page backed by hugetlbfs */ + down_read(¤t->mm->mmap_sem); + vma = find_vma_intersection(current->mm, hva, hva + 1); + if (is_vm_hugetlb_page(vma)) { + hugetlb = true; + gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; + } else { + if (vma->vm_start & ~PMD_MASK) + force_pte = true; + } + up_read(¤t->mm->mmap_sem); + /* We need minimum second+third level pages */ ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS); if (ret) @@ -541,26 +625,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, */ smp_rmb(); - pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable); + pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); if (is_error_pfn(pfn)) return -EFAULT; - new_pte = pfn_pte(pfn, PAGE_S2); - coherent_icache_guest_page(vcpu->kvm, gfn); - - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + spin_lock(&kvm->mmu_lock); + if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (writable) { - kvm_set_s2pte_writable(&new_pte); - kvm_set_pfn_dirty(pfn); + + if (hugetlb) { + pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); + new_pmd = pmd_mkhuge(new_pmd); + if (writable) { + kvm_set_s2pmd_writable(&new_pmd); + kvm_set_pfn_dirty(pfn); + } + coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE); + ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); + } else { + pte_t new_pte = pfn_pte(pfn, PAGE_S2); + if (writable) { + kvm_set_s2pte_writable(&new_pte); + kvm_set_pfn_dirty(pfn); + } + coherent_icache_guest_page(kvm, hva, PAGE_SIZE); + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); } - stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false); + out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); + spin_unlock(&kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return 0; + return ret; } /** @@ -629,7 +725,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) memslot = gfn_to_memslot(vcpu->kvm, gfn); - ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status); + ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status); if (ret == 0) ret = 1; out_unlock: diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index efe609c..e7f3852 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -91,6 +91,7 @@ int kvm_mmu_init(void); void kvm_clear_hyp_idmap(void); #define kvm_set_pte(ptep, pte) set_pte(ptep, pte) +#define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd) static inline bool kvm_is_write_fault(unsigned long esr) { @@ -116,13 +117,18 @@ static inline void kvm_set_s2pte_writable(pte_t *pte) pte_val(*pte) |= PTE_S2_RDWR; } +static inline void kvm_set_s2pmd_writable(pmd_t *pmd) +{ + pmd_val(*pmd) |= PTE_S2_RDWR; +} + struct kvm; -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) +static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, + unsigned long size) { if (!icache_is_aliasing()) { /* PIPT */ - unsigned long hva = gfn_to_hva(kvm, gfn); - flush_icache_range(hva, hva + PAGE_SIZE); + flush_icache_range(hva, hva + size); } else if (!icache_is_aivivt()) { /* non ASID-tagged VIVT */ /* any kind of VIPT cache */ __flush_icache_all();
Support huge pages in KVM/ARM and KVM/ARM64. The pud_huge checking on the unmap path may feel a bit silly as the pud_huge check is always defined to false, but the compiler should be smart about this. Note: This deals only with VMAs marked as huge which are allocated by users through hugetlbfs only. Transparent huge pages can only be detected by looking at the underlying pages (or the page tables themselves) and this patch so far simply maps these on a page-by-page level in the Stage-2 page tables. Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org> --- Changelog[v2]: - Invaliate TLB on clearing stage-2 huge page entries - Restructure unmap_range to share more code - Share stage_2 pmd allocation - Factor out THP support to separate patch - Fix incorrect gfn_to_pfn_prot and mmu_seq ordering --- arch/arm/include/asm/kvm_mmu.h | 17 +++- arch/arm/kvm/mmu.c | 172 +++++++++++++++++++++++++++++--------- arch/arm64/include/asm/kvm_mmu.h | 12 ++- 3 files changed, 157 insertions(+), 44 deletions(-)