Message ID | 1406249768-25315-5-git-send-email-m.smarduch@samsung.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote: > This patch adds support for handling 2nd stage page faults during migration, > it disables faulting in huge pages, and dissolves huge pages to page tables. > In case migration is canceled huge pages will be used again. > > Signed-off-by: Mario Smarduch <m.smarduch@samsung.com> > --- > arch/arm/kvm/mmu.c | 31 +++++++++++++++++++++++++------ > 1 file changed, 25 insertions(+), 6 deletions(-) > > diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c > index ca84331..a17812a 100644 > --- a/arch/arm/kvm/mmu.c > +++ b/arch/arm/kvm/mmu.c > @@ -642,7 +642,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache > } > > static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, > - phys_addr_t addr, const pte_t *new_pte, bool iomap) > + phys_addr_t addr, const pte_t *new_pte, bool iomap, > + bool logging_active) > { > pmd_t *pmd; > pte_t *pte, old_pte; > @@ -657,6 +658,15 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, > return 0; > } > > + /* > + * While dirty memory logging, clear PMD entry for huge page and split > + * into smaller pages, to track dirty memory at page granularity. > + */ > + if (logging_active && kvm_pmd_huge(*pmd)) { > + phys_addr_t ipa = pmd_pfn(*pmd) << PAGE_SHIFT; > + clear_pmd_entry(kvm, pmd, ipa); clear_pmd_entry has a VM_BUG_ON(kvm_pmd_huge(*pmd)) so that is definitely not the right thing to call. > + } > + > /* Create stage-2 page mappings - Level 2 */ > if (pmd_none(*pmd)) { > if (!cache) > @@ -709,7 +719,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, > if (ret) > goto out; > spin_lock(&kvm->mmu_lock); > - ret = stage2_set_pte(kvm, &cache, addr, &pte, true); > + ret = stage2_set_pte(kvm, &cache, addr, &pte, true, false); > spin_unlock(&kvm->mmu_lock); > if (ret) > goto out; > @@ -926,6 +936,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; > struct vm_area_struct *vma; > pfn_t pfn; > + /* Get logging status, if dirty_bitmap is not NULL then logging is on */ > + #ifdef CONFIG_ARM > + bool logging_active = !!memslot->dirty_bitmap; > + #else > + bool logging_active = false; > + #endif can you make this an inline in the header files for now please? > > write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); > if (fault_status == FSC_PERM && !write_fault) { > @@ -936,7 +952,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > /* Let's check if we will get back a huge page backed by hugetlbfs */ > down_read(¤t->mm->mmap_sem); > vma = find_vma_intersection(current->mm, hva, hva + 1); > - if (is_vm_hugetlb_page(vma)) { > + if (is_vm_hugetlb_page(vma) && !logging_active) { > hugetlb = true; > gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; > } else { > @@ -979,7 +995,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > spin_lock(&kvm->mmu_lock); > if (mmu_notifier_retry(kvm, mmu_seq)) > goto out_unlock; > - if (!hugetlb && !force_pte) > + if (!hugetlb && !force_pte && !logging_active) > hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); > > if (hugetlb) { > @@ -998,9 +1014,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > kvm_set_pfn_dirty(pfn); > } > coherent_cache_guest_page(vcpu, hva, PAGE_SIZE); > - ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); > + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false, > + logging_active); > } > > + if (write_fault) > + mark_page_dirty(kvm, gfn); > > out_unlock: > spin_unlock(&kvm->mmu_lock); > @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) > { > pte_t *pte = (pte_t *)data; > > - stage2_set_pte(kvm, NULL, gpa, pte, false); > + stage2_set_pte(kvm, NULL, gpa, pte, false, false); why is logging never active if we are called from MMU notifiers? > } > > > -- > 1.7.9.5 > Thanks, -Christoffer
On 08/11/2014 12:13 PM, Christoffer Dall wrote: > On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote: >> This patch adds support for handling 2nd stage page faults during migration, >> it disables faulting in huge pages, and dissolves huge pages to page tables. >> In case migration is canceled huge pages will be used again. >> >> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com> >> --- >> arch/arm/kvm/mmu.c | 31 +++++++++++++++++++++++++------ >> 1 file changed, 25 insertions(+), 6 deletions(-) >> >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c >> index ca84331..a17812a 100644 >> --- a/arch/arm/kvm/mmu.c >> +++ b/arch/arm/kvm/mmu.c >> @@ -642,7 +642,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache >> } >> >> static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, >> - phys_addr_t addr, const pte_t *new_pte, bool iomap) >> + phys_addr_t addr, const pte_t *new_pte, bool iomap, >> + bool logging_active) >> { >> pmd_t *pmd; >> pte_t *pte, old_pte; >> @@ -657,6 +658,15 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, >> return 0; >> } >> >> + /* >> + * While dirty memory logging, clear PMD entry for huge page and split >> + * into smaller pages, to track dirty memory at page granularity. >> + */ >> + if (logging_active && kvm_pmd_huge(*pmd)) { >> + phys_addr_t ipa = pmd_pfn(*pmd) << PAGE_SHIFT; >> + clear_pmd_entry(kvm, pmd, ipa); > > clear_pmd_entry has a VM_BUG_ON(kvm_pmd_huge(*pmd)) so that is > definitely not the right thing to call. I don't see that in 3.15rc1/rc4 - static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) { if (kvm_pmd_huge(*pmd)) { pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); } else { [....] } I thought the purpose of this function was to clear PMD entry. Also ran hundreds of tests no problems. Hmmm confused. > >> + } >> + >> /* Create stage-2 page mappings - Level 2 */ >> if (pmd_none(*pmd)) { >> if (!cache) >> @@ -709,7 +719,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, >> if (ret) >> goto out; >> spin_lock(&kvm->mmu_lock); >> - ret = stage2_set_pte(kvm, &cache, addr, &pte, true); >> + ret = stage2_set_pte(kvm, &cache, addr, &pte, true, false); >> spin_unlock(&kvm->mmu_lock); >> if (ret) >> goto out; >> @@ -926,6 +936,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, >> struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; >> struct vm_area_struct *vma; >> pfn_t pfn; >> + /* Get logging status, if dirty_bitmap is not NULL then logging is on */ >> + #ifdef CONFIG_ARM >> + bool logging_active = !!memslot->dirty_bitmap; >> + #else >> + bool logging_active = false; >> + #endif > > can you make this an inline in the header files for now please? Yes definitely. > >> >> write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); >> if (fault_status == FSC_PERM && !write_fault) { >> @@ -936,7 +952,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, >> /* Let's check if we will get back a huge page backed by hugetlbfs */ >> down_read(¤t->mm->mmap_sem); >> vma = find_vma_intersection(current->mm, hva, hva + 1); >> - if (is_vm_hugetlb_page(vma)) { >> + if (is_vm_hugetlb_page(vma) && !logging_active) { >> hugetlb = true; >> gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; >> } else { >> @@ -979,7 +995,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, >> spin_lock(&kvm->mmu_lock); >> if (mmu_notifier_retry(kvm, mmu_seq)) >> goto out_unlock; >> - if (!hugetlb && !force_pte) >> + if (!hugetlb && !force_pte && !logging_active) >> hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); >> >> if (hugetlb) { >> @@ -998,9 +1014,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, >> kvm_set_pfn_dirty(pfn); >> } >> coherent_cache_guest_page(vcpu, hva, PAGE_SIZE); >> - ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); >> + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false, >> + logging_active); >> } >> >> + if (write_fault) >> + mark_page_dirty(kvm, gfn); >> >> out_unlock: >> spin_unlock(&kvm->mmu_lock); >> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) >> { >> pte_t *pte = (pte_t *)data; >> >> - stage2_set_pte(kvm, NULL, gpa, pte, false); >> + stage2_set_pte(kvm, NULL, gpa, pte, false, false); > > why is logging never active if we are called from MMU notifiers? mmu notifiers update sptes, but I don't see how these updates can result in guest dirty pages. Also guest pages are marked dirty from 2nd stage page fault handlers (searching through the code). > >> } >> >> >> -- >> 1.7.9.5 >> > > Thanks, > -Christoffer >
On Mon, Aug 11, 2014 at 06:25:05PM -0700, Mario Smarduch wrote: > On 08/11/2014 12:13 PM, Christoffer Dall wrote: > > On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote: > >> This patch adds support for handling 2nd stage page faults during migration, > >> it disables faulting in huge pages, and dissolves huge pages to page tables. > >> In case migration is canceled huge pages will be used again. > >> > >> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com> > >> --- > >> arch/arm/kvm/mmu.c | 31 +++++++++++++++++++++++++------ > >> 1 file changed, 25 insertions(+), 6 deletions(-) > >> > >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c > >> index ca84331..a17812a 100644 > >> --- a/arch/arm/kvm/mmu.c > >> +++ b/arch/arm/kvm/mmu.c > >> @@ -642,7 +642,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache > >> } > >> > >> static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, > >> - phys_addr_t addr, const pte_t *new_pte, bool iomap) > >> + phys_addr_t addr, const pte_t *new_pte, bool iomap, > >> + bool logging_active) > >> { > >> pmd_t *pmd; > >> pte_t *pte, old_pte; > >> @@ -657,6 +658,15 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, > >> return 0; > >> } > >> > >> + /* > >> + * While dirty memory logging, clear PMD entry for huge page and split > >> + * into smaller pages, to track dirty memory at page granularity. > >> + */ > >> + if (logging_active && kvm_pmd_huge(*pmd)) { > >> + phys_addr_t ipa = pmd_pfn(*pmd) << PAGE_SHIFT; > >> + clear_pmd_entry(kvm, pmd, ipa); > > > > clear_pmd_entry has a VM_BUG_ON(kvm_pmd_huge(*pmd)) so that is > > definitely not the right thing to call. > > I don't see that in 3.15rc1/rc4 - > > static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) > { > if (kvm_pmd_huge(*pmd)) { > pmd_clear(pmd); > kvm_tlb_flush_vmid_ipa(kvm, addr); > } else { > [....] > } > > I thought the purpose of this function was to clear PMD entry. Also > ran hundreds of tests no problems. Hmmm confused. > You need to rebase on kvm/next or linus/master, something that contains: 4f853a7 arm/arm64: KVM: Fix and refactor unmap_range > > > >> + } > >> + > >> /* Create stage-2 page mappings - Level 2 */ > >> if (pmd_none(*pmd)) { > >> if (!cache) > >> @@ -709,7 +719,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, > >> if (ret) > >> goto out; > >> spin_lock(&kvm->mmu_lock); > >> - ret = stage2_set_pte(kvm, &cache, addr, &pte, true); > >> + ret = stage2_set_pte(kvm, &cache, addr, &pte, true, false); > >> spin_unlock(&kvm->mmu_lock); > >> if (ret) > >> goto out; > >> @@ -926,6 +936,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > >> struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; > >> struct vm_area_struct *vma; > >> pfn_t pfn; > >> + /* Get logging status, if dirty_bitmap is not NULL then logging is on */ > >> + #ifdef CONFIG_ARM > >> + bool logging_active = !!memslot->dirty_bitmap; > >> + #else > >> + bool logging_active = false; > >> + #endif > > > > can you make this an inline in the header files for now please? > > Yes definitely. > > > > >> > >> write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); > >> if (fault_status == FSC_PERM && !write_fault) { > >> @@ -936,7 +952,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > >> /* Let's check if we will get back a huge page backed by hugetlbfs */ > >> down_read(¤t->mm->mmap_sem); > >> vma = find_vma_intersection(current->mm, hva, hva + 1); > >> - if (is_vm_hugetlb_page(vma)) { > >> + if (is_vm_hugetlb_page(vma) && !logging_active) { > >> hugetlb = true; > >> gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; > >> } else { > >> @@ -979,7 +995,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > >> spin_lock(&kvm->mmu_lock); > >> if (mmu_notifier_retry(kvm, mmu_seq)) > >> goto out_unlock; > >> - if (!hugetlb && !force_pte) > >> + if (!hugetlb && !force_pte && !logging_active) > >> hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); > >> > >> if (hugetlb) { > >> @@ -998,9 +1014,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > >> kvm_set_pfn_dirty(pfn); > >> } > >> coherent_cache_guest_page(vcpu, hva, PAGE_SIZE); > >> - ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); > >> + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false, > >> + logging_active); > >> } > >> > >> + if (write_fault) > >> + mark_page_dirty(kvm, gfn); > >> > >> out_unlock: > >> spin_unlock(&kvm->mmu_lock); > >> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) > >> { > >> pte_t *pte = (pte_t *)data; > >> > >> - stage2_set_pte(kvm, NULL, gpa, pte, false); > >> + stage2_set_pte(kvm, NULL, gpa, pte, false, false); > > > > why is logging never active if we are called from MMU notifiers? > > mmu notifiers update sptes, but I don't see how these updates > can result in guest dirty pages. Also guest pages are marked dirty > from 2nd stage page fault handlers (searching through the code). > Ok, then add: /* * We can always call stage2_set_pte with logging_active == false, * because MMU notifiers will have unmapped a huge PMD before calling * ->change_pte() (which in turn calls kvm_set_spte_hva()) and therefore * stage2_set_pte() never needs to clear out a huge PMD through this * calling path. */ Thanks, -Christoffer
On 08/12/2014 02:50 AM, Christoffer Dall wrote: > On Mon, Aug 11, 2014 at 06:25:05PM -0700, Mario Smarduch wrote: >> On 08/11/2014 12:13 PM, Christoffer Dall wrote: >>> On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote: >>>> This patch adds support for handling 2nd stage page faults during migration, >>>> it disables faulting in huge pages, and dissolves huge pages to page tables. >>>> In case migration is canceled huge pages will be used again. >>>> >>>> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com> >>>> --- >>>> arch/arm/kvm/mmu.c | 31 +++++++++++++++++++++++++------ >>>> 1 file changed, 25 insertions(+), 6 deletions(-) >>>> >>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c >>>> index ca84331..a17812a 100644 >>>> --- a/arch/arm/kvm/mmu.c >>>> +++ b/arch/arm/kvm/mmu.c >>>> @@ -642,7 +642,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache >>>> } >>>> >>>> static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, >>>> - phys_addr_t addr, const pte_t *new_pte, bool iomap) >>>> + phys_addr_t addr, const pte_t *new_pte, bool iomap, >>>> + bool logging_active) >>>> { >>>> pmd_t *pmd; >>>> pte_t *pte, old_pte; >>>> @@ -657,6 +658,15 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, >>>> return 0; >>>> } >>>> >>>> + /* >>>> + * While dirty memory logging, clear PMD entry for huge page and split >>>> + * into smaller pages, to track dirty memory at page granularity. >>>> + */ >>>> + if (logging_active && kvm_pmd_huge(*pmd)) { >>>> + phys_addr_t ipa = pmd_pfn(*pmd) << PAGE_SHIFT; >>>> + clear_pmd_entry(kvm, pmd, ipa); >>> >>> clear_pmd_entry has a VM_BUG_ON(kvm_pmd_huge(*pmd)) so that is >>> definitely not the right thing to call. >> >> I don't see that in 3.15rc1/rc4 - >> >> static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) >> { >> if (kvm_pmd_huge(*pmd)) { >> pmd_clear(pmd); >> kvm_tlb_flush_vmid_ipa(kvm, addr); >> } else { >> [....] >> } >> >> I thought the purpose of this function was to clear PMD entry. Also >> ran hundreds of tests no problems. Hmmm confused. >> > > You need to rebase on kvm/next or linus/master, something that contains: > > 4f853a7 arm/arm64: KVM: Fix and refactor unmap_range > >>> >>>> + } >>>> + >>>> /* Create stage-2 page mappings - Level 2 */ >>>> if (pmd_none(*pmd)) { >>>> if (!cache) >>>> @@ -709,7 +719,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, >>>> if (ret) >>>> goto out; >>>> spin_lock(&kvm->mmu_lock); >>>> - ret = stage2_set_pte(kvm, &cache, addr, &pte, true); >>>> + ret = stage2_set_pte(kvm, &cache, addr, &pte, true, false); >>>> spin_unlock(&kvm->mmu_lock); >>>> if (ret) >>>> goto out; >>>> @@ -926,6 +936,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, >>>> struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; >>>> struct vm_area_struct *vma; >>>> pfn_t pfn; >>>> + /* Get logging status, if dirty_bitmap is not NULL then logging is on */ >>>> + #ifdef CONFIG_ARM >>>> + bool logging_active = !!memslot->dirty_bitmap; >>>> + #else >>>> + bool logging_active = false; >>>> + #endif >>> >>> can you make this an inline in the header files for now please? >> >> Yes definitely. >> >>> >>>> >>>> write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); >>>> if (fault_status == FSC_PERM && !write_fault) { >>>> @@ -936,7 +952,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, >>>> /* Let's check if we will get back a huge page backed by hugetlbfs */ >>>> down_read(¤t->mm->mmap_sem); >>>> vma = find_vma_intersection(current->mm, hva, hva + 1); >>>> - if (is_vm_hugetlb_page(vma)) { >>>> + if (is_vm_hugetlb_page(vma) && !logging_active) { >>>> hugetlb = true; >>>> gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; >>>> } else { >>>> @@ -979,7 +995,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, >>>> spin_lock(&kvm->mmu_lock); >>>> if (mmu_notifier_retry(kvm, mmu_seq)) >>>> goto out_unlock; >>>> - if (!hugetlb && !force_pte) >>>> + if (!hugetlb && !force_pte && !logging_active) >>>> hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); >>>> >>>> if (hugetlb) { >>>> @@ -998,9 +1014,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, >>>> kvm_set_pfn_dirty(pfn); >>>> } >>>> coherent_cache_guest_page(vcpu, hva, PAGE_SIZE); >>>> - ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); >>>> + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false, >>>> + logging_active); >>>> } >>>> >>>> + if (write_fault) >>>> + mark_page_dirty(kvm, gfn); >>>> >>>> out_unlock: >>>> spin_unlock(&kvm->mmu_lock); >>>> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) >>>> { >>>> pte_t *pte = (pte_t *)data; >>>> >>>> - stage2_set_pte(kvm, NULL, gpa, pte, false); >>>> + stage2_set_pte(kvm, NULL, gpa, pte, false, false); >>> >>> why is logging never active if we are called from MMU notifiers? >> >> mmu notifiers update sptes, but I don't see how these updates >> can result in guest dirty pages. Also guest pages are marked dirty >> from 2nd stage page fault handlers (searching through the code). >> > Ok, then add: > > /* > * We can always call stage2_set_pte with logging_active == false, > * because MMU notifiers will have unmapped a huge PMD before calling > * ->change_pte() (which in turn calls kvm_set_spte_hva()) and therefore > * stage2_set_pte() never needs to clear out a huge PMD through this > * calling path. > */ So here on permission change to primary pte's kernel first invalidates related s2ptes followed by ->change_pte calls to synchronize s2ptes. As consequence of invalidation we unmap huge PMDs, if a page falls in that range. Is the comment to point out use of logging flag under various scenarios? Should I add comments on flag use in other places as well? > > Thanks, > -Christoffer >
On Tue, Aug 12, 2014 at 06:27:11PM -0700, Mario Smarduch wrote: > On 08/12/2014 02:50 AM, Christoffer Dall wrote: > > On Mon, Aug 11, 2014 at 06:25:05PM -0700, Mario Smarduch wrote: > >> On 08/11/2014 12:13 PM, Christoffer Dall wrote: > >>> On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote: [...] > >>>> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) > >>>> { > >>>> pte_t *pte = (pte_t *)data; > >>>> > >>>> - stage2_set_pte(kvm, NULL, gpa, pte, false); > >>>> + stage2_set_pte(kvm, NULL, gpa, pte, false, false); > >>> > >>> why is logging never active if we are called from MMU notifiers? > >> > >> mmu notifiers update sptes, but I don't see how these updates > >> can result in guest dirty pages. Also guest pages are marked dirty > >> from 2nd stage page fault handlers (searching through the code). > >> > > Ok, then add: > > > > /* > > * We can always call stage2_set_pte with logging_active == false, > > * because MMU notifiers will have unmapped a huge PMD before calling > > * ->change_pte() (which in turn calls kvm_set_spte_hva()) and therefore > > * stage2_set_pte() never needs to clear out a huge PMD through this > > * calling path. > > */ > > So here on permission change to primary pte's kernel first invalidates > related s2ptes followed by ->change_pte calls to synchronize s2ptes. As > consequence of invalidation we unmap huge PMDs, if a page falls in that > range. > > Is the comment to point out use of logging flag under various scenarios? The comment is because when you look at this function it is not obvious why we pass logging_active=false, despite logging may actually be active. This could suggest that the parameter to stage2_set_pte() should be named differently (break_huge_pmds) or something like that, but we can also be satisfied with the comment. > > Should I add comments on flag use in other places as well? > It's always a judgement call. I didn't find it necessarry to put a comment elsewhere because I think it's pretty obivous that we would never care about logging writes to device regions. However, this made me think, are we making sure that we are not marking device mappings as read-only in the wp_range functions? I think it's quite bad if we mark the VCPU interface as read-only for example. -Christoffer
On 08/13/2014 12:30 AM, Christoffer Dall wrote: > On Tue, Aug 12, 2014 at 06:27:11PM -0700, Mario Smarduch wrote: >> On 08/12/2014 02:50 AM, Christoffer Dall wrote: >>> On Mon, Aug 11, 2014 at 06:25:05PM -0700, Mario Smarduch wrote: >>>> On 08/11/2014 12:13 PM, Christoffer Dall wrote: >>>>> On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote: > > [...] > >>>>>> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) >>>>>> { >>>>>> pte_t *pte = (pte_t *)data; >>>>>> >>>>>> - stage2_set_pte(kvm, NULL, gpa, pte, false); >>>>>> + stage2_set_pte(kvm, NULL, gpa, pte, false, false); >>>>> >>>>> why is logging never active if we are called from MMU notifiers? >>>> >>>> mmu notifiers update sptes, but I don't see how these updates >>>> can result in guest dirty pages. Also guest pages are marked dirty >>>> from 2nd stage page fault handlers (searching through the code). >>>> >>> Ok, then add: >>> >>> /* >>> * We can always call stage2_set_pte with logging_active == false, >>> * because MMU notifiers will have unmapped a huge PMD before calling >>> * ->change_pte() (which in turn calls kvm_set_spte_hva()) and therefore >>> * stage2_set_pte() never needs to clear out a huge PMD through this >>> * calling path. >>> */ >> >> So here on permission change to primary pte's kernel first invalidates >> related s2ptes followed by ->change_pte calls to synchronize s2ptes. As >> consequence of invalidation we unmap huge PMDs, if a page falls in that >> range. >> >> Is the comment to point out use of logging flag under various scenarios? > > The comment is because when you look at this function it is not obvious > why we pass logging_active=false, despite logging may actually be > active. This could suggest that the parameter to stage2_set_pte() > should be named differently (break_huge_pmds) or something like that, > but we can also be satisfied with the comment. Ok I see, I was thinking you thought it was breaking something. Yeah I'll add the comment, in reality this is another use case where a PMD may need to be converted to page table so it makes sense to contrast use cases. > >> >> Should I add comments on flag use in other places as well? >> > > It's always a judgement call. I didn't find it necessarry to put a > comment elsewhere because I think it's pretty obivous that we would > never care about logging writes to device regions. > > However, this made me think, are we making sure that we are not marking > device mappings as read-only in the wp_range functions? I think it's KVM_SET_USER_MEMORY_REGION ioctl doesn't check type of region being installed/operated on (KVM_MEM_LOG_DIRTY_PAGES), in case of QEMU these regions wind up in KVMState->KVMSlot[], when memory_region_add_subregion() is called KVM listener installs it. For migration and dirty page logging QEMU walks the KVMSlot[] array. For QEMU VFIO (PCI) mmap()ing BAR of type IORESOURCE_MEM, causes the memory region to be added to KVMState->KVMSlot[]. In that case it's possible to walk KVMState->KVMSlot[] issue the ioctl and come across a device mapping with normal memory and WP it's s2ptes (VFIO sets unmigrateble state though). But I'm not sure what's there to stop someone calling the ioctl and install a region with device memory type. Most likely though if you installed that kind of region migration would be disabled. But just for logging use not checking memory type could be an issue. > quite bad if we mark the VCPU interface as read-only for example. > > -Christoffer >
On 08/13/2014 06:20 PM, Mario Smarduch wrote: > On 08/13/2014 12:30 AM, Christoffer Dall wrote: >> On Tue, Aug 12, 2014 at 06:27:11PM -0700, Mario Smarduch wrote: >>> On 08/12/2014 02:50 AM, Christoffer Dall wrote: >>>> On Mon, Aug 11, 2014 at 06:25:05PM -0700, Mario Smarduch wrote: >>>>> On 08/11/2014 12:13 PM, Christoffer Dall wrote: >>>>>> On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote: >> >> [...] >> >>>>>>> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) >>>>>>> { >>>>>>> pte_t *pte = (pte_t *)data; >>>>>>> >>>>>>> - stage2_set_pte(kvm, NULL, gpa, pte, false); >>>>>>> + stage2_set_pte(kvm, NULL, gpa, pte, false, false); >>>>>> >>>>>> why is logging never active if we are called from MMU notifiers? >>>>> [...] >> The comment is because when you look at this function it is not obvious >> why we pass logging_active=false, despite logging may actually be >> active. This could suggest that the parameter to stage2_set_pte() >> should be named differently (break_huge_pmds) or something like that, >> but we can also be satisfied with the comment. > > Ok I see, I was thinking you thought it was breaking something. > Yeah I'll add the comment, in reality this is another use case > where a PMD may need to be converted to page table so it makes sense > to contrast use cases. > >> >>> >>> Should I add comments on flag use in other places as well? >>> >> >> It's always a judgement call. I didn't find it necessarry to put a >> comment elsewhere because I think it's pretty obivous that we would >> never care about logging writes to device regions. >> >> However, this made me think, are we making sure that we are not marking >> device mappings as read-only in the wp_range functions? I think it's > > KVM_SET_USER_MEMORY_REGION ioctl doesn't check type of region being > installed/operated on (KVM_MEM_LOG_DIRTY_PAGES), in case of QEMU > these regions wind up in KVMState->KVMSlot[], when > memory_region_add_subregion() is called KVM listener installs it. > For migration and dirty page logging QEMU walks the KVMSlot[] array. > > For QEMU VFIO (PCI) mmap()ing BAR of type IORESOURCE_MEM, > causes the memory region to be added to KVMState->KVMSlot[]. > In that case it's possible to walk KVMState->KVMSlot[] issue > the ioctl and come across a device mapping with normal memory and > WP it's s2ptes (VFIO sets unmigrateble state though). > > But I'm not sure what's there to stop someone calling the ioctl and > install a region with device memory type. Most likely though if you > installed that kind of region migration would be disabled. > > But just for logging use not checking memory type could be an issue. Clarifying above a bit, KVM structures like kvm_run or vgic don't go through KVM_SET_USER_MEMORY_REGION interface (can't think of any other KVM structures). VFIO uses KVM_SET_USER_MEMORY_REGION, user_mem_abort() should resolve the fault. I recall VFIO patch series add that support. It should be ok to write protect MMIO regions installed through KVM_SET_USER_MEMORY_REGION. Although at this time I don't know of use case for logging without migration, so this may not be an issue at all at this time. > >> quite bad if we mark the VCPU interface as read-only for example. >> >> -Christoffer >> >
On Wed, Aug 13, 2014 at 06:20:19PM -0700, Mario Smarduch wrote: > On 08/13/2014 12:30 AM, Christoffer Dall wrote: > > On Tue, Aug 12, 2014 at 06:27:11PM -0700, Mario Smarduch wrote: > >> On 08/12/2014 02:50 AM, Christoffer Dall wrote: > >>> On Mon, Aug 11, 2014 at 06:25:05PM -0700, Mario Smarduch wrote: > >>>> On 08/11/2014 12:13 PM, Christoffer Dall wrote: > >>>>> On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote: > > > > [...] > > > >>>>>> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) > >>>>>> { > >>>>>> pte_t *pte = (pte_t *)data; > >>>>>> > >>>>>> - stage2_set_pte(kvm, NULL, gpa, pte, false); > >>>>>> + stage2_set_pte(kvm, NULL, gpa, pte, false, false); > >>>>> > >>>>> why is logging never active if we are called from MMU notifiers? > >>>> > >>>> mmu notifiers update sptes, but I don't see how these updates > >>>> can result in guest dirty pages. Also guest pages are marked dirty > >>>> from 2nd stage page fault handlers (searching through the code). > >>>> > >>> Ok, then add: > >>> > >>> /* > >>> * We can always call stage2_set_pte with logging_active == false, > >>> * because MMU notifiers will have unmapped a huge PMD before calling > >>> * ->change_pte() (which in turn calls kvm_set_spte_hva()) and therefore > >>> * stage2_set_pte() never needs to clear out a huge PMD through this > >>> * calling path. > >>> */ > >> > >> So here on permission change to primary pte's kernel first invalidates > >> related s2ptes followed by ->change_pte calls to synchronize s2ptes. As > >> consequence of invalidation we unmap huge PMDs, if a page falls in that > >> range. > >> > >> Is the comment to point out use of logging flag under various scenarios? > > > > The comment is because when you look at this function it is not obvious > > why we pass logging_active=false, despite logging may actually be > > active. This could suggest that the parameter to stage2_set_pte() > > should be named differently (break_huge_pmds) or something like that, > > but we can also be satisfied with the comment. > > Ok I see, I was thinking you thought it was breaking something. > Yeah I'll add the comment, in reality this is another use case > where a PMD may need to be converted to page table so it makes sense > to contrast use cases. > the hidden knowledge is that MMU notifiers will ensure a huge PMD gets unmapped before trying to change the physical backing of the underlying PTEs, so it's a gigantic kernel bug if this gets called on something mapped with a huge PMD. > > > >> > >> Should I add comments on flag use in other places as well? > >> > > > > It's always a judgement call. I didn't find it necessarry to put a > > comment elsewhere because I think it's pretty obivous that we would > > never care about logging writes to device regions. > > > > However, this made me think, are we making sure that we are not marking > > device mappings as read-only in the wp_range functions? I think it's > > KVM_SET_USER_MEMORY_REGION ioctl doesn't check type of region being > installed/operated on (KVM_MEM_LOG_DIRTY_PAGES), in case of QEMU > these regions wind up in KVMState->KVMSlot[], when > memory_region_add_subregion() is called KVM listener installs it. > For migration and dirty page logging QEMU walks the KVMSlot[] array. > > For QEMU VFIO (PCI) mmap()ing BAR of type IORESOURCE_MEM, > causes the memory region to be added to KVMState->KVMSlot[]. > In that case it's possible to walk KVMState->KVMSlot[] issue > the ioctl and come across a device mapping with normal memory and > WP it's s2ptes (VFIO sets unmigrateble state though). > > But I'm not sure what's there to stop someone calling the ioctl and > install a region with device memory type. Most likely though if you > installed that kind of region migration would be disabled. > > But just for logging use not checking memory type could be an issue. > I forgot that the current write-protect'ing is limited to the memory region boundaries, so everything should be fine. If user-space write-protects device memory regions, the worst consequence is that it breaks the guest, but that's its own responsibility, so I don't think you need to change anything. -Christoffer
On 08/18/2014 05:51 AM, Christoffer Dall wrote: > On Wed, Aug 13, 2014 at 06:20:19PM -0700, Mario Smarduch wrote: >> On 08/13/2014 12:30 AM, Christoffer Dall wrote: >>> On Tue, Aug 12, 2014 at 06:27:11PM -0700, Mario Smarduch wrote: >>>> On 08/12/2014 02:50 AM, Christoffer Dall wrote: >>>>> On Mon, Aug 11, 2014 at 06:25:05PM -0700, Mario Smarduch wrote: >>>>>> On 08/11/2014 12:13 PM, Christoffer Dall wrote: >>>>>>> On Thu, Jul 24, 2014 at 05:56:08PM -0700, Mario Smarduch wrote: >>> >>> [...] >>> >>>>>>>> @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) >> >> Ok I see, I was thinking you thought it was breaking something. >> Yeah I'll add the comment, in reality this is another use case >> where a PMD may need to be converted to page table so it makes sense >> to contrast use cases. >> > > the hidden knowledge is that MMU notifiers will ensure a huge PMD gets > unmapped before trying to change the physical backing of the underlying > PTEs, so it's a gigantic kernel bug if this gets called on something > mapped with a huge PMD. > That's a good way of putting it luckily I was aware previously looking at some other features. > >>> >>>> >>>> Should I add comments on flag use in other places as well? >>>> >>> >>> It's always a judgement call. I didn't find it necessarry to put a >>> comment elsewhere because I think it's pretty obivous that we would >>> never care about logging writes to device regions. >>> >>> However, this made me think, are we making sure that we are not marking >>> device mappings as read-only in the wp_range functions? I think it's >> >> KVM_SET_USER_MEMORY_REGION ioctl doesn't check type of region being >> installed/operated on (KVM_MEM_LOG_DIRTY_PAGES), in case of QEMU >> these regions wind up in KVMState->KVMSlot[], when >> memory_region_add_subregion() is called KVM listener installs it. >> For migration and dirty page logging QEMU walks the KVMSlot[] array. >> >> For QEMU VFIO (PCI) mmap()ing BAR of type IORESOURCE_MEM, >> causes the memory region to be added to KVMState->KVMSlot[]. >> In that case it's possible to walk KVMState->KVMSlot[] issue >> the ioctl and come across a device mapping with normal memory and >> WP it's s2ptes (VFIO sets unmigrateble state though). >> >> But I'm not sure what's there to stop someone calling the ioctl and >> install a region with device memory type. Most likely though if you >> installed that kind of region migration would be disabled. >> >> But just for logging use not checking memory type could be an issue. >> > I forgot that the current write-protect'ing is limited to the memory > region boundaries, so everything should be fine. I looked through this way back, but it was worth to revisit. > > If user-space write-protects device memory regions, the worst > consequence is that it breaks the guest, but that's its own > responsibility, so I don't think you need to change anything. Thanks for the detailed review. I'll go off now, rebase and make the needed changes > > -Christoffer >
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index ca84331..a17812a 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -642,7 +642,8 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache } static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, bool iomap) + phys_addr_t addr, const pte_t *new_pte, bool iomap, + bool logging_active) { pmd_t *pmd; pte_t *pte, old_pte; @@ -657,6 +658,15 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, return 0; } + /* + * While dirty memory logging, clear PMD entry for huge page and split + * into smaller pages, to track dirty memory at page granularity. + */ + if (logging_active && kvm_pmd_huge(*pmd)) { + phys_addr_t ipa = pmd_pfn(*pmd) << PAGE_SHIFT; + clear_pmd_entry(kvm, pmd, ipa); + } + /* Create stage-2 page mappings - Level 2 */ if (pmd_none(*pmd)) { if (!cache) @@ -709,7 +719,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (ret) goto out; spin_lock(&kvm->mmu_lock); - ret = stage2_set_pte(kvm, &cache, addr, &pte, true); + ret = stage2_set_pte(kvm, &cache, addr, &pte, true, false); spin_unlock(&kvm->mmu_lock); if (ret) goto out; @@ -926,6 +936,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; + /* Get logging status, if dirty_bitmap is not NULL then logging is on */ + #ifdef CONFIG_ARM + bool logging_active = !!memslot->dirty_bitmap; + #else + bool logging_active = false; + #endif write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM && !write_fault) { @@ -936,7 +952,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* Let's check if we will get back a huge page backed by hugetlbfs */ down_read(¤t->mm->mmap_sem); vma = find_vma_intersection(current->mm, hva, hva + 1); - if (is_vm_hugetlb_page(vma)) { + if (is_vm_hugetlb_page(vma) && !logging_active) { hugetlb = true; gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; } else { @@ -979,7 +995,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, spin_lock(&kvm->mmu_lock); if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb && !force_pte) + if (!hugetlb && !force_pte && !logging_active) hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); if (hugetlb) { @@ -998,9 +1014,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_set_pfn_dirty(pfn); } coherent_cache_guest_page(vcpu, hva, PAGE_SIZE); - ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false, + logging_active); } + if (write_fault) + mark_page_dirty(kvm, gfn); out_unlock: spin_unlock(&kvm->mmu_lock); @@ -1151,7 +1170,7 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) { pte_t *pte = (pte_t *)data; - stage2_set_pte(kvm, NULL, gpa, pte, false); + stage2_set_pte(kvm, NULL, gpa, pte, false, false); }
This patch adds support for handling 2nd stage page faults during migration, it disables faulting in huge pages, and dissolves huge pages to page tables. In case migration is canceled huge pages will be used again. Signed-off-by: Mario Smarduch <m.smarduch@samsung.com> --- arch/arm/kvm/mmu.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-)