diff mbox series

[RFC,16/17] KVM: arm64: Enable parallel stage 2 MMU faults

Message ID 20220415215901.1737897-17-oupton@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Parallelize stage 2 fault handling | expand

Commit Message

Oliver Upton April 15, 2022, 9:59 p.m. UTC
Voila! Since the map walkers are able to work in parallel there is no
need to take the write lock on a stage 2 memory abort. Relax locking
on map operations and cross fingers we got it right.

Signed-off-by: Oliver Upton <oupton@google.com>
---
 arch/arm64/kvm/mmu.c | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

Comments

Ben Gardon April 21, 2022, 4:35 p.m. UTC | #1
On Fri, Apr 15, 2022 at 2:59 PM Oliver Upton <oupton@google.com> wrote:
>
> Voila! Since the map walkers are able to work in parallel there is no
> need to take the write lock on a stage 2 memory abort. Relax locking
> on map operations and cross fingers we got it right.

Might be worth a healthy sprinkle of lockdep on the functions taking
"shared" as an argument, just to make sure the wrong value isn't going
down a callstack you didn't expect.

>
> Signed-off-by: Oliver Upton <oupton@google.com>
> ---
>  arch/arm64/kvm/mmu.c | 21 +++------------------
>  1 file changed, 3 insertions(+), 18 deletions(-)
>
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 63cf18cdb978..2881051c3743 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1127,7 +1127,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         gfn_t gfn;
>         kvm_pfn_t pfn;
>         bool logging_active = memslot_is_logging(memslot);
> -       bool use_read_lock = false;
>         unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
>         unsigned long vma_pagesize, fault_granule;
>         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> @@ -1162,8 +1161,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         if (logging_active) {
>                 force_pte = true;
>                 vma_shift = PAGE_SHIFT;
> -               use_read_lock = (fault_status == FSC_PERM && write_fault &&
> -                                fault_granule == PAGE_SIZE);
>         } else {
>                 vma_shift = get_vma_page_shift(vma, hva);
>         }
> @@ -1267,15 +1264,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         if (exec_fault && device)
>                 return -ENOEXEC;
>
> -       /*
> -        * To reduce MMU contentions and enhance concurrency during dirty
> -        * logging dirty logging, only acquire read lock for permission
> -        * relaxation.
> -        */
> -       if (use_read_lock)
> -               read_lock(&kvm->mmu_lock);
> -       else
> -               write_lock(&kvm->mmu_lock);
> +       read_lock(&kvm->mmu_lock);
> +

Ugh, I which we could get rid of the analogous ugly block on x86.

>         pgt = vcpu->arch.hw_mmu->pgt;
>         if (mmu_notifier_retry(kvm, mmu_seq))
>                 goto out_unlock;
> @@ -1322,8 +1312,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
>                 ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
>         } else {
> -               WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n");
> -
>                 ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
>                                              __pfn_to_phys(pfn), prot,
>                                              mmu_caches, true);
> @@ -1336,10 +1324,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>         }
>
>  out_unlock:
> -       if (use_read_lock)
> -               read_unlock(&kvm->mmu_lock);
> -       else
> -               write_unlock(&kvm->mmu_lock);
> +       read_unlock(&kvm->mmu_lock);
>         kvm_set_pfn_accessed(pfn);
>         kvm_release_pfn_clean(pfn);
>         return ret != -EAGAIN ? ret : 0;
> --
> 2.36.0.rc0.470.gd361397f0d-goog
>
Oliver Upton April 21, 2022, 4:46 p.m. UTC | #2
On Thu, Apr 21, 2022 at 09:35:27AM -0700, Ben Gardon wrote:
> On Fri, Apr 15, 2022 at 2:59 PM Oliver Upton <oupton@google.com> wrote:
> >
> > Voila! Since the map walkers are able to work in parallel there is no
> > need to take the write lock on a stage 2 memory abort. Relax locking
> > on map operations and cross fingers we got it right.
> 
> Might be worth a healthy sprinkle of lockdep on the functions taking
> "shared" as an argument, just to make sure the wrong value isn't going
> down a callstack you didn't expect.

If we're going to go this route we might need to just punch a pointer
to the vCPU through to the stage 2 table walker. All of this plumbing is
built around the idea that there are multiple tables to manage and
needn't be in the context of a vCPU/VM, which is why I went the WARN()
route instead of better lockdep assertions.

> >
> > Signed-off-by: Oliver Upton <oupton@google.com>
> > ---
> >  arch/arm64/kvm/mmu.c | 21 +++------------------
> >  1 file changed, 3 insertions(+), 18 deletions(-)
> >
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index 63cf18cdb978..2881051c3743 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -1127,7 +1127,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >         gfn_t gfn;
> >         kvm_pfn_t pfn;
> >         bool logging_active = memslot_is_logging(memslot);
> > -       bool use_read_lock = false;
> >         unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
> >         unsigned long vma_pagesize, fault_granule;
> >         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> > @@ -1162,8 +1161,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >         if (logging_active) {
> >                 force_pte = true;
> >                 vma_shift = PAGE_SHIFT;
> > -               use_read_lock = (fault_status == FSC_PERM && write_fault &&
> > -                                fault_granule == PAGE_SIZE);
> >         } else {
> >                 vma_shift = get_vma_page_shift(vma, hva);
> >         }
> > @@ -1267,15 +1264,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >         if (exec_fault && device)
> >                 return -ENOEXEC;
> >
> > -       /*
> > -        * To reduce MMU contentions and enhance concurrency during dirty
> > -        * logging dirty logging, only acquire read lock for permission
> > -        * relaxation.
> > -        */
> > -       if (use_read_lock)
> > -               read_lock(&kvm->mmu_lock);
> > -       else
> > -               write_lock(&kvm->mmu_lock);
> > +       read_lock(&kvm->mmu_lock);
> > +
> 
> Ugh, I which we could get rid of the analogous ugly block on x86.

Maybe we could fold it in to a MMU macro in the arch-generic scope?
Conditional locking is smelly, I was very pleased to delete these lines :)

--
Thanks,
Oliver
Ben Gardon April 21, 2022, 5:03 p.m. UTC | #3
On Thu, Apr 21, 2022 at 9:46 AM Oliver Upton <oupton@google.com> wrote:
>
> On Thu, Apr 21, 2022 at 09:35:27AM -0700, Ben Gardon wrote:
> > On Fri, Apr 15, 2022 at 2:59 PM Oliver Upton <oupton@google.com> wrote:
> > >
> > > Voila! Since the map walkers are able to work in parallel there is no
> > > need to take the write lock on a stage 2 memory abort. Relax locking
> > > on map operations and cross fingers we got it right.
> >
> > Might be worth a healthy sprinkle of lockdep on the functions taking
> > "shared" as an argument, just to make sure the wrong value isn't going
> > down a callstack you didn't expect.
>
> If we're going to go this route we might need to just punch a pointer
> to the vCPU through to the stage 2 table walker. All of this plumbing is
> built around the idea that there are multiple tables to manage and
> needn't be in the context of a vCPU/VM, which is why I went the WARN()
> route instead of better lockdep assertions.

Oh right, it didn't even occur to me that those functions wouldn't
have a vCPU / KVM pointer.

>
> > >
> > > Signed-off-by: Oliver Upton <oupton@google.com>
> > > ---
> > >  arch/arm64/kvm/mmu.c | 21 +++------------------
> > >  1 file changed, 3 insertions(+), 18 deletions(-)
> > >
> > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > > index 63cf18cdb978..2881051c3743 100644
> > > --- a/arch/arm64/kvm/mmu.c
> > > +++ b/arch/arm64/kvm/mmu.c
> > > @@ -1127,7 +1127,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > >         gfn_t gfn;
> > >         kvm_pfn_t pfn;
> > >         bool logging_active = memslot_is_logging(memslot);
> > > -       bool use_read_lock = false;
> > >         unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
> > >         unsigned long vma_pagesize, fault_granule;
> > >         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> > > @@ -1162,8 +1161,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > >         if (logging_active) {
> > >                 force_pte = true;
> > >                 vma_shift = PAGE_SHIFT;
> > > -               use_read_lock = (fault_status == FSC_PERM && write_fault &&
> > > -                                fault_granule == PAGE_SIZE);
> > >         } else {
> > >                 vma_shift = get_vma_page_shift(vma, hva);
> > >         }
> > > @@ -1267,15 +1264,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > >         if (exec_fault && device)
> > >                 return -ENOEXEC;
> > >
> > > -       /*
> > > -        * To reduce MMU contentions and enhance concurrency during dirty
> > > -        * logging dirty logging, only acquire read lock for permission
> > > -        * relaxation.
> > > -        */
> > > -       if (use_read_lock)
> > > -               read_lock(&kvm->mmu_lock);
> > > -       else
> > > -               write_lock(&kvm->mmu_lock);
> > > +       read_lock(&kvm->mmu_lock);
> > > +
> >
> > Ugh, I which we could get rid of the analogous ugly block on x86.
>
> Maybe we could fold it in to a MMU macro in the arch-generic scope?
> Conditional locking is smelly, I was very pleased to delete these lines :)

Smelly indeed. I don't think hiding it behind a macro would really
help. It's just something we'll have to live with in x86.

>
> --
> Thanks,
> Oliver
diff mbox series

Patch

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 63cf18cdb978..2881051c3743 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1127,7 +1127,6 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	gfn_t gfn;
 	kvm_pfn_t pfn;
 	bool logging_active = memslot_is_logging(memslot);
-	bool use_read_lock = false;
 	unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
 	unsigned long vma_pagesize, fault_granule;
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
@@ -1162,8 +1161,6 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (logging_active) {
 		force_pte = true;
 		vma_shift = PAGE_SHIFT;
-		use_read_lock = (fault_status == FSC_PERM && write_fault &&
-				 fault_granule == PAGE_SIZE);
 	} else {
 		vma_shift = get_vma_page_shift(vma, hva);
 	}
@@ -1267,15 +1264,8 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (exec_fault && device)
 		return -ENOEXEC;
 
-	/*
-	 * To reduce MMU contentions and enhance concurrency during dirty
-	 * logging dirty logging, only acquire read lock for permission
-	 * relaxation.
-	 */
-	if (use_read_lock)
-		read_lock(&kvm->mmu_lock);
-	else
-		write_lock(&kvm->mmu_lock);
+	read_lock(&kvm->mmu_lock);
+
 	pgt = vcpu->arch.hw_mmu->pgt;
 	if (mmu_notifier_retry(kvm, mmu_seq))
 		goto out_unlock;
@@ -1322,8 +1312,6 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
 		ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
 	} else {
-		WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n");
-
 		ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
 					     __pfn_to_phys(pfn), prot,
 					     mmu_caches, true);
@@ -1336,10 +1324,7 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	}
 
 out_unlock:
-	if (use_read_lock)
-		read_unlock(&kvm->mmu_lock);
-	else
-		write_unlock(&kvm->mmu_lock);
+	read_unlock(&kvm->mmu_lock);
 	kvm_set_pfn_accessed(pfn);
 	kvm_release_pfn_clean(pfn);
 	return ret != -EAGAIN ? ret : 0;