diff mbox series

[RFC,2/3] KVM: arm64: Add fast path to handle permission relaxation during dirty logging

Message ID 20220110210441.2074798-3-jingzhangos@google.com (mailing list archive)
State New, archived
Headers show
Series ARM64: Guest performance improvement during dirty | expand

Commit Message

Jing Zhang Jan. 10, 2022, 9:04 p.m. UTC
To reduce MMU lock contention during dirty logging, all permission
relaxation operations would be performed under read lock.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
---
 arch/arm64/kvm/mmu.c | 50 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

Comments

Marc Zyngier Jan. 11, 2022, 10:22 a.m. UTC | #1
On Mon, 10 Jan 2022 21:04:40 +0000,
Jing Zhang <jingzhangos@google.com> wrote:
> 
> To reduce MMU lock contention during dirty logging, all permission
> relaxation operations would be performed under read lock.
> 
> Signed-off-by: Jing Zhang <jingzhangos@google.com>
> ---
>  arch/arm64/kvm/mmu.c | 50 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 50 insertions(+)
> 
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index cafd5813c949..dd1f43fba4b0 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1063,6 +1063,54 @@ static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
>  	return 0;
>  }
>  
> +static bool fast_mark_writable(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> +		struct kvm_memory_slot *memslot, unsigned long fault_status)
> +{
> +	int ret;
> +	bool writable;
> +	bool write_fault = kvm_is_write_fault(vcpu);
> +	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
> +	kvm_pfn_t pfn;
> +	struct kvm *kvm = vcpu->kvm;
> +	bool logging_active = memslot_is_logging(memslot);
> +	unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
> +	unsigned long fault_granule;
> +
> +	fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
> +
> +	/* Make sure the fault can be handled in the fast path.
> +	 * Only handle write permission fault on non-hugepage during dirty
> +	 * logging period.
> +	 */

Not the correct comment format.

> +	if (fault_status != FSC_PERM || fault_granule != PAGE_SIZE
> +			|| !logging_active || !write_fault)
> +		return false;

This is all reinventing the logic that already exists in
user_mem_abort(). I'm sympathetic to the effort not to bloat it even
more, but code duplication doesn't help either.

> +
> +
> +	/* Pin the pfn to make sure it couldn't be freed and be resued for
> +	 * another gfn.
> +	 */
> +	pfn = __gfn_to_pfn_memslot(memslot, gfn, true, NULL,
> +				   write_fault, &writable, NULL);
> +	if (is_error_pfn(pfn) || !writable)
> +		return false;

What happens if we hit a non-writable mapping? Don't we leak a page
reference?

> +
> +	read_lock(&kvm->mmu_lock);
> +	ret = kvm_pgtable_stage2_relax_perms(
> +			vcpu->arch.hw_mmu->pgt, fault_ipa, PAGE_HYP);

PAGE_HYP? Err... no. KVM_PGTABLE_PROT_RW, more likely. Yes, they
expand to the same thing, but you are not dealing with nVHE EL2 S1
page tables here.

> +
> +	if (!ret) {
> +		kvm_set_pfn_dirty(pfn);
> +		mark_page_dirty_in_slot(kvm, memslot, gfn);
> +	}
> +	read_unlock(&kvm->mmu_lock);
> +
> +	kvm_set_pfn_accessed(pfn);
> +	kvm_release_pfn_clean(pfn);
> +
> +	return true;
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  			  struct kvm_memory_slot *memslot, unsigned long hva,
>  			  unsigned long fault_status)
> @@ -1085,6 +1133,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
>  	struct kvm_pgtable *pgt;
>  
> +	if (fast_mark_writable(vcpu, fault_ipa, memslot, fault_status))
> +		return 0;
>  	fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
>  	write_fault = kvm_is_write_fault(vcpu);
>  	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);

You are bypassing all sort of checks that I want to keep. Please
integrate this in user_mem_abort instead of this side hack.

Thanks,

	M.
Marc Zyngier Jan. 11, 2022, 10:50 a.m. UTC | #2
Coming back to this, as it does bother me.

On Mon, 10 Jan 2022 21:04:40 +0000,
Jing Zhang <jingzhangos@google.com> wrote:
> 
> To reduce MMU lock contention during dirty logging, all permission
> relaxation operations would be performed under read lock.
> 
> Signed-off-by: Jing Zhang <jingzhangos@google.com>
> ---
>  arch/arm64/kvm/mmu.c | 50 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 50 insertions(+)
> 
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index cafd5813c949..dd1f43fba4b0 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1063,6 +1063,54 @@ static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
>  	return 0;
>  }
>  
> +static bool fast_mark_writable(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> +		struct kvm_memory_slot *memslot, unsigned long fault_status)
> +{
> +	int ret;
> +	bool writable;
> +	bool write_fault = kvm_is_write_fault(vcpu);
> +	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
> +	kvm_pfn_t pfn;
> +	struct kvm *kvm = vcpu->kvm;
> +	bool logging_active = memslot_is_logging(memslot);
> +	unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
> +	unsigned long fault_granule;
> +
> +	fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
> +
> +	/* Make sure the fault can be handled in the fast path.
> +	 * Only handle write permission fault on non-hugepage during dirty
> +	 * logging period.
> +	 */
> +	if (fault_status != FSC_PERM || fault_granule != PAGE_SIZE
> +			|| !logging_active || !write_fault)
> +		return false;
> +
> +
> +	/* Pin the pfn to make sure it couldn't be freed and be resued for
> +	 * another gfn.
> +	 */
> +	pfn = __gfn_to_pfn_memslot(memslot, gfn, true, NULL,
> +				   write_fault, &writable, NULL);

Why the requirement to be atomic? Once this returns, the page will
have an elevated refcount, atomic or not. Given that we're not in an
environment that requires atomicity (we're fully preemptible at this
stage), I wonder what this is achieving.

> +	if (is_error_pfn(pfn) || !writable)
> +		return false;
> +
> +	read_lock(&kvm->mmu_lock);

You also dropped the hazarding against a concurrent MMU notifier. Why
is it safe to do so?

> +	ret = kvm_pgtable_stage2_relax_perms(
> +			vcpu->arch.hw_mmu->pgt, fault_ipa, PAGE_HYP);
> +
> +	if (!ret) {
> +		kvm_set_pfn_dirty(pfn);
> +		mark_page_dirty_in_slot(kvm, memslot, gfn);
> +	}
> +	read_unlock(&kvm->mmu_lock);
> +
> +	kvm_set_pfn_accessed(pfn);
> +	kvm_release_pfn_clean(pfn);
> +
> +	return true;
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  			  struct kvm_memory_slot *memslot, unsigned long hva,
>  			  unsigned long fault_status)
> @@ -1085,6 +1133,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
>  	struct kvm_pgtable *pgt;
>  
> +	if (fast_mark_writable(vcpu, fault_ipa, memslot, fault_status))
> +		return 0;
>  	fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
>  	write_fault = kvm_is_write_fault(vcpu);
>  	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);

Thanks,

	M.
Jing Zhang Jan. 11, 2022, 10:12 p.m. UTC | #3
Hi Marc,

On Tue, Jan 11, 2022 at 2:50 AM Marc Zyngier <maz@kernel.org> wrote:
>
> Coming back to this, as it does bother me.
>
> On Mon, 10 Jan 2022 21:04:40 +0000,
> Jing Zhang <jingzhangos@google.com> wrote:
> >
> > To reduce MMU lock contention during dirty logging, all permission
> > relaxation operations would be performed under read lock.
> >
> > Signed-off-by: Jing Zhang <jingzhangos@google.com>
> > ---
> >  arch/arm64/kvm/mmu.c | 50 ++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 50 insertions(+)
> >
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index cafd5813c949..dd1f43fba4b0 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -1063,6 +1063,54 @@ static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
> >       return 0;
> >  }
> >
> > +static bool fast_mark_writable(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > +             struct kvm_memory_slot *memslot, unsigned long fault_status)
> > +{
> > +     int ret;
> > +     bool writable;
> > +     bool write_fault = kvm_is_write_fault(vcpu);
> > +     gfn_t gfn = fault_ipa >> PAGE_SHIFT;
> > +     kvm_pfn_t pfn;
> > +     struct kvm *kvm = vcpu->kvm;
> > +     bool logging_active = memslot_is_logging(memslot);
> > +     unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
> > +     unsigned long fault_granule;
> > +
> > +     fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
> > +
> > +     /* Make sure the fault can be handled in the fast path.
> > +      * Only handle write permission fault on non-hugepage during dirty
> > +      * logging period.
> > +      */
> > +     if (fault_status != FSC_PERM || fault_granule != PAGE_SIZE
> > +                     || !logging_active || !write_fault)
> > +             return false;
> > +
> > +
> > +     /* Pin the pfn to make sure it couldn't be freed and be resued for
> > +      * another gfn.
> > +      */
> > +     pfn = __gfn_to_pfn_memslot(memslot, gfn, true, NULL,
> > +                                write_fault, &writable, NULL);
>
> Why the requirement to be atomic? Once this returns, the page will
> have an elevated refcount, atomic or not. Given that we're not in an
> environment that requires atomicity (we're fully preemptible at this
> stage), I wonder what this is achieving.
>
> > +     if (is_error_pfn(pfn) || !writable)
> > +             return false;
> > +
> > +     read_lock(&kvm->mmu_lock);
>
> You also dropped the hazarding against a concurrent MMU notifier. Why
> is it safe to do so?
>
> > +     ret = kvm_pgtable_stage2_relax_perms(
> > +                     vcpu->arch.hw_mmu->pgt, fault_ipa, PAGE_HYP);
> > +
> > +     if (!ret) {
> > +             kvm_set_pfn_dirty(pfn);
> > +             mark_page_dirty_in_slot(kvm, memslot, gfn);
> > +     }
> > +     read_unlock(&kvm->mmu_lock);
> > +
> > +     kvm_set_pfn_accessed(pfn);
> > +     kvm_release_pfn_clean(pfn);
> > +
> > +     return true;
> > +}
> > +
> >  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >                         struct kvm_memory_slot *memslot, unsigned long hva,
> >                         unsigned long fault_status)
> > @@ -1085,6 +1133,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >       enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> >       struct kvm_pgtable *pgt;
> >
> > +     if (fast_mark_writable(vcpu, fault_ipa, memslot, fault_status))
> > +             return 0;
> >       fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
> >       write_fault = kvm_is_write_fault(vcpu);
> >       exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
>
> Thanks,
>
>         M.
>
> --
> Without deviation from the norm, progress is not possible.
Appreciate all the comments here. I'll refactor the patch to implement
the fast path in user_mem_abort and address all the problems you
mentioned.
Thanks,
Jing
diff mbox series

Patch

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index cafd5813c949..dd1f43fba4b0 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1063,6 +1063,54 @@  static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
 	return 0;
 }
 
+static bool fast_mark_writable(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+		struct kvm_memory_slot *memslot, unsigned long fault_status)
+{
+	int ret;
+	bool writable;
+	bool write_fault = kvm_is_write_fault(vcpu);
+	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
+	kvm_pfn_t pfn;
+	struct kvm *kvm = vcpu->kvm;
+	bool logging_active = memslot_is_logging(memslot);
+	unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
+	unsigned long fault_granule;
+
+	fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
+
+	/* Make sure the fault can be handled in the fast path.
+	 * Only handle write permission fault on non-hugepage during dirty
+	 * logging period.
+	 */
+	if (fault_status != FSC_PERM || fault_granule != PAGE_SIZE
+			|| !logging_active || !write_fault)
+		return false;
+
+
+	/* Pin the pfn to make sure it couldn't be freed and be resued for
+	 * another gfn.
+	 */
+	pfn = __gfn_to_pfn_memslot(memslot, gfn, true, NULL,
+				   write_fault, &writable, NULL);
+	if (is_error_pfn(pfn) || !writable)
+		return false;
+
+	read_lock(&kvm->mmu_lock);
+	ret = kvm_pgtable_stage2_relax_perms(
+			vcpu->arch.hw_mmu->pgt, fault_ipa, PAGE_HYP);
+
+	if (!ret) {
+		kvm_set_pfn_dirty(pfn);
+		mark_page_dirty_in_slot(kvm, memslot, gfn);
+	}
+	read_unlock(&kvm->mmu_lock);
+
+	kvm_set_pfn_accessed(pfn);
+	kvm_release_pfn_clean(pfn);
+
+	return true;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
@@ -1085,6 +1133,8 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
 	struct kvm_pgtable *pgt;
 
+	if (fast_mark_writable(vcpu, fault_ipa, memslot, fault_status))
+		return 0;
 	fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
 	write_fault = kvm_is_write_fault(vcpu);
 	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);