diff mbox series

[v3,08/11] KVM: arm64: Handle guest_memfd()-backed guest page faults

Message ID 20250211121128.703390-9-tabba@google.com (mailing list archive)
State New
Headers show
Series KVM: Mapping guest_memfd backed memory at the host for software protected VMs | expand

Commit Message

Fuad Tabba Feb. 11, 2025, 12:11 p.m. UTC
Add arm64 support for handling guest page faults on guest_memfd
backed memslots.

For now, the fault granule is restricted to PAGE_SIZE.

Signed-off-by: Fuad Tabba <tabba@google.com>
---
 arch/arm64/kvm/mmu.c     | 84 ++++++++++++++++++++++++++--------------
 include/linux/kvm_host.h |  5 +++
 virt/kvm/kvm_main.c      |  5 ---
 3 files changed, 61 insertions(+), 33 deletions(-)

Comments

Quentin Perret Feb. 11, 2025, 3:57 p.m. UTC | #1
Hey Fuad,

On Tuesday 11 Feb 2025 at 12:11:24 (+0000), Fuad Tabba wrote:
> Add arm64 support for handling guest page faults on guest_memfd
> backed memslots.
> 
> For now, the fault granule is restricted to PAGE_SIZE.
> 
> Signed-off-by: Fuad Tabba <tabba@google.com>
> ---
>  arch/arm64/kvm/mmu.c     | 84 ++++++++++++++++++++++++++--------------
>  include/linux/kvm_host.h |  5 +++
>  virt/kvm/kvm_main.c      |  5 ---
>  3 files changed, 61 insertions(+), 33 deletions(-)
> 
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index b6c0acb2311c..305060518766 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1454,6 +1454,33 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
>  	return vma->vm_flags & VM_MTE_ALLOWED;
>  }
>  
> +static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> +			     gfn_t gfn, bool write_fault, bool *writable,
> +			     struct page **page, bool is_private)
> +{
> +	kvm_pfn_t pfn;
> +	int ret;
> +
> +	if (!is_private)
> +		return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page);
> +
> +	*writable = false;
> +
> +	if (WARN_ON_ONCE(write_fault && memslot_is_readonly(slot)))
> +		return KVM_PFN_ERR_NOSLOT_MASK;

I believe this check is superfluous, we should decide to report an MMIO
exit to userspace for write faults to RO memslots and not get anywhere
near user_mem_abort(). And nit but the error code should probably be
KVM_PFN_ERR_RO_FAULT or something instead?

> +
> +	ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL);
> +	if (!ret) {
> +		*writable = write_fault;

In normal KVM, if we're not dirty logging we'll actively map the page as
writable if both the memslot and the userspace mappings are writable.
With gmem, the latter doesn't make much sense, but essentially the
underlying page should really be writable (e.g. no CoW getting in the
way and such?). If so, then perhaps make this

		*writable = !memslot_is_readonly(slot);

Wdyt?

> +		return pfn;
> +	}
> +
> +	if (ret == -EHWPOISON)
> +		return KVM_PFN_ERR_HWPOISON;
> +
> +	return KVM_PFN_ERR_NOSLOT_MASK;
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  			  struct kvm_s2_trans *nested,
>  			  struct kvm_memory_slot *memslot, unsigned long hva,
> @@ -1461,25 +1488,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  {
>  	int ret = 0;
>  	bool write_fault, writable;
> -	bool exec_fault, mte_allowed;
> +	bool exec_fault, mte_allowed = false;
>  	bool device = false, vfio_allow_any_uc = false;
>  	unsigned long mmu_seq;
>  	phys_addr_t ipa = fault_ipa;
>  	struct kvm *kvm = vcpu->kvm;
> -	struct vm_area_struct *vma;
> +	struct vm_area_struct *vma = NULL;
>  	short vma_shift;
>  	void *memcache;
> -	gfn_t gfn;
> +	gfn_t gfn = ipa >> PAGE_SHIFT;
>  	kvm_pfn_t pfn;
>  	bool logging_active = memslot_is_logging(memslot);
> -	bool force_pte = logging_active || is_protected_kvm_enabled();
> -	long vma_pagesize, fault_granule;
> +	bool is_private = kvm_mem_is_private(kvm, gfn);

Just trying to understand the locking rule for the xarray behind this.
Is it kvm->srcu that protects it for reads here? Something else?

> +	bool force_pte = logging_active || is_private || is_protected_kvm_enabled();
> +	long vma_pagesize, fault_granule = PAGE_SIZE;
>  	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
>  	struct kvm_pgtable *pgt;
>  	struct page *page;
>  	enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
>  
> -	if (fault_is_perm)
> +	if (fault_is_perm && !is_private)

Nit: not strictly necessary I think.

>  		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
>  	write_fault = kvm_is_write_fault(vcpu);
>  	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
> @@ -1510,24 +1538,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  			return ret;
>  	}
>  
> +	mmap_read_lock(current->mm);
> +
>  	/*
>  	 * Let's check if we will get back a huge page backed by hugetlbfs, or
>  	 * get block mapping for device MMIO region.
>  	 */
> -	mmap_read_lock(current->mm);
> -	vma = vma_lookup(current->mm, hva);
> -	if (unlikely(!vma)) {
> -		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> -		mmap_read_unlock(current->mm);
> -		return -EFAULT;
> -	}
> +	if (!is_private) {
> +		vma = vma_lookup(current->mm, hva);
> +		if (unlikely(!vma)) {
> +			kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> +			mmap_read_unlock(current->mm);
> +			return -EFAULT;
> +		}
>  
> -	/*
> -	 * logging_active is guaranteed to never be true for VM_PFNMAP
> -	 * memslots.
> -	 */
> -	if (WARN_ON_ONCE(logging_active && (vma->vm_flags & VM_PFNMAP)))
> -		return -EFAULT;
> +		/*
> +		 * logging_active is guaranteed to never be true for VM_PFNMAP
> +		 * memslots.
> +		 */
> +		if (WARN_ON_ONCE(logging_active && (vma->vm_flags & VM_PFNMAP)))
> +			return -EFAULT;
> +
> +		vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> +		mte_allowed = kvm_vma_mte_allowed(vma);
> +	}
>  
>  	if (force_pte)
>  		vma_shift = PAGE_SHIFT;
> @@ -1597,18 +1631,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		ipa &= ~(vma_pagesize - 1);
>  	}
>  
> -	gfn = ipa >> PAGE_SHIFT;
> -	mte_allowed = kvm_vma_mte_allowed(vma);
> -
> -	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> -
>  	/* Don't use the VMA after the unlock -- it may have vanished */
>  	vma = NULL;
>  
>  	/*
>  	 * Read mmu_invalidate_seq so that KVM can detect if the results of
> -	 * vma_lookup() or __kvm_faultin_pfn() become stale prior to
> -	 * acquiring kvm->mmu_lock.
> +	 * vma_lookup() or faultin_pfn() become stale prior to acquiring
> +	 * kvm->mmu_lock.
>  	 *
>  	 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
>  	 * with the smp_wmb() in kvm_mmu_invalidate_end().
> @@ -1616,8 +1645,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
>  	mmap_read_unlock(current->mm);
>  
> -	pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
> -				&writable, &page);
> +	pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_private);
>  	if (pfn == KVM_PFN_ERR_HWPOISON) {
>  		kvm_send_hwpoison_signal(hva, vma_shift);
>  		return 0;
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 39fd6e35c723..415c6274aede 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
>  	return gfn_to_memslot(kvm, gfn)->id;
>  }
>  
> +static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> +{
> +	return slot->flags & KVM_MEM_READONLY;
> +}
> +
>  static inline gfn_t
>  hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
>  {
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 38f0f402ea46..3e40acb9f5c0 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2624,11 +2624,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
>  	return size;
>  }
>  
> -static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> -{
> -	return slot->flags & KVM_MEM_READONLY;
> -}
> -
>  static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
>  				       gfn_t *nr_pages, bool write)
>  {
> -- 
> 2.48.1.502.g6dc24dfdaf-goog
>
Fuad Tabba Feb. 11, 2025, 4:13 p.m. UTC | #2
Hi Quentin,

On Tue, 11 Feb 2025 at 15:57, Quentin Perret <qperret@google.com> wrote:
>
> Hey Fuad,
>
> On Tuesday 11 Feb 2025 at 12:11:24 (+0000), Fuad Tabba wrote:
> > Add arm64 support for handling guest page faults on guest_memfd
> > backed memslots.
> >
> > For now, the fault granule is restricted to PAGE_SIZE.
> >
> > Signed-off-by: Fuad Tabba <tabba@google.com>
> > ---
> >  arch/arm64/kvm/mmu.c     | 84 ++++++++++++++++++++++++++--------------
> >  include/linux/kvm_host.h |  5 +++
> >  virt/kvm/kvm_main.c      |  5 ---
> >  3 files changed, 61 insertions(+), 33 deletions(-)
> >
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index b6c0acb2311c..305060518766 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -1454,6 +1454,33 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
> >       return vma->vm_flags & VM_MTE_ALLOWED;
> >  }
> >
> > +static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > +                          gfn_t gfn, bool write_fault, bool *writable,
> > +                          struct page **page, bool is_private)
> > +{
> > +     kvm_pfn_t pfn;
> > +     int ret;
> > +
> > +     if (!is_private)
> > +             return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page);
> > +
> > +     *writable = false;
> > +
> > +     if (WARN_ON_ONCE(write_fault && memslot_is_readonly(slot)))
> > +             return KVM_PFN_ERR_NOSLOT_MASK;
>
> I believe this check is superfluous, we should decide to report an MMIO
> exit to userspace for write faults to RO memslots and not get anywhere
> near user_mem_abort(). And nit but the error code should probably be
> KVM_PFN_ERR_RO_FAULT or something instead?

I tried to replicate the behavior of __kvm_faultin_pfn() here (but got
the wrong error!). I think you're right though that in the arm64 case,
this check isn't needed. Should I fix the return error and keep the
warning though?

> > +
> > +     ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL);
> > +     if (!ret) {
> > +             *writable = write_fault;
>
> In normal KVM, if we're not dirty logging we'll actively map the page as
> writable if both the memslot and the userspace mappings are writable.
> With gmem, the latter doesn't make much sense, but essentially the
> underlying page should really be writable (e.g. no CoW getting in the
> way and such?). If so, then perhaps make this
>
>                 *writable = !memslot_is_readonly(slot);
>
> Wdyt?

Ack.

> > +             return pfn;
> > +     }
> > +
> > +     if (ret == -EHWPOISON)
> > +             return KVM_PFN_ERR_HWPOISON;
> > +
> > +     return KVM_PFN_ERR_NOSLOT_MASK;
> > +}
> > +
> >  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >                         struct kvm_s2_trans *nested,
> >                         struct kvm_memory_slot *memslot, unsigned long hva,
> > @@ -1461,25 +1488,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >  {
> >       int ret = 0;
> >       bool write_fault, writable;
> > -     bool exec_fault, mte_allowed;
> > +     bool exec_fault, mte_allowed = false;
> >       bool device = false, vfio_allow_any_uc = false;
> >       unsigned long mmu_seq;
> >       phys_addr_t ipa = fault_ipa;
> >       struct kvm *kvm = vcpu->kvm;
> > -     struct vm_area_struct *vma;
> > +     struct vm_area_struct *vma = NULL;
> >       short vma_shift;
> >       void *memcache;
> > -     gfn_t gfn;
> > +     gfn_t gfn = ipa >> PAGE_SHIFT;
> >       kvm_pfn_t pfn;
> >       bool logging_active = memslot_is_logging(memslot);
> > -     bool force_pte = logging_active || is_protected_kvm_enabled();
> > -     long vma_pagesize, fault_granule;
> > +     bool is_private = kvm_mem_is_private(kvm, gfn);
>
> Just trying to understand the locking rule for the xarray behind this.
> Is it kvm->srcu that protects it for reads here? Something else?

I'm not sure I follow. Which xarray are you referring to?

>
> > +     bool force_pte = logging_active || is_private || is_protected_kvm_enabled();
> > +     long vma_pagesize, fault_granule = PAGE_SIZE;
> >       enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> >       struct kvm_pgtable *pgt;
> >       struct page *page;
> >       enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
> >
> > -     if (fault_is_perm)
> > +     if (fault_is_perm && !is_private)
>
> Nit: not strictly necessary I think.

You're right.

Thanks,
/fuad

> >               fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
> >       write_fault = kvm_is_write_fault(vcpu);
> >       exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
> > @@ -1510,24 +1538,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >                       return ret;
> >       }
> >
> > +     mmap_read_lock(current->mm);
> > +
> >       /*
> >        * Let's check if we will get back a huge page backed by hugetlbfs, or
> >        * get block mapping for device MMIO region.
> >        */
> > -     mmap_read_lock(current->mm);
> > -     vma = vma_lookup(current->mm, hva);
> > -     if (unlikely(!vma)) {
> > -             kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> > -             mmap_read_unlock(current->mm);
> > -             return -EFAULT;
> > -     }
> > +     if (!is_private) {
> > +             vma = vma_lookup(current->mm, hva);
> > +             if (unlikely(!vma)) {
> > +                     kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> > +                     mmap_read_unlock(current->mm);
> > +                     return -EFAULT;
> > +             }
> >
> > -     /*
> > -      * logging_active is guaranteed to never be true for VM_PFNMAP
> > -      * memslots.
> > -      */
> > -     if (WARN_ON_ONCE(logging_active && (vma->vm_flags & VM_PFNMAP)))
> > -             return -EFAULT;
> > +             /*
> > +              * logging_active is guaranteed to never be true for VM_PFNMAP
> > +              * memslots.
> > +              */
> > +             if (WARN_ON_ONCE(logging_active && (vma->vm_flags & VM_PFNMAP)))
> > +                     return -EFAULT;
> > +
> > +             vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> > +             mte_allowed = kvm_vma_mte_allowed(vma);
> > +     }
> >
> >       if (force_pte)
> >               vma_shift = PAGE_SHIFT;
> > @@ -1597,18 +1631,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >               ipa &= ~(vma_pagesize - 1);
> >       }
> >
> > -     gfn = ipa >> PAGE_SHIFT;
> > -     mte_allowed = kvm_vma_mte_allowed(vma);
> > -
> > -     vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> > -
> >       /* Don't use the VMA after the unlock -- it may have vanished */
> >       vma = NULL;
> >
> >       /*
> >        * Read mmu_invalidate_seq so that KVM can detect if the results of
> > -      * vma_lookup() or __kvm_faultin_pfn() become stale prior to
> > -      * acquiring kvm->mmu_lock.
> > +      * vma_lookup() or faultin_pfn() become stale prior to acquiring
> > +      * kvm->mmu_lock.
> >        *
> >        * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
> >        * with the smp_wmb() in kvm_mmu_invalidate_end().
> > @@ -1616,8 +1645,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> >       mmu_seq = vcpu->kvm->mmu_invalidate_seq;
> >       mmap_read_unlock(current->mm);
> >
> > -     pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
> > -                             &writable, &page);
> > +     pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_private);
> >       if (pfn == KVM_PFN_ERR_HWPOISON) {
> >               kvm_send_hwpoison_signal(hva, vma_shift);
> >               return 0;
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 39fd6e35c723..415c6274aede 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
> >       return gfn_to_memslot(kvm, gfn)->id;
> >  }
> >
> > +static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> > +{
> > +     return slot->flags & KVM_MEM_READONLY;
> > +}
> > +
> >  static inline gfn_t
> >  hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
> >  {
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 38f0f402ea46..3e40acb9f5c0 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -2624,11 +2624,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
> >       return size;
> >  }
> >
> > -static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> > -{
> > -     return slot->flags & KVM_MEM_READONLY;
> > -}
> > -
> >  static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
> >                                      gfn_t *nr_pages, bool write)
> >  {
> > --
> > 2.48.1.502.g6dc24dfdaf-goog
> >
Quentin Perret Feb. 11, 2025, 4:25 p.m. UTC | #3
On Tuesday 11 Feb 2025 at 16:13:27 (+0000), Fuad Tabba wrote:
> Hi Quentin,
> 
> On Tue, 11 Feb 2025 at 15:57, Quentin Perret <qperret@google.com> wrote:
> >
> > Hey Fuad,
> >
> > On Tuesday 11 Feb 2025 at 12:11:24 (+0000), Fuad Tabba wrote:
> > > Add arm64 support for handling guest page faults on guest_memfd
> > > backed memslots.
> > >
> > > For now, the fault granule is restricted to PAGE_SIZE.
> > >
> > > Signed-off-by: Fuad Tabba <tabba@google.com>
> > > ---
> > >  arch/arm64/kvm/mmu.c     | 84 ++++++++++++++++++++++++++--------------
> > >  include/linux/kvm_host.h |  5 +++
> > >  virt/kvm/kvm_main.c      |  5 ---
> > >  3 files changed, 61 insertions(+), 33 deletions(-)
> > >
> > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > > index b6c0acb2311c..305060518766 100644
> > > --- a/arch/arm64/kvm/mmu.c
> > > +++ b/arch/arm64/kvm/mmu.c
> > > @@ -1454,6 +1454,33 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
> > >       return vma->vm_flags & VM_MTE_ALLOWED;
> > >  }
> > >
> > > +static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > +                          gfn_t gfn, bool write_fault, bool *writable,
> > > +                          struct page **page, bool is_private)
> > > +{
> > > +     kvm_pfn_t pfn;
> > > +     int ret;
> > > +
> > > +     if (!is_private)
> > > +             return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page);
> > > +
> > > +     *writable = false;
> > > +
> > > +     if (WARN_ON_ONCE(write_fault && memslot_is_readonly(slot)))
> > > +             return KVM_PFN_ERR_NOSLOT_MASK;
> >
> > I believe this check is superfluous, we should decide to report an MMIO
> > exit to userspace for write faults to RO memslots and not get anywhere
> > near user_mem_abort(). And nit but the error code should probably be
> > KVM_PFN_ERR_RO_FAULT or something instead?
> 
> I tried to replicate the behavior of __kvm_faultin_pfn() here (but got
> the wrong error!). I think you're right though that in the arm64 case,
> this check isn't needed. Should I fix the return error and keep the
> warning though?

__kvm_faultin_pfn() will just set *writable to false if it find an RO
memslot apparently, not return an error. So I'd vote for dropping that
check so we align with that behaviour.

> > > +
> > > +     ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL);
> > > +     if (!ret) {
> > > +             *writable = write_fault;
> >
> > In normal KVM, if we're not dirty logging we'll actively map the page as
> > writable if both the memslot and the userspace mappings are writable.
> > With gmem, the latter doesn't make much sense, but essentially the
> > underlying page should really be writable (e.g. no CoW getting in the
> > way and such?). If so, then perhaps make this
> >
> >                 *writable = !memslot_is_readonly(slot);
> >
> > Wdyt?
> 
> Ack.
> 
> > > +             return pfn;
> > > +     }
> > > +
> > > +     if (ret == -EHWPOISON)
> > > +             return KVM_PFN_ERR_HWPOISON;
> > > +
> > > +     return KVM_PFN_ERR_NOSLOT_MASK;
> > > +}
> > > +
> > >  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > >                         struct kvm_s2_trans *nested,
> > >                         struct kvm_memory_slot *memslot, unsigned long hva,
> > > @@ -1461,25 +1488,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > >  {
> > >       int ret = 0;
> > >       bool write_fault, writable;
> > > -     bool exec_fault, mte_allowed;
> > > +     bool exec_fault, mte_allowed = false;
> > >       bool device = false, vfio_allow_any_uc = false;
> > >       unsigned long mmu_seq;
> > >       phys_addr_t ipa = fault_ipa;
> > >       struct kvm *kvm = vcpu->kvm;
> > > -     struct vm_area_struct *vma;
> > > +     struct vm_area_struct *vma = NULL;
> > >       short vma_shift;
> > >       void *memcache;
> > > -     gfn_t gfn;
> > > +     gfn_t gfn = ipa >> PAGE_SHIFT;
> > >       kvm_pfn_t pfn;
> > >       bool logging_active = memslot_is_logging(memslot);
> > > -     bool force_pte = logging_active || is_protected_kvm_enabled();
> > > -     long vma_pagesize, fault_granule;
> > > +     bool is_private = kvm_mem_is_private(kvm, gfn);
> >
> > Just trying to understand the locking rule for the xarray behind this.
> > Is it kvm->srcu that protects it for reads here? Something else?
> 
> I'm not sure I follow. Which xarray are you referring to?

Sorry, yes, that wasn't clear. I meant that kvm_mem_is_private() calls
kvm_get_memory_attributes() which indexes kvm->mem_attr_array. The
comment in struct kvm indicates that this xarray is protected by RCU for
readers, so I was just checking if we were relying on
kvm_handle_guest_abort() to take srcu_read_lock(&kvm->srcu) for us, or
if there was something else more subtle here.

Cheers,
Quentin

> >
> > > +     bool force_pte = logging_active || is_private || is_protected_kvm_enabled();
> > > +     long vma_pagesize, fault_granule = PAGE_SIZE;
> > >       enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> > >       struct kvm_pgtable *pgt;
> > >       struct page *page;
> > >       enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
> > >
> > > -     if (fault_is_perm)
> > > +     if (fault_is_perm && !is_private)
> >
> > Nit: not strictly necessary I think.
> 
> You're right.
> 
> Thanks,
> /fuad
> 
> > >               fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
> > >       write_fault = kvm_is_write_fault(vcpu);
> > >       exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
> > > @@ -1510,24 +1538,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > >                       return ret;
> > >       }
> > >
> > > +     mmap_read_lock(current->mm);
> > > +
> > >       /*
> > >        * Let's check if we will get back a huge page backed by hugetlbfs, or
> > >        * get block mapping for device MMIO region.
> > >        */
> > > -     mmap_read_lock(current->mm);
> > > -     vma = vma_lookup(current->mm, hva);
> > > -     if (unlikely(!vma)) {
> > > -             kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> > > -             mmap_read_unlock(current->mm);
> > > -             return -EFAULT;
> > > -     }
> > > +     if (!is_private) {
> > > +             vma = vma_lookup(current->mm, hva);
> > > +             if (unlikely(!vma)) {
> > > +                     kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> > > +                     mmap_read_unlock(current->mm);
> > > +                     return -EFAULT;
> > > +             }
> > >
> > > -     /*
> > > -      * logging_active is guaranteed to never be true for VM_PFNMAP
> > > -      * memslots.
> > > -      */
> > > -     if (WARN_ON_ONCE(logging_active && (vma->vm_flags & VM_PFNMAP)))
> > > -             return -EFAULT;
> > > +             /*
> > > +              * logging_active is guaranteed to never be true for VM_PFNMAP
> > > +              * memslots.
> > > +              */
> > > +             if (WARN_ON_ONCE(logging_active && (vma->vm_flags & VM_PFNMAP)))
> > > +                     return -EFAULT;
> > > +
> > > +             vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> > > +             mte_allowed = kvm_vma_mte_allowed(vma);
> > > +     }
> > >
> > >       if (force_pte)
> > >               vma_shift = PAGE_SHIFT;
> > > @@ -1597,18 +1631,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > >               ipa &= ~(vma_pagesize - 1);
> > >       }
> > >
> > > -     gfn = ipa >> PAGE_SHIFT;
> > > -     mte_allowed = kvm_vma_mte_allowed(vma);
> > > -
> > > -     vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> > > -
> > >       /* Don't use the VMA after the unlock -- it may have vanished */
> > >       vma = NULL;
> > >
> > >       /*
> > >        * Read mmu_invalidate_seq so that KVM can detect if the results of
> > > -      * vma_lookup() or __kvm_faultin_pfn() become stale prior to
> > > -      * acquiring kvm->mmu_lock.
> > > +      * vma_lookup() or faultin_pfn() become stale prior to acquiring
> > > +      * kvm->mmu_lock.
> > >        *
> > >        * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
> > >        * with the smp_wmb() in kvm_mmu_invalidate_end().
> > > @@ -1616,8 +1645,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > >       mmu_seq = vcpu->kvm->mmu_invalidate_seq;
> > >       mmap_read_unlock(current->mm);
> > >
> > > -     pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
> > > -                             &writable, &page);
> > > +     pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_private);
> > >       if (pfn == KVM_PFN_ERR_HWPOISON) {
> > >               kvm_send_hwpoison_signal(hva, vma_shift);
> > >               return 0;
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 39fd6e35c723..415c6274aede 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
> > >       return gfn_to_memslot(kvm, gfn)->id;
> > >  }
> > >
> > > +static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> > > +{
> > > +     return slot->flags & KVM_MEM_READONLY;
> > > +}
> > > +
> > >  static inline gfn_t
> > >  hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
> > >  {
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index 38f0f402ea46..3e40acb9f5c0 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -2624,11 +2624,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
> > >       return size;
> > >  }
> > >
> > > -static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> > > -{
> > > -     return slot->flags & KVM_MEM_READONLY;
> > > -}
> > > -
> > >  static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
> > >                                      gfn_t *nr_pages, bool write)
> > >  {
> > > --
> > > 2.48.1.502.g6dc24dfdaf-goog
> > >
Fuad Tabba Feb. 11, 2025, 4:34 p.m. UTC | #4
Hi Quentin,

On Tue, 11 Feb 2025 at 16:26, Quentin Perret <qperret@google.com> wrote:
>
> On Tuesday 11 Feb 2025 at 16:13:27 (+0000), Fuad Tabba wrote:
> > Hi Quentin,
> >
> > On Tue, 11 Feb 2025 at 15:57, Quentin Perret <qperret@google.com> wrote:
> > >
> > > Hey Fuad,
> > >
> > > On Tuesday 11 Feb 2025 at 12:11:24 (+0000), Fuad Tabba wrote:
> > > > Add arm64 support for handling guest page faults on guest_memfd
> > > > backed memslots.
> > > >
> > > > For now, the fault granule is restricted to PAGE_SIZE.
> > > >
> > > > Signed-off-by: Fuad Tabba <tabba@google.com>
> > > > ---
> > > >  arch/arm64/kvm/mmu.c     | 84 ++++++++++++++++++++++++++--------------
> > > >  include/linux/kvm_host.h |  5 +++
> > > >  virt/kvm/kvm_main.c      |  5 ---
> > > >  3 files changed, 61 insertions(+), 33 deletions(-)
> > > >
> > > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > > > index b6c0acb2311c..305060518766 100644
> > > > --- a/arch/arm64/kvm/mmu.c
> > > > +++ b/arch/arm64/kvm/mmu.c
> > > > @@ -1454,6 +1454,33 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
> > > >       return vma->vm_flags & VM_MTE_ALLOWED;
> > > >  }
> > > >
> > > > +static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > > +                          gfn_t gfn, bool write_fault, bool *writable,
> > > > +                          struct page **page, bool is_private)
> > > > +{
> > > > +     kvm_pfn_t pfn;
> > > > +     int ret;
> > > > +
> > > > +     if (!is_private)
> > > > +             return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page);
> > > > +
> > > > +     *writable = false;
> > > > +
> > > > +     if (WARN_ON_ONCE(write_fault && memslot_is_readonly(slot)))
> > > > +             return KVM_PFN_ERR_NOSLOT_MASK;
> > >
> > > I believe this check is superfluous, we should decide to report an MMIO
> > > exit to userspace for write faults to RO memslots and not get anywhere
> > > near user_mem_abort(). And nit but the error code should probably be
> > > KVM_PFN_ERR_RO_FAULT or something instead?
> >
> > I tried to replicate the behavior of __kvm_faultin_pfn() here (but got
> > the wrong error!). I think you're right though that in the arm64 case,
> > this check isn't needed. Should I fix the return error and keep the
> > warning though?
>
> __kvm_faultin_pfn() will just set *writable to false if it find an RO
> memslot apparently, not return an error. So I'd vote for dropping that
> check so we align with that behaviour.

Ack.

> > > > +
> > > > +     ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL);
> > > > +     if (!ret) {
> > > > +             *writable = write_fault;
> > >
> > > In normal KVM, if we're not dirty logging we'll actively map the page as
> > > writable if both the memslot and the userspace mappings are writable.
> > > With gmem, the latter doesn't make much sense, but essentially the
> > > underlying page should really be writable (e.g. no CoW getting in the
> > > way and such?). If so, then perhaps make this
> > >
> > >                 *writable = !memslot_is_readonly(slot);
> > >
> > > Wdyt?
> >
> > Ack.
> >
> > > > +             return pfn;
> > > > +     }
> > > > +
> > > > +     if (ret == -EHWPOISON)
> > > > +             return KVM_PFN_ERR_HWPOISON;
> > > > +
> > > > +     return KVM_PFN_ERR_NOSLOT_MASK;
> > > > +}
> > > > +
> > > >  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > > >                         struct kvm_s2_trans *nested,
> > > >                         struct kvm_memory_slot *memslot, unsigned long hva,
> > > > @@ -1461,25 +1488,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > > >  {
> > > >       int ret = 0;
> > > >       bool write_fault, writable;
> > > > -     bool exec_fault, mte_allowed;
> > > > +     bool exec_fault, mte_allowed = false;
> > > >       bool device = false, vfio_allow_any_uc = false;
> > > >       unsigned long mmu_seq;
> > > >       phys_addr_t ipa = fault_ipa;
> > > >       struct kvm *kvm = vcpu->kvm;
> > > > -     struct vm_area_struct *vma;
> > > > +     struct vm_area_struct *vma = NULL;
> > > >       short vma_shift;
> > > >       void *memcache;
> > > > -     gfn_t gfn;
> > > > +     gfn_t gfn = ipa >> PAGE_SHIFT;
> > > >       kvm_pfn_t pfn;
> > > >       bool logging_active = memslot_is_logging(memslot);
> > > > -     bool force_pte = logging_active || is_protected_kvm_enabled();
> > > > -     long vma_pagesize, fault_granule;
> > > > +     bool is_private = kvm_mem_is_private(kvm, gfn);
> > >
> > > Just trying to understand the locking rule for the xarray behind this.
> > > Is it kvm->srcu that protects it for reads here? Something else?
> >
> > I'm not sure I follow. Which xarray are you referring to?
>
> Sorry, yes, that wasn't clear. I meant that kvm_mem_is_private() calls
> kvm_get_memory_attributes() which indexes kvm->mem_attr_array. The
> comment in struct kvm indicates that this xarray is protected by RCU for
> readers, so I was just checking if we were relying on
> kvm_handle_guest_abort() to take srcu_read_lock(&kvm->srcu) for us, or
> if there was something else more subtle here.

I was kind of afraid that people would be confused by this, and I
commented on it in the commit message of the earlier patch:
https://lore.kernel.org/all/20250211121128.703390-6-tabba@google.com/

> Note that the word "private" in the name of the function
> kvm_mem_is_private() doesn't necessarily indicate that the memory
> isn't shared, but is due to the history and evolution of
> guest_memfd and the various names it has received. In effect,
> this function is used to multiplex between the path of a normal
> page fault and the path of a guest_memfd backed page fault.

kvm_mem_is_private() is property of the memslot itself. No xarrays
harmed in the process :)

Cheers,
/fuad

> Cheers,
> Quentin
>
> > >
> > > > +     bool force_pte = logging_active || is_private || is_protected_kvm_enabled();
> > > > +     long vma_pagesize, fault_granule = PAGE_SIZE;
> > > >       enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> > > >       struct kvm_pgtable *pgt;
> > > >       struct page *page;
> > > >       enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
> > > >
> > > > -     if (fault_is_perm)
> > > > +     if (fault_is_perm && !is_private)
> > >
> > > Nit: not strictly necessary I think.
> >
> > You're right.
> >
> > Thanks,
> > /fuad
> >
> > > >               fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
> > > >       write_fault = kvm_is_write_fault(vcpu);
> > > >       exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
> > > > @@ -1510,24 +1538,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > > >                       return ret;
> > > >       }
> > > >
> > > > +     mmap_read_lock(current->mm);
> > > > +
> > > >       /*
> > > >        * Let's check if we will get back a huge page backed by hugetlbfs, or
> > > >        * get block mapping for device MMIO region.
> > > >        */
> > > > -     mmap_read_lock(current->mm);
> > > > -     vma = vma_lookup(current->mm, hva);
> > > > -     if (unlikely(!vma)) {
> > > > -             kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> > > > -             mmap_read_unlock(current->mm);
> > > > -             return -EFAULT;
> > > > -     }
> > > > +     if (!is_private) {
> > > > +             vma = vma_lookup(current->mm, hva);
> > > > +             if (unlikely(!vma)) {
> > > > +                     kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> > > > +                     mmap_read_unlock(current->mm);
> > > > +                     return -EFAULT;
> > > > +             }
> > > >
> > > > -     /*
> > > > -      * logging_active is guaranteed to never be true for VM_PFNMAP
> > > > -      * memslots.
> > > > -      */
> > > > -     if (WARN_ON_ONCE(logging_active && (vma->vm_flags & VM_PFNMAP)))
> > > > -             return -EFAULT;
> > > > +             /*
> > > > +              * logging_active is guaranteed to never be true for VM_PFNMAP
> > > > +              * memslots.
> > > > +              */
> > > > +             if (WARN_ON_ONCE(logging_active && (vma->vm_flags & VM_PFNMAP)))
> > > > +                     return -EFAULT;
> > > > +
> > > > +             vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> > > > +             mte_allowed = kvm_vma_mte_allowed(vma);
> > > > +     }
> > > >
> > > >       if (force_pte)
> > > >               vma_shift = PAGE_SHIFT;
> > > > @@ -1597,18 +1631,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > > >               ipa &= ~(vma_pagesize - 1);
> > > >       }
> > > >
> > > > -     gfn = ipa >> PAGE_SHIFT;
> > > > -     mte_allowed = kvm_vma_mte_allowed(vma);
> > > > -
> > > > -     vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> > > > -
> > > >       /* Don't use the VMA after the unlock -- it may have vanished */
> > > >       vma = NULL;
> > > >
> > > >       /*
> > > >        * Read mmu_invalidate_seq so that KVM can detect if the results of
> > > > -      * vma_lookup() or __kvm_faultin_pfn() become stale prior to
> > > > -      * acquiring kvm->mmu_lock.
> > > > +      * vma_lookup() or faultin_pfn() become stale prior to acquiring
> > > > +      * kvm->mmu_lock.
> > > >        *
> > > >        * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
> > > >        * with the smp_wmb() in kvm_mmu_invalidate_end().
> > > > @@ -1616,8 +1645,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> > > >       mmu_seq = vcpu->kvm->mmu_invalidate_seq;
> > > >       mmap_read_unlock(current->mm);
> > > >
> > > > -     pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
> > > > -                             &writable, &page);
> > > > +     pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_private);
> > > >       if (pfn == KVM_PFN_ERR_HWPOISON) {
> > > >               kvm_send_hwpoison_signal(hva, vma_shift);
> > > >               return 0;
> > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > index 39fd6e35c723..415c6274aede 100644
> > > > --- a/include/linux/kvm_host.h
> > > > +++ b/include/linux/kvm_host.h
> > > > @@ -1882,6 +1882,11 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
> > > >       return gfn_to_memslot(kvm, gfn)->id;
> > > >  }
> > > >
> > > > +static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> > > > +{
> > > > +     return slot->flags & KVM_MEM_READONLY;
> > > > +}
> > > > +
> > > >  static inline gfn_t
> > > >  hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
> > > >  {
> > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > index 38f0f402ea46..3e40acb9f5c0 100644
> > > > --- a/virt/kvm/kvm_main.c
> > > > +++ b/virt/kvm/kvm_main.c
> > > > @@ -2624,11 +2624,6 @@ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
> > > >       return size;
> > > >  }
> > > >
> > > > -static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
> > > > -{
> > > > -     return slot->flags & KVM_MEM_READONLY;
> > > > -}
> > > > -
> > > >  static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
> > > >                                      gfn_t *nr_pages, bool write)
> > > >  {
> > > > --
> > > > 2.48.1.502.g6dc24dfdaf-goog
> > > >
Quentin Perret Feb. 11, 2025, 4:57 p.m. UTC | #5
On Tuesday 11 Feb 2025 at 16:34:02 (+0000), Fuad Tabba wrote:
> > Sorry, yes, that wasn't clear. I meant that kvm_mem_is_private() calls
> > kvm_get_memory_attributes() which indexes kvm->mem_attr_array. The
> > comment in struct kvm indicates that this xarray is protected by RCU for
> > readers, so I was just checking if we were relying on
> > kvm_handle_guest_abort() to take srcu_read_lock(&kvm->srcu) for us, or
> > if there was something else more subtle here.
> 
> I was kind of afraid that people would be confused by this, and I
> commented on it in the commit message of the earlier patch:
> https://lore.kernel.org/all/20250211121128.703390-6-tabba@google.com/
> 
> > Note that the word "private" in the name of the function
> > kvm_mem_is_private() doesn't necessarily indicate that the memory
> > isn't shared, but is due to the history and evolution of
> > guest_memfd and the various names it has received. In effect,
> > this function is used to multiplex between the path of a normal
> > page fault and the path of a guest_memfd backed page fault.
> 
> kvm_mem_is_private() is property of the memslot itself. No xarrays
> harmed in the process :)

Ah, I see, but could someone enable CONFIG_GENERIC_PRIVATE_MEM and
related and get confused? Should KVM_GENERIC_MEMORY_ATTRIBUTES=n
depend on !ARM64? Or is it KVM_GMEM_SHARED_MEM that needs to depend on
the generic implementation being off?

Thanks,
Quentin
Fuad Tabba Feb. 11, 2025, 5:04 p.m. UTC | #6
Hi Quentin,

On Tue, 11 Feb 2025 at 16:57, Quentin Perret <qperret@google.com> wrote:
>
> On Tuesday 11 Feb 2025 at 16:34:02 (+0000), Fuad Tabba wrote:
> > > Sorry, yes, that wasn't clear. I meant that kvm_mem_is_private() calls
> > > kvm_get_memory_attributes() which indexes kvm->mem_attr_array. The
> > > comment in struct kvm indicates that this xarray is protected by RCU for
> > > readers, so I was just checking if we were relying on
> > > kvm_handle_guest_abort() to take srcu_read_lock(&kvm->srcu) for us, or
> > > if there was something else more subtle here.
> >
> > I was kind of afraid that people would be confused by this, and I
> > commented on it in the commit message of the earlier patch:
> > https://lore.kernel.org/all/20250211121128.703390-6-tabba@google.com/
> >
> > > Note that the word "private" in the name of the function
> > > kvm_mem_is_private() doesn't necessarily indicate that the memory
> > > isn't shared, but is due to the history and evolution of
> > > guest_memfd and the various names it has received. In effect,
> > > this function is used to multiplex between the path of a normal
> > > page fault and the path of a guest_memfd backed page fault.
> >
> > kvm_mem_is_private() is property of the memslot itself. No xarrays
> > harmed in the process :)
>
> Ah, I see, but could someone enable CONFIG_GENERIC_PRIVATE_MEM and
> related and get confused? Should KVM_GENERIC_MEMORY_ATTRIBUTES=n
> depend on !ARM64? Or is it KVM_GMEM_SHARED_MEM that needs to depend on
> the generic implementation being off?

VMs that have sharing in place don't need
KVM_GENERIC_MEMORY_ATTRIBUTES, since that presents the userspace
view/desire of the state of the folio. It's not necessarily an arm64
thing, for example, CCA would need it, since it behaves like TDX.

I guess that KVM_GMEM_SHARED_MEM and KVM_GENERIC_MEMORY_ATTRIBUTES are
mutually exclusive. I cannot think how the two could be used or useful
together. We could have a check to ensure that both are not enabled at
the same time. The behavior in this patch series is that
KVM_GMEM_SHARED_MEM selects GENERIC_PRIVATE_MEM.

Also, to help reduce the confusion above, I could rename the variable
is_private in user_mem_abort() to is_guestmem. WDYT?

Cheers,
/fuad


> Thanks,
> Quentin
Quentin Perret Feb. 11, 2025, 5:19 p.m. UTC | #7
On Tuesday 11 Feb 2025 at 17:04:05 (+0000), Fuad Tabba wrote:
> Hi Quentin,
> 
> On Tue, 11 Feb 2025 at 16:57, Quentin Perret <qperret@google.com> wrote:
> >
> > On Tuesday 11 Feb 2025 at 16:34:02 (+0000), Fuad Tabba wrote:
> > > > Sorry, yes, that wasn't clear. I meant that kvm_mem_is_private() calls
> > > > kvm_get_memory_attributes() which indexes kvm->mem_attr_array. The
> > > > comment in struct kvm indicates that this xarray is protected by RCU for
> > > > readers, so I was just checking if we were relying on
> > > > kvm_handle_guest_abort() to take srcu_read_lock(&kvm->srcu) for us, or
> > > > if there was something else more subtle here.
> > >
> > > I was kind of afraid that people would be confused by this, and I
> > > commented on it in the commit message of the earlier patch:
> > > https://lore.kernel.org/all/20250211121128.703390-6-tabba@google.com/
> > >
> > > > Note that the word "private" in the name of the function
> > > > kvm_mem_is_private() doesn't necessarily indicate that the memory
> > > > isn't shared, but is due to the history and evolution of
> > > > guest_memfd and the various names it has received. In effect,
> > > > this function is used to multiplex between the path of a normal
> > > > page fault and the path of a guest_memfd backed page fault.
> > >
> > > kvm_mem_is_private() is property of the memslot itself. No xarrays
> > > harmed in the process :)
> >
> > Ah, I see, but could someone enable CONFIG_GENERIC_PRIVATE_MEM and
> > related and get confused? Should KVM_GENERIC_MEMORY_ATTRIBUTES=n
> > depend on !ARM64? Or is it KVM_GMEM_SHARED_MEM that needs to depend on
> > the generic implementation being off?
> 
> VMs that have sharing in place don't need
> KVM_GENERIC_MEMORY_ATTRIBUTES, since that presents the userspace
> view/desire of the state of the folio. It's not necessarily an arm64
> thing, for example, CCA would need it, since it behaves like TDX.
> 
> I guess that KVM_GMEM_SHARED_MEM and KVM_GENERIC_MEMORY_ATTRIBUTES are
> mutually exclusive. I cannot think how the two could be used or useful
> together. We could have a check to ensure that both are not enabled at
> the same time.

Right, that should be a matter of adding

	depend on !KVM_GENERIC_MEMORY_ATTRIBUTES

to the KVM_GMEM_SHARED_MEM Kconfig I think then.

> The behavior in this patch series is that
> KVM_GMEM_SHARED_MEM selects GENERIC_PRIVATE_MEM.

You meant s/GENERIC_PRIVATE_MEM/KVM_PRIVATE_MEM right?

> Also, to help reduce the confusion above, I could rename the variable
> is_private in user_mem_abort() to is_guestmem. WDYT?

I actually don't mind the variable name in that it is consistent with the
rest of the code, but I do positively hate how the definition of
'private' in this code doesn't match my intuition :-)
diff mbox series

Patch

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index b6c0acb2311c..305060518766 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1454,6 +1454,33 @@  static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
 	return vma->vm_flags & VM_MTE_ALLOWED;
 }
 
+static kvm_pfn_t faultin_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+			     gfn_t gfn, bool write_fault, bool *writable,
+			     struct page **page, bool is_private)
+{
+	kvm_pfn_t pfn;
+	int ret;
+
+	if (!is_private)
+		return __kvm_faultin_pfn(slot, gfn, write_fault ? FOLL_WRITE : 0, writable, page);
+
+	*writable = false;
+
+	if (WARN_ON_ONCE(write_fault && memslot_is_readonly(slot)))
+		return KVM_PFN_ERR_NOSLOT_MASK;
+
+	ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, page, NULL);
+	if (!ret) {
+		*writable = write_fault;
+		return pfn;
+	}
+
+	if (ret == -EHWPOISON)
+		return KVM_PFN_ERR_HWPOISON;
+
+	return KVM_PFN_ERR_NOSLOT_MASK;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_s2_trans *nested,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
@@ -1461,25 +1488,26 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 {
 	int ret = 0;
 	bool write_fault, writable;
-	bool exec_fault, mte_allowed;
+	bool exec_fault, mte_allowed = false;
 	bool device = false, vfio_allow_any_uc = false;
 	unsigned long mmu_seq;
 	phys_addr_t ipa = fault_ipa;
 	struct kvm *kvm = vcpu->kvm;
-	struct vm_area_struct *vma;
+	struct vm_area_struct *vma = NULL;
 	short vma_shift;
 	void *memcache;
-	gfn_t gfn;
+	gfn_t gfn = ipa >> PAGE_SHIFT;
 	kvm_pfn_t pfn;
 	bool logging_active = memslot_is_logging(memslot);
-	bool force_pte = logging_active || is_protected_kvm_enabled();
-	long vma_pagesize, fault_granule;
+	bool is_private = kvm_mem_is_private(kvm, gfn);
+	bool force_pte = logging_active || is_private || is_protected_kvm_enabled();
+	long vma_pagesize, fault_granule = PAGE_SIZE;
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
 	struct kvm_pgtable *pgt;
 	struct page *page;
 	enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
 
-	if (fault_is_perm)
+	if (fault_is_perm && !is_private)
 		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
 	write_fault = kvm_is_write_fault(vcpu);
 	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
@@ -1510,24 +1538,30 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			return ret;
 	}
 
+	mmap_read_lock(current->mm);
+
 	/*
 	 * Let's check if we will get back a huge page backed by hugetlbfs, or
 	 * get block mapping for device MMIO region.
 	 */
-	mmap_read_lock(current->mm);
-	vma = vma_lookup(current->mm, hva);
-	if (unlikely(!vma)) {
-		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
-		mmap_read_unlock(current->mm);
-		return -EFAULT;
-	}
+	if (!is_private) {
+		vma = vma_lookup(current->mm, hva);
+		if (unlikely(!vma)) {
+			kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
+			mmap_read_unlock(current->mm);
+			return -EFAULT;
+		}
 
-	/*
-	 * logging_active is guaranteed to never be true for VM_PFNMAP
-	 * memslots.
-	 */
-	if (WARN_ON_ONCE(logging_active && (vma->vm_flags & VM_PFNMAP)))
-		return -EFAULT;
+		/*
+		 * logging_active is guaranteed to never be true for VM_PFNMAP
+		 * memslots.
+		 */
+		if (WARN_ON_ONCE(logging_active && (vma->vm_flags & VM_PFNMAP)))
+			return -EFAULT;
+
+		vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
+		mte_allowed = kvm_vma_mte_allowed(vma);
+	}
 
 	if (force_pte)
 		vma_shift = PAGE_SHIFT;
@@ -1597,18 +1631,13 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		ipa &= ~(vma_pagesize - 1);
 	}
 
-	gfn = ipa >> PAGE_SHIFT;
-	mte_allowed = kvm_vma_mte_allowed(vma);
-
-	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
-
 	/* Don't use the VMA after the unlock -- it may have vanished */
 	vma = NULL;
 
 	/*
 	 * Read mmu_invalidate_seq so that KVM can detect if the results of
-	 * vma_lookup() or __kvm_faultin_pfn() become stale prior to
-	 * acquiring kvm->mmu_lock.
+	 * vma_lookup() or faultin_pfn() become stale prior to acquiring
+	 * kvm->mmu_lock.
 	 *
 	 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
 	 * with the smp_wmb() in kvm_mmu_invalidate_end().
@@ -1616,8 +1645,7 @@  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
 	mmap_read_unlock(current->mm);
 
-	pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
-				&writable, &page);
+	pfn = faultin_pfn(kvm, memslot, gfn, write_fault, &writable, &page, is_private);
 	if (pfn == KVM_PFN_ERR_HWPOISON) {
 		kvm_send_hwpoison_signal(hva, vma_shift);
 		return 0;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 39fd6e35c723..415c6274aede 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1882,6 +1882,11 @@  static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
 	return gfn_to_memslot(kvm, gfn)->id;
 }
 
+static inline bool memslot_is_readonly(const struct kvm_memory_slot *slot)
+{
+	return slot->flags & KVM_MEM_READONLY;
+}
+
 static inline gfn_t
 hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 38f0f402ea46..3e40acb9f5c0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2624,11 +2624,6 @@  unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
 	return size;
 }
 
-static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
-{
-	return slot->flags & KVM_MEM_READONLY;
-}
-
 static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
 				       gfn_t *nr_pages, bool write)
 {