diff mbox series

[RFC,v11,12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

Message ID 20230718234512.1690985-13-seanjc@google.com (mailing list archive)
State New
Headers show
Series KVM: guest_memfd() and per-page attributes | expand

Commit Message

Sean Christopherson July 18, 2023, 11:44 p.m. UTC
TODO

Cc: Fuad Tabba <tabba@google.com>
Cc: Vishal Annapurve <vannapurve@google.com>
Cc: Ackerley Tng <ackerleytng@google.com>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Cc: Maciej Szmigiero <mail@maciej.szmigiero.name>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Quentin Perret <qperret@google.com>
Cc: Michael Roth <michael.roth@amd.com>
Cc: Wang <wei.w.wang@intel.com>
Cc: Liam Merwick <liam.merwick@oracle.com>
Cc: Isaku Yamahata <isaku.yamahata@gmail.com>
Co-developed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Co-developed-by: Chao Peng <chao.p.peng@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h   |  48 +++
 include/uapi/linux/kvm.h   |  14 +-
 include/uapi/linux/magic.h |   1 +
 virt/kvm/Kconfig           |   4 +
 virt/kvm/Makefile.kvm      |   1 +
 virt/kvm/guest_mem.c       | 591 +++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c        |  58 +++-
 virt/kvm/kvm_mm.h          |  38 +++
 8 files changed, 750 insertions(+), 5 deletions(-)
 create mode 100644 virt/kvm/guest_mem.c

Comments

Vishal Annapurve July 19, 2023, 5:21 p.m. UTC | #1
On Tue, Jul 18, 2023 at 4:49 PM Sean Christopherson <seanjc@google.com> wrote:
> ...
> +static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
> +{
> +       struct list_head *gmem_list = &mapping->private_list;
> +       struct kvm_memory_slot *slot;
> +       struct kvm_gmem *gmem;
> +       unsigned long index;
> +       pgoff_t start, end;
> +       gfn_t gfn;
> +
> +       filemap_invalidate_lock_shared(mapping);
> +
> +       start = page->index;
> +       end = start + thp_nr_pages(page);
> +
> +       list_for_each_entry(gmem, gmem_list, entry) {
> +               xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> +                       for (gfn = start; gfn < end; gfn++) {
> +                               if (WARN_ON_ONCE(gfn < slot->base_gfn ||
> +                                               gfn >= slot->base_gfn + slot->npages))
> +                                       continue;
> +
> +                               /*
> +                                * FIXME: Tell userspace that the *private*
> +                                * memory encountered an error.
> +                                */
> +                               send_sig_mceerr(BUS_MCEERR_AR,
> +                                               (void __user *)gfn_to_hva_memslot(slot, gfn),
> +                                               PAGE_SHIFT, current);

Does it make sense to replicate what happens with MCE handling on
tmpfs backed guest memory:
1) Unmap gpa from guest
2) On the next guest EPT fault, exit to userspace to handle/log the
mce error for the gpa.

IIUC, such MCEs could be asynchronous and "current" might not always
be the intended recipient of this signal.

> +                       }
> +               }
> +       }
> +
> +       filemap_invalidate_unlock_shared(mapping);
> +
> +       return 0;
> +}
> +
Sean Christopherson July 19, 2023, 5:47 p.m. UTC | #2
On Wed, Jul 19, 2023, Vishal Annapurve wrote:
> On Tue, Jul 18, 2023 at 4:49 PM Sean Christopherson <seanjc@google.com> wrote:
> > ...
> > +static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
> > +{
> > +       struct list_head *gmem_list = &mapping->private_list;
> > +       struct kvm_memory_slot *slot;
> > +       struct kvm_gmem *gmem;
> > +       unsigned long index;
> > +       pgoff_t start, end;
> > +       gfn_t gfn;
> > +
> > +       filemap_invalidate_lock_shared(mapping);
> > +
> > +       start = page->index;
> > +       end = start + thp_nr_pages(page);
> > +
> > +       list_for_each_entry(gmem, gmem_list, entry) {
> > +               xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> > +                       for (gfn = start; gfn < end; gfn++) {
> > +                               if (WARN_ON_ONCE(gfn < slot->base_gfn ||
> > +                                               gfn >= slot->base_gfn + slot->npages))
> > +                                       continue;
> > +
> > +                               /*
> > +                                * FIXME: Tell userspace that the *private*
> > +                                * memory encountered an error.
> > +                                */
> > +                               send_sig_mceerr(BUS_MCEERR_AR,
> > +                                               (void __user *)gfn_to_hva_memslot(slot, gfn),
> > +                                               PAGE_SHIFT, current);
> 
> Does it make sense to replicate what happens with MCE handling on
> tmpfs backed guest memory:
> 1) Unmap gpa from guest
> 2) On the next guest EPT fault, exit to userspace to handle/log the
> mce error for the gpa.

Hmm, yes, that would be much better.  Ah, and kvm_gmem_get_pfn() needs to check
folio_test_hwpoison() and potentially PageHWPoison().  E.g. if the folio is huge,
KVM needs to restrict the mapping to order-0 (target page isn't poisoned), or
return KVM_PFN_ERR_HWPOISON (taget page IS poisoned).

Alternatively, KVM could punch a hole in kvm_gmem_error_page(), but I don't think
we want to do that because that would prevent forwarding the #MC to the guest.
Xiaoyao Li July 20, 2023, 2:45 p.m. UTC | #3
On 7/19/2023 7:44 AM, Sean Christopherson wrote:
> @@ -5134,6 +5167,16 @@ static long kvm_vm_ioctl(struct file *filp,
>   	case KVM_GET_STATS_FD:
>   		r = kvm_vm_ioctl_get_stats_fd(kvm);
>   		break;
> +	case KVM_CREATE_GUEST_MEMFD: {
> +		struct kvm_create_guest_memfd guest_memfd;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
> +			goto out;
> +
> +		r = kvm_gmem_create(kvm, &guest_memfd);
> +		break;
> +	}

Does it need a new CAP to indicate the support of guest_memfd?

This is patch series introduces 3 new CAPs and it seems any one of them 
can serve as the indicator of guest_memfd.

+#define KVM_CAP_USER_MEMORY2 230
+#define KVM_CAP_MEMORY_ATTRIBUTES 231
+#define KVM_CAP_VM_TYPES 232

or we just go and try the ioctl, the return value will tell the result?
Sean Christopherson July 20, 2023, 3:14 p.m. UTC | #4
On Thu, Jul 20, 2023, Xiaoyao Li wrote:
> On 7/19/2023 7:44 AM, Sean Christopherson wrote:
> > @@ -5134,6 +5167,16 @@ static long kvm_vm_ioctl(struct file *filp,
> >   	case KVM_GET_STATS_FD:
> >   		r = kvm_vm_ioctl_get_stats_fd(kvm);
> >   		break;
> > +	case KVM_CREATE_GUEST_MEMFD: {
> > +		struct kvm_create_guest_memfd guest_memfd;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
> > +			goto out;
> > +
> > +		r = kvm_gmem_create(kvm, &guest_memfd);
> > +		break;
> > +	}
> 
> Does it need a new CAP to indicate the support of guest_memfd?

Yeah, I meant to add that to the TODO list and forgot (obviously).

> This is patch series introduces 3 new CAPs and it seems any one of them can
> serve as the indicator of guest_memfd.
> 
> +#define KVM_CAP_USER_MEMORY2 230
> +#define KVM_CAP_MEMORY_ATTRIBUTES 231
> +#define KVM_CAP_VM_TYPES 232

The number of new caps being added is the main why I didn't just add another one.
On the other hand, we have room for a few billion caps, so one more isn't a big
deal.  So yeah, KVM_CAP_GUEST_MEMFD is probably the way to go.
Isaku Yamahata July 20, 2023, 9:28 p.m. UTC | #5
On Tue, Jul 18, 2023 at 04:44:55PM -0700,
Sean Christopherson <seanjc@google.com> wrote:

> +static int kvm_gmem_release(struct inode *inode, struct file *file)
> +{
> +	struct kvm_gmem *gmem = file->private_data;
> +	struct kvm_memory_slot *slot;
> +	struct kvm *kvm = gmem->kvm;
> +	unsigned long index;
> +
> +	filemap_invalidate_lock(inode->i_mapping);
> +
> +	/*
> +	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
> +	 * reference to the file and thus no new bindings can be created, but
> +	 * dereferencing the slot for existing bindings needs to be protected
> +	 * against memslot updates, specifically so that unbind doesn't race
> +	 * and free the memslot (kvm_gmem_get_file() will return NULL).
> +	 */
> +	mutex_lock(&kvm->slots_lock);
> +
> +	xa_for_each(&gmem->bindings, index, slot)
> +		rcu_assign_pointer(slot->gmem.file, NULL);
> +
> +	synchronize_rcu();
> +
> +	/*
> +	 * All in-flight operations are gone and new bindings can be created.
> +	 * Zap all SPTEs pointed at by this file.  Do not free the backing
> +	 * memory, as its lifetime is associated with the inode, not the file.
> +	 */
> +	kvm_gmem_invalidate_begin(gmem, 0, -1ul);
> +	kvm_gmem_invalidate_end(gmem, 0, -1ul);
> +
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	list_del(&gmem->entry);
> +
> +	filemap_invalidate_unlock(inode->i_mapping);
> +
> +	xa_destroy(&gmem->bindings);
> +	kfree(gmem);
> +
> +	kvm_put_kvm(kvm);
> +
> +	return 0;
> +}

The lockdep complains with the filemapping lock and the kvm slot lock.


From bc45eb084a761f93a87ba1f6d3a9949c17adeb31 Mon Sep 17 00:00:00 2001
Message-Id: <bc45eb084a761f93a87ba1f6d3a9949c17adeb31.1689888438.git.isaku.yamahata@intel.com>
From: Isaku Yamahata <isaku.yamahata@intel.com>
Date: Thu, 20 Jul 2023 14:16:21 -0700
Subject: [PATCH] KVM/gmem: Fix locking ordering in kvm_gmem_release()

The lockdep complains the locking order.  Fix kvm_gmem_release()

VM destruction:
- fput()
   ...
   \-kvm_gmem_release()
     \-filemap_invalidate_lock(inode->i_mapping);
       lock(&kvm->slots_lock);

slot creation:
kvm_set_memory_region()
   mutex_lock(&kvm->slots_lock);
   __kvm_set_memory_region(kvm, mem);
    \-kvm_gmem_bind()
      \-filemap_invalidate_lock(inode->i_mapping);

======================================================
WARNING: possible circular locking dependency detected
------------------------------------------------------
...

the existing dependency chain (in reverse order) is:

-> #1 (mapping.invalidate_lock#4){+.+.}-{4:4}:
       ...
       down_write+0x40/0xe0
       kvm_gmem_bind+0xd9/0x1b0 [kvm]
       __kvm_set_memory_region.part.0+0x4fc/0x620 [kvm]
       __kvm_set_memory_region+0x6b/0x90 [kvm]
       kvm_vm_ioctl+0x350/0xa00 [kvm]
       __x64_sys_ioctl+0x95/0xd0
       do_syscall_64+0x39/0x90
       entry_SYSCALL_64_after_hwframe+0x6e/0xd8

-> #0 (&kvm->slots_lock){+.+.}-{4:4}:
       ...
       mutex_lock_nested+0x1b/0x30
       kvm_gmem_release+0x56/0x1b0 [kvm]
       __fput+0x115/0x2e0
       ____fput+0xe/0x20
       task_work_run+0x5e/0xb0
       do_exit+0x2dd/0x5b0
       do_group_exit+0x3b/0xb0
       __x64_sys_exit_group+0x18/0x20
       do_syscall_64+0x39/0x90
       entry_SYSCALL_64_after_hwframe+0x6e/0xd8

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(mapping.invalidate_lock#4);
                               lock(&kvm->slots_lock);
                               lock(mapping.invalidate_lock#4);
  lock(&kvm->slots_lock);

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 virt/kvm/guest_mem.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
index ab91e972e699..772e4631fcd9 100644
--- a/virt/kvm/guest_mem.c
+++ b/virt/kvm/guest_mem.c
@@ -274,8 +274,6 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
 	struct kvm *kvm = gmem->kvm;
 	unsigned long index;
 
-	filemap_invalidate_lock(inode->i_mapping);
-
 	/*
 	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
 	 * reference to the file and thus no new bindings can be created, but
@@ -285,6 +283,8 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
 	 */
 	mutex_lock(&kvm->slots_lock);
 
+	filemap_invalidate_lock(inode->i_mapping);
+
 	xa_for_each(&gmem->bindings, index, slot)
 		rcu_assign_pointer(slot->gmem.file, NULL);
 
@@ -299,12 +299,12 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
 	kvm_gmem_issue_arch_invalidate(gmem->kvm, file_inode(file), 0, -1ul);
 	kvm_gmem_invalidate_end(gmem, 0, -1ul);
 
-	mutex_unlock(&kvm->slots_lock);
-
 	list_del(&gmem->entry);
 
 	filemap_invalidate_unlock(inode->i_mapping);
 
+	mutex_unlock(&kvm->slots_lock);
+
 	xa_destroy(&gmem->bindings);
 	kfree(gmem);
Yuan Yao July 21, 2023, 6:13 a.m. UTC | #6
On Tue, Jul 18, 2023 at 04:44:55PM -0700, Sean Christopherson wrote:
> TODO
>
> Cc: Fuad Tabba <tabba@google.com>
> Cc: Vishal Annapurve <vannapurve@google.com>
> Cc: Ackerley Tng <ackerleytng@google.com>
> Cc: Jarkko Sakkinen <jarkko@kernel.org>
> Cc: Maciej Szmigiero <mail@maciej.szmigiero.name>
> Cc: Vlastimil Babka <vbabka@suse.cz>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Quentin Perret <qperret@google.com>
> Cc: Michael Roth <michael.roth@amd.com>
> Cc: Wang <wei.w.wang@intel.com>
> Cc: Liam Merwick <liam.merwick@oracle.com>
> Cc: Isaku Yamahata <isaku.yamahata@gmail.com>
> Co-developed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> Co-developed-by: Chao Peng <chao.p.peng@linux.intel.com>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> Co-developed-by: Ackerley Tng <ackerleytng@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  include/linux/kvm_host.h   |  48 +++
>  include/uapi/linux/kvm.h   |  14 +-
>  include/uapi/linux/magic.h |   1 +
>  virt/kvm/Kconfig           |   4 +
>  virt/kvm/Makefile.kvm      |   1 +
>  virt/kvm/guest_mem.c       | 591 +++++++++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c        |  58 +++-
>  virt/kvm/kvm_mm.h          |  38 +++
>  8 files changed, 750 insertions(+), 5 deletions(-)
>  create mode 100644 virt/kvm/guest_mem.c
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 97db63da6227..0d1e2ee8ae7a 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -592,8 +592,20 @@ struct kvm_memory_slot {
>  	u32 flags;
>  	short id;
>  	u16 as_id;
> +
> +#ifdef CONFIG_KVM_PRIVATE_MEM
> +	struct {
> +		struct file __rcu *file;
> +		pgoff_t pgoff;
> +	} gmem;
> +#endif
>  };
>
> +static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
> +{
> +	return slot && (slot->flags & KVM_MEM_PRIVATE);
> +}
> +
>  static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
>  {
>  	return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
> @@ -688,6 +700,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
>  }
>  #endif
>
> +/*
> + * Arch code must define kvm_arch_has_private_mem if support for private memory
> + * is enabled.
> + */
> +#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
> +static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
> +{
> +	return false;
> +}
> +#endif
> +
>  struct kvm_memslots {
>  	u64 generation;
>  	atomic_long_t last_used_slot;
> @@ -1380,6 +1403,7 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  void kvm_mmu_invalidate_begin(struct kvm *kvm);
>  void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
>  void kvm_mmu_invalidate_end(struct kvm *kvm);
> +bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
>
>  long kvm_arch_dev_ioctl(struct file *filp,
>  			unsigned int ioctl, unsigned long arg);
> @@ -2313,6 +2337,30 @@ static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn
>
>  bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
>  					 struct kvm_gfn_range *range);
> +
> +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> +{
> +	return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
> +	       kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> +}
> +#else
> +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> +{
> +	return false;
> +}
>  #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
>
> +#ifdef CONFIG_KVM_PRIVATE_MEM
> +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> +			      gfn_t gfn, kvm_pfn_t *pfn, int *max_order);
> +#else
> +static inline int kvm_gmem_get_pfn(struct kvm *kvm,
> +				   struct kvm_memory_slot *slot, gfn_t gfn,
> +				   kvm_pfn_t *pfn, int *max_order)
> +{
> +	KVM_BUG_ON(1, kvm);
> +	return -EIO;
> +}
> +#endif /* CONFIG_KVM_PRIVATE_MEM */
> +
>  #endif
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index f065c57db327..9b344fc98598 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -102,7 +102,10 @@ struct kvm_userspace_memory_region2 {
>  	__u64 guest_phys_addr;
>  	__u64 memory_size;
>  	__u64 userspace_addr;
> -	__u64 pad[16];
> +	__u64 gmem_offset;
> +	__u32 gmem_fd;
> +	__u32 pad1;
> +	__u64 pad2[14];
>  };
>
>  /*
> @@ -112,6 +115,7 @@ struct kvm_userspace_memory_region2 {
>   */
>  #define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
>  #define KVM_MEM_READONLY	(1UL << 1)
> +#define KVM_MEM_PRIVATE		(1UL << 2)
>
>  /* for KVM_IRQ_LINE */
>  struct kvm_irq_level {
> @@ -2284,4 +2288,12 @@ struct kvm_memory_attributes {
>
>  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
>
> +#define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
> +
> +struct kvm_create_guest_memfd {
> +	__u64 size;
> +	__u64 flags;
> +	__u64 reserved[6];
> +};
> +
>  #endif /* __LINUX_KVM_H */
> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> index 6325d1d0e90f..15041aa7d9ae 100644
> --- a/include/uapi/linux/magic.h
> +++ b/include/uapi/linux/magic.h
> @@ -101,5 +101,6 @@
>  #define DMA_BUF_MAGIC		0x444d4142	/* "DMAB" */
>  #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
>  #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
> +#define GUEST_MEMORY_MAGIC	0x474d454d	/* "GMEM" */
>
>  #endif /* __LINUX_MAGIC_H__ */
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 8375bc49f97d..3ee3205e0b39 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -103,3 +103,7 @@ config KVM_GENERIC_MMU_NOTIFIER
>  config KVM_GENERIC_MEMORY_ATTRIBUTES
>         select KVM_GENERIC_MMU_NOTIFIER
>         bool
> +
> +config KVM_PRIVATE_MEM
> +       select XARRAY_MULTI
> +       bool
> diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
> index 2c27d5d0c367..a5a61bbe7f4c 100644
> --- a/virt/kvm/Makefile.kvm
> +++ b/virt/kvm/Makefile.kvm
> @@ -12,3 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
>  kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
>  kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
>  kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
> +kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_mem.o
> diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
> new file mode 100644
> index 000000000000..1b705fd63fa8
> --- /dev/null
> +++ b/virt/kvm/guest_mem.c
> @@ -0,0 +1,591 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/backing-dev.h>
> +#include <linux/falloc.h>
> +#include <linux/kvm_host.h>
> +#include <linux/pagemap.h>
> +#include <linux/pseudo_fs.h>
> +
> +#include <uapi/linux/magic.h>
> +
> +#include "kvm_mm.h"
> +
> +static struct vfsmount *kvm_gmem_mnt;
> +
> +struct kvm_gmem {
> +	struct kvm *kvm;
> +	struct xarray bindings;
> +	struct list_head entry;
> +};
> +
> +static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
> +{
> +	struct folio *folio;
> +
> +	/* TODO: Support huge pages. */
> +	folio = filemap_grab_folio(file->f_mapping, index);
> +	if (!folio)
> +		return NULL;
> +
> +	/*
> +	 * Use the up-to-date flag to track whether or not the memory has been
> +	 * zeroed before being handed off to the guest.  There is no backing
> +	 * storage for the memory, so the folio will remain up-to-date until
> +	 * it's removed.
> +	 *
> +	 * TODO: Skip clearing pages when trusted firmware will do it when
> +	 * assigning memory to the guest.
> +	 */
> +	if (!folio_test_uptodate(folio)) {
> +		unsigned long nr_pages = folio_nr_pages(folio);
> +		unsigned long i;
> +
> +		for (i = 0; i < nr_pages; i++)
> +			clear_highpage(folio_page(folio, i));
> +
> +		folio_mark_uptodate(folio);
> +	}
> +
> +	/*
> +	 * Ignore accessed, referenced, and dirty flags.  The memory is
> +	 * unevictable and there is no storage to write back to.
> +	 */
> +	return folio;
> +}
> +
> +static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> +				      pgoff_t end)
> +{
> +	struct kvm_memory_slot *slot;
> +	struct kvm *kvm = gmem->kvm;
> +	unsigned long index;
> +	bool flush = false;
> +
> +	KVM_MMU_LOCK(kvm);
> +
> +	kvm_mmu_invalidate_begin(kvm);
> +
> +	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> +		pgoff_t pgoff = slot->gmem.pgoff;
> +
> +		struct kvm_gfn_range gfn_range = {
> +			.start = slot->base_gfn + max(pgoff, start) - pgoff,
> +			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
> +			.slot = slot,
> +			.may_block = true,
> +		};
> +
> +		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
> +	}
> +
> +	if (flush)
> +		kvm_flush_remote_tlbs(kvm);
> +
> +	KVM_MMU_UNLOCK(kvm);
> +}
> +
> +static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
> +				    pgoff_t end)
> +{
> +	struct kvm *kvm = gmem->kvm;
> +
> +	KVM_MMU_LOCK(kvm);
> +	if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT))
> +		kvm_mmu_invalidate_end(kvm);
> +	KVM_MMU_UNLOCK(kvm);
> +}
> +
> +static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> +{
> +	struct list_head *gmem_list = &inode->i_mapping->private_list;
> +	pgoff_t start = offset >> PAGE_SHIFT;
> +	pgoff_t end = (offset + len) >> PAGE_SHIFT;
> +	struct kvm_gmem *gmem;
> +
> +	/*
> +	 * Bindings must stable across invalidation to ensure the start+end
> +	 * are balanced.
> +	 */
> +	filemap_invalidate_lock(inode->i_mapping);
> +
> +	list_for_each_entry(gmem, gmem_list, entry)
> +		kvm_gmem_invalidate_begin(gmem, start, end);
> +
> +	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
> +
> +	list_for_each_entry(gmem, gmem_list, entry)
> +		kvm_gmem_invalidate_end(gmem, start, end);
> +
> +	filemap_invalidate_unlock(inode->i_mapping);
> +
> +	return 0;
> +}
> +
> +static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
> +{
> +	struct address_space *mapping = inode->i_mapping;
> +	pgoff_t start, index, end;
> +	int r;
> +
> +	/* Dedicated guest is immutable by default. */
> +	if (offset + len > i_size_read(inode))
> +		return -EINVAL;
> +
> +	filemap_invalidate_lock_shared(mapping);
> +
> +	start = offset >> PAGE_SHIFT;
> +	end = (offset + len) >> PAGE_SHIFT;
> +
> +	r = 0;
> +	for (index = start; index < end; ) {
> +		struct folio *folio;
> +
> +		if (signal_pending(current)) {
> +			r = -EINTR;
> +			break;
> +		}
> +
> +		folio = kvm_gmem_get_folio(inode, index);
> +		if (!folio) {
> +			r = -ENOMEM;
> +			break;
> +		}
> +
> +		index = folio_next_index(folio);
> +
> +		folio_unlock(folio);
> +		folio_put(folio);
> +
> +		/* 64-bit only, wrapping the index should be impossible. */
> +		if (WARN_ON_ONCE(!index))
> +			break;
> +
> +		cond_resched();
> +	}
> +
> +	filemap_invalidate_unlock_shared(mapping);
> +
> +	return r;
> +}
> +
> +static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
> +			       loff_t len)
> +{
> +	int ret;
> +
> +	if (!(mode & FALLOC_FL_KEEP_SIZE))
> +		return -EOPNOTSUPP;
> +
> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> +		return -EOPNOTSUPP;
> +
> +	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> +		return -EINVAL;
> +
> +	if (mode & FALLOC_FL_PUNCH_HOLE)
> +		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
> +	else
> +		ret = kvm_gmem_allocate(file_inode(file), offset, len);
> +
> +	if (!ret)
> +		file_modified(file);
> +	return ret;
> +}
> +
> +static int kvm_gmem_release(struct inode *inode, struct file *file)
> +{
> +	struct kvm_gmem *gmem = file->private_data;
> +	struct kvm_memory_slot *slot;
> +	struct kvm *kvm = gmem->kvm;
> +	unsigned long index;
> +
> +	filemap_invalidate_lock(inode->i_mapping);
> +
> +	/*
> +	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
> +	 * reference to the file and thus no new bindings can be created, but
> +	 * dereferencing the slot for existing bindings needs to be protected
> +	 * against memslot updates, specifically so that unbind doesn't race
> +	 * and free the memslot (kvm_gmem_get_file() will return NULL).
> +	 */
> +	mutex_lock(&kvm->slots_lock);
> +
> +	xa_for_each(&gmem->bindings, index, slot)
> +		rcu_assign_pointer(slot->gmem.file, NULL);
> +
> +	synchronize_rcu();
> +
> +	/*
> +	 * All in-flight operations are gone and new bindings can be created.
> +	 * Zap all SPTEs pointed at by this file.  Do not free the backing
> +	 * memory, as its lifetime is associated with the inode, not the file.
> +	 */
> +	kvm_gmem_invalidate_begin(gmem, 0, -1ul);
> +	kvm_gmem_invalidate_end(gmem, 0, -1ul);
> +
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	list_del(&gmem->entry);
> +
> +	filemap_invalidate_unlock(inode->i_mapping);
> +
> +	xa_destroy(&gmem->bindings);
> +	kfree(gmem);
> +
> +	kvm_put_kvm(kvm);
> +
> +	return 0;
> +}
> +
> +static struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
> +{
> +	struct file *file;
> +
> +	rcu_read_lock();
> +
> +	file = rcu_dereference(slot->gmem.file);
> +	if (file && !get_file_rcu(file))
> +		file = NULL;
> +
> +	rcu_read_unlock();
> +
> +	return file;
> +}
> +
> +static const struct file_operations kvm_gmem_fops = {
> +	.open		= generic_file_open,
> +	.release	= kvm_gmem_release,
> +	.fallocate	= kvm_gmem_fallocate,
> +};
> +
> +static int kvm_gmem_migrate_folio(struct address_space *mapping,
> +				  struct folio *dst, struct folio *src,
> +				  enum migrate_mode mode)
> +{
> +	WARN_ON_ONCE(1);
> +	return -EINVAL;
> +}
> +
> +static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
> +{
> +	struct list_head *gmem_list = &mapping->private_list;
> +	struct kvm_memory_slot *slot;
> +	struct kvm_gmem *gmem;
> +	unsigned long index;
> +	pgoff_t start, end;
> +	gfn_t gfn;
> +
> +	filemap_invalidate_lock_shared(mapping);
> +
> +	start = page->index;
> +	end = start + thp_nr_pages(page);
> +
> +	list_for_each_entry(gmem, gmem_list, entry) {
> +		xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> +			for (gfn = start; gfn < end; gfn++) {

Why the start end range used as gfn here ?

the page->index is offset of inode's page cache mapping and
gmem address space, IIUC, gfn calculation should follow same
way as kvm_gmem_invalidate_begin().

> +				if (WARN_ON_ONCE(gfn < slot->base_gfn ||
> +						gfn >= slot->base_gfn + slot->npages))
> +					continue;
> +
> +				/*
> +				 * FIXME: Tell userspace that the *private*
> +				 * memory encountered an error.
> +				 */
> +				send_sig_mceerr(BUS_MCEERR_AR,
> +						(void __user *)gfn_to_hva_memslot(slot, gfn),
> +						PAGE_SHIFT, current);
> +			}
> +		}
> +	}
> +
> +	filemap_invalidate_unlock_shared(mapping);
> +
> +	return 0;
> +}
> +
> +static const struct address_space_operations kvm_gmem_aops = {
> +	.dirty_folio = noop_dirty_folio,
> +#ifdef CONFIG_MIGRATION
> +	.migrate_folio	= kvm_gmem_migrate_folio,
> +#endif
> +	.error_remove_page = kvm_gmem_error_page,
> +};
> +
> +static int  kvm_gmem_getattr(struct mnt_idmap *idmap,
> +			     const struct path *path, struct kstat *stat,
> +			     u32 request_mask, unsigned int query_flags)
> +{
> +	struct inode *inode = path->dentry->d_inode;
> +
> +	/* TODO */
> +	generic_fillattr(idmap, inode, stat);
> +	return 0;
> +}
> +
> +static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
> +			    struct iattr *attr)
> +{
> +	/* TODO */
> +	return -EINVAL;
> +}
> +static const struct inode_operations kvm_gmem_iops = {
> +	.getattr	= kvm_gmem_getattr,
> +	.setattr	= kvm_gmem_setattr,
> +};
> +
> +static int __kvm_gmem_create(struct kvm *kvm, loff_t size, struct vfsmount *mnt)
> +{
> +	const char *anon_name = "[kvm-gmem]";
> +	const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
> +	struct kvm_gmem *gmem;
> +	struct inode *inode;
> +	struct file *file;
> +	int fd, err;
> +
> +	inode = alloc_anon_inode(mnt->mnt_sb);
> +	if (IS_ERR(inode))
> +		return PTR_ERR(inode);
> +
> +	err = security_inode_init_security_anon(inode, &qname, NULL);
> +	if (err)
> +		goto err_inode;
> +
> +	inode->i_private = (void *)(unsigned long)flags;
> +	inode->i_op = &kvm_gmem_iops;
> +	inode->i_mapping->a_ops = &kvm_gmem_aops;
> +	inode->i_mode |= S_IFREG;
> +	inode->i_size = size;
> +	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> +	mapping_set_unevictable(inode->i_mapping);
> +	mapping_set_unmovable(inode->i_mapping);
> +
> +	fd = get_unused_fd_flags(0);
> +	if (fd < 0) {
> +		err = fd;
> +		goto err_inode;
> +	}
> +
> +	file = alloc_file_pseudo(inode, mnt, "kvm-gmem", O_RDWR, &kvm_gmem_fops);
> +	if (IS_ERR(file)) {
> +		err = PTR_ERR(file);
> +		goto err_fd;
> +	}
> +
> +	file->f_flags |= O_LARGEFILE;
> +	file->f_mapping = inode->i_mapping;
> +
> +	gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
> +	if (!gmem) {
> +		err = -ENOMEM;
> +		goto err_file;
> +	}
> +
> +	kvm_get_kvm(kvm);
> +	gmem->kvm = kvm;
> +	xa_init(&gmem->bindings);
> +
> +	file->private_data = gmem;
> +
> +	list_add(&gmem->entry, &inode->i_mapping->private_list);
> +
> +	fd_install(fd, file);
> +	return fd;
> +
> +err_file:
> +	fput(file);
> +err_fd:
> +	put_unused_fd(fd);
> +err_inode:
> +	iput(inode);
> +	return err;
> +}
> +
> +static bool kvm_gmem_is_valid_size(loff_t size, u64 flags)
> +{
> +	if (size < 0 || !PAGE_ALIGNED(size))
> +		return false;
> +
> +	return true;
> +}
> +
> +int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
> +{
> +	loff_t size = args->size;
> +	u64 flags = args->flags;
> +	u64 valid_flags = 0;
> +
> +	if (flags & ~valid_flags)
> +		return -EINVAL;
> +
> +	if (!kvm_gmem_is_valid_size(size, flags))
> +		return -EINVAL;
> +
> +	return __kvm_gmem_create(kvm, size, flags, kvm_gmem_mnt);
> +}
> +
> +int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
> +		  unsigned int fd, loff_t offset)
> +{
> +	loff_t size = slot->npages << PAGE_SHIFT;
> +	unsigned long start, end, flags;
> +	struct kvm_gmem *gmem;
> +	struct inode *inode;
> +	struct file *file;
> +
> +	BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
> +
> +	file = fget(fd);
> +	if (!file)
> +		return -EINVAL;
> +
> +	if (file->f_op != &kvm_gmem_fops)
> +		goto err;
> +
> +	gmem = file->private_data;
> +	if (gmem->kvm != kvm)
> +		goto err;
> +
> +	inode = file_inode(file);
> +	flags = (unsigned long)inode->i_private;
> +
> +	/*
> +	 * For simplicity, require the offset into the file and the size of the
> +	 * memslot to be aligned to the largest possible page size used to back
> +	 * the file (same as the size of the file itself).
> +	 */
> +	if (!kvm_gmem_is_valid_size(offset, flags) ||
> +	    !kvm_gmem_is_valid_size(size, flags))
> +		goto err;
> +
> +	if (offset + size > i_size_read(inode))
> +		goto err;
> +
> +	filemap_invalidate_lock(inode->i_mapping);
> +
> +	start = offset >> PAGE_SHIFT;
> +	end = start + slot->npages;
> +
> +	if (!xa_empty(&gmem->bindings) &&
> +	    xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
> +		filemap_invalidate_unlock(inode->i_mapping);
> +		goto err;
> +	}
> +
> +	/*
> +	 * No synchronize_rcu() needed, any in-flight readers are guaranteed to
> +	 * be see either a NULL file or this new file, no need for them to go
> +	 * away.
> +	 */
> +	rcu_assign_pointer(slot->gmem.file, file);
> +	slot->gmem.pgoff = start;
> +
> +	xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
> +	filemap_invalidate_unlock(inode->i_mapping);
> +
> +	/*
> +	 * Drop the reference to the file, even on success.  The file pins KVM,
> +	 * not the other way 'round.  Active bindings are invalidated if the
> +	 * file is closed before memslots are destroyed.
> +	 */
> +	fput(file);
> +	return 0;
> +
> +err:
> +	fput(file);
> +	return -EINVAL;
> +}
> +
> +void kvm_gmem_unbind(struct kvm_memory_slot *slot)
> +{
> +	unsigned long start = slot->gmem.pgoff;
> +	unsigned long end = start + slot->npages;
> +	struct kvm_gmem *gmem;
> +	struct file *file;
> +
> +	/*
> +	 * Nothing to do if the underlying file was already closed (or is being
> +	 * closed right now), kvm_gmem_release() invalidates all bindings.
> +	 */
> +	file = kvm_gmem_get_file(slot);
> +	if (!file)
> +		return;
> +
> +	gmem = file->private_data;
> +
> +	filemap_invalidate_lock(file->f_mapping);
> +	xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
> +	rcu_assign_pointer(slot->gmem.file, NULL);
> +	synchronize_rcu();
> +	filemap_invalidate_unlock(file->f_mapping);
> +
> +	fput(file);
> +}
> +
> +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> +		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
> +{
> +	pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
> +	struct kvm_gmem *gmem;
> +	struct folio *folio;
> +	struct page *page;
> +	struct file *file;
> +
> +	file = kvm_gmem_get_file(slot);
> +	if (!file)
> +		return -EFAULT;
> +
> +	gmem = file->private_data;
> +
> +	if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
> +		fput(file);
> +		return -EIO;
> +	}
> +
> +	folio = kvm_gmem_get_folio(file_inode(file), index);
> +	if (!folio) {
> +		fput(file);
> +		return -ENOMEM;
> +	}
> +
> +	page = folio_file_page(folio, index);
> +
> +	*pfn = page_to_pfn(page);
> +	*max_order = compound_order(compound_head(page));
> +
> +	folio_unlock(folio);
> +	fput(file);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
> +
> +static int kvm_gmem_init_fs_context(struct fs_context *fc)
> +{
> +	if (!init_pseudo(fc, GUEST_MEMORY_MAGIC))
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static struct file_system_type kvm_gmem_fs = {
> +	.name		 = "kvm_guest_memory",
> +	.init_fs_context = kvm_gmem_init_fs_context,
> +	.kill_sb	 = kill_anon_super,
> +};
> +
> +int kvm_gmem_init(void)
> +{
> +	kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
> +	if (IS_ERR(kvm_gmem_mnt))
> +		return PTR_ERR(kvm_gmem_mnt);
> +
> +	/* For giggles.  Userspace can never map this anyways. */
> +	kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
> +
> +	return 0;
> +}
> +
> +void kvm_gmem_exit(void)
> +{
> +	kern_unmount(kvm_gmem_mnt);
> +	kvm_gmem_mnt = NULL;
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 1a31bfa025b0..a8686e8473a4 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -761,7 +761,7 @@ void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
>  	}
>  }
>
> -static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> +bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>  {
>  	kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
>  	return kvm_unmap_gfn_range(kvm, range);
> @@ -992,6 +992,9 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
>  /* This does not remove the slot from struct kvm_memslots data structures */
>  static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
>  {
> +	if (slot->flags & KVM_MEM_PRIVATE)
> +		kvm_gmem_unbind(slot);
> +
>  	kvm_destroy_dirty_bitmap(slot);
>
>  	kvm_arch_free_memslot(kvm, slot);
> @@ -1556,10 +1559,18 @@ static void kvm_replace_memslot(struct kvm *kvm,
>  	}
>  }
>
> -static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem)
> +static int check_memory_region_flags(struct kvm *kvm,
> +				     const struct kvm_userspace_memory_region2 *mem)
>  {
>  	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
>
> +	if (kvm_arch_has_private_mem(kvm))
> +		valid_flags |= KVM_MEM_PRIVATE;
> +
> +	/* Dirty logging private memory is not currently supported. */
> +	if (mem->flags & KVM_MEM_PRIVATE)
> +		valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
> +
>  #ifdef __KVM_HAVE_READONLY_MEM
>  	valid_flags |= KVM_MEM_READONLY;
>  #endif
> @@ -1968,7 +1979,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  	int as_id, id;
>  	int r;
>
> -	r = check_memory_region_flags(mem);
> +	r = check_memory_region_flags(kvm, mem);
>  	if (r)
>  		return r;
>
> @@ -1987,6 +1998,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  	     !access_ok((void __user *)(unsigned long)mem->userspace_addr,
>  			mem->memory_size))
>  		return -EINVAL;
> +	if (mem->flags & KVM_MEM_PRIVATE &&
> +	    (mem->gmem_offset & (PAGE_SIZE - 1) ||
> +	     mem->gmem_offset + mem->memory_size < mem->gmem_offset))
> +		return -EINVAL;
>  	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
>  		return -EINVAL;
>  	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
> @@ -2025,6 +2040,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  		if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
>  			return -EINVAL;
>  	} else { /* Modify an existing slot. */
> +		/* Private memslots are immutable, they can only be deleted. */
> +		if (mem->flags & KVM_MEM_PRIVATE)
> +			return -EINVAL;
>  		if ((mem->userspace_addr != old->userspace_addr) ||
>  		    (npages != old->npages) ||
>  		    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
> @@ -2053,10 +2071,23 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  	new->npages = npages;
>  	new->flags = mem->flags;
>  	new->userspace_addr = mem->userspace_addr;
> +	if (mem->flags & KVM_MEM_PRIVATE) {
> +		r = kvm_gmem_bind(kvm, new, mem->gmem_fd, mem->gmem_offset);
> +		if (r)
> +			goto out;
> +	}
>
>  	r = kvm_set_memslot(kvm, old, new, change);
>  	if (r)
> -		kfree(new);
> +		goto out_restricted;
> +
> +	return 0;
> +
> +out_restricted:
> +	if (mem->flags & KVM_MEM_PRIVATE)
> +		kvm_gmem_unbind(new);
> +out:
> +	kfree(new);
>  	return r;
>  }
>  EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
> @@ -2356,6 +2387,8 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
>  #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
>  static u64 kvm_supported_mem_attributes(struct kvm *kvm)
>  {
> +	if (kvm_arch_has_private_mem(kvm))
> +		return KVM_MEMORY_ATTRIBUTE_PRIVATE;
>  	return 0;
>  }
>
> @@ -5134,6 +5167,16 @@ static long kvm_vm_ioctl(struct file *filp,
>  	case KVM_GET_STATS_FD:
>  		r = kvm_vm_ioctl_get_stats_fd(kvm);
>  		break;
> +	case KVM_CREATE_GUEST_MEMFD: {
> +		struct kvm_create_guest_memfd guest_memfd;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
> +			goto out;
> +
> +		r = kvm_gmem_create(kvm, &guest_memfd);
> +		break;
> +	}
>  	default:
>  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>  	}
> @@ -6255,12 +6298,17 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
>  	if (r)
>  		goto err_async_pf;
>
> +	r = kvm_gmem_init();
> +	if (r)
> +		goto err_gmem;
> +
>  	kvm_chardev_ops.owner = module;
>
>  	kvm_preempt_ops.sched_in = kvm_sched_in;
>  	kvm_preempt_ops.sched_out = kvm_sched_out;
>
>  	kvm_init_debug();
> +	kvm_gmem_init();
>
>  	r = kvm_vfio_ops_init();
>  	if (WARN_ON_ONCE(r))
> @@ -6281,6 +6329,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
>  err_register:
>  	kvm_vfio_ops_exit();
>  err_vfio:
> +	kvm_gmem_exit();
> +err_gmem:
>  	kvm_async_pf_deinit();
>  err_async_pf:
>  	kvm_irqfd_exit();
> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
> index 180f1a09e6ba..798f20d612bb 100644
> --- a/virt/kvm/kvm_mm.h
> +++ b/virt/kvm/kvm_mm.h
> @@ -37,4 +37,42 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
>  }
>  #endif /* HAVE_KVM_PFNCACHE */
>
> +#ifdef CONFIG_KVM_PRIVATE_MEM
> +int kvm_gmem_init(void);
> +void kvm_gmem_exit(void);
> +int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
> +int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
> +		  unsigned int fd, loff_t offset);
> +void kvm_gmem_unbind(struct kvm_memory_slot *slot);
> +#else
> +static inline int kvm_gmem_init(void)
> +{
> +	return 0;
> +}
> +
> +static inline void kvm_gmem_exit(void)
> +{
> +
> +}
> +
> +static inline int kvm_gmem_create(struct kvm *kvm,
> +				  struct kvm_create_guest_memfd *args)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline int kvm_gmem_bind(struct kvm *kvm,
> +					 struct kvm_memory_slot *slot,
> +					 unsigned int fd, loff_t offset)
> +{
> +	WARN_ON_ONCE(1);
> +	return -EIO;
> +}
> +
> +static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
> +{
> +	WARN_ON_ONCE(1);
> +}
> +#endif /* CONFIG_KVM_PRIVATE_MEM */
> +
>  #endif /* __KVM_MM_H__ */
> --
> 2.41.0.255.g8b1d071c50-goog
>
Xiaoyao Li July 21, 2023, 3:05 p.m. UTC | #7
On 7/19/2023 7:44 AM, Sean Christopherson wrote:
> @@ -6255,12 +6298,17 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
>   	if (r)
>   		goto err_async_pf;
>   
> +	r = kvm_gmem_init();
> +	if (r)
> +		goto err_gmem;
> +
>   	kvm_chardev_ops.owner = module;
>   
>   	kvm_preempt_ops.sched_in = kvm_sched_in;
>   	kvm_preempt_ops.sched_out = kvm_sched_out;
>   
>   	kvm_init_debug();
> +	kvm_gmem_init();

why kvm_gmem_init() needs to be called again? by mistake?
Xiaoyao Li July 21, 2023, 3:42 p.m. UTC | #8
On 7/21/2023 11:05 PM, Xiaoyao Li wrote:
> On 7/19/2023 7:44 AM, Sean Christopherson wrote:
>> @@ -6255,12 +6298,17 @@ int kvm_init(unsigned vcpu_size, unsigned 
>> vcpu_align, struct module *module)
>>       if (r)
>>           goto err_async_pf;
>> +    r = kvm_gmem_init();
>> +    if (r)
>> +        goto err_gmem;
>> +
>>       kvm_chardev_ops.owner = module;
>>       kvm_preempt_ops.sched_in = kvm_sched_in;
>>       kvm_preempt_ops.sched_out = kvm_sched_out;
>>       kvm_init_debug();
>> +    kvm_gmem_init();
> 
> why kvm_gmem_init() needs to be called again? by mistake?

I'm sure it's a mistake.

I'm testing the gmem QEMU with this series. SW_PROTECTED_VM gets stuck 
in a loop in early OVMF code due to two shared page of OVMF get zapped 
and re-mapped infinitely. Removing the second call of kvm_gmem_init() 
can solve the issue, though I'm not sure about the reason.
Paolo Bonzini July 21, 2023, 5:17 p.m. UTC | #9
On 7/19/23 01:44, Sean Christopherson wrote:
> +	inode = alloc_anon_inode(mnt->mnt_sb);
> +	if (IS_ERR(inode))
> +		return PTR_ERR(inode);
> +
> +	err = security_inode_init_security_anon(inode, &qname, NULL);
> +	if (err)
> +		goto err_inode;
> +

I don't understand the need to have a separate filesystem.  If it is to 
fully setup the inode before it's given a struct file, why not just 
export anon_inode_make_secure_inode instead of 
security_inode_init_security_anon?

Paolo
Sean Christopherson July 21, 2023, 5:42 p.m. UTC | #10
On Fri, Jul 21, 2023, Xiaoyao Li wrote:
> On 7/21/2023 11:05 PM, Xiaoyao Li wrote:
> > On 7/19/2023 7:44 AM, Sean Christopherson wrote:
> > > @@ -6255,12 +6298,17 @@ int kvm_init(unsigned vcpu_size, unsigned
> > > vcpu_align, struct module *module)
> > >       if (r)
> > >           goto err_async_pf;
> > > +    r = kvm_gmem_init();
> > > +    if (r)
> > > +        goto err_gmem;
> > > +
> > >       kvm_chardev_ops.owner = module;
> > >       kvm_preempt_ops.sched_in = kvm_sched_in;
> > >       kvm_preempt_ops.sched_out = kvm_sched_out;
> > >       kvm_init_debug();
> > > +    kvm_gmem_init();
> > 
> > why kvm_gmem_init() needs to be called again? by mistake?
> 
> I'm sure it's a mistake.

Yeah, definitely a bug.

> I'm testing the gmem QEMU with this series. SW_PROTECTED_VM gets stuck in a
> loop in early OVMF code due to two shared page of OVMF get zapped and
> re-mapped infinitely. Removing the second call of kvm_gmem_init() can solve
> the issue, though I'm not sure about the reason.

Not worth investigating unless you want to satiate your curiosity :-)
Sean Christopherson July 21, 2023, 5:50 p.m. UTC | #11
On Fri, Jul 21, 2023, Paolo Bonzini wrote:
> On 7/19/23 01:44, Sean Christopherson wrote:
> > +	inode = alloc_anon_inode(mnt->mnt_sb);
> > +	if (IS_ERR(inode))
> > +		return PTR_ERR(inode);
> > +
> > +	err = security_inode_init_security_anon(inode, &qname, NULL);
> > +	if (err)
> > +		goto err_inode;
> > +
> 
> I don't understand the need to have a separate filesystem.  If it is to
> fully setup the inode before it's given a struct file, why not just export
> anon_inode_make_secure_inode instead of security_inode_init_security_anon?

Ugh, this is why comments are important, I can't remember either.

I suspect I implemented a dedicated filesystem to kinda sorta show that we could
allow userspace to provide the mount point with e.g. NUMA hints[*].  But my
preference would be to not support a userspace provided mount and instead implement
fbind() to let userspace control NUMA and whatnot.

[*] https://lore.kernel.org/all/ef48935e5e6f947f6f0c6d748232b14ef5d5ad70.1681176340.git.ackerleytng@google.com
Isaku Yamahata July 21, 2023, 10:27 p.m. UTC | #12
On Fri, Jul 21, 2023 at 02:13:14PM +0800,
Yuan Yao <yuan.yao@linux.intel.com> wrote:

> On Tue, Jul 18, 2023 at 04:44:55PM -0700, Sean Christopherson wrote:
> > TODO
> >
> > Cc: Fuad Tabba <tabba@google.com>
> > Cc: Vishal Annapurve <vannapurve@google.com>
> > Cc: Ackerley Tng <ackerleytng@google.com>
> > Cc: Jarkko Sakkinen <jarkko@kernel.org>
> > Cc: Maciej Szmigiero <mail@maciej.szmigiero.name>
> > Cc: Vlastimil Babka <vbabka@suse.cz>
> > Cc: David Hildenbrand <david@redhat.com>
> > Cc: Quentin Perret <qperret@google.com>
> > Cc: Michael Roth <michael.roth@amd.com>
> > Cc: Wang <wei.w.wang@intel.com>
> > Cc: Liam Merwick <liam.merwick@oracle.com>
> > Cc: Isaku Yamahata <isaku.yamahata@gmail.com>
> > Co-developed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> > Co-developed-by: Chao Peng <chao.p.peng@linux.intel.com>
> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > Co-developed-by: Ackerley Tng <ackerleytng@google.com>
> > Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > ---
> >  include/linux/kvm_host.h   |  48 +++
> >  include/uapi/linux/kvm.h   |  14 +-
> >  include/uapi/linux/magic.h |   1 +
> >  virt/kvm/Kconfig           |   4 +
> >  virt/kvm/Makefile.kvm      |   1 +
> >  virt/kvm/guest_mem.c       | 591 +++++++++++++++++++++++++++++++++++++
> >  virt/kvm/kvm_main.c        |  58 +++-
> >  virt/kvm/kvm_mm.h          |  38 +++
> >  8 files changed, 750 insertions(+), 5 deletions(-)
> >  create mode 100644 virt/kvm/guest_mem.c
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 97db63da6227..0d1e2ee8ae7a 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -592,8 +592,20 @@ struct kvm_memory_slot {
> >  	u32 flags;
> >  	short id;
> >  	u16 as_id;
> > +
> > +#ifdef CONFIG_KVM_PRIVATE_MEM
> > +	struct {
> > +		struct file __rcu *file;
> > +		pgoff_t pgoff;
> > +	} gmem;
> > +#endif
> >  };
> >
> > +static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
> > +{
> > +	return slot && (slot->flags & KVM_MEM_PRIVATE);
> > +}
> > +
> >  static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
> >  {
> >  	return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
> > @@ -688,6 +700,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
> >  }
> >  #endif
> >
> > +/*
> > + * Arch code must define kvm_arch_has_private_mem if support for private memory
> > + * is enabled.
> > + */
> > +#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
> > +static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
> > +{
> > +	return false;
> > +}
> > +#endif
> > +
> >  struct kvm_memslots {
> >  	u64 generation;
> >  	atomic_long_t last_used_slot;
> > @@ -1380,6 +1403,7 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> >  void kvm_mmu_invalidate_begin(struct kvm *kvm);
> >  void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
> >  void kvm_mmu_invalidate_end(struct kvm *kvm);
> > +bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> >
> >  long kvm_arch_dev_ioctl(struct file *filp,
> >  			unsigned int ioctl, unsigned long arg);
> > @@ -2313,6 +2337,30 @@ static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn
> >
> >  bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
> >  					 struct kvm_gfn_range *range);
> > +
> > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> > +{
> > +	return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
> > +	       kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> > +}
> > +#else
> > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> > +{
> > +	return false;
> > +}
> >  #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
> >
> > +#ifdef CONFIG_KVM_PRIVATE_MEM
> > +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > +			      gfn_t gfn, kvm_pfn_t *pfn, int *max_order);
> > +#else
> > +static inline int kvm_gmem_get_pfn(struct kvm *kvm,
> > +				   struct kvm_memory_slot *slot, gfn_t gfn,
> > +				   kvm_pfn_t *pfn, int *max_order)
> > +{
> > +	KVM_BUG_ON(1, kvm);
> > +	return -EIO;
> > +}
> > +#endif /* CONFIG_KVM_PRIVATE_MEM */
> > +
> >  #endif
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index f065c57db327..9b344fc98598 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -102,7 +102,10 @@ struct kvm_userspace_memory_region2 {
> >  	__u64 guest_phys_addr;
> >  	__u64 memory_size;
> >  	__u64 userspace_addr;
> > -	__u64 pad[16];
> > +	__u64 gmem_offset;
> > +	__u32 gmem_fd;
> > +	__u32 pad1;
> > +	__u64 pad2[14];
> >  };
> >
> >  /*
> > @@ -112,6 +115,7 @@ struct kvm_userspace_memory_region2 {
> >   */
> >  #define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
> >  #define KVM_MEM_READONLY	(1UL << 1)
> > +#define KVM_MEM_PRIVATE		(1UL << 2)
> >
> >  /* for KVM_IRQ_LINE */
> >  struct kvm_irq_level {
> > @@ -2284,4 +2288,12 @@ struct kvm_memory_attributes {
> >
> >  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
> >
> > +#define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
> > +
> > +struct kvm_create_guest_memfd {
> > +	__u64 size;
> > +	__u64 flags;
> > +	__u64 reserved[6];
> > +};
> > +
> >  #endif /* __LINUX_KVM_H */
> > diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> > index 6325d1d0e90f..15041aa7d9ae 100644
> > --- a/include/uapi/linux/magic.h
> > +++ b/include/uapi/linux/magic.h
> > @@ -101,5 +101,6 @@
> >  #define DMA_BUF_MAGIC		0x444d4142	/* "DMAB" */
> >  #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
> >  #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
> > +#define GUEST_MEMORY_MAGIC	0x474d454d	/* "GMEM" */
> >
> >  #endif /* __LINUX_MAGIC_H__ */
> > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> > index 8375bc49f97d..3ee3205e0b39 100644
> > --- a/virt/kvm/Kconfig
> > +++ b/virt/kvm/Kconfig
> > @@ -103,3 +103,7 @@ config KVM_GENERIC_MMU_NOTIFIER
> >  config KVM_GENERIC_MEMORY_ATTRIBUTES
> >         select KVM_GENERIC_MMU_NOTIFIER
> >         bool
> > +
> > +config KVM_PRIVATE_MEM
> > +       select XARRAY_MULTI
> > +       bool
> > diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
> > index 2c27d5d0c367..a5a61bbe7f4c 100644
> > --- a/virt/kvm/Makefile.kvm
> > +++ b/virt/kvm/Makefile.kvm
> > @@ -12,3 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
> >  kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
> >  kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
> >  kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
> > +kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_mem.o
> > diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
> > new file mode 100644
> > index 000000000000..1b705fd63fa8
> > --- /dev/null
> > +++ b/virt/kvm/guest_mem.c
> > @@ -0,0 +1,591 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#include <linux/backing-dev.h>
> > +#include <linux/falloc.h>
> > +#include <linux/kvm_host.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/pseudo_fs.h>
> > +
> > +#include <uapi/linux/magic.h>
> > +
> > +#include "kvm_mm.h"
> > +
> > +static struct vfsmount *kvm_gmem_mnt;
> > +
> > +struct kvm_gmem {
> > +	struct kvm *kvm;
> > +	struct xarray bindings;
> > +	struct list_head entry;
> > +};
> > +
> > +static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
> > +{
> > +	struct folio *folio;
> > +
> > +	/* TODO: Support huge pages. */
> > +	folio = filemap_grab_folio(file->f_mapping, index);
> > +	if (!folio)
> > +		return NULL;
> > +
> > +	/*
> > +	 * Use the up-to-date flag to track whether or not the memory has been
> > +	 * zeroed before being handed off to the guest.  There is no backing
> > +	 * storage for the memory, so the folio will remain up-to-date until
> > +	 * it's removed.
> > +	 *
> > +	 * TODO: Skip clearing pages when trusted firmware will do it when
> > +	 * assigning memory to the guest.
> > +	 */
> > +	if (!folio_test_uptodate(folio)) {
> > +		unsigned long nr_pages = folio_nr_pages(folio);
> > +		unsigned long i;
> > +
> > +		for (i = 0; i < nr_pages; i++)
> > +			clear_highpage(folio_page(folio, i));
> > +
> > +		folio_mark_uptodate(folio);
> > +	}
> > +
> > +	/*
> > +	 * Ignore accessed, referenced, and dirty flags.  The memory is
> > +	 * unevictable and there is no storage to write back to.
> > +	 */
> > +	return folio;
> > +}
> > +
> > +static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> > +				      pgoff_t end)
> > +{
> > +	struct kvm_memory_slot *slot;
> > +	struct kvm *kvm = gmem->kvm;
> > +	unsigned long index;
> > +	bool flush = false;
> > +
> > +	KVM_MMU_LOCK(kvm);
> > +
> > +	kvm_mmu_invalidate_begin(kvm);
> > +
> > +	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> > +		pgoff_t pgoff = slot->gmem.pgoff;
> > +
> > +		struct kvm_gfn_range gfn_range = {
> > +			.start = slot->base_gfn + max(pgoff, start) - pgoff,
> > +			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
> > +			.slot = slot,
> > +			.may_block = true,
> > +		};
> > +
> > +		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
> > +	}
> > +
> > +	if (flush)
> > +		kvm_flush_remote_tlbs(kvm);
> > +
> > +	KVM_MMU_UNLOCK(kvm);
> > +}
> > +
> > +static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
> > +				    pgoff_t end)
> > +{
> > +	struct kvm *kvm = gmem->kvm;
> > +
> > +	KVM_MMU_LOCK(kvm);
> > +	if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT))
> > +		kvm_mmu_invalidate_end(kvm);
> > +	KVM_MMU_UNLOCK(kvm);
> > +}
> > +
> > +static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> > +{
> > +	struct list_head *gmem_list = &inode->i_mapping->private_list;
> > +	pgoff_t start = offset >> PAGE_SHIFT;
> > +	pgoff_t end = (offset + len) >> PAGE_SHIFT;
> > +	struct kvm_gmem *gmem;
> > +
> > +	/*
> > +	 * Bindings must stable across invalidation to ensure the start+end
> > +	 * are balanced.
> > +	 */
> > +	filemap_invalidate_lock(inode->i_mapping);
> > +
> > +	list_for_each_entry(gmem, gmem_list, entry)
> > +		kvm_gmem_invalidate_begin(gmem, start, end);
> > +
> > +	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
> > +
> > +	list_for_each_entry(gmem, gmem_list, entry)
> > +		kvm_gmem_invalidate_end(gmem, start, end);
> > +
> > +	filemap_invalidate_unlock(inode->i_mapping);
> > +
> > +	return 0;
> > +}
> > +
> > +static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
> > +{
> > +	struct address_space *mapping = inode->i_mapping;
> > +	pgoff_t start, index, end;
> > +	int r;
> > +
> > +	/* Dedicated guest is immutable by default. */
> > +	if (offset + len > i_size_read(inode))
> > +		return -EINVAL;
> > +
> > +	filemap_invalidate_lock_shared(mapping);
> > +
> > +	start = offset >> PAGE_SHIFT;
> > +	end = (offset + len) >> PAGE_SHIFT;
> > +
> > +	r = 0;
> > +	for (index = start; index < end; ) {
> > +		struct folio *folio;
> > +
> > +		if (signal_pending(current)) {
> > +			r = -EINTR;
> > +			break;
> > +		}
> > +
> > +		folio = kvm_gmem_get_folio(inode, index);
> > +		if (!folio) {
> > +			r = -ENOMEM;
> > +			break;
> > +		}
> > +
> > +		index = folio_next_index(folio);
> > +
> > +		folio_unlock(folio);
> > +		folio_put(folio);
> > +
> > +		/* 64-bit only, wrapping the index should be impossible. */
> > +		if (WARN_ON_ONCE(!index))
> > +			break;
> > +
> > +		cond_resched();
> > +	}
> > +
> > +	filemap_invalidate_unlock_shared(mapping);
> > +
> > +	return r;
> > +}
> > +
> > +static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
> > +			       loff_t len)
> > +{
> > +	int ret;
> > +
> > +	if (!(mode & FALLOC_FL_KEEP_SIZE))
> > +		return -EOPNOTSUPP;
> > +
> > +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> > +		return -EOPNOTSUPP;
> > +
> > +	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> > +		return -EINVAL;
> > +
> > +	if (mode & FALLOC_FL_PUNCH_HOLE)
> > +		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
> > +	else
> > +		ret = kvm_gmem_allocate(file_inode(file), offset, len);
> > +
> > +	if (!ret)
> > +		file_modified(file);
> > +	return ret;
> > +}
> > +
> > +static int kvm_gmem_release(struct inode *inode, struct file *file)
> > +{
> > +	struct kvm_gmem *gmem = file->private_data;
> > +	struct kvm_memory_slot *slot;
> > +	struct kvm *kvm = gmem->kvm;
> > +	unsigned long index;
> > +
> > +	filemap_invalidate_lock(inode->i_mapping);
> > +
> > +	/*
> > +	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
> > +	 * reference to the file and thus no new bindings can be created, but
> > +	 * dereferencing the slot for existing bindings needs to be protected
> > +	 * against memslot updates, specifically so that unbind doesn't race
> > +	 * and free the memslot (kvm_gmem_get_file() will return NULL).
> > +	 */
> > +	mutex_lock(&kvm->slots_lock);
> > +
> > +	xa_for_each(&gmem->bindings, index, slot)
> > +		rcu_assign_pointer(slot->gmem.file, NULL);
> > +
> > +	synchronize_rcu();
> > +
> > +	/*
> > +	 * All in-flight operations are gone and new bindings can be created.
> > +	 * Zap all SPTEs pointed at by this file.  Do not free the backing
> > +	 * memory, as its lifetime is associated with the inode, not the file.
> > +	 */
> > +	kvm_gmem_invalidate_begin(gmem, 0, -1ul);
> > +	kvm_gmem_invalidate_end(gmem, 0, -1ul);
> > +
> > +	mutex_unlock(&kvm->slots_lock);
> > +
> > +	list_del(&gmem->entry);
> > +
> > +	filemap_invalidate_unlock(inode->i_mapping);
> > +
> > +	xa_destroy(&gmem->bindings);
> > +	kfree(gmem);
> > +
> > +	kvm_put_kvm(kvm);
> > +
> > +	return 0;
> > +}
> > +
> > +static struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
> > +{
> > +	struct file *file;
> > +
> > +	rcu_read_lock();
> > +
> > +	file = rcu_dereference(slot->gmem.file);
> > +	if (file && !get_file_rcu(file))
> > +		file = NULL;
> > +
> > +	rcu_read_unlock();
> > +
> > +	return file;
> > +}
> > +
> > +static const struct file_operations kvm_gmem_fops = {
> > +	.open		= generic_file_open,
> > +	.release	= kvm_gmem_release,
> > +	.fallocate	= kvm_gmem_fallocate,
> > +};
> > +
> > +static int kvm_gmem_migrate_folio(struct address_space *mapping,
> > +				  struct folio *dst, struct folio *src,
> > +				  enum migrate_mode mode)
> > +{
> > +	WARN_ON_ONCE(1);
> > +	return -EINVAL;
> > +}
> > +
> > +static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
> > +{
> > +	struct list_head *gmem_list = &mapping->private_list;
> > +	struct kvm_memory_slot *slot;
> > +	struct kvm_gmem *gmem;
> > +	unsigned long index;
> > +	pgoff_t start, end;
> > +	gfn_t gfn;
> > +
> > +	filemap_invalidate_lock_shared(mapping);
> > +
> > +	start = page->index;
> > +	end = start + thp_nr_pages(page);
> > +
> > +	list_for_each_entry(gmem, gmem_list, entry) {
> > +		xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> > +			for (gfn = start; gfn < end; gfn++) {
> 
> Why the start end range used as gfn here ?
> 
> the page->index is offset of inode's page cache mapping and
> gmem address space, IIUC, gfn calculation should follow same
> way as kvm_gmem_invalidate_begin().

Also instead of sending signal multiple times, we can utilize lsb argument.
Something like this?

diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
index a14eaac9dbad..8072ac901855 100644
--- a/virt/kvm/guest_mem.c
+++ b/virt/kvm/guest_mem.c
@@ -349,20 +349,35 @@ static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
        struct kvm_gmem *gmem;
        unsigned long index;
        pgoff_t start, end;
-       gfn_t gfn;
+       unsigned int order;
+       int nr_pages;
+       gfn_t gfn, gfn_end;
 
        filemap_invalidate_lock_shared(mapping);
 
        start = page->index;
        end = start + thp_nr_pages(page);
+       nr_pages = thp_nr_pages(page);
+       order = thp_order(page);
 
        list_for_each_entry(gmem, gmem_list, entry) {
                xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
-                       for (gfn = start; gfn < end; gfn++) {
-                               if (WARN_ON_ONCE(gfn < slot->base_gfn ||
-                                               gfn >= slot->base_gfn + slot->npages))
-                                       continue;
+                       gfn = slot->base_gfn + page->index - slot->gmem.pgoff;
 
+                       if (page->index + nr_pages <= slot->gmem.pgoff + slot->npages &&
+                           !(gfn & ~((1ULL << order) - 1))) {
+                               /*
+                                * FIXME: Tell userspace that the *private*
+                                * memory encountered an error.
+                                */
+                               send_sig_mceerr(BUS_MCEERR_AR,
+                                               (void __user *)gfn_to_hva_memslot(slot, gfn),
+                                               order, current);
+                               break;
+                       }
+
+                       gfn_end = min(gfn + nr_pages, slot->base_gfn + slot->npages);
+                       for (; gfn < gfn_end; gfn++) {
                                /*
                                 * FIXME: Tell userspace that the *private*
                                 * memory encountered an error.
Sean Christopherson July 21, 2023, 10:33 p.m. UTC | #13
On Fri, Jul 21, 2023, Isaku Yamahata wrote:
> On Fri, Jul 21, 2023 at 02:13:14PM +0800,
> Yuan Yao <yuan.yao@linux.intel.com> wrote:
> > > +static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
> > > +{
> > > +	struct list_head *gmem_list = &mapping->private_list;
> > > +	struct kvm_memory_slot *slot;
> > > +	struct kvm_gmem *gmem;
> > > +	unsigned long index;
> > > +	pgoff_t start, end;
> > > +	gfn_t gfn;
> > > +
> > > +	filemap_invalidate_lock_shared(mapping);
> > > +
> > > +	start = page->index;
> > > +	end = start + thp_nr_pages(page);
> > > +
> > > +	list_for_each_entry(gmem, gmem_list, entry) {
> > > +		xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> > > +			for (gfn = start; gfn < end; gfn++) {
> > 
> > Why the start end range used as gfn here ?

Math is hard?  I almost always mess up these types of things, and then catch my
bugs via tests.  But I don't have tests for this particular flow...   Which
reminds me, we need tests for this :-)  Hopefully error injection provides most
of what we need?

> > the page->index is offset of inode's page cache mapping and
> > gmem address space, IIUC, gfn calculation should follow same
> > way as kvm_gmem_invalidate_begin().
> 
> Also instead of sending signal multiple times, we can utilize lsb argument.

As Vishal pointed out, this code shouldn't be sending signals in the first place.
Wang, Wei W July 25, 2023, 3:09 p.m. UTC | #14
On Wednesday, July 19, 2023 7:45 AM, Sean Christopherson wrote:
> +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> +		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order) {
> +	pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
> +	struct kvm_gmem *gmem;
> +	struct folio *folio;
> +	struct page *page;
> +	struct file *file;
> +
> +	file = kvm_gmem_get_file(slot);
> +	if (!file)
> +		return -EFAULT;
> +
> +	gmem = file->private_data;
> +
> +	if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
> +		fput(file);
> +		return -EIO;
> +	}
> +
> +	folio = kvm_gmem_get_folio(file_inode(file), index);
> +	if (!folio) {
> +		fput(file);
> +		return -ENOMEM;
> +	}
> +
> +	page = folio_file_page(folio, index);
> +
> +	*pfn = page_to_pfn(page);
> +	*max_order = compound_order(compound_head(page));

Maybe better to check if caller provided a buffer to get the max_order:
if (max_order)
	*max_order = compound_order(compound_head(page));

This is what the previous version did (restrictedmem_get_page),
so that callers who only want to get a pfn don't need to define
an unused "order" param.
Sean Christopherson July 25, 2023, 4:03 p.m. UTC | #15
On Tue, Jul 25, 2023, Wei W Wang wrote:
> On Wednesday, July 19, 2023 7:45 AM, Sean Christopherson wrote:
> > +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > +		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order) {
> > +	pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
> > +	struct kvm_gmem *gmem;
> > +	struct folio *folio;
> > +	struct page *page;
> > +	struct file *file;
> > +
> > +	file = kvm_gmem_get_file(slot);
> > +	if (!file)
> > +		return -EFAULT;
> > +
> > +	gmem = file->private_data;
> > +
> > +	if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
> > +		fput(file);
> > +		return -EIO;
> > +	}
> > +
> > +	folio = kvm_gmem_get_folio(file_inode(file), index);
> > +	if (!folio) {
> > +		fput(file);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	page = folio_file_page(folio, index);
> > +
> > +	*pfn = page_to_pfn(page);
> > +	*max_order = compound_order(compound_head(page));
> 
> Maybe better to check if caller provided a buffer to get the max_order:
> if (max_order)
> 	*max_order = compound_order(compound_head(page));
> 
> This is what the previous version did (restrictedmem_get_page),
> so that callers who only want to get a pfn don't need to define
> an unused "order" param.

My preference would be to require @max_order.  I can kinda sorta see why a generic
implementation (restrictedmem) would make the param optional, but with gmem being
KVM-internal I think it makes sense to require the param.  Even if pKVM doesn't
_currently_ need/want the order of the backing allocation, presumably that's because
hugepage support is still on the TODO list, not because pKVM fundamentally doesn't
need to know the order of the backing allocation.
Wang, Wei W July 26, 2023, 1:51 a.m. UTC | #16
On Wednesday, July 26, 2023 12:04 AM,  Sean Christopherson wrote:
> On Tue, Jul 25, 2023, Wei W Wang wrote:
> > On Wednesday, July 19, 2023 7:45 AM, Sean Christopherson wrote:
> > > +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > +		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order) {
> > > +	pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
> > > +	struct kvm_gmem *gmem;
> > > +	struct folio *folio;
> > > +	struct page *page;
> > > +	struct file *file;
> > > +
> > > +	file = kvm_gmem_get_file(slot);
> > > +	if (!file)
> > > +		return -EFAULT;
> > > +
> > > +	gmem = file->private_data;
> > > +
> > > +	if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
> > > +		fput(file);
> > > +		return -EIO;
> > > +	}
> > > +
> > > +	folio = kvm_gmem_get_folio(file_inode(file), index);
> > > +	if (!folio) {
> > > +		fput(file);
> > > +		return -ENOMEM;
> > > +	}
> > > +
> > > +	page = folio_file_page(folio, index);
> > > +
> > > +	*pfn = page_to_pfn(page);
> > > +	*max_order = compound_order(compound_head(page));
> >
> > Maybe better to check if caller provided a buffer to get the max_order:
> > if (max_order)
> > 	*max_order = compound_order(compound_head(page));
> >
> > This is what the previous version did (restrictedmem_get_page), so
> > that callers who only want to get a pfn don't need to define an unused
> > "order" param.
> 
> My preference would be to require @max_order.  I can kinda sorta see why a
> generic implementation (restrictedmem) would make the param optional, but
> with gmem being KVM-internal I think it makes sense to require the param.
> Even if pKVM doesn't _currently_ need/want the order of the backing
> allocation, presumably that's because hugepage support is still on the TODO
> list, not because pKVM fundamentally doesn't need to know the order of the
> backing allocation.

Another usage is live migration. The migration flow works with 4KB pages only,
and we only need to get the pfn from the given gfn. "order" doesn't seem to be
useful for this case.
Elliot Berman July 26, 2023, 5:18 p.m. UTC | #17
On 7/18/2023 4:44 PM, Sean Christopherson wrote:
> TODO
  <snip>
> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> index 6325d1d0e90f..15041aa7d9ae 100644
> --- a/include/uapi/linux/magic.h
> +++ b/include/uapi/linux/magic.h
> @@ -101,5 +101,6 @@
>   #define DMA_BUF_MAGIC		0x444d4142	/* "DMAB" */
>   #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
>   #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
> +#define GUEST_MEMORY_MAGIC	0x474d454d	/* "GMEM" */


Should this be:

#define GUEST_MEMORY_KVM_MAGIC

or KVM_GUEST_MEMORY_KVM_MAGIC?

BALLOON_KVM_MAGIC is KVM-specific few lines above.

---

Originally, I was planning to use the generic guest memfd infrastructure 
to support Gunyah hypervisor, however I see that's probably not going to 
be possible now that the guest memfd implementation is KVM-specific. I 
think this is good for both KVM and Gunyah as there will be some Gunyah 
specifics and some KVM specifics in each of implementation, as you 
mentioned in the previous series.

I'll go through series over next week or so and I'll try to find how 
much similar Gunyah guest mem fd implementation would be and we can see 
if it's better to pull whatever that ends up being into a common 
implementation? We could also agree to have completely divergent fd 
implementations like we do for the UAPI. Thoughts?

Thanks,
Elliot

  <snip>
Sean Christopherson July 26, 2023, 7:28 p.m. UTC | #18
On Wed, Jul 26, 2023, Elliot Berman wrote:
> 
> 
> On 7/18/2023 4:44 PM, Sean Christopherson wrote:
> > TODO
>  <snip>
> > diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> > index 6325d1d0e90f..15041aa7d9ae 100644
> > --- a/include/uapi/linux/magic.h
> > +++ b/include/uapi/linux/magic.h
> > @@ -101,5 +101,6 @@
> >   #define DMA_BUF_MAGIC		0x444d4142	/* "DMAB" */
> >   #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
> >   #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
> > +#define GUEST_MEMORY_MAGIC	0x474d454d	/* "GMEM" */
> 
> 
> Should this be:
> 
> #define GUEST_MEMORY_KVM_MAGIC
> 
> or KVM_GUEST_MEMORY_KVM_MAGIC?
> 
> BALLOON_KVM_MAGIC is KVM-specific few lines above.

Ah, good point.  My preference would be either KVM_GUEST_MEMORY_MAGIC or
KVM_GUEST_MEMFD_MAGIC.  Though hopefully we don't actually need a dedicated
filesystem, I _think_ it's unnecessary if we don't try to support userspace
mounts.

> ---
> 
> Originally, I was planning to use the generic guest memfd infrastructure to
> support Gunyah hypervisor, however I see that's probably not going to be
> possible now that the guest memfd implementation is KVM-specific. I think
> this is good for both KVM and Gunyah as there will be some Gunyah specifics
> and some KVM specifics in each of implementation, as you mentioned in the
> previous series.

Yeah, that's where my headspace is at too.  Sharing the actual uAPI, and even
internal APIs to some extent, doesn't save all that much, e.g. wiring up an ioctl()
is the easy part.  Whereas I strongly suspect each hypervisor use case will want
different semantics for the uAPI.

> I'll go through series over next week or so and I'll try to find how much
> similar Gunyah guest mem fd implementation would be and we can see if it's
> better to pull whatever that ends up being into a common implementation?

That would be awesome!  

> We could also agree to have completely divergent fd implementations like we
> do for the UAPI. Thoughts?

I'd like to avoid _completely_ divergent implementations, e.g. the majority of
kvm_gmem_allocate() and __kvm_gmem_create() isn't KVM specific.  I think there
would be value in sharing the core allocation logic, even if the other details
are different.  Especially if we fully commit to not supporting migration or
swap, and decide to use xarray directly to manage folios instead of bouncing
through the filemap APIs.

Thanks!
Fuad Tabba July 27, 2023, 10:39 a.m. UTC | #19
Hi Sean,

<snip>
...

> @@ -5134,6 +5167,16 @@ static long kvm_vm_ioctl(struct file *filp,
>         case KVM_GET_STATS_FD:
>                 r = kvm_vm_ioctl_get_stats_fd(kvm);
>                 break;
> +       case KVM_CREATE_GUEST_MEMFD: {
> +               struct kvm_create_guest_memfd guest_memfd;
> +
> +               r = -EFAULT;
> +               if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
> +                       goto out;
> +
> +               r = kvm_gmem_create(kvm, &guest_memfd);
> +               break;
> +       }

I'm thinking line of sight here, by having this as a vm ioctl (rather
than a system iocl), would it complicate making it possible in the
future to share/donate memory between VMs?

Cheers,
/fuad
Sean Christopherson July 27, 2023, 5:13 p.m. UTC | #20
On Thu, Jul 27, 2023, Fuad Tabba wrote:
> Hi Sean,
> 
> <snip>
> ...
> 
> > @@ -5134,6 +5167,16 @@ static long kvm_vm_ioctl(struct file *filp,
> >         case KVM_GET_STATS_FD:
> >                 r = kvm_vm_ioctl_get_stats_fd(kvm);
> >                 break;
> > +       case KVM_CREATE_GUEST_MEMFD: {
> > +               struct kvm_create_guest_memfd guest_memfd;
> > +
> > +               r = -EFAULT;
> > +               if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
> > +                       goto out;
> > +
> > +               r = kvm_gmem_create(kvm, &guest_memfd);
> > +               break;
> > +       }
> 
> I'm thinking line of sight here, by having this as a vm ioctl (rather
> than a system iocl), would it complicate making it possible in the
> future to share/donate memory between VMs?

Maybe, but I hope not?

There would still be a primary owner of the memory, i.e. the memory would still
need to be allocated in the context of a specific VM.  And the primary owner should
be able to restrict privileges, e.g. allow a different VM to read but not write
memory.

My current thinking is to (a) tie the lifetime of the backing pages to the inode,
i.e. allow allocations to outlive the original VM, and (b) create a new file each
time memory is shared/donated with a different VM (or other entity in the kernel).

That should make it fairly straightforward to provide different permissions, e.g.
track them per-file, and I think should also avoid the need to change the memslot
binding logic since each VM would have it's own view/bindings.

Copy+pasting a relevant snippet from a lengthier response in a different thread[*]:

  Conceptually, I think KVM should to bind to the file.  The inode is effectively
  the raw underlying physical storage, while the file is the VM's view of that
  storage. 
  
  Practically, I think that gives us a clean, intuitive way to handle intra-host
  migration.  Rather than transfer ownership of the file, instantiate a new file
  for the target VM, using the gmem inode from the source VM, i.e. create a hard
  link.  That'd probably require new uAPI, but I don't think that will be hugely
  problematic.  KVM would need to ensure the new VM's guest_memfd can't be mapped
  until KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM (which would also need to verify the
  memslots/bindings are identical), but that should be easy enough to enforce.
  
  That way, a VM, its memslots, and its SPTEs are tied to the file, while allowing
  the memory and the *contents* of memory to outlive the VM, i.e. be effectively
  transfered to the new target VM.  And we'll maintain the invariant that each
  guest_memfd is bound 1:1 with a single VM.
  
  As above, that should also help us draw the line between mapping memory into a
  VM (file), and freeing/reclaiming the memory (inode).
  
  There will be extra complexity/overhead as we'll have to play nice with the
  possibility of multiple files per inode, e.g. to zap mappings across all files
  when punching a hole, but the extra complexity is quite small, e.g. we can use
  address_space.private_list to keep track of the guest_memfd instances associated
  with the inode.
  
  Setting aside TDX and SNP for the moment, as it's not clear how they'll support
  memory that is "private" but shared between multiple VMs, I think per-VM files
  would work well for sharing gmem between two VMs.  E.g. would allow a give page
  to be bound to a different gfn for each VM, would allow having different permissions
  for each file (e.g. to allow fallocate() only from the original owner).

[*] https://lore.kernel.org/all/ZLGiEfJZTyl7M8mS@google.com
Fuad Tabba July 31, 2023, 1:46 p.m. UTC | #21
Hi Sean,

On Thu, Jul 27, 2023 at 6:13 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Jul 27, 2023, Fuad Tabba wrote:
> > Hi Sean,
> >
> > <snip>
> > ...
> >
> > > @@ -5134,6 +5167,16 @@ static long kvm_vm_ioctl(struct file *filp,
> > >         case KVM_GET_STATS_FD:
> > >                 r = kvm_vm_ioctl_get_stats_fd(kvm);
> > >                 break;
> > > +       case KVM_CREATE_GUEST_MEMFD: {
> > > +               struct kvm_create_guest_memfd guest_memfd;
> > > +
> > > +               r = -EFAULT;
> > > +               if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
> > > +                       goto out;
> > > +
> > > +               r = kvm_gmem_create(kvm, &guest_memfd);
> > > +               break;
> > > +       }
> >
> > I'm thinking line of sight here, by having this as a vm ioctl (rather
> > than a system iocl), would it complicate making it possible in the
> > future to share/donate memory between VMs?
>
> Maybe, but I hope not?
>
> There would still be a primary owner of the memory, i.e. the memory would still
> need to be allocated in the context of a specific VM.  And the primary owner should
> be able to restrict privileges, e.g. allow a different VM to read but not write
> memory.
>
> My current thinking is to (a) tie the lifetime of the backing pages to the inode,
> i.e. allow allocations to outlive the original VM, and (b) create a new file each
> time memory is shared/donated with a different VM (or other entity in the kernel).
>
> That should make it fairly straightforward to provide different permissions, e.g.
> track them per-file, and I think should also avoid the need to change the memslot
> binding logic since each VM would have it's own view/bindings.
>
> Copy+pasting a relevant snippet from a lengthier response in a different thread[*]:
>
>   Conceptually, I think KVM should to bind to the file.  The inode is effectively
>   the raw underlying physical storage, while the file is the VM's view of that
>   storage.

I'm not aware of any implementation of sharing memory between VMs in
KVM before (afaik, since there was no need for one). The following is
me thinking out loud, rather than any strong opinions on my part.

If an allocation can outlive the original VM, then why associate it
with that (or a) VM to begin with? Wouldn't it be more flexible if it
were a system-level construct, which is effectively what it was in
previous iterations of this? This doesn't rule out binding to the
file, and keeping the inode as the underlying physical storage.

The binding of a VM to a guestmem object could happen implicitly with
KVM_SET_USER_MEMORY_REGION2, or we could have a new ioctl specifically
for handling binding.

Cheers,
/fuad


>   Practically, I think that gives us a clean, intuitive way to handle intra-host
>   migration.  Rather than transfer ownership of the file, instantiate a new file
>   for the target VM, using the gmem inode from the source VM, i.e. create a hard
>   link.  That'd probably require new uAPI, but I don't think that will be hugely
>   problematic.  KVM would need to ensure the new VM's guest_memfd can't be mapped
>   until KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM (which would also need to verify the
>   memslots/bindings are identical), but that should be easy enough to enforce.
>
>   That way, a VM, its memslots, and its SPTEs are tied to the file, while allowing
>   the memory and the *contents* of memory to outlive the VM, i.e. be effectively
>   transfered to the new target VM.  And we'll maintain the invariant that each
>   guest_memfd is bound 1:1 with a single VM.
>
>   As above, that should also help us draw the line between mapping memory into a
>   VM (file), and freeing/reclaiming the memory (inode).
>
>   There will be extra complexity/overhead as we'll have to play nice with the
>   possibility of multiple files per inode, e.g. to zap mappings across all files
>   when punching a hole, but the extra complexity is quite small, e.g. we can use
>   address_space.private_list to keep track of the guest_memfd instances associated
>   with the inode.
>
>   Setting aside TDX and SNP for the moment, as it's not clear how they'll support
>   memory that is "private" but shared between multiple VMs, I think per-VM files
>   would work well for sharing gmem between two VMs.  E.g. would allow a give page
>   to be bound to a different gfn for each VM, would allow having different permissions
>   for each file (e.g. to allow fallocate() only from the original owner).
>
> [*] https://lore.kernel.org/all/ZLGiEfJZTyl7M8mS@google.com
>
Fuad Tabba July 31, 2023, 4:23 p.m. UTC | #22
Hi Sean,

On Tue, Jul 25, 2023 at 5:04 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Jul 25, 2023, Wei W Wang wrote:
> > On Wednesday, July 19, 2023 7:45 AM, Sean Christopherson wrote:
> > > +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > +                gfn_t gfn, kvm_pfn_t *pfn, int *max_order) {
> > > +   pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
> > > +   struct kvm_gmem *gmem;
> > > +   struct folio *folio;
> > > +   struct page *page;
> > > +   struct file *file;
> > > +
> > > +   file = kvm_gmem_get_file(slot);
> > > +   if (!file)
> > > +           return -EFAULT;
> > > +
> > > +   gmem = file->private_data;
> > > +
> > > +   if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
> > > +           fput(file);
> > > +           return -EIO;
> > > +   }
> > > +
> > > +   folio = kvm_gmem_get_folio(file_inode(file), index);
> > > +   if (!folio) {
> > > +           fput(file);
> > > +           return -ENOMEM;
> > > +   }
> > > +
> > > +   page = folio_file_page(folio, index);
> > > +
> > > +   *pfn = page_to_pfn(page);
> > > +   *max_order = compound_order(compound_head(page));
> >
> > Maybe better to check if caller provided a buffer to get the max_order:
> > if (max_order)
> >       *max_order = compound_order(compound_head(page));
> >
> > This is what the previous version did (restrictedmem_get_page),
> > so that callers who only want to get a pfn don't need to define
> > an unused "order" param.
>
> My preference would be to require @max_order.  I can kinda sorta see why a generic
> implementation (restrictedmem) would make the param optional, but with gmem being
> KVM-internal I think it makes sense to require the param.  Even if pKVM doesn't
> _currently_ need/want the order of the backing allocation, presumably that's because
> hugepage support is still on the TODO list, not because pKVM fundamentally doesn't
> need to know the order of the backing allocation.

You're right that with huge pages pKVM will eventually need to know
the order of the backing allocation, but there is at least one use
case where it doesn't, which I ran into in the previous ports as well
as this one. In pKVM (and in possibly other implementations), the host
needs to access (shared) guest memory that isn't mapped. For that,
I've used kvm_*_get_pfn(), only requiring the pfn, so get the page via
pfn_to_page().

Although it's not that big, my preference would be for max_order to be optional.

Thanks!
/fuad
Ryan Afranji Aug. 3, 2023, 7:15 p.m. UTC | #23
> +static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
> +{
> +	struct folio *folio;
> +
> +	/* TODO: Support huge pages. */
> +	folio = filemap_grab_folio(file->f_mapping, index);
> +	if (!folio)
> +		return NULL;

In Linux 6.4, filemap_grab_folio() may also return an error value.
Instead of just checking for NULL, "IS_ERR_OR_NULL(folio)" will be needed.
Sean Christopherson Aug. 8, 2023, 9:13 p.m. UTC | #24
On Mon, Aug 07, 2023, Ackerley Tng wrote:
> I’d like to propose an alternative to the refcounting approach between
> the gmem file and associated kvm, where we think of KVM’s memslots as
> users of the gmem file.
> 
> Instead of having the gmem file pin the VM (i.e. take a refcount on
> kvm), we could let memslot take a refcount on the gmem file when the
> memslots are configured.
> 
> Here’s a POC patch that flips the refcounting (and modified selftests in
> the next commit):
> https://github.com/googleprodkernel/linux-cc/commit/7f487b029b89b9f3e9b094a721bc0772f3c8c797
> 
> One side effect of having the gmem file pin the VM is that now the gmem
> file becomes sort of a false handle on the VM:
> 
> + Closing the file destroys the file pointers in the VM and invalidates
>   the pointers

Yeah, this is less than ideal.  But, it's also how things operate today.  KVM
doesn't hold references to VMAs or files, e.g. if userspace munmap()s memory,
any and all SPTEs pointing at the memory are zapped.  The only difference with
gmem is that KVM needs to explicitly invalidate file pointers, instead of that
happening behind the scenes (no more VMAs to find).  Again, I agree the resulting
code is more complex than I would prefer, but from a userspace perspective I
don't see this as problematic.

> + Keeping the file open keeps the VM around in the kernel even though
>   the VM fd may already be closed.

That is perfectly ok.  There is plenty of prior art, as well as plenty of ways
for userspace to shoot itself in the foot.  E.g. open a stats fd for a vCPU and
the VM and all its vCPUs will be kept alive.  And conceptually it's sound,
anything created in the scope of a VM _should_ pin the VM.

> I feel that memslots form a natural way of managing usage of the gmem
> file. When a memslot is created, it is using the file; hence we take a
> refcount on the gmem file, and as memslots are removed, we drop
> refcounts on the gmem file.

Yes and no.  It's definitely more natural *if* the goal is to allow guest_memfd
memory to exist without being attached to a VM.  But I'm not at all convinced
that we want to allow that, or that it has desirable properties.  With TDX and
SNP in particuarly, I'm pretty sure that allowing memory to outlive the VM is
very underisable (more below).

> The KVM pointer is shared among all the bindings in gmem’s xarray, and we can
> enforce that a gmem file is used only with one VM:
> 
> + When binding a memslot to the file, if a kvm pointer exists, it must
>   be the same kvm as the one in this binding
> + When the binding to the last memslot is removed from a file, NULL the
>   kvm pointer.

Nullifying the KVM pointer isn't sufficient, because without additional actions
userspace could extract data from a VM by deleting its memslots and then binding
the guest_memfd to an attacker controlled VM.  Or more likely with TDX and SNP,
induce badness by coercing KVM into mapping memory into a guest with the wrong
ASID/HKID.

I can think of three ways to handle that:

  (a) prevent a different VM from *ever* binding to the gmem instance
  (b) free/zero physical pages when unbinding
  (c) free/zero when binding to a different VM

Option (a) is easy, but that pretty much defeats the purpose of decopuling
guest_memfd from a VM.

Option (b) isn't hard to implement, but it screws up the lifecycle of the memory,
e.g. would require memory when a memslot is deleted.  That isn't necessarily a
deal-breaker, but it runs counter to how KVM memlots currently operate.  Memslots
are basically just weird page tables, e.g. deleting a memslot doesn't have any
impact on the underlying data in memory.  TDX throws a wrench in this as removing
a page from the Secure EPT is effectively destructive to the data (can't be mapped
back in to the VM without zeroing the data), but IMO that's an oddity with TDX and
not necessarily something we want to carry over to other VM types.

There would also be performance implications (probably a non-issue in practice),
and weirdness if/when we get to sharing, linking and/or mmap()ing gmem.  E.g. what
should happen if the last memslot (binding) is deleted, but there outstanding userspace
mappings?

Option (c) is better from a lifecycle perspective, but it adds its own flavor of
complexity, e.g. the performant way to reclaim TDX memory requires the TDMR
(effectively the VM pointer), and so a deferred relcaim doesn't really work for
TDX.  And I'm pretty sure it *can't* work for SNP, because RMP entries must not
outlive the VM; KVM can't reuse an ASID if there are pages assigned to that ASID
in the RMP, i.e. until all memory belonging to the VM has been fully freed.

> Could binding gmem files not on creation, but at memslot configuration
> time be sufficient and simpler?

After working through the flows, I think binding on-demand would simplify the
refcounting (stating the obvious), but complicate the lifecycle of the memory as
well as the contract between KVM and userspace, and would break the separation of
concerns between the inode (physical memory / data) and file (VM's view / mappings).
Vishal Annapurve Aug. 10, 2023, 11:57 p.m. UTC | #25
On Tue, Aug 8, 2023 at 2:13 PM Sean Christopherson <seanjc@google.com> wrote:
> ...

> > + When binding a memslot to the file, if a kvm pointer exists, it must
> >   be the same kvm as the one in this binding
> > + When the binding to the last memslot is removed from a file, NULL the
> >   kvm pointer.
>
> Nullifying the KVM pointer isn't sufficient, because without additional actions
> userspace could extract data from a VM by deleting its memslots and then binding
> the guest_memfd to an attacker controlled VM.  Or more likely with TDX and SNP,
> induce badness by coercing KVM into mapping memory into a guest with the wrong
> ASID/HKID.
>

TDX/SNP have mechanisms i.e. PAMT/RMP tables to ensure that the same
memory is not assigned to two different VMs. Deleting memslots should
also clear out the contents of the memory as the EPT tables will be
zapped in the process and the host will reclaim the memory.

Regards,
Vishal
Sean Christopherson Aug. 11, 2023, 5:44 p.m. UTC | #26
On Thu, Aug 10, 2023, Vishal Annapurve wrote:
> On Tue, Aug 8, 2023 at 2:13 PM Sean Christopherson <seanjc@google.com> wrote:
> > ...
> 
> > > + When binding a memslot to the file, if a kvm pointer exists, it must
> > >   be the same kvm as the one in this binding
> > > + When the binding to the last memslot is removed from a file, NULL the
> > >   kvm pointer.
> >
> > Nullifying the KVM pointer isn't sufficient, because without additional actions
> > userspace could extract data from a VM by deleting its memslots and then binding
> > the guest_memfd to an attacker controlled VM.  Or more likely with TDX and SNP,
> > induce badness by coercing KVM into mapping memory into a guest with the wrong
> > ASID/HKID.
> >
> 
> TDX/SNP have mechanisms i.e. PAMT/RMP tables to ensure that the same
> memory is not assigned to two different VMs.

One of the main reasons we pivoted away from using a flag in "struct page" to
indicate that a page was private was so that KVM could enforce 1:1 VM:page ownership
*without* relying on hardware.

And FWIW, the PAMT provides no protection in this specific case because KVM does
TDH.MEM.PAGE.REMOVE when zapping S-EPT entries, and that marks the page clear in
the PAMT.  The danger there is that physical memory is still encrypted with the
guest's HKID, and so mapping the memory into a different VM, which might not be
a TDX guest!, could lead to corruption and/or poison #MCs.

The HKID issues wouldn't be a problem if v15 is merged as-is, because zapping
S-EPT entries also fully purges and reclaims the page, but as we discussed in
one of the many threads, reclaiming physical memory should be tied to the inode,
i.e. to memory truly being freed, and not to S-EPTs being zapped.  And there is
a very good reason for wanting to do that, as it allows KVM to do the expensive
cache flush + clear outside of mmu_lock.

> Deleting memslots should also clear out the contents of the memory as the EPT
> tables will be zapped in the process

No, deleting a memslot should not clear memory.  As I said in my previous response,
the fact that zapping S-EPT entries is destructive is a limitiation of TDX, not a
feature we want to apply to other VM types.  And that's not even a fundamental
property of TDX, e.g. TDX could remove the limitation, at the cost of consuming
quite a bit more memory, by tracking the exact owner by HKID in the PAMT and
decoupling S-EPT entries from page ownership.

Or in theory, KVM could workaround the limitation by only doing TDH.MEM.RANGE.BLOCK
when zapping S-EPTs.  Hmm, that might actually be worth looking at.

> and the host will reclaim the memory.

There are no guarantees that the host will reclaim the memory.  E.g. QEMU will
delete and re-create memslots for "regular" VMs when emulating option ROMs.  Even
if that use case is nonsensical for confidential VMs (and it probably is nonsensical),
I don't want to define KVM's ABI based on what we *think* userspace will do.
Sean Christopherson Aug. 15, 2023, 8:03 p.m. UTC | #27
On Tue, Aug 15, 2023, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
> >> I feel that memslots form a natural way of managing usage of the gmem
> >> file. When a memslot is created, it is using the file; hence we take a
> >> refcount on the gmem file, and as memslots are removed, we drop
> >> refcounts on the gmem file.
> >
> > Yes and no.  It's definitely more natural *if* the goal is to allow guest_memfd
> > memory to exist without being attached to a VM.  But I'm not at all convinced
> > that we want to allow that, or that it has desirable properties.  With TDX and
> > SNP in particuarly, I'm pretty sure that allowing memory to outlive the VM is
> > very underisable (more below).
> >
> 
> This is a little confusing, with the file/inode split in gmem where the
> physical memory/data is attached to the inode and the file represents
> the VM's view of that memory, won't the memory outlive the VM?

Doh, I overloaded the term "VM".  By "VM" I meant the virtual machine as a "thing"
the rest of the world sees and interacts with, not the original "struct kvm" object.

Because yes, you're absolutely correct that the memory will outlive "struct kvm",
but it won't outlive the virtual machine, and specifically won't outlive the
ASID (SNP) / HKID (TDX) to which it's bound.

> This [1] POC was built based on that premise, that the gmem inode can be
> linked to another file and handed off to another VM, to facilitate
> intra-host migration, where the point is to save the work of rebuilding
> the VM's memory in the destination VM.
> 
> With this, the bindings don't outlive the VM, but the data/memory
> does. I think this split design you proposed is really nice.
> 
> >> The KVM pointer is shared among all the bindings in gmem’s xarray, and we can
> >> enforce that a gmem file is used only with one VM:
> >>
> >> + When binding a memslot to the file, if a kvm pointer exists, it must
> >>   be the same kvm as the one in this binding
> >> + When the binding to the last memslot is removed from a file, NULL the
> >>   kvm pointer.
> >
> > Nullifying the KVM pointer isn't sufficient, because without additional actions
> > userspace could extract data from a VM by deleting its memslots and then binding
> > the guest_memfd to an attacker controlled VM.  Or more likely with TDX and SNP,
> > induce badness by coercing KVM into mapping memory into a guest with the wrong
> > ASID/HKID.
> >
> > I can think of three ways to handle that:
> >
> >   (a) prevent a different VM from *ever* binding to the gmem instance
> >   (b) free/zero physical pages when unbinding
> >   (c) free/zero when binding to a different VM
> >
> > Option (a) is easy, but that pretty much defeats the purpose of decopuling
> > guest_memfd from a VM.
> >
> > Option (b) isn't hard to implement, but it screws up the lifecycle of the memory,
> > e.g. would require memory when a memslot is deleted.  That isn't necessarily a
> > deal-breaker, but it runs counter to how KVM memlots currently operate.  Memslots
> > are basically just weird page tables, e.g. deleting a memslot doesn't have any
> > impact on the underlying data in memory.  TDX throws a wrench in this as removing
> > a page from the Secure EPT is effectively destructive to the data (can't be mapped
> > back in to the VM without zeroing the data), but IMO that's an oddity with TDX and
> > not necessarily something we want to carry over to other VM types.
> >
> > There would also be performance implications (probably a non-issue in practice),
> > and weirdness if/when we get to sharing, linking and/or mmap()ing gmem.  E.g. what
> > should happen if the last memslot (binding) is deleted, but there outstanding userspace
> > mappings?
> >
> > Option (c) is better from a lifecycle perspective, but it adds its own flavor of
> > complexity, e.g. the performant way to reclaim TDX memory requires the TDMR
> > (effectively the VM pointer), and so a deferred relcaim doesn't really work for
> > TDX.  And I'm pretty sure it *can't* work for SNP, because RMP entries must not
> > outlive the VM; KVM can't reuse an ASID if there are pages assigned to that ASID
> > in the RMP, i.e. until all memory belonging to the VM has been fully freed.
> >
> 
> If we are on the same page that the memory should outlive the VM but not
> the bindings, then associating the gmem inode to a new VM should be a
> feature and not a bug.
> 
> What do we want to defend against here?
> 
> (a) Malicious host VMM
> 
> For a malicious host VMM to read guest memory (with TDX and SNP), it can
> create a new VM with the same HKID/ASID as the victim VM, rebind the
> gmem inode to a VM crafted with an image that dumps the memory.
> 
> I believe it is not possible for userspace to arbitrarily select a
> matching HKID unless userspace uses the intra-host migration ioctls, but if the
> migration ioctl is used, then EPTs are migrated and the memory dumper VM
> can't successfully run a different image from the victim VM. If the
> dumper VM needs to run the same image as the victim VM, then it would be
> a successful migration rather than an attack. (Perhaps we need to clean
> up some #MCs here but that can be a separate patch).

From a guest security perspective, throw TDX and SNP out the window.  As far as
the design of guest_memfd is concerned, I truly do not care what security properties
they provide, I only care about whether or not KVM's support for TDX and SNP is
clean, robust, and functionally correct.

Note, I'm not saying I don't care about TDX/SNP.  What I'm saying is that I don't
want to design something that is beneficial only to what is currently a very
niche class of VMs that require specific flavors of hardware.

> (b) Malicious host kernel
> 
> A malicious host kernel can allow a malicious host VMM to re-use a HKID
> for the dumper VM, but this isn't something a better gmem design can
> defend against.

Yep, completely out-of-scope.

> (c) Attacks using gmem for software-protected VMs
> 
> Attacks using gmem for software-protected VMs are possible since there
> is no real encryption with HKID/ASID (yet?). The selftest for [1]
> actually uses this lack of encryption to test that the destination VM
> can read the source VM's memory after the migration. In the POC [1], as
> long as both destination VM knows where in the inode's memory to read,
> it can read what it wants to.
 
Encryption is not required to protect guest memory from less privileged software.
The selftests don't rely on lack of encryption, they rely on KVM incorporating
host userspace into the TCB.

Just because this RFC doesn't remove the VMM from the TCB for SW-protected VMS,
doesn't mean we _can't_ remove the VMM from the TCB.  pKVM has already shown that
such an implementation is possible.  We didn't tackle pKVM-like support in the
initial implementation because it's non-trivial, doesn't yet have a concrete use
case to fund/drive development, and would have significantly delayed support for
the use cases people do actually care about.

There are certainly benefits from memory being encrypted, but it's neither a
requirement nor a panacea, as proven by the never ending stream of speculative
execution attacks.
 
> This is a problem for software-protected VMs, but I feel that it is also a
> separate issue from gmem's design.

No, I don't want guest_memfd to be just be a vehicle for SNP/TDX VMs.  Having line
of sight to removing host userspace from the TCB is absolutely a must have for me,
and having line of sight to improving KVM's security posture for "regular" VMs is
even more of a must have.  If guest_memfd doesn't provide us a very direct path to
(eventually) achieving those goals, then IMO it's a failure.

Which leads me to:

(d) Buggy components

Today, for all intents and purposes, guest memory *must* be mapped writable in
the VMM, which means it is all too easy for a benign-but-buggy host component to
corrupt guest memory.  There are ways to mitigate potential problems, e.g. by
developing userspace to adhere to the principle of least privilege inasmuch as
possible, but such mitigations would be far less robust than what can be achieved
via guest_memfd, and practically speaking I don't see us (Google, but also KVM in
general) making progress on deprivileging userspace without forcing the issue.

> >> Could binding gmem files not on creation, but at memslot configuration
> >> time be sufficient and simpler?
> >
> > After working through the flows, I think binding on-demand would simplify the
> > refcounting (stating the obvious), but complicate the lifecycle of the memory as
> > well as the contract between KVM and userspace,
> 
> If we are on the same page that the memory should outlive the VM but not
> the bindings, does it still complicate the lifecycle of the memory and
> the userspace/KVM contract? Could it just be a different contract?

Not entirely sure I understand what you're asking.  Does this question go away
with my clarification about struct kvm vs. virtual machine?

> > and would break the separation of
> > concerns between the inode (physical memory / data) and file (VM's view / mappings).
> 
> Binding on-demand is orthogonal to the separation of concerns between
> inode and file, because it can be built regardless of whether we do the
> gmem file/inode split.
> 
> + This flip-the-refcounting POC is built with the file/inode split and
> + In [2] (the delayed binding approach to solve intra-host migration), I
>   also tried flipping the refcounting, and that without the gmem
>   file/inode split. (Refcounting in [2] is buggy because the file can't
>   take a refcount on KVM, but it would work without taking that refcount)
> 
> [1] https://lore.kernel.org/lkml/cover.1691446946.git.ackerleytng@google.com/T/
> [2] https://github.com/googleprodkernel/linux-cc/commit/dd5ac5e53f14a1ef9915c9c1e4cc1006a40b49df
Sean Christopherson Aug. 21, 2023, 7:33 p.m. UTC | #28
On Mon, Aug 21, 2023, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
> > On Tue, Aug 15, 2023, Ackerley Tng wrote:
> >> Sean Christopherson <seanjc@google.com> writes:
> >> > Nullifying the KVM pointer isn't sufficient, because without additional actions
> >> > userspace could extract data from a VM by deleting its memslots and then binding
> >> > the guest_memfd to an attacker controlled VM.  Or more likely with TDX and SNP,
> >> > induce badness by coercing KVM into mapping memory into a guest with the wrong
> >> > ASID/HKID.
> >> >
> >> > I can think of three ways to handle that:
> >> >
> >> >   (a) prevent a different VM from *ever* binding to the gmem instance
> >> >   (b) free/zero physical pages when unbinding
> >> >   (c) free/zero when binding to a different VM
> >> >
> >> > Option (a) is easy, but that pretty much defeats the purpose of decopuling
> >> > guest_memfd from a VM.
> >> >
> >> > Option (b) isn't hard to implement, but it screws up the lifecycle of the memory,
> >> > e.g. would require memory when a memslot is deleted.  That isn't necessarily a
> >> > deal-breaker, but it runs counter to how KVM memlots currently operate.  Memslots
> >> > are basically just weird page tables, e.g. deleting a memslot doesn't have any
> >> > impact on the underlying data in memory.  TDX throws a wrench in this as removing
> >> > a page from the Secure EPT is effectively destructive to the data (can't be mapped
> >> > back in to the VM without zeroing the data), but IMO that's an oddity with TDX and
> >> > not necessarily something we want to carry over to other VM types.
> >> >
> >> > There would also be performance implications (probably a non-issue in practice),
> >> > and weirdness if/when we get to sharing, linking and/or mmap()ing gmem.  E.g. what
> >> > should happen if the last memslot (binding) is deleted, but there outstanding userspace
> >> > mappings?
> >> >
> >> > Option (c) is better from a lifecycle perspective, but it adds its own flavor of
> >> > complexity, e.g. the performant way to reclaim TDX memory requires the TDMR
> >> > (effectively the VM pointer), and so a deferred relcaim doesn't really work for
> >> > TDX.  And I'm pretty sure it *can't* work for SNP, because RMP entries must not
> >> > outlive the VM; KVM can't reuse an ASID if there are pages assigned to that ASID
> >> > in the RMP, i.e. until all memory belonging to the VM has been fully freed.

...

> I agree with you that nulling the KVM pointer is insufficient to keep
> host userspace out of the TCB. Among the three options (a) preventing a
> different VM (HKID/ASID) from binding to the gmem instance, or zeroing
> the memory either (b) on unbinding, or (c) on binding to another VM
> (HKID/ASID),
> 
> (a) sounds like adding a check issued to TDX/SNP upon binding and this
>     check would just return OK for software-protected VMs (line of sight
>     to removing host userspace from TCB).
> 
> Or, we could go further for software-protected VMs and add tracking in
> the inode to prevent the same inode from being bound to different
> "HKID/ASID"s, perhaps like this:
> 
> + On first binding, store the KVM pointer in the inode - not file (but
>   not hold a refcount)
> + On rebinding, check that the KVM matches the pointer in the inode
> + On intra-host migration, update the KVM pointer in the inode to allow
>   binding to the new struct kvm
> 
> I think you meant associating the file with a struct kvm at creation
> time as an implementation for (a), but technically since the inode is
> the representation of memory, tracking of struct kvm should be with the
> inode instead of the file.
> 
> (b) You're right that this messes up the lifecycle of the memory and
>     wouldn't work with intra-host migration.
> 
> (c) sounds like doing the clearing on a check similar to that of (a)

Sort of, though it's much nastier, because it requires the "old" KVM instance to
be alive enough to support various operations.  I.e. we'd have to make stronger
guarantees about exactly when the handoff/transition could happen.

> If we track struct kvm with the inode, then I think (a), (b) and (c) can
> be independent of the refcounting method. What do you think?

No go.  Because again, the inode (physical memory) is coupled to the virtual machine
as a thing, not to a "struct kvm".  Or more concretely, the inode is coupled to an
ASID or an HKID, and there can be multiple "struct kvm" objects associated with a
single ASID.  And at some point in the future, I suspect we'll have multiple KVM
objects per HKID too.

The current SEV use case is for the migration helper, where two KVM objects share
a single ASID (the "real" VM and the helper).  I suspect TDX will end up with
similar behavior where helper "VMs" can use the HKID of the "real" VM.  For KVM,
that means multiple struct kvm objects being associated with a single HKID.

To prevent use-after-free, KVM "just" needs to ensure the helper instances can't
outlive the real instance, i.e. can't use the HKID/ASID after the owning virtual
machine has been destroyed.

To put it differently, "struct kvm" is a KVM software construct that _usually_,
but not always, is associated 1:1 with a virtual machine.

And FWIW, stashing the pointer without holding a reference would not be a complete
solution, because it couldn't guard against KVM reusing a pointer.  E.g. if a
struct kvm was unbound and then freed, KVM could reuse the same memory for a new
struct kvm, with a different ASID/HKID, and get a false negative on the rebinding
check.
Binbin Wu Aug. 30, 2023, 3:12 p.m. UTC | #29
On 7/19/2023 7:44 AM, Sean Christopherson wrote:

[...]
> +
> +static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
> +{
> +	struct folio *folio;
> +
> +	/* TODO: Support huge pages. */
> +	folio = filemap_grab_folio(file->f_mapping, index);
> +	if (!folio)
Should use  if ((IS_ERR(folio)) instead.

> +		return NULL;
> +
> +	/*
> +	 * Use the up-to-date flag to track whether or not the memory has been
> +	 * zeroed before being handed off to the guest.  There is no backing
> +	 * storage for the memory, so the folio will remain up-to-date until
> +	 * it's removed.
> +	 *
> +	 * TODO: Skip clearing pages when trusted firmware will do it when
> +	 * assigning memory to the guest.
> +	 */
> +	if (!folio_test_uptodate(folio)) {
> +		unsigned long nr_pages = folio_nr_pages(folio);
> +		unsigned long i;
> +
> +		for (i = 0; i < nr_pages; i++)
> +			clear_highpage(folio_page(folio, i));
> +
> +		folio_mark_uptodate(folio);
> +	}
> +
> +	/*
> +	 * Ignore accessed, referenced, and dirty flags.  The memory is
> +	 * unevictable and there is no storage to write back to.
> +	 */
> +	return folio;
> +}
[...]
> +
> +static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
> +{
> +	struct address_space *mapping = inode->i_mapping;
> +	pgoff_t start, index, end;
> +	int r;
> +
> +	/* Dedicated guest is immutable by default. */
> +	if (offset + len > i_size_read(inode))
> +		return -EINVAL;
> +
> +	filemap_invalidate_lock_shared(mapping);
> +
> +	start = offset >> PAGE_SHIFT;
> +	end = (offset + len) >> PAGE_SHIFT;
> +
> +	r = 0;
> +	for (index = start; index < end; ) {
> +		struct folio *folio;
> +
> +		if (signal_pending(current)) {
> +			r = -EINTR;
> +			break;
> +		}
> +
> +		folio = kvm_gmem_get_folio(inode, index);
> +		if (!folio) {
> +			r = -ENOMEM;
> +			break;
> +		}
> +
> +		index = folio_next_index(folio);
> +
> +		folio_unlock(folio);
> +		folio_put(folio);
May be a dumb question, why we get the folio and then put it immediately?
Will it make the folio be released back to the page allocator?

> +
> +		/* 64-bit only, wrapping the index should be impossible. */
> +		if (WARN_ON_ONCE(!index))
> +			break;
> +
> +		cond_resched();
> +	}
> +
> +	filemap_invalidate_unlock_shared(mapping);
> +
> +	return r;
> +}
> +
[...]
> +
> +int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
> +		  unsigned int fd, loff_t offset)
> +{
> +	loff_t size = slot->npages << PAGE_SHIFT;
> +	unsigned long start, end, flags;
> +	struct kvm_gmem *gmem;
> +	struct inode *inode;
> +	struct file *file;
> +
> +	BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
> +
> +	file = fget(fd);
> +	if (!file)
> +		return -EINVAL;
> +
> +	if (file->f_op != &kvm_gmem_fops)
> +		goto err;
> +
> +	gmem = file->private_data;
> +	if (gmem->kvm != kvm)
> +		goto err;
> +
> +	inode = file_inode(file);
> +	flags = (unsigned long)inode->i_private;
> +
> +	/*
> +	 * For simplicity, require the offset into the file and the size of the
> +	 * memslot to be aligned to the largest possible page size used to back
> +	 * the file (same as the size of the file itself).
> +	 */
> +	if (!kvm_gmem_is_valid_size(offset, flags) ||
> +	    !kvm_gmem_is_valid_size(size, flags))
> +		goto err;
> +
> +	if (offset + size > i_size_read(inode))
> +		goto err;
> +
> +	filemap_invalidate_lock(inode->i_mapping);
> +
> +	start = offset >> PAGE_SHIFT;
> +	end = start + slot->npages;
> +
> +	if (!xa_empty(&gmem->bindings) &&
> +	    xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
> +		filemap_invalidate_unlock(inode->i_mapping);
> +		goto err;
> +	}
> +
> +	/*
> +	 * No synchronize_rcu() needed, any in-flight readers are guaranteed to
> +	 * be see either a NULL file or this new file, no need for them to go
> +	 * away.
> +	 */
> +	rcu_assign_pointer(slot->gmem.file, file);
> +	slot->gmem.pgoff = start;
> +
> +	xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
> +	filemap_invalidate_unlock(inode->i_mapping);
> +
> +	/*
> +	 * Drop the reference to the file, even on success.  The file pins KVM,
> +	 * not the other way 'round.  Active bindings are invalidated if the
an extra ',  or maybe around?


> +	 * file is closed before memslots are destroyed.
> +	 */
> +	fput(file);
> +	return 0;
> +
> +err:
> +	fput(file);
> +	return -EINVAL;
> +}
> +
[...]
> []
Sean Christopherson Sept. 14, 2023, 6:15 p.m. UTC | #30
On Mon, Aug 28, 2023, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
> >> If we track struct kvm with the inode, then I think (a), (b) and (c) can
> >> be independent of the refcounting method. What do you think?
> >
> > No go.  Because again, the inode (physical memory) is coupled to the virtual machine
> > as a thing, not to a "struct kvm".  Or more concretely, the inode is coupled to an
> > ASID or an HKID, and there can be multiple "struct kvm" objects associated with a
> > single ASID.  And at some point in the future, I suspect we'll have multiple KVM
> > objects per HKID too.
> >
> > The current SEV use case is for the migration helper, where two KVM objects share
> > a single ASID (the "real" VM and the helper).  I suspect TDX will end up with
> > similar behavior where helper "VMs" can use the HKID of the "real" VM.  For KVM,
> > that means multiple struct kvm objects being associated with a single HKID.
> >
> > To prevent use-after-free, KVM "just" needs to ensure the helper instances can't
> > outlive the real instance, i.e. can't use the HKID/ASID after the owning virtual
> > machine has been destroyed.
> >
> > To put it differently, "struct kvm" is a KVM software construct that _usually_,
> > but not always, is associated 1:1 with a virtual machine.
> >
> > And FWIW, stashing the pointer without holding a reference would not be a complete
> > solution, because it couldn't guard against KVM reusing a pointer.  E.g. if a
> > struct kvm was unbound and then freed, KVM could reuse the same memory for a new
> > struct kvm, with a different ASID/HKID, and get a false negative on the rebinding
> > check.
> 
> I agree that inode (physical memory) is coupled to the virtual machine
> as a more generic concept.
> 
> I was hoping that in the absence of CC hardware providing a HKID/ASID,
> the struct kvm pointer could act as a representation of the "virtual
> machine". You're definitely right that KVM could reuse a pointer and so
> that idea doesn't stand.
> 
> I thought about generating UUIDs to represent "virtual machines" in the
> absence of CC hardware, and this UUID could be transferred during
> intra-host migration, but this still doesn't take host userspace out of
> the TCB. A malicious host VMM could just use the migration ioctl to copy
> the UUID to a malicious dumper VM, which would then pass checks with a
> gmem file linked to the malicious dumper VM. This is fine for HKID/ASIDs
> because the memory is encrypted; with UUIDs there's no memory
> encryption.

I don't understand what problem you're trying to solve.  I don't see a need to
provide a single concrete representation/definition of a "virtual machine".  E.g.
there's no need for a formal definition to securely perform intrahost migration,
KVM just needs to ensure that the migration doesn't compromise guest security,
functionality, etc.

That gets a lot more complex if the target KVM instance (module, not "struct kvm")
is a different KVM, e.g. when migrating to a different host.  Then there needs to
be a way to attest that the target is trusted and whatnot, but that still doesn't
require there to be a formal definition of a "virtual machine".

> Circling back to the original topic, was associating the file with
> struct kvm at gmem file creation time meant to constrain the use of the
> gmem file to one struct kvm, or one virtual machine, or something else?

It's meant to keep things as simple as possible (relatively speaking).  A 1:1
association between a KVM instance and a gmem instance means we don't have to
worry about the edge cases and oddities I pointed out earlier in this thread.

> Follow up questions:
> 
> 1. Since the physical memory's representation is the inode and should be
>    coupled to the virtual machine (as a concept, not struct kvm), should
>    the binding/coupling be with the file, or the inode?

Both.  The @kvm instance is bound to a file, because the file is that @kvm's view
of the underlying memory, e.g. effectively provides the translation of guest
addresses to host memory.  The @kvm instance is indirectly bound to the inode
because the file is bound to the inode.

> 2. Should struct kvm still be bound to the file/inode at gmem file
>    creation time, since

Yes.

>    + struct kvm isn't a good representation of a "virtual machine"

I don't see how this is relevant, because as above, I don't see why we need a
canonical represenation of a virtual machine.

>    + we currently don't have anything that really represents a "virtual
>      machine" without hardware support

HKIDs and ASIDs don't provide a "virtual machine" representation either.  E.g. if
a TDX guest is live migrated to a different host, it will likely have a different
HKID, and definitely have a different encryption key, but it's still the same
virtual machine.

> I'd also like to bring up another userspace use case that Google has:
> re-use of gmem files for rebooting guests when the KVM instance is
> destroyed and rebuilt.
>
> When rebooting a VM there are some steps relating to gmem that are
> performance-sensitive:

If we (Google) really cared about performance, then we shouldn't destroy and recreate
the VM in the first place.  E.g. the cost of zapping, freeing, re-allocating and
re-populating SPTEs is far from trivial.  Pulling RESET shouldn't change what
memory that is assigned to a VM, and reseting stats is downright bizarre IMO.

In other words, I think Google's approach of destroying the VM to emulate a reboot
is asinine.  I'm not totally against extending KVM's uAPI to play nice with such
an approach, but I'm not exactly sympathetic either.

> a.      Zeroing pages from the old VM when we close a gmem file/inode
> b. Deallocating pages from the old VM when we close a gmem file/inode
> c.   Allocating pages for the new VM from the new gmem file/inode
> d.      Zeroing pages on page allocation
> 
> We want to reuse the gmem file to save re-allocating pages (b. and c.),
> and one of the two page zeroing allocations (a. or d.).
> 
> Binding the gmem file to a struct kvm on creation time means the gmem
> file can't be reused with another VM on reboot.

Not without KVM's assistance, which userspace will need for TDX and SNP VMs no
matter what, e.g. to ensure the new and old KVM instance get the same HKID/ASID.
And we've already mapped out the more complex case of intrahost migration, so I
don't expect this to be at all challenging to implement.

> Also, host userspace is forced to close the gmem file to allow the old VM to
> be freed.

Yes, but that can happen after the "new" VM has instantiated its file/view of
guest memory.

> For other places where files pin KVM, like the stats fd pinning vCPUs, I
> guess that matters less since there isn't much of a penalty to close and
> re-open the stats fd.
Ackerley Tng Sept. 14, 2023, 11:19 p.m. UTC | #31
Sean Christopherson <seanjc@google.com> writes:

> On Mon, Aug 28, 2023, Ackerley Tng wrote:
>> Sean Christopherson <seanjc@google.com> writes:
>> >> If we track struct kvm with the inode, then I think (a), (b) and (c) can
>> >> be independent of the refcounting method. What do you think?
>> >
>> > No go.  Because again, the inode (physical memory) is coupled to the virtual machine
>> > as a thing, not to a "struct kvm".  Or more concretely, the inode is coupled to an
>> > ASID or an HKID, and there can be multiple "struct kvm" objects associated with a
>> > single ASID.  And at some point in the future, I suspect we'll have multiple KVM
>> > objects per HKID too.
>> >
>> > The current SEV use case is for the migration helper, where two KVM objects share
>> > a single ASID (the "real" VM and the helper).  I suspect TDX will end up with
>> > similar behavior where helper "VMs" can use the HKID of the "real" VM.  For KVM,
>> > that means multiple struct kvm objects being associated with a single HKID.
>> >
>> > To prevent use-after-free, KVM "just" needs to ensure the helper instances can't
>> > outlive the real instance, i.e. can't use the HKID/ASID after the owning virtual
>> > machine has been destroyed.
>> >
>> > To put it differently, "struct kvm" is a KVM software construct that _usually_,
>> > but not always, is associated 1:1 with a virtual machine.
>> >
>> > And FWIW, stashing the pointer without holding a reference would not be a complete
>> > solution, because it couldn't guard against KVM reusing a pointer.  E.g. if a
>> > struct kvm was unbound and then freed, KVM could reuse the same memory for a new
>> > struct kvm, with a different ASID/HKID, and get a false negative on the rebinding
>> > check.
>> 
>> I agree that inode (physical memory) is coupled to the virtual machine
>> as a more generic concept.
>> 
>> I was hoping that in the absence of CC hardware providing a HKID/ASID,
>> the struct kvm pointer could act as a representation of the "virtual
>> machine". You're definitely right that KVM could reuse a pointer and so
>> that idea doesn't stand.
>> 
>> I thought about generating UUIDs to represent "virtual machines" in the
>> absence of CC hardware, and this UUID could be transferred during
>> intra-host migration, but this still doesn't take host userspace out of
>> the TCB. A malicious host VMM could just use the migration ioctl to copy
>> the UUID to a malicious dumper VM, which would then pass checks with a
>> gmem file linked to the malicious dumper VM. This is fine for HKID/ASIDs
>> because the memory is encrypted; with UUIDs there's no memory
>> encryption.
>
> I don't understand what problem you're trying to solve.  I don't see a need to
> provide a single concrete representation/definition of a "virtual machine".  E.g.
> there's no need for a formal definition to securely perform intrahost migration,
> KVM just needs to ensure that the migration doesn't compromise guest security,
> functionality, etc.
>
> That gets a lot more complex if the target KVM instance (module, not "struct kvm")
> is a different KVM, e.g. when migrating to a different host.  Then there needs to
> be a way to attest that the target is trusted and whatnot, but that still doesn't
> require there to be a formal definition of a "virtual machine".
>
>> Circling back to the original topic, was associating the file with
>> struct kvm at gmem file creation time meant to constrain the use of the
>> gmem file to one struct kvm, or one virtual machine, or something else?
>
> It's meant to keep things as simple as possible (relatively speaking).  A 1:1
> association between a KVM instance and a gmem instance means we don't have to
> worry about the edge cases and oddities I pointed out earlier in this thread.
>

I looked through this thread again and re-read the edge cases and
oddities that was pointed out earlier (last paragraph at [1]) and I
think I understand better, and I have just one last clarification.

It was previously mentioned that binding on creation time simplifies the
lifecycle of memory:

"(a) prevent a different VM from *ever* binding to the gmem instance" [1]

Does this actually mean

"prevent a different struct kvm from *ever* binding to this gmem file"

?

If so, then binding on creation

+ Makes the gmem *file* (and just not the bindings xarray) the binding
  between struct kvm and the file.
+ Simplifies the KVM-userspace contract to "this gmem file can only be
  used with this struct kvm"

Binding on creation doesn't offer any way to block the contents of the
inode from being used with another "virtual machine" though, since we
can have more than one gmem file pointing to the same inode, and the
other gmem file is associated with another struct kvm. (And a strut kvm
isn't associated 1:1 with a virtual machine [2])

The point about an inode needing to be coupled to a virtual machine as a
thing [2] led me to try to find a single concrete representation of a
"virtual machine".

Is locking inode contents to a "virtual machine" outside the scope of
gmem? If so, then it is fine to bind on creation time, use a VM ioctl
over a system ioctl, and the method of refcounting in gmem v12 is okay.

[1] https://lore.kernel.org/lkml/ZNKv9ul2I7A4V7IF@google.com/
[2] https://lore.kernel.org/lkml/ZOO782YGRY0YMuPu@google.com/

> <snip>
Sean Christopherson Sept. 15, 2023, 12:33 a.m. UTC | #32
On Thu, Sep 14, 2023, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
> > On Mon, Aug 28, 2023, Ackerley Tng wrote:
> >> Sean Christopherson <seanjc@google.com> writes:
> >> >> If we track struct kvm with the inode, then I think (a), (b) and (c) can
> >> >> be independent of the refcounting method. What do you think?
> >> >
> >> > No go.  Because again, the inode (physical memory) is coupled to the virtual machine
> >> > as a thing, not to a "struct kvm".  Or more concretely, the inode is coupled to an
> >> > ASID or an HKID, and there can be multiple "struct kvm" objects associated with a
> >> > single ASID.  And at some point in the future, I suspect we'll have multiple KVM
> >> > objects per HKID too.
> >> >
> >> > The current SEV use case is for the migration helper, where two KVM objects share
> >> > a single ASID (the "real" VM and the helper).  I suspect TDX will end up with
> >> > similar behavior where helper "VMs" can use the HKID of the "real" VM.  For KVM,
> >> > that means multiple struct kvm objects being associated with a single HKID.
> >> >
> >> > To prevent use-after-free, KVM "just" needs to ensure the helper instances can't
> >> > outlive the real instance, i.e. can't use the HKID/ASID after the owning virtual
> >> > machine has been destroyed.
> >> >
> >> > To put it differently, "struct kvm" is a KVM software construct that _usually_,
> >> > but not always, is associated 1:1 with a virtual machine.
> >> >
> >> > And FWIW, stashing the pointer without holding a reference would not be a complete
> >> > solution, because it couldn't guard against KVM reusing a pointer.  E.g. if a
> >> > struct kvm was unbound and then freed, KVM could reuse the same memory for a new
> >> > struct kvm, with a different ASID/HKID, and get a false negative on the rebinding
> >> > check.
> >> 
> >> I agree that inode (physical memory) is coupled to the virtual machine
> >> as a more generic concept.
> >> 
> >> I was hoping that in the absence of CC hardware providing a HKID/ASID,
> >> the struct kvm pointer could act as a representation of the "virtual
> >> machine". You're definitely right that KVM could reuse a pointer and so
> >> that idea doesn't stand.
> >> 
> >> I thought about generating UUIDs to represent "virtual machines" in the
> >> absence of CC hardware, and this UUID could be transferred during
> >> intra-host migration, but this still doesn't take host userspace out of
> >> the TCB. A malicious host VMM could just use the migration ioctl to copy
> >> the UUID to a malicious dumper VM, which would then pass checks with a
> >> gmem file linked to the malicious dumper VM. This is fine for HKID/ASIDs
> >> because the memory is encrypted; with UUIDs there's no memory
> >> encryption.
> >
> > I don't understand what problem you're trying to solve.  I don't see a need to
> > provide a single concrete representation/definition of a "virtual machine".  E.g.
> > there's no need for a formal definition to securely perform intrahost migration,
> > KVM just needs to ensure that the migration doesn't compromise guest security,
> > functionality, etc.
> >
> > That gets a lot more complex if the target KVM instance (module, not "struct kvm")
> > is a different KVM, e.g. when migrating to a different host.  Then there needs to
> > be a way to attest that the target is trusted and whatnot, but that still doesn't
> > require there to be a formal definition of a "virtual machine".
> >
> >> Circling back to the original topic, was associating the file with
> >> struct kvm at gmem file creation time meant to constrain the use of the
> >> gmem file to one struct kvm, or one virtual machine, or something else?
> >
> > It's meant to keep things as simple as possible (relatively speaking).  A 1:1
> > association between a KVM instance and a gmem instance means we don't have to
> > worry about the edge cases and oddities I pointed out earlier in this thread.
> >
> 
> I looked through this thread again and re-read the edge cases and
> oddities that was pointed out earlier (last paragraph at [1]) and I
> think I understand better, and I have just one last clarification.
> 
> It was previously mentioned that binding on creation time simplifies the
> lifecycle of memory:
> 
> "(a) prevent a different VM from *ever* binding to the gmem instance" [1]
> 
> Does this actually mean
> 
> "prevent a different struct kvm from *ever* binding to this gmem file"
> 
> ?

Yes.

> If so, then binding on creation
> 
> + Makes the gmem *file* (and just not the bindings xarray) the binding
>   between struct kvm and the file.

Yep.

> + Simplifies the KVM-userspace contract to "this gmem file can only be
>   used with this struct kvm"

Yep.

> Binding on creation doesn't offer any way to block the contents of the
> inode from being used with another "virtual machine" though, since we
> can have more than one gmem file pointing to the same inode, and the
> other gmem file is associated with another struct kvm. (And a strut kvm
> isn't associated 1:1 with a virtual machine [2])

Yep.

> The point about an inode needing to be coupled to a virtual machine as a
> thing [2] led me to try to find a single concrete representation of a
> "virtual machine".
> 
> Is locking inode contents to a "virtual machine" outside the scope of
> gmem?

Yes, because it's not gmem's responsibility to define "secure" (from a guest
perspective) or "safe" (from a platform stability and correctness perspective).

E.g. inserting additional vCPUs into the VM a la the SEV migration helper thing
is comically insecure without some way to attest the helper code.  Building policy
into the host kernel/KVM to do that attestation or otherwise determine what code
is/isn't safe for the guest to run is firmly out-of-scope.  KVM can certainly
provide the tools and help with enforcement, but the policy needs to be defined
elsewhere.  Even for something like pKVM, where KVM is in the TCB, KVM still doesn't
define who/what to trust (though KVM is heavily involved in enforcing security
stuff).

And for platform safety, e.g. not allowing two VMs to use the same HKID (ignoring
helpers for the moment), that's a KVM problem but NOT a gmem problem.  The point
I raised in link[2] about a gmem inode and thus the HKID/ASID associated with the
inode being bound to the "virtual machine" still holds true, but (a) it's not a
1:1 correlation, e.g. a VM could utilize multiple gmem inodes (all with the same
HKID/ASID), and (b) the safety and functional correctness aspects aren't unique
to gmem, e.g. even when when gmem isn't in the picture, KVM needs to make sure it
manages ASIDs correctly.  The only difference with SNP in the picture is that if
KVM screws up ASID management, bad things happen to the host, not (just) the guest.

>  If so, then it is fine to bind on creation time, use a VM ioctl
> over a system ioctl, and the method of refcounting in gmem v12 is okay.
> 
> [1] https://lore.kernel.org/lkml/ZNKv9ul2I7A4V7IF@google.com/
> [2] https://lore.kernel.org/lkml/ZOO782YGRY0YMuPu@google.com/
> 
> > <snip>
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 97db63da6227..0d1e2ee8ae7a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -592,8 +592,20 @@  struct kvm_memory_slot {
 	u32 flags;
 	short id;
 	u16 as_id;
+
+#ifdef CONFIG_KVM_PRIVATE_MEM
+	struct {
+		struct file __rcu *file;
+		pgoff_t pgoff;
+	} gmem;
+#endif
 };
 
+static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
+{
+	return slot && (slot->flags & KVM_MEM_PRIVATE);
+}
+
 static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
 {
 	return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
@@ -688,6 +700,17 @@  static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
 }
 #endif
 
+/*
+ * Arch code must define kvm_arch_has_private_mem if support for private memory
+ * is enabled.
+ */
+#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
+static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
+{
+	return false;
+}
+#endif
+
 struct kvm_memslots {
 	u64 generation;
 	atomic_long_t last_used_slot;
@@ -1380,6 +1403,7 @@  void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 void kvm_mmu_invalidate_begin(struct kvm *kvm);
 void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
 void kvm_mmu_invalidate_end(struct kvm *kvm);
+bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
 
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
@@ -2313,6 +2337,30 @@  static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn
 
 bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
 					 struct kvm_gfn_range *range);
+
+static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
+{
+	return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
+	       kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+}
+#else
+static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
+{
+	return false;
+}
 #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
 
+#ifdef CONFIG_KVM_PRIVATE_MEM
+int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+			      gfn_t gfn, kvm_pfn_t *pfn, int *max_order);
+#else
+static inline int kvm_gmem_get_pfn(struct kvm *kvm,
+				   struct kvm_memory_slot *slot, gfn_t gfn,
+				   kvm_pfn_t *pfn, int *max_order)
+{
+	KVM_BUG_ON(1, kvm);
+	return -EIO;
+}
+#endif /* CONFIG_KVM_PRIVATE_MEM */
+
 #endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f065c57db327..9b344fc98598 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -102,7 +102,10 @@  struct kvm_userspace_memory_region2 {
 	__u64 guest_phys_addr;
 	__u64 memory_size;
 	__u64 userspace_addr;
-	__u64 pad[16];
+	__u64 gmem_offset;
+	__u32 gmem_fd;
+	__u32 pad1;
+	__u64 pad2[14];
 };
 
 /*
@@ -112,6 +115,7 @@  struct kvm_userspace_memory_region2 {
  */
 #define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
 #define KVM_MEM_READONLY	(1UL << 1)
+#define KVM_MEM_PRIVATE		(1UL << 2)
 
 /* for KVM_IRQ_LINE */
 struct kvm_irq_level {
@@ -2284,4 +2288,12 @@  struct kvm_memory_attributes {
 
 #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
 
+#define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
+
+struct kvm_create_guest_memfd {
+	__u64 size;
+	__u64 flags;
+	__u64 reserved[6];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 6325d1d0e90f..15041aa7d9ae 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -101,5 +101,6 @@ 
 #define DMA_BUF_MAGIC		0x444d4142	/* "DMAB" */
 #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
+#define GUEST_MEMORY_MAGIC	0x474d454d	/* "GMEM" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 8375bc49f97d..3ee3205e0b39 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -103,3 +103,7 @@  config KVM_GENERIC_MMU_NOTIFIER
 config KVM_GENERIC_MEMORY_ATTRIBUTES
        select KVM_GENERIC_MMU_NOTIFIER
        bool
+
+config KVM_PRIVATE_MEM
+       select XARRAY_MULTI
+       bool
diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
index 2c27d5d0c367..a5a61bbe7f4c 100644
--- a/virt/kvm/Makefile.kvm
+++ b/virt/kvm/Makefile.kvm
@@ -12,3 +12,4 @@  kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
 kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
 kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
 kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
+kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_mem.o
diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
new file mode 100644
index 000000000000..1b705fd63fa8
--- /dev/null
+++ b/virt/kvm/guest_mem.c
@@ -0,0 +1,591 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/backing-dev.h>
+#include <linux/falloc.h>
+#include <linux/kvm_host.h>
+#include <linux/pagemap.h>
+#include <linux/pseudo_fs.h>
+
+#include <uapi/linux/magic.h>
+
+#include "kvm_mm.h"
+
+static struct vfsmount *kvm_gmem_mnt;
+
+struct kvm_gmem {
+	struct kvm *kvm;
+	struct xarray bindings;
+	struct list_head entry;
+};
+
+static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
+{
+	struct folio *folio;
+
+	/* TODO: Support huge pages. */
+	folio = filemap_grab_folio(file->f_mapping, index);
+	if (!folio)
+		return NULL;
+
+	/*
+	 * Use the up-to-date flag to track whether or not the memory has been
+	 * zeroed before being handed off to the guest.  There is no backing
+	 * storage for the memory, so the folio will remain up-to-date until
+	 * it's removed.
+	 *
+	 * TODO: Skip clearing pages when trusted firmware will do it when
+	 * assigning memory to the guest.
+	 */
+	if (!folio_test_uptodate(folio)) {
+		unsigned long nr_pages = folio_nr_pages(folio);
+		unsigned long i;
+
+		for (i = 0; i < nr_pages; i++)
+			clear_highpage(folio_page(folio, i));
+
+		folio_mark_uptodate(folio);
+	}
+
+	/*
+	 * Ignore accessed, referenced, and dirty flags.  The memory is
+	 * unevictable and there is no storage to write back to.
+	 */
+	return folio;
+}
+
+static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
+				      pgoff_t end)
+{
+	struct kvm_memory_slot *slot;
+	struct kvm *kvm = gmem->kvm;
+	unsigned long index;
+	bool flush = false;
+
+	KVM_MMU_LOCK(kvm);
+
+	kvm_mmu_invalidate_begin(kvm);
+
+	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
+		pgoff_t pgoff = slot->gmem.pgoff;
+
+		struct kvm_gfn_range gfn_range = {
+			.start = slot->base_gfn + max(pgoff, start) - pgoff,
+			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
+			.slot = slot,
+			.may_block = true,
+		};
+
+		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
+	}
+
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+
+	KVM_MMU_UNLOCK(kvm);
+}
+
+static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
+				    pgoff_t end)
+{
+	struct kvm *kvm = gmem->kvm;
+
+	KVM_MMU_LOCK(kvm);
+	if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT))
+		kvm_mmu_invalidate_end(kvm);
+	KVM_MMU_UNLOCK(kvm);
+}
+
+static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct list_head *gmem_list = &inode->i_mapping->private_list;
+	pgoff_t start = offset >> PAGE_SHIFT;
+	pgoff_t end = (offset + len) >> PAGE_SHIFT;
+	struct kvm_gmem *gmem;
+
+	/*
+	 * Bindings must stable across invalidation to ensure the start+end
+	 * are balanced.
+	 */
+	filemap_invalidate_lock(inode->i_mapping);
+
+	list_for_each_entry(gmem, gmem_list, entry)
+		kvm_gmem_invalidate_begin(gmem, start, end);
+
+	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
+
+	list_for_each_entry(gmem, gmem_list, entry)
+		kvm_gmem_invalidate_end(gmem, start, end);
+
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	return 0;
+}
+
+static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t start, index, end;
+	int r;
+
+	/* Dedicated guest is immutable by default. */
+	if (offset + len > i_size_read(inode))
+		return -EINVAL;
+
+	filemap_invalidate_lock_shared(mapping);
+
+	start = offset >> PAGE_SHIFT;
+	end = (offset + len) >> PAGE_SHIFT;
+
+	r = 0;
+	for (index = start; index < end; ) {
+		struct folio *folio;
+
+		if (signal_pending(current)) {
+			r = -EINTR;
+			break;
+		}
+
+		folio = kvm_gmem_get_folio(inode, index);
+		if (!folio) {
+			r = -ENOMEM;
+			break;
+		}
+
+		index = folio_next_index(folio);
+
+		folio_unlock(folio);
+		folio_put(folio);
+
+		/* 64-bit only, wrapping the index should be impossible. */
+		if (WARN_ON_ONCE(!index))
+			break;
+
+		cond_resched();
+	}
+
+	filemap_invalidate_unlock_shared(mapping);
+
+	return r;
+}
+
+static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
+			       loff_t len)
+{
+	int ret;
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE))
+		return -EOPNOTSUPP;
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
+		return -EINVAL;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
+	else
+		ret = kvm_gmem_allocate(file_inode(file), offset, len);
+
+	if (!ret)
+		file_modified(file);
+	return ret;
+}
+
+static int kvm_gmem_release(struct inode *inode, struct file *file)
+{
+	struct kvm_gmem *gmem = file->private_data;
+	struct kvm_memory_slot *slot;
+	struct kvm *kvm = gmem->kvm;
+	unsigned long index;
+
+	filemap_invalidate_lock(inode->i_mapping);
+
+	/*
+	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
+	 * reference to the file and thus no new bindings can be created, but
+	 * dereferencing the slot for existing bindings needs to be protected
+	 * against memslot updates, specifically so that unbind doesn't race
+	 * and free the memslot (kvm_gmem_get_file() will return NULL).
+	 */
+	mutex_lock(&kvm->slots_lock);
+
+	xa_for_each(&gmem->bindings, index, slot)
+		rcu_assign_pointer(slot->gmem.file, NULL);
+
+	synchronize_rcu();
+
+	/*
+	 * All in-flight operations are gone and new bindings can be created.
+	 * Zap all SPTEs pointed at by this file.  Do not free the backing
+	 * memory, as its lifetime is associated with the inode, not the file.
+	 */
+	kvm_gmem_invalidate_begin(gmem, 0, -1ul);
+	kvm_gmem_invalidate_end(gmem, 0, -1ul);
+
+	mutex_unlock(&kvm->slots_lock);
+
+	list_del(&gmem->entry);
+
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	xa_destroy(&gmem->bindings);
+	kfree(gmem);
+
+	kvm_put_kvm(kvm);
+
+	return 0;
+}
+
+static struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
+{
+	struct file *file;
+
+	rcu_read_lock();
+
+	file = rcu_dereference(slot->gmem.file);
+	if (file && !get_file_rcu(file))
+		file = NULL;
+
+	rcu_read_unlock();
+
+	return file;
+}
+
+static const struct file_operations kvm_gmem_fops = {
+	.open		= generic_file_open,
+	.release	= kvm_gmem_release,
+	.fallocate	= kvm_gmem_fallocate,
+};
+
+static int kvm_gmem_migrate_folio(struct address_space *mapping,
+				  struct folio *dst, struct folio *src,
+				  enum migrate_mode mode)
+{
+	WARN_ON_ONCE(1);
+	return -EINVAL;
+}
+
+static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
+{
+	struct list_head *gmem_list = &mapping->private_list;
+	struct kvm_memory_slot *slot;
+	struct kvm_gmem *gmem;
+	unsigned long index;
+	pgoff_t start, end;
+	gfn_t gfn;
+
+	filemap_invalidate_lock_shared(mapping);
+
+	start = page->index;
+	end = start + thp_nr_pages(page);
+
+	list_for_each_entry(gmem, gmem_list, entry) {
+		xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
+			for (gfn = start; gfn < end; gfn++) {
+				if (WARN_ON_ONCE(gfn < slot->base_gfn ||
+						gfn >= slot->base_gfn + slot->npages))
+					continue;
+
+				/*
+				 * FIXME: Tell userspace that the *private*
+				 * memory encountered an error.
+				 */
+				send_sig_mceerr(BUS_MCEERR_AR,
+						(void __user *)gfn_to_hva_memslot(slot, gfn),
+						PAGE_SHIFT, current);
+			}
+		}
+	}
+
+	filemap_invalidate_unlock_shared(mapping);
+
+	return 0;
+}
+
+static const struct address_space_operations kvm_gmem_aops = {
+	.dirty_folio = noop_dirty_folio,
+#ifdef CONFIG_MIGRATION
+	.migrate_folio	= kvm_gmem_migrate_folio,
+#endif
+	.error_remove_page = kvm_gmem_error_page,
+};
+
+static int  kvm_gmem_getattr(struct mnt_idmap *idmap,
+			     const struct path *path, struct kstat *stat,
+			     u32 request_mask, unsigned int query_flags)
+{
+	struct inode *inode = path->dentry->d_inode;
+
+	/* TODO */
+	generic_fillattr(idmap, inode, stat);
+	return 0;
+}
+
+static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+			    struct iattr *attr)
+{
+	/* TODO */
+	return -EINVAL;
+}
+static const struct inode_operations kvm_gmem_iops = {
+	.getattr	= kvm_gmem_getattr,
+	.setattr	= kvm_gmem_setattr,
+};
+
+static int __kvm_gmem_create(struct kvm *kvm, loff_t size, struct vfsmount *mnt)
+{
+	const char *anon_name = "[kvm-gmem]";
+	const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
+	struct kvm_gmem *gmem;
+	struct inode *inode;
+	struct file *file;
+	int fd, err;
+
+	inode = alloc_anon_inode(mnt->mnt_sb);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	err = security_inode_init_security_anon(inode, &qname, NULL);
+	if (err)
+		goto err_inode;
+
+	inode->i_private = (void *)(unsigned long)flags;
+	inode->i_op = &kvm_gmem_iops;
+	inode->i_mapping->a_ops = &kvm_gmem_aops;
+	inode->i_mode |= S_IFREG;
+	inode->i_size = size;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+	mapping_set_unevictable(inode->i_mapping);
+	mapping_set_unmovable(inode->i_mapping);
+
+	fd = get_unused_fd_flags(0);
+	if (fd < 0) {
+		err = fd;
+		goto err_inode;
+	}
+
+	file = alloc_file_pseudo(inode, mnt, "kvm-gmem", O_RDWR, &kvm_gmem_fops);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto err_fd;
+	}
+
+	file->f_flags |= O_LARGEFILE;
+	file->f_mapping = inode->i_mapping;
+
+	gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
+	if (!gmem) {
+		err = -ENOMEM;
+		goto err_file;
+	}
+
+	kvm_get_kvm(kvm);
+	gmem->kvm = kvm;
+	xa_init(&gmem->bindings);
+
+	file->private_data = gmem;
+
+	list_add(&gmem->entry, &inode->i_mapping->private_list);
+
+	fd_install(fd, file);
+	return fd;
+
+err_file:
+	fput(file);
+err_fd:
+	put_unused_fd(fd);
+err_inode:
+	iput(inode);
+	return err;
+}
+
+static bool kvm_gmem_is_valid_size(loff_t size, u64 flags)
+{
+	if (size < 0 || !PAGE_ALIGNED(size))
+		return false;
+
+	return true;
+}
+
+int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
+{
+	loff_t size = args->size;
+	u64 flags = args->flags;
+	u64 valid_flags = 0;
+
+	if (flags & ~valid_flags)
+		return -EINVAL;
+
+	if (!kvm_gmem_is_valid_size(size, flags))
+		return -EINVAL;
+
+	return __kvm_gmem_create(kvm, size, flags, kvm_gmem_mnt);
+}
+
+int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
+		  unsigned int fd, loff_t offset)
+{
+	loff_t size = slot->npages << PAGE_SHIFT;
+	unsigned long start, end, flags;
+	struct kvm_gmem *gmem;
+	struct inode *inode;
+	struct file *file;
+
+	BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
+
+	file = fget(fd);
+	if (!file)
+		return -EINVAL;
+
+	if (file->f_op != &kvm_gmem_fops)
+		goto err;
+
+	gmem = file->private_data;
+	if (gmem->kvm != kvm)
+		goto err;
+
+	inode = file_inode(file);
+	flags = (unsigned long)inode->i_private;
+
+	/*
+	 * For simplicity, require the offset into the file and the size of the
+	 * memslot to be aligned to the largest possible page size used to back
+	 * the file (same as the size of the file itself).
+	 */
+	if (!kvm_gmem_is_valid_size(offset, flags) ||
+	    !kvm_gmem_is_valid_size(size, flags))
+		goto err;
+
+	if (offset + size > i_size_read(inode))
+		goto err;
+
+	filemap_invalidate_lock(inode->i_mapping);
+
+	start = offset >> PAGE_SHIFT;
+	end = start + slot->npages;
+
+	if (!xa_empty(&gmem->bindings) &&
+	    xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
+		filemap_invalidate_unlock(inode->i_mapping);
+		goto err;
+	}
+
+	/*
+	 * No synchronize_rcu() needed, any in-flight readers are guaranteed to
+	 * be see either a NULL file or this new file, no need for them to go
+	 * away.
+	 */
+	rcu_assign_pointer(slot->gmem.file, file);
+	slot->gmem.pgoff = start;
+
+	xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	/*
+	 * Drop the reference to the file, even on success.  The file pins KVM,
+	 * not the other way 'round.  Active bindings are invalidated if the
+	 * file is closed before memslots are destroyed.
+	 */
+	fput(file);
+	return 0;
+
+err:
+	fput(file);
+	return -EINVAL;
+}
+
+void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+{
+	unsigned long start = slot->gmem.pgoff;
+	unsigned long end = start + slot->npages;
+	struct kvm_gmem *gmem;
+	struct file *file;
+
+	/*
+	 * Nothing to do if the underlying file was already closed (or is being
+	 * closed right now), kvm_gmem_release() invalidates all bindings.
+	 */
+	file = kvm_gmem_get_file(slot);
+	if (!file)
+		return;
+
+	gmem = file->private_data;
+
+	filemap_invalidate_lock(file->f_mapping);
+	xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
+	rcu_assign_pointer(slot->gmem.file, NULL);
+	synchronize_rcu();
+	filemap_invalidate_unlock(file->f_mapping);
+
+	fput(file);
+}
+
+int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+		     gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
+{
+	pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
+	struct kvm_gmem *gmem;
+	struct folio *folio;
+	struct page *page;
+	struct file *file;
+
+	file = kvm_gmem_get_file(slot);
+	if (!file)
+		return -EFAULT;
+
+	gmem = file->private_data;
+
+	if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
+		fput(file);
+		return -EIO;
+	}
+
+	folio = kvm_gmem_get_folio(file_inode(file), index);
+	if (!folio) {
+		fput(file);
+		return -ENOMEM;
+	}
+
+	page = folio_file_page(folio, index);
+
+	*pfn = page_to_pfn(page);
+	*max_order = compound_order(compound_head(page));
+
+	folio_unlock(folio);
+	fput(file);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
+
+static int kvm_gmem_init_fs_context(struct fs_context *fc)
+{
+	if (!init_pseudo(fc, GUEST_MEMORY_MAGIC))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static struct file_system_type kvm_gmem_fs = {
+	.name		 = "kvm_guest_memory",
+	.init_fs_context = kvm_gmem_init_fs_context,
+	.kill_sb	 = kill_anon_super,
+};
+
+int kvm_gmem_init(void)
+{
+	kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
+	if (IS_ERR(kvm_gmem_mnt))
+		return PTR_ERR(kvm_gmem_mnt);
+
+	/* For giggles.  Userspace can never map this anyways. */
+	kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
+
+	return 0;
+}
+
+void kvm_gmem_exit(void)
+{
+	kern_unmount(kvm_gmem_mnt);
+	kvm_gmem_mnt = NULL;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1a31bfa025b0..a8686e8473a4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -761,7 +761,7 @@  void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
 	}
 }
 
-static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
 	return kvm_unmap_gfn_range(kvm, range);
@@ -992,6 +992,9 @@  static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 /* This does not remove the slot from struct kvm_memslots data structures */
 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
+	if (slot->flags & KVM_MEM_PRIVATE)
+		kvm_gmem_unbind(slot);
+
 	kvm_destroy_dirty_bitmap(slot);
 
 	kvm_arch_free_memslot(kvm, slot);
@@ -1556,10 +1559,18 @@  static void kvm_replace_memslot(struct kvm *kvm,
 	}
 }
 
-static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem)
+static int check_memory_region_flags(struct kvm *kvm,
+				     const struct kvm_userspace_memory_region2 *mem)
 {
 	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
 
+	if (kvm_arch_has_private_mem(kvm))
+		valid_flags |= KVM_MEM_PRIVATE;
+
+	/* Dirty logging private memory is not currently supported. */
+	if (mem->flags & KVM_MEM_PRIVATE)
+		valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+
 #ifdef __KVM_HAVE_READONLY_MEM
 	valid_flags |= KVM_MEM_READONLY;
 #endif
@@ -1968,7 +1979,7 @@  int __kvm_set_memory_region(struct kvm *kvm,
 	int as_id, id;
 	int r;
 
-	r = check_memory_region_flags(mem);
+	r = check_memory_region_flags(kvm, mem);
 	if (r)
 		return r;
 
@@ -1987,6 +1998,10 @@  int __kvm_set_memory_region(struct kvm *kvm,
 	     !access_ok((void __user *)(unsigned long)mem->userspace_addr,
 			mem->memory_size))
 		return -EINVAL;
+	if (mem->flags & KVM_MEM_PRIVATE &&
+	    (mem->gmem_offset & (PAGE_SIZE - 1) ||
+	     mem->gmem_offset + mem->memory_size < mem->gmem_offset))
+		return -EINVAL;
 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
 		return -EINVAL;
 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
@@ -2025,6 +2040,9 @@  int __kvm_set_memory_region(struct kvm *kvm,
 		if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
 			return -EINVAL;
 	} else { /* Modify an existing slot. */
+		/* Private memslots are immutable, they can only be deleted. */
+		if (mem->flags & KVM_MEM_PRIVATE)
+			return -EINVAL;
 		if ((mem->userspace_addr != old->userspace_addr) ||
 		    (npages != old->npages) ||
 		    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
@@ -2053,10 +2071,23 @@  int __kvm_set_memory_region(struct kvm *kvm,
 	new->npages = npages;
 	new->flags = mem->flags;
 	new->userspace_addr = mem->userspace_addr;
+	if (mem->flags & KVM_MEM_PRIVATE) {
+		r = kvm_gmem_bind(kvm, new, mem->gmem_fd, mem->gmem_offset);
+		if (r)
+			goto out;
+	}
 
 	r = kvm_set_memslot(kvm, old, new, change);
 	if (r)
-		kfree(new);
+		goto out_restricted;
+
+	return 0;
+
+out_restricted:
+	if (mem->flags & KVM_MEM_PRIVATE)
+		kvm_gmem_unbind(new);
+out:
+	kfree(new);
 	return r;
 }
 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
@@ -2356,6 +2387,8 @@  static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
 static u64 kvm_supported_mem_attributes(struct kvm *kvm)
 {
+	if (kvm_arch_has_private_mem(kvm))
+		return KVM_MEMORY_ATTRIBUTE_PRIVATE;
 	return 0;
 }
 
@@ -5134,6 +5167,16 @@  static long kvm_vm_ioctl(struct file *filp,
 	case KVM_GET_STATS_FD:
 		r = kvm_vm_ioctl_get_stats_fd(kvm);
 		break;
+	case KVM_CREATE_GUEST_MEMFD: {
+		struct kvm_create_guest_memfd guest_memfd;
+
+		r = -EFAULT;
+		if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
+			goto out;
+
+		r = kvm_gmem_create(kvm, &guest_memfd);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
@@ -6255,12 +6298,17 @@  int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 	if (r)
 		goto err_async_pf;
 
+	r = kvm_gmem_init();
+	if (r)
+		goto err_gmem;
+
 	kvm_chardev_ops.owner = module;
 
 	kvm_preempt_ops.sched_in = kvm_sched_in;
 	kvm_preempt_ops.sched_out = kvm_sched_out;
 
 	kvm_init_debug();
+	kvm_gmem_init();
 
 	r = kvm_vfio_ops_init();
 	if (WARN_ON_ONCE(r))
@@ -6281,6 +6329,8 @@  int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
 err_register:
 	kvm_vfio_ops_exit();
 err_vfio:
+	kvm_gmem_exit();
+err_gmem:
 	kvm_async_pf_deinit();
 err_async_pf:
 	kvm_irqfd_exit();
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 180f1a09e6ba..798f20d612bb 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -37,4 +37,42 @@  static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
 }
 #endif /* HAVE_KVM_PFNCACHE */
 
+#ifdef CONFIG_KVM_PRIVATE_MEM
+int kvm_gmem_init(void);
+void kvm_gmem_exit(void);
+int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
+int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
+		  unsigned int fd, loff_t offset);
+void kvm_gmem_unbind(struct kvm_memory_slot *slot);
+#else
+static inline int kvm_gmem_init(void)
+{
+	return 0;
+}
+
+static inline void kvm_gmem_exit(void)
+{
+
+}
+
+static inline int kvm_gmem_create(struct kvm *kvm,
+				  struct kvm_create_guest_memfd *args)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int kvm_gmem_bind(struct kvm *kvm,
+					 struct kvm_memory_slot *slot,
+					 unsigned int fd, loff_t offset)
+{
+	WARN_ON_ONCE(1);
+	return -EIO;
+}
+
+static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+{
+	WARN_ON_ONCE(1);
+}
+#endif /* CONFIG_KVM_PRIVATE_MEM */
+
 #endif /* __KVM_MM_H__ */