diff mbox series

[v3,03/11] KVM: guest_memfd: Allow host to map guest_memfd() pages

Message ID 20250211121128.703390-4-tabba@google.com (mailing list archive)
State New
Headers show
Series KVM: Mapping guest_memfd backed memory at the host for software protected VMs | expand

Commit Message

Fuad Tabba Feb. 11, 2025, 12:11 p.m. UTC
Add support for mmap() and fault() for guest_memfd backed memory
in the host for VMs that support in-place conversion between
shared and private (shared memory). To that end, this patch adds
the ability to check whether the VM type has that support, and
only allows mapping its memory if that's the case.

Additionally, this behavior is gated with a new configuration
option, CONFIG_KVM_GMEM_SHARED_MEM.

Signed-off-by: Fuad Tabba <tabba@google.com>

---

This patch series will allow shared memory support for software
VMs in x86. It will also introduce a similar VM type for arm64
and allow shared memory support for that. In the future, pKVM
will also support shared memory.
---
 include/linux/kvm_host.h | 11 +++++
 virt/kvm/Kconfig         |  4 ++
 virt/kvm/guest_memfd.c   | 93 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+)

Comments

Fuad Tabba Feb. 12, 2025, 9:21 a.m. UTC | #1
Hi Ackerley,

On Wed, 12 Feb 2025 at 05:07, Ackerley Tng <ackerleytng@google.com> wrote:
>
> Fuad Tabba <tabba@google.com> writes:
>
> > Add support for mmap() and fault() for guest_memfd backed memory
> > in the host for VMs that support in-place conversion between
> > shared and private (shared memory). To that end, this patch adds
> > the ability to check whether the VM type has that support, and
> > only allows mapping its memory if that's the case.
> >
> > Additionally, this behavior is gated with a new configuration
> > option, CONFIG_KVM_GMEM_SHARED_MEM.
> >
> > Signed-off-by: Fuad Tabba <tabba@google.com>
> >
> > ---
> >
> > This patch series will allow shared memory support for software
> > VMs in x86. It will also introduce a similar VM type for arm64
> > and allow shared memory support for that. In the future, pKVM
> > will also support shared memory.
>
> Thanks, I agree that introducing mmap this way could help in having it
> merged earlier, independently of conversion support, to support testing.
>
> I'll adopt this patch in the next revision of 1G page support for
> guest_memfd.
>
> > ---
> >  include/linux/kvm_host.h | 11 +++++
> >  virt/kvm/Kconfig         |  4 ++
> >  virt/kvm/guest_memfd.c   | 93 ++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 108 insertions(+)
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 8b5f28f6efff..438aa3df3175 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -728,6 +728,17 @@ static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
> >  }
> >  #endif
> >
> > +/*
> > + * Arch code must define kvm_arch_gmem_supports_shared_mem if support for
> > + * private memory is enabled and it supports in-place shared/private conversion.
> > + */
> > +#if !defined(kvm_arch_gmem_supports_shared_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
> > +static inline bool kvm_arch_gmem_supports_shared_mem(struct kvm *kvm)
> > +{
> > +     return false;
> > +}
> > +#endif
>
> Perhaps this could be declared in the #ifdef CONFIG_KVM_PRIVATE_MEM
> block?
>
> Could this be defined as a __weak symbol for architectures to override?
> Or perhaps that can be done once guest_memfd gets refactored separately
> since now the entire guest_memfd.c isn't even compiled if
> CONFIG_KVM_PRIVATE_MEM is not set.

I don't have a strong opinion about this. I did it this way because
that's what the existing code for kvm_arch_has_private_mem() is like.
It seemed logical to follow that pattern.

> > +
> >  #ifndef kvm_arch_has_readonly_mem
> >  static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm)
> >  {
> > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> > index 54e959e7d68f..4e759e8020c5 100644
> > --- a/virt/kvm/Kconfig
> > +++ b/virt/kvm/Kconfig
> > @@ -124,3 +124,7 @@ config HAVE_KVM_ARCH_GMEM_PREPARE
> >  config HAVE_KVM_ARCH_GMEM_INVALIDATE
> >         bool
> >         depends on KVM_PRIVATE_MEM
> > +
> > +config KVM_GMEM_SHARED_MEM
> > +       select KVM_PRIVATE_MEM
> > +       bool
> > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > index c6f6792bec2a..85467a3ef8ea 100644
> > --- a/virt/kvm/guest_memfd.c
> > +++ b/virt/kvm/guest_memfd.c
> > @@ -317,9 +317,102 @@ void kvm_gmem_handle_folio_put(struct folio *folio)
> >  {
> >       WARN_ONCE(1, "A placeholder that shouldn't trigger. Work in progress.");
> >  }
> > +
> > +static bool kvm_gmem_offset_is_shared(struct file *file, pgoff_t index)
> > +{
> > +     struct kvm_gmem *gmem = file->private_data;
> > +
> > +     /* For now, VMs that support shared memory share all their memory. */
> > +     return kvm_arch_gmem_supports_shared_mem(gmem->kvm);
> > +}
> > +
> > +static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
> > +{
> > +     struct inode *inode = file_inode(vmf->vma->vm_file);
> > +     struct folio *folio;
> > +     vm_fault_t ret = VM_FAULT_LOCKED;
> > +
> > +     filemap_invalidate_lock_shared(inode->i_mapping);
> > +
> > +     folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> > +     if (IS_ERR(folio)) {
> > +             ret = VM_FAULT_SIGBUS;
>
> Will it always be a SIGBUS if there is some error getting a folio?

I could try to expand it, and map more of the VM_FAULT_* to the
potential return values from __filemap_get_folio(), i.e.,

-EAGAIN -> VM_FAULT_RETRY
-ENOMEM -> VM_FAULT_OOM

but some don't really map 1:1 if you read the description. For example
is it correct to map
-ENOENT -> VM_FAULT_NOPAGE /* fault installed the pte, not return page */

That said, it might be useful to map at least EAGAIN and ENOMEM.

> > +             goto out_filemap;
> > +     }
> > +
> > +     if (folio_test_hwpoison(folio)) {
> > +             ret = VM_FAULT_HWPOISON;
> > +             goto out_folio;
> > +     }
> > +
> > +     /* Must be called with folio lock held, i.e., after kvm_gmem_get_folio() */
> > +     if (!kvm_gmem_offset_is_shared(vmf->vma->vm_file, vmf->pgoff)) {
> > +             ret = VM_FAULT_SIGBUS;
> > +             goto out_folio;
> > +     }
> > +
> > +     /*
> > +      * Only private folios are marked as "guestmem" so far, and we never
> > +      * expect private folios at this point.
> > +      */
>
> Proposal - rephrase this comment as: before typed folios can be mapped,
> PGTY_guestmem is only tagged on folios so that guest_memfd will receive
> the kvm_gmem_handle_folio_put() callback. The tag is definitely not
> expected when a folio is about to be faulted in.
>
> I propose the above because I think technically when mappability is NONE
> the folio isn't private? Not sure if others see this differently.

A folio whose mappability is NONE is private as far as the host is
concerned. That said, I can rephrase this for clarity.


> > +     if (WARN_ON_ONCE(folio_test_guestmem(folio)))  {
> > +             ret = VM_FAULT_SIGBUS;
> > +             goto out_folio;
> > +     }
> > +
> > +     /* No support for huge pages. */
> > +     if (WARN_ON_ONCE(folio_test_large(folio))) {
> > +             ret = VM_FAULT_SIGBUS;
> > +             goto out_folio;
> > +     }
> > +
> > +     if (!folio_test_uptodate(folio)) {
> > +             clear_highpage(folio_page(folio, 0));
> > +             kvm_gmem_mark_prepared(folio);
> > +     }
> > +
> > +     vmf->page = folio_file_page(folio, vmf->pgoff);
> > +
> > +out_folio:
> > +     if (ret != VM_FAULT_LOCKED) {
> > +             folio_unlock(folio);
> > +             folio_put(folio);
> > +     }
> > +
> > +out_filemap:
> > +     filemap_invalidate_unlock_shared(inode->i_mapping);
> > +
> > +     return ret;
> > +}
> > +
> > +static const struct vm_operations_struct kvm_gmem_vm_ops = {
> > +     .fault = kvm_gmem_fault,
> > +};
> > +
> > +static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
> > +{
> > +     struct kvm_gmem *gmem = file->private_data;
> > +
> > +     if (!kvm_arch_gmem_supports_shared_mem(gmem->kvm))
> > +             return -ENODEV;
> > +
> > +     if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
> > +         (VM_SHARED | VM_MAYSHARE)) {
> > +             return -EINVAL;
> > +     }
> > +
> > +     file_accessed(file);
> > +     vm_flags_set(vma, VM_DONTDUMP);
> > +     vma->vm_ops = &kvm_gmem_vm_ops;
> > +
> > +     return 0;
> > +}
> > +#else
> > +#define kvm_gmem_mmap NULL
> >  #endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> >
> >  static struct file_operations kvm_gmem_fops = {
> > +     .mmap           = kvm_gmem_mmap,
>
> I think it's better to surround this with #ifdef
> CONFIG_KVM_GMEM_SHARED_MEM so that when more code gets inserted between
> the struct declaration and the definition of kvm_gmem_mmap() it is more
> obvious that .mmap is only overridden when CONFIG_KVM_GMEM_SHARED_MEM is
> set.

This is a question of style, but I prefer this because it avoids
having more ifdefs. I think that I've seen this pattern elsewhere in
the kernel. That said, if others disagree, I'm happy to change this.

Thank you,
/fuad
>
> >       .open           = generic_file_open,
> >       .release        = kvm_gmem_release,
> >       .fallocate      = kvm_gmem_fallocate,
Peter Xu Feb. 12, 2025, 9:23 p.m. UTC | #2
On Tue, Feb 11, 2025 at 12:11:19PM +0000, Fuad Tabba wrote:
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 54e959e7d68f..4e759e8020c5 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -124,3 +124,7 @@ config HAVE_KVM_ARCH_GMEM_PREPARE
>  config HAVE_KVM_ARCH_GMEM_INVALIDATE
>         bool
>         depends on KVM_PRIVATE_MEM
> +
> +config KVM_GMEM_SHARED_MEM
> +       select KVM_PRIVATE_MEM
> +       bool

No strong opinion here, but this might not be straightforward enough for
any reader to know why a shared mem option will select a private mem..

I wonder would it be clearer if we could have a config for gmem alone, and
select that option no matter how gmem would be consumed.  Then the two
options above could select it.

I'm not sure whether there're too many guest-memfd stuff hard-coded to
PRIVATE_MEM, actually that's what I hit myself both in qemu & kvm when I
wanted to try guest-memfd on QEMU as purely shared (aka no conversions, no
duplicated backends, but in-place).  So pretty much a pure question to ask
here.

The other thing is, currently guest-memfd binding only allows 1:1 binding
to kvm memslots for a specific offset range of gmem, rather than being able
to be mapped in multiple memslots:

kvm_gmem_bind():
	if (!xa_empty(&gmem->bindings) &&
	    xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
		filemap_invalidate_unlock(inode->i_mapping);
		goto err;
	}

I didn't dig further yet, but I feel like this won't trivially work with
things like SMRAM when in-place, which can map the same portion of a gmem
range more than once.  I wonder if this is a hard limit for guest-memfd,
and whether you hit anything similar when working on this series.

Thanks,
Fuad Tabba Feb. 13, 2025, 8:24 a.m. UTC | #3
Hi Peter,

On Wed, 12 Feb 2025 at 21:24, Peter Xu <peterx@redhat.com> wrote:
>
> On Tue, Feb 11, 2025 at 12:11:19PM +0000, Fuad Tabba wrote:
> > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> > index 54e959e7d68f..4e759e8020c5 100644
> > --- a/virt/kvm/Kconfig
> > +++ b/virt/kvm/Kconfig
> > @@ -124,3 +124,7 @@ config HAVE_KVM_ARCH_GMEM_PREPARE
> >  config HAVE_KVM_ARCH_GMEM_INVALIDATE
> >         bool
> >         depends on KVM_PRIVATE_MEM
> > +
> > +config KVM_GMEM_SHARED_MEM
> > +       select KVM_PRIVATE_MEM
> > +       bool
>
> No strong opinion here, but this might not be straightforward enough for
> any reader to know why a shared mem option will select a private mem..
>
> I wonder would it be clearer if we could have a config for gmem alone, and
> select that option no matter how gmem would be consumed.  Then the two
> options above could select it.
>
> I'm not sure whether there're too many guest-memfd stuff hard-coded to
> PRIVATE_MEM, actually that's what I hit myself both in qemu & kvm when I
> wanted to try guest-memfd on QEMU as purely shared (aka no conversions, no
> duplicated backends, but in-place).  So pretty much a pure question to ask
> here.

Yes, the whole thing with guest_memfd being initially called private
mem has left a few things like this, e.g., config options, function
names. It has caused (and will probably continue to cause) confusion.

In order not to blend bikeshedding over names and the patch series
adding mmap support (i.e., this one), I am planning on sending a
separate patch series to handle the name issue/

> The other thing is, currently guest-memfd binding only allows 1:1 binding
> to kvm memslots for a specific offset range of gmem, rather than being able
> to be mapped in multiple memslots:
>
> kvm_gmem_bind():
>         if (!xa_empty(&gmem->bindings) &&
>             xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
>                 filemap_invalidate_unlock(inode->i_mapping);
>                 goto err;
>         }
>
> I didn't dig further yet, but I feel like this won't trivially work with
> things like SMRAM when in-place, which can map the same portion of a gmem
> range more than once.  I wonder if this is a hard limit for guest-memfd,
> and whether you hit anything similar when working on this series.

I haven't thought about this much, but it could be something to tackle later on.

Thank you,
/fuad

> Thanks,
>
> --
> Peter Xu
>

On Wed, 12 Feb 2025 at 21:24, Peter Xu <peterx@redhat.com> wrote:
>
> On Tue, Feb 11, 2025 at 12:11:19PM +0000, Fuad Tabba wrote:
> > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> > index 54e959e7d68f..4e759e8020c5 100644
> > --- a/virt/kvm/Kconfig
> > +++ b/virt/kvm/Kconfig
> > @@ -124,3 +124,7 @@ config HAVE_KVM_ARCH_GMEM_PREPARE
> >  config HAVE_KVM_ARCH_GMEM_INVALIDATE
> >         bool
> >         depends on KVM_PRIVATE_MEM
> > +
> > +config KVM_GMEM_SHARED_MEM
> > +       select KVM_PRIVATE_MEM
> > +       bool
>
> No strong opinion here, but this might not be straightforward enough for
> any reader to know why a shared mem option will select a private mem..
>
> I wonder would it be clearer if we could have a config for gmem alone, and
> select that option no matter how gmem would be consumed.  Then the two
> options above could select it.
>
> I'm not sure whether there're too many guest-memfd stuff hard-coded to
> PRIVATE_MEM, actually that's what I hit myself both in qemu & kvm when I
> wanted to try guest-memfd on QEMU as purely shared (aka no conversions, no
> duplicated backends, but in-place).  So pretty much a pure question to ask
> here.
>
> The other thing is, currently guest-memfd binding only allows 1:1 binding
> to kvm memslots for a specific offset range of gmem, rather than being able
> to be mapped in multiple memslots:
>
> kvm_gmem_bind():
>         if (!xa_empty(&gmem->bindings) &&
>             xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
>                 filemap_invalidate_unlock(inode->i_mapping);
>                 goto err;
>         }
>
> I didn't dig further yet, but I feel like this won't trivially work with
> things like SMRAM when in-place, which can map the same portion of a gmem
> range more than once.  I wonder if this is a hard limit for guest-memfd,
> and whether you hit anything similar when working on this series.
>
> Thanks,
>
> --
> Peter Xu
>
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8b5f28f6efff..438aa3df3175 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -728,6 +728,17 @@  static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
 }
 #endif
 
+/*
+ * Arch code must define kvm_arch_gmem_supports_shared_mem if support for
+ * private memory is enabled and it supports in-place shared/private conversion.
+ */
+#if !defined(kvm_arch_gmem_supports_shared_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
+static inline bool kvm_arch_gmem_supports_shared_mem(struct kvm *kvm)
+{
+	return false;
+}
+#endif
+
 #ifndef kvm_arch_has_readonly_mem
 static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm)
 {
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 54e959e7d68f..4e759e8020c5 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -124,3 +124,7 @@  config HAVE_KVM_ARCH_GMEM_PREPARE
 config HAVE_KVM_ARCH_GMEM_INVALIDATE
        bool
        depends on KVM_PRIVATE_MEM
+
+config KVM_GMEM_SHARED_MEM
+       select KVM_PRIVATE_MEM
+       bool
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index c6f6792bec2a..85467a3ef8ea 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -317,9 +317,102 @@  void kvm_gmem_handle_folio_put(struct folio *folio)
 {
 	WARN_ONCE(1, "A placeholder that shouldn't trigger. Work in progress.");
 }
+
+static bool kvm_gmem_offset_is_shared(struct file *file, pgoff_t index)
+{
+	struct kvm_gmem *gmem = file->private_data;
+
+	/* For now, VMs that support shared memory share all their memory. */
+	return kvm_arch_gmem_supports_shared_mem(gmem->kvm);
+}
+
+static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vmf->vma->vm_file);
+	struct folio *folio;
+	vm_fault_t ret = VM_FAULT_LOCKED;
+
+	filemap_invalidate_lock_shared(inode->i_mapping);
+
+	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+	if (IS_ERR(folio)) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_filemap;
+	}
+
+	if (folio_test_hwpoison(folio)) {
+		ret = VM_FAULT_HWPOISON;
+		goto out_folio;
+	}
+
+	/* Must be called with folio lock held, i.e., after kvm_gmem_get_folio() */
+	if (!kvm_gmem_offset_is_shared(vmf->vma->vm_file, vmf->pgoff)) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_folio;
+	}
+
+	/*
+	 * Only private folios are marked as "guestmem" so far, and we never
+	 * expect private folios at this point.
+	 */
+	if (WARN_ON_ONCE(folio_test_guestmem(folio)))  {
+		ret = VM_FAULT_SIGBUS;
+		goto out_folio;
+	}
+
+	/* No support for huge pages. */
+	if (WARN_ON_ONCE(folio_test_large(folio))) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_folio;
+	}
+
+	if (!folio_test_uptodate(folio)) {
+		clear_highpage(folio_page(folio, 0));
+		kvm_gmem_mark_prepared(folio);
+	}
+
+	vmf->page = folio_file_page(folio, vmf->pgoff);
+
+out_folio:
+	if (ret != VM_FAULT_LOCKED) {
+		folio_unlock(folio);
+		folio_put(folio);
+	}
+
+out_filemap:
+	filemap_invalidate_unlock_shared(inode->i_mapping);
+
+	return ret;
+}
+
+static const struct vm_operations_struct kvm_gmem_vm_ops = {
+	.fault = kvm_gmem_fault,
+};
+
+static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct kvm_gmem *gmem = file->private_data;
+
+	if (!kvm_arch_gmem_supports_shared_mem(gmem->kvm))
+		return -ENODEV;
+
+	if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
+	    (VM_SHARED | VM_MAYSHARE)) {
+		return -EINVAL;
+	}
+
+	file_accessed(file);
+	vm_flags_set(vma, VM_DONTDUMP);
+	vma->vm_ops = &kvm_gmem_vm_ops;
+
+	return 0;
+}
+#else
+#define kvm_gmem_mmap NULL
 #endif /* CONFIG_KVM_GMEM_SHARED_MEM */
 
 static struct file_operations kvm_gmem_fops = {
+	.mmap		= kvm_gmem_mmap,
 	.open		= generic_file_open,
 	.release	= kvm_gmem_release,
 	.fallocate	= kvm_gmem_fallocate,