diff mbox series

[05/10] guestmemfs: add file mmap callback

Message ID 20240805093245.889357-6-jgowans@amazon.com (mailing list archive)
State New, archived
Headers show
Series Introduce guestmemfs: persistent in-memory filesystem | expand

Commit Message

Gowans, James Aug. 5, 2024, 9:32 a.m. UTC
Make the file data usable to userspace by adding mmap. That's all that
QEMU needs for guest RAM, so that's all be bother implementing for now.

When mmaping the file the VMA is marked as PFNMAP to indicate that there
are no struct pages for the memory in this VMA. Remap_pfn_range() is
used to actually populate the page tables. All PTEs are pre-faulted into
the pgtables at mmap time so that the pgtables are usable when this
virtual address range is given to VFIO's MAP_DMA.

Signed-off-by: James Gowans <jgowans@amazon.com>
---
 fs/guestmemfs/file.c       | 43 +++++++++++++++++++++++++++++++++++++-
 fs/guestmemfs/guestmemfs.c |  2 +-
 fs/guestmemfs/guestmemfs.h |  3 +++
 3 files changed, 46 insertions(+), 2 deletions(-)

Comments

Elliot Berman Oct. 29, 2024, 11:05 p.m. UTC | #1
On Mon, Aug 05, 2024 at 11:32:40AM +0200, James Gowans wrote:
> Make the file data usable to userspace by adding mmap. That's all that
> QEMU needs for guest RAM, so that's all be bother implementing for now.
> 
> When mmaping the file the VMA is marked as PFNMAP to indicate that there
> are no struct pages for the memory in this VMA. Remap_pfn_range() is
> used to actually populate the page tables. All PTEs are pre-faulted into
> the pgtables at mmap time so that the pgtables are usable when this
> virtual address range is given to VFIO's MAP_DMA.

Thanks for sending this out! I'm going through the series with the
intention to see how it might fit within the existing guest_memfd work
for pKVM/CoCo/Gunyah.

It might've been mentioned in the MM alignment session -- you might be
interested to join the guest_memfd bi-weekly call to see how we are
overlapping [1].

[1]: https://lore.kernel.org/kvm/ae794891-fe69-411a-b82e-6963b594a62a@redhat.com/T/

---

Was the decision to pre-fault everything because it was convenient to do
or otherwise intentionally different from hugetlb?

> 
> Signed-off-by: James Gowans <jgowans@amazon.com>
> ---
>  fs/guestmemfs/file.c       | 43 +++++++++++++++++++++++++++++++++++++-
>  fs/guestmemfs/guestmemfs.c |  2 +-
>  fs/guestmemfs/guestmemfs.h |  3 +++
>  3 files changed, 46 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/guestmemfs/file.c b/fs/guestmemfs/file.c
> index 618c93b12196..b1a52abcde65 100644
> --- a/fs/guestmemfs/file.c
> +++ b/fs/guestmemfs/file.c
> @@ -1,6 +1,7 @@
>  // SPDX-License-Identifier: GPL-2.0-only
>  
>  #include "guestmemfs.h"
> +#include <linux/mm.h>
>  
>  static int truncate(struct inode *inode, loff_t newsize)
>  {
> @@ -41,6 +42,46 @@ static int inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct
>  	return 0;
>  }
>  
> +/*
> + * To be able to use PFNMAP VMAs for VFIO DMA mapping we need the page tables
> + * populated with mappings. Pre-fault everything.
> + */
> +static int mmap(struct file *filp, struct vm_area_struct *vma)
> +{
> +	int rc;
> +	unsigned long *mappings_block;
> +	struct guestmemfs_inode *guestmemfs_inode;
> +
> +	guestmemfs_inode = guestmemfs_get_persisted_inode(filp->f_inode->i_sb,
> +			filp->f_inode->i_ino);
> +
> +	mappings_block = guestmemfs_inode->mappings;
> +
> +	/* Remap-pfn-range will mark the range VM_IO */
> +	for (unsigned long vma_addr_offset = vma->vm_start;
> +			vma_addr_offset < vma->vm_end;
> +			vma_addr_offset += PMD_SIZE) {
> +		int block, mapped_block;
> +		unsigned long map_size = min(PMD_SIZE, vma->vm_end - vma_addr_offset);
> +
> +		block = (vma_addr_offset - vma->vm_start) / PMD_SIZE;
> +		mapped_block = *(mappings_block + block);
> +		/*
> +		 * It's wrong to use rempa_pfn_range; this will install PTE-level entries.
> +		 * The whole point of 2 MiB allocs is to improve TLB perf!
> +		 * We should use something like mm/huge_memory.c#insert_pfn_pmd
> +		 * but that is currently static.
> +		 * TODO: figure out the best way to install PMDs.
> +		 */
> +		rc = remap_pfn_range(vma,
> +				vma_addr_offset,
> +				(guestmemfs_base >> PAGE_SHIFT) + (mapped_block * 512),
> +				map_size,
> +				vma->vm_page_prot);
> +	}
> +	return 0;
> +}
> +
>  const struct inode_operations guestmemfs_file_inode_operations = {
>  	.setattr = inode_setattr,
>  	.getattr = simple_getattr,
> @@ -48,5 +89,5 @@ const struct inode_operations guestmemfs_file_inode_operations = {
>  
>  const struct file_operations guestmemfs_file_fops = {
>  	.owner = THIS_MODULE,
> -	.iterate_shared = NULL,
> +	.mmap = mmap,
>  };
> diff --git a/fs/guestmemfs/guestmemfs.c b/fs/guestmemfs/guestmemfs.c
> index c45c796c497a..38f20ad25286 100644
> --- a/fs/guestmemfs/guestmemfs.c
> +++ b/fs/guestmemfs/guestmemfs.c
> @@ -9,7 +9,7 @@
>  #include <linux/memblock.h>
>  #include <linux/statfs.h>
>  
> -static phys_addr_t guestmemfs_base, guestmemfs_size;
> +phys_addr_t guestmemfs_base, guestmemfs_size;
>  struct guestmemfs_sb *psb;
>  
>  static int statfs(struct dentry *root, struct kstatfs *buf)
> diff --git a/fs/guestmemfs/guestmemfs.h b/fs/guestmemfs/guestmemfs.h
> index 7ea03ac8ecca..0f2788ce740e 100644
> --- a/fs/guestmemfs/guestmemfs.h
> +++ b/fs/guestmemfs/guestmemfs.h
> @@ -8,6 +8,9 @@
>  #define GUESTMEMFS_FILENAME_LEN 255
>  #define GUESTMEMFS_PSB(sb) ((struct guestmemfs_sb *)sb->s_fs_info)
>  
> +/* Units of bytes */
> +extern phys_addr_t guestmemfs_base, guestmemfs_size;
> +
>  struct guestmemfs_sb {
>  	/* Inode number */
>  	unsigned long next_free_ino;
> -- 
> 2.34.1
> 
>
Frank van der Linden Oct. 30, 2024, 10:18 p.m. UTC | #2
On Tue, Oct 29, 2024 at 4:06 PM Elliot Berman <quic_eberman@quicinc.com> wrote:
>
> On Mon, Aug 05, 2024 at 11:32:40AM +0200, James Gowans wrote:
> > Make the file data usable to userspace by adding mmap. That's all that
> > QEMU needs for guest RAM, so that's all be bother implementing for now.
> >
> > When mmaping the file the VMA is marked as PFNMAP to indicate that there
> > are no struct pages for the memory in this VMA. Remap_pfn_range() is
> > used to actually populate the page tables. All PTEs are pre-faulted into
> > the pgtables at mmap time so that the pgtables are usable when this
> > virtual address range is given to VFIO's MAP_DMA.
>
> Thanks for sending this out! I'm going through the series with the
> intention to see how it might fit within the existing guest_memfd work
> for pKVM/CoCo/Gunyah.
>
> It might've been mentioned in the MM alignment session -- you might be
> interested to join the guest_memfd bi-weekly call to see how we are
> overlapping [1].
>
> [1]: https://lore.kernel.org/kvm/ae794891-fe69-411a-b82e-6963b594a62a@redhat.com/T/
>
> ---
>
> Was the decision to pre-fault everything because it was convenient to do
> or otherwise intentionally different from hugetlb?
>

It's memory that is placed outside of of page allocator control, or
even outside of System RAM - VM_PFNMAP only. So you don't have much of
a choice..

In general, for things like guest memory or persistent memory, even if
struct pages were available, it doesn't seem all that useful to adhere
to the !MAP_POPULATE standard, why go through any faults to begin
with?

For guest_memfd: as I understand it, it's folio-based. And this is
VM_PFNMAP memory without struct pages / folios. So the main task there
is probably to teach guest_memfd about VM_PFNMAP memory. That would be
great, since it then ties in guest_memfd with external guest memory.

- Frank
Gowans, James Oct. 31, 2024, 3:30 p.m. UTC | #3
On Tue, 2024-10-29 at 16:05 -0700, Elliot Berman wrote:
> On Mon, Aug 05, 2024 at 11:32:40AM +0200, James Gowans wrote:
> > Make the file data usable to userspace by adding mmap. That's all that
> > QEMU needs for guest RAM, so that's all be bother implementing for now.
> > 
> > When mmaping the file the VMA is marked as PFNMAP to indicate that there
> > are no struct pages for the memory in this VMA. Remap_pfn_range() is
> > used to actually populate the page tables. All PTEs are pre-faulted into
> > the pgtables at mmap time so that the pgtables are usable when this
> > virtual address range is given to VFIO's MAP_DMA.
> 
> Thanks for sending this out! I'm going through the series with the
> intention to see how it might fit within the existing guest_memfd work
> for pKVM/CoCo/Gunyah.
> 
> It might've been mentioned in the MM alignment session -- you might be
> interested to join the guest_memfd bi-weekly call to see how we are
> overlapping [1].
> 
> [1]: https://lore.kernel.org/kvm/ae794891-fe69-411a-b82e-6963b594a62a@redhat.com/T/

Hi Elliot, yes, I think that there is a lot more overlap with
guest_memfd necessary here. The idea was to extend guestmemfs at some
point to have a guest_memfd style interface, but it was pointed out at
the MM alignment call that doing so would require guestmemfs to
duplicate the API surface of guest_memfd. This is undesirable. Better
would be to have persistence implemented as a custom allocator behind a
normal guest_memfd. I'm not too sure how this would be actually done in
practice, specifically: 
- how the persistent pool would be defined
- how it would be supplied to guest_memfd
- how the guest_memfds would be re-discovered after kexec
But assuming we can figure out some way to do this, I think it's a
better way to go.

I'll join the guest_memfd call shortly to see the developments there and
where persistence would fit best.

Hopefully we can figure out in theory how this could work, the I'll put
together another RFC sketching it out.

JG
Jason Gunthorpe Oct. 31, 2024, 4:06 p.m. UTC | #4
On Thu, Oct 31, 2024 at 03:30:59PM +0000, Gowans, James wrote:
> On Tue, 2024-10-29 at 16:05 -0700, Elliot Berman wrote:
> > On Mon, Aug 05, 2024 at 11:32:40AM +0200, James Gowans wrote:
> > > Make the file data usable to userspace by adding mmap. That's all that
> > > QEMU needs for guest RAM, so that's all be bother implementing for now.
> > > 
> > > When mmaping the file the VMA is marked as PFNMAP to indicate that there
> > > are no struct pages for the memory in this VMA. Remap_pfn_range() is
> > > used to actually populate the page tables. All PTEs are pre-faulted into
> > > the pgtables at mmap time so that the pgtables are usable when this
> > > virtual address range is given to VFIO's MAP_DMA.
> > 
> > Thanks for sending this out! I'm going through the series with the
> > intention to see how it might fit within the existing guest_memfd work
> > for pKVM/CoCo/Gunyah.
> > 
> > It might've been mentioned in the MM alignment session -- you might be
> > interested to join the guest_memfd bi-weekly call to see how we are
> > overlapping [1].
> > 
> > [1]: https://lore.kernel.org/kvm/ae794891-fe69-411a-b82e-6963b594a62a@redhat.com/T/
> 
> Hi Elliot, yes, I think that there is a lot more overlap with
> guest_memfd necessary here. The idea was to extend guestmemfs at some
> point to have a guest_memfd style interface, but it was pointed out at
> the MM alignment call that doing so would require guestmemfs to
> duplicate the API surface of guest_memfd. This is undesirable. Better
> would be to have persistence implemented as a custom allocator behind a
> normal guest_memfd. I'm not too sure how this would be actually done in
> practice, specifically: 
> - how the persistent pool would be defined
> - how it would be supplied to guest_memfd
> - how the guest_memfds would be re-discovered after kexec
> But assuming we can figure out some way to do this, I think it's a
> better way to go.

I think the filesystem interface seemed reasonable, you just want
open() on the filesystem to return back a normal guest_memfd and
re-use all of that code to implement it.

When opened through the filesystem guest_memfd would get hooked by the
KHO stuff to manage its memory, somehow.

Really KHO just needs to keep track of the addresess in the
guest_memfd when it serializes, right? So maybe all it needs is a way
to freeze the guest_memfd so it's memory map doesn't change anymore,
then a way to extract the addresses from it for serialization?

Jason
Gowans, James Nov. 1, 2024, 12:55 p.m. UTC | #5
On Wed, 2024-10-30 at 15:18 -0700, Frank van der Linden wrote:
> On Tue, Oct 29, 2024 at 4:06 PM Elliot Berman <quic_eberman@quicinc.com> wrote:
> > 
> > On Mon, Aug 05, 2024 at 11:32:40AM +0200, James Gowans wrote:
> > > Make the file data usable to userspace by adding mmap. That's all that
> > > QEMU needs for guest RAM, so that's all be bother implementing for now.
> > > 
> > > When mmaping the file the VMA is marked as PFNMAP to indicate that there
> > > are no struct pages for the memory in this VMA. Remap_pfn_range() is
> > > used to actually populate the page tables. All PTEs are pre-faulted into
> > > the pgtables at mmap time so that the pgtables are usable when this
> > > virtual address range is given to VFIO's MAP_DMA.
> > 
> > Thanks for sending this out! I'm going through the series with the
> > intention to see how it might fit within the existing guest_memfd work
> > for pKVM/CoCo/Gunyah.
> > 
> > It might've been mentioned in the MM alignment session -- you might be
> > interested to join the guest_memfd bi-weekly call to see how we are
> > overlapping [1].
> > 
> > [1]: https://lore.kernel.org/kvm/ae794891-fe69-411a-b82e-6963b594a62a@redhat.com/T/
> > 
> > ---
> > 
> > Was the decision to pre-fault everything because it was convenient to do
> > or otherwise intentionally different from hugetlb?
> > 
> 
> It's memory that is placed outside of of page allocator control, or
> even outside of System RAM - VM_PFNMAP only. So you don't have much of
> a choice..
> 
> In general, for things like guest memory or persistent memory, even if
> struct pages were available, it doesn't seem all that useful to adhere
> to the !MAP_POPULATE standard, why go through any faults to begin
> with?
> 
> For guest_memfd: as I understand it, it's folio-based. And this is
> VM_PFNMAP memory without struct pages / folios. So the main task there
> is probably to teach guest_memfd about VM_PFNMAP memory. That would be
> great, since it then ties in guest_memfd with external guest memory.

Exactly - I think all of the comments on this series are heading in a
similar direction: let's add a custom reserved (PFNMAP) persistent
memory allocator behind guest_memfd and expose that as a filesystem.
This will be what the next version of patch series will do.

JG
Gowans, James Nov. 1, 2024, 1:01 p.m. UTC | #6
On Thu, 2024-10-31 at 13:06 -0300, Jason Gunthorpe wrote:
> On Thu, Oct 31, 2024 at 03:30:59PM +0000, Gowans, James wrote:
> > On Tue, 2024-10-29 at 16:05 -0700, Elliot Berman wrote:
> > > On Mon, Aug 05, 2024 at 11:32:40AM +0200, James Gowans wrote:
> > > > Make the file data usable to userspace by adding mmap. That's all that
> > > > QEMU needs for guest RAM, so that's all be bother implementing for now.
> > > > 
> > > > When mmaping the file the VMA is marked as PFNMAP to indicate that there
> > > > are no struct pages for the memory in this VMA. Remap_pfn_range() is
> > > > used to actually populate the page tables. All PTEs are pre-faulted into
> > > > the pgtables at mmap time so that the pgtables are usable when this
> > > > virtual address range is given to VFIO's MAP_DMA.
> > > 
> > > Thanks for sending this out! I'm going through the series with the
> > > intention to see how it might fit within the existing guest_memfd work
> > > for pKVM/CoCo/Gunyah.
> > > 
> > > It might've been mentioned in the MM alignment session -- you might be
> > > interested to join the guest_memfd bi-weekly call to see how we are
> > > overlapping [1].
> > > 
> > > [1]: https://lore.kernel.org/kvm/ae794891-fe69-411a-b82e-6963b594a62a@redhat.com/T/
> > 
> > Hi Elliot, yes, I think that there is a lot more overlap with
> > guest_memfd necessary here. The idea was to extend guestmemfs at some
> > point to have a guest_memfd style interface, but it was pointed out at
> > the MM alignment call that doing so would require guestmemfs to
> > duplicate the API surface of guest_memfd. This is undesirable. Better
> > would be to have persistence implemented as a custom allocator behind a
> > normal guest_memfd. I'm not too sure how this would be actually done in
> > practice, specifically:
> > - how the persistent pool would be defined
> > - how it would be supplied to guest_memfd
> > - how the guest_memfds would be re-discovered after kexec
> > But assuming we can figure out some way to do this, I think it's a
> > better way to go.
> 
> I think the filesystem interface seemed reasonable, you just want
> open() on the filesystem to return back a normal guest_memfd and
> re-use all of that code to implement it.
> 
> When opened through the filesystem guest_memfd would get hooked by the
> KHO stuff to manage its memory, somehow.
> 
> Really KHO just needs to keep track of the addresess in the
> guest_memfd when it serializes, right? So maybe all it needs is a way
> to freeze the guest_memfd so it's memory map doesn't change anymore,
> then a way to extract the addresses from it for serialization?

Thanks Jason, that sounds perfect. I'll work on the next rev which will:
- expose a filesystem which owns reserved/persistent memory, just like
this patch.
- rebased on top of the patches which pull out the guest_memfd code into
a library
- rebased on top of the guest_memfd patches which supports adding a
different backing allocator (hugetlbfs) to guest_memfd
- when a file in guestmemfs is opened, create a guest_memfd object from
the guest_memfd library code and set guestmemfs as the custom allocator
for the file.
- serialise and re-hydrate the guest_memfds which have been created in
guestmemfs on kexec via KHO.

The main difference is that opening a guestmemfs file won't give a
regular file, rather it will give a guest_memfd library object. This
will give good code re-used with guest_memfd library and prevent needing
to re-implement the guest_memfd API surface here.

Sounds like a great path forward. :-)

JG

> 
> Jason
Jason Gunthorpe Nov. 1, 2024, 1:42 p.m. UTC | #7
On Fri, Nov 01, 2024 at 01:01:00PM +0000, Gowans, James wrote:

> Thanks Jason, that sounds perfect. I'll work on the next rev which will:
> - expose a filesystem which owns reserved/persistent memory, just like
> this patch.

Is this step needed?

If the guest memfd is already told to get 1G pages in some normal way,
why do we need a dedicated pool just for the KHO filesystem?

Back to my suggestion, can't KHO simply freeze the guest memfd and
then extract the memory layout, and just use the normal allocator?

Or do you have a hard requirement that only KHO allocated memory can
be preserved across kexec?

Jason
Gowans, James Nov. 2, 2024, 8:24 a.m. UTC | #8
On Fri, 2024-11-01 at 10:42 -0300, Jason Gunthorpe wrote:
> 
> On Fri, Nov 01, 2024 at 01:01:00PM +0000, Gowans, James wrote:
> 
> > Thanks Jason, that sounds perfect. I'll work on the next rev which will:
> > - expose a filesystem which owns reserved/persistent memory, just like
> > this patch.
> 
> Is this step needed?
> 
> If the guest memfd is already told to get 1G pages in some normal way,
> why do we need a dedicated pool just for the KHO filesystem?
> 
> Back to my suggestion, can't KHO simply freeze the guest memfd and
> then extract the memory layout, and just use the normal allocator?
> 
> Or do you have a hard requirement that only KHO allocated memory can
> be preserved across kexec?

KHO can persist any memory ranges which are not MOVABLE. Provided that
guest_memfd does non-movable allocations then serialising and persisting
should be possible.

There are other requirements here, specifically the ability to be
*guaranteed* GiB-level allocations, have the guest memory out of the
direct map for secret hiding, and remove the struct page overhead.
Struct page overhead could be handled via HVO. But considering that the
memory must be out of the direct map it seems unnecessary to have struct
pages, and unnecessary to have it managed by an existing allocator. The
only existing 1 GiB allocator I know of is hugetlbfs? Let me know if
there's something else that can be used.
That's the main motivation for a separate pool allocated on early boot.
This is quite similar to hugetlbfs, so a natural question is if we could
use and serialise hugetlbfs instead, but that probably opens another can
of worms of complexity.

There's more than just the guest_memfds and their allocations to
serialise; it's probably useful to be able to have a directory structure
in the filesystem, POSIX file ACLs, and perhaps some other filesystem
metadata. For this reason I still think that having a new filesystem
designed for this use-case which creates guest_memfd objects when files
are opened is the way to go.

Let me know what you think.

JG
Mike Rapoport Nov. 4, 2024, 10:49 a.m. UTC | #9
On Fri, Nov 01, 2024 at 10:42:02AM -0300, Jason Gunthorpe wrote:
> On Fri, Nov 01, 2024 at 01:01:00PM +0000, Gowans, James wrote:
> 
> > Thanks Jason, that sounds perfect. I'll work on the next rev which will:
> > - expose a filesystem which owns reserved/persistent memory, just like
> > this patch.
> 
> Is this step needed?
> 
> If the guest memfd is already told to get 1G pages in some normal way,
> why do we need a dedicated pool just for the KHO filesystem?
> 
> Back to my suggestion, can't KHO simply freeze the guest memfd and
> then extract the memory layout, and just use the normal allocator?
> 
> Or do you have a hard requirement that only KHO allocated memory can
> be preserved across kexec?

KHO does not allocate memory, it gets the ranges to preserve, makes sure
they are not overwritten during kexec and can be retrieved by the second
kernel.
For KHO it does not matter if the memory comes from a normal or a special
allocator.
 
> Jason
Mike Rapoport Nov. 4, 2024, 11:11 a.m. UTC | #10
On Sat, Nov 02, 2024 at 08:24:15AM +0000, Gowans, James wrote:
> On Fri, 2024-11-01 at 10:42 -0300, Jason Gunthorpe wrote:
> > 
> > On Fri, Nov 01, 2024 at 01:01:00PM +0000, Gowans, James wrote:
> > 
> > > Thanks Jason, that sounds perfect. I'll work on the next rev which will:
> > > - expose a filesystem which owns reserved/persistent memory, just like
> > > this patch.
> > 
> > Is this step needed?
> > 
> > If the guest memfd is already told to get 1G pages in some normal way,
> > why do we need a dedicated pool just for the KHO filesystem?
> > 
> > Back to my suggestion, can't KHO simply freeze the guest memfd and
> > then extract the memory layout, and just use the normal allocator?
> > 
> > Or do you have a hard requirement that only KHO allocated memory can
> > be preserved across kexec?
> 
> KHO can persist any memory ranges which are not MOVABLE. Provided that
> guest_memfd does non-movable allocations then serialising and persisting
> should be possible.
> 
> There are other requirements here, specifically the ability to be
> *guaranteed* GiB-level allocations, have the guest memory out of the
> direct map for secret hiding, and remove the struct page overhead.
> Struct page overhead could be handled via HVO. But considering that the
> memory must be out of the direct map it seems unnecessary to have struct
> pages, and unnecessary to have it managed by an existing allocator.

Having memory out of direct map does not preclude manipulations of struct
page unless that memory is completely out of the kernel control (e.g.
excluded by mem=X) and this is not necessarily the case even for VM hosts.

It's not not necessary to manage the memory using an existing allocator,
but I think a specialized allocator should not be a part of guestmemfs.`
 
> JG
Jason Gunthorpe Nov. 4, 2024, 2:39 p.m. UTC | #11
On Sat, Nov 02, 2024 at 08:24:15AM +0000, Gowans, James wrote:

> KHO can persist any memory ranges which are not MOVABLE. Provided that
> guest_memfd does non-movable allocations then serialising and persisting
> should be possible.
> 
> There are other requirements here, specifically the ability to be
> *guaranteed* GiB-level allocations, have the guest memory out of the
> direct map for secret hiding, and remove the struct page overhead.
> Struct page overhead could be handled via HVO. 

IMHO this should all be handled as part of normal guestmemfd operation
because it has nothing to do with KHO. Many others have asked for the
same things in guest memfd already.

So I would start by assuming guest memfd will get those things
eventually and design around a 'freeze and record' model for KHO of a
guestmemfd, instead of yet another special memory allocator..

Jason
diff mbox series

Patch

diff --git a/fs/guestmemfs/file.c b/fs/guestmemfs/file.c
index 618c93b12196..b1a52abcde65 100644
--- a/fs/guestmemfs/file.c
+++ b/fs/guestmemfs/file.c
@@ -1,6 +1,7 @@ 
 // SPDX-License-Identifier: GPL-2.0-only
 
 #include "guestmemfs.h"
+#include <linux/mm.h>
 
 static int truncate(struct inode *inode, loff_t newsize)
 {
@@ -41,6 +42,46 @@  static int inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct
 	return 0;
 }
 
+/*
+ * To be able to use PFNMAP VMAs for VFIO DMA mapping we need the page tables
+ * populated with mappings. Pre-fault everything.
+ */
+static int mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	int rc;
+	unsigned long *mappings_block;
+	struct guestmemfs_inode *guestmemfs_inode;
+
+	guestmemfs_inode = guestmemfs_get_persisted_inode(filp->f_inode->i_sb,
+			filp->f_inode->i_ino);
+
+	mappings_block = guestmemfs_inode->mappings;
+
+	/* Remap-pfn-range will mark the range VM_IO */
+	for (unsigned long vma_addr_offset = vma->vm_start;
+			vma_addr_offset < vma->vm_end;
+			vma_addr_offset += PMD_SIZE) {
+		int block, mapped_block;
+		unsigned long map_size = min(PMD_SIZE, vma->vm_end - vma_addr_offset);
+
+		block = (vma_addr_offset - vma->vm_start) / PMD_SIZE;
+		mapped_block = *(mappings_block + block);
+		/*
+		 * It's wrong to use rempa_pfn_range; this will install PTE-level entries.
+		 * The whole point of 2 MiB allocs is to improve TLB perf!
+		 * We should use something like mm/huge_memory.c#insert_pfn_pmd
+		 * but that is currently static.
+		 * TODO: figure out the best way to install PMDs.
+		 */
+		rc = remap_pfn_range(vma,
+				vma_addr_offset,
+				(guestmemfs_base >> PAGE_SHIFT) + (mapped_block * 512),
+				map_size,
+				vma->vm_page_prot);
+	}
+	return 0;
+}
+
 const struct inode_operations guestmemfs_file_inode_operations = {
 	.setattr = inode_setattr,
 	.getattr = simple_getattr,
@@ -48,5 +89,5 @@  const struct inode_operations guestmemfs_file_inode_operations = {
 
 const struct file_operations guestmemfs_file_fops = {
 	.owner = THIS_MODULE,
-	.iterate_shared = NULL,
+	.mmap = mmap,
 };
diff --git a/fs/guestmemfs/guestmemfs.c b/fs/guestmemfs/guestmemfs.c
index c45c796c497a..38f20ad25286 100644
--- a/fs/guestmemfs/guestmemfs.c
+++ b/fs/guestmemfs/guestmemfs.c
@@ -9,7 +9,7 @@ 
 #include <linux/memblock.h>
 #include <linux/statfs.h>
 
-static phys_addr_t guestmemfs_base, guestmemfs_size;
+phys_addr_t guestmemfs_base, guestmemfs_size;
 struct guestmemfs_sb *psb;
 
 static int statfs(struct dentry *root, struct kstatfs *buf)
diff --git a/fs/guestmemfs/guestmemfs.h b/fs/guestmemfs/guestmemfs.h
index 7ea03ac8ecca..0f2788ce740e 100644
--- a/fs/guestmemfs/guestmemfs.h
+++ b/fs/guestmemfs/guestmemfs.h
@@ -8,6 +8,9 @@ 
 #define GUESTMEMFS_FILENAME_LEN 255
 #define GUESTMEMFS_PSB(sb) ((struct guestmemfs_sb *)sb->s_fs_info)
 
+/* Units of bytes */
+extern phys_addr_t guestmemfs_base, guestmemfs_size;
+
 struct guestmemfs_sb {
 	/* Inode number */
 	unsigned long next_free_ino;