diff mbox series

[RFC,v2,3/4] mm/ptshare: Create new mm struct for page table sharing

Message ID 1fd52581f4e4960a4d07cb9784d56659ec139d3c.1682453344.git.khalid.aziz@oracle.com (mailing list archive)
State New
Headers show
Series Add support for sharing page tables across processes (Previously mshare) | expand

Commit Message

Khalid Aziz April 26, 2023, 4:49 p.m. UTC
When a process passes MAP_SHARED_PT flag to mmap(), create a new mm
struct to hold the shareable page table entries for the newly mapped
region.  This new mm is not associated with a task.  Its lifetime is
until the last shared mapping is deleted.  This patch also adds a
new pointer "ptshare_data" to struct address_space which points to
the data structure that will contain pointer to this newly created
mm along with properties of the shared mapping. ptshare_data
maintains a refcount for the shared mapping so that it can be
cleaned up upon last unmap.

Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/fs.h |   2 +
 mm/Makefile        |   2 +-
 mm/internal.h      |  14 +++++
 mm/mmap.c          |  72 ++++++++++++++++++++++++++
 mm/ptshare.c       | 126 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 215 insertions(+), 1 deletion(-)
 create mode 100644 mm/ptshare.c

Comments

Karim Manaouil June 26, 2023, 8:08 a.m. UTC | #1
On Wed, Apr 26, 2023 at 10:49:50AM -0600, Khalid Aziz wrote:
> When a process passes MAP_SHARED_PT flag to mmap(), create a new mm
> struct to hold the shareable page table entries for the newly mapped
> region.  This new mm is not associated with a task.  Its lifetime is
> until the last shared mapping is deleted.  This patch also adds a
> new pointer "ptshare_data" to struct address_space which points to
> the data structure that will contain pointer to this newly created
> mm along with properties of the shared mapping. ptshare_data
> maintains a refcount for the shared mapping so that it can be
> cleaned up upon last unmap.
>
> Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
> Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  include/linux/fs.h |   2 +
>  mm/Makefile        |   2 +-
>  mm/internal.h      |  14 +++++
>  mm/mmap.c          |  72 ++++++++++++++++++++++++++
>  mm/ptshare.c       | 126 +++++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 215 insertions(+), 1 deletion(-)
>  create mode 100644 mm/ptshare.c
>
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c85916e9f7db..db8d3257c712 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -422,6 +422,7 @@ extern const struct address_space_operations empty_aops;
>   * @private_lock: For use by the owner of the address_space.
>   * @private_list: For use by the owner of the address_space.
>   * @private_data: For use by the owner of the address_space.
> + * @ptshare_data: For shared page table use
>   */
>  struct address_space {
>       struct inode            *host;
> @@ -443,6 +444,7 @@ struct address_space {
>       spinlock_t              private_lock;
>       struct list_head        private_list;
>       void                    *private_data;
> +     void                    *ptshare_data;
>  } __attribute__((aligned(sizeof(long)))) __randomize_layout;
>       /*
>        * On most architectures that alignment is already the case; but
> diff --git a/mm/Makefile b/mm/Makefile
> index 8e105e5b3e29..d9bb14fdf220 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -40,7 +40,7 @@ mmu-y                       := nommu.o
>  mmu-$(CONFIG_MMU)    := highmem.o memory.o mincore.o \
>                          mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
>                          msync.o page_vma_mapped.o pagewalk.o \
> -                        pgtable-generic.o rmap.o vmalloc.o
> +                        pgtable-generic.o rmap.o vmalloc.o ptshare.o
>
>
>  ifdef CONFIG_CROSS_MEMORY_ATTACH
> diff --git a/mm/internal.h b/mm/internal.h
> index 4d60d2d5fe19..3efb8738e26f 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1047,4 +1047,18 @@ static inline bool vma_is_shared(const struct vm_area_struct *vma)
>  {
>       return vma->vm_flags & VM_SHARED_PT;
>  }
> +
> +/*
> + * mm/ptshare.c
> + */
> +struct ptshare_data {
> +     struct mm_struct *mm;
> +     refcount_t refcnt;
> +     unsigned long start;
> +     unsigned long size;
> +     unsigned long mode;
> +};

Why does ptshare_data contain the start address, size and mode of the
mapping? Does it mean ptshare_data can represent only a single mapping
of the file (the one that begins at ptshare_data->start)? What if we
want to share multiple different mappings of the same file (which may
or may not intersect)?

If we choose to use the VMAs in host_mm for that, will this possibly create
a lot of special-cased VMA handling?

> +int ptshare_new_mm(struct file *file, struct vm_area_struct *vma);
> +void ptshare_del_mm(struct vm_area_struct *vm);
> +int ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma);
>  #endif       /* __MM_INTERNAL_H */
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 8b46d465f8d4..c5e9b7f6de90 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1382,6 +1382,60 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
>           ((vm_flags & VM_LOCKED) ||
>            (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
>               *populate = len;
> +
> +#if VM_SHARED_PT
> +     /*
> +      * Check if this mapping is a candidate for page table sharing
> +      * at PMD level. It is if following conditions hold:
> +      *      - It is not anonymous mapping
> +      *      - It is not hugetlbfs mapping (for now)
> +      *      - flags conatins MAP_SHARED or MAP_SHARED_VALIDATE and
> +      *        MAP_SHARED_PT
> +      *      - Start address is aligned to PMD size
> +      *      - Mapping size is a multiple of PMD size
> +      */
> +     if (ptshare && file && !is_file_hugepages(file)) {
> +             struct vm_area_struct *vma;
> +
> +             vma = find_vma(mm, addr);
> +             if (!((vma->vm_start | vma->vm_end) & (PMD_SIZE - 1))) {
> +                     struct ptshare_data *info = file->f_mapping->ptshare_data;

This is racy with another process trying to share the same mapping of
the file. It's also racy with the removal (this process can get a
pointer to ptshare_data that's currently being freed).

> +                     /*
> +                      * If this mapping has not been set up for page table
> +                      * sharing yet, do so by creating a new mm to hold the
> +                      * shared page tables for this mapping
> +                      */
> +                     if (info == NULL) {
> +                             int ret;
> +
> +                             ret = ptshare_new_mm(file, vma);
> +                             if (ret < 0)
> +                                     return ret;
> +
> +                             info = file->f_mapping->ptshare_data;
> +                             ret = ptshare_insert_vma(info->mm, vma);
> +                             if (ret < 0)
> +                                     addr = ret;
> +                             else
> +                                     vm_flags_set(vma, VM_SHARED_PT);

Creation might race with another process.

> +                     } else {
> +                             /*
> +                              * Page tables will be shared only if the
> +                              * file is mapped in with the same permissions
> +                              * across all mappers with same starting
> +                              * address and size
> +                              */
> +                             if (((prot & info->mode) == info->mode) &&
> +                                     (addr == info->start) &&
> +                                     (len == info->size)) {
> +                                     vm_flags_set(vma, VM_SHARED_PT);
> +                                     refcount_inc(&info->refcnt);
> +                             }
> +                     }
> +             }
> +     }
> +#endif
> +
>       return addr;
>  }
>
> @@ -2495,6 +2549,22 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
>       if (end == start)
>               return -EINVAL;
>
> +     /*
> +      * Check if this vma uses shared page tables
> +      */
> +     vma = find_vma_intersection(mm, start, end);
> +     if (vma && unlikely(vma_is_shared(vma))) {
> +             struct ptshare_data *info = NULL;
> +
> +             if (vma->vm_file && vma->vm_file->f_mapping)
> +                     info = vma->vm_file->f_mapping->ptshare_data;
> +             /* Don't allow partial munmaps */
> +             if (info && ((start != info->start) || (len != info->size)))
> +                     return -EINVAL;
> +             ptshare_del_mm(vma);
> +     }
> +
> +
>        /* arch_unmap() might do unmaps itself.  */
>       arch_unmap(mm, start, end);
>
> @@ -2664,6 +2734,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>                       }
>               }
>
> +             if (vm_flags & VM_SHARED_PT)
> +                     vm_flags_set(vma, VM_SHARED_PT);
>               vm_flags = vma->vm_flags;
>       } else if (vm_flags & VM_SHARED) {
>               error = shmem_zero_setup(vma);
> diff --git a/mm/ptshare.c b/mm/ptshare.c
> new file mode 100644
> index 000000000000..f6784268958c
> --- /dev/null
> +++ b/mm/ptshare.c
> @@ -0,0 +1,126 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Share page table entries when possible to reduce the amount of extra
> + * memory consumed by page tables
> + *
> + * Copyright (C) 2022 Oracle Corp. All rights reserved.
> + * Authors:  Khalid Aziz <khalid.aziz@oracle.com>
> + *           Matthew Wilcox <willy@infradead.org>
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/fs.h>
> +#include <asm/pgalloc.h>
> +#include "internal.h"
> +
> +/*
> + * Create a new mm struct that will hold the shared PTEs. Pointer to
> + * this new mm is stored in the data structure ptshare_data which also
> + * includes a refcount for any current references to PTEs in this new
> + * mm. This refcount is used to determine when the mm struct for shared
> + * PTEs can be deleted.
> + */
> +int
> +ptshare_new_mm(struct file *file, struct vm_area_struct *vma)
> +{
> +     struct mm_struct *new_mm;
> +     struct ptshare_data *info = NULL;
> +     int retval = 0;
> +     unsigned long start = vma->vm_start;
> +     unsigned long len = vma->vm_end - vma->vm_start;
> +
> +     new_mm = mm_alloc();
> +     if (!new_mm) {
> +             retval = -ENOMEM;
> +             goto err_free;
> +     }
> +     new_mm->mmap_base = start;
> +     new_mm->task_size = len;
> +     if (!new_mm->task_size)
> +             new_mm->task_size--;
> +
> +     info = kzalloc(sizeof(*info), GFP_KERNEL);
> +     if (!info) {
> +             retval = -ENOMEM;
> +             goto err_free;
> +     }
> +     info->mm = new_mm;
> +     info->start = start;
> +     info->size = len;
> +     refcount_set(&info->refcnt, 1);
> +     file->f_mapping->ptshare_data = info;

Racy assignement. It can lead to a memory leak if another process does
the same concurrently and assigns before or after this one. The new_mm
and ptshare_data of one of them will be lost.

I think this whole process needs to be protected with i_mmap lock.

> +
> +     return retval;
> +
> +err_free:
> +     if (new_mm)
> +             mmput(new_mm);
> +     kfree(info);
> +     return retval;
> +}
> +
> +/*
> + * insert vma into mm holding shared page tables
> + */
> +int
> +ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma)
> +{
> +     struct vm_area_struct *new_vma;
> +     int err = 0;
> +
> +     new_vma = vm_area_dup(vma);
> +     if (!new_vma)
> +             return -ENOMEM;
> +
> +     new_vma->vm_file = NULL;
> +     /*
> +      * This new vma belongs to host mm, so clear the VM_SHARED_PT
> +      * flag on this so we know this is the host vma when we clean
> +      * up page tables. Do not use THP for page table shared regions
> +      */
> +     vm_flags_clear(new_vma, (VM_SHARED | VM_SHARED_PT));
> +     vm_flags_set(new_vma, VM_NOHUGEPAGE);
> +     new_vma->vm_mm = mm;
> +
> +     err = insert_vm_struct(mm, new_vma);
> +     if (err)
> +             return -ENOMEM;
> +
> +     return err;
> +}
> +
> +/*
> + * Free the mm struct created to hold shared PTEs and associated data
> + * structures
> + */
> +static inline void
> +free_ptshare_mm(struct ptshare_data *info)
> +{
> +     mmput(info->mm);
> +     kfree(info);
> +}
> +
> +/*
> + * This function is called when a reference to the shared PTEs in mm
> + * struct is dropped. It updates refcount and checks to see if last
> + * reference to the mm struct holding shared PTEs has been dropped. If
> + * so, it cleans up the mm struct and associated data structures
> + */
> +void
> +ptshare_del_mm(struct vm_area_struct *vma)
> +{
> +     struct ptshare_data *info;
> +     struct file *file = vma->vm_file;
> +
> +     if (!file || (!file->f_mapping))
> +             return;
> +     info = file->f_mapping->ptshare_data;
> +     WARN_ON(!info);
> +     if (!info)
> +             return;
> +
> +     if (refcount_dec_and_test(&info->refcnt)) {
> +             free_ptshare_mm(info);
> +             file->f_mapping->ptshare_data = NULL;

Maybe those two should be reordered (after keeping a pointer to
ptshare_data). Then setting f_mapping->ptshare_data to NULL can
be performed under a lock and freeing ptshare and host_mm can be
done without a lock.

Cheers
Karim
diff mbox series

Patch

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c85916e9f7db..db8d3257c712 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -422,6 +422,7 @@  extern const struct address_space_operations empty_aops;
  * @private_lock: For use by the owner of the address_space.
  * @private_list: For use by the owner of the address_space.
  * @private_data: For use by the owner of the address_space.
+ * @ptshare_data: For shared page table use
  */
 struct address_space {
 	struct inode		*host;
@@ -443,6 +444,7 @@  struct address_space {
 	spinlock_t		private_lock;
 	struct list_head	private_list;
 	void			*private_data;
+	void			*ptshare_data;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*
 	 * On most architectures that alignment is already the case; but
diff --git a/mm/Makefile b/mm/Makefile
index 8e105e5b3e29..d9bb14fdf220 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -40,7 +40,7 @@  mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
 			   msync.o page_vma_mapped.o pagewalk.o \
-			   pgtable-generic.o rmap.o vmalloc.o
+			   pgtable-generic.o rmap.o vmalloc.o ptshare.o
 
 
 ifdef CONFIG_CROSS_MEMORY_ATTACH
diff --git a/mm/internal.h b/mm/internal.h
index 4d60d2d5fe19..3efb8738e26f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1047,4 +1047,18 @@  static inline bool vma_is_shared(const struct vm_area_struct *vma)
 {
 	return vma->vm_flags & VM_SHARED_PT;
 }
+
+/*
+ * mm/ptshare.c
+ */
+struct ptshare_data {
+	struct mm_struct *mm;
+	refcount_t refcnt;
+	unsigned long start;
+	unsigned long size;
+	unsigned long mode;
+};
+int ptshare_new_mm(struct file *file, struct vm_area_struct *vma);
+void ptshare_del_mm(struct vm_area_struct *vm);
+int ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma);
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index 8b46d465f8d4..c5e9b7f6de90 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1382,6 +1382,60 @@  unsigned long do_mmap(struct file *file, unsigned long addr,
 	    ((vm_flags & VM_LOCKED) ||
 	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
 		*populate = len;
+
+#if VM_SHARED_PT
+	/*
+	 * Check if this mapping is a candidate for page table sharing
+	 * at PMD level. It is if following conditions hold:
+	 *	- It is not anonymous mapping
+	 *	- It is not hugetlbfs mapping (for now)
+	 *	- flags conatins MAP_SHARED or MAP_SHARED_VALIDATE and
+	 *	  MAP_SHARED_PT
+	 *	- Start address is aligned to PMD size
+	 *	- Mapping size is a multiple of PMD size
+	 */
+	if (ptshare && file && !is_file_hugepages(file)) {
+		struct vm_area_struct *vma;
+
+		vma = find_vma(mm, addr);
+		if (!((vma->vm_start | vma->vm_end) & (PMD_SIZE - 1))) {
+			struct ptshare_data *info = file->f_mapping->ptshare_data;
+			/*
+			 * If this mapping has not been set up for page table
+			 * sharing yet, do so by creating a new mm to hold the
+			 * shared page tables for this mapping
+			 */
+			if (info == NULL) {
+				int ret;
+
+				ret = ptshare_new_mm(file, vma);
+				if (ret < 0)
+					return ret;
+
+				info = file->f_mapping->ptshare_data;
+				ret = ptshare_insert_vma(info->mm, vma);
+				if (ret < 0)
+					addr = ret;
+				else
+					vm_flags_set(vma, VM_SHARED_PT);
+			} else {
+				/*
+				 * Page tables will be shared only if the
+				 * file is mapped in with the same permissions
+				 * across all mappers with same starting
+				 * address and size
+				 */
+				if (((prot & info->mode) == info->mode) &&
+					(addr == info->start) &&
+					(len == info->size)) {
+					vm_flags_set(vma, VM_SHARED_PT);
+					refcount_inc(&info->refcnt);
+				}
+			}
+		}
+	}
+#endif
+
 	return addr;
 }
 
@@ -2495,6 +2549,22 @@  int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
 	if (end == start)
 		return -EINVAL;
 
+	/*
+	 * Check if this vma uses shared page tables
+	 */
+	vma = find_vma_intersection(mm, start, end);
+	if (vma && unlikely(vma_is_shared(vma))) {
+		struct ptshare_data *info = NULL;
+
+		if (vma->vm_file && vma->vm_file->f_mapping)
+			info = vma->vm_file->f_mapping->ptshare_data;
+		/* Don't allow partial munmaps */
+		if (info && ((start != info->start) || (len != info->size)))
+			return -EINVAL;
+		ptshare_del_mm(vma);
+	}
+
+
 	 /* arch_unmap() might do unmaps itself.  */
 	arch_unmap(mm, start, end);
 
@@ -2664,6 +2734,8 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 			}
 		}
 
+		if (vm_flags & VM_SHARED_PT)
+			vm_flags_set(vma, VM_SHARED_PT);
 		vm_flags = vma->vm_flags;
 	} else if (vm_flags & VM_SHARED) {
 		error = shmem_zero_setup(vma);
diff --git a/mm/ptshare.c b/mm/ptshare.c
new file mode 100644
index 000000000000..f6784268958c
--- /dev/null
+++ b/mm/ptshare.c
@@ -0,0 +1,126 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Share page table entries when possible to reduce the amount of extra
+ * memory consumed by page tables
+ *
+ * Copyright (C) 2022 Oracle Corp. All rights reserved.
+ * Authors:	Khalid Aziz <khalid.aziz@oracle.com>
+ *		Matthew Wilcox <willy@infradead.org>
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+/*
+ * Create a new mm struct that will hold the shared PTEs. Pointer to
+ * this new mm is stored in the data structure ptshare_data which also
+ * includes a refcount for any current references to PTEs in this new
+ * mm. This refcount is used to determine when the mm struct for shared
+ * PTEs can be deleted.
+ */
+int
+ptshare_new_mm(struct file *file, struct vm_area_struct *vma)
+{
+	struct mm_struct *new_mm;
+	struct ptshare_data *info = NULL;
+	int retval = 0;
+	unsigned long start = vma->vm_start;
+	unsigned long len = vma->vm_end - vma->vm_start;
+
+	new_mm = mm_alloc();
+	if (!new_mm) {
+		retval = -ENOMEM;
+		goto err_free;
+	}
+	new_mm->mmap_base = start;
+	new_mm->task_size = len;
+	if (!new_mm->task_size)
+		new_mm->task_size--;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		retval = -ENOMEM;
+		goto err_free;
+	}
+	info->mm = new_mm;
+	info->start = start;
+	info->size = len;
+	refcount_set(&info->refcnt, 1);
+	file->f_mapping->ptshare_data = info;
+
+	return retval;
+
+err_free:
+	if (new_mm)
+		mmput(new_mm);
+	kfree(info);
+	return retval;
+}
+
+/*
+ * insert vma into mm holding shared page tables
+ */
+int
+ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+	struct vm_area_struct *new_vma;
+	int err = 0;
+
+	new_vma = vm_area_dup(vma);
+	if (!new_vma)
+		return -ENOMEM;
+
+	new_vma->vm_file = NULL;
+	/*
+	 * This new vma belongs to host mm, so clear the VM_SHARED_PT
+	 * flag on this so we know this is the host vma when we clean
+	 * up page tables. Do not use THP for page table shared regions
+	 */
+	vm_flags_clear(new_vma, (VM_SHARED | VM_SHARED_PT));
+	vm_flags_set(new_vma, VM_NOHUGEPAGE);
+	new_vma->vm_mm = mm;
+
+	err = insert_vm_struct(mm, new_vma);
+	if (err)
+		return -ENOMEM;
+
+	return err;
+}
+
+/*
+ * Free the mm struct created to hold shared PTEs and associated data
+ * structures
+ */
+static inline void
+free_ptshare_mm(struct ptshare_data *info)
+{
+	mmput(info->mm);
+	kfree(info);
+}
+
+/*
+ * This function is called when a reference to the shared PTEs in mm
+ * struct is dropped. It updates refcount and checks to see if last
+ * reference to the mm struct holding shared PTEs has been dropped. If
+ * so, it cleans up the mm struct and associated data structures
+ */
+void
+ptshare_del_mm(struct vm_area_struct *vma)
+{
+	struct ptshare_data *info;
+	struct file *file = vma->vm_file;
+
+	if (!file || (!file->f_mapping))
+		return;
+	info = file->f_mapping->ptshare_data;
+	WARN_ON(!info);
+	if (!info)
+		return;
+
+	if (refcount_dec_and_test(&info->refcnt)) {
+		free_ptshare_mm(info);
+		file->f_mapping->ptshare_data = NULL;
+	}
+}