diff mbox series

[v2,1/9] mm: Add msharefs filesystem

Message ID de5566e71e038d95342d00364c6760c7078cb091.1656531090.git.khalid.aziz@oracle.com (mailing list archive)
State New
Headers show
Series Add support for shared PTEs across processes | expand

Commit Message

Khalid Aziz June 29, 2022, 10:53 p.m. UTC
Add a ram-based filesystem that contains page table sharing
information and files that enables processes to share page tables.
This patch adds the basic filesystem that can be mounted.

Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
---
 Documentation/filesystems/msharefs.rst |  19 +++++
 include/uapi/linux/magic.h             |   1 +
 mm/Makefile                            |   2 +-
 mm/mshare.c                            | 103 +++++++++++++++++++++++++
 4 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/filesystems/msharefs.rst
 create mode 100644 mm/mshare.c

Comments

Darrick J. Wong June 30, 2022, 9:53 p.m. UTC | #1
On Wed, Jun 29, 2022 at 04:53:52PM -0600, Khalid Aziz wrote:
> Add a ram-based filesystem that contains page table sharing
> information and files that enables processes to share page tables.
> This patch adds the basic filesystem that can be mounted.
> 
> Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
> ---
>  Documentation/filesystems/msharefs.rst |  19 +++++
>  include/uapi/linux/magic.h             |   1 +
>  mm/Makefile                            |   2 +-
>  mm/mshare.c                            | 103 +++++++++++++++++++++++++
>  4 files changed, 124 insertions(+), 1 deletion(-)
>  create mode 100644 Documentation/filesystems/msharefs.rst
>  create mode 100644 mm/mshare.c
> 
> diff --git a/Documentation/filesystems/msharefs.rst b/Documentation/filesystems/msharefs.rst
> new file mode 100644
> index 000000000000..fd161f67045d
> --- /dev/null
> +++ b/Documentation/filesystems/msharefs.rst
> @@ -0,0 +1,19 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +=====================================================
> +msharefs - a filesystem to support shared page tables
> +=====================================================
> +
> +msharefs is a ram-based filesystem that allows multiple processes to
> +share page table entries for shared pages.
> +
> +msharefs is typically mounted like this::
> +
> +	mount -t msharefs none /sys/fs/mshare
> +
> +When a process calls mshare syscall with a name for the shared address
> +range,

You mean creat()?

> a file with the same name is created under msharefs with that
> +name. This file can be opened by another process, if permissions
> +allow, to query the addresses shared under this range. These files are
> +removed by mshare_unlink syscall and can not be deleted directly.

Oh?

> +Hence these files are created as immutable files.
> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> index f724129c0425..2a57a6ec6f3e 100644
> --- a/include/uapi/linux/magic.h
> +++ b/include/uapi/linux/magic.h
> @@ -105,5 +105,6 @@
>  #define Z3FOLD_MAGIC		0x33
>  #define PPC_CMM_MAGIC		0xc7571590
>  #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
> +#define MSHARE_MAGIC		0x4d534852	/* "MSHR" */
>  
>  #endif /* __LINUX_MAGIC_H__ */
> diff --git a/mm/Makefile b/mm/Makefile
> index 6f9ffa968a1a..51a2ab9080d9 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -37,7 +37,7 @@ CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
>  CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)
>  
>  mmu-y			:= nommu.o
> -mmu-$(CONFIG_MMU)	:= highmem.o memory.o mincore.o \
> +mmu-$(CONFIG_MMU)	:= highmem.o memory.o mincore.o mshare.o \
>  			   mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
>  			   msync.o page_vma_mapped.o pagewalk.o \
>  			   pgtable-generic.o rmap.o vmalloc.o
> diff --git a/mm/mshare.c b/mm/mshare.c
> new file mode 100644
> index 000000000000..c8fab3869bab
> --- /dev/null
> +++ b/mm/mshare.c

Filesystems are usually supposed to live under fs/; is there some reason
to put it in mm/?

I guess shmfs is in mm so maybe this isn't much of an objection.

Also, should this fs be selectable via a Kconfig option?

--D

> @@ -0,0 +1,103 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Enable copperating processes to share page table between
> + * them to reduce the extra memory consumed by multiple copies
> + * of page tables.
> + *
> + * This code adds an in-memory filesystem - msharefs.
> + * msharefs is used to manage page table sharing
> + *
> + *
> + * Copyright (C) 2022 Oracle Corp. All rights reserved.
> + * Author:	Khalid Aziz <khalid.aziz@oracle.com>
> + *
> + */
> +
> +#include <linux/fs.h>
> +#include <linux/mount.h>
> +#include <linux/syscalls.h>
> +#include <linux/uaccess.h>
> +#include <linux/pseudo_fs.h>
> +#include <linux/fileattr.h>
> +#include <uapi/linux/magic.h>
> +#include <uapi/linux/limits.h>
> +
> +static struct super_block *msharefs_sb;
> +
> +static const struct file_operations msharefs_file_operations = {
> +	.open	= simple_open,
> +	.llseek	= no_llseek,
> +};
> +
> +static int
> +msharefs_d_hash(const struct dentry *dentry, struct qstr *qstr)
> +{
> +	unsigned long hash = init_name_hash(dentry);
> +	const unsigned char *s = qstr->name;
> +	unsigned int len = qstr->len;
> +
> +	while (len--)
> +		hash = partial_name_hash(*s++, hash);
> +	qstr->hash = end_name_hash(hash);
> +	return 0;
> +}
> +
> +static const struct dentry_operations msharefs_d_ops = {
> +	.d_hash = msharefs_d_hash,
> +};
> +
> +static int
> +msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
> +{
> +	static const struct tree_descr empty_descr = {""};
> +	int err;
> +
> +	sb->s_d_op = &msharefs_d_ops;
> +	err = simple_fill_super(sb, MSHARE_MAGIC, &empty_descr);
> +	if (err)
> +		return err;
> +
> +	msharefs_sb = sb;
> +	return 0;
> +}
> +
> +static int
> +msharefs_get_tree(struct fs_context *fc)
> +{
> +	return get_tree_single(fc, msharefs_fill_super);
> +}
> +
> +static const struct fs_context_operations msharefs_context_ops = {
> +	.get_tree	= msharefs_get_tree,
> +};
> +
> +static int
> +mshare_init_fs_context(struct fs_context *fc)
> +{
> +	fc->ops = &msharefs_context_ops;
> +	return 0;
> +}
> +
> +static struct file_system_type mshare_fs = {
> +	.name			= "msharefs",
> +	.init_fs_context	= mshare_init_fs_context,
> +	.kill_sb		= kill_litter_super,
> +};
> +
> +static int
> +mshare_init(void)
> +{
> +	int ret = 0;
> +
> +	ret = sysfs_create_mount_point(fs_kobj, "mshare");
> +	if (ret)
> +		return ret;
> +
> +	ret = register_filesystem(&mshare_fs);
> +	if (ret)
> +		sysfs_remove_mount_point(fs_kobj, "mshare");
> +
> +	return ret;
> +}
> +
> +fs_initcall(mshare_init);
> -- 
> 2.32.0
>
Al Viro June 30, 2022, 10:57 p.m. UTC | #2
On Wed, Jun 29, 2022 at 04:53:52PM -0600, Khalid Aziz wrote:
> +static int
> +msharefs_d_hash(const struct dentry *dentry, struct qstr *qstr)
> +{
> +	unsigned long hash = init_name_hash(dentry);
> +	const unsigned char *s = qstr->name;
> +	unsigned int len = qstr->len;
> +
> +	while (len--)
> +		hash = partial_name_hash(*s++, hash);
> +	qstr->hash = end_name_hash(hash);
> +	return 0;
> +}

What do you need that for and how is it different from letting it
use full_name_hash() (which is what it will do if you leave
dentry_operations->d_hash equal to NULL)?
Khalid Aziz July 1, 2022, 4:05 p.m. UTC | #3
On 6/30/22 15:53, Darrick J. Wong wrote:
> On Wed, Jun 29, 2022 at 04:53:52PM -0600, Khalid Aziz wrote:
>> Add a ram-based filesystem that contains page table sharing
>> information and files that enables processes to share page tables.
>> This patch adds the basic filesystem that can be mounted.
>>
>> Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
>> ---
>>   Documentation/filesystems/msharefs.rst |  19 +++++
>>   include/uapi/linux/magic.h             |   1 +
>>   mm/Makefile                            |   2 +-
>>   mm/mshare.c                            | 103 +++++++++++++++++++++++++
>>   4 files changed, 124 insertions(+), 1 deletion(-)
>>   create mode 100644 Documentation/filesystems/msharefs.rst
>>   create mode 100644 mm/mshare.c
>>
>> diff --git a/Documentation/filesystems/msharefs.rst b/Documentation/filesystems/msharefs.rst
>> new file mode 100644
>> index 000000000000..fd161f67045d
>> --- /dev/null
>> +++ b/Documentation/filesystems/msharefs.rst
>> @@ -0,0 +1,19 @@
>> +.. SPDX-License-Identifier: GPL-2.0
>> +
>> +=====================================================
>> +msharefs - a filesystem to support shared page tables
>> +=====================================================
>> +
>> +msharefs is a ram-based filesystem that allows multiple processes to
>> +share page table entries for shared pages.
>> +
>> +msharefs is typically mounted like this::
>> +
>> +	mount -t msharefs none /sys/fs/mshare
>> +
>> +When a process calls mshare syscall with a name for the shared address
>> +range,
> 
> You mean creat()?
> 
>> a file with the same name is created under msharefs with that
>> +name. This file can be opened by another process, if permissions
>> +allow, to query the addresses shared under this range. These files are
>> +removed by mshare_unlink syscall and can not be deleted directly.
> 
> Oh?
> 

msharefs.rst needs to be updated.

>> +Hence these files are created as immutable files.
>> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
>> index f724129c0425..2a57a6ec6f3e 100644
>> --- a/include/uapi/linux/magic.h
>> +++ b/include/uapi/linux/magic.h
>> @@ -105,5 +105,6 @@
>>   #define Z3FOLD_MAGIC		0x33
>>   #define PPC_CMM_MAGIC		0xc7571590
>>   #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
>> +#define MSHARE_MAGIC		0x4d534852	/* "MSHR" */
>>   
>>   #endif /* __LINUX_MAGIC_H__ */
>> diff --git a/mm/Makefile b/mm/Makefile
>> index 6f9ffa968a1a..51a2ab9080d9 100644
>> --- a/mm/Makefile
>> +++ b/mm/Makefile
>> @@ -37,7 +37,7 @@ CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
>>   CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)
>>   
>>   mmu-y			:= nommu.o
>> -mmu-$(CONFIG_MMU)	:= highmem.o memory.o mincore.o \
>> +mmu-$(CONFIG_MMU)	:= highmem.o memory.o mincore.o mshare.o \
>>   			   mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
>>   			   msync.o page_vma_mapped.o pagewalk.o \
>>   			   pgtable-generic.o rmap.o vmalloc.o
>> diff --git a/mm/mshare.c b/mm/mshare.c
>> new file mode 100644
>> index 000000000000..c8fab3869bab
>> --- /dev/null
>> +++ b/mm/mshare.c
> 
> Filesystems are usually supposed to live under fs/; is there some reason
> to put it in mm/?
> 
> I guess shmfs is in mm so maybe this isn't much of an objection.
> 
> Also, should this fs be selectable via a Kconfig option?

Since this filesystem is meant to support an mm feature, I felt it was more appropriate for it to reside under mm/, 
similar to shmfs.

I could add a Kconfig option. The option would be to enable mshare feature. msharefs would automatically be enabled when 
mshare is enabled, i.e. msharefs shouldn't be a visible Kconfig option. Do we see a reason to make mshare an optional 
feature? If we can base hugetlbfs page table sharing on mshare in future, this will not be an optional feature at that 
time and mshare kconfig option will have to be removed.

Thanks,
Khalid
Khalid Aziz July 1, 2022, 4:08 p.m. UTC | #4
On 6/30/22 16:57, Al Viro wrote:
> On Wed, Jun 29, 2022 at 04:53:52PM -0600, Khalid Aziz wrote:
>> +static int
>> +msharefs_d_hash(const struct dentry *dentry, struct qstr *qstr)
>> +{
>> +	unsigned long hash = init_name_hash(dentry);
>> +	const unsigned char *s = qstr->name;
>> +	unsigned int len = qstr->len;
>> +
>> +	while (len--)
>> +		hash = partial_name_hash(*s++, hash);
>> +	qstr->hash = end_name_hash(hash);
>> +	return 0;
>> +}
> 
> What do you need that for and how is it different from letting it
> use full_name_hash() (which is what it will do if you leave
> dentry_operations->d_hash equal to NULL)?

I don't have a specific reason to use msharefs_d_hash(). If full_name_hash() can work, I don't mind reducing amount of 
code in my patch. I will take a look at it.

Thanks,
Khalid
diff mbox series

Patch

diff --git a/Documentation/filesystems/msharefs.rst b/Documentation/filesystems/msharefs.rst
new file mode 100644
index 000000000000..fd161f67045d
--- /dev/null
+++ b/Documentation/filesystems/msharefs.rst
@@ -0,0 +1,19 @@ 
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================================================
+msharefs - a filesystem to support shared page tables
+=====================================================
+
+msharefs is a ram-based filesystem that allows multiple processes to
+share page table entries for shared pages.
+
+msharefs is typically mounted like this::
+
+	mount -t msharefs none /sys/fs/mshare
+
+When a process calls mshare syscall with a name for the shared address
+range, a file with the same name is created under msharefs with that
+name. This file can be opened by another process, if permissions
+allow, to query the addresses shared under this range. These files are
+removed by mshare_unlink syscall and can not be deleted directly.
+Hence these files are created as immutable files.
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index f724129c0425..2a57a6ec6f3e 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -105,5 +105,6 @@ 
 #define Z3FOLD_MAGIC		0x33
 #define PPC_CMM_MAGIC		0xc7571590
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
+#define MSHARE_MAGIC		0x4d534852	/* "MSHR" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/mm/Makefile b/mm/Makefile
index 6f9ffa968a1a..51a2ab9080d9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,7 @@  CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
 CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)
 
 mmu-y			:= nommu.o
-mmu-$(CONFIG_MMU)	:= highmem.o memory.o mincore.o \
+mmu-$(CONFIG_MMU)	:= highmem.o memory.o mincore.o mshare.o \
 			   mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
 			   msync.o page_vma_mapped.o pagewalk.o \
 			   pgtable-generic.o rmap.o vmalloc.o
diff --git a/mm/mshare.c b/mm/mshare.c
new file mode 100644
index 000000000000..c8fab3869bab
--- /dev/null
+++ b/mm/mshare.c
@@ -0,0 +1,103 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Enable copperating processes to share page table between
+ * them to reduce the extra memory consumed by multiple copies
+ * of page tables.
+ *
+ * This code adds an in-memory filesystem - msharefs.
+ * msharefs is used to manage page table sharing
+ *
+ *
+ * Copyright (C) 2022 Oracle Corp. All rights reserved.
+ * Author:	Khalid Aziz <khalid.aziz@oracle.com>
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/pseudo_fs.h>
+#include <linux/fileattr.h>
+#include <uapi/linux/magic.h>
+#include <uapi/linux/limits.h>
+
+static struct super_block *msharefs_sb;
+
+static const struct file_operations msharefs_file_operations = {
+	.open	= simple_open,
+	.llseek	= no_llseek,
+};
+
+static int
+msharefs_d_hash(const struct dentry *dentry, struct qstr *qstr)
+{
+	unsigned long hash = init_name_hash(dentry);
+	const unsigned char *s = qstr->name;
+	unsigned int len = qstr->len;
+
+	while (len--)
+		hash = partial_name_hash(*s++, hash);
+	qstr->hash = end_name_hash(hash);
+	return 0;
+}
+
+static const struct dentry_operations msharefs_d_ops = {
+	.d_hash = msharefs_d_hash,
+};
+
+static int
+msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+	static const struct tree_descr empty_descr = {""};
+	int err;
+
+	sb->s_d_op = &msharefs_d_ops;
+	err = simple_fill_super(sb, MSHARE_MAGIC, &empty_descr);
+	if (err)
+		return err;
+
+	msharefs_sb = sb;
+	return 0;
+}
+
+static int
+msharefs_get_tree(struct fs_context *fc)
+{
+	return get_tree_single(fc, msharefs_fill_super);
+}
+
+static const struct fs_context_operations msharefs_context_ops = {
+	.get_tree	= msharefs_get_tree,
+};
+
+static int
+mshare_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &msharefs_context_ops;
+	return 0;
+}
+
+static struct file_system_type mshare_fs = {
+	.name			= "msharefs",
+	.init_fs_context	= mshare_init_fs_context,
+	.kill_sb		= kill_litter_super,
+};
+
+static int
+mshare_init(void)
+{
+	int ret = 0;
+
+	ret = sysfs_create_mount_point(fs_kobj, "mshare");
+	if (ret)
+		return ret;
+
+	ret = register_filesystem(&mshare_fs);
+	if (ret)
+		sysfs_remove_mount_point(fs_kobj, "mshare");
+
+	return ret;
+}
+
+fs_initcall(mshare_init);