diff mbox series

[RFC,2/6] mm: Add msharefs filesystem

Message ID 32784ee26d895bae2484e15fef205d5590720c4b.1642526745.git.khalid.aziz@oracle.com (mailing list archive)
State New
Headers show
Series Add support for shared PTEs across processes | expand

Commit Message

Khalid Aziz Jan. 18, 2022, 9:19 p.m. UTC
Add a ram-based filesystem that contains the files created for each
shared address range. This patch adds just the filesystem and creation
of files. Page table entries for these shared ranges created by mshare
syscall are still not shared.

Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
---
 Documentation/filesystems/msharefs.rst |  19 +++
 include/uapi/linux/magic.h             |   1 +
 mm/mshare.c                            | 191 +++++++++++++++++++++++--
 3 files changed, 197 insertions(+), 14 deletions(-)
 create mode 100644 Documentation/filesystems/msharefs.rst
diff mbox series

Patch

diff --git a/Documentation/filesystems/msharefs.rst b/Documentation/filesystems/msharefs.rst
new file mode 100644
index 000000000000..fd161f67045d
--- /dev/null
+++ b/Documentation/filesystems/msharefs.rst
@@ -0,0 +1,19 @@ 
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================================================
+msharefs - a filesystem to support shared page tables
+=====================================================
+
+msharefs is a ram-based filesystem that allows multiple processes to
+share page table entries for shared pages.
+
+msharefs is typically mounted like this::
+
+	mount -t msharefs none /sys/fs/mshare
+
+When a process calls mshare syscall with a name for the shared address
+range, a file with the same name is created under msharefs with that
+name. This file can be opened by another process, if permissions
+allow, to query the addresses shared under this range. These files are
+removed by mshare_unlink syscall and can not be deleted directly.
+Hence these files are created as immutable files.
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 35687dcb1a42..26a12e33a3c1 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -98,5 +98,6 @@ 
 #define Z3FOLD_MAGIC		0x33
 #define PPC_CMM_MAGIC		0xc7571590
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
+#define MSHARE_MAGIC		0x4d534852	/* "MSHR" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/mm/mshare.c b/mm/mshare.c
index c723f8369f06..e48d0f615f9f 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -10,20 +10,117 @@ 
  *		Khalid Aziz
  */
 
-#include <linux/anon_inodes.h>
 #include <linux/fs.h>
+#include <linux/mount.h>
 #include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/pseudo_fs.h>
+#include <linux/fileattr.h>
+#include <uapi/linux/magic.h>
+#include <uapi/linux/limits.h>
 
-static const struct file_operations mshare_fops = {
+static struct super_block *msharefs_sb;
+
+static const struct file_operations msharefs_file_operations = {
+	.open	= simple_open,
+	.llseek	= no_llseek,
 };
 
+static int
+msharefs_d_hash(const struct dentry *dentry, struct qstr *qstr)
+{
+	unsigned long hash = init_name_hash(dentry);
+	const unsigned char *s = qstr->name;
+	unsigned int len = qstr->len;
+
+	while (len--)
+		hash = partial_name_hash(*s++, hash);
+	qstr->hash = end_name_hash(hash);
+	return 0;
+}
+
+static struct dentry
+*msharefs_alloc_dentry(struct dentry *parent, const char *name)
+{
+	struct dentry *d;
+	struct qstr q;
+	int err;
+
+	q.name = name;
+	q.len = strlen(name);
+
+	err = msharefs_d_hash(parent, &q);
+	if (err)
+		return ERR_PTR(err);
+
+	d = d_alloc(parent, &q);
+	if (d)
+		return d;
+
+	return ERR_PTR(-ENOMEM);
+}
+
+static struct inode
+*msharefs_get_inode(struct super_block *sb, int mode)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_mode = mode;
+
+		/*
+		 * msharefs are not meant to be manipulated from userspace.
+		 * Reading from the file is the only allowed operation
+		 */
+		inode->i_flags = S_IMMUTABLE;
+
+		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+		inode->i_fop = &msharefs_file_operations;
+		inode->i_size = 0;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = current_fsgid();
+	}
+
+	return inode;
+}
+
+static int
+mshare_file_create(const char *name, unsigned long flags)
+{
+	struct inode *inode;
+	struct dentry *root, *dentry;
+	int err = 0;
+
+	root = msharefs_sb->s_root;
+
+	inode = msharefs_get_inode(msharefs_sb, S_IFREG | 0400);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	dentry = msharefs_alloc_dentry(root, name);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto fail_inode;
+	}
+
+	d_add(dentry, inode);
+
+	return err;
+
+fail_inode:
+	iput(inode);
+	return err;
+}
+
 /*
- * mshare syscall. Returns a file descriptor
+ * mshare syscall
  */
-SYSCALL_DEFINE5(mshare, const char *, name, unsigned long, addr,
+SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr,
 		unsigned long, len, int, oflag, mode_t, mode)
 {
-	int fd;
+	char mshare_name[NAME_MAX];
+	int err;
 
 	/*
 	 * Address range being shared must be aligned to pgdir
@@ -32,15 +129,14 @@  SYSCALL_DEFINE5(mshare, const char *, name, unsigned long, addr,
 	if ((addr | len) & (PGDIR_SIZE - 1))
 		return -EINVAL;
 
-	/*
-	 * Allocate a file descriptor to return
-	 *
-	 * TODO: This code ignores the object name completely. Add
-	 * support for that
-	 */
-	fd = anon_inode_getfd("mshare", &mshare_fops, NULL, O_RDWR);
+	err = copy_from_user(mshare_name, name, NAME_MAX);
+	if (err)
+		goto err_out;
 
-	return fd;
+	err = mshare_file_create(mshare_name, oflag);
+
+err_out:
+	return err;
 }
 
 /*
@@ -48,7 +144,8 @@  SYSCALL_DEFINE5(mshare, const char *, name, unsigned long, addr,
  */
 SYSCALL_DEFINE1(mshare_unlink, const char *, name)
 {
-	int fd;
+	char mshare_name[NAME_MAX];
+	int err;
 
 	/*
 	 * Delete the named object
@@ -56,5 +153,71 @@  SYSCALL_DEFINE1(mshare_unlink, const char *, name)
 	 * TODO: Mark mshare'd range for deletion
 	 *
 	 */
+	err = copy_from_user(mshare_name, name, NAME_MAX);
+	if (err)
+		goto err_out;
+	return 0;
+
+err_out:
+	return err;
+}
+
+static const struct dentry_operations msharefs_d_ops = {
+	.d_hash = msharefs_d_hash,
+};
+
+static int
+msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+	static const struct tree_descr empty_descr = {""};
+	int err;
+
+	sb->s_d_op = &msharefs_d_ops;
+	err = simple_fill_super(sb, MSHARE_MAGIC, &empty_descr);
+	if (err)
+		return err;
+
+	msharefs_sb = sb;
+	return 0;
+}
+
+static int
+msharefs_get_tree(struct fs_context *fc)
+{
+	return get_tree_single(fc, msharefs_fill_super);
+}
+
+static const struct fs_context_operations msharefs_context_ops = {
+	.get_tree	= msharefs_get_tree,
+};
+
+static int
+mshare_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &msharefs_context_ops;
 	return 0;
 }
+
+static struct file_system_type mshare_fs = {
+	.name			= "msharefs",
+	.init_fs_context	= mshare_init_fs_context,
+	.kill_sb		= kill_litter_super,
+};
+
+static int
+mshare_init(void)
+{
+	int ret = 0;
+
+	ret = sysfs_create_mount_point(fs_kobj, "mshare");
+	if (ret)
+		return ret;
+
+	ret = register_filesystem(&mshare_fs);
+	if (ret)
+		sysfs_remove_mount_point(fs_kobj, "mshare");
+
+	return ret;
+}
+
+fs_initcall(mshare_init);