Message ID | 20201203062949.5484-6-rppt@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | mm: introduce memfd_secret system call to create "secret" memory areas | expand |
On Thu, Dec 03, 2020 at 08:29:44AM +0200, Mike Rapoport wrote: > +static vm_fault_t secretmem_fault(struct vm_fault *vmf) > +{ > + struct address_space *mapping = vmf->vma->vm_file->f_mapping; > + struct inode *inode = file_inode(vmf->vma->vm_file); > + pgoff_t offset = vmf->pgoff; > + vm_fault_t ret = 0; > + unsigned long addr; > + struct page *page; > + int err; > + > + if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) > + return vmf_error(-EINVAL); > + > + page = find_get_page(mapping, offset); > + if (!page) { > + > + page = secretmem_alloc_page(vmf->gfp_mask); > + if (!page) > + return vmf_error(-ENOMEM); Just use VM_FAULT_OOM directly. > + err = add_to_page_cache(page, mapping, offset, vmf->gfp_mask); > + if (unlikely(err)) > + goto err_put_page; What if the error is EEXIST because somebody else raced with you to add a new page to the page cache? > + err = set_direct_map_invalid_noflush(page, 1); > + if (err) > + goto err_del_page_cache; Does this work correctly if somebody else has a reference to the page in the meantime? > + addr = (unsigned long)page_address(page); > + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); > + > + __SetPageUptodate(page); Once you've added it to the cache, somebody else can come along and try to lock it. They will set PageWaiter. Now you call __SetPageUptodate and wipe out their PageWaiter bit. So you won't wake them up when you unlock. You can call __SetPageUptodate before adding it to the page cache, but once it's visible to another thread, you can't do that. > + ret = VM_FAULT_LOCKED; > + } > + > + vmf->page = page; You're supposed to return the page locked, so use find_lock_page() instead of find_get_page(). > + return ret; > + > +err_del_page_cache: > + delete_from_page_cache(page); > +err_put_page: > + put_page(page); > + return vmf_error(err); > +}
On Tue, Jan 19, 2021 at 08:22:13PM +0000, Matthew Wilcox wrote: > On Thu, Dec 03, 2020 at 08:29:44AM +0200, Mike Rapoport wrote: > > +static vm_fault_t secretmem_fault(struct vm_fault *vmf) > > +{ > > + struct address_space *mapping = vmf->vma->vm_file->f_mapping; > > + struct inode *inode = file_inode(vmf->vma->vm_file); > > + pgoff_t offset = vmf->pgoff; > > + vm_fault_t ret = 0; > > + unsigned long addr; > > + struct page *page; > > + int err; > > + > > + if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) > > + return vmf_error(-EINVAL); > > + > > + page = find_get_page(mapping, offset); > > + if (!page) { > > + > > + page = secretmem_alloc_page(vmf->gfp_mask); > > + if (!page) > > + return vmf_error(-ENOMEM); > > Just use VM_FAULT_OOM directly. Ok. > > + err = add_to_page_cache(page, mapping, offset, vmf->gfp_mask); > > + if (unlikely(err)) > > + goto err_put_page; > > What if the error is EEXIST because somebody else raced with you to add > a new page to the page cache? Right, for -EEXIST I need a retry here, thanks. > > + err = set_direct_map_invalid_noflush(page, 1); > > + if (err) > > + goto err_del_page_cache; > > Does this work correctly if somebody else has a reference to the page > in the meantime? Yes, it does. If somebody else won the race that page was dropped from the direct map and this call would be essentially a nop. And anyway, the very next patch changes the way pages are removed from the direct map ;-) > > + addr = (unsigned long)page_address(page); > > + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); > > + > > + __SetPageUptodate(page); > > Once you've added it to the cache, somebody else can come along and try > to lock it. They will set PageWaiter. Now you call __SetPageUptodate > and wipe out their PageWaiter bit. So you won't wake them up when you > unlock. > > You can call __SetPageUptodate before adding it to the page cache, > but once it's visible to another thread, you can't do that. Will fix. > > + ret = VM_FAULT_LOCKED; > > + } > > + > > + vmf->page = page; > > You're supposed to return the page locked, so use find_lock_page() instead > of find_get_page(). Ok/ > > + return ret; > > + > > +err_del_page_cache: > > + delete_from_page_cache(page); > > +err_put_page: > > + put_page(page); > > + return vmf_error(err); > > +}
On Wed, Jan 20, 2021 at 05:05:10PM +0200, Mike Rapoport wrote: > On Tue, Jan 19, 2021 at 08:22:13PM +0000, Matthew Wilcox wrote: > > On Thu, Dec 03, 2020 at 08:29:44AM +0200, Mike Rapoport wrote: > > > +static vm_fault_t secretmem_fault(struct vm_fault *vmf) > > > +{ > > > + struct address_space *mapping = vmf->vma->vm_file->f_mapping; > > > + struct inode *inode = file_inode(vmf->vma->vm_file); > > > + pgoff_t offset = vmf->pgoff; > > > + vm_fault_t ret = 0; > > > + unsigned long addr; > > > + struct page *page; > > > + int err; > > > + > > > + if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) > > > + return vmf_error(-EINVAL); > > > + > > > + page = find_get_page(mapping, offset); > > > + if (!page) { > > > + > > > + page = secretmem_alloc_page(vmf->gfp_mask); > > > + if (!page) > > > + return vmf_error(-ENOMEM); > > > > Just use VM_FAULT_OOM directly. > > Ok. > > > > + err = add_to_page_cache(page, mapping, offset, vmf->gfp_mask); > > > + if (unlikely(err)) > > > + goto err_put_page; > > > > What if the error is EEXIST because somebody else raced with you to add > > a new page to the page cache? > > Right, for -EEXIST I need a retry here, thanks. > > > > + err = set_direct_map_invalid_noflush(page, 1); > > > + if (err) > > > + goto err_del_page_cache; > > > > Does this work correctly if somebody else has a reference to the page > > in the meantime? > > Yes, it does. If somebody else won the race that page was dropped from the > direct map and this call would be essentially a nop. And anyway, the very > next patch changes the way pages are removed from the direct map ;-) What I'm thinking is: thread A page faults doesn't find page allocates page adds page to page cache thread B page faults does find page in page cache set direct map invalid fails deletes from page cache ... ?
On Wed, Jan 20, 2021 at 04:02:10PM +0000, Matthew Wilcox wrote: > On Wed, Jan 20, 2021 at 05:05:10PM +0200, Mike Rapoport wrote: > > On Tue, Jan 19, 2021 at 08:22:13PM +0000, Matthew Wilcox wrote: > > > On Thu, Dec 03, 2020 at 08:29:44AM +0200, Mike Rapoport wrote: > > > > +static vm_fault_t secretmem_fault(struct vm_fault *vmf) > > > > +{ > > > > + struct address_space *mapping = vmf->vma->vm_file->f_mapping; > > > > + struct inode *inode = file_inode(vmf->vma->vm_file); > > > > + pgoff_t offset = vmf->pgoff; > > > > + vm_fault_t ret = 0; > > > > + unsigned long addr; > > > > + struct page *page; > > > > + int err; > > > > + > > > > + if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) > > > > + return vmf_error(-EINVAL); > > > > + > > > > + page = find_get_page(mapping, offset); > > > > + if (!page) { > > > > + > > > > + page = secretmem_alloc_page(vmf->gfp_mask); > > > > + if (!page) > > > > + return vmf_error(-ENOMEM); > > > > > > Just use VM_FAULT_OOM directly. > > > > Ok. > > > > > > + err = add_to_page_cache(page, mapping, offset, vmf->gfp_mask); > > > > + if (unlikely(err)) > > > > + goto err_put_page; > > > > > > What if the error is EEXIST because somebody else raced with you to add > > > a new page to the page cache? > > > > Right, for -EEXIST I need a retry here, thanks. > > > > > > + err = set_direct_map_invalid_noflush(page, 1); > > > > + if (err) > > > > + goto err_del_page_cache; > > > > > > Does this work correctly if somebody else has a reference to the page > > > in the meantime? > > > > Yes, it does. If somebody else won the race that page was dropped from the > > direct map and this call would be essentially a nop. And anyway, the very > > next patch changes the way pages are removed from the direct map ;-) > > What I'm thinking is: > > thread A page faults > doesn't find page > allocates page > adds page to page cache > thread B page faults > does find page in page cache > set direct map invalid fails > deletes from page cache > ... ? Hmm, this is not nice indeed...
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 34d5fb82f674..7d781fea79c2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -41,7 +41,7 @@ config FORCE_DYNAMIC_FTRACE in order to test the non static function tracing in the generic code, as other architectures still use it. But we only need to keep it around for x86_64. No need to keep it - for x86_32. For x86_32, force DYNAMIC_FTRACE. + for x86_32. For x86_32, force DYNAMIC_FTRACE. # # Arch settings # diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h new file mode 100644 index 000000000000..70e7db9f94fe --- /dev/null +++ b/include/linux/secretmem.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_SECRETMEM_H +#define _LINUX_SECRETMEM_H + +#ifdef CONFIG_SECRETMEM + +bool vma_is_secretmem(struct vm_area_struct *vma); +bool page_is_secretmem(struct page *page); + +#else + +static inline bool vma_is_secretmem(struct vm_area_struct *vma) +{ + return false; +} + +static inline bool page_is_secretmem(struct page *page) +{ + return false; +} + +#endif /* CONFIG_SECRETMEM */ + +#endif /* _LINUX_SECRETMEM_H */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index f3956fc11de6..35687dcb1a42 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -97,5 +97,6 @@ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define Z3FOLD_MAGIC 0x33 #define PPC_CMM_MAGIC 0xc7571590 +#define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2dd6cbb8cabc..805fd7a668be 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -353,6 +353,8 @@ COND_SYSCALL(pkey_mprotect); COND_SYSCALL(pkey_alloc); COND_SYSCALL(pkey_free); +/* memfd_secret */ +COND_SYSCALL(memfd_secret); /* * Architecture specific weak syscall entries. diff --git a/mm/Kconfig b/mm/Kconfig index c89c5444924b..d8d170fa5210 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -884,4 +884,7 @@ config ARCH_HAS_HUGEPD config MAPPING_DIRTY_HELPERS bool +config SECRETMEM + def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED + endmenu diff --git a/mm/Makefile b/mm/Makefile index 6eeb4b29efb8..dfda14c48a75 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -121,3 +121,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o +obj-$(CONFIG_SECRETMEM) += secretmem.o diff --git a/mm/gup.c b/mm/gup.c index 5ec98de1e5de..71164fa83114 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -10,6 +10,7 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/secretmem.h> #include <linux/sched/signal.h> #include <linux/rwsem.h> @@ -793,6 +794,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, struct follow_page_context ctx = { NULL }; struct page *page; + if (vma_is_secretmem(vma)) + return NULL; + page = follow_page_mask(vma, address, foll_flags, &ctx); if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); @@ -923,6 +927,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma)) return -EFAULT; + if (vma_is_secretmem(vma)) + return -EFAULT; + if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) @@ -2196,6 +2203,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); + if (page_is_secretmem(page)) + goto pte_unmap; + head = try_grab_compound_head(page, 1, flags); if (!head) goto pte_unmap; diff --git a/mm/secretmem.c b/mm/secretmem.c new file mode 100644 index 000000000000..781aaaca8c70 --- /dev/null +++ b/mm/secretmem.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2020 + * + * Author: Mike Rapoport <rppt@linux.ibm.com> + */ + +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/memfd.h> +#include <linux/bitops.h> +#include <linux/printk.h> +#include <linux/pagemap.h> +#include <linux/syscalls.h> +#include <linux/pseudo_fs.h> +#include <linux/secretmem.h> +#include <linux/set_memory.h> +#include <linux/sched/signal.h> + +#include <uapi/linux/magic.h> + +#include <asm/tlbflush.h> + +#include "internal.h" + +#undef pr_fmt +#define pr_fmt(fmt) "secretmem: " fmt + +/* + * Define mode and flag masks to allow validation of the system call + * parameters. + */ +#define SECRETMEM_MODE_MASK (0x0) +#define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK + +struct secretmem_ctx { + unsigned int mode; +}; + +static struct page *secretmem_alloc_page(gfp_t gfp) +{ + /* + * FIXME: use a cache of large pages to reduce the direct map + * fragmentation + */ + return alloc_page(gfp); +} + +static vm_fault_t secretmem_fault(struct vm_fault *vmf) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + struct inode *inode = file_inode(vmf->vma->vm_file); + pgoff_t offset = vmf->pgoff; + vm_fault_t ret = 0; + unsigned long addr; + struct page *page; + int err; + + if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) + return vmf_error(-EINVAL); + + page = find_get_page(mapping, offset); + if (!page) { + + page = secretmem_alloc_page(vmf->gfp_mask); + if (!page) + return vmf_error(-ENOMEM); + + err = add_to_page_cache(page, mapping, offset, vmf->gfp_mask); + if (unlikely(err)) + goto err_put_page; + + err = set_direct_map_invalid_noflush(page, 1); + if (err) + goto err_del_page_cache; + + addr = (unsigned long)page_address(page); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + + __SetPageUptodate(page); + + ret = VM_FAULT_LOCKED; + } + + vmf->page = page; + return ret; + +err_del_page_cache: + delete_from_page_cache(page); +err_put_page: + put_page(page); + return vmf_error(err); +} + +static const struct vm_operations_struct secretmem_vm_ops = { + .fault = secretmem_fault, +}; + +static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + unsigned long len = vma->vm_end - vma->vm_start; + + if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) + return -EINVAL; + + if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) + return -EAGAIN; + + vma->vm_ops = &secretmem_vm_ops; + vma->vm_flags |= VM_LOCKED; + + return 0; +} + +bool vma_is_secretmem(struct vm_area_struct *vma) +{ + return vma->vm_ops == &secretmem_vm_ops; +} + +static const struct file_operations secretmem_fops = { + .mmap = secretmem_mmap, +}; + +static bool secretmem_isolate_page(struct page *page, isolate_mode_t mode) +{ + return false; +} + +static int secretmem_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + return -EBUSY; +} + +static void secretmem_freepage(struct page *page) +{ + set_direct_map_default_noflush(page, 1); + clear_highpage(page); +} + +static const struct address_space_operations secretmem_aops = { + .freepage = secretmem_freepage, + .migratepage = secretmem_migratepage, + .isolate_page = secretmem_isolate_page, +}; + +bool page_is_secretmem(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (!mapping) + return false; + + return mapping->a_ops == &secretmem_aops; +} + +static struct vfsmount *secretmem_mnt; + +static struct file *secretmem_file_create(unsigned long flags) +{ + struct file *file = ERR_PTR(-ENOMEM); + struct secretmem_ctx *ctx; + struct inode *inode; + + inode = alloc_anon_inode(secretmem_mnt->mnt_sb); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + goto err_free_inode; + + file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", + O_RDWR, &secretmem_fops); + if (IS_ERR(file)) + goto err_free_ctx; + + mapping_set_unevictable(inode->i_mapping); + + inode->i_mapping->private_data = ctx; + inode->i_mapping->a_ops = &secretmem_aops; + + /* pretend we are a normal file with zero size */ + inode->i_mode |= S_IFREG; + inode->i_size = 0; + + file->private_data = ctx; + + ctx->mode = flags & SECRETMEM_MODE_MASK; + + return file; + +err_free_ctx: + kfree(ctx); +err_free_inode: + iput(inode); + return file; +} + +SYSCALL_DEFINE1(memfd_secret, unsigned long, flags) +{ + struct file *file; + int fd, err; + + /* make sure local flags do not confict with global fcntl.h */ + BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); + + if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) + return -EINVAL; + + fd = get_unused_fd_flags(flags & O_CLOEXEC); + if (fd < 0) + return fd; + + file = secretmem_file_create(flags); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto err_put_fd; + } + + file->f_flags |= O_LARGEFILE; + + fd_install(fd, file); + return fd; + +err_put_fd: + put_unused_fd(fd); + return err; +} + +static void secretmem_evict_inode(struct inode *inode) +{ + struct secretmem_ctx *ctx = inode->i_private; + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + kfree(ctx); +} + +static const struct super_operations secretmem_super_ops = { + .evict_inode = secretmem_evict_inode, +}; + +static int secretmem_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx = init_pseudo(fc, SECRETMEM_MAGIC); + + if (!ctx) + return -ENOMEM; + ctx->ops = &secretmem_super_ops; + + return 0; +} + +static struct file_system_type secretmem_fs = { + .name = "secretmem", + .init_fs_context = secretmem_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int secretmem_init(void) +{ + int ret = 0; + + secretmem_mnt = kern_mount(&secretmem_fs); + if (IS_ERR(secretmem_mnt)) + ret = PTR_ERR(secretmem_mnt); + + return ret; +} +fs_initcall(secretmem_init);