Message ID | 5c96dd165d7ec3da14306b8fd857c7eb95a8c3e6.1649370874.git.khalid.aziz@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Add support for shared PTEs across processes | expand |
On 4/11/22 09:05, Khalid Aziz wrote: > This patch adds basic page table sharing across tasks by making > mshare syscall. It does this by creating a new mm_struct which > hosts the shared vmas and page tables. This mm_struct is > maintained as long as there is at least one task using the mshare'd > range. It is cleaned up by the last mshare_unlink syscall. This sounds like a really good idea because it (in theory) totally separates the lifetime of the *source* of the page tables from the lifetime of the process that creates the mshare. > diff --git a/mm/internal.h b/mm/internal.h > index cf50a471384e..68f82f0f8b66 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -718,6 +718,8 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); > int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, > unsigned long addr, int page_nid, int *flags); > > +extern vm_fault_t find_shared_vma(struct vm_area_struct **vma, > + unsigned long *addrp); > static inline bool vma_is_shared(const struct vm_area_struct *vma) > { > return vma->vm_flags & VM_SHARED_PT; > diff --git a/mm/memory.c b/mm/memory.c > index c125c4969913..c77c0d643ea8 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -4776,6 +4776,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > unsigned int flags, struct pt_regs *regs) > { > vm_fault_t ret; > + bool shared = false; > > __set_current_state(TASK_RUNNING); > > @@ -4785,6 +4786,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > /* do counter updates before entering really critical section. */ > check_sync_rss_stat(current); > > + if (unlikely(vma_is_shared(vma))) { > + ret = find_shared_vma(&vma, &address); > + if (ret) > + return ret; > + if (!vma) > + return VM_FAULT_SIGSEGV; > + shared = true; > + } > + > if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, > flags & FAULT_FLAG_INSTRUCTION, > flags & FAULT_FLAG_REMOTE)) > @@ -4802,6 +4812,31 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > else > ret = __handle_mm_fault(vma, address, flags); > > + /* > + * Release the read lock on shared VMA's parent mm unless > + * __handle_mm_fault released the lock already. > + * __handle_mm_fault sets VM_FAULT_RETRY in return value if > + * it released mmap lock. If lock was released, that implies > + * the lock would have been released on task's original mm if > + * this were not a shared PTE vma. To keep lock state consistent, > + * make sure to release the lock on task's original mm > + */ > + if (shared) { > + int release_mmlock = 1; > + > + if (!(ret & VM_FAULT_RETRY)) { > + mmap_read_unlock(vma->vm_mm); > + release_mmlock = 0; > + } else if ((flags & FAULT_FLAG_ALLOW_RETRY) && > + (flags & FAULT_FLAG_RETRY_NOWAIT)) { > + mmap_read_unlock(vma->vm_mm); > + release_mmlock = 0; > + } > + > + if (release_mmlock) > + mmap_read_unlock(current->mm); > + } Are we guaranteed that current->mm == the original vma->vm_mm? Just a quick scan of handle_mm_fault() users shows a few suspect ones like hmm_range_fault() or iommu_v2.c::do_fault(). > diff --git a/mm/mshare.c b/mm/mshare.c > index cd2f7ad24d9d..d1896adcb00f 100644 > --- a/mm/mshare.c > +++ b/mm/mshare.c > @@ -17,18 +17,49 @@ > #include <linux/pseudo_fs.h> > #include <linux/fileattr.h> > #include <linux/refcount.h> > +#include <linux/mman.h> > #include <linux/sched/mm.h> > #include <uapi/linux/magic.h> > #include <uapi/linux/limits.h> > > struct mshare_data { > - struct mm_struct *mm; > + struct mm_struct *mm, *host_mm; > mode_t mode; > refcount_t refcnt; > }; > > static struct super_block *msharefs_sb; > > +/* Returns holding the host mm's lock for read. Caller must release. */ > +vm_fault_t > +find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp) > +{ > + struct vm_area_struct *vma, *guest = *vmap; > + struct mshare_data *info = guest->vm_private_data; > + struct mm_struct *host_mm = info->mm; > + unsigned long host_addr; > + pgd_t *pgd, *guest_pgd; > + > + host_addr = *addrp - guest->vm_start + host_mm->mmap_base; > + pgd = pgd_offset(host_mm, host_addr); > + guest_pgd = pgd_offset(current->mm, *addrp); > + if (!pgd_same(*guest_pgd, *pgd)) { > + set_pgd(guest_pgd, *pgd); > + return VM_FAULT_NOPAGE; > + } Is digging around in the other process's page tables OK without holding any locks? > + *addrp = host_addr; > + mmap_read_lock(host_mm); > + vma = find_vma(host_mm, host_addr); > + > + /* XXX: expand stack? */ > + if (vma && vma->vm_start > host_addr) > + vma = NULL; > + > + *vmap = vma; > + return 0; > +} > + > static void > msharefs_evict_inode(struct inode *inode) > { > @@ -169,11 +200,13 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, > unsigned long, len, int, oflag, mode_t, mode) > { > struct mshare_data *info; > - struct mm_struct *mm; > struct filename *fname = getname(name); > struct dentry *dentry; > struct inode *inode; > struct qstr namestr; > + struct vm_area_struct *vma, *next, *new_vma; > + struct mm_struct *new_mm; > + unsigned long end; > int err = PTR_ERR(fname); > > /* > @@ -193,6 +226,8 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, > if (IS_ERR(fname)) > goto err_out; > > + end = addr + len; > + > /* > * Does this mshare entry exist already? If it does, calling > * mshare with O_EXCL|O_CREAT is an error > @@ -205,49 +240,165 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, > inode_lock(d_inode(msharefs_sb->s_root)); > dentry = d_lookup(msharefs_sb->s_root, &namestr); > if (dentry && (oflag & (O_EXCL|O_CREAT))) { > + inode = d_inode(dentry); > err = -EEXIST; > dput(dentry); > goto err_unlock_inode; > } > > if (dentry) { > + unsigned long mapaddr, prot = PROT_NONE; > + > inode = d_inode(dentry); > if (inode == NULL) { > + mmap_write_unlock(current->mm); > err = -EINVAL; > goto err_out; > } > info = inode->i_private; > - refcount_inc(&info->refcnt); > dput(dentry); > + > + /* > + * Map in the address range as anonymous mappings > + */ > + oflag &= (O_RDONLY | O_WRONLY | O_RDWR); > + if (oflag & O_RDONLY) > + prot |= PROT_READ; > + else if (oflag & O_WRONLY) > + prot |= PROT_WRITE; > + else if (oflag & O_RDWR) > + prot |= (PROT_READ | PROT_WRITE); > + mapaddr = vm_mmap(NULL, addr, len, prot, > + MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, 0); > + if (IS_ERR((void *)mapaddr)) { > + err = -EINVAL; > + goto err_out; > + } > + > + refcount_inc(&info->refcnt); > + > + /* > + * Now that we have mmap'd the mshare'd range, update vma > + * flags and vm_mm pointer for this mshare'd range. > + */ > + mmap_write_lock(current->mm); > + vma = find_vma(current->mm, addr); > + if (vma && vma->vm_start < addr) { > + mmap_write_unlock(current->mm); > + err = -EINVAL; > + goto err_out; > + } How do you know that this is the same anonymous VMA that you set up above? Couldn't it have been unmapped and remapped to be something random before the mmap_write_lock() is reacquired? > + while (vma && vma->vm_start < (addr + len)) { > + vma->vm_private_data = info; > + vma->vm_mm = info->mm; > + vma->vm_flags |= VM_SHARED_PT; > + next = vma->vm_next; > + vma = next; > + } This vma is still in the mm->mm_rb tree, right? I'm kinda surprised that it's OK to have a VMA in mm->vm_rb have vma->vm_mm!=mm. > } else { > - mm = mm_alloc(); > - if (!mm) > + unsigned long myaddr; > + struct mm_struct *old_mm; > + > + old_mm = current->mm; > + new_mm = mm_alloc(); > + if (!new_mm) > return -ENOMEM; > info = kzalloc(sizeof(*info), GFP_KERNEL); > if (!info) { > err = -ENOMEM; > goto err_relmm; > } > - mm->mmap_base = addr; > - mm->task_size = addr + len; > - if (!mm->task_size) > - mm->task_size--; > - info->mm = mm; > + new_mm->mmap_base = addr; > + new_mm->task_size = addr + len; > + if (!new_mm->task_size) > + new_mm->task_size--; > + info->mm = new_mm; > + info->host_mm = old_mm; > info->mode = mode; > refcount_set(&info->refcnt, 1); > + > + /* > + * VMAs for this address range may or may not exist. > + * If VMAs exist, they should be marked as shared at > + * this point and page table info should be copied > + * over to newly created mm_struct. TODO: If VMAs do not > + * exist, create them and mark them as shared. > + */ At this point, there are just too many TODO's in this series to look at it seriously. I think what you have here is an entertaining proof-of-concept, but it's looking to me to be obviously still RFC quality. Do you seriously think anyone could even *think* about applying this series at this point?
On 4/11/22 12:48, Dave Hansen wrote: > On 4/11/22 09:05, Khalid Aziz wrote: >> This patch adds basic page table sharing across tasks by making >> mshare syscall. It does this by creating a new mm_struct which >> hosts the shared vmas and page tables. This mm_struct is >> maintained as long as there is at least one task using the mshare'd >> range. It is cleaned up by the last mshare_unlink syscall. > > This sounds like a really good idea because it (in theory) totally > separates the lifetime of the *source* of the page tables from the > lifetime of the process that creates the mshare. > >> diff --git a/mm/internal.h b/mm/internal.h >> index cf50a471384e..68f82f0f8b66 100644 >> --- a/mm/internal.h >> +++ b/mm/internal.h >> @@ -718,6 +718,8 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); >> int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, >> unsigned long addr, int page_nid, int *flags); >> >> +extern vm_fault_t find_shared_vma(struct vm_area_struct **vma, >> + unsigned long *addrp); >> static inline bool vma_is_shared(const struct vm_area_struct *vma) >> { >> return vma->vm_flags & VM_SHARED_PT; >> diff --git a/mm/memory.c b/mm/memory.c >> index c125c4969913..c77c0d643ea8 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -4776,6 +4776,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, >> unsigned int flags, struct pt_regs *regs) >> { >> vm_fault_t ret; >> + bool shared = false; >> >> __set_current_state(TASK_RUNNING); >> >> @@ -4785,6 +4786,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, >> /* do counter updates before entering really critical section. */ >> check_sync_rss_stat(current); >> >> + if (unlikely(vma_is_shared(vma))) { >> + ret = find_shared_vma(&vma, &address); >> + if (ret) >> + return ret; >> + if (!vma) >> + return VM_FAULT_SIGSEGV; >> + shared = true; >> + } >> + >> if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, >> flags & FAULT_FLAG_INSTRUCTION, >> flags & FAULT_FLAG_REMOTE)) >> @@ -4802,6 +4812,31 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, >> else >> ret = __handle_mm_fault(vma, address, flags); >> >> + /* >> + * Release the read lock on shared VMA's parent mm unless >> + * __handle_mm_fault released the lock already. >> + * __handle_mm_fault sets VM_FAULT_RETRY in return value if >> + * it released mmap lock. If lock was released, that implies >> + * the lock would have been released on task's original mm if >> + * this were not a shared PTE vma. To keep lock state consistent, >> + * make sure to release the lock on task's original mm >> + */ >> + if (shared) { >> + int release_mmlock = 1; >> + >> + if (!(ret & VM_FAULT_RETRY)) { >> + mmap_read_unlock(vma->vm_mm); >> + release_mmlock = 0; >> + } else if ((flags & FAULT_FLAG_ALLOW_RETRY) && >> + (flags & FAULT_FLAG_RETRY_NOWAIT)) { >> + mmap_read_unlock(vma->vm_mm); >> + release_mmlock = 0; >> + } >> + >> + if (release_mmlock) >> + mmap_read_unlock(current->mm); >> + } > > Are we guaranteed that current->mm == the original vma->vm_mm? Just a > quick scan of handle_mm_fault() users shows a few suspect ones like > hmm_range_fault() or iommu_v2.c::do_fault(). You are probably right. Safe thing to do would be to save the original vma->vm_mm before calling find_shared_vma() and use this saved value to unlock later if needed. I will fix that. > >> diff --git a/mm/mshare.c b/mm/mshare.c >> index cd2f7ad24d9d..d1896adcb00f 100644 >> --- a/mm/mshare.c >> +++ b/mm/mshare.c >> @@ -17,18 +17,49 @@ >> #include <linux/pseudo_fs.h> >> #include <linux/fileattr.h> >> #include <linux/refcount.h> >> +#include <linux/mman.h> >> #include <linux/sched/mm.h> >> #include <uapi/linux/magic.h> >> #include <uapi/linux/limits.h> >> >> struct mshare_data { >> - struct mm_struct *mm; >> + struct mm_struct *mm, *host_mm; >> mode_t mode; >> refcount_t refcnt; >> }; >> >> static struct super_block *msharefs_sb; >> >> +/* Returns holding the host mm's lock for read. Caller must release. */ >> +vm_fault_t >> +find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp) >> +{ >> + struct vm_area_struct *vma, *guest = *vmap; >> + struct mshare_data *info = guest->vm_private_data; >> + struct mm_struct *host_mm = info->mm; >> + unsigned long host_addr; >> + pgd_t *pgd, *guest_pgd; >> + >> + host_addr = *addrp - guest->vm_start + host_mm->mmap_base; >> + pgd = pgd_offset(host_mm, host_addr); >> + guest_pgd = pgd_offset(current->mm, *addrp); >> + if (!pgd_same(*guest_pgd, *pgd)) { >> + set_pgd(guest_pgd, *pgd); >> + return VM_FAULT_NOPAGE; >> + } > > Is digging around in the other process's page tables OK without holding > any locks? current->mm should already be locked when handle_mm_fault() is called. "mmap_read_lock(host_mm)" should be moved up to before calling pgd_offset(). I will fix that. > >> + *addrp = host_addr; >> + mmap_read_lock(host_mm); >> + vma = find_vma(host_mm, host_addr); >> + >> + /* XXX: expand stack? */ >> + if (vma && vma->vm_start > host_addr) >> + vma = NULL; >> + >> + *vmap = vma; >> + return 0; >> +} >> + >> static void >> msharefs_evict_inode(struct inode *inode) >> { >> @@ -169,11 +200,13 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, >> unsigned long, len, int, oflag, mode_t, mode) >> { >> struct mshare_data *info; >> - struct mm_struct *mm; >> struct filename *fname = getname(name); >> struct dentry *dentry; >> struct inode *inode; >> struct qstr namestr; >> + struct vm_area_struct *vma, *next, *new_vma; >> + struct mm_struct *new_mm; >> + unsigned long end; >> int err = PTR_ERR(fname); >> >> /* >> @@ -193,6 +226,8 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, >> if (IS_ERR(fname)) >> goto err_out; >> >> + end = addr + len; >> + >> /* >> * Does this mshare entry exist already? If it does, calling >> * mshare with O_EXCL|O_CREAT is an error >> @@ -205,49 +240,165 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, >> inode_lock(d_inode(msharefs_sb->s_root)); >> dentry = d_lookup(msharefs_sb->s_root, &namestr); >> if (dentry && (oflag & (O_EXCL|O_CREAT))) { >> + inode = d_inode(dentry); >> err = -EEXIST; >> dput(dentry); >> goto err_unlock_inode; >> } >> >> if (dentry) { >> + unsigned long mapaddr, prot = PROT_NONE; >> + >> inode = d_inode(dentry); >> if (inode == NULL) { >> + mmap_write_unlock(current->mm); >> err = -EINVAL; >> goto err_out; >> } >> info = inode->i_private; >> - refcount_inc(&info->refcnt); >> dput(dentry); >> + >> + /* >> + * Map in the address range as anonymous mappings >> + */ >> + oflag &= (O_RDONLY | O_WRONLY | O_RDWR); >> + if (oflag & O_RDONLY) >> + prot |= PROT_READ; >> + else if (oflag & O_WRONLY) >> + prot |= PROT_WRITE; >> + else if (oflag & O_RDWR) >> + prot |= (PROT_READ | PROT_WRITE); >> + mapaddr = vm_mmap(NULL, addr, len, prot, >> + MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, 0); >> + if (IS_ERR((void *)mapaddr)) { >> + err = -EINVAL; >> + goto err_out; >> + } >> + >> + refcount_inc(&info->refcnt); >> + >> + /* >> + * Now that we have mmap'd the mshare'd range, update vma >> + * flags and vm_mm pointer for this mshare'd range. >> + */ >> + mmap_write_lock(current->mm); >> + vma = find_vma(current->mm, addr); >> + if (vma && vma->vm_start < addr) { >> + mmap_write_unlock(current->mm); >> + err = -EINVAL; >> + goto err_out; >> + } > > How do you know that this is the same anonymous VMA that you set up > above? Couldn't it have been unmapped and remapped to be something > random before the mmap_write_lock() is reacquired? Good point. The one check I have after find_vma() is not enough. I need to add more checks to validate this vma. > >> + while (vma && vma->vm_start < (addr + len)) { >> + vma->vm_private_data = info; >> + vma->vm_mm = info->mm; >> + vma->vm_flags |= VM_SHARED_PT; >> + next = vma->vm_next; >> + vma = next; >> + } > > This vma is still in the mm->mm_rb tree, right? I'm kinda surprised > that it's OK to have a VMA in mm->vm_rb have vma->vm_mm!=mm. I will look into what needs to be fixed up here. One possibility is to not change vm_mm. I think I can work without changing vm_mm for donor or client processes as long as vm_mm in the host mm for mshare'd vmas points to the host mm. > >> } else { >> - mm = mm_alloc(); >> - if (!mm) >> + unsigned long myaddr; >> + struct mm_struct *old_mm; >> + >> + old_mm = current->mm; >> + new_mm = mm_alloc(); >> + if (!new_mm) >> return -ENOMEM; >> info = kzalloc(sizeof(*info), GFP_KERNEL); >> if (!info) { >> err = -ENOMEM; >> goto err_relmm; >> } >> - mm->mmap_base = addr; >> - mm->task_size = addr + len; >> - if (!mm->task_size) >> - mm->task_size--; >> - info->mm = mm; >> + new_mm->mmap_base = addr; >> + new_mm->task_size = addr + len; >> + if (!new_mm->task_size) >> + new_mm->task_size--; >> + info->mm = new_mm; >> + info->host_mm = old_mm; >> info->mode = mode; >> refcount_set(&info->refcnt, 1); >> + >> + /* >> + * VMAs for this address range may or may not exist. >> + * If VMAs exist, they should be marked as shared at >> + * this point and page table info should be copied >> + * over to newly created mm_struct. TODO: If VMAs do not >> + * exist, create them and mark them as shared. >> + */ > > At this point, there are just too many TODO's in this series to look at > it seriously. I think what you have here is an entertaining > proof-of-concept, but it's looking to me to be obviously still RFC > quality. Do you seriously think anyone could even *think* about > applying this series at this point? > Fair enough. Some of the TODOs are meant to be reminders for expansion of functionality (like this one is to support calling mshare without having to mmap the address range first), but I should clean these up and I will do that. I appreciate your taking time to provide yuseful feedback. Thanks, Khalid
On Tue, Apr 12, 2022 at 4:07 AM Khalid Aziz <khalid.aziz@oracle.com> wrote: > > This patch adds basic page table sharing across tasks by making > mshare syscall. It does this by creating a new mm_struct which > hosts the shared vmas and page tables. This mm_struct is > maintained as long as there is at least one task using the mshare'd > range. It is cleaned up by the last mshare_unlink syscall. > > Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > --- > mm/internal.h | 2 + > mm/memory.c | 35 ++++++++++ > mm/mshare.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++---- > 3 files changed, 214 insertions(+), 13 deletions(-) > > diff --git a/mm/internal.h b/mm/internal.h > index cf50a471384e..68f82f0f8b66 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -718,6 +718,8 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); > int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, > unsigned long addr, int page_nid, int *flags); > > +extern vm_fault_t find_shared_vma(struct vm_area_struct **vma, > + unsigned long *addrp); > static inline bool vma_is_shared(const struct vm_area_struct *vma) > { > return vma->vm_flags & VM_SHARED_PT; > diff --git a/mm/memory.c b/mm/memory.c > index c125c4969913..c77c0d643ea8 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -4776,6 +4776,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > unsigned int flags, struct pt_regs *regs) > { > vm_fault_t ret; > + bool shared = false; > > __set_current_state(TASK_RUNNING); > > @@ -4785,6 +4786,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > /* do counter updates before entering really critical section. */ > check_sync_rss_stat(current); > > + if (unlikely(vma_is_shared(vma))) { > + ret = find_shared_vma(&vma, &address); > + if (ret) > + return ret; > + if (!vma) > + return VM_FAULT_SIGSEGV; > + shared = true; > + } > + > if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, > flags & FAULT_FLAG_INSTRUCTION, > flags & FAULT_FLAG_REMOTE)) > @@ -4802,6 +4812,31 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > else > ret = __handle_mm_fault(vma, address, flags); > > + /* > + * Release the read lock on shared VMA's parent mm unless > + * __handle_mm_fault released the lock already. > + * __handle_mm_fault sets VM_FAULT_RETRY in return value if > + * it released mmap lock. If lock was released, that implies > + * the lock would have been released on task's original mm if > + * this were not a shared PTE vma. To keep lock state consistent, > + * make sure to release the lock on task's original mm > + */ > + if (shared) { > + int release_mmlock = 1; > + > + if (!(ret & VM_FAULT_RETRY)) { > + mmap_read_unlock(vma->vm_mm); > + release_mmlock = 0; > + } else if ((flags & FAULT_FLAG_ALLOW_RETRY) && > + (flags & FAULT_FLAG_RETRY_NOWAIT)) { > + mmap_read_unlock(vma->vm_mm); > + release_mmlock = 0; > + } > + > + if (release_mmlock) > + mmap_read_unlock(current->mm); > + } > + > if (flags & FAULT_FLAG_USER) { > mem_cgroup_exit_user_fault(); > /* > diff --git a/mm/mshare.c b/mm/mshare.c > index cd2f7ad24d9d..d1896adcb00f 100644 > --- a/mm/mshare.c > +++ b/mm/mshare.c > @@ -17,18 +17,49 @@ > #include <linux/pseudo_fs.h> > #include <linux/fileattr.h> > #include <linux/refcount.h> > +#include <linux/mman.h> > #include <linux/sched/mm.h> > #include <uapi/linux/magic.h> > #include <uapi/linux/limits.h> > > struct mshare_data { > - struct mm_struct *mm; > + struct mm_struct *mm, *host_mm; > mode_t mode; > refcount_t refcnt; > }; > > static struct super_block *msharefs_sb; > > +/* Returns holding the host mm's lock for read. Caller must release. */ > +vm_fault_t > +find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp) > +{ > + struct vm_area_struct *vma, *guest = *vmap; > + struct mshare_data *info = guest->vm_private_data; > + struct mm_struct *host_mm = info->mm; > + unsigned long host_addr; > + pgd_t *pgd, *guest_pgd; > + > + host_addr = *addrp - guest->vm_start + host_mm->mmap_base; > + pgd = pgd_offset(host_mm, host_addr); > + guest_pgd = pgd_offset(current->mm, *addrp); > + if (!pgd_same(*guest_pgd, *pgd)) { > + set_pgd(guest_pgd, *pgd); > + return VM_FAULT_NOPAGE; > + } > + > + *addrp = host_addr; > + mmap_read_lock(host_mm); > + vma = find_vma(host_mm, host_addr); > + > + /* XXX: expand stack? */ > + if (vma && vma->vm_start > host_addr) > + vma = NULL; > + > + *vmap = vma; > + return 0; > +} > + > static void > msharefs_evict_inode(struct inode *inode) > { > @@ -169,11 +200,13 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, > unsigned long, len, int, oflag, mode_t, mode) > { > struct mshare_data *info; > - struct mm_struct *mm; > struct filename *fname = getname(name); > struct dentry *dentry; > struct inode *inode; > struct qstr namestr; > + struct vm_area_struct *vma, *next, *new_vma; > + struct mm_struct *new_mm; > + unsigned long end; > int err = PTR_ERR(fname); > > /* > @@ -193,6 +226,8 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, > if (IS_ERR(fname)) > goto err_out; > > + end = addr + len; > + > /* > * Does this mshare entry exist already? If it does, calling > * mshare with O_EXCL|O_CREAT is an error > @@ -205,49 +240,165 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, > inode_lock(d_inode(msharefs_sb->s_root)); > dentry = d_lookup(msharefs_sb->s_root, &namestr); > if (dentry && (oflag & (O_EXCL|O_CREAT))) { > + inode = d_inode(dentry); > err = -EEXIST; > dput(dentry); > goto err_unlock_inode; > } > > if (dentry) { > + unsigned long mapaddr, prot = PROT_NONE; > + > inode = d_inode(dentry); > if (inode == NULL) { > + mmap_write_unlock(current->mm); > err = -EINVAL; > goto err_out; > } > info = inode->i_private; > - refcount_inc(&info->refcnt); > dput(dentry); > + > + /* > + * Map in the address range as anonymous mappings > + */ > + oflag &= (O_RDONLY | O_WRONLY | O_RDWR); > + if (oflag & O_RDONLY) > + prot |= PROT_READ; > + else if (oflag & O_WRONLY) > + prot |= PROT_WRITE; > + else if (oflag & O_RDWR) > + prot |= (PROT_READ | PROT_WRITE); > + mapaddr = vm_mmap(NULL, addr, len, prot, > + MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, 0); From the perspective of hardware, do we have to use MAP_FIXED to make sure those processes sharing PTE use the same virtual address for the shared area? or actually we don't necessarily need it? as long as the upper level pgtable entries point to the same lower level pgtable? > + if (IS_ERR((void *)mapaddr)) { > + err = -EINVAL; > + goto err_out; > + } > + > + refcount_inc(&info->refcnt); > + > + /* > + * Now that we have mmap'd the mshare'd range, update vma > + * flags and vm_mm pointer for this mshare'd range. > + */ > + mmap_write_lock(current->mm); > + vma = find_vma(current->mm, addr); > + if (vma && vma->vm_start < addr) { > + mmap_write_unlock(current->mm); > + err = -EINVAL; > + goto err_out; > + } > + > + while (vma && vma->vm_start < (addr + len)) { > + vma->vm_private_data = info; > + vma->vm_mm = info->mm; > + vma->vm_flags |= VM_SHARED_PT; > + next = vma->vm_next; > + vma = next; > + } > } else { > - mm = mm_alloc(); > - if (!mm) > + unsigned long myaddr; > + struct mm_struct *old_mm; > + > + old_mm = current->mm; > + new_mm = mm_alloc(); > + if (!new_mm) > return -ENOMEM; > info = kzalloc(sizeof(*info), GFP_KERNEL); > if (!info) { > err = -ENOMEM; > goto err_relmm; > } > - mm->mmap_base = addr; > - mm->task_size = addr + len; > - if (!mm->task_size) > - mm->task_size--; > - info->mm = mm; > + new_mm->mmap_base = addr; > + new_mm->task_size = addr + len; > + if (!new_mm->task_size) > + new_mm->task_size--; > + info->mm = new_mm; > + info->host_mm = old_mm; > info->mode = mode; > refcount_set(&info->refcnt, 1); > + > + /* > + * VMAs for this address range may or may not exist. > + * If VMAs exist, they should be marked as shared at > + * this point and page table info should be copied > + * over to newly created mm_struct. TODO: If VMAs do not > + * exist, create them and mark them as shared. > + */ > + mmap_write_lock(old_mm); > + vma = find_vma_intersection(old_mm, addr, end); > + if (!vma) { > + err = -EINVAL; > + goto unlock; > + } > + /* > + * TODO: If the currently allocated VMA goes beyond the > + * mshare'd range, this VMA needs to be split. > + * > + * Double check that source VMAs do not extend outside > + * the range > + */ > + vma = find_vma(old_mm, addr + len); > + if (vma && vma->vm_start < (addr + len)) { > + err = -EINVAL; > + goto unlock; > + } > + > + vma = find_vma(old_mm, addr); > + if (vma && vma->vm_start < addr) { > + err = -EINVAL; > + goto unlock; > + } > + > + mmap_write_lock(new_mm); > + while (vma && vma->vm_start < (addr + len)) { > + /* > + * Copy this vma over to host mm > + */ > + vma->vm_private_data = info; > + vma->vm_mm = new_mm; > + vma->vm_flags |= VM_SHARED_PT; > + new_vma = vm_area_dup(vma); > + if (!new_vma) { > + err = -ENOMEM; > + goto unlock; > + } > + err = insert_vm_struct(new_mm, new_vma); > + if (err) > + goto unlock; > + > + vma = vma->vm_next; > + } > + mmap_write_unlock(new_mm); > + > err = mshare_file_create(fname, oflag, info); > if (err) > - goto err_relinfo; > + goto unlock; > + > + /* > + * Copy over current PTEs > + */ > + myaddr = addr; > + while (myaddr < new_mm->task_size) { > + *pgd_offset(new_mm, myaddr) = *pgd_offset(old_mm, myaddr); > + myaddr += PGDIR_SIZE; > + } > + /* > + * TODO: Free the corresponding page table in calling > + * process > + */ > } > > + mmap_write_unlock(current->mm); > inode_unlock(d_inode(msharefs_sb->s_root)); > putname(fname); > return 0; > > -err_relinfo: > +unlock: > + mmap_write_unlock(current->mm); > kfree(info); > err_relmm: > - mmput(mm); > + mmput(new_mm); > err_unlock_inode: > inode_unlock(d_inode(msharefs_sb->s_root)); > err_out: > @@ -294,11 +445,24 @@ SYSCALL_DEFINE1(mshare_unlink, const char *, name) > > /* > * Is this the last reference? > + * TODO: permission checks are needed before proceeding > */ > if (refcount_dec_and_test(&info->refcnt)) { > simple_unlink(d_inode(msharefs_sb->s_root), dentry); > d_drop(dentry); > d_delete(dentry); > + /* > + * TODO: Release all physical pages allocated for this > + * mshare range and release associated page table. If > + * the final unlink happens from the process that created > + * mshare'd range, do we return page tables and pages to > + * that process so the creating process can continue using > + * the address range it had chosen to mshare at some > + * point? > + * > + * TODO: unmap shared vmas from every task that is using > + * this mshare'd range. > + */ > mmput(info->mm); > kfree(info); > } else { > -- > 2.32.0 > Thanks Barry
On Tue, Apr 12, 2022 at 4:07 AM Khalid Aziz <khalid.aziz@oracle.com> wrote: > > This patch adds basic page table sharing across tasks by making > mshare syscall. It does this by creating a new mm_struct which > hosts the shared vmas and page tables. This mm_struct is > maintained as long as there is at least one task using the mshare'd > range. It is cleaned up by the last mshare_unlink syscall. > > Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > --- > mm/internal.h | 2 + > mm/memory.c | 35 ++++++++++ > mm/mshare.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++---- > 3 files changed, 214 insertions(+), 13 deletions(-) > > diff --git a/mm/internal.h b/mm/internal.h > index cf50a471384e..68f82f0f8b66 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -718,6 +718,8 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); > int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, > unsigned long addr, int page_nid, int *flags); > > +extern vm_fault_t find_shared_vma(struct vm_area_struct **vma, > + unsigned long *addrp); > static inline bool vma_is_shared(const struct vm_area_struct *vma) > { > return vma->vm_flags & VM_SHARED_PT; > diff --git a/mm/memory.c b/mm/memory.c > index c125c4969913..c77c0d643ea8 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -4776,6 +4776,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > unsigned int flags, struct pt_regs *regs) > { > vm_fault_t ret; > + bool shared = false; > > __set_current_state(TASK_RUNNING); > > @@ -4785,6 +4786,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > /* do counter updates before entering really critical section. */ > check_sync_rss_stat(current); > > + if (unlikely(vma_is_shared(vma))) { > + ret = find_shared_vma(&vma, &address); > + if (ret) > + return ret; > + if (!vma) > + return VM_FAULT_SIGSEGV; > + shared = true; > + } > + > if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, > flags & FAULT_FLAG_INSTRUCTION, > flags & FAULT_FLAG_REMOTE)) > @@ -4802,6 +4812,31 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > else > ret = __handle_mm_fault(vma, address, flags); > > + /* > + * Release the read lock on shared VMA's parent mm unless > + * __handle_mm_fault released the lock already. > + * __handle_mm_fault sets VM_FAULT_RETRY in return value if > + * it released mmap lock. If lock was released, that implies > + * the lock would have been released on task's original mm if > + * this were not a shared PTE vma. To keep lock state consistent, > + * make sure to release the lock on task's original mm > + */ > + if (shared) { > + int release_mmlock = 1; > + > + if (!(ret & VM_FAULT_RETRY)) { > + mmap_read_unlock(vma->vm_mm); > + release_mmlock = 0; > + } else if ((flags & FAULT_FLAG_ALLOW_RETRY) && > + (flags & FAULT_FLAG_RETRY_NOWAIT)) { > + mmap_read_unlock(vma->vm_mm); > + release_mmlock = 0; > + } > + > + if (release_mmlock) > + mmap_read_unlock(current->mm); > + } > + > if (flags & FAULT_FLAG_USER) { > mem_cgroup_exit_user_fault(); > /* > diff --git a/mm/mshare.c b/mm/mshare.c > index cd2f7ad24d9d..d1896adcb00f 100644 > --- a/mm/mshare.c > +++ b/mm/mshare.c > @@ -17,18 +17,49 @@ > #include <linux/pseudo_fs.h> > #include <linux/fileattr.h> > #include <linux/refcount.h> > +#include <linux/mman.h> > #include <linux/sched/mm.h> > #include <uapi/linux/magic.h> > #include <uapi/linux/limits.h> > > struct mshare_data { > - struct mm_struct *mm; > + struct mm_struct *mm, *host_mm; > mode_t mode; > refcount_t refcnt; > }; > > static struct super_block *msharefs_sb; > > +/* Returns holding the host mm's lock for read. Caller must release. */ > +vm_fault_t > +find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp) > +{ > + struct vm_area_struct *vma, *guest = *vmap; > + struct mshare_data *info = guest->vm_private_data; > + struct mm_struct *host_mm = info->mm; > + unsigned long host_addr; > + pgd_t *pgd, *guest_pgd; > + > + host_addr = *addrp - guest->vm_start + host_mm->mmap_base; > + pgd = pgd_offset(host_mm, host_addr); > + guest_pgd = pgd_offset(current->mm, *addrp); > + if (!pgd_same(*guest_pgd, *pgd)) { > + set_pgd(guest_pgd, *pgd); > + return VM_FAULT_NOPAGE; > + } > + > + *addrp = host_addr; > + mmap_read_lock(host_mm); > + vma = find_vma(host_mm, host_addr); > + > + /* XXX: expand stack? */ > + if (vma && vma->vm_start > host_addr) > + vma = NULL; > + > + *vmap = vma; > + return 0; > +} > + > static void > msharefs_evict_inode(struct inode *inode) > { > @@ -169,11 +200,13 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, > unsigned long, len, int, oflag, mode_t, mode) > { > struct mshare_data *info; > - struct mm_struct *mm; > struct filename *fname = getname(name); > struct dentry *dentry; > struct inode *inode; > struct qstr namestr; > + struct vm_area_struct *vma, *next, *new_vma; > + struct mm_struct *new_mm; > + unsigned long end; > int err = PTR_ERR(fname); > > /* > @@ -193,6 +226,8 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, > if (IS_ERR(fname)) > goto err_out; > > + end = addr + len; > + > /* > * Does this mshare entry exist already? If it does, calling > * mshare with O_EXCL|O_CREAT is an error > @@ -205,49 +240,165 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, > inode_lock(d_inode(msharefs_sb->s_root)); > dentry = d_lookup(msharefs_sb->s_root, &namestr); > if (dentry && (oflag & (O_EXCL|O_CREAT))) { > + inode = d_inode(dentry); > err = -EEXIST; > dput(dentry); > goto err_unlock_inode; > } > > if (dentry) { > + unsigned long mapaddr, prot = PROT_NONE; > + > inode = d_inode(dentry); > if (inode == NULL) { > + mmap_write_unlock(current->mm); > err = -EINVAL; > goto err_out; > } > info = inode->i_private; > - refcount_inc(&info->refcnt); > dput(dentry); > + > + /* > + * Map in the address range as anonymous mappings > + */ > + oflag &= (O_RDONLY | O_WRONLY | O_RDWR); > + if (oflag & O_RDONLY) > + prot |= PROT_READ; > + else if (oflag & O_WRONLY) > + prot |= PROT_WRITE; > + else if (oflag & O_RDWR) > + prot |= (PROT_READ | PROT_WRITE); > + mapaddr = vm_mmap(NULL, addr, len, prot, > + MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, 0); > + if (IS_ERR((void *)mapaddr)) { > + err = -EINVAL; > + goto err_out; > + } > + > + refcount_inc(&info->refcnt); > + > + /* > + * Now that we have mmap'd the mshare'd range, update vma > + * flags and vm_mm pointer for this mshare'd range. > + */ > + mmap_write_lock(current->mm); > + vma = find_vma(current->mm, addr); > + if (vma && vma->vm_start < addr) { and I don't understand why "vma->vm_start < addr" can happen. does it mean we have given mshare() an address which is not aligned? then we should have return -EINVAL earlier? > + mmap_write_unlock(current->mm); > + err = -EINVAL; > + goto err_out; > + } > + > + while (vma && vma->vm_start < (addr + len)) { > + vma->vm_private_data = info; > + vma->vm_mm = info->mm; > + vma->vm_flags |= VM_SHARED_PT; > + next = vma->vm_next; > + vma = next; > + } > } else { > - mm = mm_alloc(); > - if (!mm) > + unsigned long myaddr; > + struct mm_struct *old_mm; > + > + old_mm = current->mm; > + new_mm = mm_alloc(); > + if (!new_mm) > return -ENOMEM; > info = kzalloc(sizeof(*info), GFP_KERNEL); > if (!info) { > err = -ENOMEM; > goto err_relmm; > } > - mm->mmap_base = addr; > - mm->task_size = addr + len; > - if (!mm->task_size) > - mm->task_size--; > - info->mm = mm; > + new_mm->mmap_base = addr; > + new_mm->task_size = addr + len; > + if (!new_mm->task_size) > + new_mm->task_size--; > + info->mm = new_mm; > + info->host_mm = old_mm; > info->mode = mode; > refcount_set(&info->refcnt, 1); > + > + /* > + * VMAs for this address range may or may not exist. > + * If VMAs exist, they should be marked as shared at > + * this point and page table info should be copied > + * over to newly created mm_struct. TODO: If VMAs do not > + * exist, create them and mark them as shared. > + */ > + mmap_write_lock(old_mm); > + vma = find_vma_intersection(old_mm, addr, end); > + if (!vma) { > + err = -EINVAL; > + goto unlock; > + } > + /* > + * TODO: If the currently allocated VMA goes beyond the > + * mshare'd range, this VMA needs to be split. > + * > + * Double check that source VMAs do not extend outside > + * the range > + */ > + vma = find_vma(old_mm, addr + len); > + if (vma && vma->vm_start < (addr + len)) { > + err = -EINVAL; > + goto unlock; > + } > + > + vma = find_vma(old_mm, addr); > + if (vma && vma->vm_start < addr) { > + err = -EINVAL; > + goto unlock; > + } > + > + mmap_write_lock(new_mm); > + while (vma && vma->vm_start < (addr + len)) { > + /* > + * Copy this vma over to host mm > + */ > + vma->vm_private_data = info; > + vma->vm_mm = new_mm; > + vma->vm_flags |= VM_SHARED_PT; > + new_vma = vm_area_dup(vma); > + if (!new_vma) { > + err = -ENOMEM; > + goto unlock; > + } > + err = insert_vm_struct(new_mm, new_vma); > + if (err) > + goto unlock; > + > + vma = vma->vm_next; > + } > + mmap_write_unlock(new_mm); > + > err = mshare_file_create(fname, oflag, info); > if (err) > - goto err_relinfo; > + goto unlock; > + > + /* > + * Copy over current PTEs > + */ > + myaddr = addr; > + while (myaddr < new_mm->task_size) { > + *pgd_offset(new_mm, myaddr) = *pgd_offset(old_mm, myaddr); > + myaddr += PGDIR_SIZE; > + } > + /* > + * TODO: Free the corresponding page table in calling > + * process > + */ > } > > + mmap_write_unlock(current->mm); > inode_unlock(d_inode(msharefs_sb->s_root)); > putname(fname); > return 0; > > -err_relinfo: > +unlock: > + mmap_write_unlock(current->mm); > kfree(info); > err_relmm: > - mmput(mm); > + mmput(new_mm); > err_unlock_inode: > inode_unlock(d_inode(msharefs_sb->s_root)); > err_out: > @@ -294,11 +445,24 @@ SYSCALL_DEFINE1(mshare_unlink, const char *, name) > > /* > * Is this the last reference? > + * TODO: permission checks are needed before proceeding > */ > if (refcount_dec_and_test(&info->refcnt)) { > simple_unlink(d_inode(msharefs_sb->s_root), dentry); > d_drop(dentry); > d_delete(dentry); > + /* > + * TODO: Release all physical pages allocated for this > + * mshare range and release associated page table. If > + * the final unlink happens from the process that created > + * mshare'd range, do we return page tables and pages to > + * that process so the creating process can continue using > + * the address range it had chosen to mshare at some > + * point? > + * > + * TODO: unmap shared vmas from every task that is using > + * this mshare'd range. > + */ > mmput(info->mm); > kfree(info); > } else { > -- > 2.32.0 > Thanks Barry
On 5/30/22 05:11, Barry Song wrote: > On Tue, Apr 12, 2022 at 4:07 AM Khalid Aziz <khalid.aziz@oracle.com> wrote: >> >> >> @@ -193,6 +226,8 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, >> if (IS_ERR(fname)) >> goto err_out; >> >> + end = addr + len; >> + >> /* >> * Does this mshare entry exist already? If it does, calling >> * mshare with O_EXCL|O_CREAT is an error >> @@ -205,49 +240,165 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, >> inode_lock(d_inode(msharefs_sb->s_root)); >> dentry = d_lookup(msharefs_sb->s_root, &namestr); >> if (dentry && (oflag & (O_EXCL|O_CREAT))) { >> + inode = d_inode(dentry); >> err = -EEXIST; >> dput(dentry); >> goto err_unlock_inode; >> } >> >> if (dentry) { >> + unsigned long mapaddr, prot = PROT_NONE; >> + >> inode = d_inode(dentry); >> if (inode == NULL) { >> + mmap_write_unlock(current->mm); >> err = -EINVAL; >> goto err_out; >> } >> info = inode->i_private; >> - refcount_inc(&info->refcnt); >> dput(dentry); >> + >> + /* >> + * Map in the address range as anonymous mappings >> + */ >> + oflag &= (O_RDONLY | O_WRONLY | O_RDWR); >> + if (oflag & O_RDONLY) >> + prot |= PROT_READ; >> + else if (oflag & O_WRONLY) >> + prot |= PROT_WRITE; >> + else if (oflag & O_RDWR) >> + prot |= (PROT_READ | PROT_WRITE); >> + mapaddr = vm_mmap(NULL, addr, len, prot, >> + MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, 0); > > From the perspective of hardware, do we have to use MAP_FIXED to make > sure those processes sharing PTE > use the same virtual address for the shared area? or actually we don't > necessarily need it? as long as the > upper level pgtable entries point to the same lower level pgtable? Hi Barry, Sorry, I didn't mean to ignore this. I was out of commission for the last few weeks. All processes sharing an mshare region must use the same virtual address otherwise page table entry for those processes won't be identical and hence can not be shared. Upper bits of virtual address provide index into various level directories. It may be possible to manipulate the various page directories to allow for different virtual addresses across processes and get hardware page table walk to work correctly, but that would be complex and potentially error prone. Thanks, Khalid
On 5/30/22 21:46, Barry Song wrote: > On Tue, Apr 12, 2022 at 4:07 AM Khalid Aziz <khalid.aziz@oracle.com> wrote: >> >> This patch adds basic page table sharing across tasks by making >> mshare syscall. It does this by creating a new mm_struct which >> hosts the shared vmas and page tables. This mm_struct is >> maintained as long as there is at least one task using the mshare'd >> range. It is cleaned up by the last mshare_unlink syscall. >> >> Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com> >> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> >> --- >> mm/internal.h | 2 + >> mm/memory.c | 35 ++++++++++ >> mm/mshare.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++---- >> 3 files changed, 214 insertions(+), 13 deletions(-) >> >> diff --git a/mm/internal.h b/mm/internal.h >> index cf50a471384e..68f82f0f8b66 100644 >> --- a/mm/internal.h >> +++ b/mm/internal.h >> @@ -718,6 +718,8 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); >> int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, >> unsigned long addr, int page_nid, int *flags); >> >> +extern vm_fault_t find_shared_vma(struct vm_area_struct **vma, >> + unsigned long *addrp); >> static inline bool vma_is_shared(const struct vm_area_struct *vma) >> { >> return vma->vm_flags & VM_SHARED_PT; >> diff --git a/mm/memory.c b/mm/memory.c >> index c125c4969913..c77c0d643ea8 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -4776,6 +4776,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, >> unsigned int flags, struct pt_regs *regs) >> { >> vm_fault_t ret; >> + bool shared = false; >> >> __set_current_state(TASK_RUNNING); >> >> @@ -4785,6 +4786,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, >> /* do counter updates before entering really critical section. */ >> check_sync_rss_stat(current); >> >> + if (unlikely(vma_is_shared(vma))) { >> + ret = find_shared_vma(&vma, &address); >> + if (ret) >> + return ret; >> + if (!vma) >> + return VM_FAULT_SIGSEGV; >> + shared = true; >> + } >> + >> if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, >> flags & FAULT_FLAG_INSTRUCTION, >> flags & FAULT_FLAG_REMOTE)) >> @@ -4802,6 +4812,31 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, >> else >> ret = __handle_mm_fault(vma, address, flags); >> >> + /* >> + * Release the read lock on shared VMA's parent mm unless >> + * __handle_mm_fault released the lock already. >> + * __handle_mm_fault sets VM_FAULT_RETRY in return value if >> + * it released mmap lock. If lock was released, that implies >> + * the lock would have been released on task's original mm if >> + * this were not a shared PTE vma. To keep lock state consistent, >> + * make sure to release the lock on task's original mm >> + */ >> + if (shared) { >> + int release_mmlock = 1; >> + >> + if (!(ret & VM_FAULT_RETRY)) { >> + mmap_read_unlock(vma->vm_mm); >> + release_mmlock = 0; >> + } else if ((flags & FAULT_FLAG_ALLOW_RETRY) && >> + (flags & FAULT_FLAG_RETRY_NOWAIT)) { >> + mmap_read_unlock(vma->vm_mm); >> + release_mmlock = 0; >> + } >> + >> + if (release_mmlock) >> + mmap_read_unlock(current->mm); >> + } >> + >> if (flags & FAULT_FLAG_USER) { >> mem_cgroup_exit_user_fault(); >> /* >> diff --git a/mm/mshare.c b/mm/mshare.c >> index cd2f7ad24d9d..d1896adcb00f 100644 >> --- a/mm/mshare.c >> +++ b/mm/mshare.c >> @@ -17,18 +17,49 @@ >> #include <linux/pseudo_fs.h> >> #include <linux/fileattr.h> >> #include <linux/refcount.h> >> +#include <linux/mman.h> >> #include <linux/sched/mm.h> >> #include <uapi/linux/magic.h> >> #include <uapi/linux/limits.h> >> >> struct mshare_data { >> - struct mm_struct *mm; >> + struct mm_struct *mm, *host_mm; >> mode_t mode; >> refcount_t refcnt; >> }; >> >> static struct super_block *msharefs_sb; >> >> +/* Returns holding the host mm's lock for read. Caller must release. */ >> +vm_fault_t >> +find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp) >> +{ >> + struct vm_area_struct *vma, *guest = *vmap; >> + struct mshare_data *info = guest->vm_private_data; >> + struct mm_struct *host_mm = info->mm; >> + unsigned long host_addr; >> + pgd_t *pgd, *guest_pgd; >> + >> + host_addr = *addrp - guest->vm_start + host_mm->mmap_base; >> + pgd = pgd_offset(host_mm, host_addr); >> + guest_pgd = pgd_offset(current->mm, *addrp); >> + if (!pgd_same(*guest_pgd, *pgd)) { >> + set_pgd(guest_pgd, *pgd); >> + return VM_FAULT_NOPAGE; >> + } >> + >> + *addrp = host_addr; >> + mmap_read_lock(host_mm); >> + vma = find_vma(host_mm, host_addr); >> + >> + /* XXX: expand stack? */ >> + if (vma && vma->vm_start > host_addr) >> + vma = NULL; >> + >> + *vmap = vma; >> + return 0; >> +} >> + >> static void >> msharefs_evict_inode(struct inode *inode) >> { >> @@ -169,11 +200,13 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, >> unsigned long, len, int, oflag, mode_t, mode) >> { >> struct mshare_data *info; >> - struct mm_struct *mm; >> struct filename *fname = getname(name); >> struct dentry *dentry; >> struct inode *inode; >> struct qstr namestr; >> + struct vm_area_struct *vma, *next, *new_vma; >> + struct mm_struct *new_mm; >> + unsigned long end; >> int err = PTR_ERR(fname); >> >> /* >> @@ -193,6 +226,8 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, >> if (IS_ERR(fname)) >> goto err_out; >> >> + end = addr + len; >> + >> /* >> * Does this mshare entry exist already? If it does, calling >> * mshare with O_EXCL|O_CREAT is an error >> @@ -205,49 +240,165 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, >> inode_lock(d_inode(msharefs_sb->s_root)); >> dentry = d_lookup(msharefs_sb->s_root, &namestr); >> if (dentry && (oflag & (O_EXCL|O_CREAT))) { >> + inode = d_inode(dentry); >> err = -EEXIST; >> dput(dentry); >> goto err_unlock_inode; >> } >> >> if (dentry) { >> + unsigned long mapaddr, prot = PROT_NONE; >> + >> inode = d_inode(dentry); >> if (inode == NULL) { >> + mmap_write_unlock(current->mm); >> err = -EINVAL; >> goto err_out; >> } >> info = inode->i_private; >> - refcount_inc(&info->refcnt); >> dput(dentry); >> + >> + /* >> + * Map in the address range as anonymous mappings >> + */ >> + oflag &= (O_RDONLY | O_WRONLY | O_RDWR); >> + if (oflag & O_RDONLY) >> + prot |= PROT_READ; >> + else if (oflag & O_WRONLY) >> + prot |= PROT_WRITE; >> + else if (oflag & O_RDWR) >> + prot |= (PROT_READ | PROT_WRITE); >> + mapaddr = vm_mmap(NULL, addr, len, prot, >> + MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, 0); >> + if (IS_ERR((void *)mapaddr)) { >> + err = -EINVAL; >> + goto err_out; >> + } >> + >> + refcount_inc(&info->refcnt); >> + >> + /* >> + * Now that we have mmap'd the mshare'd range, update vma >> + * flags and vm_mm pointer for this mshare'd range. >> + */ >> + mmap_write_lock(current->mm); >> + vma = find_vma(current->mm, addr); >> + if (vma && vma->vm_start < addr) { > > and I don't understand why "vma->vm_start < addr" can happen. > does it mean we have given mshare() an address which is not > aligned? then we should have return -EINVAL earlier? > Yes, this could potentially be caught earlier. I have rewritten the code to new API based upon mmap() entirely without adding new system calls. Much of the above code is no longer needed since mmap takes care of lots of these checks already. I am running some final tests on v2 patch series and will send it out shortly. Thanks, Khalid
diff --git a/mm/internal.h b/mm/internal.h index cf50a471384e..68f82f0f8b66 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -718,6 +718,8 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); +extern vm_fault_t find_shared_vma(struct vm_area_struct **vma, + unsigned long *addrp); static inline bool vma_is_shared(const struct vm_area_struct *vma) { return vma->vm_flags & VM_SHARED_PT; diff --git a/mm/memory.c b/mm/memory.c index c125c4969913..c77c0d643ea8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4776,6 +4776,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { vm_fault_t ret; + bool shared = false; __set_current_state(TASK_RUNNING); @@ -4785,6 +4786,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); + if (unlikely(vma_is_shared(vma))) { + ret = find_shared_vma(&vma, &address); + if (ret) + return ret; + if (!vma) + return VM_FAULT_SIGSEGV; + shared = true; + } + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_REMOTE)) @@ -4802,6 +4812,31 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, else ret = __handle_mm_fault(vma, address, flags); + /* + * Release the read lock on shared VMA's parent mm unless + * __handle_mm_fault released the lock already. + * __handle_mm_fault sets VM_FAULT_RETRY in return value if + * it released mmap lock. If lock was released, that implies + * the lock would have been released on task's original mm if + * this were not a shared PTE vma. To keep lock state consistent, + * make sure to release the lock on task's original mm + */ + if (shared) { + int release_mmlock = 1; + + if (!(ret & VM_FAULT_RETRY)) { + mmap_read_unlock(vma->vm_mm); + release_mmlock = 0; + } else if ((flags & FAULT_FLAG_ALLOW_RETRY) && + (flags & FAULT_FLAG_RETRY_NOWAIT)) { + mmap_read_unlock(vma->vm_mm); + release_mmlock = 0; + } + + if (release_mmlock) + mmap_read_unlock(current->mm); + } + if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* diff --git a/mm/mshare.c b/mm/mshare.c index cd2f7ad24d9d..d1896adcb00f 100644 --- a/mm/mshare.c +++ b/mm/mshare.c @@ -17,18 +17,49 @@ #include <linux/pseudo_fs.h> #include <linux/fileattr.h> #include <linux/refcount.h> +#include <linux/mman.h> #include <linux/sched/mm.h> #include <uapi/linux/magic.h> #include <uapi/linux/limits.h> struct mshare_data { - struct mm_struct *mm; + struct mm_struct *mm, *host_mm; mode_t mode; refcount_t refcnt; }; static struct super_block *msharefs_sb; +/* Returns holding the host mm's lock for read. Caller must release. */ +vm_fault_t +find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp) +{ + struct vm_area_struct *vma, *guest = *vmap; + struct mshare_data *info = guest->vm_private_data; + struct mm_struct *host_mm = info->mm; + unsigned long host_addr; + pgd_t *pgd, *guest_pgd; + + host_addr = *addrp - guest->vm_start + host_mm->mmap_base; + pgd = pgd_offset(host_mm, host_addr); + guest_pgd = pgd_offset(current->mm, *addrp); + if (!pgd_same(*guest_pgd, *pgd)) { + set_pgd(guest_pgd, *pgd); + return VM_FAULT_NOPAGE; + } + + *addrp = host_addr; + mmap_read_lock(host_mm); + vma = find_vma(host_mm, host_addr); + + /* XXX: expand stack? */ + if (vma && vma->vm_start > host_addr) + vma = NULL; + + *vmap = vma; + return 0; +} + static void msharefs_evict_inode(struct inode *inode) { @@ -169,11 +200,13 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, unsigned long, len, int, oflag, mode_t, mode) { struct mshare_data *info; - struct mm_struct *mm; struct filename *fname = getname(name); struct dentry *dentry; struct inode *inode; struct qstr namestr; + struct vm_area_struct *vma, *next, *new_vma; + struct mm_struct *new_mm; + unsigned long end; int err = PTR_ERR(fname); /* @@ -193,6 +226,8 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, if (IS_ERR(fname)) goto err_out; + end = addr + len; + /* * Does this mshare entry exist already? If it does, calling * mshare with O_EXCL|O_CREAT is an error @@ -205,49 +240,165 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr, inode_lock(d_inode(msharefs_sb->s_root)); dentry = d_lookup(msharefs_sb->s_root, &namestr); if (dentry && (oflag & (O_EXCL|O_CREAT))) { + inode = d_inode(dentry); err = -EEXIST; dput(dentry); goto err_unlock_inode; } if (dentry) { + unsigned long mapaddr, prot = PROT_NONE; + inode = d_inode(dentry); if (inode == NULL) { + mmap_write_unlock(current->mm); err = -EINVAL; goto err_out; } info = inode->i_private; - refcount_inc(&info->refcnt); dput(dentry); + + /* + * Map in the address range as anonymous mappings + */ + oflag &= (O_RDONLY | O_WRONLY | O_RDWR); + if (oflag & O_RDONLY) + prot |= PROT_READ; + else if (oflag & O_WRONLY) + prot |= PROT_WRITE; + else if (oflag & O_RDWR) + prot |= (PROT_READ | PROT_WRITE); + mapaddr = vm_mmap(NULL, addr, len, prot, + MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, 0); + if (IS_ERR((void *)mapaddr)) { + err = -EINVAL; + goto err_out; + } + + refcount_inc(&info->refcnt); + + /* + * Now that we have mmap'd the mshare'd range, update vma + * flags and vm_mm pointer for this mshare'd range. + */ + mmap_write_lock(current->mm); + vma = find_vma(current->mm, addr); + if (vma && vma->vm_start < addr) { + mmap_write_unlock(current->mm); + err = -EINVAL; + goto err_out; + } + + while (vma && vma->vm_start < (addr + len)) { + vma->vm_private_data = info; + vma->vm_mm = info->mm; + vma->vm_flags |= VM_SHARED_PT; + next = vma->vm_next; + vma = next; + } } else { - mm = mm_alloc(); - if (!mm) + unsigned long myaddr; + struct mm_struct *old_mm; + + old_mm = current->mm; + new_mm = mm_alloc(); + if (!new_mm) return -ENOMEM; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) { err = -ENOMEM; goto err_relmm; } - mm->mmap_base = addr; - mm->task_size = addr + len; - if (!mm->task_size) - mm->task_size--; - info->mm = mm; + new_mm->mmap_base = addr; + new_mm->task_size = addr + len; + if (!new_mm->task_size) + new_mm->task_size--; + info->mm = new_mm; + info->host_mm = old_mm; info->mode = mode; refcount_set(&info->refcnt, 1); + + /* + * VMAs for this address range may or may not exist. + * If VMAs exist, they should be marked as shared at + * this point and page table info should be copied + * over to newly created mm_struct. TODO: If VMAs do not + * exist, create them and mark them as shared. + */ + mmap_write_lock(old_mm); + vma = find_vma_intersection(old_mm, addr, end); + if (!vma) { + err = -EINVAL; + goto unlock; + } + /* + * TODO: If the currently allocated VMA goes beyond the + * mshare'd range, this VMA needs to be split. + * + * Double check that source VMAs do not extend outside + * the range + */ + vma = find_vma(old_mm, addr + len); + if (vma && vma->vm_start < (addr + len)) { + err = -EINVAL; + goto unlock; + } + + vma = find_vma(old_mm, addr); + if (vma && vma->vm_start < addr) { + err = -EINVAL; + goto unlock; + } + + mmap_write_lock(new_mm); + while (vma && vma->vm_start < (addr + len)) { + /* + * Copy this vma over to host mm + */ + vma->vm_private_data = info; + vma->vm_mm = new_mm; + vma->vm_flags |= VM_SHARED_PT; + new_vma = vm_area_dup(vma); + if (!new_vma) { + err = -ENOMEM; + goto unlock; + } + err = insert_vm_struct(new_mm, new_vma); + if (err) + goto unlock; + + vma = vma->vm_next; + } + mmap_write_unlock(new_mm); + err = mshare_file_create(fname, oflag, info); if (err) - goto err_relinfo; + goto unlock; + + /* + * Copy over current PTEs + */ + myaddr = addr; + while (myaddr < new_mm->task_size) { + *pgd_offset(new_mm, myaddr) = *pgd_offset(old_mm, myaddr); + myaddr += PGDIR_SIZE; + } + /* + * TODO: Free the corresponding page table in calling + * process + */ } + mmap_write_unlock(current->mm); inode_unlock(d_inode(msharefs_sb->s_root)); putname(fname); return 0; -err_relinfo: +unlock: + mmap_write_unlock(current->mm); kfree(info); err_relmm: - mmput(mm); + mmput(new_mm); err_unlock_inode: inode_unlock(d_inode(msharefs_sb->s_root)); err_out: @@ -294,11 +445,24 @@ SYSCALL_DEFINE1(mshare_unlink, const char *, name) /* * Is this the last reference? + * TODO: permission checks are needed before proceeding */ if (refcount_dec_and_test(&info->refcnt)) { simple_unlink(d_inode(msharefs_sb->s_root), dentry); d_drop(dentry); d_delete(dentry); + /* + * TODO: Release all physical pages allocated for this + * mshare range and release associated page table. If + * the final unlink happens from the process that created + * mshare'd range, do we return page tables and pages to + * that process so the creating process can continue using + * the address range it had chosen to mshare at some + * point? + * + * TODO: unmap shared vmas from every task that is using + * this mshare'd range. + */ mmput(info->mm); kfree(info); } else {