Message ID | d7ae3b880dc3a26129486d5680db672289d2695c.1656531090.git.khalid.aziz@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Add support for shared PTEs across processes | expand |
On Wed, Jun 29, 2022 at 04:53:57PM -0600, Khalid Aziz wrote: > mmap is used to establish address range for mshare region and map the > region into process's address space. Add basic mmap operation that > supports setting address range. Also fix code to not allocate new > mm_struct for files in msharefs that exist for information and not > for defining a new mshare region. > > Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > --- > mm/mshare.c | 48 +++++++++++++++++++++++++++++++++++++++++------- > 1 file changed, 41 insertions(+), 7 deletions(-) > > diff --git a/mm/mshare.c b/mm/mshare.c > index d238b68b0576..088a6cab1e93 100644 > --- a/mm/mshare.c > +++ b/mm/mshare.c > @@ -9,7 +9,8 @@ > * > * > * Copyright (C) 2022 Oracle Corp. All rights reserved. > - * Author: Khalid Aziz <khalid.aziz@oracle.com> > + * Authors: Khalid Aziz <khalid.aziz@oracle.com> > + * Matthew Wilcox <willy@infradead.org> > * > */ > > @@ -60,9 +61,36 @@ msharefs_read(struct kiocb *iocb, struct iov_iter *iov) > return ret; > } > > +static int > +msharefs_mmap(struct file *file, struct vm_area_struct *vma) > +{ > + struct mshare_data *info = file->private_data; > + struct mm_struct *mm = info->mm; > + > + /* > + * If this mshare region has been set up once already, bail out > + */ > + if (mm->mmap_base != 0) > + return -EINVAL; > + > + if ((vma->vm_start | vma->vm_end) & (PGDIR_SIZE - 1)) > + return -EINVAL; > + > + mm->mmap_base = vma->vm_start; > + mm->task_size = vma->vm_end - vma->vm_start; > + if (!mm->task_size) > + mm->task_size--; > + info->minfo->start = mm->mmap_base; > + info->minfo->size = mm->task_size; So, uh, if the second mmap() caller decides to ignore the mshare_info, should they get an -EINVAL here since the memory mappings won't be at the same process virtual address? > + vma->vm_flags |= VM_SHARED_PT; > + vma->vm_private_data = info; > + return 0; > +} > + > static const struct file_operations msharefs_file_operations = { > .open = msharefs_open, > .read_iter = msharefs_read, > + .mmap = msharefs_mmap, > .llseek = no_llseek, > }; > > @@ -119,7 +147,12 @@ msharefs_fill_mm(struct inode *inode) > goto err_free; > } > info->mm = mm; > - info->minfo = NULL; > + info->minfo = kzalloc(sizeof(struct mshare_info), GFP_KERNEL); > + if (info->minfo == NULL) { > + retval = -ENOMEM; > + goto err_free; > + } > + > refcount_set(&info->refcnt, 1); > inode->i_private = info; > > @@ -128,13 +161,14 @@ msharefs_fill_mm(struct inode *inode) > err_free: > if (mm) > mmput(mm); > + kfree(info->minfo); > kfree(info); > return retval; > } > > static struct inode > *msharefs_get_inode(struct super_block *sb, const struct inode *dir, > - umode_t mode) > + umode_t mode, bool newmm) > { > struct inode *inode = new_inode(sb); > if (inode) { > @@ -147,7 +181,7 @@ static struct inode > case S_IFREG: > inode->i_op = &msharefs_file_inode_ops; > inode->i_fop = &msharefs_file_operations; > - if (msharefs_fill_mm(inode) != 0) { > + if (newmm && msharefs_fill_mm(inode) != 0) { > discard_new_inode(inode); > inode = ERR_PTR(-ENOMEM); > } > @@ -177,7 +211,7 @@ msharefs_mknod(struct user_namespace *mnt_userns, struct inode *dir, > struct inode *inode; > int err = 0; > > - inode = msharefs_get_inode(dir->i_sb, dir, mode); > + inode = msharefs_get_inode(dir->i_sb, dir, mode, true); > if (IS_ERR(inode)) > return PTR_ERR(inode); > > @@ -267,7 +301,7 @@ prepopulate_files(struct super_block *s, struct inode *dir, > if (!dentry) > return -ENOMEM; > > - inode = msharefs_get_inode(s, dir, S_IFREG | files->mode); > + inode = msharefs_get_inode(s, dir, S_IFREG | files->mode, false); I was wondering why the information files were getting their own mshare_data. TBH I'm not really sure what the difference is between mshare_data and mshare_info, since those names are not especially distinct. > if (!inode) { > dput(dentry); > return -ENOMEM; > @@ -301,7 +335,7 @@ msharefs_fill_super(struct super_block *sb, struct fs_context *fc) > sb->s_d_op = &msharefs_d_ops; > sb->s_time_gran = 1; > > - inode = msharefs_get_inode(sb, NULL, S_IFDIR | 0777); > + inode = msharefs_get_inode(sb, NULL, S_IFDIR | 0777, false); Is it wise to default to world-writable? Surely whatever userspace software wraps an msharefs can relax permissions as needed. --D > if (!inode) { > err = -ENOMEM; > goto out; > -- > 2.32.0 >
On 6/30/22 15:44, Darrick J. Wong wrote: > On Wed, Jun 29, 2022 at 04:53:57PM -0600, Khalid Aziz wrote: >> mmap is used to establish address range for mshare region and map the >> region into process's address space. Add basic mmap operation that >> supports setting address range. Also fix code to not allocate new >> mm_struct for files in msharefs that exist for information and not >> for defining a new mshare region. >> >> Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com> >> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> >> --- >> mm/mshare.c | 48 +++++++++++++++++++++++++++++++++++++++++------- >> 1 file changed, 41 insertions(+), 7 deletions(-) >> >> diff --git a/mm/mshare.c b/mm/mshare.c >> index d238b68b0576..088a6cab1e93 100644 >> --- a/mm/mshare.c >> +++ b/mm/mshare.c >> @@ -9,7 +9,8 @@ >> * >> * >> * Copyright (C) 2022 Oracle Corp. All rights reserved. >> - * Author: Khalid Aziz <khalid.aziz@oracle.com> >> + * Authors: Khalid Aziz <khalid.aziz@oracle.com> >> + * Matthew Wilcox <willy@infradead.org> >> * >> */ >> >> @@ -60,9 +61,36 @@ msharefs_read(struct kiocb *iocb, struct iov_iter *iov) >> return ret; >> } >> >> +static int >> +msharefs_mmap(struct file *file, struct vm_area_struct *vma) >> +{ >> + struct mshare_data *info = file->private_data; >> + struct mm_struct *mm = info->mm; >> + >> + /* >> + * If this mshare region has been set up once already, bail out >> + */ >> + if (mm->mmap_base != 0) >> + return -EINVAL; >> + >> + if ((vma->vm_start | vma->vm_end) & (PGDIR_SIZE - 1)) >> + return -EINVAL; >> + >> + mm->mmap_base = vma->vm_start; >> + mm->task_size = vma->vm_end - vma->vm_start; >> + if (!mm->task_size) >> + mm->task_size--; >> + info->minfo->start = mm->mmap_base; >> + info->minfo->size = mm->task_size; > > So, uh, if the second mmap() caller decides to ignore the mshare_info, > should they get an -EINVAL here since the memory mappings won't be at > the same process virtual address? Yes, that is in patch 9. A second mmap will result in EINVAL until patch 9 irrespective of address and size passed to mmap. > >> + vma->vm_flags |= VM_SHARED_PT; >> + vma->vm_private_data = info; >> + return 0; >> +} >> + >> static const struct file_operations msharefs_file_operations = { >> .open = msharefs_open, >> .read_iter = msharefs_read, >> + .mmap = msharefs_mmap, >> .llseek = no_llseek, >> }; >> >> @@ -119,7 +147,12 @@ msharefs_fill_mm(struct inode *inode) >> goto err_free; >> } >> info->mm = mm; >> - info->minfo = NULL; >> + info->minfo = kzalloc(sizeof(struct mshare_info), GFP_KERNEL); >> + if (info->minfo == NULL) { >> + retval = -ENOMEM; >> + goto err_free; >> + } >> + >> refcount_set(&info->refcnt, 1); >> inode->i_private = info; >> >> @@ -128,13 +161,14 @@ msharefs_fill_mm(struct inode *inode) >> err_free: >> if (mm) >> mmput(mm); >> + kfree(info->minfo); >> kfree(info); >> return retval; >> } >> >> static struct inode >> *msharefs_get_inode(struct super_block *sb, const struct inode *dir, >> - umode_t mode) >> + umode_t mode, bool newmm) >> { >> struct inode *inode = new_inode(sb); >> if (inode) { >> @@ -147,7 +181,7 @@ static struct inode >> case S_IFREG: >> inode->i_op = &msharefs_file_inode_ops; >> inode->i_fop = &msharefs_file_operations; >> - if (msharefs_fill_mm(inode) != 0) { >> + if (newmm && msharefs_fill_mm(inode) != 0) { >> discard_new_inode(inode); >> inode = ERR_PTR(-ENOMEM); >> } >> @@ -177,7 +211,7 @@ msharefs_mknod(struct user_namespace *mnt_userns, struct inode *dir, >> struct inode *inode; >> int err = 0; >> >> - inode = msharefs_get_inode(dir->i_sb, dir, mode); >> + inode = msharefs_get_inode(dir->i_sb, dir, mode, true); >> if (IS_ERR(inode)) >> return PTR_ERR(inode); >> >> @@ -267,7 +301,7 @@ prepopulate_files(struct super_block *s, struct inode *dir, >> if (!dentry) >> return -ENOMEM; >> >> - inode = msharefs_get_inode(s, dir, S_IFREG | files->mode); >> + inode = msharefs_get_inode(s, dir, S_IFREG | files->mode, false); > > I was wondering why the information files were getting their own > mshare_data. > > TBH I'm not really sure what the difference is between mshare_data and > mshare_info, since those names are not especially distinct. mshare_data is superset and internal while mshare_info is what is sent back to userspace when it reads a file representing an mshare region. > >> if (!inode) { >> dput(dentry); >> return -ENOMEM; >> @@ -301,7 +335,7 @@ msharefs_fill_super(struct super_block *sb, struct fs_context *fc) >> sb->s_d_op = &msharefs_d_ops; >> sb->s_time_gran = 1; >> >> - inode = msharefs_get_inode(sb, NULL, S_IFDIR | 0777); >> + inode = msharefs_get_inode(sb, NULL, S_IFDIR | 0777, false); > > Is it wise to default to world-writable? Surely whatever userspace > software wraps an msharefs can relax permissions as needed. > Since this is for the root inode, the default is so any process can create mshare region in msharefs which I think is most flexible. Individual userspace app can create mshare regions with any permissions they deem fit using open(). Does that make sense? Thanks, Khalid
diff --git a/mm/mshare.c b/mm/mshare.c index d238b68b0576..088a6cab1e93 100644 --- a/mm/mshare.c +++ b/mm/mshare.c @@ -9,7 +9,8 @@ * * * Copyright (C) 2022 Oracle Corp. All rights reserved. - * Author: Khalid Aziz <khalid.aziz@oracle.com> + * Authors: Khalid Aziz <khalid.aziz@oracle.com> + * Matthew Wilcox <willy@infradead.org> * */ @@ -60,9 +61,36 @@ msharefs_read(struct kiocb *iocb, struct iov_iter *iov) return ret; } +static int +msharefs_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct mshare_data *info = file->private_data; + struct mm_struct *mm = info->mm; + + /* + * If this mshare region has been set up once already, bail out + */ + if (mm->mmap_base != 0) + return -EINVAL; + + if ((vma->vm_start | vma->vm_end) & (PGDIR_SIZE - 1)) + return -EINVAL; + + mm->mmap_base = vma->vm_start; + mm->task_size = vma->vm_end - vma->vm_start; + if (!mm->task_size) + mm->task_size--; + info->minfo->start = mm->mmap_base; + info->minfo->size = mm->task_size; + vma->vm_flags |= VM_SHARED_PT; + vma->vm_private_data = info; + return 0; +} + static const struct file_operations msharefs_file_operations = { .open = msharefs_open, .read_iter = msharefs_read, + .mmap = msharefs_mmap, .llseek = no_llseek, }; @@ -119,7 +147,12 @@ msharefs_fill_mm(struct inode *inode) goto err_free; } info->mm = mm; - info->minfo = NULL; + info->minfo = kzalloc(sizeof(struct mshare_info), GFP_KERNEL); + if (info->minfo == NULL) { + retval = -ENOMEM; + goto err_free; + } + refcount_set(&info->refcnt, 1); inode->i_private = info; @@ -128,13 +161,14 @@ msharefs_fill_mm(struct inode *inode) err_free: if (mm) mmput(mm); + kfree(info->minfo); kfree(info); return retval; } static struct inode *msharefs_get_inode(struct super_block *sb, const struct inode *dir, - umode_t mode) + umode_t mode, bool newmm) { struct inode *inode = new_inode(sb); if (inode) { @@ -147,7 +181,7 @@ static struct inode case S_IFREG: inode->i_op = &msharefs_file_inode_ops; inode->i_fop = &msharefs_file_operations; - if (msharefs_fill_mm(inode) != 0) { + if (newmm && msharefs_fill_mm(inode) != 0) { discard_new_inode(inode); inode = ERR_PTR(-ENOMEM); } @@ -177,7 +211,7 @@ msharefs_mknod(struct user_namespace *mnt_userns, struct inode *dir, struct inode *inode; int err = 0; - inode = msharefs_get_inode(dir->i_sb, dir, mode); + inode = msharefs_get_inode(dir->i_sb, dir, mode, true); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -267,7 +301,7 @@ prepopulate_files(struct super_block *s, struct inode *dir, if (!dentry) return -ENOMEM; - inode = msharefs_get_inode(s, dir, S_IFREG | files->mode); + inode = msharefs_get_inode(s, dir, S_IFREG | files->mode, false); if (!inode) { dput(dentry); return -ENOMEM; @@ -301,7 +335,7 @@ msharefs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_d_op = &msharefs_d_ops; sb->s_time_gran = 1; - inode = msharefs_get_inode(sb, NULL, S_IFDIR | 0777); + inode = msharefs_get_inode(sb, NULL, S_IFDIR | 0777, false); if (!inode) { err = -ENOMEM; goto out;