diff mbox series

[RFC,1/3] mm: Add f_ops->populate()

Message ID 20220306053211.135762-2-jarkko@kernel.org (mailing list archive)
State New, archived
Headers show
Series MAP_POPULATE for device memory | expand

Commit Message

Jarkko Sakkinen March 6, 2022, 5:32 a.m. UTC
Sometimes you might want to use MAP_POPULATE to ask a device driver to
initialize the device memory in some specific manner. SGX driver can use
this to request more memory by issuing ENCLS[EAUG] x86 opcode for each
page in the address range.

Add f_ops->populate() with the same parameters as f_ops->mmap() and make
it conditionally called inside call_mmap(). Update call sites
accodingly.
---
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
v3:
-       if (!ret && do_populate && file->f_op->populate)
+       if (!ret && do_populate && file->f_op->populate &&
+           !!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
(reported by Matthew Wilcox)
v2:
-       if (!ret && do_populate)
+       if (!ret && do_populate && file->f_op->populate)
(reported by Jan Harkes)
---
 arch/mips/kernel/vdso.c                    |  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c |  2 +-
 fs/coda/file.c                             |  2 +-
 fs/overlayfs/file.c                        |  2 +-
 include/linux/fs.h                         | 12 ++++++++++--
 include/linux/mm.h                         |  2 +-
 ipc/shm.c                                  |  2 +-
 mm/mmap.c                                  | 10 +++++-----
 mm/nommu.c                                 |  4 ++--
 9 files changed, 23 insertions(+), 15 deletions(-)

Comments

Greg Kroah-Hartman March 6, 2022, 10:01 a.m. UTC | #1
On Sun, Mar 06, 2022 at 07:32:05AM +0200, Jarkko Sakkinen wrote:
> Sometimes you might want to use MAP_POPULATE to ask a device driver to
> initialize the device memory in some specific manner. SGX driver can use
> this to request more memory by issuing ENCLS[EAUG] x86 opcode for each
> page in the address range.
> 
> Add f_ops->populate() with the same parameters as f_ops->mmap() and make
> it conditionally called inside call_mmap(). Update call sites
> accodingly.
> ---
> Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
> v3:
> -       if (!ret && do_populate && file->f_op->populate)
> +       if (!ret && do_populate && file->f_op->populate &&
> +           !!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
> (reported by Matthew Wilcox)
> v2:
> -       if (!ret && do_populate)
> +       if (!ret && do_populate && file->f_op->populate)
> (reported by Jan Harkes)
> ---
>  arch/mips/kernel/vdso.c                    |  2 +-
>  drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c |  2 +-
>  fs/coda/file.c                             |  2 +-
>  fs/overlayfs/file.c                        |  2 +-
>  include/linux/fs.h                         | 12 ++++++++++--
>  include/linux/mm.h                         |  2 +-
>  ipc/shm.c                                  |  2 +-
>  mm/mmap.c                                  | 10 +++++-----
>  mm/nommu.c                                 |  4 ++--
>  9 files changed, 23 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c
> index 3d0cf471f2fe..89f3f3da9abd 100644
> --- a/arch/mips/kernel/vdso.c
> +++ b/arch/mips/kernel/vdso.c
> @@ -102,7 +102,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
>  		base = mmap_region(NULL, STACK_TOP, PAGE_SIZE,
>  				VM_READ | VM_EXEC |
>  				VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
> -				0, NULL);
> +				0, NULL, false);
>  		if (IS_ERR_VALUE(base)) {
>  			ret = base;
>  			goto out;
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
> index 1b526039a60d..4c71f64d6a79 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
> @@ -107,7 +107,7 @@ static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *
>  	if (!obj->base.filp)
>  		return -ENODEV;
>  
> -	ret = call_mmap(obj->base.filp, vma);
> +	ret = call_mmap(obj->base.filp, vma, false);
>  	if (ret)
>  		return ret;
>  
> diff --git a/fs/coda/file.c b/fs/coda/file.c
> index 29dd87be2fb8..e14f312fdbf8 100644
> --- a/fs/coda/file.c
> +++ b/fs/coda/file.c
> @@ -173,7 +173,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
>  	spin_unlock(&cii->c_lock);
>  
>  	vma->vm_file = get_file(host_file);
> -	ret = call_mmap(vma->vm_file, vma);
> +	ret = call_mmap(vma->vm_file, vma, false);
>  
>  	if (ret) {
>  		/* if call_mmap fails, our caller will put host_file so we
> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> index fa125feed0ff..b963a9397e80 100644
> --- a/fs/overlayfs/file.c
> +++ b/fs/overlayfs/file.c
> @@ -503,7 +503,7 @@ static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
>  	vma_set_file(vma, realfile);
>  
>  	old_cred = ovl_override_creds(file_inode(file)->i_sb);
> -	ret = call_mmap(vma->vm_file, vma);
> +	ret = call_mmap(vma->vm_file, vma, false);
>  	revert_creds(old_cred);
>  	ovl_file_accessed(file);
>  
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e2d892b201b0..2909e2d14af8 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -42,6 +42,7 @@
>  #include <linux/mount.h>
>  #include <linux/cred.h>
>  #include <linux/mnt_idmapping.h>
> +#include <linux/mm.h>
>  
>  #include <asm/byteorder.h>
>  #include <uapi/linux/fs.h>
> @@ -1993,6 +1994,7 @@ struct file_operations {
>  	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
>  	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
>  	int (*mmap) (struct file *, struct vm_area_struct *);
> +	int (*populate)(struct file *, struct vm_area_struct *);
>  	unsigned long mmap_supported_flags;
>  	int (*open) (struct inode *, struct file *);
>  	int (*flush) (struct file *, fl_owner_t id);
> @@ -2074,9 +2076,15 @@ static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
>  	return file->f_op->write_iter(kio, iter);
>  }
>  
> -static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
> +static inline int call_mmap(struct file *file, struct vm_area_struct *vma, bool do_populate)
>  {
> -	return file->f_op->mmap(file, vma);
> +	int ret = file->f_op->mmap(file, vma);
> +
> +	if (!ret && do_populate && file->f_op->populate &&
> +	    !!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
> +		ret = file->f_op->populate(file, vma);
> +
> +	return ret;
>  }
>  
>  extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 213cc569b192..6c8c036f423b 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2683,7 +2683,7 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
>  
>  extern unsigned long mmap_region(struct file *file, unsigned long addr,
>  	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
> -	struct list_head *uf);
> +	struct list_head *uf, bool do_populate);

As I have said many times before, don't add random boolean flags to
function arguments, as they provide no hint as to what they do at all.
When you read the code, you then have to go back and look up the
function definition here and see what exactly it means and the flow is
broken.

Make function names mean something obvious, for this, if it really is a
good idea to have this new flag (and I doubt it, but that's not my
call), then make this a mmap_region_populate() call to make it obvious
it is something different than the notmal mmap_region() call.

But as is, this is pretty horrid, don't you agree?

thanks,

greg k-h
Jarkko Sakkinen March 6, 2022, 5:02 p.m. UTC | #2
On Sun, Mar 06, 2022 at 11:01:36AM +0100, Greg Kroah-Hartman wrote:
> On Sun, Mar 06, 2022 at 07:32:05AM +0200, Jarkko Sakkinen wrote:
> > Sometimes you might want to use MAP_POPULATE to ask a device driver to
> > initialize the device memory in some specific manner. SGX driver can use
> > this to request more memory by issuing ENCLS[EAUG] x86 opcode for each
> > page in the address range.
> > 
> > Add f_ops->populate() with the same parameters as f_ops->mmap() and make
> > it conditionally called inside call_mmap(). Update call sites
> > accodingly.
> > ---
> > Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
> > v3:
> > -       if (!ret && do_populate && file->f_op->populate)
> > +       if (!ret && do_populate && file->f_op->populate &&
> > +           !!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
> > (reported by Matthew Wilcox)
> > v2:
> > -       if (!ret && do_populate)
> > +       if (!ret && do_populate && file->f_op->populate)
> > (reported by Jan Harkes)
> > ---
> >  arch/mips/kernel/vdso.c                    |  2 +-
> >  drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c |  2 +-
> >  fs/coda/file.c                             |  2 +-
> >  fs/overlayfs/file.c                        |  2 +-
> >  include/linux/fs.h                         | 12 ++++++++++--
> >  include/linux/mm.h                         |  2 +-
> >  ipc/shm.c                                  |  2 +-
> >  mm/mmap.c                                  | 10 +++++-----
> >  mm/nommu.c                                 |  4 ++--
> >  9 files changed, 23 insertions(+), 15 deletions(-)
> > 
> > diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c
> > index 3d0cf471f2fe..89f3f3da9abd 100644
> > --- a/arch/mips/kernel/vdso.c
> > +++ b/arch/mips/kernel/vdso.c
> > @@ -102,7 +102,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> >  		base = mmap_region(NULL, STACK_TOP, PAGE_SIZE,
> >  				VM_READ | VM_EXEC |
> >  				VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
> > -				0, NULL);
> > +				0, NULL, false);
> >  		if (IS_ERR_VALUE(base)) {
> >  			ret = base;
> >  			goto out;
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
> > index 1b526039a60d..4c71f64d6a79 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
> > @@ -107,7 +107,7 @@ static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *
> >  	if (!obj->base.filp)
> >  		return -ENODEV;
> >  
> > -	ret = call_mmap(obj->base.filp, vma);
> > +	ret = call_mmap(obj->base.filp, vma, false);
> >  	if (ret)
> >  		return ret;
> >  
> > diff --git a/fs/coda/file.c b/fs/coda/file.c
> > index 29dd87be2fb8..e14f312fdbf8 100644
> > --- a/fs/coda/file.c
> > +++ b/fs/coda/file.c
> > @@ -173,7 +173,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
> >  	spin_unlock(&cii->c_lock);
> >  
> >  	vma->vm_file = get_file(host_file);
> > -	ret = call_mmap(vma->vm_file, vma);
> > +	ret = call_mmap(vma->vm_file, vma, false);
> >  
> >  	if (ret) {
> >  		/* if call_mmap fails, our caller will put host_file so we
> > diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> > index fa125feed0ff..b963a9397e80 100644
> > --- a/fs/overlayfs/file.c
> > +++ b/fs/overlayfs/file.c
> > @@ -503,7 +503,7 @@ static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
> >  	vma_set_file(vma, realfile);
> >  
> >  	old_cred = ovl_override_creds(file_inode(file)->i_sb);
> > -	ret = call_mmap(vma->vm_file, vma);
> > +	ret = call_mmap(vma->vm_file, vma, false);
> >  	revert_creds(old_cred);
> >  	ovl_file_accessed(file);
> >  
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index e2d892b201b0..2909e2d14af8 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -42,6 +42,7 @@
> >  #include <linux/mount.h>
> >  #include <linux/cred.h>
> >  #include <linux/mnt_idmapping.h>
> > +#include <linux/mm.h>
> >  
> >  #include <asm/byteorder.h>
> >  #include <uapi/linux/fs.h>
> > @@ -1993,6 +1994,7 @@ struct file_operations {
> >  	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
> >  	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
> >  	int (*mmap) (struct file *, struct vm_area_struct *);
> > +	int (*populate)(struct file *, struct vm_area_struct *);
> >  	unsigned long mmap_supported_flags;
> >  	int (*open) (struct inode *, struct file *);
> >  	int (*flush) (struct file *, fl_owner_t id);
> > @@ -2074,9 +2076,15 @@ static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
> >  	return file->f_op->write_iter(kio, iter);
> >  }
> >  
> > -static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
> > +static inline int call_mmap(struct file *file, struct vm_area_struct *vma, bool do_populate)
> >  {
> > -	return file->f_op->mmap(file, vma);
> > +	int ret = file->f_op->mmap(file, vma);
> > +
> > +	if (!ret && do_populate && file->f_op->populate &&
> > +	    !!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
> > +		ret = file->f_op->populate(file, vma);
> > +
> > +	return ret;
> >  }
> >  
> >  extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 213cc569b192..6c8c036f423b 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -2683,7 +2683,7 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
> >  
> >  extern unsigned long mmap_region(struct file *file, unsigned long addr,
> >  	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
> > -	struct list_head *uf);
> > +	struct list_head *uf, bool do_populate);
> 
> As I have said many times before, don't add random boolean flags to
> function arguments, as they provide no hint as to what they do at all.
> When you read the code, you then have to go back and look up the
> function definition here and see what exactly it means and the flow is
> broken.
> 
> Make function names mean something obvious, for this, if it really is a
> good idea to have this new flag (and I doubt it, but that's not my
> call), then make this a mmap_region_populate() call to make it obvious
> it is something different than the notmal mmap_region() call.

I can create:

* mmap_region_populate()
* call_mmap_populate()

This would localize the changes and leave out those boolean parameters.

> But as is, this is pretty horrid, don't you agree?

So can I conclude from this that in general having populate available for
device memory is something horrid, or just the implementation path?

That's the main reason why I made this RFC patch set, to get clear answer
to that question. I.e. if it is in general sense a bad idea, I'll just
create ioctl. If it is the implementation, I'll try to improve it.

Otherwise, I don't know whether or not it is good idea to include such
patch into the main SGX2 patch set. No means enforcibl tryy to push support
IO memory populate.

> thanks,
> 
> greg k-h

BR, Jarkko
Jarkko Sakkinen March 6, 2022, 5:03 p.m. UTC | #3
On Sun, Mar 06, 2022 at 07:03:00PM +0200, Jarkko Sakkinen wrote:
> On Sun, Mar 06, 2022 at 11:01:36AM +0100, Greg Kroah-Hartman wrote:
> > On Sun, Mar 06, 2022 at 07:32:05AM +0200, Jarkko Sakkinen wrote:
> > > Sometimes you might want to use MAP_POPULATE to ask a device driver to
> > > initialize the device memory in some specific manner. SGX driver can use
> > > this to request more memory by issuing ENCLS[EAUG] x86 opcode for each
> > > page in the address range.
> > > 
> > > Add f_ops->populate() with the same parameters as f_ops->mmap() and make
> > > it conditionally called inside call_mmap(). Update call sites
> > > accodingly.
> > > ---
> > > Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
> > > v3:
> > > -       if (!ret && do_populate && file->f_op->populate)
> > > +       if (!ret && do_populate && file->f_op->populate &&
> > > +           !!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
> > > (reported by Matthew Wilcox)
> > > v2:
> > > -       if (!ret && do_populate)
> > > +       if (!ret && do_populate && file->f_op->populate)
> > > (reported by Jan Harkes)
> > > ---
> > >  arch/mips/kernel/vdso.c                    |  2 +-
> > >  drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c |  2 +-
> > >  fs/coda/file.c                             |  2 +-
> > >  fs/overlayfs/file.c                        |  2 +-
> > >  include/linux/fs.h                         | 12 ++++++++++--
> > >  include/linux/mm.h                         |  2 +-
> > >  ipc/shm.c                                  |  2 +-
> > >  mm/mmap.c                                  | 10 +++++-----
> > >  mm/nommu.c                                 |  4 ++--
> > >  9 files changed, 23 insertions(+), 15 deletions(-)
> > > 
> > > diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c
> > > index 3d0cf471f2fe..89f3f3da9abd 100644
> > > --- a/arch/mips/kernel/vdso.c
> > > +++ b/arch/mips/kernel/vdso.c
> > > @@ -102,7 +102,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> > >  		base = mmap_region(NULL, STACK_TOP, PAGE_SIZE,
> > >  				VM_READ | VM_EXEC |
> > >  				VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
> > > -				0, NULL);
> > > +				0, NULL, false);
> > >  		if (IS_ERR_VALUE(base)) {
> > >  			ret = base;
> > >  			goto out;
> > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
> > > index 1b526039a60d..4c71f64d6a79 100644
> > > --- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
> > > +++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
> > > @@ -107,7 +107,7 @@ static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *
> > >  	if (!obj->base.filp)
> > >  		return -ENODEV;
> > >  
> > > -	ret = call_mmap(obj->base.filp, vma);
> > > +	ret = call_mmap(obj->base.filp, vma, false);
> > >  	if (ret)
> > >  		return ret;
> > >  
> > > diff --git a/fs/coda/file.c b/fs/coda/file.c
> > > index 29dd87be2fb8..e14f312fdbf8 100644
> > > --- a/fs/coda/file.c
> > > +++ b/fs/coda/file.c
> > > @@ -173,7 +173,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
> > >  	spin_unlock(&cii->c_lock);
> > >  
> > >  	vma->vm_file = get_file(host_file);
> > > -	ret = call_mmap(vma->vm_file, vma);
> > > +	ret = call_mmap(vma->vm_file, vma, false);
> > >  
> > >  	if (ret) {
> > >  		/* if call_mmap fails, our caller will put host_file so we
> > > diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> > > index fa125feed0ff..b963a9397e80 100644
> > > --- a/fs/overlayfs/file.c
> > > +++ b/fs/overlayfs/file.c
> > > @@ -503,7 +503,7 @@ static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
> > >  	vma_set_file(vma, realfile);
> > >  
> > >  	old_cred = ovl_override_creds(file_inode(file)->i_sb);
> > > -	ret = call_mmap(vma->vm_file, vma);
> > > +	ret = call_mmap(vma->vm_file, vma, false);
> > >  	revert_creds(old_cred);
> > >  	ovl_file_accessed(file);
> > >  
> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > index e2d892b201b0..2909e2d14af8 100644
> > > --- a/include/linux/fs.h
> > > +++ b/include/linux/fs.h
> > > @@ -42,6 +42,7 @@
> > >  #include <linux/mount.h>
> > >  #include <linux/cred.h>
> > >  #include <linux/mnt_idmapping.h>
> > > +#include <linux/mm.h>
> > >  
> > >  #include <asm/byteorder.h>
> > >  #include <uapi/linux/fs.h>
> > > @@ -1993,6 +1994,7 @@ struct file_operations {
> > >  	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
> > >  	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
> > >  	int (*mmap) (struct file *, struct vm_area_struct *);
> > > +	int (*populate)(struct file *, struct vm_area_struct *);
> > >  	unsigned long mmap_supported_flags;
> > >  	int (*open) (struct inode *, struct file *);
> > >  	int (*flush) (struct file *, fl_owner_t id);
> > > @@ -2074,9 +2076,15 @@ static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
> > >  	return file->f_op->write_iter(kio, iter);
> > >  }
> > >  
> > > -static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
> > > +static inline int call_mmap(struct file *file, struct vm_area_struct *vma, bool do_populate)
> > >  {
> > > -	return file->f_op->mmap(file, vma);
> > > +	int ret = file->f_op->mmap(file, vma);
> > > +
> > > +	if (!ret && do_populate && file->f_op->populate &&
> > > +	    !!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
> > > +		ret = file->f_op->populate(file, vma);
> > > +
> > > +	return ret;
> > >  }
> > >  
> > >  extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index 213cc569b192..6c8c036f423b 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -2683,7 +2683,7 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
> > >  
> > >  extern unsigned long mmap_region(struct file *file, unsigned long addr,
> > >  	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
> > > -	struct list_head *uf);
> > > +	struct list_head *uf, bool do_populate);
> > 
> > As I have said many times before, don't add random boolean flags to
> > function arguments, as they provide no hint as to what they do at all.
> > When you read the code, you then have to go back and look up the
> > function definition here and see what exactly it means and the flow is
> > broken.
> > 
> > Make function names mean something obvious, for this, if it really is a
> > good idea to have this new flag (and I doubt it, but that's not my
> > call), then make this a mmap_region_populate() call to make it obvious
> > it is something different than the notmal mmap_region() call.
> 
> I can create:
> 
> * mmap_region_populate()
> * call_mmap_populate()
> 
> This would localize the changes and leave out those boolean parameters.
> 
> > But as is, this is pretty horrid, don't you agree?
> 
> So can I conclude from this that in general having populate available for
> device memory is something horrid, or just the implementation path?
> 
> That's the main reason why I made this RFC patch set, to get clear answer
> to that question. I.e. if it is in general sense a bad idea, I'll just
> create ioctl. If it is the implementation, I'll try to improve it.
> 
> Otherwise, I don't know whether or not it is good idea to include such
> patch into the main SGX2 patch set. No means enforcibl tryy to push support
                                         ~~~~~
                                         intention

BR, Jarkko
Matthew Wilcox (Oracle) March 6, 2022, 10:43 p.m. UTC | #4
On Sun, Mar 06, 2022 at 07:02:57PM +0200, Jarkko Sakkinen wrote:
> So can I conclude from this that in general having populate available for
> device memory is something horrid, or just the implementation path?

You haven't even attempted to explain what the problem is you're trying
to solve.  You've shown up with some terrible code and said "Hey, is
this a good idea".  No, no, it's not.
Jarkko Sakkinen March 7, 2022, 1:16 p.m. UTC | #5
On Sun, Mar 06, 2022 at 10:43:31PM +0000, Matthew Wilcox wrote:
> On Sun, Mar 06, 2022 at 07:02:57PM +0200, Jarkko Sakkinen wrote:
> > So can I conclude from this that in general having populate available for
> > device memory is something horrid, or just the implementation path?
> 
> You haven't even attempted to explain what the problem is you're trying
> to solve.  You've shown up with some terrible code and said "Hey, is
> this a good idea".  No, no, it's not.

The problem is that in order to include memory to enclave, which is
essentially a reserved address range processes virtual address space
there's two steps into it:

1. Host side (kernel) does ENCLS[EAUG] to request a new page to be
   added to the enclave.
2. Enclave accepts request with ENCLU[EACCEPT] or ENCLU[EACCEPTCOPY].

In the current SGX2 patch set this taken care by the page fault
handler. I.e. the enclave calls ENCLU[EACCEPT] for an empty address
and the #PF handler then does EAUG for a single page.

So if you want to process a batch of pages this generates O(n)
round-trips.

So if there was a way pre-do a batch of EAUG's, that would allow
to load data to the enclave without causing page faults happening
constantly.

One solution for this simply add ioctl:

https://lore.kernel.org/linux-sgx/YiLRBglTEbu8cHP9@iki.fi/T/#m195ec84bf85614a140abeee245c5118c22ace8f3

But in practice when you wanted to use it, you would setup the
parameters so that they match the mmap() range. So for pratical
user space API having mmap() take care of this would be much more
lean option.

BR, Jarkko
Jarkko Sakkinen March 7, 2022, 1:26 p.m. UTC | #6
On Mon, Mar 07, 2022 at 03:16:57PM +0200, Jarkko Sakkinen wrote:
> On Sun, Mar 06, 2022 at 10:43:31PM +0000, Matthew Wilcox wrote:
> > On Sun, Mar 06, 2022 at 07:02:57PM +0200, Jarkko Sakkinen wrote:
> > > So can I conclude from this that in general having populate available for
> > > device memory is something horrid, or just the implementation path?
> > 
> > You haven't even attempted to explain what the problem is you're trying
> > to solve.  You've shown up with some terrible code and said "Hey, is
> > this a good idea".  No, no, it's not.
> 
> The problem is that in order to include memory to enclave, which is
> essentially a reserved address range processes virtual address space
> there's two steps into it:
> 
> 1. Host side (kernel) does ENCLS[EAUG] to request a new page to be
>    added to the enclave.
> 2. Enclave accepts request with ENCLU[EACCEPT] or ENCLU[EACCEPTCOPY].
> 
> In the current SGX2 patch set this taken care by the page fault
> handler. I.e. the enclave calls ENCLU[EACCEPT] for an empty address
> and the #PF handler then does EAUG for a single page.
> 
> So if you want to process a batch of pages this generates O(n)
> round-trips.
> 
> So if there was a way pre-do a batch of EAUG's, that would allow
> to load data to the enclave without causing page faults happening
> constantly.
> 
> One solution for this simply add ioctl:
> 
> https://lore.kernel.org/linux-sgx/YiLRBglTEbu8cHP9@iki.fi/T/#m195ec84bf85614a140abeee245c5118c22ace8f3
> 
> But in practice when you wanted to use it, you would setup the
> parameters so that they match the mmap() range. So for pratical
> user space API having mmap() take care of this would be much more
> lean option.

For something like Graphene [1] the lazy #PF based option is probably
a way to go. For wasm runtime that we're doing in Enarx [2] we get better
performance by having something like this. I.e. we most of the time take
as much as we use.

[1] https://github.com/gramineproject/graphene
[2] https://enarx.dev/

BR, Jarkko
diff mbox series

Patch

diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c
index 3d0cf471f2fe..89f3f3da9abd 100644
--- a/arch/mips/kernel/vdso.c
+++ b/arch/mips/kernel/vdso.c
@@ -102,7 +102,7 @@  int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		base = mmap_region(NULL, STACK_TOP, PAGE_SIZE,
 				VM_READ | VM_EXEC |
 				VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
-				0, NULL);
+				0, NULL, false);
 		if (IS_ERR_VALUE(base)) {
 			ret = base;
 			goto out;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
index 1b526039a60d..4c71f64d6a79 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -107,7 +107,7 @@  static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *
 	if (!obj->base.filp)
 		return -ENODEV;
 
-	ret = call_mmap(obj->base.filp, vma);
+	ret = call_mmap(obj->base.filp, vma, false);
 	if (ret)
 		return ret;
 
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 29dd87be2fb8..e14f312fdbf8 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -173,7 +173,7 @@  coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
 	spin_unlock(&cii->c_lock);
 
 	vma->vm_file = get_file(host_file);
-	ret = call_mmap(vma->vm_file, vma);
+	ret = call_mmap(vma->vm_file, vma, false);
 
 	if (ret) {
 		/* if call_mmap fails, our caller will put host_file so we
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index fa125feed0ff..b963a9397e80 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -503,7 +503,7 @@  static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
 	vma_set_file(vma, realfile);
 
 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
-	ret = call_mmap(vma->vm_file, vma);
+	ret = call_mmap(vma->vm_file, vma, false);
 	revert_creds(old_cred);
 	ovl_file_accessed(file);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e2d892b201b0..2909e2d14af8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -42,6 +42,7 @@ 
 #include <linux/mount.h>
 #include <linux/cred.h>
 #include <linux/mnt_idmapping.h>
+#include <linux/mm.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -1993,6 +1994,7 @@  struct file_operations {
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
+	int (*populate)(struct file *, struct vm_area_struct *);
 	unsigned long mmap_supported_flags;
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *, fl_owner_t id);
@@ -2074,9 +2076,15 @@  static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
 	return file->f_op->write_iter(kio, iter);
 }
 
-static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
+static inline int call_mmap(struct file *file, struct vm_area_struct *vma, bool do_populate)
 {
-	return file->f_op->mmap(file, vma);
+	int ret = file->f_op->mmap(file, vma);
+
+	if (!ret && do_populate && file->f_op->populate &&
+	    !!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+		ret = file->f_op->populate(file, vma);
+
+	return ret;
 }
 
 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 213cc569b192..6c8c036f423b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2683,7 +2683,7 @@  extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
 
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
-	struct list_head *uf);
+	struct list_head *uf, bool do_populate);
 extern unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot, unsigned long flags,
 	unsigned long pgoff, unsigned long *populate, struct list_head *uf);
diff --git a/ipc/shm.c b/ipc/shm.c
index b3048ebd5c31..89b28f32acf0 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -587,7 +587,7 @@  static int shm_mmap(struct file *file, struct vm_area_struct *vma)
 	if (ret)
 		return ret;
 
-	ret = call_mmap(sfd->file, vma);
+	ret = call_mmap(sfd->file, vma, do_populate);
 	if (ret) {
 		shm_close(vma);
 		return ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index 1e8fdb0b51ed..5eca79957d4c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1413,6 +1413,7 @@  unsigned long do_mmap(struct file *file, unsigned long addr,
 			unsigned long flags, unsigned long pgoff,
 			unsigned long *populate, struct list_head *uf)
 {
+	bool do_populate = (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE;
 	struct mm_struct *mm = current->mm;
 	vm_flags_t vm_flags;
 	int pkey = 0;
@@ -1579,10 +1580,9 @@  unsigned long do_mmap(struct file *file, unsigned long addr,
 			vm_flags |= VM_NORESERVE;
 	}
 
-	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
+	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf, do_populate);
 	if (!IS_ERR_VALUE(addr) &&
-	    ((vm_flags & VM_LOCKED) ||
-	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
+	    ((vm_flags & VM_LOCKED) || do_populate))
 		*populate = len;
 	return addr;
 }
@@ -1721,7 +1721,7 @@  static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
 		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
-		struct list_head *uf)
+		struct list_head *uf, bool do_populate)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev, *merge;
@@ -1790,7 +1790,7 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 		}
 
 		vma->vm_file = get_file(file);
-		error = call_mmap(file, vma);
+		error = call_mmap(file, vma, do_populate);
 		if (error)
 			goto unmap_and_free_vma;
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 55a9e48a7a02..a3c20b803c27 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -941,7 +941,7 @@  static int do_mmap_shared_file(struct vm_area_struct *vma)
 {
 	int ret;
 
-	ret = call_mmap(vma->vm_file, vma);
+	ret = call_mmap(vma->vm_file, vma, false);
 	if (ret == 0) {
 		vma->vm_region->vm_top = vma->vm_region->vm_end;
 		return 0;
@@ -972,7 +972,7 @@  static int do_mmap_private(struct vm_area_struct *vma,
 	 * - VM_MAYSHARE will be set if it may attempt to share
 	 */
 	if (capabilities & NOMMU_MAP_DIRECT) {
-		ret = call_mmap(vma->vm_file, vma);
+		ret = call_mmap(vma->vm_file, vma, false);
 		if (ret == 0) {
 			/* shouldn't return success if we're not sharing */
 			BUG_ON(!(vma->vm_flags & VM_MAYSHARE));