diff mbox series

[v7,03/19] gup: Turn fault_in_pages_{readable,writeable} into fault_in_{readable,writeable}

Message ID 20210827164926.1726765-4-agruenba@redhat.com (mailing list archive)
State New, archived
Headers show
Series gfs2: Fix mmap + page fault deadlocks | expand

Commit Message

Andreas Gruenbacher Aug. 27, 2021, 4:49 p.m. UTC
Turn fault_in_pages_{readable,writeable} into versions that return the
number of bytes not faulted in (similar to copy_to_user) instead of
returning a non-zero value when any of the requested pages couldn't be
faulted in.  This supports the existing users that require all pages to
be faulted in as well as new users that are happy if any pages can be
faulted in at all.

Neither of these functions is entirely trivial and it doesn't seem
useful to inline them, so move them to mm/gup.c.

Rename the functions to fault_in_{readable,writeable} to make sure that
this change doesn't silently break things.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 arch/powerpc/kernel/kvm.c           |  3 +-
 arch/powerpc/kernel/signal_32.c     |  4 +-
 arch/powerpc/kernel/signal_64.c     |  2 +-
 arch/x86/kernel/fpu/signal.c        |  7 ++-
 drivers/gpu/drm/armada/armada_gem.c |  7 ++-
 fs/btrfs/ioctl.c                    |  5 +-
 include/linux/pagemap.h             | 57 ++---------------------
 lib/iov_iter.c                      | 10 ++--
 mm/filemap.c                        |  2 +-
 mm/gup.c                            | 72 +++++++++++++++++++++++++++++
 10 files changed, 93 insertions(+), 76 deletions(-)

Comments

Al Viro Aug. 27, 2021, 7:08 p.m. UTC | #1
On Fri, Aug 27, 2021 at 06:49:10PM +0200, Andreas Gruenbacher wrote:
> Turn fault_in_pages_{readable,writeable} into versions that return the
> number of bytes not faulted in (similar to copy_to_user) instead of
> returning a non-zero value when any of the requested pages couldn't be
> faulted in.  This supports the existing users that require all pages to
> be faulted in as well as new users that are happy if any pages can be
> faulted in at all.
> 
> Neither of these functions is entirely trivial and it doesn't seem
> useful to inline them, so move them to mm/gup.c.
> 
> Rename the functions to fault_in_{readable,writeable} to make sure that
> this change doesn't silently break things.

I'm sorry, but this is wrong.  The callers need to be reviewed and
sanitized.  You have several oddball callers (most of them simply
wrong) *and* the ones on a very hot path in write(2).  And _there_
the existing behaviour does the wrong thing for memory poisoning setups.

	Do we have *any* cases where we both need the fault-in at all *and*
would not be better off with "fail only if the first byte couldn't have been
faulted in"?

> diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
> index 0608581967f0..38c3eae40c14 100644
> --- a/arch/powerpc/kernel/signal_32.c
> +++ b/arch/powerpc/kernel/signal_32.c
> @@ -1048,7 +1048,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
>  	if (new_ctx == NULL)
>  		return 0;
>  	if (!access_ok(new_ctx, ctx_size) ||
> -	    fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
> +	    fault_in_readable((char __user *)new_ctx, ctx_size))
>  		return -EFAULT;

This is completely pointless.  Look at do_setcontext() there.  Seriously,
it immediately does
        if (!user_read_access_begin(ucp, sizeof(*ucp)))
			return -EFAULT;
so this access_ok() is so much garbage.  Then it does normal unsage_get_...()
stuff, so it doesn't need that fault-in crap at all - it *must* handle
copyin failures, fault-in or not.  Just lose that fault_in_... call and be
done with that.


> @@ -1237,7 +1237,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx,
>  #endif
>  
>  	if (!access_ok(ctx, sizeof(*ctx)) ||
> -	    fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx)))
> +	    fault_in_readable((char __user *)ctx, sizeof(*ctx)))
>  		return -EFAULT;

Ditto.

> diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
> index 1831bba0582e..9f471b4a11e3 100644
> --- a/arch/powerpc/kernel/signal_64.c
> +++ b/arch/powerpc/kernel/signal_64.c
> @@ -688,7 +688,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
>  	if (new_ctx == NULL)
>  		return 0;
>  	if (!access_ok(new_ctx, ctx_size) ||
> -	    fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
> +	    fault_in_readable((char __user *)new_ctx, ctx_size))
>  		return -EFAULT;

... and again.

> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 0ba98e08a029..9233ecc31e2e 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -2244,9 +2244,8 @@ static noinline int search_ioctl(struct inode *inode,
>  	key.offset = sk->min_offset;
>  
>  	while (1) {
> -		ret = fault_in_pages_writeable(ubuf + sk_offset,
> -					       *buf_size - sk_offset);
> -		if (ret)
> +		ret = -EFAULT;
> +		if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
>  			break;

Really?

> diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> index 25dfc48536d7..069cedd9d7b4 100644
> --- a/lib/iov_iter.c
> +++ b/lib/iov_iter.c
> @@ -191,7 +191,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
>  	buf = iov->iov_base + skip;
>  	copy = min(bytes, iov->iov_len - skip);
>  
> -	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
> +	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) {

Here we definitely want "fail only if nothing could be faulted in"

>  		kaddr = kmap_atomic(page);
>  		from = kaddr + offset;
>  
> @@ -275,7 +275,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
>  	buf = iov->iov_base + skip;
>  	copy = min(bytes, iov->iov_len - skip);
>  
> -	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
> +	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) {

Same.

> @@ -446,13 +446,11 @@ int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
>  			bytes = i->count;
>  		for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
>  			size_t len = min(bytes, p->iov_len - skip);
> -			int err;
>  
>  			if (unlikely(!len))
>  				continue;
> -			err = fault_in_pages_readable(p->iov_base + skip, len);
> -			if (unlikely(err))
> -				return err;
> +			if (fault_in_readable(p->iov_base + skip, len))
> +				return -EFAULT;

... and the same, except that here we want failure only if nothing had already
been faulted in.
Filipe Manana Sept. 3, 2021, 2:56 p.m. UTC | #2
On Fri, Aug 27, 2021 at 5:52 PM Andreas Gruenbacher <agruenba@redhat.com> wrote:
>
> Turn fault_in_pages_{readable,writeable} into versions that return the
> number of bytes not faulted in (similar to copy_to_user) instead of
> returning a non-zero value when any of the requested pages couldn't be
> faulted in.  This supports the existing users that require all pages to
> be faulted in as well as new users that are happy if any pages can be
> faulted in at all.
>
> Neither of these functions is entirely trivial and it doesn't seem
> useful to inline them, so move them to mm/gup.c.
>
> Rename the functions to fault_in_{readable,writeable} to make sure that
> this change doesn't silently break things.
>
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> ---
>  arch/powerpc/kernel/kvm.c           |  3 +-
>  arch/powerpc/kernel/signal_32.c     |  4 +-
>  arch/powerpc/kernel/signal_64.c     |  2 +-
>  arch/x86/kernel/fpu/signal.c        |  7 ++-
>  drivers/gpu/drm/armada/armada_gem.c |  7 ++-
>  fs/btrfs/ioctl.c                    |  5 +-
>  include/linux/pagemap.h             | 57 ++---------------------
>  lib/iov_iter.c                      | 10 ++--
>  mm/filemap.c                        |  2 +-
>  mm/gup.c                            | 72 +++++++++++++++++++++++++++++
>  10 files changed, 93 insertions(+), 76 deletions(-)
>
> diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
> index d89cf802d9aa..6568823cf306 100644
> --- a/arch/powerpc/kernel/kvm.c
> +++ b/arch/powerpc/kernel/kvm.c
> @@ -669,7 +669,8 @@ static void __init kvm_use_magic_page(void)
>         on_each_cpu(kvm_map_magic_page, &features, 1);
>
>         /* Quick self-test to see if the mapping works */
> -       if (fault_in_pages_readable((const char *)KVM_MAGIC_PAGE, sizeof(u32))) {
> +       if (fault_in_readable((const char __user *)KVM_MAGIC_PAGE,
> +                             sizeof(u32))) {
>                 kvm_patching_worked = false;
>                 return;
>         }
> diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
> index 0608581967f0..38c3eae40c14 100644
> --- a/arch/powerpc/kernel/signal_32.c
> +++ b/arch/powerpc/kernel/signal_32.c
> @@ -1048,7 +1048,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
>         if (new_ctx == NULL)
>                 return 0;
>         if (!access_ok(new_ctx, ctx_size) ||
> -           fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
> +           fault_in_readable((char __user *)new_ctx, ctx_size))
>                 return -EFAULT;
>
>         /*
> @@ -1237,7 +1237,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx,
>  #endif
>
>         if (!access_ok(ctx, sizeof(*ctx)) ||
> -           fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx)))
> +           fault_in_readable((char __user *)ctx, sizeof(*ctx)))
>                 return -EFAULT;
>
>         /*
> diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
> index 1831bba0582e..9f471b4a11e3 100644
> --- a/arch/powerpc/kernel/signal_64.c
> +++ b/arch/powerpc/kernel/signal_64.c
> @@ -688,7 +688,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
>         if (new_ctx == NULL)
>                 return 0;
>         if (!access_ok(new_ctx, ctx_size) ||
> -           fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
> +           fault_in_readable((char __user *)new_ctx, ctx_size))
>                 return -EFAULT;
>
>         /*
> diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
> index 445c57c9c539..ba6bdec81603 100644
> --- a/arch/x86/kernel/fpu/signal.c
> +++ b/arch/x86/kernel/fpu/signal.c
> @@ -205,7 +205,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
>         fpregs_unlock();
>
>         if (ret) {
> -               if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size))
> +               if (!fault_in_writeable(buf_fx, fpu_user_xstate_size))
>                         goto retry;
>                 return -EFAULT;
>         }
> @@ -278,10 +278,9 @@ static int restore_fpregs_from_user(void __user *buf, u64 xrestore,
>                 if (ret != -EFAULT)
>                         return -EINVAL;
>
> -               ret = fault_in_pages_readable(buf, size);
> -               if (!ret)
> +               if (!fault_in_readable(buf, size))
>                         goto retry;
> -               return ret;
> +               return -EFAULT;
>         }
>
>         /*
> diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c
> index 21909642ee4c..8fbb25913327 100644
> --- a/drivers/gpu/drm/armada/armada_gem.c
> +++ b/drivers/gpu/drm/armada/armada_gem.c
> @@ -336,7 +336,7 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data,
>         struct drm_armada_gem_pwrite *args = data;
>         struct armada_gem_object *dobj;
>         char __user *ptr;
> -       int ret;
> +       int ret = 0;
>
>         DRM_DEBUG_DRIVER("handle %u off %u size %u ptr 0x%llx\n",
>                 args->handle, args->offset, args->size, args->ptr);
> @@ -349,9 +349,8 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data,
>         if (!access_ok(ptr, args->size))
>                 return -EFAULT;
>
> -       ret = fault_in_pages_readable(ptr, args->size);
> -       if (ret)
> -               return ret;
> +       if (fault_in_readable(ptr, args->size))
> +               return -EFAULT;
>
>         dobj = armada_gem_object_lookup(file, args->handle);
>         if (dobj == NULL)
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 0ba98e08a029..9233ecc31e2e 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -2244,9 +2244,8 @@ static noinline int search_ioctl(struct inode *inode,
>         key.offset = sk->min_offset;
>
>         while (1) {
> -               ret = fault_in_pages_writeable(ubuf + sk_offset,
> -                                              *buf_size - sk_offset);
> -               if (ret)
> +               ret = -EFAULT;
> +               if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
>                         break;
>
>                 ret = btrfs_search_forward(root, &key, path, sk->min_transid);
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index ed02aa522263..7c9edc9694d9 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -734,61 +734,10 @@ int wait_on_page_private_2_killable(struct page *page);
>  extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
>
>  /*
> - * Fault everything in given userspace address range in.
> + * Fault in userspace address range.
>   */
> -static inline int fault_in_pages_writeable(char __user *uaddr, int size)
> -{
> -       char __user *end = uaddr + size - 1;
> -
> -       if (unlikely(size == 0))
> -               return 0;
> -
> -       if (unlikely(uaddr > end))
> -               return -EFAULT;
> -       /*
> -        * Writing zeroes into userspace here is OK, because we know that if
> -        * the zero gets there, we'll be overwriting it.
> -        */
> -       do {
> -               if (unlikely(__put_user(0, uaddr) != 0))
> -                       return -EFAULT;
> -               uaddr += PAGE_SIZE;
> -       } while (uaddr <= end);
> -
> -       /* Check whether the range spilled into the next page. */
> -       if (((unsigned long)uaddr & PAGE_MASK) ==
> -                       ((unsigned long)end & PAGE_MASK))
> -               return __put_user(0, end);
> -
> -       return 0;
> -}
> -
> -static inline int fault_in_pages_readable(const char __user *uaddr, int size)
> -{
> -       volatile char c;
> -       const char __user *end = uaddr + size - 1;
> -
> -       if (unlikely(size == 0))
> -               return 0;
> -
> -       if (unlikely(uaddr > end))
> -               return -EFAULT;
> -
> -       do {
> -               if (unlikely(__get_user(c, uaddr) != 0))
> -                       return -EFAULT;
> -               uaddr += PAGE_SIZE;
> -       } while (uaddr <= end);
> -
> -       /* Check whether the range spilled into the next page. */
> -       if (((unsigned long)uaddr & PAGE_MASK) ==
> -                       ((unsigned long)end & PAGE_MASK)) {
> -               return __get_user(c, end);
> -       }
> -
> -       (void)c;
> -       return 0;
> -}
> +size_t fault_in_writeable(char __user *uaddr, size_t size);
> +size_t fault_in_readable(const char __user *uaddr, size_t size);
>
>  int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
>                                 pgoff_t index, gfp_t gfp_mask);
> diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> index 25dfc48536d7..069cedd9d7b4 100644
> --- a/lib/iov_iter.c
> +++ b/lib/iov_iter.c
> @@ -191,7 +191,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
>         buf = iov->iov_base + skip;
>         copy = min(bytes, iov->iov_len - skip);
>
> -       if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
> +       if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) {
>                 kaddr = kmap_atomic(page);
>                 from = kaddr + offset;
>
> @@ -275,7 +275,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
>         buf = iov->iov_base + skip;
>         copy = min(bytes, iov->iov_len - skip);
>
> -       if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
> +       if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) {
>                 kaddr = kmap_atomic(page);
>                 to = kaddr + offset;
>
> @@ -446,13 +446,11 @@ int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
>                         bytes = i->count;
>                 for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
>                         size_t len = min(bytes, p->iov_len - skip);
> -                       int err;
>
>                         if (unlikely(!len))
>                                 continue;
> -                       err = fault_in_pages_readable(p->iov_base + skip, len);
> -                       if (unlikely(err))
> -                               return err;
> +                       if (fault_in_readable(p->iov_base + skip, len))
> +                               return -EFAULT;
>                         bytes -= len;
>                 }
>         }
> diff --git a/mm/filemap.c b/mm/filemap.c
> index d1458ecf2f51..4dec3bc7752e 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -88,7 +88,7 @@
>   *    ->lock_page              (access_process_vm)
>   *
>   *  ->i_mutex                  (generic_perform_write)
> - *    ->mmap_lock              (fault_in_pages_readable->do_page_fault)
> + *    ->mmap_lock              (fault_in_readable->do_page_fault)
>   *
>   *  bdi->wb.list_lock
>   *    sb_lock                  (fs/fs-writeback.c)
> diff --git a/mm/gup.c b/mm/gup.c
> index b94717977d17..0cf47955e5a1 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -1672,6 +1672,78 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
>  }
>  #endif /* !CONFIG_MMU */
>
> +/**
> + * fault_in_writeable - fault in userspace address range for writing
> + * @uaddr: start of address range
> + * @size: size of address range
> + *
> + * Returns the number of bytes not faulted in (like copy_to_user() and
> + * copy_from_user()).
> + */
> +size_t fault_in_writeable(char __user *uaddr, size_t size)
> +{
> +       char __user *start = uaddr, *end;
> +
> +       if (unlikely(size == 0))
> +               return 0;
> +       if (!PAGE_ALIGNED(uaddr)) {
> +               if (unlikely(__put_user(0, uaddr) != 0))
> +                       return size;
> +               uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
> +       }
> +       end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
> +       if (unlikely(end < start))
> +               end = NULL;
> +       while (uaddr != end) {
> +               if (unlikely(__put_user(0, uaddr) != 0))
> +                       goto out;
> +               uaddr += PAGE_SIZE;

Won't we loop endlessly or corrupt some unwanted page when 'end' was
set to NULL?

> +       }
> +
> +out:
> +       if (size > uaddr - start)
> +               return size - (uaddr - start);
> +       return 0;
> +}
> +EXPORT_SYMBOL(fault_in_writeable);
> +
> +/**
> + * fault_in_readable - fault in userspace address range for reading
> + * @uaddr: start of user address range
> + * @size: size of user address range
> + *
> + * Returns the number of bytes not faulted in (like copy_to_user() and
> + * copy_from_user()).
> + */
> +size_t fault_in_readable(const char __user *uaddr, size_t size)
> +{
> +       const char __user *start = uaddr, *end;
> +       volatile char c;
> +
> +       if (unlikely(size == 0))
> +               return 0;
> +       if (!PAGE_ALIGNED(uaddr)) {
> +               if (unlikely(__get_user(c, uaddr) != 0))
> +                       return size;
> +               uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
> +       }
> +       end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
> +       if (unlikely(end < start))
> +               end = NULL;
> +       while (uaddr != end) {

Same kind of issue here, when 'end' was set to NULL?

Thanks.

> +               if (unlikely(__get_user(c, uaddr) != 0))
> +                       goto out;
> +               uaddr += PAGE_SIZE;
> +       }
> +
> +out:
> +       (void)c;
> +       if (size > uaddr - start)
> +               return size - (uaddr - start);
> +       return 0;
> +}
> +EXPORT_SYMBOL(fault_in_readable);
> +
>  /**
>   * get_dump_page() - pin user page in memory while writing it to core dump
>   * @addr: user address
> --
> 2.26.3
>
Andreas Gruenbacher Sept. 28, 2021, 3:02 p.m. UTC | #3
On Fri, Sep 3, 2021 at 4:57 PM Filipe Manana <fdmanana@gmail.com> wrote:
> On Fri, Aug 27, 2021 at 5:52 PM Andreas Gruenbacher <agruenba@redhat.com> wrote:
> >
> > Turn fault_in_pages_{readable,writeable} into versions that return the
> > number of bytes not faulted in (similar to copy_to_user) instead of
> > returning a non-zero value when any of the requested pages couldn't be
> > faulted in.  This supports the existing users that require all pages to
> > be faulted in as well as new users that are happy if any pages can be
> > faulted in at all.
> >
> > Neither of these functions is entirely trivial and it doesn't seem
> > useful to inline them, so move them to mm/gup.c.
> >
> > Rename the functions to fault_in_{readable,writeable} to make sure that
> > this change doesn't silently break things.
> >
> > Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> > ---
> >  arch/powerpc/kernel/kvm.c           |  3 +-
> >  arch/powerpc/kernel/signal_32.c     |  4 +-
> >  arch/powerpc/kernel/signal_64.c     |  2 +-
> >  arch/x86/kernel/fpu/signal.c        |  7 ++-
> >  drivers/gpu/drm/armada/armada_gem.c |  7 ++-
> >  fs/btrfs/ioctl.c                    |  5 +-
> >  include/linux/pagemap.h             | 57 ++---------------------
> >  lib/iov_iter.c                      | 10 ++--
> >  mm/filemap.c                        |  2 +-
> >  mm/gup.c                            | 72 +++++++++++++++++++++++++++++
> >  10 files changed, 93 insertions(+), 76 deletions(-)
> >
> > diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
> > index d89cf802d9aa..6568823cf306 100644
> > --- a/arch/powerpc/kernel/kvm.c
> > +++ b/arch/powerpc/kernel/kvm.c
> > @@ -669,7 +669,8 @@ static void __init kvm_use_magic_page(void)
> >         on_each_cpu(kvm_map_magic_page, &features, 1);
> >
> >         /* Quick self-test to see if the mapping works */
> > -       if (fault_in_pages_readable((const char *)KVM_MAGIC_PAGE, sizeof(u32))) {
> > +       if (fault_in_readable((const char __user *)KVM_MAGIC_PAGE,
> > +                             sizeof(u32))) {
> >                 kvm_patching_worked = false;
> >                 return;
> >         }
> > diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
> > index 0608581967f0..38c3eae40c14 100644
> > --- a/arch/powerpc/kernel/signal_32.c
> > +++ b/arch/powerpc/kernel/signal_32.c
> > @@ -1048,7 +1048,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
> >         if (new_ctx == NULL)
> >                 return 0;
> >         if (!access_ok(new_ctx, ctx_size) ||
> > -           fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
> > +           fault_in_readable((char __user *)new_ctx, ctx_size))
> >                 return -EFAULT;
> >
> >         /*
> > @@ -1237,7 +1237,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx,
> >  #endif
> >
> >         if (!access_ok(ctx, sizeof(*ctx)) ||
> > -           fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx)))
> > +           fault_in_readable((char __user *)ctx, sizeof(*ctx)))
> >                 return -EFAULT;
> >
> >         /*
> > diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
> > index 1831bba0582e..9f471b4a11e3 100644
> > --- a/arch/powerpc/kernel/signal_64.c
> > +++ b/arch/powerpc/kernel/signal_64.c
> > @@ -688,7 +688,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
> >         if (new_ctx == NULL)
> >                 return 0;
> >         if (!access_ok(new_ctx, ctx_size) ||
> > -           fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
> > +           fault_in_readable((char __user *)new_ctx, ctx_size))
> >                 return -EFAULT;
> >
> >         /*
> > diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
> > index 445c57c9c539..ba6bdec81603 100644
> > --- a/arch/x86/kernel/fpu/signal.c
> > +++ b/arch/x86/kernel/fpu/signal.c
> > @@ -205,7 +205,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
> >         fpregs_unlock();
> >
> >         if (ret) {
> > -               if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size))
> > +               if (!fault_in_writeable(buf_fx, fpu_user_xstate_size))
> >                         goto retry;
> >                 return -EFAULT;
> >         }
> > @@ -278,10 +278,9 @@ static int restore_fpregs_from_user(void __user *buf, u64 xrestore,
> >                 if (ret != -EFAULT)
> >                         return -EINVAL;
> >
> > -               ret = fault_in_pages_readable(buf, size);
> > -               if (!ret)
> > +               if (!fault_in_readable(buf, size))
> >                         goto retry;
> > -               return ret;
> > +               return -EFAULT;
> >         }
> >
> >         /*
> > diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c
> > index 21909642ee4c..8fbb25913327 100644
> > --- a/drivers/gpu/drm/armada/armada_gem.c
> > +++ b/drivers/gpu/drm/armada/armada_gem.c
> > @@ -336,7 +336,7 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data,
> >         struct drm_armada_gem_pwrite *args = data;
> >         struct armada_gem_object *dobj;
> >         char __user *ptr;
> > -       int ret;
> > +       int ret = 0;
> >
> >         DRM_DEBUG_DRIVER("handle %u off %u size %u ptr 0x%llx\n",
> >                 args->handle, args->offset, args->size, args->ptr);
> > @@ -349,9 +349,8 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data,
> >         if (!access_ok(ptr, args->size))
> >                 return -EFAULT;
> >
> > -       ret = fault_in_pages_readable(ptr, args->size);
> > -       if (ret)
> > -               return ret;
> > +       if (fault_in_readable(ptr, args->size))
> > +               return -EFAULT;
> >
> >         dobj = armada_gem_object_lookup(file, args->handle);
> >         if (dobj == NULL)
> > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> > index 0ba98e08a029..9233ecc31e2e 100644
> > --- a/fs/btrfs/ioctl.c
> > +++ b/fs/btrfs/ioctl.c
> > @@ -2244,9 +2244,8 @@ static noinline int search_ioctl(struct inode *inode,
> >         key.offset = sk->min_offset;
> >
> >         while (1) {
> > -               ret = fault_in_pages_writeable(ubuf + sk_offset,
> > -                                              *buf_size - sk_offset);
> > -               if (ret)
> > +               ret = -EFAULT;
> > +               if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
> >                         break;
> >
> >                 ret = btrfs_search_forward(root, &key, path, sk->min_transid);
> > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> > index ed02aa522263..7c9edc9694d9 100644
> > --- a/include/linux/pagemap.h
> > +++ b/include/linux/pagemap.h
> > @@ -734,61 +734,10 @@ int wait_on_page_private_2_killable(struct page *page);
> >  extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
> >
> >  /*
> > - * Fault everything in given userspace address range in.
> > + * Fault in userspace address range.
> >   */
> > -static inline int fault_in_pages_writeable(char __user *uaddr, int size)
> > -{
> > -       char __user *end = uaddr + size - 1;
> > -
> > -       if (unlikely(size == 0))
> > -               return 0;
> > -
> > -       if (unlikely(uaddr > end))
> > -               return -EFAULT;
> > -       /*
> > -        * Writing zeroes into userspace here is OK, because we know that if
> > -        * the zero gets there, we'll be overwriting it.
> > -        */
> > -       do {
> > -               if (unlikely(__put_user(0, uaddr) != 0))
> > -                       return -EFAULT;
> > -               uaddr += PAGE_SIZE;
> > -       } while (uaddr <= end);
> > -
> > -       /* Check whether the range spilled into the next page. */
> > -       if (((unsigned long)uaddr & PAGE_MASK) ==
> > -                       ((unsigned long)end & PAGE_MASK))
> > -               return __put_user(0, end);
> > -
> > -       return 0;
> > -}
> > -
> > -static inline int fault_in_pages_readable(const char __user *uaddr, int size)
> > -{
> > -       volatile char c;
> > -       const char __user *end = uaddr + size - 1;
> > -
> > -       if (unlikely(size == 0))
> > -               return 0;
> > -
> > -       if (unlikely(uaddr > end))
> > -               return -EFAULT;
> > -
> > -       do {
> > -               if (unlikely(__get_user(c, uaddr) != 0))
> > -                       return -EFAULT;
> > -               uaddr += PAGE_SIZE;
> > -       } while (uaddr <= end);
> > -
> > -       /* Check whether the range spilled into the next page. */
> > -       if (((unsigned long)uaddr & PAGE_MASK) ==
> > -                       ((unsigned long)end & PAGE_MASK)) {
> > -               return __get_user(c, end);
> > -       }
> > -
> > -       (void)c;
> > -       return 0;
> > -}
> > +size_t fault_in_writeable(char __user *uaddr, size_t size);
> > +size_t fault_in_readable(const char __user *uaddr, size_t size);
> >
> >  int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
> >                                 pgoff_t index, gfp_t gfp_mask);
> > diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> > index 25dfc48536d7..069cedd9d7b4 100644
> > --- a/lib/iov_iter.c
> > +++ b/lib/iov_iter.c
> > @@ -191,7 +191,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
> >         buf = iov->iov_base + skip;
> >         copy = min(bytes, iov->iov_len - skip);
> >
> > -       if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
> > +       if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) {
> >                 kaddr = kmap_atomic(page);
> >                 from = kaddr + offset;
> >
> > @@ -275,7 +275,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
> >         buf = iov->iov_base + skip;
> >         copy = min(bytes, iov->iov_len - skip);
> >
> > -       if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
> > +       if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) {
> >                 kaddr = kmap_atomic(page);
> >                 to = kaddr + offset;
> >
> > @@ -446,13 +446,11 @@ int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
> >                         bytes = i->count;
> >                 for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
> >                         size_t len = min(bytes, p->iov_len - skip);
> > -                       int err;
> >
> >                         if (unlikely(!len))
> >                                 continue;
> > -                       err = fault_in_pages_readable(p->iov_base + skip, len);
> > -                       if (unlikely(err))
> > -                               return err;
> > +                       if (fault_in_readable(p->iov_base + skip, len))
> > +                               return -EFAULT;
> >                         bytes -= len;
> >                 }
> >         }
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index d1458ecf2f51..4dec3bc7752e 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -88,7 +88,7 @@
> >   *    ->lock_page              (access_process_vm)
> >   *
> >   *  ->i_mutex                  (generic_perform_write)
> > - *    ->mmap_lock              (fault_in_pages_readable->do_page_fault)
> > + *    ->mmap_lock              (fault_in_readable->do_page_fault)
> >   *
> >   *  bdi->wb.list_lock
> >   *    sb_lock                  (fs/fs-writeback.c)
> > diff --git a/mm/gup.c b/mm/gup.c
> > index b94717977d17..0cf47955e5a1 100644
> > --- a/mm/gup.c
> > +++ b/mm/gup.c
> > @@ -1672,6 +1672,78 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
> >  }
> >  #endif /* !CONFIG_MMU */
> >
> > +/**
> > + * fault_in_writeable - fault in userspace address range for writing
> > + * @uaddr: start of address range
> > + * @size: size of address range
> > + *
> > + * Returns the number of bytes not faulted in (like copy_to_user() and
> > + * copy_from_user()).
> > + */
> > +size_t fault_in_writeable(char __user *uaddr, size_t size)
> > +{
> > +       char __user *start = uaddr, *end;
> > +
> > +       if (unlikely(size == 0))
> > +               return 0;
> > +       if (!PAGE_ALIGNED(uaddr)) {
> > +               if (unlikely(__put_user(0, uaddr) != 0))
> > +                       return size;
> > +               uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
> > +       }
> > +       end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
> > +       if (unlikely(end < start))
> > +               end = NULL;
> > +       while (uaddr != end) {
> > +               if (unlikely(__put_user(0, uaddr) != 0))
> > +                       goto out;
> > +               uaddr += PAGE_SIZE;
>
> Won't we loop endlessly or corrupt some unwanted page when 'end' was
> set to NULL?

What do you mean? We set 'end' to NULL when start + size < start
exactly so that the loop will stop when uaddr wraps around.

> > +       }
> > +
> > +out:
> > +       if (size > uaddr - start)
> > +               return size - (uaddr - start);
> > +       return 0;
> > +}
> > +EXPORT_SYMBOL(fault_in_writeable);
> > +
> > +/**
> > + * fault_in_readable - fault in userspace address range for reading
> > + * @uaddr: start of user address range
> > + * @size: size of user address range
> > + *
> > + * Returns the number of bytes not faulted in (like copy_to_user() and
> > + * copy_from_user()).
> > + */
> > +size_t fault_in_readable(const char __user *uaddr, size_t size)
> > +{
> > +       const char __user *start = uaddr, *end;
> > +       volatile char c;
> > +
> > +       if (unlikely(size == 0))
> > +               return 0;
> > +       if (!PAGE_ALIGNED(uaddr)) {
> > +               if (unlikely(__get_user(c, uaddr) != 0))
> > +                       return size;
> > +               uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
> > +       }
> > +       end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
> > +       if (unlikely(end < start))
> > +               end = NULL;
> > +       while (uaddr != end) {
>
> Same kind of issue here, when 'end' was set to NULL?
>
> Thanks.
>
> > +               if (unlikely(__get_user(c, uaddr) != 0))
> > +                       goto out;
> > +               uaddr += PAGE_SIZE;
> > +       }
> > +
> > +out:
> > +       (void)c;
> > +       if (size > uaddr - start)
> > +               return size - (uaddr - start);
> > +       return 0;
> > +}
> > +EXPORT_SYMBOL(fault_in_readable);
> > +
> >  /**
> >   * get_dump_page() - pin user page in memory while writing it to core dump
> >   * @addr: user address
> > --
> > 2.26.3
> >

Thanks,
Andreas
Matthew Wilcox Sept. 28, 2021, 4:37 p.m. UTC | #4
On Tue, Sep 28, 2021 at 05:02:43PM +0200, Andreas Gruenbacher wrote:
> On Fri, Sep 3, 2021 at 4:57 PM Filipe Manana <fdmanana@gmail.com> wrote:
> > On Fri, Aug 27, 2021 at 5:52 PM Andreas Gruenbacher <agruenba@redhat.com> wrote:
> > > +size_t fault_in_writeable(char __user *uaddr, size_t size)
> > > +{
> > > +       char __user *start = uaddr, *end;
> > > +
> > > +       if (unlikely(size == 0))
> > > +               return 0;
> > > +       if (!PAGE_ALIGNED(uaddr)) {
> > > +               if (unlikely(__put_user(0, uaddr) != 0))
> > > +                       return size;
> > > +               uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
> > > +       }
> > > +       end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
> > > +       if (unlikely(end < start))
> > > +               end = NULL;
> > > +       while (uaddr != end) {
> > > +               if (unlikely(__put_user(0, uaddr) != 0))
> > > +                       goto out;
> > > +               uaddr += PAGE_SIZE;
> >
> > Won't we loop endlessly or corrupt some unwanted page when 'end' was
> > set to NULL?
> 
> What do you mean? We set 'end' to NULL when start + size < start
> exactly so that the loop will stop when uaddr wraps around.

But think about x86-64.  The virtual address space (unless you have 5
level PTs) looks like:

[0, 2^47)		userspace
[2^47, 2^64 - 2^47)	hole
[2^64 - 2^47, 2^64)	kernel space

If we try to copy from the hole we'll get some kind of fault (I forget
the details).  We have to stop at the top of userspace.
Andreas Gruenbacher Sept. 28, 2021, 8:41 p.m. UTC | #5
Hi Willy,

On Tue, Sep 28, 2021 at 6:40 PM Matthew Wilcox <willy@infradead.org> wrote:
> On Tue, Sep 28, 2021 at 05:02:43PM +0200, Andreas Gruenbacher wrote:
> > On Fri, Sep 3, 2021 at 4:57 PM Filipe Manana <fdmanana@gmail.com> wrote:
> > > On Fri, Aug 27, 2021 at 5:52 PM Andreas Gruenbacher <agruenba@redhat.com> wrote:
> > > > +size_t fault_in_writeable(char __user *uaddr, size_t size)
> > > > +{
> > > > +       char __user *start = uaddr, *end;
> > > > +
> > > > +       if (unlikely(size == 0))
> > > > +               return 0;
> > > > +       if (!PAGE_ALIGNED(uaddr)) {
> > > > +               if (unlikely(__put_user(0, uaddr) != 0))
> > > > +                       return size;
> > > > +               uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
> > > > +       }
> > > > +       end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
> > > > +       if (unlikely(end < start))
> > > > +               end = NULL;
> > > > +       while (uaddr != end) {
> > > > +               if (unlikely(__put_user(0, uaddr) != 0))
> > > > +                       goto out;
> > > > +               uaddr += PAGE_SIZE;
> > >
> > > Won't we loop endlessly or corrupt some unwanted page when 'end' was
> > > set to NULL?
> >
> > What do you mean? We set 'end' to NULL when start + size < start
> > exactly so that the loop will stop when uaddr wraps around.
>
> But think about x86-64.  The virtual address space (unless you have 5
> level PTs) looks like:
>
> [0, 2^47)               userspace
> [2^47, 2^64 - 2^47)     hole
> [2^64 - 2^47, 2^64)     kernel space
>
> If we try to copy from the hole we'll get some kind of fault (I forget
> the details).  We have to stop at the top of userspace.

If you look at the before and after state of this patch,
fault_in_pages_readable and fault_in_pages_writeable did fail an
attempt to fault in a range that wraps with -EFAULT. That's sensible
for a function that returns an all-or-nothing result. We now want to
return how much of the range was (or wasn't) faulted in. We could do
that and still reject ranges that wrap outright. Or we could try to
fault in however much we reasonably can even if the range wraps. The
patch tries the latter, which is where the stopping at NULL is coming
from: when the range wraps, we *definitely* don't want to go any
further.

If the range extends into the hole, we'll get a failure from
__get_user or __put_user where that happens. That's entirely the
expected result, isn't it?

Thanks,
Andreas
diff mbox series

Patch

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index d89cf802d9aa..6568823cf306 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -669,7 +669,8 @@  static void __init kvm_use_magic_page(void)
 	on_each_cpu(kvm_map_magic_page, &features, 1);
 
 	/* Quick self-test to see if the mapping works */
-	if (fault_in_pages_readable((const char *)KVM_MAGIC_PAGE, sizeof(u32))) {
+	if (fault_in_readable((const char __user *)KVM_MAGIC_PAGE,
+			      sizeof(u32))) {
 		kvm_patching_worked = false;
 		return;
 	}
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 0608581967f0..38c3eae40c14 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1048,7 +1048,7 @@  SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
 	if (new_ctx == NULL)
 		return 0;
 	if (!access_ok(new_ctx, ctx_size) ||
-	    fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
+	    fault_in_readable((char __user *)new_ctx, ctx_size))
 		return -EFAULT;
 
 	/*
@@ -1237,7 +1237,7 @@  SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx,
 #endif
 
 	if (!access_ok(ctx, sizeof(*ctx)) ||
-	    fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx)))
+	    fault_in_readable((char __user *)ctx, sizeof(*ctx)))
 		return -EFAULT;
 
 	/*
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 1831bba0582e..9f471b4a11e3 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -688,7 +688,7 @@  SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
 	if (new_ctx == NULL)
 		return 0;
 	if (!access_ok(new_ctx, ctx_size) ||
-	    fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
+	    fault_in_readable((char __user *)new_ctx, ctx_size))
 		return -EFAULT;
 
 	/*
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 445c57c9c539..ba6bdec81603 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -205,7 +205,7 @@  int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
 	fpregs_unlock();
 
 	if (ret) {
-		if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size))
+		if (!fault_in_writeable(buf_fx, fpu_user_xstate_size))
 			goto retry;
 		return -EFAULT;
 	}
@@ -278,10 +278,9 @@  static int restore_fpregs_from_user(void __user *buf, u64 xrestore,
 		if (ret != -EFAULT)
 			return -EINVAL;
 
-		ret = fault_in_pages_readable(buf, size);
-		if (!ret)
+		if (!fault_in_readable(buf, size))
 			goto retry;
-		return ret;
+		return -EFAULT;
 	}
 
 	/*
diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c
index 21909642ee4c..8fbb25913327 100644
--- a/drivers/gpu/drm/armada/armada_gem.c
+++ b/drivers/gpu/drm/armada/armada_gem.c
@@ -336,7 +336,7 @@  int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 	struct drm_armada_gem_pwrite *args = data;
 	struct armada_gem_object *dobj;
 	char __user *ptr;
-	int ret;
+	int ret = 0;
 
 	DRM_DEBUG_DRIVER("handle %u off %u size %u ptr 0x%llx\n",
 		args->handle, args->offset, args->size, args->ptr);
@@ -349,9 +349,8 @@  int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 	if (!access_ok(ptr, args->size))
 		return -EFAULT;
 
-	ret = fault_in_pages_readable(ptr, args->size);
-	if (ret)
-		return ret;
+	if (fault_in_readable(ptr, args->size))
+		return -EFAULT;
 
 	dobj = armada_gem_object_lookup(file, args->handle);
 	if (dobj == NULL)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0ba98e08a029..9233ecc31e2e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2244,9 +2244,8 @@  static noinline int search_ioctl(struct inode *inode,
 	key.offset = sk->min_offset;
 
 	while (1) {
-		ret = fault_in_pages_writeable(ubuf + sk_offset,
-					       *buf_size - sk_offset);
-		if (ret)
+		ret = -EFAULT;
+		if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
 			break;
 
 		ret = btrfs_search_forward(root, &key, path, sk->min_transid);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ed02aa522263..7c9edc9694d9 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -734,61 +734,10 @@  int wait_on_page_private_2_killable(struct page *page);
 extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
 
 /*
- * Fault everything in given userspace address range in.
+ * Fault in userspace address range.
  */
-static inline int fault_in_pages_writeable(char __user *uaddr, int size)
-{
-	char __user *end = uaddr + size - 1;
-
-	if (unlikely(size == 0))
-		return 0;
-
-	if (unlikely(uaddr > end))
-		return -EFAULT;
-	/*
-	 * Writing zeroes into userspace here is OK, because we know that if
-	 * the zero gets there, we'll be overwriting it.
-	 */
-	do {
-		if (unlikely(__put_user(0, uaddr) != 0))
-			return -EFAULT;
-		uaddr += PAGE_SIZE;
-	} while (uaddr <= end);
-
-	/* Check whether the range spilled into the next page. */
-	if (((unsigned long)uaddr & PAGE_MASK) ==
-			((unsigned long)end & PAGE_MASK))
-		return __put_user(0, end);
-
-	return 0;
-}
-
-static inline int fault_in_pages_readable(const char __user *uaddr, int size)
-{
-	volatile char c;
-	const char __user *end = uaddr + size - 1;
-
-	if (unlikely(size == 0))
-		return 0;
-
-	if (unlikely(uaddr > end))
-		return -EFAULT;
-
-	do {
-		if (unlikely(__get_user(c, uaddr) != 0))
-			return -EFAULT;
-		uaddr += PAGE_SIZE;
-	} while (uaddr <= end);
-
-	/* Check whether the range spilled into the next page. */
-	if (((unsigned long)uaddr & PAGE_MASK) ==
-			((unsigned long)end & PAGE_MASK)) {
-		return __get_user(c, end);
-	}
-
-	(void)c;
-	return 0;
-}
+size_t fault_in_writeable(char __user *uaddr, size_t size);
+size_t fault_in_readable(const char __user *uaddr, size_t size);
 
 int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 25dfc48536d7..069cedd9d7b4 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -191,7 +191,7 @@  static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
 	buf = iov->iov_base + skip;
 	copy = min(bytes, iov->iov_len - skip);
 
-	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
+	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) {
 		kaddr = kmap_atomic(page);
 		from = kaddr + offset;
 
@@ -275,7 +275,7 @@  static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
 	buf = iov->iov_base + skip;
 	copy = min(bytes, iov->iov_len - skip);
 
-	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
+	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) {
 		kaddr = kmap_atomic(page);
 		to = kaddr + offset;
 
@@ -446,13 +446,11 @@  int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
 			bytes = i->count;
 		for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
 			size_t len = min(bytes, p->iov_len - skip);
-			int err;
 
 			if (unlikely(!len))
 				continue;
-			err = fault_in_pages_readable(p->iov_base + skip, len);
-			if (unlikely(err))
-				return err;
+			if (fault_in_readable(p->iov_base + skip, len))
+				return -EFAULT;
 			bytes -= len;
 		}
 	}
diff --git a/mm/filemap.c b/mm/filemap.c
index d1458ecf2f51..4dec3bc7752e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -88,7 +88,7 @@ 
  *    ->lock_page		(access_process_vm)
  *
  *  ->i_mutex			(generic_perform_write)
- *    ->mmap_lock		(fault_in_pages_readable->do_page_fault)
+ *    ->mmap_lock		(fault_in_readable->do_page_fault)
  *
  *  bdi->wb.list_lock
  *    sb_lock			(fs/fs-writeback.c)
diff --git a/mm/gup.c b/mm/gup.c
index b94717977d17..0cf47955e5a1 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1672,6 +1672,78 @@  static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
 }
 #endif /* !CONFIG_MMU */
 
+/**
+ * fault_in_writeable - fault in userspace address range for writing
+ * @uaddr: start of address range
+ * @size: size of address range
+ *
+ * Returns the number of bytes not faulted in (like copy_to_user() and
+ * copy_from_user()).
+ */
+size_t fault_in_writeable(char __user *uaddr, size_t size)
+{
+	char __user *start = uaddr, *end;
+
+	if (unlikely(size == 0))
+		return 0;
+	if (!PAGE_ALIGNED(uaddr)) {
+		if (unlikely(__put_user(0, uaddr) != 0))
+			return size;
+		uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
+	}
+	end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
+	if (unlikely(end < start))
+		end = NULL;
+	while (uaddr != end) {
+		if (unlikely(__put_user(0, uaddr) != 0))
+			goto out;
+		uaddr += PAGE_SIZE;
+	}
+
+out:
+	if (size > uaddr - start)
+		return size - (uaddr - start);
+	return 0;
+}
+EXPORT_SYMBOL(fault_in_writeable);
+
+/**
+ * fault_in_readable - fault in userspace address range for reading
+ * @uaddr: start of user address range
+ * @size: size of user address range
+ *
+ * Returns the number of bytes not faulted in (like copy_to_user() and
+ * copy_from_user()).
+ */
+size_t fault_in_readable(const char __user *uaddr, size_t size)
+{
+	const char __user *start = uaddr, *end;
+	volatile char c;
+
+	if (unlikely(size == 0))
+		return 0;
+	if (!PAGE_ALIGNED(uaddr)) {
+		if (unlikely(__get_user(c, uaddr) != 0))
+			return size;
+		uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
+	}
+	end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
+	if (unlikely(end < start))
+		end = NULL;
+	while (uaddr != end) {
+		if (unlikely(__get_user(c, uaddr) != 0))
+			goto out;
+		uaddr += PAGE_SIZE;
+	}
+
+out:
+	(void)c;
+	if (size > uaddr - start)
+		return size - (uaddr - start);
+	return 0;
+}
+EXPORT_SYMBOL(fault_in_readable);
+
 /**
  * get_dump_page() - pin user page in memory while writing it to core dump
  * @addr: user address