diff mbox series

[v2,2/3] iov_iter: introduce iov_iter_pin_user_pages*() routines

Message ID 20200829080853.20337-3-jhubbard@nvidia.com (mailing list archive)
State New, archived
Headers show
Series bio: Direct IO: convert to pin_user_pages_fast() | expand

Commit Message

John Hubbard Aug. 29, 2020, 8:08 a.m. UTC
The new routines are:
    iov_iter_pin_user_pages()
    iov_iter_pin_user_pages_alloc()

and those correspond to these pre-existing routines:
    iov_iter_get_pages()
    iov_iter_get_pages_alloc()

Also, pipe_get_pages() and related are changed so as to pass
down a "use_pup" (use pin_user_page() instead of get_page()) bool
argument.

Unlike the iov_iter_get_pages*() routines, the
iov_iter_pin_user_pages*() routines assert that only ITER_IOVEC or
ITER_PIPE items are passed in. They then call pin_user_page*(), instead
of get_user_pages_fast() or get_page().

Why: In order to incrementally change Direct IO callers from calling
get_user_pages_fast() and put_page(), over to calling
pin_user_pages_fast() and unpin_user_page(), there need to be mid-level
routines that specifically call one or the other systems, for both page
acquisition and page release.

Signed-off-by: John Hubbard <jhubbard@nvidia.com>
---
 include/linux/uio.h |   5 ++
 lib/iov_iter.c      | 110 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 107 insertions(+), 8 deletions(-)

Comments

Christoph Hellwig Aug. 29, 2020, 2:58 p.m. UTC | #1
On Sat, Aug 29, 2020 at 01:08:52AM -0700, John Hubbard wrote:
> The new routines are:
>     iov_iter_pin_user_pages()
>     iov_iter_pin_user_pages_alloc()
> 
> and those correspond to these pre-existing routines:
>     iov_iter_get_pages()
>     iov_iter_get_pages_alloc()
> 
> Also, pipe_get_pages() and related are changed so as to pass
> down a "use_pup" (use pin_user_page() instead of get_page()) bool
> argument.
> 
> Unlike the iov_iter_get_pages*() routines, the
> iov_iter_pin_user_pages*() routines assert that only ITER_IOVEC or
> ITER_PIPE items are passed in. They then call pin_user_page*(), instead
> of get_user_pages_fast() or get_page().
> 
> Why: In order to incrementally change Direct IO callers from calling
> get_user_pages_fast() and put_page(), over to calling
> pin_user_pages_fast() and unpin_user_page(), there need to be mid-level
> routines that specifically call one or the other systems, for both page
> acquisition and page release.
> 
> Signed-off-by: John Hubbard <jhubbard@nvidia.com>
> ---
>  include/linux/uio.h |   5 ++
>  lib/iov_iter.c      | 110 ++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 107 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/uio.h b/include/linux/uio.h
> index 3835a8a8e9ea..29b0504a27cc 100644
> --- a/include/linux/uio.h
> +++ b/include/linux/uio.h
> @@ -229,6 +229,11 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages);
>  
>  const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
>  
> +ssize_t iov_iter_pin_user_pages(struct iov_iter *i, struct page **pages,
> +			size_t maxsize, unsigned int maxpages, size_t *start);
> +ssize_t iov_iter_pin_user_pages_alloc(struct iov_iter *i, struct page ***pages,
> +			size_t maxsize, size_t *start);
> +
>  static inline size_t iov_iter_count(const struct iov_iter *i)
>  {
>  	return i->count;
> diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> index 5e40786c8f12..f25555eb3279 100644
> --- a/lib/iov_iter.c
> +++ b/lib/iov_iter.c
> @@ -1269,7 +1269,8 @@ static inline ssize_t __pipe_get_pages(struct iov_iter *i,
>  				size_t maxsize,
>  				struct page **pages,
>  				int iter_head,
> -				size_t *start)
> +				size_t *start,
> +				bool use_pup)
>  {
>  	struct pipe_inode_info *pipe = i->pipe;
>  	unsigned int p_mask = pipe->ring_size - 1;
> @@ -1280,7 +1281,11 @@ static inline ssize_t __pipe_get_pages(struct iov_iter *i,
>  	maxsize = n;
>  	n += *start;
>  	while (n > 0) {
> -		get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
> +		if (use_pup)
> +			pin_user_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
> +		else
> +			get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);

Maybe this would become a little more readable with a local variable
and a little more verbosity:

		struct page *page = pipe->bufs[iter_head & p_mask].page;

		if (use_pup)
			pin_user_page(page);
		else
			get_page(page);

		*pages++ = page;
John Hubbard Aug. 29, 2020, 9:58 p.m. UTC | #2
On 8/29/20 7:58 AM, Christoph Hellwig wrote:
> On Sat, Aug 29, 2020 at 01:08:52AM -0700, John Hubbard wrote:
...
>> @@ -1280,7 +1281,11 @@ static inline ssize_t __pipe_get_pages(struct iov_iter *i,
>>   	maxsize = n;
>>   	n += *start;
>>   	while (n > 0) {
>> -		get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
>> +		if (use_pup)
>> +			pin_user_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
>> +		else
>> +			get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
> 
> Maybe this would become a little more readable with a local variable
> and a little more verbosity:
> 
> 		struct page *page = pipe->bufs[iter_head & p_mask].page;
> 
> 		if (use_pup)
> 			pin_user_page(page);
> 		else
> 			get_page(page);
> 
> 		*pages++ = page;
> 

Yes, that is cleaner, I'll change to that, thanks.

thanks,
Al Viro Aug. 30, 2020, 8:17 p.m. UTC | #3
On Sat, Aug 29, 2020 at 01:08:52AM -0700, John Hubbard wrote:
> The new routines are:
>     iov_iter_pin_user_pages()
>     iov_iter_pin_user_pages_alloc()
> 
> and those correspond to these pre-existing routines:
>     iov_iter_get_pages()
>     iov_iter_get_pages_alloc()
> 
> Also, pipe_get_pages() and related are changed so as to pass
> down a "use_pup" (use pin_user_page() instead of get_page()) bool
> argument.
> 
> Unlike the iov_iter_get_pages*() routines, the
> iov_iter_pin_user_pages*() routines assert that only ITER_IOVEC or
> ITER_PIPE items are passed in. They then call pin_user_page*(), instead
> of get_user_pages_fast() or get_page().
> 
> Why: In order to incrementally change Direct IO callers from calling
> get_user_pages_fast() and put_page(), over to calling
> pin_user_pages_fast() and unpin_user_page(), there need to be mid-level
> routines that specifically call one or the other systems, for both page
> acquisition and page release.

Hmm...  Do you plan to kill iov_iter_get_pages* off, eventually getting
rid of that use_pup argument?
John Hubbard Aug. 30, 2020, 8:44 p.m. UTC | #4
On 8/30/20 1:17 PM, Al Viro wrote:
...
>> Why: In order to incrementally change Direct IO callers from calling
>> get_user_pages_fast() and put_page(), over to calling
>> pin_user_pages_fast() and unpin_user_page(), there need to be mid-level
>> routines that specifically call one or the other systems, for both page
>> acquisition and page release.
> 
> Hmm...  Do you plan to kill iov_iter_get_pages* off, eventually getting
> rid of that use_pup argument?
> 

Yes. That is definitely something I'm interested in doing, and in fact,
I started to write words to that effect into the v1 cover letter. I lost
confidence at the last minute, after poking around the remaining call
sites (which are mostly network file systems, plus notably io_uring),
and wondering if I really understood what the hell I was doing. :)

So I decided to reduce the scope of the proposal, until I got some
feedback. Which I now have!

Looking at this again, I see that there are actually *very* few
ITER_KVEC and ITER_BVEC callers, so...yes, maybe this can all be
collapsed down to calling the new functions, which would always use pup,
and lead to the simplification you asked about.

Any advance warnings, advice, design thoughts are definitely welcome
here.


thanks,
diff mbox series

Patch

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 3835a8a8e9ea..29b0504a27cc 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -229,6 +229,11 @@  int iov_iter_npages(const struct iov_iter *i, int maxpages);
 
 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
 
+ssize_t iov_iter_pin_user_pages(struct iov_iter *i, struct page **pages,
+			size_t maxsize, unsigned int maxpages, size_t *start);
+ssize_t iov_iter_pin_user_pages_alloc(struct iov_iter *i, struct page ***pages,
+			size_t maxsize, size_t *start);
+
 static inline size_t iov_iter_count(const struct iov_iter *i)
 {
 	return i->count;
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 5e40786c8f12..f25555eb3279 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1269,7 +1269,8 @@  static inline ssize_t __pipe_get_pages(struct iov_iter *i,
 				size_t maxsize,
 				struct page **pages,
 				int iter_head,
-				size_t *start)
+				size_t *start,
+				bool use_pup)
 {
 	struct pipe_inode_info *pipe = i->pipe;
 	unsigned int p_mask = pipe->ring_size - 1;
@@ -1280,7 +1281,11 @@  static inline ssize_t __pipe_get_pages(struct iov_iter *i,
 	maxsize = n;
 	n += *start;
 	while (n > 0) {
-		get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
+		if (use_pup)
+			pin_user_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
+		else
+			get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
+
 		iter_head++;
 		n -= PAGE_SIZE;
 	}
@@ -1290,7 +1295,7 @@  static inline ssize_t __pipe_get_pages(struct iov_iter *i,
 
 static ssize_t pipe_get_pages(struct iov_iter *i,
 		   struct page **pages, size_t maxsize, unsigned maxpages,
-		   size_t *start)
+		   size_t *start, bool use_pup)
 {
 	unsigned int iter_head, npages;
 	size_t capacity;
@@ -1306,8 +1311,51 @@  static ssize_t pipe_get_pages(struct iov_iter *i,
 	npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
 	capacity = min(npages, maxpages) * PAGE_SIZE - *start;
 
-	return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
+	return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head,
+				start, use_pup);
+}
+
+ssize_t iov_iter_pin_user_pages(struct iov_iter *i,
+		   struct page **pages, size_t maxsize, unsigned int maxpages,
+		   size_t *start)
+{
+	size_t skip = i->iov_offset;
+	const struct iovec *iov;
+	struct iovec v;
+
+	if (unlikely(iov_iter_is_pipe(i)))
+		return pipe_get_pages(i, pages, maxsize, maxpages, start, true);
+	if (unlikely(iov_iter_is_discard(i)))
+		return -EFAULT;
+	if (WARN_ON_ONCE(!iter_is_iovec(i)))
+		return -EFAULT;
+
+	if (unlikely(!maxsize))
+		return 0;
+	maxsize = min(maxsize, i->count);
+
+	iterate_iovec(i, maxsize, v, iov, skip, ({
+		unsigned long addr = (unsigned long)v.iov_base;
+		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+		int n;
+		int res;
+
+		if (len > maxpages * PAGE_SIZE)
+			len = maxpages * PAGE_SIZE;
+		addr &= ~(PAGE_SIZE - 1);
+		n = DIV_ROUND_UP(len, PAGE_SIZE);
+
+		res = pin_user_pages_fast(addr, n,
+				iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
+				pages);
+		if (unlikely(res < 0))
+			return res;
+		return (res == n ? len : res * PAGE_SIZE) - *start;
+		0;
+	}))
+	return 0;
 }
+EXPORT_SYMBOL(iov_iter_pin_user_pages);
 
 ssize_t iov_iter_get_pages(struct iov_iter *i,
 		   struct page **pages, size_t maxsize, unsigned maxpages,
@@ -1317,7 +1365,7 @@  ssize_t iov_iter_get_pages(struct iov_iter *i,
 		maxsize = i->count;
 
 	if (unlikely(iov_iter_is_pipe(i)))
-		return pipe_get_pages(i, pages, maxsize, maxpages, start);
+		return pipe_get_pages(i, pages, maxsize, maxpages, start, false);
 	if (unlikely(iov_iter_is_discard(i)))
 		return -EFAULT;
 
@@ -1357,7 +1405,7 @@  static struct page **get_pages_array(size_t n)
 
 static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
 		   struct page ***pages, size_t maxsize,
-		   size_t *start)
+		   size_t *start, bool use_pup)
 {
 	struct page **p;
 	unsigned int iter_head, npages;
@@ -1380,7 +1428,7 @@  static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
 	p = get_pages_array(npages);
 	if (!p)
 		return -ENOMEM;
-	n = __pipe_get_pages(i, maxsize, p, iter_head, start);
+	n = __pipe_get_pages(i, maxsize, p, iter_head, start, use_pup);
 	if (n > 0)
 		*pages = p;
 	else
@@ -1388,6 +1436,52 @@  static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
 	return n;
 }
 
+ssize_t iov_iter_pin_user_pages_alloc(struct iov_iter *i,
+		   struct page ***pages, size_t maxsize,
+		   size_t *start)
+{
+	struct page **p;
+	size_t skip = i->iov_offset;
+	const struct iovec *iov;
+	struct iovec v;
+
+	if (unlikely(iov_iter_is_pipe(i)))
+		return pipe_get_pages_alloc(i, pages, maxsize, start, true);
+	if (unlikely(iov_iter_is_discard(i)))
+		return -EFAULT;
+	if (WARN_ON_ONCE(!iter_is_iovec(i)))
+		return -EFAULT;
+
+	if (unlikely(!maxsize))
+		return 0;
+	maxsize = min(maxsize, i->count);
+
+	iterate_iovec(i, maxsize, v, iov, skip, ({
+		unsigned long addr = (unsigned long)v.iov_base;
+		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+		int n;
+		int res;
+
+		addr &= ~(PAGE_SIZE - 1);
+		n = DIV_ROUND_UP(len, PAGE_SIZE);
+		p = get_pages_array(n);
+		if (!p)
+			return -ENOMEM;
+
+		res = pin_user_pages_fast(addr, n,
+				iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);
+		if (unlikely(res < 0)) {
+			kvfree(p);
+			return res;
+		}
+		*pages = p;
+		return (res == n ? len : res * PAGE_SIZE) - *start;
+		0;
+	}))
+	return 0;
+}
+EXPORT_SYMBOL(iov_iter_pin_user_pages_alloc);
+
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 		   struct page ***pages, size_t maxsize,
 		   size_t *start)
@@ -1398,7 +1492,7 @@  ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 		maxsize = i->count;
 
 	if (unlikely(iov_iter_is_pipe(i)))
-		return pipe_get_pages_alloc(i, pages, maxsize, start);
+		return pipe_get_pages_alloc(i, pages, maxsize, start, false);
 	if (unlikely(iov_iter_is_discard(i)))
 		return -EFAULT;