diff mbox series

[v4,4/6] io_uring: rsrc: avoid use of vmas parameter in pin_user_pages()

Message ID 956f4fc2204f23e4c00e9602ded80cb4e7b5df9b.1681831798.git.lstoakes@gmail.com (mailing list archive)
State New
Headers show
Series remove the vmas parameter from GUP APIs | expand

Commit Message

Lorenzo Stoakes April 18, 2023, 3:49 p.m. UTC
We are shortly to remove pin_user_pages(), and instead perform the required
VMA checks ourselves. In most cases there will be a single VMA so this
should caues no undue impact on an already slow path.

Doing this eliminates the one instance of vmas being used by
pin_user_pages().

Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
---
 io_uring/rsrc.c | 55 ++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 24 deletions(-)

Comments

David Hildenbrand April 18, 2023, 3:55 p.m. UTC | #1
On 18.04.23 17:49, Lorenzo Stoakes wrote:
> We are shortly to remove pin_user_pages(), and instead perform the required
> VMA checks ourselves. In most cases there will be a single VMA so this
> should caues no undue impact on an already slow path.
> 
> Doing this eliminates the one instance of vmas being used by
> pin_user_pages().
> 
> Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
> ---
>   io_uring/rsrc.c | 55 ++++++++++++++++++++++++++++---------------------
>   1 file changed, 31 insertions(+), 24 deletions(-)
> 
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 7a43aed8e395..3a927df9d913 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -1138,12 +1138,37 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
>   	return ret;
>   }
>   
> +static int check_vmas_locked(unsigned long addr, unsigned long len)

TBH, the whole "_locked" suffix is a bit confusing.

I was wondering why you'd want to check whether the VMAs are locked ...

> +{
> +	struct file *file;
> +	VMA_ITERATOR(vmi, current->mm, addr);
> +	struct vm_area_struct *vma = vma_next(&vmi);
> +	unsigned long end = addr + len;
> +
> +	if (WARN_ON_ONCE(!vma))
> +		return -EINVAL;
> +
> +	file = vma->vm_file;
> +	if (file && !is_file_hugepages(file))
> +		return -EOPNOTSUPP;

You'd now be rejecting vma_is_shmem() here, no?
David Hildenbrand April 18, 2023, 3:56 p.m. UTC | #2
On 18.04.23 17:55, David Hildenbrand wrote:
> On 18.04.23 17:49, Lorenzo Stoakes wrote:
>> We are shortly to remove pin_user_pages(), and instead perform the required
>> VMA checks ourselves. In most cases there will be a single VMA so this
>> should caues no undue impact on an already slow path.
>>
>> Doing this eliminates the one instance of vmas being used by
>> pin_user_pages().
>>
>> Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
>> Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
>> ---
>>    io_uring/rsrc.c | 55 ++++++++++++++++++++++++++++---------------------
>>    1 file changed, 31 insertions(+), 24 deletions(-)
>>
>> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
>> index 7a43aed8e395..3a927df9d913 100644
>> --- a/io_uring/rsrc.c
>> +++ b/io_uring/rsrc.c
>> @@ -1138,12 +1138,37 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
>>    	return ret;
>>    }
>>    
>> +static int check_vmas_locked(unsigned long addr, unsigned long len)
> 
> TBH, the whole "_locked" suffix is a bit confusing.
> 
> I was wondering why you'd want to check whether the VMAs are locked ...

FWIW, "check_vmas_compatible_locked" might be better. But the "_locked" 
is still annoying.
Lorenzo Stoakes April 18, 2023, 4:25 p.m. UTC | #3
On Tue, Apr 18, 2023 at 05:55:48PM +0200, David Hildenbrand wrote:
> On 18.04.23 17:49, Lorenzo Stoakes wrote:
> > We are shortly to remove pin_user_pages(), and instead perform the required
> > VMA checks ourselves. In most cases there will be a single VMA so this
> > should caues no undue impact on an already slow path.
> > 
> > Doing this eliminates the one instance of vmas being used by
> > pin_user_pages().
> > 
> > Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
> > ---
> >   io_uring/rsrc.c | 55 ++++++++++++++++++++++++++++---------------------
> >   1 file changed, 31 insertions(+), 24 deletions(-)
> > 
> > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> > index 7a43aed8e395..3a927df9d913 100644
> > --- a/io_uring/rsrc.c
> > +++ b/io_uring/rsrc.c
> > @@ -1138,12 +1138,37 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
> >   	return ret;
> >   }
> > +static int check_vmas_locked(unsigned long addr, unsigned long len)
> 
> TBH, the whole "_locked" suffix is a bit confusing.
> 
> I was wondering why you'd want to check whether the VMAs are locked ...
>

Yeah it's annoying partly because GUP itself is super inconsistent about
it. Idea is to try to indicate that you need to hold mmap_lock
obviously... let's drop _locked then since we're inconsistent with it anyway.

> > +{
> > +	struct file *file;
> > +	VMA_ITERATOR(vmi, current->mm, addr);
> > +	struct vm_area_struct *vma = vma_next(&vmi);
> > +	unsigned long end = addr + len;
> > +
> > +	if (WARN_ON_ONCE(!vma))
> > +		return -EINVAL;
> > +
> > +	file = vma->vm_file;
> > +	if (file && !is_file_hugepages(file))
> > +		return -EOPNOTSUPP;
> 
> You'd now be rejecting vma_is_shmem() here, no?
>

Good spot, I guess I was confused that io_uring would actually want to do
that... not sure who'd want to actually mapping some shmem for this
purpose!

I'll update to make it match the existing code.

> 
> -- 
> Thanks,
> 
> David / dhildenb
> 

To avoid spam, here's a -fix patch for both:-

----8<----
From 62838d66ee01e631c7c8aa3848b6892d1478c5b6 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Tue, 18 Apr 2023 17:11:01 +0100
Subject: [PATCH] io_uring: rsrc: avoid use of vmas parameter in
 pin_user_pages()

Rename function to avoid confusion and correct shmem check as suggested by
David.
---
 io_uring/rsrc.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 3a927df9d913..483b975e31b3 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1138,7 +1138,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 	return ret;
 }

-static int check_vmas_locked(unsigned long addr, unsigned long len)
+static int check_vmas_compatible(unsigned long addr, unsigned long len)
 {
 	struct file *file;
 	VMA_ITERATOR(vmi, current->mm, addr);
@@ -1149,15 +1149,16 @@ static int check_vmas_locked(unsigned long addr, unsigned long len)
 		return -EINVAL;

 	file = vma->vm_file;
-	if (file && !is_file_hugepages(file))
-		return -EOPNOTSUPP;

 	/* don't support file backed memory */
 	for_each_vma_range(vmi, vma, end) {
 		if (vma->vm_file != file)
 			return -EINVAL;

-		if (file && !vma_is_shmem(vma))
+		if (!file)
+			continue;
+
+		if (!vma_is_shmem(vma) && !is_file_hugepages(file))
 			return -EOPNOTSUPP;
 	}

@@ -1185,7 +1186,7 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
 			      pages, NULL);

 	if (pret == nr_pages) {
-		ret = check_vmas_locked(ubuf, len);
+		ret = check_vmas_compatible(ubuf, len);
 		*npages = nr_pages;
 	} else {
 		ret = pret < 0 ? pret : -EFAULT;
--
2.40.0
Jens Axboe April 19, 2023, 4:35 p.m. UTC | #4
On 4/18/23 9:49?AM, Lorenzo Stoakes wrote:
> We are shortly to remove pin_user_pages(), and instead perform the required
> VMA checks ourselves. In most cases there will be a single VMA so this
> should caues no undue impact on an already slow path.
> 
> Doing this eliminates the one instance of vmas being used by
> pin_user_pages().

First up, please don't just send single patches from a series. It's
really annoying when you are trying to get the full picture. Just CC the
whole series, so reviews don't have to look it up separately.

So when you're doing a respin for what I'll mention below and the issue
that David found, please don't just show us patch 4+5 of the series.

> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> index 7a43aed8e395..3a927df9d913 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -1138,12 +1138,37 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
>  	return ret;
>  }
>  
> +static int check_vmas_locked(unsigned long addr, unsigned long len)
> +{
> +	struct file *file;
> +	VMA_ITERATOR(vmi, current->mm, addr);
> +	struct vm_area_struct *vma = vma_next(&vmi);
> +	unsigned long end = addr + len;
> +
> +	if (WARN_ON_ONCE(!vma))
> +		return -EINVAL;
> +
> +	file = vma->vm_file;
> +	if (file && !is_file_hugepages(file))
> +		return -EOPNOTSUPP;
> +
> +	/* don't support file backed memory */
> +	for_each_vma_range(vmi, vma, end) {
> +		if (vma->vm_file != file)
> +			return -EINVAL;
> +
> +		if (file && !vma_is_shmem(vma))
> +			return -EOPNOTSUPP;
> +	}
> +
> +	return 0;
> +}

I really dislike this naming. There's no point to doing locked in the
naming here, it just makes people think it's checking whether the vmas
are locked. Which is not at all what it does. Because what else would we
think, there's nothing else in the name that suggests what it is
actually checking.

Don't put implied locking in the naming, the way to do that is to do
something ala:

lockdep_assert_held_read(&current->mm->mmap_lock);

though I don't think it's needed here at all, as there's just one caller
and it's clearly inside. You could even just make a comment instead.

So please rename this to indicate what it's ACTUALLY checking.
Jens Axboe April 19, 2023, 4:59 p.m. UTC | #5
On 4/19/23 10:35?AM, Jens Axboe wrote:
> On 4/18/23 9:49?AM, Lorenzo Stoakes wrote:
>> We are shortly to remove pin_user_pages(), and instead perform the required
>> VMA checks ourselves. In most cases there will be a single VMA so this
>> should caues no undue impact on an already slow path.
>>
>> Doing this eliminates the one instance of vmas being used by
>> pin_user_pages().
> 
> First up, please don't just send single patches from a series. It's
> really annoying when you are trying to get the full picture. Just CC the
> whole series, so reviews don't have to look it up separately.
> 
> So when you're doing a respin for what I'll mention below and the issue
> that David found, please don't just show us patch 4+5 of the series.

I'll reply here too rather than keep some of this conversaion
out-of-band.

I don't necessarily think that making io buffer registration dumber and
less efficient by needing a separate vma lookup after the fact is a huge
deal, as I would imagine most workloads register buffers at setup time
and then don't change them. But if people do switch sets at runtime,
it's not necessarily a slow path. That said, I suspect the other bits
that we do in here, like the GUP, is going to dominate the overhead
anyway.

My main question is, why don't we just have a __pin_user_pages or
something helper that still takes the vmas argument, and drop it from
pin_user_pages() only? That'd still allow the cleanup of the other users
that don't care about the vma at all, while retaining the bundled
functionality for the case/cases that do? That would avoid needing
explicit vma iteration in io_uring.
Lorenzo Stoakes April 19, 2023, 5:07 p.m. UTC | #6
On Wed, Apr 19, 2023 at 10:35:12AM -0600, Jens Axboe wrote:
> On 4/18/23 9:49?AM, Lorenzo Stoakes wrote:
> > We are shortly to remove pin_user_pages(), and instead perform the required
> > VMA checks ourselves. In most cases there will be a single VMA so this
> > should caues no undue impact on an already slow path.
> >
> > Doing this eliminates the one instance of vmas being used by
> > pin_user_pages().
>
> First up, please don't just send single patches from a series. It's
> really annoying when you are trying to get the full picture. Just CC the
> whole series, so reviews don't have to look it up separately.
>

Sorry about that, it's hard to strike the right balance between not
spamming people and giving appropriate context, different people have
different opinions about how best to do this, in retrospect would certainly
have been a good idea to include you on all.

> So when you're doing a respin for what I'll mention below and the issue
> that David found, please don't just show us patch 4+5 of the series.

ack

>
> > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
> > index 7a43aed8e395..3a927df9d913 100644
> > --- a/io_uring/rsrc.c
> > +++ b/io_uring/rsrc.c
> > @@ -1138,12 +1138,37 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
> >  	return ret;
> >  }
> >
> > +static int check_vmas_locked(unsigned long addr, unsigned long len)
> > +{
> > +	struct file *file;
> > +	VMA_ITERATOR(vmi, current->mm, addr);
> > +	struct vm_area_struct *vma = vma_next(&vmi);
> > +	unsigned long end = addr + len;
> > +
> > +	if (WARN_ON_ONCE(!vma))
> > +		return -EINVAL;
> > +
> > +	file = vma->vm_file;
> > +	if (file && !is_file_hugepages(file))
> > +		return -EOPNOTSUPP;
> > +
> > +	/* don't support file backed memory */
> > +	for_each_vma_range(vmi, vma, end) {
> > +		if (vma->vm_file != file)
> > +			return -EINVAL;
> > +
> > +		if (file && !vma_is_shmem(vma))
> > +			return -EOPNOTSUPP;
> > +	}
> > +
> > +	return 0;
> > +}
>
> I really dislike this naming. There's no point to doing locked in the
> naming here, it just makes people think it's checking whether the vmas
> are locked. Which is not at all what it does. Because what else would we
> think, there's nothing else in the name that suggests what it is
> actually checking.
>
> Don't put implied locking in the naming, the way to do that is to do
> something ala:
>
> lockdep_assert_held_read(&current->mm->mmap_lock);
>
> though I don't think it's needed here at all, as there's just one caller
> and it's clearly inside. You could even just make a comment instead.
>
> So please rename this to indicate what it's ACTUALLY checking.

ack will do!

>
> --
> Jens Axboe
>
Lorenzo Stoakes April 19, 2023, 5:23 p.m. UTC | #7
On Wed, Apr 19, 2023 at 10:59:27AM -0600, Jens Axboe wrote:
> On 4/19/23 10:35?AM, Jens Axboe wrote:
> > On 4/18/23 9:49?AM, Lorenzo Stoakes wrote:
> >> We are shortly to remove pin_user_pages(), and instead perform the required
> >> VMA checks ourselves. In most cases there will be a single VMA so this
> >> should caues no undue impact on an already slow path.
> >>
> >> Doing this eliminates the one instance of vmas being used by
> >> pin_user_pages().
> >
> > First up, please don't just send single patches from a series. It's
> > really annoying when you are trying to get the full picture. Just CC the
> > whole series, so reviews don't have to look it up separately.
> >
> > So when you're doing a respin for what I'll mention below and the issue
> > that David found, please don't just show us patch 4+5 of the series.
>
> I'll reply here too rather than keep some of this conversaion
> out-of-band.
>
> I don't necessarily think that making io buffer registration dumber and
> less efficient by needing a separate vma lookup after the fact is a huge
> deal, as I would imagine most workloads register buffers at setup time
> and then don't change them. But if people do switch sets at runtime,
> it's not necessarily a slow path. That said, I suspect the other bits
> that we do in here, like the GUP, is going to dominate the overhead
> anyway.

Thanks, and indeed I expect the GUP will dominate.

>
> My main question is, why don't we just have a __pin_user_pages or
> something helper that still takes the vmas argument, and drop it from
> pin_user_pages() only? That'd still allow the cleanup of the other users
> that don't care about the vma at all, while retaining the bundled
> functionality for the case/cases that do? That would avoid needing
> explicit vma iteration in io_uring.
>

The desire here is to completely eliminate vmas as an externally available
parameter from GUP. While we do have a newly introduced helper that returns
a VMA, doing the lookup manually for all other vma cases (which look up a
single page and vma), that is more so a helper that sits outside of GUP.

Having a separate function that still bundled the vmas would essentially
undermine the purpose of the series altogether which is not just to clean
up some NULL's but rather to eliminate vmas as part of the GUP interface
altogether.

The reason for this is that by doing so we simplify the GUP interface,
eliminate a whole class of possible future bugs with people holding onto
pointers to vmas which may dangle and lead the way to future changes in GUP
which might be more impactful, such as trying to find means to use the fast
paths in more areas with an eye to gradual eradication of the use of
mmap_lock.

While we return VMAs, none of this is possible and it also makes the
interface more confusing - without vmas GUP takes flags which define its
behaviour and in most cases returns page objects. The odd rules about what
can and cannot return vmas under what circumstances are not helpful for new
users.

Another point here is that Jason suggested adding a new
FOLL_ALLOW_BROKEN_FILE_MAPPINGS flag which would, by default, not be
set. This could assert that only shmem/hugetlb file mappings are permitted
which would eliminate the need for you to perform this check at all.

This leads into the larger point that GUP-writing file mappings is
fundamentally broken due to e.g. GUP not honouring write notify so this
check should at least in theory not be necessary.

So it may be the case that should such a flag be added this code will
simply be deleted at a future point :)

> --
> Jens Axboe
>
Jens Axboe April 19, 2023, 5:35 p.m. UTC | #8
On 4/19/23 11:23?AM, Lorenzo Stoakes wrote:
> On Wed, Apr 19, 2023 at 10:59:27AM -0600, Jens Axboe wrote:
>> On 4/19/23 10:35?AM, Jens Axboe wrote:
>>> On 4/18/23 9:49?AM, Lorenzo Stoakes wrote:
>>>> We are shortly to remove pin_user_pages(), and instead perform the required
>>>> VMA checks ourselves. In most cases there will be a single VMA so this
>>>> should caues no undue impact on an already slow path.
>>>>
>>>> Doing this eliminates the one instance of vmas being used by
>>>> pin_user_pages().
>>>
>>> First up, please don't just send single patches from a series. It's
>>> really annoying when you are trying to get the full picture. Just CC the
>>> whole series, so reviews don't have to look it up separately.
>>>
>>> So when you're doing a respin for what I'll mention below and the issue
>>> that David found, please don't just show us patch 4+5 of the series.
>>
>> I'll reply here too rather than keep some of this conversaion
>> out-of-band.
>>
>> I don't necessarily think that making io buffer registration dumber and
>> less efficient by needing a separate vma lookup after the fact is a huge
>> deal, as I would imagine most workloads register buffers at setup time
>> and then don't change them. But if people do switch sets at runtime,
>> it's not necessarily a slow path. That said, I suspect the other bits
>> that we do in here, like the GUP, is going to dominate the overhead
>> anyway.
> 
> Thanks, and indeed I expect the GUP will dominate.

Unless you have a lot of vmas... Point is, it's _probably_ not a
problem, but it might and it's making things worse for no real gain as
far as I can tell outside of some notion of "cleaning up the code".

>> My main question is, why don't we just have a __pin_user_pages or
>> something helper that still takes the vmas argument, and drop it from
>> pin_user_pages() only? That'd still allow the cleanup of the other users
>> that don't care about the vma at all, while retaining the bundled
>> functionality for the case/cases that do? That would avoid needing
>> explicit vma iteration in io_uring.
>>
> 
> The desire here is to completely eliminate vmas as an externally available
> parameter from GUP. While we do have a newly introduced helper that returns
> a VMA, doing the lookup manually for all other vma cases (which look up a
> single page and vma), that is more so a helper that sits outside of GUP.
> 
> Having a separate function that still bundled the vmas would essentially
> undermine the purpose of the series altogether which is not just to clean
> up some NULL's but rather to eliminate vmas as part of the GUP interface
> altogether.
> 
> The reason for this is that by doing so we simplify the GUP interface,
> eliminate a whole class of possible future bugs with people holding onto
> pointers to vmas which may dangle and lead the way to future changes in GUP
> which might be more impactful, such as trying to find means to use the fast
> paths in more areas with an eye to gradual eradication of the use of
> mmap_lock.
> 
> While we return VMAs, none of this is possible and it also makes the
> interface more confusing - without vmas GUP takes flags which define its
> behaviour and in most cases returns page objects. The odd rules about what
> can and cannot return vmas under what circumstances are not helpful for new
> users.
> 
> Another point here is that Jason suggested adding a new
> FOLL_ALLOW_BROKEN_FILE_MAPPINGS flag which would, by default, not be
> set. This could assert that only shmem/hugetlb file mappings are permitted
> which would eliminate the need for you to perform this check at all.
> 
> This leads into the larger point that GUP-writing file mappings is
> fundamentally broken due to e.g. GUP not honouring write notify so this
> check should at least in theory not be necessary.
> 
> So it may be the case that should such a flag be added this code will
> simply be deleted at a future point :)

Why don't we do that first then? There's nothing more permanent than a
temporary workaround/fix. Once it's in there, motivation to get rid of
it for most people is zero because they just never see it. Seems like
that'd be a much saner approach rather than the other way around, and
make this patchset simpler/cleaner too as it'd only be removing code in
all of the callers.
Lorenzo Stoakes April 19, 2023, 5:47 p.m. UTC | #9
On Wed, Apr 19, 2023 at 11:35:58AM -0600, Jens Axboe wrote:
> On 4/19/23 11:23?AM, Lorenzo Stoakes wrote:
> > On Wed, Apr 19, 2023 at 10:59:27AM -0600, Jens Axboe wrote:
> >> On 4/19/23 10:35?AM, Jens Axboe wrote:
> >>> On 4/18/23 9:49?AM, Lorenzo Stoakes wrote:
> >>>> We are shortly to remove pin_user_pages(), and instead perform the required
> >>>> VMA checks ourselves. In most cases there will be a single VMA so this
> >>>> should caues no undue impact on an already slow path.
> >>>>
> >>>> Doing this eliminates the one instance of vmas being used by
> >>>> pin_user_pages().
> >>>
> >>> First up, please don't just send single patches from a series. It's
> >>> really annoying when you are trying to get the full picture. Just CC the
> >>> whole series, so reviews don't have to look it up separately.
> >>>
> >>> So when you're doing a respin for what I'll mention below and the issue
> >>> that David found, please don't just show us patch 4+5 of the series.
> >>
> >> I'll reply here too rather than keep some of this conversaion
> >> out-of-band.
> >>
> >> I don't necessarily think that making io buffer registration dumber and
> >> less efficient by needing a separate vma lookup after the fact is a huge
> >> deal, as I would imagine most workloads register buffers at setup time
> >> and then don't change them. But if people do switch sets at runtime,
> >> it's not necessarily a slow path. That said, I suspect the other bits
> >> that we do in here, like the GUP, is going to dominate the overhead
> >> anyway.
> >
> > Thanks, and indeed I expect the GUP will dominate.
>
> Unless you have a lot of vmas... Point is, it's _probably_ not a
> problem, but it might and it's making things worse for no real gain as
> far as I can tell outside of some notion of "cleaning up the code".
>
> >> My main question is, why don't we just have a __pin_user_pages or
> >> something helper that still takes the vmas argument, and drop it from
> >> pin_user_pages() only? That'd still allow the cleanup of the other users
> >> that don't care about the vma at all, while retaining the bundled
> >> functionality for the case/cases that do? That would avoid needing
> >> explicit vma iteration in io_uring.
> >>
> >
> > The desire here is to completely eliminate vmas as an externally available
> > parameter from GUP. While we do have a newly introduced helper that returns
> > a VMA, doing the lookup manually for all other vma cases (which look up a
> > single page and vma), that is more so a helper that sits outside of GUP.
> >
> > Having a separate function that still bundled the vmas would essentially
> > undermine the purpose of the series altogether which is not just to clean
> > up some NULL's but rather to eliminate vmas as part of the GUP interface
> > altogether.
> >
> > The reason for this is that by doing so we simplify the GUP interface,
> > eliminate a whole class of possible future bugs with people holding onto
> > pointers to vmas which may dangle and lead the way to future changes in GUP
> > which might be more impactful, such as trying to find means to use the fast
> > paths in more areas with an eye to gradual eradication of the use of
> > mmap_lock.
> >
> > While we return VMAs, none of this is possible and it also makes the
> > interface more confusing - without vmas GUP takes flags which define its
> > behaviour and in most cases returns page objects. The odd rules about what
> > can and cannot return vmas under what circumstances are not helpful for new
> > users.
> >
> > Another point here is that Jason suggested adding a new
> > FOLL_ALLOW_BROKEN_FILE_MAPPINGS flag which would, by default, not be
> > set. This could assert that only shmem/hugetlb file mappings are permitted
> > which would eliminate the need for you to perform this check at all.
> >
> > This leads into the larger point that GUP-writing file mappings is
> > fundamentally broken due to e.g. GUP not honouring write notify so this
> > check should at least in theory not be necessary.
> >
> > So it may be the case that should such a flag be added this code will
> > simply be deleted at a future point :)
>
> Why don't we do that first then? There's nothing more permanent than a
> temporary workaround/fix. Once it's in there, motivation to get rid of
> it for most people is zero because they just never see it. Seems like
> that'd be a much saner approach rather than the other way around, and
> make this patchset simpler/cleaner too as it'd only be removing code in
> all of the callers.
>

Because I'd then need to audit all GUP callers to see whether they in some
way brokenly access files in order to know which should and should not use
this new flag. It'd change this series from 'remove the vmas parameter' to
something a lot more involved.

I think it's much safer to do the two separately, as I feel that change
would need quite a bit of scrutiny too.

As for temporary, I can assure you I will be looking at introducing this
flag, for what it's worth :) and Jason is certainly minded to do work in
this area also.

> --
> Jens Axboe
>
Jens Axboe April 19, 2023, 5:51 p.m. UTC | #10
On 4/19/23 11:47?AM, Lorenzo Stoakes wrote:
> On Wed, Apr 19, 2023 at 11:35:58AM -0600, Jens Axboe wrote:
>> On 4/19/23 11:23?AM, Lorenzo Stoakes wrote:
>>> On Wed, Apr 19, 2023 at 10:59:27AM -0600, Jens Axboe wrote:
>>>> On 4/19/23 10:35?AM, Jens Axboe wrote:
>>>>> On 4/18/23 9:49?AM, Lorenzo Stoakes wrote:
>>>>>> We are shortly to remove pin_user_pages(), and instead perform the required
>>>>>> VMA checks ourselves. In most cases there will be a single VMA so this
>>>>>> should caues no undue impact on an already slow path.
>>>>>>
>>>>>> Doing this eliminates the one instance of vmas being used by
>>>>>> pin_user_pages().
>>>>>
>>>>> First up, please don't just send single patches from a series. It's
>>>>> really annoying when you are trying to get the full picture. Just CC the
>>>>> whole series, so reviews don't have to look it up separately.
>>>>>
>>>>> So when you're doing a respin for what I'll mention below and the issue
>>>>> that David found, please don't just show us patch 4+5 of the series.
>>>>
>>>> I'll reply here too rather than keep some of this conversaion
>>>> out-of-band.
>>>>
>>>> I don't necessarily think that making io buffer registration dumber and
>>>> less efficient by needing a separate vma lookup after the fact is a huge
>>>> deal, as I would imagine most workloads register buffers at setup time
>>>> and then don't change them. But if people do switch sets at runtime,
>>>> it's not necessarily a slow path. That said, I suspect the other bits
>>>> that we do in here, like the GUP, is going to dominate the overhead
>>>> anyway.
>>>
>>> Thanks, and indeed I expect the GUP will dominate.
>>
>> Unless you have a lot of vmas... Point is, it's _probably_ not a
>> problem, but it might and it's making things worse for no real gain as
>> far as I can tell outside of some notion of "cleaning up the code".
>>
>>>> My main question is, why don't we just have a __pin_user_pages or
>>>> something helper that still takes the vmas argument, and drop it from
>>>> pin_user_pages() only? That'd still allow the cleanup of the other users
>>>> that don't care about the vma at all, while retaining the bundled
>>>> functionality for the case/cases that do? That would avoid needing
>>>> explicit vma iteration in io_uring.
>>>>
>>>
>>> The desire here is to completely eliminate vmas as an externally available
>>> parameter from GUP. While we do have a newly introduced helper that returns
>>> a VMA, doing the lookup manually for all other vma cases (which look up a
>>> single page and vma), that is more so a helper that sits outside of GUP.
>>>
>>> Having a separate function that still bundled the vmas would essentially
>>> undermine the purpose of the series altogether which is not just to clean
>>> up some NULL's but rather to eliminate vmas as part of the GUP interface
>>> altogether.
>>>
>>> The reason for this is that by doing so we simplify the GUP interface,
>>> eliminate a whole class of possible future bugs with people holding onto
>>> pointers to vmas which may dangle and lead the way to future changes in GUP
>>> which might be more impactful, such as trying to find means to use the fast
>>> paths in more areas with an eye to gradual eradication of the use of
>>> mmap_lock.
>>>
>>> While we return VMAs, none of this is possible and it also makes the
>>> interface more confusing - without vmas GUP takes flags which define its
>>> behaviour and in most cases returns page objects. The odd rules about what
>>> can and cannot return vmas under what circumstances are not helpful for new
>>> users.
>>>
>>> Another point here is that Jason suggested adding a new
>>> FOLL_ALLOW_BROKEN_FILE_MAPPINGS flag which would, by default, not be
>>> set. This could assert that only shmem/hugetlb file mappings are permitted
>>> which would eliminate the need for you to perform this check at all.
>>>
>>> This leads into the larger point that GUP-writing file mappings is
>>> fundamentally broken due to e.g. GUP not honouring write notify so this
>>> check should at least in theory not be necessary.
>>>
>>> So it may be the case that should such a flag be added this code will
>>> simply be deleted at a future point :)
>>
>> Why don't we do that first then? There's nothing more permanent than a
>> temporary workaround/fix. Once it's in there, motivation to get rid of
>> it for most people is zero because they just never see it. Seems like
>> that'd be a much saner approach rather than the other way around, and
>> make this patchset simpler/cleaner too as it'd only be removing code in
>> all of the callers.
>>
> 
> Because I'd then need to audit all GUP callers to see whether they in some
> way brokenly access files in order to know which should and should not use
> this new flag. It'd change this series from 'remove the vmas parameter' to
> something a lot more involved.
> 
> I think it's much safer to do the two separately, as I feel that change
> would need quite a bit of scrutiny too.
> 
> As for temporary, I can assure you I will be looking at introducing this
> flag, for what it's worth :) and Jason is certainly minded to do work in
> this area also.

It's either feasible or it's not, and it didn't sound too bad in terms
of getting it done to remove the temporary addition. Since we're now
days away from the merge window and any of this would need to soak in
for-next anyway for a bit, why not just do that other series first? It
really is backward. And this happens sometimes when developing
patchsets, at some point you realize that things would be easier/cleaner
with another prep series first. Nothing wrong with that, but let's not
be hesitant to shift direction a bit when it makes sense to do so.

I keep getting this sense of urgency for a cleanup series. Why not just
do it right from the get-go and make this series simpler? At that point
there would be no discussion on it at all, as it would be a straight
forward cleanup without adding an intermediate step that'd get deleted
later anyway.
Lorenzo Stoakes April 19, 2023, 6:18 p.m. UTC | #11
On Wed, Apr 19, 2023 at 11:51:29AM -0600, Jens Axboe wrote:
> On 4/19/23 11:47?AM, Lorenzo Stoakes wrote:
> > On Wed, Apr 19, 2023 at 11:35:58AM -0600, Jens Axboe wrote:
> >> On 4/19/23 11:23?AM, Lorenzo Stoakes wrote:
> >>> On Wed, Apr 19, 2023 at 10:59:27AM -0600, Jens Axboe wrote:
> >>>> On 4/19/23 10:35?AM, Jens Axboe wrote:
> >>>>> On 4/18/23 9:49?AM, Lorenzo Stoakes wrote:
> >>>>>> We are shortly to remove pin_user_pages(), and instead perform the required
> >>>>>> VMA checks ourselves. In most cases there will be a single VMA so this
> >>>>>> should caues no undue impact on an already slow path.
> >>>>>>
> >>>>>> Doing this eliminates the one instance of vmas being used by
> >>>>>> pin_user_pages().
> >>>>>
> >>>>> First up, please don't just send single patches from a series. It's
> >>>>> really annoying when you are trying to get the full picture. Just CC the
> >>>>> whole series, so reviews don't have to look it up separately.
> >>>>>
> >>>>> So when you're doing a respin for what I'll mention below and the issue
> >>>>> that David found, please don't just show us patch 4+5 of the series.
> >>>>
> >>>> I'll reply here too rather than keep some of this conversaion
> >>>> out-of-band.
> >>>>
> >>>> I don't necessarily think that making io buffer registration dumber and
> >>>> less efficient by needing a separate vma lookup after the fact is a huge
> >>>> deal, as I would imagine most workloads register buffers at setup time
> >>>> and then don't change them. But if people do switch sets at runtime,
> >>>> it's not necessarily a slow path. That said, I suspect the other bits
> >>>> that we do in here, like the GUP, is going to dominate the overhead
> >>>> anyway.
> >>>
> >>> Thanks, and indeed I expect the GUP will dominate.
> >>
> >> Unless you have a lot of vmas... Point is, it's _probably_ not a
> >> problem, but it might and it's making things worse for no real gain as
> >> far as I can tell outside of some notion of "cleaning up the code".
> >>
> >>>> My main question is, why don't we just have a __pin_user_pages or
> >>>> something helper that still takes the vmas argument, and drop it from
> >>>> pin_user_pages() only? That'd still allow the cleanup of the other users
> >>>> that don't care about the vma at all, while retaining the bundled
> >>>> functionality for the case/cases that do? That would avoid needing
> >>>> explicit vma iteration in io_uring.
> >>>>
> >>>
> >>> The desire here is to completely eliminate vmas as an externally available
> >>> parameter from GUP. While we do have a newly introduced helper that returns
> >>> a VMA, doing the lookup manually for all other vma cases (which look up a
> >>> single page and vma), that is more so a helper that sits outside of GUP.
> >>>
> >>> Having a separate function that still bundled the vmas would essentially
> >>> undermine the purpose of the series altogether which is not just to clean
> >>> up some NULL's but rather to eliminate vmas as part of the GUP interface
> >>> altogether.
> >>>
> >>> The reason for this is that by doing so we simplify the GUP interface,
> >>> eliminate a whole class of possible future bugs with people holding onto
> >>> pointers to vmas which may dangle and lead the way to future changes in GUP
> >>> which might be more impactful, such as trying to find means to use the fast
> >>> paths in more areas with an eye to gradual eradication of the use of
> >>> mmap_lock.
> >>>
> >>> While we return VMAs, none of this is possible and it also makes the
> >>> interface more confusing - without vmas GUP takes flags which define its
> >>> behaviour and in most cases returns page objects. The odd rules about what
> >>> can and cannot return vmas under what circumstances are not helpful for new
> >>> users.
> >>>
> >>> Another point here is that Jason suggested adding a new
> >>> FOLL_ALLOW_BROKEN_FILE_MAPPINGS flag which would, by default, not be
> >>> set. This could assert that only shmem/hugetlb file mappings are permitted
> >>> which would eliminate the need for you to perform this check at all.
> >>>
> >>> This leads into the larger point that GUP-writing file mappings is
> >>> fundamentally broken due to e.g. GUP not honouring write notify so this
> >>> check should at least in theory not be necessary.
> >>>
> >>> So it may be the case that should such a flag be added this code will
> >>> simply be deleted at a future point :)
> >>
> >> Why don't we do that first then? There's nothing more permanent than a
> >> temporary workaround/fix. Once it's in there, motivation to get rid of
> >> it for most people is zero because they just never see it. Seems like
> >> that'd be a much saner approach rather than the other way around, and
> >> make this patchset simpler/cleaner too as it'd only be removing code in
> >> all of the callers.
> >>
> >
> > Because I'd then need to audit all GUP callers to see whether they in some
> > way brokenly access files in order to know which should and should not use
> > this new flag. It'd change this series from 'remove the vmas parameter' to
> > something a lot more involved.
> >
> > I think it's much safer to do the two separately, as I feel that change
> > would need quite a bit of scrutiny too.
> >
> > As for temporary, I can assure you I will be looking at introducing this
> > flag, for what it's worth :) and Jason is certainly minded to do work in
> > this area also.
>
> It's either feasible or it's not, and it didn't sound too bad in terms
> of getting it done to remove the temporary addition. Since we're now
> days away from the merge window and any of this would need to soak in
> for-next anyway for a bit, why not just do that other series first? It
> really is backward. And this happens sometimes when developing
> patchsets, at some point you realize that things would be easier/cleaner
> with another prep series first. Nothing wrong with that, but let's not
> be hesitant to shift direction a bit when it makes sense to do so.
>
> I keep getting this sense of urgency for a cleanup series. Why not just
> do it right from the get-go and make this series simpler? At that point
> there would be no discussion on it at all, as it would be a straight
> forward cleanup without adding an intermediate step that'd get deleted
> later anyway.

As I said, I think it is a little more than a cleanup, or at the very least
a cleanup that leads the way to more functional changes (and eliminates a
class of bugs).

I'd also argue that we are doing things right with this patch series as-is,
io_uring is the only sticking point because, believe it or not, it is the
only place in the kernel that uses multiple vmas (it has been interesting
to get a view on GUP use as a whole here).

But obviously if you have concerns about performance I understand (note
that the actual first iteration of this patch set added a flag specifically
to avoid the need for this in order to cause you less trouble :)

I would argue that you're _already_ manipulating VMAs (I do understand your
desire not to do so obviously) the only thing that would change is how you
get them and duplicated work (though likely less impactful).

So even if I did the FOLL_ALLOW_BROKEN_FILE_MAPPING patch series first, I
would still need to come along and delete a bunch of your code
afterwards. And unfortunately Pavel's recent change which insists on not
having different vm_file's across VMAs for the buffer would have to be
reverted so I expect it might not be entirely without discussion.

However, if you really do feel that you can't accept this change as-is, I
can put this series on hold and look at FOLL_ALLOW_BROKEN_FILE_MAPPING and
we can return to this afterwards.

>
> --
> Jens Axboe
>
Jason Gunthorpe April 19, 2023, 6:22 p.m. UTC | #12
On Wed, Apr 19, 2023 at 07:18:26PM +0100, Lorenzo Stoakes wrote:

> I'd also argue that we are doing things right with this patch series as-is,
> io_uring is the only sticking point because, believe it or not, it is the
> only place in the kernel that uses multiple vmas (it has been interesting
> to get a view on GUP use as a whole here).

I would say io_uring is the only place trying to open-code bug fixes
for MM problems :\ As Jens says, these sorts of temporary work arounds become
lingering problems that nobody wants to fix properly.

> So even if I did the FOLL_ALLOW_BROKEN_FILE_MAPPING patch series first, I
> would still need to come along and delete a bunch of your code
> afterwards. And unfortunately Pavel's recent change which insists on not
> having different vm_file's across VMAs for the buffer would have to be
> reverted so I expect it might not be entirely without discussion.

Yeah, that should just be reverted.

> However, if you really do feel that you can't accept this change as-is, I
> can put this series on hold and look at FOLL_ALLOW_BROKEN_FILE_MAPPING and
> we can return to this afterwards.

It is probably not as bad as you think, I suspect only RDMA really
wants to set the flag. Maybe something in media too, maybe.

Then again that stuff doesn't work so incredibly badly maybe there
really is no user of it and we should just block it completely.

Jason
Matthew Wilcox April 19, 2023, 6:23 p.m. UTC | #13
On Wed, Apr 19, 2023 at 07:18:26PM +0100, Lorenzo Stoakes wrote:
> So even if I did the FOLL_ALLOW_BROKEN_FILE_MAPPING patch series first, I
> would still need to come along and delete a bunch of your code
> afterwards. And unfortunately Pavel's recent change which insists on not
> having different vm_file's across VMAs for the buffer would have to be
> reverted so I expect it might not be entirely without discussion.

I don't even understand why Pavel wanted to make this change.  The
commit log really doesn't say.

commit edd478269640
Author: Pavel Begunkov <asml.silence@gmail.com>
Date:   Wed Feb 22 14:36:48 2023 +0000

    io_uring/rsrc: disallow multi-source reg buffers

    If two or more mappings go back to back to each other they can be passed
    into io_uring to be registered as a single registered buffer. That would
    even work if mappings came from different sources, e.g. it's possible to
    mix in this way anon pages and pages from shmem or hugetlb. That is not
    a problem but it'd rather be less prone if we forbid such mixing.

    Cc: <stable@vger.kernel.org>
    Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
    Signed-off-by: Jens Axboe <axboe@kernel.dk>

It even says "That is not a problem"!  So why was this patch merged
if it's not fixing a problem?

It's now standing in the way of an actual cleanup.  So why don't we
revert it?  There must be more to it than this ...
Jason Gunthorpe April 19, 2023, 6:24 p.m. UTC | #14
On Wed, Apr 19, 2023 at 07:23:00PM +0100, Matthew Wilcox wrote:
> On Wed, Apr 19, 2023 at 07:18:26PM +0100, Lorenzo Stoakes wrote:
> > So even if I did the FOLL_ALLOW_BROKEN_FILE_MAPPING patch series first, I
> > would still need to come along and delete a bunch of your code
> > afterwards. And unfortunately Pavel's recent change which insists on not
> > having different vm_file's across VMAs for the buffer would have to be
> > reverted so I expect it might not be entirely without discussion.
> 
> I don't even understand why Pavel wanted to make this change.  The
> commit log really doesn't say.
> 
> commit edd478269640
> Author: Pavel Begunkov <asml.silence@gmail.com>
> Date:   Wed Feb 22 14:36:48 2023 +0000
> 
>     io_uring/rsrc: disallow multi-source reg buffers
> 
>     If two or more mappings go back to back to each other they can be passed
>     into io_uring to be registered as a single registered buffer. That would
>     even work if mappings came from different sources, e.g. it's possible to
>     mix in this way anon pages and pages from shmem or hugetlb. That is not
>     a problem but it'd rather be less prone if we forbid such mixing.
> 
>     Cc: <stable@vger.kernel.org>
>     Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>     Signed-off-by: Jens Axboe <axboe@kernel.dk>
> 
> It even says "That is not a problem"!  So why was this patch merged
> if it's not fixing a problem?
> 
> It's now standing in the way of an actual cleanup.  So why don't we
> revert it?  There must be more to it than this ...

https://lore.kernel.org/all/61ded378-51a8-1dcb-b631-fda1903248a9@gmail.com/

Jason
Matthew Wilcox April 19, 2023, 6:35 p.m. UTC | #15
On Wed, Apr 19, 2023 at 03:24:55PM -0300, Jason Gunthorpe wrote:
> On Wed, Apr 19, 2023 at 07:23:00PM +0100, Matthew Wilcox wrote:
> > On Wed, Apr 19, 2023 at 07:18:26PM +0100, Lorenzo Stoakes wrote:
> > > So even if I did the FOLL_ALLOW_BROKEN_FILE_MAPPING patch series first, I
> > > would still need to come along and delete a bunch of your code
> > > afterwards. And unfortunately Pavel's recent change which insists on not
> > > having different vm_file's across VMAs for the buffer would have to be
> > > reverted so I expect it might not be entirely without discussion.
> > 
> > I don't even understand why Pavel wanted to make this change.  The
> > commit log really doesn't say.
> > 
> > commit edd478269640
> > Author: Pavel Begunkov <asml.silence@gmail.com>
> > Date:   Wed Feb 22 14:36:48 2023 +0000
> > 
> >     io_uring/rsrc: disallow multi-source reg buffers
> > 
> >     If two or more mappings go back to back to each other they can be passed
> >     into io_uring to be registered as a single registered buffer. That would
> >     even work if mappings came from different sources, e.g. it's possible to
> >     mix in this way anon pages and pages from shmem or hugetlb. That is not
> >     a problem but it'd rather be less prone if we forbid such mixing.
> > 
> >     Cc: <stable@vger.kernel.org>
> >     Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> >     Signed-off-by: Jens Axboe <axboe@kernel.dk>
> > 
> > It even says "That is not a problem"!  So why was this patch merged
> > if it's not fixing a problem?
> > 
> > It's now standing in the way of an actual cleanup.  So why don't we
> > revert it?  There must be more to it than this ...
> 
> https://lore.kernel.org/all/61ded378-51a8-1dcb-b631-fda1903248a9@gmail.com/

So um, it's disallowed because Pavel couldn't understand why it
should be allowed?  This gets less and less convincing.

FWIW, what I was suggesting was that we should have a FOLL_SINGLE_VMA
flag, which would use our shiny new VMA lock infrastructure to look
up and lock _one_ VMA instead of having the caller take the mmap_lock.
Passing that flag would be a tighter restriction that Pavel implemented,
but would certainly relieve some of his mental load.

By the way, even if all pages are from the same VMA, they may still be a
mixture of anon and file pages; think a MAP_PRIVATE of a file when
only some pages have been written to.  Or an anon MAP_SHARED which is
accessible by a child process.
Lorenzo Stoakes April 19, 2023, 6:45 p.m. UTC | #16
On Wed, Apr 19, 2023 at 07:35:33PM +0100, Matthew Wilcox wrote:
> On Wed, Apr 19, 2023 at 03:24:55PM -0300, Jason Gunthorpe wrote:
> > On Wed, Apr 19, 2023 at 07:23:00PM +0100, Matthew Wilcox wrote:
> > > On Wed, Apr 19, 2023 at 07:18:26PM +0100, Lorenzo Stoakes wrote:
> > > > So even if I did the FOLL_ALLOW_BROKEN_FILE_MAPPING patch series first, I
> > > > would still need to come along and delete a bunch of your code
> > > > afterwards. And unfortunately Pavel's recent change which insists on not
> > > > having different vm_file's across VMAs for the buffer would have to be
> > > > reverted so I expect it might not be entirely without discussion.
> > >
> > > I don't even understand why Pavel wanted to make this change.  The
> > > commit log really doesn't say.
> > >
> > > commit edd478269640
> > > Author: Pavel Begunkov <asml.silence@gmail.com>
> > > Date:   Wed Feb 22 14:36:48 2023 +0000
> > >
> > >     io_uring/rsrc: disallow multi-source reg buffers
> > >
> > >     If two or more mappings go back to back to each other they can be passed
> > >     into io_uring to be registered as a single registered buffer. That would
> > >     even work if mappings came from different sources, e.g. it's possible to
> > >     mix in this way anon pages and pages from shmem or hugetlb. That is not
> > >     a problem but it'd rather be less prone if we forbid such mixing.
> > >
> > >     Cc: <stable@vger.kernel.org>
> > >     Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> > >     Signed-off-by: Jens Axboe <axboe@kernel.dk>
> > >
> > > It even says "That is not a problem"!  So why was this patch merged
> > > if it's not fixing a problem?
> > >
> > > It's now standing in the way of an actual cleanup.  So why don't we
> > > revert it?  There must be more to it than this ...
> >
> > https://lore.kernel.org/all/61ded378-51a8-1dcb-b631-fda1903248a9@gmail.com/
>
> So um, it's disallowed because Pavel couldn't understand why it
> should be allowed?  This gets less and less convincing.
>
> FWIW, what I was suggesting was that we should have a FOLL_SINGLE_VMA
> flag, which would use our shiny new VMA lock infrastructure to look
> up and lock _one_ VMA instead of having the caller take the mmap_lock.
> Passing that flag would be a tighter restriction that Pavel implemented,
> but would certainly relieve some of his mental load.
>
> By the way, even if all pages are from the same VMA, they may still be a
> mixture of anon and file pages; think a MAP_PRIVATE of a file when
> only some pages have been written to.  Or an anon MAP_SHARED which is
> accessible by a child process.

Indeed, my motive for the series came out of a conversation with you about
vmas being odd (thanks! :), however I did end up feeling FOLL_SINGLE_VMA
would be too restricted and would break the uAPI.

For example, imagine if a user (yes it'd be weird) mlock'd some pages in a
buffer and not others, then we'd break their use case. Also (perhaps?) more
feasibly, a user might mix hugetlb and anon pages. So I think that'd be too
restrictive here.

However the idea of just essentially taking what Jens has had to do
open-coded and putting it into GUP as a whole really feels like the right
thing to do.

I do like the idea of a FOLL_SINGLE_VMA for other use cases though, the
majority of which want one and one page only. Perhaps worth taking the
helper added in this series (get_user_page_vma_remote() from [1]) and
replacing it with an a full GUP function which has an interface explicitly
for this common single page/vma case.

[1]:https://lore.kernel.org/linux-mm/7c6f1ae88320bf11d2f583178a3d9e653e06ac63.1681831798.git.lstoakes@gmail.com/
Lorenzo Stoakes April 19, 2023, 6:50 p.m. UTC | #17
On Wed, Apr 19, 2023 at 03:22:19PM -0300, Jason Gunthorpe wrote:
> On Wed, Apr 19, 2023 at 07:18:26PM +0100, Lorenzo Stoakes wrote:
>
> > I'd also argue that we are doing things right with this patch series as-is,
> > io_uring is the only sticking point because, believe it or not, it is the
> > only place in the kernel that uses multiple vmas (it has been interesting
> > to get a view on GUP use as a whole here).
>
> I would say io_uring is the only place trying to open-code bug fixes
> for MM problems :\ As Jens says, these sorts of temporary work arounds become
> lingering problems that nobody wants to fix properly.
>
> > So even if I did the FOLL_ALLOW_BROKEN_FILE_MAPPING patch series first, I
> > would still need to come along and delete a bunch of your code
> > afterwards. And unfortunately Pavel's recent change which insists on not
> > having different vm_file's across VMAs for the buffer would have to be
> > reverted so I expect it might not be entirely without discussion.
>
> Yeah, that should just be reverted.
>
> > However, if you really do feel that you can't accept this change as-is, I
> > can put this series on hold and look at FOLL_ALLOW_BROKEN_FILE_MAPPING and
> > we can return to this afterwards.
>
> It is probably not as bad as you think, I suspect only RDMA really
> wants to set the flag. Maybe something in media too, maybe.
>
> Then again that stuff doesn't work so incredibly badly maybe there
> really is no user of it and we should just block it completely.
>
> Jason

OK in this case I think we're all agreed that it's best to do this first
then revisit this series afterwards. I will switch to working on this!

And I will make sure to cc- everyone in :)
Jens Axboe April 19, 2023, 8:15 p.m. UTC | #18
On 4/19/23 12:24?PM, Jason Gunthorpe wrote:
> On Wed, Apr 19, 2023 at 07:23:00PM +0100, Matthew Wilcox wrote:
>> On Wed, Apr 19, 2023 at 07:18:26PM +0100, Lorenzo Stoakes wrote:
>>> So even if I did the FOLL_ALLOW_BROKEN_FILE_MAPPING patch series first, I
>>> would still need to come along and delete a bunch of your code
>>> afterwards. And unfortunately Pavel's recent change which insists on not
>>> having different vm_file's across VMAs for the buffer would have to be
>>> reverted so I expect it might not be entirely without discussion.
>>
>> I don't even understand why Pavel wanted to make this change.  The
>> commit log really doesn't say.
>>
>> commit edd478269640
>> Author: Pavel Begunkov <asml.silence@gmail.com>
>> Date:   Wed Feb 22 14:36:48 2023 +0000
>>
>>     io_uring/rsrc: disallow multi-source reg buffers
>>
>>     If two or more mappings go back to back to each other they can be passed
>>     into io_uring to be registered as a single registered buffer. That would
>>     even work if mappings came from different sources, e.g. it's possible to
>>     mix in this way anon pages and pages from shmem or hugetlb. That is not
>>     a problem but it'd rather be less prone if we forbid such mixing.
>>
>>     Cc: <stable@vger.kernel.org>
>>     Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>     Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>
>> It even says "That is not a problem"!  So why was this patch merged
>> if it's not fixing a problem?
>>
>> It's now standing in the way of an actual cleanup.  So why don't we
>> revert it?  There must be more to it than this ...
> 
> https://lore.kernel.org/all/61ded378-51a8-1dcb-b631-fda1903248a9@gmail.com/

Let's just kill that patch that, I can add a revert for 6.4. I had
forgotten about that patch and guess I didn't realize that most of the
issue do in fact just stem from that.
Jens Axboe April 19, 2023, 8:18 p.m. UTC | #19
On 4/19/23 2:15 PM, Jens Axboe wrote:
> On 4/19/23 12:24?PM, Jason Gunthorpe wrote:
>> On Wed, Apr 19, 2023 at 07:23:00PM +0100, Matthew Wilcox wrote:
>>> On Wed, Apr 19, 2023 at 07:18:26PM +0100, Lorenzo Stoakes wrote:
>>>> So even if I did the FOLL_ALLOW_BROKEN_FILE_MAPPING patch series first, I
>>>> would still need to come along and delete a bunch of your code
>>>> afterwards. And unfortunately Pavel's recent change which insists on not
>>>> having different vm_file's across VMAs for the buffer would have to be
>>>> reverted so I expect it might not be entirely without discussion.
>>>
>>> I don't even understand why Pavel wanted to make this change.  The
>>> commit log really doesn't say.
>>>
>>> commit edd478269640
>>> Author: Pavel Begunkov <asml.silence@gmail.com>
>>> Date:   Wed Feb 22 14:36:48 2023 +0000
>>>
>>>     io_uring/rsrc: disallow multi-source reg buffers
>>>
>>>     If two or more mappings go back to back to each other they can be passed
>>>     into io_uring to be registered as a single registered buffer. That would
>>>     even work if mappings came from different sources, e.g. it's possible to
>>>     mix in this way anon pages and pages from shmem or hugetlb. That is not
>>>     a problem but it'd rather be less prone if we forbid such mixing.
>>>
>>>     Cc: <stable@vger.kernel.org>
>>>     Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>>     Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>
>>> It even says "That is not a problem"!  So why was this patch merged
>>> if it's not fixing a problem?
>>>
>>> It's now standing in the way of an actual cleanup.  So why don't we
>>> revert it?  There must be more to it than this ...
>>
>> https://lore.kernel.org/all/61ded378-51a8-1dcb-b631-fda1903248a9@gmail.com/
> 
> Let's just kill that patch that, I can add a revert for 6.4. I had
> forgotten about that patch and guess I didn't realize that most of the
> issue do in fact just stem from that.

https://git.kernel.dk/cgit/linux-block/commit/?h=for-6.4/io_uring&id=fbd3aaf37886d3645b1bd6920f6298f5884049f8
Jason Gunthorpe April 19, 2023, 11:22 p.m. UTC | #20
On Wed, Apr 19, 2023 at 07:45:06PM +0100, Lorenzo Stoakes wrote:

> For example, imagine if a user (yes it'd be weird) mlock'd some pages in a
> buffer and not others, then we'd break their use case. Also (perhaps?) more
> feasibly, a user might mix hugetlb and anon pages. So I think that'd be too
> restrictive here.

Yeah, I agree we should not add a broad single-vma restriction to
GUP. It turns any split of a VMA into a potentially uABI breaking
change and we just don't need that headache in the mm..

> I do like the idea of a FOLL_SINGLE_VMA for other use cases though, the
> majority of which want one and one page only. Perhaps worth taking the
> helper added in this series (get_user_page_vma_remote() from [1]) and
> replacing it with an a full GUP function which has an interface explicitly
> for this common single page/vma case.

Like I showed in another thread a function signature that can only do
one page and also returns the VMA would force it to be used properly
and we don't need a FOLL flag.

Jason
Pavel Begunkov April 20, 2023, 1:36 p.m. UTC | #21
On 4/19/23 19:35, Matthew Wilcox wrote:
> On Wed, Apr 19, 2023 at 03:24:55PM -0300, Jason Gunthorpe wrote:
>> On Wed, Apr 19, 2023 at 07:23:00PM +0100, Matthew Wilcox wrote:
>>> On Wed, Apr 19, 2023 at 07:18:26PM +0100, Lorenzo Stoakes wrote:
>>>> So even if I did the FOLL_ALLOW_BROKEN_FILE_MAPPING patch series first, I
>>>> would still need to come along and delete a bunch of your code
>>>> afterwards. And unfortunately Pavel's recent change which insists on not
>>>> having different vm_file's across VMAs for the buffer would have to be
>>>> reverted so I expect it might not be entirely without discussion.
>>>
>>> I don't even understand why Pavel wanted to make this change.  The
>>> commit log really doesn't say.
>>>
>>> commit edd478269640
>>> Author: Pavel Begunkov <asml.silence@gmail.com>
>>> Date:   Wed Feb 22 14:36:48 2023 +0000
>>>
>>>      io_uring/rsrc: disallow multi-source reg buffers
>>>
>>>      If two or more mappings go back to back to each other they can be passed
>>>      into io_uring to be registered as a single registered buffer. That would
>>>      even work if mappings came from different sources, e.g. it's possible to
>>>      mix in this way anon pages and pages from shmem or hugetlb. That is not
>>>      a problem but it'd rather be less prone if we forbid such mixing.
>>>
>>>      Cc: <stable@vger.kernel.org>
>>>      Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>>      Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>
>>> It even says "That is not a problem"!  So why was this patch merged
>>> if it's not fixing a problem?
>>>
>>> It's now standing in the way of an actual cleanup.  So why don't we
>>> revert it?  There must be more to it than this ...
>>
>> https://lore.kernel.org/all/61ded378-51a8-1dcb-b631-fda1903248a9@gmail.com/
> 
> So um, it's disallowed because Pavel couldn't understand why it
> should be allowed?  This gets less and less convincing.

Excuse me? I'm really sorry you "couldn't understand" the explanation
as it has probably been too much of a "mental load", but let me try to
elaborate.

Because it's currently limited what can be registered, it's indeed not
a big deal, but that will most certainly change, and I usually and
apparently nonsensically prefer to tighten things up _before_ it becomes
a problem. And again, taking a random set of buffers created for
different purposes and registering it as a single entity is IMHO not a
sane approach.

Take p2pdma for instance, if would have been passed without intermixing
there might not have been is_pci_p2pdma_page()/etc. for every single page
in a bvec. That's why in general, it won't change for p2pdma but there
might be other cases in the future.


> FWIW, what I was suggesting was that we should have a FOLL_SINGLE_VMA
> flag, which would use our shiny new VMA lock infrastructure to look
> up and lock _one_ VMA instead of having the caller take the mmap_lock.
> Passing that flag would be a tighter restriction that Pavel implemented,
> but would certainly relieve some of his mental load.
> 
> By the way, even if all pages are from the same VMA, they may still be a
> mixture of anon and file pages; think a MAP_PRIVATE of a file when
> only some pages have been written to.  Or an anon MAP_SHARED which is
> accessible by a child process.
Pavel Begunkov April 20, 2023, 1:37 p.m. UTC | #22
On 4/19/23 21:15, Jens Axboe wrote:
> On 4/19/23 12:24?PM, Jason Gunthorpe wrote:
>> On Wed, Apr 19, 2023 at 07:23:00PM +0100, Matthew Wilcox wrote:
>>> On Wed, Apr 19, 2023 at 07:18:26PM +0100, Lorenzo Stoakes wrote:
>>>> So even if I did the FOLL_ALLOW_BROKEN_FILE_MAPPING patch series first, I
>>>> would still need to come along and delete a bunch of your code
>>>> afterwards. And unfortunately Pavel's recent change which insists on not
>>>> having different vm_file's across VMAs for the buffer would have to be
>>>> reverted so I expect it might not be entirely without discussion.
>>>
>>> I don't even understand why Pavel wanted to make this change.  The
>>> commit log really doesn't say.
>>>
>>> commit edd478269640
>>> Author: Pavel Begunkov <asml.silence@gmail.com>
>>> Date:   Wed Feb 22 14:36:48 2023 +0000
>>>
>>>      io_uring/rsrc: disallow multi-source reg buffers
>>>
>>>      If two or more mappings go back to back to each other they can be passed
>>>      into io_uring to be registered as a single registered buffer. That would
>>>      even work if mappings came from different sources, e.g. it's possible to
>>>      mix in this way anon pages and pages from shmem or hugetlb. That is not
>>>      a problem but it'd rather be less prone if we forbid such mixing.
>>>
>>>      Cc: <stable@vger.kernel.org>
>>>      Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>>      Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>
>>> It even says "That is not a problem"!  So why was this patch merged
>>> if it's not fixing a problem?
>>>
>>> It's now standing in the way of an actual cleanup.  So why don't we
>>> revert it?  There must be more to it than this ...
>>
>> https://lore.kernel.org/all/61ded378-51a8-1dcb-b631-fda1903248a9@gmail.com/
> 
> Let's just kill that patch that, I can add a revert for 6.4. I had
> forgotten about that patch and guess I didn't realize that most of the
> issue do in fact just stem from that.

Well, we're now trading uapi sanity for cleanups, I don't know
what I should take out of it.
Lorenzo Stoakes April 20, 2023, 1:57 p.m. UTC | #23
On Wed, Apr 19, 2023 at 08:22:51PM -0300, Jason Gunthorpe wrote:
> On Wed, Apr 19, 2023 at 07:45:06PM +0100, Lorenzo Stoakes wrote:
>
> > For example, imagine if a user (yes it'd be weird) mlock'd some pages in a
> > buffer and not others, then we'd break their use case. Also (perhaps?) more
> > feasibly, a user might mix hugetlb and anon pages. So I think that'd be too
> > restrictive here.
>
> Yeah, I agree we should not add a broad single-vma restriction to
> GUP. It turns any split of a VMA into a potentially uABI breaking
> change and we just don't need that headache in the mm..
>
> > I do like the idea of a FOLL_SINGLE_VMA for other use cases though, the
> > majority of which want one and one page only. Perhaps worth taking the
> > helper added in this series (get_user_page_vma_remote() from [1]) and
> > replacing it with an a full GUP function which has an interface explicitly
> > for this common single page/vma case.
>
> Like I showed in another thread a function signature that can only do
> one page and also returns the VMA would force it to be used properly
> and we don't need a FOLL flag.
>

Indeed the latest spin of the series uses this. The point is by doing so we
can use per-VMA locks for a common case, I was thinking perhaps as a
separate function call (or perhaps just reusing the wrapper).

This would be entirely separate to all the other work.

> Jason
Lorenzo Stoakes April 20, 2023, 2:19 p.m. UTC | #24
On Thu, Apr 20, 2023 at 02:36:47PM +0100, Pavel Begunkov wrote:
> On 4/19/23 19:35, Matthew Wilcox wrote:
> > On Wed, Apr 19, 2023 at 03:24:55PM -0300, Jason Gunthorpe wrote:
> > > On Wed, Apr 19, 2023 at 07:23:00PM +0100, Matthew Wilcox wrote:
> > > > On Wed, Apr 19, 2023 at 07:18:26PM +0100, Lorenzo Stoakes wrote:
> > > > > So even if I did the FOLL_ALLOW_BROKEN_FILE_MAPPING patch series first, I
> > > > > would still need to come along and delete a bunch of your code
> > > > > afterwards. And unfortunately Pavel's recent change which insists on not
> > > > > having different vm_file's across VMAs for the buffer would have to be
> > > > > reverted so I expect it might not be entirely without discussion.
> > > >
> > > > I don't even understand why Pavel wanted to make this change.  The
> > > > commit log really doesn't say.
> > > >
> > > > commit edd478269640
> > > > Author: Pavel Begunkov <asml.silence@gmail.com>
> > > > Date:   Wed Feb 22 14:36:48 2023 +0000
> > > >
> > > >      io_uring/rsrc: disallow multi-source reg buffers
> > > >
> > > >      If two or more mappings go back to back to each other they can be passed
> > > >      into io_uring to be registered as a single registered buffer. That would
> > > >      even work if mappings came from different sources, e.g. it's possible to
> > > >      mix in this way anon pages and pages from shmem or hugetlb. That is not
> > > >      a problem but it'd rather be less prone if we forbid such mixing.
> > > >
> > > >      Cc: <stable@vger.kernel.org>
> > > >      Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> > > >      Signed-off-by: Jens Axboe <axboe@kernel.dk>
> > > >
> > > > It even says "That is not a problem"!  So why was this patch merged
> > > > if it's not fixing a problem?
> > > >
> > > > It's now standing in the way of an actual cleanup.  So why don't we
> > > > revert it?  There must be more to it than this ...
> > >
> > > https://lore.kernel.org/all/61ded378-51a8-1dcb-b631-fda1903248a9@gmail.com/
> >
> > So um, it's disallowed because Pavel couldn't understand why it
> > should be allowed?  This gets less and less convincing.
>
> Excuse me? I'm really sorry you "couldn't understand" the explanation
> as it has probably been too much of a "mental load", but let me try to
> elaborate.
>
> Because it's currently limited what can be registered, it's indeed not
> a big deal, but that will most certainly change, and I usually and
> apparently nonsensically prefer to tighten things up _before_ it becomes
> a problem. And again, taking a random set of buffers created for
> different purposes and registering it as a single entity is IMHO not a
> sane approach.
>
> Take p2pdma for instance, if would have been passed without intermixing
> there might not have been is_pci_p2pdma_page()/etc. for every single page
> in a bvec. That's why in general, it won't change for p2pdma but there
> might be other cases in the future.
>

The proposed changes for GUP are equally intended to tighten things up both
in advance of issues (e.g. misuse of vmas parameter), for the purposes of
future improvements to GUP (optimising how we perform these operations) and
most pertinently here - ensuring broken uses of GUP do not occur.

So both are 'cleanups' in some sense :) I think it's important to point out
that this change is not simply a desire to improve an interface but has
practical implications going forward (which maybe aren't obvious at this
point yet).

The new approach to this changes goes further in that we essentially
perform the existing check here (anon/shmem/hugetlb) but for _all_
FOLL_WRITE operations in GUP to avoid broken writes to file mappings, at
which point we can just remove the check here altogether (I will post a
series for this soon).

I think that you guys should not have to be performing sanity checks here
but rather we should handle it in GUP so you don't need to think about it
at all. It feels like an mm failing that you have had to do so at all.

So I guess the question is, do you feel the requirement for vm_file being
the same should apply across _any_ GUP operation over a range or is it
io_uring-specific?

If the former then we should do it in GUP alongside the other checks (and
you can comment accordingly on that patch series when it comes), if the
latter then I would restate my opinion that we shouldn't be prevented from
making improvements to GUP in for one caller that wants to iterate
over these VMAs for custom checks + that should be done separately.

>
> > FWIW, what I was suggesting was that we should have a FOLL_SINGLE_VMA
> > flag, which would use our shiny new VMA lock infrastructure to look
> > up and lock _one_ VMA instead of having the caller take the mmap_lock.
> > Passing that flag would be a tighter restriction that Pavel implemented,
> > but would certainly relieve some of his mental load.
> >
> > By the way, even if all pages are from the same VMA, they may still be a
> > mixture of anon and file pages; think a MAP_PRIVATE of a file when
> > only some pages have been written to.  Or an anon MAP_SHARED which is
> > accessible by a child process.
>
> --
> Pavel Begunkov
Jason Gunthorpe April 20, 2023, 3:31 p.m. UTC | #25
On Thu, Apr 20, 2023 at 03:19:01PM +0100, Lorenzo Stoakes wrote:

> So I guess the question is, do you feel the requirement for vm_file being
> the same should apply across _any_ GUP operation over a range or is it
> io_uring-specific?

No need at all anywhere.

GUP's API contract is you get a jumble of pages with certain
properties. Ie normal GUP gives you a jumble of CPU cache coherent
struct pages.

The job of the GUP caller is to process this jumble.

There is no architectural reason to restrict it beyond that, and
io_uring has no functional need either.

The caller of GUP is, by design, not supposed to know about files, or
VMAs, or other MM details. GUP's purpose is to abstract that.

I would not object if io_uring demanded that all struct pages be in a
certain way, like all are P2P or something. But that is a *struct
page* based test, not a VMA test.

Checking VMAs is basically io_uring saying it knows better than GUP
and better than the MM on how this abstraction should work.

Jason
diff mbox series

Patch

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 7a43aed8e395..3a927df9d913 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1138,12 +1138,37 @@  static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 	return ret;
 }
 
+static int check_vmas_locked(unsigned long addr, unsigned long len)
+{
+	struct file *file;
+	VMA_ITERATOR(vmi, current->mm, addr);
+	struct vm_area_struct *vma = vma_next(&vmi);
+	unsigned long end = addr + len;
+
+	if (WARN_ON_ONCE(!vma))
+		return -EINVAL;
+
+	file = vma->vm_file;
+	if (file && !is_file_hugepages(file))
+		return -EOPNOTSUPP;
+
+	/* don't support file backed memory */
+	for_each_vma_range(vmi, vma, end) {
+		if (vma->vm_file != file)
+			return -EINVAL;
+
+		if (file && !vma_is_shmem(vma))
+			return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
 {
 	unsigned long start, end, nr_pages;
-	struct vm_area_struct **vmas = NULL;
 	struct page **pages = NULL;
-	int i, pret, ret = -ENOMEM;
+	int pret, ret = -ENOMEM;
 
 	end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	start = ubuf >> PAGE_SHIFT;
@@ -1153,31 +1178,14 @@  struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
 	if (!pages)
 		goto done;
 
-	vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
-			      GFP_KERNEL);
-	if (!vmas)
-		goto done;
-
 	ret = 0;
 	mmap_read_lock(current->mm);
+
 	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
-			      pages, vmas);
-	if (pret == nr_pages) {
-		struct file *file = vmas[0]->vm_file;
+			      pages, NULL);
 
-		/* don't support file backed memory */
-		for (i = 0; i < nr_pages; i++) {
-			if (vmas[i]->vm_file != file) {
-				ret = -EINVAL;
-				break;
-			}
-			if (!file)
-				continue;
-			if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) {
-				ret = -EOPNOTSUPP;
-				break;
-			}
-		}
+	if (pret == nr_pages) {
+		ret = check_vmas_locked(ubuf, len);
 		*npages = nr_pages;
 	} else {
 		ret = pret < 0 ? pret : -EFAULT;
@@ -1194,7 +1202,6 @@  struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
 	}
 	ret = 0;
 done:
-	kvfree(vmas);
 	if (ret < 0) {
 		kvfree(pages);
 		pages = ERR_PTR(ret);