diff mbox series

[5/6] powerpc/mmu: drop mmap_sem now that locked_vm is atomic

Message ID 20190402204158.27582-6-daniel.m.jordan@oracle.com (mailing list archive)
State New, archived
Headers show
Series convert locked_vm from unsigned long to atomic64_t | expand

Commit Message

Daniel Jordan April 2, 2019, 8:41 p.m. UTC
With locked_vm now an atomic, there is no need to take mmap_sem as
writer.  Delete and refactor accordingly.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: <linux-mm@kvack.org>
Cc: <linuxppc-dev@lists.ozlabs.org>
Cc: <linux-kernel@vger.kernel.org>
---
 arch/powerpc/mm/mmu_context_iommu.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

Comments

Christophe Leroy April 3, 2019, 4:58 a.m. UTC | #1
Le 02/04/2019 à 22:41, Daniel Jordan a écrit :
> With locked_vm now an atomic, there is no need to take mmap_sem as
> writer.  Delete and refactor accordingly.

Could you please detail the change ? It looks like this is not the only 
change. I'm wondering what the consequences are.

Before we did:
- lock
- calculate future value
- check the future value is acceptable
- update value if future value acceptable
- return error if future value non acceptable
- unlock

Now we do:
- atomic update with future (possibly too high) value
- check the new value is acceptable
- atomic update back with older value if new value not acceptable and 
return error

So if a concurrent action wants to increase locked_vm with an acceptable 
step while another one has temporarily set it too high, it will now fail.

I think we should keep the previous approach and do a cmpxchg after 
validating the new value.

Christophe

> 
> Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Davidlohr Bueso <dave@stgolabs.net>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: <linux-mm@kvack.org>
> Cc: <linuxppc-dev@lists.ozlabs.org>
> Cc: <linux-kernel@vger.kernel.org>
> ---
>   arch/powerpc/mm/mmu_context_iommu.c | 27 +++++++++++----------------
>   1 file changed, 11 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
> index 8038ac24a312..a4ef22b67c07 100644
> --- a/arch/powerpc/mm/mmu_context_iommu.c
> +++ b/arch/powerpc/mm/mmu_context_iommu.c
> @@ -54,34 +54,29 @@ struct mm_iommu_table_group_mem_t {
>   static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
>   		unsigned long npages, bool incr)
>   {
> -	long ret = 0, locked, lock_limit;
> +	long ret = 0;
> +	unsigned long lock_limit;
>   	s64 locked_vm;
>   
>   	if (!npages)
>   		return 0;
>   
> -	down_write(&mm->mmap_sem);
> -	locked_vm = atomic64_read(&mm->locked_vm);
>   	if (incr) {
> -		locked = locked_vm + npages;
>   		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> -		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
> +		locked_vm = atomic64_add_return(npages, &mm->locked_vm);
> +		if (locked_vm > lock_limit && !capable(CAP_IPC_LOCK)) {
>   			ret = -ENOMEM;
> -		else
> -			atomic64_add(npages, &mm->locked_vm);
> +			atomic64_sub(npages, &mm->locked_vm);
> +		}
>   	} else {
> -		if (WARN_ON_ONCE(npages > locked_vm))
> -			npages = locked_vm;
> -		atomic64_sub(npages, &mm->locked_vm);
> +		locked_vm = atomic64_sub_return(npages, &mm->locked_vm);
> +		WARN_ON_ONCE(locked_vm < 0);
>   	}
>   
> -	pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
> -			current ? current->pid : 0,
> -			incr ? '+' : '-',
> -			npages << PAGE_SHIFT,
> -			atomic64_read(&mm->locked_vm) << PAGE_SHIFT,
> +	pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%lu %lld/%lu\n",
> +			current ? current->pid : 0, incr ? '+' : '-',
> +			npages << PAGE_SHIFT, locked_vm << PAGE_SHIFT,
>   			rlimit(RLIMIT_MEMLOCK));
> -	up_write(&mm->mmap_sem);
>   
>   	return ret;
>   }
>
Daniel Jordan April 3, 2019, 4:40 p.m. UTC | #2
On Wed, Apr 03, 2019 at 06:58:45AM +0200, Christophe Leroy wrote:
> Le 02/04/2019 à 22:41, Daniel Jordan a écrit :
> > With locked_vm now an atomic, there is no need to take mmap_sem as
> > writer.  Delete and refactor accordingly.
> 
> Could you please detail the change ?

Ok, I'll be more specific in the next version, using some of your language in
fact.  :)

> It looks like this is not the only
> change. I'm wondering what the consequences are.
> 
> Before we did:
> - lock
> - calculate future value
> - check the future value is acceptable
> - update value if future value acceptable
> - return error if future value non acceptable
> - unlock
> 
> Now we do:
> - atomic update with future (possibly too high) value
> - check the new value is acceptable
> - atomic update back with older value if new value not acceptable and return
> error
> 
> So if a concurrent action wants to increase locked_vm with an acceptable
> step while another one has temporarily set it too high, it will now fail.
> 
> I think we should keep the previous approach and do a cmpxchg after
> validating the new value.

That's a good idea, and especially worth doing considering that an arbitrary
number of threads that charge a low amount of locked_vm can fail just because
one thread charges lots of it.

pinned_vm appears to be broken the same way, so I can fix it too unless someone
beats me to it.
Davidlohr Bueso April 24, 2019, 2:15 a.m. UTC | #3
On Wed, 03 Apr 2019, Daniel Jordan wrote:

>On Wed, Apr 03, 2019 at 06:58:45AM +0200, Christophe Leroy wrote:
>> Le 02/04/2019 à 22:41, Daniel Jordan a écrit :
>> > With locked_vm now an atomic, there is no need to take mmap_sem as
>> > writer.  Delete and refactor accordingly.
>>
>> Could you please detail the change ?
>
>Ok, I'll be more specific in the next version, using some of your language in
>fact.  :)
>
>> It looks like this is not the only
>> change. I'm wondering what the consequences are.
>>
>> Before we did:
>> - lock
>> - calculate future value
>> - check the future value is acceptable
>> - update value if future value acceptable
>> - return error if future value non acceptable
>> - unlock
>>
>> Now we do:
>> - atomic update with future (possibly too high) value
>> - check the new value is acceptable
>> - atomic update back with older value if new value not acceptable and return
>> error
>>
>> So if a concurrent action wants to increase locked_vm with an acceptable
>> step while another one has temporarily set it too high, it will now fail.
>>
>> I think we should keep the previous approach and do a cmpxchg after
>> validating the new value.

Wouldn't the cmpxchg alternative also be exposed the locked_vm changing between
validating the new value and the cmpxchg() and we'd bogusly fail even when there
is still just because the value changed (I'm assuming we don't hold any locks,
otherwise all this is pointless).

   current_locked = atomic_read(&mm->locked_vm);
   new_locked = current_locked + npages;
   if (new_locked < lock_limit)
      if (cmpxchg(&mm->locked_vm, current_locked, new_locked) == current_locked)
      	 /* ENOMEM */

>
>That's a good idea, and especially worth doing considering that an arbitrary
>number of threads that charge a low amount of locked_vm can fail just because
>one thread charges lots of it.

Yeah but the window for this is quite small, I doubt it would be a real issue.

What if before doing the atomic_add_return(), we first did the racy new_locked
check for ENOMEM, then do the speculative add and cleanup, if necessary. This
would further reduce the scope of the window where false ENOMEM can occur.

>pinned_vm appears to be broken the same way, so I can fix it too unless someone
>beats me to it.

This should not be a surprise for the rdma folks. Cc'ing Jason nonetheless.

Thanks,
Davidlohr
Davidlohr Bueso April 24, 2019, 2:31 a.m. UTC | #4
On Tue, 23 Apr 2019, Bueso wrote:

>On Wed, 03 Apr 2019, Daniel Jordan wrote:
>
>>On Wed, Apr 03, 2019 at 06:58:45AM +0200, Christophe Leroy wrote:
>>>Le 02/04/2019 à 22:41, Daniel Jordan a écrit :
>>>> With locked_vm now an atomic, there is no need to take mmap_sem as
>>>> writer.  Delete and refactor accordingly.
>>>
>>>Could you please detail the change ?
>>
>>Ok, I'll be more specific in the next version, using some of your language in
>>fact.  :)
>>
>>>It looks like this is not the only
>>>change. I'm wondering what the consequences are.
>>>
>>>Before we did:
>>>- lock
>>>- calculate future value
>>>- check the future value is acceptable
>>>- update value if future value acceptable
>>>- return error if future value non acceptable
>>>- unlock
>>>
>>>Now we do:
>>>- atomic update with future (possibly too high) value
>>>- check the new value is acceptable
>>>- atomic update back with older value if new value not acceptable and return
>>>error
>>>
>>>So if a concurrent action wants to increase locked_vm with an acceptable
>>>step while another one has temporarily set it too high, it will now fail.
>>>
>>>I think we should keep the previous approach and do a cmpxchg after
>>>validating the new value.
>
>Wouldn't the cmpxchg alternative also be exposed the locked_vm changing between
>validating the new value and the cmpxchg() and we'd bogusly fail even when there
>is still just because the value changed (I'm assuming we don't hold any locks,
>otherwise all this is pointless).
>
>  current_locked = atomic_read(&mm->locked_vm);
>  new_locked = current_locked + npages;
>  if (new_locked < lock_limit)
>     if (cmpxchg(&mm->locked_vm, current_locked, new_locked) == current_locked)

Err, this being != of course.
Jason Gunthorpe April 24, 2019, 11:10 a.m. UTC | #5
On Tue, Apr 23, 2019 at 07:15:44PM -0700, Davidlohr Bueso wrote:
> On Wed, 03 Apr 2019, Daniel Jordan wrote:
> 
> > On Wed, Apr 03, 2019 at 06:58:45AM +0200, Christophe Leroy wrote:
> > > Le 02/04/2019 à 22:41, Daniel Jordan a écrit :
> > > > With locked_vm now an atomic, there is no need to take mmap_sem as
> > > > writer.  Delete and refactor accordingly.
> > > 
> > > Could you please detail the change ?
> > 
> > Ok, I'll be more specific in the next version, using some of your language in
> > fact.  :)
> > 
> > > It looks like this is not the only
> > > change. I'm wondering what the consequences are.
> > > 
> > > Before we did:
> > > - lock
> > > - calculate future value
> > > - check the future value is acceptable
> > > - update value if future value acceptable
> > > - return error if future value non acceptable
> > > - unlock
> > > 
> > > Now we do:
> > > - atomic update with future (possibly too high) value
> > > - check the new value is acceptable
> > > - atomic update back with older value if new value not acceptable and return
> > > error
> > > 
> > > So if a concurrent action wants to increase locked_vm with an acceptable
> > > step while another one has temporarily set it too high, it will now fail.
> > > 
> > > I think we should keep the previous approach and do a cmpxchg after
> > > validating the new value.
> 
> Wouldn't the cmpxchg alternative also be exposed the locked_vm changing between
> validating the new value and the cmpxchg() and we'd bogusly fail even when there
> is still just because the value changed (I'm assuming we don't hold any locks,
> otherwise all this is pointless).
> 
>   current_locked = atomic_read(&mm->locked_vm);
>   new_locked = current_locked + npages;
>   if (new_locked < lock_limit)
>      if (cmpxchg(&mm->locked_vm, current_locked, new_locked) == current_locked)
>      	 /* ENOMEM */

Well it needs a loop..

again:
   current_locked = atomic_read(&mm->locked_vm);
   new_locked = current_locked + npages;
   if (new_locked < lock_limit)
      if (cmpxchg(&mm->locked_vm, current_locked, new_locked) != current_locked)
            goto again;

So it won't have bogus failures as there is no unwind after
error. Basically this is a load locked/store conditional style of
locking pattern.

> > That's a good idea, and especially worth doing considering that an arbitrary
> > number of threads that charge a low amount of locked_vm can fail just because
> > one thread charges lots of it.
> 
> Yeah but the window for this is quite small, I doubt it would be a real issue.
> 
> What if before doing the atomic_add_return(), we first did the racy new_locked
> check for ENOMEM, then do the speculative add and cleanup, if necessary. This
> would further reduce the scope of the window where false ENOMEM can occur.
> 
> > pinned_vm appears to be broken the same way, so I can fix it too unless someone
> > beats me to it.
> 
> This should not be a surprise for the rdma folks. Cc'ing Jason nonetheless.

I think we accepted this tiny race as a side effect of removing the
lock, which was very beneficial. Really the time window between the
atomic failing and unwind is very small, and there are enough other
ways a hostile user could DOS locked_vm that I don't think it really
matters in practice..

However, the cmpxchg seems better, so a helper to implement that would
probably be the best thing to do.

Jason
Daniel Jordan April 25, 2019, 1:47 a.m. UTC | #6
On Wed, Apr 24, 2019 at 11:10:24AM +0000, Jason Gunthorpe wrote:
> On Tue, Apr 23, 2019 at 07:15:44PM -0700, Davidlohr Bueso wrote:
> > Wouldn't the cmpxchg alternative also be exposed the locked_vm changing between
> > validating the new value and the cmpxchg() and we'd bogusly fail even when there
> > is still just because the value changed (I'm assuming we don't hold any locks,
> > otherwise all this is pointless).

That's true, I hadn't considered that we could retry even when there's enough
locked_vm.  Seems like another one is that RLIMIT_MEMLOCK could change after
it's read.  I guess nothing's going to be perfect.  :/

> Well it needs a loop..
> 
> again:
>    current_locked = atomic_read(&mm->locked_vm);
>    new_locked = current_locked + npages;
>    if (new_locked < lock_limit)
>       if (cmpxchg(&mm->locked_vm, current_locked, new_locked) != current_locked)
>             goto again;
> 
> So it won't have bogus failures as there is no unwind after
> error. Basically this is a load locked/store conditional style of
> locking pattern.

This is basically what I have so far.

> > > That's a good idea, and especially worth doing considering that an arbitrary
> > > number of threads that charge a low amount of locked_vm can fail just because
> > > one thread charges lots of it.
> > 
> > Yeah but the window for this is quite small, I doubt it would be a real issue.
>
> > What if before doing the atomic_add_return(), we first did the racy new_locked
> > check for ENOMEM, then do the speculative add and cleanup, if necessary. This
> > would further reduce the scope of the window where false ENOMEM can occur.

So the upside of this is that there's no retry loop so tasks don't spin under
heavy contention?  Seems better to always guard against false ENOMEM, at least
from the locked_vm side if not from the rlimit changing.

> > > pinned_vm appears to be broken the same way, so I can fix it too unless someone
> > > beats me to it.
> > 
> > This should not be a surprise for the rdma folks. Cc'ing Jason nonetheless.
> 
> I think we accepted this tiny race as a side effect of removing the
> lock, which was very beneficial. Really the time window between the
> atomic failing and unwind is very small, and there are enough other
> ways a hostile user could DOS locked_vm that I don't think it really
> matters in practice..
> 
> However, the cmpxchg seems better, so a helper to implement that would
> probably be the best thing to do.

I've collapsed all the locked_vm users into such a helper and am now working on
converting the pinned_vm users to the same helper.  Taking longer than I
thought.
diff mbox series

Patch

diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 8038ac24a312..a4ef22b67c07 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -54,34 +54,29 @@  struct mm_iommu_table_group_mem_t {
 static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
 		unsigned long npages, bool incr)
 {
-	long ret = 0, locked, lock_limit;
+	long ret = 0;
+	unsigned long lock_limit;
 	s64 locked_vm;
 
 	if (!npages)
 		return 0;
 
-	down_write(&mm->mmap_sem);
-	locked_vm = atomic64_read(&mm->locked_vm);
 	if (incr) {
-		locked = locked_vm + npages;
 		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+		locked_vm = atomic64_add_return(npages, &mm->locked_vm);
+		if (locked_vm > lock_limit && !capable(CAP_IPC_LOCK)) {
 			ret = -ENOMEM;
-		else
-			atomic64_add(npages, &mm->locked_vm);
+			atomic64_sub(npages, &mm->locked_vm);
+		}
 	} else {
-		if (WARN_ON_ONCE(npages > locked_vm))
-			npages = locked_vm;
-		atomic64_sub(npages, &mm->locked_vm);
+		locked_vm = atomic64_sub_return(npages, &mm->locked_vm);
+		WARN_ON_ONCE(locked_vm < 0);
 	}
 
-	pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
-			current ? current->pid : 0,
-			incr ? '+' : '-',
-			npages << PAGE_SHIFT,
-			atomic64_read(&mm->locked_vm) << PAGE_SHIFT,
+	pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%lu %lld/%lu\n",
+			current ? current->pid : 0, incr ? '+' : '-',
+			npages << PAGE_SHIFT, locked_vm << PAGE_SHIFT,
 			rlimit(RLIMIT_MEMLOCK));
-	up_write(&mm->mmap_sem);
 
 	return ret;
 }