diff mbox series

[v2,hmm,01/11] mm/hmm: fix use after free with struct hmm in the mmu notifiers

Message ID 20190606184438.31646-2-jgg@ziepe.ca (mailing list archive)
State New, archived
Headers show
Series Various revisions from a locking/code review | expand

Commit Message

Jason Gunthorpe June 6, 2019, 6:44 p.m. UTC
From: Jason Gunthorpe <jgg@mellanox.com>

mmu_notifier_unregister_no_release() is not a fence and the mmu_notifier
system will continue to reference hmm->mn until the srcu grace period
expires.

Resulting in use after free races like this:

         CPU0                                     CPU1
                                               __mmu_notifier_invalidate_range_start()
                                                 srcu_read_lock
                                                 hlist_for_each ()
                                                   // mn == hmm->mn
hmm_mirror_unregister()
  hmm_put()
    hmm_free()
      mmu_notifier_unregister_no_release()
         hlist_del_init_rcu(hmm-mn->list)
			                           mn->ops->invalidate_range_start(mn, range);
					             mm_get_hmm()
      mm->hmm = NULL;
      kfree(hmm)
                                                     mutex_lock(&hmm->lock);

Use SRCU to kfree the hmm memory so that the notifiers can rely on hmm
existing. Get the now-safe hmm struct through container_of and directly
check kref_get_unless_zero to lock it against free.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
v2:
- Spell 'free' properly (Jerome/Ralph)
---
 include/linux/hmm.h |  1 +
 mm/hmm.c            | 25 +++++++++++++++++++------
 2 files changed, 20 insertions(+), 6 deletions(-)

Comments

John Hubbard June 7, 2019, 2:29 a.m. UTC | #1
On 6/6/19 11:44 AM, Jason Gunthorpe wrote:
> From: Jason Gunthorpe <jgg@mellanox.com>
...
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 8e7403f081f44a..547002f56a163d 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
...
> @@ -125,7 +130,7 @@ static void hmm_free(struct kref *kref)
>  		mm->hmm = NULL;
>  	spin_unlock(&mm->page_table_lock);
>  
> -	kfree(hmm);
> +	mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);


It occurred to me to wonder if it is best to use the MMU notifier's
instance of srcu, instead of creating a separate instance for HMM.
But this really does seem appropriate, since we are after all using
this to synchronize with MMU notifier callbacks. So, fine.


>  }
>  
>  static inline void hmm_put(struct hmm *hmm)
> @@ -153,10 +158,14 @@ void hmm_mm_destroy(struct mm_struct *mm)
>  
>  static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  {
> -	struct hmm *hmm = mm_get_hmm(mm);
> +	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
>  	struct hmm_mirror *mirror;
>  	struct hmm_range *range;
>  
> +	/* hmm is in progress to free */

Well, sometimes, yes. :)

Maybe this wording is clearer (if we need any comment at all):

	/* Bail out if hmm is in the process of being freed */

> +	if (!kref_get_unless_zero(&hmm->kref))
> +		return;
> +
>  	/* Report this HMM as dying. */
>  	hmm->dead = true;
>  
> @@ -194,13 +203,15 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
>  static int hmm_invalidate_range_start(struct mmu_notifier *mn,
>  			const struct mmu_notifier_range *nrange)
>  {
> -	struct hmm *hmm = mm_get_hmm(nrange->mm);
> +	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
>  	struct hmm_mirror *mirror;
>  	struct hmm_update update;
>  	struct hmm_range *range;
>  	int ret = 0;
>  
> -	VM_BUG_ON(!hmm);
> +	/* hmm is in progress to free */

Same here.

> +	if (!kref_get_unless_zero(&hmm->kref))
> +		return 0;
>  
>  	update.start = nrange->start;
>  	update.end = nrange->end;
> @@ -245,9 +256,11 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
>  static void hmm_invalidate_range_end(struct mmu_notifier *mn,
>  			const struct mmu_notifier_range *nrange)
>  {
> -	struct hmm *hmm = mm_get_hmm(nrange->mm);
> +	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
>  
> -	VM_BUG_ON(!hmm);
> +	/* hmm is in progress to free */

And here.

> +	if (!kref_get_unless_zero(&hmm->kref))
> +		return;
>  
>  	mutex_lock(&hmm->lock);
>  	hmm->notifiers--;
> 

Elegant fix. Regardless of the above chatter I added, you can add:

    Reviewed-by: John Hubbard <jhubbard@nvidia.com>


thanks,
Jason Gunthorpe June 7, 2019, 12:34 p.m. UTC | #2
On Thu, Jun 06, 2019 at 07:29:08PM -0700, John Hubbard wrote:
> On 6/6/19 11:44 AM, Jason Gunthorpe wrote:
> > From: Jason Gunthorpe <jgg@mellanox.com>
> ...
> > diff --git a/mm/hmm.c b/mm/hmm.c
> > index 8e7403f081f44a..547002f56a163d 100644
> > +++ b/mm/hmm.c
> ...
> > @@ -125,7 +130,7 @@ static void hmm_free(struct kref *kref)
> >  		mm->hmm = NULL;
> >  	spin_unlock(&mm->page_table_lock);
> >  
> > -	kfree(hmm);
> > +	mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
> 
> 
> It occurred to me to wonder if it is best to use the MMU notifier's
> instance of srcu, instead of creating a separate instance for HMM.

It *has* to be the MMU notifier SRCU because we are synchornizing
against the read side of that SRU inside the mmu notifier code, ie:

int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_range_start) {
                   ^^^^^

Here 'mn' is really hmm (hmm = container_of(mn, struct hmm,
mmu_notifier)), so we must protect the memory against free for the mmu
notifier core.

Thus we have no choice but to use its SRCU.

CH also pointed out a more elegant solution, which is to get the write
side of the mmap_sem during hmm_mirror_unregister - no notifier
callback can be running in this case. Then we delete the kref, srcu
and so forth.

This is much clearer/saner/better, but.. requries the callers of
hmm_mirror_unregister to be safe to get the mmap_sem write side.

I think this is true, so maybe this patch should be switched, what do
you think?

> > @@ -153,10 +158,14 @@ void hmm_mm_destroy(struct mm_struct *mm)
> >  
> >  static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
> >  {
> > -	struct hmm *hmm = mm_get_hmm(mm);
> > +	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
> >  	struct hmm_mirror *mirror;
> >  	struct hmm_range *range;
> >  
> > +	/* hmm is in progress to free */
> 
> Well, sometimes, yes. :)

It think it is in all cases actually.. The only way we see a 0 kref
and still reach this code path is if another thread has alreay setup
the hmm_free in the call_srcu..

> Maybe this wording is clearer (if we need any comment at all):

I always find this hard.. This is a very standard pattern when working
with RCU - however in my experience few people actually know the RCU
patterns, and missing the _unless_zero is a common bug I find when
looking at code.

This is mm/ so I can drop it, what do you think?

Thanks,
Jason
Jason Gunthorpe June 7, 2019, 1:42 p.m. UTC | #3
On Fri, Jun 07, 2019 at 09:34:32AM -0300, Jason Gunthorpe wrote:

> CH also pointed out a more elegant solution, which is to get the write
> side of the mmap_sem during hmm_mirror_unregister - no notifier
> callback can be running in this case. Then we delete the kref, srcu
> and so forth.

Oops, it turns out this is only the case for invalidate_start/end, not
release, so this doesn't help with the SRCU unless we also change
exit_mmap to call release with the mmap sem held.

So I think we have to stick with this for now.

Jason
Ralph Campbell June 7, 2019, 6:12 p.m. UTC | #4
On 6/6/19 11:44 AM, Jason Gunthorpe wrote:
> From: Jason Gunthorpe <jgg@mellanox.com>
> 
> mmu_notifier_unregister_no_release() is not a fence and the mmu_notifier
> system will continue to reference hmm->mn until the srcu grace period
> expires.
> 
> Resulting in use after free races like this:
> 
>           CPU0                                     CPU1
>                                                 __mmu_notifier_invalidate_range_start()
>                                                   srcu_read_lock
>                                                   hlist_for_each ()
>                                                     // mn == hmm->mn
> hmm_mirror_unregister()
>    hmm_put()
>      hmm_free()
>        mmu_notifier_unregister_no_release()
>           hlist_del_init_rcu(hmm-mn->list)
> 			                           mn->ops->invalidate_range_start(mn, range);
> 					             mm_get_hmm()
>        mm->hmm = NULL;
>        kfree(hmm)
>                                                       mutex_lock(&hmm->lock);
> 
> Use SRCU to kfree the hmm memory so that the notifiers can rely on hmm
> existing. Get the now-safe hmm struct through container_of and directly
> check kref_get_unless_zero to lock it against free.
> 
> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>

You can add
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>

> ---
> v2:
> - Spell 'free' properly (Jerome/Ralph)
> ---
>   include/linux/hmm.h |  1 +
>   mm/hmm.c            | 25 +++++++++++++++++++------
>   2 files changed, 20 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> index 092f0234bfe917..688c5ca7068795 100644
> --- a/include/linux/hmm.h
> +++ b/include/linux/hmm.h
> @@ -102,6 +102,7 @@ struct hmm {
>   	struct mmu_notifier	mmu_notifier;
>   	struct rw_semaphore	mirrors_sem;
>   	wait_queue_head_t	wq;
> +	struct rcu_head		rcu;
>   	long			notifiers;
>   	bool			dead;
>   };
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 8e7403f081f44a..547002f56a163d 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -113,6 +113,11 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
>   	return NULL;
>   }
>   
> +static void hmm_free_rcu(struct rcu_head *rcu)
> +{
> +	kfree(container_of(rcu, struct hmm, rcu));
> +}
> +
>   static void hmm_free(struct kref *kref)
>   {
>   	struct hmm *hmm = container_of(kref, struct hmm, kref);
> @@ -125,7 +130,7 @@ static void hmm_free(struct kref *kref)
>   		mm->hmm = NULL;
>   	spin_unlock(&mm->page_table_lock);
>   
> -	kfree(hmm);
> +	mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
>   }
>   
>   static inline void hmm_put(struct hmm *hmm)
> @@ -153,10 +158,14 @@ void hmm_mm_destroy(struct mm_struct *mm)
>   
>   static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
>   {
> -	struct hmm *hmm = mm_get_hmm(mm);
> +	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
>   	struct hmm_mirror *mirror;
>   	struct hmm_range *range;
>   
> +	/* hmm is in progress to free */
> +	if (!kref_get_unless_zero(&hmm->kref))
> +		return;
> +
>   	/* Report this HMM as dying. */
>   	hmm->dead = true;
>   
> @@ -194,13 +203,15 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
>   static int hmm_invalidate_range_start(struct mmu_notifier *mn,
>   			const struct mmu_notifier_range *nrange)
>   {
> -	struct hmm *hmm = mm_get_hmm(nrange->mm);
> +	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
>   	struct hmm_mirror *mirror;
>   	struct hmm_update update;
>   	struct hmm_range *range;
>   	int ret = 0;
>   
> -	VM_BUG_ON(!hmm);
> +	/* hmm is in progress to free */
> +	if (!kref_get_unless_zero(&hmm->kref))
> +		return 0;
>   
>   	update.start = nrange->start;
>   	update.end = nrange->end;
> @@ -245,9 +256,11 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
>   static void hmm_invalidate_range_end(struct mmu_notifier *mn,
>   			const struct mmu_notifier_range *nrange)
>   {
> -	struct hmm *hmm = mm_get_hmm(nrange->mm);
> +	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
>   
> -	VM_BUG_ON(!hmm);
> +	/* hmm is in progress to free */
> +	if (!kref_get_unless_zero(&hmm->kref))
> +		return;
>   
>   	mutex_lock(&hmm->lock);
>   	hmm->notifiers--;
>
John Hubbard June 8, 2019, 1:13 a.m. UTC | #5
On 6/7/19 5:34 AM, Jason Gunthorpe wrote:
> On Thu, Jun 06, 2019 at 07:29:08PM -0700, John Hubbard wrote:
>> On 6/6/19 11:44 AM, Jason Gunthorpe wrote:
>>> From: Jason Gunthorpe <jgg@mellanox.com>
>> ...
>>> diff --git a/mm/hmm.c b/mm/hmm.c
>>> index 8e7403f081f44a..547002f56a163d 100644
>>> +++ b/mm/hmm.c
>> ...
>>> @@ -125,7 +130,7 @@ static void hmm_free(struct kref *kref)
>>>  		mm->hmm = NULL;
>>>  	spin_unlock(&mm->page_table_lock);
>>>  
>>> -	kfree(hmm);
>>> +	mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
>>
>>
>> It occurred to me to wonder if it is best to use the MMU notifier's
>> instance of srcu, instead of creating a separate instance for HMM.
> 
> It *has* to be the MMU notifier SRCU because we are synchornizing
> against the read side of that SRU inside the mmu notifier code, ie:
> 
> int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
>         id = srcu_read_lock(&srcu);
>         hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
>                 if (mn->ops->invalidate_range_start) {
>                    ^^^^^
> 
> Here 'mn' is really hmm (hmm = container_of(mn, struct hmm,
> mmu_notifier)), so we must protect the memory against free for the mmu
> notifier core.
> 
> Thus we have no choice but to use its SRCU.

Ah right. It's embarassingly obvious when you say it out loud. :) 
Thanks for explaining.

> 
> CH also pointed out a more elegant solution, which is to get the write
> side of the mmap_sem during hmm_mirror_unregister - no notifier
> callback can be running in this case. Then we delete the kref, srcu
> and so forth.
> 
> This is much clearer/saner/better, but.. requries the callers of
> hmm_mirror_unregister to be safe to get the mmap_sem write side.
> 
> I think this is true, so maybe this patch should be switched, what do
> you think?

OK, your follow-up notes that we'll leave it as is, got it.


thanks,
John Hubbard June 8, 2019, 1:37 a.m. UTC | #6
On 6/7/19 5:34 AM, Jason Gunthorpe wrote:
> On Thu, Jun 06, 2019 at 07:29:08PM -0700, John Hubbard wrote:
>> On 6/6/19 11:44 AM, Jason Gunthorpe wrote:
>>> From: Jason Gunthorpe <jgg@mellanox.com>
>> ...
>>> @@ -153,10 +158,14 @@ void hmm_mm_destroy(struct mm_struct *mm)
>>>  
>>>  static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
>>>  {
>>> -	struct hmm *hmm = mm_get_hmm(mm);
>>> +	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
>>>  	struct hmm_mirror *mirror;
>>>  	struct hmm_range *range;
>>>  
>>> +	/* hmm is in progress to free */
>>
>> Well, sometimes, yes. :)
> 
> It think it is in all cases actually.. The only way we see a 0 kref
> and still reach this code path is if another thread has alreay setup
> the hmm_free in the call_srcu..
> 
>> Maybe this wording is clearer (if we need any comment at all):
> 
> I always find this hard.. This is a very standard pattern when working
> with RCU - however in my experience few people actually know the RCU
> patterns, and missing the _unless_zero is a common bug I find when
> looking at code.
> 
> This is mm/ so I can drop it, what do you think?
> 

I forgot to respond to this section, so catching up now:

I think we're talking about slightly different things. I was just
noting that the comment above the "if" statement was only accurate
if the branch is taken, which is why I recommended this combination
of comment and code:

	/* Bail out if hmm is in the process of being freed */
	if (!kref_get_unless_zero(&hmm->kref))
		return;

As for the actual _unless_zero part, I think that's good to have.
And it's a good reminder if nothing else, even in mm/ code.

thanks,
Christoph Hellwig June 8, 2019, 8:49 a.m. UTC | #7
I still think sruct hmm should die.  We already have a structure used
for additional information for drivers having crazly tight integration
into the VM, and it is called struct mmu_notifier_mm.  We really need
to reuse that intead of duplicating it badly.
Jason Gunthorpe June 8, 2019, 11:33 a.m. UTC | #8
On Sat, Jun 08, 2019 at 01:49:48AM -0700, Christoph Hellwig wrote:
> I still think sruct hmm should die.  We already have a structure used
> for additional information for drivers having crazly tight integration
> into the VM, and it is called struct mmu_notifier_mm.  We really need
> to reuse that intead of duplicating it badly.

Probably. But at least in ODP we needed something very similar to
'struct hmm' to make our mmu notifier implementation work.

The mmu notifier api really lends itself to having a per-mm structure
in the driver to hold the 'struct mmu_notifier'..

I think I see other drivers are doing things like assuming that there
is only one mm in their world (despite being FD based, so this is not
really guarenteed)

So, my first attempt would be an api something like:

   priv = mmu_notififer_attach_mm(ops, current->mm, sizeof(my_priv))
   mmu_notifier_detach_mm(priv);

 ops->invalidate_start(struct mmu_notififer *mn):
   struct p *priv = mmu_notifier_priv(mn);

Such that
 - There is only one priv per mm
 - All the srcu stuff is handled inside mmu notifier
 - It is reference counted, so ops can be attached multiple times to
   the same mm

Then odp's per_mm, and struct hmm (if we keep it at all) is simply a
'priv' in the above.

I was thinking of looking at this stuff next, once this series is
done.

Jason
diff mbox series

Patch

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 092f0234bfe917..688c5ca7068795 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -102,6 +102,7 @@  struct hmm {
 	struct mmu_notifier	mmu_notifier;
 	struct rw_semaphore	mirrors_sem;
 	wait_queue_head_t	wq;
+	struct rcu_head		rcu;
 	long			notifiers;
 	bool			dead;
 };
diff --git a/mm/hmm.c b/mm/hmm.c
index 8e7403f081f44a..547002f56a163d 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -113,6 +113,11 @@  static struct hmm *hmm_get_or_create(struct mm_struct *mm)
 	return NULL;
 }
 
+static void hmm_free_rcu(struct rcu_head *rcu)
+{
+	kfree(container_of(rcu, struct hmm, rcu));
+}
+
 static void hmm_free(struct kref *kref)
 {
 	struct hmm *hmm = container_of(kref, struct hmm, kref);
@@ -125,7 +130,7 @@  static void hmm_free(struct kref *kref)
 		mm->hmm = NULL;
 	spin_unlock(&mm->page_table_lock);
 
-	kfree(hmm);
+	mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
 }
 
 static inline void hmm_put(struct hmm *hmm)
@@ -153,10 +158,14 @@  void hmm_mm_destroy(struct mm_struct *mm)
 
 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
-	struct hmm *hmm = mm_get_hmm(mm);
+	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 	struct hmm_mirror *mirror;
 	struct hmm_range *range;
 
+	/* hmm is in progress to free */
+	if (!kref_get_unless_zero(&hmm->kref))
+		return;
+
 	/* Report this HMM as dying. */
 	hmm->dead = true;
 
@@ -194,13 +203,15 @@  static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 			const struct mmu_notifier_range *nrange)
 {
-	struct hmm *hmm = mm_get_hmm(nrange->mm);
+	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 	struct hmm_mirror *mirror;
 	struct hmm_update update;
 	struct hmm_range *range;
 	int ret = 0;
 
-	VM_BUG_ON(!hmm);
+	/* hmm is in progress to free */
+	if (!kref_get_unless_zero(&hmm->kref))
+		return 0;
 
 	update.start = nrange->start;
 	update.end = nrange->end;
@@ -245,9 +256,11 @@  static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 static void hmm_invalidate_range_end(struct mmu_notifier *mn,
 			const struct mmu_notifier_range *nrange)
 {
-	struct hmm *hmm = mm_get_hmm(nrange->mm);
+	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 
-	VM_BUG_ON(!hmm);
+	/* hmm is in progress to free */
+	if (!kref_get_unless_zero(&hmm->kref))
+		return;
 
 	mutex_lock(&hmm->lock);
 	hmm->notifiers--;