diff mbox

mm: mmu_notifier: re-fix freed page still mapped in secondary MMU

Message ID 516CF235.4060103@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Xiao Guangrong April 16, 2013, 6:39 a.m. UTC
The commit 751efd8610d3 (mmu_notifier_unregister NULL Pointer deref
and multiple ->release()) breaks the fix:
    3ad3d901bbcfb15a5e4690e55350db0899095a68
    (mm: mmu_notifier: fix freed page still mapped in secondary MMU)

This patch reverts the commit and simply fix the bug spotted
by that patch

This bug is spotted by commit 751efd8610d3:

Comments

Robin Holt April 16, 2013, 9:31 a.m. UTC | #1
On Tue, Apr 16, 2013 at 02:39:49PM +0800, Xiao Guangrong wrote:
> The commit 751efd8610d3 (mmu_notifier_unregister NULL Pointer deref
> and multiple ->release()) breaks the fix:
>     3ad3d901bbcfb15a5e4690e55350db0899095a68
>     (mm: mmu_notifier: fix freed page still mapped in secondary MMU)

Can you describe how the page is still mapped?  I thought I had all
cases covered.  Whichever call hits first, I thought we had one callout
to the registered notifiers.  Are you saying we need multiple callouts?

Also, shouldn't you be asking for a revert commit and then supply a
subsequent commit for the real fix?  I thought that was the process for
doing a revert.

Thanks,
Robin Holt

> 
> This patch reverts the commit and simply fix the bug spotted
> by that patch
> 
> This bug is spotted by commit 751efd8610d3:
> ======
> There is a race condition between mmu_notifier_unregister() and
> __mmu_notifier_release().
> 
> Assume two tasks, one calling mmu_notifier_unregister() as a result of a
> filp_close() ->flush() callout (task A), and the other calling
> mmu_notifier_release() from an mmput() (task B).
> 
>                     A                               B
> t1                                              srcu_read_lock()
> t2              if (!hlist_unhashed())
> t3                                              srcu_read_unlock()
> t4              srcu_read_lock()
> t5                                              hlist_del_init_rcu()
> t6                                              synchronize_srcu()
> t7              srcu_read_unlock()
> t8              hlist_del_rcu()  <--- NULL pointer deref.
> ======
> 
> This can be fixed by using hlist_del_init_rcu instead of hlist_del_rcu.
> 
> The another issue spotted in the commit is
> "multiple ->release() callouts", we needn't care it too much because
> it is really rare (e.g, can not happen on kvm since mmu-notify is unregistered
> after exit_mmap()) and the later call of multiple ->release should be
> fast since all the pages have already been released by the first call.
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  mm/mmu_notifier.c |   81 +++++++++++++++++++++++++++--------------------------
>  1 files changed, 41 insertions(+), 40 deletions(-)
> 
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> index be04122..606777a 100644
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -40,48 +40,45 @@ void __mmu_notifier_release(struct mm_struct *mm)
>  	int id;
> 
>  	/*
> -	 * srcu_read_lock() here will block synchronize_srcu() in
> -	 * mmu_notifier_unregister() until all registered
> -	 * ->release() callouts this function makes have
> -	 * returned.
> +	 * SRCU here will block mmu_notifier_unregister until
> +	 * ->release returns.
>  	 */
>  	id = srcu_read_lock(&srcu);
> +	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
> +		/*
> +		 * if ->release runs before mmu_notifier_unregister it
> +		 * must be handled as it's the only way for the driver
> +		 * to flush all existing sptes and stop the driver
> +		 * from establishing any more sptes before all the
> +		 * pages in the mm are freed.
> +		 */
> +		if (mn->ops->release)
> +			mn->ops->release(mn, mm);
> +	srcu_read_unlock(&srcu, id);
> +
>  	spin_lock(&mm->mmu_notifier_mm->lock);
>  	while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
>  		mn = hlist_entry(mm->mmu_notifier_mm->list.first,
>  				 struct mmu_notifier,
>  				 hlist);
> -
>  		/*
> -		 * Unlink.  This will prevent mmu_notifier_unregister()
> -		 * from also making the ->release() callout.
> +		 * We arrived before mmu_notifier_unregister so
> +		 * mmu_notifier_unregister will do nothing other than
> +		 * to wait ->release to finish and
> +		 * mmu_notifier_unregister to return.
>  		 */
>  		hlist_del_init_rcu(&mn->hlist);
> -		spin_unlock(&mm->mmu_notifier_mm->lock);
> -
> -		/*
> -		 * Clear sptes. (see 'release' description in mmu_notifier.h)
> -		 */
> -		if (mn->ops->release)
> -			mn->ops->release(mn, mm);
> -
> -		spin_lock(&mm->mmu_notifier_mm->lock);
>  	}
>  	spin_unlock(&mm->mmu_notifier_mm->lock);
> 
>  	/*
> -	 * All callouts to ->release() which we have done are complete.
> -	 * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
> -	 */
> -	srcu_read_unlock(&srcu, id);
> -
> -	/*
> -	 * mmu_notifier_unregister() may have unlinked a notifier and may
> -	 * still be calling out to it.	Additionally, other notifiers
> -	 * may have been active via vmtruncate() et. al. Block here
> -	 * to ensure that all notifier callouts for this mm have been
> -	 * completed and the sptes are really cleaned up before returning
> -	 * to exit_mmap().
> +	 * synchronize_srcu here prevents mmu_notifier_release to
> +	 * return to exit_mmap (which would proceed freeing all pages
> +	 * in the mm) until the ->release method returns, if it was
> +	 * invoked by mmu_notifier_unregister.
> +	 *
> +	 * The mmu_notifier_mm can't go away from under us because one
> +	 * mm_count is hold by exit_mmap.
>  	 */
>  	synchronize_srcu(&srcu);
>  }
> @@ -292,31 +289,35 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
>  {
>  	BUG_ON(atomic_read(&mm->mm_count) <= 0);
> 
> -	spin_lock(&mm->mmu_notifier_mm->lock);
>  	if (!hlist_unhashed(&mn->hlist)) {
> +		/*
> +		 * SRCU here will force exit_mmap to wait ->release to finish
> +		 * before freeing the pages.
> +		 */
>  		int id;
> 
> +		id = srcu_read_lock(&srcu);
>  		/*
> -		 * Ensure we synchronize up with __mmu_notifier_release().
> +		 * exit_mmap will block in mmu_notifier_release to
> +		 * guarantee ->release is called before freeing the
> +		 * pages.
>  		 */
> -		id = srcu_read_lock(&srcu);
> -
> -		hlist_del_rcu(&mn->hlist);
> -		spin_unlock(&mm->mmu_notifier_mm->lock);
> -
>  		if (mn->ops->release)
>  			mn->ops->release(mn, mm);
> +		srcu_read_unlock(&srcu, id);
> 
> +		spin_lock(&mm->mmu_notifier_mm->lock);
>  		/*
> -		 * Allow __mmu_notifier_release() to complete.
> +		 * Can not use list_del_rcu() since __mmu_notifier_release
> +		 * can delete it before we hold the lock.
>  		 */
> -		srcu_read_unlock(&srcu, id);
> -	} else
> +		hlist_del_init_rcu(&mn->hlist);
>  		spin_unlock(&mm->mmu_notifier_mm->lock);
> +	}
> 
>  	/*
> -	 * Wait for any running method to finish, including ->release() if it
> -	 * was run by __mmu_notifier_release() instead of us.
> +	 * Wait any running method to finish, of course including
> +	 * ->release if it was run by mmu_notifier_relase instead of us.
>  	 */
>  	synchronize_srcu(&srcu);
> 
> -- 
> 1.7.7.6
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong April 16, 2013, 10:26 a.m. UTC | #2
On 04/16/2013 05:31 PM, Robin Holt wrote:
> On Tue, Apr 16, 2013 at 02:39:49PM +0800, Xiao Guangrong wrote:
>> The commit 751efd8610d3 (mmu_notifier_unregister NULL Pointer deref
>> and multiple ->release()) breaks the fix:
>>     3ad3d901bbcfb15a5e4690e55350db0899095a68
>>     (mm: mmu_notifier: fix freed page still mapped in secondary MMU)
> 
> Can you describe how the page is still mapped?  I thought I had all
> cases covered.  Whichever call hits first, I thought we had one callout
> to the registered notifiers.  Are you saying we need multiple callouts?

No.

You patch did this:

                hlist_del_init_rcu(&mn->hlist);    1 <======
+               spin_unlock(&mm->mmu_notifier_mm->lock);
+
+               /*
+                * Clear sptes. (see 'release' description in mmu_notifier.h)
+                */
+               if (mn->ops->release)
+                       mn->ops->release(mn, mm);    2 <======
+
+               spin_lock(&mm->mmu_notifier_mm->lock);

At point 1, you delete the notify, but the page is still on LRU. Other
cpu can reclaim the page but without call ->invalid_page().

At point 2, you call ->release(), the secondary MMU make page Accessed/Dirty
but that page has already been on the free-list of page-alloctor.

> 
> Also, shouldn't you be asking for a revert commit and then supply a
> subsequent commit for the real fix?  I thought that was the process for
> doing a revert.

Can not do that pure reversion since your patch moved hlist_for_each_entry_rcu
which has been modified now.

Should i do pure-eversion + hlist_for_each_entry_rcu update first?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Robin Holt April 16, 2013, 11:25 a.m. UTC | #3
On Tue, Apr 16, 2013 at 06:26:36PM +0800, Xiao Guangrong wrote:
> On 04/16/2013 05:31 PM, Robin Holt wrote:
> > On Tue, Apr 16, 2013 at 02:39:49PM +0800, Xiao Guangrong wrote:
> >> The commit 751efd8610d3 (mmu_notifier_unregister NULL Pointer deref
> >> and multiple ->release()) breaks the fix:
> >>     3ad3d901bbcfb15a5e4690e55350db0899095a68
> >>     (mm: mmu_notifier: fix freed page still mapped in secondary MMU)
> > 
> > Can you describe how the page is still mapped?  I thought I had all
> > cases covered.  Whichever call hits first, I thought we had one callout
> > to the registered notifiers.  Are you saying we need multiple callouts?
> 
> No.
> 
> You patch did this:
> 
>                 hlist_del_init_rcu(&mn->hlist);    1 <======
> +               spin_unlock(&mm->mmu_notifier_mm->lock);
> +
> +               /*
> +                * Clear sptes. (see 'release' description in mmu_notifier.h)
> +                */
> +               if (mn->ops->release)
> +                       mn->ops->release(mn, mm);    2 <======
> +
> +               spin_lock(&mm->mmu_notifier_mm->lock);
> 
> At point 1, you delete the notify, but the page is still on LRU. Other
> cpu can reclaim the page but without call ->invalid_page().
> 
> At point 2, you call ->release(), the secondary MMU make page Accessed/Dirty
> but that page has already been on the free-list of page-alloctor.

That expectation on srcu _REALLY_ needs to be documented better.
Maybe I missed it in the comments, but there is an expectation beyond
the synchronize_srcu().  This code has been extremely poorly described
and I think it is time we fix that up.

I do see that in comments for mmu_notifier_unregister, there is an
expectation upon already having all the spte's removed prior to making
this call.  I think that is also a stale comment as it mentions a lock
which I am not sure ever really existed.

> > Also, shouldn't you be asking for a revert commit and then supply a
> > subsequent commit for the real fix?  I thought that was the process for
> > doing a revert.
> 
> Can not do that pure reversion since your patch moved hlist_for_each_entry_rcu
> which has been modified now.
> 
> Should i do pure-eversion + hlist_for_each_entry_rcu update first?

Let's not go off without considering this first.

It looks like what we really need to do is ensure there is a method
for ensuring that the mmu_notifier remains on the list while callouts
invalidate_page() callouts are being made and also a means of ensuring
that only one ->release() callout is made.

First, is it the case that when kvm calls mmu_notifier_unregister(),
it has already cleared the spte's?  (what does spte stand for anyway)?
If so, then we really need to close the hole in __mmu_notifier_release().
I think we would need to modify code in both _unregister and _release,
but the issue is really _release.


I need to get ready and drive into work.  If you want to float something
out there, that is fine.  Otherwise, I will try to work something up
when I get to the office.

Thanks,
Robin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Robin Holt April 16, 2013, 11:43 a.m. UTC | #4
Argh.  Taking a step back helped clear my head.

For the -stable releases, I agree we should just go with your
revert-plus-hlist_del_init_rcu patch.  I will give it a test
when I am in the office.

For the v3.10 release, we should work on making this more
correct and completely documented.

Robin

On Tue, Apr 16, 2013 at 06:25:53AM -0500, Robin Holt wrote:
> On Tue, Apr 16, 2013 at 06:26:36PM +0800, Xiao Guangrong wrote:
> > On 04/16/2013 05:31 PM, Robin Holt wrote:
> > > On Tue, Apr 16, 2013 at 02:39:49PM +0800, Xiao Guangrong wrote:
> > >> The commit 751efd8610d3 (mmu_notifier_unregister NULL Pointer deref
> > >> and multiple ->release()) breaks the fix:
> > >>     3ad3d901bbcfb15a5e4690e55350db0899095a68
> > >>     (mm: mmu_notifier: fix freed page still mapped in secondary MMU)
> > > 
> > > Can you describe how the page is still mapped?  I thought I had all
> > > cases covered.  Whichever call hits first, I thought we had one callout
> > > to the registered notifiers.  Are you saying we need multiple callouts?
> > 
> > No.
> > 
> > You patch did this:
> > 
> >                 hlist_del_init_rcu(&mn->hlist);    1 <======
> > +               spin_unlock(&mm->mmu_notifier_mm->lock);
> > +
> > +               /*
> > +                * Clear sptes. (see 'release' description in mmu_notifier.h)
> > +                */
> > +               if (mn->ops->release)
> > +                       mn->ops->release(mn, mm);    2 <======
> > +
> > +               spin_lock(&mm->mmu_notifier_mm->lock);
> > 
> > At point 1, you delete the notify, but the page is still on LRU. Other
> > cpu can reclaim the page but without call ->invalid_page().
> > 
> > At point 2, you call ->release(), the secondary MMU make page Accessed/Dirty
> > but that page has already been on the free-list of page-alloctor.
> 
> That expectation on srcu _REALLY_ needs to be documented better.
> Maybe I missed it in the comments, but there is an expectation beyond
> the synchronize_srcu().  This code has been extremely poorly described
> and I think it is time we fix that up.
> 
> I do see that in comments for mmu_notifier_unregister, there is an
> expectation upon already having all the spte's removed prior to making
> this call.  I think that is also a stale comment as it mentions a lock
> which I am not sure ever really existed.
> 
> > > Also, shouldn't you be asking for a revert commit and then supply a
> > > subsequent commit for the real fix?  I thought that was the process for
> > > doing a revert.
> > 
> > Can not do that pure reversion since your patch moved hlist_for_each_entry_rcu
> > which has been modified now.
> > 
> > Should i do pure-eversion + hlist_for_each_entry_rcu update first?
> 
> Let's not go off without considering this first.
> 
> It looks like what we really need to do is ensure there is a method
> for ensuring that the mmu_notifier remains on the list while callouts
> invalidate_page() callouts are being made and also a means of ensuring
> that only one ->release() callout is made.
> 
> First, is it the case that when kvm calls mmu_notifier_unregister(),
> it has already cleared the spte's?  (what does spte stand for anyway)?
> If so, then we really need to close the hole in __mmu_notifier_release().
> I think we would need to modify code in both _unregister and _release,
> but the issue is really _release.
> 
> 
> I need to get ready and drive into work.  If you want to float something
> out there, that is fine.  Otherwise, I will try to work something up
> when I get to the office.
> 
> Thanks,
> Robin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong April 16, 2013, 1:07 p.m. UTC | #5
On 04/16/2013 07:43 PM, Robin Holt wrote:
> Argh.  Taking a step back helped clear my head.
> 
> For the -stable releases, I agree we should just go with your
> revert-plus-hlist_del_init_rcu patch.  I will give it a test
> when I am in the office.

Okay. Wait for your test report. Thank you in advance.

> 
> For the v3.10 release, we should work on making this more
> correct and completely documented.

Better document is always welcomed.

Double call ->release is not bad, like i mentioned it in the changelog:

it is really rare (e.g, can not happen on kvm since mmu-notify is unregistered
after exit_mmap()) and the later call of multiple ->release should be
fast since all the pages have already been released by the first call.

But, of course, it's great if you have a _light_ way to avoid this.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Simon Jeons April 17, 2013, 11:38 p.m. UTC | #6
Hi Robin,
On 04/16/2013 05:31 PM, Robin Holt wrote:
> On Tue, Apr 16, 2013 at 02:39:49PM +0800, Xiao Guangrong wrote:
>> The commit 751efd8610d3 (mmu_notifier_unregister NULL Pointer deref
>> and multiple ->release()) breaks the fix:
>>      3ad3d901bbcfb15a5e4690e55350db0899095a68
>>      (mm: mmu_notifier: fix freed page still mapped in secondary MMU)
> Can you describe how the page is still mapped?  I thought I had all
> cases covered.  Whichever call hits first, I thought we had one callout
> to the registered notifiers.  Are you saying we need multiple callouts?
>
> Also, shouldn't you be asking for a revert commit and then supply a
> subsequent commit for the real fix?  I thought that was the process for
> doing a revert.

mmu_notifier is used for sync normal pte and spte, correct?

>
> Thanks,
> Robin Holt
>
>> This patch reverts the commit and simply fix the bug spotted
>> by that patch
>>
>> This bug is spotted by commit 751efd8610d3:
>> ======
>> There is a race condition between mmu_notifier_unregister() and
>> __mmu_notifier_release().
>>
>> Assume two tasks, one calling mmu_notifier_unregister() as a result of a
>> filp_close() ->flush() callout (task A), and the other calling
>> mmu_notifier_release() from an mmput() (task B).
>>
>>                      A                               B
>> t1                                              srcu_read_lock()
>> t2              if (!hlist_unhashed())
>> t3                                              srcu_read_unlock()
>> t4              srcu_read_lock()
>> t5                                              hlist_del_init_rcu()
>> t6                                              synchronize_srcu()
>> t7              srcu_read_unlock()
>> t8              hlist_del_rcu()  <--- NULL pointer deref.
>> ======
>>
>> This can be fixed by using hlist_del_init_rcu instead of hlist_del_rcu.
>>
>> The another issue spotted in the commit is
>> "multiple ->release() callouts", we needn't care it too much because
>> it is really rare (e.g, can not happen on kvm since mmu-notify is unregistered
>> after exit_mmap()) and the later call of multiple ->release should be
>> fast since all the pages have already been released by the first call.
>>
>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>> ---
>>   mm/mmu_notifier.c |   81 +++++++++++++++++++++++++++--------------------------
>>   1 files changed, 41 insertions(+), 40 deletions(-)
>>
>> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
>> index be04122..606777a 100644
>> --- a/mm/mmu_notifier.c
>> +++ b/mm/mmu_notifier.c
>> @@ -40,48 +40,45 @@ void __mmu_notifier_release(struct mm_struct *mm)
>>   	int id;
>>
>>   	/*
>> -	 * srcu_read_lock() here will block synchronize_srcu() in
>> -	 * mmu_notifier_unregister() until all registered
>> -	 * ->release() callouts this function makes have
>> -	 * returned.
>> +	 * SRCU here will block mmu_notifier_unregister until
>> +	 * ->release returns.
>>   	 */
>>   	id = srcu_read_lock(&srcu);
>> +	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
>> +		/*
>> +		 * if ->release runs before mmu_notifier_unregister it
>> +		 * must be handled as it's the only way for the driver
>> +		 * to flush all existing sptes and stop the driver
>> +		 * from establishing any more sptes before all the
>> +		 * pages in the mm are freed.
>> +		 */
>> +		if (mn->ops->release)
>> +			mn->ops->release(mn, mm);
>> +	srcu_read_unlock(&srcu, id);
>> +
>>   	spin_lock(&mm->mmu_notifier_mm->lock);
>>   	while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
>>   		mn = hlist_entry(mm->mmu_notifier_mm->list.first,
>>   				 struct mmu_notifier,
>>   				 hlist);
>> -
>>   		/*
>> -		 * Unlink.  This will prevent mmu_notifier_unregister()
>> -		 * from also making the ->release() callout.
>> +		 * We arrived before mmu_notifier_unregister so
>> +		 * mmu_notifier_unregister will do nothing other than
>> +		 * to wait ->release to finish and
>> +		 * mmu_notifier_unregister to return.
>>   		 */
>>   		hlist_del_init_rcu(&mn->hlist);
>> -		spin_unlock(&mm->mmu_notifier_mm->lock);
>> -
>> -		/*
>> -		 * Clear sptes. (see 'release' description in mmu_notifier.h)
>> -		 */
>> -		if (mn->ops->release)
>> -			mn->ops->release(mn, mm);
>> -
>> -		spin_lock(&mm->mmu_notifier_mm->lock);
>>   	}
>>   	spin_unlock(&mm->mmu_notifier_mm->lock);
>>
>>   	/*
>> -	 * All callouts to ->release() which we have done are complete.
>> -	 * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
>> -	 */
>> -	srcu_read_unlock(&srcu, id);
>> -
>> -	/*
>> -	 * mmu_notifier_unregister() may have unlinked a notifier and may
>> -	 * still be calling out to it.	Additionally, other notifiers
>> -	 * may have been active via vmtruncate() et. al. Block here
>> -	 * to ensure that all notifier callouts for this mm have been
>> -	 * completed and the sptes are really cleaned up before returning
>> -	 * to exit_mmap().
>> +	 * synchronize_srcu here prevents mmu_notifier_release to
>> +	 * return to exit_mmap (which would proceed freeing all pages
>> +	 * in the mm) until the ->release method returns, if it was
>> +	 * invoked by mmu_notifier_unregister.
>> +	 *
>> +	 * The mmu_notifier_mm can't go away from under us because one
>> +	 * mm_count is hold by exit_mmap.
>>   	 */
>>   	synchronize_srcu(&srcu);
>>   }
>> @@ -292,31 +289,35 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
>>   {
>>   	BUG_ON(atomic_read(&mm->mm_count) <= 0);
>>
>> -	spin_lock(&mm->mmu_notifier_mm->lock);
>>   	if (!hlist_unhashed(&mn->hlist)) {
>> +		/*
>> +		 * SRCU here will force exit_mmap to wait ->release to finish
>> +		 * before freeing the pages.
>> +		 */
>>   		int id;
>>
>> +		id = srcu_read_lock(&srcu);
>>   		/*
>> -		 * Ensure we synchronize up with __mmu_notifier_release().
>> +		 * exit_mmap will block in mmu_notifier_release to
>> +		 * guarantee ->release is called before freeing the
>> +		 * pages.
>>   		 */
>> -		id = srcu_read_lock(&srcu);
>> -
>> -		hlist_del_rcu(&mn->hlist);
>> -		spin_unlock(&mm->mmu_notifier_mm->lock);
>> -
>>   		if (mn->ops->release)
>>   			mn->ops->release(mn, mm);
>> +		srcu_read_unlock(&srcu, id);
>>
>> +		spin_lock(&mm->mmu_notifier_mm->lock);
>>   		/*
>> -		 * Allow __mmu_notifier_release() to complete.
>> +		 * Can not use list_del_rcu() since __mmu_notifier_release
>> +		 * can delete it before we hold the lock.
>>   		 */
>> -		srcu_read_unlock(&srcu, id);
>> -	} else
>> +		hlist_del_init_rcu(&mn->hlist);
>>   		spin_unlock(&mm->mmu_notifier_mm->lock);
>> +	}
>>
>>   	/*
>> -	 * Wait for any running method to finish, including ->release() if it
>> -	 * was run by __mmu_notifier_release() instead of us.
>> +	 * Wait any running method to finish, of course including
>> +	 * ->release if it was run by mmu_notifier_relase instead of us.
>>   	 */
>>   	synchronize_srcu(&srcu);
>>
>> -- 
>> 1.7.7.6
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

======
There is a race condition between mmu_notifier_unregister() and
__mmu_notifier_release().

Assume two tasks, one calling mmu_notifier_unregister() as a result of a
filp_close() ->flush() callout (task A), and the other calling
mmu_notifier_release() from an mmput() (task B).

                    A                               B
t1                                              srcu_read_lock()
t2              if (!hlist_unhashed())
t3                                              srcu_read_unlock()
t4              srcu_read_lock()
t5                                              hlist_del_init_rcu()
t6                                              synchronize_srcu()
t7              srcu_read_unlock()
t8              hlist_del_rcu()  <--- NULL pointer deref.
======

This can be fixed by using hlist_del_init_rcu instead of hlist_del_rcu.

The another issue spotted in the commit is
"multiple ->release() callouts", we needn't care it too much because
it is really rare (e.g, can not happen on kvm since mmu-notify is unregistered
after exit_mmap()) and the later call of multiple ->release should be
fast since all the pages have already been released by the first call.

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 mm/mmu_notifier.c |   81 +++++++++++++++++++++++++++--------------------------
 1 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index be04122..606777a 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -40,48 +40,45 @@  void __mmu_notifier_release(struct mm_struct *mm)
 	int id;

 	/*
-	 * srcu_read_lock() here will block synchronize_srcu() in
-	 * mmu_notifier_unregister() until all registered
-	 * ->release() callouts this function makes have
-	 * returned.
+	 * SRCU here will block mmu_notifier_unregister until
+	 * ->release returns.
 	 */
 	id = srcu_read_lock(&srcu);
+	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
+		/*
+		 * if ->release runs before mmu_notifier_unregister it
+		 * must be handled as it's the only way for the driver
+		 * to flush all existing sptes and stop the driver
+		 * from establishing any more sptes before all the
+		 * pages in the mm are freed.
+		 */
+		if (mn->ops->release)
+			mn->ops->release(mn, mm);
+	srcu_read_unlock(&srcu, id);
+
 	spin_lock(&mm->mmu_notifier_mm->lock);
 	while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
 		mn = hlist_entry(mm->mmu_notifier_mm->list.first,
 				 struct mmu_notifier,
 				 hlist);
-
 		/*
-		 * Unlink.  This will prevent mmu_notifier_unregister()
-		 * from also making the ->release() callout.
+		 * We arrived before mmu_notifier_unregister so
+		 * mmu_notifier_unregister will do nothing other than
+		 * to wait ->release to finish and
+		 * mmu_notifier_unregister to return.
 		 */
 		hlist_del_init_rcu(&mn->hlist);
-		spin_unlock(&mm->mmu_notifier_mm->lock);
-
-		/*
-		 * Clear sptes. (see 'release' description in mmu_notifier.h)
-		 */
-		if (mn->ops->release)
-			mn->ops->release(mn, mm);
-
-		spin_lock(&mm->mmu_notifier_mm->lock);
 	}
 	spin_unlock(&mm->mmu_notifier_mm->lock);

 	/*
-	 * All callouts to ->release() which we have done are complete.
-	 * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
-	 */
-	srcu_read_unlock(&srcu, id);
-
-	/*
-	 * mmu_notifier_unregister() may have unlinked a notifier and may
-	 * still be calling out to it.	Additionally, other notifiers
-	 * may have been active via vmtruncate() et. al. Block here
-	 * to ensure that all notifier callouts for this mm have been
-	 * completed and the sptes are really cleaned up before returning
-	 * to exit_mmap().
+	 * synchronize_srcu here prevents mmu_notifier_release to
+	 * return to exit_mmap (which would proceed freeing all pages
+	 * in the mm) until the ->release method returns, if it was
+	 * invoked by mmu_notifier_unregister.
+	 *
+	 * The mmu_notifier_mm can't go away from under us because one
+	 * mm_count is hold by exit_mmap.
 	 */
 	synchronize_srcu(&srcu);
 }
@@ -292,31 +289,35 @@  void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
 	BUG_ON(atomic_read(&mm->mm_count) <= 0);

-	spin_lock(&mm->mmu_notifier_mm->lock);
 	if (!hlist_unhashed(&mn->hlist)) {
+		/*
+		 * SRCU here will force exit_mmap to wait ->release to finish
+		 * before freeing the pages.
+		 */
 		int id;

+		id = srcu_read_lock(&srcu);
 		/*
-		 * Ensure we synchronize up with __mmu_notifier_release().
+		 * exit_mmap will block in mmu_notifier_release to
+		 * guarantee ->release is called before freeing the
+		 * pages.
 		 */
-		id = srcu_read_lock(&srcu);
-
-		hlist_del_rcu(&mn->hlist);
-		spin_unlock(&mm->mmu_notifier_mm->lock);
-
 		if (mn->ops->release)
 			mn->ops->release(mn, mm);
+		srcu_read_unlock(&srcu, id);

+		spin_lock(&mm->mmu_notifier_mm->lock);
 		/*
-		 * Allow __mmu_notifier_release() to complete.
+		 * Can not use list_del_rcu() since __mmu_notifier_release
+		 * can delete it before we hold the lock.
 		 */
-		srcu_read_unlock(&srcu, id);
-	} else
+		hlist_del_init_rcu(&mn->hlist);
 		spin_unlock(&mm->mmu_notifier_mm->lock);
+	}

 	/*
-	 * Wait for any running method to finish, including ->release() if it
-	 * was run by __mmu_notifier_release() instead of us.
+	 * Wait any running method to finish, of course including
+	 * ->release if it was run by mmu_notifier_relase instead of us.
 	 */
 	synchronize_srcu(&srcu);