diff mbox

[v13,11/22] vfio iommu: Add blocking notifier to notify DMA_UNMAP

Message ID 20161115201612.103893d7@t450s.home (mailing list archive)
State New, archived
Headers show

Commit Message

Alex Williamson Nov. 16, 2016, 3:16 a.m. UTC
On Wed, 16 Nov 2016 08:16:15 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 11/16/2016 3:49 AM, Alex Williamson wrote:
> > On Tue, 15 Nov 2016 20:59:54 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> ...
> 
> >> @@ -854,7 +857,28 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>  		 */
> >>  		if (dma->task->mm != current->mm)
> >>  			break;
> >> +
> >>  		unmapped += dma->size;
> >> +
> >> +		if (iommu->external_domain && !RB_EMPTY_ROOT(&dma->pfn_list)) {
> >> +			struct vfio_iommu_type1_dma_unmap nb_unmap;
> >> +
> >> +			nb_unmap.iova = dma->iova;
> >> +			nb_unmap.size = dma->size;
> >> +
> >> +			/*
> >> +			 * Notifier callback would call vfio_unpin_pages() which
> >> +			 * would acquire iommu->lock. Release lock here and
> >> +			 * reacquire it again.
> >> +			 */
> >> +			mutex_unlock(&iommu->lock);
> >> +			blocking_notifier_call_chain(&iommu->notifier,
> >> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> >> +						    &nb_unmap);
> >> +			mutex_lock(&iommu->lock);
> >> +			if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
> >> +				break;
> >> +		}  
> > 
> > 
> > Why exactly do we need to notify per vfio_dma rather than per unmap
> > request?  If we do the latter we can send the notify first, limiting us
> > to races where a page is pinned between the notify and the locking,
> > whereas here, even our dma pointer is suspect once we re-acquire the
> > lock, we don't technically know if another unmap could have removed
> > that already.  Perhaps something like this (untested):
> >   
> 
> There are checks to validate unmap request, like v2 check and who is
> calling unmap and is it allowed for that task to unmap. Before these
> checks its not sure that unmap region range which asked for would be
> unmapped all. Notify call should be at the place where its sure that the
> range provided to notify call is definitely going to be removed. My
> change do that.

Ok, but that does solve the problem.  What about this (untested):

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Alex Williamson Nov. 16, 2016, 3:25 a.m. UTC | #1
On Tue, 15 Nov 2016 20:16:12 -0700
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Wed, 16 Nov 2016 08:16:15 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > On 11/16/2016 3:49 AM, Alex Williamson wrote:  
> > > On Tue, 15 Nov 2016 20:59:54 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >     
> > ...
> >   
> > >> @@ -854,7 +857,28 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > >>  		 */
> > >>  		if (dma->task->mm != current->mm)
> > >>  			break;
> > >> +
> > >>  		unmapped += dma->size;
> > >> +
> > >> +		if (iommu->external_domain && !RB_EMPTY_ROOT(&dma->pfn_list)) {
> > >> +			struct vfio_iommu_type1_dma_unmap nb_unmap;
> > >> +
> > >> +			nb_unmap.iova = dma->iova;
> > >> +			nb_unmap.size = dma->size;
> > >> +
> > >> +			/*
> > >> +			 * Notifier callback would call vfio_unpin_pages() which
> > >> +			 * would acquire iommu->lock. Release lock here and
> > >> +			 * reacquire it again.
> > >> +			 */
> > >> +			mutex_unlock(&iommu->lock);
> > >> +			blocking_notifier_call_chain(&iommu->notifier,
> > >> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> > >> +						    &nb_unmap);
> > >> +			mutex_lock(&iommu->lock);
> > >> +			if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
> > >> +				break;
> > >> +		}    
> > > 
> > > 
> > > Why exactly do we need to notify per vfio_dma rather than per unmap
> > > request?  If we do the latter we can send the notify first, limiting us
> > > to races where a page is pinned between the notify and the locking,
> > > whereas here, even our dma pointer is suspect once we re-acquire the
> > > lock, we don't technically know if another unmap could have removed
> > > that already.  Perhaps something like this (untested):
> > >     
> > 
> > There are checks to validate unmap request, like v2 check and who is
> > calling unmap and is it allowed for that task to unmap. Before these
> > checks its not sure that unmap region range which asked for would be
> > unmapped all. Notify call should be at the place where its sure that the
> > range provided to notify call is definitely going to be removed. My
> > change do that.  
> 
> Ok, but that does solve the problem.  What about this (untested):

s/does/does not/

BTW, I like how the retries here fill the gap in my previous proposal
where we could still race re-pinning.  We've given it an honest shot or
someone is not participating if we've retried 10 times.  I don't
understand why the test for iommu->external_domain was there, clearly
if the list is not empty, we need to notify.  Thanks,

Alex

> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index ee9a680..50cafdf 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -782,9 +782,9 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  			     struct vfio_iommu_type1_dma_unmap *unmap)
>  {
>  	uint64_t mask;
> -	struct vfio_dma *dma;
> +	struct vfio_dma *dma, *dma_last = NULL;
>  	size_t unmapped = 0;
> -	int ret = 0;
> +	int ret = 0, retries;
>  
>  	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
>  
> @@ -794,7 +794,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  		return -EINVAL;
>  
>  	WARN_ON(mask & PAGE_MASK);
> -
> +again:
>  	mutex_lock(&iommu->lock);
>  
>  	/*
> @@ -851,11 +851,16 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  		if (dma->task->mm != current->mm)
>  			break;
>  
> -		unmapped += dma->size;
> -
> -		if (iommu->external_domain && !RB_EMPTY_ROOT(&dma->pfn_list)) {
> +		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
>  			struct vfio_iommu_type1_dma_unmap nb_unmap;
>  
> +			if (dma_last == dma) {
> +				BUG_ON(++retries > 10);
> +			} else {
> +				dma_last = dma;
> +				retries = 0;
> +			}
> +
>  			nb_unmap.iova = dma->iova;
>  			nb_unmap.size = dma->size;
>  
> @@ -868,11 +873,11 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>  			blocking_notifier_call_chain(&iommu->notifier,
>  						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>  						    &nb_unmap);
> -			mutex_lock(&iommu->lock);
> -			if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
> -				break;
> +			goto again:
>  		}
> +		unmapped += dma->size;
>  		vfio_remove_dma(iommu, dma);
> +
>  	}
>  
>  unlock:

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirti Wankhede Nov. 16, 2016, 3:43 a.m. UTC | #2
On 11/16/2016 8:55 AM, Alex Williamson wrote:
> On Tue, 15 Nov 2016 20:16:12 -0700
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
>> On Wed, 16 Nov 2016 08:16:15 +0530
>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>
>>> On 11/16/2016 3:49 AM, Alex Williamson wrote:  
>>>> On Tue, 15 Nov 2016 20:59:54 +0530
>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>     
>>> ...
>>>   
>>>>> @@ -854,7 +857,28 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>>>>  		 */
>>>>>  		if (dma->task->mm != current->mm)
>>>>>  			break;
>>>>> +
>>>>>  		unmapped += dma->size;
>>>>> +
>>>>> +		if (iommu->external_domain && !RB_EMPTY_ROOT(&dma->pfn_list)) {
>>>>> +			struct vfio_iommu_type1_dma_unmap nb_unmap;
>>>>> +
>>>>> +			nb_unmap.iova = dma->iova;
>>>>> +			nb_unmap.size = dma->size;
>>>>> +
>>>>> +			/*
>>>>> +			 * Notifier callback would call vfio_unpin_pages() which
>>>>> +			 * would acquire iommu->lock. Release lock here and
>>>>> +			 * reacquire it again.
>>>>> +			 */
>>>>> +			mutex_unlock(&iommu->lock);
>>>>> +			blocking_notifier_call_chain(&iommu->notifier,
>>>>> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>>>>> +						    &nb_unmap);
>>>>> +			mutex_lock(&iommu->lock);
>>>>> +			if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
>>>>> +				break;
>>>>> +		}    
>>>>
>>>>
>>>> Why exactly do we need to notify per vfio_dma rather than per unmap
>>>> request?  If we do the latter we can send the notify first, limiting us
>>>> to races where a page is pinned between the notify and the locking,
>>>> whereas here, even our dma pointer is suspect once we re-acquire the
>>>> lock, we don't technically know if another unmap could have removed
>>>> that already.  Perhaps something like this (untested):
>>>>     
>>>
>>> There are checks to validate unmap request, like v2 check and who is
>>> calling unmap and is it allowed for that task to unmap. Before these
>>> checks its not sure that unmap region range which asked for would be
>>> unmapped all. Notify call should be at the place where its sure that the
>>> range provided to notify call is definitely going to be removed. My
>>> change do that.  
>>
>> Ok, but that does solve the problem.  What about this (untested):
> 
> s/does/does not/
> 
> BTW, I like how the retries here fill the gap in my previous proposal
> where we could still race re-pinning.  We've given it an honest shot or
> someone is not participating if we've retried 10 times.  I don't
> understand why the test for iommu->external_domain was there, clearly
> if the list is not empty, we need to notify.  Thanks,
> 

Ok. Retry is good to give a chance to unpin all. But is it really
required to use BUG_ON() that would panic the host. I think WARN_ON
should be fine and then when container is closed or when the last group
is removed from the container, vfio_iommu_type1_release() is called and
we have a chance to unpin it all.

Thanks,
Kirti

> Alex
> 
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index ee9a680..50cafdf 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -782,9 +782,9 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>  			     struct vfio_iommu_type1_dma_unmap *unmap)
>>  {
>>  	uint64_t mask;
>> -	struct vfio_dma *dma;
>> +	struct vfio_dma *dma, *dma_last = NULL;
>>  	size_t unmapped = 0;
>> -	int ret = 0;
>> +	int ret = 0, retries;
>>  
>>  	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
>>  
>> @@ -794,7 +794,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>  		return -EINVAL;
>>  
>>  	WARN_ON(mask & PAGE_MASK);
>> -
>> +again:
>>  	mutex_lock(&iommu->lock);
>>  
>>  	/*
>> @@ -851,11 +851,16 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>  		if (dma->task->mm != current->mm)
>>  			break;
>>  
>> -		unmapped += dma->size;
>> -
>> -		if (iommu->external_domain && !RB_EMPTY_ROOT(&dma->pfn_list)) {
>> +		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
>>  			struct vfio_iommu_type1_dma_unmap nb_unmap;
>>  
>> +			if (dma_last == dma) {
>> +				BUG_ON(++retries > 10);
>> +			} else {
>> +				dma_last = dma;
>> +				retries = 0;
>> +			}
>> +
>>  			nb_unmap.iova = dma->iova;
>>  			nb_unmap.size = dma->size;
>>  
>> @@ -868,11 +873,11 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>  			blocking_notifier_call_chain(&iommu->notifier,
>>  						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>>  						    &nb_unmap);
>> -			mutex_lock(&iommu->lock);
>> -			if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
>> -				break;
>> +			goto again:
>>  		}
>> +		unmapped += dma->size;
>>  		vfio_remove_dma(iommu, dma);
>> +
>>  	}
>>  
>>  unlock:
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Nov. 16, 2016, 3:58 a.m. UTC | #3
On Wed, 16 Nov 2016 09:13:37 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 11/16/2016 8:55 AM, Alex Williamson wrote:
> > On Tue, 15 Nov 2016 20:16:12 -0700
> > Alex Williamson <alex.williamson@redhat.com> wrote:
> >   
> >> On Wed, 16 Nov 2016 08:16:15 +0530
> >> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>  
> >>> On 11/16/2016 3:49 AM, Alex Williamson wrote:    
> >>>> On Tue, 15 Nov 2016 20:59:54 +0530
> >>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>       
> >>> ...
> >>>     
> >>>>> @@ -854,7 +857,28 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>>>>  		 */
> >>>>>  		if (dma->task->mm != current->mm)
> >>>>>  			break;
> >>>>> +
> >>>>>  		unmapped += dma->size;
> >>>>> +
> >>>>> +		if (iommu->external_domain && !RB_EMPTY_ROOT(&dma->pfn_list)) {
> >>>>> +			struct vfio_iommu_type1_dma_unmap nb_unmap;
> >>>>> +
> >>>>> +			nb_unmap.iova = dma->iova;
> >>>>> +			nb_unmap.size = dma->size;
> >>>>> +
> >>>>> +			/*
> >>>>> +			 * Notifier callback would call vfio_unpin_pages() which
> >>>>> +			 * would acquire iommu->lock. Release lock here and
> >>>>> +			 * reacquire it again.
> >>>>> +			 */
> >>>>> +			mutex_unlock(&iommu->lock);
> >>>>> +			blocking_notifier_call_chain(&iommu->notifier,
> >>>>> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> >>>>> +						    &nb_unmap);
> >>>>> +			mutex_lock(&iommu->lock);
> >>>>> +			if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
> >>>>> +				break;
> >>>>> +		}      
> >>>>
> >>>>
> >>>> Why exactly do we need to notify per vfio_dma rather than per unmap
> >>>> request?  If we do the latter we can send the notify first, limiting us
> >>>> to races where a page is pinned between the notify and the locking,
> >>>> whereas here, even our dma pointer is suspect once we re-acquire the
> >>>> lock, we don't technically know if another unmap could have removed
> >>>> that already.  Perhaps something like this (untested):
> >>>>       
> >>>
> >>> There are checks to validate unmap request, like v2 check and who is
> >>> calling unmap and is it allowed for that task to unmap. Before these
> >>> checks its not sure that unmap region range which asked for would be
> >>> unmapped all. Notify call should be at the place where its sure that the
> >>> range provided to notify call is definitely going to be removed. My
> >>> change do that.    
> >>
> >> Ok, but that does solve the problem.  What about this (untested):  
> > 
> > s/does/does not/
> > 
> > BTW, I like how the retries here fill the gap in my previous proposal
> > where we could still race re-pinning.  We've given it an honest shot or
> > someone is not participating if we've retried 10 times.  I don't
> > understand why the test for iommu->external_domain was there, clearly
> > if the list is not empty, we need to notify.  Thanks,
> >   
> 
> Ok. Retry is good to give a chance to unpin all. But is it really
> required to use BUG_ON() that would panic the host. I think WARN_ON
> should be fine and then when container is closed or when the last group
> is removed from the container, vfio_iommu_type1_release() is called and
> we have a chance to unpin it all.

See my comments on patch 10/22, we need to be vigilant that the vendor
driver is participating.  I don't think we should be cleaning up after
the vendor driver on release, if we need to do that, it implies we
already have problems in multi-mdev containers since we'll be left with
pfn_list entries that no longer have an owner.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirti Wankhede Nov. 16, 2016, 4:16 a.m. UTC | #4
On 11/16/2016 9:28 AM, Alex Williamson wrote:
> On Wed, 16 Nov 2016 09:13:37 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 11/16/2016 8:55 AM, Alex Williamson wrote:
>>> On Tue, 15 Nov 2016 20:16:12 -0700
>>> Alex Williamson <alex.williamson@redhat.com> wrote:
>>>   
>>>> On Wed, 16 Nov 2016 08:16:15 +0530
>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>  
>>>>> On 11/16/2016 3:49 AM, Alex Williamson wrote:    
>>>>>> On Tue, 15 Nov 2016 20:59:54 +0530
>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>>       
>>>>> ...
>>>>>     
>>>>>>> @@ -854,7 +857,28 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>>>>>>  		 */
>>>>>>>  		if (dma->task->mm != current->mm)
>>>>>>>  			break;
>>>>>>> +
>>>>>>>  		unmapped += dma->size;
>>>>>>> +
>>>>>>> +		if (iommu->external_domain && !RB_EMPTY_ROOT(&dma->pfn_list)) {
>>>>>>> +			struct vfio_iommu_type1_dma_unmap nb_unmap;
>>>>>>> +
>>>>>>> +			nb_unmap.iova = dma->iova;
>>>>>>> +			nb_unmap.size = dma->size;
>>>>>>> +
>>>>>>> +			/*
>>>>>>> +			 * Notifier callback would call vfio_unpin_pages() which
>>>>>>> +			 * would acquire iommu->lock. Release lock here and
>>>>>>> +			 * reacquire it again.
>>>>>>> +			 */
>>>>>>> +			mutex_unlock(&iommu->lock);
>>>>>>> +			blocking_notifier_call_chain(&iommu->notifier,
>>>>>>> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>>>>>>> +						    &nb_unmap);
>>>>>>> +			mutex_lock(&iommu->lock);
>>>>>>> +			if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
>>>>>>> +				break;
>>>>>>> +		}      
>>>>>>
>>>>>>
>>>>>> Why exactly do we need to notify per vfio_dma rather than per unmap
>>>>>> request?  If we do the latter we can send the notify first, limiting us
>>>>>> to races where a page is pinned between the notify and the locking,
>>>>>> whereas here, even our dma pointer is suspect once we re-acquire the
>>>>>> lock, we don't technically know if another unmap could have removed
>>>>>> that already.  Perhaps something like this (untested):
>>>>>>       
>>>>>
>>>>> There are checks to validate unmap request, like v2 check and who is
>>>>> calling unmap and is it allowed for that task to unmap. Before these
>>>>> checks its not sure that unmap region range which asked for would be
>>>>> unmapped all. Notify call should be at the place where its sure that the
>>>>> range provided to notify call is definitely going to be removed. My
>>>>> change do that.    
>>>>
>>>> Ok, but that does solve the problem.  What about this (untested):  
>>>
>>> s/does/does not/
>>>
>>> BTW, I like how the retries here fill the gap in my previous proposal
>>> where we could still race re-pinning.  We've given it an honest shot or
>>> someone is not participating if we've retried 10 times.  I don't
>>> understand why the test for iommu->external_domain was there, clearly
>>> if the list is not empty, we need to notify.  Thanks,
>>>   
>>
>> Ok. Retry is good to give a chance to unpin all. But is it really
>> required to use BUG_ON() that would panic the host. I think WARN_ON
>> should be fine and then when container is closed or when the last group
>> is removed from the container, vfio_iommu_type1_release() is called and
>> we have a chance to unpin it all.
> 
> See my comments on patch 10/22, we need to be vigilant that the vendor
> driver is participating.  I don't think we should be cleaning up after
> the vendor driver on release, if we need to do that, it implies we
> already have problems in multi-mdev containers since we'll be left with
> pfn_list entries that no longer have an owner.  Thanks,
> 

If any vendor driver doesn't clean its pinned pages and there are
entries in pfn_list with no owner, that would be indicated by WARN_ON,
which should be fixed by that vendor driver. I still feel it shouldn't
cause host panic.
When such warning is seen with multiple mdev devices in container, it is
easy to isolate and find which vendor driver is not cleaning their
stuff, same warning would be seen with single mdev device in a
container. To isolate and find which vendor driver is culprit check with
one mdev device at a time.
Finally, we have a chance to clean all residue from
vfio_iommu_type1_release() so that vfio_iommu_type1 module doesn't leave
any leaks.

Thanks,
Kirti
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson Nov. 16, 2016, 4:36 a.m. UTC | #5
On Wed, 16 Nov 2016 09:46:20 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 11/16/2016 9:28 AM, Alex Williamson wrote:
> > On Wed, 16 Nov 2016 09:13:37 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 11/16/2016 8:55 AM, Alex Williamson wrote:  
> >>> On Tue, 15 Nov 2016 20:16:12 -0700
> >>> Alex Williamson <alex.williamson@redhat.com> wrote:
> >>>     
> >>>> On Wed, 16 Nov 2016 08:16:15 +0530
> >>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>    
> >>>>> On 11/16/2016 3:49 AM, Alex Williamson wrote:      
> >>>>>> On Tue, 15 Nov 2016 20:59:54 +0530
> >>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>>         
> >>>>> ...
> >>>>>       
> >>>>>>> @@ -854,7 +857,28 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> >>>>>>>  		 */
> >>>>>>>  		if (dma->task->mm != current->mm)
> >>>>>>>  			break;
> >>>>>>> +
> >>>>>>>  		unmapped += dma->size;
> >>>>>>> +
> >>>>>>> +		if (iommu->external_domain && !RB_EMPTY_ROOT(&dma->pfn_list)) {
> >>>>>>> +			struct vfio_iommu_type1_dma_unmap nb_unmap;
> >>>>>>> +
> >>>>>>> +			nb_unmap.iova = dma->iova;
> >>>>>>> +			nb_unmap.size = dma->size;
> >>>>>>> +
> >>>>>>> +			/*
> >>>>>>> +			 * Notifier callback would call vfio_unpin_pages() which
> >>>>>>> +			 * would acquire iommu->lock. Release lock here and
> >>>>>>> +			 * reacquire it again.
> >>>>>>> +			 */
> >>>>>>> +			mutex_unlock(&iommu->lock);
> >>>>>>> +			blocking_notifier_call_chain(&iommu->notifier,
> >>>>>>> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> >>>>>>> +						    &nb_unmap);
> >>>>>>> +			mutex_lock(&iommu->lock);
> >>>>>>> +			if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
> >>>>>>> +				break;
> >>>>>>> +		}        
> >>>>>>
> >>>>>>
> >>>>>> Why exactly do we need to notify per vfio_dma rather than per unmap
> >>>>>> request?  If we do the latter we can send the notify first, limiting us
> >>>>>> to races where a page is pinned between the notify and the locking,
> >>>>>> whereas here, even our dma pointer is suspect once we re-acquire the
> >>>>>> lock, we don't technically know if another unmap could have removed
> >>>>>> that already.  Perhaps something like this (untested):
> >>>>>>         
> >>>>>
> >>>>> There are checks to validate unmap request, like v2 check and who is
> >>>>> calling unmap and is it allowed for that task to unmap. Before these
> >>>>> checks its not sure that unmap region range which asked for would be
> >>>>> unmapped all. Notify call should be at the place where its sure that the
> >>>>> range provided to notify call is definitely going to be removed. My
> >>>>> change do that.      
> >>>>
> >>>> Ok, but that does solve the problem.  What about this (untested):    
> >>>
> >>> s/does/does not/
> >>>
> >>> BTW, I like how the retries here fill the gap in my previous proposal
> >>> where we could still race re-pinning.  We've given it an honest shot or
> >>> someone is not participating if we've retried 10 times.  I don't
> >>> understand why the test for iommu->external_domain was there, clearly
> >>> if the list is not empty, we need to notify.  Thanks,
> >>>     
> >>
> >> Ok. Retry is good to give a chance to unpin all. But is it really
> >> required to use BUG_ON() that would panic the host. I think WARN_ON
> >> should be fine and then when container is closed or when the last group
> >> is removed from the container, vfio_iommu_type1_release() is called and
> >> we have a chance to unpin it all.  
> > 
> > See my comments on patch 10/22, we need to be vigilant that the vendor
> > driver is participating.  I don't think we should be cleaning up after
> > the vendor driver on release, if we need to do that, it implies we
> > already have problems in multi-mdev containers since we'll be left with
> > pfn_list entries that no longer have an owner.  Thanks,
> >   
> 
> If any vendor driver doesn't clean its pinned pages and there are
> entries in pfn_list with no owner, that would be indicated by WARN_ON,
> which should be fixed by that vendor driver. I still feel it shouldn't
> cause host panic.
> When such warning is seen with multiple mdev devices in container, it is
> easy to isolate and find which vendor driver is not cleaning their
> stuff, same warning would be seen with single mdev device in a
> container. To isolate and find which vendor driver is culprit check with
> one mdev device at a time.
> Finally, we have a chance to clean all residue from
> vfio_iommu_type1_release() so that vfio_iommu_type1 module doesn't leave
> any leaks.

How can we claim that we've resolved anything by unpinning the
residue?  In fact, is it actually safe to unpin any residue left by the
vendor driver or does it imply that we're promoting a simple memory
leak to a security issue because we can't verify whether the vendor
driver has disabled access to that pfn, which may not reference a user
page after we unpin it.  That, in addition to the fact that I don't
need to figure out how to break from the loop with a BUG_ON, is why I
chose that rather than a WARN_ON.  The release path could probably be a
WARN_ON since the user no longer has access to the device, so we have a
consistency error with the vendor driver, but we're probably not
promoting it further by unpinning the pages.  Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kirti Wankhede Nov. 16, 2016, 3:22 p.m. UTC | #6
On 11/16/2016 10:06 AM, Alex Williamson wrote:
> On Wed, 16 Nov 2016 09:46:20 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 11/16/2016 9:28 AM, Alex Williamson wrote:
>>> On Wed, 16 Nov 2016 09:13:37 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>   
>>>> On 11/16/2016 8:55 AM, Alex Williamson wrote:  
>>>>> On Tue, 15 Nov 2016 20:16:12 -0700
>>>>> Alex Williamson <alex.williamson@redhat.com> wrote:
>>>>>     
>>>>>> On Wed, 16 Nov 2016 08:16:15 +0530
>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>>    
>>>>>>> On 11/16/2016 3:49 AM, Alex Williamson wrote:      
>>>>>>>> On Tue, 15 Nov 2016 20:59:54 +0530
>>>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>>>>         
>>>>>>> ...
>>>>>>>       
>>>>>>>>> @@ -854,7 +857,28 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>>>>>>>>>  		 */
>>>>>>>>>  		if (dma->task->mm != current->mm)
>>>>>>>>>  			break;
>>>>>>>>> +
>>>>>>>>>  		unmapped += dma->size;
>>>>>>>>> +
>>>>>>>>> +		if (iommu->external_domain && !RB_EMPTY_ROOT(&dma->pfn_list)) {
>>>>>>>>> +			struct vfio_iommu_type1_dma_unmap nb_unmap;
>>>>>>>>> +
>>>>>>>>> +			nb_unmap.iova = dma->iova;
>>>>>>>>> +			nb_unmap.size = dma->size;
>>>>>>>>> +
>>>>>>>>> +			/*
>>>>>>>>> +			 * Notifier callback would call vfio_unpin_pages() which
>>>>>>>>> +			 * would acquire iommu->lock. Release lock here and
>>>>>>>>> +			 * reacquire it again.
>>>>>>>>> +			 */
>>>>>>>>> +			mutex_unlock(&iommu->lock);
>>>>>>>>> +			blocking_notifier_call_chain(&iommu->notifier,
>>>>>>>>> +						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>>>>>>>>> +						    &nb_unmap);
>>>>>>>>> +			mutex_lock(&iommu->lock);
>>>>>>>>> +			if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
>>>>>>>>> +				break;
>>>>>>>>> +		}        
>>>>>>>>
>>>>>>>>
>>>>>>>> Why exactly do we need to notify per vfio_dma rather than per unmap
>>>>>>>> request?  If we do the latter we can send the notify first, limiting us
>>>>>>>> to races where a page is pinned between the notify and the locking,
>>>>>>>> whereas here, even our dma pointer is suspect once we re-acquire the
>>>>>>>> lock, we don't technically know if another unmap could have removed
>>>>>>>> that already.  Perhaps something like this (untested):
>>>>>>>>         
>>>>>>>
>>>>>>> There are checks to validate unmap request, like v2 check and who is
>>>>>>> calling unmap and is it allowed for that task to unmap. Before these
>>>>>>> checks its not sure that unmap region range which asked for would be
>>>>>>> unmapped all. Notify call should be at the place where its sure that the
>>>>>>> range provided to notify call is definitely going to be removed. My
>>>>>>> change do that.      
>>>>>>
>>>>>> Ok, but that does solve the problem.  What about this (untested):    
>>>>>
>>>>> s/does/does not/
>>>>>
>>>>> BTW, I like how the retries here fill the gap in my previous proposal
>>>>> where we could still race re-pinning.  We've given it an honest shot or
>>>>> someone is not participating if we've retried 10 times.  I don't
>>>>> understand why the test for iommu->external_domain was there, clearly
>>>>> if the list is not empty, we need to notify.  Thanks,
>>>>>     
>>>>
>>>> Ok. Retry is good to give a chance to unpin all. But is it really
>>>> required to use BUG_ON() that would panic the host. I think WARN_ON
>>>> should be fine and then when container is closed or when the last group
>>>> is removed from the container, vfio_iommu_type1_release() is called and
>>>> we have a chance to unpin it all.  
>>>
>>> See my comments on patch 10/22, we need to be vigilant that the vendor
>>> driver is participating.  I don't think we should be cleaning up after
>>> the vendor driver on release, if we need to do that, it implies we
>>> already have problems in multi-mdev containers since we'll be left with
>>> pfn_list entries that no longer have an owner.  Thanks,
>>>   
>>
>> If any vendor driver doesn't clean its pinned pages and there are
>> entries in pfn_list with no owner, that would be indicated by WARN_ON,
>> which should be fixed by that vendor driver. I still feel it shouldn't
>> cause host panic.
>> When such warning is seen with multiple mdev devices in container, it is
>> easy to isolate and find which vendor driver is not cleaning their
>> stuff, same warning would be seen with single mdev device in a
>> container. To isolate and find which vendor driver is culprit check with
>> one mdev device at a time.
>> Finally, we have a chance to clean all residue from
>> vfio_iommu_type1_release() so that vfio_iommu_type1 module doesn't leave
>> any leaks.
> 
> How can we claim that we've resolved anything by unpinning the
> residue?  In fact, is it actually safe to unpin any residue left by the
> vendor driver or does it imply that we're promoting a simple memory
> leak to a security issue because we can't verify whether the vendor
> driver has disabled access to that pfn, which may not reference a user
> page after we unpin it.  That, in addition to the fact that I don't
> need to figure out how to break from the loop with a BUG_ON, is why I
> chose that rather than a WARN_ON.  The release path could probably be a
> WARN_ON since the user no longer has access to the device, so we have a
> consistency error with the vendor driver, but we're probably not
> promoting it further by unpinning the pages.  Thanks,
> 

Ok. Agree with the security concern you mentioned.
Changing to BUG_ON as you suggested in vfio_dma_do_unmap() and replacing
'unpinning remaining pages on detach_group and release' with 'WARN_ON'
if there are unpinned pages.

Thanks,
Kirti
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index ee9a680..50cafdf 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -782,9 +782,9 @@  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			     struct vfio_iommu_type1_dma_unmap *unmap)
 {
 	uint64_t mask;
-	struct vfio_dma *dma;
+	struct vfio_dma *dma, *dma_last = NULL;
 	size_t unmapped = 0;
-	int ret = 0;
+	int ret = 0, retries;
 
 	mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
 
@@ -794,7 +794,7 @@  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 		return -EINVAL;
 
 	WARN_ON(mask & PAGE_MASK);
-
+again:
 	mutex_lock(&iommu->lock);
 
 	/*
@@ -851,11 +851,16 @@  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 		if (dma->task->mm != current->mm)
 			break;
 
-		unmapped += dma->size;
-
-		if (iommu->external_domain && !RB_EMPTY_ROOT(&dma->pfn_list)) {
+		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
 			struct vfio_iommu_type1_dma_unmap nb_unmap;
 
+			if (dma_last == dma) {
+				BUG_ON(++retries > 10);
+			} else {
+				dma_last = dma;
+				retries = 0;
+			}
+
 			nb_unmap.iova = dma->iova;
 			nb_unmap.size = dma->size;
 
@@ -868,11 +873,11 @@  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			blocking_notifier_call_chain(&iommu->notifier,
 						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
 						    &nb_unmap);
-			mutex_lock(&iommu->lock);
-			if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
-				break;
+			goto again:
 		}
+		unmapped += dma->size;
 		vfio_remove_dma(iommu, dma);
+
 	}
 
 unlock: