diff mbox series

[1/2] drm/ttm: set ttm_buffer_object pointer as null after it's freed

Message ID 1536569876-27262-1-git-send-email-ray.huang@amd.com (mailing list archive)
State New, archived
Headers show
Series [1/2] drm/ttm: set ttm_buffer_object pointer as null after it's freed | expand

Commit Message

Huang Rui Sept. 10, 2018, 8:57 a.m. UTC
It avoids to be refered again after freed.

Signed-off-by: Huang Rui <ray.huang@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Tom StDenis <Tom.StDenis@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c | 1 +
 1 file changed, 1 insertion(+)

Comments

Christian König Sept. 10, 2018, 9 a.m. UTC | #1
Hi Ray,

well those patches doesn't make sense, the pointer is only local to the 
function.

Regards,
Christian.

Am 10.09.2018 um 10:57 schrieb Huang Rui:
> It avoids to be refered again after freed.
>
> Signed-off-by: Huang Rui <ray.huang@amd.com>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Tom StDenis <Tom.StDenis@amd.com>
> ---
>   drivers/gpu/drm/ttm/ttm_bo.c | 1 +
>   1 file changed, 1 insertion(+)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 138c989..d3ef5f8 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -54,6 +54,7 @@ static struct attribute ttm_bo_count = {
>   static void ttm_bo_default_destroy(struct ttm_buffer_object *bo)
>   {
>   	kfree(bo);
> +	bo = NULL;
>   }
>   
>   static inline int ttm_mem_type_from_place(const struct ttm_place *place,
Huang Rui Sept. 10, 2018, 9:23 a.m. UTC | #2
On Mon, Sep 10, 2018 at 11:00:04AM +0200, Christian König wrote:
> Hi Ray,
> 
> well those patches doesn't make sense, the pointer is only local to
> the function.

You're right.
I narrowed it with gdb dump from ttm_bo_bulk_move_lru_tail+0x2b, the
use-after-free should be in below codes: 

man = &bulk->tt[i].first->bdev->man[TTM_PL_TT];
ttm_bo_bulk_move_helper(&bulk->tt[i], &man->lru[i], false);

Is there a case, when orignal bo is destroyed in the bulk pos, but it
doesn't update pos->first pointer, then we still use it during the bulk
moving?

Thanks,
Ray

> 
> Regards,
> Christian.
> 
> Am 10.09.2018 um 10:57 schrieb Huang Rui:
> >It avoids to be refered again after freed.
> >
> >Signed-off-by: Huang Rui <ray.huang@amd.com>
> >Cc: Christian König <christian.koenig@amd.com>
> >Cc: Tom StDenis <Tom.StDenis@amd.com>
> >---
> >  drivers/gpu/drm/ttm/ttm_bo.c | 1 +
> >  1 file changed, 1 insertion(+)
> >
> >diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> >index 138c989..d3ef5f8 100644
> >--- a/drivers/gpu/drm/ttm/ttm_bo.c
> >+++ b/drivers/gpu/drm/ttm/ttm_bo.c
> >@@ -54,6 +54,7 @@ static struct attribute ttm_bo_count = {
> >  static void ttm_bo_default_destroy(struct ttm_buffer_object *bo)
> >  {
> >  	kfree(bo);
> >+	bo = NULL;
> >  }
> >  static inline int ttm_mem_type_from_place(const struct ttm_place *place,
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Christian König Sept. 10, 2018, 9:25 a.m. UTC | #3
Am 10.09.2018 um 11:23 schrieb Huang Rui:
> On Mon, Sep 10, 2018 at 11:00:04AM +0200, Christian König wrote:
>> Hi Ray,
>>
>> well those patches doesn't make sense, the pointer is only local to
>> the function.
> You're right.
> I narrowed it with gdb dump from ttm_bo_bulk_move_lru_tail+0x2b, the
> use-after-free should be in below codes:
>
> man = &bulk->tt[i].first->bdev->man[TTM_PL_TT];
> ttm_bo_bulk_move_helper(&bulk->tt[i], &man->lru[i], false);
>
> Is there a case, when orignal bo is destroyed in the bulk pos, but it
> doesn't update pos->first pointer, then we still use it during the bulk
> moving?

Only when a per VM BO is freed or the VM destroyed.

The first case should now be handled by "drm/amdgpu: set bulk_moveable 
to false when a per VM is released" and when we use a destroyed VM we 
would see other problems as well.

BTW: Just pushed this commit to the repository, should show up any second.

Christian.

>
> Thanks,
> Ray
>
>> Regards,
>> Christian.
>>
>> Am 10.09.2018 um 10:57 schrieb Huang Rui:
>>> It avoids to be refered again after freed.
>>>
>>> Signed-off-by: Huang Rui <ray.huang@amd.com>
>>> Cc: Christian König <christian.koenig@amd.com>
>>> Cc: Tom StDenis <Tom.StDenis@amd.com>
>>> ---
>>>   drivers/gpu/drm/ttm/ttm_bo.c | 1 +
>>>   1 file changed, 1 insertion(+)
>>>
>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
>>> index 138c989..d3ef5f8 100644
>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>> @@ -54,6 +54,7 @@ static struct attribute ttm_bo_count = {
>>>   static void ttm_bo_default_destroy(struct ttm_buffer_object *bo)
>>>   {
>>>   	kfree(bo);
>>> +	bo = NULL;
>>>   }
>>>   static inline int ttm_mem_type_from_place(const struct ttm_place *place,
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Huang Rui Sept. 10, 2018, 11:58 a.m. UTC | #4
On Mon, Sep 10, 2018 at 05:25:48PM +0800, Koenig, Christian wrote:
> Am 10.09.2018 um 11:23 schrieb Huang Rui:
> > On Mon, Sep 10, 2018 at 11:00:04AM +0200, Christian König wrote:
> >> Hi Ray,
> >>
> >> well those patches doesn't make sense, the pointer is only local to
> >> the function.
> > You're right.
> > I narrowed it with gdb dump from ttm_bo_bulk_move_lru_tail+0x2b, the
> > use-after-free should be in below codes:
> >
> > man = &bulk->tt[i].first->bdev->man[TTM_PL_TT];
> > ttm_bo_bulk_move_helper(&bulk->tt[i], &man->lru[i], false);
> >
> > Is there a case, when orignal bo is destroyed in the bulk pos, but it
> > doesn't update pos->first pointer, then we still use it during the bulk
> > moving?
> 
> Only when a per VM BO is freed or the VM destroyed.
> 
> The first case should now be handled by "drm/amdgpu: set bulk_moveable 
> to false when a per VM is released" and when we use a destroyed VM we 
> would see other problems as well.

When a VM instance is teared down, all BOs which belong to that VM should
be removed from LRU list. How can we use a destroyed VM when we submmit a
command? You know, we do the bulk moving at the last step of submission.

> 
> BTW: Just pushed this commit to the repository, should show up any second.

I see, thanks.

Thanks,
Ray

> 
> Christian.
> 
> >
> > Thanks,
> > Ray
> >
> >> Regards,
> >> Christian.
> >>
> >> Am 10.09.2018 um 10:57 schrieb Huang Rui:
> >>> It avoids to be refered again after freed.
> >>>
> >>> Signed-off-by: Huang Rui <ray.huang@amd.com>
> >>> Cc: Christian König <christian.koenig@amd.com>
> >>> Cc: Tom StDenis <Tom.StDenis@amd.com>
> >>> ---
> >>>   drivers/gpu/drm/ttm/ttm_bo.c | 1 +
> >>>   1 file changed, 1 insertion(+)
> >>>
> >>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> >>> index 138c989..d3ef5f8 100644
> >>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> >>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> >>> @@ -54,6 +54,7 @@ static struct attribute ttm_bo_count = {
> >>>   static void ttm_bo_default_destroy(struct ttm_buffer_object *bo)
> >>>   {
> >>>   	kfree(bo);
> >>> +	bo = NULL;
> >>>   }
> >>>   static inline int ttm_mem_type_from_place(const struct ttm_place *place,
> >> _______________________________________________
> >> amd-gfx mailing list
> >> amd-gfx@lists.freedesktop.org
> >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
Huang Rui Sept. 10, 2018, 12:05 p.m. UTC | #5
On Mon, Sep 10, 2018 at 05:25:48PM +0800, Koenig, Christian wrote:
> Am 10.09.2018 um 11:23 schrieb Huang Rui:
> > On Mon, Sep 10, 2018 at 11:00:04AM +0200, Christian König wrote:
> >> Hi Ray,
> >>
> >> well those patches doesn't make sense, the pointer is only local to
> >> the function.
> > You're right.
> > I narrowed it with gdb dump from ttm_bo_bulk_move_lru_tail+0x2b, the
> > use-after-free should be in below codes:
> >
> > man = &bulk->tt[i].first->bdev->man[TTM_PL_TT];
> > ttm_bo_bulk_move_helper(&bulk->tt[i], &man->lru[i], false);
> >
> > Is there a case, when orignal bo is destroyed in the bulk pos, but it
> > doesn't update pos->first pointer, then we still use it during the bulk
> > moving?
> 
> Only when a per VM BO is freed or the VM destroyed.
> 
> The first case should now be handled by "drm/amdgpu: set bulk_moveable 
> to false when a per VM is released" and when we use a destroyed VM we 
> would see other problems as well.
> 

If a VM instance is teared down, all BOs which belong that VM should be
removed from LRU. But how can we submit cmd based on a destroyed VM? You
know, we do the bulk move at last step of submission.


Thanks,
Ray

> BTW: Just pushed this commit to the repository, should show up any second.
> 
> Christian.
> 
> >
> > Thanks,
> > Ray
> >
> >> Regards,
> >> Christian.
> >>
> >> Am 10.09.2018 um 10:57 schrieb Huang Rui:
> >>> It avoids to be refered again after freed.
> >>>
> >>> Signed-off-by: Huang Rui <ray.huang@amd.com>
> >>> Cc: Christian König <christian.koenig@amd.com>
> >>> Cc: Tom StDenis <Tom.StDenis@amd.com>
> >>> ---
> >>>   drivers/gpu/drm/ttm/ttm_bo.c | 1 +
> >>>   1 file changed, 1 insertion(+)
> >>>
> >>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> >>> index 138c989..d3ef5f8 100644
> >>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> >>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> >>> @@ -54,6 +54,7 @@ static struct attribute ttm_bo_count = {
> >>>   static void ttm_bo_default_destroy(struct ttm_buffer_object *bo)
> >>>   {
> >>>   	kfree(bo);
> >>> +	bo = NULL;
> >>>   }
> >>>   static inline int ttm_mem_type_from_place(const struct ttm_place *place,
> >> _______________________________________________
> >> amd-gfx mailing list
> >> amd-gfx@lists.freedesktop.org
> >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
Christian König Sept. 10, 2018, 12:49 p.m. UTC | #6
Am 10.09.2018 um 14:05 schrieb Huang Rui:
> On Mon, Sep 10, 2018 at 05:25:48PM +0800, Koenig, Christian wrote:
>> Am 10.09.2018 um 11:23 schrieb Huang Rui:
>>> On Mon, Sep 10, 2018 at 11:00:04AM +0200, Christian König wrote:
>>>> Hi Ray,
>>>>
>>>> well those patches doesn't make sense, the pointer is only local to
>>>> the function.
>>> You're right.
>>> I narrowed it with gdb dump from ttm_bo_bulk_move_lru_tail+0x2b, the
>>> use-after-free should be in below codes:
>>>
>>> man = &bulk->tt[i].first->bdev->man[TTM_PL_TT];
>>> ttm_bo_bulk_move_helper(&bulk->tt[i], &man->lru[i], false);
>>>
>>> Is there a case, when orignal bo is destroyed in the bulk pos, but it
>>> doesn't update pos->first pointer, then we still use it during the bulk
>>> moving?
>> Only when a per VM BO is freed or the VM destroyed.
>>
>> The first case should now be handled by "drm/amdgpu: set bulk_moveable
>> to false when a per VM is released" and when we use a destroyed VM we
>> would see other problems as well.
>>
> If a VM instance is teared down, all BOs which belong that VM should be
> removed from LRU. But how can we submit cmd based on a destroyed VM? You
> know, we do the bulk move at last step of submission.

Well exactly that's the point this can't happen :)

Otherwise we would crash because of using freed up memory much earlier 
in the command submission.

The best idea I have to track this down further is to add some 
trace_printk in ttm_bo_bulk_move_helper and amdgpu_bo_destroy and see 
why and when we are actually using a destroyed BO.

Christian.

>
>
> Thanks,
> Ray
>
>> BTW: Just pushed this commit to the repository, should show up any second.
>>
>> Christian.
>>
>>> Thanks,
>>> Ray
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 10.09.2018 um 10:57 schrieb Huang Rui:
>>>>> It avoids to be refered again after freed.
>>>>>
>>>>> Signed-off-by: Huang Rui <ray.huang@amd.com>
>>>>> Cc: Christian König <christian.koenig@amd.com>
>>>>> Cc: Tom StDenis <Tom.StDenis@amd.com>
>>>>> ---
>>>>>    drivers/gpu/drm/ttm/ttm_bo.c | 1 +
>>>>>    1 file changed, 1 insertion(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>> index 138c989..d3ef5f8 100644
>>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>> @@ -54,6 +54,7 @@ static struct attribute ttm_bo_count = {
>>>>>    static void ttm_bo_default_destroy(struct ttm_buffer_object *bo)
>>>>>    {
>>>>>    	kfree(bo);
>>>>> +	bo = NULL;
>>>>>    }
>>>>>    static inline int ttm_mem_type_from_place(const struct ttm_place *place,
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
StDenis, Tom Sept. 10, 2018, 12:59 p.m. UTC | #7
Hi Christian,

Are you adding new traces or turning on existing ones?  Would you like 
me to try them out in my setup?

Tom

On 2018-09-10 8:49 a.m., Christian König wrote:
> Am 10.09.2018 um 14:05 schrieb Huang Rui:
>> On Mon, Sep 10, 2018 at 05:25:48PM +0800, Koenig, Christian wrote:
>>> Am 10.09.2018 um 11:23 schrieb Huang Rui:
>>>> On Mon, Sep 10, 2018 at 11:00:04AM +0200, Christian König wrote:
>>>>> Hi Ray,
>>>>>
>>>>> well those patches doesn't make sense, the pointer is only local to
>>>>> the function.
>>>> You're right.
>>>> I narrowed it with gdb dump from ttm_bo_bulk_move_lru_tail+0x2b, the
>>>> use-after-free should be in below codes:
>>>>
>>>> man = &bulk->tt[i].first->bdev->man[TTM_PL_TT];
>>>> ttm_bo_bulk_move_helper(&bulk->tt[i], &man->lru[i], false);
>>>>
>>>> Is there a case, when orignal bo is destroyed in the bulk pos, but it
>>>> doesn't update pos->first pointer, then we still use it during the bulk
>>>> moving?
>>> Only when a per VM BO is freed or the VM destroyed.
>>>
>>> The first case should now be handled by "drm/amdgpu: set bulk_moveable
>>> to false when a per VM is released" and when we use a destroyed VM we
>>> would see other problems as well.
>>>
>> If a VM instance is teared down, all BOs which belong that VM should be
>> removed from LRU. But how can we submit cmd based on a destroyed VM? You
>> know, we do the bulk move at last step of submission.
> 
> Well exactly that's the point this can't happen :)
> 
> Otherwise we would crash because of using freed up memory much earlier 
> in the command submission.
> 
> The best idea I have to track this down further is to add some 
> trace_printk in ttm_bo_bulk_move_helper and amdgpu_bo_destroy and see 
> why and when we are actually using a destroyed BO.
> 
> Christian.
> 
>>
>>
>> Thanks,
>> Ray
>>
>>> BTW: Just pushed this commit to the repository, should show up any 
>>> second.
>>>
>>> Christian.
>>>
>>>> Thanks,
>>>> Ray
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 10.09.2018 um 10:57 schrieb Huang Rui:
>>>>>> It avoids to be refered again after freed.
>>>>>>
>>>>>> Signed-off-by: Huang Rui <ray.huang@amd.com>
>>>>>> Cc: Christian König <christian.koenig@amd.com>
>>>>>> Cc: Tom StDenis <Tom.StDenis@amd.com>
>>>>>> ---
>>>>>>    drivers/gpu/drm/ttm/ttm_bo.c | 1 +
>>>>>>    1 file changed, 1 insertion(+)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c 
>>>>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>> index 138c989..d3ef5f8 100644
>>>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>> @@ -54,6 +54,7 @@ static struct attribute ttm_bo_count = {
>>>>>>    static void ttm_bo_default_destroy(struct ttm_buffer_object *bo)
>>>>>>    {
>>>>>>        kfree(bo);
>>>>>> +    bo = NULL;
>>>>>>    }
>>>>>>    static inline int ttm_mem_type_from_place(const struct 
>>>>>> ttm_place *place,
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>
Christian König Sept. 10, 2018, 1:04 p.m. UTC | #8
Hi Tom,

I'm talking about adding new printks to figure out what the heck is 
going wrong here.

Thanks,
Christian.

Am 10.09.2018 um 14:59 schrieb Tom St Denis:
> Hi Christian,
>
> Are you adding new traces or turning on existing ones?  Would you like 
> me to try them out in my setup?
>
> Tom
>
> On 2018-09-10 8:49 a.m., Christian König wrote:
>> Am 10.09.2018 um 14:05 schrieb Huang Rui:
>>> On Mon, Sep 10, 2018 at 05:25:48PM +0800, Koenig, Christian wrote:
>>>> Am 10.09.2018 um 11:23 schrieb Huang Rui:
>>>>> On Mon, Sep 10, 2018 at 11:00:04AM +0200, Christian König wrote:
>>>>>> Hi Ray,
>>>>>>
>>>>>> well those patches doesn't make sense, the pointer is only local to
>>>>>> the function.
>>>>> You're right.
>>>>> I narrowed it with gdb dump from ttm_bo_bulk_move_lru_tail+0x2b, the
>>>>> use-after-free should be in below codes:
>>>>>
>>>>> man = &bulk->tt[i].first->bdev->man[TTM_PL_TT];
>>>>> ttm_bo_bulk_move_helper(&bulk->tt[i], &man->lru[i], false);
>>>>>
>>>>> Is there a case, when orignal bo is destroyed in the bulk pos, but it
>>>>> doesn't update pos->first pointer, then we still use it during the 
>>>>> bulk
>>>>> moving?
>>>> Only when a per VM BO is freed or the VM destroyed.
>>>>
>>>> The first case should now be handled by "drm/amdgpu: set bulk_moveable
>>>> to false when a per VM is released" and when we use a destroyed VM we
>>>> would see other problems as well.
>>>>
>>> If a VM instance is teared down, all BOs which belong that VM should be
>>> removed from LRU. But how can we submit cmd based on a destroyed VM? 
>>> You
>>> know, we do the bulk move at last step of submission.
>>
>> Well exactly that's the point this can't happen :)
>>
>> Otherwise we would crash because of using freed up memory much 
>> earlier in the command submission.
>>
>> The best idea I have to track this down further is to add some 
>> trace_printk in ttm_bo_bulk_move_helper and amdgpu_bo_destroy and see 
>> why and when we are actually using a destroyed BO.
>>
>> Christian.
>>
>>>
>>>
>>> Thanks,
>>> Ray
>>>
>>>> BTW: Just pushed this commit to the repository, should show up any 
>>>> second.
>>>>
>>>> Christian.
>>>>
>>>>> Thanks,
>>>>> Ray
>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 10.09.2018 um 10:57 schrieb Huang Rui:
>>>>>>> It avoids to be refered again after freed.
>>>>>>>
>>>>>>> Signed-off-by: Huang Rui <ray.huang@amd.com>
>>>>>>> Cc: Christian König <christian.koenig@amd.com>
>>>>>>> Cc: Tom StDenis <Tom.StDenis@amd.com>
>>>>>>> ---
>>>>>>>    drivers/gpu/drm/ttm/ttm_bo.c | 1 +
>>>>>>>    1 file changed, 1 insertion(+)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c 
>>>>>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>> index 138c989..d3ef5f8 100644
>>>>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>> @@ -54,6 +54,7 @@ static struct attribute ttm_bo_count = {
>>>>>>>    static void ttm_bo_default_destroy(struct ttm_buffer_object *bo)
>>>>>>>    {
>>>>>>>        kfree(bo);
>>>>>>> +    bo = NULL;
>>>>>>>    }
>>>>>>>    static inline int ttm_mem_type_from_place(const struct 
>>>>>>> ttm_place *place,
>>>>>> _______________________________________________
>>>>>> amd-gfx mailing list
>>>>>> amd-gfx@lists.freedesktop.org
>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>> _______________________________________________
>>> dri-devel mailing list
>>> dri-devel@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
StDenis, Tom Sept. 10, 2018, 1:05 p.m. UTC | #9
On 2018-09-10 9:04 a.m., Christian König wrote:
> Hi Tom,
> 
> I'm talking about adding new printks to figure out what the heck is 
> going wrong here.
> 
> Thanks,
> Christian.

Hi Christian,

Sure, if you want to send me a simple patch that adds more printk I'll 
gladly give it a try (doubly so since my workstation depends on our 
staging tree to work properly...).

Tom

> 
> Am 10.09.2018 um 14:59 schrieb Tom St Denis:
>> Hi Christian,
>>
>> Are you adding new traces or turning on existing ones?  Would you like 
>> me to try them out in my setup?
>>
>> Tom
>>
>> On 2018-09-10 8:49 a.m., Christian König wrote:
>>> Am 10.09.2018 um 14:05 schrieb Huang Rui:
>>>> On Mon, Sep 10, 2018 at 05:25:48PM +0800, Koenig, Christian wrote:
>>>>> Am 10.09.2018 um 11:23 schrieb Huang Rui:
>>>>>> On Mon, Sep 10, 2018 at 11:00:04AM +0200, Christian König wrote:
>>>>>>> Hi Ray,
>>>>>>>
>>>>>>> well those patches doesn't make sense, the pointer is only local to
>>>>>>> the function.
>>>>>> You're right.
>>>>>> I narrowed it with gdb dump from ttm_bo_bulk_move_lru_tail+0x2b, the
>>>>>> use-after-free should be in below codes:
>>>>>>
>>>>>> man = &bulk->tt[i].first->bdev->man[TTM_PL_TT];
>>>>>> ttm_bo_bulk_move_helper(&bulk->tt[i], &man->lru[i], false);
>>>>>>
>>>>>> Is there a case, when orignal bo is destroyed in the bulk pos, but it
>>>>>> doesn't update pos->first pointer, then we still use it during the 
>>>>>> bulk
>>>>>> moving?
>>>>> Only when a per VM BO is freed or the VM destroyed.
>>>>>
>>>>> The first case should now be handled by "drm/amdgpu: set bulk_moveable
>>>>> to false when a per VM is released" and when we use a destroyed VM we
>>>>> would see other problems as well.
>>>>>
>>>> If a VM instance is teared down, all BOs which belong that VM should be
>>>> removed from LRU. But how can we submit cmd based on a destroyed VM? 
>>>> You
>>>> know, we do the bulk move at last step of submission.
>>>
>>> Well exactly that's the point this can't happen :)
>>>
>>> Otherwise we would crash because of using freed up memory much 
>>> earlier in the command submission.
>>>
>>> The best idea I have to track this down further is to add some 
>>> trace_printk in ttm_bo_bulk_move_helper and amdgpu_bo_destroy and see 
>>> why and when we are actually using a destroyed BO.
>>>
>>> Christian.
>>>
>>>>
>>>>
>>>> Thanks,
>>>> Ray
>>>>
>>>>> BTW: Just pushed this commit to the repository, should show up any 
>>>>> second.
>>>>>
>>>>> Christian.
>>>>>
>>>>>> Thanks,
>>>>>> Ray
>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 10.09.2018 um 10:57 schrieb Huang Rui:
>>>>>>>> It avoids to be refered again after freed.
>>>>>>>>
>>>>>>>> Signed-off-by: Huang Rui <ray.huang@amd.com>
>>>>>>>> Cc: Christian König <christian.koenig@amd.com>
>>>>>>>> Cc: Tom StDenis <Tom.StDenis@amd.com>
>>>>>>>> ---
>>>>>>>>    drivers/gpu/drm/ttm/ttm_bo.c | 1 +
>>>>>>>>    1 file changed, 1 insertion(+)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c 
>>>>>>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>>> index 138c989..d3ef5f8 100644
>>>>>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>>> @@ -54,6 +54,7 @@ static struct attribute ttm_bo_count = {
>>>>>>>>    static void ttm_bo_default_destroy(struct ttm_buffer_object *bo)
>>>>>>>>    {
>>>>>>>>        kfree(bo);
>>>>>>>> +    bo = NULL;
>>>>>>>>    }
>>>>>>>>    static inline int ttm_mem_type_from_place(const struct 
>>>>>>>> ttm_place *place,
>>>>>>> _______________________________________________
>>>>>>> amd-gfx mailing list
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>> _______________________________________________
>>>> dri-devel mailing list
>>>> dri-devel@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
Christian König Sept. 10, 2018, 1:10 p.m. UTC | #10
Am 10.09.2018 um 15:05 schrieb Tom St Denis:
> On 2018-09-10 9:04 a.m., Christian König wrote:
>> Hi Tom,
>>
>> I'm talking about adding new printks to figure out what the heck is 
>> going wrong here.
>>
>> Thanks,
>> Christian.
>
> Hi Christian,
>
> Sure, if you want to send me a simple patch that adds more printk I'll 
> gladly give it a try (doubly so since my workstation depends on our 
> staging tree to work properly...).

Just add a printk to ttm_bo_bulk_move_helper to print pos->first and 
pos->last.

And another one to amdgpu_bo_destroy to printk the value of tbo.

Christian.

>
> Tom
>
>>
>> Am 10.09.2018 um 14:59 schrieb Tom St Denis:
>>> Hi Christian,
>>>
>>> Are you adding new traces or turning on existing ones?  Would you 
>>> like me to try them out in my setup?
>>>
>>> Tom
>>>
>>> On 2018-09-10 8:49 a.m., Christian König wrote:
>>>> Am 10.09.2018 um 14:05 schrieb Huang Rui:
>>>>> On Mon, Sep 10, 2018 at 05:25:48PM +0800, Koenig, Christian wrote:
>>>>>> Am 10.09.2018 um 11:23 schrieb Huang Rui:
>>>>>>> On Mon, Sep 10, 2018 at 11:00:04AM +0200, Christian König wrote:
>>>>>>>> Hi Ray,
>>>>>>>>
>>>>>>>> well those patches doesn't make sense, the pointer is only 
>>>>>>>> local to
>>>>>>>> the function.
>>>>>>> You're right.
>>>>>>> I narrowed it with gdb dump from ttm_bo_bulk_move_lru_tail+0x2b, 
>>>>>>> the
>>>>>>> use-after-free should be in below codes:
>>>>>>>
>>>>>>> man = &bulk->tt[i].first->bdev->man[TTM_PL_TT];
>>>>>>> ttm_bo_bulk_move_helper(&bulk->tt[i], &man->lru[i], false);
>>>>>>>
>>>>>>> Is there a case, when orignal bo is destroyed in the bulk pos, 
>>>>>>> but it
>>>>>>> doesn't update pos->first pointer, then we still use it during 
>>>>>>> the bulk
>>>>>>> moving?
>>>>>> Only when a per VM BO is freed or the VM destroyed.
>>>>>>
>>>>>> The first case should now be handled by "drm/amdgpu: set 
>>>>>> bulk_moveable
>>>>>> to false when a per VM is released" and when we use a destroyed 
>>>>>> VM we
>>>>>> would see other problems as well.
>>>>>>
>>>>> If a VM instance is teared down, all BOs which belong that VM 
>>>>> should be
>>>>> removed from LRU. But how can we submit cmd based on a destroyed 
>>>>> VM? You
>>>>> know, we do the bulk move at last step of submission.
>>>>
>>>> Well exactly that's the point this can't happen :)
>>>>
>>>> Otherwise we would crash because of using freed up memory much 
>>>> earlier in the command submission.
>>>>
>>>> The best idea I have to track this down further is to add some 
>>>> trace_printk in ttm_bo_bulk_move_helper and amdgpu_bo_destroy and 
>>>> see why and when we are actually using a destroyed BO.
>>>>
>>>> Christian.
>>>>
>>>>>
>>>>>
>>>>> Thanks,
>>>>> Ray
>>>>>
>>>>>> BTW: Just pushed this commit to the repository, should show up 
>>>>>> any second.
>>>>>>
>>>>>> Christian.
>>>>>>
>>>>>>> Thanks,
>>>>>>> Ray
>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 10.09.2018 um 10:57 schrieb Huang Rui:
>>>>>>>>> It avoids to be refered again after freed.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Huang Rui <ray.huang@amd.com>
>>>>>>>>> Cc: Christian König <christian.koenig@amd.com>
>>>>>>>>> Cc: Tom StDenis <Tom.StDenis@amd.com>
>>>>>>>>> ---
>>>>>>>>>    drivers/gpu/drm/ttm/ttm_bo.c | 1 +
>>>>>>>>>    1 file changed, 1 insertion(+)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c 
>>>>>>>>> b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>>>> index 138c989..d3ef5f8 100644
>>>>>>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>>>>>>>> @@ -54,6 +54,7 @@ static struct attribute ttm_bo_count = {
>>>>>>>>>    static void ttm_bo_default_destroy(struct ttm_buffer_object 
>>>>>>>>> *bo)
>>>>>>>>>    {
>>>>>>>>>        kfree(bo);
>>>>>>>>> +    bo = NULL;
>>>>>>>>>    }
>>>>>>>>>    static inline int ttm_mem_type_from_place(const struct 
>>>>>>>>> ttm_place *place,
>>>>>>>> _______________________________________________
>>>>>>>> amd-gfx mailing list
>>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>> _______________________________________________
>>>>> dri-devel mailing list
>>>>> dri-devel@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>>>>
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>
Huang Rui Sept. 11, 2018, 7:34 a.m. UTC | #11
On Mon, Sep 10, 2018 at 09:10:00PM +0800, Koenig, Christian wrote:
> Am 10.09.2018 um 15:05 schrieb Tom St Denis:
> > On 2018-09-10 9:04 a.m., Christian König wrote:
> >> Hi Tom,
> >>
> >> I'm talking about adding new printks to figure out what the heck is 
> >> going wrong here.
> >>
> >> Thanks,
> >> Christian.
> >
> > Hi Christian,
> >
> > Sure, if you want to send me a simple patch that adds more printk I'll 
> > gladly give it a try (doubly so since my workstation depends on our 
> > staging tree to work properly...).
> 
> Just add a printk to ttm_bo_bulk_move_helper to print pos->first and 
> pos->last.
> 
> And another one to amdgpu_bo_destroy to printk the value of tbo.
> 

Hi Tom,

Could you help to add below traces to check when the bo is freed.

8<---
From 919cabfbf4d202876a510cd51caa9c86cf7c8fd5 Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Tue, 11 Sep 2018 15:24:27 +0800
Subject: [PATCH] drm/amdgpu: add traces for lru bulk move

Signed-off-by: Huang Rui <ray.huang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h  | 47 ++++++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c     |  1 +
 3 files changed, 50 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index de990bd..ce28326 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -89,6 +89,8 @@ static void amdgpu_bo_destroy(struct ttm_buffer_object *tbo)
 	struct amdgpu_device *adev = amdgpu_ttm_adev(tbo->bdev);
 	struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
 
+	trace_amdgpu_bo_destroy(bo);
+
 	if (bo->pin_count > 0)
 		amdgpu_bo_subtract_pin_size(bo);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index 2e87414..5d93431 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -383,6 +383,53 @@ TRACE_EVENT(amdgpu_vm_flush,
 		      __entry->vm_hub,__entry->pd_addr)
 );
 
+TRACE_EVENT(amdgpu_vm_lru_bulk_move,
+	    TP_PROTO(struct amdgpu_vm *vm,
+		     struct amdgpu_bo *bo),
+	    TP_ARGS(vm, bo),
+	    TP_STRUCT__entry(
+			     __field(struct amdgpu_bo *, bo)
+			     __field(u32, mem_type)
+			     __field(struct ttm_buffer_object *, tt_first)
+			     __field(struct ttm_buffer_object *, tt_last)
+			     __field(struct ttm_buffer_object *, vram_first)
+			     __field(struct ttm_buffer_object *, vram_last)
+			     __field(struct ttm_buffer_object *, swap_first)
+			     __field(struct ttm_buffer_object *, swap_last)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->bo = bo;
+			   __entry->mem_type = bo->tbo.mem.mem_type;
+			   __entry->tt_first = vm->lru_bulk_move.tt[bo->tbo.priority].first;
+			   __entry->tt_last = vm->lru_bulk_move.tt[bo->tbo.priority].last;
+			   __entry->vram_first = vm->lru_bulk_move.vram[bo->tbo.priority].first;
+			   __entry->vram_last = vm->lru_bulk_move.vram[bo->tbo.priority].last;
+			   __entry->swap_first = vm->lru_bulk_move.swap[bo->tbo.priority].first;
+			   __entry->swap_last = vm->lru_bulk_move.swap[bo->tbo.priority].last;
+			   ),
+	    TP_printk("bo=%p, mem_type=%d, tt_first=%p, tt_last=%p, vram_first=%p, vram_last=%p, swap_first=%p, swap_last=%p",
+		      __entry->bo, __entry->mem_type,
+		      __entry->tt_first, __entry->tt_last,
+		      __entry->vram_first, __entry->vram_last,
+		      __entry->swap_first, __entry->swap_last)
+);
+
+TRACE_EVENT(amdgpu_bo_destroy,
+	    TP_PROTO(struct amdgpu_bo *bo),
+	    TP_ARGS(bo),
+	    TP_STRUCT__entry(
+			     __field(struct amdgpu_bo *, bo)
+			     __field(struct ttm_buffer_object *, tbo)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->bo = bo;
+			   __entry->tbo = &bo->tbo;
+			   ),
+	    TP_printk("bo=%p, tbo=%p", __entry->bo, __entry->tbo)
+);
+
 DECLARE_EVENT_CLASS(amdgpu_pasid,
 	    TP_PROTO(unsigned pasid),
 	    TP_ARGS(pasid),
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index ab95a9c..351bc58 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -391,6 +391,7 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
 			continue;
 
 		ttm_bo_move_to_lru_tail(&bo->tbo, &vm->lru_bulk_move);
+		trace_amdgpu_vm_lru_bulk_move(vm, bo);
 		if (bo->shadow)
 			ttm_bo_move_to_lru_tail(&bo->shadow->tbo,
 						&vm->lru_bulk_move);
diff mbox series

Patch

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 138c989..d3ef5f8 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -54,6 +54,7 @@  static struct attribute ttm_bo_count = {
 static void ttm_bo_default_destroy(struct ttm_buffer_object *bo)
 {
 	kfree(bo);
+	bo = NULL;
 }
 
 static inline int ttm_mem_type_from_place(const struct ttm_place *place,