diff mbox series

[v2,7/8] drm/amdgpu: Fix sdma code crash post device unplug

Message ID 1592719388-13819-8-git-send-email-andrey.grodzovsky@amd.com (mailing list archive)
State New, archived
Headers show
Series RFC Support hot device unplug in amdgpu | expand

Commit Message

Andrey Grodzovsky June 21, 2020, 6:03 a.m. UTC
entity->rq becomes null aftre device unplugged so just return early
in that case.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

Comments

Daniel Vetter June 22, 2020, 9:55 a.m. UTC | #1
On Sun, Jun 21, 2020 at 02:03:07AM -0400, Andrey Grodzovsky wrote:
> entity->rq becomes null aftre device unplugged so just return early
> in that case.
> 
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

That looks very deep in amdgpu internals ... how do you even get in here
after the device is fully unplugged on the sw side?

Is this amdkfd doing something stupid because entirely unaware of what
amdgpu has done? Something else? Just feels like this is just duct-taping
over a more fundamental problem, after hotunplug no one should be able to
even submit anything new, or do bo moves, or well anything really.
-Daniel

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 21 ++++++++++++++++-----
>  1 file changed, 16 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> index 8d9c6fe..d252427 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> @@ -24,6 +24,7 @@
>  #include "amdgpu_job.h"
>  #include "amdgpu_object.h"
>  #include "amdgpu_trace.h"
> +#include <drm/drm_drv.h>
>  
>  #define AMDGPU_VM_SDMA_MIN_NUM_DW	256u
>  #define AMDGPU_VM_SDMA_MAX_NUM_DW	(16u * 1024u)
> @@ -94,7 +95,12 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
>  	struct drm_sched_entity *entity;
>  	struct amdgpu_ring *ring;
>  	struct dma_fence *f;
> -	int r;
> +	int r, idx;
> +
> +	if (!drm_dev_enter(p->adev->ddev, &idx)) {
> +		r = -ENODEV;
> +		goto nodev;
> +	}
>  
>  	entity = p->immediate ? &p->vm->immediate : &p->vm->delayed;
>  	ring = container_of(entity->rq->sched, struct amdgpu_ring, sched);
> @@ -104,7 +110,7 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
>  	WARN_ON(ib->length_dw > p->num_dw_left);
>  	r = amdgpu_job_submit(p->job, entity, AMDGPU_FENCE_OWNER_VM, &f);
>  	if (r)
> -		goto error;
> +		goto job_fail;
>  
>  	if (p->unlocked) {
>  		struct dma_fence *tmp = dma_fence_get(f);
> @@ -118,10 +124,15 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
>  	if (fence && !p->immediate)
>  		swap(*fence, f);
>  	dma_fence_put(f);
> -	return 0;
>  
> -error:
> -	amdgpu_job_free(p->job);
> +	r = 0;
> +
> +job_fail:
> +	drm_dev_exit(idx);
> +nodev:
> +	if (r)
> +		amdgpu_job_free(p->job);
> +
>  	return r;
>  }
>  
> -- 
> 2.7.4
>
Christian König June 22, 2020, 7:40 p.m. UTC | #2
Am 21.06.20 um 08:03 schrieb Andrey Grodzovsky:
> entity->rq becomes null aftre device unplugged so just return early
> in that case.

Mhm, do you have a backtrace for this?

This should only be called by an IOCTL and IOCTLs should already call 
drm_dev_enter()/exit() on their own...

Christian.

>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 21 ++++++++++++++++-----
>   1 file changed, 16 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> index 8d9c6fe..d252427 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> @@ -24,6 +24,7 @@
>   #include "amdgpu_job.h"
>   #include "amdgpu_object.h"
>   #include "amdgpu_trace.h"
> +#include <drm/drm_drv.h>
>   
>   #define AMDGPU_VM_SDMA_MIN_NUM_DW	256u
>   #define AMDGPU_VM_SDMA_MAX_NUM_DW	(16u * 1024u)
> @@ -94,7 +95,12 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
>   	struct drm_sched_entity *entity;
>   	struct amdgpu_ring *ring;
>   	struct dma_fence *f;
> -	int r;
> +	int r, idx;
> +
> +	if (!drm_dev_enter(p->adev->ddev, &idx)) {
> +		r = -ENODEV;
> +		goto nodev;
> +	}
>   
>   	entity = p->immediate ? &p->vm->immediate : &p->vm->delayed;
>   	ring = container_of(entity->rq->sched, struct amdgpu_ring, sched);
> @@ -104,7 +110,7 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
>   	WARN_ON(ib->length_dw > p->num_dw_left);
>   	r = amdgpu_job_submit(p->job, entity, AMDGPU_FENCE_OWNER_VM, &f);
>   	if (r)
> -		goto error;
> +		goto job_fail;
>   
>   	if (p->unlocked) {
>   		struct dma_fence *tmp = dma_fence_get(f);
> @@ -118,10 +124,15 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
>   	if (fence && !p->immediate)
>   		swap(*fence, f);
>   	dma_fence_put(f);
> -	return 0;
>   
> -error:
> -	amdgpu_job_free(p->job);
> +	r = 0;
> +
> +job_fail:
> +	drm_dev_exit(idx);
> +nodev:
> +	if (r)
> +		amdgpu_job_free(p->job);
> +
>   	return r;
>   }
>
Andrey Grodzovsky June 23, 2020, 5:11 a.m. UTC | #3
On 6/22/20 3:40 PM, Christian König wrote:
> Am 21.06.20 um 08:03 schrieb Andrey Grodzovsky:
>> entity->rq becomes null aftre device unplugged so just return early
>> in that case.
>
> Mhm, do you have a backtrace for this?
>
> This should only be called by an IOCTL and IOCTLs should already call 
> drm_dev_enter()/exit() on their own...
>
> Christian.


See bellow, it's not during IOCTL but during all GEM objects release when 
releasing the device. entity->rq becomes null because all the gpu schedulers are 
marked as not ready during the early pci remove stage and so the next time sdma 
job tries to pick a scheduler to run nothing is available and it's set to null.

Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382648] BUG: kernel NULL pointer 
dereference, address: 0000000000000038
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382651] #PF: supervisor read 
access in kernel mode
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382652] #PF: error_code(0x0000) 
- not-present page
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382653] PGD 0 P4D 0
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382656] Oops: 0000 [#1] SMP PTI
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382658] CPU: 6 PID: 2598 Comm: 
llvmpipe-6 Tainted: G           OE     5.6.0-dev+ #51
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382659] Hardware name: System 
manufacturer System Product Name/RAMPAGE IV FORMULA, BIOS 4804 12/30/2013
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382700] RIP: 
0010:amdgpu_vm_sdma_commit+0x6c/0x270 [amdgpu]
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382702] Code: 01 00 00 48 89 ee 
48 c7 c7 ef d4 85 c0 e8 fc 5f e8 ff 48 8b 75 10 48 c7 c7 fd d4 85 c0 e8 ec 5f e8 
ff 48 8b 45 10 41 8b 55 08 <48> 8b 40 38 85 d2 48 8d b8 30 ff ff ff 0f 84 9b 01 
00 00 48 8b 80
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382704] RSP: 
0018:ffffa88e40f57950 EFLAGS: 00010282
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382705] RAX: 0000000000000000 
RBX: ffffa88e40f579a8 RCX: 0000000000000001
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382707] RDX: 0000000000000014 
RSI: ffff94d4d62388e0 RDI: ffff94d4dbd98e30
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382708] RBP: ffff94d4d2ad3288 
R08: 0000000000000000 R09: 0000000000000001
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382709] R10: 000000000000001f 
R11: 0000000000000000 R12: ffffa88e40f57a48
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382710] R13: ffff94d4d627a5e8 
R14: ffff94d4d424d978 R15: 0000000800100020
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382712] FS: 
00007f30ae694700(0000) GS:ffff94d4dbd80000(0000) knlGS:0000000000000000
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382713] CS:  0010 DS: 0000 ES: 
0000 CR0: 0000000080050033
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382714] CR2: 0000000000000038 
CR3: 0000000121810006 CR4: 00000000000606e0
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382716] Call Trace:
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382755] 
amdgpu_vm_bo_update_mapping.constprop.30+0x16b/0x230 [amdgpu]
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382795] 
amdgpu_vm_clear_freed+0xd7/0x210 [amdgpu]
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382833] 
amdgpu_gem_object_close+0x200/0x2b0 [amdgpu]
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382856]  ? 
drm_gem_object_handle_put_unlocked+0x90/0x90 [drm]
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382864]  ? 
drm_gem_object_release_handle+0x2c/0x90 [drm]
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382872] 
drm_gem_object_release_handle+0x2c/0x90 [drm]
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382879]  ? 
drm_gem_object_handle_put_unlocked+0x90/0x90 [drm]
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382882] idr_for_each+0x48/0xd0
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382885]  ? 
_raw_spin_unlock_irqrestore+0x2d/0x50
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382893] 
drm_gem_release+0x1c/0x30 [drm]
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382901] 
drm_file_free+0x21d/0x270 [drm]
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382908] drm_release+0x67/0xe0 [drm]
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382912] __fput+0xc6/0x260
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382916] task_work_run+0x79/0xb0
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382919] do_exit+0x3d0/0xc40
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382921]  ? get_signal+0x13d/0xc30
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382924] do_group_exit+0x47/0xb0
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382926] get_signal+0x18b/0xc30
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382929] do_signal+0x36/0x6a0
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382931]  ? 
__set_task_comm+0x62/0x120
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382935]  ? 
__x64_sys_futex+0x88/0x180
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382938] 
exit_to_usermode_loop+0x6f/0xc0
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382941] do_syscall_64+0x149/0x1c0
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382943] 
entry_SYSCALL_64_after_hwframe+0x49/0xbe
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382944] RIP: 0033:0x7f30f7f35360
Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382947] Code: Bad RIP value.


Andrey


>
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 21 ++++++++++++++++-----
>>   1 file changed, 16 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>> index 8d9c6fe..d252427 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>> @@ -24,6 +24,7 @@
>>   #include "amdgpu_job.h"
>>   #include "amdgpu_object.h"
>>   #include "amdgpu_trace.h"
>> +#include <drm/drm_drv.h>
>>     #define AMDGPU_VM_SDMA_MIN_NUM_DW    256u
>>   #define AMDGPU_VM_SDMA_MAX_NUM_DW    (16u * 1024u)
>> @@ -94,7 +95,12 @@ static int amdgpu_vm_sdma_commit(struct 
>> amdgpu_vm_update_params *p,
>>       struct drm_sched_entity *entity;
>>       struct amdgpu_ring *ring;
>>       struct dma_fence *f;
>> -    int r;
>> +    int r, idx;
>> +
>> +    if (!drm_dev_enter(p->adev->ddev, &idx)) {
>> +        r = -ENODEV;
>> +        goto nodev;
>> +    }
>>         entity = p->immediate ? &p->vm->immediate : &p->vm->delayed;
>>       ring = container_of(entity->rq->sched, struct amdgpu_ring, sched);
>> @@ -104,7 +110,7 @@ static int amdgpu_vm_sdma_commit(struct 
>> amdgpu_vm_update_params *p,
>>       WARN_ON(ib->length_dw > p->num_dw_left);
>>       r = amdgpu_job_submit(p->job, entity, AMDGPU_FENCE_OWNER_VM, &f);
>>       if (r)
>> -        goto error;
>> +        goto job_fail;
>>         if (p->unlocked) {
>>           struct dma_fence *tmp = dma_fence_get(f);
>> @@ -118,10 +124,15 @@ static int amdgpu_vm_sdma_commit(struct 
>> amdgpu_vm_update_params *p,
>>       if (fence && !p->immediate)
>>           swap(*fence, f);
>>       dma_fence_put(f);
>> -    return 0;
>>   -error:
>> -    amdgpu_job_free(p->job);
>> +    r = 0;
>> +
>> +job_fail:
>> +    drm_dev_exit(idx);
>> +nodev:
>> +    if (r)
>> +        amdgpu_job_free(p->job);
>> +
>>       return r;
>>   }
>
Christian König June 23, 2020, 7:14 a.m. UTC | #4
Am 23.06.20 um 07:11 schrieb Andrey Grodzovsky:
>
> On 6/22/20 3:40 PM, Christian König wrote:
>> Am 21.06.20 um 08:03 schrieb Andrey Grodzovsky:
>>> entity->rq becomes null aftre device unplugged so just return early
>>> in that case.
>>
>> Mhm, do you have a backtrace for this?
>>
>> This should only be called by an IOCTL and IOCTLs should already call 
>> drm_dev_enter()/exit() on their own...
>>
>> Christian.
>
>
> See bellow, it's not during IOCTL but during all GEM objects release 
> when releasing the device. entity->rq becomes null because all the gpu 
> schedulers are marked as not ready during the early pci remove stage 
> and so the next time sdma job tries to pick a scheduler to run nothing 
> is available and it's set to null.

I see. This should then probably go into amdgpu_gem_object_close() 
before we reserve the PD.

See drm_dev_enter()/exit() are kind of a read side lock and with this we 
create a nice lock inversion when we do it in the low level SDMA VM backend.

Christian.

>
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382648] BUG: kernel 
> NULL pointer dereference, address: 0000000000000038
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382651] #PF: 
> supervisor read access in kernel mode
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382652] #PF: 
> error_code(0x0000) - not-present page
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382653] PGD 0 P4D 0
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382656] Oops: 0000 
> [#1] SMP PTI
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382658] CPU: 6 PID: 
> 2598 Comm: llvmpipe-6 Tainted: G           OE     5.6.0-dev+ #51
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382659] Hardware name: 
> System manufacturer System Product Name/RAMPAGE IV FORMULA, BIOS 4804 
> 12/30/2013
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382700] RIP: 
> 0010:amdgpu_vm_sdma_commit+0x6c/0x270 [amdgpu]
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382702] Code: 01 00 00 
> 48 89 ee 48 c7 c7 ef d4 85 c0 e8 fc 5f e8 ff 48 8b 75 10 48 c7 c7 fd 
> d4 85 c0 e8 ec 5f e8 ff 48 8b 45 10 41 8b 55 08 <48> 8b 40 38 85 d2 48 
> 8d b8 30 ff ff ff 0f 84 9b 01 00 00 48 8b 80
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382704] RSP: 
> 0018:ffffa88e40f57950 EFLAGS: 00010282
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382705] RAX: 
> 0000000000000000 RBX: ffffa88e40f579a8 RCX: 0000000000000001
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382707] RDX: 
> 0000000000000014 RSI: ffff94d4d62388e0 RDI: ffff94d4dbd98e30
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382708] RBP: 
> ffff94d4d2ad3288 R08: 0000000000000000 R09: 0000000000000001
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382709] R10: 
> 000000000000001f R11: 0000000000000000 R12: ffffa88e40f57a48
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382710] R13: 
> ffff94d4d627a5e8 R14: ffff94d4d424d978 R15: 0000000800100020
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382712] FS: 
> 00007f30ae694700(0000) GS:ffff94d4dbd80000(0000) knlGS:0000000000000000
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382713] CS:  0010 DS: 
> 0000 ES: 0000 CR0: 0000000080050033
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382714] CR2: 
> 0000000000000038 CR3: 0000000121810006 CR4: 00000000000606e0
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382716] Call Trace:
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382755] 
> amdgpu_vm_bo_update_mapping.constprop.30+0x16b/0x230 [amdgpu]
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382795] 
> amdgpu_vm_clear_freed+0xd7/0x210 [amdgpu]
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382833] 
> amdgpu_gem_object_close+0x200/0x2b0 [amdgpu]
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382856]  ? 
> drm_gem_object_handle_put_unlocked+0x90/0x90 [drm]
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382864]  ? 
> drm_gem_object_release_handle+0x2c/0x90 [drm]
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382872] 
> drm_gem_object_release_handle+0x2c/0x90 [drm]
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382879]  ? 
> drm_gem_object_handle_put_unlocked+0x90/0x90 [drm]
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382882] 
> idr_for_each+0x48/0xd0
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382885]  ? 
> _raw_spin_unlock_irqrestore+0x2d/0x50
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382893] 
> drm_gem_release+0x1c/0x30 [drm]
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382901] 
> drm_file_free+0x21d/0x270 [drm]
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382908] 
> drm_release+0x67/0xe0 [drm]
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382912] __fput+0xc6/0x260
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382916] 
> task_work_run+0x79/0xb0
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382919] 
> do_exit+0x3d0/0xc40
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382921]  ? 
> get_signal+0x13d/0xc30
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382924] 
> do_group_exit+0x47/0xb0
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382926] 
> get_signal+0x18b/0xc30
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382929] 
> do_signal+0x36/0x6a0
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382931]  ? 
> __set_task_comm+0x62/0x120
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382935]  ? 
> __x64_sys_futex+0x88/0x180
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382938] 
> exit_to_usermode_loop+0x6f/0xc0
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382941] 
> do_syscall_64+0x149/0x1c0
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382943] 
> entry_SYSCALL_64_after_hwframe+0x49/0xbe
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382944] RIP: 
> 0033:0x7f30f7f35360
> Jun  8 11:14:56 ubuntu-1604-test kernel: [   44.382947] Code: Bad RIP 
> value.
>
>
> Andrey
>
>
>>
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 21 
>>> ++++++++++++++++-----
>>>   1 file changed, 16 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>>> index 8d9c6fe..d252427 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>>> @@ -24,6 +24,7 @@
>>>   #include "amdgpu_job.h"
>>>   #include "amdgpu_object.h"
>>>   #include "amdgpu_trace.h"
>>> +#include <drm/drm_drv.h>
>>>     #define AMDGPU_VM_SDMA_MIN_NUM_DW    256u
>>>   #define AMDGPU_VM_SDMA_MAX_NUM_DW    (16u * 1024u)
>>> @@ -94,7 +95,12 @@ static int amdgpu_vm_sdma_commit(struct 
>>> amdgpu_vm_update_params *p,
>>>       struct drm_sched_entity *entity;
>>>       struct amdgpu_ring *ring;
>>>       struct dma_fence *f;
>>> -    int r;
>>> +    int r, idx;
>>> +
>>> +    if (!drm_dev_enter(p->adev->ddev, &idx)) {
>>> +        r = -ENODEV;
>>> +        goto nodev;
>>> +    }
>>>         entity = p->immediate ? &p->vm->immediate : &p->vm->delayed;
>>>       ring = container_of(entity->rq->sched, struct amdgpu_ring, 
>>> sched);
>>> @@ -104,7 +110,7 @@ static int amdgpu_vm_sdma_commit(struct 
>>> amdgpu_vm_update_params *p,
>>>       WARN_ON(ib->length_dw > p->num_dw_left);
>>>       r = amdgpu_job_submit(p->job, entity, AMDGPU_FENCE_OWNER_VM, &f);
>>>       if (r)
>>> -        goto error;
>>> +        goto job_fail;
>>>         if (p->unlocked) {
>>>           struct dma_fence *tmp = dma_fence_get(f);
>>> @@ -118,10 +124,15 @@ static int amdgpu_vm_sdma_commit(struct 
>>> amdgpu_vm_update_params *p,
>>>       if (fence && !p->immediate)
>>>           swap(*fence, f);
>>>       dma_fence_put(f);
>>> -    return 0;
>>>   -error:
>>> -    amdgpu_job_free(p->job);
>>> +    r = 0;
>>> +
>>> +job_fail:
>>> +    drm_dev_exit(idx);
>>> +nodev:
>>> +    if (r)
>>> +        amdgpu_job_free(p->job);
>>> +
>>>       return r;
>>>   }
>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
index 8d9c6fe..d252427 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
@@ -24,6 +24,7 @@ 
 #include "amdgpu_job.h"
 #include "amdgpu_object.h"
 #include "amdgpu_trace.h"
+#include <drm/drm_drv.h>
 
 #define AMDGPU_VM_SDMA_MIN_NUM_DW	256u
 #define AMDGPU_VM_SDMA_MAX_NUM_DW	(16u * 1024u)
@@ -94,7 +95,12 @@  static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
 	struct drm_sched_entity *entity;
 	struct amdgpu_ring *ring;
 	struct dma_fence *f;
-	int r;
+	int r, idx;
+
+	if (!drm_dev_enter(p->adev->ddev, &idx)) {
+		r = -ENODEV;
+		goto nodev;
+	}
 
 	entity = p->immediate ? &p->vm->immediate : &p->vm->delayed;
 	ring = container_of(entity->rq->sched, struct amdgpu_ring, sched);
@@ -104,7 +110,7 @@  static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
 	WARN_ON(ib->length_dw > p->num_dw_left);
 	r = amdgpu_job_submit(p->job, entity, AMDGPU_FENCE_OWNER_VM, &f);
 	if (r)
-		goto error;
+		goto job_fail;
 
 	if (p->unlocked) {
 		struct dma_fence *tmp = dma_fence_get(f);
@@ -118,10 +124,15 @@  static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
 	if (fence && !p->immediate)
 		swap(*fence, f);
 	dma_fence_put(f);
-	return 0;
 
-error:
-	amdgpu_job_free(p->job);
+	r = 0;
+
+job_fail:
+	drm_dev_exit(idx);
+nodev:
+	if (r)
+		amdgpu_job_free(p->job);
+
 	return r;
 }