diff mbox series

[RFC,v4,02/11] drm/amdgpu: Move scheduler init to after XGMI is ready

Message ID 20220209002320.6077-3-andrey.grodzovsky@amd.com (mailing list archive)
State New, archived
Headers show
Series Define and use reset domain for GPU recovery in amdgpu | expand

Commit Message

Andrey Grodzovsky Feb. 9, 2022, 12:23 a.m. UTC
Before we initialize schedulers we must know which reset
domain are we in - for single device there iis a single
domain per device and so single wq per device. For XGMI
the reset domain spans the entire XGMI hive and so the
reset wq is per hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
 3 files changed, 51 insertions(+), 30 deletions(-)

Comments

Christian König Feb. 9, 2022, 7:53 a.m. UTC | #1
Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:
> Before we initialize schedulers we must know which reset
> domain are we in - for single device there iis a single
> domain per device and so single wq per device. For XGMI
> the reset domain spans the entire XGMI hive and so the
> reset wq is per hive.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

One more comment below, with that fixed Reviewed-by: Christian König 
<christian.koenig@amd.com>.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
>   3 files changed, 51 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 9704b0e1fd82..00123b0013d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2287,6 +2287,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>   	return r;
>   }
>   
> +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
> +{
> +	long timeout;
> +	int r, i;
> +
> +	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +		struct amdgpu_ring *ring = adev->rings[i];
> +
> +		/* No need to setup the GPU scheduler for rings that don't need it */
> +		if (!ring || ring->no_scheduler)
> +			continue;
> +
> +		switch (ring->funcs->type) {
> +		case AMDGPU_RING_TYPE_GFX:
> +			timeout = adev->gfx_timeout;
> +			break;
> +		case AMDGPU_RING_TYPE_COMPUTE:
> +			timeout = adev->compute_timeout;
> +			break;
> +		case AMDGPU_RING_TYPE_SDMA:
> +			timeout = adev->sdma_timeout;
> +			break;
> +		default:
> +			timeout = adev->video_timeout;
> +			break;
> +		}
> +
> +		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
> +				   ring->num_hw_submission, amdgpu_job_hang_limit,
> +				   timeout, adev->reset_domain.wq, ring->sched_score, ring->name);
> +		if (r) {
> +			DRM_ERROR("Failed to create scheduler on ring %s.\n",
> +				  ring->name);
> +			return r;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +
>   /**
>    * amdgpu_device_ip_init - run init for hardware IPs
>    *
> @@ -2419,6 +2460,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
>   		}
>   	}
>   
> +	r = amdgpu_device_init_schedulers(adev);
> +	if (r)
> +		goto init_failed;
> +
>   	/* Don't init kfd if whole hive need to be reset during init */
>   	if (!adev->gmc.xgmi.pending_reset)
>   		amdgpu_amdkfd_device_init(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 45977a72b5dd..fa302540c69a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -457,8 +457,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   				  atomic_t *sched_score)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	long timeout;
> -	int r;
>   
>   	if (!adev)
>   		return -EINVAL;
> @@ -478,36 +476,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   	spin_lock_init(&ring->fence_drv.lock);
>   	ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
>   					 GFP_KERNEL);
> -	if (!ring->fence_drv.fences)
> -		return -ENOMEM;
>   
> -	/* No need to setup the GPU scheduler for rings that don't need it */
> -	if (ring->no_scheduler)
> -		return 0;
> +	ring->num_hw_submission = num_hw_submission;
> +	ring->sched_score = sched_score;

Let's move this into the caller and then use ring->num_hw_submission in 
the fence code as well.

The maximum number of jobs on the ring is not really fence specific.

Regards,
Christian.

>   
> -	switch (ring->funcs->type) {
> -	case AMDGPU_RING_TYPE_GFX:
> -		timeout = adev->gfx_timeout;
> -		break;
> -	case AMDGPU_RING_TYPE_COMPUTE:
> -		timeout = adev->compute_timeout;
> -		break;
> -	case AMDGPU_RING_TYPE_SDMA:
> -		timeout = adev->sdma_timeout;
> -		break;
> -	default:
> -		timeout = adev->video_timeout;
> -		break;
> -	}
> -
> -	r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
> -			   num_hw_submission, amdgpu_job_hang_limit,
> -			   timeout, NULL, sched_score, ring->name);
> -	if (r) {
> -		DRM_ERROR("Failed to create scheduler on ring %s.\n",
> -			  ring->name);
> -		return r;
> -	}
> +	if (!ring->fence_drv.fences)
> +		return -ENOMEM;
>   
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index fae7d185ad0d..7f20ce73a243 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -251,6 +251,8 @@ struct amdgpu_ring {
>   	bool			has_compute_vm_bug;
>   	bool			no_scheduler;
>   	int			hw_prio;
> +	unsigned 		num_hw_submission;
> +	atomic_t		*sched_score;
>   };
>   
>   #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))
Andrey Grodzovsky Feb. 9, 2022, 6:06 p.m. UTC | #2
All comments are fixed and code pushed. Thanks for everyone
who helped reviewing.

Andrey

On 2022-02-09 02:53, Christian König wrote:
> Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:
>> Before we initialize schedulers we must know which reset
>> domain are we in - for single device there iis a single
>> domain per device and so single wq per device. For XGMI
>> the reset domain spans the entire XGMI hive and so the
>> reset wq is per hive.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>
> One more comment below, with that fixed Reviewed-by: Christian König 
> <christian.koenig@amd.com>.
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--------------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
>>   3 files changed, 51 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 9704b0e1fd82..00123b0013d3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2287,6 +2287,47 @@ static int amdgpu_device_fw_loading(struct 
>> amdgpu_device *adev)
>>       return r;
>>   }
>>   +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
>> +{
>> +    long timeout;
>> +    int r, i;
>> +
>> +    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +        struct amdgpu_ring *ring = adev->rings[i];
>> +
>> +        /* No need to setup the GPU scheduler for rings that don't 
>> need it */
>> +        if (!ring || ring->no_scheduler)
>> +            continue;
>> +
>> +        switch (ring->funcs->type) {
>> +        case AMDGPU_RING_TYPE_GFX:
>> +            timeout = adev->gfx_timeout;
>> +            break;
>> +        case AMDGPU_RING_TYPE_COMPUTE:
>> +            timeout = adev->compute_timeout;
>> +            break;
>> +        case AMDGPU_RING_TYPE_SDMA:
>> +            timeout = adev->sdma_timeout;
>> +            break;
>> +        default:
>> +            timeout = adev->video_timeout;
>> +            break;
>> +        }
>> +
>> +        r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>> +                   ring->num_hw_submission, amdgpu_job_hang_limit,
>> +                   timeout, adev->reset_domain.wq, 
>> ring->sched_score, ring->name);
>> +        if (r) {
>> +            DRM_ERROR("Failed to create scheduler on ring %s.\n",
>> +                  ring->name);
>> +            return r;
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +
>>   /**
>>    * amdgpu_device_ip_init - run init for hardware IPs
>>    *
>> @@ -2419,6 +2460,10 @@ static int amdgpu_device_ip_init(struct 
>> amdgpu_device *adev)
>>           }
>>       }
>>   +    r = amdgpu_device_init_schedulers(adev);
>> +    if (r)
>> +        goto init_failed;
>> +
>>       /* Don't init kfd if whole hive need to be reset during init */
>>       if (!adev->gmc.xgmi.pending_reset)
>>           amdgpu_amdkfd_device_init(adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> index 45977a72b5dd..fa302540c69a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> @@ -457,8 +457,6 @@ int amdgpu_fence_driver_init_ring(struct 
>> amdgpu_ring *ring,
>>                     atomic_t *sched_score)
>>   {
>>       struct amdgpu_device *adev = ring->adev;
>> -    long timeout;
>> -    int r;
>>         if (!adev)
>>           return -EINVAL;
>> @@ -478,36 +476,12 @@ int amdgpu_fence_driver_init_ring(struct 
>> amdgpu_ring *ring,
>>       spin_lock_init(&ring->fence_drv.lock);
>>       ring->fence_drv.fences = kcalloc(num_hw_submission * 2, 
>> sizeof(void *),
>>                        GFP_KERNEL);
>> -    if (!ring->fence_drv.fences)
>> -        return -ENOMEM;
>>   -    /* No need to setup the GPU scheduler for rings that don't 
>> need it */
>> -    if (ring->no_scheduler)
>> -        return 0;
>> +    ring->num_hw_submission = num_hw_submission;
>> +    ring->sched_score = sched_score;
>
> Let's move this into the caller and then use ring->num_hw_submission 
> in the fence code as well.
>
> The maximum number of jobs on the ring is not really fence specific.
>
> Regards,
> Christian.
>
>>   -    switch (ring->funcs->type) {
>> -    case AMDGPU_RING_TYPE_GFX:
>> -        timeout = adev->gfx_timeout;
>> -        break;
>> -    case AMDGPU_RING_TYPE_COMPUTE:
>> -        timeout = adev->compute_timeout;
>> -        break;
>> -    case AMDGPU_RING_TYPE_SDMA:
>> -        timeout = adev->sdma_timeout;
>> -        break;
>> -    default:
>> -        timeout = adev->video_timeout;
>> -        break;
>> -    }
>> -
>> -    r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>> -               num_hw_submission, amdgpu_job_hang_limit,
>> -               timeout, NULL, sched_score, ring->name);
>> -    if (r) {
>> -        DRM_ERROR("Failed to create scheduler on ring %s.\n",
>> -              ring->name);
>> -        return r;
>> -    }
>> +    if (!ring->fence_drv.fences)
>> +        return -ENOMEM;
>>         return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index fae7d185ad0d..7f20ce73a243 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -251,6 +251,8 @@ struct amdgpu_ring {
>>       bool            has_compute_vm_bug;
>>       bool            no_scheduler;
>>       int            hw_prio;
>> +    unsigned         num_hw_submission;
>> +    atomic_t        *sched_score;
>>   };
>>     #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), 
>> (ib)))
>
JingWen Chen Feb. 24, 2022, 6:47 a.m. UTC | #3
Hi Andrey,

Will you port this patch into amd-staging-drm-next?

on 2/10/22 2:06 AM, Andrey Grodzovsky wrote:
> All comments are fixed and code pushed. Thanks for everyone
> who helped reviewing.
>
> Andrey
>
> On 2022-02-09 02:53, Christian König wrote:
>> Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:
>>> Before we initialize schedulers we must know which reset
>>> domain are we in - for single device there iis a single
>>> domain per device and so single wq per device. For XGMI
>>> the reset domain spans the entire XGMI hive and so the
>>> reset wq is per hive.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>
>> One more comment below, with that fixed Reviewed-by: Christian König <christian.koenig@amd.com>.
>>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--------------
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
>>>   3 files changed, 51 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 9704b0e1fd82..00123b0013d3 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2287,6 +2287,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>>>       return r;
>>>   }
>>>   +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
>>> +{
>>> +    long timeout;
>>> +    int r, i;
>>> +
>>> +    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +        struct amdgpu_ring *ring = adev->rings[i];
>>> +
>>> +        /* No need to setup the GPU scheduler for rings that don't need it */
>>> +        if (!ring || ring->no_scheduler)
>>> +            continue;
>>> +
>>> +        switch (ring->funcs->type) {
>>> +        case AMDGPU_RING_TYPE_GFX:
>>> +            timeout = adev->gfx_timeout;
>>> +            break;
>>> +        case AMDGPU_RING_TYPE_COMPUTE:
>>> +            timeout = adev->compute_timeout;
>>> +            break;
>>> +        case AMDGPU_RING_TYPE_SDMA:
>>> +            timeout = adev->sdma_timeout;
>>> +            break;
>>> +        default:
>>> +            timeout = adev->video_timeout;
>>> +            break;
>>> +        }
>>> +
>>> +        r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>> +                   ring->num_hw_submission, amdgpu_job_hang_limit,
>>> +                   timeout, adev->reset_domain.wq, ring->sched_score, ring->name);
>>> +        if (r) {
>>> +            DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>> +                  ring->name);
>>> +            return r;
>>> +        }
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +
>>>   /**
>>>    * amdgpu_device_ip_init - run init for hardware IPs
>>>    *
>>> @@ -2419,6 +2460,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
>>>           }
>>>       }
>>>   +    r = amdgpu_device_init_schedulers(adev);
>>> +    if (r)
>>> +        goto init_failed;
>>> +
>>>       /* Don't init kfd if whole hive need to be reset during init */
>>>       if (!adev->gmc.xgmi.pending_reset)
>>>           amdgpu_amdkfd_device_init(adev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> index 45977a72b5dd..fa302540c69a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> @@ -457,8 +457,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>>>                     atomic_t *sched_score)
>>>   {
>>>       struct amdgpu_device *adev = ring->adev;
>>> -    long timeout;
>>> -    int r;
>>>         if (!adev)
>>>           return -EINVAL;
>>> @@ -478,36 +476,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>>>       spin_lock_init(&ring->fence_drv.lock);
>>>       ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
>>>                        GFP_KERNEL);
>>> -    if (!ring->fence_drv.fences)
>>> -        return -ENOMEM;
>>>   -    /* No need to setup the GPU scheduler for rings that don't need it */
>>> -    if (ring->no_scheduler)
>>> -        return 0;
>>> +    ring->num_hw_submission = num_hw_submission;
>>> +    ring->sched_score = sched_score;
>>
>> Let's move this into the caller and then use ring->num_hw_submission in the fence code as well.
>>
>> The maximum number of jobs on the ring is not really fence specific.
>>
>> Regards,
>> Christian.
>>
>>>   -    switch (ring->funcs->type) {
>>> -    case AMDGPU_RING_TYPE_GFX:
>>> -        timeout = adev->gfx_timeout;
>>> -        break;
>>> -    case AMDGPU_RING_TYPE_COMPUTE:
>>> -        timeout = adev->compute_timeout;
>>> -        break;
>>> -    case AMDGPU_RING_TYPE_SDMA:
>>> -        timeout = adev->sdma_timeout;
>>> -        break;
>>> -    default:
>>> -        timeout = adev->video_timeout;
>>> -        break;
>>> -    }
>>> -
>>> -    r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>> -               num_hw_submission, amdgpu_job_hang_limit,
>>> -               timeout, NULL, sched_score, ring->name);
>>> -    if (r) {
>>> -        DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>> -              ring->name);
>>> -        return r;
>>> -    }
>>> +    if (!ring->fence_drv.fences)
>>> +        return -ENOMEM;
>>>         return 0;
>>>   }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index fae7d185ad0d..7f20ce73a243 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -251,6 +251,8 @@ struct amdgpu_ring {
>>>       bool            has_compute_vm_bug;
>>>       bool            no_scheduler;
>>>       int            hw_prio;
>>> +    unsigned         num_hw_submission;
>>> +    atomic_t        *sched_score;
>>>   };
>>>     #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))
>>
Andrey Grodzovsky Feb. 24, 2022, 4:24 p.m. UTC | #4
No because all the patch-set including this patch was landed into 
drm-misc-next and will reach amd-staging-drm-next on the next upstream 
rebase i guess.

Andrey

On 2022-02-24 01:47, JingWen Chen wrote:
> Hi Andrey,
>
> Will you port this patch into amd-staging-drm-next?
>
> on 2/10/22 2:06 AM, Andrey Grodzovsky wrote:
>> All comments are fixed and code pushed. Thanks for everyone
>> who helped reviewing.
>>
>> Andrey
>>
>> On 2022-02-09 02:53, Christian König wrote:
>>> Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:
>>>> Before we initialize schedulers we must know which reset
>>>> domain are we in - for single device there iis a single
>>>> domain per device and so single wq per device. For XGMI
>>>> the reset domain spans the entire XGMI hive and so the
>>>> reset wq is per hive.
>>>>
>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> One more comment below, with that fixed Reviewed-by: Christian König <christian.koenig@amd.com>.
>>>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--------------
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
>>>>    3 files changed, 51 insertions(+), 30 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 9704b0e1fd82..00123b0013d3 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -2287,6 +2287,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>>>>        return r;
>>>>    }
>>>>    +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
>>>> +{
>>>> +    long timeout;
>>>> +    int r, i;
>>>> +
>>>> +    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>> +        struct amdgpu_ring *ring = adev->rings[i];
>>>> +
>>>> +        /* No need to setup the GPU scheduler for rings that don't need it */
>>>> +        if (!ring || ring->no_scheduler)
>>>> +            continue;
>>>> +
>>>> +        switch (ring->funcs->type) {
>>>> +        case AMDGPU_RING_TYPE_GFX:
>>>> +            timeout = adev->gfx_timeout;
>>>> +            break;
>>>> +        case AMDGPU_RING_TYPE_COMPUTE:
>>>> +            timeout = adev->compute_timeout;
>>>> +            break;
>>>> +        case AMDGPU_RING_TYPE_SDMA:
>>>> +            timeout = adev->sdma_timeout;
>>>> +            break;
>>>> +        default:
>>>> +            timeout = adev->video_timeout;
>>>> +            break;
>>>> +        }
>>>> +
>>>> +        r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>>> +                   ring->num_hw_submission, amdgpu_job_hang_limit,
>>>> +                   timeout, adev->reset_domain.wq, ring->sched_score, ring->name);
>>>> +        if (r) {
>>>> +            DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>>> +                  ring->name);
>>>> +            return r;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +
>>>>    /**
>>>>     * amdgpu_device_ip_init - run init for hardware IPs
>>>>     *
>>>> @@ -2419,6 +2460,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
>>>>            }
>>>>        }
>>>>    +    r = amdgpu_device_init_schedulers(adev);
>>>> +    if (r)
>>>> +        goto init_failed;
>>>> +
>>>>        /* Don't init kfd if whole hive need to be reset during init */
>>>>        if (!adev->gmc.xgmi.pending_reset)
>>>>            amdgpu_amdkfd_device_init(adev);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> index 45977a72b5dd..fa302540c69a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> @@ -457,8 +457,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>>>>                      atomic_t *sched_score)
>>>>    {
>>>>        struct amdgpu_device *adev = ring->adev;
>>>> -    long timeout;
>>>> -    int r;
>>>>          if (!adev)
>>>>            return -EINVAL;
>>>> @@ -478,36 +476,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>>>>        spin_lock_init(&ring->fence_drv.lock);
>>>>        ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
>>>>                         GFP_KERNEL);
>>>> -    if (!ring->fence_drv.fences)
>>>> -        return -ENOMEM;
>>>>    -    /* No need to setup the GPU scheduler for rings that don't need it */
>>>> -    if (ring->no_scheduler)
>>>> -        return 0;
>>>> +    ring->num_hw_submission = num_hw_submission;
>>>> +    ring->sched_score = sched_score;
>>> Let's move this into the caller and then use ring->num_hw_submission in the fence code as well.
>>>
>>> The maximum number of jobs on the ring is not really fence specific.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>    -    switch (ring->funcs->type) {
>>>> -    case AMDGPU_RING_TYPE_GFX:
>>>> -        timeout = adev->gfx_timeout;
>>>> -        break;
>>>> -    case AMDGPU_RING_TYPE_COMPUTE:
>>>> -        timeout = adev->compute_timeout;
>>>> -        break;
>>>> -    case AMDGPU_RING_TYPE_SDMA:
>>>> -        timeout = adev->sdma_timeout;
>>>> -        break;
>>>> -    default:
>>>> -        timeout = adev->video_timeout;
>>>> -        break;
>>>> -    }
>>>> -
>>>> -    r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>>> -               num_hw_submission, amdgpu_job_hang_limit,
>>>> -               timeout, NULL, sched_score, ring->name);
>>>> -    if (r) {
>>>> -        DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>>> -              ring->name);
>>>> -        return r;
>>>> -    }
>>>> +    if (!ring->fence_drv.fences)
>>>> +        return -ENOMEM;
>>>>          return 0;
>>>>    }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> index fae7d185ad0d..7f20ce73a243 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> @@ -251,6 +251,8 @@ struct amdgpu_ring {
>>>>        bool            has_compute_vm_bug;
>>>>        bool            no_scheduler;
>>>>        int            hw_prio;
>>>> +    unsigned         num_hw_submission;
>>>> +    atomic_t        *sched_score;
>>>>    };
>>>>      #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))
Alex Deucher Feb. 24, 2022, 5:12 p.m. UTC | #5
[Public]

If it applies cleanly, feel free to drop it in.  I'll drop those patches for drm-next since they are already in drm-misc.

Alex
Andrey Grodzovsky Feb. 24, 2022, 5:26 p.m. UTC | #6
No problem if so but before I do,


JingWen - why you think this patch is needed as a standalone now ? It 
has no use without the
entire feature together with it. Is it some changes you want to do on 
top of that code ?


Andrey


On 2022-02-24 12:12, Deucher, Alexander wrote:
>
> [Public]
>
>
> If it applies cleanly, feel free to drop it in.  I'll drop those 
> patches for drm-next since they are already in drm-misc.
>
> Alex
>
> ------------------------------------------------------------------------
> *From:* amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of 
> Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> *Sent:* Thursday, February 24, 2022 11:24 AM
> *To:* Chen, JingWen <JingWen.Chen2@amd.com>; Christian König 
> <ckoenig.leichtzumerken@gmail.com>; dri-devel@lists.freedesktop.org 
> <dri-devel@lists.freedesktop.org>; amd-gfx@lists.freedesktop.org 
> <amd-gfx@lists.freedesktop.org>
> *Cc:* Liu, Monk <Monk.Liu@amd.com>; Chen, Horace 
> <Horace.Chen@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Koenig, 
> Christian <Christian.Koenig@amd.com>; daniel@ffwll.ch <daniel@ffwll.ch>
> *Subject:* Re: [RFC v4 02/11] drm/amdgpu: Move scheduler init to after 
> XGMI is ready
> No because all the patch-set including this patch was landed into
> drm-misc-next and will reach amd-staging-drm-next on the next upstream
> rebase i guess.
>
> Andrey
>
> On 2022-02-24 01:47, JingWen Chen wrote:
> > Hi Andrey,
> >
> > Will you port this patch into amd-staging-drm-next?
> >
> > on 2/10/22 2:06 AM, Andrey Grodzovsky wrote:
> >> All comments are fixed and code pushed. Thanks for everyone
> >> who helped reviewing.
> >>
> >> Andrey
> >>
> >> On 2022-02-09 02:53, Christian König wrote:
> >>> Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:
> >>>> Before we initialize schedulers we must know which reset
> >>>> domain are we in - for single device there iis a single
> >>>> domain per device and so single wq per device. For XGMI
> >>>> the reset domain spans the entire XGMI hive and so the
> >>>> reset wq is per hive.
> >>>>
> >>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> >>> One more comment below, with that fixed Reviewed-by: Christian 
> König <christian.koenig@amd.com>.
> >>>
> >>>> ---
> >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 
> ++++++++++++++++++++++
> >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--------------
> >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
> >>>>    3 files changed, 51 insertions(+), 30 deletions(-)
> >>>>
> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>> index 9704b0e1fd82..00123b0013d3 100644
> >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>> @@ -2287,6 +2287,47 @@ static int amdgpu_device_fw_loading(struct 
> amdgpu_device *adev)
> >>>>        return r;
> >>>>    }
> >>>>    +static int amdgpu_device_init_schedulers(struct amdgpu_device 
> *adev)
> >>>> +{
> >>>> +    long timeout;
> >>>> +    int r, i;
> >>>> +
> >>>> +    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> >>>> +        struct amdgpu_ring *ring = adev->rings[i];
> >>>> +
> >>>> +        /* No need to setup the GPU scheduler for rings that 
> don't need it */
> >>>> +        if (!ring || ring->no_scheduler)
> >>>> +            continue;
> >>>> +
> >>>> +        switch (ring->funcs->type) {
> >>>> +        case AMDGPU_RING_TYPE_GFX:
> >>>> +            timeout = adev->gfx_timeout;
> >>>> +            break;
> >>>> +        case AMDGPU_RING_TYPE_COMPUTE:
> >>>> +            timeout = adev->compute_timeout;
> >>>> +            break;
> >>>> +        case AMDGPU_RING_TYPE_SDMA:
> >>>> +            timeout = adev->sdma_timeout;
> >>>> +            break;
> >>>> +        default:
> >>>> +            timeout = adev->video_timeout;
> >>>> +            break;
> >>>> +        }
> >>>> +
> >>>> +        r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
> >>>> + ring->num_hw_submission, amdgpu_job_hang_limit,
> >>>> +                   timeout, adev->reset_domain.wq, 
> ring->sched_score, ring->name);
> >>>> +        if (r) {
> >>>> +            DRM_ERROR("Failed to create scheduler on ring %s.\n",
> >>>> +                  ring->name);
> >>>> +            return r;
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>> +    return 0;
> >>>> +}
> >>>> +
> >>>> +
> >>>>    /**
> >>>>     * amdgpu_device_ip_init - run init for hardware IPs
> >>>>     *
> >>>> @@ -2419,6 +2460,10 @@ static int amdgpu_device_ip_init(struct 
> amdgpu_device *adev)
> >>>>            }
> >>>>        }
> >>>>    +    r = amdgpu_device_init_schedulers(adev);
> >>>> +    if (r)
> >>>> +        goto init_failed;
> >>>> +
> >>>>        /* Don't init kfd if whole hive need to be reset during 
> init */
> >>>>        if (!adev->gmc.xgmi.pending_reset)
> >>>> amdgpu_amdkfd_device_init(adev);
> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> >>>> index 45977a72b5dd..fa302540c69a 100644
> >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> >>>> @@ -457,8 +457,6 @@ int amdgpu_fence_driver_init_ring(struct 
> amdgpu_ring *ring,
> >>>>                      atomic_t *sched_score)
> >>>>    {
> >>>>        struct amdgpu_device *adev = ring->adev;
> >>>> -    long timeout;
> >>>> -    int r;
> >>>>          if (!adev)
> >>>>            return -EINVAL;
> >>>> @@ -478,36 +476,12 @@ int amdgpu_fence_driver_init_ring(struct 
> amdgpu_ring *ring,
> >>>> spin_lock_init(&ring->fence_drv.lock);
> >>>>        ring->fence_drv.fences = kcalloc(num_hw_submission * 2, 
> sizeof(void *),
> >>>>                         GFP_KERNEL);
> >>>> -    if (!ring->fence_drv.fences)
> >>>> -        return -ENOMEM;
> >>>>    -    /* No need to setup the GPU scheduler for rings that 
> don't need it */
> >>>> -    if (ring->no_scheduler)
> >>>> -        return 0;
> >>>> +    ring->num_hw_submission = num_hw_submission;
> >>>> +    ring->sched_score = sched_score;
> >>> Let's move this into the caller and then use 
> ring->num_hw_submission in the fence code as well.
> >>>
> >>> The maximum number of jobs on the ring is not really fence specific.
> >>>
> >>> Regards,
> >>> Christian.
> >>>
> >>>>    -    switch (ring->funcs->type) {
> >>>> -    case AMDGPU_RING_TYPE_GFX:
> >>>> -        timeout = adev->gfx_timeout;
> >>>> -        break;
> >>>> -    case AMDGPU_RING_TYPE_COMPUTE:
> >>>> -        timeout = adev->compute_timeout;
> >>>> -        break;
> >>>> -    case AMDGPU_RING_TYPE_SDMA:
> >>>> -        timeout = adev->sdma_timeout;
> >>>> -        break;
> >>>> -    default:
> >>>> -        timeout = adev->video_timeout;
> >>>> -        break;
> >>>> -    }
> >>>> -
> >>>> -    r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
> >>>> -               num_hw_submission, amdgpu_job_hang_limit,
> >>>> -               timeout, NULL, sched_score, ring->name);
> >>>> -    if (r) {
> >>>> -        DRM_ERROR("Failed to create scheduler on ring %s.\n",
> >>>> -              ring->name);
> >>>> -        return r;
> >>>> -    }
> >>>> +    if (!ring->fence_drv.fences)
> >>>> +        return -ENOMEM;
> >>>>          return 0;
> >>>>    }
> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>>> index fae7d185ad0d..7f20ce73a243 100644
> >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> >>>> @@ -251,6 +251,8 @@ struct amdgpu_ring {
> >>>>        bool has_compute_vm_bug;
> >>>>        bool            no_scheduler;
> >>>>        int            hw_prio;
> >>>> +    unsigned num_hw_submission;
> >>>> +    atomic_t        *sched_score;
> >>>>    };
> >>>>      #define amdgpu_ring_parse_cs(r, p, ib) 
> ((r)->funcs->parse_cs((p), (ib)))
JingWen Chen Feb. 25, 2022, 3:13 a.m. UTC | #7
Hi Andrey,

Sorry for the misleading, I mean the whole patch series. We are depending on this patch series to fix the concurrency issue within SRIOV TDR sequence.



On 2/25/22 1:26 AM, Andrey Grodzovsky wrote:
> No problem if so but before I do,
>
>
> JingWen - why you think this patch is needed as a standalone now ? It has no use without the
> entire feature together with it. Is it some changes you want to do on top of that code ?
>
>
> Andrey
>
>
> On 2022-02-24 12:12, Deucher, Alexander wrote:
>>
>> [Public]
>>
>>
>> If it applies cleanly, feel free to drop it in.  I'll drop those patches for drm-next since they are already in drm-misc.
>>
>> Alex
>>
>> ------------------------------------------------------------------------
>> *From:* amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> *Sent:* Thursday, February 24, 2022 11:24 AM
>> *To:* Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; dri-devel@lists.freedesktop.org <dri-devel@lists.freedesktop.org>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
>> *Cc:* Liu, Monk <Monk.Liu@amd.com>; Chen, Horace <Horace.Chen@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; daniel@ffwll.ch <daniel@ffwll.ch>
>> *Subject:* Re: [RFC v4 02/11] drm/amdgpu: Move scheduler init to after XGMI is ready
>> No because all the patch-set including this patch was landed into
>> drm-misc-next and will reach amd-staging-drm-next on the next upstream
>> rebase i guess.
>>
>> Andrey
>>
>> On 2022-02-24 01:47, JingWen Chen wrote:
>> > Hi Andrey,
>> >
>> > Will you port this patch into amd-staging-drm-next?
>> >
>> > on 2/10/22 2:06 AM, Andrey Grodzovsky wrote:
>> >> All comments are fixed and code pushed. Thanks for everyone
>> >> who helped reviewing.
>> >>
>> >> Andrey
>> >>
>> >> On 2022-02-09 02:53, Christian König wrote:
>> >>> Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:
>> >>>> Before we initialize schedulers we must know which reset
>> >>>> domain are we in - for single device there iis a single
>> >>>> domain per device and so single wq per device. For XGMI
>> >>>> the reset domain spans the entire XGMI hive and so the
>> >>>> reset wq is per hive.
>> >>>>
>> >>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> >>> One more comment below, with that fixed Reviewed-by: Christian König <christian.koenig@amd.com>.
>> >>>
>> >>>> ---
>> >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++
>> >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--------------
>> >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
>> >>>>    3 files changed, 51 insertions(+), 30 deletions(-)
>> >>>>
>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> >>>> index 9704b0e1fd82..00123b0013d3 100644
>> >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> >>>> @@ -2287,6 +2287,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>> >>>>        return r;
>> >>>>    }
>> >>>>    +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
>> >>>> +{
>> >>>> +    long timeout;
>> >>>> +    int r, i;
>> >>>> +
>> >>>> +    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> >>>> +        struct amdgpu_ring *ring = adev->rings[i];
>> >>>> +
>> >>>> +        /* No need to setup the GPU scheduler for rings that don't need it */
>> >>>> +        if (!ring || ring->no_scheduler)
>> >>>> +            continue;
>> >>>> +
>> >>>> +        switch (ring->funcs->type) {
>> >>>> +        case AMDGPU_RING_TYPE_GFX:
>> >>>> +            timeout = adev->gfx_timeout;
>> >>>> +            break;
>> >>>> +        case AMDGPU_RING_TYPE_COMPUTE:
>> >>>> +            timeout = adev->compute_timeout;
>> >>>> +            break;
>> >>>> +        case AMDGPU_RING_TYPE_SDMA:
>> >>>> +            timeout = adev->sdma_timeout;
>> >>>> +            break;
>> >>>> +        default:
>> >>>> +            timeout = adev->video_timeout;
>> >>>> +            break;
>> >>>> +        }
>> >>>> +
>> >>>> +        r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>> >>>> + ring->num_hw_submission, amdgpu_job_hang_limit,
>> >>>> +                   timeout, adev->reset_domain.wq, ring->sched_score, ring->name);
>> >>>> +        if (r) {
>> >>>> +            DRM_ERROR("Failed to create scheduler on ring %s.\n",
>> >>>> +                  ring->name);
>> >>>> +            return r;
>> >>>> +        }
>> >>>> +    }
>> >>>> +
>> >>>> +    return 0;
>> >>>> +}
>> >>>> +
>> >>>> +
>> >>>>    /**
>> >>>>     * amdgpu_device_ip_init - run init for hardware IPs
>> >>>>     *
>> >>>> @@ -2419,6 +2460,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
>> >>>>            }
>> >>>>        }
>> >>>>    +    r = amdgpu_device_init_schedulers(adev);
>> >>>> +    if (r)
>> >>>> +        goto init_failed;
>> >>>> +
>> >>>>        /* Don't init kfd if whole hive need to be reset during init */
>> >>>>        if (!adev->gmc.xgmi.pending_reset)
>> >>>> amdgpu_amdkfd_device_init(adev);
>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> >>>> index 45977a72b5dd..fa302540c69a 100644
>> >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> >>>> @@ -457,8 +457,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>> >>>>                      atomic_t *sched_score)
>> >>>>    {
>> >>>>        struct amdgpu_device *adev = ring->adev;
>> >>>> -    long timeout;
>> >>>> -    int r;
>> >>>>          if (!adev)
>> >>>>            return -EINVAL;
>> >>>> @@ -478,36 +476,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>> >>>> spin_lock_init(&ring->fence_drv.lock);
>> >>>>        ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
>> >>>>                         GFP_KERNEL);
>> >>>> -    if (!ring->fence_drv.fences)
>> >>>> -        return -ENOMEM;
>> >>>>    -    /* No need to setup the GPU scheduler for rings that don't need it */
>> >>>> -    if (ring->no_scheduler)
>> >>>> -        return 0;
>> >>>> +    ring->num_hw_submission = num_hw_submission;
>> >>>> +    ring->sched_score = sched_score;
>> >>> Let's move this into the caller and then use ring->num_hw_submission in the fence code as well.
>> >>>
>> >>> The maximum number of jobs on the ring is not really fence specific.
>> >>>
>> >>> Regards,
>> >>> Christian.
>> >>>
>> >>>>    -    switch (ring->funcs->type) {
>> >>>> -    case AMDGPU_RING_TYPE_GFX:
>> >>>> -        timeout = adev->gfx_timeout;
>> >>>> -        break;
>> >>>> -    case AMDGPU_RING_TYPE_COMPUTE:
>> >>>> -        timeout = adev->compute_timeout;
>> >>>> -        break;
>> >>>> -    case AMDGPU_RING_TYPE_SDMA:
>> >>>> -        timeout = adev->sdma_timeout;
>> >>>> -        break;
>> >>>> -    default:
>> >>>> -        timeout = adev->video_timeout;
>> >>>> -        break;
>> >>>> -    }
>> >>>> -
>> >>>> -    r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>> >>>> -               num_hw_submission, amdgpu_job_hang_limit,
>> >>>> -               timeout, NULL, sched_score, ring->name);
>> >>>> -    if (r) {
>> >>>> -        DRM_ERROR("Failed to create scheduler on ring %s.\n",
>> >>>> -              ring->name);
>> >>>> -        return r;
>> >>>> -    }
>> >>>> +    if (!ring->fence_drv.fences)
>> >>>> +        return -ENOMEM;
>> >>>>          return 0;
>> >>>>    }
>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> >>>> index fae7d185ad0d..7f20ce73a243 100644
>> >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> >>>> @@ -251,6 +251,8 @@ struct amdgpu_ring {
>> >>>>        bool has_compute_vm_bug;
>> >>>>        bool            no_scheduler;
>> >>>>        int            hw_prio;
>> >>>> +    unsigned num_hw_submission;
>> >>>> +    atomic_t        *sched_score;
>> >>>>    };
>> >>>>      #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9704b0e1fd82..00123b0013d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2287,6 +2287,47 @@  static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
 	return r;
 }
 
+static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
+{
+	long timeout;
+	int r, i;
+
+	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+		struct amdgpu_ring *ring = adev->rings[i];
+
+		/* No need to setup the GPU scheduler for rings that don't need it */
+		if (!ring || ring->no_scheduler)
+			continue;
+
+		switch (ring->funcs->type) {
+		case AMDGPU_RING_TYPE_GFX:
+			timeout = adev->gfx_timeout;
+			break;
+		case AMDGPU_RING_TYPE_COMPUTE:
+			timeout = adev->compute_timeout;
+			break;
+		case AMDGPU_RING_TYPE_SDMA:
+			timeout = adev->sdma_timeout;
+			break;
+		default:
+			timeout = adev->video_timeout;
+			break;
+		}
+
+		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
+				   ring->num_hw_submission, amdgpu_job_hang_limit,
+				   timeout, adev->reset_domain.wq, ring->sched_score, ring->name);
+		if (r) {
+			DRM_ERROR("Failed to create scheduler on ring %s.\n",
+				  ring->name);
+			return r;
+		}
+	}
+
+	return 0;
+}
+
+
 /**
  * amdgpu_device_ip_init - run init for hardware IPs
  *
@@ -2419,6 +2460,10 @@  static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 		}
 	}
 
+	r = amdgpu_device_init_schedulers(adev);
+	if (r)
+		goto init_failed;
+
 	/* Don't init kfd if whole hive need to be reset during init */
 	if (!adev->gmc.xgmi.pending_reset)
 		amdgpu_amdkfd_device_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 45977a72b5dd..fa302540c69a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -457,8 +457,6 @@  int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
 				  atomic_t *sched_score)
 {
 	struct amdgpu_device *adev = ring->adev;
-	long timeout;
-	int r;
 
 	if (!adev)
 		return -EINVAL;
@@ -478,36 +476,12 @@  int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
 	spin_lock_init(&ring->fence_drv.lock);
 	ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
 					 GFP_KERNEL);
-	if (!ring->fence_drv.fences)
-		return -ENOMEM;
 
-	/* No need to setup the GPU scheduler for rings that don't need it */
-	if (ring->no_scheduler)
-		return 0;
+	ring->num_hw_submission = num_hw_submission;
+	ring->sched_score = sched_score;
 
-	switch (ring->funcs->type) {
-	case AMDGPU_RING_TYPE_GFX:
-		timeout = adev->gfx_timeout;
-		break;
-	case AMDGPU_RING_TYPE_COMPUTE:
-		timeout = adev->compute_timeout;
-		break;
-	case AMDGPU_RING_TYPE_SDMA:
-		timeout = adev->sdma_timeout;
-		break;
-	default:
-		timeout = adev->video_timeout;
-		break;
-	}
-
-	r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
-			   num_hw_submission, amdgpu_job_hang_limit,
-			   timeout, NULL, sched_score, ring->name);
-	if (r) {
-		DRM_ERROR("Failed to create scheduler on ring %s.\n",
-			  ring->name);
-		return r;
-	}
+	if (!ring->fence_drv.fences)
+		return -ENOMEM;
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index fae7d185ad0d..7f20ce73a243 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -251,6 +251,8 @@  struct amdgpu_ring {
 	bool			has_compute_vm_bug;
 	bool			no_scheduler;
 	int			hw_prio;
+	unsigned 		num_hw_submission;
+	atomic_t		*sched_score;
 };
 
 #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))