diff mbox series

[RFC,v2,8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV

Message ID 20211222221400.790842-4-andrey.grodzovsky@amd.com (mailing list archive)
State New, archived
Headers show
Series Define and use reset domain for GPU recovery in amdgpu | expand

Commit Message

Andrey Grodzovsky Dec. 22, 2021, 10:14 p.m. UTC
Since now flr work is serialized against  GPU resets
there is no need for this.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
 2 files changed, 22 deletions(-)

Comments

Christian König Dec. 23, 2021, 8:42 a.m. UTC | #1
Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
> Since now flr work is serialized against  GPU resets
> there is no need for this.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

Acked-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>   2 files changed, 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 487cd654b69e..7d59a66e3988 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
>   	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>   
> -	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
> -	 * otherwise the mailbox msg will be ruined/reseted by
> -	 * the VF FLR.
> -	 */
> -	if (!down_write_trylock(&adev->reset_sem))
> -		return;
> -
>   	amdgpu_virt_fini_data_exchange(adev);
> -	atomic_set(&adev->in_gpu_reset, 1);
>   
>   	xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>   
> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   	} while (timeout > 1);
>   
>   flr_done:
> -	atomic_set(&adev->in_gpu_reset, 0);
> -	up_write(&adev->reset_sem);
> -
>   	/* Trigger recovery for world switch failure if no TDR */
>   	if (amdgpu_device_should_recover_gpu(adev)
>   		&& (!amdgpu_device_has_job_running(adev) ||
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index e3869067a31d..f82c066c8e8d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
>   	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>   
> -	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
> -	 * otherwise the mailbox msg will be ruined/reseted by
> -	 * the VF FLR.
> -	 */
> -	if (!down_write_trylock(&adev->reset_sem))
> -		return;
> -
>   	amdgpu_virt_fini_data_exchange(adev);
> -	atomic_set(&adev->in_gpu_reset, 1);
>   
>   	xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>   
> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   	} while (timeout > 1);
>   
>   flr_done:
> -	atomic_set(&adev->in_gpu_reset, 0);
> -	up_write(&adev->reset_sem);
> -
>   	/* Trigger recovery for world switch failure if no TDR */
>   	if (amdgpu_device_should_recover_gpu(adev)
>   		&& (!amdgpu_device_has_job_running(adev) ||
Liu, Monk Dec. 23, 2021, 10:14 a.m. UTC | #2
[AMD Official Use Only]

@Chen, Horace @Chen, JingWen @Deng, Emily

Please take a review on Andrey's patch 

Thanks 
-------------------------------------------------------------------
Monk Liu | Cloud GPU & Virtualization Solution | AMD
-------------------------------------------------------------------
we are hiring software manager for CVS core team
-------------------------------------------------------------------

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Thursday, December 23, 2021 4:42 PM
To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace <Horace.Chen@amd.com>
Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV

Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
> Since now flr work is serialized against  GPU resets there is no need 
> for this.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

Acked-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>   2 files changed, 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 487cd654b69e..7d59a66e3988 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
>   	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>   
> -	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
> -	 * otherwise the mailbox msg will be ruined/reseted by
> -	 * the VF FLR.
> -	 */
> -	if (!down_write_trylock(&adev->reset_sem))
> -		return;
> -
>   	amdgpu_virt_fini_data_exchange(adev);
> -	atomic_set(&adev->in_gpu_reset, 1);
>   
>   	xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>   
> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   	} while (timeout > 1);
>   
>   flr_done:
> -	atomic_set(&adev->in_gpu_reset, 0);
> -	up_write(&adev->reset_sem);
> -
>   	/* Trigger recovery for world switch failure if no TDR */
>   	if (amdgpu_device_should_recover_gpu(adev)
>   		&& (!amdgpu_device_has_job_running(adev) || diff --git 
> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index e3869067a31d..f82c066c8e8d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
>   	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>   
> -	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
> -	 * otherwise the mailbox msg will be ruined/reseted by
> -	 * the VF FLR.
> -	 */
> -	if (!down_write_trylock(&adev->reset_sem))
> -		return;
> -
>   	amdgpu_virt_fini_data_exchange(adev);
> -	atomic_set(&adev->in_gpu_reset, 1);
>   
>   	xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>   
> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   	} while (timeout > 1);
>   
>   flr_done:
> -	atomic_set(&adev->in_gpu_reset, 0);
> -	up_write(&adev->reset_sem);
> -
>   	/* Trigger recovery for world switch failure if no TDR */
>   	if (amdgpu_device_should_recover_gpu(adev)
>   		&& (!amdgpu_device_has_job_running(adev) ||
Liu, Shaoyun Dec. 23, 2021, 6:07 p.m. UTC | #3
[AMD Official Use Only]

I have  a discussion with  Andrey  about this offline.   It seems dangerous  to remove the in_gpu_reset and  reset_semm directly inside the  flr_work.  In the case when the reset is triggered from host side , gpu need to be locked while host perform reset after flr_work reply the host with  READY_TO_RESET. 
The original comments seems need to be updated. 

Regards
Shaoyun.liu
 

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Andrey Grodzovsky
Sent: Wednesday, December 22, 2021 5:14 PM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Liu, Monk <Monk.Liu@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Chen, Horace <Horace.Chen@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; daniel@ffwll.ch
Subject: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV

Since now flr work is serialized against  GPU resets there is no need for this.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
 2 files changed, 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 487cd654b69e..7d59a66e3988 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
 	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
 
-	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
-	 * otherwise the mailbox msg will be ruined/reseted by
-	 * the VF FLR.
-	 */
-	if (!down_write_trylock(&adev->reset_sem))
-		return;
-
 	amdgpu_virt_fini_data_exchange(adev);
-	atomic_set(&adev->in_gpu_reset, 1);
 
 	xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
 
@@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 	} while (timeout > 1);
 
 flr_done:
-	atomic_set(&adev->in_gpu_reset, 0);
-	up_write(&adev->reset_sem);
-
 	/* Trigger recovery for world switch failure if no TDR */
 	if (amdgpu_device_should_recover_gpu(adev)
 		&& (!amdgpu_device_has_job_running(adev) || diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index e3869067a31d..f82c066c8e8d 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
 	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
 
-	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
-	 * otherwise the mailbox msg will be ruined/reseted by
-	 * the VF FLR.
-	 */
-	if (!down_write_trylock(&adev->reset_sem))
-		return;
-
 	amdgpu_virt_fini_data_exchange(adev);
-	atomic_set(&adev->in_gpu_reset, 1);
 
 	xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
 
@@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 	} while (timeout > 1);
 
 flr_done:
-	atomic_set(&adev->in_gpu_reset, 0);
-	up_write(&adev->reset_sem);
-
 	/* Trigger recovery for world switch failure if no TDR */
 	if (amdgpu_device_should_recover_gpu(adev)
 		&& (!amdgpu_device_has_job_running(adev) ||
--
2.25.1
Emily Deng Dec. 24, 2021, 8:58 a.m. UTC | #4
These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.

Best wishes
Emily Deng



>-----Original Message-----
>From: Liu, Monk <Monk.Liu@amd.com>
>Sent: Thursday, December 23, 2021 6:14 PM
>To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, Andrey
><Andrey.Grodzovsky@amd.com>; dri-devel@lists.freedesktop.org; amd-
>gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen,
>JingWen <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>Cc: daniel@ffwll.ch
>Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection
>for SRIOV
>
>[AMD Official Use Only]
>
>@Chen, Horace @Chen, JingWen @Deng, Emily
>
>Please take a review on Andrey's patch
>
>Thanks
>-------------------------------------------------------------------
>Monk Liu | Cloud GPU & Virtualization Solution | AMD
>-------------------------------------------------------------------
>we are hiring software manager for CVS core team
>-------------------------------------------------------------------
>
>-----Original Message-----
>From: Koenig, Christian <Christian.Koenig@amd.com>
>Sent: Thursday, December 23, 2021 4:42 PM
>To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
><Horace.Chen@amd.com>
>Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection
>for SRIOV
>
>Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>> Since now flr work is serialized against  GPU resets there is no need
>> for this.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>
>Acked-by: Christian König <christian.koenig@amd.com>
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>   2 files changed, 22 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> index 487cd654b69e..7d59a66e3988 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>work_struct *work)
>>   	struct amdgpu_device *adev = container_of(virt, struct
>amdgpu_device, virt);
>>   	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>
>> -	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>> -	 * otherwise the mailbox msg will be ruined/reseted by
>> -	 * the VF FLR.
>> -	 */
>> -	if (!down_write_trylock(&adev->reset_sem))
>> -		return;
>> -
>>   	amdgpu_virt_fini_data_exchange(adev);
>> -	atomic_set(&adev->in_gpu_reset, 1);
>>
>>   	xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>
>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>work_struct *work)
>>   	} while (timeout > 1);
>>
>>   flr_done:
>> -	atomic_set(&adev->in_gpu_reset, 0);
>> -	up_write(&adev->reset_sem);
>> -
>>   	/* Trigger recovery for world switch failure if no TDR */
>>   	if (amdgpu_device_should_recover_gpu(adev)
>>   		&& (!amdgpu_device_has_job_running(adev) || diff --git
>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> index e3869067a31d..f82c066c8e8d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>work_struct *work)
>>   	struct amdgpu_device *adev = container_of(virt, struct
>amdgpu_device, virt);
>>   	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>
>> -	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>> -	 * otherwise the mailbox msg will be ruined/reseted by
>> -	 * the VF FLR.
>> -	 */
>> -	if (!down_write_trylock(&adev->reset_sem))
>> -		return;
>> -
>>   	amdgpu_virt_fini_data_exchange(adev);
>> -	atomic_set(&adev->in_gpu_reset, 1);
>>
>>   	xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>
>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>work_struct *work)
>>   	} while (timeout > 1);
>>
>>   flr_done:
>> -	atomic_set(&adev->in_gpu_reset, 0);
>> -	up_write(&adev->reset_sem);
>> -
>>   	/* Trigger recovery for world switch failure if no TDR */
>>   	if (amdgpu_device_should_recover_gpu(adev)
>>   		&& (!amdgpu_device_has_job_running(adev) ||
JingWen Chen Dec. 24, 2021, 9:57 a.m. UTC | #5
I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.

On 2021/12/24 下午4:58, Deng, Emily wrote:
> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: Liu, Monk <Monk.Liu@amd.com>
>> Sent: Thursday, December 23, 2021 6:14 PM
>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, Andrey
>> <Andrey.Grodzovsky@amd.com>; dri-devel@lists.freedesktop.org; amd-
>> gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen,
>> JingWen <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>> Cc: daniel@ffwll.ch
>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection
>> for SRIOV
>>
>> [AMD Official Use Only]
>>
>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>
>> Please take a review on Andrey's patch
>>
>> Thanks
>> -------------------------------------------------------------------
>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>> -------------------------------------------------------------------
>> we are hiring software manager for CVS core team
>> -------------------------------------------------------------------
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Thursday, December 23, 2021 4:42 PM
>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>> <Horace.Chen@amd.com>
>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection
>> for SRIOV
>>
>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>> Since now flr work is serialized against  GPU resets there is no need
>>> for this.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> Acked-by: Christian König <christian.koenig@amd.com>
>>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>   2 files changed, 22 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>> index 487cd654b69e..7d59a66e3988 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>> work_struct *work)
>>>   	struct amdgpu_device *adev = container_of(virt, struct
>> amdgpu_device, virt);
>>>   	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>
>>> -	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>> -	 * otherwise the mailbox msg will be ruined/reseted by
>>> -	 * the VF FLR.
>>> -	 */
>>> -	if (!down_write_trylock(&adev->reset_sem))
>>> -		return;
>>> -
>>>   	amdgpu_virt_fini_data_exchange(adev);
>>> -	atomic_set(&adev->in_gpu_reset, 1);
>>>
>>>   	xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>>
>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>> work_struct *work)
>>>   	} while (timeout > 1);
>>>
>>>   flr_done:
>>> -	atomic_set(&adev->in_gpu_reset, 0);
>>> -	up_write(&adev->reset_sem);
>>> -
>>>   	/* Trigger recovery for world switch failure if no TDR */
>>>   	if (amdgpu_device_should_recover_gpu(adev)
>>>   		&& (!amdgpu_device_has_job_running(adev) || diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>> index e3869067a31d..f82c066c8e8d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>> work_struct *work)
>>>   	struct amdgpu_device *adev = container_of(virt, struct
>> amdgpu_device, virt);
>>>   	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>
>>> -	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>> -	 * otherwise the mailbox msg will be ruined/reseted by
>>> -	 * the VF FLR.
>>> -	 */
>>> -	if (!down_write_trylock(&adev->reset_sem))
>>> -		return;
>>> -
>>>   	amdgpu_virt_fini_data_exchange(adev);
>>> -	atomic_set(&adev->in_gpu_reset, 1);
>>>
>>>   	xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>>
>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>> work_struct *work)
>>>   	} while (timeout > 1);
>>>
>>>   flr_done:
>>> -	atomic_set(&adev->in_gpu_reset, 0);
>>> -	up_write(&adev->reset_sem);
>>> -
>>>   	/* Trigger recovery for world switch failure if no TDR */
>>>   	if (amdgpu_device_should_recover_gpu(adev)
>>>   		&& (!amdgpu_device_has_job_running(adev) ||
Andrey Grodzovsky Dec. 30, 2021, 6:39 p.m. UTC | #6
Thanks a lot, please let me know.

Andrey

On 2021-12-24 3:58 a.m., Deng, Emily wrote:
> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>
> Best wishes
> Emily Deng
>
>
>
>> -----Original Message-----
>> From: Liu, Monk <Monk.Liu@amd.com>
>> Sent: Thursday, December 23, 2021 6:14 PM
>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, Andrey
>> <Andrey.Grodzovsky@amd.com>; dri-devel@lists.freedesktop.org; amd-
>> gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen,
>> JingWen <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>> Cc: daniel@ffwll.ch
>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection
>> for SRIOV
>>
>> [AMD Official Use Only]
>>
>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>
>> Please take a review on Andrey's patch
>>
>> Thanks
>> -------------------------------------------------------------------
>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>> -------------------------------------------------------------------
>> we are hiring software manager for CVS core team
>> -------------------------------------------------------------------
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Thursday, December 23, 2021 4:42 PM
>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>> <Horace.Chen@amd.com>
>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection
>> for SRIOV
>>
>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>> Since now flr work is serialized against  GPU resets there is no need
>>> for this.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> Acked-by: Christian König <christian.koenig@amd.com>
>>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>    drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>    2 files changed, 22 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>> index 487cd654b69e..7d59a66e3988 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>> work_struct *work)
>>>    	struct amdgpu_device *adev = container_of(virt, struct
>> amdgpu_device, virt);
>>>    	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>
>>> -	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>> -	 * otherwise the mailbox msg will be ruined/reseted by
>>> -	 * the VF FLR.
>>> -	 */
>>> -	if (!down_write_trylock(&adev->reset_sem))
>>> -		return;
>>> -
>>>    	amdgpu_virt_fini_data_exchange(adev);
>>> -	atomic_set(&adev->in_gpu_reset, 1);
>>>
>>>    	xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>>
>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>> work_struct *work)
>>>    	} while (timeout > 1);
>>>
>>>    flr_done:
>>> -	atomic_set(&adev->in_gpu_reset, 0);
>>> -	up_write(&adev->reset_sem);
>>> -
>>>    	/* Trigger recovery for world switch failure if no TDR */
>>>    	if (amdgpu_device_should_recover_gpu(adev)
>>>    		&& (!amdgpu_device_has_job_running(adev) || diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>> index e3869067a31d..f82c066c8e8d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>> work_struct *work)
>>>    	struct amdgpu_device *adev = container_of(virt, struct
>> amdgpu_device, virt);
>>>    	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>
>>> -	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>> -	 * otherwise the mailbox msg will be ruined/reseted by
>>> -	 * the VF FLR.
>>> -	 */
>>> -	if (!down_write_trylock(&adev->reset_sem))
>>> -		return;
>>> -
>>>    	amdgpu_virt_fini_data_exchange(adev);
>>> -	atomic_set(&adev->in_gpu_reset, 1);
>>>
>>>    	xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>>
>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>> work_struct *work)
>>>    	} while (timeout > 1);
>>>
>>>    flr_done:
>>> -	atomic_set(&adev->in_gpu_reset, 0);
>>> -	up_write(&adev->reset_sem);
>>> -
>>>    	/* Trigger recovery for world switch failure if no TDR */
>>>    	if (amdgpu_device_should_recover_gpu(adev)
>>>    		&& (!amdgpu_device_has_job_running(adev) ||
Andrey Grodzovsky Dec. 30, 2021, 6:45 p.m. UTC | #7
Sure, I guess i can drop this patch then.

Andrey

On 2021-12-24 4:57 a.m., JingWen Chen wrote:
> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>
> On 2021/12/24 下午4:58, Deng, Emily wrote:
>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>
>> Best wishes
>> Emily Deng
>>
>>
>>
>>> -----Original Message-----
>>> From: Liu, Monk <Monk.Liu@amd.com>
>>> Sent: Thursday, December 23, 2021 6:14 PM
>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, Andrey
>>> <Andrey.Grodzovsky@amd.com>; dri-devel@lists.freedesktop.org; amd-
>>> gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen,
>>> JingWen <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>> Cc: daniel@ffwll.ch
>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection
>>> for SRIOV
>>>
>>> [AMD Official Use Only]
>>>
>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>
>>> Please take a review on Andrey's patch
>>>
>>> Thanks
>>> -------------------------------------------------------------------
>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>> -------------------------------------------------------------------
>>> we are hiring software manager for CVS core team
>>> -------------------------------------------------------------------
>>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Thursday, December 23, 2021 4:42 PM
>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>> <Horace.Chen@amd.com>
>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection
>>> for SRIOV
>>>
>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>> Since now flr work is serialized against  GPU resets there is no need
>>>> for this.
>>>>
>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>    drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>    2 files changed, 22 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>> index 487cd654b69e..7d59a66e3988 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>> work_struct *work)
>>>>    	struct amdgpu_device *adev = container_of(virt, struct
>>> amdgpu_device, virt);
>>>>    	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>
>>>> -	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>> -	 * otherwise the mailbox msg will be ruined/reseted by
>>>> -	 * the VF FLR.
>>>> -	 */
>>>> -	if (!down_write_trylock(&adev->reset_sem))
>>>> -		return;
>>>> -
>>>>    	amdgpu_virt_fini_data_exchange(adev);
>>>> -	atomic_set(&adev->in_gpu_reset, 1);
>>>>
>>>>    	xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>>>
>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>> work_struct *work)
>>>>    	} while (timeout > 1);
>>>>
>>>>    flr_done:
>>>> -	atomic_set(&adev->in_gpu_reset, 0);
>>>> -	up_write(&adev->reset_sem);
>>>> -
>>>>    	/* Trigger recovery for world switch failure if no TDR */
>>>>    	if (amdgpu_device_should_recover_gpu(adev)
>>>>    		&& (!amdgpu_device_has_job_running(adev) || diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>> index e3869067a31d..f82c066c8e8d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>> work_struct *work)
>>>>    	struct amdgpu_device *adev = container_of(virt, struct
>>> amdgpu_device, virt);
>>>>    	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>
>>>> -	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>> -	 * otherwise the mailbox msg will be ruined/reseted by
>>>> -	 * the VF FLR.
>>>> -	 */
>>>> -	if (!down_write_trylock(&adev->reset_sem))
>>>> -		return;
>>>> -
>>>>    	amdgpu_virt_fini_data_exchange(adev);
>>>> -	atomic_set(&adev->in_gpu_reset, 1);
>>>>
>>>>    	xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>>>
>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>> work_struct *work)
>>>>    	} while (timeout > 1);
>>>>
>>>>    flr_done:
>>>> -	atomic_set(&adev->in_gpu_reset, 0);
>>>> -	up_write(&adev->reset_sem);
>>>> -
>>>>    	/* Trigger recovery for world switch failure if no TDR */
>>>>    	if (amdgpu_device_should_recover_gpu(adev)
>>>>    		&& (!amdgpu_device_has_job_running(adev) ||
Christian König Jan. 3, 2022, 10:17 a.m. UTC | #8
Please don't. This patch is vital to the cleanup of the reset procedure.

If SRIOV doesn't work with that we need to change SRIOV and not the driver.

Christian.

Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
> Sure, I guess i can drop this patch then.
>
> Andrey
>
> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>> I do agree with shaoyun, if the host find the gpu engine hangs first, 
>> and do the flr, guest side thread may not know this and still try to 
>> access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to 
>> identify the reset status). And this may lead to very bad result.
>>
>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>> These patches look good to me. JingWen will pull these patches and 
>>> do some basic TDR test on sriov environment, and give feedback.
>>>
>>> Best wishes
>>> Emily Deng
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, Andrey
>>>> <Andrey.Grodzovsky@amd.com>; dri-devel@lists.freedesktop.org; amd-
>>>> gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen,
>>>> JingWen <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>> Cc: daniel@ffwll.ch
>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset 
>>>> protection
>>>> for SRIOV
>>>>
>>>> [AMD Official Use Only]
>>>>
>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>
>>>> Please take a review on Andrey's patch
>>>>
>>>> Thanks
>>>> -------------------------------------------------------------------
>>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>> -------------------------------------------------------------------
>>>> we are hiring software manager for CVS core team
>>>> -------------------------------------------------------------------
>>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>> <Horace.Chen@amd.com>
>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset 
>>>> protection
>>>> for SRIOV
>>>>
>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>> Since now flr work is serialized against  GPU resets there is no need
>>>>> for this.
>>>>>
>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>
>>>>> ---
>>>>>    drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>    drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>    2 files changed, 22 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>> work_struct *work)
>>>>>        struct amdgpu_device *adev = container_of(virt, struct
>>>> amdgpu_device, virt);
>>>>>        int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>
>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>> -     * the VF FLR.
>>>>> -     */
>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>> -        return;
>>>>> -
>>>>>        amdgpu_virt_fini_data_exchange(adev);
>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>
>>>>>        xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>>>>
>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>> work_struct *work)
>>>>>        } while (timeout > 1);
>>>>>
>>>>>    flr_done:
>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>> -    up_write(&adev->reset_sem);
>>>>> -
>>>>>        /* Trigger recovery for world switch failure if no TDR */
>>>>>        if (amdgpu_device_should_recover_gpu(adev)
>>>>>            && (!amdgpu_device_has_job_running(adev) || diff --git
>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>> work_struct *work)
>>>>>        struct amdgpu_device *adev = container_of(virt, struct
>>>> amdgpu_device, virt);
>>>>>        int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>
>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>> -     * the VF FLR.
>>>>> -     */
>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>> -        return;
>>>>> -
>>>>>        amdgpu_virt_fini_data_exchange(adev);
>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>
>>>>>        xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>>>>
>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>> work_struct *work)
>>>>>        } while (timeout > 1);
>>>>>
>>>>>    flr_done:
>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>> -    up_write(&adev->reset_sem);
>>>>> -
>>>>>        /* Trigger recovery for world switch failure if no TDR */
>>>>>        if (amdgpu_device_should_recover_gpu(adev)
>>>>>            && (!amdgpu_device_has_job_running(adev) ||
JingWen Chen Jan. 4, 2022, 9:07 a.m. UTC | #9
Hi Christian,
I'm not sure what do you mean by "we need to change SRIOV not the driver".

Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.

From my point of view, we can directly use
amdgpu_device_lock_adev and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.

Best Regards,
Jingwen Chen

On 2022/1/3 下午6:17, Christian König wrote:
> Please don't. This patch is vital to the cleanup of the reset procedure.
>
> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>
> Christian.
>
> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>> Sure, I guess i can drop this patch then.
>>
>> Andrey
>>
>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>
>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>
>>>> Best wishes
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, Andrey
>>>>> <Andrey.Grodzovsky@amd.com>; dri-devel@lists.freedesktop.org; amd-
>>>>> gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen,
>>>>> JingWen <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>> Cc: daniel@ffwll.ch
>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection
>>>>> for SRIOV
>>>>>
>>>>> [AMD Official Use Only]
>>>>>
>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>
>>>>> Please take a review on Andrey's patch
>>>>>
>>>>> Thanks
>>>>> -------------------------------------------------------------------
>>>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>> -------------------------------------------------------------------
>>>>> we are hiring software manager for CVS core team
>>>>> -------------------------------------------------------------------
>>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>> <Horace.Chen@amd.com>
>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection
>>>>> for SRIOV
>>>>>
>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>> Since now flr work is serialized against  GPU resets there is no need
>>>>>> for this.
>>>>>>
>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>
>>>>>> ---
>>>>>>    drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>    drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>    2 files changed, 22 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>> work_struct *work)
>>>>>>        struct amdgpu_device *adev = container_of(virt, struct
>>>>> amdgpu_device, virt);
>>>>>>        int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>
>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>> -     * the VF FLR.
>>>>>> -     */
>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>> -        return;
>>>>>> -
>>>>>>        amdgpu_virt_fini_data_exchange(adev);
>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>
>>>>>>        xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>>>>>
>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>> work_struct *work)
>>>>>>        } while (timeout > 1);
>>>>>>
>>>>>>    flr_done:
>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>> -    up_write(&adev->reset_sem);
>>>>>> -
>>>>>>        /* Trigger recovery for world switch failure if no TDR */
>>>>>>        if (amdgpu_device_should_recover_gpu(adev)
>>>>>>            && (!amdgpu_device_has_job_running(adev) || diff --git
>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>> work_struct *work)
>>>>>>        struct amdgpu_device *adev = container_of(virt, struct
>>>>> amdgpu_device, virt);
>>>>>>        int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>
>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>> -     * the VF FLR.
>>>>>> -     */
>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>> -        return;
>>>>>> -
>>>>>>        amdgpu_virt_fini_data_exchange(adev);
>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>
>>>>>>        xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>>>>>
>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>> work_struct *work)
>>>>>>        } while (timeout > 1);
>>>>>>
>>>>>>    flr_done:
>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>> -    up_write(&adev->reset_sem);
>>>>>> -
>>>>>>        /* Trigger recovery for world switch failure if no TDR */
>>>>>>        if (amdgpu_device_should_recover_gpu(adev)
>>>>>>            && (!amdgpu_device_has_job_running(adev) ||
>
Christian König Jan. 4, 2022, 10:18 a.m. UTC | #10
Hi Jingwen,

well what I mean is that we need to adjust the implementation in amdgpu 
to actually match the requirements.

Could be that the reset sequence is questionable in general, but I doubt 
so at least for now.

See the FLR request from the hypervisor is just another source of 
signaling the need for a reset, similar to each job timeout on each 
queue. Otherwise you have a race condition between the hypervisor and 
the scheduler.

Properly setting in_gpu_reset is indeed mandatory, but should happen at 
a central place and not in the SRIOV specific code.

In other words I strongly think that the current SRIOV reset 
implementation is severely broken and what Andrey is doing is actually 
fixing it.

Regards,
Christian.

Am 04.01.22 um 10:07 schrieb JingWen Chen:
> Hi Christian,
> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>
> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>
>  From my point of view, we can directly use
> amdgpu_device_lock_adev and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>
> Best Regards,
> Jingwen Chen
>
> On 2022/1/3 下午6:17, Christian König wrote:
>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>
>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>
>> Christian.
>>
>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>> Sure, I guess i can drop this patch then.
>>>
>>> Andrey
>>>
>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>
>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>
>>>>> Best wishes
>>>>> Emily Deng
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, Andrey
>>>>>> <Andrey.Grodzovsky@amd.com>; dri-devel@lists.freedesktop.org; amd-
>>>>>> gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen,
>>>>>> JingWen <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>> Cc: daniel@ffwll.ch
>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection
>>>>>> for SRIOV
>>>>>>
>>>>>> [AMD Official Use Only]
>>>>>>
>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>
>>>>>> Please take a review on Andrey's patch
>>>>>>
>>>>>> Thanks
>>>>>> -------------------------------------------------------------------
>>>>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>> -------------------------------------------------------------------
>>>>>> we are hiring software manager for CVS core team
>>>>>> -------------------------------------------------------------------
>>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>> <Horace.Chen@amd.com>
>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection
>>>>>> for SRIOV
>>>>>>
>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>> Since now flr work is serialized against  GPU resets there is no need
>>>>>>> for this.
>>>>>>>
>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>
>>>>>>> ---
>>>>>>>     drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>     drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>     2 files changed, 22 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>> work_struct *work)
>>>>>>>         struct amdgpu_device *adev = container_of(virt, struct
>>>>>> amdgpu_device, virt);
>>>>>>>         int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>
>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>> -     * the VF FLR.
>>>>>>> -     */
>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>> -        return;
>>>>>>> -
>>>>>>>         amdgpu_virt_fini_data_exchange(adev);
>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>
>>>>>>>         xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>>>>>>
>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>> work_struct *work)
>>>>>>>         } while (timeout > 1);
>>>>>>>
>>>>>>>     flr_done:
>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>> -
>>>>>>>         /* Trigger recovery for world switch failure if no TDR */
>>>>>>>         if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>             && (!amdgpu_device_has_job_running(adev) || diff --git
>>>>>>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>> work_struct *work)
>>>>>>>         struct amdgpu_device *adev = container_of(virt, struct
>>>>>> amdgpu_device, virt);
>>>>>>>         int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>
>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>> -     * the VF FLR.
>>>>>>> -     */
>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>> -        return;
>>>>>>> -
>>>>>>>         amdgpu_virt_fini_data_exchange(adev);
>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>
>>>>>>>         xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>>>>>>
>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>> work_struct *work)
>>>>>>>         } while (timeout > 1);
>>>>>>>
>>>>>>>     flr_done:
>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>> -
>>>>>>>         /* Trigger recovery for world switch failure if no TDR */
>>>>>>>         if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>             && (!amdgpu_device_has_job_running(adev) ||
Liu, Monk Jan. 4, 2022, 10:49 a.m. UTC | #11
[AMD Official Use Only]

>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long

>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
It makes the code to crash ... how could it be a fix ?

I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.

Thanks 
-------------------------------------------------------------------
Monk Liu | Cloud GPU & Virtualization Solution | AMD
-------------------------------------------------------------------
we are hiring software manager for CVS core team
-------------------------------------------------------------------

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 
Sent: Tuesday, January 4, 2022 6:19 PM
To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>
Cc: daniel@ffwll.ch
Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV

Hi Jingwen,

well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.

Could be that the reset sequence is questionable in general, but I doubt so at least for now.

See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.

Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.

In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.

Regards,
Christian.

Am 04.01.22 um 10:07 schrieb JingWen Chen:
> Hi Christian,
> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>
> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>
>  From my point of view, we can directly use amdgpu_device_lock_adev 
> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>
> Best Regards,
> Jingwen Chen
>
> On 2022/1/3 下午6:17, Christian König wrote:
>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>
>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>
>> Christian.
>>
>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>> Sure, I guess i can drop this patch then.
>>>
>>> Andrey
>>>
>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>
>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>
>>>>> Best wishes
>>>>> Emily Deng
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, 
>>>>>> Andrey <Andrey.Grodzovsky@amd.com>; 
>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org; 
>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen 
>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>> Cc: daniel@ffwll.ch
>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset 
>>>>>> protection for SRIOV
>>>>>>
>>>>>> [AMD Official Use Only]
>>>>>>
>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>
>>>>>> Please take a review on Andrey's patch
>>>>>>
>>>>>> Thanks
>>>>>> -----------------------------------------------------------------
>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>> -----------------------------------------------------------------
>>>>>> -- we are hiring software manager for CVS core team
>>>>>> -----------------------------------------------------------------
>>>>>> --
>>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri- 
>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace 
>>>>>> <Horace.Chen@amd.com>
>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset 
>>>>>> protection for SRIOV
>>>>>>
>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>> Since now flr work is serialized against  GPU resets there is no 
>>>>>>> need for this.
>>>>>>>
>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>
>>>>>>> ---
>>>>>>>     drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>     drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>     2 files changed, 22 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>> work_struct *work)
>>>>>>>         struct amdgpu_device *adev = container_of(virt, struct
>>>>>> amdgpu_device, virt);
>>>>>>>         int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>
>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>> -     * the VF FLR.
>>>>>>> -     */
>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>> -        return;
>>>>>>> -
>>>>>>>         amdgpu_virt_fini_data_exchange(adev);
>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>
>>>>>>>         xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 
>>>>>>> 0, 0);
>>>>>>>
>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>> work_struct *work)
>>>>>>>         } while (timeout > 1);
>>>>>>>
>>>>>>>     flr_done:
>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>> -
>>>>>>>         /* Trigger recovery for world switch failure if no TDR 
>>>>>>> */
>>>>>>>         if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>             && (!amdgpu_device_has_job_running(adev) || diff 
>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>> work_struct *work)
>>>>>>>         struct amdgpu_device *adev = container_of(virt, struct
>>>>>> amdgpu_device, virt);
>>>>>>>         int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>
>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>> -     * the VF FLR.
>>>>>>> -     */
>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>> -        return;
>>>>>>> -
>>>>>>>         amdgpu_virt_fini_data_exchange(adev);
>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>
>>>>>>>         xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 
>>>>>>> 0, 0);
>>>>>>>
>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>> work_struct *work)
>>>>>>>         } while (timeout > 1);
>>>>>>>
>>>>>>>     flr_done:
>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>> -
>>>>>>>         /* Trigger recovery for world switch failure if no TDR 
>>>>>>> */
>>>>>>>         if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>             && (!amdgpu_device_has_job_running(adev) ||
Christian König Jan. 4, 2022, 11:36 a.m. UTC | #12
Am 04.01.22 um 11:49 schrieb Liu, Monk:
> [AMD Official Use Only]
>
>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>

Then we have a major design issue in the SRIOV protocol and really need 
to question this.

How do you want to prevent a race between the hypervisor resetting the 
hardware and the client trying the same because of a timeout?

As far as I can see the procedure should be:
1. We detect that a reset is necessary, either because of a fault a 
timeout or signal from hypervisor.
2. For each of those potential reset sources a work item is send to the 
single workqueue.
3. One of those work items execute first and prepares the reset.
4. We either do the reset our self or notify the hypervisor that we are 
ready for the reset.
5. Cleanup after the reset, eventually resubmit jobs etc..
6. Cancel work items which might have been scheduled from other reset 
sources.

It does make sense that the hypervisor resets the hardware without 
waiting for the clients for too long, but if we don't follow this 
general steps we will always have a race between the different components.

Regards,
Christian.

Am 04.01.22 um 11:49 schrieb Liu, Monk:
> [AMD Official Use Only]
>
>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>
>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
> It makes the code to crash ... how could it be a fix ?
>
> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>
> Thanks
> -------------------------------------------------------------------
> Monk Liu | Cloud GPU & Virtualization Solution | AMD
> -------------------------------------------------------------------
> we are hiring software manager for CVS core team
> -------------------------------------------------------------------
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Tuesday, January 4, 2022 6:19 PM
> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>
> Cc: daniel@ffwll.ch
> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>
> Hi Jingwen,
>
> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>
> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>
> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>
> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>
> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>
> Regards,
> Christian.
>
> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>> Hi Christian,
>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>
>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>
>>   From my point of view, we can directly use amdgpu_device_lock_adev
>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>
>> Best Regards,
>> Jingwen Chen
>>
>> On 2022/1/3 下午6:17, Christian König wrote:
>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>
>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>
>>> Christian.
>>>
>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>> Sure, I guess i can drop this patch then.
>>>>
>>>> Andrey
>>>>
>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>
>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org;
>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>> Cc: daniel@ffwll.ch
>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>> protection for SRIOV
>>>>>>>
>>>>>>> [AMD Official Use Only]
>>>>>>>
>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>
>>>>>>> Please take a review on Andrey's patch
>>>>>>>
>>>>>>> Thanks
>>>>>>> -----------------------------------------------------------------
>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>> -----------------------------------------------------------------
>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>> -----------------------------------------------------------------
>>>>>>> --
>>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>> <Horace.Chen@amd.com>
>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>> protection for SRIOV
>>>>>>>
>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>> Since now flr work is serialized against  GPU resets there is no
>>>>>>>> need for this.
>>>>>>>>
>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>
>>>>>>>> ---
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>      2 files changed, 22 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>> work_struct *work)
>>>>>>>>          struct amdgpu_device *adev = container_of(virt, struct
>>>>>>> amdgpu_device, virt);
>>>>>>>>          int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>
>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>> -     * the VF FLR.
>>>>>>>> -     */
>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>> -        return;
>>>>>>>> -
>>>>>>>>          amdgpu_virt_fini_data_exchange(adev);
>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>
>>>>>>>>          xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>> 0, 0);
>>>>>>>>
>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>> work_struct *work)
>>>>>>>>          } while (timeout > 1);
>>>>>>>>
>>>>>>>>      flr_done:
>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>> -
>>>>>>>>          /* Trigger recovery for world switch failure if no TDR
>>>>>>>> */
>>>>>>>>          if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>              && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>> work_struct *work)
>>>>>>>>          struct amdgpu_device *adev = container_of(virt, struct
>>>>>>> amdgpu_device, virt);
>>>>>>>>          int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>
>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>> -     * the VF FLR.
>>>>>>>> -     */
>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>> -        return;
>>>>>>>> -
>>>>>>>>          amdgpu_virt_fini_data_exchange(adev);
>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>
>>>>>>>>          xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>> 0, 0);
>>>>>>>>
>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>> work_struct *work)
>>>>>>>>          } while (timeout > 1);
>>>>>>>>
>>>>>>>>      flr_done:
>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>> -
>>>>>>>>          /* Trigger recovery for world switch failure if no TDR
>>>>>>>> */
>>>>>>>>          if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>              && (!amdgpu_device_has_job_running(adev) ||
Andrey Grodzovsky Jan. 4, 2022, 4:56 p.m. UTC | #13
On 2022-01-04 6:36 a.m., Christian König wrote:
> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>> [AMD Official Use Only]
>>
>>>> See the FLR request from the hypervisor is just another source of 
>>>> signaling the need for a reset, similar to each job timeout on each 
>>>> queue. Otherwise you have a race condition between the hypervisor 
>>>> and the scheduler.
>> No it's not, FLR from hypervisor is just to notify guest the hw VF 
>> FLR is about to start or was already executed, but host will do FLR 
>> anyway without waiting for guest too long
>>
>
> Then we have a major design issue in the SRIOV protocol and really 
> need to question this.
>
> How do you want to prevent a race between the hypervisor resetting the 
> hardware and the client trying the same because of a timeout?
>
> As far as I can see the procedure should be:
> 1. We detect that a reset is necessary, either because of a fault a 
> timeout or signal from hypervisor.
> 2. For each of those potential reset sources a work item is send to 
> the single workqueue.
> 3. One of those work items execute first and prepares the reset.
> 4. We either do the reset our self or notify the hypervisor that we 
> are ready for the reset.
> 5. Cleanup after the reset, eventually resubmit jobs etc..
> 6. Cancel work items which might have been scheduled from other reset 
> sources.
>
> It does make sense that the hypervisor resets the hardware without 
> waiting for the clients for too long, but if we don't follow this 
> general steps we will always have a race between the different 
> components.


Monk, just to add to this - if indeed as you say that 'FLR from 
hypervisor is just to notify guest the hw VF FLR is about to start or 
was already executed, but host will do FLR anyway without waiting for 
guest too long'
and there is no strict waiting from the hypervisor for 
IDH_READY_TO_RESET to be recived from guest before starting the reset 
then setting in_gpu_reset and locking reset_sem from guest side is not 
really full proof
protection from MMIO accesses by the guest - it only truly helps if 
hypervisor waits for that message before initiation of HW reset.

Andrey


>
> Regards,
> Christian.
>
> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>> [AMD Official Use Only]
>>
>>>> See the FLR request from the hypervisor is just another source of 
>>>> signaling the need for a reset, similar to each job timeout on each 
>>>> queue. Otherwise you have a race condition between the hypervisor 
>>>> and the scheduler.
>> No it's not, FLR from hypervisor is just to notify guest the hw VF 
>> FLR is about to start or was already executed, but host will do FLR 
>> anyway without waiting for guest too long
>>
>>>> In other words I strongly think that the current SRIOV reset 
>>>> implementation is severely broken and what Andrey is doing is 
>>>> actually fixing it.
>> It makes the code to crash ... how could it be a fix ?
>>
>> I'm afraid the patch is NAK from me,  but it is welcome if the 
>> cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>
>> Thanks
>> -------------------------------------------------------------------
>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>> -------------------------------------------------------------------
>> we are hiring software manager for CVS core team
>> -------------------------------------------------------------------
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Tuesday, January 4, 2022 6:19 PM
>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König 
>> <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey 
>> <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, 
>> Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; 
>> amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; 
>> Chen, JingWen <JingWen.Chen2@amd.com>
>> Cc: daniel@ffwll.ch
>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset 
>> protection for SRIOV
>>
>> Hi Jingwen,
>>
>> well what I mean is that we need to adjust the implementation in 
>> amdgpu to actually match the requirements.
>>
>> Could be that the reset sequence is questionable in general, but I 
>> doubt so at least for now.
>>
>> See the FLR request from the hypervisor is just another source of 
>> signaling the need for a reset, similar to each job timeout on each 
>> queue. Otherwise you have a race condition between the hypervisor and 
>> the scheduler.
>>
>> Properly setting in_gpu_reset is indeed mandatory, but should happen 
>> at a central place and not in the SRIOV specific code.
>>
>> In other words I strongly think that the current SRIOV reset 
>> implementation is severely broken and what Andrey is doing is 
>> actually fixing it.
>>
>> Regards,
>> Christian.
>>
>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>> Hi Christian,
>>> I'm not sure what do you mean by "we need to change SRIOV not the 
>>> driver".
>>>
>>> Do you mean we should change the reset sequence in SRIOV? This will 
>>> be a huge change for our SRIOV solution.
>>>
>>>   From my point of view, we can directly use amdgpu_device_lock_adev
>>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since 
>>> no one will conflict with this thread with reset_domain introduced.
>>> But we do need the reset_sem and adev->in_gpu_reset to keep device 
>>> untouched via user space.
>>>
>>> Best Regards,
>>> Jingwen Chen
>>>
>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>> Please don't. This patch is vital to the cleanup of the reset 
>>>> procedure.
>>>>
>>>> If SRIOV doesn't work with that we need to change SRIOV and not the 
>>>> driver.
>>>>
>>>> Christian.
>>>>
>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>> Sure, I guess i can drop this patch then.
>>>>>
>>>>> Andrey
>>>>>
>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs 
>>>>>> first, and do the flr, guest side thread may not know this and 
>>>>>> still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset 
>>>>>> and reset_sem to identify the reset status). And this may lead to 
>>>>>> very bad result.
>>>>>>
>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>> These patches look good to me. JingWen will pull these patches 
>>>>>>> and do some basic TDR test on sriov environment, and give feedback.
>>>>>>>
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org;
>>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>> protection for SRIOV
>>>>>>>>
>>>>>>>> [AMD Official Use Only]
>>>>>>>>
>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>
>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>
>>>>>>>> Thanks
>>>>>>>> -----------------------------------------------------------------
>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>> -----------------------------------------------------------------
>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>> -----------------------------------------------------------------
>>>>>>>> -- 
>>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>> protection for SRIOV
>>>>>>>>
>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>> Since now flr work is serialized against  GPU resets there is no
>>>>>>>>> need for this.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>
>>>>>>>>> ---
>>>>>>>>>      drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>      drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>      2 files changed, 22 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>          struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>> amdgpu_device, virt);
>>>>>>>>>          int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>
>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>> -     * the VF FLR.
>>>>>>>>> -     */
>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>> -        return;
>>>>>>>>> -
>>>>>>>>>          amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>
>>>>>>>>>          xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>> 0, 0);
>>>>>>>>>
>>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>          } while (timeout > 1);
>>>>>>>>>
>>>>>>>>>      flr_done:
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>> -
>>>>>>>>>          /* Trigger recovery for world switch failure if no TDR
>>>>>>>>> */
>>>>>>>>>          if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>              && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>          struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>> amdgpu_device, virt);
>>>>>>>>>          int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>
>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>> -     * the VF FLR.
>>>>>>>>> -     */
>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>> -        return;
>>>>>>>>> -
>>>>>>>>>          amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>
>>>>>>>>>          xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>> 0, 0);
>>>>>>>>>
>>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>          } while (timeout > 1);
>>>>>>>>>
>>>>>>>>>      flr_done:
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>> -
>>>>>>>>>          /* Trigger recovery for world switch failure if no TDR
>>>>>>>>> */
>>>>>>>>>          if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>              && (!amdgpu_device_has_job_running(adev) ||
>
Liu, Shaoyun Jan. 4, 2022, 5:13 p.m. UTC | #14
[AMD Official Use Only]

I mostly agree with the sequences Christian  described .  Just one  thing might need to  discuss here.  For FLR notified from host,  in new sequenceas described  , driver need to reply the  READY_TO_RESET in the  workitem  from a reset  work queue which means inside flr_work, driver can not directly reply to host but need to queue another workqueue . For current  code ,  the flr_work for sriov itself is a work queue queued from ISR .  I think we should try to response to the host driver as soon as possible.  Queue another workqueue  inside  the workqueue  doesn't sounds efficient to me.  
Anyway, what we need is a working  solution for our project.  So if we need to change the sequence, we  need to make sure it's been tested first and won't break the functionality before the code is landed in the branch . 

Regards
Shaoyun.liu


-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Christian König
Sent: Tuesday, January 4, 2022 6:36 AM
To: Liu, Monk <Monk.Liu@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>
Cc: daniel@ffwll.ch
Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV

Am 04.01.22 um 11:49 schrieb Liu, Monk:
> [AMD Official Use Only]
>
>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR 
> is about to start or was already executed, but host will do FLR anyway 
> without waiting for guest too long
>

Then we have a major design issue in the SRIOV protocol and really need to question this.

How do you want to prevent a race between the hypervisor resetting the hardware and the client trying the same because of a timeout?

As far as I can see the procedure should be:
1. We detect that a reset is necessary, either because of a fault a timeout or signal from hypervisor.
2. For each of those potential reset sources a work item is send to the single workqueue.
3. One of those work items execute first and prepares the reset.
4. We either do the reset our self or notify the hypervisor that we are ready for the reset.
5. Cleanup after the reset, eventually resubmit jobs etc..
6. Cancel work items which might have been scheduled from other reset sources.

It does make sense that the hypervisor resets the hardware without waiting for the clients for too long, but if we don't follow this general steps we will always have a race between the different components.

Regards,
Christian.

Am 04.01.22 um 11:49 schrieb Liu, Monk:
> [AMD Official Use Only]
>
>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR 
> is about to start or was already executed, but host will do FLR anyway 
> without waiting for guest too long
>
>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
> It makes the code to crash ... how could it be a fix ?
>
> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>
> Thanks
> -------------------------------------------------------------------
> Monk Liu | Cloud GPU & Virtualization Solution | AMD
> -------------------------------------------------------------------
> we are hiring software manager for CVS core team
> -------------------------------------------------------------------
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Tuesday, January 4, 2022 6:19 PM
> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König 
> <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey 
> <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, 
> Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; 
> amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; 
> Chen, JingWen <JingWen.Chen2@amd.com>
> Cc: daniel@ffwll.ch
> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset 
> protection for SRIOV
>
> Hi Jingwen,
>
> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>
> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>
> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>
> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>
> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>
> Regards,
> Christian.
>
> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>> Hi Christian,
>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>
>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>
>>   From my point of view, we can directly use amdgpu_device_lock_adev 
>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>
>> Best Regards,
>> Jingwen Chen
>>
>> On 2022/1/3 下午6:17, Christian König wrote:
>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>
>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>
>>> Christian.
>>>
>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>> Sure, I guess i can drop this patch then.
>>>>
>>>> Andrey
>>>>
>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>
>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>
>>>>>> Best wishes
>>>>>> Emily Deng
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, 
>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>; 
>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org; 
>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen 
>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>> Cc: daniel@ffwll.ch
>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU 
>>>>>>> reset protection for SRIOV
>>>>>>>
>>>>>>> [AMD Official Use Only]
>>>>>>>
>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>
>>>>>>> Please take a review on Andrey's patch
>>>>>>>
>>>>>>> Thanks
>>>>>>> ----------------------------------------------------------------
>>>>>>> -
>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>> ----------------------------------------------------------------
>>>>>>> -
>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>> ----------------------------------------------------------------
>>>>>>> -
>>>>>>> --
>>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri- 
>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace 
>>>>>>> <Horace.Chen@amd.com>
>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU 
>>>>>>> reset protection for SRIOV
>>>>>>>
>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>> Since now flr work is serialized against  GPU resets there is 
>>>>>>>> no need for this.
>>>>>>>>
>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>
>>>>>>>> ---
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>      2 files changed, 22 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>> @@ -248,15 +248,7 @@ static void 
>>>>>>>> xgpu_ai_mailbox_flr_work(struct
>>>>>>> work_struct *work)
>>>>>>>>          struct amdgpu_device *adev = container_of(virt, struct
>>>>>>> amdgpu_device, virt);
>>>>>>>>          int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>
>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE 
>>>>>>>> received,
>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>> -     * the VF FLR.
>>>>>>>> -     */
>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>> -        return;
>>>>>>>> -
>>>>>>>>          amdgpu_virt_fini_data_exchange(adev);
>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>
>>>>>>>>          xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 
>>>>>>>> 0, 0);
>>>>>>>>
>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>> work_struct *work)
>>>>>>>>          } while (timeout > 1);
>>>>>>>>
>>>>>>>>      flr_done:
>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>> -
>>>>>>>>          /* Trigger recovery for world switch failure if no TDR 
>>>>>>>> */
>>>>>>>>          if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>              && (!amdgpu_device_has_job_running(adev) || diff 
>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>> @@ -277,15 +277,7 @@ static void 
>>>>>>>> xgpu_nv_mailbox_flr_work(struct
>>>>>>> work_struct *work)
>>>>>>>>          struct amdgpu_device *adev = container_of(virt, struct
>>>>>>> amdgpu_device, virt);
>>>>>>>>          int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>
>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE 
>>>>>>>> received,
>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>> -     * the VF FLR.
>>>>>>>> -     */
>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>> -        return;
>>>>>>>> -
>>>>>>>>          amdgpu_virt_fini_data_exchange(adev);
>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>
>>>>>>>>          xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 
>>>>>>>> 0, 0);
>>>>>>>>
>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>> work_struct *work)
>>>>>>>>          } while (timeout > 1);
>>>>>>>>
>>>>>>>>      flr_done:
>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>> -
>>>>>>>>          /* Trigger recovery for world switch failure if no TDR 
>>>>>>>> */
>>>>>>>>          if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>              && (!amdgpu_device_has_job_running(adev) ||
Andrey Grodzovsky Jan. 4, 2022, 8:54 p.m. UTC | #15
On 2022-01-04 12:13 p.m., Liu, Shaoyun wrote:

> [AMD Official Use Only]
>
> I mostly agree with the sequences Christian  described .  Just one  thing might need to  discuss here.  For FLR notified from host,  in new sequenceas described  , driver need to reply the  READY_TO_RESET in the  workitem  from a reset  work queue which means inside flr_work, driver can not directly reply to host but need to queue another workqueue .


Can you clarify why 'driver can not directly reply to host but need to 
queue another workqueue' ? To my understating all steps 3-6 in 
Christian's description happen from the same single wq thread serially.


>   For current  code ,  the flr_work for sriov itself is a work queue queued from ISR .  I think we should try to response to the host driver as soon as possible.  Queue another workqueue  inside  the workqueue  doesn't sounds efficient to me.


Check patch 5 please [1] - I just substituted 
schedule_work(&adev->virt.flr_work) for 
queue_work(adev->reset_domain.wq,&adev->virt.flr_work) so no extra 
requeue here, just instead of sending to system_wq it's sent
to dedicated reset wq

[1] - 
https://lore.kernel.org/all/20211222221400.790842-1-andrey.grodzovsky@amd.com/

Andrey


> Anyway, what we need is a working  solution for our project.  So if we need to change the sequence, we  need to make sure it's been tested first and won't break the functionality before the code is landed in the branch .
>
> Regards
> Shaoyun.liu
>
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Christian König
> Sent: Tuesday, January 4, 2022 6:36 AM
> To: Liu, Monk <Monk.Liu@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>
> Cc: daniel@ffwll.ch
> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>
> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>> [AMD Official Use Only]
>>
>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR
>> is about to start or was already executed, but host will do FLR anyway
>> without waiting for guest too long
>>
> Then we have a major design issue in the SRIOV protocol and really need to question this.
>
> How do you want to prevent a race between the hypervisor resetting the hardware and the client trying the same because of a timeout?
>
> As far as I can see the procedure should be:
> 1. We detect that a reset is necessary, either because of a fault a timeout or signal from hypervisor.
> 2. For each of those potential reset sources a work item is send to the single workqueue.
> 3. One of those work items execute first and prepares the reset.
> 4. We either do the reset our self or notify the hypervisor that we are ready for the reset.
> 5. Cleanup after the reset, eventually resubmit jobs etc..
> 6. Cancel work items which might have been scheduled from other reset sources.
>
> It does make sense that the hypervisor resets the hardware without waiting for the clients for too long, but if we don't follow this general steps we will always have a race between the different components.
>
> Regards,
> Christian.
>
> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>> [AMD Official Use Only]
>>
>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR
>> is about to start or was already executed, but host will do FLR anyway
>> without waiting for guest too long
>>
>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>> It makes the code to crash ... how could it be a fix ?
>>
>> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>
>> Thanks
>> -------------------------------------------------------------------
>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>> -------------------------------------------------------------------
>> we are hiring software manager for CVS core team
>> -------------------------------------------------------------------
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Tuesday, January 4, 2022 6:19 PM
>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König
>> <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey
>> <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu,
>> Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org;
>> amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>;
>> Chen, JingWen <JingWen.Chen2@amd.com>
>> Cc: daniel@ffwll.ch
>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>> protection for SRIOV
>>
>> Hi Jingwen,
>>
>> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>>
>> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>>
>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>
>> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>>
>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>
>> Regards,
>> Christian.
>>
>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>> Hi Christian,
>>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>>
>>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>>
>>>    From my point of view, we can directly use amdgpu_device_lock_adev
>>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>>
>>> Best Regards,
>>> Jingwen Chen
>>>
>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>>
>>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>>
>>>> Christian.
>>>>
>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>> Sure, I guess i can drop this patch then.
>>>>>
>>>>> Andrey
>>>>>
>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>>
>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>>
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org;
>>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU
>>>>>>>> reset protection for SRIOV
>>>>>>>>
>>>>>>>> [AMD Official Use Only]
>>>>>>>>
>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>
>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>
>>>>>>>> Thanks
>>>>>>>> ----------------------------------------------------------------
>>>>>>>> -
>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>> ----------------------------------------------------------------
>>>>>>>> -
>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>> ----------------------------------------------------------------
>>>>>>>> -
>>>>>>>> --
>>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU
>>>>>>>> reset protection for SRIOV
>>>>>>>>
>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>> Since now flr work is serialized against  GPU resets there is
>>>>>>>>> no need for this.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>
>>>>>>>>> ---
>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>       2 files changed, 22 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> @@ -248,15 +248,7 @@ static void
>>>>>>>>> xgpu_ai_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>           struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>> amdgpu_device, virt);
>>>>>>>>>           int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>
>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE
>>>>>>>>> received,
>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>> -     * the VF FLR.
>>>>>>>>> -     */
>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>> -        return;
>>>>>>>>> -
>>>>>>>>>           amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>
>>>>>>>>>           xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>> 0, 0);
>>>>>>>>>
>>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>           } while (timeout > 1);
>>>>>>>>>
>>>>>>>>>       flr_done:
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>> -
>>>>>>>>>           /* Trigger recovery for world switch failure if no TDR
>>>>>>>>> */
>>>>>>>>>           if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>               && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> @@ -277,15 +277,7 @@ static void
>>>>>>>>> xgpu_nv_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>           struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>> amdgpu_device, virt);
>>>>>>>>>           int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>
>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE
>>>>>>>>> received,
>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>> -     * the VF FLR.
>>>>>>>>> -     */
>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>> -        return;
>>>>>>>>> -
>>>>>>>>>           amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>
>>>>>>>>>           xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>> 0, 0);
>>>>>>>>>
>>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>           } while (timeout > 1);
>>>>>>>>>
>>>>>>>>>       flr_done:
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>> -
>>>>>>>>>           /* Trigger recovery for world switch failure if no TDR
>>>>>>>>> */
>>>>>>>>>           if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>               && (!amdgpu_device_has_job_running(adev) ||
Liu, Shaoyun Jan. 5, 2022, 12:01 a.m. UTC | #16
[AMD Official Use Only]

I see, I didn't notice you already have  this implemented . so the flr_work routine itself is synced now, in this case , I  agree it should be safe to remove the in_gpu_reset and  reset_semm in the flr_work. 

Regards
Shaoyun.liu

-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com> 
Sent: Tuesday, January 4, 2022 3:55 PM
To: Liu, Shaoyun <Shaoyun.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Liu, Monk <Monk.Liu@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Deng, Emily <Emily.Deng@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>
Cc: daniel@ffwll.ch
Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV

On 2022-01-04 12:13 p.m., Liu, Shaoyun wrote:

> [AMD Official Use Only]
>
> I mostly agree with the sequences Christian  described .  Just one  thing might need to  discuss here.  For FLR notified from host,  in new sequenceas described  , driver need to reply the  READY_TO_RESET in the  workitem  from a reset  work queue which means inside flr_work, driver can not directly reply to host but need to queue another workqueue .


Can you clarify why 'driver can not directly reply to host but need to queue another workqueue' ? To my understating all steps 3-6 in Christian's description happen from the same single wq thread serially.


>   For current  code ,  the flr_work for sriov itself is a work queue queued from ISR .  I think we should try to response to the host driver as soon as possible.  Queue another workqueue  inside  the workqueue  doesn't sounds efficient to me.


Check patch 5 please [1] - I just substituted
schedule_work(&adev->virt.flr_work) for
queue_work(adev->reset_domain.wq,&adev->virt.flr_work) so no extra requeue here, just instead of sending to system_wq it's sent to dedicated reset wq

[1] -
https://lore.kernel.org/all/20211222221400.790842-1-andrey.grodzovsky@amd.com/

Andrey


> Anyway, what we need is a working  solution for our project.  So if we need to change the sequence, we  need to make sure it's been tested first and won't break the functionality before the code is landed in the branch .
>
> Regards
> Shaoyun.liu
>
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of 
> Christian König
> Sent: Tuesday, January 4, 2022 6:36 AM
> To: Liu, Monk <Monk.Liu@amd.com>; Chen, JingWen 
> <JingWen.Chen2@amd.com>; Christian König 
> <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey 
> <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; 
> dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, 
> Horace <Horace.Chen@amd.com>
> Cc: daniel@ffwll.ch
> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset 
> protection for SRIOV
>
> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>> [AMD Official Use Only]
>>
>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>> No it's not, FLR from hypervisor is just to notify guest the hw VF 
>> FLR is about to start or was already executed, but host will do FLR 
>> anyway without waiting for guest too long
>>
> Then we have a major design issue in the SRIOV protocol and really need to question this.
>
> How do you want to prevent a race between the hypervisor resetting the hardware and the client trying the same because of a timeout?
>
> As far as I can see the procedure should be:
> 1. We detect that a reset is necessary, either because of a fault a timeout or signal from hypervisor.
> 2. For each of those potential reset sources a work item is send to the single workqueue.
> 3. One of those work items execute first and prepares the reset.
> 4. We either do the reset our self or notify the hypervisor that we are ready for the reset.
> 5. Cleanup after the reset, eventually resubmit jobs etc..
> 6. Cancel work items which might have been scheduled from other reset sources.
>
> It does make sense that the hypervisor resets the hardware without waiting for the clients for too long, but if we don't follow this general steps we will always have a race between the different components.
>
> Regards,
> Christian.
>
> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>> [AMD Official Use Only]
>>
>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>> No it's not, FLR from hypervisor is just to notify guest the hw VF 
>> FLR is about to start or was already executed, but host will do FLR 
>> anyway without waiting for guest too long
>>
>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>> It makes the code to crash ... how could it be a fix ?
>>
>> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>
>> Thanks
>> -------------------------------------------------------------------
>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>> -------------------------------------------------------------------
>> we are hiring software manager for CVS core team
>> -------------------------------------------------------------------
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Tuesday, January 4, 2022 6:19 PM
>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König 
>> <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey 
>> <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, 
>> Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; 
>> amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; 
>> Chen, JingWen <JingWen.Chen2@amd.com>
>> Cc: daniel@ffwll.ch
>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset 
>> protection for SRIOV
>>
>> Hi Jingwen,
>>
>> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>>
>> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>>
>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>
>> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>>
>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>
>> Regards,
>> Christian.
>>
>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>> Hi Christian,
>>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>>
>>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>>
>>>    From my point of view, we can directly use 
>>> amdgpu_device_lock_adev and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>>
>>> Best Regards,
>>> Jingwen Chen
>>>
>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>>
>>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>>
>>>> Christian.
>>>>
>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>> Sure, I guess i can drop this patch then.
>>>>>
>>>>> Andrey
>>>>>
>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>>
>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>>
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, 
>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>; 
>>>>>>>> dri-devel@lists.freedesktop.org; amd- 
>>>>>>>> gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; 
>>>>>>>> Chen, JingWen <JingWen.Chen2@amd.com>; Deng, Emily 
>>>>>>>> <Emily.Deng@amd.com>
>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU 
>>>>>>>> reset protection for SRIOV
>>>>>>>>
>>>>>>>> [AMD Official Use Only]
>>>>>>>>
>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>
>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>
>>>>>>>> Thanks
>>>>>>>> ---------------------------------------------------------------
>>>>>>>> -
>>>>>>>> -
>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>> ---------------------------------------------------------------
>>>>>>>> -
>>>>>>>> -
>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>> ---------------------------------------------------------------
>>>>>>>> -
>>>>>>>> -
>>>>>>>> --
>>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri- 
>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace 
>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU 
>>>>>>>> reset protection for SRIOV
>>>>>>>>
>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>> Since now flr work is serialized against  GPU resets there is 
>>>>>>>>> no need for this.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>
>>>>>>>>> ---
>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>       2 files changed, 22 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> @@ -248,15 +248,7 @@ static void 
>>>>>>>>> xgpu_ai_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>           struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>> struct
>>>>>>>> amdgpu_device, virt);
>>>>>>>>>           int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>
>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE 
>>>>>>>>> received,
>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>> -     * the VF FLR.
>>>>>>>>> -     */
>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>> -        return;
>>>>>>>>> -
>>>>>>>>>           amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>
>>>>>>>>>           xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 
>>>>>>>>> 0, 0, 0);
>>>>>>>>>
>>>>>>>>> @@ -269,9 +261,6 @@ static void 
>>>>>>>>> xgpu_ai_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>           } while (timeout > 1);
>>>>>>>>>
>>>>>>>>>       flr_done:
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>> -
>>>>>>>>>           /* Trigger recovery for world switch failure if no 
>>>>>>>>> TDR */
>>>>>>>>>           if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>               && (!amdgpu_device_has_job_running(adev) || diff 
>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> @@ -277,15 +277,7 @@ static void 
>>>>>>>>> xgpu_nv_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>           struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>> struct
>>>>>>>> amdgpu_device, virt);
>>>>>>>>>           int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>
>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE 
>>>>>>>>> received,
>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>> -     * the VF FLR.
>>>>>>>>> -     */
>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>> -        return;
>>>>>>>>> -
>>>>>>>>>           amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>
>>>>>>>>>           xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 
>>>>>>>>> 0, 0, 0);
>>>>>>>>>
>>>>>>>>> @@ -298,9 +290,6 @@ static void 
>>>>>>>>> xgpu_nv_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>           } while (timeout > 1);
>>>>>>>>>
>>>>>>>>>       flr_done:
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>> -
>>>>>>>>>           /* Trigger recovery for world switch failure if no 
>>>>>>>>> TDR */
>>>>>>>>>           if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>               && (!amdgpu_device_has_job_running(adev) ||
JingWen Chen Jan. 5, 2022, 7:25 a.m. UTC | #17
On 2022/1/4 下午7:36, Christian König wrote:
> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>> [AMD Official Use Only]
>>
>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>
>
> Then we have a major design issue in the SRIOV protocol and really need to question this.
>
> How do you want to prevent a race between the hypervisor resetting the hardware and the client trying the same because of a timeout?
>
> As far as I can see the procedure should be:
> 1. We detect that a reset is necessary, either because of a fault a timeout or signal from hypervisor.
> 2. For each of those potential reset sources a work item is send to the single workqueue.

I think Andrey has already used the same ordered work queue to handle the reset from both ring timeout and hypervisor. (Patch 5)

So there should be no race between different reset sources. As ring timeout is much longer than the world switch time slice(6ms), we should see a reset from hypervisor queued into reset domain wq first and after the flr work done, then the ring timeout reset queued into reset domain.

> 3. One of those work items execute first and prepares the reset.
> 4. We either do the reset our self or notify the hypervisor that we are ready for the reset.
> 5. Cleanup after the reset, eventually resubmit jobs etc..
> 6. Cancel work items which might have been scheduled from other reset sources.
>
> It does make sense that the hypervisor resets the hardware without waiting for the clients for too long, but if we don't follow this general steps we will always have a race between the different components.

So the reset_sem and in_gpu_reset is to prevent race between reset_domain(mostly hypervisor source) and other user spaces(e.g. kfd).

>
> Regards,
> Christian.
>
> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>> [AMD Official Use Only]
>>
>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>
>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>> It makes the code to crash ... how could it be a fix ?
>>
>> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>
>> Thanks
>> -------------------------------------------------------------------
>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>> -------------------------------------------------------------------
>> we are hiring software manager for CVS core team
>> -------------------------------------------------------------------
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Tuesday, January 4, 2022 6:19 PM
>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>
>> Cc: daniel@ffwll.ch
>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>>
>> Hi Jingwen,
>>
>> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>>
>> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>>
>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>
>> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>>
>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>
>> Regards,
>> Christian.
>>
>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>> Hi Christian,
>>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>>
>>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>>
>>>   From my point of view, we can directly use amdgpu_device_lock_adev
>>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>>
>>> Best Regards,
>>> Jingwen Chen
>>>
>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>>
>>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>>
>>>> Christian.
>>>>
>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>> Sure, I guess i can drop this patch then.
>>>>>
>>>>> Andrey
>>>>>
>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>>
>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>>
>>>>>>> Best wishes
>>>>>>> Emily Deng
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org;
>>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>> protection for SRIOV
>>>>>>>>
>>>>>>>> [AMD Official Use Only]
>>>>>>>>
>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>
>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>
>>>>>>>> Thanks
>>>>>>>> -----------------------------------------------------------------
>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>> -----------------------------------------------------------------
>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>> -----------------------------------------------------------------
>>>>>>>> -- 
>>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>> protection for SRIOV
>>>>>>>>
>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>> Since now flr work is serialized against  GPU resets there is no
>>>>>>>>> need for this.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>
>>>>>>>>> ---
>>>>>>>>>      drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>      drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>      2 files changed, 22 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>          struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>> amdgpu_device, virt);
>>>>>>>>>          int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>
>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>> -     * the VF FLR.
>>>>>>>>> -     */
>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>> -        return;
>>>>>>>>> -
>>>>>>>>>          amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>
>>>>>>>>>          xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>> 0, 0);
>>>>>>>>>
>>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>          } while (timeout > 1);
>>>>>>>>>
>>>>>>>>>      flr_done:
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>> -
>>>>>>>>>          /* Trigger recovery for world switch failure if no TDR
>>>>>>>>> */
>>>>>>>>>          if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>              && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>          struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>> amdgpu_device, virt);
>>>>>>>>>          int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>
>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>> -     * the VF FLR.
>>>>>>>>> -     */
>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>> -        return;
>>>>>>>>> -
>>>>>>>>>          amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>
>>>>>>>>>          xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>> 0, 0);
>>>>>>>>>
>>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>> work_struct *work)
>>>>>>>>>          } while (timeout > 1);
>>>>>>>>>
>>>>>>>>>      flr_done:
>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>> -
>>>>>>>>>          /* Trigger recovery for world switch failure if no TDR
>>>>>>>>> */
>>>>>>>>>          if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>              && (!amdgpu_device_has_job_running(adev) ||
>
JingWen Chen Jan. 5, 2022, 7:34 a.m. UTC | #18
On 2022/1/5 上午12:56, Andrey Grodzovsky wrote:
>
> On 2022-01-04 6:36 a.m., Christian König wrote:
>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>> [AMD Official Use Only]
>>>
>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>
>>
>> Then we have a major design issue in the SRIOV protocol and really need to question this.
>>
>> How do you want to prevent a race between the hypervisor resetting the hardware and the client trying the same because of a timeout?
>>
>> As far as I can see the procedure should be:
>> 1. We detect that a reset is necessary, either because of a fault a timeout or signal from hypervisor.
>> 2. For each of those potential reset sources a work item is send to the single workqueue.
>> 3. One of those work items execute first and prepares the reset.
>> 4. We either do the reset our self or notify the hypervisor that we are ready for the reset.
>> 5. Cleanup after the reset, eventually resubmit jobs etc..
>> 6. Cancel work items which might have been scheduled from other reset sources.
>>
>> It does make sense that the hypervisor resets the hardware without waiting for the clients for too long, but if we don't follow this general steps we will always have a race between the different components.
>
>
> Monk, just to add to this - if indeed as you say that 'FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long'
> and there is no strict waiting from the hypervisor for IDH_READY_TO_RESET to be recived from guest before starting the reset then setting in_gpu_reset and locking reset_sem from guest side is not really full proof
> protection from MMIO accesses by the guest - it only truly helps if hypervisor waits for that message before initiation of HW reset.
>
Hi Andrey, this cannot be done. If somehow guest kernel hangs and never has the chance to send the response back, then other VFs will have to wait it reset. All the vfs will hang in this case. Or sometimes the mailbox has some delay and other VFs will also wait. The user of other VFs will be affected in this case.
> Andrey
>
>
>>
>> Regards,
>> Christian.
>>
>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>> [AMD Official Use Only]
>>>
>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>
>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>> It makes the code to crash ... how could it be a fix ?
>>>
>>> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>>
>>> Thanks
>>> -------------------------------------------------------------------
>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>> -------------------------------------------------------------------
>>> we are hiring software manager for CVS core team
>>> -------------------------------------------------------------------
>>>
>>> -----Original Message-----
>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>> Sent: Tuesday, January 4, 2022 6:19 PM
>>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>
>>> Cc: daniel@ffwll.ch
>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>>>
>>> Hi Jingwen,
>>>
>>> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>>>
>>> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>>>
>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>
>>> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>>>
>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>
>>> Regards,
>>> Christian.
>>>
>>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>>> Hi Christian,
>>>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>>>
>>>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>>>
>>>>   From my point of view, we can directly use amdgpu_device_lock_adev
>>>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>>>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>>>
>>>> Best Regards,
>>>> Jingwen Chen
>>>>
>>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>>>
>>>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>>>
>>>>> Christian.
>>>>>
>>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>>> Sure, I guess i can drop this patch then.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>>>
>>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>>>
>>>>>>>> Best wishes
>>>>>>>> Emily Deng
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org;
>>>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>> protection for SRIOV
>>>>>>>>>
>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>
>>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>>
>>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>>
>>>>>>>>> Thanks
>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>> -- 
>>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>> protection for SRIOV
>>>>>>>>>
>>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>>> Since now flr work is serialized against  GPU resets there is no
>>>>>>>>>> need for this.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>>
>>>>>>>>>> ---
>>>>>>>>>>      drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>>      drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>>      2 files changed, 22 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>> work_struct *work)
>>>>>>>>>>          struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>          int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>
>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>> -     */
>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>> -        return;
>>>>>>>>>> -
>>>>>>>>>>          amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>
>>>>>>>>>>          xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>> 0, 0);
>>>>>>>>>>
>>>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>> work_struct *work)
>>>>>>>>>>          } while (timeout > 1);
>>>>>>>>>>
>>>>>>>>>>      flr_done:
>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>> -
>>>>>>>>>>          /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>> */
>>>>>>>>>>          if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>              && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>> work_struct *work)
>>>>>>>>>>          struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>          int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>
>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>> -     */
>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>> -        return;
>>>>>>>>>> -
>>>>>>>>>>          amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>
>>>>>>>>>>          xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>> 0, 0);
>>>>>>>>>>
>>>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>> work_struct *work)
>>>>>>>>>>          } while (timeout > 1);
>>>>>>>>>>
>>>>>>>>>>      flr_done:
>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>> -
>>>>>>>>>>          /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>> */
>>>>>>>>>>          if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>              && (!amdgpu_device_has_job_running(adev) ||
>>
Christian König Jan. 5, 2022, 7:59 a.m. UTC | #19
Am 05.01.22 um 08:34 schrieb JingWen Chen:
> On 2022/1/5 上午12:56, Andrey Grodzovsky wrote:
>> On 2022-01-04 6:36 a.m., Christian König wrote:
>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>> [AMD Official Use Only]
>>>>
>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>
>>> Then we have a major design issue in the SRIOV protocol and really need to question this.
>>>
>>> How do you want to prevent a race between the hypervisor resetting the hardware and the client trying the same because of a timeout?
>>>
>>> As far as I can see the procedure should be:
>>> 1. We detect that a reset is necessary, either because of a fault a timeout or signal from hypervisor.
>>> 2. For each of those potential reset sources a work item is send to the single workqueue.
>>> 3. One of those work items execute first and prepares the reset.
>>> 4. We either do the reset our self or notify the hypervisor that we are ready for the reset.
>>> 5. Cleanup after the reset, eventually resubmit jobs etc..
>>> 6. Cancel work items which might have been scheduled from other reset sources.
>>>
>>> It does make sense that the hypervisor resets the hardware without waiting for the clients for too long, but if we don't follow this general steps we will always have a race between the different components.
>>
>> Monk, just to add to this - if indeed as you say that 'FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long'
>> and there is no strict waiting from the hypervisor for IDH_READY_TO_RESET to be recived from guest before starting the reset then setting in_gpu_reset and locking reset_sem from guest side is not really full proof
>> protection from MMIO accesses by the guest - it only truly helps if hypervisor waits for that message before initiation of HW reset.
>>
> Hi Andrey, this cannot be done. If somehow guest kernel hangs and never has the chance to send the response back, then other VFs will have to wait it reset. All the vfs will hang in this case. Or sometimes the mailbox has some delay and other VFs will also wait. The user of other VFs will be affected in this case.

Yeah, agree completely with JingWen. The hypervisor is the one in charge 
here, not the guest.

What the hypervisor should do (and it already seems to be designed that 
way) is to send the guest a message that a reset is about to happen and 
give it some time to response appropriately.

The guest on the other hand then tells the hypervisor that all 
processing has stopped and it is ready to restart. If that doesn't 
happen in time the hypervisor should eliminate the guest probably 
trigger even more severe consequences, e.g. restart the whole VM etc...

Christian.

>> Andrey
>>
>>
>>> Regards,
>>> Christian.
>>>
>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>> [AMD Official Use Only]
>>>>
>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>
>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>> It makes the code to crash ... how could it be a fix ?
>>>>
>>>> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>>>
>>>> Thanks
>>>> -------------------------------------------------------------------
>>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>> -------------------------------------------------------------------
>>>> we are hiring software manager for CVS core team
>>>> -------------------------------------------------------------------
>>>>
>>>> -----Original Message-----
>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>> Sent: Tuesday, January 4, 2022 6:19 PM
>>>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>
>>>> Cc: daniel@ffwll.ch
>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>>>>
>>>> Hi Jingwen,
>>>>
>>>> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>>>>
>>>> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>>>>
>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>
>>>> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>>>>
>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>>>> Hi Christian,
>>>>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>>>>
>>>>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>>>>
>>>>>    From my point of view, we can directly use amdgpu_device_lock_adev
>>>>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>>>>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>>>>
>>>>> Best Regards,
>>>>> Jingwen Chen
>>>>>
>>>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>>>>
>>>>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>>>>
>>>>>> Christian.
>>>>>>
>>>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>>>> Sure, I guess i can drop this patch then.
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>>>>
>>>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>>>>
>>>>>>>>> Best wishes
>>>>>>>>> Emily Deng
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org;
>>>>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>> protection for SRIOV
>>>>>>>>>>
>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>
>>>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>>>
>>>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>>>
>>>>>>>>>> Thanks
>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>> -- 
>>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>> protection for SRIOV
>>>>>>>>>>
>>>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>>>> Since now flr work is serialized against  GPU resets there is no
>>>>>>>>>>> need for this.
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>>>
>>>>>>>>>>> ---
>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>>>       2 files changed, 22 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>> work_struct *work)
>>>>>>>>>>>           struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>           int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>
>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>> -     */
>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>> -        return;
>>>>>>>>>>> -
>>>>>>>>>>>           amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>
>>>>>>>>>>>           xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>> 0, 0);
>>>>>>>>>>>
>>>>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>> work_struct *work)
>>>>>>>>>>>           } while (timeout > 1);
>>>>>>>>>>>
>>>>>>>>>>>       flr_done:
>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>> -
>>>>>>>>>>>           /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>> */
>>>>>>>>>>>           if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>               && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>> work_struct *work)
>>>>>>>>>>>           struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>           int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>
>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>> -     */
>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>> -        return;
>>>>>>>>>>> -
>>>>>>>>>>>           amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>
>>>>>>>>>>>           xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>> 0, 0);
>>>>>>>>>>>
>>>>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>> work_struct *work)
>>>>>>>>>>>           } while (timeout > 1);
>>>>>>>>>>>
>>>>>>>>>>>       flr_done:
>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>> -
>>>>>>>>>>>           /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>> */
>>>>>>>>>>>           if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>               && (!amdgpu_device_has_job_running(adev) ||
Andrey Grodzovsky Jan. 5, 2022, 6:24 p.m. UTC | #20
On 2022-01-05 2:59 a.m., Christian König wrote:
> Am 05.01.22 um 08:34 schrieb JingWen Chen:
>> On 2022/1/5 上午12:56, Andrey Grodzovsky wrote:
>>> On 2022-01-04 6:36 a.m., Christian König wrote:
>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>> [AMD Official Use Only]
>>>>>
>>>>>>> See the FLR request from the hypervisor is just another source 
>>>>>>> of signaling the need for a reset, similar to each job timeout 
>>>>>>> on each queue. Otherwise you have a race condition between the 
>>>>>>> hypervisor and the scheduler.
>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF 
>>>>> FLR is about to start or was already executed, but host will do 
>>>>> FLR anyway without waiting for guest too long
>>>>>
>>>> Then we have a major design issue in the SRIOV protocol and really 
>>>> need to question this.
>>>>
>>>> How do you want to prevent a race between the hypervisor resetting 
>>>> the hardware and the client trying the same because of a timeout?
>>>>
>>>> As far as I can see the procedure should be:
>>>> 1. We detect that a reset is necessary, either because of a fault a 
>>>> timeout or signal from hypervisor.
>>>> 2. For each of those potential reset sources a work item is send to 
>>>> the single workqueue.
>>>> 3. One of those work items execute first and prepares the reset.
>>>> 4. We either do the reset our self or notify the hypervisor that we 
>>>> are ready for the reset.
>>>> 5. Cleanup after the reset, eventually resubmit jobs etc..
>>>> 6. Cancel work items which might have been scheduled from other 
>>>> reset sources.
>>>>
>>>> It does make sense that the hypervisor resets the hardware without 
>>>> waiting for the clients for too long, but if we don't follow this 
>>>> general steps we will always have a race between the different 
>>>> components.
>>>
>>> Monk, just to add to this - if indeed as you say that 'FLR from 
>>> hypervisor is just to notify guest the hw VF FLR is about to start 
>>> or was already executed, but host will do FLR anyway without waiting 
>>> for guest too long'
>>> and there is no strict waiting from the hypervisor for 
>>> IDH_READY_TO_RESET to be recived from guest before starting the 
>>> reset then setting in_gpu_reset and locking reset_sem from guest 
>>> side is not really full proof
>>> protection from MMIO accesses by the guest - it only truly helps if 
>>> hypervisor waits for that message before initiation of HW reset.
>>>
>> Hi Andrey, this cannot be done. If somehow guest kernel hangs and 
>> never has the chance to send the response back, then other VFs will 
>> have to wait it reset. All the vfs will hang in this case. Or 
>> sometimes the mailbox has some delay and other VFs will also wait. 
>> The user of other VFs will be affected in this case.
>
> Yeah, agree completely with JingWen. The hypervisor is the one in 
> charge here, not the guest.
>
> What the hypervisor should do (and it already seems to be designed 
> that way) is to send the guest a message that a reset is about to 
> happen and give it some time to response appropriately.
>
> The guest on the other hand then tells the hypervisor that all 
> processing has stopped and it is ready to restart. If that doesn't 
> happen in time the hypervisor should eliminate the guest probably 
> trigger even more severe consequences, e.g. restart the whole VM etc...
>
> Christian.


So what's the end conclusion here regarding dropping this particular 
patch ? Seems to me we still need to drop it to prevent driver's MMIO access
to the GPU during reset from various places in the code.

Andrey


>
>>> Andrey
>>>
>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>> [AMD Official Use Only]
>>>>>
>>>>>>> See the FLR request from the hypervisor is just another source 
>>>>>>> of signaling the need for a reset, similar to each job timeout 
>>>>>>> on each queue. Otherwise you have a race condition between the 
>>>>>>> hypervisor and the scheduler.
>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF 
>>>>> FLR is about to start or was already executed, but host will do 
>>>>> FLR anyway without waiting for guest too long
>>>>>
>>>>>>> In other words I strongly think that the current SRIOV reset 
>>>>>>> implementation is severely broken and what Andrey is doing is 
>>>>>>> actually fixing it.
>>>>> It makes the code to crash ... how could it be a fix ?
>>>>>
>>>>> I'm afraid the patch is NAK from me,  but it is welcome if the 
>>>>> cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>>>>
>>>>> Thanks
>>>>> -------------------------------------------------------------------
>>>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>> -------------------------------------------------------------------
>>>>> we are hiring software manager for CVS core team
>>>>> -------------------------------------------------------------------
>>>>>
>>>>> -----Original Message-----
>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>> Sent: Tuesday, January 4, 2022 6:19 PM
>>>>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König 
>>>>> <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey 
>>>>> <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; 
>>>>> Liu, Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; 
>>>>> amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; 
>>>>> Chen, JingWen <JingWen.Chen2@amd.com>
>>>>> Cc: daniel@ffwll.ch
>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset 
>>>>> protection for SRIOV
>>>>>
>>>>> Hi Jingwen,
>>>>>
>>>>> well what I mean is that we need to adjust the implementation in 
>>>>> amdgpu to actually match the requirements.
>>>>>
>>>>> Could be that the reset sequence is questionable in general, but I 
>>>>> doubt so at least for now.
>>>>>
>>>>> See the FLR request from the hypervisor is just another source of 
>>>>> signaling the need for a reset, similar to each job timeout on 
>>>>> each queue. Otherwise you have a race condition between the 
>>>>> hypervisor and the scheduler.
>>>>>
>>>>> Properly setting in_gpu_reset is indeed mandatory, but should 
>>>>> happen at a central place and not in the SRIOV specific code.
>>>>>
>>>>> In other words I strongly think that the current SRIOV reset 
>>>>> implementation is severely broken and what Andrey is doing is 
>>>>> actually fixing it.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>>>>> Hi Christian,
>>>>>> I'm not sure what do you mean by "we need to change SRIOV not the 
>>>>>> driver".
>>>>>>
>>>>>> Do you mean we should change the reset sequence in SRIOV? This 
>>>>>> will be a huge change for our SRIOV solution.
>>>>>>
>>>>>>    From my point of view, we can directly use 
>>>>>> amdgpu_device_lock_adev
>>>>>> and amdgpu_device_unlock_adev in flr_work instead of try_lock 
>>>>>> since no one will conflict with this thread with reset_domain 
>>>>>> introduced.
>>>>>> But we do need the reset_sem and adev->in_gpu_reset to keep 
>>>>>> device untouched via user space.
>>>>>>
>>>>>> Best Regards,
>>>>>> Jingwen Chen
>>>>>>
>>>>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>>>>> Please don't. This patch is vital to the cleanup of the reset 
>>>>>>> procedure.
>>>>>>>
>>>>>>> If SRIOV doesn't work with that we need to change SRIOV and not 
>>>>>>> the driver.
>>>>>>>
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>>>>> Sure, I guess i can drop this patch then.
>>>>>>>>
>>>>>>>> Andrey
>>>>>>>>
>>>>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs 
>>>>>>>>> first, and do the flr, guest side thread may not know this and 
>>>>>>>>> still try to access HW(e.g. kfd is using a lot of 
>>>>>>>>> amdgpu_in_reset and reset_sem to identify the reset status). 
>>>>>>>>> And this may lead to very bad result.
>>>>>>>>>
>>>>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>>>>> These patches look good to me. JingWen will pull these 
>>>>>>>>>> patches and do some basic TDR test on sriov environment, and 
>>>>>>>>>> give feedback.
>>>>>>>>>>
>>>>>>>>>> Best wishes
>>>>>>>>>> Emily Deng
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>>>>>> dri-devel@lists.freedesktop.org; amd- 
>>>>>>>>>>> gfx@lists.freedesktop.org;
>>>>>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU 
>>>>>>>>>>> reset
>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>
>>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>>
>>>>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>>>>
>>>>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>>>>
>>>>>>>>>>> Thanks
>>>>>>>>>>> ----------------------------------------------------------------- 
>>>>>>>>>>>
>>>>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>>>> ----------------------------------------------------------------- 
>>>>>>>>>>>
>>>>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>>>>> ----------------------------------------------------------------- 
>>>>>>>>>>>
>>>>>>>>>>> -- 
>>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU 
>>>>>>>>>>> reset
>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>
>>>>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>>>>> Since now flr work is serialized against  GPU resets there 
>>>>>>>>>>>> is no
>>>>>>>>>>>> need for this.
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>
>>>>>>>>>>>> ---
>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>>>>       2 files changed, 22 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>> @@ -248,15 +248,7 @@ static void 
>>>>>>>>>>>> xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>           struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>> struct
>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>           int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>
>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE 
>>>>>>>>>>>> received,
>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>> -     */
>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>> -        return;
>>>>>>>>>>>> -
>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>
>>>>>>>>>>>>           xgpu_ai_mailbox_trans_msg(adev, 
>>>>>>>>>>>> IDH_READY_TO_RESET, 0,
>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>
>>>>>>>>>>>> @@ -269,9 +261,6 @@ static void 
>>>>>>>>>>>> xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>           } while (timeout > 1);
>>>>>>>>>>>>
>>>>>>>>>>>>       flr_done:
>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>> -
>>>>>>>>>>>>           /* Trigger recovery for world switch failure if 
>>>>>>>>>>>> no TDR
>>>>>>>>>>>> */
>>>>>>>>>>>>           if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>               && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>> @@ -277,15 +277,7 @@ static void 
>>>>>>>>>>>> xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>           struct amdgpu_device *adev = container_of(virt, 
>>>>>>>>>>>> struct
>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>           int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>
>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE 
>>>>>>>>>>>> received,
>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>> -     */
>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>> -        return;
>>>>>>>>>>>> -
>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>
>>>>>>>>>>>>           xgpu_nv_mailbox_trans_msg(adev, 
>>>>>>>>>>>> IDH_READY_TO_RESET, 0,
>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>
>>>>>>>>>>>> @@ -298,9 +290,6 @@ static void 
>>>>>>>>>>>> xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>           } while (timeout > 1);
>>>>>>>>>>>>
>>>>>>>>>>>>       flr_done:
>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>> -
>>>>>>>>>>>>           /* Trigger recovery for world switch failure if 
>>>>>>>>>>>> no TDR
>>>>>>>>>>>> */
>>>>>>>>>>>>           if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>               && (!amdgpu_device_has_job_running(adev) ||
>
JingWen Chen Jan. 6, 2022, 4:59 a.m. UTC | #21
On 2022/1/6 上午2:24, Andrey Grodzovsky wrote:
>
> On 2022-01-05 2:59 a.m., Christian König wrote:
>> Am 05.01.22 um 08:34 schrieb JingWen Chen:
>>> On 2022/1/5 上午12:56, Andrey Grodzovsky wrote:
>>>> On 2022-01-04 6:36 a.m., Christian König wrote:
>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>> [AMD Official Use Only]
>>>>>>
>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>
>>>>> Then we have a major design issue in the SRIOV protocol and really need to question this.
>>>>>
>>>>> How do you want to prevent a race between the hypervisor resetting the hardware and the client trying the same because of a timeout?
>>>>>
>>>>> As far as I can see the procedure should be:
>>>>> 1. We detect that a reset is necessary, either because of a fault a timeout or signal from hypervisor.
>>>>> 2. For each of those potential reset sources a work item is send to the single workqueue.
>>>>> 3. One of those work items execute first and prepares the reset.
>>>>> 4. We either do the reset our self or notify the hypervisor that we are ready for the reset.
>>>>> 5. Cleanup after the reset, eventually resubmit jobs etc..
>>>>> 6. Cancel work items which might have been scheduled from other reset sources.
>>>>>
>>>>> It does make sense that the hypervisor resets the hardware without waiting for the clients for too long, but if we don't follow this general steps we will always have a race between the different components.
>>>>
>>>> Monk, just to add to this - if indeed as you say that 'FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long'
>>>> and there is no strict waiting from the hypervisor for IDH_READY_TO_RESET to be recived from guest before starting the reset then setting in_gpu_reset and locking reset_sem from guest side is not really full proof
>>>> protection from MMIO accesses by the guest - it only truly helps if hypervisor waits for that message before initiation of HW reset.
>>>>
>>> Hi Andrey, this cannot be done. If somehow guest kernel hangs and never has the chance to send the response back, then other VFs will have to wait it reset. All the vfs will hang in this case. Or sometimes the mailbox has some delay and other VFs will also wait. The user of other VFs will be affected in this case.
>>
>> Yeah, agree completely with JingWen. The hypervisor is the one in charge here, not the guest.
>>
>> What the hypervisor should do (and it already seems to be designed that way) is to send the guest a message that a reset is about to happen and give it some time to response appropriately.
>>
>> The guest on the other hand then tells the hypervisor that all processing has stopped and it is ready to restart. If that doesn't happen in time the hypervisor should eliminate the guest probably trigger even more severe consequences, e.g. restart the whole VM etc...
>>
>> Christian.
>
>
> So what's the end conclusion here regarding dropping this particular patch ? Seems to me we still need to drop it to prevent driver's MMIO access
> to the GPU during reset from various places in the code.
>
> Andrey
>
Hi Andrey & Christian,

I have ported your patch(drop the reset_sem and in_gpu_reset in flr work) and run some tests. If a engine hang during an OCL benchmark(using kfd), we can see the logs below:

[  397.190727] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
[  397.301496] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
[  397.406601] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
[  397.532343] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
[  397.642251] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
[  397.746634] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
[  397.850761] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
[  397.960544] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
[  398.065218] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
[  398.182173] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
[  398.288264] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
[  398.394712] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
[  428.400582] [drm] clean up the vf2pf work item
[  428.500528] amdgpu 0000:00:07.0: amdgpu: [gfxhub] page fault (src_id:0 ring:153 vmid:8 pasid:32771, for process xgemmStandalone pid 3557 thread xgemmStandalone pid 3557)
[  428.527576] amdgpu 0000:00:07.0: amdgpu:   in page starting at address 0x00007fc991c04000 from client 0x1b (UTCL2)
[  437.531392] amdgpu: qcm fence wait loop timeout expired
[  437.535738] amdgpu: The cp might be in an unrecoverable state due to an unsuccessful queues preemption
[  437.537191] amdgpu 0000:00:07.0: amdgpu: GPU reset begin!
[  438.087443] [drm] RE-INIT-early: nv_common succeeded

As kfd relies on these to check if GPU is in reset, dropping it will hit some page fault and fence error very easily.

>
>>
>>>> Andrey
>>>>
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>> [AMD Official Use Only]
>>>>>>
>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>
>>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>> It makes the code to crash ... how could it be a fix ?
>>>>>>
>>>>>> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>>>>>
>>>>>> Thanks
>>>>>> -------------------------------------------------------------------
>>>>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>> -------------------------------------------------------------------
>>>>>> we are hiring software manager for CVS core team
>>>>>> -------------------------------------------------------------------
>>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>> Sent: Tuesday, January 4, 2022 6:19 PM
>>>>>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>
>>>>>> Cc: daniel@ffwll.ch
>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>>>>>>
>>>>>> Hi Jingwen,
>>>>>>
>>>>>> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>>>>>>
>>>>>> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>>>>>>
>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>
>>>>>> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>>>>>>
>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>>>>>> Hi Christian,
>>>>>>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>>>>>>
>>>>>>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>>>>>>
>>>>>>>    From my point of view, we can directly use amdgpu_device_lock_adev
>>>>>>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>>>>>>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>>>>>>
>>>>>>> Best Regards,
>>>>>>> Jingwen Chen
>>>>>>>
>>>>>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>>>>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>>>>>>
>>>>>>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>>>>>>
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>>>>>> Sure, I guess i can drop this patch then.
>>>>>>>>>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>>>>>>
>>>>>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>>>>>>
>>>>>>>>>>> Best wishes
>>>>>>>>>>> Emily Deng
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org;
>>>>>>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>
>>>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>>>
>>>>>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>>>>>
>>>>>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>>>>>
>>>>>>>>>>>> Thanks
>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>> -- 
>>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>
>>>>>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>> Since now flr work is serialized against  GPU resets there is no
>>>>>>>>>>>>> need for this.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>>>>>       2 files changed, 22 deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>           struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>           int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>
>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>> -     */
>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>> -
>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>
>>>>>>>>>>>>>           xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>
>>>>>>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>           } while (timeout > 1);
>>>>>>>>>>>>>
>>>>>>>>>>>>>       flr_done:
>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>> -
>>>>>>>>>>>>>           /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>> */
>>>>>>>>>>>>>           if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>               && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>           struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>           int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>
>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>> -     */
>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>> -
>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>
>>>>>>>>>>>>>           xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>
>>>>>>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>           } while (timeout > 1);
>>>>>>>>>>>>>
>>>>>>>>>>>>>       flr_done:
>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>> -
>>>>>>>>>>>>>           /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>> */
>>>>>>>>>>>>>           if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>               && (!amdgpu_device_has_job_running(adev) ||
>>
JingWen Chen Jan. 6, 2022, 5:18 a.m. UTC | #22
On 2022/1/6 下午12:59, JingWen Chen wrote:
> On 2022/1/6 上午2:24, Andrey Grodzovsky wrote:
>> On 2022-01-05 2:59 a.m., Christian König wrote:
>>> Am 05.01.22 um 08:34 schrieb JingWen Chen:
>>>> On 2022/1/5 上午12:56, Andrey Grodzovsky wrote:
>>>>> On 2022-01-04 6:36 a.m., Christian König wrote:
>>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>>> [AMD Official Use Only]
>>>>>>>
>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>>
>>>>>> Then we have a major design issue in the SRIOV protocol and really need to question this.
>>>>>>
>>>>>> How do you want to prevent a race between the hypervisor resetting the hardware and the client trying the same because of a timeout?
>>>>>>
>>>>>> As far as I can see the procedure should be:
>>>>>> 1. We detect that a reset is necessary, either because of a fault a timeout or signal from hypervisor.
>>>>>> 2. For each of those potential reset sources a work item is send to the single workqueue.
>>>>>> 3. One of those work items execute first and prepares the reset.
>>>>>> 4. We either do the reset our self or notify the hypervisor that we are ready for the reset.
>>>>>> 5. Cleanup after the reset, eventually resubmit jobs etc..
>>>>>> 6. Cancel work items which might have been scheduled from other reset sources.
>>>>>>
>>>>>> It does make sense that the hypervisor resets the hardware without waiting for the clients for too long, but if we don't follow this general steps we will always have a race between the different components.
>>>>> Monk, just to add to this - if indeed as you say that 'FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long'
>>>>> and there is no strict waiting from the hypervisor for IDH_READY_TO_RESET to be recived from guest before starting the reset then setting in_gpu_reset and locking reset_sem from guest side is not really full proof
>>>>> protection from MMIO accesses by the guest - it only truly helps if hypervisor waits for that message before initiation of HW reset.
>>>>>
>>>> Hi Andrey, this cannot be done. If somehow guest kernel hangs and never has the chance to send the response back, then other VFs will have to wait it reset. All the vfs will hang in this case. Or sometimes the mailbox has some delay and other VFs will also wait. The user of other VFs will be affected in this case.
>>> Yeah, agree completely with JingWen. The hypervisor is the one in charge here, not the guest.
>>>
>>> What the hypervisor should do (and it already seems to be designed that way) is to send the guest a message that a reset is about to happen and give it some time to response appropriately.
>>>
>>> The guest on the other hand then tells the hypervisor that all processing has stopped and it is ready to restart. If that doesn't happen in time the hypervisor should eliminate the guest probably trigger even more severe consequences, e.g. restart the whole VM etc...
>>>
>>> Christian.
>>
>> So what's the end conclusion here regarding dropping this particular patch ? Seems to me we still need to drop it to prevent driver's MMIO access
>> to the GPU during reset from various places in the code.
>>
>> Andrey
>>
> Hi Andrey & Christian,
>
> I have ported your patch(drop the reset_sem and in_gpu_reset in flr work) and run some tests. If a engine hang during an OCL benchmark(using kfd), we can see the logs below:
>
> [  397.190727] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
> [  397.301496] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
> [  397.406601] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
> [  397.532343] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
> [  397.642251] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
> [  397.746634] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
> [  397.850761] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
> [  397.960544] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
> [  398.065218] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
> [  398.182173] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
> [  398.288264] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
> [  398.394712] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
> [  428.400582] [drm] clean up the vf2pf work item
> [  428.500528] amdgpu 0000:00:07.0: amdgpu: [gfxhub] page fault (src_id:0 ring:153 vmid:8 pasid:32771, for process xgemmStandalone pid 3557 thread xgemmStandalone pid 3557)
> [  428.527576] amdgpu 0000:00:07.0: amdgpu:   in page starting at address 0x00007fc991c04000 from client 0x1b (UTCL2)
> [  437.531392] amdgpu: qcm fence wait loop timeout expired
> [  437.535738] amdgpu: The cp might be in an unrecoverable state due to an unsuccessful queues preemption
> [  437.537191] amdgpu 0000:00:07.0: amdgpu: GPU reset begin!
> [  438.087443] [drm] RE-INIT-early: nv_common succeeded
>
> As kfd relies on these to check if GPU is in reset, dropping it will hit some page fault and fence error very easily.
To be clear, we can also hit the page fault with the reset_sem and in_gpu_reset, just not as easily as dropping them.
>
>>>>> Andrey
>>>>>
>>>>>
>>>>>> Regards,
>>>>>> Christian.
>>>>>>
>>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>>> [AMD Official Use Only]
>>>>>>>
>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>>
>>>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>> It makes the code to crash ... how could it be a fix ?
>>>>>>>
>>>>>>> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>>>>>>
>>>>>>> Thanks
>>>>>>> -------------------------------------------------------------------
>>>>>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>> -------------------------------------------------------------------
>>>>>>> we are hiring software manager for CVS core team
>>>>>>> -------------------------------------------------------------------
>>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>> Sent: Tuesday, January 4, 2022 6:19 PM
>>>>>>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>
>>>>>>> Cc: daniel@ffwll.ch
>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>>>>>>>
>>>>>>> Hi Jingwen,
>>>>>>>
>>>>>>> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>>>>>>>
>>>>>>> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>>>>>>>
>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>
>>>>>>> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>>>>>>>
>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>>>>>>> Hi Christian,
>>>>>>>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>>>>>>>
>>>>>>>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>>>>>>>
>>>>>>>>    From my point of view, we can directly use amdgpu_device_lock_adev
>>>>>>>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>>>>>>>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>>>>>>>
>>>>>>>> Best Regards,
>>>>>>>> Jingwen Chen
>>>>>>>>
>>>>>>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>>>>>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>>>>>>>
>>>>>>>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>>>>>>>
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>>>>>>> Sure, I guess i can drop this patch then.
>>>>>>>>>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>>>>>>>
>>>>>>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>>>>>>>
>>>>>>>>>>>> Best wishes
>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org;
>>>>>>>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>>
>>>>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>>>>
>>>>>>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>>>>>>
>>>>>>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>>>>>>
>>>>>>>>>>>>> Thanks
>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>>> Since now flr work is serialized against  GPU resets there is no
>>>>>>>>>>>>>> need for this.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>>>>>>       2 files changed, 22 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>           struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>>           int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>>> -     */
>>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>>> -
>>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>           xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>           } while (timeout > 1);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>       flr_done:
>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>           /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>           if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>>               && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>           struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>>           int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>>> -     */
>>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>>> -
>>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>           xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>           } while (timeout > 1);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>       flr_done:
>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>           /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>           if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>>               && (!amdgpu_device_has_job_running(adev) ||
Christian König Jan. 6, 2022, 9:13 a.m. UTC | #23
Am 06.01.22 um 06:18 schrieb JingWen Chen:
> On 2022/1/6 下午12:59, JingWen Chen wrote:
>> On 2022/1/6 上午2:24, Andrey Grodzovsky wrote:
>>> On 2022-01-05 2:59 a.m., Christian König wrote:
>>>> Am 05.01.22 um 08:34 schrieb JingWen Chen:
>>>>> On 2022/1/5 上午12:56, Andrey Grodzovsky wrote:
>>>>>> On 2022-01-04 6:36 a.m., Christian König wrote:
>>>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>>>> [AMD Official Use Only]
>>>>>>>>
>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>>>
>>>>>>> Then we have a major design issue in the SRIOV protocol and really need to question this.
>>>>>>>
>>>>>>> How do you want to prevent a race between the hypervisor resetting the hardware and the client trying the same because of a timeout?
>>>>>>>
>>>>>>> As far as I can see the procedure should be:
>>>>>>> 1. We detect that a reset is necessary, either because of a fault a timeout or signal from hypervisor.
>>>>>>> 2. For each of those potential reset sources a work item is send to the single workqueue.
>>>>>>> 3. One of those work items execute first and prepares the reset.
>>>>>>> 4. We either do the reset our self or notify the hypervisor that we are ready for the reset.
>>>>>>> 5. Cleanup after the reset, eventually resubmit jobs etc..
>>>>>>> 6. Cancel work items which might have been scheduled from other reset sources.
>>>>>>>
>>>>>>> It does make sense that the hypervisor resets the hardware without waiting for the clients for too long, but if we don't follow this general steps we will always have a race between the different components.
>>>>>> Monk, just to add to this - if indeed as you say that 'FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long'
>>>>>> and there is no strict waiting from the hypervisor for IDH_READY_TO_RESET to be recived from guest before starting the reset then setting in_gpu_reset and locking reset_sem from guest side is not really full proof
>>>>>> protection from MMIO accesses by the guest - it only truly helps if hypervisor waits for that message before initiation of HW reset.
>>>>>>
>>>>> Hi Andrey, this cannot be done. If somehow guest kernel hangs and never has the chance to send the response back, then other VFs will have to wait it reset. All the vfs will hang in this case. Or sometimes the mailbox has some delay and other VFs will also wait. The user of other VFs will be affected in this case.
>>>> Yeah, agree completely with JingWen. The hypervisor is the one in charge here, not the guest.
>>>>
>>>> What the hypervisor should do (and it already seems to be designed that way) is to send the guest a message that a reset is about to happen and give it some time to response appropriately.
>>>>
>>>> The guest on the other hand then tells the hypervisor that all processing has stopped and it is ready to restart. If that doesn't happen in time the hypervisor should eliminate the guest probably trigger even more severe consequences, e.g. restart the whole VM etc...
>>>>
>>>> Christian.
>>> So what's the end conclusion here regarding dropping this particular patch ? Seems to me we still need to drop it to prevent driver's MMIO access
>>> to the GPU during reset from various places in the code.
>>>
>>> Andrey
>>>
>> Hi Andrey & Christian,
>>
>> I have ported your patch(drop the reset_sem and in_gpu_reset in flr work) and run some tests. If a engine hang during an OCL benchmark(using kfd), we can see the logs below:
>>
>> [  397.190727] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  397.301496] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  397.406601] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  397.532343] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  397.642251] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  397.746634] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  397.850761] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  397.960544] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  398.065218] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  398.182173] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  398.288264] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  398.394712] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  428.400582] [drm] clean up the vf2pf work item
>> [  428.500528] amdgpu 0000:00:07.0: amdgpu: [gfxhub] page fault (src_id:0 ring:153 vmid:8 pasid:32771, for process xgemmStandalone pid 3557 thread xgemmStandalone pid 3557)
>> [  428.527576] amdgpu 0000:00:07.0: amdgpu:   in page starting at address 0x00007fc991c04000 from client 0x1b (UTCL2)
>> [  437.531392] amdgpu: qcm fence wait loop timeout expired
>> [  437.535738] amdgpu: The cp might be in an unrecoverable state due to an unsuccessful queues preemption
>> [  437.537191] amdgpu 0000:00:07.0: amdgpu: GPU reset begin!
>> [  438.087443] [drm] RE-INIT-early: nv_common succeeded
>>
>> As kfd relies on these to check if GPU is in reset, dropping it will hit some page fault and fence error very easily.
> To be clear, we can also hit the page fault with the reset_sem and in_gpu_reset, just not as easily as dropping them.

Yeah, I was just about to complain that this isn't good engineering but 
just makes things less likely.

Question is what would be the approach to avoid those kind of problems 
from the start?

Regards,
Christian.

>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>>>> [AMD Official Use Only]
>>>>>>>>
>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>>>
>>>>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>>> It makes the code to crash ... how could it be a fix ?
>>>>>>>>
>>>>>>>> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>>>>>>>
>>>>>>>> Thanks
>>>>>>>> -------------------------------------------------------------------
>>>>>>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>> -------------------------------------------------------------------
>>>>>>>> we are hiring software manager for CVS core team
>>>>>>>> -------------------------------------------------------------------
>>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Tuesday, January 4, 2022 6:19 PM
>>>>>>>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>
>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>>>>>>>>
>>>>>>>> Hi Jingwen,
>>>>>>>>
>>>>>>>> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>>>>>>>>
>>>>>>>> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>>>>>>>>
>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>>
>>>>>>>> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>>>>>>>>
>>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>>>>>>>> Hi Christian,
>>>>>>>>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>>>>>>>>
>>>>>>>>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>>>>>>>>
>>>>>>>>>     From my point of view, we can directly use amdgpu_device_lock_adev
>>>>>>>>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>>>>>>>>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>>>>>>>>
>>>>>>>>> Best Regards,
>>>>>>>>> Jingwen Chen
>>>>>>>>>
>>>>>>>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>>>>>>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>>>>>>>>
>>>>>>>>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>>>>>>>>
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>>>>>>>> Sure, I guess i can drop this patch then.
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>>>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>>>>>>>>
>>>>>>>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>>>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>>>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org;
>>>>>>>>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>>>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Thanks
>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>>>> Since now flr work is serialized against  GPU resets there is no
>>>>>>>>>>>>>>> need for this.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>>>>>>>        2 files changed, 22 deletions(-)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>            struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>>>            int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>>>> -     */
>>>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>            xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>            } while (timeout > 1);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>        flr_done:
>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>            /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>            if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>>>                && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>            struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>>>            int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>>>> -     */
>>>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>            xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>            } while (timeout > 1);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>        flr_done:
>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>            /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>            if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>>>                && (!amdgpu_device_has_job_running(adev) ||
Andrey Grodzovsky Jan. 6, 2022, 7:13 p.m. UTC | #24
On 2022-01-06 12:18 a.m., JingWen Chen wrote:
> On 2022/1/6 下午12:59, JingWen Chen wrote:
>> On 2022/1/6 上午2:24, Andrey Grodzovsky wrote:
>>> On 2022-01-05 2:59 a.m., Christian König wrote:
>>>> Am 05.01.22 um 08:34 schrieb JingWen Chen:
>>>>> On 2022/1/5 上午12:56, Andrey Grodzovsky wrote:
>>>>>> On 2022-01-04 6:36 a.m., Christian König wrote:
>>>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>>>> [AMD Official Use Only]
>>>>>>>>
>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>>>
>>>>>>> Then we have a major design issue in the SRIOV protocol and really need to question this.
>>>>>>>
>>>>>>> How do you want to prevent a race between the hypervisor resetting the hardware and the client trying the same because of a timeout?
>>>>>>>
>>>>>>> As far as I can see the procedure should be:
>>>>>>> 1. We detect that a reset is necessary, either because of a fault a timeout or signal from hypervisor.
>>>>>>> 2. For each of those potential reset sources a work item is send to the single workqueue.
>>>>>>> 3. One of those work items execute first and prepares the reset.
>>>>>>> 4. We either do the reset our self or notify the hypervisor that we are ready for the reset.
>>>>>>> 5. Cleanup after the reset, eventually resubmit jobs etc..
>>>>>>> 6. Cancel work items which might have been scheduled from other reset sources.
>>>>>>>
>>>>>>> It does make sense that the hypervisor resets the hardware without waiting for the clients for too long, but if we don't follow this general steps we will always have a race between the different components.
>>>>>> Monk, just to add to this - if indeed as you say that 'FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long'
>>>>>> and there is no strict waiting from the hypervisor for IDH_READY_TO_RESET to be recived from guest before starting the reset then setting in_gpu_reset and locking reset_sem from guest side is not really full proof
>>>>>> protection from MMIO accesses by the guest - it only truly helps if hypervisor waits for that message before initiation of HW reset.
>>>>>>
>>>>> Hi Andrey, this cannot be done. If somehow guest kernel hangs and never has the chance to send the response back, then other VFs will have to wait it reset. All the vfs will hang in this case. Or sometimes the mailbox has some delay and other VFs will also wait. The user of other VFs will be affected in this case.
>>>> Yeah, agree completely with JingWen. The hypervisor is the one in charge here, not the guest.
>>>>
>>>> What the hypervisor should do (and it already seems to be designed that way) is to send the guest a message that a reset is about to happen and give it some time to response appropriately.
>>>>
>>>> The guest on the other hand then tells the hypervisor that all processing has stopped and it is ready to restart. If that doesn't happen in time the hypervisor should eliminate the guest probably trigger even more severe consequences, e.g. restart the whole VM etc...
>>>>
>>>> Christian.
>>> So what's the end conclusion here regarding dropping this particular patch ? Seems to me we still need to drop it to prevent driver's MMIO access
>>> to the GPU during reset from various places in the code.
>>>
>>> Andrey
>>>
>> Hi Andrey & Christian,
>>
>> I have ported your patch(drop the reset_sem and in_gpu_reset in flr work) and run some tests. If a engine hang during an OCL benchmark(using kfd), we can see the logs below:


Did you port the entire patchset or just 'drm/amd/virt: Drop concurrent 
GPU reset protection for SRIOV' ?


>>
>> [  397.190727] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  397.301496] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  397.406601] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  397.532343] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  397.642251] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  397.746634] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  397.850761] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  397.960544] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  398.065218] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  398.182173] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  398.288264] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  398.394712] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>> [  428.400582] [drm] clean up the vf2pf work item
>> [  428.500528] amdgpu 0000:00:07.0: amdgpu: [gfxhub] page fault (src_id:0 ring:153 vmid:8 pasid:32771, for process xgemmStandalone pid 3557 thread xgemmStandalone pid 3557)
>> [  428.527576] amdgpu 0000:00:07.0: amdgpu:   in page starting at address 0x00007fc991c04000 from client 0x1b (UTCL2)
>> [  437.531392] amdgpu: qcm fence wait loop timeout expired
>> [  437.535738] amdgpu: The cp might be in an unrecoverable state due to an unsuccessful queues preemption
>> [  437.537191] amdgpu 0000:00:07.0: amdgpu: GPU reset begin!
>> [  438.087443] [drm] RE-INIT-early: nv_common succeeded
>>
>> As kfd relies on these to check if GPU is in reset, dropping it will hit some page fault and fence error very easily.
> To be clear, we can also hit the page fault with the reset_sem and in_gpu_reset, just not as easily as dropping them.

Are you saying that the entire patch-set with and without patch 
'drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
is casing this GPUVM page fault during testing engine hang while running 
benchmark ?

Do you never observe this page fault when running this test with 
original tree without the new patch-set ?

Andrey


>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>>>> [AMD Official Use Only]
>>>>>>>>
>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>>>
>>>>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>>> It makes the code to crash ... how could it be a fix ?
>>>>>>>>
>>>>>>>> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>>>>>>>
>>>>>>>> Thanks
>>>>>>>> -------------------------------------------------------------------
>>>>>>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>> -------------------------------------------------------------------
>>>>>>>> we are hiring software manager for CVS core team
>>>>>>>> -------------------------------------------------------------------
>>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>> Sent: Tuesday, January 4, 2022 6:19 PM
>>>>>>>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>
>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>>>>>>>>
>>>>>>>> Hi Jingwen,
>>>>>>>>
>>>>>>>> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>>>>>>>>
>>>>>>>> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>>>>>>>>
>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>>
>>>>>>>> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>>>>>>>>
>>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>>>>>>>> Hi Christian,
>>>>>>>>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>>>>>>>>
>>>>>>>>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>>>>>>>>
>>>>>>>>>     From my point of view, we can directly use amdgpu_device_lock_adev
>>>>>>>>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>>>>>>>>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>>>>>>>>
>>>>>>>>> Best Regards,
>>>>>>>>> Jingwen Chen
>>>>>>>>>
>>>>>>>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>>>>>>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>>>>>>>>
>>>>>>>>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>>>>>>>>
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>>>>>>>> Sure, I guess i can drop this patch then.
>>>>>>>>>>>
>>>>>>>>>>> Andrey
>>>>>>>>>>>
>>>>>>>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>>>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>>>>>>>>
>>>>>>>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>>>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>>>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org;
>>>>>>>>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>>>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Thanks
>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>>>> Since now flr work is serialized against  GPU resets there is no
>>>>>>>>>>>>>>> need for this.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>>>>>>>        2 files changed, 22 deletions(-)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>            struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>>>            int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>>>> -     */
>>>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>            xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>            } while (timeout > 1);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>        flr_done:
>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>            /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>            if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>>>                && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>            struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>>>            int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>>>> -     */
>>>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>            xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>            } while (timeout > 1);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>        flr_done:
>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>            /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>            if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>>>                && (!amdgpu_device_has_job_running(adev) ||
JingWen Chen Jan. 7, 2022, 3:57 a.m. UTC | #25
On 2022/1/7 上午3:13, Andrey Grodzovsky wrote:
>
> On 2022-01-06 12:18 a.m., JingWen Chen wrote:
>> On 2022/1/6 下午12:59, JingWen Chen wrote:
>>> On 2022/1/6 上午2:24, Andrey Grodzovsky wrote:
>>>> On 2022-01-05 2:59 a.m., Christian König wrote:
>>>>> Am 05.01.22 um 08:34 schrieb JingWen Chen:
>>>>>> On 2022/1/5 上午12:56, Andrey Grodzovsky wrote:
>>>>>>> On 2022-01-04 6:36 a.m., Christian König wrote:
>>>>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>
>>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>>>>
>>>>>>>> Then we have a major design issue in the SRIOV protocol and really need to question this.
>>>>>>>>
>>>>>>>> How do you want to prevent a race between the hypervisor resetting the hardware and the client trying the same because of a timeout?
>>>>>>>>
>>>>>>>> As far as I can see the procedure should be:
>>>>>>>> 1. We detect that a reset is necessary, either because of a fault a timeout or signal from hypervisor.
>>>>>>>> 2. For each of those potential reset sources a work item is send to the single workqueue.
>>>>>>>> 3. One of those work items execute first and prepares the reset.
>>>>>>>> 4. We either do the reset our self or notify the hypervisor that we are ready for the reset.
>>>>>>>> 5. Cleanup after the reset, eventually resubmit jobs etc..
>>>>>>>> 6. Cancel work items which might have been scheduled from other reset sources.
>>>>>>>>
>>>>>>>> It does make sense that the hypervisor resets the hardware without waiting for the clients for too long, but if we don't follow this general steps we will always have a race between the different components.
>>>>>>> Monk, just to add to this - if indeed as you say that 'FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long'
>>>>>>> and there is no strict waiting from the hypervisor for IDH_READY_TO_RESET to be recived from guest before starting the reset then setting in_gpu_reset and locking reset_sem from guest side is not really full proof
>>>>>>> protection from MMIO accesses by the guest - it only truly helps if hypervisor waits for that message before initiation of HW reset.
>>>>>>>
>>>>>> Hi Andrey, this cannot be done. If somehow guest kernel hangs and never has the chance to send the response back, then other VFs will have to wait it reset. All the vfs will hang in this case. Or sometimes the mailbox has some delay and other VFs will also wait. The user of other VFs will be affected in this case.
>>>>> Yeah, agree completely with JingWen. The hypervisor is the one in charge here, not the guest.
>>>>>
>>>>> What the hypervisor should do (and it already seems to be designed that way) is to send the guest a message that a reset is about to happen and give it some time to response appropriately.
>>>>>
>>>>> The guest on the other hand then tells the hypervisor that all processing has stopped and it is ready to restart. If that doesn't happen in time the hypervisor should eliminate the guest probably trigger even more severe consequences, e.g. restart the whole VM etc...
>>>>>
>>>>> Christian.
>>>> So what's the end conclusion here regarding dropping this particular patch ? Seems to me we still need to drop it to prevent driver's MMIO access
>>>> to the GPU during reset from various places in the code.
>>>>
>>>> Andrey
>>>>
>>> Hi Andrey & Christian,
>>>
>>> I have ported your patch(drop the reset_sem and in_gpu_reset in flr work) and run some tests. If a engine hang during an OCL benchmark(using kfd), we can see the logs below:
>
>
> Did you port the entire patchset or just 'drm/amd/virt: Drop concurrent GPU reset protection for SRIOV' ?
>
>
I ported the entire patchset
>>>
>>> [  397.190727] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>> [  397.301496] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>> [  397.406601] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>> [  397.532343] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>> [  397.642251] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>> [  397.746634] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>> [  397.850761] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>> [  397.960544] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>> [  398.065218] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>> [  398.182173] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>> [  398.288264] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>> [  398.394712] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>> [  428.400582] [drm] clean up the vf2pf work item
>>> [  428.500528] amdgpu 0000:00:07.0: amdgpu: [gfxhub] page fault (src_id:0 ring:153 vmid:8 pasid:32771, for process xgemmStandalone pid 3557 thread xgemmStandalone pid 3557)
>>> [  428.527576] amdgpu 0000:00:07.0: amdgpu:   in page starting at address 0x00007fc991c04000 from client 0x1b (UTCL2)
>>> [  437.531392] amdgpu: qcm fence wait loop timeout expired
>>> [  437.535738] amdgpu: The cp might be in an unrecoverable state due to an unsuccessful queues preemption
>>> [  437.537191] amdgpu 0000:00:07.0: amdgpu: GPU reset begin!
>>> [  438.087443] [drm] RE-INIT-early: nv_common succeeded
>>>
>>> As kfd relies on these to check if GPU is in reset, dropping it will hit some page fault and fence error very easily.
>> To be clear, we can also hit the page fault with the reset_sem and in_gpu_reset, just not as easily as dropping them.
>
> Are you saying that the entire patch-set with and without patch 'drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
> is casing this GPUVM page fault during testing engine hang while running benchmark ?
>
> Do you never observe this page fault when running this test with original tree without the new patch-set ?
>
> Andrey
>
I think this page fault issue can be seen even on the original tree. It's just drop the concurrent GPU reset will hit it more easily.

We may need a new way to protect the reset in SRIOV.

>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Christian.
>>>>>>>>
>>>>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>
>>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>>>>
>>>>>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>>>> It makes the code to crash ... how could it be a fix ?
>>>>>>>>>
>>>>>>>>> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>>>>>>>>
>>>>>>>>> Thanks
>>>>>>>>> -------------------------------------------------------------------
>>>>>>>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>> -------------------------------------------------------------------
>>>>>>>>> we are hiring software manager for CVS core team
>>>>>>>>> -------------------------------------------------------------------
>>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>> Sent: Tuesday, January 4, 2022 6:19 PM
>>>>>>>>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>
>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>>>>>>>>>
>>>>>>>>> Hi Jingwen,
>>>>>>>>>
>>>>>>>>> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>>>>>>>>>
>>>>>>>>> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>>>>>>>>>
>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>>>
>>>>>>>>> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>>>>>>>>>
>>>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>>>>>>>>> Hi Christian,
>>>>>>>>>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>>>>>>>>>
>>>>>>>>>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>>>>>>>>>
>>>>>>>>>>     From my point of view, we can directly use amdgpu_device_lock_adev
>>>>>>>>>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>>>>>>>>>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>>>>>>>>>
>>>>>>>>>> Best Regards,
>>>>>>>>>> Jingwen Chen
>>>>>>>>>>
>>>>>>>>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>>>>>>>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>>>>>>>>>
>>>>>>>>>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>>>>>>>>>
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>>>>>>>>> Sure, I guess i can drop this patch then.
>>>>>>>>>>>>
>>>>>>>>>>>> Andrey
>>>>>>>>>>>>
>>>>>>>>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>>>>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>>>>>>>>>
>>>>>>>>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>>>>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>>>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>>>>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org;
>>>>>>>>>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>>>>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Thanks
>>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>>>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>>>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>>>>> Since now flr work is serialized against  GPU resets there is no
>>>>>>>>>>>>>>>> need for this.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>>>>>>>>        2 files changed, 22 deletions(-)
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>            struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>>>>            int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>>>>> -     */
>>>>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>            xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>            } while (timeout > 1);
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>        flr_done:
>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>            /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>            if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>>>>                && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>            struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>>>>            int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>>>>> -     */
>>>>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>            xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>            } while (timeout > 1);
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>        flr_done:
>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>            /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>            if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>>>>                && (!amdgpu_device_has_job_running(adev) ||
JingWen Chen Jan. 7, 2022, 5:46 a.m. UTC | #26
On 2022/1/7 上午11:57, JingWen Chen wrote:
> On 2022/1/7 上午3:13, Andrey Grodzovsky wrote:
>> On 2022-01-06 12:18 a.m., JingWen Chen wrote:
>>> On 2022/1/6 下午12:59, JingWen Chen wrote:
>>>> On 2022/1/6 上午2:24, Andrey Grodzovsky wrote:
>>>>> On 2022-01-05 2:59 a.m., Christian König wrote:
>>>>>> Am 05.01.22 um 08:34 schrieb JingWen Chen:
>>>>>>> On 2022/1/5 上午12:56, Andrey Grodzovsky wrote:
>>>>>>>> On 2022-01-04 6:36 a.m., Christian König wrote:
>>>>>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>
>>>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>>>>>
>>>>>>>>> Then we have a major design issue in the SRIOV protocol and really need to question this.
>>>>>>>>>
>>>>>>>>> How do you want to prevent a race between the hypervisor resetting the hardware and the client trying the same because of a timeout?
>>>>>>>>>
>>>>>>>>> As far as I can see the procedure should be:
>>>>>>>>> 1. We detect that a reset is necessary, either because of a fault a timeout or signal from hypervisor.
>>>>>>>>> 2. For each of those potential reset sources a work item is send to the single workqueue.
>>>>>>>>> 3. One of those work items execute first and prepares the reset.
>>>>>>>>> 4. We either do the reset our self or notify the hypervisor that we are ready for the reset.
>>>>>>>>> 5. Cleanup after the reset, eventually resubmit jobs etc..
>>>>>>>>> 6. Cancel work items which might have been scheduled from other reset sources.
>>>>>>>>>
>>>>>>>>> It does make sense that the hypervisor resets the hardware without waiting for the clients for too long, but if we don't follow this general steps we will always have a race between the different components.
>>>>>>>> Monk, just to add to this - if indeed as you say that 'FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long'
>>>>>>>> and there is no strict waiting from the hypervisor for IDH_READY_TO_RESET to be recived from guest before starting the reset then setting in_gpu_reset and locking reset_sem from guest side is not really full proof
>>>>>>>> protection from MMIO accesses by the guest - it only truly helps if hypervisor waits for that message before initiation of HW reset.
>>>>>>>>
>>>>>>> Hi Andrey, this cannot be done. If somehow guest kernel hangs and never has the chance to send the response back, then other VFs will have to wait it reset. All the vfs will hang in this case. Or sometimes the mailbox has some delay and other VFs will also wait. The user of other VFs will be affected in this case.
>>>>>> Yeah, agree completely with JingWen. The hypervisor is the one in charge here, not the guest.
>>>>>>
>>>>>> What the hypervisor should do (and it already seems to be designed that way) is to send the guest a message that a reset is about to happen and give it some time to response appropriately.
>>>>>>
>>>>>> The guest on the other hand then tells the hypervisor that all processing has stopped and it is ready to restart. If that doesn't happen in time the hypervisor should eliminate the guest probably trigger even more severe consequences, e.g. restart the whole VM etc...
>>>>>>
>>>>>> Christian.
>>>>> So what's the end conclusion here regarding dropping this particular patch ? Seems to me we still need to drop it to prevent driver's MMIO access
>>>>> to the GPU during reset from various places in the code.
>>>>>
>>>>> Andrey
>>>>>
>>>> Hi Andrey & Christian,
>>>>
>>>> I have ported your patch(drop the reset_sem and in_gpu_reset in flr work) and run some tests. If a engine hang during an OCL benchmark(using kfd), we can see the logs below:
>>
>> Did you port the entire patchset or just 'drm/amd/virt: Drop concurrent GPU reset protection for SRIOV' ?
>>
>>
> I ported the entire patchset
>>>> [  397.190727] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>> [  397.301496] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>> [  397.406601] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>> [  397.532343] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>> [  397.642251] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>> [  397.746634] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>> [  397.850761] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>> [  397.960544] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>> [  398.065218] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>> [  398.182173] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>> [  398.288264] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>> [  398.394712] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>> [  428.400582] [drm] clean up the vf2pf work item
>>>> [  428.500528] amdgpu 0000:00:07.0: amdgpu: [gfxhub] page fault (src_id:0 ring:153 vmid:8 pasid:32771, for process xgemmStandalone pid 3557 thread xgemmStandalone pid 3557)
>>>> [  428.527576] amdgpu 0000:00:07.0: amdgpu:   in page starting at address 0x00007fc991c04000 from client 0x1b (UTCL2)
>>>> [  437.531392] amdgpu: qcm fence wait loop timeout expired
>>>> [  437.535738] amdgpu: The cp might be in an unrecoverable state due to an unsuccessful queues preemption
>>>> [  437.537191] amdgpu 0000:00:07.0: amdgpu: GPU reset begin!
>>>> [  438.087443] [drm] RE-INIT-early: nv_common succeeded
>>>>
>>>> As kfd relies on these to check if GPU is in reset, dropping it will hit some page fault and fence error very easily.
>>> To be clear, we can also hit the page fault with the reset_sem and in_gpu_reset, just not as easily as dropping them.
>> Are you saying that the entire patch-set with and without patch 'drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>> is casing this GPUVM page fault during testing engine hang while running benchmark ?
>>
>> Do you never observe this page fault when running this test with original tree without the new patch-set ?
>>
>> Andrey
>>
> I think this page fault issue can be seen even on the original tree. It's just drop the concurrent GPU reset will hit it more easily.
>
> We may need a new way to protect the reset in SRIOV.
>
Hi Andrey

Actually, I would like to propose a RFC based on your patch, which will move the waiting logic in SRIOV flr work into amdgpu_device_gpu_recover_imp, host will wait a certain time till the pre_reset work done and guest send back response then actually do the vf flr. Hopefully this will help solving the page fault issue.

JingWen

>>>>>>>> Andrey
>>>>>>>>
>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Christian.
>>>>>>>>>
>>>>>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>
>>>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>>>>>
>>>>>>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>>>>> It makes the code to crash ... how could it be a fix ?
>>>>>>>>>>
>>>>>>>>>> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>>>>>>>>>
>>>>>>>>>> Thanks
>>>>>>>>>> -------------------------------------------------------------------
>>>>>>>>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>>> -------------------------------------------------------------------
>>>>>>>>>> we are hiring software manager for CVS core team
>>>>>>>>>> -------------------------------------------------------------------
>>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>> Sent: Tuesday, January 4, 2022 6:19 PM
>>>>>>>>>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>
>>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>>>>>>>>>>
>>>>>>>>>> Hi Jingwen,
>>>>>>>>>>
>>>>>>>>>> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>>>>>>>>>>
>>>>>>>>>> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>>>>>>>>>>
>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>>>>
>>>>>>>>>> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>>>>>>>>>>
>>>>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>>>>>>>>>> Hi Christian,
>>>>>>>>>>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>>>>>>>>>>
>>>>>>>>>>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>>>>>>>>>>
>>>>>>>>>>>     From my point of view, we can directly use amdgpu_device_lock_adev
>>>>>>>>>>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>>>>>>>>>>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>>>>>>>>>>
>>>>>>>>>>> Best Regards,
>>>>>>>>>>> Jingwen Chen
>>>>>>>>>>>
>>>>>>>>>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>>>>>>>>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>>>>>>>>>>
>>>>>>>>>>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>>>>>>>>>>
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>> Sure, I guess i can drop this patch then.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>
>>>>>>>>>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>>>>>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>>>>>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>>>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>>>>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>>>>>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org;
>>>>>>>>>>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>>>>>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Thanks
>>>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>>>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>>>>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>>>>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>>>>>> Since now flr work is serialized against  GPU resets there is no
>>>>>>>>>>>>>>>>> need for this.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>>>>>>>>>        2 files changed, 22 deletions(-)
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>            struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>>>>>            int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>>>>>> -     */
>>>>>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>            xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>            } while (timeout > 1);
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>        flr_done:
>>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>            /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>            if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>>>>>                && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>            struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>>>>>            int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>>>>>> -     */
>>>>>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>            xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>            } while (timeout > 1);
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>        flr_done:
>>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>            /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>            if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>>>>>                && (!amdgpu_device_has_job_running(adev) ||
Andrey Grodzovsky Jan. 7, 2022, 4:02 p.m. UTC | #27
On 2022-01-07 12:46 a.m., JingWen Chen wrote:
> On 2022/1/7 上午11:57, JingWen Chen wrote:
>> On 2022/1/7 上午3:13, Andrey Grodzovsky wrote:
>>> On 2022-01-06 12:18 a.m., JingWen Chen wrote:
>>>> On 2022/1/6 下午12:59, JingWen Chen wrote:
>>>>> On 2022/1/6 上午2:24, Andrey Grodzovsky wrote:
>>>>>> On 2022-01-05 2:59 a.m., Christian König wrote:
>>>>>>> Am 05.01.22 um 08:34 schrieb JingWen Chen:
>>>>>>>> On 2022/1/5 上午12:56, Andrey Grodzovsky wrote:
>>>>>>>>> On 2022-01-04 6:36 a.m., Christian König wrote:
>>>>>>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>>
>>>>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>>>>>>
>>>>>>>>>> Then we have a major design issue in the SRIOV protocol and really need to question this.
>>>>>>>>>>
>>>>>>>>>> How do you want to prevent a race between the hypervisor resetting the hardware and the client trying the same because of a timeout?
>>>>>>>>>>
>>>>>>>>>> As far as I can see the procedure should be:
>>>>>>>>>> 1. We detect that a reset is necessary, either because of a fault a timeout or signal from hypervisor.
>>>>>>>>>> 2. For each of those potential reset sources a work item is send to the single workqueue.
>>>>>>>>>> 3. One of those work items execute first and prepares the reset.
>>>>>>>>>> 4. We either do the reset our self or notify the hypervisor that we are ready for the reset.
>>>>>>>>>> 5. Cleanup after the reset, eventually resubmit jobs etc..
>>>>>>>>>> 6. Cancel work items which might have been scheduled from other reset sources.
>>>>>>>>>>
>>>>>>>>>> It does make sense that the hypervisor resets the hardware without waiting for the clients for too long, but if we don't follow this general steps we will always have a race between the different components.
>>>>>>>>> Monk, just to add to this - if indeed as you say that 'FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long'
>>>>>>>>> and there is no strict waiting from the hypervisor for IDH_READY_TO_RESET to be recived from guest before starting the reset then setting in_gpu_reset and locking reset_sem from guest side is not really full proof
>>>>>>>>> protection from MMIO accesses by the guest - it only truly helps if hypervisor waits for that message before initiation of HW reset.
>>>>>>>>>
>>>>>>>> Hi Andrey, this cannot be done. If somehow guest kernel hangs and never has the chance to send the response back, then other VFs will have to wait it reset. All the vfs will hang in this case. Or sometimes the mailbox has some delay and other VFs will also wait. The user of other VFs will be affected in this case.
>>>>>>> Yeah, agree completely with JingWen. The hypervisor is the one in charge here, not the guest.
>>>>>>>
>>>>>>> What the hypervisor should do (and it already seems to be designed that way) is to send the guest a message that a reset is about to happen and give it some time to response appropriately.
>>>>>>>
>>>>>>> The guest on the other hand then tells the hypervisor that all processing has stopped and it is ready to restart. If that doesn't happen in time the hypervisor should eliminate the guest probably trigger even more severe consequences, e.g. restart the whole VM etc...
>>>>>>>
>>>>>>> Christian.
>>>>>> So what's the end conclusion here regarding dropping this particular patch ? Seems to me we still need to drop it to prevent driver's MMIO access
>>>>>> to the GPU during reset from various places in the code.
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>> Hi Andrey & Christian,
>>>>>
>>>>> I have ported your patch(drop the reset_sem and in_gpu_reset in flr work) and run some tests. If a engine hang during an OCL benchmark(using kfd), we can see the logs below:
>>> Did you port the entire patchset or just 'drm/amd/virt: Drop concurrent GPU reset protection for SRIOV' ?
>>>
>>>
>> I ported the entire patchset
>>>>> [  397.190727] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>> [  397.301496] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>> [  397.406601] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>> [  397.532343] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>> [  397.642251] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>> [  397.746634] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>> [  397.850761] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>> [  397.960544] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>> [  398.065218] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>> [  398.182173] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>> [  398.288264] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>> [  398.394712] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>> [  428.400582] [drm] clean up the vf2pf work item
>>>>> [  428.500528] amdgpu 0000:00:07.0: amdgpu: [gfxhub] page fault (src_id:0 ring:153 vmid:8 pasid:32771, for process xgemmStandalone pid 3557 thread xgemmStandalone pid 3557)
>>>>> [  428.527576] amdgpu 0000:00:07.0: amdgpu:   in page starting at address 0x00007fc991c04000 from client 0x1b (UTCL2)
>>>>> [  437.531392] amdgpu: qcm fence wait loop timeout expired
>>>>> [  437.535738] amdgpu: The cp might be in an unrecoverable state due to an unsuccessful queues preemption
>>>>> [  437.537191] amdgpu 0000:00:07.0: amdgpu: GPU reset begin!
>>>>> [  438.087443] [drm] RE-INIT-early: nv_common succeeded
>>>>>
>>>>> As kfd relies on these to check if GPU is in reset, dropping it will hit some page fault and fence error very easily.
>>>> To be clear, we can also hit the page fault with the reset_sem and in_gpu_reset, just not as easily as dropping them.
>>> Are you saying that the entire patch-set with and without patch 'drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>>> is casing this GPUVM page fault during testing engine hang while running benchmark ?
>>>
>>> Do you never observe this page fault when running this test with original tree without the new patch-set ?
>>>
>>> Andrey
>>>
>> I think this page fault issue can be seen even on the original tree. It's just drop the concurrent GPU reset will hit it more easily.
>>
>> We may need a new way to protect the reset in SRIOV.
>>
> Hi Andrey
>
> Actually, I would like to propose a RFC based on your patch, which will move the waiting logic in SRIOV flr work into amdgpu_device_gpu_recover_imp, host will wait a certain time till the pre_reset work done and guest send back response then actually do the vf flr. Hopefully this will help solving the page fault issue.
>
> JingWen


This makes sense to me, you want the guest driver to be as idle as 
possible before host side starts actual reset. Go ahead and try it on 
top of my patch-set and update with results.
I am away all next week but will try to find time and peek at your updates.

Another question - how much the switch to single threaded reset makes 
SRIOV more unstable ? Is it OK to push the patches as is without your 
RFC or we need to wait for your RFC before push ?

Andrey


>
>>>>>>>>> Andrey
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Christian.
>>>>>>>>>>
>>>>>>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>>
>>>>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>>>>>>
>>>>>>>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>>>>>> It makes the code to crash ... how could it be a fix ?
>>>>>>>>>>>
>>>>>>>>>>> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>>>>>>>>>>
>>>>>>>>>>> Thanks
>>>>>>>>>>> -------------------------------------------------------------------
>>>>>>>>>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>>>> -------------------------------------------------------------------
>>>>>>>>>>> we are hiring software manager for CVS core team
>>>>>>>>>>> -------------------------------------------------------------------
>>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>> Sent: Tuesday, January 4, 2022 6:19 PM
>>>>>>>>>>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>
>>>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>>>>>>>>>>>
>>>>>>>>>>> Hi Jingwen,
>>>>>>>>>>>
>>>>>>>>>>> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>>>>>>>>>>>
>>>>>>>>>>> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>>>>>>>>>>>
>>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>>>>>
>>>>>>>>>>> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>>>>>>>>>>>
>>>>>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>>>>>>>>>>>
>>>>>>>>>>>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>>>>>>>>>>>
>>>>>>>>>>>>      From my point of view, we can directly use amdgpu_device_lock_adev
>>>>>>>>>>>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>>>>>>>>>>>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>>>>>>>>>>>
>>>>>>>>>>>> Best Regards,
>>>>>>>>>>>> Jingwen Chen
>>>>>>>>>>>>
>>>>>>>>>>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>>>>>>>>>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>>>>>>>>>>>
>>>>>>>>>>>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>>> Sure, I guess i can drop this patch then.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>>>>>>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>>>>>>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>>>>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>>>>>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>>>>>>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org;
>>>>>>>>>>>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>>>>>>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>>>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Thanks
>>>>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>>>>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>>>>>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>>>>>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>>>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>>>>>>> Since now flr work is serialized against  GPU resets there is no
>>>>>>>>>>>>>>>>>> need for this.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>>>>>>>>>>         2 files changed, 22 deletions(-)
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>             struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>             int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>>>>>>> -     */
>>>>>>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>             xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>             } while (timeout > 1);
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>         flr_done:
>>>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>             /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>             if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>>>>>>                 && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>             struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>             int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>>>>>>> -     */
>>>>>>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>             xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>             } while (timeout > 1);
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>         flr_done:
>>>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>             /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>             if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>>>>>>                 && (!amdgpu_device_has_job_running(adev) ||
JingWen Chen Jan. 12, 2022, 6:28 a.m. UTC | #28
Hi Andrey,

Please go ahead and push your change. I will prepare the RFC later.

On 2022/1/8 上午12:02, Andrey Grodzovsky wrote:
>
> On 2022-01-07 12:46 a.m., JingWen Chen wrote:
>> On 2022/1/7 上午11:57, JingWen Chen wrote:
>>> On 2022/1/7 上午3:13, Andrey Grodzovsky wrote:
>>>> On 2022-01-06 12:18 a.m., JingWen Chen wrote:
>>>>> On 2022/1/6 下午12:59, JingWen Chen wrote:
>>>>>> On 2022/1/6 上午2:24, Andrey Grodzovsky wrote:
>>>>>>> On 2022-01-05 2:59 a.m., Christian König wrote:
>>>>>>>> Am 05.01.22 um 08:34 schrieb JingWen Chen:
>>>>>>>>> On 2022/1/5 上午12:56, Andrey Grodzovsky wrote:
>>>>>>>>>> On 2022-01-04 6:36 a.m., Christian König wrote:
>>>>>>>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>>>
>>>>>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>>>>>>>
>>>>>>>>>>> Then we have a major design issue in the SRIOV protocol and really need to question this.
>>>>>>>>>>>
>>>>>>>>>>> How do you want to prevent a race between the hypervisor resetting the hardware and the client trying the same because of a timeout?
>>>>>>>>>>>
>>>>>>>>>>> As far as I can see the procedure should be:
>>>>>>>>>>> 1. We detect that a reset is necessary, either because of a fault a timeout or signal from hypervisor.
>>>>>>>>>>> 2. For each of those potential reset sources a work item is send to the single workqueue.
>>>>>>>>>>> 3. One of those work items execute first and prepares the reset.
>>>>>>>>>>> 4. We either do the reset our self or notify the hypervisor that we are ready for the reset.
>>>>>>>>>>> 5. Cleanup after the reset, eventually resubmit jobs etc..
>>>>>>>>>>> 6. Cancel work items which might have been scheduled from other reset sources.
>>>>>>>>>>>
>>>>>>>>>>> It does make sense that the hypervisor resets the hardware without waiting for the clients for too long, but if we don't follow this general steps we will always have a race between the different components.
>>>>>>>>>> Monk, just to add to this - if indeed as you say that 'FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long'
>>>>>>>>>> and there is no strict waiting from the hypervisor for IDH_READY_TO_RESET to be recived from guest before starting the reset then setting in_gpu_reset and locking reset_sem from guest side is not really full proof
>>>>>>>>>> protection from MMIO accesses by the guest - it only truly helps if hypervisor waits for that message before initiation of HW reset.
>>>>>>>>>>
>>>>>>>>> Hi Andrey, this cannot be done. If somehow guest kernel hangs and never has the chance to send the response back, then other VFs will have to wait it reset. All the vfs will hang in this case. Or sometimes the mailbox has some delay and other VFs will also wait. The user of other VFs will be affected in this case.
>>>>>>>> Yeah, agree completely with JingWen. The hypervisor is the one in charge here, not the guest.
>>>>>>>>
>>>>>>>> What the hypervisor should do (and it already seems to be designed that way) is to send the guest a message that a reset is about to happen and give it some time to response appropriately.
>>>>>>>>
>>>>>>>> The guest on the other hand then tells the hypervisor that all processing has stopped and it is ready to restart. If that doesn't happen in time the hypervisor should eliminate the guest probably trigger even more severe consequences, e.g. restart the whole VM etc...
>>>>>>>>
>>>>>>>> Christian.
>>>>>>> So what's the end conclusion here regarding dropping this particular patch ? Seems to me we still need to drop it to prevent driver's MMIO access
>>>>>>> to the GPU during reset from various places in the code.
>>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>> Hi Andrey & Christian,
>>>>>>
>>>>>> I have ported your patch(drop the reset_sem and in_gpu_reset in flr work) and run some tests. If a engine hang during an OCL benchmark(using kfd), we can see the logs below:
>>>> Did you port the entire patchset or just 'drm/amd/virt: Drop concurrent GPU reset protection for SRIOV' ?
>>>>
>>>>
>>> I ported the entire patchset
>>>>>> [  397.190727] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>>> [  397.301496] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>>> [  397.406601] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>>> [  397.532343] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>>> [  397.642251] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>>> [  397.746634] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>>> [  397.850761] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>>> [  397.960544] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>>> [  398.065218] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>>> [  398.182173] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>>> [  398.288264] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>>> [  398.394712] amdgpu 0000:00:07.0: amdgpu: wait for kiq fence error: 0.
>>>>>> [  428.400582] [drm] clean up the vf2pf work item
>>>>>> [  428.500528] amdgpu 0000:00:07.0: amdgpu: [gfxhub] page fault (src_id:0 ring:153 vmid:8 pasid:32771, for process xgemmStandalone pid 3557 thread xgemmStandalone pid 3557)
>>>>>> [  428.527576] amdgpu 0000:00:07.0: amdgpu:   in page starting at address 0x00007fc991c04000 from client 0x1b (UTCL2)
>>>>>> [  437.531392] amdgpu: qcm fence wait loop timeout expired
>>>>>> [  437.535738] amdgpu: The cp might be in an unrecoverable state due to an unsuccessful queues preemption
>>>>>> [  437.537191] amdgpu 0000:00:07.0: amdgpu: GPU reset begin!
>>>>>> [  438.087443] [drm] RE-INIT-early: nv_common succeeded
>>>>>>
>>>>>> As kfd relies on these to check if GPU is in reset, dropping it will hit some page fault and fence error very easily.
>>>>> To be clear, we can also hit the page fault with the reset_sem and in_gpu_reset, just not as easily as dropping them.
>>>> Are you saying that the entire patch-set with and without patch 'drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>>>> is casing this GPUVM page fault during testing engine hang while running benchmark ?
>>>>
>>>> Do you never observe this page fault when running this test with original tree without the new patch-set ?
>>>>
>>>> Andrey
>>>>
>>> I think this page fault issue can be seen even on the original tree. It's just drop the concurrent GPU reset will hit it more easily.
>>>
>>> We may need a new way to protect the reset in SRIOV.
>>>
>> Hi Andrey
>>
>> Actually, I would like to propose a RFC based on your patch, which will move the waiting logic in SRIOV flr work into amdgpu_device_gpu_recover_imp, host will wait a certain time till the pre_reset work done and guest send back response then actually do the vf flr. Hopefully this will help solving the page fault issue.
>>
>> JingWen
>
>
> This makes sense to me, you want the guest driver to be as idle as possible before host side starts actual reset. Go ahead and try it on top of my patch-set and update with results.
> I am away all next week but will try to find time and peek at your updates.
>
> Another question - how much the switch to single threaded reset makes SRIOV more unstable ? Is it OK to push the patches as is without your RFC or we need to wait for your RFC before push ?
>
> Andrey
>
>
>>
>>>>>>>>>> Andrey
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> Regards,
>>>>>>>>>>> Christian.
>>>>>>>>>>>
>>>>>>>>>>> Am 04.01.22 um 11:49 schrieb Liu, Monk:
>>>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>>>
>>>>>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>>>>>> No it's not, FLR from hypervisor is just to notify guest the hw VF FLR is about to start or was already executed, but host will do FLR anyway without waiting for guest too long
>>>>>>>>>>>>
>>>>>>>>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>>>>>>> It makes the code to crash ... how could it be a fix ?
>>>>>>>>>>>>
>>>>>>>>>>>> I'm afraid the patch is NAK from me,  but it is welcome if the cleanup do not ruin the logic, Andry or jingwen can try it if needed.
>>>>>>>>>>>>
>>>>>>>>>>>> Thanks
>>>>>>>>>>>> -------------------------------------------------------------------
>>>>>>>>>>>> Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>>>>> -------------------------------------------------------------------
>>>>>>>>>>>> we are hiring software manager for CVS core team
>>>>>>>>>>>> -------------------------------------------------------------------
>>>>>>>>>>>>
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>> Sent: Tuesday, January 4, 2022 6:19 PM
>>>>>>>>>>>> To: Chen, JingWen <JingWen.Chen2@amd.com>; Christian König <ckoenig.leichtzumerken@gmail.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>
>>>>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV
>>>>>>>>>>>>
>>>>>>>>>>>> Hi Jingwen,
>>>>>>>>>>>>
>>>>>>>>>>>> well what I mean is that we need to adjust the implementation in amdgpu to actually match the requirements.
>>>>>>>>>>>>
>>>>>>>>>>>> Could be that the reset sequence is questionable in general, but I doubt so at least for now.
>>>>>>>>>>>>
>>>>>>>>>>>> See the FLR request from the hypervisor is just another source of signaling the need for a reset, similar to each job timeout on each queue. Otherwise you have a race condition between the hypervisor and the scheduler.
>>>>>>>>>>>>
>>>>>>>>>>>> Properly setting in_gpu_reset is indeed mandatory, but should happen at a central place and not in the SRIOV specific code.
>>>>>>>>>>>>
>>>>>>>>>>>> In other words I strongly think that the current SRIOV reset implementation is severely broken and what Andrey is doing is actually fixing it.
>>>>>>>>>>>>
>>>>>>>>>>>> Regards,
>>>>>>>>>>>> Christian.
>>>>>>>>>>>>
>>>>>>>>>>>> Am 04.01.22 um 10:07 schrieb JingWen Chen:
>>>>>>>>>>>>> Hi Christian,
>>>>>>>>>>>>> I'm not sure what do you mean by "we need to change SRIOV not the driver".
>>>>>>>>>>>>>
>>>>>>>>>>>>> Do you mean we should change the reset sequence in SRIOV? This will be a huge change for our SRIOV solution.
>>>>>>>>>>>>>
>>>>>>>>>>>>>      From my point of view, we can directly use amdgpu_device_lock_adev
>>>>>>>>>>>>> and amdgpu_device_unlock_adev in flr_work instead of try_lock since no one will conflict with this thread with reset_domain introduced.
>>>>>>>>>>>>> But we do need the reset_sem and adev->in_gpu_reset to keep device untouched via user space.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Best Regards,
>>>>>>>>>>>>> Jingwen Chen
>>>>>>>>>>>>>
>>>>>>>>>>>>> On 2022/1/3 下午6:17, Christian König wrote:
>>>>>>>>>>>>>> Please don't. This patch is vital to the cleanup of the reset procedure.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> If SRIOV doesn't work with that we need to change SRIOV and not the driver.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Christian.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>>>> Sure, I guess i can drop this patch then.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Andrey
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> On 2021-12-24 4:57 a.m., JingWen Chen wrote:
>>>>>>>>>>>>>>>> I do agree with shaoyun, if the host find the gpu engine hangs first, and do the flr, guest side thread may not know this and still try to access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to identify the reset status). And this may lead to very bad result.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> On 2021/12/24 下午4:58, Deng, Emily wrote:
>>>>>>>>>>>>>>>>> These patches look good to me. JingWen will pull these patches and do some basic TDR test on sriov environment, and give feedback.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Best wishes
>>>>>>>>>>>>>>>>> Emily Deng
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>> From: Liu, Monk <Monk.Liu@amd.com>
>>>>>>>>>>>>>>>>>> Sent: Thursday, December 23, 2021 6:14 PM
>>>>>>>>>>>>>>>>>> To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky,
>>>>>>>>>>>>>>>>>> Andrey <Andrey.Grodzovsky@amd.com>;
>>>>>>>>>>>>>>>>>> dri-devel@lists.freedesktop.org; amd- gfx@lists.freedesktop.org;
>>>>>>>>>>>>>>>>>> Chen, Horace <Horace.Chen@amd.com>; Chen, JingWen
>>>>>>>>>>>>>>>>>> <JingWen.Chen2@amd.com>; Deng, Emily <Emily.Deng@amd.com>
>>>>>>>>>>>>>>>>>> Cc: daniel@ffwll.ch
>>>>>>>>>>>>>>>>>> Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> [AMD Official Use Only]
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> @Chen, Horace @Chen, JingWen @Deng, Emily
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Please take a review on Andrey's patch
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Thanks
>>>>>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>>>>>> -- Monk Liu | Cloud GPU & Virtualization Solution | AMD
>>>>>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>>>>>> -- we are hiring software manager for CVS core team
>>>>>>>>>>>>>>>>>> -----------------------------------------------------------------
>>>>>>>>>>>>>>>>>> -- 
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>>>>>>> From: Koenig, Christian <Christian.Koenig@amd.com>
>>>>>>>>>>>>>>>>>> Sent: Thursday, December 23, 2021 4:42 PM
>>>>>>>>>>>>>>>>>> To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; dri-
>>>>>>>>>>>>>>>>>> devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>>>>>>>>>>>>>>>>> Cc: daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace
>>>>>>>>>>>>>>>>>> <Horace.Chen@amd.com>
>>>>>>>>>>>>>>>>>> Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset
>>>>>>>>>>>>>>>>>> protection for SRIOV
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>>>>>>>>>>>>>>>>>>> Since now flr work is serialized against  GPU resets there is no
>>>>>>>>>>>>>>>>>>> need for this.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>>>>>>>>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com>
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>>>>>>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>>>>>>>>>>>>>>>>>>         2 files changed, 22 deletions(-)
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>> index 487cd654b69e..7d59a66e3988 100644
>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>>>>>>>>>>>>>>>>>>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>             struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>             int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>>>>>>>> -     */
>>>>>>>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>             xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>             } while (timeout > 1);
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>         flr_done:
>>>>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>             /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>             if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>>>>>>>                 && (!amdgpu_device_has_job_running(adev) || diff
>>>>>>>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>> index e3869067a31d..f82c066c8e8d 100644
>>>>>>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>>>>>>>>>>>>>>>>>>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>             struct amdgpu_device *adev = container_of(virt, struct
>>>>>>>>>>>>>>>>>> amdgpu_device, virt);
>>>>>>>>>>>>>>>>>>>             int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>>>>>>>>>>>>>>>>>>> -     * otherwise the mailbox msg will be ruined/reseted by
>>>>>>>>>>>>>>>>>>> -     * the VF FLR.
>>>>>>>>>>>>>>>>>>> -     */
>>>>>>>>>>>>>>>>>>> -    if (!down_write_trylock(&adev->reset_sem))
>>>>>>>>>>>>>>>>>>> -        return;
>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>> amdgpu_virt_fini_data_exchange(adev);
>>>>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 1);
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>             xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0,
>>>>>>>>>>>>>>>>>>> 0, 0);
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>>>>>>>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>>>>>>>>             } while (timeout > 1);
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>         flr_done:
>>>>>>>>>>>>>>>>>>> -    atomic_set(&adev->in_gpu_reset, 0);
>>>>>>>>>>>>>>>>>>> -    up_write(&adev->reset_sem);
>>>>>>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>>>>>>             /* Trigger recovery for world switch failure if no TDR
>>>>>>>>>>>>>>>>>>> */
>>>>>>>>>>>>>>>>>>>             if (amdgpu_device_should_recover_gpu(adev)
>>>>>>>>>>>>>>>>>>>                 && (!amdgpu_device_has_job_running(adev) ||
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 487cd654b69e..7d59a66e3988 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -248,15 +248,7 @@  static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
 	int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
 
-	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
-	 * otherwise the mailbox msg will be ruined/reseted by
-	 * the VF FLR.
-	 */
-	if (!down_write_trylock(&adev->reset_sem))
-		return;
-
 	amdgpu_virt_fini_data_exchange(adev);
-	atomic_set(&adev->in_gpu_reset, 1);
 
 	xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
 
@@ -269,9 +261,6 @@  static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
 	} while (timeout > 1);
 
 flr_done:
-	atomic_set(&adev->in_gpu_reset, 0);
-	up_write(&adev->reset_sem);
-
 	/* Trigger recovery for world switch failure if no TDR */
 	if (amdgpu_device_should_recover_gpu(adev)
 		&& (!amdgpu_device_has_job_running(adev) ||
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index e3869067a31d..f82c066c8e8d 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -277,15 +277,7 @@  static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 	struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
 	int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
 
-	/* block amdgpu_gpu_recover till msg FLR COMPLETE received,
-	 * otherwise the mailbox msg will be ruined/reseted by
-	 * the VF FLR.
-	 */
-	if (!down_write_trylock(&adev->reset_sem))
-		return;
-
 	amdgpu_virt_fini_data_exchange(adev);
-	atomic_set(&adev->in_gpu_reset, 1);
 
 	xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
 
@@ -298,9 +290,6 @@  static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
 	} while (timeout > 1);
 
 flr_done:
-	atomic_set(&adev->in_gpu_reset, 0);
-	up_write(&adev->reset_sem);
-
 	/* Trigger recovery for world switch failure if no TDR */
 	if (amdgpu_device_should_recover_gpu(adev)
 		&& (!amdgpu_device_has_job_running(adev) ||