diff mbox series

drm/amd/display: Fix module load hangs when connected to an eDP

Message ID 20201009211759.2106060-1-Rodrigo.Siqueira@amd.com (mailing list archive)
State New, archived
Headers show
Series drm/amd/display: Fix module load hangs when connected to an eDP | expand

Commit Message

Rodrigo Siqueira Jordao Oct. 9, 2020, 9:17 p.m. UTC
It was recently introduced a change that enables driver to disable
streams if pixel clock changes. Consequently, the code path executed in
the disable vbios function expanded to an encoder verification part.
The encoder loop is nested inside the pipe count loop, and both loops
share the 'i' variable in control of their flow. This situation may lead
to an infinite loop because the encoder loop constantly updates the `i`
variable, making the first loop always positive. As a result, we can see
a soft hang during the module load (modprobe amdgpu) and a series of
dmesg log that looks like this:

kernel:[  124.538727] watchdog: BUG: soft lockup - CPU#2 stuck for 22s!
[modprobe:1000]
RSP: 0018:ffffabbf419bf0e8 EFLAGS: 00000282
RAX: ffffffffc0809de0 RBX: ffff93b35ccc0000 RCX: ffff93b366c21800
RDX: 0000000000000000 RSI: 0000000000000141 RDI: ffff93b35ccc0000
RBP: ffffabbf419bf108 R08: ffffabbf419bf164 R09: 0000000000000001
R10: 0000000000000003 R11: 0000000000000003 R12: 0000000008677d40
R13: 0000000000000141 R14: ffff93b35cfc0000 R15: ffff93b35abc0000
FS:  00007f1400717540(0000) GS:ffff93b37f680000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005649b66b0968 CR3: 00000003e0fec000 CR4: 0000000000350ee0

Call Trace:
 amdgpu_device_rreg+0x17/0x20 [amdgpu]
 amdgpu_cgs_read_register+0x14/0x20 [amdgpu]
 dm_read_reg_func+0x3a/0xb0 [amdgpu]
 get_pixel_clk_frequency_100hz+0x30/0x50 [amdgpu]
 dc_commit_state+0x8f1/0xae0 [amdgpu]
 ? drm_calc_timestamping_constants+0x101/0x160 [drm]
 amdgpu_dm_atomic_commit_tail+0x39d/0x21a0 [amdgpu]
 ? dcn21_validate_bandwidth+0xe5/0x290 [amdgpu]
 ? kfree+0xc3/0x390
 ? dcn21_validate_bandwidth+0xe5/0x290 [amdgpu]
...
RSP: 002b:00007fff26009bd8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
RAX: ffffffffffffffda RBX: 000055a8025bea50 RCX: 00007f140085c89d
RDX: 0000000000000000 RSI: 000055a8025b8290 RDI: 000000000000000c
RBP: 0000000000040000 R08: 0000000000000000 R09: 0000000000000000
R10: 000000000000000c R11: 0000000000000246 R12: 000055a8025b8290
R13: 0000000000000000 R14: 000055a8025bead0 R15: 000055a8025bea50

This issue was fixed by introducing a second variable for the internal
loop.

Fixes: 32933ac6ec3a ("drm/amd/display: disable stream if pixel clock changed with link active")
Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
---
 drivers/gpu/drm/amd/display/dc/core/dc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

Comments

Li, Roman Oct. 9, 2020, 9:22 p.m. UTC | #1
[AMD Public Use]

Reviewed-by: Roman Li <Roman.Li@amd.com>

-----Original Message-----
From: Siqueira, Rodrigo <Rodrigo.Siqueira@amd.com> 
Sent: Friday, October 9, 2020 5:18 PM
To: dri-devel@lists.freedesktop.org
Cc: Wentland, Harry <Harry.Wentland@amd.com>; Li, Sun peng (Leo) <Sunpeng.Li@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Li, Roman <Roman.Li@amd.com>; Kazlauskas, Nicholas <Nicholas.Kazlauskas@amd.com>
Subject: [PATCH] drm/amd/display: Fix module load hangs when connected to an eDP

It was recently introduced a change that enables driver to disable streams if pixel clock changes. Consequently, the code path executed in the disable vbios function expanded to an encoder verification part.
The encoder loop is nested inside the pipe count loop, and both loops share the 'i' variable in control of their flow. This situation may lead to an infinite loop because the encoder loop constantly updates the `i` variable, making the first loop always positive. As a result, we can see a soft hang during the module load (modprobe amdgpu) and a series of dmesg log that looks like this:

kernel:[  124.538727] watchdog: BUG: soft lockup - CPU#2 stuck for 22s!
[modprobe:1000]
RSP: 0018:ffffabbf419bf0e8 EFLAGS: 00000282
RAX: ffffffffc0809de0 RBX: ffff93b35ccc0000 RCX: ffff93b366c21800
RDX: 0000000000000000 RSI: 0000000000000141 RDI: ffff93b35ccc0000
RBP: ffffabbf419bf108 R08: ffffabbf419bf164 R09: 0000000000000001
R10: 0000000000000003 R11: 0000000000000003 R12: 0000000008677d40
R13: 0000000000000141 R14: ffff93b35cfc0000 R15: ffff93b35abc0000
FS:  00007f1400717540(0000) GS:ffff93b37f680000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005649b66b0968 CR3: 00000003e0fec000 CR4: 0000000000350ee0

Call Trace:
 amdgpu_device_rreg+0x17/0x20 [amdgpu]
 amdgpu_cgs_read_register+0x14/0x20 [amdgpu]
 dm_read_reg_func+0x3a/0xb0 [amdgpu]
 get_pixel_clk_frequency_100hz+0x30/0x50 [amdgpu]
 dc_commit_state+0x8f1/0xae0 [amdgpu]
 ? drm_calc_timestamping_constants+0x101/0x160 [drm]
 amdgpu_dm_atomic_commit_tail+0x39d/0x21a0 [amdgpu]  ? dcn21_validate_bandwidth+0xe5/0x290 [amdgpu]  ? kfree+0xc3/0x390  ? dcn21_validate_bandwidth+0xe5/0x290 [amdgpu] ...
RSP: 002b:00007fff26009bd8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
RAX: ffffffffffffffda RBX: 000055a8025bea50 RCX: 00007f140085c89d
RDX: 0000000000000000 RSI: 000055a8025b8290 RDI: 000000000000000c
RBP: 0000000000040000 R08: 0000000000000000 R09: 0000000000000000
R10: 000000000000000c R11: 0000000000000246 R12: 000055a8025b8290
R13: 0000000000000000 R14: 000055a8025bead0 R15: 000055a8025bea50

This issue was fixed by introducing a second variable for the internal loop.

Fixes: 32933ac6ec3a ("drm/amd/display: disable stream if pixel clock changed with link active")
Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
---
 drivers/gpu/drm/amd/display/dc/core/dc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c
index 8d96ef157987..3f7d04b8956a 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -848,7 +848,7 @@ static void disable_vbios_mode_if_required(
 		struct dc *dc,
 		struct dc_state *context)
 {
-	unsigned int i;
+	unsigned int i, j;
 
 	/* check if timing_changed, disable stream*/
 	for (i = 0; i < dc->res_pool->pipe_count; i++) { @@ -872,10 +872,10 @@ static void disable_vbios_mode_if_required(
 
 			enc_inst = link->link_enc->funcs->get_dig_frontend(link->link_enc);
 			if (enc_inst != ENGINE_ID_UNKNOWN) {
-				for (i = 0; i < dc->res_pool->stream_enc_count; i++) {
-					if (dc->res_pool->stream_enc[i]->id == enc_inst) {
-						tg_inst = dc->res_pool->stream_enc[i]->funcs->dig_source_otg(
-							dc->res_pool->stream_enc[i]);
+				for (j = 0; j < dc->res_pool->stream_enc_count; j++) {
+					if (dc->res_pool->stream_enc[j]->id == enc_inst) {
+						tg_inst = dc->res_pool->stream_enc[j]->funcs->dig_source_otg(
+							dc->res_pool->stream_enc[j]);
 						break;
 					}
 				}
--
2.28.0
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c
index 8d96ef157987..3f7d04b8956a 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -848,7 +848,7 @@  static void disable_vbios_mode_if_required(
 		struct dc *dc,
 		struct dc_state *context)
 {
-	unsigned int i;
+	unsigned int i, j;
 
 	/* check if timing_changed, disable stream*/
 	for (i = 0; i < dc->res_pool->pipe_count; i++) {
@@ -872,10 +872,10 @@  static void disable_vbios_mode_if_required(
 
 			enc_inst = link->link_enc->funcs->get_dig_frontend(link->link_enc);
 			if (enc_inst != ENGINE_ID_UNKNOWN) {
-				for (i = 0; i < dc->res_pool->stream_enc_count; i++) {
-					if (dc->res_pool->stream_enc[i]->id == enc_inst) {
-						tg_inst = dc->res_pool->stream_enc[i]->funcs->dig_source_otg(
-							dc->res_pool->stream_enc[i]);
+				for (j = 0; j < dc->res_pool->stream_enc_count; j++) {
+					if (dc->res_pool->stream_enc[j]->id == enc_inst) {
+						tg_inst = dc->res_pool->stream_enc[j]->funcs->dig_source_otg(
+							dc->res_pool->stream_enc[j]);
 						break;
 					}
 				}