diff mbox series

[v3,1/2] media: mediatek: vcodec: adding lock to protect decoder context list

Message ID 20240222092609.31382-2-yunfei.dong@mediatek.com (mailing list archive)
State New, archived
Headers show
Series media: adding lock to protect the context list | expand

Commit Message

Yunfei Dong Feb. 22, 2024, 9:26 a.m. UTC
The ctx_list will be deleted when scp getting unexpected behavior, then the
ctx_list->next will be NULL, the kernel driver maybe access NULL pointer in
function vpu_dec_ipi_handler when going through each context, then reboot.

Need to add lock to protect the ctx_list to make sure the ctx_list->next isn't
NULL pointer.

Hardware name: Google juniper sku16 board (DT)
pstate: 20400005 (nzCv daif +PAN -UAO -TCO BTYPE=--)
pc : vpu_dec_ipi_handler+0x58/0x1f8 [mtk_vcodec_dec]
lr : scp_ipi_handler+0xd0/0x194 [mtk_scp]
sp : ffffffc0131dbbd0
x29: ffffffc0131dbbd0 x28: 0000000000000000
x27: ffffff9bb277f348 x26: ffffff9bb242ad00
x25: ffffffd2d440d3b8 x24: ffffffd2a13ff1d4
x23: ffffff9bb7fe85a0 x22: ffffffc0133fbdb0
x21: 0000000000000010 x20: ffffff9b050ea328
x19: ffffffc0131dbc08 x18: 0000000000001000
x17: 0000000000000000 x16: ffffffd2d461c6e0
x15: 0000000000000242 x14: 000000000000018f
x13: 000000000000004d x12: 0000000000000000
x11: 0000000000000001 x10: fffffffffffffff0
x9 : ffffff9bb6e793a8 x8 : 0000000000000000
x7 : 0000000000000000 x6 : 000000000000003f
x5 : 0000000000000040 x4 : fffffffffffffff0
x3 : 0000000000000020 x2 : ffffff9bb6e79080
x1 : 0000000000000010 x0 : ffffffc0131dbc08
Call trace:
vpu_dec_ipi_handler+0x58/0x1f8 [mtk_vcodec_dec (HASH:6c3f 2)]
scp_ipi_handler+0xd0/0x194 [mtk_scp (HASH:7046 3)]
mt8183_scp_irq_handler+0x44/0x88 [mtk_scp (HASH:7046 3)]
scp_irq_handler+0x48/0x90 [mtk_scp (HASH:7046 3)]
irq_thread_fn+0x38/0x94
irq_thread+0x100/0x1c0
kthread+0x140/0x1fc
ret_from_fork+0x10/0x30
Code: 54000088 f94ca50a eb14015f 54000060 (f9400108)
---[ end trace ace43ce36cbd5c93 ]---
Kernel panic - not syncing: Oops: Fatal exception
SMP: stopping secondary CPUs
Kernel Offset: 0x12c4000000 from 0xffffffc010000000
PHYS_OFFSET: 0xffffffe580000000
CPU features: 0x08240002,2188200c
Memory Limit: none

Fixes: 655b86e52eac ("media: mediatek: vcodec: Fix possible invalid memory access for decoder")
Signed-off-by: Yunfei Dong <yunfei.dong@mediatek.com>
---
 .../platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c      | 4 ++--
 .../platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c    | 5 +++++
 .../platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h    | 2 ++
 drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c | 2 ++
 4 files changed, 11 insertions(+), 2 deletions(-)

Comments

AngeloGioacchino Del Regno Feb. 23, 2024, 8:28 a.m. UTC | #1
Il 22/02/24 10:26, Yunfei Dong ha scritto:
> The ctx_list will be deleted when scp getting unexpected behavior, then the
> ctx_list->next will be NULL, the kernel driver maybe access NULL pointer in
> function vpu_dec_ipi_handler when going through each context, then reboot.
> 
> Need to add lock to protect the ctx_list to make sure the ctx_list->next isn't
> NULL pointer.
> 
> Hardware name: Google juniper sku16 board (DT)
> pstate: 20400005 (nzCv daif +PAN -UAO -TCO BTYPE=--)
> pc : vpu_dec_ipi_handler+0x58/0x1f8 [mtk_vcodec_dec]
> lr : scp_ipi_handler+0xd0/0x194 [mtk_scp]
> sp : ffffffc0131dbbd0
> x29: ffffffc0131dbbd0 x28: 0000000000000000
> x27: ffffff9bb277f348 x26: ffffff9bb242ad00
> x25: ffffffd2d440d3b8 x24: ffffffd2a13ff1d4
> x23: ffffff9bb7fe85a0 x22: ffffffc0133fbdb0
> x21: 0000000000000010 x20: ffffff9b050ea328
> x19: ffffffc0131dbc08 x18: 0000000000001000
> x17: 0000000000000000 x16: ffffffd2d461c6e0
> x15: 0000000000000242 x14: 000000000000018f
> x13: 000000000000004d x12: 0000000000000000
> x11: 0000000000000001 x10: fffffffffffffff0
> x9 : ffffff9bb6e793a8 x8 : 0000000000000000
> x7 : 0000000000000000 x6 : 000000000000003f
> x5 : 0000000000000040 x4 : fffffffffffffff0
> x3 : 0000000000000020 x2 : ffffff9bb6e79080
> x1 : 0000000000000010 x0 : ffffffc0131dbc08
> Call trace:
> vpu_dec_ipi_handler+0x58/0x1f8 [mtk_vcodec_dec (HASH:6c3f 2)]
> scp_ipi_handler+0xd0/0x194 [mtk_scp (HASH:7046 3)]
> mt8183_scp_irq_handler+0x44/0x88 [mtk_scp (HASH:7046 3)]
> scp_irq_handler+0x48/0x90 [mtk_scp (HASH:7046 3)]
> irq_thread_fn+0x38/0x94
> irq_thread+0x100/0x1c0
> kthread+0x140/0x1fc
> ret_from_fork+0x10/0x30
> Code: 54000088 f94ca50a eb14015f 54000060 (f9400108)
> ---[ end trace ace43ce36cbd5c93 ]---
> Kernel panic - not syncing: Oops: Fatal exception
> SMP: stopping secondary CPUs
> Kernel Offset: 0x12c4000000 from 0xffffffc010000000
> PHYS_OFFSET: 0xffffffe580000000
> CPU features: 0x08240002,2188200c
> Memory Limit: none
> 
> Fixes: 655b86e52eac ("media: mediatek: vcodec: Fix possible invalid memory access for decoder")
> Signed-off-by: Yunfei Dong <yunfei.dong@mediatek.com>

Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Nicolas Dufresne Feb. 23, 2024, 6:11 p.m. UTC | #2
Hi Yunfei,

Le jeudi 22 février 2024 à 17:26 +0800, Yunfei Dong a écrit :
> The ctx_list will be deleted when scp getting unexpected behavior, then the
> ctx_list->next will be NULL, the kernel driver maybe access NULL pointer in
> function vpu_dec_ipi_handler when going through each context, then reboot.
> 
> Need to add lock to protect the ctx_list to make sure the ctx_list->next isn't
> NULL pointer.
> 
> Hardware name: Google juniper sku16 board (DT)
> pstate: 20400005 (nzCv daif +PAN -UAO -TCO BTYPE=--)
> pc : vpu_dec_ipi_handler+0x58/0x1f8 [mtk_vcodec_dec]
> lr : scp_ipi_handler+0xd0/0x194 [mtk_scp]
> sp : ffffffc0131dbbd0
> x29: ffffffc0131dbbd0 x28: 0000000000000000
> x27: ffffff9bb277f348 x26: ffffff9bb242ad00
> x25: ffffffd2d440d3b8 x24: ffffffd2a13ff1d4
> x23: ffffff9bb7fe85a0 x22: ffffffc0133fbdb0
> x21: 0000000000000010 x20: ffffff9b050ea328
> x19: ffffffc0131dbc08 x18: 0000000000001000
> x17: 0000000000000000 x16: ffffffd2d461c6e0
> x15: 0000000000000242 x14: 000000000000018f
> x13: 000000000000004d x12: 0000000000000000
> x11: 0000000000000001 x10: fffffffffffffff0
> x9 : ffffff9bb6e793a8 x8 : 0000000000000000
> x7 : 0000000000000000 x6 : 000000000000003f
> x5 : 0000000000000040 x4 : fffffffffffffff0
> x3 : 0000000000000020 x2 : ffffff9bb6e79080
> x1 : 0000000000000010 x0 : ffffffc0131dbc08
> Call trace:
> vpu_dec_ipi_handler+0x58/0x1f8 [mtk_vcodec_dec (HASH:6c3f 2)]
> scp_ipi_handler+0xd0/0x194 [mtk_scp (HASH:7046 3)]
> mt8183_scp_irq_handler+0x44/0x88 [mtk_scp (HASH:7046 3)]
> scp_irq_handler+0x48/0x90 [mtk_scp (HASH:7046 3)]
> irq_thread_fn+0x38/0x94
> irq_thread+0x100/0x1c0
> kthread+0x140/0x1fc
> ret_from_fork+0x10/0x30
> Code: 54000088 f94ca50a eb14015f 54000060 (f9400108)
> ---[ end trace ace43ce36cbd5c93 ]---
> Kernel panic - not syncing: Oops: Fatal exception
> SMP: stopping secondary CPUs
> Kernel Offset: 0x12c4000000 from 0xffffffc010000000
> PHYS_OFFSET: 0xffffffe580000000
> CPU features: 0x08240002,2188200c
> Memory Limit: none
> 
> Fixes: 655b86e52eac ("media: mediatek: vcodec: Fix possible invalid memory access for decoder")
> Signed-off-by: Yunfei Dong <yunfei.dong@mediatek.com>

I've been experiencing this crasher recently, so nice to see you found the
problem.

Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>

> ---
>  .../platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c      | 4 ++--
>  .../platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c    | 5 +++++
>  .../platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h    | 2 ++
>  drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c | 2 ++
>  4 files changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c
> index 9f6e4b59455da..9a11a2c248045 100644
> --- a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c
> +++ b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c
> @@ -58,12 +58,12 @@ static void mtk_vcodec_vpu_reset_dec_handler(void *priv)
>  
>  	dev_err(&dev->plat_dev->dev, "Watchdog timeout!!");
>  
> -	mutex_lock(&dev->dev_mutex);
> +	mutex_lock(&dev->dev_ctx_lock);
>  	list_for_each_entry(ctx, &dev->ctx_list, list) {
>  		ctx->state = MTK_STATE_ABORT;
>  		mtk_v4l2_vdec_dbg(0, ctx, "[%d] Change to state MTK_STATE_ABORT", ctx->id);
>  	}
> -	mutex_unlock(&dev->dev_mutex);
> +	mutex_unlock(&dev->dev_ctx_lock);
>  }
>  
>  static void mtk_vcodec_vpu_reset_enc_handler(void *priv)
> diff --git a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c
> index ad9b68380692f..d69c9fe2af6f3 100644
> --- a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c
> +++ b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c
> @@ -267,7 +267,9 @@ static int fops_vcodec_open(struct file *file)
>  
>  	ctx->dev->vdec_pdata->init_vdec_params(ctx);
>  
> +	mutex_lock(&dev->dev_ctx_lock);
>  	list_add(&ctx->list, &dev->ctx_list);
> +	mutex_unlock(&dev->dev_ctx_lock);
>  	mtk_vcodec_dbgfs_create(ctx);
>  
>  	mutex_unlock(&dev->dev_mutex);
> @@ -310,7 +312,9 @@ static int fops_vcodec_release(struct file *file)
>  	v4l2_ctrl_handler_free(&ctx->ctrl_hdl);
>  
>  	mtk_vcodec_dbgfs_remove(dev, ctx->id);
> +	mutex_lock(&dev->dev_ctx_lock);
>  	list_del_init(&ctx->list);
> +	mutex_unlock(&dev->dev_ctx_lock);
>  	kfree(ctx);
>  	mutex_unlock(&dev->dev_mutex);
>  	return 0;
> @@ -403,6 +407,7 @@ static int mtk_vcodec_probe(struct platform_device *pdev)
>  	for (i = 0; i < MTK_VDEC_HW_MAX; i++)
>  		mutex_init(&dev->dec_mutex[i]);
>  	mutex_init(&dev->dev_mutex);
> +	mutex_init(&dev->dev_ctx_lock);
>  	spin_lock_init(&dev->irqlock);
>  
>  	snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name), "%s",
> diff --git a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h
> index 849b89dd205c2..85b2c0d3d8bcd 100644
> --- a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h
> +++ b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h
> @@ -241,6 +241,7 @@ struct mtk_vcodec_dec_ctx {
>   *
>   * @dec_mutex: decoder hardware lock
>   * @dev_mutex: video_device lock
> + * @dev_ctx_lock: the lock of context list
>   * @decode_workqueue: decode work queue
>   *
>   * @irqlock: protect data access by irq handler and work thread
> @@ -282,6 +283,7 @@ struct mtk_vcodec_dec_dev {
>  	/* decoder hardware mutex lock */
>  	struct mutex dec_mutex[MTK_VDEC_HW_MAX];
>  	struct mutex dev_mutex;
> +	struct mutex dev_ctx_lock;
>  	struct workqueue_struct *decode_workqueue;
>  
>  	spinlock_t irqlock;
> diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c
> index 82e57ae983d55..da6be556727bb 100644
> --- a/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c
> +++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c
> @@ -77,12 +77,14 @@ static bool vpu_dec_check_ap_inst(struct mtk_vcodec_dec_dev *dec_dev, struct vde
>  	struct mtk_vcodec_dec_ctx *ctx;
>  	int ret = false;
>  
> +	mutex_lock(&dec_dev->dev_ctx_lock);
>  	list_for_each_entry(ctx, &dec_dev->ctx_list, list) {
>  		if (!IS_ERR_OR_NULL(ctx) && ctx->vpu_inst == vpu) {
>  			ret = true;
>  			break;
>  		}
>  	}
> +	mutex_unlock(&dec_dev->dev_ctx_lock);
>  
>  	return ret;
>  }
Nicolas Dufresne Feb. 26, 2024, 7:39 p.m. UTC | #3
Hi,

Le jeudi 22 février 2024 à 17:26 +0800, Yunfei Dong a écrit :
> The ctx_list will be deleted when scp getting unexpected behavior, then the
> ctx_list->next will be NULL, the kernel driver maybe access NULL pointer in
> function vpu_dec_ipi_handler when going through each context, then reboot.
> 
> Need to add lock to protect the ctx_list to make sure the ctx_list->next isn't
> NULL pointer.

The cited crash no longer occurs for me, but it still sometimes crashes while
the SCP being rebooted. I think this patch can still go in, as it overall
improves the situation.

Meanwhile, here's my stress test using GStreamer and stream downloaded by
fluster. I call this script few times this way as it does not always crash. The
test just keep starting decode sessions and terminate them after 2 seconds. It
is highly parallel. Using too low number does not reproduce the crash, using too
high number leads to alloc failure, which wasn't the goal of this test.

./mtk-vcodec-crash.sh 100

Script code:
***
#!/bin/bash

test() {
	gst-launch-1.0 --no-fault filesrc location=TILES_B_Cisco_1.bin ! h265parse ! v4l2slh265dec ! fakevideosink &
	pid=$!

	sleep 2
	kill $pid
}

for i in $(seq 1 $1)
do
	test &
done

wait
***

The kernel Crash:
[   93.261248] Unable to handle kernel NULL pointer dereference at virtual
address 0000000000000008
[   93.270056] Mem abort info:
[   93.272880]   ESR = 0x0000000096000004
[   93.276804]   EC = 0x25: DABT (current EL), IL = 32 bits
[   93.282233]   SET = 0, FnV = 0
[   93.285372]   EA = 0, S1PTW = 0
[   93.288561]   FSC = 0x04: level 0 translation fault
[   93.293493] Data abort info:
[   93.296424]   ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
[   93.301920]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[   93.306977]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[   93.312321] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000168daf000
[   93.318790] [0000000000000008] pgd=0000000000000000, p4d=0000000000000000
[   93.325588] Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP
[   93.331842] Modules linked in: mt7921e mt7921_common mt792x_lib
mt76_connac_lib mt76 mac80211 btusb btintel mtk_vcodec_dec_hw btmtk btrtl
mtk_vcodec_dec btbcm cfg80211 bluetooth snd_sof_mt8195 mtk_vcodec_enc
mtk_adsp_common uvcvideo v4l2_vp9 snd_sof_xtensa_dsp v4l2_h264 mtk_vcodec_dbgfs
snd_sof_of snd_sof ecdh_generic mtk_vcodec_common ecc uvc elan_i2c
videobuf2_vmalloc crct10dif_ce cros_ec_lid_angle cros_ec_sensors snd_sof_utils
cros_ec_sensors_core cros_usbpd_logger cros_usbpd_charger fuse ip_tables ipv6
[   93.376652] CPU: 5 PID: 3210 Comm: h265parse0:sink Tainted: G        W      
6.8.0-rc4-next-20240212+ #14
[   93.386463] Hardware name: Acer Tomato (rev3 - 4) board (DT)
[   93.392107] pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   93.399054] pc : vcodec_vpu_send_msg+0x4c/0x190 [mtk_vcodec_dec]
[   93.405058] lr : vcodec_send_ap_ipi+0x78/0x170 [mtk_vcodec_dec]
[   93.410968] sp : ffff80008750bc20
[   93.414269] x29: ffff80008750bc20 x28: ffff1299f6d70000 x27: 0000000000000000
[   93.421391] x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000
[   93.428512] x23: ffff80008750bc98 x22: 000000000000a003 x21: ffffd45c4cfae000
[   93.435632] x20: 0000000000000010 x19: ffff1299fd668310 x18: 000000000000001a
[   93.442753] x17: 000000040044ffff x16: ffffd45cb15dc648 x15: 0000000000000000
[   93.449874] x14: ffff1299c08da1c0 x13: ffffd45cb1f87a10 x12: ffffd45cb2f5fe80
[   93.456995] x11: 0000000000000001 x10: 0000000000001b30 x9 : ffffd45c4d12b488
[   93.464116] x8 : 1fffe25339380d81 x7 : 0000000000000001 x6 : ffff1299c9c06c00
[   93.471236] x5 : 0000000000000132 x4 : 0000000000000000 x3 : 0000000000000000
[   93.478358] x2 : 0000000000000010 x1 : ffff80008750bc98 x0 : 0000000000000000
[   93.485479] Call trace:
[   93.487914]  vcodec_vpu_send_msg+0x4c/0x190 [mtk_vcodec_dec]
[   93.493563]  vcodec_send_ap_ipi+0x78/0x170 [mtk_vcodec_dec]
[   93.499125]  vpu_dec_deinit+0x1c/0x30 [mtk_vcodec_dec]
[   93.504254]  vdec_hevc_slice_deinit+0x30/0x98 [mtk_vcodec_dec]
[   93.510076]  vdec_if_deinit+0x38/0x68 [mtk_vcodec_dec]
[   93.515205]  mtk_vcodec_dec_release+0x20/0x40 [mtk_vcodec_dec]
[   93.521027]  fops_vcodec_release+0x64/0x118 [mtk_vcodec_dec]
[   93.526677]  v4l2_release+0x7c/0x100
[   93.530245]  __fput+0x80/0x2d8
[   93.533292]  __fput_sync+0x58/0x70
[   93.536681]  __arm64_sys_close+0x40/0x90
[   93.540590]  invoke_syscall+0x50/0x128
[   93.544329]  el0_svc_common.constprop.0+0x48/0xf0
[   93.549020]  do_el0_svc+0x24/0x38
[   93.552323]  el0_svc+0x38/0xd8
[   93.555367]  el0t_64_sync_handler+0xc0/0xc8
[   93.559537]  el0t_64_sync+0x1a8/0x1b0
[   93.563189] Code: d503201f f9401660 b900127f b900227f (f9400400) 
[   93.569268] ---[ end trace 0000000000000000 ]---
Nicolas Dufresne Feb. 26, 2024, 9:32 p.m. UTC | #4
Hi Yunfei,

Le lundi 26 février 2024 à 14:39 -0500, Nicolas Dufresne a écrit :
> Hi,
> 
> Le jeudi 22 février 2024 à 17:26 +0800, Yunfei Dong a écrit :
> > The ctx_list will be deleted when scp getting unexpected behavior, then the
> > ctx_list->next will be NULL, the kernel driver maybe access NULL pointer in
> > function vpu_dec_ipi_handler when going through each context, then reboot.
> > 
> > Need to add lock to protect the ctx_list to make sure the ctx_list->next isn't
> > NULL pointer.
> 
> The cited crash no longer occurs for me, but it still sometimes crashes while
> the SCP being rebooted. I think this patch can still go in, as it overall
> improves the situation.
> 
> Meanwhile, here's my stress test using GStreamer and stream downloaded by
> fluster. I call this script few times this way as it does not always crash. The
> test just keep starting decode sessions and terminate them after 2 seconds. It
> is highly parallel. Using too low number does not reproduce the crash, using too
> high number leads to alloc failure, which wasn't the goal of this test.

I just sent a fix for that crash, it was limited to HEVC.

https://lore.kernel.org/all/20240226211954.400891-1-nicolas.dufresne@collabora.com/

With this applied, the kernel no longer crash. But the SCP get reset every-time
I run the script below. Will you be able to provide a firmware (or driver if
that turns out to the issue) for this ?

regards,
Nicolas

> 
> ./mtk-vcodec-crash.sh 100

> 
> Script code:
> ***
> #!/bin/bash
> 
> test() {
> 	gst-launch-1.0 --no-fault filesrc location=TILES_B_Cisco_1.bin ! h265parse ! v4l2slh265dec ! fakevideosink &
> 	pid=$!
> 
> 	sleep 2
> 	kill $pid
> }
> 
> for i in $(seq 1 $1)
> do
> 	test &
> done
> 
> wait
> ***
> 
> The kernel Crash:
> [   93.261248] Unable to handle kernel NULL pointer dereference at virtual
> address 0000000000000008
> [   93.270056] Mem abort info:
> [   93.272880]   ESR = 0x0000000096000004
> [   93.276804]   EC = 0x25: DABT (current EL), IL = 32 bits
> [   93.282233]   SET = 0, FnV = 0
> [   93.285372]   EA = 0, S1PTW = 0
> [   93.288561]   FSC = 0x04: level 0 translation fault
> [   93.293493] Data abort info:
> [   93.296424]   ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
> [   93.301920]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> [   93.306977]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> [   93.312321] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000168daf000
> [   93.318790] [0000000000000008] pgd=0000000000000000, p4d=0000000000000000
> [   93.325588] Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP
> [   93.331842] Modules linked in: mt7921e mt7921_common mt792x_lib
> mt76_connac_lib mt76 mac80211 btusb btintel mtk_vcodec_dec_hw btmtk btrtl
> mtk_vcodec_dec btbcm cfg80211 bluetooth snd_sof_mt8195 mtk_vcodec_enc
> mtk_adsp_common uvcvideo v4l2_vp9 snd_sof_xtensa_dsp v4l2_h264 mtk_vcodec_dbgfs
> snd_sof_of snd_sof ecdh_generic mtk_vcodec_common ecc uvc elan_i2c
> videobuf2_vmalloc crct10dif_ce cros_ec_lid_angle cros_ec_sensors snd_sof_utils
> cros_ec_sensors_core cros_usbpd_logger cros_usbpd_charger fuse ip_tables ipv6
> [   93.376652] CPU: 5 PID: 3210 Comm: h265parse0:sink Tainted: G        W      
> 6.8.0-rc4-next-20240212+ #14
> [   93.386463] Hardware name: Acer Tomato (rev3 - 4) board (DT)
> [   93.392107] pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [   93.399054] pc : vcodec_vpu_send_msg+0x4c/0x190 [mtk_vcodec_dec]
> [   93.405058] lr : vcodec_send_ap_ipi+0x78/0x170 [mtk_vcodec_dec]
> [   93.410968] sp : ffff80008750bc20
> [   93.414269] x29: ffff80008750bc20 x28: ffff1299f6d70000 x27: 0000000000000000
> [   93.421391] x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000
> [   93.428512] x23: ffff80008750bc98 x22: 000000000000a003 x21: ffffd45c4cfae000
> [   93.435632] x20: 0000000000000010 x19: ffff1299fd668310 x18: 000000000000001a
> [   93.442753] x17: 000000040044ffff x16: ffffd45cb15dc648 x15: 0000000000000000
> [   93.449874] x14: ffff1299c08da1c0 x13: ffffd45cb1f87a10 x12: ffffd45cb2f5fe80
> [   93.456995] x11: 0000000000000001 x10: 0000000000001b30 x9 : ffffd45c4d12b488
> [   93.464116] x8 : 1fffe25339380d81 x7 : 0000000000000001 x6 : ffff1299c9c06c00
> [   93.471236] x5 : 0000000000000132 x4 : 0000000000000000 x3 : 0000000000000000
> [   93.478358] x2 : 0000000000000010 x1 : ffff80008750bc98 x0 : 0000000000000000
> [   93.485479] Call trace:
> [   93.487914]  vcodec_vpu_send_msg+0x4c/0x190 [mtk_vcodec_dec]
> [   93.493563]  vcodec_send_ap_ipi+0x78/0x170 [mtk_vcodec_dec]
> [   93.499125]  vpu_dec_deinit+0x1c/0x30 [mtk_vcodec_dec]
> [   93.504254]  vdec_hevc_slice_deinit+0x30/0x98 [mtk_vcodec_dec]
> [   93.510076]  vdec_if_deinit+0x38/0x68 [mtk_vcodec_dec]
> [   93.515205]  mtk_vcodec_dec_release+0x20/0x40 [mtk_vcodec_dec]
> [   93.521027]  fops_vcodec_release+0x64/0x118 [mtk_vcodec_dec]
> [   93.526677]  v4l2_release+0x7c/0x100
> [   93.530245]  __fput+0x80/0x2d8
> [   93.533292]  __fput_sync+0x58/0x70
> [   93.536681]  __arm64_sys_close+0x40/0x90
> [   93.540590]  invoke_syscall+0x50/0x128
> [   93.544329]  el0_svc_common.constprop.0+0x48/0xf0
> [   93.549020]  do_el0_svc+0x24/0x38
> [   93.552323]  el0_svc+0x38/0xd8
> [   93.555367]  el0t_64_sync_handler+0xc0/0xc8
> [   93.559537]  el0t_64_sync+0x1a8/0x1b0
> [   93.563189] Code: d503201f f9401660 b900127f b900227f (f9400400) 
> [   93.569268] ---[ end trace 0000000000000000 ]---
>
Sebastian Fricke March 13, 2024, 1:46 p.m. UTC | #5
Hey Yunfei,

On 22.02.2024 17:26, Yunfei Dong wrote:
>The ctx_list will be deleted when scp getting unexpected behavior, then the
>ctx_list->next will be NULL, the kernel driver maybe access NULL pointer in
>function vpu_dec_ipi_handler when going through each context, then reboot.

>
>Need to add lock to protect the ctx_list to make sure the ctx_list->next isn't
>NULL pointer.

Same here as with the encoder changes.

Greetings,
Sebastian

>
>Hardware name: Google juniper sku16 board (DT)
>pstate: 20400005 (nzCv daif +PAN -UAO -TCO BTYPE=--)
>pc : vpu_dec_ipi_handler+0x58/0x1f8 [mtk_vcodec_dec]
>lr : scp_ipi_handler+0xd0/0x194 [mtk_scp]
>sp : ffffffc0131dbbd0
>x29: ffffffc0131dbbd0 x28: 0000000000000000
>x27: ffffff9bb277f348 x26: ffffff9bb242ad00
>x25: ffffffd2d440d3b8 x24: ffffffd2a13ff1d4
>x23: ffffff9bb7fe85a0 x22: ffffffc0133fbdb0
>x21: 0000000000000010 x20: ffffff9b050ea328
>x19: ffffffc0131dbc08 x18: 0000000000001000
>x17: 0000000000000000 x16: ffffffd2d461c6e0
>x15: 0000000000000242 x14: 000000000000018f
>x13: 000000000000004d x12: 0000000000000000
>x11: 0000000000000001 x10: fffffffffffffff0
>x9 : ffffff9bb6e793a8 x8 : 0000000000000000
>x7 : 0000000000000000 x6 : 000000000000003f
>x5 : 0000000000000040 x4 : fffffffffffffff0
>x3 : 0000000000000020 x2 : ffffff9bb6e79080
>x1 : 0000000000000010 x0 : ffffffc0131dbc08
>Call trace:
>vpu_dec_ipi_handler+0x58/0x1f8 [mtk_vcodec_dec (HASH:6c3f 2)]
>scp_ipi_handler+0xd0/0x194 [mtk_scp (HASH:7046 3)]
>mt8183_scp_irq_handler+0x44/0x88 [mtk_scp (HASH:7046 3)]
>scp_irq_handler+0x48/0x90 [mtk_scp (HASH:7046 3)]
>irq_thread_fn+0x38/0x94
>irq_thread+0x100/0x1c0
>kthread+0x140/0x1fc
>ret_from_fork+0x10/0x30
>Code: 54000088 f94ca50a eb14015f 54000060 (f9400108)
>---[ end trace ace43ce36cbd5c93 ]---
>Kernel panic - not syncing: Oops: Fatal exception
>SMP: stopping secondary CPUs
>Kernel Offset: 0x12c4000000 from 0xffffffc010000000
>PHYS_OFFSET: 0xffffffe580000000
>CPU features: 0x08240002,2188200c
>Memory Limit: none
>
>Fixes: 655b86e52eac ("media: mediatek: vcodec: Fix possible invalid memory access for decoder")
>Signed-off-by: Yunfei Dong <yunfei.dong@mediatek.com>
>---
> .../platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c      | 4 ++--
> .../platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c    | 5 +++++
> .../platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h    | 2 ++
> drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c | 2 ++
> 4 files changed, 11 insertions(+), 2 deletions(-)
>
>diff --git a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c
>index 9f6e4b59455da..9a11a2c248045 100644
>--- a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c
>+++ b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c
>@@ -58,12 +58,12 @@ static void mtk_vcodec_vpu_reset_dec_handler(void *priv)
>
> 	dev_err(&dev->plat_dev->dev, "Watchdog timeout!!");
>
>-	mutex_lock(&dev->dev_mutex);
>+	mutex_lock(&dev->dev_ctx_lock);
> 	list_for_each_entry(ctx, &dev->ctx_list, list) {
> 		ctx->state = MTK_STATE_ABORT;
> 		mtk_v4l2_vdec_dbg(0, ctx, "[%d] Change to state MTK_STATE_ABORT", ctx->id);
> 	}
>-	mutex_unlock(&dev->dev_mutex);
>+	mutex_unlock(&dev->dev_ctx_lock);
> }
>
> static void mtk_vcodec_vpu_reset_enc_handler(void *priv)
>diff --git a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c
>index ad9b68380692f..d69c9fe2af6f3 100644
>--- a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c
>+++ b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c
>@@ -267,7 +267,9 @@ static int fops_vcodec_open(struct file *file)
>
> 	ctx->dev->vdec_pdata->init_vdec_params(ctx);
>
>+	mutex_lock(&dev->dev_ctx_lock);
> 	list_add(&ctx->list, &dev->ctx_list);
>+	mutex_unlock(&dev->dev_ctx_lock);
> 	mtk_vcodec_dbgfs_create(ctx);
>
> 	mutex_unlock(&dev->dev_mutex);
>@@ -310,7 +312,9 @@ static int fops_vcodec_release(struct file *file)
> 	v4l2_ctrl_handler_free(&ctx->ctrl_hdl);
>
> 	mtk_vcodec_dbgfs_remove(dev, ctx->id);
>+	mutex_lock(&dev->dev_ctx_lock);
> 	list_del_init(&ctx->list);
>+	mutex_unlock(&dev->dev_ctx_lock);
> 	kfree(ctx);
> 	mutex_unlock(&dev->dev_mutex);
> 	return 0;
>@@ -403,6 +407,7 @@ static int mtk_vcodec_probe(struct platform_device *pdev)
> 	for (i = 0; i < MTK_VDEC_HW_MAX; i++)
> 		mutex_init(&dev->dec_mutex[i]);
> 	mutex_init(&dev->dev_mutex);
>+	mutex_init(&dev->dev_ctx_lock);
> 	spin_lock_init(&dev->irqlock);
>
> 	snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name), "%s",
>diff --git a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h
>index 849b89dd205c2..85b2c0d3d8bcd 100644
>--- a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h
>+++ b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h
>@@ -241,6 +241,7 @@ struct mtk_vcodec_dec_ctx {
>  *
>  * @dec_mutex: decoder hardware lock
>  * @dev_mutex: video_device lock
>+ * @dev_ctx_lock: the lock of context list
>  * @decode_workqueue: decode work queue
>  *
>  * @irqlock: protect data access by irq handler and work thread
>@@ -282,6 +283,7 @@ struct mtk_vcodec_dec_dev {
> 	/* decoder hardware mutex lock */
> 	struct mutex dec_mutex[MTK_VDEC_HW_MAX];
> 	struct mutex dev_mutex;
>+	struct mutex dev_ctx_lock;
> 	struct workqueue_struct *decode_workqueue;
>
> 	spinlock_t irqlock;
>diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c
>index 82e57ae983d55..da6be556727bb 100644
>--- a/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c
>+++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c
>@@ -77,12 +77,14 @@ static bool vpu_dec_check_ap_inst(struct mtk_vcodec_dec_dev *dec_dev, struct vde
> 	struct mtk_vcodec_dec_ctx *ctx;
> 	int ret = false;
>
>+	mutex_lock(&dec_dev->dev_ctx_lock);
> 	list_for_each_entry(ctx, &dec_dev->ctx_list, list) {
> 		if (!IS_ERR_OR_NULL(ctx) && ctx->vpu_inst == vpu) {
> 			ret = true;
> 			break;
> 		}
> 	}
>+	mutex_unlock(&dec_dev->dev_ctx_lock);
>
> 	return ret;
> }
>-- 
>2.18.0
>
>
diff mbox series

Patch

diff --git a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c
index 9f6e4b59455da..9a11a2c248045 100644
--- a/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c
+++ b/drivers/media/platform/mediatek/vcodec/common/mtk_vcodec_fw_vpu.c
@@ -58,12 +58,12 @@  static void mtk_vcodec_vpu_reset_dec_handler(void *priv)
 
 	dev_err(&dev->plat_dev->dev, "Watchdog timeout!!");
 
-	mutex_lock(&dev->dev_mutex);
+	mutex_lock(&dev->dev_ctx_lock);
 	list_for_each_entry(ctx, &dev->ctx_list, list) {
 		ctx->state = MTK_STATE_ABORT;
 		mtk_v4l2_vdec_dbg(0, ctx, "[%d] Change to state MTK_STATE_ABORT", ctx->id);
 	}
-	mutex_unlock(&dev->dev_mutex);
+	mutex_unlock(&dev->dev_ctx_lock);
 }
 
 static void mtk_vcodec_vpu_reset_enc_handler(void *priv)
diff --git a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c
index ad9b68380692f..d69c9fe2af6f3 100644
--- a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c
+++ b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.c
@@ -267,7 +267,9 @@  static int fops_vcodec_open(struct file *file)
 
 	ctx->dev->vdec_pdata->init_vdec_params(ctx);
 
+	mutex_lock(&dev->dev_ctx_lock);
 	list_add(&ctx->list, &dev->ctx_list);
+	mutex_unlock(&dev->dev_ctx_lock);
 	mtk_vcodec_dbgfs_create(ctx);
 
 	mutex_unlock(&dev->dev_mutex);
@@ -310,7 +312,9 @@  static int fops_vcodec_release(struct file *file)
 	v4l2_ctrl_handler_free(&ctx->ctrl_hdl);
 
 	mtk_vcodec_dbgfs_remove(dev, ctx->id);
+	mutex_lock(&dev->dev_ctx_lock);
 	list_del_init(&ctx->list);
+	mutex_unlock(&dev->dev_ctx_lock);
 	kfree(ctx);
 	mutex_unlock(&dev->dev_mutex);
 	return 0;
@@ -403,6 +407,7 @@  static int mtk_vcodec_probe(struct platform_device *pdev)
 	for (i = 0; i < MTK_VDEC_HW_MAX; i++)
 		mutex_init(&dev->dec_mutex[i]);
 	mutex_init(&dev->dev_mutex);
+	mutex_init(&dev->dev_ctx_lock);
 	spin_lock_init(&dev->irqlock);
 
 	snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name), "%s",
diff --git a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h
index 849b89dd205c2..85b2c0d3d8bcd 100644
--- a/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h
+++ b/drivers/media/platform/mediatek/vcodec/decoder/mtk_vcodec_dec_drv.h
@@ -241,6 +241,7 @@  struct mtk_vcodec_dec_ctx {
  *
  * @dec_mutex: decoder hardware lock
  * @dev_mutex: video_device lock
+ * @dev_ctx_lock: the lock of context list
  * @decode_workqueue: decode work queue
  *
  * @irqlock: protect data access by irq handler and work thread
@@ -282,6 +283,7 @@  struct mtk_vcodec_dec_dev {
 	/* decoder hardware mutex lock */
 	struct mutex dec_mutex[MTK_VDEC_HW_MAX];
 	struct mutex dev_mutex;
+	struct mutex dev_ctx_lock;
 	struct workqueue_struct *decode_workqueue;
 
 	spinlock_t irqlock;
diff --git a/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c b/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c
index 82e57ae983d55..da6be556727bb 100644
--- a/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c
+++ b/drivers/media/platform/mediatek/vcodec/decoder/vdec_vpu_if.c
@@ -77,12 +77,14 @@  static bool vpu_dec_check_ap_inst(struct mtk_vcodec_dec_dev *dec_dev, struct vde
 	struct mtk_vcodec_dec_ctx *ctx;
 	int ret = false;
 
+	mutex_lock(&dec_dev->dev_ctx_lock);
 	list_for_each_entry(ctx, &dec_dev->ctx_list, list) {
 		if (!IS_ERR_OR_NULL(ctx) && ctx->vpu_inst == vpu) {
 			ret = true;
 			break;
 		}
 	}
+	mutex_unlock(&dec_dev->dev_ctx_lock);
 
 	return ret;
 }