diff mbox series

[v3,1/3] mlxbf_gige: Fix kernel panic at shutdown

Message ID 20230922173626.23790-2-asmaa@nvidia.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series mlxbf_gige: Fix several bugs | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply

Commit Message

Asmaa Mnebhi Sept. 22, 2023, 5:36 p.m. UTC
There is a race condition happening during shutdown due to pending napi transactions.
Since mlxbf_gige_poll is still running, it tries to access a NULL pointer and as a
result causes a kernel panic:

[  284.074822] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000070
...
[  284.322326] Call trace:
[  284.324757]  mlxbf_gige_handle_tx_complete+0xc8/0x170 [mlxbf_gige]
[  284.330924]  mlxbf_gige_poll+0x54/0x160 [mlxbf_gige]
[  284.335876]  __napi_poll+0x40/0x1c8
[  284.339353]  net_rx_action+0x314/0x3a0
[  284.343086]  __do_softirq+0x128/0x334
[  284.346734]  run_ksoftirqd+0x54/0x6c
[  284.350294]  smpboot_thread_fn+0x14c/0x190
[  284.354375]  kthread+0x10c/0x110
[  284.357588]  ret_from_fork+0x10/0x20
[  284.361150] Code: 8b070000 f9000ea0 f95056c0 f86178a1 (b9407002)
[  284.367227] ---[ end trace a18340bbb9ea2fa7 ]---

To fix this, invoke mlxbf_gige_remove to disable and dequeue napi during shutdown,
and also return in the case where "priv" is NULL in the poll function.

Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver")
Signed-off-by: Asmaa Mnebhi <asmaa@nvidia.com>
Reviewed-by: David Thompson <davthompson@nvidia.com>
---
v2->v3:
- Add the logic to clean the port to the remove() function
v1-v2:
- make mlxbf_gige_shutdown() the same as the mlxbf_gige_remove()

 .../mellanox/mlxbf_gige/mlxbf_gige_main.c     | 21 ++++++++-----------
 .../mellanox/mlxbf_gige/mlxbf_gige_rx.c       |  3 +++
 2 files changed, 12 insertions(+), 12 deletions(-)

Comments

Florian Fainelli Sept. 22, 2023, 5:44 p.m. UTC | #1
On 9/22/2023 10:36 AM, Asmaa Mnebhi wrote:
> There is a race condition happening during shutdown due to pending napi transactions.
> Since mlxbf_gige_poll is still running, it tries to access a NULL pointer and as a
> result causes a kernel panic:
> 
> [  284.074822] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000070
> ...
> [  284.322326] Call trace:
> [  284.324757]  mlxbf_gige_handle_tx_complete+0xc8/0x170 [mlxbf_gige]
> [  284.330924]  mlxbf_gige_poll+0x54/0x160 [mlxbf_gige]
> [  284.335876]  __napi_poll+0x40/0x1c8
> [  284.339353]  net_rx_action+0x314/0x3a0
> [  284.343086]  __do_softirq+0x128/0x334
> [  284.346734]  run_ksoftirqd+0x54/0x6c
> [  284.350294]  smpboot_thread_fn+0x14c/0x190
> [  284.354375]  kthread+0x10c/0x110
> [  284.357588]  ret_from_fork+0x10/0x20
> [  284.361150] Code: 8b070000 f9000ea0 f95056c0 f86178a1 (b9407002)
> [  284.367227] ---[ end trace a18340bbb9ea2fa7 ]---
> 
> To fix this, invoke mlxbf_gige_remove to disable and dequeue napi during shutdown,
> and also return in the case where "priv" is NULL in the poll function.
> 
> Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver")
> Signed-off-by: Asmaa Mnebhi <asmaa@nvidia.com>
> Reviewed-by: David Thompson <davthompson@nvidia.com>
> ---
> v2->v3:
> - Add the logic to clean the port to the remove() function
> v1-v2:
> - make mlxbf_gige_shutdown() the same as the mlxbf_gige_remove()
> 
>   .../mellanox/mlxbf_gige/mlxbf_gige_main.c     | 21 ++++++++-----------
>   .../mellanox/mlxbf_gige/mlxbf_gige_rx.c       |  3 +++
>   2 files changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
> index 694de9513b9f..74185b02daa0 100644
> --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
> +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
> @@ -471,24 +471,21 @@ static int mlxbf_gige_probe(struct platform_device *pdev)
>   	return err;
>   }
>   
> -static int mlxbf_gige_remove(struct platform_device *pdev)
> +static void mlxbf_gige_remove(struct platform_device *pdev)
>   {
>   	struct mlxbf_gige *priv = platform_get_drvdata(pdev);
>   
> +	if (!priv)
> +		return;
> +
> +	writeq(0, priv->base + MLXBF_GIGE_INT_EN);
> +        mlxbf_gige_clean_port(priv);
>   	unregister_netdev(priv->netdev);
>   	phy_disconnect(priv->netdev->phydev);
>   	mlxbf_gige_mdio_remove(priv);
> -	platform_set_drvdata(pdev, NULL);
> -
> -	return 0;
> -}
> -
> -static void mlxbf_gige_shutdown(struct platform_device *pdev)
> -{
> -	struct mlxbf_gige *priv = platform_get_drvdata(pdev);
> -
>   	writeq(0, priv->base + MLXBF_GIGE_INT_EN);
>   	mlxbf_gige_clean_port(priv);
> +	platform_set_drvdata(pdev, NULL);
>   }
>   
>   static const struct acpi_device_id __maybe_unused mlxbf_gige_acpi_match[] = {
> @@ -499,8 +496,8 @@ MODULE_DEVICE_TABLE(acpi, mlxbf_gige_acpi_match);
>   
>   static struct platform_driver mlxbf_gige_driver = {
>   	.probe = mlxbf_gige_probe,
> -	.remove = mlxbf_gige_remove,
> -	.shutdown = mlxbf_gige_shutdown,
> +	.remove_new = mlxbf_gige_remove,
> +	.shutdown = mlxbf_gige_remove,
>   	.driver = {
>   		.name = KBUILD_MODNAME,
>   		.acpi_match_table = ACPI_PTR(mlxbf_gige_acpi_match),
> diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c
> index 0d5a41a2ae01..cfb8fb957f0c 100644
> --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c
> @@ -298,6 +298,9 @@ int mlxbf_gige_poll(struct napi_struct *napi, int budget)
>   
>   	priv = container_of(napi, struct mlxbf_gige, napi);
>   
> +	if (!priv)
> +		return 0;

Do you still need this test even after you unregistered the network 
device in your shutdown routine?
Asmaa Mnebhi Oct. 13, 2023, 2:58 p.m. UTC | #2
> >   	priv = container_of(napi, struct mlxbf_gige, napi);
> >
> > +	if (!priv)
> > +		return 0;
> 
> Do you still need this test even after you unregistered the network device in
> your shutdown routine?

always good to check variables but if want me to remove it I can.
Asmaa Mnebhi Jan. 5, 2024, 4:05 p.m. UTC | #3
> > >   	priv = container_of(napi, struct mlxbf_gige, napi);
> > >
> > > +	if (!priv)
> > > +		return 0;
> >
> > Do you still need this test even after you unregistered the network
> > device in your shutdown routine?
> 
> always good to check variables but if want me to remove it I can.

I will resubmit this patch with addressed comments after our QA has done more thorough testing. 

Thank you.
Asmaa
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
index 694de9513b9f..74185b02daa0 100644
--- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
+++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
@@ -471,24 +471,21 @@  static int mlxbf_gige_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int mlxbf_gige_remove(struct platform_device *pdev)
+static void mlxbf_gige_remove(struct platform_device *pdev)
 {
 	struct mlxbf_gige *priv = platform_get_drvdata(pdev);
 
+	if (!priv)
+		return;
+
+	writeq(0, priv->base + MLXBF_GIGE_INT_EN);
+        mlxbf_gige_clean_port(priv);
 	unregister_netdev(priv->netdev);
 	phy_disconnect(priv->netdev->phydev);
 	mlxbf_gige_mdio_remove(priv);
-	platform_set_drvdata(pdev, NULL);
-
-	return 0;
-}
-
-static void mlxbf_gige_shutdown(struct platform_device *pdev)
-{
-	struct mlxbf_gige *priv = platform_get_drvdata(pdev);
-
 	writeq(0, priv->base + MLXBF_GIGE_INT_EN);
 	mlxbf_gige_clean_port(priv);
+	platform_set_drvdata(pdev, NULL);
 }
 
 static const struct acpi_device_id __maybe_unused mlxbf_gige_acpi_match[] = {
@@ -499,8 +496,8 @@  MODULE_DEVICE_TABLE(acpi, mlxbf_gige_acpi_match);
 
 static struct platform_driver mlxbf_gige_driver = {
 	.probe = mlxbf_gige_probe,
-	.remove = mlxbf_gige_remove,
-	.shutdown = mlxbf_gige_shutdown,
+	.remove_new = mlxbf_gige_remove,
+	.shutdown = mlxbf_gige_remove,
 	.driver = {
 		.name = KBUILD_MODNAME,
 		.acpi_match_table = ACPI_PTR(mlxbf_gige_acpi_match),
diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c
index 0d5a41a2ae01..cfb8fb957f0c 100644
--- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c
+++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c
@@ -298,6 +298,9 @@  int mlxbf_gige_poll(struct napi_struct *napi, int budget)
 
 	priv = container_of(napi, struct mlxbf_gige, napi);
 
+	if (!priv)
+		return 0;
+
 	mlxbf_gige_handle_tx_complete(priv);
 
 	do {