Message ID | 20230922173626.23790-2-asmaa@nvidia.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | mlxbf_gige: Fix several bugs | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Guessing tree name failed - patch did not apply |
On 9/22/2023 10:36 AM, Asmaa Mnebhi wrote: > There is a race condition happening during shutdown due to pending napi transactions. > Since mlxbf_gige_poll is still running, it tries to access a NULL pointer and as a > result causes a kernel panic: > > [ 284.074822] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000070 > ... > [ 284.322326] Call trace: > [ 284.324757] mlxbf_gige_handle_tx_complete+0xc8/0x170 [mlxbf_gige] > [ 284.330924] mlxbf_gige_poll+0x54/0x160 [mlxbf_gige] > [ 284.335876] __napi_poll+0x40/0x1c8 > [ 284.339353] net_rx_action+0x314/0x3a0 > [ 284.343086] __do_softirq+0x128/0x334 > [ 284.346734] run_ksoftirqd+0x54/0x6c > [ 284.350294] smpboot_thread_fn+0x14c/0x190 > [ 284.354375] kthread+0x10c/0x110 > [ 284.357588] ret_from_fork+0x10/0x20 > [ 284.361150] Code: 8b070000 f9000ea0 f95056c0 f86178a1 (b9407002) > [ 284.367227] ---[ end trace a18340bbb9ea2fa7 ]--- > > To fix this, invoke mlxbf_gige_remove to disable and dequeue napi during shutdown, > and also return in the case where "priv" is NULL in the poll function. > > Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver") > Signed-off-by: Asmaa Mnebhi <asmaa@nvidia.com> > Reviewed-by: David Thompson <davthompson@nvidia.com> > --- > v2->v3: > - Add the logic to clean the port to the remove() function > v1-v2: > - make mlxbf_gige_shutdown() the same as the mlxbf_gige_remove() > > .../mellanox/mlxbf_gige/mlxbf_gige_main.c | 21 ++++++++----------- > .../mellanox/mlxbf_gige/mlxbf_gige_rx.c | 3 +++ > 2 files changed, 12 insertions(+), 12 deletions(-) > > diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c > index 694de9513b9f..74185b02daa0 100644 > --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c > +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c > @@ -471,24 +471,21 @@ static int mlxbf_gige_probe(struct platform_device *pdev) > return err; > } > > -static int mlxbf_gige_remove(struct platform_device *pdev) > +static void mlxbf_gige_remove(struct platform_device *pdev) > { > struct mlxbf_gige *priv = platform_get_drvdata(pdev); > > + if (!priv) > + return; > + > + writeq(0, priv->base + MLXBF_GIGE_INT_EN); > + mlxbf_gige_clean_port(priv); > unregister_netdev(priv->netdev); > phy_disconnect(priv->netdev->phydev); > mlxbf_gige_mdio_remove(priv); > - platform_set_drvdata(pdev, NULL); > - > - return 0; > -} > - > -static void mlxbf_gige_shutdown(struct platform_device *pdev) > -{ > - struct mlxbf_gige *priv = platform_get_drvdata(pdev); > - > writeq(0, priv->base + MLXBF_GIGE_INT_EN); > mlxbf_gige_clean_port(priv); > + platform_set_drvdata(pdev, NULL); > } > > static const struct acpi_device_id __maybe_unused mlxbf_gige_acpi_match[] = { > @@ -499,8 +496,8 @@ MODULE_DEVICE_TABLE(acpi, mlxbf_gige_acpi_match); > > static struct platform_driver mlxbf_gige_driver = { > .probe = mlxbf_gige_probe, > - .remove = mlxbf_gige_remove, > - .shutdown = mlxbf_gige_shutdown, > + .remove_new = mlxbf_gige_remove, > + .shutdown = mlxbf_gige_remove, > .driver = { > .name = KBUILD_MODNAME, > .acpi_match_table = ACPI_PTR(mlxbf_gige_acpi_match), > diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c > index 0d5a41a2ae01..cfb8fb957f0c 100644 > --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c > +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c > @@ -298,6 +298,9 @@ int mlxbf_gige_poll(struct napi_struct *napi, int budget) > > priv = container_of(napi, struct mlxbf_gige, napi); > > + if (!priv) > + return 0; Do you still need this test even after you unregistered the network device in your shutdown routine?
> > priv = container_of(napi, struct mlxbf_gige, napi); > > > > + if (!priv) > > + return 0; > > Do you still need this test even after you unregistered the network device in > your shutdown routine? always good to check variables but if want me to remove it I can.
> > > priv = container_of(napi, struct mlxbf_gige, napi); > > > > > > + if (!priv) > > > + return 0; > > > > Do you still need this test even after you unregistered the network > > device in your shutdown routine? > > always good to check variables but if want me to remove it I can. I will resubmit this patch with addressed comments after our QA has done more thorough testing. Thank you. Asmaa
diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c index 694de9513b9f..74185b02daa0 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c @@ -471,24 +471,21 @@ static int mlxbf_gige_probe(struct platform_device *pdev) return err; } -static int mlxbf_gige_remove(struct platform_device *pdev) +static void mlxbf_gige_remove(struct platform_device *pdev) { struct mlxbf_gige *priv = platform_get_drvdata(pdev); + if (!priv) + return; + + writeq(0, priv->base + MLXBF_GIGE_INT_EN); + mlxbf_gige_clean_port(priv); unregister_netdev(priv->netdev); phy_disconnect(priv->netdev->phydev); mlxbf_gige_mdio_remove(priv); - platform_set_drvdata(pdev, NULL); - - return 0; -} - -static void mlxbf_gige_shutdown(struct platform_device *pdev) -{ - struct mlxbf_gige *priv = platform_get_drvdata(pdev); - writeq(0, priv->base + MLXBF_GIGE_INT_EN); mlxbf_gige_clean_port(priv); + platform_set_drvdata(pdev, NULL); } static const struct acpi_device_id __maybe_unused mlxbf_gige_acpi_match[] = { @@ -499,8 +496,8 @@ MODULE_DEVICE_TABLE(acpi, mlxbf_gige_acpi_match); static struct platform_driver mlxbf_gige_driver = { .probe = mlxbf_gige_probe, - .remove = mlxbf_gige_remove, - .shutdown = mlxbf_gige_shutdown, + .remove_new = mlxbf_gige_remove, + .shutdown = mlxbf_gige_remove, .driver = { .name = KBUILD_MODNAME, .acpi_match_table = ACPI_PTR(mlxbf_gige_acpi_match), diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c index 0d5a41a2ae01..cfb8fb957f0c 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c @@ -298,6 +298,9 @@ int mlxbf_gige_poll(struct napi_struct *napi, int budget) priv = container_of(napi, struct mlxbf_gige, napi); + if (!priv) + return 0; + mlxbf_gige_handle_tx_complete(priv); do {