diff mbox series

[net,2/4] net/mlx5e: Fix ESN update kernel panic

Message ID acc45e30ff0cf5220a3fda02411d22880878102f.1685950599.git.leonro@nvidia.com (mailing list archive)
State Awaiting Upstream
Delegated to: Netdev Maintainers
Headers show
Series Fix mixing atomic/non-atomic contexts in mlx5 IPsec code | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/cc_maintainers warning 3 maintainers not CCed: borisp@nvidia.com davem@davemloft.net linux-rdma@vger.kernel.org
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 33 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Leon Romanovsky June 5, 2023, 8:09 a.m. UTC
From: Patrisious Haddad <phaddad@nvidia.com>

Previously during mlx5e_ipsec_handle_event the driver tried to execute
an operation that could sleep, while holding a spinlock, which caused
the kernel panic mentioned below.

Move the function call that can sleep outside of the spinlock context.

 Call Trace:
 <TASK>
 dump_stack_lvl+0x49/0x6c
 __schedule_bug.cold+0x42/0x4e
 schedule_debug.constprop.0+0xe0/0x118
 __schedule+0x59/0x58a
 ? __mod_timer+0x2a1/0x3ef
 schedule+0x5e/0xd4
 schedule_timeout+0x99/0x164
 ? __pfx_process_timeout+0x10/0x10
 __wait_for_common+0x90/0x1da
 ? __pfx_schedule_timeout+0x10/0x10
 wait_func+0x34/0x142 [mlx5_core]
 mlx5_cmd_invoke+0x1f3/0x313 [mlx5_core]
 cmd_exec+0x1fe/0x325 [mlx5_core]
 mlx5_cmd_do+0x22/0x50 [mlx5_core]
 mlx5_cmd_exec+0x1c/0x40 [mlx5_core]
 mlx5_modify_ipsec_obj+0xb2/0x17f [mlx5_core]
 mlx5e_ipsec_update_esn_state+0x69/0xf0 [mlx5_core]
 ? wake_affine+0x62/0x1f8
 mlx5e_ipsec_handle_event+0xb1/0xc0 [mlx5_core]
 process_one_work+0x1e2/0x3e6
 ? __pfx_worker_thread+0x10/0x10
 worker_thread+0x54/0x3ad
 ? __pfx_worker_thread+0x10/0x10
 kthread+0xda/0x101
 ? __pfx_kthread+0x10/0x10
 ret_from_fork+0x29/0x37
 </TASK>
 BUG: workqueue leaked lock or atomic: kworker/u256:4/0x7fffffff/189754#012     last function: mlx5e_ipsec_handle_event [mlx5_core]
 CPU: 66 PID: 189754 Comm: kworker/u256:4 Kdump: loaded Tainted: G        W          6.2.0-2596.20230309201517_5.el8uek.rc1.x86_64 #2
 Hardware name: Oracle Corporation ORACLE SERVER X9-2/ASMMBX9-2, BIOS 61070300 08/17/2022
 Workqueue: mlx5e_ipsec: eth%d mlx5e_ipsec_handle_event [mlx5_core]
 Call Trace:
 <TASK>
 dump_stack_lvl+0x49/0x6c
 process_one_work.cold+0x2b/0x3c
 ? __pfx_worker_thread+0x10/0x10
 worker_thread+0x54/0x3ad
 ? __pfx_worker_thread+0x10/0x10
 kthread+0xda/0x101
 ? __pfx_kthread+0x10/0x10
 ret_from_fork+0x29/0x37
 </TASK>
 BUG: scheduling while atomic: kworker/u256:4/189754/0x00000000

Fixes: cee137a63431 ("net/mlx5e: Handle ESN update events")
Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 .../mellanox/mlx5/core/en_accel/ipsec_offload.c    | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

Comments

Simon Horman June 6, 2023, 9:08 a.m. UTC | #1
On Mon, Jun 05, 2023 at 11:09:50AM +0300, Leon Romanovsky wrote:
> From: Patrisious Haddad <phaddad@nvidia.com>
> 
> Previously during mlx5e_ipsec_handle_event the driver tried to execute
> an operation that could sleep, while holding a spinlock, which caused
> the kernel panic mentioned below.
> 
> Move the function call that can sleep outside of the spinlock context.
> 
>  Call Trace:
>  <TASK>
>  dump_stack_lvl+0x49/0x6c
>  __schedule_bug.cold+0x42/0x4e
>  schedule_debug.constprop.0+0xe0/0x118
>  __schedule+0x59/0x58a
>  ? __mod_timer+0x2a1/0x3ef
>  schedule+0x5e/0xd4
>  schedule_timeout+0x99/0x164
>  ? __pfx_process_timeout+0x10/0x10
>  __wait_for_common+0x90/0x1da
>  ? __pfx_schedule_timeout+0x10/0x10
>  wait_func+0x34/0x142 [mlx5_core]
>  mlx5_cmd_invoke+0x1f3/0x313 [mlx5_core]
>  cmd_exec+0x1fe/0x325 [mlx5_core]
>  mlx5_cmd_do+0x22/0x50 [mlx5_core]
>  mlx5_cmd_exec+0x1c/0x40 [mlx5_core]
>  mlx5_modify_ipsec_obj+0xb2/0x17f [mlx5_core]
>  mlx5e_ipsec_update_esn_state+0x69/0xf0 [mlx5_core]
>  ? wake_affine+0x62/0x1f8
>  mlx5e_ipsec_handle_event+0xb1/0xc0 [mlx5_core]
>  process_one_work+0x1e2/0x3e6
>  ? __pfx_worker_thread+0x10/0x10
>  worker_thread+0x54/0x3ad
>  ? __pfx_worker_thread+0x10/0x10
>  kthread+0xda/0x101
>  ? __pfx_kthread+0x10/0x10
>  ret_from_fork+0x29/0x37
>  </TASK>
>  BUG: workqueue leaked lock or atomic: kworker/u256:4/0x7fffffff/189754#012     last function: mlx5e_ipsec_handle_event [mlx5_core]
>  CPU: 66 PID: 189754 Comm: kworker/u256:4 Kdump: loaded Tainted: G        W          6.2.0-2596.20230309201517_5.el8uek.rc1.x86_64 #2
>  Hardware name: Oracle Corporation ORACLE SERVER X9-2/ASMMBX9-2, BIOS 61070300 08/17/2022
>  Workqueue: mlx5e_ipsec: eth%d mlx5e_ipsec_handle_event [mlx5_core]
>  Call Trace:
>  <TASK>
>  dump_stack_lvl+0x49/0x6c
>  process_one_work.cold+0x2b/0x3c
>  ? __pfx_worker_thread+0x10/0x10
>  worker_thread+0x54/0x3ad
>  ? __pfx_worker_thread+0x10/0x10
>  kthread+0xda/0x101
>  ? __pfx_kthread+0x10/0x10
>  ret_from_fork+0x29/0x37
>  </TASK>
>  BUG: scheduling while atomic: kworker/u256:4/189754/0x00000000
> 
> Fixes: cee137a63431 ("net/mlx5e: Handle ESN update events")
> Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>

Reviewed-by: Simon Horman <simon.horman@corigine.com>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c
index df90e19066bc..ca16cb9807ea 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c
@@ -305,7 +305,17 @@  static void mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry,
 	}
 
 	mlx5e_ipsec_build_accel_xfrm_attrs(sa_entry, &attrs);
+
+	/* It is safe to execute the modify below unlocked since the only flows
+	 * that could affect this HW object, are create, destroy and this work.
+	 *
+	 * Creation flow can't co-exist with this modify work, the destruction
+	 * flow would cancel this work, and this work is a single entity that
+	 * can't conflict with it self.
+	 */
+	spin_unlock_bh(&sa_entry->x->lock);
 	mlx5_accel_esp_modify_xfrm(sa_entry, &attrs);
+	spin_lock_bh(&sa_entry->x->lock);
 
 	data.data_offset_condition_operand =
 		MLX5_IPSEC_ASO_REMOVE_FLOW_PKT_CNT_OFFSET;
@@ -431,7 +441,7 @@  static void mlx5e_ipsec_handle_event(struct work_struct *_work)
 	aso = sa_entry->ipsec->aso;
 	attrs = &sa_entry->attrs;
 
-	spin_lock(&sa_entry->x->lock);
+	spin_lock_bh(&sa_entry->x->lock);
 	ret = mlx5e_ipsec_aso_query(sa_entry, NULL);
 	if (ret)
 		goto unlock;
@@ -447,7 +457,7 @@  static void mlx5e_ipsec_handle_event(struct work_struct *_work)
 		mlx5e_ipsec_handle_limits(sa_entry);
 
 unlock:
-	spin_unlock(&sa_entry->x->lock);
+	spin_unlock_bh(&sa_entry->x->lock);
 	kfree(work);
 }