diff mbox series

IB/cm: use rwlock for MAD agent lock

Message ID CAHYDg1Sdi0UZXzMzWo=HkWzJx9jp6HF23oGSO_kgobs0=JyEcw@mail.gmail.com (mailing list archive)
State Superseded
Headers show
Series IB/cm: use rwlock for MAD agent lock | expand

Commit Message

Jacob Moroni Feb. 20, 2025, 5:04 p.m. UTC
In workloads where there are many processes establishing
connections using RDMA CM in parallel (large scale MPI),
there can be heavy contention for mad_agent_lock in
cm_alloc_msg.

This contention can occur while inside of a spin_lock_irq
region, leading to interrupts being disabled for extended
durations on many cores. Furthermore, it leads to the
serialization of rdma_create_ah calls, which has negative
performance impacts for NICs which are capable of processing
multiple address handle creations in parallel.

The end result is the machine becoming unresponsive, hung
task warnings, netdev TX timeouts, etc.

Since the lock appears to be only for protection from
cm_remove_one, it can be changed to a rwlock to resolve
these issues.

Reproducer:

Server:
  for i in $(seq 1 512); do
    ucmatose -c 32 -p $((i + 5000)) &
  done

Client:
  for i in $(seq 1 512); do
    ucmatose -c 32 -p $((i + 5000)) -s 10.2.0.52 &
  done

Fixes: 76039ac9095f5ee5 ("IB/cm: Protect cm_dev, cm_ports and
mad_agent with kref and lock")
Signed-off-by: Jacob Moroni <jmoroni@google.com>
---
 drivers/infiniband/core/cm.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

  m = ERR_PTR(-EINVAL);
@@ -311,7 +311,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct
cm_id_private *cm_id_priv)
  m->ah = ah;

 out:
- spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
  return m;
 }

@@ -1297,10 +1297,10 @@ static __be64 cm_form_tid(struct cm_id_private
*cm_id_priv)
  if (!cm_id_priv->av.port)
  return cpu_to_be64(low_tid);

- spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
  if (cm_id_priv->av.port->mad_agent)
  hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32;
- spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
  return cpu_to_be64(hi_tid | low_tid);
 }

@@ -4378,7 +4378,7 @@ static int cm_add_one(struct ib_device *ib_device)
  return -ENOMEM;

  kref_init(&cm_dev->kref);
- spin_lock_init(&cm_dev->mad_agent_lock);
+ rwlock_init(&cm_dev->mad_agent_lock);
  cm_dev->ib_device = ib_device;
  cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
  cm_dev->going_down = 0;
@@ -4494,9 +4494,9 @@ static void cm_remove_one(struct ib_device
*ib_device, void *client_data)
  * The above ensures no call paths from the work are running,
  * the remaining paths all take the mad_agent_lock.
  */
- spin_lock(&cm_dev->mad_agent_lock);
+ write_lock(&cm_dev->mad_agent_lock);
  port->mad_agent = NULL;
- spin_unlock(&cm_dev->mad_agent_lock);
+ write_unlock(&cm_dev->mad_agent_lock);
  ib_unregister_mad_agent(mad_agent);
  ib_port_unregister_client_groups(ib_device, i,
  cm_counter_groups);

Comments

Eric Dumazet Feb. 20, 2025, 5:37 p.m. UTC | #1
On Thu, Feb 20, 2025 at 6:05 PM Jacob Moroni <jmoroni@google.com> wrote:
>
> In workloads where there are many processes establishing
> connections using RDMA CM in parallel (large scale MPI),
> there can be heavy contention for mad_agent_lock in
> cm_alloc_msg.
>
> This contention can occur while inside of a spin_lock_irq
> region, leading to interrupts being disabled for extended
> durations on many cores. Furthermore, it leads to the
> serialization of rdma_create_ah calls, which has negative
> performance impacts for NICs which are capable of processing
> multiple address handle creations in parallel.
>
> The end result is the machine becoming unresponsive, hung
> task warnings, netdev TX timeouts, etc.
>
> Since the lock appears to be only for protection from
> cm_remove_one, it can be changed to a rwlock to resolve
> these issues.
>
> Reproducer:
>
> Server:
>   for i in $(seq 1 512); do
>     ucmatose -c 32 -p $((i + 5000)) &
>   done
>
> Client:
>   for i in $(seq 1 512); do
>     ucmatose -c 32 -p $((i + 5000)) -s 10.2.0.52 &
>   done
>
> Fixes: 76039ac9095f5ee5 ("IB/cm: Protect cm_dev, cm_ports and
> mad_agent with kref and lock")

Fixes: tag should be on a single line.

> Signed-off-by: Jacob Moroni <jmoroni@google.com>
> ---

It seems your patch is mangled.

Can you use "git send-email" to resend it ?
diff mbox series

Patch

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 142170473e75..effa53dd6800 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -167,7 +167,7 @@  struct cm_port {
 struct cm_device {
  struct kref kref;
  struct list_head list;
- spinlock_t mad_agent_lock;
+ rwlock_t mad_agent_lock;
  struct ib_device *ib_device;
  u8 ack_delay;
  int going_down;
@@ -285,7 +285,7 @@  static struct ib_mad_send_buf *cm_alloc_msg(struct
cm_id_private *cm_id_priv)
  if (!cm_id_priv->av.port)
  return ERR_PTR(-EINVAL);

- spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
  mad_agent = cm_id_priv->av.port->mad_agent;
  if (!mad_agent) {