diff mbox series

[rdma-next,7/9] IB/mad: Exponential backoff when retrying sends

Message ID af348c70c47485235d7d6811b56ccf23e105bdad.1733405453.git.leon@kernel.org (mailing list archive)
State New
Headers show
Series Rework retry algorithm used when sending MADs | expand

Commit Message

Leon Romanovsky Dec. 5, 2024, 1:49 p.m. UTC
From: Vlad Dumitrescu <vdumitrescu@nvidia.com>

When a receiver is overloaded, MAD requests time out and get retried in
a linear fashion.  This could worsen congestion and reduce goodput.  To
help reduce the load over time, use exponential backoff after a preset
number of retries.  Cap delays between retries at 60s, even when in
exponential mode.

MRA message from recipient could request an even higher timeout, so
continue to respect that for the next retry.  However, reset the backoff
algorithm to the beginning when and MRA is received.

Exclude RMPP and OPA from exponential backoff.

Signed-off-by: Vlad Dumitrescu <vdumitrescu@nvidia.com>
Reviewed-by: Sean Hefty <shefty@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/mad.c      | 53 ++++++++++++++++++++++++++++--
 drivers/infiniband/core/mad_priv.h |  3 ++
 2 files changed, 53 insertions(+), 3 deletions(-)
diff mbox series

Patch

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 5c255ee3db38..a3a8cf4bbc20 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -54,7 +54,9 @@ 
 #define CREATE_TRACE_POINTS
 #include <trace/events/ib_mad.h>
 
-#define IB_MAD_MAX_DEADLINE (jiffies + msecs_to_jiffies(5 * 60 * 1000))
+#define IB_MAD_LINEAR_TIMEOUTS_DEFAULT	4
+#define IB_MAD_MAX_TIMEOUT_MS		(60 * MSEC_PER_SEC)
+#define IB_MAD_MAX_DEADLINE		(jiffies + msecs_to_jiffies(5 * 60 * 1000))
 
 #ifdef CONFIG_TRACEPOINTS
 static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr,
@@ -1210,10 +1212,12 @@  int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
 		}
 
 		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+		mad_send_wr->var_timeout_ms = send_buf->timeout_ms;
 		/* Timeout will be updated after send completes */
 		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
 		mad_send_wr->max_retries = send_buf->retries;
 		mad_send_wr->retries_left = send_buf->retries;
+		mad_send_wr->backoff_retries = 0;
 		send_buf->retries = 0;
 		mad_send_wr->status = IB_WC_SUCCESS;
 
@@ -2662,18 +2666,34 @@  int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms)
 		return -EINVAL;
 	}
 
-	if (!timeout_ms)
+	if (!timeout_ms) {
 		mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+		goto apply;
+	}
+
+	/* CM MRA requesting a lower timeout than ours.  Could be a delayed MRA
+	 * (variable backoff increased in the meantime) or remote using a const.
+	 */
+	if (timeout_ms < mad_send_wr->var_timeout_ms)
+		goto ignore;
+
+	/* Assume remote will no longer be overloaded after MRA Service Timeout
+	 * passes and restart variable backoff algorithm.
+	 */
+	mad_send_wr->var_timeout_ms = mad_send_wr->send_buf.timeout_ms;
+	mad_send_wr->backoff_retries = 0;
 
 	if (mad_send_wr->deadline)
 		mad_send_wr->deadline += msecs_to_jiffies(timeout_ms);
 
+apply:
 	if (mad_send_wr->state == IB_MAD_STATE_SEND_START ||
 	    (mad_send_wr->state == IB_MAD_STATE_QUEUED && timeout_ms))
 		mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
 	else
 		ib_reset_mad_timeout(mad_send_wr, timeout_ms);
 
+ignore:
 	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 	return 0;
 }
@@ -2767,6 +2787,30 @@  static void local_completions(struct work_struct *work)
 	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 }
 
+/*
+ * Applies a variable backoff to certain send MADs.
+ *
+ * Exists to scope down the initial variable backoff implementation.
+ */
+static void set_next_timeout(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	const struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv;
+	const struct ib_mad_port_private *port = agent->qp_info->port_priv;
+	const struct ib_mad_hdr *hdr = mad_send_wr->send_buf.mad;
+
+	if (ib_mad_kernel_rmpp_agent(&agent->agent))
+		return;
+
+	if (hdr->base_version != IB_MGMT_BASE_VERSION)
+		return;
+
+	if (++mad_send_wr->backoff_retries < READ_ONCE(port->linear_timeouts))
+		return;
+
+	mad_send_wr->var_timeout_ms =
+		min(mad_send_wr->var_timeout_ms << 1, IB_MAD_MAX_TIMEOUT_MS);
+}
+
 static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
 {
 	int ret;
@@ -2778,7 +2822,8 @@  static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
 	mad_send_wr->retries_left--;
 	mad_send_wr->send_buf.retries++;
 
-	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+	set_next_timeout(mad_send_wr);
+	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->var_timeout_ms);
 
 	if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) {
 		ret = ib_retry_rmpp(mad_send_wr);
@@ -3195,6 +3240,8 @@  static int ib_mad_port_open(struct ib_device *device,
 		goto error8;
 	}
 
+	port_priv->linear_timeouts = IB_MAD_LINEAR_TIMEOUTS_DEFAULT;
+
 	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
 	list_add_tail(&port_priv->port_list, &ib_mad_port_list);
 	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 24580ad2d428..076ebcea27b4 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -139,10 +139,12 @@  struct ib_mad_send_wr_private {
 	struct ib_ud_wr send_wr;
 	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
 	__be64 tid;
+	unsigned int var_timeout_ms;
 	unsigned long timeout;
 	unsigned long deadline;
 	int max_retries;
 	int retries_left;
+	int backoff_retries;
 	int retry;
 	enum ib_wc_status status;
 
@@ -222,6 +224,7 @@  struct ib_mad_port_private {
 	struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
 	struct workqueue_struct *wq;
 	struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
+	u8 linear_timeouts;
 };
 
 int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr);