diff mbox series

[rdma-next,2/9] IB/mad: Add deadline for send MADs

Message ID 3e9add3109a36c3238465b9ce11363084b9ddb14.1733405453.git.leon@kernel.org (mailing list archive)
State New
Headers show
Series Rework retry algorithm used when sending MADs | expand

Commit Message

Leon Romanovsky Dec. 5, 2024, 1:49 p.m. UTC
From: Vlad Dumitrescu <vdumitrescu@nvidia.com>

The MAD layer does not currently provide a way to enforce a deadline.
Callers which need that, like rdma_resolve_route / SA, make assumptions
about the MAD layer's retry algorithm and set the retries and timeout_ms
fields struct ib_mad_send_buf accordingly.  For example, given today's
retry algorithm - linear, with no significant scheduling or queueing
delays - callers expect the final timeout to trigger roughly after
(retries + 1) * timeout_ms.

Add helper to set internal deadline based on relative timeout from
current time.  Callers can configure the deadline at any time, but
should account for delays themselves introduce before calling
ib_post_send_mad.  Otherwise, if the deadline has passed, post fails.

When a deadline is not set or it's too high, clamp to 5 minutes after
post time.  Probably not a good idea to accept arbitrary timeouts.

After a series of callers will be converted to use this new parameter,
the MAD layer can evolve its retry algorithm (e.g., to prevent
congestion) without affecting those callers.

Note that existing fields still need to be exposed:
  - timeout_ms will be needed to reset the retry algorithm after a
    temporary delay requested by remote via CM MRA [1], and
  - retries is needed to implement CM REQ:Max CM Retries [2].

In case of CM MRA (ib_modify_mad is called with non-zero timeout),
increase the deadline as the sender can't plan for MRA-requested delays.

Ignore RMPP for now - it uses a different per-window retry algorithm.

[1] IBTA v1.7 - Section 12.6.6 - MRA - Message Receipt Acknowledgment
[2] IBTA v1.7 - Section 12.7.27 - Max CM Retries

Signed-off-by: Vlad Dumitrescu <vdumitrescu@nvidia.com>
Reviewed-by: Sean Hefty <shefty@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/mad.c      | 54 +++++++++++++++++++++++++++---
 drivers/infiniband/core/mad_priv.h |  1 +
 include/rdma/ib_mad.h              | 29 ++++++++++++++++
 3 files changed, 80 insertions(+), 4 deletions(-)
diff mbox series

Patch

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index bcfbb2a5c02b..5c255ee3db38 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -54,6 +54,8 @@ 
 #define CREATE_TRACE_POINTS
 #include <trace/events/ib_mad.h>
 
+#define IB_MAD_MAX_DEADLINE (jiffies + msecs_to_jiffies(5 * 60 * 1000))
+
 #ifdef CONFIG_TRACEPOINTS
 static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr,
 			  struct ib_mad_qp_info *qp_info,
@@ -855,6 +857,26 @@  int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent)
 }
 EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent);
 
+int ib_set_mad_deadline(struct ib_mad_send_buf *send_buf, u32 total_timeout_ms)
+{
+	struct ib_mad_send_wr_private *mad_send_wr =
+		container_of(send_buf, struct ib_mad_send_wr_private, send_buf);
+
+	if (WARN_ON_ONCE(!total_timeout_ms))
+		return -EINVAL;
+
+	if (WARN_ON_ONCE(mad_send_wr->deadline))
+		return -EINVAL;
+
+	if (WARN_ON_ONCE(ib_mad_kernel_rmpp_agent(
+		    &mad_send_wr->mad_agent_priv->agent)))
+		return -EINVAL;
+
+	mad_send_wr->deadline = jiffies + msecs_to_jiffies(total_timeout_ms);
+	return 0;
+}
+EXPORT_SYMBOL(ib_set_mad_deadline);
+
 struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent,
 					   u32 remote_qpn, u16 pkey_index,
 					   int rmpp_active, int hdr_len,
@@ -1174,6 +1196,19 @@  int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
 				continue;
 		}
 
+		if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) &&
+		    send_buf->timeout_ms) {
+			if (!mad_send_wr->deadline ||
+			    time_after(mad_send_wr->deadline,
+				       IB_MAD_MAX_DEADLINE)) {
+				mad_send_wr->deadline = IB_MAD_MAX_DEADLINE;
+			} else if (time_after_eq(jiffies,
+						 mad_send_wr->deadline)) {
+				ret = -ETIMEDOUT;
+				goto error;
+			}
+		}
+
 		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
 		/* Timeout will be updated after send completes */
 		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
@@ -2293,16 +2328,23 @@  static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
 
 static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
 {
-	struct ib_mad_agent_private *mad_agent_priv;
 	struct ib_mad_send_wr_private *temp_mad_send_wr;
+	struct ib_mad_agent_private *mad_agent_priv;
+	const unsigned long now = jiffies;
 	struct list_head *list_item;
 	unsigned long delay;
 
 	mad_agent_priv = mad_send_wr->mad_agent_priv;
 	list_del_init(&mad_send_wr->agent_list);
 
-	delay = mad_send_wr->timeout;
-	mad_send_wr->timeout += jiffies;
+	/* Caller must ensure mad_send_wr->timeout is relative */
+	if (!mad_send_wr->deadline)
+		delay = mad_send_wr->timeout;
+	else if (time_after_eq(now, mad_send_wr->deadline))
+		delay = 0; /* schedule ASAP */
+	else
+		delay = min(mad_send_wr->deadline - now, mad_send_wr->timeout);
+	mad_send_wr->timeout = now + delay;
 
 	if (delay) {
 		list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
@@ -2623,6 +2665,9 @@  int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms)
 	if (!timeout_ms)
 		mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
 
+	if (mad_send_wr->deadline)
+		mad_send_wr->deadline += msecs_to_jiffies(timeout_ms);
+
 	if (mad_send_wr->state == IB_MAD_STATE_SEND_START ||
 	    (mad_send_wr->state == IB_MAD_STATE_QUEUED && timeout_ms))
 		mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
@@ -2726,7 +2771,8 @@  static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
 {
 	int ret;
 
-	if (!mad_send_wr->retries_left)
+	if (time_after_eq(jiffies, mad_send_wr->deadline) ||
+	    !mad_send_wr->retries_left)
 		return -ETIMEDOUT;
 
 	mad_send_wr->retries_left--;
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index b2a12a82a62d..24580ad2d428 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -140,6 +140,7 @@  struct ib_mad_send_wr_private {
 	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
 	__be64 tid;
 	unsigned long timeout;
+	unsigned long deadline;
 	int max_retries;
 	int retries_left;
 	int retry;
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index 3f1b58d8b4bf..69708170a0d6 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -727,6 +727,9 @@  void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc);
  *
  * This call will reset the timeout value for a sent MAD to the specified
  * value.
+ *
+ * If called with a non-zero value and ib_set_mad_deadline was used, the
+ * deadline will be extended by the @timeout_ms.
  */
 int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms);
 
@@ -818,4 +821,30 @@  void ib_free_send_mad(struct ib_mad_send_buf *send_buf);
  */
 int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent);
 
+/**
+ * ib_set_mad_deadline - Sets send MAD's deadline based on current time.
+ * @send_buf: Previously allocated send data buffer.
+ * @total_timeout_ms: Time to wait before stopping retries.
+ *
+ * The deadline will start being enforced once ib_post_send_mad is called.
+ * It is NOT guaranteed that at least one send will be performed.  Only valid
+ * for MADs waiting for response (ib_mad_send_buf.timeout_ms must also be set).
+ *
+ * This option allows callers to bound the time a MAD is owned by the MAD layer.
+ * This takes precedence over ib_mad_send_buf.{retries, timeout_ms} and is
+ * independent from the MAD layer's internal retry algorithm.
+ *
+ * Once the this deadline expires, the MAD data buffer will be returned to the
+ * caller via the send_handler configured at agent registration time.
+ * Invocation of the send_handler might happen slightly later due to scheduling
+ * delays.
+ *
+ * The deadline will be extended if ib_modify_mad is called.
+ *
+ * Can only be called once.
+ *
+ * Might return errors for MADs which do not support deadline.
+ */
+int ib_set_mad_deadline(struct ib_mad_send_buf *send_buf, u32 total_timeout_ms);
+
 #endif /* IB_MAD_H */