diff mbox series

[rdma-next,4/9] RDMA/nldev: Add sa-min-timeout management attribute

Message ID 69722b901037bf9727da09bc2cbea82e81f2f6e7.1733405453.git.leon@kernel.org (mailing list archive)
State New
Headers show
Series Rework retry algorithm used when sending MADs | expand

Commit Message

Leon Romanovsky Dec. 5, 2024, 1:49 p.m. UTC
From: Vlad Dumitrescu <vdumitrescu@nvidia.com>

Add new namespace for MAD (Management Datagram) protocols as we expect
more attributes in this area.

Add first such attribute, to control the minimum initial timeout used by
the SA client implementation.  The SA client relies on the MAD layer to
issue retries, but has to configure an initial timeout value for the
first retry.  While the SA client provides a default, the right value
likely depends on network size, loss levels and capacity of the SA
server.  This attribute enables system admins to tune the trade-off
between speed of recovery under transient loss and load (on the network
and/or the SA server) generated by unnecessary retries.

Enforce a reasonable range of 50ms - 10s.

Changes do not apply to existing SA queries, which were already posted
to the MAD layer.

Example usage:
  # rdma management show ibp1s0f0/1
  0: ibp1s0f0: 1 sa-min-timeout 500 ...
  # rdma management show
  0: ibp1s0f0: 1 sa-min-timeout 500 ...
  1: ibp1s0f1: 1 sa-min-timeout 500 ...
  # rdma management set ibp1s0f1/1 sa-min-timeout 1000 ...
  # rdma management show
  0: ibp1s0f0: 1 sa-min-timeout 500 ...
  1: ibp1s0f1: 1 sa-min-timeout 1000 ...

Signed-off-by: Vlad Dumitrescu <vdumitrescu@nvidia.com>
Reviewed-by: Sean Hefty <shefty@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/core_priv.h |   4 +
 drivers/infiniband/core/nldev.c     | 114 ++++++++++++++++++++++++++++
 drivers/infiniband/core/sa_query.c  |  47 ++++++++++++
 include/uapi/rdma/rdma_netlink.h    |   5 ++
 4 files changed, 170 insertions(+)
diff mbox series

Patch

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 05102769a918..7a7326588297 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -199,6 +199,10 @@  void ib_mad_cleanup(void);
 int ib_sa_init(void);
 void ib_sa_cleanup(void);
 
+int ib_sa_min_timeout_set(struct ib_device *dev, u32 port_num, u32 val,
+			  struct netlink_ext_ack *extack);
+int ib_sa_min_timeout_get(struct ib_device *dev, u32 port_num, u32 *val);
+
 void rdma_nl_init(void);
 void rdma_nl_exit(void);
 
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index ff121e59b9c0..363742567dd2 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -171,6 +171,7 @@  static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_PARENT_NAME]		= { .type = NLA_NUL_STRING },
 	[RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE]	= { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_EVENT_TYPE]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_MGMT_ATTR_SA_MIN_TIMEOUT]	= { .type = NLA_U32 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -2621,6 +2622,112 @@  static int nldev_deldev(struct sk_buff *skb, struct nlmsghdr *nlh,
 	return ib_del_sub_device_and_put(device);
 }
 
+static int nldev_mgmt_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			       struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	struct sk_buff *msg;
+	u32 index;
+	u32 port;
+	u32 sa_min_timeout;
+	int ret;
+
+	ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
+			    NL_VALIDATE_LIBERAL, extack);
+	if (ret ||
+	    !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+	    !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
+	if (!device)
+		return -EINVAL;
+
+	port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+	if (!rdma_is_port_valid(device, port)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (rdma_cap_ib_sa(device, port)) {
+		ret = ib_sa_min_timeout_get(device, port, &sa_min_timeout);
+		if (ret)
+			goto err;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	nlh = nlmsg_put(
+		msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+		RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MGMT_GET), 0, 0);
+	if (!nlh ||
+	    fill_nldev_handle(msg, device) ||
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) {
+		ret = -EMSGSIZE;
+		goto err_msg;
+	}
+
+	if (rdma_cap_ib_sa(device, port)) {
+		ret = nla_put_u32(msg, RDMA_NLDEV_MGMT_ATTR_SA_MIN_TIMEOUT,
+				  sa_min_timeout);
+		if (ret)
+			goto err_msg;
+	}
+
+	nlmsg_end(msg, nlh);
+	return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_msg:
+	nlmsg_free(msg);
+err:
+	ib_device_put(device);
+	return ret;
+}
+
+static int nldev_set_mgmt_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+				   struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	u32 index;
+	u32 port;
+	u32 sa_min_timeout;
+	int ret;
+
+	ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
+			  extack);
+	if (ret ||
+	    !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+	    !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(sock_net(skb->sk), index);
+	if (!device)
+		return -EINVAL;
+
+	port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+	if (!rdma_is_port_valid(device, port))
+		goto err;
+
+	if (tb[RDMA_NLDEV_MGMT_ATTR_SA_MIN_TIMEOUT]) {
+		sa_min_timeout =
+			nla_get_u32(tb[RDMA_NLDEV_MGMT_ATTR_SA_MIN_TIMEOUT]);
+		return ib_sa_min_timeout_set(device, port, sa_min_timeout,
+					     extack);
+	}
+
+err:
+	ib_device_put(device);
+	return -EINVAL;
+}
+
 static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 	[RDMA_NLDEV_CMD_GET] = {
 		.doit = nldev_get_doit,
@@ -2727,6 +2834,13 @@  static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 		.doit = nldev_deldev,
 		.flags = RDMA_NL_ADMIN_PERM,
 	},
+	[RDMA_NLDEV_CMD_MGMT_GET] = {
+		.doit = nldev_mgmt_get_doit,
+	},
+	[RDMA_NLDEV_CMD_MGMT_SET] = {
+		.doit = nldev_set_mgmt_set_doit,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 };
 
 static int fill_mon_netdev_rename(struct sk_buff *msg,
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index ac0d53bf91c4..7f63cad3f212 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -60,7 +60,10 @@ 
 #define IB_SA_LOCAL_SVC_TIMEOUT_MAX		200000
 #define IB_SA_CPI_MAX_RETRY_CNT			3
 #define IB_SA_CPI_RETRY_WAIT			1000 /*msecs */
+#define IB_SA_MIN_TIMEOUT_MS_MIN		50
 #define IB_SA_MIN_TIMEOUT_MS_DEFAULT		500
+#define IB_SA_MIN_TIMEOUT_MS_MAX		10000
+
 static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
 
 struct ib_sa_sm_ah {
@@ -1334,6 +1337,50 @@  static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent)
 	spin_unlock_irqrestore(&tid_lock, flags);
 }
 
+int ib_sa_min_timeout_set(struct ib_device *dev, u32 port_num, u32 val,
+			  struct netlink_ext_ack *extack)
+{
+	struct ib_sa_device *sa_dev = ib_get_client_data(dev, &sa_client);
+	struct ib_sa_port *port;
+
+	if (!rdma_cap_ib_sa(dev, port_num))
+		return -EOPNOTSUPP;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port = &sa_dev->port[port_num - sa_dev->start_port];
+
+	if (val > IB_SA_MIN_TIMEOUT_MS_MAX || val < IB_SA_MIN_TIMEOUT_MS_MIN) {
+		NL_SET_ERR_MSG_FMT_MOD(extack, "Valid range [%u-%u]ms",
+				       IB_SA_MIN_TIMEOUT_MS_MIN,
+				       IB_SA_MIN_TIMEOUT_MS_MAX);
+		return -EINVAL;
+	}
+
+	WRITE_ONCE(port->min_timeout_ms, val);
+
+	return 0;
+}
+
+int ib_sa_min_timeout_get(struct ib_device *dev, u32 port_num, u32 *val)
+{
+	struct ib_sa_device *sa_dev = ib_get_client_data(dev, &sa_client);
+	struct ib_sa_port *port;
+
+	if (!rdma_cap_ib_sa(dev, port_num))
+		return -EOPNOTSUPP;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port = &sa_dev->port[port_num - sa_dev->start_port];
+
+	*val = READ_ONCE(port->min_timeout_ms);
+
+	return 0;
+}
+
 static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms,
 		    gfp_t gfp_mask)
 {
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 9f9cf20c1cd8..2b1c4c55e51f 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -308,6 +308,9 @@  enum rdma_nldev_command {
 
 	RDMA_NLDEV_CMD_MONITOR,
 
+	RDMA_NLDEV_CMD_MGMT_GET,
+	RDMA_NLDEV_CMD_MGMT_SET,
+
 	RDMA_NLDEV_NUM_OPS
 };
 
@@ -580,6 +583,8 @@  enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_EVENT_TYPE,		/* u8 */
 
 	RDMA_NLDEV_SYS_ATTR_MONITOR_MODE,	/* u8 */
+
+	RDMA_NLDEV_MGMT_ATTR_SA_MIN_TIMEOUT,	/* u32 */
 	/*
 	 * Always the end
 	 */