@@ -479,6 +479,7 @@ struct lnet_ni *
extern unsigned int lnet_transaction_timeout;
extern unsigned int lnet_numa_range;
+extern unsigned int lnet_health_sensitivity;
extern unsigned int lnet_peer_discovery_disabled;
extern int portal_rotor;
@@ -78,6 +78,23 @@ struct lnet the_lnet = {
MODULE_PARM_DESC(lnet_numa_range,
"NUMA range to consider during Multi-Rail selection");
+/* lnet_health_sensitivity determines by how much we decrement the health
+ * value on sending error. The value defaults to 0, which means health
+ * checking is turned off by default.
+ */
+unsigned int lnet_health_sensitivity;
+static int sensitivity_set(const char *val, const struct kernel_param *kp);
+static struct kernel_param_ops param_ops_health_sensitivity = {
+ .set = sensitivity_set,
+ .get = param_get_int,
+};
+
+#define param_check_health_sensitivity(name, p) \
+ __param_check(name, p, int)
+module_param(lnet_health_sensitivity, health_sensitivity, 0644);
+MODULE_PARM_DESC(lnet_health_sensitivity,
+ "Value to decrement the health value by on error");
+
static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT;
static int intf_max_set(const char *val, const struct kernel_param *kp);
module_param_call(lnet_interfaces_max, intf_max_set, param_get_int,
@@ -115,6 +132,41 @@ static int lnet_discover(struct lnet_process_id id, u32 force,
struct lnet_process_id __user *ids, int n_ids);
static int
+sensitivity_set(const char *val, const struct kernel_param *kp)
+{
+ int rc;
+ unsigned int *sensitivity = (unsigned int *)kp->arg;
+ unsigned long value;
+
+ rc = kstrtoul(val, 0, &value);
+ if (rc) {
+ CERROR("Invalid module parameter value for 'lnet_health_sensitivity'\n");
+ return rc;
+ }
+
+ /* The purpose of locking the api_mutex here is to ensure that
+ * the correct value ends up stored properly.
+ */
+ mutex_lock(&the_lnet.ln_api_mutex);
+
+ if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+ mutex_unlock(&the_lnet.ln_api_mutex);
+ return 0;
+ }
+
+ if (value == *sensitivity) {
+ mutex_unlock(&the_lnet.ln_api_mutex);
+ return 0;
+ }
+
+ *sensitivity = value;
+
+ mutex_unlock(&the_lnet.ln_api_mutex);
+
+ return 0;
+}
+
+static int
discovery_set(const char *val, const struct kernel_param *kp)
{
int rc;
@@ -1332,6 +1332,16 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
*/
if (ni_healthv < best_healthv) {
continue;
+ } else if (ni_healthv > best_healthv) {
+ best_healthv = ni_healthv;
+ /* If we're going to prefer this ni because it's
+ * the healthiest, then we should set the
+ * shortest_distance in the algorithm in case
+ * there are multiple NIs with the same health but
+ * different distances.
+ */
+ if (distance < shortest_distance)
+ shortest_distance = distance;
} else if (distance > shortest_distance) {
continue;
} else if (distance < shortest_distance) {
@@ -1344,7 +1354,6 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
}
best_ni = ni;
best_credits = ni_credits;
- best_healthv = ni_healthv;
}
CDEBUG(D_NET, "selected best_ni %s\n",