@@ -79,10 +79,10 @@ struct lnet the_lnet = {
"NUMA range to consider during Multi-Rail selection");
/* lnet_health_sensitivity determines by how much we decrement the health
- * value on sending error. The value defaults to 0, which means health
- * checking is turned off by default.
+ * value on sending error. The value defaults to 100, which means health
+ * interface health is decremented by 100 points every failure.
*/
-unsigned int lnet_health_sensitivity;
+unsigned int lnet_health_sensitivity = 100;
static int sensitivity_set(const char *val, const struct kernel_param *kp);
static struct kernel_param_ops param_ops_health_sensitivity = {
.set = sensitivity_set,
@@ -140,7 +140,10 @@ static int recovery_interval_set(const char *val,
MODULE_PARM_DESC(lnet_drop_asym_route,
"Set to 1 to drop asymmetrical route messages.");
-unsigned int lnet_transaction_timeout = 50;
+#define LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT 50
+#define LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT 10
+
+unsigned int lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
static int transaction_to_set(const char *val, const struct kernel_param *kp);
static struct kernel_param_ops param_ops_transaction_timeout = {
.set = transaction_to_set,
@@ -153,7 +156,8 @@ static int recovery_interval_set(const char *val,
MODULE_PARM_DESC(lnet_transaction_timeout,
"Maximum number of seconds to wait for a peer response.");
-unsigned int lnet_retry_count;
+#define LNET_RETRY_COUNT_HEALTH_DEFAULT 3
+unsigned int lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
static int retry_count_set(const char *val, const struct kernel_param *kp);
static struct kernel_param_ops param_ops_retry_count = {
.set = retry_count_set,
@@ -201,11 +205,6 @@ static int lnet_discover(struct lnet_process_id id, u32 force,
*/
mutex_lock(&the_lnet.ln_api_mutex);
- if (the_lnet.ln_state != LNET_STATE_RUNNING) {
- mutex_unlock(&the_lnet.ln_api_mutex);
- return 0;
- }
-
if (value > LNET_MAX_HEALTH_VALUE) {
mutex_unlock(&the_lnet.ln_api_mutex);
CERROR("Invalid health value. Maximum: %d value = %lu\n",
@@ -213,6 +212,22 @@ static int lnet_discover(struct lnet_process_id id, u32 force,
return -EINVAL;
}
+ /* if we're turning on health then use the health timeout
+ * defaults.
+ */
+ if (*sensitivity == 0 && value != 0) {
+ lnet_transaction_timeout =
+ LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
+ lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
+ /* if we're turning off health then use the no health timeout
+ * default.
+ */
+ } else if (*sensitivity != 0 && value == 0) {
+ lnet_transaction_timeout =
+ LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT;
+ lnet_retry_count = 0;
+ }
+
*sensitivity = value;
mutex_unlock(&the_lnet.ln_api_mutex);
@@ -243,11 +258,6 @@ static int lnet_discover(struct lnet_process_id id, u32 force,
*/
mutex_lock(&the_lnet.ln_api_mutex);
- if (the_lnet.ln_state != LNET_STATE_RUNNING) {
- mutex_unlock(&the_lnet.ln_api_mutex);
- return 0;
- }
-
*interval = value;
mutex_unlock(&the_lnet.ln_api_mutex);
@@ -353,11 +363,6 @@ static int lnet_discover(struct lnet_process_id id, u32 force,
*/
mutex_lock(&the_lnet.ln_api_mutex);
- if (the_lnet.ln_state != LNET_STATE_RUNNING) {
- mutex_unlock(&the_lnet.ln_api_mutex);
- return 0;
- }
-
if (value < lnet_retry_count || value == 0) {
mutex_unlock(&the_lnet.ln_api_mutex);
CERROR("Invalid value for lnet_transaction_timeout (%lu). Has to be greater than lnet_retry_count (%u)\n",
@@ -399,9 +404,10 @@ static int lnet_discover(struct lnet_process_id id, u32 force,
*/
mutex_lock(&the_lnet.ln_api_mutex);
- if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+ if (lnet_health_sensitivity == 0) {
mutex_unlock(&the_lnet.ln_api_mutex);
- return 0;
+ CERROR("Can not set retry_count when health feature is turned off\n");
+ return -EINVAL;
}
if (value > lnet_transaction_timeout) {
@@ -411,11 +417,6 @@ static int lnet_discover(struct lnet_process_id id, u32 force,
return -EINVAL;
}
- if (value == *retry_count) {
- mutex_unlock(&the_lnet.ln_api_mutex);
- return 0;
- }
-
*retry_count = value;
if (value == 0)