@@ -478,6 +478,7 @@ struct lnet_ni *
struct lnet_net *lnet_get_net_locked(u32 net_id);
extern unsigned int lnet_transaction_timeout;
+extern unsigned int lnet_retry_count;
extern unsigned int lnet_numa_range;
extern unsigned int lnet_health_sensitivity;
extern unsigned int lnet_peer_discovery_disabled;
@@ -103,6 +103,8 @@ struct lnet_msg {
enum lnet_msg_hstatus msg_health_status;
/* This is a recovery message */
bool msg_recovery;
+ /* the number of times a transmission has been retried */
+ int msg_retry_count;
/* flag to indicate that we do not want to resend this message */
bool msg_no_resend;
@@ -116,6 +116,11 @@ struct lnet the_lnet = {
MODULE_PARM_DESC(lnet_transaction_timeout,
"Time in seconds to wait for a REPLY or an ACK");
+unsigned int lnet_retry_count;
+module_param(lnet_retry_count, uint, 0444);
+MODULE_PARM_DESC(lnet_retry_count,
+ "Maximum number of times to retry transmitting a message");
+
/*
* This sequence number keeps track of how many times DLC was used to
* update the local NIs. It is incremented when a NI is added or
@@ -556,7 +556,8 @@
}
/* Do a health check on the message:
- * return -1 if we're not going to handle the error
+ * return -1 if we're not going to handle the error or
+ * if we've reached the maximum number of retries.
* success case will return -1 as well
* return 0 if it the message is requeued for send
*/
@@ -646,6 +647,11 @@
if (msg->msg_no_resend)
return -1;
+ /* check if the message has exceeded the number of retries */
+ if (msg->msg_retry_count >= lnet_retry_count)
+ return -1;
+ msg->msg_retry_count++;
+
lnet_net_lock(msg->msg_tx_cpt);
/* remove message from the active list and reset it in preparation