@@ -639,6 +639,8 @@ void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg,
void lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt);
void lnet_finalize(struct lnet_msg *msg, int rc);
+bool lnet_send_error_simulation(struct lnet_msg *msg,
+ enum lnet_msg_hstatus *hstatus);
void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private,
unsigned int nob, u32 msg_type);
@@ -661,7 +663,7 @@ void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private,
int lnet_fault_init(void);
void lnet_fault_fini(void);
-bool lnet_drop_rule_match(struct lnet_hdr *hdr);
+bool lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus);
int lnet_delay_rule_add(struct lnet_fault_attr *attr);
int lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown);
@@ -72,7 +72,8 @@ enum lnet_msg_hstatus {
LNET_MSG_STATUS_REMOTE_ERROR,
LNET_MSG_STATUS_REMOTE_DROPPED,
LNET_MSG_STATUS_REMOTE_TIMEOUT,
- LNET_MSG_STATUS_NETWORK_TIMEOUT
+ LNET_MSG_STATUS_NETWORK_TIMEOUT,
+ LNET_MSG_STATUS_END,
};
struct lnet_rsp_tracker {
@@ -41,6 +41,19 @@ enum {
#define LNET_GET_BIT (1 << 2)
#define LNET_REPLY_BIT (1 << 3)
+#define HSTATUS_END 11
+#define HSTATUS_LOCAL_INTERRUPT_BIT (1 << 1)
+#define HSTATUS_LOCAL_DROPPED_BIT (1 << 2)
+#define HSTATUS_LOCAL_ABORTED_BIT (1 << 3)
+#define HSTATUS_LOCAL_NO_ROUTE_BIT (1 << 4)
+#define HSTATUS_LOCAL_ERROR_BIT (1 << 5)
+#define HSTATUS_LOCAL_TIMEOUT_BIT (1 << 6)
+#define HSTATUS_REMOTE_ERROR_BIT (1 << 7)
+#define HSTATUS_REMOTE_DROPPED_BIT (1 << 8)
+#define HSTATUS_REMOTE_TIMEOUT_BIT (1 << 9)
+#define HSTATUS_NETWORK_TIMEOUT_BIT (1 << 10)
+#define HSTATUS_RANDOM 0xffffffff
+
/** ioctl parameter for LNet fault simulation */
struct lnet_fault_attr {
/**
@@ -78,6 +91,10 @@ struct lnet_fault_attr {
* with da_rate
*/
__u32 da_interval;
+ /** error type mask */
+ __u32 da_health_error_mask;
+ /** randomize error generation */
+ bool da_random;
} drop;
/** message latency simulation */
struct {
@@ -912,7 +912,11 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
bad->wr_id, bad->opcode, bad->send_flags,
libcfs_nid2str(conn->ibc_peer->ibp_nid));
bad = NULL;
- rc = ib_post_send(conn->ibc_cmid->qp, wrq, &bad);
+ if (lnet_send_error_simulation(tx->tx_lntmsg[0],
+ &tx->tx_hstatus))
+ rc = -EINVAL;
+ else
+ rc = ib_post_send(conn->ibc_cmid->qp, wrq, &bad);
}
conn->ibc_last_send = ktime_get();
@@ -335,7 +335,8 @@ struct ksock_tx *
if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted)) {
rc = -EIO;
- hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+ if (hstatus == LNET_MSG_STATUS_OK)
+ hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
}
if (tx->tx_conn)
@@ -467,6 +468,13 @@ struct ksock_tx *
ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx)
{
int rc;
+ bool error_sim = false;
+
+ if (lnet_send_error_simulation(tx->tx_lnetmsg, &tx->tx_hstatus)) {
+ error_sim = true;
+ rc = -EINVAL;
+ goto simulate_error;
+ }
if (tx->tx_zc_capable && !tx->tx_zc_checked)
ksocknal_check_zc_req(tx);
@@ -512,16 +520,19 @@ struct ksock_tx *
return rc;
}
+simulate_error:
/* Actual error */
LASSERT(rc < 0);
- /* set the health status of the message which determines
- * whether we should retry the transmit
- */
- if (rc == -ETIMEDOUT)
- tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_TIMEOUT;
- else
- tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+ if (!error_sim) {
+ /* set the health status of the message which determines
+ * whether we should retry the transmit
+ */
+ if (rc == -ETIMEDOUT)
+ tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_TIMEOUT;
+ else
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+ }
if (!conn->ksnc_closing) {
switch (rc) {
@@ -3875,7 +3875,7 @@ void lnet_monitor_thr_stop(void)
}
if (!list_empty(&the_lnet.ln_drop_rules) &&
- lnet_drop_rule_match(hdr)) {
+ lnet_drop_rule_match(hdr, NULL)) {
CDEBUG(D_NET, "%s, src %s, dst %s: Dropping %s to simulate silent message loss\n",
libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
libcfs_nid2str(dest_nid), lnet_msgtyp2str(type));
@@ -812,6 +812,30 @@
}
}
+bool
+lnet_send_error_simulation(struct lnet_msg *msg,
+ enum lnet_msg_hstatus *hstatus)
+{
+ if (!msg)
+ return false;
+
+ if (list_empty(&the_lnet.ln_drop_rules))
+ return false;
+
+ /* match only health rules */
+ if (!lnet_drop_rule_match(&msg->msg_hdr, hstatus))
+ return false;
+
+ CDEBUG(D_NET, "src %s, dst %s: %s simulate health error: %s\n",
+ libcfs_nid2str(msg->msg_hdr.src_nid),
+ libcfs_nid2str(msg->msg_hdr.dest_nid),
+ lnet_msgtyp2str(msg->msg_type),
+ lnet_health_error2str(*hstatus));
+
+ return true;
+}
+EXPORT_SYMBOL(lnet_send_error_simulation);
+
void
lnet_finalize(struct lnet_msg *msg, int status)
{
@@ -292,13 +292,56 @@ struct lnet_drop_rule {
lnet_net_unlock(cpt);
}
+static void
+lnet_fault_match_health(enum lnet_msg_hstatus *hstatus, __u32 mask)
+{
+ int choice;
+ int delta;
+ int best_delta;
+ int i;
+
+ /* assign a random failure */
+ choice = prandom_u32_max(LNET_MSG_STATUS_END - LNET_MSG_STATUS_OK);
+ if (choice == 0)
+ choice++;
+
+ if (mask == HSTATUS_RANDOM) {
+ *hstatus = choice;
+ return;
+ }
+
+ if (mask & (1 << choice)) {
+ *hstatus = choice;
+ return;
+ }
+
+ /* round to the closest ON bit */
+ i = HSTATUS_END;
+ best_delta = HSTATUS_END;
+ while (i > 0) {
+ if (mask & (1 << i)) {
+ delta = choice - i;
+ if (delta < 0)
+ delta *= -1;
+ if (delta < best_delta) {
+ best_delta = delta;
+ choice = i;
+ }
+ }
+ i--;
+ }
+
+ *hstatus = choice;
+}
+
/**
* check source/destination NID, portal, message type and drop rate,
* decide whether should drop this message or not
*/
static bool
drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
- lnet_nid_t dst, unsigned int type, unsigned int portal)
+ lnet_nid_t dst, unsigned int type, unsigned int portal,
+ enum lnet_msg_hstatus *hstatus)
{
struct lnet_fault_attr *attr = &rule->dr_attr;
bool drop;
@@ -306,9 +349,23 @@ struct lnet_drop_rule {
if (!lnet_fault_attr_match(attr, src, dst, type, portal))
return false;
+ /* if we're trying to match a health status error but it hasn't
+ * been set in the rule, then don't match
+ */
+ if ((hstatus && !attr->u.drop.da_health_error_mask) ||
+ (!hstatus && attr->u.drop.da_health_error_mask))
+ return false;
+
/* match this rule, check drop rate now */
spin_lock(&rule->dr_lock);
- if (rule->dr_drop_time) { /* time based drop */
+ if (attr->u.drop.da_random) {
+ int value = prandom_u32_max(attr->u.drop.da_interval);
+
+ if (value >= (attr->u.drop.da_interval / 2))
+ drop = true;
+ else
+ drop = false;
+ } else if (rule->dr_drop_time) { /* time based drop */
time64_t now = ktime_get_seconds();
rule->dr_stat.fs_count++;
@@ -340,6 +397,9 @@ struct lnet_drop_rule {
}
if (drop) { /* drop this message, update counters */
+ if (hstatus)
+ lnet_fault_match_health(hstatus,
+ attr->u.drop.da_health_error_mask);
lnet_fault_stat_inc(&rule->dr_stat, type);
rule->dr_stat.u.drop.ds_dropped++;
}
@@ -352,12 +412,12 @@ struct lnet_drop_rule {
* Check if message from @src to @dst can match any existed drop rule
*/
bool
-lnet_drop_rule_match(struct lnet_hdr *hdr)
+lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus)
{
- struct lnet_drop_rule *rule;
lnet_nid_t src = le64_to_cpu(hdr->src_nid);
lnet_nid_t dst = le64_to_cpu(hdr->dest_nid);
unsigned int typ = le32_to_cpu(hdr->type);
+ struct lnet_drop_rule *rule;
unsigned int ptl = -1;
bool drop = false;
int cpt;
@@ -373,12 +433,13 @@ struct lnet_drop_rule {
cpt = lnet_net_lock_current();
list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) {
- drop = drop_rule_match(rule, src, dst, typ, ptl);
+ drop = drop_rule_match(rule, src, dst, typ, ptl,
+ hstatus);
if (drop)
break;
}
-
lnet_net_unlock(cpt);
+
return drop;
}