@@ -199,6 +199,7 @@ typedef struct osm_subn_opt {
char *root_guid_file;
char *cn_guid_file;
char *io_guid_file;
+ boolean_t port_shifting;
uint16_t max_reverse_hops;
char *ids_guid_file;
char *guid_routing_order_file;
@@ -418,6 +419,9 @@ typedef struct osm_subn_opt {
* Name of the file that contains list of I/O node guids that
* will be used by fat-tree routing (provided by User)
*
+* port_shifting
+* This option will turn on port_shifting in routing.
+*
* ids_guid_file
* Name of the file that contains list of ids which should be
* used by Up/Down algorithm instead of node GUIDs
@@ -919,7 +919,8 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
IN unsigned start_from,
IN boolean_t ignore_existing,
IN boolean_t routing_for_lmc,
- IN boolean_t dor);
+ IN boolean_t dor,
+ IN boolean_t port_shifting);
/*
* PARAMETERS
* p_sw
@@ -955,6 +956,9 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
* dor
* [in] If TRUE, Dimension Order Routing will be done.
*
+* port_shifting
+* [in] If TRUE, port_shifting will be done.
+*
* RETURN VALUE
* Returns the recommended port on which to route this LID.
*
@@ -25,6 +25,7 @@ opensm \- InfiniBand subnet manager and administration (SM/SA)
[\-a | \-\-root_guid_file <path to file>]
[\-u | \-\-cn_guid_file <path to file>]
[\-G | \-\-io_guid_file <path to file>]
+[\-\-port\-shifting]
[\-H | \-\-max_reverse_hops <max reverse hops allowed>]
[\-X | \-\-guid_routing_order_file <path to file>]
[\-m | \-\-ids_guid_file <path to file>]
@@ -208,6 +209,13 @@ to the guids provided in the given file (one to a line).
I/O nodes are non-CN nodes allowed to use up to max_reverse_hops switches
the wrong way around to improve connectivity.
.TP
+\fB\-\-port\-shifting\fR
+This option enables a feature called \fBport shifting\fR. In some
+fabrics, particularly cluster environments, routes commonly align and
+congest with other routes due to algorithmically unchanging traffic
+patterns. This routing option will "shift" routing around in an
+attempt to alleviate this problem.
+.TP
\fB\-H\fR, \fB\-\-max_reverse_hops\fR <file name>
Set the maximum number of reverse hops an I/O node is allowed
to make. A reverse hop is the use of a switch the wrong way around.
@@ -223,6 +223,9 @@ static void show_usage(void)
printf("--io_guid_file, -G <path to file>\n"
" Set the I/O nodes for the Fat-Tree routing algorithm\n"
" to the guids provided in the given file (one to a line)\n\n");
+ printf("--port-shifting\n"
+ " Attempt to shift port routes around to remove alignment problems\n"
+ " in routing tables\n\n");
printf("--max_reverse_hops, -H <hop_count>\n"
" Set the max number of hops the wrong way around\n"
" an I/O node is allowed to do (connectivity for I/O nodes on top swithces)\n\n");
@@ -601,6 +604,7 @@ int main(int argc, char *argv[])
{"root_guid_file", 1, NULL, 'a'},
{"cn_guid_file", 1, NULL, 'u'},
{"io_guid_file", 1, NULL, 'G'},
+ {"port-shifting", 0, NULL, 11},
{"max_reverse_hops", 1, NULL, 'H'},
{"ids_guid_file", 1, NULL, 'm'},
{"guid_routing_order_file", 1, NULL, 'X'},
@@ -943,6 +947,10 @@ int main(int argc, char *argv[])
opt.io_guid_file = optarg;
printf(" I/O Node Guid File: %s\n", opt.io_guid_file);
break;
+ case 11:
+ opt.port_shifting = TRUE;
+ printf(" Port Shifting is on\n");
+ break;
case 'H':
opt.max_reverse_hops = atoi(optarg);
printf(" Max Reverse Hops: %d\n", opt.max_reverse_hops);
@@ -221,7 +221,8 @@ static void dump_ucast_routes(cl_map_item_t * item, FILE * file, void *cxt)
/* No LMC Optimization */
best_port = osm_switch_recommend_path(p_sw, p_port,
lid_ho, 1, TRUE,
- FALSE, dor);
+ FALSE, dor,
+ p_osm->subn.opt.port_shifting);
fprintf(file, "No %u hop path possible via port %u!",
best_hops, best_port);
}
@@ -347,6 +347,7 @@ static const opt_rec_t opt_tbl[] = {
{ "root_guid_file", OPT_OFFSET(root_guid_file), opts_parse_charp, NULL, 0 },
{ "cn_guid_file", OPT_OFFSET(cn_guid_file), opts_parse_charp, NULL, 0 },
{ "io_guid_file", OPT_OFFSET(io_guid_file), opts_parse_charp, NULL, 0 },
+ { "port_shifting", OPT_OFFSET(port_shifting), opts_parse_boolean, NULL, 1 },
{ "max_reverse_hops", OPT_OFFSET(max_reverse_hops), opts_parse_uint16, NULL, 0 },
{ "ids_guid_file", OPT_OFFSET(ids_guid_file), opts_parse_charp, NULL, 0 },
{ "guid_routing_order_file", OPT_OFFSET(guid_routing_order_file), opts_parse_charp, NULL, 0 },
@@ -740,6 +741,7 @@ void osm_subn_set_default_opt(IN osm_subn_opt_t * p_opt)
p_opt->root_guid_file = NULL;
p_opt->cn_guid_file = NULL;
p_opt->io_guid_file = NULL;
+ p_opt->port_shifting = FALSE;
p_opt->max_reverse_hops = 0;
p_opt->ids_guid_file = NULL;
p_opt->guid_routing_order_file = NULL;
@@ -1440,6 +1442,11 @@ int osm_subn_output_conf(FILE *out, IN osm_subn_opt_t * p_opts)
p_opts->lash_start_vl);
fprintf(out,
+ "# Port Shifting (use FALSE if unsure)\n"
+ "port_shifting %s\n\n",
+ p_opts->port_shifting ? "TRUE" : "FALSE");
+
+ fprintf(out,
"# SA database file name\nsa_db_file %s\n\n",
p_opts->sa_db_file ? p_opts->sa_db_file : null_str);
@@ -51,6 +51,14 @@
#include <iba/ib_types.h>
#include <opensm/osm_switch.h>
+struct switch_port_path {
+ uint8_t port_num;
+ uint32_t path_count;
+ int found_sys_guid;
+ int found_node_guid;
+ uint32_t forwarded_to;
+};
+
cl_status_t osm_switch_set_hops(IN osm_switch_t * p_sw, IN uint16_t lid_ho,
IN uint8_t port_num, IN uint8_t num_hops)
{
@@ -217,7 +225,8 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
IN unsigned start_from,
IN boolean_t ignore_existing,
IN boolean_t routing_for_lmc,
- IN boolean_t dor)
+ IN boolean_t dor,
+ IN boolean_t port_shifting)
{
/*
We support an enhanced LMC aware routing mode:
@@ -259,6 +268,11 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
osm_node_t *p_rem_node_first = NULL;
struct osm_remote_node *p_remote_guid = NULL;
struct osm_remote_node null_remote_node = {NULL, 0, 0};
+ struct switch_port_path port_paths[IB_NODE_NUM_PORTS_MAX];
+ unsigned int port_paths_total_paths = 0;
+ unsigned int port_paths_count = 0;
+ int found_sys_guid;
+ int found_node_guid;
CL_ASSERT(lid_ho > 0);
@@ -369,6 +383,7 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
check_count =
osm_port_prof_path_count_get(&p_sw->p_prof[port_num]);
+
if (dor) {
/* Get the Remote Node */
p_rem_physp = osm_physp_get_remote(p_physp);
@@ -412,7 +427,10 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
best_port_other_sys = port_num;
least_forwarded_to = 0;
}
+ found_sys_guid = 0;
} else { /* same sys found - try node */
+
+
/* Else is the node guid already used ? */
p_remote_guid = switch_find_node_guid_count(p_sw,
p_port->priv,
@@ -427,9 +445,27 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
}
/* else prior sys and node guid already used */
+ if (!p_remote_guid)
+ found_node_guid = 0;
+ else
+ found_node_guid = 1;
+ found_sys_guid = 1;
} /* same sys found */
}
+ port_paths[port_paths_count].port_num = port_num;
+ port_paths[port_paths_count].path_count = check_count;
+ if (routing_for_lmc) {
+ port_paths[port_paths_count].found_sys_guid = found_sys_guid;
+ port_paths[port_paths_count].found_node_guid = found_node_guid;
+ }
+ if (routing_for_lmc && p_remote_guid)
+ port_paths[port_paths_count].forwarded_to = p_remote_guid->forwarded_to;
+ else
+ port_paths[port_paths_count].forwarded_to = 0;
+ port_paths_total_paths += check_count;
+ port_paths_count++;
+
/* routing for LMC mode */
/*
the count is min but also lower then the max subscribed
@@ -454,6 +490,66 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * p_sw,
if (port_found == FALSE)
return OSM_NO_PATH;
+ if (port_shifting && port_paths_count) {
+ /* In the port_paths[] array, we now have all the ports that we
+ * can route out of. Using some shifting math below, possibly
+ * select a different one so that lids won't align in LFTs
+ *
+ * If lmc > 0, we need to loop through these ports to find the
+ * least_forwarded_to port, best_port_other_sys, and
+ * best_port_other_node just like before but through the different
+ * ordering.
+ */
+
+ least_paths = 0xFFFFFFFF;
+ least_paths_other_sys = 0xFFFFFFFF;
+ least_paths_other_nodes = 0xFFFFFFFF;
+ least_forwarded_to = 0xFFFFFFFF;
+ best_port = 0;
+ best_port_other_sys = 0;
+ best_port_other_node = 0;
+
+ for (i = 0; i < port_paths_count; i++) {
+ unsigned int idx;
+
+ idx = (port_paths_total_paths/port_paths_count + i) % port_paths_count;
+
+ if (routing_for_lmc) {
+ if (!port_paths[idx].found_sys_guid
+ && port_paths[idx].path_count < least_paths_other_sys) {
+ least_paths_other_sys = port_paths[idx].path_count;
+ best_port_other_sys = port_paths[idx].port_num;
+ least_forwarded_to = 0;
+ }
+ else if (!port_paths[idx].found_node_guid
+ && port_paths[idx].path_count < least_paths_other_nodes) {
+ least_paths_other_nodes = port_paths[idx].path_count;
+ best_port_other_node = port_paths[idx].port_num;
+ least_forwarded_to = 0;
+ }
+ }
+
+ if (port_paths[idx].path_count < least_paths) {
+ best_port = port_paths[idx].port_num;
+ least_paths = port_paths[idx].path_count;
+ if (routing_for_lmc
+ && (port_paths[idx].found_sys_guid
+ || port_paths[idx].found_node_guid)
+ && port_paths[idx].forwarded_to < least_forwarded_to)
+ least_forwarded_to = port_paths[idx].forwarded_to;
+ }
+ else if (routing_for_lmc
+ && (port_paths[idx].found_sys_guid
+ || port_paths[idx].found_node_guid)
+ && port_paths[idx].path_count == least_paths
+ && port_paths[idx].forwarded_to < least_forwarded_to) {
+ least_forwarded_to = port_paths[idx].forwarded_to;
+ best_port = port_paths[idx].port_num;
+ }
+
+ }
+ }
+
/*
if we are in enhanced routing mode and the best port is not
the local port 0
@@ -255,7 +255,8 @@ static void ucast_mgr_process_port(IN osm_ucast_mgr_t * p_mgr,
port = osm_switch_recommend_path(p_sw, p_port, lid_ho, start_from,
p_mgr->p_subn->ignore_existing_lfts,
p_mgr->p_subn->opt.lmc,
- p_mgr->is_dor);
+ p_mgr->is_dor,
+ p_mgr->p_subn->opt.port_shifting);
if (port == OSM_NO_PATH) {
/* do not try to overwrite the ppro of non existing port ... */