xref: /trueos/contrib/ofed/management/opensm/opensm/osm_switch.c (revision 8fe640108653f13042f1b15213769e338aa524f6)
1 /*
2  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
3  * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
4  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenIB.org BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  *
34  */
35 
36 /*
37  * Abstract:
38  *    Implementation of osm_switch_t.
39  * This object represents an Infiniband switch.
40  * This object is part of the opensm family of objects.
41  */
42 
43 #if HAVE_CONFIG_H
44 #  include <config.h>
45 #endif				/* HAVE_CONFIG_H */
46 
47 #include <stdlib.h>
48 #include <string.h>
49 #include <complib/cl_math.h>
50 #include <iba/ib_types.h>
51 #include <opensm/osm_switch.h>
52 
53 /**********************************************************************
54  **********************************************************************/
55 cl_status_t
osm_switch_set_hops(IN osm_switch_t * const p_sw,IN const uint16_t lid_ho,IN const uint8_t port_num,IN const uint8_t num_hops)56 osm_switch_set_hops(IN osm_switch_t * const p_sw,
57 		    IN const uint16_t lid_ho,
58 		    IN const uint8_t port_num, IN const uint8_t num_hops)
59 {
60 	if (lid_ho > p_sw->max_lid_ho)
61 		return -1;
62 	if (!p_sw->hops[lid_ho]) {
63 		p_sw->hops[lid_ho] = malloc(p_sw->num_ports);
64 		if (!p_sw->hops[lid_ho])
65 			return -1;
66 		memset(p_sw->hops[lid_ho], OSM_NO_PATH, p_sw->num_ports);
67 	}
68 
69 	p_sw->hops[lid_ho][port_num] = num_hops;
70 	if (p_sw->hops[lid_ho][0] > num_hops)
71 		p_sw->hops[lid_ho][0] = num_hops;
72 
73 	return 0;
74 }
75 
76 /**********************************************************************
77  **********************************************************************/
78 static ib_api_status_t
osm_switch_init(IN osm_switch_t * const p_sw,IN osm_node_t * const p_node,IN const osm_madw_t * const p_madw)79 osm_switch_init(IN osm_switch_t * const p_sw,
80 		IN osm_node_t * const p_node,
81 		IN const osm_madw_t * const p_madw)
82 {
83 	ib_api_status_t status = IB_SUCCESS;
84 	ib_switch_info_t *p_si;
85 	ib_smp_t *p_smp;
86 	uint8_t num_ports;
87 	uint32_t port_num;
88 
89 	p_smp = osm_madw_get_smp_ptr(p_madw);
90 	p_si = (ib_switch_info_t *) ib_smp_get_payload_ptr(p_smp);
91 	num_ports = osm_node_get_num_physp(p_node);
92 
93 	CL_ASSERT(p_smp->attr_id == IB_MAD_ATTR_SWITCH_INFO);
94 
95 	p_sw->p_node = p_node;
96 	p_sw->switch_info = *p_si;
97 	p_sw->num_ports = num_ports;
98 	p_sw->need_update = 2;
99 
100 	/* Initiate the linear forwarding table */
101 
102 	if (!p_si->lin_cap) {
103 		/* This switch does not support linear forwarding tables */
104 		status = IB_UNSUPPORTED;
105 		goto Exit;
106 	}
107 
108 	p_sw->lft = malloc(IB_LID_UCAST_END_HO + 1);
109 	if (!p_sw->lft) {
110 		status = IB_INSUFFICIENT_MEMORY;
111 		goto Exit;
112 	}
113 
114 	/* Initialize the table to OSM_NO_PATH, which is "invalid port" */
115 	memset(p_sw->lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
116 
117 	p_sw->p_prof = malloc(sizeof(*p_sw->p_prof) * num_ports);
118 	if (p_sw->p_prof == NULL) {
119 		status = IB_INSUFFICIENT_MEMORY;
120 		goto Exit;
121 	}
122 
123 	memset(p_sw->p_prof, 0, sizeof(*p_sw->p_prof) * num_ports);
124 
125 	status = osm_mcast_tbl_init(&p_sw->mcast_tbl,
126 				    osm_node_get_num_physp(p_node),
127 				    cl_ntoh16(p_si->mcast_cap));
128 	if (status != IB_SUCCESS)
129 		goto Exit;
130 
131 	for (port_num = 0; port_num < num_ports; port_num++)
132 		osm_port_prof_construct(&p_sw->p_prof[port_num]);
133 
134 Exit:
135 	return (status);
136 }
137 
138 /**********************************************************************
139  **********************************************************************/
osm_switch_delete(IN OUT osm_switch_t ** const pp_sw)140 void osm_switch_delete(IN OUT osm_switch_t ** const pp_sw)
141 {
142 	osm_switch_t *p_sw = *pp_sw;
143 	unsigned i;
144 
145 	osm_mcast_tbl_destroy(&p_sw->mcast_tbl);
146 	free(p_sw->p_prof);
147 	if (p_sw->lft)
148 		free(p_sw->lft);
149 	if (p_sw->new_lft)
150 		free(p_sw->new_lft);
151 	if (p_sw->hops) {
152 		for (i = 0; i < p_sw->num_hops; i++)
153 			if (p_sw->hops[i])
154 				free(p_sw->hops[i]);
155 		free(p_sw->hops);
156 	}
157 	free(*pp_sw);
158 	*pp_sw = NULL;
159 }
160 
161 /**********************************************************************
162  **********************************************************************/
osm_switch_new(IN osm_node_t * const p_node,IN const osm_madw_t * const p_madw)163 osm_switch_t *osm_switch_new(IN osm_node_t * const p_node,
164 			     IN const osm_madw_t * const p_madw)
165 {
166 	ib_api_status_t status;
167 	osm_switch_t *p_sw;
168 
169 	CL_ASSERT(p_madw);
170 	CL_ASSERT(p_node);
171 
172 	p_sw = (osm_switch_t *) malloc(sizeof(*p_sw));
173 	if (p_sw) {
174 		memset(p_sw, 0, sizeof(*p_sw));
175 		status = osm_switch_init(p_sw, p_node, p_madw);
176 		if (status != IB_SUCCESS)
177 			osm_switch_delete(&p_sw);
178 	}
179 
180 	return (p_sw);
181 }
182 
183 /**********************************************************************
184  **********************************************************************/
185 boolean_t
osm_switch_get_lft_block(IN const osm_switch_t * const p_sw,IN const uint16_t block_id,OUT uint8_t * const p_block)186 osm_switch_get_lft_block(IN const osm_switch_t * const p_sw,
187 			 IN const uint16_t block_id,
188 			 OUT uint8_t * const p_block)
189 {
190 	uint16_t base_lid_ho = block_id * IB_SMP_DATA_SIZE;
191 
192 	CL_ASSERT(p_sw);
193 	CL_ASSERT(p_block);
194 
195 	if (base_lid_ho > p_sw->max_lid_ho)
196 		return FALSE;
197 
198 	CL_ASSERT(base_lid_ho + IB_SMP_DATA_SIZE <= IB_LID_UCAST_END_HO);
199 	memcpy(p_block, &(p_sw->lft[base_lid_ho]), IB_SMP_DATA_SIZE);
200 	return TRUE;
201 }
202 
203 /**********************************************************************
204  **********************************************************************/
205 static struct osm_remote_node *
osm_switch_find_guid_common(IN const osm_switch_t * const p_sw,IN struct osm_remote_guids_count * r,IN uint8_t port_num,IN int find_sys_guid,IN int find_node_guid)206 osm_switch_find_guid_common(IN const osm_switch_t * const p_sw,
207 			    IN struct osm_remote_guids_count *r,
208 			    IN uint8_t port_num,
209 			    IN int find_sys_guid,
210 			    IN int find_node_guid)
211 {
212 	struct osm_remote_node *p_remote_guid = NULL;
213 	osm_physp_t *p_physp;
214 	osm_physp_t *p_rem_physp;
215 	osm_node_t *p_rem_node;
216 	uint64_t sys_guid;
217 	uint64_t node_guid;
218 	int i;
219 
220 	CL_ASSERT(p_sw);
221 
222 	p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
223 	p_rem_physp = osm_physp_get_remote(p_physp);
224 	p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
225 	sys_guid = p_rem_node->node_info.sys_guid;
226 	node_guid = p_rem_node->node_info.node_guid;
227 
228 	for (i = 0; i < r->count; i++) {
229 		if ((!find_sys_guid
230 		     || r->guids[i].node->node_info.sys_guid == sys_guid)
231 		    && (!find_node_guid
232 			|| r->guids[i].node->node_info.node_guid == node_guid)) {
233 			p_remote_guid = &r->guids[i];
234 			break;
235 		}
236 	}
237 
238 	return p_remote_guid;
239 }
240 
241 static struct osm_remote_node *
osm_switch_find_sys_guid_count(IN const osm_switch_t * const p_sw,IN struct osm_remote_guids_count * r,IN uint8_t port_num)242 osm_switch_find_sys_guid_count(IN const osm_switch_t * const p_sw,
243 			       IN struct osm_remote_guids_count *r,
244 			       IN uint8_t port_num)
245 {
246 	return osm_switch_find_guid_common(p_sw, r, port_num, 1, 0);
247 }
248 
249 static struct osm_remote_node *
osm_switch_find_node_guid_count(IN const osm_switch_t * const p_sw,IN struct osm_remote_guids_count * r,IN uint8_t port_num)250 osm_switch_find_node_guid_count(IN const osm_switch_t * const p_sw,
251 				IN struct osm_remote_guids_count *r,
252 				IN uint8_t port_num)
253 {
254 	return osm_switch_find_guid_common(p_sw, r, port_num, 0, 1);
255 }
256 
257 /**********************************************************************
258  **********************************************************************/
259 uint8_t
osm_switch_recommend_path(IN const osm_switch_t * const p_sw,IN osm_port_t * p_port,IN const uint16_t lid_ho,IN unsigned start_from,IN const boolean_t ignore_existing,IN const boolean_t dor)260 osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
261 			  IN osm_port_t * p_port,
262 			  IN const uint16_t lid_ho,
263 			  IN unsigned start_from,
264 			  IN const boolean_t ignore_existing,
265 			  IN const boolean_t dor)
266 {
267 	/*
268 	   We support an enhanced LMC aware routing mode:
269 	   In the case of LMC > 0, we can track the remote side
270 	   system and node for all of the lids of the target
271 	   and try and avoid routing again through the same
272 	   system / node.
273 
274 	   If this procedure is provided with the tracking array
275 	   and counter we can conduct this algorithm.
276 	 */
277 	boolean_t routing_for_lmc = (p_port->priv != NULL);
278 	uint16_t base_lid;
279 	uint8_t hops;
280 	uint8_t least_hops;
281 	uint8_t port_num;
282 	uint8_t num_ports;
283 	uint32_t least_paths = 0xFFFFFFFF;
284 	unsigned i;
285 	/*
286 	   The follwing will track the least paths if the
287 	   route should go through a new system/node
288 	 */
289 	uint32_t least_paths_other_sys = 0xFFFFFFFF;
290 	uint32_t least_paths_other_nodes = 0xFFFFFFFF;
291 	uint32_t least_forwarded_to = 0xFFFFFFFF;
292 	uint32_t check_count;
293 	uint8_t best_port = 0;
294 	/*
295 	   These vars track the best port if it connects to
296 	   not used system/node.
297 	 */
298 	uint8_t best_port_other_sys = 0;
299 	uint8_t best_port_other_node = 0;
300 	boolean_t port_found = FALSE;
301 	osm_physp_t *p_physp;
302 	osm_physp_t *p_rem_physp;
303 	osm_node_t *p_rem_node;
304 	osm_node_t *p_rem_node_first = NULL;
305 	struct osm_remote_node *p_remote_guid = NULL;
306 
307 	CL_ASSERT(lid_ho > 0);
308 
309 	if (p_port->p_node->sw) {
310 		if (p_port->p_node->sw == p_sw)
311 			return 0;
312 		base_lid = osm_port_get_base_lid(p_port);
313 	} else {
314 		p_physp = p_port->p_physp;
315 		if (!p_physp || !p_physp->p_remote_physp ||
316 		    !p_physp->p_remote_physp->p_node->sw)
317 			return OSM_NO_PATH;
318 
319 		if (p_physp->p_remote_physp->p_node->sw == p_sw)
320 			return p_physp->p_remote_physp->port_num;
321 		base_lid =
322 		    osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0);
323 	}
324 	base_lid = cl_ntoh16(base_lid);
325 
326 	num_ports = p_sw->num_ports;
327 
328 	least_hops = osm_switch_get_least_hops(p_sw, base_lid);
329 	if (least_hops == OSM_NO_PATH)
330 		return (OSM_NO_PATH);
331 
332 	/*
333 	   First, inquire with the forwarding table for an existing
334 	   route.  If one is found, honor it unless:
335 	   1. the ignore existing flag is set.
336 	   2. the physical port is not a valid one or not healthy
337 	   3. the physical port has a remote port (the link is up)
338 	   4. the port has min-hops to the target (avoid loops)
339 	 */
340 	if (!ignore_existing) {
341 		port_num = osm_switch_get_port_by_lid(p_sw, lid_ho);
342 
343 		if (port_num != OSM_NO_PATH) {
344 			CL_ASSERT(port_num < num_ports);
345 
346 			p_physp =
347 			    osm_node_get_physp_ptr(p_sw->p_node, port_num);
348 			/*
349 			   Don't be too trusting of the current forwarding table!
350 			   Verify that the port number is legal and that the
351 			   LID is reachable through this port.
352 			 */
353 			if (p_physp && osm_physp_is_healthy(p_physp) &&
354 			    osm_physp_get_remote(p_physp)) {
355 				hops =
356 				    osm_switch_get_hop_count(p_sw, base_lid,
357 							     port_num);
358 				/*
359 				   If we aren't using pre-defined user routes
360 				   function, then we need to make sure that the
361 				   current path is the minimum one. In case of
362 				   having such a user function - this check will
363 				   not be done, and the old routing will be used.
364 				   Note: This means that it is the user's job to
365 				   clean all data in the forwarding tables that
366 				   he wants to be overridden by the minimum
367 				   hop function.
368 				 */
369 				if (hops == least_hops)
370 					return (port_num);
371 			}
372 		}
373 	}
374 
375 	/*
376 	   This algorithm selects a port based on a static load balanced
377 	   selection across equal hop-count ports.
378 	   There is lots of room for improved sophistication here,
379 	   possibly guided by user configuration info.
380 	 */
381 
382 	/*
383 	   OpenSM routing is "local" - not considering a full lid to lid
384 	   path. As such we can not guarantee a path will not loop if we
385 	   do not always follow least hops.
386 	   So we must abort if not least hops.
387 	 */
388 
389 	/* port number starts with one and num_ports is 1 + num phys ports */
390 	for (i = start_from; i < start_from + num_ports; i++) {
391 		port_num = i%num_ports;
392 		if (!port_num ||
393 		    osm_switch_get_hop_count(p_sw, base_lid, port_num) !=
394 		    least_hops)
395 			continue;
396 
397 		/* let us make sure it is not down or unhealthy */
398 		p_physp = osm_node_get_physp_ptr(p_sw->p_node, port_num);
399 		if (!p_physp || !osm_physp_is_healthy(p_physp) ||
400 		    /*
401 		       we require all - non sma ports to be linked
402 		       to be routed through
403 		     */
404 		    !osm_physp_get_remote(p_physp))
405 			continue;
406 
407 		/*
408 		   We located a least-hop port, possibly one of many.
409 		   For this port, check the running total count of
410 		   the number of paths through this port.  Select
411 		   the port routing the least number of paths.
412 		 */
413 		check_count =
414 		    osm_port_prof_path_count_get(&p_sw->p_prof[port_num]);
415 
416 		/*
417 		   Advanced LMC routing requires tracking of the
418 		   best port by the node connected to the other side of
419 		   it.
420 		 */
421 		if (routing_for_lmc) {
422 			/* Is the sys guid already used ? */
423 			p_remote_guid = osm_switch_find_sys_guid_count(p_sw,
424 								       p_port->priv,
425 								       port_num);
426 
427 			/* If not update the least hops for this case */
428 			if (!p_remote_guid) {
429 				if (check_count < least_paths_other_sys) {
430 					least_paths_other_sys = check_count;
431 					best_port_other_sys = port_num;
432 					least_forwarded_to = 0;
433 				}
434 			} else {	/* same sys found - try node */
435 				/* Else is the node guid already used ? */
436 				p_remote_guid = osm_switch_find_node_guid_count(p_sw,
437 										p_port->priv,
438 										port_num);
439 
440 				/* If not update the least hops for this case */
441 				if (!p_remote_guid
442 				    && check_count < least_paths_other_nodes) {
443 					least_paths_other_nodes = check_count;
444 					best_port_other_node = port_num;
445 					least_forwarded_to = 0;
446 				}
447 				/* else prior sys and node guid already used */
448 
449 			}	/* same sys found */
450 		}
451 
452 		/* routing for LMC mode */
453 		/*
454 		   the count is min but also lower then the max subscribed
455 		 */
456 		if (check_count < least_paths) {
457 			if (dor) {
458 				/* Get the Remote Node */
459 				p_rem_physp = osm_physp_get_remote(p_physp);
460 				p_rem_node =
461 				    osm_physp_get_node_ptr(p_rem_physp);
462 				/* use the first dimension, but spread
463 				 * traffic out among the group of ports
464 				 * representing that dimension */
465 				if (port_found) {
466 					if (p_rem_node != p_rem_node_first)
467 						continue;
468 				} else
469 					p_rem_node_first = p_rem_node;
470 			}
471 			port_found = TRUE;
472 			best_port = port_num;
473 			least_paths = check_count;
474 			if (routing_for_lmc
475 			    && p_remote_guid
476 			    && p_remote_guid->forwarded_to < least_forwarded_to)
477 				least_forwarded_to = p_remote_guid->forwarded_to;
478 		} else if (routing_for_lmc
479 			   && p_remote_guid
480 			   && check_count == least_paths
481 			   && p_remote_guid->forwarded_to < least_forwarded_to) {
482 			least_forwarded_to = p_remote_guid->forwarded_to;
483 			best_port = port_num;
484 		}
485 	}
486 
487 	if (port_found == FALSE)
488 		return (OSM_NO_PATH);
489 
490 	/*
491 	   if we are in enhanced routing mode and the best port is not
492 	   the local port 0
493 	 */
494 	if (routing_for_lmc && best_port) {
495 		/* Select the least hop port of the non used sys first */
496 		if (best_port_other_sys)
497 			best_port = best_port_other_sys;
498 		else if (best_port_other_node)
499 			best_port = best_port_other_node;
500 	}
501 
502 	return (best_port);
503 }
504 
505 /**********************************************************************
506  **********************************************************************/
osm_switch_clear_hops(IN osm_switch_t * p_sw)507 void osm_switch_clear_hops(IN osm_switch_t * p_sw)
508 {
509 	unsigned i;
510 
511 	for (i = 0; i < p_sw->num_hops; i++)
512 		if (p_sw->hops[i])
513 			memset(p_sw->hops[i], OSM_NO_PATH, p_sw->num_ports);
514 }
515 
516 /**********************************************************************
517  **********************************************************************/
518 int
osm_switch_prepare_path_rebuild(IN osm_switch_t * p_sw,IN uint16_t max_lids)519 osm_switch_prepare_path_rebuild(IN osm_switch_t * p_sw, IN uint16_t max_lids)
520 {
521 	uint8_t **hops;
522 	unsigned i;
523 
524 	for (i = 0; i < p_sw->num_ports; i++)
525 		osm_port_prof_construct(&p_sw->p_prof[i]);
526 
527 	osm_switch_clear_hops(p_sw);
528 
529 	if (!p_sw->new_lft &&
530 	    !(p_sw->new_lft = malloc(IB_LID_UCAST_END_HO + 1)))
531 		return IB_INSUFFICIENT_MEMORY;
532 
533 	memset(p_sw->new_lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
534 
535 	if (!p_sw->hops) {
536 		hops = malloc((max_lids + 1) * sizeof(hops[0]));
537 		if (!hops)
538 			return -1;
539 		memset(hops, 0, (max_lids + 1) * sizeof(hops[0]));
540 		p_sw->hops = hops;
541 		p_sw->num_hops = max_lids + 1;
542 	} else if (max_lids + 1 > p_sw->num_hops) {
543 		uint8_t **old_hops;
544 
545 		hops = malloc((max_lids + 1) * sizeof(hops[0]));
546 		if (!hops)
547 			return -1;
548 		memcpy(hops, p_sw->hops, p_sw->num_hops * sizeof(hops[0]));
549 		memset(hops + p_sw->num_hops, 0,
550 		       (max_lids + 1 - p_sw->num_hops) * sizeof(hops[0]));
551 		old_hops = p_sw->hops;
552 		p_sw->hops = hops;
553 		p_sw->num_hops = max_lids + 1;
554 		free(old_hops);
555 	}
556 	p_sw->max_lid_ho = max_lids;
557 
558 	return 0;
559 }
560 
561 /**********************************************************************
562  **********************************************************************/
563 uint8_t
osm_switch_get_port_least_hops(IN const osm_switch_t * const p_sw,IN const osm_port_t * p_port)564 osm_switch_get_port_least_hops(IN const osm_switch_t * const p_sw,
565 			       IN const osm_port_t * p_port)
566 {
567 	uint16_t lid;
568 
569 	if (p_port->p_node->sw) {
570 		if (p_port->p_node->sw == p_sw)
571 			return 0;
572 		lid = osm_node_get_base_lid(p_port->p_node, 0);
573 		return osm_switch_get_least_hops(p_sw, cl_ntoh16(lid));
574 	} else {
575 		osm_physp_t *p = p_port->p_physp;
576 		uint8_t hops;
577 
578 		if (!p || !p->p_remote_physp || !p->p_remote_physp->p_node->sw)
579 			return OSM_NO_PATH;
580 		if (p->p_remote_physp->p_node->sw == p_sw)
581 			return 1;
582 		lid = osm_node_get_base_lid(p->p_remote_physp->p_node, 0);
583 		hops = osm_switch_get_least_hops(p_sw, cl_ntoh16(lid));
584 		return hops != OSM_NO_PATH ? hops + 1 : OSM_NO_PATH;
585 	}
586 }
587 
588 /**********************************************************************
589  **********************************************************************/
590 uint8_t
osm_switch_recommend_mcast_path(IN osm_switch_t * const p_sw,IN osm_port_t * p_port,IN uint16_t const mlid_ho,IN boolean_t const ignore_existing)591 osm_switch_recommend_mcast_path(IN osm_switch_t * const p_sw,
592 				IN osm_port_t * p_port,
593 				IN uint16_t const mlid_ho,
594 				IN boolean_t const ignore_existing)
595 {
596 	uint16_t base_lid;
597 	uint8_t hops;
598 	uint8_t port_num;
599 	uint8_t num_ports;
600 	uint8_t least_hops;
601 
602 	CL_ASSERT(mlid_ho >= IB_LID_MCAST_START_HO);
603 
604 	if (p_port->p_node->sw) {
605 		if (p_port->p_node->sw == p_sw)
606 			return 0;
607 		base_lid = osm_port_get_base_lid(p_port);
608 	} else {
609 		osm_physp_t *p_physp = p_port->p_physp;
610 		if (!p_physp || !p_physp->p_remote_physp ||
611 		    !p_physp->p_remote_physp->p_node->sw)
612 			return OSM_NO_PATH;
613 		if (p_physp->p_remote_physp->p_node->sw == p_sw)
614 			return p_physp->p_remote_physp->port_num;
615 		base_lid =
616 		    osm_node_get_base_lid(p_physp->p_remote_physp->p_node, 0);
617 	}
618 	base_lid = cl_ntoh16(base_lid);
619 	num_ports = p_sw->num_ports;
620 
621 	/*
622 	   If the user wants us to ignore existing multicast routes,
623 	   then simply return the shortest hop count path to the
624 	   target port.
625 
626 	   Otherwise, return the first port that has a path to the target,
627 	   picking from the ports that are already in the multicast group.
628 	 */
629 	if (!ignore_existing) {
630 		for (port_num = 1; port_num < num_ports; port_num++) {
631 			if (!osm_mcast_tbl_is_port
632 			    (&p_sw->mcast_tbl, mlid_ho, port_num))
633 				continue;
634 			/*
635 			   Don't be too trusting of the current forwarding table!
636 			   Verify that the LID is reachable through this port.
637 			 */
638 			hops =
639 			    osm_switch_get_hop_count(p_sw, base_lid, port_num);
640 			if (hops != OSM_NO_PATH)
641 				return (port_num);
642 		}
643 	}
644 
645 	/*
646 	   Either no existing mcast paths reach this port or we are
647 	   ignoring existing paths.
648 
649 	   Determine the best multicast path to the target.  Note that this
650 	   algorithm is slightly different from the one used for unicast route
651 	   recommendation.  In this case (multicast), we must NOT
652 	   perform any sort of load balancing.  We MUST take the FIRST
653 	   port found that has <= the lowest hop count path.  This prevents
654 	   more than one multicast path to the same remote switch which
655 	   prevents a multicast loop.  Multicast loops are bad since the same
656 	   multicast packet will go around and around, inevitably creating
657 	   a black hole that will destroy the Earth in a firey conflagration.
658 	 */
659 	least_hops = osm_switch_get_least_hops(p_sw, base_lid);
660 	for (port_num = 1; port_num < num_ports; port_num++)
661 		if (osm_switch_get_hop_count(p_sw, base_lid, port_num) ==
662 		    least_hops)
663 			break;
664 
665 	CL_ASSERT(port_num < num_ports);
666 	return (port_num);
667 }
668