Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 /*
     29  * This file contains consumer routines of the IPv4 forwarding engine
     30  */
     31 
     32 #include <sys/types.h>
     33 #include <sys/stream.h>
     34 #include <sys/stropts.h>
     35 #include <sys/strlog.h>
     36 #include <sys/dlpi.h>
     37 #include <sys/ddi.h>
     38 #include <sys/cmn_err.h>
     39 #include <sys/policy.h>
     40 
     41 #include <sys/systm.h>
     42 #include <sys/strsun.h>
     43 #include <sys/kmem.h>
     44 #include <sys/param.h>
     45 #include <sys/socket.h>
     46 #include <sys/strsubr.h>
     47 #include <sys/pattr.h>
     48 #include <net/if.h>
     49 #include <net/route.h>
     50 #include <netinet/in.h>
     51 #include <net/if_dl.h>
     52 #include <netinet/ip6.h>
     53 #include <netinet/icmp6.h>
     54 
     55 #include <inet/common.h>
     56 #include <inet/mi.h>
     57 #include <inet/mib2.h>
     58 #include <inet/ip.h>
     59 #include <inet/ip_impl.h>
     60 #include <inet/ip6.h>
     61 #include <inet/ip_ndp.h>
     62 #include <inet/arp.h>
     63 #include <inet/ip_if.h>
     64 #include <inet/ip_ire.h>
     65 #include <inet/ip_ftable.h>
     66 #include <inet/ip_rts.h>
     67 #include <inet/nd.h>
     68 
     69 #include <net/pfkeyv2.h>
     70 #include <inet/ipsec_info.h>
     71 #include <inet/sadb.h>
     72 #include <sys/kmem.h>
     73 #include <inet/tcp.h>
     74 #include <inet/ipclassifier.h>
     75 #include <sys/zone.h>
     76 #include <net/radix.h>
     77 #include <sys/tsol/label.h>
     78 #include <sys/tsol/tnet.h>
     79 
     80 #define	IS_DEFAULT_ROUTE(ire)	\
     81 	(((ire)->ire_type & IRE_DEFAULT) || \
     82 	    (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
     83 
     84 /*
     85  * structure for passing args between ire_ftable_lookup and ire_find_best_route
     86  */
     87 typedef struct ire_ftable_args_s {
     88 	ipaddr_t	ift_addr;
     89 	ipaddr_t	ift_mask;
     90 	ipaddr_t	ift_gateway;
     91 	int		ift_type;
     92 	const ipif_t		*ift_ipif;
     93 	zoneid_t	ift_zoneid;
     94 	uint32_t	ift_ihandle;
     95 	const ts_label_t	*ift_tsl;
     96 	int		ift_flags;
     97 	ire_t		*ift_best_ire;
     98 } ire_ftable_args_t;
     99 
    100 static ire_t	*route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
    101 static ire_t   	*ire_round_robin(irb_t *, zoneid_t, ire_ftable_args_t *,
    102     ip_stack_t *);
    103 static void		ire_del_host_redir(ire_t *, char *);
    104 static boolean_t	ire_find_best_route(struct radix_node *, void *);
    105 static int	ip_send_align_hcksum_flags(mblk_t *, ill_t *);
    106 
    107 /*
    108  * Lookup a route in forwarding table. A specific lookup is indicated by
    109  * passing the required parameters and indicating the match required in the
    110  * flag field.
    111  *
    112  * Looking for default route can be done in three ways
    113  * 1) pass mask as 0 and set MATCH_IRE_MASK in flags field
    114  *    along with other matches.
    115  * 2) pass type as IRE_DEFAULT and set MATCH_IRE_TYPE in flags
    116  *    field along with other matches.
    117  * 3) if the destination and mask are passed as zeros.
    118  *
    119  * A request to return a default route if no route
    120  * is found, can be specified by setting MATCH_IRE_DEFAULT
    121  * in flags.
    122  *
    123  * It does not support recursion more than one level. It
    124  * will do recursive lookup only when the lookup maps to
    125  * a prefix or default route and MATCH_IRE_RECURSIVE flag is passed.
    126  *
    127  * If the routing table is setup to allow more than one level
    128  * of recursion, the cleaning up cache table will not work resulting
    129  * in invalid routing.
    130  *
    131  * Supports IP_BOUND_IF by following the ipif/ill when recursing.
    132  *
    133  * NOTE : When this function returns NULL, pire has already been released.
    134  *	  pire is valid only when this function successfully returns an
    135  *	  ire.
    136  */
    137 ire_t *
    138 ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
    139     int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid,
    140     uint32_t ihandle, const ts_label_t *tsl, int flags, ip_stack_t *ipst)
    141 {
    142 	ire_t *ire = NULL;
    143 	ipaddr_t gw_addr;
    144 	struct rt_sockaddr rdst, rmask;
    145 	struct rt_entry *rt;
    146 	ire_ftable_args_t margs;
    147 	boolean_t found_incomplete = B_FALSE;
    148 
    149 	ASSERT(ipif == NULL || !ipif->ipif_isv6);
    150 
    151 	/*
    152 	 * When we return NULL from this function, we should make
    153 	 * sure that *pire is NULL so that the callers will not
    154 	 * wrongly REFRELE the pire.
    155 	 */
    156 	if (pire != NULL)
    157 		*pire = NULL;
    158 	/*
    159 	 * ire_match_args() will dereference ipif MATCH_IRE_SRC or
    160 	 * MATCH_IRE_ILL is set.
    161 	 */
    162 	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
    163 	    (ipif == NULL))
    164 		return (NULL);
    165 
    166 	(void) memset(&rdst, 0, sizeof (rdst));
    167 	rdst.rt_sin_len = sizeof (rdst);
    168 	rdst.rt_sin_family = AF_INET;
    169 	rdst.rt_sin_addr.s_addr = addr;
    170 
    171 	(void) memset(&rmask, 0, sizeof (rmask));
    172 	rmask.rt_sin_len = sizeof (rmask);
    173 	rmask.rt_sin_family = AF_INET;
    174 	rmask.rt_sin_addr.s_addr = mask;
    175 
    176 	(void) memset(&margs, 0, sizeof (margs));
    177 	margs.ift_addr = addr;
    178 	margs.ift_mask = mask;
    179 	margs.ift_gateway = gateway;
    180 	margs.ift_type = type;
    181 	margs.ift_ipif = ipif;
    182 	margs.ift_zoneid = zoneid;
    183 	margs.ift_ihandle = ihandle;
    184 	margs.ift_tsl = tsl;
    185 	margs.ift_flags = flags;
    186 
    187 	/*
    188 	 * The flags argument passed to ire_ftable_lookup may cause the
    189 	 * search to return, not the longest matching prefix, but the
    190 	 * "best matching prefix", i.e., the longest prefix that also
    191 	 * satisfies constraints imposed via the permutation of flags
    192 	 * passed in. To achieve this, we invoke ire_match_args() on
    193 	 * each matching leaf in the  radix tree. ire_match_args is
    194 	 * invoked by the callback function ire_find_best_route()
    195 	 * We hold the global tree lock in read mode when calling
    196 	 * rn_match_args.Before dropping the global tree lock, ensure
    197 	 * that the radix node can't be deleted by incrementing ire_refcnt.
    198 	 */
    199 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
    200 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
    201 	    ipst->ips_ip_ftable, ire_find_best_route, &margs);
    202 	ire = margs.ift_best_ire;
    203 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
    204 
    205 	if (rt == NULL) {
    206 		return (NULL);
    207 	} else {
    208 		ASSERT(ire != NULL);
    209 	}
    210 
    211 	DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
    212 
    213 	if (!IS_DEFAULT_ROUTE(ire))
    214 		goto found_ire_held;
    215 	/*
    216 	 * If default route is found, see if default matching criteria
    217 	 * are satisfied.
    218 	 */
    219 	if (flags & MATCH_IRE_MASK) {
    220 		/*
    221 		 * we were asked to match a 0 mask, and came back with
    222 		 * a default route. Ok to return it.
    223 		 */
    224 		goto found_default_ire;
    225 	}
    226 	if ((flags & MATCH_IRE_TYPE) &&
    227 	    (type & (IRE_DEFAULT | IRE_INTERFACE))) {
    228 		/*
    229 		 * we were asked to match a default ire type. Ok to return it.
    230 		 */
    231 		goto found_default_ire;
    232 	}
    233 	if (flags & MATCH_IRE_DEFAULT) {
    234 		goto found_default_ire;
    235 	}
    236 	/*
    237 	 * we found a default route, but default matching criteria
    238 	 * are not specified and we are not explicitly looking for
    239 	 * default.
    240 	 */
    241 	IRE_REFRELE(ire);
    242 	return (NULL);
    243 found_default_ire:
    244 	/*
    245 	 * round-robin only if we have more than one route in the bucket.
    246 	 */
    247 	if ((ire->ire_bucket->irb_ire_cnt > 1) &&
    248 	    IS_DEFAULT_ROUTE(ire) &&
    249 	    ((flags & (MATCH_IRE_DEFAULT | MATCH_IRE_MASK)) ==
    250 	    MATCH_IRE_DEFAULT)) {
    251 		ire_t *next_ire;
    252 
    253 		next_ire = ire_round_robin(ire->ire_bucket, zoneid, &margs,
    254 		    ipst);
    255 		IRE_REFRELE(ire);
    256 		if (next_ire != NULL) {
    257 			ire = next_ire;
    258 		} else {
    259 			/* no route */
    260 			return (NULL);
    261 		}
    262 	}
    263 found_ire_held:
    264 	if ((flags & MATCH_IRE_RJ_BHOLE) &&
    265 	    (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
    266 		return (ire);
    267 	}
    268 	/*
    269 	 * At this point, IRE that was found must be an IRE_FORWARDTABLE
    270 	 * type.  If this is a recursive lookup and an IRE_INTERFACE type was
    271 	 * found, return that.  If it was some other IRE_FORWARDTABLE type of
    272 	 * IRE (one of the prefix types), then it is necessary to fill in the
    273 	 * parent IRE pointed to by pire, and then lookup the gateway address of
    274 	 * the parent.  For backwards compatiblity, if this lookup returns an
    275 	 * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
    276 	 * of lookup is done.
    277 	 */
    278 	if (flags & MATCH_IRE_RECURSIVE) {
    279 		ipif_t	*gw_ipif;
    280 		int match_flags = MATCH_IRE_DSTONLY;
    281 		ire_t *save_ire;
    282 
    283 		if (ire->ire_type & IRE_INTERFACE)
    284 			return (ire);
    285 		if (pire != NULL)
    286 			*pire = ire;
    287 		/*
    288 		 * If we can't find an IRE_INTERFACE or the caller has not
    289 		 * asked for pire, we need to REFRELE the save_ire.
    290 		 */
    291 		save_ire = ire;
    292 
    293 		/*
    294 		 * Currently MATCH_IRE_ILL is never used with
    295 		 * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
    296 		 * sending out packets as MATCH_IRE_ILL is used only
    297 		 * for communicating with on-link hosts. We can't assert
    298 		 * that here as RTM_GET calls this function with
    299 		 * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
    300 		 * We have already used the MATCH_IRE_ILL in determining
    301 		 * the right prefix route at this point. To match the
    302 		 * behavior of how we locate routes while sending out
    303 		 * packets, we don't want to use MATCH_IRE_ILL below
    304 		 * while locating the interface route.
    305 		 *
    306 		 * ire_ftable_lookup may end up with an incomplete IRE_CACHE
    307 		 * entry for the gateway (i.e., one for which the
    308 		 * ire_nce->nce_state is not yet ND_REACHABLE). If the caller
    309 		 * has specified MATCH_IRE_COMPLETE, such entries will not
    310 		 * be returned; instead, we return the IF_RESOLVER ire.
    311 		 */
    312 		if (ire->ire_ipif != NULL)
    313 			match_flags |= MATCH_IRE_ILL_GROUP;
    314 
    315 		ire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, 0,
    316 		    ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst);
    317 		DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire,
    318 		    (ire_t *), save_ire);
    319 		if (ire == NULL ||
    320 		    ((ire->ire_type & IRE_CACHE) && ire->ire_nce &&
    321 		    ire->ire_nce->nce_state != ND_REACHABLE &&
    322 		    (flags & MATCH_IRE_COMPLETE))) {
    323 			/*
    324 			 * Do not release the parent ire if MATCH_IRE_PARENT
    325 			 * is set. Also return it via ire.
    326 			 */
    327 			if (ire != NULL) {
    328 				ire_refrele(ire);
    329 				ire = NULL;
    330 				found_incomplete = B_TRUE;
    331 			}
    332 			if (flags & MATCH_IRE_PARENT) {
    333 				if (pire != NULL) {
    334 					/*
    335 					 * Need an extra REFHOLD, if the parent
    336 					 * ire is returned via both ire and
    337 					 * pire.
    338 					 */
    339 					IRE_REFHOLD(save_ire);
    340 				}
    341 				ire = save_ire;
    342 			} else {
    343 				ire_refrele(save_ire);
    344 				if (pire != NULL)
    345 					*pire = NULL;
    346 			}
    347 			if (!found_incomplete)
    348 				return (ire);
    349 		}
    350 		if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
    351 			/*
    352 			 * If the caller did not ask for pire, release
    353 			 * it now.
    354 			 */
    355 			if (pire == NULL) {
    356 				ire_refrele(save_ire);
    357 			}
    358 			return (ire);
    359 		}
    360 		match_flags |= MATCH_IRE_TYPE;
    361 		gw_addr = ire->ire_gateway_addr;
    362 		gw_ipif = ire->ire_ipif;
    363 		ire_refrele(ire);
    364 		ire = ire_route_lookup(gw_addr, 0, 0,
    365 		    (found_incomplete? IRE_INTERFACE :
    366 		    (IRE_CACHETABLE | IRE_INTERFACE)),
    367 		    gw_ipif, NULL, zoneid, tsl, match_flags, ipst);
    368 		DTRACE_PROBE2(ftable__route__lookup2, (ire_t *), ire,
    369 		    (ire_t *), save_ire);
    370 		if (ire == NULL ||
    371 		    ((ire->ire_type & IRE_CACHE) && ire->ire_nce &&
    372 		    ire->ire_nce->nce_state != ND_REACHABLE &&
    373 		    (flags & MATCH_IRE_COMPLETE))) {
    374 			/*
    375 			 * Do not release the parent ire if MATCH_IRE_PARENT
    376 			 * is set. Also return it via ire.
    377 			 */
    378 			if (ire != NULL) {
    379 				ire_refrele(ire);
    380 				ire = NULL;
    381 			}
    382 			if (flags & MATCH_IRE_PARENT) {
    383 				if (pire != NULL) {
    384 					/*
    385 					 * Need an extra REFHOLD, if the
    386 					 * parent ire is returned via both
    387 					 * ire and pire.
    388 					 */
    389 					IRE_REFHOLD(save_ire);
    390 				}
    391 				ire = save_ire;
    392 			} else {
    393 				ire_refrele(save_ire);
    394 				if (pire != NULL)
    395 					*pire = NULL;
    396 			}
    397 			return (ire);
    398 		} else if (pire == NULL) {
    399 			/*
    400 			 * If the caller did not ask for pire, release
    401 			 * it now.
    402 			 */
    403 			ire_refrele(save_ire);
    404 		}
    405 		return (ire);
    406 	}
    407 	ASSERT(pire == NULL || *pire == NULL);
    408 	return (ire);
    409 }
    410 
    411 
    412 /*
    413  * Find an IRE_OFFSUBNET IRE entry for the multicast address 'group'
    414  * that goes through 'ipif'. As a fallback, a route that goes through
    415  * ipif->ipif_ill can be returned.
    416  */
    417 ire_t *
    418 ipif_lookup_multi_ire(ipif_t *ipif, ipaddr_t group)
    419 {
    420 	ire_t	*ire;
    421 	ire_t	*save_ire = NULL;
    422 	ire_t   *gw_ire;
    423 	irb_t   *irb;
    424 	ipaddr_t gw_addr;
    425 	int	match_flags = MATCH_IRE_TYPE | MATCH_IRE_ILL;
    426 	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
    427 
    428 	ASSERT(CLASSD(group));
    429 
    430 	ire = ire_ftable_lookup(group, 0, 0, 0, NULL, NULL, ALL_ZONES, 0,
    431 	    NULL, MATCH_IRE_DEFAULT, ipst);
    432 
    433 	if (ire == NULL)
    434 		return (NULL);
    435 
    436 	irb = ire->ire_bucket;
    437 	ASSERT(irb);
    438 
    439 	IRB_REFHOLD(irb);
    440 	ire_refrele(ire);
    441 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
    442 		if (ire->ire_addr != group ||
    443 		    ipif->ipif_zoneid != ire->ire_zoneid &&
    444 		    ire->ire_zoneid != ALL_ZONES) {
    445 			continue;
    446 		}
    447 
    448 		switch (ire->ire_type) {
    449 		case IRE_DEFAULT:
    450 		case IRE_PREFIX:
    451 		case IRE_HOST:
    452 			gw_addr = ire->ire_gateway_addr;
    453 			gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE,
    454 			    ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
    455 
    456 			if (gw_ire != NULL) {
    457 				if (save_ire != NULL) {
    458 					ire_refrele(save_ire);
    459 				}
    460 				IRE_REFHOLD(ire);
    461 				if (gw_ire->ire_ipif == ipif) {
    462 					ire_refrele(gw_ire);
    463 
    464 					IRB_REFRELE(irb);
    465 					return (ire);
    466 				}
    467 				ire_refrele(gw_ire);
    468 				save_ire = ire;
    469 			}
    470 			break;
    471 		case IRE_IF_NORESOLVER:
    472 		case IRE_IF_RESOLVER:
    473 			if (ire->ire_ipif == ipif) {
    474 				if (save_ire != NULL) {
    475 					ire_refrele(save_ire);
    476 				}
    477 				IRE_REFHOLD(ire);
    478 
    479 				IRB_REFRELE(irb);
    480 				return (ire);
    481 			}
    482 			break;
    483 		}
    484 	}
    485 	IRB_REFRELE(irb);
    486 
    487 	return (save_ire);
    488 }
    489 
    490 /*
    491  * Find an IRE_INTERFACE for the multicast group.
    492  * Allows different routes for multicast addresses
    493  * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
    494  * which point at different interfaces. This is used when IP_MULTICAST_IF
    495  * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
    496  * specify the interface to join on.
    497  *
    498  * Supports IP_BOUND_IF by following the ipif/ill when recursing.
    499  */
    500 ire_t *
    501 ire_lookup_multi(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst)
    502 {
    503 	ire_t	*ire;
    504 	ipif_t	*ipif = NULL;
    505 	int	match_flags = MATCH_IRE_TYPE;
    506 	ipaddr_t gw_addr;
    507 
    508 	ire = ire_ftable_lookup(group, 0, 0, 0, NULL, NULL, zoneid,
    509 	    0, NULL, MATCH_IRE_DEFAULT, ipst);
    510 
    511 	/* We search a resolvable ire in case of multirouting. */
    512 	if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) {
    513 		ire_t *cire = NULL;
    514 		/*
    515 		 * If the route is not resolvable, the looked up ire
    516 		 * may be changed here. In that case, ire_multirt_lookup()
    517 		 * IRE_REFRELE the original ire and change it.
    518 		 */
    519 		(void) ire_multirt_lookup(&cire, &ire, MULTIRT_CACHEGW,
    520 		    NULL, ipst);
    521 		if (cire != NULL)
    522 			ire_refrele(cire);
    523 	}
    524 	if (ire == NULL)
    525 		return (NULL);
    526 	/*
    527 	 * Make sure we follow ire_ipif.
    528 	 *
    529 	 * We need to determine the interface route through
    530 	 * which the gateway will be reached. We don't really
    531 	 * care which interface is picked if the interface is
    532 	 * part of a group.
    533 	 */
    534 	if (ire->ire_ipif != NULL) {
    535 		ipif = ire->ire_ipif;
    536 		match_flags |= MATCH_IRE_ILL_GROUP;
    537 	}
    538 
    539 	switch (ire->ire_type) {
    540 	case IRE_DEFAULT:
    541 	case IRE_PREFIX:
    542 	case IRE_HOST:
    543 		gw_addr = ire->ire_gateway_addr;
    544 		ire_refrele(ire);
    545 		ire = ire_ftable_lookup(gw_addr, 0, 0,
    546 		    IRE_INTERFACE, ipif, NULL, zoneid, 0,
    547 		    NULL, match_flags, ipst);
    548 		return (ire);
    549 	case IRE_IF_NORESOLVER:
    550 	case IRE_IF_RESOLVER:
    551 		return (ire);
    552 	default:
    553 		ire_refrele(ire);
    554 		return (NULL);
    555 	}
    556 }
    557 
    558 /*
    559  * Delete the passed in ire if the gateway addr matches
    560  */
    561 void
    562 ire_del_host_redir(ire_t *ire, char *gateway)
    563 {
    564 	if ((ire->ire_flags & RTF_DYNAMIC) &&
    565 	    (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
    566 		ire_delete(ire);
    567 }
    568 
    569 /*
    570  * Search for all HOST REDIRECT routes that are
    571  * pointing at the specified gateway and
    572  * delete them. This routine is called only
    573  * when a default gateway is going away.
    574  */
    575 void
    576 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
    577 {
    578 	struct rtfuncarg rtfarg;
    579 
    580 	(void) memset(&rtfarg, 0, sizeof (rtfarg));
    581 	rtfarg.rt_func = ire_del_host_redir;
    582 	rtfarg.rt_arg = (void *)&gateway;
    583 	(void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
    584 	    rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
    585 }
    586 
    587 struct ihandle_arg {
    588 	uint32_t ihandle;
    589 	ire_t	 *ire;
    590 };
    591 
    592 static int
    593 ire_ihandle_onlink_match(struct radix_node *rn, void *arg)
    594 {
    595 	struct rt_entry *rt;
    596 	irb_t *irb;
    597 	ire_t *ire;
    598 	struct ihandle_arg *ih = arg;
    599 
    600 	rt = (struct rt_entry *)rn;
    601 	ASSERT(rt != NULL);
    602 	irb = &rt->rt_irb;
    603 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
    604 		if ((ire->ire_type & IRE_INTERFACE) &&
    605 		    (ire->ire_ihandle == ih->ihandle)) {
    606 			ih->ire = ire;
    607 			IRE_REFHOLD(ire);
    608 			return (1);
    609 		}
    610 	}
    611 	return (0);
    612 }
    613 
    614 /*
    615  * Locate the interface ire that is tied to the cache ire 'cire' via
    616  * cire->ire_ihandle.
    617  *
    618  * We are trying to create the cache ire for an onlink destn. or
    619  * gateway in 'cire'. We are called from ire_add_v4() in the IRE_IF_RESOLVER
    620  * case, after the ire has come back from ARP.
    621  */
    622 ire_t *
    623 ire_ihandle_lookup_onlink(ire_t *cire)
    624 {
    625 	ire_t	*ire;
    626 	int	match_flags;
    627 	struct ihandle_arg ih;
    628 	ip_stack_t *ipst;
    629 
    630 	ASSERT(cire != NULL);
    631 	ipst = cire->ire_ipst;
    632 
    633 	/*
    634 	 * We don't need to specify the zoneid to ire_ftable_lookup() below
    635 	 * because the ihandle refers to an ipif which can be in only one zone.
    636 	 */
    637 	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
    638 	/*
    639 	 * We know that the mask of the interface ire equals cire->ire_cmask.
    640 	 * (When ip_newroute() created 'cire' for an on-link destn. it set its
    641 	 * cmask from the interface ire's mask)
    642 	 */
    643 	ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0,
    644 	    IRE_INTERFACE, NULL, NULL, ALL_ZONES, cire->ire_ihandle,
    645 	    NULL, match_flags, ipst);
    646 	if (ire != NULL)
    647 		return (ire);
    648 	/*
    649 	 * If we didn't find an interface ire above, we can't declare failure.
    650 	 * For backwards compatibility, we need to support prefix routes
    651 	 * pointing to next hop gateways that are not on-link.
    652 	 *
    653 	 * In the resolver/noresolver case, ip_newroute() thinks it is creating
    654 	 * the cache ire for an onlink destination in 'cire'. But 'cire' is
    655 	 * not actually onlink, because ire_ftable_lookup() cheated it, by
    656 	 * doing ire_route_lookup() twice and returning an interface ire.
    657 	 *
    658 	 * Eg. default	-	gw1			(line 1)
    659 	 *	gw1	-	gw2			(line 2)
    660 	 *	gw2	-	hme0			(line 3)
    661 	 *
    662 	 * In the above example, ip_newroute() tried to create the cache ire
    663 	 * 'cire' for gw1, based on the interface route in line 3. The
    664 	 * ire_ftable_lookup() above fails, because there is no interface route
    665 	 * to reach gw1. (it is gw2). We fall thru below.
    666 	 *
    667 	 * Do a brute force search based on the ihandle in a subset of the
    668 	 * forwarding tables, corresponding to cire->ire_cmask. Otherwise
    669 	 * things become very complex, since we don't have 'pire' in this
    670 	 * case. (Also note that this method is not possible in the offlink
    671 	 * case because we don't know the mask)
    672 	 */
    673 	(void) memset(&ih, 0, sizeof (ih));
    674 	ih.ihandle = cire->ire_ihandle;
    675 	(void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
    676 	    ire_ihandle_onlink_match, &ih, irb_refhold_rn, irb_refrele_rn);
    677 	return (ih.ire);
    678 }
    679 
    680 /*
    681  * IRE iterator used by ire_ftable_lookup[_v6]() to process multiple default
    682  * routes. Given a starting point in the hash list (ire_origin), walk the IREs
    683  * in the bucket skipping default interface routes and deleted entries.
    684  * Returns the next IRE (unheld), or NULL when we're back to the starting point.
    685  * Assumes that the caller holds a reference on the IRE bucket.
    686  */
    687 ire_t *
    688 ire_get_next_default_ire(ire_t *ire, ire_t *ire_origin)
    689 {
    690 	ASSERT(ire_origin->ire_bucket != NULL);
    691 	ASSERT(ire != NULL);
    692 
    693 	do {
    694 		ire = ire->ire_next;
    695 		if (ire == NULL)
    696 			ire = ire_origin->ire_bucket->irb_ire;
    697 		if (ire == ire_origin)
    698 			return (NULL);
    699 	} while ((ire->ire_type & IRE_INTERFACE) ||
    700 	    (ire->ire_marks & IRE_MARK_CONDEMNED));
    701 	ASSERT(ire != NULL);
    702 	return (ire);
    703 }
    704 
    705 static ipif_t *
    706 ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, ill_t *dst_ill,
    707     int zoneid, ushort_t *marks)
    708 {
    709 	ipif_t *src_ipif;
    710 	ip_stack_t *ipst = dst_ill->ill_ipst;
    711 
    712 	/*
    713 	 * Pick the best source address from dst_ill.
    714 	 *
    715 	 * 1) If it is part of a multipathing group, we would
    716 	 *    like to spread the inbound packets across different
    717 	 *    interfaces. ipif_select_source picks a random source
    718 	 *    across the different ills in the group.
    719 	 *
    720 	 * 2) If it is not part of a multipathing group, we try
    721 	 *    to pick the source address from the destination
    722 	 *    route. Clustering assumes that when we have multiple
    723 	 *    prefixes hosted on an interface, the prefix of the
    724 	 *    source address matches the prefix of the destination
    725 	 *    route. We do this only if the address is not
    726 	 *    DEPRECATED.
    727 	 *
    728 	 * 3) If the conn is in a different zone than the ire, we
    729 	 *    need to pick a source address from the right zone.
    730 	 *
    731 	 * NOTE : If we hit case (1) above, the prefix of the source
    732 	 *	  address picked may not match the prefix of the
    733 	 *	  destination routes prefix as ipif_select_source
    734 	 *	  does not look at "dst" while picking a source
    735 	 *	  address.
    736 	 *	  If we want the same behavior as (2), we will need
    737 	 *	  to change the behavior of ipif_select_source.
    738 	 */
    739 
    740 	if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
    741 		/*
    742 		 * The RTF_SETSRC flag is set in the parent ire (sire).
    743 		 * Check that the ipif matching the requested source
    744 		 * address still exists.
    745 		 */
    746 		src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL,
    747 		    zoneid, NULL, NULL, NULL, NULL, ipst);
    748 		return (src_ipif);
    749 	}
    750 	*marks |= IRE_MARK_USESRC_CHECK;
    751 	if ((dst_ill->ill_group != NULL) ||
    752 	    (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
    753 	    (dst_ill->ill_usesrc_ifindex != 0)) {
    754 		src_ipif = ipif_select_source(dst_ill, dst, zoneid);
    755 		if (src_ipif == NULL)
    756 			return (NULL);
    757 
    758 	} else {
    759 		src_ipif = ire->ire_ipif;
    760 		ASSERT(src_ipif != NULL);
    761 		/* hold src_ipif for uniformity */
    762 		ipif_refhold(src_ipif);
    763 	}
    764 	return (src_ipif);
    765 }
    766 
    767 /*
    768  * This function is called by ip_rput_noire() and ip_fast_forward()
    769  * to resolve the route of incoming packet that needs to be forwarded.
    770  * If the ire of the nexthop is not already in the cachetable, this
    771  * routine will insert it to the table, but won't trigger ARP resolution yet.
    772  * Thus unlike ip_newroute, this function adds incomplete ires to
    773  * the cachetable. ARP resolution for these ires are  delayed until
    774  * after all of the packet processing is completed and its ready to
    775  * be sent out on the wire, Eventually, the packet transmit routine
    776  * ip_xmit_v4() attempts to send a packet  to the driver. If it finds
    777  * that there is no link layer information, it will do the arp
    778  * resolution and queue the packet in ire->ire_nce->nce_qd_mp and
    779  * then send it out once the arp resolution is over
    780  * (see ip_xmit_v4()->ire_arpresolve()). This scheme is similar to
    781  * the model of BSD/SunOS 4
    782  *
    783  * In future, the insertion of incomplete ires in the cachetable should
    784  * be implemented in hostpath as well, as doing so will greatly reduce
    785  * the existing complexity for code paths that depend on the context of
    786  * the sender (such as IPsec).
    787  *
    788  * Thus this scheme of adding incomplete ires in cachetable in forwarding
    789  * path can be used as a template for simplifying the hostpath.
    790  */
    791 
    792 ire_t *
    793 ire_forward(ipaddr_t dst, enum ire_forward_action *ret_action,
    794     ire_t *supplied_ire, ire_t *supplied_sire, const struct ts_label_s *tsl,
    795     ip_stack_t *ipst)
    796 {
    797 	ipaddr_t gw = 0;
    798 	ire_t	*ire = NULL;
    799 	ire_t   *sire = NULL, *save_ire;
    800 	ill_t *dst_ill = NULL;
    801 	int error;
    802 	zoneid_t zoneid;
    803 	ipif_t *src_ipif = NULL;
    804 	mblk_t *res_mp;
    805 	ushort_t ire_marks = 0;
    806 	tsol_gcgrp_t *gcgrp = NULL;
    807 	tsol_gcgrp_addr_t ga;
    808 
    809 	zoneid = GLOBAL_ZONEID;
    810 
    811 	if (supplied_ire != NULL) {
    812 		/* We have arrived here from ipfil_sendpkt */
    813 		ire = supplied_ire;
    814 		sire = supplied_sire;
    815 		goto create_irecache;
    816 	}
    817 
    818 	ire = ire_ftable_lookup(dst, 0, 0, 0, NULL, &sire, zoneid, 0,
    819 	    tsl, MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
    820 	    MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT|MATCH_IRE_SECATTR, ipst);
    821 
    822 	if (ire == NULL) {
    823 		ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst);
    824 		goto icmp_err_ret;
    825 	}
    826 
    827 	/*
    828 	 * If we encounter CGTP, we should  have the caller use
    829 	 * ip_newroute to resolve multirt instead of this function.
    830 	 * CGTP specs explicitly state that it can't be used with routers.
    831 	 * This essentially prevents insertion of incomplete RTF_MULTIRT
    832 	 * ires in cachetable.
    833 	 */
    834 	if (ipst->ips_ip_cgtp_filter &&
    835 	    ((ire->ire_flags & RTF_MULTIRT) ||
    836 	    ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)))) {
    837 		ip3dbg(("ire_forward: packet is to be multirouted- "
    838 		    "handing it to ip_newroute\n"));
    839 		if (sire != NULL)
    840 			ire_refrele(sire);
    841 		ire_refrele(ire);
    842 		/*
    843 		 * Inform caller about encountering of multirt so that
    844 		 * ip_newroute() can be called.
    845 		 */
    846 		*ret_action = Forward_check_multirt;
    847 		return (NULL);
    848 	}
    849 
    850 	/*
    851 	 * Verify that the returned IRE does not have either
    852 	 * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is
    853 	 * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
    854 	 */
    855 	if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
    856 	    (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) {
    857 		ip3dbg(("ire 0x%p is not cache/resolver/noresolver\n",
    858 		    (void *)ire));
    859 		goto icmp_err_ret;
    860 	}
    861 
    862 	/*
    863 	 * If we already have a fully resolved IRE CACHE of the
    864 	 * nexthop router, just hand over the cache entry
    865 	 * and we are done.
    866 	 */
    867 
    868 	if (ire->ire_type & IRE_CACHE) {
    869 
    870 		/*
    871 		 * If we are using this ire cache entry as a
    872 		 * gateway to forward packets, chances are we
    873 		 * will be using it again. So turn off
    874 		 * the temporary flag, thus reducing its
    875 		 * chances of getting deleted frequently.
    876 		 */
    877 		if (ire->ire_marks & IRE_MARK_TEMPORARY) {
    878 			irb_t *irb = ire->ire_bucket;
    879 			rw_enter(&irb->irb_lock, RW_WRITER);
    880 			/*
    881 			 * We need to recheck for IRE_MARK_TEMPORARY after
    882 			 * acquiring the lock in order to guarantee
    883 			 * irb_tmp_ire_cnt
    884 			 */
    885 			if (ire->ire_marks & IRE_MARK_TEMPORARY) {
    886 				ire->ire_marks &= ~IRE_MARK_TEMPORARY;
    887 				irb->irb_tmp_ire_cnt--;
    888 			}
    889 			rw_exit(&irb->irb_lock);
    890 		}
    891 
    892 		if (sire != NULL) {
    893 			UPDATE_OB_PKT_COUNT(sire);
    894 			sire->ire_last_used_time = lbolt;
    895 			ire_refrele(sire);
    896 		}
    897 		*ret_action = Forward_ok;
    898 		return (ire);
    899 	}
    900 create_irecache:
    901 	/*
    902 	 * Increment the ire_ob_pkt_count field for ire if it is an
    903 	 * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and
    904 	 * increment the same for the parent IRE, sire, if it is some
    905 	 * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST).
    906 	 */
    907 	if ((ire->ire_type & IRE_INTERFACE) != 0) {
    908 		UPDATE_OB_PKT_COUNT(ire);
    909 		ire->ire_last_used_time = lbolt;
    910 	}
    911 
    912 	/*
    913 	 * sire must be either IRE_CACHETABLE OR IRE_INTERFACE type
    914 	 */
    915 	if (sire != NULL) {
    916 		gw = sire->ire_gateway_addr;
    917 		ASSERT((sire->ire_type &
    918 		    (IRE_CACHETABLE | IRE_INTERFACE)) == 0);
    919 		UPDATE_OB_PKT_COUNT(sire);
    920 		sire->ire_last_used_time = lbolt;
    921 	}
    922 
    923 	/* Obtain dst_ill */
    924 	dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill);
    925 	if (dst_ill == NULL) {
    926 		ip2dbg(("ire_forward no dst ill; ire 0x%p\n",
    927 		    (void *)ire));
    928 		goto icmp_err_ret;
    929 	}
    930 
    931 	ASSERT(src_ipif == NULL);
    932 	/* Now obtain the src_ipif */
    933 	src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill,
    934 	    zoneid, &ire_marks);
    935 	if (src_ipif == NULL)
    936 		goto icmp_err_ret;
    937 
    938 	switch (ire->ire_type) {
    939 	case IRE_IF_NORESOLVER:
    940 		/* create ire_cache for ire_addr endpoint */
    941 		if (dst_ill->ill_phys_addr_length != IP_ADDR_LEN &&
    942 		    dst_ill->ill_resolver_mp == NULL) {
    943 			ip1dbg(("ire_forward: dst_ill %p "
    944 			    "for IRE_IF_NORESOLVER ire %p has "
    945 			    "no ill_resolver_mp\n",
    946 			    (void *)dst_ill, (void *)ire));
    947 			goto icmp_err_ret;
    948 		}
    949 		/* FALLTHRU */
    950 	case IRE_IF_RESOLVER:
    951 		/*
    952 		 * We have the IRE_IF_RESOLVER of the nexthop gateway
    953 		 * and now need to build a IRE_CACHE for it.
    954 		 * In this case, we have the following :
    955 		 *
    956 		 * 1) src_ipif - used for getting a source address.
    957 		 *
    958 		 * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
    959 		 *    means packets using the IRE_CACHE that we will build
    960 		 *    here will go out on dst_ill.
    961 		 *
    962 		 * 3) sire may or may not be NULL. But, the IRE_CACHE that is
    963 		 *    to be created will only be tied to the IRE_INTERFACE
    964 		 *    that was derived from the ire_ihandle field.
    965 		 *
    966 		 *    If sire is non-NULL, it means the destination is
    967 		 *    off-link and we will first create the IRE_CACHE for the
    968 		 *    gateway.
    969 		 */
    970 		res_mp = dst_ill->ill_resolver_mp;
    971 		if (ire->ire_type == IRE_IF_RESOLVER &&
    972 		    (!OK_RESOLVER_MP(res_mp))) {
    973 			goto icmp_err_ret;
    974 		}
    975 		/*
    976 		 * To be at this point in the code with a non-zero gw
    977 		 * means that dst is reachable through a gateway that
    978 		 * we have never resolved.  By changing dst to the gw
    979 		 * addr we resolve the gateway first.
    980 		 */
    981 		if (gw != INADDR_ANY) {
    982 			/*
    983 			 * The source ipif that was determined above was
    984 			 * relative to the destination address, not the
    985 			 * gateway's. If src_ipif was not taken out of
    986 			 * the IRE_IF_RESOLVER entry, we'll need to call
    987 			 * ipif_select_source() again.
    988 			 */
    989 			if (src_ipif != ire->ire_ipif) {
    990 				ipif_refrele(src_ipif);
    991 				src_ipif = ipif_select_source(dst_ill,
    992 				    gw, zoneid);
    993 				if (src_ipif == NULL)
    994 					goto icmp_err_ret;
    995 			}
    996 			dst = gw;
    997 			gw = INADDR_ANY;
    998 		}
    999 		/*
   1000 		 * dst has been set to the address of the nexthop.
   1001 		 *
   1002 		 * TSol note: get security attributes of the nexthop;
   1003 		 * Note that the nexthop may either be a gateway, or the
   1004 		 * packet destination itself; Detailed explanation of
   1005 		 * issues involved is  provided in the  IRE_IF_NORESOLVER
   1006 		 * logic in ip_newroute().
   1007 		 */
   1008 		ga.ga_af = AF_INET;
   1009 		IN6_IPADDR_TO_V4MAPPED(dst, &ga.ga_addr);
   1010 		gcgrp = gcgrp_lookup(&ga, B_FALSE);
   1011 
   1012 		if (ire->ire_type == IRE_IF_NORESOLVER)
   1013 			dst = ire->ire_addr; /* ire_cache for tunnel endpoint */
   1014 
   1015 		save_ire = ire;
   1016 		/*
   1017 		 * create an incomplete IRE_CACHE.
   1018 		 * An areq_mp will be generated in ire_arpresolve() for
   1019 		 * RESOLVER interfaces.
   1020 		 */
   1021 		ire = ire_create(
   1022 		    (uchar_t *)&dst,		/* dest address */
   1023 		    (uchar_t *)&ip_g_all_ones,	/* mask */
   1024 		    (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
   1025 		    (uchar_t *)&gw,		/* gateway address */
   1026 		    (save_ire->ire_type == IRE_IF_RESOLVER ?  NULL:
   1027 		    &save_ire->ire_max_frag),
   1028 		    NULL,
   1029 		    dst_ill->ill_rq,		/* recv-from queue */
   1030 		    dst_ill->ill_wq,		/* send-to queue */
   1031 		    IRE_CACHE,			/* IRE type */
   1032 		    src_ipif,
   1033 		    ire->ire_mask,		/* Parent mask */
   1034 		    0,
   1035 		    ire->ire_ihandle,	/* Interface handle */
   1036 		    0,
   1037 		    &(ire->ire_uinfo),
   1038 		    NULL,
   1039 		    gcgrp,
   1040 		    ipst);
   1041 		ip1dbg(("incomplete ire_cache 0x%p\n", (void *)ire));
   1042 		if (ire != NULL) {
   1043 			gcgrp = NULL; /* reference now held by IRE */
   1044 			ire->ire_marks |= ire_marks;
   1045 			/* add the incomplete ire: */
   1046 			error = ire_add(&ire, NULL, NULL, NULL, B_TRUE);
   1047 			if (error == 0 && ire != NULL) {
   1048 				ire->ire_max_frag = save_ire->ire_max_frag;
   1049 				ip1dbg(("setting max_frag to %d in ire 0x%p\n",
   1050 				    ire->ire_max_frag, (void *)ire));
   1051 			} else {
   1052 				ire_refrele(save_ire);
   1053 				goto icmp_err_ret;
   1054 			}
   1055 		} else {
   1056 			if (gcgrp != NULL) {
   1057 				GCGRP_REFRELE(gcgrp);
   1058 				gcgrp = NULL;
   1059 			}
   1060 		}
   1061 
   1062 		ire_refrele(save_ire);
   1063 		break;
   1064 	default:
   1065 		break;
   1066 	}
   1067 
   1068 	*ret_action = Forward_ok;
   1069 	if (sire != NULL)
   1070 		ire_refrele(sire);
   1071 	if (dst_ill != NULL)
   1072 		ill_refrele(dst_ill);
   1073 	if (src_ipif != NULL)
   1074 		ipif_refrele(src_ipif);
   1075 	return (ire);
   1076 icmp_err_ret:
   1077 	*ret_action = Forward_ret_icmp_err;
   1078 	if (sire != NULL)
   1079 		ire_refrele(sire);
   1080 	if (dst_ill != NULL)
   1081 		ill_refrele(dst_ill);
   1082 	if (src_ipif != NULL)
   1083 		ipif_refrele(src_ipif);
   1084 	if (ire != NULL) {
   1085 		if (ire->ire_flags & RTF_BLACKHOLE)
   1086 			*ret_action = Forward_blackhole;
   1087 		ire_refrele(ire);
   1088 	}
   1089 	return (NULL);
   1090 
   1091 }
   1092 
   1093 /*
   1094  * Obtain the rt_entry and rt_irb for the route to be added to
   1095  * the ips_ip_ftable.
   1096  * First attempt to add a node to the radix tree via rn_addroute. If the
   1097  * route already exists, return the bucket for the existing route.
   1098  *
   1099  * Locking notes: Need to hold the global radix tree lock in write mode to
   1100  * add a radix node. To prevent the node from being deleted, ire_get_bucket()
   1101  * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
   1102  * while holding the irb_lock, but not the radix tree lock.
   1103  */
   1104 irb_t *
   1105 ire_get_bucket(ire_t *ire)
   1106 {
   1107 	struct radix_node *rn;
   1108 	struct rt_entry *rt;
   1109 	struct rt_sockaddr rmask, rdst;
   1110 	irb_t *irb = NULL;
   1111 	ip_stack_t *ipst = ire->ire_ipst;
   1112 
   1113 	ASSERT(ipst->ips_ip_ftable != NULL);
   1114 
   1115 	/* first try to see if route exists (based on rtalloc1) */
   1116 	(void) memset(&rdst, 0, sizeof (rdst));
   1117 	rdst.rt_sin_len = sizeof (rdst);
   1118 	rdst.rt_sin_family =