Home | History | Annotate | Download | only in ip
      1      0    stevel /*
      2      0    stevel  * CDDL HEADER START
      3      0    stevel  *
      4      0    stevel  * The contents of this file are subject to the terms of the
      5   1676       jpk  * Common Development and Distribution License (the "License").
      6   1676       jpk  * You may not use this file except in compliance with the License.
      7      0    stevel  *
      8      0    stevel  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9      0    stevel  * or http://www.opensolaris.org/os/licensing.
     10      0    stevel  * See the License for the specific language governing permissions
     11      0    stevel  * and limitations under the License.
     12      0    stevel  *
     13      0    stevel  * When distributing Covered Code, include this CDDL HEADER in each
     14      0    stevel  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15      0    stevel  * If applicable, add the following below this CDDL HEADER, with the
     16      0    stevel  * fields enclosed by brackets "[]" replaced with your own identifying
     17      0    stevel  * information: Portions Copyright [yyyy] [name of copyright owner]
     18      0    stevel  *
     19      0    stevel  * CDDL HEADER END
     20      0    stevel  */
     21      0    stevel /*
     22   8485     Peter  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23      0    stevel  * Use is subject to license terms.
     24      0    stevel  */
     25      0    stevel /*
     26      0    stevel  * Copyright (c) 1990 Mentat Inc.
     27      0    stevel  */
     28      0    stevel 
     29      0    stevel /*
     30      0    stevel  * This file contains routines that manipulate Internet Routing Entries (IREs).
     31      0    stevel  */
     32      0    stevel #include <sys/types.h>
     33      0    stevel #include <sys/stream.h>
     34      0    stevel #include <sys/stropts.h>
     35      0    stevel #include <sys/ddi.h>
     36      0    stevel #include <sys/cmn_err.h>
     37      0    stevel 
     38      0    stevel #include <sys/systm.h>
     39      0    stevel #include <sys/param.h>
     40      0    stevel #include <sys/socket.h>
     41      0    stevel #include <net/if.h>
     42      0    stevel #include <net/route.h>
     43      0    stevel #include <netinet/in.h>
     44      0    stevel #include <net/if_dl.h>
     45      0    stevel #include <netinet/ip6.h>
     46      0    stevel #include <netinet/icmp6.h>
     47      0    stevel 
     48      0    stevel #include <inet/common.h>
     49      0    stevel #include <inet/mi.h>
     50      0    stevel #include <inet/ip.h>
     51      0    stevel #include <inet/ip6.h>
     52      0    stevel #include <inet/ip_ndp.h>
     53      0    stevel #include <inet/ip_if.h>
     54      0    stevel #include <inet/ip_ire.h>
     55      0    stevel #include <inet/ipclassifier.h>
     56      0    stevel #include <inet/nd.h>
     57      0    stevel #include <sys/kmem.h>
     58      0    stevel #include <sys/zone.h>
     59   1676       jpk 
     60   1676       jpk #include <sys/tsol/label.h>
     61   1676       jpk #include <sys/tsol/tnet.h>
     62      0    stevel 
     63  11042      Erik #define	IS_DEFAULT_ROUTE_V6(ire)	\
     64  11042      Erik 	(((ire)->ire_type & IRE_DEFAULT) || \
     65  11042      Erik 	    (((ire)->ire_type & IRE_INTERFACE) && \
     66  11042      Erik 	    (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6))))
     67  11042      Erik 
     68      0    stevel static	ire_t	ire_null;
     69      0    stevel 
     70  11042      Erik static ire_t *
     71  11042      Erik ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
     72  11042      Erik     const in6_addr_t *gateway, int type, const ill_t *ill,
     73  11042      Erik     zoneid_t zoneid, const ts_label_t *tsl, int flags,
     74  11042      Erik     ip_stack_t *ipst);
     75      0    stevel 
     76      0    stevel /*
     77      0    stevel  * Initialize the ire that is specific to IPv6 part and call
     78      0    stevel  * ire_init_common to finish it.
     79  11042      Erik  * Returns zero or errno.
     80      0    stevel  */
     81  11042      Erik int
     82   4714   sowmini ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask,
     83  11042      Erik     const in6_addr_t *v6gateway, ushort_t type, ill_t *ill,
     84  11042      Erik     zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
     85      0    stevel {
     86  11042      Erik 	int error;
     87   2535  sangeeta 
     88   1676       jpk 	/*
     89  11042      Erik 	 * Reject IRE security attmakeribute creation/initialization
     90   1676       jpk 	 * if system is not running in Trusted mode.
     91   1676       jpk 	 */
     92  11042      Erik 	if (gc != NULL && !is_system_labeled())
     93  11042      Erik 		return (EINVAL);
     94      0    stevel 
     95   3448  dh155122 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced);
     96  11042      Erik 	if (v6addr != NULL)
     97  11042      Erik 		ire->ire_addr_v6 = *v6addr;
     98      0    stevel 	if (v6gateway != NULL)
     99      0    stevel 		ire->ire_gateway_addr_v6 = *v6gateway;
    100      0    stevel 
    101  11042      Erik 	/* Make sure we don't have stray values in some fields */
    102  11042      Erik 	switch (type) {
    103  11042      Erik 	case IRE_LOOPBACK:
    104  11042      Erik 		ire->ire_gateway_addr_v6 = ire->ire_addr_v6;
    105  11042      Erik 		/* FALLTHRU */
    106  11042      Erik 	case IRE_HOST:
    107  11042      Erik 	case IRE_LOCAL:
    108  11042      Erik 	case IRE_IF_CLONE:
    109  11042      Erik 		ire->ire_mask_v6 = ipv6_all_ones;
    110  11042      Erik 		ire->ire_masklen = IPV6_ABITS;
    111  11042      Erik 		break;
    112  11042      Erik 	case IRE_PREFIX:
    113  11042      Erik 	case IRE_DEFAULT:
    114  11042      Erik 	case IRE_IF_RESOLVER:
    115  11042      Erik 	case IRE_IF_NORESOLVER:
    116  11042      Erik 		if (v6mask != NULL) {
    117  11042      Erik 			ire->ire_mask_v6 = *v6mask;
    118  11042      Erik 			ire->ire_masklen =
    119  11042      Erik 			    ip_mask_to_plen_v6(&ire->ire_mask_v6);
    120  11042      Erik 		}
    121  11042      Erik 		break;
    122  11042      Erik 	case IRE_MULTICAST:
    123  11042      Erik 	case IRE_NOROUTE:
    124  11042      Erik 		ASSERT(v6mask == NULL);
    125  11042      Erik 		break;
    126  11042      Erik 	default:
    127  11042      Erik 		ASSERT(0);
    128  11042      Erik 		return (EINVAL);
    129      0    stevel 	}
    130      0    stevel 
    131  11042      Erik 	error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION,
    132  11042      Erik 	    gc, ipst);
    133  11042      Erik 	if (error != NULL)
    134  11042      Erik 		return (error);
    135      0    stevel 
    136  11042      Erik 	/* Determine which function pointers to use */
    137  11042      Erik 	ire->ire_postfragfn = ip_xmit;		/* Common case */
    138      0    stevel 
    139  11042      Erik 	switch (ire->ire_type) {
    140  11042      Erik 	case IRE_LOCAL:
    141  11042      Erik 		ire->ire_sendfn = ire_send_local_v6;
    142  11042      Erik 		ire->ire_recvfn = ire_recv_local_v6;
    143  11042      Erik 		ASSERT(ire->ire_ill != NULL);
    144  11076     Cathy 		if (ire->ire_ill->ill_flags & ILLF_NOACCEPT)
    145  11042      Erik 			ire->ire_recvfn = ire_recv_noaccept_v6;
    146  11042      Erik 		break;
    147  11042      Erik 	case IRE_LOOPBACK:
    148  11042      Erik 		ire->ire_sendfn = ire_send_local_v6;
    149  11042      Erik 		ire->ire_recvfn = ire_recv_loopback_v6;
    150  11042      Erik 		break;
    151  11042      Erik 	case IRE_MULTICAST:
    152  11042      Erik 		ire->ire_postfragfn = ip_postfrag_loopcheck;
    153  11042      Erik 		ire->ire_sendfn = ire_send_multicast_v6;
    154  11042      Erik 		ire->ire_recvfn = ire_recv_multicast_v6;
    155  11042      Erik 		break;
    156  11042      Erik 	default:
    157  11042      Erik 		/*
    158  11042      Erik 		 * For IRE_IF_ALL and IRE_OFFLINK we forward received
    159  11042      Erik 		 * packets by default.
    160  11042      Erik 		 */
    161  11042      Erik 		ire->ire_sendfn = ire_send_wire_v6;
    162  11042      Erik 		ire->ire_recvfn = ire_recv_forward_v6;
    163  11042      Erik 		break;
    164      0    stevel 	}
    165  11042      Erik 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
    166  11042      Erik 		ire->ire_sendfn = ire_send_noroute_v6;
    167  11042      Erik 		ire->ire_recvfn = ire_recv_noroute_v6;
    168  11042      Erik 	} else if (ire->ire_flags & RTF_MULTIRT) {
    169  11042      Erik 		ire->ire_postfragfn = ip_postfrag_multirt_v6;
    170  11042      Erik 		ire->ire_sendfn = ire_send_multirt_v6;
    171  11042      Erik 		ire->ire_recvfn = ire_recv_multirt_v6;
    172      0    stevel 	}
    173  11042      Erik 	ire->ire_nce_capable = ire_determine_nce_capable(ire);
    174  11042      Erik 	return (0);
    175      0    stevel }
    176      0    stevel 
    177      0    stevel /*
    178      0    stevel  * ire_create_v6 is called to allocate and initialize a new IRE.
    179      0    stevel  *
    180      0    stevel  * NOTE : This is called as writer sometimes though not required
    181      0    stevel  * by this function.
    182      0    stevel  */
    183   4714   sowmini /* ARGSUSED */
    184      0    stevel ire_t *
    185      0    stevel ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
    186  11042      Erik     const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid,
    187  11042      Erik     uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
    188      0    stevel {
    189      0    stevel 	ire_t	*ire;
    190  11042      Erik 	int	error;
    191      0    stevel 
    192      0    stevel 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
    193      0    stevel 
    194      0    stevel 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
    195      0    stevel 	if (ire == NULL) {
    196  11042      Erik 		DTRACE_PROBE(kmem__cache__alloc);
    197      0    stevel 		return (NULL);
    198      0    stevel 	}
    199      0    stevel 	*ire = ire_null;
    200      0    stevel 
    201  11042      Erik 	error = ire_init_v6(ire, v6addr, v6mask, v6gateway,
    202  11042      Erik 	    type, ill, zoneid, flags, gc, ipst);
    203      0    stevel 
    204  11042      Erik 	if (error != 0) {
    205  11042      Erik 		DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error);
    206      0    stevel 		kmem_cache_free(ire_cache, ire);
    207      0    stevel 		return (NULL);
    208      0    stevel 	}
    209      0    stevel 	return (ire);
    210      0    stevel }
    211      0    stevel 
    212      0    stevel /*
    213  11042      Erik  * Find the ill matching a multicast group.
    214      0    stevel  * Allows different routes for multicast addresses
    215      0    stevel  * in the unicast routing table (akin to FF::0/8 but could be more specific)
    216      0    stevel  * which point at different interfaces. This is used when IPV6_MULTICAST_IF
    217      0    stevel  * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
    218      0    stevel  * specify the interface to join on.
    219      0    stevel  *
    220  11042      Erik  * Supports link-local addresses by using ire_route_recursive which follows
    221  11042      Erik  * the ill when recursing.
    222  11042      Erik  *
    223  11042      Erik  * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
    224  11042      Erik  * and the MULTIRT property can be different for different groups, we
    225  11042      Erik  * extract RTF_MULTIRT from the special unicast route added for a group
    226  11042      Erik  * with CGTP and pass that back in the multirtp argument.
    227  11042      Erik  * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
    228  11042      Erik  * We have a setsrcp argument for the same reason.
    229      0    stevel  */
    230  11042      Erik ill_t *
    231  11042      Erik ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid,
    232  11042      Erik     ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp)
    233      0    stevel {
    234      0    stevel 	ire_t	*ire;
    235  11042      Erik 	ill_t	*ill;
    236      0    stevel 
    237  11042      Erik 	ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL,
    238  11042      Erik 	    MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL);
    239  11042      Erik 	ASSERT(ire != NULL);
    240      0    stevel 
    241  11042      Erik 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
    242      0    stevel 		ire_refrele(ire);
    243      0    stevel 		return (NULL);
    244      0    stevel 	}
    245      0    stevel 
    246  11042      Erik 	if (multirtp != NULL)
    247  11042      Erik 		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
    248      0    stevel 
    249  11042      Erik 	ill = ire_nexthop_ill(ire);
    250  11042      Erik 	ire_refrele(ire);
    251  11042      Erik 	return (ill);
    252      0    stevel }
    253      0    stevel 
    254      0    stevel /*
    255      0    stevel  * This function takes a mask and returns number of bits set in the
    256      0    stevel  * mask (the represented prefix length).  Assumes a contiguous mask.
    257      0    stevel  */
    258      0    stevel int
    259      0    stevel ip_mask_to_plen_v6(const in6_addr_t *v6mask)
    260      0    stevel {
    261      0    stevel 	int		bits;
    262      0    stevel 	int		plen = IPV6_ABITS;
    263      0    stevel 	int		i;
    264      0    stevel 
    265      0    stevel 	for (i = 3; i >= 0; i--) {
    266      0    stevel 		if (v6mask->s6_addr32[i] == 0) {
    267      0    stevel 			plen -= 32;
    268      0    stevel 			continue;
    269      0    stevel 		}
    270      0    stevel 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
    271      0    stevel 		if (bits == 0)
    272      0    stevel 			break;
    273      0    stevel 		plen -= bits;
    274      0    stevel 	}
    275      0    stevel 
    276      0    stevel 	return (plen);
    277      0    stevel }
    278      0    stevel 
    279      0    stevel /*
    280      0    stevel  * Convert a prefix length to the mask for that prefix.
    281      0    stevel  * Returns the argument bitmask.
    282      0    stevel  */
    283      0    stevel in6_addr_t *
    284      0    stevel ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
    285      0    stevel {
    286      0    stevel 	uint32_t *ptr;
    287      0    stevel 
    288      0    stevel 	if (plen < 0 || plen > IPV6_ABITS)
    289      0    stevel 		return (NULL);
    290      0    stevel 	*bitmask = ipv6_all_zeros;
    291  11042      Erik 	if (plen == 0)
    292  11042      Erik 		return (bitmask);
    293      0    stevel 
    294      0    stevel 	ptr = (uint32_t *)bitmask;
    295      0    stevel 	while (plen > 32) {
    296      0    stevel 		*ptr++ = 0xffffffffU;
    297      0    stevel 		plen -= 32;
    298      0    stevel 	}
    299      0    stevel 	*ptr = htonl(0xffffffffU << (32 - plen));
    300      0    stevel 	return (bitmask);
    301      0    stevel }
    302      0    stevel 
    303      0    stevel /*
    304  11042      Erik  * Add a fully initialized IPv6 IRE to the forwarding table.
    305  11042      Erik  * This returns NULL on failure, or a held IRE on success.
    306  11042      Erik  * Normally the returned IRE is the same as the argument. But a different
    307  11042      Erik  * IRE will be returned if the added IRE is deemed identical to an existing
    308  11042      Erik  * one. In that case ire_identical_ref will be increased.
    309  11042      Erik  * The caller always needs to do an ire_refrele() on the returned IRE.
    310      0    stevel  */
    311  11042      Erik ire_t *
    312  11042      Erik ire_add_v6(ire_t *ire)
    313      0    stevel {
    314      0    stevel 	ire_t	*ire1;
    315      0    stevel 	int	mask_table_index;
    316      0    stevel 	irb_t	*irb_ptr;
    317      0    stevel 	ire_t	**irep;
    318  11042      Erik 	int	match_flags;
    319      0    stevel 	int	error;
    320   3448  dh155122 	ip_stack_t	*ipst = ire->ire_ipst;
    321      0    stevel 
    322      0    stevel 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
    323      0    stevel 
    324      0    stevel 	/* Make sure the address is properly masked. */
    325      0    stevel 	V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
    326      0    stevel 
    327  11042      Erik 	mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
    328  11042      Erik 	if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) {
    329  11042      Erik 		irb_t *ptr;
    330  11042      Erik 		int i;
    331  11042      Erik 
    332  11042      Erik 		ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size *
    333  11042      Erik 		    sizeof (irb_t)));
    334  11042      Erik 		if (ptr == NULL) {
    335  11042      Erik 			ire_delete(ire);
    336  11042      Erik 			return (NULL);
    337  11042      Erik 		}
    338  11042      Erik 		for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
    339  11042      Erik 			rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL);
    340  11042      Erik 		}
    341  11042      Erik 		mutex_enter(&ipst->ips_ire_ft_init_lock);
    342  11042      Erik 		if (ipst->ips_ip_forwarding_table_v6[mask_table_index] ==
    343   3448  dh155122 		    NULL) {
    344  11042      Erik 			ipst->ips_ip_forwarding_table_v6[mask_table_index] =
    345  11042      Erik 			    ptr;
    346  11042      Erik 			mutex_exit(&ipst->ips_ire_ft_init_lock);
    347  11042      Erik 		} else {
    348  11042      Erik 			/*
    349  11042      Erik 			 * Some other thread won the race in
    350  11042      Erik 			 * initializing the forwarding table at the
    351  11042      Erik 			 * same index.
    352  11042      Erik 			 */
    353  11042      Erik 			mutex_exit(&ipst->ips_ire_ft_init_lock);
    354  11042      Erik 			for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
    355  11042      Erik 				rw_destroy(&ptr[i].irb_lock);
    356      0    stevel 			}
    357  11042      Erik 			mi_free(ptr);
    358      0    stevel 		}
    359      0    stevel 	}
    360  11042      Erik 	irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
    361  11042      Erik 	    IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
    362  11042      Erik 	    ipst->ips_ip6_ftable_hash_size)]);
    363      0    stevel 
    364  11042      Erik 	match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
    365  11042      Erik 	if (ire->ire_ill != NULL)
    366  11042      Erik 		match_flags |= MATCH_IRE_ILL;
    367      0    stevel 	/*
    368  11042      Erik 	 * Start the atomic add of the ire. Grab the bucket lock and the
    369  11042      Erik 	 * ill lock. Check for condemned.
    370      0    stevel 	 */
    371  11042      Erik 	error = ire_atomic_start(irb_ptr, ire);
    372  11042      Erik 	if (error != 0) {
    373  11042      Erik 		ire_delete(ire);
    374  11042      Erik 		return (NULL);
    375  11042      Erik 	}
    376   8485     Peter 
    377      0    stevel 	/*
    378   8485     Peter 	 * If we are creating a hidden IRE, make sure we search for
    379   8485     Peter 	 * hidden IREs when searching for duplicates below.
    380   8485     Peter 	 * Otherwise, we might find an IRE on some other interface
    381   8485     Peter 	 * that's not marked hidden.
    382      0    stevel 	 */
    383  11042      Erik 	if (ire->ire_testhidden)
    384  11042      Erik 		match_flags |= MATCH_IRE_TESTHIDDEN;
    385      0    stevel 
    386      0    stevel 	/*
    387      0    stevel 	 * Atomically check for duplicate and insert in the table.
    388      0    stevel 	 */
    389      0    stevel 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
    390  11042      Erik 		if (IRE_IS_CONDEMNED(ire1))
    391  11042      Erik 			continue;
    392  11042      Erik 		/*
    393  11042      Erik 		 * Here we need an exact match on zoneid, i.e.,
    394  11042      Erik 		 * ire_match_args doesn't fit.
    395  11042      Erik 		 */
    396  11042      Erik 		if (ire1->ire_zoneid != ire->ire_zoneid)
    397      0    stevel 			continue;
    398      0    stevel 
    399  11042      Erik 		if (ire1->ire_type != ire->ire_type)
    400      0    stevel 			continue;
    401  11042      Erik 
    402  11042      Erik 		/*
    403  11042      Erik 		 * Note: We do not allow multiple routes that differ only
    404  11042      Erik 		 * in the gateway security attributes; such routes are
    405  11042      Erik 		 * considered duplicates.
    406  11042      Erik 		 * To change that we explicitly have to treat them as
    407  11042      Erik 		 * different here.
    408  11042      Erik 		 */
    409      0    stevel 		if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
    410      0    stevel 		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
    411  11042      Erik 		    ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL,
    412  11042      Erik 		    match_flags)) {
    413      0    stevel 			/*
    414      0    stevel 			 * Return the old ire after doing a REFHOLD.
    415      0    stevel 			 * As most of the callers continue to use the IRE
    416      0    stevel 			 * after adding, we return a held ire. This will
    417      0    stevel 			 * avoid a lookup in the caller again. If the callers
    418      0    stevel 			 * don't want to use it, they need to do a REFRELE.
    419      0    stevel 			 */
    420      0    stevel 			ip1dbg(("found dup ire existing %p new %p",
    421      0    stevel 			    (void *)ire1, (void *)ire));
    422  11042      Erik 			ire_refhold(ire1);
    423  11042      Erik 			atomic_add_32(&ire1->ire_identical_ref, 1);
    424      0    stevel 			ire_atomic_end(irb_ptr, ire);
    425      0    stevel 			ire_delete(ire);
    426  11042      Erik 			return (ire1);
    427      0    stevel 		}
    428      0    stevel 	}
    429      0    stevel 
    430  11042      Erik 	/*
    431  11042      Erik 	 * Normally we do head insertion since most things do not care about
    432  11042      Erik 	 * the order of the IREs in the bucket.
    433  11042      Erik 	 * However, due to shared-IP zones (and restrict_interzone_loopback)
    434  11042      Erik 	 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
    435  11042      Erik 	 * address. For that reason we do tail insertion for IRE_IF_CLONE.
    436  11042      Erik 	 */
    437  11042      Erik 	irep = (ire_t **)irb_ptr;
    438  11042      Erik 	if (ire->ire_type & IRE_IF_CLONE) {
    439  11042      Erik 		while ((ire1 = *irep) != NULL)
    440  11042      Erik 			irep = &ire1->ire_next;
    441      0    stevel 	}
    442      0    stevel 	/* Insert at *irep */
    443      0    stevel 	ire1 = *irep;
    444      0    stevel 	if (ire1 != NULL)
    445      0    stevel 		ire1->ire_ptpn = &ire->ire_next;
    446      0    stevel 	ire->ire_next = ire1;
    447      0    stevel 	/* Link the new one in. */
    448      0    stevel 	ire->ire_ptpn = irep;
    449      0    stevel 	/*
    450      0    stevel 	 * ire_walk routines de-reference ire_next without holding
    451      0    stevel 	 * a lock. Before we point to the new ire, we want to make
    452      0    stevel 	 * sure the store that sets the ire_next of the new ire
    453      0    stevel 	 * reaches global visibility, so that ire_walk routines
    454      0    stevel 	 * don't see a truncated list of ires i.e if the ire_next
    455      0    stevel 	 * of the new ire gets set after we do "*irep = ire" due
    456      0    stevel 	 * to re-ordering, the ire_walk thread will see a NULL
    457      0    stevel 	 * once it accesses the ire_next of the new ire.
    458      0    stevel 	 * membar_producer() makes sure that the following store
    459      0    stevel 	 * happens *after* all of the above stores.
    460      0    stevel 	 */
    461      0    stevel 	membar_producer();
    462      0    stevel 	*irep = ire;
    463      0    stevel 	ire->ire_bucket = irb_ptr;
    464      0    stevel 	/*
    465      0    stevel 	 * We return a bumped up IRE above. Keep it symmetrical
    466      0    stevel 	 * so that the callers will always have to release. This
    467      0    stevel 	 * helps the callers of this function because they continue
    468      0    stevel 	 * to use the IRE after adding and hence they don't have to
    469      0    stevel 	 * lookup again after we return the IRE.
    470      0    stevel 	 *
    471      0    stevel 	 * NOTE : We don't have to use atomics as this is appearing
    472      0    stevel 	 * in the list for the first time and no one else can bump
    473      0    stevel 	 * up the reference count on this yet.
    474      0    stevel 	 */
    475  11042      Erik 	ire_refhold_locked(ire);
    476   3448  dh155122 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted);
    477      0    stevel 	irb_ptr->irb_ire_cnt++;
    478      0    stevel 
    479  11042      Erik 	if (ire->ire_ill != NULL) {
    480  11042      Erik 		DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill,
    481   6255   sowmini 		    (char *), "ire", (void *), ire);
    482  11042      Erik 		ire->ire_ill->ill_ire_cnt++;
    483  11042      Erik 		ASSERT(ire->ire_ill->ill_ire_cnt != 0);	/* Wraparound */
    484      0    stevel 	}
    485      0    stevel 	ire_atomic_end(irb_ptr, ire);
    486      0    stevel 
    487  11042      Erik 	/* Make any caching of the IREs be notified or updated */
    488  11042      Erik 	ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
    489      0    stevel 
    490  11042      Erik 	return (ire);
    491      0    stevel }
    492      0    stevel 
    493      0    stevel /*
    494      0    stevel  * Search for all HOST REDIRECT routes that are
    495      0    stevel  * pointing at the specified gateway and
    496      0    stevel  * delete them. This routine is called only
    497      0    stevel  * when a default gateway is going away.
    498      0    stevel  */
    499      0    stevel static void
    500   3448  dh155122 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
    501      0    stevel {
    502      0    stevel 	irb_t *irb_ptr;
    503      0    stevel 	irb_t *irb;
    504      0    stevel 	ire_t *ire;
    505      0    stevel 	in6_addr_t gw_addr_v6;
    506      0    stevel 	int i;
    507      0    stevel 
    508      0    stevel 	/* get the hash table for HOST routes */
    509   3448  dh155122 	irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)];
    510      0    stevel 	if (irb_ptr == NULL)
    511      0    stevel 		return;
    512   3448  dh155122 	for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
    513      0    stevel 		irb = &irb_ptr[i];
    514  11042      Erik 		irb_refhold(irb);
    515      0    stevel 		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
    516   3004  dd193516 			if (!(ire->ire_flags & RTF_DYNAMIC))
    517      0    stevel 				continue;
    518      0    stevel 			mutex_enter(&ire->ire_lock);
    519      0    stevel 			gw_addr_v6 = ire->ire_gateway_addr_v6;
    520      0    stevel 			mutex_exit(&ire->ire_lock);
    521      0    stevel 			if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
    522      0    stevel 				ire_delete(ire);
    523      0    stevel 		}
    524  11042      Erik 		irb_refrele(irb);
    525      0    stevel 	}
    526      0    stevel }
    527      0    stevel 
    528      0    stevel /*
    529      0    stevel  * Delete the specified IRE.
    530      0    stevel  * All calls should use ire_delete().
    531      0    stevel  * Sometimes called as writer though not required by this function.
    532      0    stevel  *
    533      0    stevel  * NOTE : This function is called only if the ire was added
    534      0    stevel  * in the list.
    535      0    stevel  */
    536      0    stevel void
    537      0    stevel ire_delete_v6(ire_t *ire)
    538      0    stevel {
    539      0    stevel 	in6_addr_t gw_addr_v6;
    540   3448  dh155122 	ip_stack_t	*ipst = ire->ire_ipst;
    541      0    stevel 
    542  11042      Erik 	/*
    543  11042      Erik 	 * Make sure ire_generation increases from ire_flush_cache happen
    544  11042      Erik 	 * after any lookup/reader has read ire_generation.
    545  11042      Erik 	 * Since the rw_enter makes us wait until any lookup/reader has
    546  11042      Erik 	 * completed we can exit the lock immediately.
    547  11042      Erik 	 */
    548  11042      Erik 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
    549  11042      Erik 	rw_exit(&ipst->ips_ip6_ire_head_lock);
    550  11042      Erik 
    551      0    stevel 	ASSERT(ire->ire_refcnt >= 1);
    552      0    stevel 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
    553      0    stevel 
    554  11042      Erik 	ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
    555  11042      Erik 
    556      0    stevel 	if (ire->ire_type == IRE_DEFAULT) {
    557      0    stevel 		/*
    558      0    stevel 		 * when a default gateway is going away
    559      0    stevel 		 * delete all the host redirects pointing at that
    560      0    stevel 		 * gateway.
    561      0    stevel 		 */
    562      0    stevel 		mutex_enter(&ire->ire_lock);
    563      0    stevel 		gw_addr_v6 = ire->ire_gateway_addr_v6;
    564      0    stevel 		mutex_exit(&ire->ire_lock);
    565   3448  dh155122 		ire_delete_host_redirects_v6(&gw_addr_v6, ipst);
    566      0    stevel 	}
    567  11042      Erik 
    568  11042      Erik 	/*
    569  11042      Erik 	 * If we are deleting an IRE_INTERFACE then we make sure we also
    570  11042      Erik 	 * delete any IRE_IF_CLONE that has been created from it.
    571  11042      Erik 	 * Those are always in ire_dep_children.
    572  11042      Erik 	 */
    573  11042      Erik 	if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
    574  11042      Erik 		ire_dep_delete_if_clone(ire);
    575  11042      Erik 
    576  11042      Erik 	/* Remove from parent dependencies and child */
    577  11042      Erik 	rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
    578  11042      Erik 	if (ire->ire_dep_parent != NULL) {
    579  11042      Erik 		ire_dep_remove(ire);
    580  11042      Erik 	}
    581  11042      Erik 	while (ire->ire_dep_children != NULL)
    582  11042      Erik 		ire_dep_remove(ire->ire_dep_children);
    583  11042      Erik 	rw_exit(&ipst->ips_ire_dep_lock);
    584      0    stevel }
    585      0    stevel 
    586      0    stevel /*
    587  11042      Erik  * When an IRE is added or deleted this routine is called to make sure
    588  11042      Erik  * any caching of IRE information is notified or updated.
    589      0    stevel  *
    590  11042      Erik  * The flag argument indicates if the flush request is due to addition
    591  11042      Erik  * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
    592  11042      Erik  * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
    593      0    stevel  */
    594      0    stevel void
    595      0    stevel ire_flush_cache_v6(ire_t *ire, int flag)
    596      0    stevel {
    597  11042      Erik 	ip_stack_t *ipst = ire->ire_ipst;
    598      0    stevel 
    599  11042      Erik 	/*
    600  11042      Erik 	 * IRE_IF_CLONE ire's don't provide any new information
    601  11042      Erik 	 * than the parent from which they are cloned, so don't
    602  11042      Erik 	 * perturb the generation numbers.
    603  11042      Erik 	 */
    604  11042      Erik 	if (ire->ire_type & IRE_IF_CLONE)
    605   4714   sowmini 		return;
    606      0    stevel 
    607      0    stevel 	/*
    608  11042      Erik 	 * Ensure that an ire_add during a lookup serializes the updates of
    609  11042      Erik 	 * the generation numbers under ire_head_lock so that the lookup gets
    610  11042      Erik 	 * either the old ire and old generation number, or a new ire and new
    611  11042      Erik 	 * generation number.
    612      0    stevel 	 */
    613  11042      Erik 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
    614  11042      Erik 
    615  11042      Erik 	/*
    616  11042      Erik 	 * If a route was just added, we need to notify everybody that
    617  11042      Erik 	 * has cached an IRE_NOROUTE since there might now be a better
    618  11042      Erik 	 * route for them.
    619  11042      Erik 	 */
    620  11042      Erik 	if (flag == IRE_FLUSH_ADD) {
    621  11042      Erik 		ire_increment_generation(ipst->ips_ire_reject_v6);
    622  11042      Erik 		ire_increment_generation(ipst->ips_ire_blackhole_v6);
    623  11042      Erik 	}
    624  11042      Erik 
    625  11042      Erik 	/* Adding a default can't otherwise provide a better route */
    626  11042      Erik 	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
    627  11042      Erik 		rw_exit(&ipst->ips_ip6_ire_head_lock);
    628      0    stevel 		return;
    629  11042      Erik 	}
    630  11042      Erik 
    631  11042      Erik 	switch (flag) {
    632  11042      Erik 	case IRE_FLUSH_DELETE:
    633  11042      Erik 	case IRE_FLUSH_GWCHANGE:
    634      0    stevel 		/*
    635  11042      Erik 		 * Update ire_generation for all ire_dep_children chains
    636  11042      Erik 		 * starting with this IRE
    637      0    stevel 		 */
    638  11042      Erik 		ire_dep_incr_generation(ire);
    639  11042      Erik 		break;
    640  11042      Erik 	case IRE_FLUSH_ADD: {
    641  11042      Erik 		in6_addr_t	addr;
    642  11042      Erik 		in6_addr_t	mask;
    643  11042      Erik 		ip_stack_t	*ipst = ire->ire_ipst;
    644  11042      Erik 		uint_t		masklen;
    645  11042      Erik 
    646  11042      Erik 		/*
    647  11042      Erik 		 * Find an IRE which is a shorter match than the ire to be added
    648  11042      Erik 		 * For any such IRE (which we repeat) we update the
    649  11042      Erik 		 * ire_generation the same way as in the delete case.
    650  11042      Erik 		 */
    651  11042      Erik 		addr = ire->ire_addr_v6;
    652  11042      Erik 		mask = ire->ire_mask_v6;
    653  11042      Erik 		masklen = ip_mask_to_plen_v6(&mask);
    654  11042      Erik 
    655  11042      Erik 		ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL,
    656  11042      Erik 		    ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
    657  11042      Erik 		while (ire != NULL) {
    658  11042      Erik 			/* We need to handle all in the same bucket */
    659  11042      Erik 			irb_increment_generation(ire->ire_bucket);
    660  11042      Erik 
    661  11042      Erik 			mask = ire->ire_mask_v6;
    662  11042      Erik 			ASSERT(masklen > ip_mask_to_plen_v6(&mask));
    663  11042      Erik 			masklen = ip_mask_to_plen_v6(&mask);
    664  11042      Erik 			ire_refrele(ire);
    665  11042      Erik 			ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0,
    666  11042      Erik 			    NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
    667      0    stevel 		}
    668      0    stevel 		}
    669  11042      Erik 		break;
    670      0    stevel 	}
    671  11042      Erik 	rw_exit(&ipst->ips_ip6_ire_head_lock);
    672      0    stevel }
    673      0    stevel 
    674      0    stevel /*
    675      0    stevel  * Matches the arguments passed with the values in the ire.
    676      0    stevel  *
    677  11042      Erik  * Note: for match types that match using "ill" passed in, ill
    678      0    stevel  * must be checked for non-NULL before calling this routine.
    679      0    stevel  */
    680  11042      Erik boolean_t
    681      0    stevel ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
    682  11042      Erik     const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid,
    683  11042      Erik     const ts_label_t *tsl, int match_flags)
    684      0    stevel {
    685      0    stevel 	in6_addr_t masked_addr;
    686      0    stevel 	in6_addr_t gw_addr_v6;
    687      0    stevel 	ill_t *ire_ill = NULL, *dst_ill;
    688  11042      Erik 	ip_stack_t *ipst = ire->ire_ipst;
    689      0    stevel 
    690      0    stevel 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
    691      0    stevel 	ASSERT(addr != NULL);
    692      0    stevel 	ASSERT(mask != NULL);
    693      0    stevel 	ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
    694   8485     Peter 	ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
    695  11042      Erik 	    (ill != NULL && ill->ill_isv6));
    696      0    stevel 
    697      0    stevel 	/*
    698  11042      Erik 	 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it
    699  11042      Erik 	 * is in fact hidden, to ensure the caller gets the right one.
    700      0    stevel 	 */
    701  11042      Erik 	if (ire->ire_testhidden) {
    702  11042      Erik 		if (!(match_flags & MATCH_IRE_TESTHIDDEN))
    703   8485     Peter 			return (B_FALSE);
    704   8485     Peter 	}
    705      0    stevel 
    706   1676       jpk 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
    707   1676       jpk 	    ire->ire_zoneid != ALL_ZONES) {
    708      0    stevel 		/*
    709  11042      Erik 		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
    710  11042      Erik 		 * does not match that of ire_zoneid, a failure to
    711      0    stevel 		 * match is reported at this point. Otherwise, since some IREs
    712      0    stevel 		 * that are available in the global zone can be used in local
    713      0    stevel 		 * zones, additional checks need to be performed:
    714      0    stevel 		 *
    715  11042      Erik 		 * IRE_LOOPBACK
    716  11042      Erik 		 *	entries should never be matched in this situation.
    717  11042      Erik 		 *	Each zone has its own IRE_LOOPBACK.
    718      0    stevel 		 *
    719  11042      Erik 		 * IRE_LOCAL
    720  11042      Erik 		 *	We allow them for any zoneid. ire_route_recursive
    721  11042      Erik 		 *	does additional checks when
    722  11042      Erik 		 *	ip_restrict_interzone_loopback is set.
    723      0    stevel 		 *
    724  11042      Erik 		 * If ill_usesrc_ifindex is set
    725  11042      Erik 		 *	Then we check if the zone has a valid source address
    726  11042      Erik 		 *	on the usesrc ill.
    727      0    stevel 		 *
    728  11042      Erik 		 * If ire_ill is set, then check that the zone has an ipif
    729  11042      Erik 		 *	on that ill.
    730  11042      Erik 		 *
    731  11042      Erik 		 * Outside of this function (in ire_round_robin) we check
    732  11042      Erik 		 * that any IRE_OFFLINK has a gateway that reachable from the
    733  11042      Erik 		 * zone when we have multiple choices (ECMP).
    734      0    stevel 		 */
    735      0    stevel 		if (match_flags & MATCH_IRE_ZONEONLY)
    736      0    stevel 			return (B_FALSE);
    737  11042      Erik 		if (ire->ire_type & IRE_LOOPBACK)
    738      0    stevel 			return (B_FALSE);
    739  11042      Erik 
    740  11042      Erik 		if (ire->ire_type & IRE_LOCAL)
    741  11042      Erik 			goto matchit;
    742  11042      Erik 
    743      0    stevel 		/*
    744  11042      Erik 		 * The normal case of IRE_ONLINK has a matching zoneid.
    745  11042      Erik 		 * Here we handle the case when shared-IP zones have been
    746  11042      Erik 		 * configured with IP addresses on vniN. In that case it
    747  11042      Erik 		 * is ok for traffic from a zone to use IRE_ONLINK routes
    748  11042      Erik 		 * if the ill has a usesrc pointing at vniN
    749  11042      Erik 		 * Applies to IRE_INTERFACE.
    750      0    stevel 		 */
    751  11042      Erik 		dst_ill = ire->ire_ill;
    752  11042      Erik 		if (ire->ire_type & IRE_ONLINK) {
    753  11042      Erik 			uint_t	ifindex;
    754  11042      Erik 
    755  11042      Erik 			/*
    756  11042      Erik 			 * Note there is no IRE_INTERFACE on vniN thus
    757  11042      Erik 			 * can't do an IRE lookup for a matching route.
    758  11042      Erik 			 */
    759  11042      Erik 			ifindex = dst_ill->ill_usesrc_ifindex;
    760  11042      Erik 			if (ifindex == 0)
    761  11042      Erik 				return (B_FALSE);
    762  11042      Erik 
    763      0    stevel 			/*
    764      0    stevel 			 * If there is a usable source address in the
    765  11042      Erik 			 * zone, then it's ok to return this IRE_INTERFACE
    766      0    stevel 			 */
    767  11042      Erik 			if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
    768  11042      Erik 			    zoneid, ipst)) {
    769  11042      Erik 				ip3dbg(("ire_match_args: no usrsrc for zone"
    770      0    stevel 				    " dst_ill %p\n", (void *)dst_ill));
    771      0    stevel 				return (B_FALSE);
    772      0    stevel 			}
    773      0    stevel 		}
    774  11042      Erik 		/*
    775  11042      Erik 		 * For exampe, with
    776  11042      Erik 		 * route add 11.0.0.0 gw1 -ifp bge0
    777  11042      Erik 		 * route add 11.0.0.0 gw2 -ifp bge1
    778  11042      Erik 		 * this code would differentiate based on
    779  11042      Erik 		 * where the sending zone has addresses.
    780  11042      Erik 		 * Only if the zone has an address on bge0 can it use the first
    781  11042      Erik 		 * route. It isn't clear if this behavior is documented
    782  11042      Erik 		 * anywhere.
    783  11042      Erik 		 */
    784  11042      Erik 		if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
    785      0    stevel 			ipif_t	*tipif;
    786      0    stevel 
    787  11042      Erik 			mutex_enter(&dst_ill->ill_lock);
    788  11042      Erik 			for (tipif = dst_ill->ill_ipif;
    789      0    stevel 			    tipif != NULL; tipif = tipif->ipif_next) {
    790  11042      Erik 				if (!IPIF_IS_CONDEMNED(tipif) &&
    791      0    stevel 				    (tipif->ipif_flags & IPIF_UP) &&
    792   1676       jpk 				    (tipif->ipif_zoneid == zoneid ||
    793   1676       jpk 				    tipif->ipif_zoneid == ALL_ZONES))
    794      0    stevel 					break;
    795      0    stevel 			}
    796  11042      Erik 			mutex_exit(&dst_ill->ill_lock);
    797      0    stevel 			if (tipif == NULL)
    798      0    stevel 				return (B_FALSE);
    799      0    stevel 		}
    800      0    stevel 	}
    801      0    stevel 
    802  11042      Erik matchit:
    803      0    stevel 	if (match_flags & MATCH_IRE_GW) {
    804      0    stevel 		mutex_enter(&ire->ire_lock);
    805      0    stevel 		gw_addr_v6 = ire->ire_gateway_addr_v6;
    806      0    stevel 		mutex_exit(&ire->ire_lock);
    807      0    stevel 	}
    808  11042      Erik 	if (match_flags & MATCH_IRE_ILL) {
    809  11042      Erik 		ire_ill = ire->ire_ill;
    810   8485     Peter 
    811  11042      Erik 		/*
    812  11042      Erik 		 * If asked to match an ill, we *must* match
    813  11042      Erik 		 * on the ire_ill for ipmp test addresses, or
    814  11042      Erik 		 * any of the ill in the group for data addresses.
    815  11042      Erik 		 * If we don't, we may as well fail.
    816  11042      Erik 		 * However, we need an exception for IRE_LOCALs to ensure
    817  11042      Erik 		 * we loopback packets even sent to test addresses on different
    818  11042      Erik 		 * interfaces in the group.
    819  11042      Erik 		 */
    820  11042      Erik 		if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
    821  11042      Erik 		    !(ire->ire_type & IRE_LOCAL)) {
    822  11042      Erik 			if (ire->ire_ill != ill)
    823  11042      Erik 				return (B_FALSE);
    824  11042      Erik 		} else  {
    825  11042      Erik 			match_flags &= ~MATCH_IRE_TESTHIDDEN;
    826  11042      Erik 			/*
    827  11042      Erik 			 * We know that ill is not NULL, but ire_ill could be
    828  11042      Erik 			 * NULL
    829  11042      Erik 			 */
    830  11042      Erik 			if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
    831  11042      Erik 				return (B_FALSE);
    832  11042      Erik 		}
    833      0    stevel 	}
    834      0    stevel 	/* No ire_addr_v6 bits set past the mask */
    835      0    stevel 	ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
    836      0    stevel 	    ire->ire_addr_v6));
    837      0    stevel 	V6_MASK_COPY(*addr, *mask, masked_addr);
    838      0    stevel 	if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
    839      0    stevel 	    ((!(match_flags & MATCH_IRE_GW)) ||
    840   4714   sowmini 	    IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
    841  11042      Erik 	    ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
    842  11042      Erik 	    ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
    843  11042      Erik 	    ((!(match_flags & MATCH_IRE_MASK)) ||
    844  11042      Erik 	    (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) &&
    845   1676       jpk 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
    846   4714   sowmini 	    (!is_system_labeled()) ||
    847   4714   sowmini 	    (tsol_ire_match_gwattr(ire, tsl) == 0))) {
    848      0    stevel 		/* We found the matched IRE */
    849      0    stevel 		return (B_TRUE);
    850      0    stevel 	}
    851      0    stevel 	return (B_FALSE);
    852      0    stevel }
    853      0    stevel 
    854      0    stevel /*
    855  11042      Erik  * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
    856  11042      Erik  * gateway address. If ill is non-NULL we also match on it.
    857  11042      Erik  * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
    858      0    stevel  */
    859  11042      Erik boolean_t
    860  11042      Erik ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill,
    861  11042      Erik     const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held)
    862      0    stevel {
    863  11042      Erik 	ire_t	*ire;
    864  11042      Erik 	uint_t	match_flags;
    865      0    stevel 
    866  11042      Erik 	if (lock_held)
    867  11042      Erik 		ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock));
    868  11042      Erik 	else
    869  11042      Erik 		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
    870      0    stevel 
    871  11042      Erik 	match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
    872  11042      Erik 	if (ill != NULL)
    873  11042      Erik 		match_flags |= MATCH_IRE_ILL;
    874  11042      Erik 
    875  11042      Erik 	ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros,
    876  11042      Erik 	    &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags,
    877  11042      Erik 	    ipst);
    878  11042      Erik 
    879  11042      Erik 	if (!lock_held)
    880  11042      Erik 		rw_exit(&ipst->ips_ip6_ire_head_lock);
    881  11042      Erik 	if (ire != NULL) {
    882  11042      Erik 		ire_refrele(ire);
    883  11042      Erik 		return (B_TRUE);
    884  11042      Erik 	} else {
    885  11042      Erik 		return (B_FALSE);
    886      0    stevel 	}
    887      0    stevel }
    888      0    stevel 
    889      0    stevel /*
    890      0    stevel  * Lookup a route in forwarding table.
    891      0    stevel  * specific lookup is indicated by passing the
    892      0    stevel  * required parameters and indicating the
    893      0    stevel  * match required in flag field.
    894      0    stevel  *
    895      0    stevel  * Supports link-local addresses by following the ipif/ill when recursing.
    896      0    stevel  */
    897      0    stevel ire_t *
    898      0    stevel ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
    899  11042      Erik     const in6_addr_t *gateway, int type, const ill_t *ill,
    900  11042      Erik     zoneid_t zoneid, const ts_label_t *tsl, int flags,
    901  11042      Erik     uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
    902      0    stevel {
    903      0    stevel 	ire_t *ire = NULL;
    904      0    stevel 
    905      0    stevel 	ASSERT(addr != NULL);
    906      0    stevel 	ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
    907      0    stevel 	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
    908  11042      Erik 	ASSERT(ill == NULL || ill->ill_isv6);
    909  11042      Erik 
    910  11042      Erik 	ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
    911      0    stevel 
    912      0    stevel 	/*
    913  11042      Erik 	 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL
    914  11042      Erik 	 * is set.
    915      0    stevel 	 */
    916  11042      Erik 	if ((flags & (MATCH_IRE_ILL)) && (ill == NULL))
    917  11042      Erik 		return (NULL);
    918  11042      Erik 
    919  11042      Erik 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
    920  11042      Erik 	ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid,
    921  11042      Erik 	    tsl, flags, ipst);
    922  11042      Erik 	if (ire == NULL) {
    923  11042      Erik 		rw_exit(&ipst->ips_ip6_ire_head_lock);
    924  11042      Erik 		return (NULL);
    925  11042      Erik 	}
    926  11042      Erik 
    927      0    stevel 	/*
    928  11042      Erik 	 * round-robin only if we have more than one route in the bucket.
    929  11042      Erik 	 * ips_ip_ecmp_behavior controls when we do ECMP
    930  11042      Erik 	 *	2:	always
    931  11042      Erik 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
    932  11042      Erik 	 *	0:	never
    933  11042      Erik 	 *
    934  11042      Erik 	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
    935  11042      Erik 	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
    936  11042      Erik 	 * and the IRE_INTERFACESs are likely to be shorter matches.
    937      0    stevel 	 */
    938  11042      Erik 	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
    939  11042      Erik 		if (ipst->ips_ip_ecmp_behavior == 2 ||
    940  11042      Erik 		    (ipst->ips_ip_ecmp_behavior == 1 &&
    941  11042      Erik 		    IS_DEFAULT_ROUTE_V6(ire))) {
    942  11042      Erik 			ire_t	*next_ire;
    943  11042      Erik 			ire_ftable_args_t margs;
    944  11042      Erik 
    945  11131      Erik 			bzero(&margs, sizeof (margs));
    946  11042      Erik 			margs.ift_addr_v6 = *addr;
    947  11042      Erik 			if (mask != NULL)
    948  11042      Erik 				margs.ift_mask_v6 = *mask;
    949  11042      Erik 			if (gateway != NULL)
    950  11042      Erik 				margs.ift_gateway_v6 = *gateway;
    951  11042      Erik 			margs.ift_type = type;
    952  11042      Erik 			margs.ift_ill = ill;
    953  11042      Erik 			margs.ift_zoneid = zoneid;
    954  11042      Erik 			margs.ift_tsl = tsl;
    955  11042      Erik 			margs.ift_flags = flags;
    956  11042      Erik 
    957  11042      Erik 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
    958  11042      Erik 			    xmit_hint, ire, ipst);
    959  11042      Erik 			if (next_ire == NULL) {
    960  11042      Erik 				/* keep ire if next_ire is null */
    961  11042      Erik 				goto done;
    962  11042      Erik 			}
    963  11042      Erik 			ire_refrele(ire);
    964  11042      Erik 			ire = next_ire;
    965  11042      Erik 		}
    966  11042      Erik 	}
    967  11042      Erik 
    968  11042      Erik done:
    969  11042      Erik 	/* Return generation before dropping lock */
    970  11042      Erik 	if (generationp != NULL)
    971  11042      Erik 		*generationp = ire->ire_generation;
    972  11042      Erik 
    973  11042      Erik 	rw_exit(&ipst->ips_ip6_ire_head_lock);
    974  11042      Erik 
    975  11042      Erik 	/*
    976  11042      Erik 	 * For shared-IP zones we need additional checks to what was
    977  11042      Erik 	 * done in ire_match_args to make sure IRE_LOCALs are handled.
    978  11042      Erik 	 *
    979  11042      Erik 	 * When ip_restrict_interzone_loopback is set, then
    980  11042      Erik 	 * we ensure that IRE_LOCAL are only used for loopback
    981  11042      Erik 	 * between zones when the logical "Ethernet" would
    982  11042      Erik 	 * have looped them back. That is, if in the absense of
    983  11042      Erik 	 * the IRE_LOCAL we would have sent to packet out the
    984  11042      Erik 	 * same ill.
    985  11042      Erik 	 */
    986  11042      Erik 	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
    987  11042      Erik 	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
    988  11042      Erik 	    ipst->ips_ip_restrict_interzone_loopback) {
    989  11042      Erik 		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
    990  11042      Erik 		ASSERT(ire != NULL);
    991  11042      Erik 	}
    992  11042      Erik 
    993  11042      Erik 	return (ire);
    994  11042      Erik }
    995  11042      Erik 
    996  11042      Erik /*
    997  11042      Erik  * Look up a single ire. The caller holds either the read or write lock.
    998  11042      Erik  */
    999  11042      Erik ire_t *
   1000  11042      Erik ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
   1001  11042      Erik     const in6_addr_t *gateway, int type, const ill_t *ill,
   1002  11042      Erik     zoneid_t zoneid, const ts_label_t *tsl, int flags,
   1003  11042      Erik     ip_stack_t *ipst)
   1004  11042      Erik {
   1005  11042      Erik 	irb_t *irb_ptr;
   1006  11042      Erik 	ire_t *ire = NULL;
   1007  11042      Erik 	int i;
   1008  11042      Erik 
   1009  11042      Erik 	ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock));
   1010      0    stevel 
   1011      0    stevel 	/*
   1012      0    stevel 	 * If the mask is known, the lookup
   1013      0    stevel 	 * is simple, if the mask is not known
   1014      0    stevel 	 * we need to search.
   1015      0    stevel 	 */
   1016      0    stevel 	if (flags & MATCH_IRE_MASK) {
   1017      0    stevel 		uint_t masklen;
   1018      0    stevel 
   1019      0    stevel 		masklen = ip_mask_to_plen_v6(mask);
   1020  11042      Erik 		if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) {
   1021      0    stevel 			return (NULL);
   1022  11042      Erik 		}
   1023   3448  dh155122 		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][
   1024   3448  dh155122 		    IRE_ADDR_MASK_HASH_V6(*addr, *mask,
   1025   4714   sowmini 		    ipst->ips_ip6_ftable_hash_size)]);
   1026      0    stevel 		rw_enter(&irb_ptr->irb_lock, RW_READER);
   1027      0    stevel 		for (ire = irb_ptr->irb_ire; ire != NULL;
   1028      0    stevel 		    ire = ire->ire_next) {
   1029  11042      Erik 			if (IRE_IS_CONDEMNED(ire))
   1030      0    stevel 				continue;
   1031      0    stevel 			if (ire_match_args_v6(ire, addr, mask, gateway, type,
   1032  11042      Erik 			    ill, zoneid, tsl, flags))
   1033      0    stevel 				goto found_ire;
   1034      0    stevel 		}
   1035      0    stevel 		rw_exit(&irb_ptr->irb_lock);
   1036      0    stevel 	} else {
   1037  11042      Erik 		uint_t masklen;
   1038  11042      Erik 
   1039      0    stevel 		/*
   1040      0    stevel 		 * In this case we don't know the mask, we need to
   1041      0    stevel 		 * search the table assuming different mask sizes.
   1042      0    stevel 		 */
   1043  11042      Erik 		if (flags & MATCH_IRE_SHORTERMASK) {
   1044  11042      Erik 			masklen = ip_mask_to_plen_v6(mask);
   1045  11042      Erik 			if (masklen == 0) {
   1046  11042      Erik 				/* Nothing shorter than zero */
   1047  11042      Erik 				return (NULL);
   1048  11042      Erik 			}
   1049  11042      Erik 			masklen--;
   1050  11042      Erik 		} else {
   1051  11042      Erik 			masklen = IP6_MASK_TABLE_SIZE - 1;
   1052  11042      Erik 		}
   1053  11042      Erik 
   1054  11042      Erik 		for (i = masklen; i >= 0; i--) {
   1055      0    stevel 			in6_addr_t tmpmask;
   1056      0    stevel 
   1057   3448  dh155122 			if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
   1058      0    stevel 				continue;
   1059      0    stevel 			(void) ip_plen_to_mask_v6(i, &tmpmask);
   1060   3448  dh155122 			irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][
   1061      0    stevel 			    IRE_ADDR_MASK_HASH_V6(*addr, tmpmask,
   1062   3448  dh155122 			    ipst->ips_ip6_ftable_hash_size)];
   1063      0    stevel 			rw_enter(&irb_ptr->irb_lock, RW_READER);
   1064      0    stevel 			for (ire = irb_ptr->irb_ire; ire != NULL;
   1065      0    stevel 			    ire = ire->ire_next) {
   1066  11042      Erik 				if (IRE_IS_CONDEMNED(ire))
   1067      0    stevel 					continue;
   1068      0    stevel 				if (ire_match_args_v6(ire, addr,
   1069  11042      Erik 				    &ire->ire_mask_v6, gateway, type, ill,
   1070  11042      Erik 				    zoneid, tsl, flags))
   1071      0    stevel 					goto found_ire;
   1072      0    stevel 			}
   1073      0    stevel 			rw_exit(&irb_ptr->irb_lock);
   1074      0    stevel 		}
   1075      0    stevel 	}
   1076  11042      Erik 	ASSERT(ire == NULL);
   1077  11042      Erik 	ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
   1078  11042      Erik 	return (NULL);
   1079  11042      Erik 
   1080  11042      Erik found_ire:
   1081  11042      Erik 	ire_refhold(ire);
   1082  11042      Erik 	rw_exit(&irb_ptr->irb_lock);
   1083  11042      Erik 	return (ire);
   1084  11042      Erik }
   1085  11042      Erik 
   1086  11042      Erik 
   1087  11042      Erik /*
   1088  11042      Erik  * This function is called by
   1089  11042      Erik  * ip_input/ire_route_recursive when doing a route lookup on only the
   1090  11042      Erik  * destination address.
   1091  11042      Erik  *
   1092  11042      Erik  * The optimizations of this function over ire_ftable_lookup are:
   1093  11042      Erik  *	o removing unnecessary flag matching
   1094  11042      Erik  *	o doing longest prefix match instead of overloading it further
   1095  11042      Erik  *	  with the unnecessary "best_prefix_match"
   1096  11042      Erik  *
   1097  11042      Erik  * If no route is found we return IRE_NOROUTE.
   1098  11042      Erik  */
   1099  11042      Erik ire_t *
   1100  11042      Erik ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint,
   1101  11042      Erik     ip_stack_t *ipst, uint_t *generationp)
   1102  11042      Erik {
   1103  11042      Erik 	ire_t	*ire;
   1104  11042      Erik 
   1105  11042      Erik 	ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL,
   1106  11042      Erik 	    MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp);
   1107  11042      Erik 	if (ire == NULL) {
   1108  11042      Erik 		ire = ire_reject(ipst, B_TRUE);
   1109  11042      Erik 		if (generationp != NULL)
   1110  11042      Erik 			*generationp = IRE_GENERATION_VERIFY;
   1111  11042      Erik 	}
   1112  11042      Erik 	/* ftable_lookup did round robin */
   1113  11042      Erik 	return (ire);
   1114  11042      Erik }
   1115  11042      Erik 
   1116  11042      Erik ire_t *
   1117  11042      Erik ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa,
   1118  11042      Erik     uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
   1119  11042      Erik {
   1120  11042      Erik 	ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
   1121  11042      Erik 
   1122  11042      Erik 	return (ip_select_route(dst, ixa, generationp, setsrcp, errorp,
   1123  11042      Erik 	    multirtp));
   1124  11042      Erik }
   1125  11042      Erik 
   1126  11042      Erik /*
   1127  11042      Erik  * Recursively look for a route to the destination. Can also match on
   1128  11042      Erik  * the zoneid, ill, and label. Used for the data paths. See also
   1129  11042      Erik  * ire_route_recursive_dstonly.
   1130  11042      Erik  *
   1131  11042      Erik  * If ill is set this means we will match it by adding MATCH_IRE_ILL.
   1132  11042      Erik  *
   1133  11042      Erik  * If allocate is not set then we will only inspect the existing IREs; never
   1134  11042      Erik  * create an IRE_IF_CLONE. This is used on the receive side when we are not
   1135  11042      Erik  * forwarding.
   1136  11042      Erik  *
   1137  11042      Erik  * Note that this function never returns NULL. It returns an IRE_NOROUTE
   1138  11042      Erik  * instead.
   1139  11042      Erik  *
   1140  11042      Erik  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
   1141  11042      Erik  * is an error.
   1142  11042      Erik  * Allow at most one RTF_INDIRECT.
   1143  11042      Erik  */
   1144  11042      Erik ire_t *
   1145  11042      Erik ire_route_recursive_impl_v6(ire_t *ire,
   1146  11042      Erik     const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg,
   1147  11042      Erik     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
   1148  11042      Erik     boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst,
   1149  11042      Erik     in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
   1150  11042      Erik {
   1151  11042      Erik 	int		i, j;
   1152  11042      Erik 	in6_addr_t	v6nexthop = *nexthop;
   1153  11042      Erik 	ire_t		*ires[MAX_IRE_RECURSION];
   1154  11042      Erik 	uint_t		generation;
   1155  11042      Erik 	uint_t		generations[MAX_IRE_RECURSION];
   1156  11042      Erik 	boolean_t	need_refrele = B_FALSE;
   1157  11042      Erik 	boolean_t	invalidate = B_FALSE;
   1158  11042      Erik 	int		prefs[MAX_IRE_RECURSION];
   1159  11042      Erik 	ill_t		*ill = NULL;
   1160  11042      Erik 
   1161  11042      Erik 	if (setsrcp != NULL)
   1162  11042      Erik 		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
   1163  11042      Erik 	if (gwattrp != NULL)
   1164  11042      Erik 		ASSERT(*gwattrp == NULL);
   1165  11042      Erik 
   1166  11042      Erik 	if (ill_arg != NULL)
   1167  11042      Erik 		match_args |= MATCH_IRE_ILL;
   1168      0    stevel 
   1169      0    stevel 	/*
   1170  11042      Erik 	 * We iterate up to three times to resolve a route, even though
   1171  11042      Erik 	 * we have four slots in the array. The extra slot is for an
   1172  11042      Erik 	 * IRE_IF_CLONE we might need to create.
   1173      0    stevel 	 */
   1174  11042      Erik 	i = 0;
   1175  11042      Erik 	while (i < MAX_IRE_RECURSION - 1) {
   1176  11042      Erik 		/* ire_ftable_lookup handles round-robin/ECMP */
   1177  11042      Erik 		if (ire == NULL) {
   1178  11042      Erik 			ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type,
   1179  11042      Erik 			    (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
   1180  11042      Erik 			    match_args, xmit_hint, ipst, &generation);
   1181  11042      Erik 		} else {
   1182  11042      Erik 			/* Caller passed it; extra hold since we will rele */
   1183  11042      Erik 			ire_refhold(ire);
   1184  11042      Erik 			if (generationp != NULL)
   1185  11042      Erik 				generation = *generationp;
   1186  11042      Erik 			else
   1187  11042      Erik 				generation = IRE_GENERATION_VERIFY;
   1188  11042      Erik 		}
   1189      0    stevel 
   1190  11042      Erik 		if (ire == NULL)
   1191  11042      Erik 			ire = ire_reject(ipst, B_TRUE);
   1192      0    stevel 
   1193  11042      Erik 		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
   1194  11042      Erik 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
   1195  11042      Erik 			goto error;
   1196  11042      Erik 
   1197  11042      Erik 		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
   1198  11042      Erik 
   1199  11042      Erik 		if (i != 0) {
   1200  11131      Erik 			prefs[i] = ire_pref(ire);
   1201  11042      Erik 			/*
   1202  11042      Erik 			 * Don't allow anything unusual past the first
   1203  11042      Erik 			 * iteration.
   1204  11042      Erik 			 */
   1205  11042      Erik 			if ((ire->ire_type &
   1206  11042      Erik 			    (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
   1207  11042      Erik 			    prefs[i] <= prefs[i-1]) {
   1208  11042      Erik 				ire_refrele(ire);
   1209  11042      Erik 				ire = ire_reject(ipst, B_TRUE);
   1210  11042      Erik 				goto error;
   1211      0    stevel 			}
   1212      0    stevel 		}
   1213  11042      Erik 		/* We have a usable IRE */
   1214  11042      Erik 		ires[i] = ire;
   1215  11042      Erik 		generations[i] = generation;
   1216  11042      Erik 		i++;
   1217      0    stevel 
   1218  11042      Erik 		/* The first RTF_SETSRC address is passed back if setsrcp */
   1219  11042      Erik 		if ((ire->ire_flags & RTF_SETSRC) &&
   1220  11042      Erik 		    setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) {
   1221  11042      Erik 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
   1222  11042      Erik 			    &ire->ire_setsrc_addr_v6));
   1223  11042      Erik 			*setsrcp = ire->ire_setsrc_addr_v6;
   1224  11042      Erik 		}
   1225  11042      Erik 
   1226  11042      Erik 		/* The first ire_gw_secattr is passed back if gwattrp */
   1227  11042      Erik 		if (ire->ire_gw_secattr != NULL &&
   1228  11042      Erik 		    gwattrp != NULL && *gwattrp == NULL)
   1229  11042      Erik 			*gwattrp = ire->ire_gw_secattr;
   1230      0    stevel 
   1231      0    stevel 		/*
   1232  11042      Erik 		 * Check if we have a short-cut pointer to an IRE for this
   1233  11042      Erik 		 * destination, and that the cached dependency isn't stale.
   1234  11042      Erik 		 * In that case we've rejoined an existing tree towards a
   1235  11042      Erik 		 * parent, thus we don't need to continue the loop to
   1236  11042      Erik 		 * discover the rest of the tree.
   1237      0    stevel 		 */
   1238  11042      Erik 		mutex_enter(&ire->ire_lock);
   1239  11042      Erik 		if (ire->ire_dep_parent != NULL &&
   1240  11042      Erik 		    ire->ire_dep_parent->ire_generation ==
   1241  11042      Erik 		    ire->ire_dep_parent_generation) {
   1242  11042      Erik 			mutex_exit(&ire->ire_lock);
   1243  11042      Erik 			ire = NULL;
   1244  11042      Erik 			goto done;
   1245  11042      Erik 		}
   1246  11042      Erik 		mutex_exit(&ire->ire_lock);
   1247  11042      Erik 
   1248  11042      Erik 		/*
   1249  11042      Erik 		 * If this type should have an ire_nce_cache (even if it
   1250  11042      Erik 		 * doesn't yet have one) then we are done. Includes
   1251  11042      Erik 		 * IRE_INTERFACE with a full 128 bit mask.
   1252  11042      Erik 		 */
   1253  11042      Erik 		if (ire->ire_nce_capable) {
   1254  11042      Erik 			ire = NULL;
   1255  11042      Erik 			goto done;
   1256  11042      Erik 		}
   1257  11042      Erik 		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
   1258  11042      Erik 		/*
   1259  11042      Erik 		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
   1260  11042      Erik 		 * particular destination
   1261  11042      Erik 		 */
   1262  11042      Erik 		if (ire->ire_type & IRE_INTERFACE) {
   1263  11042      Erik 			ire_t		*clone;
   1264  11042      Erik 
   1265  11042      Erik 			ASSERT(ire->ire_masklen != IPV6_ABITS);
   1266  11042      Erik 
   1267  11042      Erik 			/*
   1268  11042      Erik 			 * In the case of ip_input and ILLF_FORWARDING not
   1269  11042      Erik 			 * being set, and in the case of RTM_GET,
   1270  11042      Erik 			 * there is no point in allocating
   1271  11042      Erik 			 * an IRE_IF_CLONE. We return the IRE_INTERFACE.
   1272  11042      Erik 			 * Note that !allocate can result in a ire_dep_parent
   1273  11042      Erik 			 * which is IRE_IF_* without an IRE_IF_CLONE.
   1274  11042      Erik 			 * We recover from that when we need to send packets
   1275  11042      Erik 			 * by ensuring that the generations become
   1276  11042      Erik 			 * IRE_GENERATION_VERIFY in this case.
   1277  11042      Erik 			 */
   1278  11042      Erik 			if (!allocate) {
   1279  11042      Erik 				invalidate = B_TRUE;
   1280  11042      Erik 				ire = NULL;
   1281  11042      Erik 				goto done;
   1282  11042      Erik 			}
   1283  11042      Erik 
   1284  11042      Erik 			clone = ire_create_if_clone(ire, &v6nexthop,
   1285  11042      Erik 			    &generation);
   1286  11042      Erik 			if (clone == NULL) {
   1287  11042      Erik 				/*
   1288  11042      Erik 				 * Temporary failure - no memory.
   1289  11042      Erik 				 * Don't want caller to cache IRE_NOROUTE.
   1290  11042      Erik 				 */
   1291  11042      Erik 				invalidate = B_TRUE;
   1292  11042      Erik 				ire = ire_blackhole(ipst, B_TRUE);
   1293  11042      Erik 				goto error;
   1294  11042      Erik 			}
   1295  11042      Erik 			/*
   1296  11042      Erik 			 * Make clone next to last entry and the
   1297  11042      Erik 			 * IRE_INTERFACE the last in the dependency
   1298  11042      Erik 			 * chain since the clone depends on the
   1299  11042      Erik 			 * IRE_INTERFACE.
   1300  11042      Erik 			 */
   1301  11042      Erik 			ASSERT(i >= 1);
   1302  11042      Erik 			ASSERT(i < MAX_IRE_RECURSION);
   1303  11042      Erik 
   1304  11042      Erik 			ires[i] = ires[i-1];
   1305  11042      Erik 			generations[i] = generations[i-1];
   1306  11042      Erik 			ires[i-1] = clone;
   1307  11042      Erik 			generations[i-1] = generation;
   1308  11042      Erik 			i++;
   1309  11042      Erik 
   1310  11042      Erik 			ire = NULL;
   1311  11042      Erik 			goto done;
   1312      0    stevel 		}
   1313      0    stevel 
   1314      0    stevel 		/*
   1315  11042      Erik 		 * We only match on the type and optionally ILL when
   1316  11042      Erik 		 * recursing. The type match is used by some callers
   1317  11042      Erik 		 * to exclude certain types (such as IRE_IF_CLONE or
   1318  11042      Erik 		 * IRE_LOCAL|IRE_LOOPBACK).
   1319      0    stevel 		 */
   1320  11042      Erik 		match_args &= MATCH_IRE_TYPE;
   1321  11042      Erik 		v6nexthop = ire->ire_gateway_addr_v6;
   1322  11042      Erik 		if (ill == NULL && ire->ire_ill != NULL) {
   1323  11042      Erik 			ill = ire->ire_ill;
   1324  11042      Erik 			need_refrele = B_TRUE;
   1325  11042      Erik 			ill_refhold(ill);
   1326  11042      Erik 			match_args |= MATCH_IRE_ILL;
   1327      0    stevel 		}
   1328  11131      Erik 		/*
   1329  11131      Erik 		 * We set the prefs[i] value above if i > 0. We've already
   1330  11131      Erik 		 * done i++ so i is one in the case of the first time around.
   1331  11131      Erik 		 */
   1332  11131      Erik 		if (i == 1)
   1333  11131      Erik 			prefs[0] = ire_pref(ire);
   1334  11042      Erik 		ire = NULL;
   1335  11042      Erik 	}
   1336  11042      Erik 	ASSERT(ire == NULL);
   1337  11042      Erik 	ire = ire_reject(ipst, B_TRUE);
   1338  11042      Erik 
   1339  11042      Erik error:
   1340  11042      Erik 	ASSERT(ire != NULL);
   1341  11042      Erik 	if (need_refrele)
   1342  11042      Erik 		ill_refrele(ill);
   1343  11042      Erik 
   1344  11042      Erik 	/*
   1345  11042      Erik 	 * In the case of MULTIRT we want to try a different IRE the next
   1346  11042      Erik 	 * time. We let the next packet retry in that case.
   1347  11042      Erik 	 */
   1348  11042      Erik 	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
   1349  11042      Erik 		(void) ire_no_good(ires[0]);
   1350  11042      Erik 
   1351  11042      Erik cleanup:
   1352  11042      Erik 	/* cleanup ires[i] */
   1353  11042      Erik 	ire_dep_unbuild(ires, i);
   1354  11042      Erik 	for (j = 0; j < i; j++)
   1355  11042      Erik 		ire_refrele(ires[j]);
   1356  11042      Erik 
   1357  11042      Erik 	ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE));
   1358  11042      Erik 	/*
   1359  11042      Erik 	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
   1360  11042      Erik 	 * ip_select_route since the reject or lack of memory might be gone.
   1361  11042      Erik 	 */
   1362  11042      Erik 	if (generationp != NULL)
   1363  11042      Erik 		*generationp = IRE_GENERATION_VERIFY;
   1364  11042      Erik 	return (ire);
   1365  11042      Erik 
   1366  11042      Erik done:
   1367  11042      Erik 	ASSERT(ire == NULL);
   1368  11042      Erik 	if (need_refrele)
   1369  11042      Erik 		ill_refrele(ill);
   1370  11042      Erik 
   1371  11042      Erik 	/* Build dependencies */
   1372  11131      Erik 	if (i > 1 && !ire_dep_build(ires, generations, i)) {
   1373  11042      Erik 		/* Something in chain was condemned; tear it apart */
   1374  11042      Erik 		ire = ire_blackhole(ipst, B_TRUE);
   1375  11042      Erik 		goto cleanup;
   1376  11042      Erik 	}
   1377  11042      Erik 
   1378  11042      Erik 	/*
   1379  11042      Erik 	 * Release all refholds except the one for ires[0] that we
   1380  11042      Erik 	 * will return to the caller.
   1381  11042      Erik 	 */
   1382  11042      Erik 	for (j = 1; j < i; j++)
   1383  11042      Erik 		ire_refrele(ires[j]);
   1384  11042      Erik 
   1385  11042      Erik 	if (invalidate) {
   1386      0    stevel 		/*
   1387  11042      Erik 		 * Since we needed to allocate but couldn't we need to make
   1388  11042      Erik 		 * sure that the dependency chain is rebuilt the next time.
   1389      0    stevel 		 */
   1390  11042      Erik 		ire_dep_invalidate_generations(ires[0]);
   1391  11042      Erik 		generation = IRE_GENERATION_VERIFY;
   1392  11042      Erik 	} else {
   1393  11042      Erik 		/*
   1394  11042      Erik 		 * IREs can have been added or deleted while we did the
   1395  11042      Erik 		 * recursive lookup and we can't catch those until we've built
   1396  11042      Erik 		 * the dependencies. We verify the stored
   1397  11042      Erik 		 * ire_dep_parent_generation to catch any such changes and
   1398  11042      Erik 		 * return IRE_GENERATION_VERIFY (which will cause
   1399  11042      Erik 		 * ip_select_route to be called again so we can redo the
   1400  11042      Erik 		 * recursive lookup next time we send a packet.
   1401  11042      Erik 		 */
   1402  11131      Erik 		if (ires[0]->ire_dep_parent == NULL)
   1403  11131      Erik 			generation = ires[0]->ire_generation;
   1404  11131      Erik 		else
   1405  11131      Erik 			generation = ire_dep_validate_generations(ires[0]);
   1406  11042      Erik 		if (generations[0] != ires[0]->ire_generation) {
   1407  11042      Erik 			/* Something changed at the top */
   1408  11042      Erik 			generation = IRE_GENERATION_VERIFY;
   1409      0    stevel 		}
   1410      0    stevel 	}
   1411  11042      Erik 	if (generationp != NULL)
   1412  11042      Erik 		*generationp = generation;
   1413      0    stevel 
   1414  11042      Erik 	return (ires[0]);
   1415  11042      Erik }
   1416      0    stevel 
   1417  11042      Erik ire_t *
   1418  11042      Erik ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type,
   1419  11042      Erik     const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
   1420  11042      Erik     boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst,
   1421  11042      Erik     in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
   1422  11042      Erik {
   1423  11042      Erik 	return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill,
   1424  11042      Erik 	    zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp,
   1425  11042      Erik 	    gwattrp, generationp));
   1426      0    stevel }
   1427      0    stevel 
   1428      0    stevel /*
   1429  11042      Erik  * Recursively look for a route to the destination.
   1430  11042      Erik  * We only handle a destination match here, yet we have the same arguments
   1431  11042      Erik  * as the full match to allow function pointers to select between the two.
   1432  11042      Erik  *
   1433  11042      Erik  * Note that this function never returns NULL. It returns an IRE_NOROUTE
   1434  11042      Erik  * instead.
   1435  11042      Erik  *
   1436  11042      Erik  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
   1437  11042      Erik  * is an error.
   1438  11042      Erik  * Allow at most one RTF_INDIRECT.
   1439      0    stevel  */
   1440      0    stevel ire_t *
   1441  11042      Erik ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, boolean_t allocate,
   1442  11042      Erik     uint32_t xmit_hint, ip_stack_t *ipst)
   1443      0    stevel {
   1444      0    stevel 	ire_t	*ire;
   1445  11042      Erik 	ire_t	*ire1;
   1446  11042      Erik 	uint_t	generation;
   1447      0    stevel 
   1448  11042      Erik 	/* ire_ftable_lookup handles round-robin/ECMP */
   1449  11042      Erik 	ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst,
   1450  11042      Erik 	    &generation);
   1451  11042      Erik 	ASSERT(ire != NULL);
   1452      0    stevel 
   1453      0    stevel 	/*
   1454  11042      Erik 	 * If this type should have an ire_nce_cache (even if it
   1455  11042      Erik 	 * doesn't yet have one) then we are done. Includes
   1456  11042      Erik 	 * IRE_INTERFACE with a full 128 bit mask.
   1457      0    stevel 	 */
   1458  11042      Erik 	if (ire->ire_nce_capable)
   1459      0    stevel 		return (ire);
   1460  11042      Erik 
   1461      0    stevel 	/*
   1462  11042      Erik 	 * If the IRE has a current cached parent we know that the whole
   1463  11042      Erik 	 * parent chain is current, hence we don't need to discover and
   1464  11042      Erik 	 * build any dependencies by doing a recursive lookup.
   1465      0    stevel 	 */
   1466      0    stevel 	mutex_enter(&ire->ire_lock);
   1467  11042      Erik 	if (ire->ire_dep_parent != NULL &&
   1468  11042      Erik 	    ire->ire_dep_parent->ire_generation ==
   1469  11042      Erik 	    ire->ire_dep_parent_generation) {
   1470  11042      Erik 		mutex_exit(&ire->ire_lock);
   1471  11042      Erik 		return (ire);
   1472  11042      Erik 	}
   1473      0    stevel 	mutex_exit(&ire->ire_lock);
   1474   8485     Peter 
   1475   8485     Peter 	/*
   1476  11042      Erik 	 * Fallback to loop in the normal code starting with the ire
   1477  11042      Erik 	 * we found. Normally this would return the same ire.
   1478   8485     Peter 	 */
   1479  11042      Erik 	ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES,
   1480  11042      Erik 	    NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL,
   1481  11042      Erik 	    &generation);
   1482  11042      Erik 	ire_refrele(ire);
   1483  11042      Erik 	return (ire1);
   1484      0    stevel }
   1485