Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /*
     26  * Copyright (c) 1990 Mentat Inc.
     27  */
     28 
     29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     30 
     31 /*
     32  * This file contains routines that manipulate Internet Routing Entries (IREs).
     33  */
     34 #include <sys/types.h>
     35 #include <sys/stream.h>
     36 #include <sys/stropts.h>
     37 #include <sys/ddi.h>
     38 #include <sys/cmn_err.h>
     39 
     40 #include <sys/systm.h>
     41 #include <sys/param.h>
     42 #include <sys/socket.h>
     43 #include <net/if.h>
     44 #include <net/route.h>
     45 #include <netinet/in.h>
     46 #include <net/if_dl.h>
     47 #include <netinet/ip6.h>
     48 #include <netinet/icmp6.h>
     49 
     50 #include <inet/common.h>
     51 #include <inet/mi.h>
     52 #include <inet/ip.h>
     53 #include <inet/ip6.h>
     54 #include <inet/ip_ndp.h>
     55 #include <inet/ip_if.h>
     56 #include <inet/ip_ire.h>
     57 #include <inet/ipclassifier.h>
     58 #include <inet/nd.h>
     59 #include <sys/kmem.h>
     60 #include <sys/zone.h>
     61 
     62 #include <sys/tsol/label.h>
     63 #include <sys/tsol/tnet.h>
     64 
     65 static	ire_t	ire_null;
     66 
     67 static ire_t	*ire_ihandle_lookup_onlink_v6(ire_t *cire);
     68 static boolean_t ire_match_args_v6(ire_t *ire, const in6_addr_t *addr,
     69     const in6_addr_t *mask, const in6_addr_t *gateway, int type,
     70     const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle,
     71     const ts_label_t *tsl, int match_flags);
     72 static	ire_t	*ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *,
     73     const in6_addr_t *, const in6_addr_t *, uint_t *, queue_t *, queue_t *,
     74     ushort_t, ipif_t *, const in6_addr_t *, uint32_t, uint32_t, uint_t,
     75     const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *);
     76 
     77 
     78 /*
     79  * Initialize the ire that is specific to IPv6 part and call
     80  * ire_init_common to finish it.
     81  */
     82 static ire_t *
     83 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask,
     84     const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
     85     uint_t *max_fragp, queue_t *rfq, queue_t *stq, ushort_t type,
     86     ipif_t *ipif, const in6_addr_t *v6cmask, uint32_t phandle,
     87     uint32_t ihandle, uint_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
     88     tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
     89 {
     90 
     91 	/*
     92 	 * Reject IRE security attribute creation/initialization
     93 	 * if system is not running in Trusted mode.
     94 	 */
     95 	if ((gc != NULL || gcgrp != NULL) && !is_system_labeled())
     96 		return (NULL);
     97 
     98 
     99 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced);
    100 	ire->ire_addr_v6 = *v6addr;
    101 
    102 	if (v6src_addr != NULL)
    103 		ire->ire_src_addr_v6 = *v6src_addr;
    104 	if (v6mask != NULL) {
    105 		ire->ire_mask_v6 = *v6mask;
    106 		ire->ire_masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6);
    107 	}
    108 	if (v6gateway != NULL)
    109 		ire->ire_gateway_addr_v6 = *v6gateway;
    110 
    111 	if (type == IRE_CACHE && v6cmask != NULL)
    112 		ire->ire_cmask_v6 = *v6cmask;
    113 
    114 	/*
    115 	 * Multirouted packets need to have a fragment header added so that
    116 	 * the receiver is able to discard duplicates according to their
    117 	 * fragment identifier.
    118 	 */
    119 	if (type == IRE_CACHE && (flags & RTF_MULTIRT)) {
    120 		ire->ire_frag_flag = IPH_FRAG_HDR;
    121 	}
    122 
    123 	/* ire_init_common will free the mblks upon encountering any failure */
    124 	if (!ire_init_common(ire, max_fragp, NULL, rfq, stq, type, ipif,
    125 	    phandle, ihandle, flags, IPV6_VERSION, ulp_info, gc, gcgrp, ipst))
    126 		return (NULL);
    127 
    128 	return (ire);
    129 }
    130 
    131 /*
    132  * Similar to ire_create_v6 except that it is called only when
    133  * we want to allocate ire as an mblk e.g. we have a external
    134  * resolver. Do we need this in IPv6 ?
    135  *
    136  * IPv6 initializes the ire_nce in ire_add_v6, which expects to
    137  * find the ire_nce to be null when it is called. So, although
    138  * we have a src_nce parameter (in the interest of matching up with
    139  * the argument list of the v4 version), we ignore the src_nce
    140  * argument here.
    141  */
    142 /* ARGSUSED */
    143 ire_t *
    144 ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
    145     const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
    146     nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type,
    147     ipif_t *ipif, const in6_addr_t *v6cmask,
    148     uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
    149     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
    150 {
    151 	ire_t	*ire;
    152 	ire_t	*ret_ire;
    153 	mblk_t	*mp;
    154 
    155 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
    156 
    157 	/* Allocate the new IRE. */
    158 	mp = allocb(sizeof (ire_t), BPRI_MED);
    159 	if (mp == NULL) {
    160 		ip1dbg(("ire_create_mp_v6: alloc failed\n"));
    161 		return (NULL);
    162 	}
    163 
    164 	ire = (ire_t *)mp->b_rptr;
    165 	mp->b_wptr = (uchar_t *)&ire[1];
    166 
    167 	/* Start clean. */
    168 	*ire = ire_null;
    169 	ire->ire_mp = mp;
    170 	mp->b_datap->db_type = IRE_DB_TYPE;
    171 
    172 	ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
    173 	    NULL, rfq, stq, type, ipif, v6cmask, phandle,
    174 	    ihandle, flags, ulp_info, gc, gcgrp, ipst);
    175 
    176 	if (ret_ire == NULL) {
    177 		freeb(ire->ire_mp);
    178 		return (NULL);
    179 	}
    180 	return (ire);
    181 }
    182 
    183 /*
    184  * ire_create_v6 is called to allocate and initialize a new IRE.
    185  *
    186  * NOTE : This is called as writer sometimes though not required
    187  * by this function.
    188  *
    189  * See comments above ire_create_mp_v6() for the rationale behind the
    190  * unused src_nce argument.
    191  */
    192 /* ARGSUSED */
    193 ire_t *
    194 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
    195     const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
    196     uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq,
    197     ushort_t type, ipif_t *ipif, const in6_addr_t *v6cmask,
    198     uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
    199     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
    200 {
    201 	ire_t	*ire;
    202 	ire_t	*ret_ire;
    203 
    204 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
    205 
    206 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
    207 	if (ire == NULL) {
    208 		ip1dbg(("ire_create_v6: alloc failed\n"));
    209 		return (NULL);
    210 	}
    211 	*ire = ire_null;
    212 
    213 	ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
    214 	    max_fragp, rfq, stq, type, ipif, v6cmask, phandle,
    215 	    ihandle, flags, ulp_info, gc, gcgrp, ipst);
    216 
    217 	if (ret_ire == NULL) {
    218 		kmem_cache_free(ire_cache, ire);
    219 		return (NULL);
    220 	}
    221 	ASSERT(ret_ire == ire);
    222 	return (ire);
    223 }
    224 
    225 /*
    226  * Find an IRE_INTERFACE for the multicast group.
    227  * Allows different routes for multicast addresses
    228  * in the unicast routing table (akin to FF::0/8 but could be more specific)
    229  * which point at different interfaces. This is used when IPV6_MULTICAST_IF
    230  * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
    231  * specify the interface to join on.
    232  *
    233  * Supports link-local addresses by following the ipif/ill when recursing.
    234  */
    235 ire_t *
    236 ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
    237 {
    238 	ire_t	*ire;
    239 	ipif_t	*ipif = NULL;
    240 	int	match_flags = MATCH_IRE_TYPE;
    241 	in6_addr_t gw_addr_v6;
    242 
    243 	ire = ire_ftable_lookup_v6(group, 0, 0, 0, NULL, NULL,
    244 	    zoneid, 0, NULL, MATCH_IRE_DEFAULT, ipst);
    245 
    246 	/* We search a resolvable ire in case of multirouting. */
    247 	if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) {
    248 		ire_t *cire = NULL;
    249 		/*
    250 		 * If the route is not resolvable, the looked up ire
    251 		 * may be changed here. In that case, ire_multirt_lookup()
    252 		 * IRE_REFRELE the original ire and change it.
    253 		 */
    254 		(void) ire_multirt_lookup_v6(&cire, &ire, MULTIRT_CACHEGW,
    255 		    NULL, ipst);
    256 		if (cire != NULL)
    257 			ire_refrele(cire);
    258 	}
    259 	if (ire == NULL)
    260 		return (NULL);
    261 	/*
    262 	 * Make sure we follow ire_ipif.
    263 	 *
    264 	 * We need to determine the interface route through
    265 	 * which the gateway will be reached. We don't really
    266 	 * care which interface is picked if the interface is
    267 	 * part of a group.
    268 	 */
    269 	if (ire->ire_ipif != NULL) {
    270 		ipif = ire->ire_ipif;
    271 		match_flags |= MATCH_IRE_ILL_GROUP;
    272 	}
    273 
    274 	switch (ire->ire_type) {
    275 	case IRE_DEFAULT:
    276 	case IRE_PREFIX:
    277 	case IRE_HOST:
    278 		mutex_enter(&ire->ire_lock);
    279 		gw_addr_v6 = ire->ire_gateway_addr_v6;
    280 		mutex_exit(&ire->ire_lock);
    281 		ire_refrele(ire);
    282 		ire = ire_ftable_lookup_v6(&gw_addr_v6, 0, 0,
    283 		    IRE_INTERFACE, ipif, NULL, zoneid, 0,
    284 		    NULL, match_flags, ipst);
    285 		return (ire);
    286 	case IRE_IF_NORESOLVER:
    287 	case IRE_IF_RESOLVER:
    288 		return (ire);
    289 	default:
    290 		ire_refrele(ire);
    291 		return (NULL);
    292 	}
    293 }
    294 
    295 /*
    296  * Return any local address.  We use this to target ourselves
    297  * when the src address was specified as 'default'.
    298  * Preference for IRE_LOCAL entries.
    299  */
    300 ire_t *
    301 ire_lookup_local_v6(zoneid_t zoneid, ip_stack_t *ipst)
    302 {
    303 	ire_t	*ire;
    304 	irb_t	*irb;
    305 	ire_t	*maybe = NULL;
    306 	int i;
    307 
    308 	for (i = 0; i < ipst->ips_ip6_cache_table_size;  i++) {
    309 		irb = &ipst->ips_ip_cache_table_v6[i];
    310 		if (irb->irb_ire == NULL)
    311 			continue;
    312 		rw_enter(&irb->irb_lock, RW_READER);
    313 		for (ire = irb->irb_ire; ire; ire = ire->ire_next) {
    314 			if ((ire->ire_marks & IRE_MARK_CONDEMNED) ||
    315 			    ire->ire_zoneid != zoneid &&
    316 			    ire->ire_zoneid != ALL_ZONES)
    317 				continue;
    318 			switch (ire->ire_type) {
    319 			case IRE_LOOPBACK:
    320 				if (maybe == NULL) {
    321 					IRE_REFHOLD(ire);
    322 					maybe = ire;
    323 				}
    324 				break;
    325 			case IRE_LOCAL:
    326 				if (maybe != NULL) {
    327 					ire_refrele(maybe);
    328 				}
    329 				IRE_REFHOLD(ire);
    330 				rw_exit(&irb->irb_lock);
    331 				return (ire);
    332 			}
    333 		}
    334 		rw_exit(&irb->irb_lock);
    335 	}
    336 	return (maybe);
    337 }
    338 
    339 /*
    340  * This function takes a mask and returns number of bits set in the
    341  * mask (the represented prefix length).  Assumes a contiguous mask.
    342  */
    343 int
    344 ip_mask_to_plen_v6(const in6_addr_t *v6mask)
    345 {
    346 	int		bits;
    347 	int		plen = IPV6_ABITS;
    348 	int		i;
    349 
    350 	for (i = 3; i >= 0; i--) {
    351 		if (v6mask->s6_addr32[i] == 0) {
    352 			plen -= 32;
    353 			continue;
    354 		}
    355 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
    356 		if (bits == 0)
    357 			break;
    358 		plen -= bits;
    359 	}
    360 
    361 	return (plen);
    362 }
    363 
    364 /*
    365  * Convert a prefix length to the mask for that prefix.
    366  * Returns the argument bitmask.
    367  */
    368 in6_addr_t *
    369 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
    370 {
    371 	uint32_t *ptr;
    372 
    373 	if (plen < 0 || plen > IPV6_ABITS)
    374 		return (NULL);
    375 	*bitmask = ipv6_all_zeros;
    376 
    377 	ptr = (uint32_t *)bitmask;
    378 	while (plen > 32) {
    379 		*ptr++ = 0xffffffffU;
    380 		plen -= 32;
    381 	}
    382 	*ptr = htonl(0xffffffffU << (32 - plen));
    383 	return (bitmask);
    384 }
    385 
    386 /*
    387  * Add a fully initialized IRE to an appropriate
    388  * table based on ire_type.
    389  *
    390  * The forward table contains IRE_PREFIX/IRE_HOST/IRE_HOST and
    391  * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT.
    392  *
    393  * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK
    394  * and IRE_CACHE.
    395  *
    396  * NOTE : This function is called as writer though not required
    397  * by this function.
    398  */
    399 int
    400 ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
    401 {
    402 	ire_t	*ire1;
    403 	int	mask_table_index;
    404 	irb_t	*irb_ptr;
    405 	ire_t	**irep;
    406 	int	flags;
    407 	ire_t	*pire = NULL;
    408 	ill_t	*stq_ill;
    409 	boolean_t	ndp_g_lock_held = B_FALSE;
    410 	ire_t	*ire = *ire_p;
    411 	int	error;
    412 	ip_stack_t	*ipst = ire->ire_ipst;
    413 
    414 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
    415 	ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
    416 	ASSERT(ire->ire_nce == NULL);
    417 
    418 	/* Find the appropriate list head. */
    419 	switch (ire->ire_type) {
    420 	case IRE_HOST:
    421 		ire->ire_mask_v6 = ipv6_all_ones;
    422 		ire->ire_masklen = IPV6_ABITS;
    423 		if ((ire->ire_flags & RTF_SETSRC) == 0)
    424 			ire->ire_src_addr_v6 = ipv6_all_zeros;
    425 		break;
    426 	case IRE_CACHE:
    427 	case IRE_LOCAL:
    428 	case IRE_LOOPBACK:
    429 		ire->ire_mask_v6 = ipv6_all_ones;
    430 		ire->ire_masklen = IPV6_ABITS;
    431 		break;
    432 	case IRE_PREFIX:
    433 		if ((ire->ire_flags & RTF_SETSRC) == 0)
    434 			ire->ire_src_addr_v6 = ipv6_all_zeros;
    435 		break;
    436 	case IRE_DEFAULT:
    437 		if ((ire->ire_flags & RTF_SETSRC) == 0)
    438 			ire->ire_src_addr_v6 = ipv6_all_zeros;
    439 		break;
    440 	case IRE_IF_RESOLVER:
    441 	case IRE_IF_NORESOLVER:
    442 		break;
    443 	default:
    444 		printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n",
    445 		    (void *)ire, ire->ire_type);
    446 		ire_delete(ire);
    447 		*ire_p = NULL;
    448 		return (EINVAL);
    449 	}
    450 
    451 	/* Make sure the address is properly masked. */
    452 	V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
    453 
    454 	if ((ire->ire_type & IRE_CACHETABLE) == 0) {
    455 		/* IRE goes into Forward Table */
    456 		mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
    457 		if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) ==
    458 		    NULL) {
    459 			irb_t *ptr;
    460 			int i;
    461 
    462 			ptr = (irb_t *)mi_zalloc((
    463 			    ipst->ips_ip6_ftable_hash_size * sizeof (irb_t)));
    464 			if (ptr == NULL) {
    465 				ire_delete(ire);
    466 				*ire_p = NULL;
    467 				return (ENOMEM);
    468 			}
    469 			for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
    470 				rw_init(&ptr[i].irb_lock, NULL,
    471 				    RW_DEFAULT, NULL);
    472 			}
    473 			mutex_enter(&ipst->ips_ire_ft_init_lock);
    474 			if (ipst->ips_ip_forwarding_table_v6[
    475 			    mask_table_index] == NULL) {
    476 				ipst->ips_ip_forwarding_table_v6[
    477 				    mask_table_index] = ptr;
    478 				mutex_exit(&ipst->ips_ire_ft_init_lock);
    479 			} else {
    480 				/*
    481 				 * Some other thread won the race in
    482 				 * initializing the forwarding table at the
    483 				 * same index.
    484 				 */
    485 				mutex_exit(&ipst->ips_ire_ft_init_lock);
    486 				for (i = 0; i < ipst->ips_ip6_ftable_hash_size;
    487 				    i++) {
    488 					rw_destroy(&ptr[i].irb_lock);
    489 				}
    490 				mi_free(ptr);
    491 			}
    492 		}
    493 		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
    494 		    IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
    495 		    ipst->ips_ip6_ftable_hash_size)]);
    496 	} else {
    497 		irb_ptr = &(ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(
    498 		    ire->ire_addr_v6, ipst->ips_ip6_cache_table_size)]);
    499 	}
    500 	/*
    501 	 * For xresolv interfaces (v6 interfaces with an external
    502 	 * address resolver), ip_newroute_v6/ip_newroute_ipif_v6
    503 	 * are unable to prevent the deletion of the interface route
    504 	 * while adding an IRE_CACHE for an on-link destination
    505 	 * in the IRE_IF_RESOLVER case, since the ire has to go to
    506 	 * the external resolver and return. We can't do a REFHOLD on the
    507 	 * associated interface ire for fear of the message being freed
    508 	 * if the external resolver can't resolve the address.
    509 	 * Here we look up the interface ire in the forwarding table
    510 	 * and make sure that the interface route has not been deleted.
    511 	 */
    512 	if (ire->ire_type == IRE_CACHE &&
    513 	    IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6) &&
    514 	    (((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) &&
    515 	    (((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_XRESOLV)) {
    516 
    517 		pire = ire_ihandle_lookup_onlink_v6(ire);
    518 		if (pire == NULL) {
    519 			ire_delete(ire);
    520 			*ire_p = NULL;
    521 			return (EINVAL);
    522 		}
    523 		/* Prevent pire from getting deleted */
    524 		IRB_REFHOLD(pire->ire_bucket);
    525 		/* Has it been removed already? */
    526 		if (pire->ire_marks & IRE_MARK_CONDEMNED) {
    527 			IRB_REFRELE(pire->ire_bucket);
    528 			ire_refrele(pire);
    529 			ire_delete(ire);
    530 			*ire_p = NULL;
    531 			return (EINVAL);
    532 		}
    533 	}
    534 
    535 	flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
    536 	/*
    537 	 * For IRE_CACHES, MATCH_IRE_IPIF is not enough to check
    538 	 * for duplicates because :
    539 	 *
    540 	 * 1) ire_ipif->ipif_ill and ire_stq->q_ptr could be
    541 	 *    pointing at different ills. A real duplicate is
    542 	 *    a match on both ire_ipif and ire_stq.
    543 	 *
    544 	 * 2) We could have multiple packets trying to create
    545 	 *    an IRE_CACHE for the same ill.
    546 	 *
    547 	 * Moreover, IPIF_NOFAILOVER and IPV6_BOUND_PIF endpoints wants
    548 	 * to go out on a particular ill. Rather than looking at the
    549 	 * packet, we depend on the above for MATCH_IRE_ILL here.
    550 	 *
    551 	 * Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have
    552 	 * multiple IRE_CACHES for an ill for the same destination
    553 	 * with various scoped addresses i.e represented by ipifs.
    554 	 *
    555 	 * MATCH_IRE_ILL is done implicitly below for IRE_CACHES.
    556 	 */
    557 	if (ire->ire_ipif != NULL)
    558 		flags |= MATCH_IRE_IPIF;
    559 	/*
    560 	 * If we are creating hidden ires, make sure we search on
    561 	 * this ill (MATCH_IRE_ILL) and a hidden ire, while we are
    562 	 * searching for duplicates below. Otherwise we could
    563 	 * potentially find an IRE on some other interface
    564 	 * and it may not be a IRE marked with IRE_MARK_HIDDEN. We
    565 	 * shouldn't do this as this will lead to an infinite loop as
    566 	 * eventually we need an hidden ire for this packet to go
    567 	 * out. MATCH_IRE_ILL is already marked above.
    568 	 */
    569 	if (ire->ire_marks & IRE_MARK_HIDDEN) {
    570 		ASSERT(ire->ire_type == IRE_CACHE);
    571 		flags |= MATCH_IRE_MARK_HIDDEN;
    572 	}
    573 
    574 	/*
    575 	 * Start the atomic add of the ire. Grab the ill locks,
    576 	 * ill_g_usesrc_lock and the bucket lock. Check for condemned.
    577 	 * To avoid lock order problems, get the ndp6.ndp_g_lock now itself.
    578 	 */
    579 	if (ire->ire_type == IRE_CACHE) {
    580 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
    581 		ndp_g_lock_held = B_TRUE;
    582 	}
    583 
    584 	/*
    585 	 * If ipif or ill is changing ire_atomic_start() may queue the
    586 	 * request and return EINPROGRESS.
    587 	 */
    588 
    589 	error = ire_atomic_start(irb_ptr, ire, q, mp, func);
    590 	if (error != 0) {
    591 		if (ndp_g_lock_held)
    592 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
    593 		/*
    594 		 * We don't know whether it is a valid ipif or not.
    595 		 * So, set it to NULL. This assumes that the ire has not added
    596 		 * a reference to the ipif.
    597 		 */
    598 		ire->ire_ipif = NULL;
    599 		ire_delete(ire);
    600 		if (pire != NULL) {
    601 			IRB_REFRELE(pire->ire_bucket);
    602 			ire_refrele(pire);
    603 		}
    604 		*ire_p = NULL;
    605 		return (error);
    606 	}
    607 	/*
    608 	 * To avoid creating ires having stale values for the ire_max_frag
    609 	 * we get the latest value atomically here. For more details
    610 	 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE
    611 	 * in ip_rput_dlpi_writer
    612 	 */
    613 	if (ire->ire_max_fragp == NULL) {
    614 		if (IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6))
    615 			ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
    616 		else
    617 			ire->ire_max_frag = pire->ire_max_frag;
    618 	} else {
    619 		uint_t  max_frag;
    620 
    621 		max_frag = *ire->ire_max_fragp;
    622 		ire->ire_max_fragp = NULL;
    623 		ire->ire_max_frag = max_frag;
    624 	}
    625 
    626 	/*
    627 	 * Atomically check for duplicate and insert in the table.
    628 	 */
    629 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
    630 		if (ire1->ire_marks & IRE_MARK_CONDEMNED)
    631 			continue;
    632 
    633 		if (ire->ire_type == IRE_CACHE) {
    634 			/*
    635 			 * We do MATCH_IRE_ILL implicitly here for IRE_CACHES.
    636 			 * As ire_ipif and ire_stq could point to two
    637 			 * different ills, we can't pass just ire_ipif to
    638 			 * ire_match_args and get a match on both ills.
    639 			 * This is just needed for duplicate checks here and
    640 			 * so we don't add an extra argument to
    641 			 * ire_match_args for this. Do it locally.
    642 			 *
    643 			 * NOTE : Currently there is no part of the code
    644 			 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL
    645 			 * match for IRE_CACHEs. Thus we don't want to
    646 			 * extend the arguments to ire_match_args_v6.
    647 			 */
    648 			if (ire1->ire_stq != ire->ire_stq)
    649 				continue;
    650 			/*
    651 			 * Multiroute IRE_CACHEs for a given destination can
    652 			 * have the same ire_ipif, typically if their source
    653 			 * address is forced using RTF_SETSRC, and the same
    654 			 * send-to queue. We differentiate them using the parent
    655 			 * handle.
    656 			 */
    657 			if ((ire1->ire_flags & RTF_MULTIRT) &&
    658 			    (ire->ire_flags & RTF_MULTIRT) &&
    659 			    (ire1->ire_phandle != ire->ire_phandle))
    660 				continue;
    661 		}
    662 		if (ire1->ire_zoneid != ire->ire_zoneid)
    663 			continue;
    664 		if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
    665 		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
    666 		    ire->ire_type, ire->ire_ipif, ire->ire_zoneid, 0, NULL,
    667 		    flags)) {
    668 			/*
    669 			 * Return the old ire after doing a REFHOLD.
    670 			 * As most of the callers continue to use the IRE
    671 			 * after adding, we return a held ire. This will
    672 			 * avoid a lookup in the caller again. If the callers
    673 			 * don't want to use it, they need to do a REFRELE.
    674 			 */
    675 			ip1dbg(("found dup ire existing %p new %p",
    676 			    (void *)ire1, (void *)ire));
    677 			IRE_REFHOLD(ire1);
    678 			if (ndp_g_lock_held)
    679 				mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
    680 			ire_atomic_end(irb_ptr, ire);
    681 			ire_delete(ire);
    682 			if (pire != NULL) {
    683 				/*
    684 				 * Assert that it is
    685 				 * not yet removed from the list.
    686 				 */
    687 				ASSERT(pire->ire_ptpn != NULL);
    688 				IRB_REFRELE(pire->ire_bucket);
    689 				ire_refrele(pire);
    690 			}
    691 			*ire_p = ire1;
    692 			return (0);
    693 		}
    694 	}
    695 	if (ire->ire_type == IRE_CACHE) {
    696 		in6_addr_t gw_addr_v6;
    697 		ill_t	*ill = ire_to_ill(ire);
    698 		char	buf[INET6_ADDRSTRLEN];
    699 		nce_t	*nce;
    700 
    701 		/*
    702 		 * All IRE_CACHE types must have a nce.  If this is
    703 		 * not the case the entry will not be added. We need
    704 		 * to make sure that if somebody deletes the nce
    705 		 * after we looked up, they will find this ire and
    706 		 * delete the ire. To delete this ire one needs the
    707 		 * bucket lock which we are still holding here. So,
    708 		 * even if the nce gets deleted after we looked up,
    709 		 * this ire  will get deleted.
    710 		 *
    711 		 * NOTE : Don't need the ire_lock for accessing
    712 		 * ire_gateway_addr_v6 as it is appearing first
    713 		 * time on the list and rts_setgwr_v6 could not
    714 		 * be changing this.
    715 		 */
    716 		gw_addr_v6 = ire->ire_gateway_addr_v6;
    717 		if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
    718 			nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, B_TRUE);
    719 		} else {
    720 			nce = ndp_lookup_v6(ill, &gw_addr_v6, B_TRUE);
    721 		}
    722 		if (nce == NULL)
    723 			goto failed;
    724 
    725 		/* Pair of refhold, refrele just to get the tracing right */
    726 		NCE_REFHOLD_TO_REFHOLD_NOTR(nce);
    727 		/*
    728 		 * Atomically make sure that new IREs don't point
    729 		 * to an NCE that is logically deleted (CONDEMNED).
    730 		 * ndp_delete() first marks the NCE CONDEMNED.
    731 		 * This ensures that the nce_refcnt won't increase
    732 		 * due to new nce_lookups or due to addition of new IREs
    733 		 * pointing to this NCE. Then ndp_delete() cleans up
    734 		 * existing references. If we don't do it atomically here,
    735 		 * ndp_delete() -> nce_ire_delete() will not be able to
    736 		 * clean up the IRE list completely, and the nce_refcnt
    737 		 * won't go down to zero.
    738 		 */
    739 		mutex_enter(&nce->nce_lock);
    740 		if (ill->ill_flags & ILLF_XRESOLV) {
    741 			/*
    742 			 * If we used an external resolver, we may not
    743 			 * have gone through neighbor discovery to get here.
    744 			 * Must update the nce_state before the next check.
    745 			 */
    746 			if (nce->nce_state == ND_INCOMPLETE)
    747 				nce->nce_state = ND_REACHABLE;
    748 		}
    749 		if (nce->nce_state == ND_INCOMPLETE ||
    750 		    (nce->nce_flags & NCE_F_CONDEMNED) ||
    751 		    (nce->nce_state == ND_UNREACHABLE)) {
    752 failed:
    753 			if (ndp_g_lock_held)
    754 				mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
    755 			if (nce != NULL)
    756 				mutex_exit(&nce->nce_lock);
    757 			ire_atomic_end(irb_ptr, ire);
    758 			ip1dbg(("ire_add_v6: No nce for dst %s \n",
    759 			    inet_ntop(AF_INET6, &ire->ire_addr_v6,
    760 			    buf, sizeof (buf))));
    761 			ire_delete(ire);
    762 			if (pire != NULL) {
    763 				/*
    764 				 * Assert that it is
    765 				 * not yet removed from the list.
    766 				 */
    767 				ASSERT(pire->ire_ptpn != NULL);
    768 				IRB_REFRELE(pire->ire_bucket);
    769 				ire_refrele(pire);
    770 			}
    771 			if (nce != NULL)
    772 				NCE_REFRELE_NOTR(nce);
    773 			*ire_p = NULL;
    774 			return (EINVAL);
    775 		} else {
    776 			ire->ire_nce = nce;
    777 		}
    778 		mutex_exit(&nce->nce_lock);
    779 	}
    780 	/*
    781 	 * Find the first entry that matches ire_addr - provides
    782 	 * tail insertion. *irep will be null if no match.
    783 	 */
    784 	irep = (ire_t **)irb_ptr;
    785 	while ((ire1 = *irep) != NULL &&
    786 	    !IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6))
    787 		irep = &ire1->ire_next;
    788 	ASSERT(!(ire->ire_type & IRE_BROADCAST));
    789 
    790 	if (*irep != NULL) {
    791 		/*
    792 		 * Find the last ire which matches ire_addr_v6.
    793 		 * Needed to do tail insertion among entries with the same
    794 		 * ire_addr_v6.
    795 		 */
    796 		while (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6,
    797 		    &ire1->ire_addr_v6)) {
    798 			irep = &ire1->ire_next;
    799 			ire1 = *irep;
    800 			if (ire1 == NULL)
    801 				break;
    802 		}
    803 	}
    804 
    805 	if (ire->ire_type == IRE_DEFAULT) {
    806 		/*
    807 		 * We keep a count of default gateways which is used when
    808 		 * assigning them as routes.
    809 		 */
    810 		ipst->ips_ipv6_ire_default_count++;
    811 		ASSERT(ipst->ips_ipv6_ire_default_count != 0); /* Wraparound */
    812 	}
    813 	/* Insert at *irep */
    814 	ire1 = *irep;
    815 	if (ire1 != NULL)
    816 		ire1->ire_ptpn = &ire->ire_next;
    817 	ire->ire_next = ire1;
    818 	/* Link the new one in. */
    819 	ire->ire_ptpn = irep;
    820 	/*
    821 	 * ire_walk routines de-reference ire_next without holding
    822 	 * a lock. Before we point to the new ire, we want to make
    823 	 * sure the store that sets the ire_next of the new ire
    824 	 * reaches global visibility, so that ire_walk routines
    825 	 * don't see a truncated list of ires i.e if the ire_next
    826 	 * of the new ire gets set after we do "*irep = ire" due
    827 	 * to re-ordering, the ire_walk thread will see a NULL
    828 	 * once it accesses the ire_next of the new ire.
    829 	 * membar_producer() makes sure that the following store
    830 	 * happens *after* all of the above stores.
    831 	 */
    832 	membar_producer();
    833 	*irep = ire;
    834 	ire->ire_bucket = irb_ptr;
    835 	/*
    836 	 * We return a bumped up IRE above. Keep it symmetrical
    837 	 * so that the callers will always have to release. This
    838 	 * helps the callers of this function because they continue
    839 	 * to use the IRE after adding and hence they don't have to
    840 	 * lookup again after we return the IRE.
    841 	 *
    842 	 * NOTE : We don't have to use atomics as this is appearing
    843 	 * in the list for the first time and no one else can bump
    844 	 * up the reference count on this yet.
    845 	 */
    846 	IRE_REFHOLD_LOCKED(ire);
    847 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted);
    848 	irb_ptr->irb_ire_cnt++;
    849 	if (ire->ire_marks & IRE_MARK_TEMPORARY)
    850 		irb_ptr->irb_tmp_ire_cnt++;
    851 
    852 	if (ire->ire_ipif != NULL) {
    853 		DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif,
    854 		    (char *), "ire", (void *), ire);
    855 		ire->ire_ipif->ipif_ire_cnt++;
    856 		if (ire->ire_stq != NULL) {
    857 			stq_ill = (ill_t *)ire->ire_stq->q_ptr;
    858 			DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill,
    859 			    (char *), "ire", (void *), ire);
    860 			stq_ill->ill_ire_cnt++;
    861 		}
    862 	} else {
    863 		ASSERT(ire->ire_stq == NULL);
    864 	}
    865 
    866 	if (ndp_g_lock_held)
    867 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
    868 	ire_atomic_end(irb_ptr, ire);
    869 
    870 	if (pire != NULL) {
    871 		/* Assert that it is not removed from the list yet */
    872 		ASSERT(pire->ire_ptpn != NULL);
    873 		IRB_REFRELE(pire->ire_bucket);
    874 		ire_refrele(pire);
    875 	}
    876 
    877 	if (ire->ire_type != IRE_CACHE) {
    878 		/*
    879 		 * For ire's with with host mask see if there is an entry
    880 		 * in the cache. If there is one flush the whole cache as
    881 		 * there might be multiple entries due to RTF_MULTIRT (CGTP).
    882 		 * If no entry is found than there is no need to flush the
    883 		 * cache.
    884 		 */
    885 
    886 		if (ip_mask_to_plen_v6(&ire->ire_mask_v6) == IPV6_ABITS) {
    887 			ire_t *lire;
    888 			lire = ire_ctable_lookup_v6(&ire->ire_addr_v6, NULL,
    889 			    IRE_CACHE, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE,
    890 			    ipst);
    891 			if (lire != NULL) {
    892 				ire_refrele(lire);
    893 				ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
    894 			}
    895 		} else {
    896 			ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
    897 		}
    898 	}
    899 
    900 	*ire_p = ire;
    901 	return (0);
    902 }
    903 
    904 /*
    905  * Search for all HOST REDIRECT routes that are
    906  * pointing at the specified gateway and
    907  * delete them. This routine is called only
    908  * when a default gateway is going away.
    909  */
    910 static void
    911 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
    912 {
    913 	irb_t *irb_ptr;
    914 	irb_t *irb;
    915 	ire_t *ire;
    916 	in6_addr_t gw_addr_v6;
    917 	int i;
    918 
    919 	/* get the hash table for HOST routes */
    920 	irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)];
    921 	if (irb_ptr == NULL)
    922 		return;
    923 	for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
    924 		irb = &irb_ptr[i];
    925 		IRB_REFHOLD(irb);
    926 		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
    927 			if (!(ire->ire_flags & RTF_DYNAMIC))
    928 				continue;
    929 			mutex_enter(&ire->ire_lock);
    930 			gw_addr_v6 = ire->ire_gateway_addr_v6;
    931 			mutex_exit(&ire->ire_lock);
    932 			if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
    933 				ire_delete(ire);
    934 		}
    935 		IRB_REFRELE(irb);
    936 	}
    937 }
    938 
    939 /*
    940  * Delete all the cache entries with this 'addr'. This is the IPv6 counterpart
    941  * of ip_ire_clookup_and_delete. The difference being this function does not
    942  * return any value. IPv6 processing of a gratuitous ARP, as it stands, is
    943  * different than IPv4 in that, regardless of the presence of a cache entry
    944  * for this address, an ire_walk_v6 is done. Another difference is that unlike
    945  * in the case of IPv4 this does not take an ipif_t argument, since it is only
    946  * called by ip_arp_news and the match is always only on the address.
    947  */
    948 void
    949 ip_ire_clookup_and_delete_v6(const in6_addr_t *addr, ip_stack_t *ipst)
    950 {
    951 	irb_t		*irb;
    952 	ire_t		*cire;
    953 	boolean_t	found = B_FALSE;
    954 
    955 	irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
    956 	    ipst->ips_ip6_cache_table_size)];
    957 	IRB_REFHOLD(irb);
    958 	for (cire = irb->irb_ire; cire != NULL; cire = cire->ire_next) {
    959 		if (cire->ire_marks & IRE_MARK_CONDEMNED)
    960 			continue;
    961 		if (IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, addr)) {
    962 
    963 			/* This signifies start of a match */
    964 			if (!found)
    965 				found = B_TRUE;
    966 			if (cire->ire_type == IRE_CACHE) {
    967 				if (cire->ire_nce != NULL)
    968 					ndp_delete(cire->ire_nce);
    969 				ire_delete_v6(cire);
    970 			}
    971 		/* End of the match */
    972 		} else if (found)
    973 			break;
    974 	}
    975 	IRB_REFRELE(irb);
    976 }
    977 
    978 /*
    979  * Delete the specified IRE.
    980  * All calls should use ire_delete().
    981  * Sometimes called as writer though not required by this function.
    982  *
    983  * NOTE : This function is called only if the ire was added
    984  * in the list.
    985  */
    986 void
    987 ire_delete_v6(ire_t *ire)
    988 {
    989 	in6_addr_t gw_addr_v6;
    990 	ip_stack_t	*ipst = ire->ire_ipst;
    991 
    992 	ASSERT(ire->ire_refcnt >= 1);
    993 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
    994 
    995 	if (ire->ire_type != IRE_CACHE)
    996 		ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
    997 	if (ire->ire_type == IRE_DEFAULT) {
    998 		/*
    999 		 * when a default gateway is going away
   1000 		 * delete all the host redirects pointing at that
   1001 		 * gateway.
   1002 		 */
   1003 		mutex_enter(&ire->ire_lock);
   1004 		gw_addr_v6 = ire->ire_gateway_addr_v6;
   1005 		mutex_exit(&ire->ire_lock);
   1006 		ire_delete_host_redirects_v6(&gw_addr_v6, ipst);
   1007 	}
   1008 }
   1009 
   1010 /*
   1011  * ire_walk routine to delete all IRE_CACHE and IRE_HOST type redirect
   1012  * entries.
   1013  */
   1014 /*ARGSUSED1*/
   1015 void
   1016 ire_delete_cache_v6(ire_t *ire, char *arg)
   1017 {
   1018 	char    addrstr1[INET6_ADDRSTRLEN];
   1019 	char    addrstr2[INET6_ADDRSTRLEN];
   1020 
   1021 	if ((ire->ire_type & IRE_CACHE) ||
   1022 	    (ire->ire_flags & RTF_DYNAMIC)) {
   1023 		ip1dbg(("ire_delete_cache_v6: deleted %s type %d through %s\n",
   1024 		    inet_ntop(AF_INET6, &ire->ire_addr_v6,
   1025 		    addrstr1, sizeof (addrstr1)),
   1026 		    ire->ire_type,
   1027 		    inet_ntop(AF_INET6, &ire->ire_gateway_addr_v6,
   1028 		    addrstr2, sizeof (addrstr2))));
   1029 		ire_delete(ire);
   1030 	}
   1031 
   1032 }
   1033 
   1034 /*
   1035  * ire_walk routine to delete all IRE_CACHE/IRE_HOST type redirect entries
   1036  * that have a given gateway address.
   1037  */
   1038 void
   1039 ire_delete_cache_gw_v6(ire_t *ire, char *addr)
   1040 {
   1041 	in6_addr_t	*gw_addr = (in6_addr_t *)addr;
   1042 	char		buf1[INET6_ADDRSTRLEN];
   1043 	char		buf2[INET6_ADDRSTRLEN];
   1044 	in6_addr_t	ire_gw_addr_v6;
   1045 
   1046 	if (!(ire->ire_type & IRE_CACHE) &&
   1047 	    !(ire->ire_flags & RTF_DYNAMIC))
   1048 		return;
   1049 
   1050 	mutex_enter(&ire->ire_lock);
   1051 	ire_gw_addr_v6 = ire->ire_gateway_addr_v6;
   1052 	mutex_exit(&ire->ire_lock);
   1053 
   1054 	if (IN6_ARE_ADDR_EQUAL(&ire_gw_addr_v6, gw_addr)) {
   1055 		ip1dbg(("ire_delete_cache_gw_v6: deleted %s type %d to %s\n",
   1056 		    inet_ntop(AF_INET6, &ire->ire_src_addr_v6,
   1057 		    buf1, sizeof (buf1)),
   1058 		    ire->ire_type