Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/types.h>
     29 #include <sys/stream.h>
     30 #include <sys/stropts.h>
     31 #include <sys/strsun.h>
     32 #include <sys/sysmacros.h>
     33 #include <sys/errno.h>
     34 #include <sys/dlpi.h>
     35 #include <sys/socket.h>
     36 #include <sys/ddi.h>
     37 #include <sys/sunddi.h>
     38 #include <sys/cmn_err.h>
     39 #include <sys/debug.h>
     40 #include <sys/vtrace.h>
     41 #include <sys/kmem.h>
     42 #include <sys/zone.h>
     43 #include <sys/ethernet.h>
     44 #include <sys/sdt.h>
     45 
     46 #include <net/if.h>
     47 #include <net/if_types.h>
     48 #include <net/if_dl.h>
     49 #include <net/route.h>
     50 #include <netinet/in.h>
     51 #include <netinet/ip6.h>
     52 #include <netinet/icmp6.h>
     53 
     54 #include <inet/common.h>
     55 #include <inet/mi.h>
     56 #include <inet/mib2.h>
     57 #include <inet/nd.h>
     58 #include <inet/ip.h>
     59 #include <inet/ip_impl.h>
     60 #include <inet/ipclassifier.h>
     61 #include <inet/ip_if.h>
     62 #include <inet/ip_ire.h>
     63 #include <inet/ip_rts.h>
     64 #include <inet/ip6.h>
     65 #include <inet/ip_ndp.h>
     66 #include <inet/ipsec_impl.h>
     67 #include <inet/ipsec_info.h>
     68 #include <inet/sctp_ip.h>
     69 
     70 /*
     71  * Function names with nce_ prefix are static while function
     72  * names with ndp_ prefix are used by rest of the IP.
     73  *
     74  * Lock ordering:
     75  *
     76  *	ndp_g_lock -> ill_lock -> nce_lock
     77  *
     78  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
     79  * nce_next.  Nce_lock protects the contents of the NCE (particularly
     80  * nce_refcnt).
     81  */
     82 
     83 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
     84     uint32_t ll_addr_len);
     85 static	void	nce_ire_delete(nce_t *nce);
     86 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
     87 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
     88 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
     89 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
     90 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
     91     uchar_t *addr);
     92 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
     93 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
     94 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
     95 static	void	nce_update(nce_t *nce, uint16_t new_state,
     96     uchar_t *new_ll_addr);
     97 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
     98 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
     99     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
    100     const in6_addr_t *target, int flag);
    101 static int	ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
    102     nce_t **, nce_t *);
    103 
    104 #ifdef DEBUG
    105 static void	nce_trace_cleanup(const nce_t *);
    106 #endif
    107 
    108 #define	NCE_HASH_PTR_V4(ipst, addr)					\
    109 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
    110 
    111 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
    112 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
    113 		NCE_TABLE_SIZE)]))
    114 
    115 /*
    116  * Compute default flags to use for an advertisement of this nce's address.
    117  */
    118 static int
    119 nce_advert_flags(const nce_t *nce)
    120 {
    121 	int flag = 0;
    122 
    123 	if (nce->nce_flags & NCE_F_ISROUTER)
    124 		flag |= NDP_ISROUTER;
    125 	if (!(nce->nce_flags & NCE_F_ANYCAST))
    126 		flag |= NDP_ORIDE;
    127 
    128 	return (flag);
    129 }
    130 
    131 /* Non-tunable probe interval, based on link capabilities */
    132 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
    133 
    134 /*
    135  * NDP Cache Entry creation routine.
    136  * Mapped entries will never do NUD .
    137  * This routine must always be called with ndp6->ndp_g_lock held.
    138  * Prior to return, nce_refcnt is incremented.
    139  */
    140 int
    141 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
    142     const in6_addr_t *mask, const in6_addr_t *extract_mask,
    143     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
    144     nce_t **newnce)
    145 {
    146 	static	nce_t		nce_nil;
    147 	nce_t		*nce;
    148 	mblk_t		*mp;
    149 	mblk_t		*template;
    150 	nce_t		**ncep;
    151 	int		err;
    152 	boolean_t	dropped = B_FALSE;
    153 	ip_stack_t	*ipst = ill->ill_ipst;
    154 
    155 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
    156 	ASSERT(ill != NULL && ill->ill_isv6);
    157 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
    158 		ip0dbg(("ndp_add_v6: no addr\n"));
    159 		return (EINVAL);
    160 	}
    161 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
    162 		ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags));
    163 		return (EINVAL);
    164 	}
    165 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
    166 	    (flags & NCE_F_MAPPING)) {
    167 		ip0dbg(("ndp_add_v6: extract mask zero for mapping"));
    168 		return (EINVAL);
    169 	}
    170 	/*
    171 	 * Allocate the mblk to hold the nce.
    172 	 *
    173 	 * XXX This can come out of a separate cache - nce_cache.
    174 	 * We don't need the mp anymore as there are no more
    175 	 * "qwriter"s
    176 	 */
    177 	mp = allocb(sizeof (nce_t), BPRI_MED);
    178 	if (mp == NULL)
    179 		return (ENOMEM);
    180 
    181 	nce = (nce_t *)mp->b_rptr;
    182 	mp->b_wptr = (uchar_t *)&nce[1];
    183 	*nce = nce_nil;
    184 
    185 	/*
    186 	 * This one holds link layer address
    187 	 */
    188 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
    189 		template = nce_udreq_alloc(ill);
    190 	} else {
    191 		if (ill->ill_resolver_mp == NULL) {
    192 			freeb(mp);
    193 			return (EINVAL);
    194 		}
    195 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
    196 		template = copyb(ill->ill_resolver_mp);
    197 	}
    198 	if (template == NULL) {
    199 		freeb(mp);
    200 		return (ENOMEM);
    201 	}
    202 	nce->nce_ill = ill;
    203 	nce->nce_ipversion = IPV6_VERSION;
    204 	nce->nce_flags = flags;
    205 	nce->nce_state = state;
    206 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
    207 	nce->nce_rcnt = ill->ill_xmit_count;
    208 	nce->nce_addr = *addr;
    209 	nce->nce_mask = *mask;
    210 	nce->nce_extract_mask = *extract_mask;
    211 	nce->nce_ll_extract_start = hw_extract_start;
    212 	nce->nce_fp_mp = NULL;
    213 	nce->nce_res_mp = template;
    214 	if (state == ND_REACHABLE)
    215 		nce->nce_last = TICK_TO_MSEC(lbolt64);
    216 	else
    217 		nce->nce_last = 0;
    218 	nce->nce_qd_mp = NULL;
    219 	nce->nce_mp = mp;
    220 	if (hw_addr != NULL)
    221 		nce_set_ll(nce, hw_addr);
    222 	/* This one is for nce getting created */
    223 	nce->nce_refcnt = 1;
    224 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
    225 	if (nce->nce_flags & NCE_F_MAPPING) {
    226 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
    227 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
    228 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
    229 		ncep = &ipst->ips_ndp6->nce_mask_entries;
    230 	} else {
    231 		ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
    232 	}
    233 
    234 	nce->nce_trace_disable = B_FALSE;
    235 
    236 	/*
    237 	 * Atomically ensure that the ill is not CONDEMNED, before
    238 	 * adding the NCE.
    239 	 */
    240 	mutex_enter(&ill->ill_lock);
    241 	if (ill->ill_state_flags & ILL_CONDEMNED) {
    242 		mutex_exit(&ill->ill_lock);
    243 		freeb(mp);
    244 		freeb(template);
    245 		return (EINVAL);
    246 	}
    247 	if ((nce->nce_next = *ncep) != NULL)
    248 		nce->nce_next->nce_ptpn = &nce->nce_next;
    249 	*ncep = nce;
    250 	nce->nce_ptpn = ncep;
    251 	*newnce = nce;
    252 	/* This one is for nce being used by an active thread */
    253 	NCE_REFHOLD(*newnce);
    254 
    255 	/* Bump up the number of nce's referencing this ill */
    256 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
    257 	    (char *), "nce", (void *), nce);
    258 	ill->ill_nce_cnt++;
    259 	mutex_exit(&ill->ill_lock);
    260 
    261 	err = 0;
    262 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
    263 		mutex_enter(&nce->nce_lock);
    264 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
    265 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
    266 		mutex_exit(&nce->nce_lock);
    267 		dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
    268 		    &ipv6_all_zeros, addr, NDP_PROBE);
    269 		if (dropped) {
    270 			mutex_enter(&nce->nce_lock);
    271 			nce->nce_pcnt++;
    272 			mutex_exit(&nce->nce_lock);
    273 		}
    274 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
    275 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
    276 		err = EINPROGRESS;
    277 	} else if (flags & NCE_F_UNSOL_ADV) {
    278 		/*
    279 		 * We account for the transmit below by assigning one
    280 		 * less than the ndd variable. Subsequent decrements
    281 		 * are done in ndp_timer.
    282 		 */
    283 		mutex_enter(&nce->nce_lock);
    284 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
    285 		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
    286 		mutex_exit(&nce->nce_lock);
    287 		dropped = nce_xmit(ill,
    288 		    ND_NEIGHBOR_ADVERT,
    289 		    ill,	/* ill to be used for extracting ill_nd_lla */
    290 		    B_TRUE,	/* use ill_nd_lla */
    291 		    addr,	/* Source and target of the advertisement pkt */
    292 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
    293 		    nce_advert_flags(nce));
    294 		mutex_enter(&nce->nce_lock);
    295 		if (dropped)
    296 			nce->nce_unsolicit_count++;
    297 		if (nce->nce_unsolicit_count != 0) {
    298 			nce->nce_timeout_id = timeout(ndp_timer, nce,
    299 			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
    300 		}
    301 		mutex_exit(&nce->nce_lock);
    302 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
    303 	}
    304 	/*
    305 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
    306 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
    307 	 * We call nce_fastpath from nce_update if the link layer address of
    308 	 * the peer changes from nce_update
    309 	 */
    310 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
    311 		nce_fastpath(nce);
    312 	return (err);
    313 }
    314 
    315 int
    316 ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
    317     const in6_addr_t *mask, const in6_addr_t *extract_mask,
    318     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
    319     nce_t **newnce)
    320 {
    321 	int	err = 0;
    322 	nce_t	*nce;
    323 	ip_stack_t	*ipst = ill->ill_ipst;
    324 
    325 	ASSERT(ill->ill_isv6);
    326 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
    327 
    328 	/* Get head of v6 hash table */
    329 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
    330 	nce = nce_lookup_addr(ill, addr, nce);
    331 	if (nce == NULL) {
    332 		err = ndp_add_v6(ill,
    333 		    hw_addr,
    334 		    addr,
    335 		    mask,
    336 		    extract_mask,
    337 		    hw_extract_start,
    338 		    flags,
    339 		    state,
    340 		    newnce);
    341 	} else {
    342 		*newnce = nce;
    343 		err = EEXIST;
    344 	}
    345 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
    346 	return (err);
    347 }
    348 
    349 /*
    350  * Remove all the CONDEMNED nces from the appropriate hash table.
    351  * We create a private list of NCEs, these may have ires pointing
    352  * to them, so the list will be passed through to clean up dependent
    353  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
    354  */
    355 static void
    356 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
    357 {
    358 	nce_t *nce1;
    359 	nce_t **ptpn;
    360 
    361 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
    362 	ASSERT(ndp->ndp_g_walker == 0);
    363 	for (; nce; nce = nce1) {
    364 		nce1 = nce->nce_next;
    365 		mutex_enter(&nce->nce_lock);
    366 		if (nce->nce_flags & NCE_F_CONDEMNED) {
    367 			ptpn = nce->nce_ptpn;
    368 			nce1 = nce->nce_next;
    369 			if (nce1 != NULL)
    370 				nce1->nce_ptpn = ptpn;
    371 			*ptpn = nce1;
    372 			nce->nce_ptpn = NULL;
    373 			nce->nce_next = NULL;
    374 			nce->nce_next = *free_nce_list;
    375 			*free_nce_list = nce;
    376 		}
    377 		mutex_exit(&nce->nce_lock);
    378 	}
    379 }
    380 
    381 /*
    382  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
    383  *    will return this NCE. Also no new IREs will be created that
    384  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
    385  *    be started (See NDP_RESTART_TIMER).
    386  * 2. Cancel any currently running timeouts.
    387  * 3. If there is an ndp walker, return. The walker will do the cleanup.
    388  *    This ensures that walkers see a consistent list of NCEs while walking.
    389  * 4. Otherwise remove the NCE from the list of NCEs
    390  * 5. Delete all IREs pointing to this NCE.
    391  */
    392 void
    393 ndp_delete(nce_t *nce)
    394 {
    395 	nce_t	**ptpn;
    396 	nce_t	*nce1;
    397 	int	ipversion = nce->nce_ipversion;
    398 	ndp_g_t *ndp;
    399 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
    400 
    401 	if (ipversion == IPV4_VERSION)
    402 		ndp = ipst->ips_ndp4;
    403 	else
    404 		ndp = ipst->ips_ndp6;
    405 
    406 	/* Serialize deletes */
    407 	mutex_enter(&nce->nce_lock);
    408 	if (nce->nce_flags & NCE_F_CONDEMNED) {
    409 		/* Some other thread is doing the delete */
    410 		mutex_exit(&nce->nce_lock);
    411 		return;
    412 	}
    413 	/*
    414 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
    415 	 * refcnt has to be >= 2
    416 	 */
    417 	ASSERT(nce->nce_refcnt >= 2);
    418 	nce->nce_flags |= NCE_F_CONDEMNED;
    419 	mutex_exit(&nce->nce_lock);
    420 
    421 	nce_fastpath_list_delete(nce);
    422 
    423 	/*
    424 	 * Cancel any running timer. Timeout can't be restarted
    425 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
    426 	 * Passing invalid timeout id is fine.
    427 	 */
    428 	if (nce->nce_timeout_id != 0) {
    429 		(void) untimeout(nce->nce_timeout_id);
    430 		nce->nce_timeout_id = 0;
    431 	}
    432 
    433 	mutex_enter(&ndp->ndp_g_lock);
    434 	if (nce->nce_ptpn == NULL) {
    435 		/*
    436 		 * The last ndp walker has already removed this nce from
    437 		 * the list after we marked the nce CONDEMNED and before
    438 		 * we grabbed the global lock.
    439 		 */
    440 		mutex_exit(&ndp->ndp_g_lock);
    441 		return;
    442 	}
    443 	if (ndp->ndp_g_walker > 0) {
    444 		/*
    445 		 * Can't unlink. The walker will clean up
    446 		 */
    447 		ndp->ndp_g_walker_cleanup = B_TRUE;
    448 		mutex_exit(&ndp->ndp_g_lock);
    449 		return;
    450 	}
    451 
    452 	/*
    453 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
    454 	 * the timer since it is marked CONDEMNED.
    455 	 */
    456 	ptpn = nce->nce_ptpn;
    457 	nce1 = nce->nce_next;
    458 	if (nce1 != NULL)
    459 		nce1->nce_ptpn = ptpn;
    460 	*ptpn = nce1;
    461 	nce->nce_ptpn = NULL;
    462 	nce->nce_next = NULL;
    463 	mutex_exit(&ndp->ndp_g_lock);
    464 
    465 	nce_ire_delete(nce);
    466 }
    467 
    468 void
    469 ndp_inactive(nce_t *nce)
    470 {
    471 	mblk_t		**mpp;
    472 	ill_t		*ill;
    473 
    474 	ASSERT(nce->nce_refcnt == 0);
    475 	ASSERT(MUTEX_HELD(&nce->nce_lock));
    476 	ASSERT(nce->nce_fastpath == NULL);
    477 
    478 	/* Free all nce allocated messages */
    479 	mpp = &nce->nce_first_mp_to_free;
    480 	do {
    481 		while (*mpp != NULL) {
    482 			mblk_t  *mp;
    483 
    484 			mp = *mpp;
    485 			*mpp = mp->b_next;
    486 
    487 			inet_freemsg(mp);
    488 		}
    489 	} while (mpp++ != &nce->nce_last_mp_to_free);
    490 
    491 #ifdef DEBUG
    492 	nce_trace_cleanup(nce);
    493 #endif
    494 
    495 	ill = nce->nce_ill;
    496 	mutex_enter(&ill->ill_lock);
    497 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
    498 	    (char *), "nce", (void *), nce);
    499 	ill->ill_nce_cnt--;
    500 	/*
    501 	 * If the number of nce's associated with this ill have dropped
    502 	 * to zero, check whether we need to restart any operation that
    503 	 * is waiting for this to happen.
    504 	 */
    505 	if (ILL_DOWN_OK(ill)) {
    506 		/* ipif_ill_refrele_tail drops the ill_lock */
    507 		ipif_ill_refrele_tail(ill);
    508 	} else {
    509 		mutex_exit(&ill->ill_lock);
    510 	}
    511 	mutex_destroy(&nce->nce_lock);
    512 	if (nce->nce_mp != NULL)
    513 		inet_freemsg(nce->nce_mp);
    514 }
    515 
    516 /*
    517  * ndp_walk routine.  Delete the nce if it is associated with the ill
    518  * that is going away.  Always called as a writer.
    519  */
    520 void
    521 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
    522 {
    523 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
    524 		ndp_delete(nce);
    525 	}
    526 }
    527 
    528 /*
    529  * Walk a list of to be inactive NCEs and blow away all the ires.
    530  */
    531 static void
    532 nce_ire_delete_list(nce_t *nce)
    533 {
    534 	nce_t *nce_next;
    535 
    536 	ASSERT(nce != NULL);
    537 	while (nce != NULL) {
    538 		nce_next = nce->nce_next;
    539 		nce->nce_next = NULL;
    540 
    541 		/*
    542 		 * It is possible for the last ndp walker (this thread)
    543 		 * to come here after ndp_delete has marked the nce CONDEMNED
    544 		 * and before it has removed the nce from the fastpath list
    545 		 * or called untimeout. So we need to do it here. It is safe
    546 		 * for both ndp_delete and this thread to do it twice or
    547 		 * even simultaneously since each of the threads has a
    548 		 * reference on the nce.
    549 		 */
    550 		nce_fastpath_list_delete(nce);
    551 		/*
    552 		 * Cancel any running timer. Timeout can't be restarted
    553 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
    554 		 * Passing invalid timeout id is fine.
    555 		 */
    556 		if (nce->nce_timeout_id != 0) {
    557 			(void) untimeout(nce->nce_timeout_id);
    558 			nce->nce_timeout_id = 0;
    559 		}
    560 		/*
    561 		 * We might hit this func thus in the v4 case:
    562 		 * ipif_down->ipif_ndp_down->ndp_walk
    563 		 */
    564 
    565 		if (nce->nce_ipversion == IPV4_VERSION) {
    566 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
    567 			    IRE_CACHE, nce_ire_delete1,
    568 			    (char *)nce, nce->nce_ill);
    569 		} else {
    570 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
    571 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
    572 			    IRE_CACHE, nce_ire_delete1,
    573 			    (char *)nce, nce->nce_ill);
    574 		}
    575 		NCE_REFRELE_NOTR(nce);
    576 		nce = nce_next;
    577 	}
    578 }
    579 
    580 /*
    581  * Delete an ire when the nce goes away.
    582  */
    583 /* ARGSUSED */
    584 static void
    585 nce_ire_delete(nce_t *nce)
    586 {
    587 	if (nce->nce_ipversion == IPV6_VERSION) {
    588 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
    589 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
    590 		NCE_REFRELE_NOTR(nce);
    591 	} else {
    592 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
    593 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
    594 		NCE_REFRELE_NOTR(nce);
    595 	}
    596 }
    597 
    598 /*
    599  * ire_walk routine used to delete every IRE that shares this nce
    600  */
    601 static void
    602 nce_ire_delete1(ire_t *ire, char *nce_arg)
    603 {
    604 	nce_t	*nce = (nce_t *)nce_arg;
    605 
    606 	ASSERT(ire->ire_type == IRE_CACHE);
    607 
    608 	if (ire->ire_nce == nce) {
    609 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
    610 		ire_delete(ire);
    611 	}
    612 }
    613 
    614 /*
    615  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
    616  */
    617 boolean_t
    618 ndp_restart_dad(nce_t *nce)
    619 {
    620 	boolean_t started;
    621 	boolean_t dropped;
    622 
    623 	if (nce == NULL)
    624 		return (B_FALSE);
    625 	mutex_enter(&nce->nce_lock);
    626 	if (nce->nce_state == ND_PROBE) {
    627 		mutex_exit(&nce->nce_lock);
    628 		started = B_TRUE;
    629 	} else if (nce->nce_state == ND_REACHABLE) {
    630 		nce->nce_state = ND_PROBE;
    631 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
    632 		mutex_exit(&nce->nce_lock);
    633 		dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
    634 		    B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
    635 		if (dropped) {
    636 			mutex_enter(&nce->nce_lock);
    637 			nce->nce_pcnt++;
    638 			mutex_exit(&nce->nce_lock);
    639 		}
    640 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
    641 		started = B_TRUE;
    642 	} else {
    643 		mutex_exit(&nce->nce_lock);
    644 		started = B_FALSE;
    645 	}
    646 	return (started);
    647 }
    648 
    649 /*
    650  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
    651  * If one is found, the refcnt on the nce will be incremented.
    652  */
    653 nce_t *
    654 ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
    655 {
    656 	nce_t	*nce;
    657 	ip_stack_t	*ipst;
    658 
    659 	ASSERT(ill != NULL);
    660 	ipst = ill->ill_ipst;
    661 
    662 	ASSERT(ill != NULL && ill->ill_isv6);
    663 	if (!caller_holds_lock) {
    664 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
    665 	}
    666 
    667 	/* Get head of v6 hash table */
    668 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
    669 	nce = nce_lookup_addr(ill, addr, nce);
    670 	if (nce == NULL)
    671 		nce = nce_lookup_mapping(ill, addr);
    672 	if (!caller_holds_lock)
    673 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
    674 	return (nce);
    675 }
    676 /*
    677  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
    678  * If one is found, the refcnt on the nce will be incremented.
    679  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
    680  * so we skip the nce_lookup_mapping call.
    681  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
    682  */
    683 nce_t *
    684 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
    685 {
    686 	nce_t	*nce;
    687 	in6_addr_t addr6;
    688 	ip_stack_t *ipst = ill->ill_ipst;
    689 
    690 	if (!caller_holds_lock) {
    691 		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
    692 	}
    693 
    694 	/* Get head of v4 hash table */
    695 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
    696 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
    697 	nce = nce_lookup_addr(ill, &addr6, nce);
    698 	if (!caller_holds_lock)
    699 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
    700 	return (nce);
    701 }
    702 
    703 /*
    704  * Cache entry lookup.  Try to find an nce matching the parameters passed.
    705  * Look only for exact entries (no mappings).  If an nce is found, increment
    706  * the hold count on that nce. The caller passes in the start of the
    707  * appropriate hash table, and must be holding the appropriate global
    708  * lock (ndp_g_lock).
    709  */
    710 static nce_t *
    711 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
    712 {
    713 	ndp_g_t		*ndp;
    714 	ip_stack_t	*ipst = ill->ill_ipst;
    715 
    716 	if (ill->ill_isv6)
    717 		ndp = ipst->ips_ndp6;
    718 	else
    719 		ndp = ipst->ips_ndp4;
    720 
    721 	ASSERT(ill != NULL);
    722 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
    723 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
    724 		return (NULL);
    725 	for (; nce != NULL; nce = nce->nce_next) {
    726 		if (nce->nce_ill == ill) {
    727 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
    728 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
    729 			    &ipv6_all_ones)) {
    730 				mutex_enter(&nce->nce_lock);
    731 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
    732 					NCE_REFHOLD_LOCKED(nce);
    733 					mutex_exit(&nce->nce_lock);
    734 					break;
    735 				}
    736 				mutex_exit(&nce->nce_lock);
    737 			}
    738 		}
    739 	}
    740 	return (nce);
    741 }
    742 
    743 /*
    744  * Cache entry lookup.  Try to find an nce matching the parameters passed.
    745  * Look only for mappings.
    746  */
    747 static nce_t *
    748 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
    749 {
    750 	nce_t	*nce;
    751 	ip_stack_t	*ipst = ill->ill_ipst;
    752 
    753 	ASSERT(ill != NULL && ill->ill_isv6);
    754 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
    755 	if (!IN6_IS_ADDR_MULTICAST(addr))
    756 		return (NULL);
    757 	nce = ipst->ips_ndp6->nce_mask_entries;
    758 	for (; nce != NULL; nce = nce->nce_next)
    759 		if (nce->nce_ill == ill &&
    760 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
    761 			mutex_enter(&nce->nce_lock);
    762 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
    763 				NCE_REFHOLD_LOCKED(nce);
    764 				mutex_exit(&nce->nce_lock);
    765 				break;
    766 			}
    767 			mutex_exit(&nce->nce_lock);
    768 		}
    769 	return (nce);
    770 }
    771 
    772 /*
    773  * Process passed in parameters either from an incoming packet or via
    774  * user ioctl.
    775  */
    776 void
    777 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
    778 {
    779 	ill_t	*ill = nce->nce_ill;
    780 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
    781 	mblk_t	*mp;
    782 	boolean_t ll_updated = B_FALSE;
    783 	boolean_t ll_changed;
    784 	ip_stack_t	*ipst = ill->ill_ipst;
    785 
    786 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
    787 	/*
    788 	 * No updates of link layer address or the neighbor state is
    789 	 * allowed, when the cache is in NONUD state.  This still
    790 	 * allows for responding to reachability solicitation.
    791 	 */
    792 	mutex_enter(&nce->nce_lock);
    793 	if (nce->nce_state == ND_INCOMPLETE) {
    794 		if (hw_addr == NULL) {
    795 			mutex_exit(&nce->nce_lock);
    796 			return;
    797 		}
    798 		nce_set_ll(nce, hw_addr);
    799 		/*
    800 		 * Update nce state and send the queued packets
    801 		 * back to ip this time ire will be added.
    802 		 */
    803 		if (flag & ND_NA_FLAG_SOLICITED) {
    804 			nce_update(nce, ND_REACHABLE, NULL);
    805 		} else {
    806 			nce_update(nce, ND_STALE, NULL);
    807 		}
    808 		mutex_exit(&nce->nce_lock);
    809 		nce_fastpath(nce);
    810 		mutex_enter(&nce->nce_lock);
    811 		mp = nce->nce_qd_mp;
    812 		nce->nce_qd_mp = NULL;
    813 		mutex_exit(&nce->nce_lock);
    814 		while (mp != NULL) {
    815 			mblk_t *nxt_mp, *data_mp;
    816 
    817 			nxt_mp = mp->b_next;
    818 			mp->b_next = NULL;
    819 
    820 			if (mp->b_datap->db_type == M_CTL)
    821 				data_mp = mp->b_cont;
    822 			else
    823 				data_mp = mp;
    824 			if (data_mp->b_prev != NULL) {
    825 				ill_t   *inbound_ill;
    826 				queue_t *fwdq = NULL;
    827 				uint_t ifindex;
    828 
    829 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
    830 				inbound_ill = ill_lookup_on_ifindex(ifindex,
    831 				    B_TRUE, NULL, NULL, NULL, NULL, ipst);
    832 				if (inbound_ill == NULL) {
    833 					data_mp->b_prev = NULL;
    834 					freemsg(mp);
    835 					return;
    836 				} else {
    837 					fwdq = inbound_ill->ill_rq;
    838 				}
    839 				data_mp->b_prev = NULL;
    840 				/*
    841 				 * Send a forwarded packet back into ip_rput_v6
    842 				 * just as in ire_send_v6().
    843 				 * Extract the queue from b_prev (set in
    844 				 * ip_rput_data_v6).
    845 				 */
    846 				if (fwdq != NULL) {
    847 					/*
    848 					 * Forwarded packets hop count will
    849 					 * get decremented in ip_rput_data_v6
    850 					 */
    851 					if (data_mp != mp)
    852 						freeb(mp);
    853 					put(fwdq, data_mp);
    854 				} else {
    855 					/*
    856 					 * Send locally originated packets back
    857 					 * into * ip_wput_v6.
    858 					 */
    859 					put(ill->ill_wq, mp);
    860 				}
    861 				ill_refrele(inbound_ill);
    862 			} else {
    863 				put(ill->ill_wq, mp);
    864 			}
    865 			mp = nxt_mp;
    866 		}
    867 		return;
    868 	}
    869 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
    870 	if (!is_adv) {
    871 		/* If this is a SOLICITATION request only */
    872 		if (ll_changed)
    873 			nce_update(nce, ND_STALE, hw_addr);
    874 		mutex_exit(&nce->nce_lock);
    875 		return;
    876 	}
    877 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
    878 		/* If in any other state than REACHABLE, ignore */
    879 		if (nce->nce_state == ND_REACHABLE) {
    880 			nce_update(nce, ND_STALE, NULL);
    881 		}
    882 		mutex_exit(&nce->nce_lock);
    883 		return;
    884 	} else {
    885 		if (ll_changed) {
    886 			nce_update(nce, ND_UNCHANGED, hw_addr);
    887 			ll_updated = B_TRUE;
    888 		}
    889 		if (flag & ND_NA_FLAG_SOLICITED) {
    890 			nce_update(nce, ND_REACHABLE, NULL);
    891 		} else {
    892 			if (ll_updated) {
    893 				nce_update(nce, ND_STALE, NULL);
    894 			}
    895 		}
    896 		mutex_exit(&nce->nce_lock);
    897 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
    898 		    NCE_F_ISROUTER)) {
    899 			ire_t *ire;
    900 
    901 			/*
    902 			 * Router turned to host.  We need to remove the
    903 			 * entry as well as any default route that may be
    904 			 * using this as a next hop.  This is required by
    905 			 * section 7.2.5 of RFC 2461.
    906 			 */
    907 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
    908 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
    909 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
    910 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
    911 			    MATCH_IRE_DEFAULT, ipst);
    912 			if (ire != NULL) {
    913 				ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
    914 				ire_delete(ire);
    915 				ire_refrele(ire);
    916 			}
    917 			ndp_delete(nce);
    918 		}
    919 	}
    920 }
    921 
    922 /*
    923  * Pass arg1 to the pfi supplied, along with each nce in existence.
    924  * ndp_walk() places a REFHOLD on the nce and drops the lock when
    925  * walking the hash list.
    926  */
    927 void
    928 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
    929     boolean_t trace)
    930 {
    931 
    932 	nce_t	*nce;
    933 	nce_t	*nce1;
    934 	nce_t	**ncep;
    935 	nce_t	*free_nce_list = NULL;
    936 
    937 	mutex_enter(&ndp->ndp_g_lock);
    938 	/* Prevent ndp_delete from unlink and free of NCE */
    939 	ndp->ndp_g_walker++;
    940 	mutex_exit(&ndp->ndp_g_lock);
    941 	for (ncep = ndp->nce_hash_tbl;
    942 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
    943 		for (nce = *ncep; nce != NULL; nce = nce1) {
    944 			nce1 = nce->nce_next;
    945 			if (ill == NULL || nce->nce_ill == ill) {
    946 				if (trace) {
    947 					NCE_REFHOLD(nce);
    948 					(*pfi)(nce, arg1);
    949 					NCE_REFRELE(nce);
    950 				} else {
    951 					NCE_REFHOLD_NOTR(nce);
    952 					(*pfi)(nce, arg1);
    953 					NCE_REFRELE_NOTR(nce);
    954 				}
    955 			}
    956 		}
    957 	}
    958 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
    959 		nce1 = nce->nce_next;
    960 		if (ill == NULL || nce->nce_ill == ill) {
    961 			if (trace) {
    962 				NCE_REFHOLD(nce);
    963 				(*pfi)(nce, arg1);
    964 				NCE_REFRELE(nce);
    965 			} else {
    966 				NCE_REFHOLD_NOTR(nce);
    967 				(*pfi)(nce, arg1);
    968 				NCE_REFRELE_NOTR(nce);
    969 			}
    970 		}
    971 	}
    972 	mutex_enter(&ndp->ndp_g_lock);
    973 	ndp->ndp_g_walker--;
    974 	/*
    975 	 * While NCE's are removed from global list they are placed
    976 	 * in a private list, to be passed to nce_ire_delete_list().
    977 	 * The reason is, there may be ires pointing to this nce
    978 	 * which needs to cleaned up.
    979 	 */
    980 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
    981 		/* Time to delete condemned entries */
    982 		for (ncep = ndp->nce_hash_tbl;
    983 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
    984 			nce = *ncep;
    985 			if (nce != NULL) {
    986 				nce_remove(ndp, nce, &free_nce_list);
    987 			}
    988 		}
    989 		nce = ndp->nce_mask_entries;
    990 		if (nce != NULL) {
    991 			nce_remove(ndp, nce, &free_nce_list);
    992 		}
    993 		ndp->ndp_g_walker_cleanup = B_FALSE;
    994 	}
    995 
    996 	mutex_exit(&ndp->ndp_g_lock);
    997 
    998 	if (free_nce_list != NULL) {
    999 		nce_ire_delete_list(free_nce_list);
   1000 	}
   1001 }
   1002 
   1003 /*
   1004  * Walk everything.
   1005  * Note that ill can be NULL hence can't derive the ipst from it.
   1006  */
   1007 void
   1008 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
   1009 {
   1010 	ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
   1011 	ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
   1012 }
   1013 
   1014 /*
   1015  * Process resolve requests.  Handles both mapped entries
   1016  * as well as cases that needs to be send out on the wire.
   1017  * Lookup a NCE for a given IRE.  Regardless of whether one exists
   1018  * or one is created, we defer making ire point to nce until the
   1019  * ire is actually added at which point the nce_refcnt on the nce is
   1020  * incremented.  This is done primarily to have symmetry between ire_add()
   1021  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
   1022  */
   1023 int
   1024 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
   1025 {
   1026 	nce_t		*nce;
   1027 	int		err = 0;
   1028 	uint32_t	ms;
   1029 	mblk_t		*mp_nce = NULL;
   1030 	ip_stack_t	*ipst = ill->ill_ipst;
   1031 
   1032 	ASSERT(ill->ill_isv6);
   1033 	if (IN6_IS_ADDR_MULTICAST(