Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Functions to implement IP address -> link layer address (PSARC 2006/482)
     28  */
     29 #include <inet/ip2mac.h>
     30 #include <inet/ip2mac_impl.h>
     31 #include <sys/zone.h>
     32 #include <sys/dlpi.h>
     33 #include <inet/ip_ndp.h>
     34 #include <inet/ip_if.h>
     35 #include <inet/ip6.h>
     36 
     37 /*
     38  * dispatch pending callbacks.
     39  */
     40 void
     41 nce_cb_dispatch(nce_t *nce)
     42 {
     43 	nce_cb_t *nce_cb = list_head(&nce->nce_cb);
     44 	ip2mac_t ip2m;
     45 
     46 	mutex_enter(&nce->nce_lock);
     47 	if (list_is_empty(&nce->nce_cb)) {
     48 		mutex_exit(&nce->nce_lock);
     49 		return;
     50 	}
     51 	nce_ip2mac_response(&ip2m, nce);
     52 	nce_cb_refhold_locked(nce);
     53 	/*
     54 	 * IP does not hold internal locks like nce_lock across calls to
     55 	 * other subsystems for fear of recursive lock entry and lock
     56 	 * hierarchy violation. The caller may be holding locks across
     57 	 * the call to IP. (It would be ideal if no subsystem holds locks
     58 	 * across calls into another subsystem, especially if calls can
     59 	 * happen in either direction).
     60 	 */
     61 	nce_cb = list_head(&nce->nce_cb);
     62 	for (; nce_cb != NULL; nce_cb = list_next(&nce->nce_cb, nce_cb)) {
     63 		if (nce_cb->nce_cb_flags & NCE_CB_DISPATCHED)
     64 			continue;
     65 		nce_cb->nce_cb_flags |= NCE_CB_DISPATCHED;
     66 		mutex_exit(&nce->nce_lock);
     67 		(*nce_cb->nce_cb_func)(&ip2m, nce_cb->nce_cb_arg);
     68 		mutex_enter(&nce->nce_lock);
     69 	}
     70 	nce_cb_refrele(nce);
     71 	mutex_exit(&nce->nce_lock);
     72 }
     73 
     74 /*
     75  * fill up the ip2m response fields with inforamation from the nce.
     76  */
     77 void
     78 nce_ip2mac_response(ip2mac_t *ip2m, nce_t *nce)
     79 {
     80 	boolean_t isv6 = (nce->nce_ipversion == IPV6_VERSION);
     81 	sin6_t	*sin6;
     82 	struct sockaddr_dl *sdl;
     83 	uchar_t *nce_lladdr;
     84 
     85 	ASSERT(MUTEX_HELD(&nce->nce_lock));
     86 	bzero(ip2m, sizeof (*ip2m));
     87 	if (NCE_ISREACHABLE(nce) && (nce->nce_flags & NCE_F_CONDEMNED) == 0)
     88 		ip2m->ip2mac_err = 0;
     89 	else
     90 		ip2m->ip2mac_err = ESRCH;
     91 	if (isv6) {
     92 		sin6 = (sin6_t *)&ip2m->ip2mac_pa;
     93 		sin6->sin6_family = AF_INET6;
     94 		sin6->sin6_addr = nce->nce_addr;
     95 	}
     96 	if (ip2m->ip2mac_err == 0) {
     97 		sdl = &ip2m->ip2mac_ha;
     98 		sdl->sdl_family = AF_LINK;
     99 		sdl->sdl_type = nce->nce_ill->ill_type;
    100 		sdl->sdl_nlen = 0;
    101 		sdl->sdl_alen = nce->nce_ill->ill_phys_addr_length;
    102 		nce_lladdr = nce->nce_res_mp->b_rptr +
    103 		    NCE_LL_ADDR_OFFSET(nce->nce_ill);
    104 		bcopy(nce_lladdr, LLADDR(sdl), sdl->sdl_alen);
    105 	}
    106 }
    107 
    108 void
    109 nce_cb_refhold_locked(nce_t *nce)
    110 {
    111 	ASSERT(MUTEX_HELD(&nce->nce_lock));
    112 	nce->nce_cb_walker_cnt++;
    113 }
    114 
    115 void
    116 nce_cb_refrele(nce_t *nce)
    117 {
    118 	nce_cb_t *nce_cb, *nce_cb_next = NULL;
    119 
    120 	ASSERT(MUTEX_HELD(&nce->nce_lock));
    121 	if (--nce->nce_cb_walker_cnt == 0) {
    122 		for (nce_cb = list_head(&nce->nce_cb); nce_cb != NULL;
    123 		    nce_cb = nce_cb_next) {
    124 
    125 			nce_cb_next = list_next(&nce->nce_cb, nce_cb);
    126 			if ((nce_cb->nce_cb_flags & NCE_CB_DISPATCHED) == 0)
    127 				continue;
    128 			list_remove(&nce->nce_cb, nce_cb);
    129 			kmem_free(nce_cb, sizeof (*nce_cb));
    130 		}
    131 	}
    132 }
    133 
    134 /*
    135  * add a callback to the nce, so that the callback can be invoked
    136  * after address resolution succeeds/fails.
    137  */
    138 static ip2mac_id_t
    139 nce_add_cb(nce_t *nce, ip2mac_callback_t *cb, void *cbarg)
    140 {
    141 	nce_cb_t	*nce_cb;
    142 	ip2mac_id_t	ip2mid = NULL;
    143 
    144 	ASSERT(MUTEX_HELD(&nce->nce_lock));
    145 	if ((nce_cb = kmem_zalloc(sizeof (*nce_cb), KM_NOSLEEP)) == NULL)
    146 		return (ip2mid);
    147 	nce_cb->nce_cb_func = cb;
    148 	nce_cb->nce_cb_arg = cbarg;
    149 	/*
    150 	 * We identify the nce_cb_t during cancellation by the address
    151 	 * of the nce_cb_t itself, and, as a short-cut for eliminating
    152 	 * clear mismatches, only look in the callback list of nce's
    153 	 * whose address is equal to the nce_cb_id.
    154 	 */
    155 	nce_cb->nce_cb_id = nce; /* no refs! just an address */
    156 	list_insert_tail(&nce->nce_cb, nce_cb);
    157 	ip2mid = nce;  /* this is the id to be used in ip2mac_cancel */
    158 
    159 	return (nce_cb);
    160 }
    161 
    162 /*
    163  * Resolve an IP address to a link-layer address using the data-structures
    164  * defined in PSARC 2006/482. If the current link-layer address for the
    165  * IP address is not known, the state-machine for resolving the resolution
    166  * will be triggered, and the callback function (*cb) will be invoked after
    167  * the resolution completes.
    168  */
    169 ip2mac_id_t
    170 ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
    171     zoneid_t zoneid)
    172 {
    173 	nce_t		*nce;
    174 	boolean_t	isv6;
    175 	ill_t		*ill;
    176 	netstack_t	*ns;
    177 	ip_stack_t	*ipst;
    178 	ip2mac_id_t	ip2mid = NULL;
    179 	sin6_t		*sin6;
    180 	int		err;
    181 	uint64_t	delta;
    182 
    183 	isv6 = (ip2m->ip2mac_pa.ss_family == AF_INET6);
    184 
    185 	if (!isv6) {
    186 		/*
    187 		 * IPv4 is not currently supported.
    188 		 */
    189 		ip2m->ip2mac_err = ENOTSUP;
    190 		return (NULL);
    191 	}
    192 
    193 	ns = netstack_find_by_zoneid(zoneid);
    194 	if (ns == NULL) {
    195 		ip2m->ip2mac_err = EINVAL;
    196 		return (NULL);
    197 	}
    198 	/*
    199 	 * For exclusive stacks we reset the zoneid to zero
    200 	 * since IP uses the global zoneid in the exclusive stacks.
    201 	 */
    202 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
    203 		zoneid = GLOBAL_ZONEID;
    204 	ipst = ns->netstack_ip;
    205 	/*
    206 	 * find the ill from the ip2m->ip2mac_ifindex
    207 	 */
    208 	ill = ill_lookup_on_ifindex(ip2m->ip2mac_ifindex, isv6, NULL,
    209 	    NULL, NULL, NULL, ipst);
    210 	if (ill == NULL) {
    211 		ip2m->ip2mac_err = ENXIO;
    212 		netstack_rele(ns);
    213 		return (NULL);
    214 	}
    215 	if (isv6) {
    216 		sin6 = (sin6_t *)&ip2m->ip2mac_pa;
    217 		if (flags == IP2MAC_LOOKUP) {
    218 			nce = ndp_lookup_v6(ill, B_FALSE, &sin6->sin6_addr,
    219 			    B_FALSE);
    220 		} else {
    221 			err = ndp_lookup_then_add_v6(ill, B_FALSE, NULL,
    222 			    &sin6->sin6_addr, &ipv6_all_ones, &ipv6_all_zeros,
    223 			    0, 0, ND_INCOMPLETE, &nce);
    224 		}
    225 	} else  {
    226 		ip2m->ip2mac_err = ENOTSUP; /* yet. */
    227 		goto done;
    228 	}
    229 	if (flags == IP2MAC_LOOKUP) {
    230 		if (nce == NULL) {
    231 			ip2m->ip2mac_err = ESRCH;
    232 			goto done;
    233 		}
    234 		mutex_enter(&nce->nce_lock);
    235 		if (NCE_ISREACHABLE(nce)) {
    236 			nce_ip2mac_response(ip2m, nce);
    237 			ip2m->ip2mac_err = 0;
    238 		} else {
    239 			ip2m->ip2mac_err = ESRCH;
    240 		}
    241 		mutex_exit(&nce->nce_lock);
    242 		NCE_REFRELE(nce);
    243 		goto done;
    244 	} else {
    245 		if (err != 0 && err != EEXIST) {
    246 			ip2m->ip2mac_err = err;
    247 			goto done;
    248 		}
    249 	}
    250 	delta = TICK_TO_MSEC(lbolt64) - nce->nce_last;
    251 	mutex_enter(&nce->nce_lock);
    252 	if (nce->nce_flags & NCE_F_CONDEMNED) {
    253 		ip2m->ip2mac_err = ESRCH;
    254 	} else if (!NCE_ISREACHABLE(nce) ||
    255 	    delta > (uint64_t)ill->ill_reachable_time) {
    256 		if (NCE_ISREACHABLE(nce)) {
    257 			/*
    258 			 * Since we do not control the packet output
    259 			 * path for ip2mac() callers, we need to verify
    260 			 * if the existing information in the nce is
    261 			 * very old, and retrigger resolution if necessary.
    262 			 * We will not return the existing stale
    263 			 * information until it is verified through a
    264 			 * resolver request/response exchange.
    265 			 *
    266 			 * In the future, we may want to support extensions
    267 			 * that do additional callbacks on link-layer updates,
    268 			 * so that we can return the stale information but
    269 			 * also update the caller if the lladdr changes.
    270 			 */
    271 			nce->nce_rcnt = ill->ill_xmit_count;
    272 			nce->nce_state = ND_PROBE;
    273 			err = 0; /* treat this nce as a new one */
    274 		}
    275 		if (nce->nce_rcnt > 0) {
    276 			/*
    277 			 * Still resolving this nce, so we can
    278 			 * queue the callback information in nce->nce_cb
    279 			 */
    280 			ip2mid = nce_add_cb(nce, cb, cbarg);
    281 			ip2m->ip2mac_err = EINPROGRESS;
    282 		} else {
    283 			/*
    284 			 * Resolution failed.
    285 			 */
    286 			ip2m->ip2mac_err = ESRCH;
    287 		}
    288 	} else {
    289 		nce_ip2mac_response(ip2m, nce);
    290 		ip2m->ip2mac_err = 0;
    291 	}
    292 	if (ip2m->ip2mac_err == EINPROGRESS && err != EEXIST)
    293 		ip_ndp_resolve(nce);
    294 	mutex_exit(&nce->nce_lock);
    295 	NCE_REFRELE(nce);
    296 done:
    297 	netstack_rele(ns);
    298 	ill_refrele(ill);
    299 	return (ip2mid);
    300 }
    301 
    302 /*
    303  * data passed to nce_walk for canceling outstanding callbacks.
    304  */
    305 typedef struct ip2mac_cancel_data_s {
    306 	ip2mac_id_t ip2m_cancel_id;
    307 	int	ip2m_cancel_err;
    308 } ip2mac_cancel_data_t;
    309 
    310 /*
    311  * callback invoked for each active nce. If the ip2mac_id_t corresponds
    312  * to an active nce_cb_t in the nce's callback list, we want to remove
    313  * the callback (if there are no walkers) or return EBUSY to the caller
    314  */
    315 static int
    316 ip2mac_cancel_callback(nce_t *nce, void *arg)
    317 {
    318 	ip2mac_cancel_data_t *ip2m_wdata = arg;
    319 	nce_cb_t *ip2m_nce_cb = ip2m_wdata->ip2m_cancel_id;
    320 	nce_cb_t *nce_cb;
    321 
    322 	if (ip2m_nce_cb->nce_cb_id != nce)
    323 		return (0);
    324 
    325 	mutex_enter(&nce->nce_lock);
    326 	if (list_is_empty(&nce->nce_cb)) {
    327 		mutex_exit(&nce->nce_lock);
    328 		return (0);
    329 	}
    330 	/*
    331 	 * IP does not hold internal locks like nce_lock across calls to
    332 	 * other subsystems for fear of recursive lock entry and lock
    333 	 * hierarchy violation. The caller may be holding locks across
    334 	 * the call to IP. (It would be ideal if no subsystem holds locks
    335 	 * across calls into another subsystem, especially if calls can
    336 	 * happen in either direction).
    337 	 */
    338 	nce_cb = list_head(&nce->nce_cb);
    339 	for (; nce_cb != NULL; nce_cb = list_next(&nce->nce_cb, nce_cb)) {
    340 		if (nce_cb != ip2m_nce_cb)
    341 			continue;
    342 		/*
    343 		 * If there are no walkers we can remove the nce_cb.
    344 		 * Otherwise the exiting walker will clean up.
    345 		 */
    346 		if (nce->nce_cb_walker_cnt == 0) {
    347 			list_remove(&nce->nce_cb, nce_cb);
    348 		} else {
    349 			ip2m_wdata->ip2m_cancel_err = EBUSY;
    350 		}
    351 		break;
    352 	}
    353 	mutex_exit(&nce->nce_lock);
    354 	return (0);
    355 }
    356 
    357 /*
    358  * cancel an outstanding timeout set up via ip2mac
    359  */
    360 int
    361 ip2mac_cancel(ip2mac_id_t ip2mid, zoneid_t zoneid)
    362 {
    363 	netstack_t	*ns;
    364 	ip_stack_t	*ipst;
    365 	ip2mac_cancel_data_t ip2m_wdata;
    366 
    367 	ns = netstack_find_by_zoneid(zoneid);
    368 	if (ns == NULL) {
    369 		ip2m_wdata.ip2m_cancel_err = EINVAL;
    370 		return (ip2m_wdata.ip2m_cancel_err);
    371 	}
    372 	/*
    373 	 * For exclusive stacks we reset the zoneid to zero
    374 	 * since IP uses the global zoneid in the exclusive stacks.
    375 	 */
    376 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
    377 		zoneid = GLOBAL_ZONEID;
    378 	ipst = ns->netstack_ip;
    379 
    380 	ip2m_wdata.ip2m_cancel_id = ip2mid;
    381 	ip2m_wdata.ip2m_cancel_err = 0;
    382 	ndp_walk(NULL, ip2mac_cancel_callback, &ip2m_wdata, ipst);
    383 	/*
    384 	 * We may return EBUSY if a walk to dispatch callbacks is
    385 	 * in progress, in which case the caller needs to synchronize
    386 	 * with the registered callback function to make sure the
    387 	 * module does not exit when there is a callback pending.
    388 	 */
    389 	netstack_rele(ns);
    390 	return (ip2m_wdata.ip2m_cancel_err);
    391 }
    392