Home | History | Annotate | Download | only in ip
      1 /*
      2  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
      3  * Use is subject to license terms.
      4  */
      5 
      6 /*
      7  * Copyright (c) 1988, 1991, 1993
      8  *	The Regents of the University of California.  All rights reserved.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. All advertising materials mentioning features or use of this software
     19  *    must display the following acknowledgement:
     20  *	This product includes software developed by the University of
     21  *	California, Berkeley and its contributors.
     22  * 4. Neither the name of the University nor the names of its contributors
     23  *    may be used to endorse or promote products derived from this software
     24  *    without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     36  * SUCH DAMAGE.
     37  *
     38  *	@(#)rtsock.c	8.6 (Berkeley) 2/11/95
     39  */
     40 
     41 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     42 
     43 /*
     44  * This file contains routines that processes routing socket requests.
     45  */
     46 
     47 #include <sys/types.h>
     48 #include <sys/stream.h>
     49 #include <sys/stropts.h>
     50 #include <sys/ddi.h>
     51 #include <sys/cmn_err.h>
     52 #include <sys/debug.h>
     53 #include <sys/policy.h>
     54 #include <sys/zone.h>
     55 
     56 #include <sys/systm.h>
     57 #include <sys/param.h>
     58 #include <sys/socket.h>
     59 #include <sys/strsun.h>
     60 #include <net/if.h>
     61 #include <net/route.h>
     62 #include <netinet/in.h>
     63 #include <net/if_dl.h>
     64 #include <netinet/ip6.h>
     65 
     66 #include <inet/common.h>
     67 #include <inet/ip.h>
     68 #include <inet/ip6.h>
     69 #include <inet/ip_if.h>
     70 #include <inet/ip_ire.h>
     71 #include <inet/ip_ftable.h>
     72 #include <inet/ip_rts.h>
     73 
     74 #include <inet/ipclassifier.h>
     75 
     76 #include <sys/tsol/tndb.h>
     77 #include <sys/tsol/tnet.h>
     78 
     79 #define	RTS_MSG_SIZE(type, rtm_addrs, af, sacnt) \
     80 	(rts_data_msg_size(rtm_addrs, af, sacnt) + rts_header_msg_size(type))
     81 
     82 static size_t	rts_copyfromsockaddr(struct sockaddr *sa, in6_addr_t *addrp);
     83 static void	rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst,
     84     ipaddr_t mask, ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr,
     85     ipaddr_t author, const ipif_t *ipif, mblk_t *mp, uint_t, const tsol_gc_t *);
     86 static int	rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp,
     87     in6_addr_t *gw_addrp, in6_addr_t *net_maskp, in6_addr_t *authorp,
     88     in6_addr_t *if_addrp, in6_addr_t *src_addrp, ushort_t *indexp,
     89     sa_family_t *afp, tsol_rtsecattr_t *rtsecattr, int *error);
     90 static void	rts_getifdata(if_data_t *if_data, const ipif_t *ipif);
     91 static int	rts_getmetrics(ire_t *ire, rt_metrics_t *metrics);
     92 static mblk_t	*rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire,
     93     sa_family_t af);
     94 static void	rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics);
     95 static void	ip_rts_request_retry(ipsq_t *, queue_t *q, mblk_t *mp, void *);
     96 
     97 /*
     98  * Send the ack to all the routing queues.  In case of the originating queue,
     99  * send it only if the loopback is set.
    100  *
    101  * Messages are sent upstream only on routing sockets that did not specify an
    102  * address family when they were created or when the address family matches the
    103  * one specified by the caller.
    104  *
    105  */
    106 void
    107 rts_queue_input(mblk_t *mp, queue_t *q, sa_family_t af, ip_stack_t *ipst)
    108 {
    109 	mblk_t	*mp1;
    110 	int	checkqfull;
    111 	conn_t 	*connp, *next_connp;
    112 
    113 	mutex_enter(&ipst->ips_rts_clients->connf_lock);
    114 	connp = ipst->ips_rts_clients->connf_head;
    115 
    116 	while (connp != NULL) {
    117 		/*
    118 		 * If there was a family specified when this routing socket was
    119 		 * created and it doesn't match the family of the message to
    120 		 * copy, then continue.
    121 		 */
    122 		if ((connp->conn_proto != AF_UNSPEC) &&
    123 		    (connp->conn_proto != af)) {
    124 			connp = connp->conn_next;
    125 			continue;
    126 		}
    127 		/*
    128 		 * For the originating queue, we only copy the message upstream
    129 		 * if loopback is set.  For others reading on the routing
    130 		 * socket, we check if there is room upstream for a copy of the
    131 		 * message.
    132 		 */
    133 		if ((q != NULL) && (CONNP_TO_RQ(connp) == RD(q))) {
    134 			if (connp->conn_loopback == 0) {
    135 				connp = connp->conn_next;
    136 				continue;
    137 			}
    138 			/*
    139 			 * Just because it is the same queue doesn't mean it
    140 			 * will promptly read its acks. Have to avoid using
    141 			 * all of kernel memory.
    142 			 */
    143 			checkqfull = B_TRUE;
    144 		} else {
    145 			checkqfull = B_TRUE;
    146 		}
    147 		CONN_INC_REF(connp);
    148 		mutex_exit(&ipst->ips_rts_clients->connf_lock);
    149 		/* Pass to rts_input */
    150 		if (!checkqfull || canputnext(CONNP_TO_RQ(connp))) {
    151 			mp1 = dupmsg(mp);
    152 			if (mp1 == NULL)
    153 				mp1 = copymsg(mp);
    154 			if (mp1 != NULL)
    155 				(connp->conn_recv)(connp, mp1, NULL);
    156 		}
    157 
    158 		mutex_enter(&ipst->ips_rts_clients->connf_lock);
    159 		/* Follow the next pointer before releasing the conn. */
    160 		next_connp = connp->conn_next;
    161 		CONN_DEC_REF(connp);
    162 		connp = next_connp;
    163 	}
    164 	mutex_exit(&ipst->ips_rts_clients->connf_lock);
    165 	freemsg(mp);
    166 }
    167 
    168 /*
    169  * Takes an ire and sends an ack to all the routing sockets. This
    170  * routine is used
    171  * - when a route is created/deleted through the ioctl interface.
    172  * - when ire_expire deletes a stale redirect
    173  */
    174 void
    175 ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
    176 {
    177 	mblk_t		*mp;
    178 	rt_msghdr_t	*rtm;
    179 	int		rtm_addrs = (RTA_DST | RTA_NETMASK | RTA_GATEWAY);
    180 	sa_family_t	af;
    181 	in6_addr_t	gw_addr_v6;
    182 
    183 	if (ire == NULL)
    184 		return;
    185 	ASSERT(ire->ire_ipversion == IPV4_VERSION ||
    186 	    ire->ire_ipversion == IPV6_VERSION);
    187 
    188 	if (ire->ire_flags & RTF_SETSRC)
    189 		rtm_addrs |= RTA_SRC;
    190 
    191 	switch (ire->ire_ipversion) {
    192 	case IPV4_VERSION:
    193 		af = AF_INET;
    194 		mp = rts_alloc_msg(type, rtm_addrs, af, 0);
    195 		if (mp == NULL)
    196 			return;
    197 		rts_fill_msg(type, rtm_addrs, ire->ire_addr, ire->ire_mask,
    198 		    ire->ire_gateway_addr, ire->ire_src_addr, 0, 0, NULL, mp,
    199 		    0, NULL);
    200 		break;
    201 	case IPV6_VERSION:
    202 		af = AF_INET6;
    203 		mp = rts_alloc_msg(type, rtm_addrs, af, 0);
    204 		if (mp == NULL)
    205 			return;
    206 		mutex_enter(&ire->ire_lock);
    207 		gw_addr_v6 = ire->ire_gateway_addr_v6;
    208 		mutex_exit(&ire->ire_lock);
    209 		rts_fill_msg_v6(type, rtm_addrs, &ire->ire_addr_v6,
    210 		    &ire->ire_mask_v6, &gw_addr_v6,
    211 		    &ire->ire_src_addr_v6, &ipv6_all_zeros, &ipv6_all_zeros,
    212 		    NULL, mp, 0, NULL);
    213 		break;
    214 	}
    215 	rtm = (rt_msghdr_t *)mp->b_rptr;
    216 	mp->b_wptr = (uchar_t *)&mp->b_rptr[rtm->rtm_msglen];
    217 	rtm->rtm_addrs = rtm_addrs;
    218 	rtm->rtm_flags = ire->ire_flags;
    219 	if (error != 0)
    220 		rtm->rtm_errno = error;
    221 	else
    222 		rtm->rtm_flags |= RTF_DONE;
    223 	rts_queue_input(mp, NULL, af, ipst);
    224 }
    225 
    226 /* ARGSUSED */
    227 static void
    228 ip_rts_request_retry(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy)
    229 {
    230 	(void) ip_rts_request(q, mp, DB_CRED(mp));
    231 }
    232 
    233 /*
    234  * This is a call from the RTS module
    235  * indicating that this is a Routing Socket
    236  * Stream. Insert this conn_t in routing
    237  * socket client list.
    238  */
    239 void
    240 ip_rts_register(conn_t *connp)
    241 {
    242 	ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
    243 
    244 	connp->conn_loopback = 1;
    245 	ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp);
    246 }
    247 
    248 /*
    249  * This is a call from the RTS module indicating that it is closing.
    250  */
    251 void
    252 ip_rts_unregister(conn_t *connp)
    253 {
    254 	ipcl_hash_remove(connp);
    255 }
    256 
    257 /*
    258  * Processes requests received on a routing socket. It extracts all the
    259  * arguments and calls the appropriate function to process the request.
    260  *
    261  * RTA_SRC bit flag requests are sent by 'route -setsrc'.
    262  *
    263  * In general, this function does not consume the message supplied but rather
    264  * sends the message upstream with an appropriate UNIX errno.
    265  *
    266  * We may need to restart this operation if the ipif cannot be looked up
    267  * due to an exclusive operation that is currently in progress. The restart
    268  * entry point is ip_rts_request_retry. While the request is enqueud in the
    269  * ipsq the ioctl could be aborted and the conn close. To ensure that we don't
    270  * have stale conn pointers, ip_wput_ioctl does a conn refhold. This is
    271  * released at the completion of the rts ioctl at the end of this function
    272  * by calling CONN_OPER_PENDING_DONE or when the ioctl is aborted and
    273  * conn close occurs in conn_ioctl_cleanup.
    274  */
    275 int
    276 ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
    277 {
    278 	rt_msghdr_t	*rtm = NULL;
    279 	in6_addr_t	dst_addr_v6;
    280 	in6_addr_t	src_addr_v6;
    281 	in6_addr_t	gw_addr_v6;
    282 	in6_addr_t	net_mask_v6;
    283 	in6_addr_t	author_v6;
    284 	in6_addr_t	if_addr_v6;
    285 	mblk_t		*mp1, *ioc_mp = mp;
    286 	ire_t		*ire = NULL;
    287 	ire_t		*sire = NULL;
    288 	int		error = 0;
    289 	int		match_flags = MATCH_IRE_DSTONLY;
    290 	int		match_flags_local = MATCH_IRE_TYPE | MATCH_IRE_GW;
    291 	int		found_addrs;
    292 	sa_family_t	af;
    293 	ipaddr_t	dst_addr;
    294 	ipaddr_t	gw_addr;
    295 	ipaddr_t	src_addr;
    296 	ipaddr_t	net_mask;
    297 	ushort_t	index;
    298 	ipif_t		*ipif = NULL;
    299 	ipif_t		*tmp_ipif = NULL;
    300 	IOCP		iocp = (IOCP)mp->b_rptr;
    301 	conn_t		*connp;
    302 	boolean_t	gcgrp_xtraref = B_FALSE;
    303 	tsol_gcgrp_addr_t ga;
    304 	tsol_rtsecattr_t rtsecattr;
    305 	struct rtsa_s	*rtsap = NULL;
    306 	tsol_gcgrp_t	*gcgrp = NULL;
    307 	tsol_gc_t	*gc = NULL;
    308 	ts_label_t	*tsl = NULL;
    309 	zoneid_t	zoneid;
    310 	ip_stack_t	*ipst;
    311 
    312 	ip1dbg(("ip_rts_request: mp is %x\n", DB_TYPE(mp)));
    313 
    314 	ASSERT(CONN_Q(q));
    315 	connp = Q_TO_CONN(q);
    316 	zoneid = connp->conn_zoneid;
    317 	ipst = connp->conn_netstack->netstack_ip;
    318 
    319 	ASSERT(mp->b_cont != NULL);
    320 	/* ioc_mp holds mp */
    321 	mp = mp->b_cont;
    322 
    323 	/*
    324 	 * The Routing Socket data starts on
    325 	 * next block. If there is no next block
    326 	 * this is an indication from routing module
    327 	 * that it is a routing socket stream queue.
    328 	 * We need to support that for compatibility with SDP since
    329 	 * it has a contract private interface to use IP_IOC_RTS_REQUEST.
    330 	 */
    331 	if (mp->b_cont == NULL) {
    332 		/*
    333 		 * This is a message from SDP
    334 		 * indicating that this is a Routing Socket
    335 		 * Stream. Insert this conn_t in routing
    336 		 * socket client list.
    337 		 */
    338 		connp->conn_loopback = 1;
    339 		ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp);
    340 		goto done;
    341 	}
    342 	mp1 = dupmsg(mp->b_cont);
    343 	if (mp1 == NULL) {
    344 		error  = ENOBUFS;
    345 		goto done;
    346 	}
    347 	mp = mp1;
    348 
    349 	if (mp->b_cont != NULL && !pullupmsg(mp, -1)) {
    350 		freemsg(mp);
    351 		error =  EINVAL;
    352 		goto done;
    353 	}
    354 	if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) {
    355 		freemsg(mp);
    356 		error = EINVAL;
    357 		goto done;
    358 	}
    359 
    360 	/*
    361 	 * Check the routing message for basic consistency including the
    362 	 * version number and that the number of octets written is the same
    363 	 * as specified by the rtm_msglen field.
    364 	 *
    365 	 * At this point, an error can be delivered back via rtm_errno.
    366 	 */
    367 	rtm = (rt_msghdr_t *)mp->b_rptr;
    368 	if ((mp->b_wptr - mp->b_rptr) != rtm->rtm_msglen) {
    369 		error = EINVAL;
    370 		goto done;
    371 	}
    372 	if (rtm->rtm_version != RTM_VERSION) {
    373 		error = EPROTONOSUPPORT;
    374 		goto done;
    375 	}
    376 
    377 	/* Only allow RTM_GET or RTM_RESOLVE for unprivileged process */
    378 	if (rtm->rtm_type != RTM_GET &&
    379 	    rtm->rtm_type != RTM_RESOLVE &&
    380 	    (ioc_cr == NULL ||
    381 	    secpolicy_ip_config(ioc_cr, B_FALSE) != 0)) {
    382 		error = EPERM;
    383 		goto done;
    384 	}
    385 
    386 	found_addrs = rts_getaddrs(rtm, &dst_addr_v6, &gw_addr_v6, &net_mask_v6,
    387 	    &author_v6, &if_addr_v6, &src_addr_v6, &index, &af, &rtsecattr,
    388 	    &error);
    389 
    390 	if (error != 0)
    391 		goto done;
    392 
    393 	if ((found_addrs & RTA_DST) == 0) {
    394 		error = EINVAL;
    395 		goto done;
    396 	}
    397 
    398 	/*
    399 	 * Based on the address family of the destination address, determine
    400 	 * the destination, gateway and netmask and return the appropriate error
    401 	 * if an unknown address family was specified (following the errno
    402 	 * values that 4.4BSD-Lite2 returns.)
    403 	 */
    404 	switch (af) {
    405 	case AF_INET:
    406 		IN6_V4MAPPED_TO_IPADDR(&dst_addr_v6, dst_addr);
    407 		IN6_V4MAPPED_TO_IPADDR(&src_addr_v6, src_addr);
    408 		IN6_V4MAPPED_TO_IPADDR(&gw_addr_v6, gw_addr);
    409 		if (((found_addrs & RTA_NETMASK) == 0) ||
    410 		    (rtm->rtm_flags & RTF_HOST))
    411 			net_mask = IP_HOST_MASK;
    412 		else
    413 			IN6_V4MAPPED_TO_IPADDR(&net_mask_v6, net_mask);
    414 		break;
    415 	case AF_INET6:
    416 		if (((found_addrs & RTA_NETMASK) == 0) ||
    417 		    (rtm->rtm_flags & RTF_HOST))
    418 			net_mask_v6 = ipv6_all_ones;
    419 		break;
    420 	default:
    421 		/*
    422 		 * These errno values are meant to be compatible with
    423 		 * 4.4BSD-Lite2 for the given message types.
    424 		 */
    425 		switch (rtm->rtm_type) {
    426 		case RTM_ADD:
    427 		case RTM_DELETE:
    428 			error = ESRCH;
    429 			goto done;
    430 		case RTM_GET:
    431 		case RTM_CHANGE:
    432 			error = EAFNOSUPPORT;
    433 			goto done;
    434 		default:
    435 			error = EOPNOTSUPP;
    436 			goto done;
    437 		}
    438 	}
    439 
    440 	/*
    441 	 * At this point, the address family must be something known.
    442 	 */
    443 	ASSERT(af == AF_INET || af == AF_INET6);
    444 
    445 	if (index != 0) {
    446 		ill_t   *ill;
    447 
    448 		/*
    449 		 * IPC must be refheld somewhere in ip_wput_nondata or
    450 		 * ip_wput_ioctl etc... and cleaned up if ioctl is killed.
    451 		 * If ILL_CHANGING the request is queued in the ipsq.
    452 		 */
    453 		ill = ill_lookup_on_ifindex(index, af == AF_INET6,
    454 		    CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry, &error,
    455 		    ipst);
    456 		if (ill == NULL) {
    457 			if (error != EINPROGRESS)
    458 				error = EINVAL;
    459 			goto done;
    460 		}
    461 
    462 		ipif = ipif_get_next_ipif(NULL, ill);
    463 		ill_refrele(ill);
    464 		/*
    465 		 * If this is replacement ipif, prevent a route from
    466 		 * being added.
    467 		 */
    468 		if (ipif != NULL && ipif->ipif_replace_zero) {
    469 			error = ENETDOWN;
    470 			goto done;
    471 		}
    472 		match_flags |= MATCH_IRE_ILL;
    473 	}
    474 
    475 	/*
    476 	 * If a netmask was supplied in the message, then subsequent route
    477 	 * lookups will attempt to match on the netmask as well.
    478 	 */
    479 	if ((found_addrs & RTA_NETMASK) != 0)
    480 		match_flags |= MATCH_IRE_MASK;
    481 
    482 	/*
    483 	 * We only process any passed-in route security attributes for
    484 	 * either RTM_ADD or RTM_CHANGE message; We overload them
    485 	 * to do an RTM_GET as a different label; ignore otherwise.
    486 	 */
    487 	if (rtm->rtm_type == RTM_ADD || rtm->rtm_type == RTM_CHANGE ||
    488 	    rtm->rtm_type == RTM_GET) {
    489 		ASSERT(rtsecattr.rtsa_cnt <= TSOL_RTSA_REQUEST_MAX);
    490 		if (rtsecattr.rtsa_cnt > 0)
    491 			rtsap = &rtsecattr.rtsa_attr[0];
    492 	}
    493 
    494 	switch (rtm->rtm_type) {
    495 	case RTM_ADD:
    496 		/* if we are adding a route, gateway is a must */
    497 		if ((found_addrs & RTA_GATEWAY) == 0) {
    498 			error = EINVAL;
    499 			goto done;
    500 		}
    501 
    502 		/* Multirouting does not support net routes. */
    503 		if ((rtm->rtm_flags & (RTF_MULTIRT | RTF_HOST)) ==
    504 		    RTF_MULTIRT) {
    505 			error = EADDRNOTAVAIL;
    506 			goto done;
    507 		}
    508 
    509 		/*
    510 		 * Multirouting and user-specified source addresses
    511 		 * do not support interface based routing.
    512 		 * Assigning a source address to an interface based
    513 		 * route is achievable by plumbing a new ipif and
    514 		 * setting up the interface route via this ipif,
    515 		 * though.
    516 		 */
    517 		if (rtm->rtm_flags & (RTF_MULTIRT | RTF_SETSRC)) {
    518 			if ((rtm->rtm_flags & RTF_GATEWAY) == 0) {
    519 				error = EADDRNOTAVAIL;
    520 				goto done;
    521 			}
    522 		}
    523 
    524 		switch (af) {
    525 		case AF_INET:
    526 			if (src_addr != INADDR_ANY) {
    527 				/*
    528 				 * The RTF_SETSRC flag is present, check that
    529 				 * the supplied src address is not the loopback
    530 				 * address. This would produce martian packets.
    531 				 */
    532 				if (src_addr == htonl(INADDR_LOOPBACK)) {
    533 					error = EINVAL;
    534 					goto done;
    535 				}
    536 				/*
    537 				 * Also check that the supplied address is a
    538 				 * valid, local one.
    539 				 */
    540 				tmp_ipif = ipif_lookup_addr(src_addr, NULL,
    541 				    ALL_ZONES, CONNP_TO_WQ(connp), ioc_mp,
    542 				    ip_rts_request_retry, &error, ipst);
    543 				if (tmp_ipif == NULL) {
    544 					if (error != EINPROGRESS)
    545 						error = EADDRNOTAVAIL;
    546 					goto done;
    547 				}
    548 				if (!(tmp_ipif->ipif_flags & IPIF_UP) ||
    549 				    (tmp_ipif->ipif_flags &
    550 				    (IPIF_NOLOCAL | IPIF_ANYCAST))) {
    551 					error = EINVAL;
    552 					goto done;
    553 				}
    554 			} else {
    555 				/*
    556 				 * The RTF_SETSRC modifier must be associated
    557 				 * to a non-null source address.
    558 				 */
    559 				if (rtm->rtm_flags & RTF_SETSRC) {
    560 					error = EINVAL;
    561 					goto done;
    562 				}
    563 			}
    564 
    565 			error = ip_rt_add(dst_addr, net_mask, gw_addr, src_addr,
    566 			    rtm->rtm_flags, ipif, &ire, B_FALSE,
    567 			    CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry,
    568 			    rtsap, ipst);
    569 			if (ipif != NULL)
    570 				ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
    571 			break;
    572 		case AF_INET6:
    573 			if (!IN6_IS_ADDR_UNSPECIFIED(&src_addr_v6)) {
    574 				/*
    575 				 * The RTF_SETSRC flag is present, check that
    576 				 * the supplied src address is not the loopback
    577 				 * address. This would produce martian packets.
    578 				 */
    579 				if (IN6_IS_ADDR_LOOPBACK(&src_addr_v6)) {
    580 					error = EINVAL;
    581 					goto done;
    582 				}
    583 				/*
    584 				 * Also check that the supplied address is a
    585 				 * valid, local one.
    586 				 */
    587 				tmp_ipif = ipif_lookup_addr_v6(&src_addr_v6,
    588 				    NULL, ALL_ZONES, CONNP_TO_WQ(connp), ioc_mp,
    589 				    ip_rts_request_retry, &error, ipst);
    590 				if (tmp_ipif == NULL) {
    591 					if (error != EINPROGRESS)
    592 						error = EADDRNOTAVAIL;
    593 					goto done;
    594 				}
    595 
    596 				if (!(tmp_ipif->ipif_flags & IPIF_UP) ||
    597 				    (tmp_ipif->ipif_flags &
    598 				    (IPIF_NOLOCAL | IPIF_ANYCAST))) {
    599 					error = EINVAL;
    600 					goto done;
    601 				}
    602 
    603 				error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
    604 				    &gw_addr_v6, &src_addr_v6, rtm->rtm_flags,
    605 				    ipif, &ire, CONNP_TO_WQ(connp), ioc_mp,
    606 				    ip_rts_request_retry, rtsap, ipst);
    607 				break;
    608 			}
    609 			/*
    610 			 * The RTF_SETSRC modifier must be associated
    611 			 * to a non-null source address.
    612 			 */
    613 			if (rtm->rtm_flags & RTF_SETSRC) {
    614 				error = EINVAL;
    615 				goto done;
    616 			}
    617 			error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
    618 			    &gw_addr_v6, NULL, rtm->rtm_flags,
    619 			    ipif, &ire, CONNP_TO_WQ(connp), ioc_mp,
    620 			    ip_rts_request_retry, rtsap, ipst);
    621 			if (ipif != NULL)
    622 				ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
    623 			break;
    624 		}
    625 		if (error != 0)
    626 			goto done;
    627 		ASSERT(ire != NULL);
    628 		rts_setmetrics(ire, rtm->rtm_inits, &rtm->rtm_rmx);
    629 		break;
    630 	case RTM_DELETE:
    631 		/* if we are deleting a route, gateway is a must */
    632 		if ((found_addrs & RTA_GATEWAY) == 0) {
    633 			error = EINVAL;
    634 			goto done;
    635 		}
    636 		/*
    637 		 * The RTF_SETSRC modifier does not make sense
    638 		 * when deleting a route.
    639 		 */
    640 		if (rtm->rtm_flags & RTF_SETSRC) {
    641 			error = EINVAL;
    642 			goto done;
    643 		}
    644 
    645 		switch (af) {
    646 		case AF_INET:
    647 			error = ip_rt_delete(dst_addr, net_mask, gw_addr,
    648 			    found_addrs, rtm->rtm_flags, ipif, B_FALSE,
    649 			    CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry,
    650 			    ipst);
    651 			break;
    652 		case AF_INET6:
    653 			error = ip_rt_delete_v6(&dst_addr_v6, &net_mask_v6,
    654 			    &gw_addr_v6, found_addrs, rtm->rtm_flags, ipif,
    655 			    CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry,
    656 			    ipst);
    657 			break;
    658 		}
    659 		break;
    660 	case RTM_GET:
    661 	case RTM_CHANGE:
    662 		/*
    663 		 * In the case of RTM_GET, the forwarding table should be
    664 		 * searched recursively with default being matched if the
    665 		 * specific route doesn't exist.  Also, if a gateway was
    666 		 * specified then the gateway address must also be matched.
    667 		 *
    668 		 * In the case of RTM_CHANGE, the gateway address (if supplied)
    669 		 * is the new gateway address so matching on the gateway address
    670 		 * is not done.  This can lead to ambiguity when looking up the
    671 		 * route to change as usually only the destination (and netmask,
    672 		 * if supplied) is used for the lookup.  However if a RTA_IFP
    673 		 * sockaddr is also supplied, it can disambiguate which route to
    674 		 * change provided the ambigous routes are tied to distinct
    675 		 * ill's (or interface indices).  If the routes are not tied to
    676 		 * any particular interfaces (for example, with traditional
    677 		 * gateway routes), then a RTA_IFP sockaddr will be of no use as
    678 		 * it won't match any such routes.
    679 		 * RTA_SRC is not supported for RTM_GET and RTM_CHANGE,
    680 		 * except when RTM_CHANGE is combined to RTF_SETSRC.
    681 		 */
    682 		if (((found_addrs & RTA_SRC) != 0) &&
    683 		    ((rtm->rtm_type == RTM_GET) ||
    684 		    !(rtm->rtm_flags & RTF_SETSRC))) {
    685 			error = EOPNOTSUPP;
    686 			goto done;
    687 		}
    688 
    689 		if (rtm->rtm_type == RTM_GET) {
    690 			match_flags |=
    691 			    (MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE |
    692 			    MATCH_IRE_SECATTR);
    693 			match_flags_local |= MATCH_IRE_SECATTR;
    694 			if ((found_addrs & RTA_GATEWAY) != 0)
    695 				match_flags |= MATCH_IRE_GW;
    696 			if (ioc_cr)
    697 				tsl = crgetlabel(ioc_cr);
    698 			if (rtsap != NULL) {
    699 				if (rtsa_validate(rtsap) != 0) {
    700 					error = EINVAL;
    701 					goto done;
    702 				}
    703 				if (tsl != NULL &&
    704 				    crgetzoneid(ioc_cr) != GLOBAL_ZONEID &&
    705 				    (tsl->tsl_doi != rtsap->rtsa_doi ||
    706 				    !bldominates(&tsl->tsl_label,
    707 				    &rtsap->rtsa_slrange.lower_bound))) {
    708 					error = EPERM;
    709 					goto done;
    710 				}
    711 				tsl = labelalloc(
    712 				    &rtsap->rtsa_slrange.lower_bound,
    713 				    rtsap->rtsa_doi, KM_NOSLEEP);
    714 			}
    715 		}
    716 		if (rtm->rtm_type == RTM_CHANGE) {
    717 			if ((found_addrs & RTA_GATEWAY) &&
    718 			    (rtm->rtm_flags & RTF_SETSRC)) {
    719 				/*
    720 				 * Do not want to change the gateway,
    721 				 * but rather the source address.
    722 				 */
    723 				match_flags |= MATCH_IRE_GW;
    724 			}
    725 		}
    726 
    727 		/*
    728 		 * If the netmask is all ones (either as supplied or as derived
    729 		 * above), then first check for an IRE_LOOPBACK or
    730 		 * IRE_LOCAL entry.
    731 		 *
    732 		 * If we didn't check for or find an IRE_LOOPBACK or IRE_LOCAL
    733 		 * entry, then look in the forwarding table.
    734 		 */
    735 		switch (af) {
    736 		case AF_INET:
    737 			if (net_mask == IP_HOST_MASK) {
    738 				ire = ire_ctable_lookup(dst_addr, gw_addr,
    739 				    IRE_LOCAL | IRE_LOOPBACK, NULL, zoneid,
    740 				    tsl, match_flags_local, ipst);
    741 				/*
    742 				 * If we found an IRE_LOCAL, make sure
    743 				 * it is one that would be used by this
    744 				 * zone to send packets.
    745 				 */
    746 				if (ire != NULL &&
    747 				    ire->ire_type == IRE_LOCAL &&
    748 				    ipst->ips_ip_restrict_interzone_loopback &&
    749 				    !ire_local_ok_across_zones(ire,
    750 				    zoneid, &dst_addr, tsl, ipst)) {
    751 					ire_refrele(ire);
    752 					ire = NULL;
    753 				}
    754 			}
    755 			if (ire == NULL) {
    756 				ire = ire_ftable_lookup(dst_addr, net_mask,
    757 				    gw_addr, 0, ipif, &sire, zoneid, 0,
    758 				    tsl, match_flags, ipst);
    759 			}
    760 			break;
    761 		case AF_INET6:
    762 			if (IN6_ARE_ADDR_EQUAL(&net_mask_v6, &ipv6_all_ones)) {
    763 				ire = ire_ctable_lookup_v6(&dst_addr_v6,
    764 				    &gw_addr_v6, IRE_LOCAL | IRE_LOOPBACK, NULL,
    765 				    zoneid, tsl, match_flags_local, ipst);
    766 				/*
    767 				 * If we found an IRE_LOCAL, make sure
    768 				 * it is one that would be used by this
    769 				 * zone to send packets.
    770 				 */
    771 				if (ire != NULL &&
    772 				    ire->ire_type == IRE_LOCAL &&
    773 				    ipst->ips_ip_restrict_interzone_loopback &&
    774 				    !ire_local_ok_across_zones(ire,
    775 				    zoneid, (void *)&dst_addr_v6, tsl, ipst)) {
    776 					ire_refrele(ire);
    777 					ire = NULL;
    778 				}
    779 			}
    780 			if (ire == NULL) {
    781 				ire = ire_ftable_lookup_v6(&dst_addr_v6,
    782 				    &net_mask_v6, &gw_addr_v6, 0, ipif, &sire,
    783 				    zoneid, 0, tsl, match_flags, ipst);
    784 			}
    785 			break;
    786 		}
    787 		if (tsl != NULL && tsl != crgetlabel(ioc_cr))
    788 			label_rele(tsl);
    789 
    790 		if (ire == NULL) {
    791 			error = ESRCH;
    792 			goto done;
    793 		}
    794 		/* we know the IRE before we come here */
    795 		switch (rtm->rtm_type) {
    796 		case RTM_GET:
    797 			mp1 = rts_rtmget(mp, ire, sire, af);
    798 			if (mp1 == NULL) {
    799 				error = ENOBUFS;
    800 				goto done;
    801 			}
    802 			freemsg(mp);
    803 			mp = mp1;
    804 			rtm = (rt_msghdr_t *)mp->b_rptr;
    805 			break;
    806 		case RTM_CHANGE:
    807 			/*
    808 			 * Do not allow to the multirouting state of a route
    809 			 * to be changed. This aims to prevent undesirable
    810 			 * stages where both multirt and non-multirt routes
    811 			 * for the same destination are declared.
    812 			 */
    813 			if ((ire->ire_flags & RTF_MULTIRT) !=
    814 			    (rtm->rtm_flags & RTF_MULTIRT)) {
    815 				error = EINVAL;
    816 				goto done;
    817 			}
    818 			/*
    819 			 * Note that we do not need to do
    820 			 * ire_flush_cache_*(IRE_FLUSH_ADD) as a change
    821 			 * in metrics or gateway will not affect existing
    822 			 * routes since it does not create a more specific
    823 			 * route.
    824 			 */
    825 			switch (af) {
    826 			case AF_INET:
    827 				ire_flush_cache_v4(ire, IRE_FLUSH_DELETE);
    828 				if ((found_addrs & RTA_GATEWAY) != 0 &&
    829 				    (ire->ire_gateway_addr != gw_addr)) {
    830 					ire->ire_gateway_addr = gw_addr;
    831 				}
    832 
    833 				if (rtsap != NULL) {
    834 					ga.ga_af = AF_INET;
    835 					IN6_IPADDR_TO_V4MAPPED(
    836 					    ire->ire_gateway_addr, &ga.ga_addr);
    837 
    838 					gcgrp = gcgrp_lookup(&ga, B_TRUE);
    839 					if (gcgrp == NULL) {
    840 						error = ENOMEM;
    841 						goto done;
    842 					}
    843 				}
    844 
    845 				if ((found_addrs & RTA_SRC) != 0 &&
    846 				    (rtm->rtm_flags & RTF_SETSRC) != 0 &&
    847 				    (ire->ire_src_addr != src_addr)) {
    848 
    849 					if (src_addr != INADDR_ANY) {
    850 						/*
    851 						 * The RTF_SETSRC flag is
    852 						 * present, check that the
    853 						 * supplied src address is not
    854 						 * the loopback address. This
    855 						 * would produce martian
    856 						 * packets.
    857 						 */
    858 						if (src_addr ==
    859 						    htonl(INADDR_LOOPBACK)) {
    860 							error = EINVAL;
    861 							goto done;
    862 						}
    863 						/*
    864 						 * Also check that the the
    865 						 * supplied addr is a valid
    866 						 * local address.
    867 						 */
    868 						tmp_ipif = ipif_lookup_addr(
    869 						    src_addr, NULL, ALL_ZONES,
    870 						    CONNP_TO_WQ(connp), ioc_mp,
    871 						    ip_rts_request_retry,
    872 						    &error, ipst);
    873 						if (tmp_ipif == NULL) {
    874 							error = (error ==
    875 							    EINPROGRESS) ?
    876 							    error :
    877 							    EADDRNOTAVAIL;
    878 							goto done;
    879 						}
    880 
    881 						if (!(tmp_ipif->ipif_flags &
    882 						    IPIF_UP) ||
    883 						    (tmp_ipif->ipif_flags &
    884 						    (IPIF_NOLOCAL |
    885 						    IPIF_ANYCAST))) {
    886 							error = EINVAL;
    887 							goto done;
    888 						}
    889 						ire->ire_flags |= RTF_SETSRC;
    890 					} else {
    891 						ire->ire_flags &= ~RTF_SETSRC;
    892 					}
    893 					ire->ire_src_addr = src_addr;
    894 				}
    895 				break;
    896 			case AF_INET6:
    897 				ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
    898 				mutex_enter(&ire->ire_lock);
    899 				if ((found_addrs & RTA_GATEWAY) != 0 &&
    900 				    !IN6_ARE_ADDR_EQUAL(
    901 				    &ire->ire_gateway_addr_v6, &gw_addr_v6)) {
    902 					ire->ire_gateway_addr_v6 = gw_addr_v6;
    903 				}
    904 
    905 				if (rtsap != NULL) {
    906 					ga.ga_af = AF_INET6;
    907 					ga.ga_addr = ire->ire_gateway_addr_v6;
    908 
    909 					gcgrp = gcgrp_lookup(&ga, B_TRUE);
    910 					if (gcgrp == NULL) {
    911 						error = ENOMEM;
    912 						goto done;
    913 					}
    914 				}
    915 
    916 				if ((found_addrs & RTA_SRC) != 0 &&
    917 				    (rtm->rtm_flags & RTF_SETSRC) != 0 &&
    918 				    !IN6_ARE_ADDR_EQUAL(
    919 				    &ire->ire_src_addr_v6, &src_addr_v6)) {
    920 
    921 					if (!IN6_IS_ADDR_UNSPECIFIED(
    922 					    &src_addr_v6)) {
    923 						/*
    924 						 * The RTF_SETSRC flag is
    925 						 * present, check that the
    926 						 * supplied src address is not
    927 						 * the loopback address. This
    928 						 * would produce martian
    929 						 * packets.
    930 						 */
    931 						if (IN6_IS_ADDR_LOOPBACK(
    932 						    &src_addr_v6)) {
    933 							mutex_exit(
    934 							    &ire->ire_lock);
    935 							error = EINVAL;
    936 							goto done;
    937 						}
    938 						/*
    939 						 * Also check that the the
    940 						 * supplied addr is a valid
    941 						 * local address.
    942 						 */
    943 						tmp_ipif = ipif_lookup_addr_v6(
    944 						    &src_addr_v6, NULL,
    945 						    ALL_ZONES,
    946 						    CONNP_TO_WQ(connp), ioc_mp,
    947 						    ip_rts_request_retry,
    948 						    &error, ipst);
    949 						if (tmp_ipif == NULL) {
    950 							mutex_exit(
    951 							    &ire->ire_lock);
    952 							error = (error ==
    953 							    EINPROGRESS) ?
    954 							    error :
    955 							    EADDRNOTAVAIL;
    956 							goto done;
    957 						}
    958 						if (!(tmp_ipif->ipif_flags &
    959 						    IPIF_UP) ||
    960 						    (tmp_ipif->ipif_flags &
    961 						    (IPIF_NOLOCAL |
    962 						    IPIF_ANYCAST))) {
    963 							mutex_exit(
    964 							    &ire->ire_lock);
    965 							error = EINVAL;
    966 							goto done;
    967 						}
    968 						ire->ire_flags |= RTF_SETSRC;
    969 					} else {
    970 						ire->ire_flags &= ~RTF_SETSRC;
    971 					}
    972 					ire->ire_src_addr_v6 = src_addr_v6;
    973 				}
    974 				mutex_exit(&ire->ire_lock);
    975 				break;
    976 			}
    977 
    978 			if (rtsap != NULL) {
    979 				in_addr_t ga_addr4;
    980 
    981 				ASSERT(gcgrp != NULL);
    982 
    983 				/*
    984 				 * Create and add the security attribute to
    985 				 * prefix IRE; it will add a reference to the
    986 				 * group upon allocating a new entry.  If it
    987 				 * finds an already-existing entry for the
    988 				 * security attribute, it simply returns it
    989 				 * and no new group reference is made.
    990 				 */
    991 				gc = gc_create(rtsap, gcgrp, &gcgrp_xtraref);
    992 				if (gc == NULL ||
    993 				    (error = tsol_ire_init_gwattr(ire,
    994 				    ire->ire_ipversion, gc, NULL)) != 0) {
    995 					if (gc != NULL) {
    996 						GC_REFRELE(gc);
    997 					} else {
    998 						/* gc_create failed */
    999 						error = ENOMEM;
   1000 					}
   1001 					goto done;
   1002 				}
   1003 
   1004 				/*
   1005 				 * Now delete any existing gateway IRE caches
   1006 				 * as well as all caches using the gateway,
   1007 				 * and allow them to be created on demand
   1008 				 * through ip_newroute{_v6}.
   1009 				 */
   1010 				IN6_V4MAPPED_TO_IPADDR(&ga.ga_addr, ga_addr4);
   1011 				if (af == AF_INET) {
   1012 					ire_clookup_delete_cache_gw(
   1013 					    ga_addr4, ALL_ZONES, ipst);
   1014 				} else {
   1015 					ire_clookup_delete_cache_gw_v6(
   1016 					    &ga.ga_addr, ALL_ZONES, ipst);
   1017 				}
   1018 			}
   1019 			rts_setmetrics(ire, rtm->rtm_inits, &rtm->rtm_rmx);
   1020 			break;
   1021 		}
   1022 		break;
   1023 	default:
   1024 		error = EOPNOTSUPP;
   1025 		break;
   1026 	}
   1027 done:
   1028 	if (ire != NULL)
   1029 		ire_refrele(ire);
   1030 	if (sire != NULL)
   1031 		ire_refrele(sire);
   1032 	if (ipif != NULL)
   1033 		ipif_refrele(ipif);
   1034 	if (tmp_ipif != NULL)
   1035 		ipif_refrele(tmp_ipif);
   1036 
   1037 	if (gcgrp_xtraref)
   1038 		GCGRP_REFRELE(gcgrp);
   1039 
   1040 	if (error == EINPROGRESS) {
   1041 		if (rtm != NULL)
   1042 			freemsg(mp);
   1043 		return (error);
   1044 	}
   1045 	if (rtm != NULL) {
   1046 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
   1047 		if (error != 0) {
   1048 			rtm->rtm_errno = error;
   1049 			/* Send error ACK */
   1050 			ip1dbg(("ip_rts_request: error %d\n", error));
   1051 		} else {
   1052 			rtm->rtm_flags |= RTF_DONE;
   1053 			/* OK ACK already set up by caller except this */
   1054 			ip2dbg(("ip_rts_request: OK ACK\n"));
   1055 		}
   1056 		rts_queue_input(mp, q, af, ipst);
   1057 	}
   1058 	iocp->ioc_error = error;
   1059 	ioc_mp->b_datap->db_type = M_IOCACK;
   1060 	if (iocp->ioc_error != 0)
   1061 		iocp->ioc_count = 0;
   1062 	(connp->conn_recv)(connp, ioc_mp, NULL);
   1063 	/* conn was refheld in ip_wput_ioctl. */
   1064 	CONN_OPER_PENDING_DONE(connp);
   1065 
   1066 	return (error);
   1067 }
   1068 
   1069 /*
   1070  * Build a reply to the RTM_GET request contained in the given message block
   1071  * using the retrieved IRE of the destination address, the parent IRE (if it
   1072  * exists) and the address family.
   1073  *
   1074  * Returns a pointer to a message block containing the reply if successful,
   1075  * otherwise NULL is returned.
   1076  */
   1077 static mblk_t *
   1078 rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af)
   1079 {
   1080 	rt_msghdr_t	*rtm;
   1081 	rt_msghdr_t	*new_rtm;
   1082 	mblk_t		*new_mp;
   1083 	int		rtm_addrs;
   1084 	int		rtm_flags;
   1085 	in6_addr_t	gw_addr_v6;
   1086 	tsol_ire_gw_secattr_t *attrp = NULL;
   1087 	tsol_gc_t	*