Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/types.h>
     29 #include <sys/stream.h>
     30 #include <sys/strsubr.h>
     31 #include <sys/stropts.h>
     32 #include <sys/strsun.h>
     33 #include <sys/strlog.h>
     34 #define	_SUN_TPI_VERSION 2
     35 #include <sys/tihdr.h>
     36 #include <sys/timod.h>
     37 #include <sys/ddi.h>
     38 #include <sys/sunddi.h>
     39 #include <sys/cmn_err.h>
     40 #include <sys/proc.h>
     41 #include <sys/suntpi.h>
     42 #include <sys/policy.h>
     43 #include <sys/zone.h>
     44 
     45 #include <sys/socket.h>
     46 #include <netinet/in.h>
     47 
     48 #include <inet/common.h>
     49 #include <netinet/ip6.h>
     50 #include <inet/ip.h>
     51 #include <inet/ipclassifier.h>
     52 #include <inet/mi.h>
     53 #include <inet/nd.h>
     54 #include <inet/optcom.h>
     55 #include <netinet/ip_mroute.h>
     56 #include <sys/isa_defs.h>
     57 #include <net/route.h>
     58 
     59 #include <inet/rts_impl.h>
     60 #include <inet/ip_rts.h>
     61 
     62 /*
     63  * This is a transport provider for routing sockets.  Downstream messages are
     64  * wrapped with a IP_IOCTL header, and ip_wput_ioctl calls the appropriate entry
     65  * in the ip_ioctl_ftbl callout table to pass the routing socket data into IP.
     66  * Upstream messages are generated for listeners of the routing socket as well
     67  * as the message sender (unless they have turned off their end using
     68  * SO_USELOOPBACK or shutdown(3n)).  Upstream messages may also be generated
     69  * asynchronously when:
     70  *
     71  *	Interfaces are brought up or down.
     72  *	Addresses are assigned to interfaces.
     73  *	ICMP redirects are processed and a IRE_HOST/RTF_DYNAMIC is installed.
     74  *	No route is found while sending a packet.
     75  *	When TCP requests IP to remove an IRE_CACHE of a troubled destination.
     76  *
     77  * Since all we do is reformat the messages between routing socket and
     78  * ioctl forms, no synchronization is necessary in this module; all
     79  * the dirty work is done down in ip.
     80  */
     81 
     82 /* Default structure copied into T_INFO_ACK messages */
     83 static struct T_info_ack rts_g_t_info_ack = {
     84 	T_INFO_ACK,
     85 	T_INFINITE,	/* TSDU_size. Maximum size messages. */
     86 	T_INVALID,	/* ETSDU_size. No expedited data. */
     87 	T_INVALID,	/* CDATA_size. No connect data. */
     88 	T_INVALID,	/* DDATA_size. No disconnect data. */
     89 	0,		/* ADDR_size. */
     90 	0,		/* OPT_size - not initialized here */
     91 	64 * 1024,	/* TIDU_size. rts allows maximum size messages. */
     92 	T_COTS,		/* SERV_type. rts supports connection oriented. */
     93 	TS_UNBND,	/* CURRENT_state. This is set from rts_state. */
     94 	(XPG4_1)	/* PROVIDER_flag */
     95 };
     96 
     97 /*
     98  * Table of ND variables supported by rts. These are loaded into rts_g_nd
     99  * in rts_open.
    100  * All of these are alterable, within the min/max values given, at run time.
    101  */
    102 static rtsparam_t	lcl_param_arr[] = {
    103 	/* min		max		value		name */
    104 	{ 4096,		65536,		8192,		"rts_xmit_hiwat"},
    105 	{ 0,		65536,		1024,		"rts_xmit_lowat"},
    106 	{ 4096,		65536,		8192,		"rts_recv_hiwat"},
    107 	{ 65536,	1024*1024*1024, 256*1024,	"rts_max_buf"},
    108 };
    109 #define	rtss_xmit_hiwat		rtss_params[0].rts_param_value
    110 #define	rtss_xmit_lowat		rtss_params[1].rts_param_value
    111 #define	rtss_recv_hiwat		rtss_params[2].rts_param_value
    112 #define	rtss_max_buf		rtss_params[3].rts_param_value
    113 
    114 static int	rts_close(queue_t *q);
    115 static void 	rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
    116     int sys_error);
    117 static void	rts_input(void *, mblk_t *, void *);
    118 static mblk_t	*rts_ioctl_alloc(mblk_t *data, cred_t *cr);
    119 static int	rts_open(queue_t *q, dev_t *devp, int flag, int sflag,
    120     cred_t *credp);
    121 int		rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
    122     uchar_t *ptr);
    123 int		rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name,
    124     uchar_t *ptr);
    125 int		rts_opt_set(queue_t *q, uint_t optset_context, int level,
    126     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
    127     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
    128 static int	rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
    129 static boolean_t rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt);
    130 static int	rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
    131     cred_t *cr);
    132 static void	rts_rsrv(queue_t *q);
    133 static void	*rts_stack_init(netstackid_t stackid, netstack_t *ns);
    134 static void	rts_stack_fini(netstackid_t stackid, void *arg);
    135 static void	rts_wput(queue_t *q, mblk_t *mp);
    136 static void	rts_wput_iocdata(queue_t *q, mblk_t *mp);
    137 static void 	rts_wput_other(queue_t *q, mblk_t *mp);
    138 static int	rts_wrw(queue_t *q, struiod_t *dp);
    139 
    140 static struct module_info rts_mod_info = {
    141 	129, "rts", 1, INFPSZ, 512, 128
    142 };
    143 
    144 static struct qinit rtsrinit = {
    145 	NULL, (pfi_t)rts_rsrv, rts_open, rts_close, NULL, &rts_mod_info
    146 };
    147 
    148 static struct qinit rtswinit = {
    149 	(pfi_t)rts_wput, NULL, NULL, NULL, NULL, &rts_mod_info,
    150 	NULL, (pfi_t)rts_wrw, NULL, STRUIOT_STANDARD
    151 };
    152 
    153 struct streamtab rtsinfo = {
    154 	&rtsrinit, &rtswinit
    155 };
    156 
    157 /*
    158  * This routine allocates the necessary
    159  * message blocks for IOCTL wrapping the
    160  * user data.
    161  */
    162 static mblk_t *
    163 rts_ioctl_alloc(mblk_t *data, cred_t *cr)
    164 {
    165 	mblk_t	*mp = NULL;
    166 	mblk_t	*mp1 = NULL;
    167 	ipllc_t	*ipllc;
    168 	struct iocblk	*ioc;
    169 
    170 	mp = allocb_cred(sizeof (ipllc_t), cr);
    171 	if (mp == NULL)
    172 		return (NULL);
    173 	mp1 = allocb_cred(sizeof (struct iocblk), cr);
    174 	if (mp1 == NULL) {
    175 		freeb(mp);
    176 		return (NULL);
    177 	}
    178 
    179 	ipllc = (ipllc_t *)mp->b_rptr;
    180 	ipllc->ipllc_cmd = IP_IOC_RTS_REQUEST;
    181 	ipllc->ipllc_name_offset = 0;
    182 	ipllc->ipllc_name_length = 0;
    183 	mp->b_wptr += sizeof (ipllc_t);
    184 	mp->b_cont = data;
    185 
    186 	ioc = (struct iocblk *)mp1->b_rptr;
    187 	ioc->ioc_cmd = IP_IOCTL;
    188 	ioc->ioc_error = 0;
    189 	ioc->ioc_cr = NULL;
    190 	ioc->ioc_count = msgdsize(mp);
    191 	mp1->b_wptr += sizeof (struct iocblk);
    192 	mp1->b_datap->db_type = M_IOCTL;
    193 	mp1->b_cont = mp;
    194 
    195 	return (mp1);
    196 }
    197 
    198 /*
    199  * This routine closes rts stream, by disabling
    200  * put/srv routines and freeing the this module
    201  * internal datastructure.
    202  */
    203 static int
    204 rts_close(queue_t *q)
    205 {
    206 	conn_t	*connp = Q_TO_CONN(q);
    207 
    208 	ASSERT(connp != NULL && IPCL_IS_RTS(connp));
    209 
    210 	ip_rts_unregister(connp);
    211 
    212 	ip_quiesce_conn(connp);
    213 
    214 	qprocsoff(q);
    215 
    216 	/*
    217 	 * Now we are truly single threaded on this stream, and can
    218 	 * delete the things hanging off the connp, and finally the connp.
    219 	 * We removed this connp from the fanout list, it cannot be
    220 	 * accessed thru the fanouts, and we already waited for the
    221 	 * conn_ref to drop to 0. We are already in close, so
    222 	 * there cannot be any other thread from the top. qprocsoff
    223 	 * has completed, and service has completed or won't run in
    224 	 * future.
    225 	 */
    226 	ASSERT(connp->conn_ref == 1);
    227 
    228 	inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
    229 
    230 	connp->conn_ref--;
    231 	ipcl_conn_destroy(connp);
    232 
    233 	q->q_ptr = WR(q)->q_ptr = NULL;
    234 	return (0);
    235 }
    236 
    237 /*
    238  * This is the open routine for routing socket. It allocates
    239  * rts_t structure for the stream and tells IP that it is a routing socket.
    240  */
    241 /* ARGSUSED */
    242 static int
    243 rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
    244 {
    245 	rts_t	*rts;
    246 	conn_t *connp;
    247 	dev_t	conn_dev;
    248 	zoneid_t zoneid;
    249 	netstack_t *ns;
    250 	rts_stack_t *rtss;
    251 
    252 	/* If the stream is already open, return immediately. */
    253 	if (q->q_ptr != NULL)
    254 		return (0);
    255 
    256 	if (sflag == MODOPEN)
    257 		return (EINVAL);
    258 
    259 	ns = netstack_find_by_cred(credp);
    260 	ASSERT(ns != NULL);
    261 	rtss = ns->netstack_rts;
    262 	ASSERT(rtss != NULL);
    263 
    264 	/*
    265 	 * For exclusive stacks we set the zoneid to zero
    266 	 * to make RTS operate as if in the global zone.
    267 	 */
    268 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
    269 		zoneid = GLOBAL_ZONEID;
    270 	else
    271 		zoneid = crgetzoneid(credp);
    272 
    273 	/*
    274 	 * Since RTS is not used so heavily, allocating from the small
    275 	 * arena should be sufficient.
    276 	 */
    277 	if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
    278 		netstack_rele(ns);
    279 		return (EBUSY);
    280 	}
    281 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
    282 
    283 	connp = ipcl_conn_create(IPCL_RTSCONN, KM_SLEEP, ns);
    284 	connp->conn_dev = conn_dev;
    285 	connp->conn_minor_arena = ip_minor_arena_sa;
    286 	rts = connp->conn_rts;
    287 
    288 	/*
    289 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
    290 	 * done by netstack_find_by_cred()
    291 	 */
    292 	netstack_rele(ns);
    293 
    294 	/*
    295 	 * Initialize the rts_t structure for this stream.
    296 	 */
    297 	q->q_ptr = connp;
    298 	WR(q)->q_ptr = connp;
    299 	connp->conn_rq = q;
    300 	connp->conn_wq = WR(q);
    301 
    302 	rw_enter(&rts->rts_rwlock, RW_WRITER);
    303 	ASSERT(connp->conn_rts == rts);
    304 	ASSERT(rts->rts_connp == connp);
    305 
    306 	/* Set the initial state of the stream and the privilege status. */
    307 	rts->rts_state = TS_UNBND;
    308 	connp->conn_zoneid = zoneid;
    309 
    310 	connp->conn_ulp_labeled = is_system_labeled();
    311 
    312 	rts->rts_rtss = rtss;
    313 
    314 	q->q_hiwat = rtss->rtss_recv_hiwat;
    315 	WR(q)->q_hiwat = rtss->rtss_xmit_hiwat;
    316 	WR(q)->q_lowat = rtss->rtss_xmit_lowat;
    317 
    318 	connp->conn_recv = rts_input;
    319 	crhold(credp);
    320 	connp->conn_cred = credp;
    321 
    322 	mutex_enter(&connp->conn_lock);
    323 	connp->conn_state_flags &= ~CONN_INCIPIENT;
    324 	mutex_exit(&connp->conn_lock);
    325 
    326 	qprocson(q);
    327 	rw_exit(&rts->rts_rwlock);
    328 
    329 	/*
    330 	 * Indicate the down IP module that this is a routing socket
    331 	 * client by sending an RTS IOCTL without any user data. Although
    332 	 * this is just a notification message (without any real routing
    333 	 * request), we pass in any credential for correctness sake.
    334 	 */
    335 	ip_rts_register(connp);
    336 
    337 	return (0);
    338 
    339 }
    340 
    341 /*
    342  * This routine creates a T_ERROR_ACK message and passes it upstream.
    343  */
    344 static void
    345 rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
    346 {
    347 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
    348 		qreply(q, mp);
    349 }
    350 
    351 /*
    352  * This routine creates a T_OK_ACK message and passes it upstream.
    353  */
    354 static void
    355 rts_ok_ack(queue_t *q, mblk_t *mp)
    356 {
    357 	if ((mp = mi_tpi_ok_ack_alloc(mp)) != NULL)
    358 		qreply(q, mp);
    359 }
    360 
    361 /*
    362  * This routine is called by rts_wput to handle T_UNBIND_REQ messages.
    363  */
    364 static void
    365 rts_unbind(queue_t *q, mblk_t *mp)
    366 {
    367 	conn_t	*connp = Q_TO_CONN(q);
    368 	rts_t	*rts = connp->conn_rts;
    369 
    370 	/* If a bind has not been done, we can't unbind. */
    371 	if (rts->rts_state != TS_IDLE) {
    372 		rts_err_ack(q, mp, TOUTSTATE, 0);
    373 		return;
    374 	}
    375 	rts->rts_state = TS_UNBND;
    376 	rts_ok_ack(q, mp);
    377 }
    378 
    379 /*
    380  * This routine is called to handle each
    381  * O_T_BIND_REQ/T_BIND_REQ message passed to
    382  * rts_wput. Note: This routine works with both
    383  * O_T_BIND_REQ and T_BIND_REQ semantics.
    384  */
    385 static void
    386 rts_bind(queue_t *q, mblk_t *mp)
    387 {
    388 	conn_t	*connp = Q_TO_CONN(q);
    389 	rts_t	*rts = connp->conn_rts;
    390 	mblk_t	*mp1;
    391 	struct T_bind_req *tbr;
    392 
    393 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
    394 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
    395 		    "rts_bind: bad data, %d", rts->rts_state);
    396 		rts_err_ack(q, mp, TBADADDR, 0);
    397 		return;
    398 	}
    399 	if (rts->rts_state != TS_UNBND) {
    400 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
    401 		    "rts_bind: bad state, %d", rts->rts_state);
    402 		rts_err_ack(q, mp, TOUTSTATE, 0);
    403 		return;
    404 	}
    405 	/*
    406 	 * Reallocate the message to make sure we have enough room for an
    407 	 * address and the protocol type.
    408 	 */
    409 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin_t), 1);
    410 	if (mp1 == NULL) {
    411 		rts_err_ack(q, mp, TSYSERR, ENOMEM);
    412 		return;
    413 	}
    414 	mp = mp1;
    415 	tbr = (struct T_bind_req *)mp->b_rptr;
    416 	if (tbr->ADDR_length != 0) {
    417 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
    418 		    "rts_bind: bad ADDR_length %d", tbr->ADDR_length);
    419 		rts_err_ack(q, mp, TBADADDR, 0);
    420 		return;
    421 	}
    422 	/* Generic request */
    423 	tbr->ADDR_offset = (t_scalar_t)sizeof (struct T_bind_req);
    424 	tbr->ADDR_length = 0;
    425 	tbr->PRIM_type = T_BIND_ACK;
    426 	rts->rts_state = TS_IDLE;
    427 	qreply(q, mp);
    428 }
    429 
    430 static void
    431 rts_copy_info(struct T_info_ack *tap, rts_t *rts)
    432 {
    433 	*tap = rts_g_t_info_ack;
    434 	tap->CURRENT_state = rts->rts_state;
    435 	tap->OPT_size = rts_max_optsize;
    436 }
    437 
    438 /*
    439  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
    440  * rts_wput.  Much of the T_CAPABILITY_ACK information is copied from
    441  * rts_g_t_info_ack.  The current state of the stream is copied from
    442  * rts_state.
    443  */
    444 static void
    445 rts_capability_req(queue_t *q, mblk_t *mp)
    446 {
    447 	conn_t	*connp = Q_TO_CONN(q);
    448 	rts_t	*rts = connp->conn_rts;
    449 	t_uscalar_t		cap_bits1;
    450 	struct T_capability_ack	*tcap;
    451 
    452 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
    453 
    454 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
    455 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
    456 	if (mp == NULL)
    457 		return;
    458 
    459 	tcap = (struct T_capability_ack *)mp->b_rptr;
    460 	tcap->CAP_bits1 = 0;
    461 
    462 	if (cap_bits1 & TC1_INFO) {
    463 		rts_copy_info(&tcap->INFO_ack, rts);
    464 		tcap->CAP_bits1 |= TC1_INFO;
    465 	}
    466 
    467 	qreply(q, mp);
    468 }
    469 
    470 /*
    471  * This routine responds to T_INFO_REQ messages.  It is called by rts_wput.
    472  * Most of the T_INFO_ACK information is copied from rts_g_t_info_ack.
    473  * The current state of the stream is copied from rts_state.
    474  */
    475 static void
    476 rts_info_req(queue_t *q, mblk_t *mp)
    477 {
    478 	conn_t	*connp = Q_TO_CONN(q);
    479 	rts_t	*rts = connp->conn_rts;
    480 
    481 	mp = tpi_ack_alloc(mp, sizeof (rts_g_t_info_ack), M_PCPROTO,
    482 	    T_INFO_ACK);
    483 	if (mp == NULL)
    484 		return;
    485 	rts_copy_info((struct T_info_ack *)mp->b_rptr, rts);
    486 	qreply(q, mp);
    487 }
    488 
    489 /*
    490  * This routine gets default values of certain options whose default
    491  * values are maintained by protcol specific code
    492  */
    493 /* ARGSUSED */
    494 int
    495 rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
    496 {
    497 	/* no default value processed by protocol specific code currently */
    498 	return (-1);
    499 }
    500 
    501 /*
    502  * This routine retrieves the current status of socket options.
    503  * It returns the size of the option retrieved.
    504  */
    505 int
    506 rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
    507 {
    508 	int	*i1 = (int *)ptr;
    509 	conn_t	*connp = Q_TO_CONN(q);
    510 	rts_t	*rts = connp->conn_rts;
    511 
    512 	switch (level) {
    513 	case SOL_SOCKET:
    514 		switch (name) {
    515 		case SO_DEBUG:
    516 			*i1 = rts->rts_debug;
    517 			break;
    518 		case SO_REUSEADDR:
    519 			*i1 = rts->rts_reuseaddr;
    520 			break;
    521 		case SO_TYPE:
    522 			*i1 = SOCK_RAW;
    523 			break;
    524 
    525 		/*
    526 		 * The following three items are available here,
    527 		 * but are only meaningful to IP.
    528 		 */
    529 		case SO_DONTROUTE:
    530 			*i1 = rts->rts_dontroute;
    531 			break;
    532 		case SO_USELOOPBACK:
    533 			*i1 = rts->rts_useloopback;
    534 			break;
    535 		case SO_BROADCAST:
    536 			*i1 = rts->rts_broadcast;
    537 			break;
    538 		case SO_PROTOTYPE:
    539 			*i1 = rts->rts_proto;
    540 			break;
    541 		/*
    542 		 * The following two items can be manipulated,
    543 		 * but changing them should do nothing.
    544 		 */
    545 		case SO_SNDBUF:
    546 			ASSERT(q->q_hiwat <= INT_MAX);
    547 			*i1 = (int)(q->q_hiwat);
    548 			break;
    549 		case SO_RCVBUF:
    550 			ASSERT(q->q_hiwat <= INT_MAX);
    551 			*i1 = (int)(RD(q)->q_hiwat);
    552 			break;
    553 		case SO_DOMAIN:
    554 			*i1 = PF_ROUTE;
    555 			break;
    556 		default:
    557 			return (-1);
    558 		}
    559 		break;
    560 	default:
    561 		return (-1);
    562 	}
    563 	return ((int)sizeof (int));
    564 }
    565 
    566 
    567 /*
    568  * This routine sets socket options.
    569  */
    570 /*ARGSUSED*/
    571 int
    572 rts_opt_set(queue_t *q, uint_t optset_context, int level,
    573     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
    574     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
    575 {
    576 	int	*i1 = (int *)invalp;
    577 	conn_t	*connp = Q_TO_CONN(q);
    578 	rts_t	*rts = connp->conn_rts;
    579 	boolean_t checkonly;
    580 	rts_stack_t	*rtss = rts->rts_rtss;
    581 
    582 	switch (optset_context) {
    583 	case SETFN_OPTCOM_CHECKONLY:
    584 		checkonly = B_TRUE;
    585 		/*
    586 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
    587 		 * inlen != 0 implies value supplied and
    588 		 * 	we have to "pretend" to set it.
    589 		 * inlen == 0 implies that there is no
    590 		 * 	value part in T_CHECK request and just validation
    591 		 * done elsewhere should be enough, we just return here.
    592 		 */
    593 		if (inlen == 0) {
    594 			*outlenp = 0;
    595 			return (0);
    596 		}
    597 		break;
    598 	case SETFN_OPTCOM_NEGOTIATE:
    599 		checkonly = B_FALSE;
    600 		break;
    601 	case SETFN_UD_NEGOTIATE:
    602 	case SETFN_CONN_NEGOTIATE:
    603 		checkonly = B_FALSE;
    604 		/*
    605 		 * Negotiating local and "association-related" options
    606 		 * through T_UNITDATA_REQ or T_CONN_{REQ,CON}
    607 		 * Not allowed in this module.
    608 		 */
    609 		return (EINVAL);
    610 	default:
    611 		/*
    612 		 * We should never get here
    613 		 */
    614 		*outlenp = 0;
    615 		return (EINVAL);
    616 	}
    617 
    618 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
    619 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
    620 
    621 	/*
    622 	 * For rts, we should have no ancillary data sent down
    623 	 * (rts_wput doesn't handle options).
    624 	 */
    625 	ASSERT(thisdg_attrs == NULL);
    626 
    627 	/*
    628 	 * For fixed length options, no sanity check
    629 	 * of passed in length is done. It is assumed *_optcom_req()
    630 	 * routines do the right thing.
    631 	 */
    632 
    633 	switch (level) {
    634 	case SOL_SOCKET:
    635 		switch (name) {
    636 		case SO_REUSEADDR:
    637 			if (!checkonly)
    638 				rts->rts_reuseaddr = *i1;
    639 			break;	/* goto sizeof (int) option return */
    640 		case SO_DEBUG:
    641 			if (!checkonly)
    642 				rts->rts_debug = *i1;
    643 			break;	/* goto sizeof (int) option return */
    644 		/*
    645 		 * The following three items are available here,
    646 		 * but are only meaningful to IP.
    647 		 */
    648 		case SO_DONTROUTE:
    649 			if (!checkonly)
    650 				rts->rts_dontroute = *i1;
    651 			break;	/* goto sizeof (int) option return */
    652 		case SO_USELOOPBACK:
    653 			if (!checkonly)
    654 				rts->rts_useloopback = *i1;
    655 			break;	/* goto sizeof (int) option return */
    656 		case SO_BROADCAST:
    657 			if (!checkonly)
    658 				rts->rts_broadcast = *i1;
    659 			break;	/* goto sizeof (int) option return */
    660 		case SO_PROTOTYPE:
    661 			/*
    662 			 * Routing socket applications that call socket() with
    663 			 * a third argument can filter which messages will be
    664 			 * sent upstream thanks to sockfs.  so_socket() sends
    665 			 * down the SO_PROTOTYPE and rts_queue_input()
    666 			 * implements the filtering.
    667 			 */
    668 			if (*i1 != AF_INET && *i1 != AF_INET6)
    669 				return (EPROTONOSUPPORT);
    670 			if (!checkonly)
    671 				rts->rts_proto = *i1;
    672 			break;	/* goto sizeof (int) option return */
    673 		/*
    674 		 * The following two items can be manipulated,
    675 		 * but changing them should do nothing.
    676 		 */
    677 		case SO_SNDBUF:
    678 			if (*i1 > rtss->rtss_max_buf) {
    679 				*outlenp = 0;
    680 				return (ENOBUFS);
    681 			}
    682 			if (!checkonly) {
    683 				q->q_hiwat = *i1;
    684 			}
    685 			break;	/* goto sizeof (int) option return */
    686 		case SO_RCVBUF:
    687 			if (*i1 > rtss->rtss_max_buf) {
    688 				*outlenp = 0;
    689 				return (ENOBUFS);
    690 			}
    691 			if (!checkonly) {
    692 				RD(q)->q_hiwat = *i1;
    693 				(void) mi_set_sth_hiwat(RD(q), *i1);
    694 			}
    695 			break;	/* goto sizeof (int) option return */
    696 		default:
    697 			*outlenp = 0;
    698 			return (EINVAL);
    699 		}
    700 		break;
    701 	default:
    702 		*outlenp = 0;
    703 		return (EINVAL);
    704 	}
    705 	/*
    706 	 * Common case of return from an option that is sizeof (int)
    707 	 */
    708 	*(int *)outvalp = *i1;
    709 	*outlenp = (t_uscalar_t)sizeof (int);
    710 	return (0);
    711 }
    712 
    713 /*
    714  * This routine retrieves the value of an ND variable in a rtsparam_t
    715  * structure. It is called through nd_getset when a user reads the
    716  * variable.
    717  */
    718 /* ARGSUSED */
    719 static int
    720 rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
    721 {
    722 	rtsparam_t	*rtspa = (rtsparam_t *)cp;
    723 
    724 	(void) mi_mpprintf(mp, "%u", rtspa->rts_param_value);
    725 	return (0);
    726 }
    727 
    728 /*
    729  * Walk through the param array specified registering each element with the
    730  * named dispatch (ND) handler.
    731  */
    732 static boolean_t
    733 rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt)
    734 {
    735 	for (; cnt-- > 0; rtspa++) {
    736 		if (rtspa->rts_param_name != NULL && rtspa->rts_param_name[0]) {
    737 			if (!nd_load(ndp, rtspa->rts_param_name,
    738 			    rts_param_get, rts_param_set, (caddr_t)rtspa)) {
    739 				nd_free(ndp);
    740 				return (B_FALSE);
    741 			}
    742 		}
    743 	}
    744 	return (B_TRUE);
    745 }
    746 
    747 /* This routine sets an ND variable in a rtsparam_t structure. */
    748 /* ARGSUSED */
    749 static int
    750 rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
    751 {
    752 	ulong_t	new_value;
    753 	rtsparam_t	*rtspa = (rtsparam_t *)cp;
    754 
    755 	/*
    756 	 * Fail the request if the new value does not lie within the
    757 	 * required bounds.
    758 	 */
    759 	if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
    760 	    new_value < rtspa->rts_param_min ||
    761 	    new_value > rtspa->rts_param_max) {
    762 		return (EINVAL);
    763 	}
    764 
    765 	/* Set the new value */
    766 	rtspa->rts_param_value = new_value;
    767 	return (0);
    768 }
    769 
    770 /*
    771  * Empty rsrv routine which is used by rts_input to cause a wakeup
    772  * of a thread in qwait.
    773  */
    774 /*ARGSUSED*/
    775 static void
    776 rts_rsrv(queue_t *q)
    777 {
    778 }
    779 
    780 /*
    781  * This routine handles synchronous messages passed downstream. It either
    782  * consumes the message or passes it downstream; it never queues a
    783  * a message. The data messages that go down are wrapped in an IOCTL
    784  * message.
    785  *
    786  * Since it is synchronous, it waits for the M_IOCACK/M_IOCNAK so that
    787  * it can return an immediate error (such as ENETUNREACH when adding a route).
    788  * It uses the RTS_WRW_PENDING to ensure that each rts instance has only
    789  * one M_IOCTL outstanding at any given time.
    790  */
    791 static int
    792 rts_wrw(queue_t *q, struiod_t *dp)
    793 {
    794 	mblk_t	*mp = dp->d_mp;
    795 	mblk_t	*mp1;
    796 	int	error;
    797 	rt_msghdr_t	*rtm;
    798 	conn_t	*connp = Q_TO_CONN(q);
    799 	rts_t	*rts = connp->conn_rts;
    800 
    801 	while (rts->rts_flag & RTS_WRW_PENDING) {
    802 		if (qwait_rw(q)) {
    803 			rts->rts_error = EINTR;
    804 			goto err_ret;
    805 		}
    806 		}
    807 	rts->rts_flag |= RTS_WRW_PENDING;
    808 
    809 	if (isuioq(q) && (error = struioget(q, mp, dp, 0))) {
    810 		/*
    811 		 * Uio error of some sort, so just return the error.
    812 		 */
    813 		rts->rts_error = error;
    814 		goto err_ret;
    815 	}
    816 	/*
    817 	 * Pass the mblk (chain) onto wput().
    818 	 */
    819 	dp->d_mp = 0;
    820 
    821 	switch (mp->b_datap->db_type) {
    822 	case M_PROTO:
    823 	case M_PCPROTO:
    824 		/* Expedite other than T_DATA_REQ to below the switch */
    825 		if (((mp->b_wptr - mp->b_rptr) !=
    826 		    sizeof (struct T_data_req)) ||
    827 		    (((union T_primitives *)mp->b_rptr)->type != T_DATA_REQ))
    828 			break;
    829 		if ((mp1 = mp->b_cont) == NULL) {
    830 			rts->rts_error = EINVAL;
    831 			goto err_ret;
    832 		}
    833 		freeb(mp);
    834 		mp = mp1;
    835 		/* FALLTHRU */
    836 	case M_DATA:
    837 		/*
    838 		 * The semantics of the routing socket is such that the rtm_pid
    839 		 * field is automatically filled in during requests with the
    840 		 * current process' pid.  We do this here (where we still have
    841 		 * user context) after checking we have at least a message the
    842 		 * size of a routing message header.
    843 		 */
    844 		if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) {
    845 			if (!pullupmsg(mp, sizeof (rt_msghdr_t))) {
    846 				rts->rts_error = EINVAL;
    847 				goto err_ret;
    848 			}
    849 		}
    850 		rtm = (rt_msghdr_t *)mp->b_rptr;
    851 		rtm->rtm_pid = curproc->p_pid;
    852 		break;
    853 	default:
    854 		break;
    855 	}
    856 	rts->rts_flag |= RTS_WPUT_PENDING;
    857 	rts_wput(q, mp);
    858 	while (rts->rts_flag & RTS_WPUT_PENDING)
    859 		if (qwait_rw(q)) {
    860 			/* RTS_WPUT_PENDING will be cleared below */
    861 			rts->rts_error = EINTR;
    862 			break;
    863 		}
    864 err_ret:
    865 	rts->rts_flag &= ~(RTS_WPUT_PENDING | RTS_WRW_PENDING);
    866 	return (rts->rts_error);
    867 }
    868 
    869 /*
    870  * This routine handles all messages passed downstream. It either
    871  * consumes the message or passes it downstream; it never queues a
    872  * a message. The data messages that go down are wrapped in an IOCTL
    873  * message.
    874  *
    875  * FIXME? Should we call IP rts_request directly? Could punt on returning
    876  * errno in the case when it defers processing due to
    877  * IPIF_CHANGING/ILL_CHANGING???
    878  */
    879 static void
    880 rts_wput(queue_t *q, mblk_t *mp)
    881 {
    882 	uchar_t	*rptr = mp->b_rptr;
    883 	mblk_t	*mp1;
    884 	conn_t	*connp = Q_TO_CONN(q);
    885 	rts_t	*rts = connp->conn_rts;
    886 
    887 	switch (mp->b_datap->db_type) {
    888 	case M_DATA:
    889 		break;
    890 	case M_PROTO:
    891 	case M_PCPROTO:
    892 		if ((mp->b_wptr - rptr) == sizeof (struct T_data_req)) {
    893 			/* Expedite valid T_DATA_REQ to below the switch */
    894 			if (((union T_primitives *)rptr)->type == T_DATA_REQ) {
    895 				mp1 = mp->b_cont;
    896 				freeb(mp);
    897 				if (mp1 == NULL)
    898 					return;
    899 				mp = mp1;
    900 				break;
    901 			}
    902 		}
    903 		/* FALLTHRU */
    904 	default:
    905 		rts_wput_other(q, mp);
    906 		return;
    907 	}
    908 
    909 
    910 	mp1 = rts_ioctl_alloc(mp, DB_CRED(mp));
    911 	if (mp1 == NULL) {
    912 		ASSERT(rts != NULL);
    913 		freemsg(mp);
    914 		if (rts->rts_flag & RTS_WPUT_PENDING) {
    915 			rts->rts_error = ENOMEM;
    916 			rts->rts_flag &= ~RTS_WPUT_PENDING;
    917 		}
    918 		return;
    919 	}
    920 	ip_output(connp, mp1, q, IP_WPUT);
    921 }
    922 
    923 
    924 /*
    925  * Handles all the control message, if it
    926  * can not understand it, it will
    927  * pass down stream.
    928  */
    929 static void
    930 rts_wput_other(queue_t *q, mblk_t *mp)
    931 {
    932 	conn_t	*connp = Q_TO_CONN(q);
    933 	rts_t	*rts = connp->conn_rts;
    934 	uchar_t	*rptr = mp->b_rptr;
    935 	struct iocblk	*iocp;
    936 	cred_t	*cr;
    937 	rts_stack_t	*rtss;
    938 
    939 	rtss = rts->rts_rtss;
    940 
    941 	cr = DB_CREDDEF(mp, connp->conn_cred);
    942 
    943 	switch (mp->b_datap->db_type) {
    944 	case M_PROTO:
    945 	case M_PCPROTO:
    946 		if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) {
    947 			/*
    948 			 * If the message does not contain a PRIM_type,
    949 			 * throw it away.
    950 			 */
    951 			freemsg(mp);
    952 			return;
    953 		}
    954 		switch (((union T_primitives *)rptr)->type) {
    955 		case T_BIND_REQ:
    956 		case O_T_BIND_REQ:
    957 			rts_bind(q, mp);
    958 			return;
    959 		case T_UNBIND_REQ:
    960 			rts_unbind(q, mp);
    961 			return;
    962 		case T_CAPABILITY_REQ:
    963 			rts_capability_req(q, mp);
    964 			return;
    965 		case T_INFO_REQ:
    966 			rts_info_req(q, mp);
    967 			return;
    968 		case T_SVR4_OPTMGMT_REQ:
    969 			(void) svr4_optcom_req(q, mp, cr, &rts_opt_obj,
    970 			    B_TRUE);
    971 			return;
    972 		case T_OPTMGMT_REQ:
    973 			(void) tpi_optcom_req(q, mp, cr, &rts_opt_obj, B_TRUE);
    974 			return;
    975 		case O_T_CONN_RES:
    976 		case T_CONN_RES:
    977 		case T_DISCON_REQ:
    978 			/* Not supported by rts. */
    979 			rts_err_ack(q, mp, TNOTSUPPORT, 0);
    980 			return;
    981 		case T_DATA_REQ:
    982 		case T_EXDATA_REQ:
    983 		case T_ORDREL_REQ:
    984 			/* Illegal for rts. */
    985 			freemsg(mp);
    986 			(void) putnextctl1(RD(q), M_ERROR, EPROTO);
    987 			return;
    988 		default:
    989 			break;
    990 		}
    991 		break;
    992 	case M_IOCTL:
    993 		iocp = (struct iocblk *)mp->b_rptr;
    994 		switch (iocp->ioc_cmd) {
    995 		case ND_SET:
    996 		case ND_GET:
    997 			if (nd_getset(q, rtss->rtss_g_nd, mp)) {
    998 				qreply(q, mp);
    999 				return;
   1000 			}
   1001 			break;
   1002 		case TI_GETPEERNAME:
   1003 			mi_copyin(q, mp, NULL,
   1004 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
   1005 			return;
   1006 		default:
   1007 			break;
   1008 		}
   1009 	case M_IOCDATA:
   1010 		rts_wput_iocdata(q, mp);
   1011 		return;
   1012 	default:
   1013 		break;
   1014 	}
   1015 	ip_output(connp, mp, q, IP_WPUT);
   1016 }
   1017 
   1018 /*
   1019  * Called by rts_wput_other to handle all M_IOCDATA messages.
   1020  */
   1021 static void
   1022 rts_wput_iocdata(queue_t *q, mblk_t *mp)
   1023 {
   1024 	conn_t *connp = Q_TO_CONN(q);
   1025 	struct sockaddr	*rtsaddr;
   1026 	mblk_t	*mp1;
   1027 	STRUCT_HANDLE(strbuf, sb);
   1028 	struct iocblk	*iocp	= (struct iocblk *)mp->b_rptr;
   1029 
   1030 	/* Make sure it is one of ours. */
   1031 	switch (iocp->ioc_cmd) {
   1032 	case TI_GETPEERNAME:
   1033 		break;
   1034 	default:
   1035 		ip_output(connp, mp, q, IP_WPUT);
   1036 		return;
   1037 	}
   1038 	switch (mi_copy_state(q, mp, &mp1)) {
   1039 	case -1:
   1040 		return;
   1041 	case MI_COPY_CASE(MI_COPY_IN, 1):
   1042 		break;
   1043 	case MI_COPY_CASE(MI_COPY_OUT, 1):
   1044 		/* Copy out the strbuf. */
   1045 		mi_copyout(q, mp);
   1046 		return;
   1047 	case MI_COPY_CASE(MI_COPY_OUT, 2):
   1048 		/* All done. */
   1049 		mi_copy_done(q, mp, 0);
   1050 		return;
   1051 	default:
   1052 		mi_copy_done(q, mp, EPROTO);
   1053 		return;
   1054 	}
   1055 	STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
   1056 	if (STRUCT_FGET(sb, maxlen) < (int)sizeof (sin_t)) {
   1057 		mi_copy_done(q, mp, EINVAL);
   1058 		return;
   1059 	}
   1060 	switch (iocp->ioc_cmd) {
   1061 	case TI_GETPEERNAME:
   1062 		break;
   1063 	default:
   1064 		mi_copy_done(q, mp, EPROTO);
   1065 		return;
   1066 	}
   1067 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), sizeof (sin_t),
   1068 	    B_TRUE);
   1069 	if (mp1 == NULL)
   1070 		return;
   1071 	STRUCT_FSET(sb, len, (int)sizeof (sin_t));
   1072 	rtsaddr = (struct sockaddr *)mp1->b_rptr;
   1073 	mp1->b_wptr = (uchar_t *)&rtsaddr[1];
   1074 	bzero(rtsaddr, sizeof (struct sockaddr));
   1075 	rtsaddr->sa_family = AF_ROUTE;
   1076 	/* Copy out the address */
   1077 	mi_copyout(q, mp);
   1078 }
   1079 
   1080 /*ARGSUSED2*/
   1081 static void
   1082 rts_input(void *arg1, mblk_t *mp, void *arg2)
   1083 {
   1084 	conn_t *connp = (conn_t *)arg1;
   1085 	rts_t	*rts = connp->conn_rts;
   1086 	struct iocblk	*iocp;
   1087 	mblk_t *mp1;
   1088 	struct T_data_ind *tdi;
   1089 
   1090 	switch (mp->b_datap->db_type) {
   1091 	case M_IOCACK:
   1092 	case M_IOCNAK:
   1093 		iocp = (struct iocblk *)mp->b_rptr;
   1094 		if (rts->rts_flag & (RTS_WPUT_PENDING)) {
   1095 			rts->rts_flag &= ~RTS_WPUT_PENDING;
   1096 			rts->rts_error = iocp->ioc_error;
   1097 			/*
   1098 			 * Tell rts_wvw/qwait that we are done.
   1099 			 * Note: there is no qwait_wakeup() we can use.
   1100 			 */
   1101 			qenable(connp->conn_rq);
   1102 			freemsg(mp);
   1103 			return;
   1104 		}
   1105 		break;
   1106 	case M_DATA:
   1107 		/*
   1108 		 * Prepend T_DATA_IND to prevent the stream head from
   1109 		 * consolidating multiple messages together.
   1