Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /* Copyright (c) 1990 Mentat Inc. */
     26 
     27 
     28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     29 
     30 #include <sys/types.h>
     31 #include <sys/stream.h>
     32 #include <sys/stropts.h>
     33 #include <sys/strlog.h>
     34 #include <sys/strsun.h>
     35 #define	_SUN_TPI_VERSION 2
     36 #include <sys/tihdr.h>
     37 #include <sys/timod.h>
     38 #include <sys/ddi.h>
     39 #include <sys/sunddi.h>
     40 #include <sys/strsubr.h>
     41 #include <sys/cmn_err.h>
     42 #include <sys/debug.h>
     43 #include <sys/kmem.h>
     44 #include <sys/policy.h>
     45 #include <sys/priv.h>
     46 #include <sys/zone.h>
     47 #include <sys/time.h>
     48 
     49 #include <sys/socket.h>
     50 #include <sys/isa_defs.h>
     51 #include <sys/suntpi.h>
     52 #include <sys/xti_inet.h>
     53 #include <sys/netstack.h>
     54 
     55 #include <net/route.h>
     56 #include <net/if.h>
     57 
     58 #include <netinet/in.h>
     59 #include <netinet/ip6.h>
     60 #include <netinet/icmp6.h>
     61 #include <inet/common.h>
     62 #include <inet/ip.h>
     63 #include <inet/ip6.h>
     64 #include <inet/mi.h>
     65 #include <inet/nd.h>
     66 #include <inet/optcom.h>
     67 #include <inet/snmpcom.h>
     68 #include <inet/kstatcom.h>
     69 #include <inet/rawip_impl.h>
     70 
     71 #include <netinet/ip_mroute.h>
     72 #include <inet/tcp.h>
     73 #include <net/pfkeyv2.h>
     74 #include <inet/ipsec_info.h>
     75 #include <inet/ipclassifier.h>
     76 
     77 #include <sys/tsol/label.h>
     78 #include <sys/tsol/tnet.h>
     79 
     80 #include <inet/ip_ire.h>
     81 #include <inet/ip_if.h>
     82 
     83 #include <inet/ip_impl.h>
     84 
     85 /*
     86  * Synchronization notes:
     87  *
     88  * RAWIP is MT and uses the usual kernel synchronization primitives. There is
     89  * locks, which is icmp_rwlock. We also use conn_lock when updating things
     90  * which affect the IP classifier lookup.
     91  * The lock order is icmp_rwlock -> conn_lock.
     92  *
     93  * The icmp_rwlock:
     94  * This protects most of the other fields in the icmp_t. The exact list of
     95  * fields which are protected by each of the above locks is documented in
     96  * the icmp_t structure definition.
     97  *
     98  * Plumbing notes:
     99  * ICMP is always a device driver. For compatibility with mibopen() code
    100  * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
    101  * dummy module.
    102  */
    103 
    104 static void	icmp_addr_req(queue_t *q, mblk_t *mp);
    105 static void	icmp_bind(queue_t *q, mblk_t *mp);
    106 static void	icmp_bind_proto(queue_t *q);
    107 static void	icmp_bind_result(conn_t *, mblk_t *);
    108 static void	icmp_bind_ack(conn_t *, mblk_t *mp);
    109 static void	icmp_bind_error(conn_t *, mblk_t *mp);
    110 static int	icmp_build_hdrs(icmp_t *icmp);
    111 static void	icmp_capability_req(queue_t *q, mblk_t *mp);
    112 static int	icmp_close(queue_t *q);
    113 static void	icmp_connect(queue_t *q, mblk_t *mp);
    114 static void	icmp_disconnect(queue_t *q, mblk_t *mp);
    115 static void	icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
    116 		    int sys_error);
    117 static void	icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
    118 		    t_scalar_t t_error, int sys_error);
    119 static void	icmp_icmp_error(queue_t *q, mblk_t *mp);
    120 static void	icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp);
    121 static void	icmp_info_req(queue_t *q, mblk_t *mp);
    122 static void	icmp_input(void *, mblk_t *, void *);
    123 static mblk_t	*icmp_ip_bind_mp(icmp_t *icmp, t_scalar_t bind_prim,
    124 		    t_scalar_t addr_length, in_port_t);
    125 static int	icmp_open(queue_t *q, dev_t *devp, int flag, int sflag,
    126 		    cred_t *credp, boolean_t isv6);
    127 static int	icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
    128 		    cred_t *credp);
    129 static int	icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
    130 		    cred_t *credp);
    131 static void	icmp_output(queue_t *q, mblk_t *mp);
    132 static int	icmp_unitdata_opt_process(queue_t *q, mblk_t *mp,
    133 		    int *errorp, void *thisdg_attrs);
    134 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
    135 int		icmp_opt_set(queue_t *q, uint_t optset_context,
    136 		    int level, int name, uint_t inlen,
    137 		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
    138 		    void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
    139 int		icmp_opt_get(queue_t *q, int level, int name,
    140 		    uchar_t *ptr);
    141 static int	icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
    142 static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt);
    143 static int	icmp_param_set(queue_t *q, mblk_t *mp, char *value,
    144 		    caddr_t cp, cred_t *cr);
    145 static int	icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
    146 		    uchar_t *ptr, int len);
    147 static int	icmp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
    148 		    cred_t *cr);
    149 static void	icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
    150 static void	icmp_unbind(queue_t *q, mblk_t *mp);
    151 static void	icmp_wput(queue_t *q, mblk_t *mp);
    152 static void	icmp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6,
    153 		    t_scalar_t tudr_optlen);
    154 static void	icmp_wput_other(queue_t *q, mblk_t *mp);
    155 static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
    156 static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
    157 
    158 static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
    159 static void	rawip_stack_fini(netstackid_t stackid, void *arg);
    160 
    161 static void	*rawip_kstat_init(netstackid_t stackid);
    162 static void	rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
    163 static int	rawip_kstat_update(kstat_t *kp, int rw);
    164 
    165 
    166 static struct module_info icmp_mod_info =  {
    167 	5707, "icmp", 1, INFPSZ, 512, 128
    168 };
    169 
    170 /*
    171  * Entry points for ICMP as a device.
    172  * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
    173  */
    174 static struct qinit icmprinitv4 = {
    175 	NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
    176 };
    177 
    178 static struct qinit icmprinitv6 = {
    179 	NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
    180 };
    181 
    182 static struct qinit icmpwinit = {
    183 	(pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
    184 };
    185 
    186 /* For AF_INET aka /dev/icmp */
    187 struct streamtab icmpinfov4 = {
    188 	&icmprinitv4, &icmpwinit
    189 };
    190 
    191 /* For AF_INET6 aka /dev/icmp6 */
    192 struct streamtab icmpinfov6 = {
    193 	&icmprinitv6, &icmpwinit
    194 };
    195 
    196 static sin_t	sin_null;	/* Zero address for quick clears */
    197 static sin6_t	sin6_null;	/* Zero address for quick clears */
    198 
    199 /* Default structure copied into T_INFO_ACK messages */
    200 static struct T_info_ack icmp_g_t_info_ack = {
    201 	T_INFO_ACK,
    202 	IP_MAXPACKET,	 /* TSDU_size.  icmp allows maximum size messages. */
    203 	T_INVALID,	/* ETSDU_size.  icmp does not support expedited data. */
    204 	T_INVALID,	/* CDATA_size. icmp does not support connect data. */
    205 	T_INVALID,	/* DDATA_size. icmp does not support disconnect data. */
    206 	0,		/* ADDR_size - filled in later. */
    207 	0,		/* OPT_size - not initialized here */
    208 	IP_MAXPACKET,	/* TIDU_size.  icmp allows maximum size messages. */
    209 	T_CLTS,		/* SERV_type.  icmp supports connection-less. */
    210 	TS_UNBND,	/* CURRENT_state.  This is set from icmp_state. */
    211 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
    212 };
    213 
    214 /*
    215  * Table of ND variables supported by icmp.  These are loaded into is_nd
    216  * when the stack instance is created.
    217  * All of these are alterable, within the min/max values given, at run time.
    218  */
    219 static icmpparam_t	icmp_param_arr[] = {
    220 	/* min	max	value	name */
    221 	{ 0,	128,	32,	"icmp_wroff_extra" },
    222 	{ 1,	255,	255,	"icmp_ipv4_ttl" },
    223 	{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS,	"icmp_ipv6_hoplimit"},
    224 	{ 0,	1,	1,	"icmp_bsd_compat" },
    225 	{ 4096,	65536,	8192,	"icmp_xmit_hiwat"},
    226 	{ 0,	65536,	1024,	"icmp_xmit_lowat"},
    227 	{ 4096,	65536,	8192,	"icmp_recv_hiwat"},
    228 	{ 65536, 1024*1024*1024, 256*1024,	"icmp_max_buf"},
    229 };
    230 #define	is_wroff_extra			is_param_arr[0].icmp_param_value
    231 #define	is_ipv4_ttl			is_param_arr[1].icmp_param_value
    232 #define	is_ipv6_hoplimit		is_param_arr[2].icmp_param_value
    233 #define	is_bsd_compat			is_param_arr[3].icmp_param_value
    234 #define	is_xmit_hiwat			is_param_arr[4].icmp_param_value
    235 #define	is_xmit_lowat			is_param_arr[5].icmp_param_value
    236 #define	is_recv_hiwat			is_param_arr[6].icmp_param_value
    237 #define	is_max_buf			is_param_arr[7].icmp_param_value
    238 
    239 /*
    240  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
    241  * passed to icmp_wput.
    242  * The O_T_BIND_REQ/T_BIND_REQ is passed downstream to ip with the ICMP
    243  * protocol type placed in the message following the address. A T_BIND_ACK
    244  * message is returned by ip_bind_v4/v6.
    245  */
    246 static void
    247 icmp_bind(queue_t *q, mblk_t *mp)
    248 {
    249 	sin_t	*sin;
    250 	sin6_t	*sin6;
    251 	mblk_t	*mp1;
    252 	struct T_bind_req	*tbr;
    253 	icmp_t	*icmp;
    254 	conn_t	*connp = Q_TO_CONN(q);
    255 
    256 	icmp = connp->conn_icmp;
    257 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
    258 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
    259 		    "icmp_bind: bad req, len %u",
    260 		    (uint_t)(mp->b_wptr - mp->b_rptr));
    261 		icmp_err_ack(q, mp, TPROTO, 0);
    262 		return;
    263 	}
    264 	if (icmp->icmp_state != TS_UNBND) {
    265 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
    266 		    "icmp_bind: bad state, %d", icmp->icmp_state);
    267 		icmp_err_ack(q, mp, TOUTSTATE, 0);
    268 		return;
    269 	}
    270 	/*
    271 	 * Reallocate the message to make sure we have enough room for an
    272 	 * address and the protocol type.
    273 	 */
    274 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
    275 	if (!mp1) {
    276 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
    277 		return;
    278 	}
    279 	mp = mp1;
    280 	tbr = (struct T_bind_req *)mp->b_rptr;
    281 	switch (tbr->ADDR_length) {
    282 	case 0:			/* Generic request */
    283 		tbr->ADDR_offset = sizeof (struct T_bind_req);
    284 		if (icmp->icmp_family == AF_INET) {
    285 			tbr->ADDR_length = sizeof (sin_t);
    286 			sin = (sin_t *)&tbr[1];
    287 			*sin = sin_null;
    288 			sin->sin_family = AF_INET;
    289 			mp->b_wptr = (uchar_t *)&sin[1];
    290 		} else {
    291 			ASSERT(icmp->icmp_family == AF_INET6);
    292 			tbr->ADDR_length = sizeof (sin6_t);
    293 			sin6 = (sin6_t *)&tbr[1];
    294 			*sin6 = sin6_null;
    295 			sin6->sin6_family = AF_INET6;
    296 			mp->b_wptr = (uchar_t *)&sin6[1];
    297 		}
    298 		break;
    299 	case sizeof (sin_t):	/* Complete IP address */
    300 		sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset,
    301 		    sizeof (sin_t));
    302 		if (sin == NULL || !OK_32PTR((char *)sin)) {
    303 			icmp_err_ack(q, mp, TSYSERR, EINVAL);
    304 			return;
    305 		}
    306 		if (icmp->icmp_family != AF_INET ||
    307 		    sin->sin_family != AF_INET) {
    308 			icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
    309 			return;
    310 		}
    311 		break;
    312 	case sizeof (sin6_t):	/* Complete IP address */
    313 		sin6 = (sin6_t *)mi_offset_param(mp, tbr->ADDR_offset,
    314 		    sizeof (sin6_t));
    315 		if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
    316 			icmp_err_ack(q, mp, TSYSERR, EINVAL);
    317 			return;
    318 		}
    319 		if (icmp->icmp_family != AF_INET6 ||
    320 		    sin6->sin6_family != AF_INET6) {
    321 			icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
    322 			return;
    323 		}
    324 		/* No support for mapped addresses on raw sockets */
    325 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
    326 			icmp_err_ack(q, mp, TSYSERR, EADDRNOTAVAIL);
    327 			return;
    328 		}
    329 		break;
    330 	default:
    331 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
    332 		    "icmp_bind: bad ADDR_length %d", tbr->ADDR_length);
    333 		icmp_err_ack(q, mp, TBADADDR, 0);
    334 		return;
    335 	}
    336 
    337 	/*
    338 	 * The state must be TS_UNBND. TPI mandates that users must send
    339 	 * TPI primitives only 1 at a time and wait for the response before
    340 	 * sending the next primitive.
    341 	 */
    342 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
    343 	if (icmp->icmp_state != TS_UNBND || icmp->icmp_pending_op != -1) {
    344 		rw_exit(&icmp->icmp_rwlock);
    345 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
    346 		    "icmp_bind: bad state, %d", icmp->icmp_state);
    347 		icmp_err_ack(q, mp, TOUTSTATE, 0);
    348 		return;
    349 	}
    350 
    351 	icmp->icmp_pending_op = tbr->PRIM_type;
    352 
    353 	/*
    354 	 * Copy the source address into our icmp structure.  This address
    355 	 * may still be zero; if so, ip will fill in the correct address
    356 	 * each time an outbound packet is passed to it.
    357 	 * If we are binding to a broadcast or multicast address then
    358 	 * icmp_bind_ack will clear the source address when it receives
    359 	 * the T_BIND_ACK.
    360 	 */
    361 	icmp->icmp_state = TS_IDLE;
    362 
    363 	if (icmp->icmp_family == AF_INET) {
    364 		ASSERT(sin != NULL);
    365 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
    366 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr,
    367 		    &icmp->icmp_v6src);
    368 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
    369 		    icmp->icmp_ip_snd_options_len;
    370 		icmp->icmp_bound_v6src = icmp->icmp_v6src;
    371 	} else {
    372 		int error;
    373 
    374 		ASSERT(sin6 != NULL);
    375 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
    376 		icmp->icmp_v6src = sin6->sin6_addr;
    377 		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
    378 		icmp->icmp_bound_v6src = icmp->icmp_v6src;
    379 
    380 		/* Rebuild the header template */
    381 		error = icmp_build_hdrs(icmp);
    382 		if (error != 0) {
    383 			icmp->icmp_pending_op = -1;
    384 			rw_exit(&icmp->icmp_rwlock);
    385 			icmp_err_ack(q, mp, TSYSERR, error);
    386 			return;
    387 		}
    388 	}
    389 	/*
    390 	 * Place protocol type in the O_T_BIND_REQ/T_BIND_REQ following
    391 	 * the address.
    392 	 */
    393 	*mp->b_wptr++ = icmp->icmp_proto;
    394 	if (!(V6_OR_V4_INADDR_ANY(icmp->icmp_v6src))) {
    395 		/*
    396 		 * Append a request for an IRE if src not 0 (INADDR_ANY)
    397 		 */
    398 		mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
    399 		if (!mp->b_cont) {
    400 			icmp->icmp_pending_op = -1;
    401 			rw_exit(&icmp->icmp_rwlock);
    402 			icmp_err_ack(q, mp, TSYSERR, ENOMEM);
    403 			return;
    404 		}
    405 		mp->b_cont->b_wptr += sizeof (ire_t);
    406 		mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
    407 	}
    408 	rw_exit(&icmp->icmp_rwlock);
    409 
    410 	/* Pass the O_T_BIND_REQ/T_BIND_REQ to ip. */
    411 	if (icmp->icmp_family == AF_INET6)
    412 		mp = ip_bind_v6(q, mp, connp, NULL);
    413 	else
    414 		mp = ip_bind_v4(q, mp, connp);
    415 
    416 	/* The above return NULL if the bind needs to be deferred */
    417 	if (mp != NULL)
    418 		icmp_bind_result(connp, mp);
    419 	else
    420 		CONN_INC_REF(connp);
    421 }
    422 
    423 /*
    424  * Send message to IP to just bind to the protocol.
    425  */
    426 static void
    427 icmp_bind_proto(queue_t *q)
    428 {
    429 	mblk_t	*mp;
    430 	struct T_bind_req	*tbr;
    431 	icmp_t	*icmp;
    432 	conn_t	*connp = Q_TO_CONN(q);
    433 
    434 	icmp = connp->conn_icmp;
    435 
    436 	mp = allocb(sizeof (struct T_bind_req) + sizeof (sin6_t) + 1,
    437 	    BPRI_MED);
    438 	if (!mp) {
    439 		return;
    440 	}
    441 	mp->b_datap->db_type = M_PROTO;
    442 	tbr = (struct T_bind_req *)mp->b_rptr;
    443 	tbr->PRIM_type = O_T_BIND_REQ; /* change to T_BIND_REQ ? */
    444 	tbr->ADDR_offset = sizeof (struct T_bind_req);
    445 
    446 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
    447 	if (icmp->icmp_ipversion == IPV4_VERSION) {
    448 		sin_t	*sin;
    449 
    450 		tbr->ADDR_length = sizeof (sin_t);
    451 		sin = (sin_t *)&tbr[1];
    452 		*sin = sin_null;
    453 		sin->sin_family = AF_INET;
    454 		mp->b_wptr = (uchar_t *)&sin[1];
    455 	} else {
    456 		sin6_t	*sin6;
    457 
    458 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
    459 		tbr->ADDR_length = sizeof (sin6_t);
    460 		sin6 = (sin6_t *)&tbr[1];
    461 		*sin6 = sin6_null;
    462 		sin6->sin6_family = AF_INET6;
    463 		mp->b_wptr = (uchar_t *)&sin6[1];
    464 	}
    465 
    466 	/* Place protocol type in the O_T_BIND_REQ following the address. */
    467 	*mp->b_wptr++ = icmp->icmp_proto;
    468 	rw_exit(&icmp->icmp_rwlock);
    469 
    470 	/* Pass the O_T_BIND_REQ to ip. */
    471 	if (icmp->icmp_family == AF_INET6)
    472 		mp = ip_bind_v6(q, mp, connp, NULL);
    473 	else
    474 		mp = ip_bind_v4(q, mp, connp);
    475 
    476 	/* The above return NULL if the bind needs to be deferred */
    477 	if (mp != NULL)
    478 		icmp_bind_result(connp, mp);
    479 	else
    480 		CONN_INC_REF(connp);
    481 }
    482 
    483 /*
    484  * This is called from ip_wput_nondata to handle the results of a
    485  * deferred RAWIP bind.  It is called once the bind has been completed.
    486  */
    487 void
    488 rawip_resume_bind(conn_t *connp, mblk_t *mp)
    489 {
    490 	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
    491 
    492 	icmp_bind_result(connp, mp);
    493 
    494 	CONN_OPER_PENDING_DONE(connp);
    495 }
    496 
    497 /*
    498  * This routine handles each T_CONN_REQ message passed to icmp.  It
    499  * associates a default destination address with the stream.
    500  *
    501  * This routine sends down a T_BIND_REQ to IP with the following mblks:
    502  *	T_BIND_REQ	- specifying local and remote address.
    503  *	IRE_DB_REQ_TYPE	- to get an IRE back containing ire_type and src
    504  *	T_OK_ACK	- for the T_CONN_REQ
    505  *	T_CONN_CON	- to keep the TPI user happy
    506  *
    507  * The connect completes in icmp_bind_result.
    508  * When a T_BIND_ACK is received information is extracted from the IRE
    509  * and the two appended messages are sent to the TPI user.
    510  * Should icmp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will
    511  * convert it to an error ack for the appropriate primitive.
    512  */
    513 static void
    514 icmp_connect(queue_t *q, mblk_t *mp)
    515 {
    516 	sin_t	*sin;
    517 	sin6_t	*sin6;
    518 	mblk_t	*mp1, *mp2;
    519 	struct T_conn_req	*tcr;
    520 	icmp_t	*icmp;
    521 	ipaddr_t	v4dst;
    522 	in6_addr_t	v6dst;
    523 	uint32_t	flowinfo;
    524 	conn_t	*connp = Q_TO_CONN(q);
    525 
    526 	icmp = connp->conn_icmp;
    527 	tcr = (struct T_conn_req *)mp->b_rptr;
    528 	/* Sanity checks */
    529 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
    530 		icmp_err_ack(q, mp, TPROTO, 0);
    531 		return;
    532 	}
    533 
    534 	if (tcr->OPT_length != 0) {
    535 		icmp_err_ack(q, mp, TBADOPT, 0);
    536 		return;
    537 	}
    538 
    539 	switch (tcr->DEST_length) {
    540 	default:
    541 		icmp_err_ack(q, mp, TBADADDR, 0);
    542 		return;
    543 
    544 	case sizeof (sin_t):
    545 		sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
    546 		    sizeof (sin_t));
    547 		if (sin == NULL || !OK_32PTR((char *)sin)) {
    548 			icmp_err_ack(q, mp, TSYSERR, EINVAL);
    549 			return;
    550 		}
    551 		if (icmp->icmp_family != AF_INET ||
    552 		    sin->sin_family != AF_INET) {
    553 			icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
    554 			return;
    555 		}
    556 		v4dst = sin->sin_addr.s_addr;
    557 		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
    558 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
    559 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
    560 		    icmp->icmp_ip_snd_options_len;
    561 		break;
    562 
    563 	case sizeof (sin6_t):
    564 		sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset,
    565 		    sizeof (sin6_t));
    566 		if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
    567 			icmp_err_ack(q, mp, TSYSERR, EINVAL);
    568 			return;
    569 		}
    570 		if (icmp->icmp_family != AF_INET6 ||
    571 		    sin6->sin6_family != AF_INET6) {
    572 			icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
    573 			return;
    574 		}
    575 		/* No support for mapped addresses on raw sockets */
    576 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
    577 			icmp_err_ack(q, mp, TSYSERR, EADDRNOTAVAIL);
    578 			return;
    579 		}
    580 		v6dst = sin6->sin6_addr;
    581 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
    582 		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
    583 		flowinfo = sin6->sin6_flowinfo;
    584 		break;
    585 	}
    586 	if (icmp->icmp_ipversion == IPV4_VERSION) {
    587 		/*
    588 		 * Interpret a zero destination to mean loopback.
    589 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
    590 		 * generate the T_CONN_CON.
    591 		 */
    592 		if (v4dst == INADDR_ANY) {
    593 			v4dst = htonl(INADDR_LOOPBACK);
    594 			IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
    595 			if (icmp->icmp_family == AF_INET) {
    596 				sin->sin_addr.s_addr = v4dst;
    597 			} else {
    598 				sin6->sin6_addr = v6dst;
    599 			}
    600 		}
    601 		icmp->icmp_v6dst = v6dst;
    602 		icmp->icmp_flowinfo = 0;
    603 
    604 		/*
    605 		 * If the destination address is multicast and
    606 		 * an outgoing multicast interface has been set,
    607 		 * use the address of that interface as our
    608 		 * source address if no source address has been set.
    609 		 */
    610 		if (V4_PART_OF_V6(icmp->icmp_v6src) == INADDR_ANY &&
    611 		    CLASSD(v4dst) &&
    612 		    icmp->icmp_multicast_if_addr != INADDR_ANY) {
    613 			IN6_IPADDR_TO_V4MAPPED(icmp->icmp_multicast_if_addr,
    614 			    &icmp->icmp_v6src);
    615 		}
    616 	} else {
    617 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
    618 		/*
    619 		 * Interpret a zero destination to mean loopback.
    620 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
    621 		 * generate the T_CONN_CON.
    622 		 */
    623 		if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
    624 			v6dst = ipv6_loopback;
    625 			sin6->sin6_addr = v6dst;
    626 		}
    627 		icmp->icmp_v6dst = v6dst;
    628 		icmp->icmp_flowinfo = flowinfo;
    629 		/*
    630 		 * If the destination address is multicast and
    631 		 * an outgoing multicast interface has been set,
    632 		 * then the ip bind logic will pick the correct source
    633 		 * address (i.e. matching the outgoing multicast interface).
    634 		 */
    635 	}
    636 
    637 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
    638 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
    639 		rw_exit(&icmp->icmp_rwlock);
    640 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
    641 		    "icmp_connect: bad state, %d", icmp->icmp_state);
    642 		icmp_err_ack(q, mp, TOUTSTATE, 0);
    643 		return;
    644 	}
    645 	icmp->icmp_pending_op = T_CONN_REQ;
    646 
    647 	if (icmp->icmp_state == TS_DATA_XFER) {
    648 		/* Already connected - clear out state */
    649 		icmp->icmp_v6src = icmp->icmp_bound_v6src;
    650 		icmp->icmp_state = TS_IDLE;
    651 	}
    652 
    653 	/*
    654 	 * Send down bind to IP to verify that there is a route
    655 	 * and to determine the source address.
    656 	 * This will come back as T_BIND_ACK with an IRE_DB_TYPE in rput.
    657 	 */
    658 	if (icmp->icmp_family == AF_INET) {
    659 		mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (ipa_conn_t),
    660 		    sin->sin_port);
    661 	} else {
    662 		ASSERT(icmp->icmp_family == AF_INET6);
    663 		mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (ipa6_conn_t),
    664 		    sin6->sin6_port);
    665 	}
    666 	if (mp1 == NULL) {
    667 		icmp->icmp_pending_op = -1;
    668 		rw_exit(&icmp->icmp_rwlock);
    669 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
    670 		return;
    671 	}
    672 
    673 	/*
    674 	 * We also have to send a connection confirmation to
    675 	 * keep TLI happy. Prepare it for icmp_bind_result.
    676 	 */
    677 	if (icmp->icmp_family == AF_INET) {
    678 		mp2 = mi_tpi_conn_con(NULL, (char *)sin, sizeof (*sin), NULL,
    679 		    0);
    680 	} else {
    681 		ASSERT(icmp->icmp_family == AF_INET6);
    682 		mp2 = mi_tpi_conn_con(NULL, (char *)sin6, sizeof (*sin6), NULL,
    683 		    0);
    684 	}
    685 	if (mp2 == NULL) {
    686 		freemsg(mp1);
    687 		icmp->icmp_pending_op = -1;
    688 		rw_exit(&icmp->icmp_rwlock);
    689 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
    690 		return;
    691 	}
    692 
    693 	mp = mi_tpi_ok_ack_alloc(mp);
    694 	if (mp == NULL) {
    695 		/* Unable to reuse the T_CONN_REQ for the ack. */
    696 		freemsg(mp2);
    697 		icmp->icmp_pending_op = -1;
    698 		rw_exit(&icmp->icmp_rwlock);
    699 		icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
    700 		return;
    701 	}
    702 
    703 	icmp->icmp_state = TS_DATA_XFER;
    704 	rw_exit(&icmp->icmp_rwlock);
    705 
    706 	/* Hang onto the T_OK_ACK and T_CONN_CON for later. */
    707 	linkb(mp1, mp);
    708 	linkb(mp1, mp2);
    709 
    710 	mblk_setcred(mp1, connp->conn_cred);
    711 	if (icmp->icmp_family == AF_INET)
    712 		mp1 = ip_bind_v4(q, mp1, connp);
    713 	else
    714 		mp1 = ip_bind_v6(q, mp1, connp, NULL);
    715 
    716 	/* The above return NULL if the bind needs to be deferred */
    717 	if (mp1 != NULL)
    718 		icmp_bind_result(connp, mp1);
    719 	else
    720 		CONN_INC_REF(connp);
    721 }
    722 
    723 static void
    724 icmp_close_free(conn_t *connp)
    725 {
    726 	icmp_t *icmp = connp->conn_icmp;
    727 
    728 	/* If there are any options associated with the stream, free them. */
    729 	if (icmp->icmp_ip_snd_options != NULL) {
    730 		mi_free((char *)icmp->icmp_ip_snd_options);
    731 		icmp->icmp_ip_snd_options = NULL;
    732 		icmp->icmp_ip_snd_options_len = 0;
    733 	}
    734 
    735 	if (icmp->icmp_filter != NULL) {
    736 		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
    737 		icmp->icmp_filter = NULL;
    738 	}
    739 	/* Free memory associated with sticky options */
    740 	if (icmp->icmp_sticky_hdrs_len != 0) {
    741 		kmem_free(icmp->icmp_sticky_hdrs,
    742 		    icmp->icmp_sticky_hdrs_len);
    743 		icmp->icmp_sticky_hdrs = NULL;
    744 		icmp->icmp_sticky_hdrs_len = 0;
    745 	}
    746 	ip6_pkt_free(&icmp->icmp_sticky_ipp);
    747 
    748 	/*
    749 	 * Clear any fields which the kmem_cache constructor clears.
    750 	 * Only icmp_connp needs to be preserved.
    751 	 * TBD: We should make this more efficient to avoid clearing
    752 	 * everything.
    753 	 */
    754 	ASSERT(icmp->icmp_connp == connp);
    755 	bzero(icmp, sizeof (icmp_t));
    756 	icmp->icmp_connp = connp;
    757 }
    758 
    759 static int
    760 icmp_close(queue_t *q)
    761 {
    762 	conn_t	*connp = (conn_t *)q->q_ptr;
    763 
    764 	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
    765 
    766 	ip_quiesce_conn(connp);
    767 
    768 	qprocsoff(connp->conn_rq);
    769 
    770 	icmp_close_free(connp);
    771 
    772 	/*
    773 	 * Now we are truly single threaded on this stream, and can
    774 	 * delete the things hanging off the connp, and finally the connp.
    775 	 * We removed this connp from the fanout list, it cannot be
    776 	 * accessed thru the fanouts, and we already waited for the
    777 	 * conn_ref to drop to 0. We are already in close, so
    778 	 * there cannot be any other thread from the top. qprocsoff
    779 	 * has completed, and service has completed or won't run in
    780 	 * future.
    781 	 */
    782 	ASSERT(connp->conn_ref == 1);
    783 
    784 	inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
    785 
    786 	connp->conn_ref--;
    787 	ipcl_conn_destroy(connp);
    788 
    789 	q->q_ptr = WR(q)->q_ptr = NULL;
    790 	return (0);
    791 }
    792 
    793 /*
    794  * This routine handles each T_DISCON_REQ message passed to icmp
    795  * as an indicating that ICMP is no longer connected. This results
    796  * in sending a T_BIND_REQ to IP to restore the binding to just
    797  * the local address.
    798  *
    799  * This routine sends down a T_BIND_REQ to IP with the following mblks:
    800  *	T_BIND_REQ	- specifying just the local address.
    801  *	T_OK_ACK	- for the T_DISCON_REQ
    802  *
    803  * The disconnect completes in icmp_bind_result.
    804  * When a T_BIND_ACK is received the appended T_OK_ACK is sent to the TPI user.
    805  * Should icmp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will
    806  * convert it to an error ack for the appropriate primitive.
    807  */
    808 static void
    809 icmp_disconnect(queue_t *q, mblk_t *mp)
    810 {
    811 	icmp_t	*icmp;
    812 	mblk_t	*mp1;
    813 	conn_t	*connp = Q_TO_CONN(q);
    814 
    815 	icmp = connp->conn_icmp;
    816 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
    817 	if (icmp->icmp_state != TS_DATA_XFER || icmp->icmp_pending_op != -1) {
    818 		rw_exit(&icmp->icmp_rwlock);
    819 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
    820 		    "icmp_disconnect: bad state, %d", icmp->icmp_state);
    821 		icmp_err_ack(q, mp, TOUTSTATE, 0);
    822 		return;
    823 	}
    824 	icmp->icmp_pending_op = T_DISCON_REQ;
    825 	icmp->icmp_v6src = icmp->icmp_bound_v6src;
    826 	icmp->icmp_state = TS_IDLE;
    827 
    828 	/*
    829 	 * Send down bind to IP to remove the full binding and revert
    830 	 * to the local address binding.
    831 	 */
    832 	if (icmp->icmp_family == AF_INET) {
    833 		mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (sin_t), 0);
    834 	} else {
    835 		ASSERT(icmp->icmp_family == AF_INET6);
    836 		mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (sin6_t), 0);
    837 	}
    838 	if (mp1 == NULL) {
    839 		icmp->icmp_pending_op = -1;
    840 		rw_exit(&icmp->icmp_rwlock);
    841 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
    842 		return;
    843 	}
    844 	mp = mi_tpi_ok_ack_alloc(mp);
    845 	if (mp == NULL) {
    846 		/* Unable to reuse the T_DISCON_REQ for the ack. */
    847 		icmp->icmp_pending_op = -1;
    848 		rw_exit(&icmp->icmp_rwlock);
    849 		icmp_err_ack_prim(q, mp1, T_DISCON_REQ, TSYSERR, ENOMEM);
    850 		return;
    851 	}
    852 
    853 	if (icmp->icmp_family == AF_INET6) {
    854 		int error;
    855 
    856 		/* Rebuild the header template */
    857 		error = icmp_build_hdrs(icmp);
    858 		if (error != 0) {
    859 			icmp->icmp_pending_op = -1;
    860 			rw_exit(&icmp->icmp_rwlock);
    861 			icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, error);
    862 			freemsg(mp1);
    863 			return;
    864 		}
    865 	}
    866 
    867 	rw_exit(&icmp->icmp_rwlock);
    868 	/* Append the T_OK_ACK to the T_BIND_REQ for icmp_bind_result */
    869 	linkb(mp1, mp);
    870 
    871 	if (icmp->icmp_family == AF_INET6)
    872 		mp1 = ip_bind_v6(q, mp1, connp, NULL);
    873 	else
    874 		mp1 = ip_bind_v4(q, mp1, connp);
    875 
    876 	/* The above return NULL if the bind needs to be deferred */
    877 	if (mp1 != NULL)
    878 		icmp_bind_result(connp, mp1);
    879 	else
    880 		CONN_INC_REF(connp);
    881 }
    882 
    883 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
    884 static void
    885 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
    886 {
    887 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
    888 		qreply(q, mp);
    889 }
    890 
    891 /* Shorthand to generate and send TPI error acks to our client */
    892 static void
    893 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
    894     t_scalar_t t_error, int sys_error)
    895 {
    896 	struct T_error_ack	*teackp;
    897 
    898 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
    899 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
    900 		teackp = (struct T_error_ack *)mp->b_rptr;
    901 		teackp->ERROR_prim = primitive;
    902 		teackp->TLI_error = t_error;
    903 		teackp->UNIX_error = sys_error;
    904 		qreply(q, mp);
    905 	}
    906 }
    907 
    908 /*
    909  * icmp_icmp_error is called by icmp_input to process ICMP
    910  * messages passed up by IP.
    911  * Generates the appropriate T_UDERROR_IND for permanent
    912  * (non-transient) errors.
    913  * Assumes that IP has pulled up everything up to and including
    914  * the ICMP header.
    915  */
    916 static void
    917 icmp_icmp_error(queue_t *q, mblk_t *mp)
    918 {
    919 	icmph_t *icmph;
    920 	ipha_t	*ipha;
    921 	int	iph_hdr_length;
    922 	sin_t	sin;
    923 	sin6_t	sin6;
    924 	mblk_t	*mp1;
    925 	int	error = 0;
    926 	icmp_t	*icmp = Q_TO_ICMP(q);
    927 
    928 	ipha = (ipha_t *)mp->b_rptr;
    929 
    930 	ASSERT(OK_32PTR(mp->b_rptr));
    931 
    932 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
    933 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
    934 		icmp_icmp_error_ipv6(q, mp);
    935 		return;
    936 	}
    937 	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
    938 
    939 	/* Skip past the outer IP and ICMP headers */
    940 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
    941 	icmph = (icmph_t *)(&mp->b_rptr[iph_hdr_length]);
    942 	ipha = (ipha_t *)&icmph[1];
    943 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
    944 
    945 	switch (icmph->icmph_type) {
    946 	case ICMP_DEST_UNREACHABLE:
    947 		switch (icmph->icmph_code) {
    948 		case ICMP_FRAGMENTATION_NEEDED:
    949 			/*
    950 			 * IP has already adjusted the path MTU.
    951 			 */
    952 			break;
    953 		case ICMP_PORT_UNREACHABLE:
    954 		case ICMP_PROTOCOL_UNREACHABLE:
    955 			error = ECONNREFUSED;
    956 			break;
    957 		default:
    958 			/* Transient errors */
    959 			break;
    960 		}
    961 		break;
    962 	default:
    963 		/* Transient errors */
    964 		break;
    965 	}
    966 	if (error == 0) {
    967 		freemsg(mp);
    968 		return;
    969 	}
    970 
    971 	/*
    972 	 * Deliver T_UDERROR_IND when the application has asked for it.
    973 	 * The socket layer enables this automatically when connected.
    974 	 */
    975 	if (!icmp->icmp_dgram_errind) {
    976 		freemsg(mp);
    977 		return;
    978 	}
    979 
    980 	switch (icmp->icmp_family) {
    981 	case AF_INET:
    982 		sin = sin_null;
    983 		sin.sin_family = AF_INET;
    984 		sin