Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Tunnel driver
     28  * This module acts like a driver/DLPI provider as viewed from the top
     29  * and a stream head/TPI user from the bottom
     30  * Implements the logic for IP (IPv4 or IPv6) encapsulation
     31  * within IP (IPv4 or IPv6)
     32  */
     33 
     34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     35 
     36 #include <sys/types.h>
     37 #include <sys/stream.h>
     38 #include <sys/dlpi.h>
     39 #include <sys/stropts.h>
     40 #include <sys/strlog.h>
     41 #include <sys/tihdr.h>
     42 #include <sys/tiuser.h>
     43 #include <sys/ddi.h>
     44 #include <sys/sunddi.h>
     45 #include <sys/ethernet.h>
     46 #include <sys/cmn_err.h>
     47 #include <sys/debug.h>
     48 #include <sys/kmem.h>
     49 #include <sys/netstack.h>
     50 
     51 #include <sys/systm.h>
     52 #include <sys/param.h>
     53 #include <sys/socket.h>
     54 #include <sys/vtrace.h>
     55 #include <sys/isa_defs.h>
     56 #include <net/if.h>
     57 #include <net/if_arp.h>
     58 #include <net/route.h>
     59 #include <sys/sockio.h>
     60 #include <netinet/in.h>
     61 
     62 #include <inet/common.h>
     63 #include <inet/mi.h>
     64 #include <inet/mib2.h>
     65 #include <inet/nd.h>
     66 #include <inet/arp.h>
     67 #include <inet/snmpcom.h>
     68 
     69 #include <netinet/igmp_var.h>
     70 
     71 #include <netinet/ip6.h>
     72 #include <netinet/icmp6.h>
     73 #include <inet/ip.h>
     74 #include <inet/ip6.h>
     75 #include <net/if_dl.h>
     76 #include <inet/ip_if.h>
     77 #include <sys/strsun.h>
     78 #include <sys/strsubr.h>
     79 #include <inet/ipsec_impl.h>
     80 #include <inet/ipdrop.h>
     81 #include <inet/tun.h>
     82 #include <inet/ipsec_impl.h>
     83 
     84 
     85 #include <sys/conf.h>
     86 #include <sys/errno.h>
     87 #include <sys/modctl.h>
     88 #include <sys/stat.h>
     89 
     90 #include <inet/ip_ire.h>	/* for ire_route_lookup_v6 */
     91 
     92 static void	tun_cancel_rec_evs(queue_t *, eventid_t *);
     93 static void	tun_bufcall_handler(void *);
     94 static boolean_t tun_icmp_message_v4(queue_t *, ipha_t *, icmph_t *, mblk_t *);
     95 static boolean_t tun_icmp_too_big_v4(queue_t *, ipha_t *, uint16_t, mblk_t *);
     96 static boolean_t tun_icmp_message_v6(queue_t *, ip6_t *, icmp6_t *, uint8_t,
     97     mblk_t *);
     98 static boolean_t tun_icmp_too_big_v6(queue_t *, ip6_t *, uint32_t, uint8_t,
     99     mblk_t *);
    100 static void	tun_sendokack(queue_t *, mblk_t *, t_uscalar_t);
    101 static void	tun_sendsdusize(queue_t *);
    102 static void	tun_senderrack(queue_t *, mblk_t *, t_uscalar_t, t_uscalar_t,
    103     t_uscalar_t);
    104 static int	tun_fastpath(queue_t *, mblk_t *);
    105 static int	tun_ioctl(queue_t *, mblk_t  *);
    106 static void	tun_timeout_handler(void *);
    107 static int	tun_rproc(queue_t *, mblk_t *);
    108 static int	tun_wproc_mdata(queue_t *, mblk_t *);
    109 static int	tun_wproc(queue_t *, mblk_t  *);
    110 static int	tun_rdata(queue_t *, mblk_t *, mblk_t *, tun_t *, uint_t);
    111 static int	tun_rdata_v4(queue_t *, mblk_t *, mblk_t *, tun_t *);
    112 static int	tun_rdata_v6(queue_t *, mblk_t *, mblk_t *, tun_t *);
    113 static int	tun_set_sec_simple(tun_t *, ipsec_req_t *);
    114 static void	tun_send_ire_req(queue_t *);
    115 static uint32_t	tun_update_link_mtu(queue_t *, uint32_t, boolean_t);
    116 static mblk_t	*tun_realloc_mblk(queue_t *, mblk_t *, size_t, mblk_t *,
    117     boolean_t);
    118 static void	tun_recover(queue_t *, mblk_t *, size_t);
    119 static void	tun_rem_ppa_list(tun_t *);
    120 static void	tun_rem_tun_byaddr_list(tun_t *);
    121 static void	tun_rput_icmp_err_v4(queue_t *, mblk_t *, mblk_t *);
    122 static void	icmp_ricmp_err_v4_v4(queue_t *, mblk_t *, mblk_t *);
    123 static void	icmp_ricmp_err_v6_v4(queue_t *, mblk_t *, mblk_t *);
    124 static void	icmp_ricmp_err_v4_v6(queue_t *, mblk_t *, mblk_t *, icmp6_t *);
    125 static void	icmp_ricmp_err_v6_v6(queue_t *, mblk_t *, mblk_t *, icmp6_t *);
    126 static void	tun_rput_icmp_err_v6(queue_t *, mblk_t *, mblk_t *);
    127 static int	tun_rput_tpi(queue_t *, mblk_t *);
    128 static int	tun_send_bind_req(queue_t *);
    129 static void	tun_statinit(tun_stats_t *, char *, netstackid_t);
    130 static int	tun_stat_kstat_update(kstat_t *, int);
    131 static void	tun_wdata_v4(queue_t *, mblk_t *);
    132 static void	tun_wdata_v6(queue_t *, mblk_t *);
    133 static char	*tun_who(queue_t *, char *);
    134 static int	tun_wput_dlpi(queue_t *, mblk_t *);
    135 static int	tun_wputnext_v6(queue_t *, mblk_t *);
    136 static int	tun_wputnext_v4(queue_t *, mblk_t *);
    137 static boolean_t tun_limit_value_v6(queue_t *, mblk_t *, ip6_t *, int *);
    138 static void	tun_freemsg_chain(mblk_t *, uint64_t *);
    139 static void	*tun_stack_init(netstackid_t, netstack_t *);
    140 static void	tun_stack_fini(netstackid_t, void *);
    141 
    142 /* module's defined constants, globals and data structures */
    143 
    144 #define	IP	"ip"
    145 #define	IP6	"ip6"
    146 static major_t	IP_MAJ;
    147 static major_t	IP6_MAJ;
    148 
    149 #define	TUN_DEBUG
    150 #define	TUN_LINK_EXTRA_OFF	32
    151 
    152 #define	IPV6V4_DEF_TTL		60
    153 #define	IPV6V4_DEF_ENCAP	60
    154 
    155 #define	TUN_WHO_BUF		60
    156 
    157 
    158 #ifdef	TUN_DEBUG
    159 /* levels of debugging verbosity */
    160 #define	TUN0DBG		0x00	/* crucial */
    161 #define	TUN1DBG		0x01	/* informational */
    162 #define	TUN2DBG		0x02	/* verbose */
    163 #define	TUN3DBG		0x04	/* very verbose */
    164 
    165 /*
    166  * Global variable storing debugging level for all tunnels.  By default
    167  * all crucial messages will be printed.  Value can be masked to exclusively
    168  * print certain debug levels and not others.
    169  */
    170 int8_t tun_debug = TUN0DBG;
    171 
    172 #define	TUN_LEVEL(dbg, lvl)	((dbg & lvl) == lvl)
    173 
    174 #define	tun0dbg(a)	printf a
    175 #define	tun1dbg(a)	if (TUN_LEVEL(tun_debug, TUN1DBG)) printf a
    176 #define	tun2dbg(a)	if (TUN_LEVEL(tun_debug, TUN2DBG)) printf a
    177 #define	tun3dbg(a)	if (TUN_LEVEL(tun_debug, TUN3DBG)) printf a
    178 #else
    179 #define	tun0dbg(a)	/*  */
    180 #define	tun1dbg(a)	/*  */
    181 #define	tun2dbg(a)	/*  */
    182 #define	tun3dbg(a)	/*  */
    183 #endif /* TUN_DEBUG */
    184 
    185 #define	TUN_RECOVER_WAIT		(1*hz)
    186 
    187 /* canned DL_INFO_ACK  - adjusted based on tunnel type */
    188 dl_info_ack_t infoack = {
    189 	DL_INFO_ACK,	/* dl_primitive */
    190 	4196,		/* dl_max_sdu */
    191 	0,		/* dl_min_sdu */
    192 	0,		/* dl_addr_length */
    193 	DL_IPV4,	/* dl_mac_type */
    194 	0,		/* dl_reserved */
    195 	DL_UNATTACHED,	/* dl_current_state */
    196 	0,		/* dl_sap_length */
    197 	DL_CLDLS,	/* dl_service_mode */
    198 	0,		/* dl_qos_length */
    199 	0,		/* dl_qos_offset */
    200 	0,		/* dl_qos_range_length */
    201 	0,		/* dl_qos_range_offset */
    202 	DL_STYLE2,	/* dl_provider_style */
    203 	0,		/* dl_addr_offset */
    204 	DL_VERSION_2,	/* dl_version */
    205 	0,		/* dl_brdcast_addr_length */
    206 	0,		/* dl_brdcst_addr_offset */
    207 	0		/* dl_grow */
    208 };
    209 
    210 /*
    211  * canned DL_BIND_ACK - IP doesn't use any of this info.
    212  */
    213 dl_bind_ack_t bindack = {
    214 	DL_BIND_ACK,	/* dl_primitive */
    215 	0,		/* dl_sap */
    216 	0,		/* dl_addr_length */
    217 	0,		/* dl_addr_offset */
    218 	0,		/* dl_max_conind */
    219 	0		/* dl_xidtest_flg */
    220 };
    221 
    222 
    223 /*
    224  * Canned IPv6 destination options header containing Tunnel
    225  * Encapsulation Limit option.
    226  */
    227 static struct tun_encap_limit tun_limit_init_upper_v4 = {
    228 	{ IPPROTO_ENCAP, 0 },
    229 	IP6OPT_TUNNEL_LIMIT,
    230 	1,
    231 	IPV6_DEFAULT_ENCAPLIMIT, /* filled in with actual value later */
    232 	IP6OPT_PADN,
    233 	1,
    234 	0
    235 };
    236 static struct tun_encap_limit tun_limit_init_upper_v6 = {
    237 	{ IPPROTO_IPV6, 0 },
    238 	IP6OPT_TUNNEL_LIMIT,
    239 	1,
    240 	IPV6_DEFAULT_ENCAPLIMIT, /* filled in with actual value later */
    241 	IP6OPT_PADN,
    242 	1,
    243 	0
    244 };
    245 
    246 static tun_stats_t	*tun_add_stat(queue_t *);
    247 
    248 static void tun_add_byaddr(tun_t *);
    249 static ipsec_tun_pol_t *itp_get_byaddr_fn(uint32_t *, uint32_t *, int,
    250     netstack_t *);
    251 
    252 /* Setable in /etc/system */
    253 static boolean_t 	tun_do_fastpath = B_TRUE;
    254 
    255 /* streams linkages */
    256 static struct module_info info = {
    257 	TUN_MODID,	/* module id number */
    258 	TUN_NAME,	/* module name */
    259 	1,		/* min packet size accepted */
    260 	INFPSZ,		/* max packet size accepted */
    261 	65536,		/* hi-water mark */
    262 	1024		/* lo-water mark */
    263 };
    264 
    265 static struct qinit tunrinit = {
    266 	(pfi_t)tun_rput,	/* read side put procedure */
    267 	(pfi_t)tun_rsrv,	/* read side service procedure */
    268 	tun_open,		/* open procedure */
    269 	tun_close,		/* close procedure */
    270 	NULL,			/* for future use */
    271 	&info,			/* module information structure */
    272 	NULL			/* module statistics structure */
    273 };
    274 
    275 static struct qinit tunwinit = {
    276 	(pfi_t)tun_wput,	/* write side put procedure */
    277 	(pfi_t)tun_wsrv,	/* write side service procedure */
    278 	NULL,
    279 	NULL,
    280 	NULL,
    281 	&info,
    282 	NULL
    283 };
    284 
    285 struct streamtab tuninfo = {
    286 	&tunrinit,		/* read side queue init */
    287 	&tunwinit,		/* write side queue init */
    288 	NULL,			/* mux read side init */
    289 	NULL			/* mux write side init */
    290 };
    291 
    292 static struct fmodsw tun_fmodsw = {
    293 	TUN_NAME,
    294 	&tuninfo,
    295 	(D_MP | D_MTQPAIR | D_MTPUTSHARED)
    296 };
    297 
    298 static struct modlstrmod modlstrmod = {
    299 	&mod_strmodops,
    300 	"configured tunneling module",
    301 	&tun_fmodsw
    302 };
    303 
    304 static struct modlinkage modlinkage = {
    305 	MODREV_1,
    306 	&modlstrmod,
    307 	NULL
    308 };
    309 
    310 int
    311 _init(void)
    312 {
    313 	int	rc;
    314 
    315 	IP_MAJ = ddi_name_to_major(IP);
    316 	IP6_MAJ = ddi_name_to_major(IP6);
    317 
    318 	/*
    319 	 * We want to be informed each time a stack is created or
    320 	 * destroyed in the kernel, so we can maintain the
    321 	 * set of tun_stack_t's.
    322 	 */
    323 	netstack_register(NS_TUN, tun_stack_init, NULL, tun_stack_fini);
    324 
    325 	rc = mod_install(&modlinkage);
    326 	if (rc != 0)
    327 		netstack_unregister(NS_TUN);
    328 
    329 	return (rc);
    330 }
    331 
    332 int
    333 _fini(void)
    334 {
    335 	int error;
    336 
    337 	error = mod_remove(&modlinkage);
    338 	if (error == 0)
    339 		netstack_unregister(NS_TUN);
    340 
    341 	return (error);
    342 }
    343 
    344 int
    345 _info(struct modinfo *modinfop)
    346 {
    347 	return (mod_info(&modlinkage, modinfop));
    348 }
    349 
    350 /*
    351  * this module is meant to be pushed on an instance of IP and
    352  * have an instance of IP pushed on top of it.
    353  */
    354 
    355 /* ARGSUSED */
    356 int
    357 tun_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
    358 {
    359 	tun_t	*atp;
    360 	mblk_t *hello;
    361 	ipsec_info_t *ii;
    362 	netstack_t *ns;
    363 	zoneid_t zoneid;
    364 
    365 	if (q->q_ptr != NULL) {
    366 		/* re-open of an already open instance */
    367 		return (0);
    368 	}
    369 
    370 	if (sflag != MODOPEN) {
    371 		return (EINVAL);
    372 	}
    373 
    374 	tun1dbg(("tun_open\n"));
    375 
    376 	ns = netstack_find_by_cred(credp);
    377 	ASSERT(ns != NULL);
    378 
    379 	/*
    380 	 * For exclusive stacks we set the zoneid to zero
    381 	 * to make IP operate as if in the global zone.
    382 	 */
    383 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
    384 		zoneid = GLOBAL_ZONEID;
    385 	else
    386 		zoneid = crgetzoneid(credp);
    387 
    388 	hello = allocb(sizeof (ipsec_info_t), BPRI_HI);
    389 	if (hello == NULL) {
    390 		netstack_rele(ns);
    391 		return (ENOMEM);
    392 	}
    393 
    394 	/* allocate per-instance structure */
    395 	atp = kmem_zalloc(sizeof (tun_t), KM_SLEEP);
    396 
    397 	atp->tun_state = DL_UNATTACHED;
    398 	atp->tun_dev = *devp;
    399 	atp->tun_zoneid = zoneid;
    400 	atp->tun_netstack = ns;
    401 
    402 	/*
    403 	 * Based on the lower version of IP, initialize stuff that
    404 	 * won't change
    405 	 */
    406 	if (getmajor(*devp) == IP_MAJ) {
    407 		ipha_t *ipha;
    408 
    409 		atp->tun_flags = TUN_L_V4 | TUN_HOP_LIM;
    410 		atp->tun_hop_limit = IPV6V4_DEF_TTL;
    411 
    412 		/*
    413 		 * The tunnel MTU is recalculated when we know more
    414 		 * about the tunnel destination.
    415 		 */
    416 		atp->tun_mtu = IP_MAXPACKET - sizeof (ipha_t);
    417 		ipha = &atp->tun_ipha;
    418 		ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
    419 		ipha->ipha_type_of_service = 0;
    420 		ipha->ipha_ident = 0;		/* to be filled in by IP */
    421 		ipha->ipha_fragment_offset_and_flags = htons(IPH_DF);
    422 		ipha->ipha_ttl = atp->tun_hop_limit;
    423 		ipha->ipha_hdr_checksum = 0;	/* to be filled in by IP */
    424 	} else if (getmajor(*devp) == IP6_MAJ) {
    425 		atp->tun_flags = TUN_L_V6 | TUN_HOP_LIM | TUN_ENCAP_LIM;
    426 		atp->tun_hop_limit = IPV6_DEFAULT_HOPS;
    427 		atp->tun_encap_lim = IPV6_DEFAULT_ENCAPLIMIT;
    428 		atp->tun_mtu = IP_MAXPACKET - sizeof (ip6_t) -
    429 		    IPV6_TUN_ENCAP_OPT_LEN;
    430 		atp->tun_ip6h.ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
    431 		atp->tun_ip6h.ip6_hops = IPV6_DEFAULT_HOPS;
    432 	} else {
    433 		netstack_rele(ns);
    434 		kmem_free(atp, sizeof (tun_t));
    435 		return (ENXIO);
    436 	}
    437 
    438 	atp->tun_extra_offset = TUN_LINK_EXTRA_OFF;
    439 	mutex_init(&atp->tun_lock, NULL, MUTEX_DEFAULT, NULL);
    440 
    441 	/*
    442 	 * If this is the automatic tunneling module, atun, verify that the
    443 	 * lower protocol is IPv4 and set TUN_AUTOMATIC.  Since we don't do
    444 	 * automatic tunneling over IPv6, trying to run over IPv6 is an error,
    445 	 * so free memory and return an error.
    446 	 */
    447 	if (q->q_qinfo->qi_minfo->mi_idnum == ATUN_MODID) {
    448 		if (atp->tun_flags & TUN_L_V4) {
    449 			atp->tun_flags |= TUN_AUTOMATIC;
    450 			atp->tun_mtu = ATUN_MTU;
    451 		} else {
    452 			/* Error. */
    453 			netstack_rele(ns);
    454 			kmem_free(atp, sizeof (tun_t));
    455 			return (ENXIO);
    456 		}
    457 	} else if (q->q_qinfo->qi_minfo->mi_idnum == TUN6TO4_MODID) {
    458 		/*
    459 		 * Set 6to4 flag if this is the 6to4tun module and make
    460 		 * the same checks mentioned above.
    461 		 */
    462 		if (atp->tun_flags & TUN_L_V4) {
    463 			atp->tun_flags |= TUN_6TO4;
    464 			atp->tun_mtu = ATUN_MTU;
    465 		} else {
    466 			/* Error. */
    467 			netstack_rele(ns);
    468 			kmem_free(atp, sizeof (tun_t));
    469 			return (ENXIO);
    470 		}
    471 	}
    472 
    473 	q->q_ptr = WR(q)->q_ptr = atp;
    474 	atp->tun_wq = WR(q);
    475 	mutex_enter(&ns->netstack_tun->tuns_global_lock);
    476 	tun_add_byaddr(atp);
    477 	mutex_exit(&ns->netstack_tun->tuns_global_lock);
    478 	ii = (ipsec_info_t *)hello->b_rptr;
    479 	hello->b_wptr = hello->b_rptr + sizeof (*ii);
    480 	hello->b_datap->db_type = M_CTL;
    481 	ii->ipsec_info_type = TUN_HELLO;
    482 	ii->ipsec_info_len = sizeof (*ii);
    483 	qprocson(q);
    484 	putnext(WR(q), hello);
    485 	return (0);
    486 }
    487 
    488 /* ARGSUSED */
    489 int
    490 tun_close(queue_t *q, int flag, cred_t *cred_p)
    491 {
    492 	tun_t *atp = (tun_t *)q->q_ptr;
    493 	netstack_t *ns;
    494 	tun_stack_t *tuns;
    495 
    496 	ASSERT(atp != NULL);
    497 
    498 	ns = atp->tun_netstack;
    499 	tuns = ns->netstack_tun;
    500 
    501 	/* Cancel outstanding qtimeouts() or qbufcalls() */
    502 	tun_cancel_rec_evs(q, &atp->tun_events);
    503 
    504 	qprocsoff(q);
    505 
    506 	/* NOTE:  tun_rem_ppa_list() may unlink tun_itp from its AVL tree. */
    507 	if (atp->tun_stats != NULL)
    508 		tun_rem_ppa_list(atp);
    509 
    510 	if (atp->tun_itp != NULL) {
    511 		/* In brackets because of ITP_REFRELE's brackets. */
    512 		ITP_REFRELE(atp->tun_itp, ns);
    513 	}
    514 
    515 	netstack_rele(ns);
    516 
    517 	mutex_destroy(&atp->tun_lock);
    518 
    519 	/* remove tun_t from global list */
    520 	mutex_enter(&tuns->tuns_global_lock);
    521 	tun_rem_tun_byaddr_list(atp);
    522 	mutex_exit(&tuns->tuns_global_lock);
    523 
    524 	/* free per-instance struct  */
    525 	kmem_free(atp, sizeof (tun_t));
    526 
    527 	q->q_ptr = WR(q)->q_ptr = NULL;
    528 
    529 	return (0);
    530 }
    531 
    532 
    533 /*
    534  * Cancel bufcall and timer requests
    535  * Don't need to hold lock. protected by perimeter
    536  */
    537 static void
    538 tun_cancel_rec_evs(queue_t *q, eventid_t *evs)
    539 {
    540 	if (evs->ev_rbufcid != 0) {
    541 		qunbufcall(RD(q), evs->ev_rbufcid);
    542 		evs->ev_rbufcid = 0;
    543 	}
    544 	if (evs->ev_wbufcid != 0) {
    545 		qunbufcall(WR(q), evs->ev_wbufcid);
    546 		evs->ev_wbufcid = 0;
    547 	}
    548 	if (evs->ev_rtimoutid != 0) {
    549 		(void) quntimeout(RD(q), evs->ev_rtimoutid);
    550 		evs->ev_rtimoutid = 0;
    551 	}
    552 	if (evs->ev_wtimoutid != 0) {
    553 		(void) quntimeout(WR(q), evs->ev_wtimoutid);
    554 		evs->ev_wtimoutid = 0;
    555 	}
    556 }
    557 
    558 /*
    559  * Called by bufcall() when memory becomes available
    560  * Don't need to hold lock. protected by perimeter
    561  */
    562 static void
    563 tun_bufcall_handler(void *arg)
    564 {
    565 	queue_t		*q = arg;
    566 	tun_t		*atp = (tun_t *)q->q_ptr;
    567 	eventid_t	*evs;
    568 
    569 	ASSERT(atp);
    570 
    571 	evs = &atp->tun_events;
    572 	if ((q->q_flag & QREADR) != 0) {
    573 		ASSERT(evs->ev_rbufcid);
    574 		evs->ev_rbufcid = 0;
    575 	} else {
    576 		ASSERT(evs->ev_wbufcid);
    577 		evs->ev_wbufcid = 0;
    578 	}
    579 	enableok(q);
    580 	qenable(q);
    581 }
    582 
    583 /*
    584  * Called by timeout (if we couldn't do a bufcall)
    585  * Don't need to hold lock. protected by perimeter
    586  */
    587 static void
    588 tun_timeout_handler(void *arg)
    589 {
    590 	queue_t		*q = arg;
    591 	tun_t		*atp = (tun_t *)q->q_ptr;
    592 	eventid_t	*evs;
    593 
    594 	ASSERT(atp);
    595 	evs = &atp->tun_events;
    596 
    597 	if (q->q_flag & QREADR) {
    598 		ASSERT(evs->ev_rtimoutid);
    599 		evs->ev_rtimoutid = 0;
    600 	} else {
    601 		ASSERT(evs->ev_wtimoutid);
    602 		evs->ev_wtimoutid = 0;
    603 	}
    604 	enableok(q);
    605 	qenable(q);
    606 }
    607 
    608 /*
    609  * This routine is called when a message buffer can not
    610  * be allocated.  M_PCPROT message are converted to M_PROTO, but
    611  * other than that, the mblk passed in must not be a high
    612  * priority message (putting a hight priority message back on
    613  * the queue is a bad idea)
    614  * Side effect: the queue is disabled
    615  * (timeout or bufcall handler will re-enable the queue)
    616  * tun_cancel_rec_evs() must be called in close to cancel all
    617  * outstanding requests.
    618  */
    619 static void
    620 tun_recover(queue_t *q, mblk_t *mp, size_t size)
    621 {
    622 	tun_t	*atp = (tun_t *)q->q_ptr;
    623 	timeout_id_t	tid;
    624 	bufcall_id_t	bid;
    625 	eventid_t	*evs = &atp->tun_events;
    626 
    627 	ASSERT(mp != NULL);
    628 
    629 	/*
    630 	 * To avoid re-enabling the queue, change the high priority
    631 	 * M_PCPROTO message to a M_PROTO before putting it on the queue
    632 	 */
    633 	if (mp->b_datap->db_type == M_PCPROTO)
    634 		mp->b_datap->db_type = M_PROTO;
    635 
    636 	ASSERT(mp->b_datap->db_type < QPCTL);
    637 
    638 	(void) putbq(q, mp);
    639 
    640 	/*
    641 	 * Make sure there is at most one outstanding request per queue.
    642 	 */
    643 	if (q->q_flag & QREADR) {
    644 		if (evs->ev_rtimoutid || evs->ev_rbufcid)
    645 			return;
    646 	} else {
    647 		if (evs->ev_wtimoutid || evs->ev_wbufcid)
    648 			return;
    649 	}
    650 
    651 	noenable(q);
    652 	/*
    653 	 * locking is needed here because this routine may be called
    654 	 * with two puts() running
    655 	 */
    656 	mutex_enter(&atp->tun_lock);
    657 	if (!(bid = qbufcall(q, size, BPRI_MED, tun_bufcall_handler, q))) {
    658 		tid = qtimeout(q, tun_timeout_handler, q, TUN_RECOVER_WAIT);
    659 		if (q->q_flag & QREADR)
    660 			evs->ev_rtimoutid = tid;
    661 		else
    662 			evs->ev_wtimoutid = tid;
    663 	} else	{
    664 		if (q->q_flag & QREADR)
    665 			evs->ev_rbufcid = bid;
    666 		else
    667 			evs->ev_wbufcid = bid;
    668 	}
    669 	mutex_exit(&atp->tun_lock);
    670 }
    671 
    672 /*
    673  * tun_realloc_mblk(q, mp, size, orig_mp, copy)
    674  *
    675  * q - pointer to a queue_t, must not be NULL
    676  * mp - pointer to an mblk to copy, can be NULL
    677  * size - Number of bytes being (re)allocated
    678  * orig_mp - pointer to the original mblk_t which will be passed to
    679  *           tun_recover if the memory (re)allocation fails.  This is done
    680  *           so that the message can be rescheduled on the queue.
    681  *           orig_mp must be NULL if the original mblk_t is a high priority
    682  *           message of type other then M_PCPROTO.
    683  * copy - a boolean to specify wheater the contents of mp should be copied
    684  *        into the new mblk_t returned by this function.
    685  *
    686  * note: this routine will adjust the b_rptr and b_wptr of the
    687  * mblk.  Returns an mblk able to hold the requested size or
    688  * NULL if allocation failed. If copy is true, original
    689  * contents, if any, will be copied to new mblk
    690  */
    691 static mblk_t *
    692 tun_realloc_mblk(queue_t *q, mblk_t *mp, size_t size, mblk_t *orig_mp,
    693     boolean_t copy)
    694 {
    695 	/*
    696 	 * If we are passed in an mblk.. check to make sure that
    697 	 * it is big enough and we are the only users of the mblk
    698 	 * If not, then try and allocate one
    699 	 */
    700 	if (mp == NULL || mp->b_datap->db_lim - mp->b_datap->db_base < size ||
    701 	    mp->b_datap->db_ref > 1) {
    702 		size_t	asize;
    703 		mblk_t *newmp;
    704 
    705 		/* allocate at least as much as we had -- don't shrink */
    706 		if (mp != NULL) {
    707 			asize = MAX(size,
    708 			    mp->b_datap->db_lim - mp->b_datap->db_base);
    709 		} else {
    710 			asize = size;
    711 		}
    712 		newmp = allocb(asize, BPRI_HI);
    713 
    714 		if (newmp == NULL) {
    715 			/*
    716 			 * Reschedule the mblk via bufcall or timeout
    717 			 * if orig_mp is non-NULL
    718 			 */
    719 			if (orig_mp != NULL) {
    720 				tun_recover(q, orig_mp, asize);
    721 			}
    722 			tun1dbg(("tun_realloc_mblk: couldn't allocate" \
    723 			    " dl_ok_ack mblk\n"));
    724 			return (NULL);
    725 		}
    726 		if (mp != NULL) {
    727 			if (copy)
    728 				bcopy(mp->b_rptr, newmp->b_rptr,
    729 				    mp->b_wptr - mp->b_rptr);
    730 			newmp->b_datap->db_type = mp->b_datap->db_type;
    731 			freemsg(mp);
    732 		}
    733 		mp = newmp;
    734 	} else {
    735 		if (mp->b_rptr != mp->b_datap->db_base) {
    736 			if (copy)
    737 				bcopy(mp->b_rptr, mp->b_datap->db_base,
    738 				    mp->b_wptr - mp->b_rptr);
    739 			mp->b_rptr = mp->b_datap->db_base;
    740 		}
    741 	}
    742 	mp->b_wptr = mp->b_rptr + size;
    743 	return (mp);
    744 }
    745 
    746 
    747 /* send a DL_OK_ACK back upstream */
    748 static void
    749 tun_sendokack(queue_t *q, mblk_t *mp, t_uscalar_t prim)
    750 {
    751 	dl_ok_ack_t *dlok;
    752 
    753 	if ((mp = tun_realloc_mblk(q, mp, sizeof (dl_ok_ack_t), mp,
    754 	    B_FALSE)) == NULL) {
    755 		return;
    756 	}
    757 	dlok = (dl_ok_ack_t *)mp->b_rptr;
    758 	dlok->dl_primitive = DL_OK_ACK;
    759 	dlok->dl_correct_primitive = prim;
    760 	mp->b_datap->db_type = M_PCPROTO;
    761 	qreply(q, mp);
    762 }
    763 
    764 /*
    765  * Send a DL_NOTIFY_IND message with DL_NOTE_SDU_SIZE up to notify IP of a
    766  * link MTU change.
    767  */
    768 static void
    769 tun_sendsdusize(queue_t *q)
    770 {
    771 	tun_t		*atp = (tun_t *)q->q_ptr;
    772 	mblk_t		*mp = NULL;
    773 	dl_notify_ind_t	*notify;
    774 
    775 	if (!(atp->tun_notifications & DL_NOTE_SDU_SIZE))
    776 		return;
    777 
    778 	if ((mp = tun_realloc_mblk(q, NULL, DL_NOTIFY_IND_SIZE, NULL,
    779 	    B_FALSE)) == NULL) {
    780 		return;
    781 	}
    782 	mp->b_datap->db_type = M_PROTO;
    783 	notify = (dl_notify_ind_t *)mp->b_rptr;
    784 	notify->dl_primitive = DL_NOTIFY_IND;
    785 	notify->dl_notification = DL_NOTE_SDU_SIZE;
    786 	notify->dl_data = atp->tun_mtu;
    787 	notify->dl_addr_length = 0;
    788 	notify->dl_addr_offset = 0;
    789 
    790 	tun1dbg(("tun_sendsdusize: notifying ip of new mtu: %d", atp->tun_mtu));
    791 
    792 	/*
    793 	 * We send this notification to the upper IP instance who is using
    794 	 * us as a device.
    795 	 */
    796 	putnext(RD(q), mp);
    797 }
    798 
    799 /* send a DL_ERROR_ACK back upstream */
    800 static void
    801 tun_senderrack(queue_t *q, mblk_t *mp, t_uscalar_t prim, t_uscalar_t dl_err,
    802     t_uscalar_t error)
    803 {
    804 	dl_error_ack_t *dl_err_ack;
    805 
    806 	if ((mp = tun_realloc_mblk(q, mp, sizeof (dl_error_ack_t), mp,
    807 	    B_FALSE)) == NULL) {
    808 		return;
    809 	}
    810 
    811 	dl_err_ack = (dl_error_ack_t *)mp->b_rptr;
    812 	dl_err_ack->dl_error_primitive =  prim;
    813 	dl_err_ack->dl_primitive = DL_ERROR_ACK;
    814 	dl_err_ack->dl_errno = dl_err;
    815 	dl_err_ack->dl_unix_errno = error;
    816 	mp->b_datap->db_type = M_PCPROTO;
    817 	qreply(q, mp);
    818 }
    819 
    820 /*
    821  * Free all messages in an mblk chain and optionally collect
    822  * byte-counter stats.  Caller responsible for per-packet stats
    823  */
    824 static void
    825 tun_freemsg_chain(mblk_t *mp, uint64_t *bytecount)
    826 {
    827 	mblk_t *mpnext;
    828 	while (mp != NULL) {
    829 		ASSERT(mp->b_prev == NULL);
    830 		mpnext = mp->b_next;
    831 		mp->b_next = NULL;
    832 		if (bytecount != NULL)
    833 			atomic_add_64(bytecount, (int64_t)msgdsize(mp));
    834 		freemsg(mp);
    835 		mp = mpnext;
    836 	}
    837 }
    838 
    839 /*
    840  * Send all messages in a chain of mblk chains and optionally collect
    841  * byte-counter stats.  Caller responsible for per-packet stats, and insuring
    842  * mp is always non-NULL.
    843  *
    844  * This is a macro so we can save stack.  Assume the caller function
    845  * has local-variable "nmp" as a placeholder.  Define two versions, one with
    846  * byte-counting stats and one without.
    847  */
    848 #define	TUN_PUTMSG_CHAIN_STATS(q, mp, nmp, bytecount) \
    849 	(nmp) = NULL; \
    850 	ASSERT((mp) != NULL); \
    851 	do { \
    852 		if ((nmp) != NULL) \
    853 			putnext(q, (nmp)); \
    854 		ASSERT((mp)->b_prev == NULL); \
    855 		(nmp) = (mp); \
    856 		(mp) = (mp)->b_next; \
    857 		(nmp)->b_next = NULL; \
    858 		atomic_add_64(bytecount, (int64_t)msgdsize(nmp)); \
    859 	} while ((mp) != NULL); \
    860 \
    861 	putnext((q), (nmp))  /* trailing semicolon provided by instantiator. */
    862 
    863 #define	TUN_PUTMSG_CHAIN(q, mp, nmp) \
    864 	(nmp) = NULL; \
    865 	ASSERT((mp) != NULL); \
    866 	do { \
    867 		if ((nmp) != NULL) \
    868 			putnext(q, (nmp)); \
    869 		ASSERT((mp)->b_prev == NULL); \
    870 		(nmp) = (mp); \
    871 		(mp) = (mp)->b_next; \
    872 		(nmp)->b_next = NULL; \
    873 	} while ((mp) != NULL); \
    874 \
    875 	putnext((q), (nmp))  /* trailing semicolon provided by instantiator. */
    876 
    877 /*
    878  * Macro that not only checks tun_itp, but also sees if one got loaded
    879  * via ipsecconf(1m)/PF_POLICY behind our backs.  Note the sleazy update of
    880  * (tun)->tun_itp_gen so we don't lose races with other possible updates via
    881  * PF_POLICY.
    882  */
    883 #define	tun_policy_present(tun, ns, ipss)	\
    884 	(((tun)->tun_itp != NULL) || \
    885 	(((tun)->tun_itp_gen < ipss->ipsec_tunnel_policy_gen) && \
    886 	    ((tun)->tun_itp_gen = ipss->ipsec_tunnel_policy_gen) && \
    887 	    (((tun)->tun_itp = get_tunnel_policy((tun)->tun_lifname, ns)) \
    888 	    != NULL)))
    889 
    890 /*
    891  * Search tuns_byaddr_list for occurrence of tun_t with matching
    892  * inner addresses.  This function does not take into account
    893  * prefixes.  Possibly we could generalize this function in the
    894  * future with V6_MASK_EQ() and pass in an all 1's prefix for IP
    895  * address matches.
    896  * Returns NULL on no match.
    897  * This function is not directly called - it's assigned into itp_get_byaddr().
    898  */
    899 static ipsec_tun_pol_t *
    900 itp_get_byaddr_fn(uint32_t *lin, uint32_t *fin, int af, netstack_t *ns)
    901 {
    902 	tun_t	*tun_list;
    903 	uint_t index;
    904 	in6_addr_t lmapped, fmapped, *laddr, *faddr;
    905 	ipsec_stack_t *ipss = ns->netstack_ipsec;
    906 	tun_stack_t *tuns = ns->netstack_tun;
    907 
    908 	if (af == AF_INET) {
    909 		laddr = &lmapped;
    910 		faddr = &fmapped;
    911 		IN6_INADDR_TO_V4MAPPED((struct in_addr *)lin, laddr);
    912 		IN6_INADDR_TO_V4MAPPED((struct in_addr *)fin, faddr);
    913 	} else {
    914 		laddr = (in6_addr_t *)lin;
    915 		faddr = (in6_addr_t *)fin;
    916 	}
    917 
    918 	index = TUN_BYADDR_LIST_HASH(*faddr);
    919 
    920 	/*
    921 	 * it's ok to grab global lock while holding tun_lock/perimeter
    922 	 */
    923 	mutex_enter(&tuns->tuns_global_lock);
    924 
    925 	/*
    926 	 * walk through list of tun_t looking for a match of
    927 	 * inner addresses.  Addresses are inserted with
    928 	 * IN6_IPADDR_TO_V4MAPPED(), so v6 matching works for
    929 	 * all cases.
    930 	 */
    931 	for (tun_list = tuns->tuns_byaddr_list[index]; tun_list;
    932 	    tun_list = tun_list->tun_next) {
    933 		if (IN6_ARE_ADDR_EQUAL(&tun_list->tun_laddr, laddr) &&
    934 		    IN6_ARE_ADDR_EQUAL(&tun_list->tun_faddr, faddr)) {
    935 			ipsec_tun_pol_t *itp;
    936 
    937 			if (!tun_policy_present(tun_list, ns, ipss)) {
    938 				tun1dbg(("itp_get_byaddr: No IPsec policy on "
    939 				    "matching tun_t instance %p/%s\n",
    940 				    (void *)tun_list, tun_list->tun_lifname));
    941 				continue;
    942 			}
    943 			tun1dbg(("itp_get_byaddr: Found matching tun_t %p with "
    944 			    "IPsec policy\n", (void *)tun_list));
    945 			mutex_enter(&tun_list->tun_itp->itp_lock);
    946 			itp = tun_list->tun_itp;
    947 			mutex_exit(&tuns->tuns_global_lock);
    948 			ITP_REFHOLD(itp);
    949 			mutex_exit(&itp->itp_lock);
    950 			tun1dbg(("itp_get_byaddr: Found itp %p \n",
    951 			    (void *)itp));
    952 			return (itp);
    953 		}
    954 	}
    955 
    956 	/* didn't find one, return zilch */
    957 
    958 	tun1dbg(("itp_get_byaddr: No matching tunnel instances with policy\n"));
    959 	mutex_exit(&tuns->tuns_global_lock);
    960 	return (NULL);
    961 }
    962 
    963 /*
    964  * Search tuns_byaddr_list for occurrence of tun_t, same upper and lower stream,
    965  * and same type (6to4 vs automatic vs configured)
    966  * If none is found, insert this tun entry.
    967  */
    968 static void
    969 tun_add_byaddr(tun_t *atp)
    970 {
    971 	tun_t	*tun_list;
    972 	t_uscalar_t	ppa = atp->tun_ppa;
    973 	uint_t	mask = atp->tun_flags & (TUN_LOWER_MASK | TUN_UPPER_MASK);
    974 	uint_t	tun_type = (atp->tun_flags & (TUN_AUTOMATIC | TUN_6TO4));
    975 	uint_t index = TUN_BYADDR_LIST_HASH(atp->tun_faddr);
    976 	tun_stack_t *tuns = atp->tun_netstack->netstack_tun;
    977 
    978 	tun1dbg(("tun_add_byaddr: index = %d\n", index));
    979 
    980 	ASSERT(MUTEX_HELD(&tuns->tuns_global_lock));
    981 	ASSERT(atp->tun_next == NULL);
    982 
    983 	/*
    984 	 * walk through list of tun_t looking for a match of
    985 	 * ppa, same upper and lower stream and same tunnel type
    986 	 * (automatic or configured).
    987 	 * There shouldn't be all that many tunnels, so a sequential
    988 	 * search of the bucket should be fine.
    989 	 */
    990 	for (tun_list = tuns->tuns_byaddr_list[index]; tun_list;
    991 	    tun_list = tun_list->tun_next) {
    992 		if (tun_list->tun_ppa == ppa &&
    993 		    ((tun_list->tun_flags & (TUN_LOWER_MASK |
    994 		    TUN_UPPER_MASK)) == mask) &&
    995 		    ((tun_list->tun_flags & (TUN_AUTOMATIC | TUN_6TO4)) ==
    996 		    tun_type)) {
    997 			tun1dbg(("tun_add_byaddr: tun 0x%p Found ppa %d " \
    998 			    "tun_stats 0x%p\n", (void *)atp, ppa,
    999 			    (void *)tun_list));
   1000 			tun1dbg(("tun_add_byaddr: Nothing to do."));
   1001 			/* Collision, do nothing. */
   1002 			return;
   1003 		}
   1004 	}
   1005 
   1006 	/* didn't find one, throw it in the global list */
   1007 
   1008 	atp->tun_next = tuns->tuns_byaddr_list[index];
   1009 	atp->tun_ptpn = &(tuns->tuns_byaddr_list[index]);
   1010 	if (tuns->tuns_byaddr_list[index] != NULL)
   1011 		tuns->tuns_byaddr_list[index]->tun_ptpn = &(atp->tun_next);
   1012 	tuns->tuns_byaddr_list[index] = atp;
   1013 }
   1014 
   1015 /*
   1016  * Search tuns_ppa_list for occurrence of tun_ppa, same lower stream,
   1017  * and same type (6to4 vs automatic vs configured)
   1018  * If none is found, insert this tun entry and create a new kstat for
   1019  * the entry.
   1020  * This is needed so that multiple tunnels with the same interface
   1021  * name (e.g. ip.tun0 under IPv4 and ip.tun0 under IPv6) can share the
   1022  * same kstats. (they share the same tun_stat and kstat)
   1023  * Don't need to hold tun_lock if we are coming is as qwriter()
   1024  */
   1025 static tun_stats_t *
   1026 tun_add_stat(queue_t *q)
   1027 {
   1028 	tun_t		*atp = (tun_t *)q->q_ptr;
   1029 	tun_stats_t	*tun_list;
   1030 	tun_stats_t	*tun_stat;
   1031 	t_uscalar_t	ppa = atp->tun_ppa;
   1032 	uint_t	lower = atp->tun_flags & TUN_LOWER_MASK;
   1033 	uint_t	tun_type = (atp->tun_flags & (TUN_AUTOMATIC | TUN_6TO4));
   1034 	uint_t index = TUN_LIST_HASH(ppa);
   1035 	tun_stack_t *tuns = atp->