Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * IP PACKET CLASSIFIER
     28  *
     29  * The IP packet classifier provides mapping between IP packets and persistent
     30  * connection state for connection-oriented protocols. It also provides
     31  * interface for managing connection states.
     32  *
     33  * The connection state is kept in conn_t data structure and contains, among
     34  * other things:
     35  *
     36  *	o local/remote address and ports
     37  *	o Transport protocol
     38  *	o squeue for the connection (for TCP only)
     39  *	o reference counter
     40  *	o Connection state
     41  *	o hash table linkage
     42  *	o interface/ire information
     43  *	o credentials
     44  *	o ipsec policy
     45  *	o send and receive functions.
     46  *	o mutex lock.
     47  *
     48  * Connections use a reference counting scheme. They are freed when the
     49  * reference counter drops to zero. A reference is incremented when connection
     50  * is placed in a list or table, when incoming packet for the connection arrives
     51  * and when connection is processed via squeue (squeue processing may be
     52  * asynchronous and the reference protects the connection from being destroyed
     53  * before its processing is finished).
     54  *
     55  * send and receive functions are currently used for TCP only. The send function
     56  * determines the IP entry point for the packet once it leaves TCP to be sent to
     57  * the destination address. The receive function is used by IP when the packet
     58  * should be passed for TCP processing. When a new connection is created these
     59  * are set to ip_output() and tcp_input() respectively. During the lifetime of
     60  * the connection the send and receive functions may change depending on the
     61  * changes in the connection state. For example, Once the connection is bound to
     62  * an addresse, the receive function for this connection is set to
     63  * tcp_conn_request().  This allows incoming SYNs to go directly into the
     64  * listener SYN processing function without going to tcp_input() first.
     65  *
     66  * Classifier uses several hash tables:
     67  *
     68  * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
     69  *	ipcl_bind_fanout:	contains all connections in BOUND state
     70  *	ipcl_proto_fanout:	IPv4 protocol fanout
     71  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
     72  *	ipcl_udp_fanout:	contains all UDP connections
     73  *	ipcl_iptun_fanout:	contains all IP tunnel connections
     74  *	ipcl_globalhash_fanout:	contains all connections
     75  *
     76  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
     77  * which need to view all existing connections.
     78  *
     79  * All tables are protected by per-bucket locks. When both per-bucket lock and
     80  * connection lock need to be held, the per-bucket lock should be acquired
     81  * first, followed by the connection lock.
     82  *
     83  * All functions doing search in one of these tables increment a reference
     84  * counter on the connection found (if any). This reference should be dropped
     85  * when the caller has finished processing the connection.
     86  *
     87  *
     88  * INTERFACES:
     89  * ===========
     90  *
     91  * Connection Lookup:
     92  * ------------------
     93  *
     94  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack)
     95  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack)
     96  *
     97  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
     98  * it can't find any associated connection. If the connection is found, its
     99  * reference counter is incremented.
    100  *
    101  *	mp:	mblock, containing packet header. The full header should fit
    102  *		into a single mblock. It should also contain at least full IP
    103  *		and TCP or UDP header.
    104  *
    105  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
    106  *
    107  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
    108  *		 the packet.
    109  *
    110  * 	zoneid: The zone in which the returned connection must be; the zoneid
    111  *		corresponding to the ire_zoneid on the IRE located for the
    112  *		packet's destination address.
    113  *
    114  *	For TCP connections, the lookup order is as follows:
    115  *		5-tuple {src, dst, protocol, local port, remote port}
    116  *			lookup in ipcl_conn_fanout table.
    117  *		3-tuple {dst, remote port, protocol} lookup in
    118  *			ipcl_bind_fanout table.
    119  *
    120  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
    121  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
    122  *	these interfaces do not handle cases where a packets belongs
    123  *	to multiple UDP clients, which is handled in IP itself.
    124  *
    125  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
    126  * determine which actual zone gets the segment.  This is used only in a
    127  * labeled environment.  The matching rules are:
    128  *
    129  *	- If it's not a multilevel port, then the label on the packet selects
    130  *	  the zone.  Unlabeled packets are delivered to the global zone.
    131  *
    132  *	- If it's a multilevel port, then only the zone registered to receive
    133  *	  packets on that port matches.
    134  *
    135  * Also, in a labeled environment, packet labels need to be checked.  For fully
    136  * bound TCP connections, we can assume that the packet label was checked
    137  * during connection establishment, and doesn't need to be checked on each
    138  * packet.  For others, though, we need to check for strict equality or, for
    139  * multilevel ports, membership in the range or set.  This part currently does
    140  * a tnrh lookup on each packet, but could be optimized to use cached results
    141  * if that were necessary.  (SCTP doesn't come through here, but if it did,
    142  * we would apply the same rules as TCP.)
    143  *
    144  * An implication of the above is that fully-bound TCP sockets must always use
    145  * distinct 4-tuples; they can't be discriminated by label alone.
    146  *
    147  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
    148  * as there's no connection set-up handshake and no shared state.
    149  *
    150  * Labels on looped-back packets within a single zone do not need to be
    151  * checked, as all processes in the same zone have the same label.
    152  *
    153  * Finally, for unlabeled packets received by a labeled system, special rules
    154  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
    155  * socket in the zone whose label matches the default label of the sender, if
    156  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
    157  * receiver's label must dominate the sender's default label.
    158  *
    159  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack);
    160  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
    161  *					 ip_stack);
    162  *
    163  *	Lookup routine to find a exact match for {src, dst, local port,
    164  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
    165  *	ports are read from the IP and TCP header respectively.
    166  *
    167  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
    168  *					 zoneid, ip_stack);
    169  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
    170  *					 zoneid, ip_stack);
    171  *
    172  * 	Lookup routine to find a listener with the tuple {lport, laddr,
    173  * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
    174  * 	parameter interface index is also compared.
    175  *
    176  * void ipcl_walk(func, arg, ip_stack)
    177  *
    178  * 	Apply 'func' to every connection available. The 'func' is called as
    179  *	(*func)(connp, arg). The walk is non-atomic so connections may be
    180  *	created and destroyed during the walk. The CONN_CONDEMNED and
    181  *	CONN_INCIPIENT flags ensure that connections which are newly created
    182  *	or being destroyed are not selected by the walker.
    183  *
    184  * Table Updates
    185  * -------------
    186  *
    187  * int ipcl_conn_insert(connp, protocol, src, dst, ports)
    188  * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
    189  *
    190  *	Insert 'connp' in the ipcl_conn_fanout.
    191  *	Arguements :
    192  *		connp		conn_t to be inserted
    193  *		protocol	connection protocol
    194  *		src		source address
    195  *		dst		destination address
    196  *		ports		local and remote port
    197  *		ifindex		interface index for IPv6 connections
    198  *
    199  *	Return value :
    200  *		0		if connp was inserted
    201  *		EADDRINUSE	if the connection with the same tuple
    202  *				already exists.
    203  *
    204  * int ipcl_bind_insert(connp, protocol, src, lport);
    205  * int ipcl_bind_insert_v6(connp, protocol, src, lport);
    206  *
    207  * 	Insert 'connp' in ipcl_bind_fanout.
    208  * 	Arguements :
    209  * 		connp		conn_t to be inserted
    210  * 		protocol	connection protocol
    211  * 		src		source address connection wants
    212  * 				to bind to
    213  * 		lport		local port connection wants to
    214  * 				bind to
    215  *
    216  *
    217  * void ipcl_hash_remove(connp);
    218  *
    219  * 	Removes the 'connp' from the connection fanout table.
    220  *
    221  * Connection Creation/Destruction
    222  * -------------------------------
    223  *
    224  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
    225  *
    226  * 	Creates a new conn based on the type flag, inserts it into
    227  * 	globalhash table.
    228  *
    229  *	type:	This flag determines the type of conn_t which needs to be
    230  *		created i.e., which kmem_cache it comes from.
    231  *		IPCL_TCPCONN	indicates a TCP connection
    232  *		IPCL_SCTPCONN	indicates a SCTP connection
    233  *		IPCL_UDPCONN	indicates a UDP conn_t.
    234  *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
    235  *		IPCL_RTSCONN	indicates a RTS conn_t.
    236  *		IPCL_IPCCONN	indicates all other connections.
    237  *
    238  * void ipcl_conn_destroy(connp)
    239  *
    240  * 	Destroys the connection state, removes it from the global
    241  * 	connection hash table and frees its memory.
    242  */
    243 
    244 #include <sys/types.h>
    245 #include <sys/stream.h>
    246 #include <sys/stropts.h>
    247 #include <sys/sysmacros.h>
    248 #include <sys/strsubr.h>
    249 #include <sys/strsun.h>
    250 #define	_SUN_TPI_VERSION 2
    251 #include <sys/ddi.h>
    252 #include <sys/cmn_err.h>
    253 #include <sys/debug.h>
    254 
    255 #include <sys/systm.h>
    256 #include <sys/param.h>
    257 #include <sys/kmem.h>
    258 #include <sys/isa_defs.h>
    259 #include <inet/common.h>
    260 #include <netinet/ip6.h>
    261 #include <netinet/icmp6.h>
    262 
    263 #include <inet/ip.h>
    264 #include <inet/ip6.h>
    265 #include <inet/ip_ndp.h>
    266 #include <inet/ip_impl.h>
    267 #include <inet/udp_impl.h>
    268 #include <inet/sctp_ip.h>
    269 #include <inet/sctp/sctp_impl.h>
    270 #include <inet/rawip_impl.h>
    271 #include <inet/rts_impl.h>
    272 #include <inet/iptun/iptun_impl.h>
    273 
    274 #include <sys/cpuvar.h>
    275 
    276 #include <inet/ipclassifier.h>
    277 #include <inet/tcp.h>
    278 #include <inet/ipsec_impl.h>
    279 
    280 #include <sys/tsol/tnet.h>
    281 #include <sys/sockio.h>
    282 
    283 #ifdef DEBUG
    284 #define	IPCL_DEBUG
    285 #else
    286 #undef	IPCL_DEBUG
    287 #endif
    288 
    289 #ifdef	IPCL_DEBUG
    290 int	ipcl_debug_level = 0;
    291 #define	IPCL_DEBUG_LVL(level, args)	\
    292 	if (ipcl_debug_level  & level) { printf args; }
    293 #else
    294 #define	IPCL_DEBUG_LVL(level, args) {; }
    295 #endif
    296 /* Old value for compatibility. Setable in /etc/system */
    297 uint_t tcp_conn_hash_size = 0;
    298 
    299 /* New value. Zero means choose automatically.  Setable in /etc/system */
    300 uint_t ipcl_conn_hash_size = 0;
    301 uint_t ipcl_conn_hash_memfactor = 8192;
    302 uint_t ipcl_conn_hash_maxsize = 82500;
    303 
    304 /* bind/udp fanout table size */
    305 uint_t ipcl_bind_fanout_size = 512;
    306 uint_t ipcl_udp_fanout_size = 16384;
    307 
    308 /* Raw socket fanout size.  Must be a power of 2. */
    309 uint_t ipcl_raw_fanout_size = 256;
    310 
    311 /*
    312  * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
    313  * expect that most large deployments would have hundreds of tunnels, and
    314  * thousands in the extreme case.
    315  */
    316 uint_t ipcl_iptun_fanout_size = 6143;
    317 
    318 /*
    319  * Power of 2^N Primes useful for hashing for N of 0-28,
    320  * these primes are the nearest prime <= 2^N - 2^(N-2).
    321  */
    322 
    323 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
    324 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
    325 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
    326 		50331599, 100663291, 201326557, 0}
    327 
    328 /*
    329  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
    330  * are aligned on cache lines.
    331  */
    332 typedef union itc_s {
    333 	conn_t	itc_conn;
    334 	char	itcu_filler[CACHE_ALIGN(conn_s)];
    335 } itc_t;
    336 
    337 struct kmem_cache  *tcp_conn_cache;
    338 struct kmem_cache  *ip_conn_cache;
    339 struct kmem_cache  *ip_helper_stream_cache;
    340 extern struct kmem_cache  *sctp_conn_cache;
    341 extern struct kmem_cache  *tcp_sack_info_cache;
    342 extern struct kmem_cache  *tcp_iphc_cache;
    343 struct kmem_cache  *udp_conn_cache;
    344 struct kmem_cache  *rawip_conn_cache;
    345 struct kmem_cache  *rts_conn_cache;
    346 
    347 extern void	tcp_timermp_free(tcp_t *);
    348 extern mblk_t	*tcp_timermp_alloc(int);
    349 
    350 static int	ip_conn_constructor(void *, void *, int);
    351 static void	ip_conn_destructor(void *, void *);
    352 
    353 static int	tcp_conn_constructor(void *, void *, int);
    354 static void	tcp_conn_destructor(void *, void *);
    355 
    356 static int	udp_conn_constructor(void *, void *, int);
    357 static void	udp_conn_destructor(void *, void *);
    358 
    359 static int	rawip_conn_constructor(void *, void *, int);
    360 static void	rawip_conn_destructor(void *, void *);
    361 
    362 static int	rts_conn_constructor(void *, void *, int);
    363 static void	rts_conn_destructor(void *, void *);
    364 
    365 static int	ip_helper_stream_constructor(void *, void *, int);
    366 static void	ip_helper_stream_destructor(void *, void *);
    367 
    368 boolean_t	ip_use_helper_cache = B_TRUE;
    369 
    370 /*
    371  * Hook functions to enable cluster networking
    372  * On non-clustered systems these vectors must always be NULL.
    373  */
    374 extern void	(*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t,
    375 		    uint8_t *, in_port_t, void *);
    376 extern void	(*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t,
    377 		    uint8_t *, in_port_t, void *);
    378 
    379 #ifdef	IPCL_DEBUG
    380 #define	INET_NTOA_BUFSIZE	18
    381 
    382 static char *
    383 inet_ntoa_r(uint32_t in, char *b)
    384 {
    385 	unsigned char	*p;
    386 
    387 	p = (unsigned char *)&in;
    388 	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
    389 	return (b);
    390 }
    391 #endif
    392 
    393 /*
    394  * Global (for all stack instances) init routine
    395  */
    396 void
    397 ipcl_g_init(void)
    398 {
    399 	ip_conn_cache = kmem_cache_create("ip_conn_cache",
    400 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
    401 	    ip_conn_constructor, ip_conn_destructor,
    402 	    NULL, NULL, NULL, 0);
    403 
    404 	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
    405 	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
    406 	    tcp_conn_constructor, tcp_conn_destructor,
    407 	    NULL, NULL, NULL, 0);
    408 
    409 	udp_conn_cache = kmem_cache_create("udp_conn_cache",
    410 	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
    411 	    udp_conn_constructor, udp_conn_destructor,
    412 	    NULL, NULL, NULL, 0);
    413 
    414 	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
    415 	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
    416 	    rawip_conn_constructor, rawip_conn_destructor,
    417 	    NULL, NULL, NULL, 0);
    418 
    419 	rts_conn_cache = kmem_cache_create("rts_conn_cache",
    420 	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
    421 	    rts_conn_constructor, rts_conn_destructor,
    422 	    NULL, NULL, NULL, 0);
    423 
    424 	if (ip_use_helper_cache) {
    425 		ip_helper_stream_cache = kmem_cache_create
    426 		    ("ip_helper_stream_cache", sizeof (ip_helper_stream_info_t),
    427 		    CACHE_ALIGN_SIZE, ip_helper_stream_constructor,
    428 		    ip_helper_stream_destructor, NULL, NULL, NULL, 0);
    429 	} else {
    430 		ip_helper_stream_cache = NULL;
    431 	}
    432 }
    433 
    434 /*
    435  * ipclassifier intialization routine, sets up hash tables.
    436  */
    437 void
    438 ipcl_init(ip_stack_t *ipst)
    439 {
    440 	int i;
    441 	int sizes[] = P2Ps();
    442 
    443 	/*
    444 	 * Calculate size of conn fanout table from /etc/system settings
    445 	 */
    446 	if (ipcl_conn_hash_size != 0) {
    447 		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
    448 	} else if (tcp_conn_hash_size != 0) {
    449 		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
    450 	} else {
    451 		extern pgcnt_t freemem;
    452 
    453 		ipst->ips_ipcl_conn_fanout_size =
    454 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
    455 
    456 		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
    457 			ipst->ips_ipcl_conn_fanout_size =
    458 			    ipcl_conn_hash_maxsize;
    459 		}
    460 	}
    461 
    462 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
    463 		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
    464 			break;
    465 		}
    466 	}
    467 	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
    468 		/* Out of range, use the 2^16 value */
    469 		ipst->ips_ipcl_conn_fanout_size = sizes[16];
    470 	}
    471 
    472 	/* Take values from /etc/system */
    473 	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
    474 	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
    475 	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
    476 	ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
    477 
    478 	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
    479 
    480 	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
    481 	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
    482 
    483 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
    484 		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
    485 		    MUTEX_DEFAULT, NULL);
    486 	}
    487 
    488 	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
    489 	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
    490 
    491 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
    492 		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
    493 		    MUTEX_DEFAULT, NULL);
    494 	}
    495 
    496 	ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX *
    497 	    sizeof (connf_t), KM_SLEEP);
    498 	for (i = 0; i < IPPROTO_MAX; i++) {
    499 		mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL,
    500 		    MUTEX_DEFAULT, NULL);
    501 	}
    502 
    503 	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
    504 	    sizeof (connf_t), KM_SLEEP);
    505 	for (i = 0; i < IPPROTO_MAX; i++) {
    506 		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
    507 		    MUTEX_DEFAULT, NULL);
    508 	}
    509 
    510 	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
    511 	mutex_init(&ipst->ips_rts_clients->connf_lock,
    512 	    NULL, MUTEX_DEFAULT, NULL);
    513 
    514 	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
    515 	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
    516 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
    517 		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
    518 		    MUTEX_DEFAULT, NULL);
    519 	}
    520 
    521 	ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
    522 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
    523 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
    524 		mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
    525 		    MUTEX_DEFAULT, NULL);
    526 	}
    527 
    528 	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
    529 	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
    530 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
    531 		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
    532 		    MUTEX_DEFAULT, NULL);
    533 	}
    534 
    535 	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
    536 	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
    537 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
    538 		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
    539 		    NULL, MUTEX_DEFAULT, NULL);
    540 	}
    541 }
    542 
    543 void
    544 ipcl_g_destroy(void)
    545 {
    546 	kmem_cache_destroy(ip_conn_cache);
    547 	kmem_cache_destroy(tcp_conn_cache);
    548 	kmem_cache_destroy(udp_conn_cache);
    549 	kmem_cache_destroy(rawip_conn_cache);
    550 	kmem_cache_destroy(rts_conn_cache);
    551 }
    552 
    553 /*
    554  * All user-level and kernel use of the stack must be gone
    555  * by now.
    556  */
    557 void
    558 ipcl_destroy(ip_stack_t *ipst)
    559 {
    560 	int i;
    561 
    562 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
    563 		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
    564 		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
    565 	}
    566 	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
    567 	    sizeof (connf_t));
    568 	ipst->ips_ipcl_conn_fanout = NULL;
    569 
    570 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
    571 		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
    572 		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
    573 	}
    574 	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
    575 	    sizeof (connf_t));
    576 	ipst->ips_ipcl_bind_fanout = NULL;
    577 
    578 	for (i = 0; i < IPPROTO_MAX; i++) {
    579 		ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL);
    580 		mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock);
    581 	}
    582 	kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t));
    583 	ipst->ips_ipcl_proto_fanout = NULL;
    584 
    585 	for (i = 0; i < IPPROTO_MAX; i++) {
    586 		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
    587 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
    588 	}
    589 	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
    590 	    IPPROTO_MAX * sizeof (connf_t));
    591 	ipst->ips_ipcl_proto_fanout_v6 = NULL;
    592 
    593 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
    594 		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
    595 		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
    596 	}
    597 	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
    598 	    sizeof (connf_t));
    599 	ipst->ips_ipcl_udp_fanout = NULL;
    600 
    601 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
    602 		ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
    603 		mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
    604 	}
    605 	kmem_free(ipst->ips_ipcl_iptun_fanout,
    606 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
    607 	ipst->ips_ipcl_iptun_fanout = NULL;
    608 
    609 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
    610 		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
    611 		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
    612 	}
    613 	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
    614 	    sizeof (connf_t));
    615 	ipst->ips_ipcl_raw_fanout = NULL;
    616 
    617 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
    618 		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
    619 		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
    620 	}
    621 	kmem_free(ipst->ips_ipcl_globalhash_fanout,
    622 	    sizeof (connf_t) * CONN_G_HASH_SIZE);
    623 	ipst->ips_ipcl_globalhash_fanout = NULL;
    624 
    625 	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
    626 	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
    627 	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
    628 	ipst->ips_rts_clients = NULL;
    629 }
    630 
    631 /*
    632  * conn creation routine. initialize the conn, sets the reference
    633  * and inserts it in the global hash table.
    634  */
    635 conn_t *
    636 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
    637 {
    638 	conn_t	*connp;
    639 	sctp_stack_t *sctps;
    640 	struct kmem_cache *conn_cache;
    641 
    642 	switch (type) {
    643 	case IPCL_SCTPCONN:
    644 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
    645 			return (NULL);
    646 		sctp_conn_init(connp);
    647 		sctps = ns->netstack_sctp;
    648 		SCTP_G_Q_REFHOLD(sctps);
    649 		netstack_hold(ns);
    650 		connp->conn_netstack = ns;
    651 		return (connp);
    652 
    653 	case IPCL_TCPCONN:
    654 		conn_cache = tcp_conn_cache;
    655 		break;
    656 
    657 	case IPCL_UDPCONN:
    658 		conn_cache = udp_conn_cache;
    659 		break;
    660 
    661 	case IPCL_RAWIPCONN:
    662 		conn_cache = rawip_conn_cache;
    663 		break;
    664 
    665 	case IPCL_RTSCONN:
    666 		conn_cache = rts_conn_cache;
    667 		break;
    668 
    669 	case IPCL_IPCCONN:
    670 		conn_cache = ip_conn_cache;
    671 		break;
    672 
    673 	default:
    674 		connp = NULL;
    675 		ASSERT(0);
    676 	}
    677 
    678 	if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
    679 		return (NULL);
    680 
    681 	connp->conn_ref = 1;
    682 	netstack_hold(ns);
    683 	connp->conn_netstack = ns;
    684 	ipcl_globalhash_insert(connp);
    685 	return (connp);
    686 }
    687 
    688 void
    689 ipcl_conn_destroy(conn_t *connp)
    690 {
    691 	mblk_t	*mp;
    692 	netstack_t	*ns = connp->conn_netstack;
    693 
    694 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
    695 	ASSERT(connp->conn_ref == 0);
    696 	ASSERT(connp->conn_ire_cache == NULL);
    697 
    698 	DTRACE_PROBE1(conn__destroy, conn_t *, connp);
    699 
    700 	if (connp->conn_effective_cred != NULL) {
    701 		crfree(connp->conn_effective_cred);
    702 		connp->conn_effective_cred = NULL;
    703 	}
    704 
    705 	if (connp->conn_cred != NULL) {
    706 		crfree(connp->conn_cred);
    707 		connp->conn_cred = NULL;
    708 	}
    709 
    710 	ipcl_globalhash_remove(connp);
    711 
    712 	/* FIXME: add separate tcp_conn_free()? */
    713 	if (connp->conn_flags & IPCL_TCPCONN) {
    714 		tcp_t	*tcp = connp->conn_tcp;
    715 		tcp_stack_t *tcps;
    716 
    717 		ASSERT(tcp != NULL);
    718 		tcps = tcp->tcp_tcps;
    719 		if (tcps != NULL) {
    720 			if (connp->conn_latch != NULL) {
    721 				IPLATCH_REFRELE(connp->conn_latch, ns);
    722 				connp->conn_latch = NULL;
    723 			}
    724 			if (connp->conn_policy != NULL) {
    725 				IPPH_REFRELE(connp->conn_policy, ns);
    726 				connp->conn_policy = NULL;
    727 			}
    728 			tcp->tcp_tcps = NULL;
    729 			TCPS_REFRELE(tcps);
    730 		}
    731 
    732 		tcp_free(tcp);
    733 		mp = tcp->tcp_timercache;
    734 		tcp->tcp_cred = NULL;
    735 
    736 		if (tcp->tcp_sack_info != NULL) {
    737 			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
    738 			kmem_cache_free(tcp_sack_info_cache,
    739 			    tcp->tcp_sack_info);
    740 		}
    741 		if (tcp->tcp_iphc != NULL) {
    742 			if (tcp->tcp_hdr_grown) {
    743 				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
    744 			} else {
    745 				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
    746 				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
    747 			}
    748 			tcp->tcp_iphc_len = 0;
    749 		}
    750 		ASSERT(tcp->tcp_iphc_len == 0);
    751 
    752 		/*
    753 		 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
    754 		 * the mblk.
    755 		 */
    756 		if (tcp->tcp_rsrv_mp != NULL) {
    757 			freeb(tcp->tcp_rsrv_mp);
    758 			tcp->tcp_rsrv_mp = NULL;
    759 			mutex_destroy(&tcp->tcp_rsrv_mp_lock);
    760 		}
    761 
    762 		ASSERT(connp->conn_latch == NULL);
    763 		ASSERT(connp->conn_policy == NULL);
    764 
    765 		if (ns != NULL) {
    766 			ASSERT(tcp->tcp_tcps == NULL);
    767 			connp->conn_netstack = NULL;
    768 			netstack_rele(ns);
    769 		}
    770 
    771 		ipcl_conn_cleanup(connp);
    772 		connp->conn_flags = IPCL_TCPCONN;
    773 		bzero(tcp, sizeof (tcp_t));
    774 
    775 		tcp->tcp_timercache = mp;
    776 		tcp->tcp_connp = connp;
    777 		kmem_cache_free(tcp_conn_cache, connp);
    778 		return;
    779 	}
    780 	if (connp->conn_latch != NULL) {
    781 		IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack);
    782 		connp->conn_latch = NULL;
    783 	}
    784 	if (connp->conn_policy != NULL) {
    785 		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
    786 		connp->conn_policy = NULL;
    787 	}
    788 	if (connp->conn_ipsec_opt_mp != NULL) {
    789 		freemsg(connp->conn_ipsec_opt_mp);
    790 		connp->conn_ipsec_opt_mp = NULL;
    791 	}
    792 
    793 	if (connp->conn_flags & IPCL_SCTPCONN) {
    794 		ASSERT(ns != NULL);
    795 		sctp_free(connp);
    796 		return;
    797 	}
    798 
    799 	if (ns != NULL) {
    800 		connp->conn_netstack = NULL;
    801 		netstack_rele(ns);
    802 	}
    803 
    804 	ipcl_conn_cleanup(connp);
    805 
    806 	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
    807 	if (connp->conn_flags & IPCL_UDPCONN) {
    808 		connp->conn_flags = IPCL_UDPCONN;
    809 		kmem_cache_free(udp_conn_cache, connp);
    810 	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
    811 
    812 		connp->conn_flags = IPCL_RAWIPCONN;
    813 		connp->conn_ulp = IPPROTO_ICMP;
    814 		kmem_cache_free(rawip_conn_cache, connp);
    815 	} else if (connp->conn_flags & IPCL_RTSCONN) {
    816 		connp->conn_flags = IPCL_RTSCONN;
    817 		kmem_cache_free(rts_conn_cache, connp);
    818 	} else {
    819 		connp->conn_flags = IPCL_IPCCONN;
    820 		ASSERT(connp->conn_flags & IPCL_IPCCONN);
    821 		ASSERT(connp->conn_priv == NULL);
    822 		kmem_cache_free(ip_conn_cache, connp);
    823 	}
    824 }
    825 
    826 /*
    827  * Running in cluster mode - deregister listener information
    828  */
    829 
    830 static void
    831 ipcl_conn_unlisten(conn_t *connp)
    832 {
    833 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
    834 	ASSERT(connp->conn_lport != 0);
    835 
    836 	if (cl_inet_unlisten != NULL) {
    837 		sa_family_t	addr_family;
    838 		uint8_t		*laddrp;
    839 
    840 		if (connp->conn_pkt_isv6) {
    841 			addr_family = AF_INET6;
    842 			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
    843 		} else {
    844 			addr_family = AF_INET;
    845 			laddrp = (uint8_t *)&connp->conn_bound_source;
    846 		}
    847 		(*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
    848 		    IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
    849 	}
    850 	connp->conn_flags &= ~IPCL_CL_LISTENER;
    851 }
    852 
    853 /*
    854  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
    855  * which table the conn belonged to). So for debugging we can see which hash
    856  * table this connection was in.
    857  */
    858 #define	IPCL_HASH_REMOVE(connp)	{					\
    859 	connf_t	*connfp = (connp)->conn_fanout;				\
    860 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
    861 	if (connfp != NULL) {						\
    862 		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
    863 		    (void *)(connp)));					\
    864 		mutex_enter(&connfp->connf_lock);			\
    865 		if ((connp)->conn_next != NULL)				\
    866 			(connp)->conn_next->conn_prev =			\
    867 			    (connp)->conn_prev;				\
    868 		if ((connp)->conn_prev != NULL)				\
    869 			(connp)->conn_prev->conn_next =			\
    870 			    (connp)->conn_next;				\
    871 		else							\
    872 			connfp->connf_head = (connp)->conn_next;	\
    873 		(connp)->conn_fanout = NULL;				\
    874 		(connp)->conn_next = NULL;				\
    875 		(connp)->conn_prev = NULL;				\
    876 		(connp)->conn_flags |= IPCL_REMOVED;			\
    877 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
    878 			ipcl_conn_unlisten((connp));			\
    879 		CONN_DEC_REF((connp));					\
    880 		mutex_exit(&connfp->connf_lock);			\
    881 	}								\
    882 }
    883 
    884 void
    885 ipcl_hash_remove(conn_t *connp)
    886 {
    887 	IPCL_HASH_REMOVE(connp);
    888 }
    889 
    890 /*
    891  * The whole purpose of this function is allow removal of
    892  * a conn_t from the connected hash for timewait reclaim.
    893  * This is essentially a TW reclaim fastpath where timewait
    894  * collector checks under fanout lock (so no one else can
    895  * get access to the conn_t) that refcnt is 2 i.e. one for
    896  * TCP and one for the classifier hash list. If ref count
    897  * is indeed 2, we can just remove the conn under lock and
    898  * avoid cleaning up the conn under squeue. This gives us
    899  * improved performance.
    900  */
    901 void
    902 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
    903 {
    904 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
    905 	ASSERT(MUTEX_HELD(&connp->conn_lock));
    906 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
    907 
    908 	if ((connp)->conn_next != NULL) {
    909 		(connp)->conn_next->conn_prev = (connp)->conn_prev;
    910 	}
    911 	if ((connp)->conn_prev != NULL) {
    912 		(connp)->conn_prev->conn_next = (connp)->conn_next;
    913 	} else {
    914 		connfp->connf_head = (connp)->conn_next;
    915 	}
    916 	(connp)->conn_fanout = NULL;
    917 	(connp)->conn_next = NULL;
    918 	(connp)->conn_prev = NULL;
    919 	(connp)->conn_flags |= IPCL_REMOVED;
    920 	ASSERT((connp)->conn_ref == 2);
    921 	(connp)->conn_ref--;
    922 }
    923 
    924 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
    925 	ASSERT((connp)->conn_fanout == NULL);				\
    926 	ASSERT((connp)->conn_next == NULL);				\
    927 	ASSERT((connp)->conn_prev == NULL);				\
    928 	if ((connfp)->connf_head != NULL) {				\
    929 		(connfp)->connf_head->conn_prev = (connp);		\
    930 		(connp)->conn_next = (connfp)->connf_head;		\
    931 	}								\
    932 	(connp)->conn_fanout = (connfp);				\
    933 	(connfp)->connf_head = (connp);					\
    934 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
    935 	    IPCL_CONNECTED;						\
    936 	CONN_INC_REF(connp);						\
    937 }
    938 
    939 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
    940 	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
    941 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
    942 	IPCL_HASH_REMOVE((connp));					\
    943 	mutex_enter(&(connfp)->connf_lock);				\
    944 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
    945 	mutex_exit(&(connfp)->connf_lock);				\
    946 }
    947 
    948 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
    949 	conn_t *pconnp = NULL, *nconnp;					\
    950 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
    951 	    "connp %p", (void *)connfp, (void *)(connp)));		\
    952 	IPCL_HASH_REMOVE((connp));					\
    953 	mutex_enter(&(connfp)->connf_lock);				\
    954 	nconnp = (connfp)->connf_head;					\
    955 	while (nconnp != NULL &&					\
    956 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
    957 		pconnp = nconnp;					\
    958 		nconnp = nconnp->conn_next;				\
    959 	}								\
    960 	if (pconnp != NULL) {						\
    961 		pconnp->conn_next = (connp);				\
    962 		(connp)->conn_prev = pconnp;				\
    963 	} else {							\
    964 		(connfp)->connf_head = (connp);				\
    965 	}								\
    966 	if (nconnp != NULL) {						\
    967 		(connp)->conn_next = nconnp;				\
    968 		nconnp->conn_prev = (connp);				\
    969 	}								\
    970 	(connp)->conn_fanout = (connfp);				\
    971 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
    972 	    IPCL_BOUND;							\
    973 	CONN_INC_REF(connp);						\
    974 	mutex_exit(&(connfp)->connf_lock);				\
    975 }
    976 
    977 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
    978 	conn_t **list, *prev, *next;					\
    979 	boolean_t isv4mapped =						\
    980 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
    981 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
    982 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
    983 	IPCL_HASH_REMOVE((connp));					\
    984 	mutex_enter(&(connfp)->connf_lock);				\
    985 	list = &(connfp)->connf_head;					\
    986 	prev = NULL;							\
    987 	while ((next = *list) != NULL) {				\
    988 		if (isv4mapped &&					\
    989 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
    990 		    connp->conn_zoneid == next->conn_zoneid) {		\
    991 			(connp)->conn_next = next;			\
    992 			if (prev != NULL)				\
    993 				prev = next->conn_prev;			\
    994 			next->conn_prev = (connp);			\
    995 			break;						\
    996 		}							\
    997 		list = &next->conn_next;				\
    998 		prev = next;						\
    999 	}								\
   1000 	(connp)->conn_prev = prev;					\
   1001 	*list = (connp);						\
   1002 	(connp)->conn_fanout = (connfp);				\
   1003 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
   1004 	    IPCL_BOUND;							\
   1005 	CONN_INC_REF((connp));						\
   1006 	mutex_exit(&(connfp)->connf_lock);				\
   1007 }
   1008 
   1009 void
   1010 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
   1011 {
   1012 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
   1013 }
   1014 
   1015 void
   1016 ipcl_proto_insert(conn_t *connp, uint8_t protocol)
   1017 {
   1018 	connf_t	*connfp;
   1019 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   1020 
   1021 	ASSERT(connp != NULL);
   1022 	ASSERT((connp->conn_mac_mode == CONN_MAC_DEFAULT) ||
   1023 	    protocol == IPPROTO_AH || protocol == IPPROTO_ESP);
   1024 
   1025 	connp->conn_ulp = protocol;
   1026 
   1027 	/* Insert it in the protocol hash */
   1028 	connfp = &ipst->ips_ipcl_proto_fanout[protocol];
   1029 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
   1030 }
   1031 
   1032 void
   1033 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
   1034 {
   1035 	connf_t	*connfp;
   1036 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   1037 
   1038 	ASSERT(connp != NULL);
   1039 	ASSERT((connp->conn_mac_mode == CONN_MAC_DEFAULT) ||
   1040 	    protocol == IPPROTO_AH || protocol == IPPROTO_ESP);
   1041 
   1042 	connp->conn_ulp = protocol;
   1043 
   1044 	/* Insert it in the Bind Hash */
   1045 	connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
   1046 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
   1047 }
   1048 
   1049 /*
   1050  * Because the classifier is used to classify inbound packets, the destination
   1051  * address is meant to be our local tunnel address (tunnel source), and the
   1052  * source the remote tunnel address (tunnel destination).
   1053  */
   1054 conn_t *
   1055 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
   1056 {
   1057 	connf_t	*connfp;
   1058 	conn_t	*connp;
   1059 
   1060 	/* first look for IPv4 tunnel links */
   1061 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
   1062 	mutex_enter(&connfp->connf_lock);
   1063 	for (connp = connfp->connf_head; connp != NULL;
   1064 	    connp = connp->conn_next) {
   1065 		if (IPCL_IPTUN_MATCH(connp, *dst, *src))
   1066 			break;
   1067 	}
   1068 	if (connp != NULL)
   1069 		goto done;
   1070 
   1071 	mutex_exit(&connfp->connf_lock);
   1072 
   1073 	/* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
   1074 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
   1075 	    INADDR_ANY)];
   1076 	mutex_enter(&connfp->connf_lock);
   1077 	for (connp = connfp->connf_head; connp != NULL;
   1078 	    connp = connp->conn_next) {
   1079 		if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
   1080 			break;
   1081 	}
   1082 done:
   1083 	if (connp != NULL)
   1084 		CONN_INC_REF(connp);
   1085 	mutex_exit(&connfp->connf_lock);
   1086 	return (connp);
   1087 }
   1088 
   1089 conn_t *
   1090 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
   1091 {
   1092 	connf_t	*connfp;
   1093 	conn_t	*connp;
   1094 
   1095 	/* Look for an IPv6 tunnel link */
   1096 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
   1097 	mutex_enter(&connfp->connf_lock);
   1098 	for (connp = connfp->connf_head; connp != NULL;
   1099 	    connp = connp->conn_next) {
   1100 		if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
   1101 			CONN_INC_REF(connp);
   1102 			break;
   1103 		}
   1104 	}
   1105 	mutex_exit(&connfp->connf_lock);
   1106 	return (connp);
   1107 }
   1108 
   1109 /*
   1110  * This function is used only for inserting SCTP raw socket now.
   1111  * This may change later.
   1112  *
   1113  * Note that only one raw socket can be bound to a port.  The param
   1114  * lport is in network byte order.
   1115  */
   1116 static int
   1117 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
   1118 {
   1119 	connf_t	*connfp;
   1120 	conn_t	*oconnp;
   1121 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   1122 
   1123 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
   1124 
   1125 	/* Check for existing raw socket already bound to the port. */
   1126 	mutex_enter(&connfp->connf_lock);
   1127 	for (oconnp = connfp->connf_head; oconnp != NULL;
   1128 	    oconnp = oconnp->conn_next) {
   1129 		if (oconnp->conn_lport == lport &&
   1130 		    oconnp->conn_zoneid == connp->conn_zoneid &&
   1131 		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
   1132 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
   1133 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
   1134 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
   1135 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
   1136 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
   1137 		    &connp->conn_srcv6))) {
   1138 			break;
   1139 		}
   1140 	}
   1141 	mutex_exit(&connfp->connf_lock);
   1142 	if (oconnp != NULL)
   1143 		return (EADDRNOTAVAIL);
   1144 
   1145 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
   1146 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
   1147 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
   1148 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
   1149 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
   1150 		} else {
   1151 			IPCL_HASH_INSERT_BOUND(connfp, connp);
   1152 		}
   1153 	} else {
   1154 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
   1155 	}
   1156 	return (0);
   1157 }
   1158 
   1159 static int
   1160 ipcl_iptun_hash_insert(conn_t *connp, ipaddr_t src, ipaddr_t dst,
   1161     ip_stack_t *ipst)
   1162 {
   1163 	connf_t	*connfp;
   1164 	conn_t	*tconnp;
   1165 
   1166 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(src, dst)];
   1167 	mutex_enter(&connfp->connf_lock);
   1168 	for (tconnp = connfp->connf_head; tconnp != NULL;
   1169 	    tconnp = tconnp->conn_next) {
   1170 		if (IPCL_IPTUN_MATCH(tconnp, src, dst)) {
   1171 			/* A tunnel is already bound to these addresses. */
   1172 			mutex_exit(&connfp->connf_lock);
   1173 			return (EADDRINUSE);
   1174 		}
   1175 	}
   1176 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
   1177 	mutex_exit(&connfp->connf_lock);
   1178 	return (0);
   1179 }
   1180 
   1181 static int
   1182 ipcl_iptun_hash_insert_v6(conn_t *connp, const in6_addr_t *src,
   1183     const in6_addr_t *dst, ip_stack_t *ipst)
   1184 {
   1185 	connf_t	*connfp;
   1186 	conn_t	*tconnp;
   1187 
   1188 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(src, dst)];
   1189 	mutex_enter(&connfp->connf_lock);
   1190 	for (tconnp = connfp->connf_head; tconnp != NULL;
   1191 	    tconnp = tconnp->conn_next) {
   1192 		if (IPCL_IPTUN_MATCH_V6(tconnp, src, dst)) {
   1193 			/* A tunnel is already bound to these addresses. */
   1194 			mutex_exit(&connfp->connf_lock);
   1195 			return (EADDRINUSE);
   1196 		}
   1197 	}
   1198 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
   1199 	mutex_exit(&connfp->connf_lock);
   1200 	return (0);
   1201 }
   1202 
   1203 /*
   1204  * Check for a MAC exemption conflict on a labeled system.  Note that for
   1205  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
   1206  * transport layer.  This check is for binding all other protocols.
   1207  *
   1208  * Returns true if there's a conflict.
   1209  */
   1210 static boolean_t
   1211 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
   1212 {
   1213 	connf_t	*connfp;
   1214 	conn_t *tconn;
   1215 
   1216 	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
   1217 	mutex_enter(&connfp->connf_lock);
   1218 	for (tconn = connfp->connf_head; tconn != NULL;
   1219 	    tconn = tconn->conn_next) {
   1220 		/* We don't allow v4 fallback for v6 raw socket */
   1221 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
   1222 			continue;
   1223 		/* If neither is exempt, then there's no conflict */
   1224 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
   1225 		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
   1226 			continue;
   1227 		/* We are only concerned about sockets for a different zone */
   1228 		if (connp->conn_zoneid == tconn->conn_zoneid)
   1229 			continue;
   1230 		/* If both are bound to different specific addrs, ok */
   1231 		if (connp->conn_src != INADDR_ANY &&
   1232 		    tconn->conn_src != INADDR_ANY &&
   1233 		    connp->conn_src != tconn->conn_src)
   1234 			continue;
   1235 		/* These two conflict; fail */
   1236 		break;
   1237 	}
   1238 	mutex_exit(&connfp->connf_lock);
   1239 	return (tconn != NULL);
   1240 }
   1241 
   1242 static boolean_t
   1243 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
   1244 {
   1245 	connf_t	*connfp;
   1246 	conn_t *tconn;
   1247 
   1248 	connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
   1249 	mutex_enter(&connfp->connf_lock);
   1250 	for (tconn = connfp->connf_head; tconn != NULL;
   1251 	    tconn = tconn->conn_next) {
   1252 		/* We don't allow v4 fallback for v6 raw socket */
   1253 		if (connp->conn_af_isv6 != tconn->conn_af_isv6)
   1254 			continue;
   1255 		/* If neither is exempt, then there's no conflict */
   1256 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
   1257 		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
   1258 			continue;
   1259 		/* We are only concerned about sockets for a different zone */
   1260 		if (connp->conn_zoneid == tconn->conn_zoneid)
   1261 			continue;
   1262 		/* If both are bound to different addrs, ok */
   1263 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) &&
   1264 		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) &&
   1265 		    !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6))
   1266 			continue;
   1267 		/* These two conflict; fail */
   1268 		break;
   1269 	}
   1270 	mutex_exit(&connfp->connf_lock);
   1271 	return (tconn != NULL);
   1272 }
   1273 
   1274 /*
   1275  * (v4, v6) bind hash insertion routines
   1276  */
   1277 int
   1278 ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
   1279 {
   1280 	connf_t	*connfp;
   1281 #ifdef	IPCL_DEBUG
   1282 	char	buf[INET_NTOA_BUFSIZE];
   1283 #endif
   1284 	int	ret = 0;
   1285 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   1286 
   1287 	ASSERT(connp);
   1288 
   1289 	IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
   1290 	    "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
   1291 
   1292 	connp->conn_ulp = protocol;
   1293 	IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
   1294 	connp->conn_lport = lport;
   1295 
   1296 	if (IPCL_IS_IPTUN(connp))
   1297 		return (ipcl_iptun_hash_insert(connp, src, INADDR_ANY, ipst));
   1298 
   1299 	switch (protocol) {
   1300 	default:
   1301 		if (is_system_labeled() &&
   1302 		    check_exempt_conflict_v4(connp, ipst))
   1303 			return (EADDRINUSE);
   1304 		/* FALLTHROUGH */
   1305 	case IPPROTO_UDP:
   1306 		if (protocol == IPPROTO_UDP) {
   1307 			IPCL_DEBUG_LVL(64,
   1308 			    ("ipcl_bind_insert: connp %p - udp\n",
   1309 			    (void *)connp));
   1310 			connfp = &ipst->ips_ipcl_udp_fanout[
   1311 			    IPCL_UDP_HASH(lport, ipst)];
   1312 		} else {
   1313 			IPCL_DEBUG_LVL(64,
   1314 			    ("ipcl_bind_insert: connp %p - protocol\n",
   1315 			    (void *)connp));
   1316 			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
   1317 		}
   1318 
   1319 		if (connp->conn_rem != INADDR_ANY) {
   1320 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
   1321 		} else if (connp->conn_src != INADDR_ANY) {
   1322 			IPCL_HASH_INSERT_BOUND(connfp, connp);
   1323 		} else {
   1324 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
   1325 		}
   1326 		break;
   1327 
   1328 	case IPPROTO_TCP:
   1329 
   1330 		/* Insert it in the Bind Hash */
   1331 		ASSERT(connp->conn_zoneid != ALL_ZONES);
   1332 		connfp = &ipst->ips_ipcl_bind_fanout[
   1333 		    IPCL_BIND_HASH(lport, ipst)];
   1334 		if (connp->conn_src != INADDR_ANY) {
   1335 			IPCL_HASH_INSERT_BOUND(connfp, connp);
   1336 		} else {
   1337 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
   1338 		}
   1339 		if (cl_inet_listen != NULL) {
   1340 			ASSERT(!connp->conn_pkt_isv6);
   1341 			connp->conn_flags |= IPCL_CL_LISTENER;
   1342 			(*cl_inet_listen)(
   1343 			    connp->conn_netstack->netstack_stackid,
   1344 			    IPPROTO_TCP, AF_INET,
   1345 			    (uint8_t *)&connp->conn_bound_source, lport, NULL);
   1346 		}
   1347 		break;
   1348 
   1349 	case IPPROTO_SCTP:
   1350 		ret = ipcl_sctp_hash_insert(connp, lport);
   1351 		break;
   1352 	}
   1353 
   1354 	return (ret);
   1355 }
   1356 
   1357 int
   1358 ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
   1359     uint16_t lport)
   1360 {
   1361 	connf_t		*connfp;
   1362 	int		ret = 0;
   1363 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   1364 
   1365 	ASSERT(connp != NULL);	connp->conn_ulp = protocol;
   1366 	connp->conn_srcv6 = *src;
   1367 	connp->conn_lport = lport;
   1368 
   1369 	if (IPCL_IS_IPTUN(connp)) {
   1370 		return (ipcl_iptun_hash_insert_v6(connp, src, &ipv6_all_zeros,
   1371 		    ipst));
   1372 	}
   1373 
   1374 	switch (protocol) {
   1375 	default:
   1376 		if (is_system_labeled() &&
   1377 		    check_exempt_conflict_v6(connp, ipst))
   1378 			return (EADDRINUSE);
   1379 		/* FALLTHROUGH */
   1380 	case IPPROTO_UDP:
   1381 		if (protocol == IPPROTO_UDP) {
   1382 			IPCL_DEBUG_LVL(128,
   1383 			    ("ipcl_bind_insert_v6: connp %p - udp\n",
   1384 			    (void *)connp));
   1385 			connfp = &ipst->ips_ipcl_udp_fanout[
   1386 			    IPCL_UDP_HASH(lport, ipst)];
   1387 		} else {
   1388 			IPCL_DEBUG_LVL(128,
   1389 			    ("ipcl_bind_insert_v6: connp %p - protocol\n",
   1390 			    (void *)connp));
   1391 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
   1392 		}
   1393 
   1394 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
   1395 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
   1396 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
   1397 			IPCL_HASH_INSERT_BOUND(connfp, connp);
   1398 		} else {
   1399 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
   1400 		}
   1401 		break;
   1402 
   1403 	case IPPROTO_TCP:
   1404 		/* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
   1405 
   1406 		/* Insert it in the Bind Hash */
   1407 		ASSERT(connp->conn_zoneid != ALL_ZONES);
   1408 		connfp = &ipst->ips_ipcl_bind_fanout[
   1409 		    IPCL_BIND_HASH(lport, ipst)];
   1410 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
   1411 			IPCL_HASH_INSERT_BOUND(connfp, connp);
   1412 		} else {
   1413 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
   1414 		}
   1415 		if (cl_inet_listen != NULL) {
   1416 			sa_family_t	addr_family;
   1417 			uint8_t		*laddrp;
   1418 
   1419 			if (connp->conn_pkt_isv6) {
   1420 				addr_family = AF_INET6;
   1421 				laddrp =
   1422 				    (uint8_t *)&connp->conn_bound_source_v6;
   1423 			} else {
   1424 				addr_family = AF_INET;
   1425 				laddrp = (uint8_t *)&connp->conn_bound_source;
   1426 			}
   1427 			connp->conn_flags |= IPCL_CL_LISTENER;
   1428 			(*cl_inet_listen)(
   1429 			    connp->conn_netstack->netstack_stackid,
   1430 			    IPPROTO_TCP, addr_family, laddrp, lport, NULL);
   1431 		}
   1432 		break;
   1433 
   1434 	case IPPROTO_SCTP:
   1435 		ret = ipcl_sctp_hash_insert(connp, lport);
   1436 		break;
   1437 	}
   1438 
   1439 	return (ret);
   1440 }
   1441 
   1442 /*
   1443  * ipcl_conn_hash insertion routines.
   1444  */
   1445 int
   1446 ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
   1447     ipaddr_t rem, uint32_t ports)
   1448 {
   1449 	connf_t		*connfp;
   1450 	uint16_t	*up;
   1451 	conn_t		*tconnp;
   1452 #ifdef	IPCL_DEBUG
   1453 	char	sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
   1454 #endif
   1455 	in_port_t	lport;
   1456 	int		ret = 0;
   1457 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   1458 
   1459 	IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
   1460 	    "dst = %s, ports = %x, protocol = %x", (void *)connp,
   1461 	    inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
   1462 	    ports, protocol));
   1463 
   1464 	if (IPCL_IS_IPTUN(connp))
   1465 		return (ipcl_iptun_hash_insert(connp, src, rem, ipst));
   1466 
   1467 	switch (protocol) {
   1468 	case IPPROTO_TCP:
   1469 		if (!(connp->conn_flags & IPCL_EAGER)) {
   1470 			/*
   1471 			 * for a eager connection, i.e connections which
   1472 			 * have just been created, the initialization is
   1473 			 * already done in ip at conn_creation time, so
   1474 			 * we can skip the checks here.
   1475 			 */
   1476 			IPCL_CONN_INIT(connp, protocol, src, rem, ports);
   1477 		}
   1478 
   1479 		/*
   1480 		 * For tcp, we check whether the connection tuple already
   1481 		 * exists before allowing the connection to proceed.  We
   1482 		 * also allow indexing on the zoneid. This is to allow
   1483 		 * multiple shared stack zones to have the same tcp
   1484 		 * connection tuple. In practice this only happens for
   1485 		 * INADDR_LOOPBACK as it's the only local address which
   1486 		 * doesn't have to be unique.
   1487 		 */
   1488 		connfp = &ipst->ips_ipcl_conn_fanout[
   1489 		    IPCL_CONN_HASH(connp->conn_rem,
   1490 		    connp->conn_ports, ipst)];
   1491 		mutex_enter(&connfp->connf_lock);
   1492 		for (tconnp = connfp->connf_head; tconnp != NULL;
   1493 		    tconnp = tconnp->conn_next) {
   1494 			if ((IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
   1495 			    connp->conn_rem, connp->conn_src,
   1496 			    connp->conn_ports)) &&
   1497 			    (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) {
   1498 
   1499 				/* Already have a conn. bail out */
   1500 				mutex_exit(&connfp->connf_lock);
   1501 				return (EADDRINUSE);
   1502 			}
   1503 		}
   1504 		if (connp->conn_fanout != NULL) {
   1505 			/*
   1506 			 * Probably a XTI/TLI application trying to do a
   1507 			 * rebind. Let it happen.
   1508 			 */
   1509 			mutex_exit(&connfp->connf_lock);
   1510 			IPCL_HASH_REMOVE(connp);
   1511 			mutex_enter(&connfp->connf_lock);
   1512 		}
   1513 
   1514 		ASSERT(connp->conn_recv != NULL);
   1515 
   1516 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
   1517 		mutex_exit(&connfp->connf_lock);
   1518 		break;
   1519 
   1520 	case IPPROTO_SCTP:
   1521 		/*
   1522 		 * The raw socket may have already been bound, remove it
   1523 		 * from the hash first.
   1524 		 */
   1525 		IPCL_HASH_REMOVE(connp);
   1526 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
   1527 		ret = ipcl_sctp_hash_insert(connp, lport);
   1528 		break;
   1529 
   1530 	default:
   1531 		/*
   1532 		 * Check for conflicts among MAC exempt bindings.  For
   1533 		 * transports with port numbers, this is done by the upper
   1534 		 * level per-transport binding logic.  For all others, it's
   1535 		 * done here.
   1536 		 */
   1537 		if (is_system_labeled() &&
   1538 		    check_exempt_conflict_v4(connp, ipst))
   1539 			return (EADDRINUSE);
   1540 		/* FALLTHROUGH */
   1541 
   1542 	case IPPROTO_UDP:
   1543 		up = (uint16_t *)&ports;
   1544 		IPCL_CONN_INIT(connp, protocol, src, rem, ports);
   1545 		if (protocol == IPPROTO_UDP) {
   1546 			connfp = &ipst->ips_ipcl_udp_fanout[
   1547 			    IPCL_UDP_HASH(up[1], ipst)];
   1548 		} else {
   1549 			connfp = &ipst->ips_ipcl_proto_fanout[protocol];
   1550 		}
   1551 
   1552 		if (connp->conn_rem != INADDR_ANY) {
   1553 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
   1554 		} else if (connp->conn_src != INADDR_ANY) {
   1555 			IPCL_HASH_INSERT_BOUND(connfp, connp);
   1556 		} else {
   1557 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
   1558 		}
   1559 		break;
   1560 	}
   1561 
   1562 	return (ret);
   1563 }
   1564 
   1565 int
   1566 ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
   1567     const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
   1568 {
   1569 	connf_t		*connfp;
   1570 	uint16_t	*up;
   1571 	conn_t		*tconnp;
   1572 	in_port_t	lport;
   1573 	int		ret = 0;
   1574 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   1575 
   1576 	if (IPCL_IS_IPTUN(connp))
   1577 		return (ipcl_iptun_hash_insert_v6(connp, src, rem, ipst));
   1578 
   1579 	switch (protocol) {
   1580 	case IPPROTO_TCP:
   1581 		/* Just need to insert a conn struct */
   1582 		if (!(connp->conn_flags & IPCL_EAGER)) {
   1583 			IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
   1584 		}
   1585 
   1586 		/*
   1587 		 * For tcp, we check whether the connection tuple already
   1588 		 * exists before allowing the connection to proceed.  We
   1589 		 * also allow indexing on the zoneid. This is to allow
   1590 		 * multiple shared stack zones to have the same tcp
   1591 		 * connection tuple. In practice this only happens for
   1592 		 * ipv6_loopback as it's the only local address which
   1593 		 * doesn't have to be unique.
   1594 		 */
   1595 		connfp = &ipst->ips_ipcl_conn_fanout[
   1596 		    IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports,
   1597 		    ipst)];
   1598 		mutex_enter(&connfp->connf_lock);
   1599 		for (tconnp = connfp->connf_head; tconnp != NULL;
   1600 		    tconnp = tconnp->conn_next) {
   1601 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
   1602 			    connp->conn_remv6, connp->conn_srcv6,
   1603 			    connp->conn_ports) &&
   1604 			    (tconnp->conn_tcp->tcp_bound_if == 0 ||
   1605 			    tconnp->conn_tcp->tcp_bound_if == ifindex) &&
   1606 			    (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) {
   1607 				/* Already have a conn. bail out */
   1608 				mutex_exit(&connfp->connf_lock);
   1609 				return (EADDRINUSE);
   1610 			}
   1611 		}
   1612 		if (connp->conn_fanout != NULL) {
   1613 			/*
   1614 			 * Probably a XTI/TLI application trying to do a
   1615 			 * rebind. Let it happen.
   1616 			 */
   1617 			mutex_exit(&connfp->connf_lock);
   1618 			IPCL_HASH_REMOVE(connp);
   1619 			mutex_enter(&connfp->connf_lock);
   1620 		}
   1621 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
   1622 		mutex_exit(&connfp->connf_lock);
   1623 		break;
   1624 
   1625 	case IPPROTO_SCTP:
   1626 		IPCL_HASH_REMOVE(connp);
   1627 		lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
   1628 		ret = ipcl_sctp_hash_insert(connp, lport);
   1629 		break;
   1630 
   1631 	default:
   1632 		if (is_system_labeled() &&
   1633 		    check_exempt_conflict_v6(connp, ipst))
   1634 			return (EADDRINUSE);
   1635 		/* FALLTHROUGH */
   1636 	case IPPROTO_UDP:
   1637 		up = (uint16_t *)&ports;
   1638 		IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
   1639 		if (protocol == IPPROTO_UDP) {
   1640 			connfp = &ipst->ips_ipcl_udp_fanout[
   1641 			    IPCL_UDP_HASH(up[1], ipst)];
   1642 		} else {
   1643 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
   1644 		}
   1645 
   1646 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
   1647 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
   1648 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
   1649 			IPCL_HASH_INSERT_BOUND(connfp, connp);
   1650 		} else {
   1651 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
   1652 		}
   1653 		break;
   1654 	}
   1655 
   1656 	return (ret);
   1657 }
   1658 
   1659 /*
   1660  * v4 packet classifying function. looks up the fanout table to
   1661  * find the conn, the packet belongs to. returns the conn with
   1662  * the reference held, null otherwise.
   1663  *
   1664  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
   1665  * Lookup" comment block are applied.  Labels are also checked as described
   1666  * above.  If the packet is from the inside (looped back), and is from the same
   1667  * zone, then label checks are omitted.
   1668  */
   1669 conn_t *
   1670 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
   1671     ip_stack_t *ipst)
   1672 {
   1673 	ipha_t	*ipha;
   1674 	connf_t	*connfp, *bind_connfp;
   1675 	uint16_t lport;
   1676 	uint16_t fport;
   1677 	uint32_t ports;
   1678 	conn_t	*connp;
   1679 	uint16_t  *up;
   1680 	boolean_t shared_addr;
   1681 	boolean_t unlabeled;
   1682 
   1683 	ipha = (ipha_t *)mp->b_rptr;
   1684 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
   1685 
   1686 	switch (protocol) {
   1687 	case IPPROTO_TCP:
   1688 		ports = *(uint32_t *)up;
   1689 		connfp =
   1690 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
   1691 		    ports, ipst)];
   1692 		mutex_enter(&connfp->connf_lock);
   1693 		for (connp = connfp->connf_head; connp != NULL;
   1694 		    connp = connp->conn_next) {
   1695 			if ((IPCL_CONN_MATCH(connp, protocol,
   1696 			    ipha->ipha_src, ipha->ipha_dst, ports)) &&
   1697 			    (IPCL_ZONE_MATCH(connp, zoneid))) {
   1698 				break;
   1699 			}
   1700 		}
   1701 
   1702 		if (connp != NULL) {
   1703 			/*
   1704 			 * We have a fully-bound TCP connection.
   1705 			 *
   1706 			 * For labeled systems, there's no need to check the
   1707 			 * label here.  It's known to be good as we checked
   1708 			 * before allowing the connection to become bound.
   1709 			 */
   1710 			CONN_INC_REF(connp);
   1711 			mutex_exit(&connfp->connf_lock);
   1712 			return (connp);
   1713 		}
   1714 
   1715 		mutex_exit(&connfp->connf_lock);
   1716 
   1717 		lport = up[1];
   1718 		unlabeled = B_FALSE;
   1719 		/* Cred cannot be null on IPv4 */
   1720 		if (is_system_labeled()) {
   1721 			cred_t *cr = msg_getcred(mp, NULL);
   1722 			ASSERT(cr != NULL);
   1723 			unlabeled = (crgetlabel(cr)->tsl_flags &
   1724 			    TSLF_UNLABELED) != 0;
   1725 		}
   1726 		shared_addr = (zoneid == ALL_ZONES);
   1727 		if (shared_addr) {
   1728 			/*
   1729 			 * No need to handle exclusive-stack zones since
   1730 			 * ALL_ZONES only applies to the shared stack.
   1731 			 */
   1732 			zoneid = tsol_mlp_findzone(protocol, lport);
   1733 			/*
   1734 			 * If no shared MLP is found, tsol_mlp_findzone returns
   1735 			 * ALL_ZONES.  In that case, we assume it's SLP, and
   1736 			 * search for the zone based on the packet label.
   1737 			 *
   1738 			 * If there is such a zone, we prefer to find a
   1739 			 * connection in it.  Otherwise, we look for a
   1740 			 * MAC-exempt connection in any zone whose label
   1741 			 * dominates the default label on the packet.
   1742 			 */
   1743 			if (zoneid == ALL_ZONES)
   1744 				zoneid = tsol_packet_to_zoneid(mp);
   1745 			else
   1746 				unlabeled = B_FALSE;
   1747 		}
   1748 
   1749 		bind_connfp =
   1750 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
   1751 		mutex_enter(&bind_connfp->connf_lock);
   1752 		for (connp = bind_connfp->connf_head; connp != NULL;
   1753 		    connp = connp->conn_next) {
   1754 			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
   1755 			    lport) && (IPCL_ZONE_MATCH(connp, zoneid) ||
   1756 			    (unlabeled && shared_addr &&
   1757 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
   1758 				break;
   1759 		}
   1760 
   1761 		/*
   1762 		 * If the matching connection is SLP on a private address, then
   1763 		 * the label on the packet must match the local zone's label.
   1764 		 * Otherwise, it must be in the label range defined by tnrh.
   1765 		 * This is ensured by tsol_receive_label.
   1766 		 */
   1767 		if (connp != NULL && is_system_labeled() &&
   1768 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
   1769 		    shared_addr, connp)) {
   1770 				DTRACE_PROBE3(
   1771 				    tx__ip__log__info__classify__tcp,
   1772 				    char *,
   1773 				    "connp(1) could not receive mp(2)",
   1774 				    conn_t *, connp, mblk_t *, mp);
   1775 			connp = NULL;
   1776 		}
   1777 
   1778 		if (connp != NULL) {
   1779 			/* Have a listener at least */
   1780 			CONN_INC_REF(connp);
   1781 			mutex_exit(&bind_connfp->connf_lock);
   1782 			return (connp);
   1783 		}
   1784 
   1785 		mutex_exit(&bind_connfp->connf_lock);
   1786 
   1787 		IPCL_DEBUG_LVL(512,
   1788 		    ("ipcl_classify: couldn't classify mp = %p\n",
   1789 		    (void *)mp));
   1790 		break;
   1791 
   1792 	case IPPROTO_UDP:
   1793 		lport = up[1];
   1794 		unlabeled = B_FALSE;
   1795 		/* Cred cannot be null on IPv4 */
   1796 		if (is_system_labeled()) {
   1797 			cred_t *cr = msg_getcred(mp, NULL);
   1798 			ASSERT(cr != NULL);
   1799 			unlabeled = (crgetlabel(cr)->tsl_flags &
   1800 			    TSLF_UNLABELED) != 0;
   1801 		}
   1802 		shared_addr = (zoneid == ALL_ZONES);
   1803 		if (shared_addr) {
   1804 			/*
   1805 			 * No need to handle exclusive-stack zones since
   1806 			 * ALL_ZONES only applies to the shared stack.
   1807 			 */
   1808 			zoneid = tsol_mlp_findzone(protocol, lport);
   1809 			/*
   1810 			 * If no shared MLP is found, tsol_mlp_findzone returns
   1811 			 * ALL_ZONES.  In that case, we assume it's SLP, and
   1812 			 * search for the zone based on the packet label.
   1813 			 *
   1814 			 * If there is such a zone, we prefer to find a
   1815 			 * connection in it.  Otherwise, we look for a
   1816 			 * MAC-exempt connection in any zone whose label
   1817 			 * dominates the default label on the packet.
   1818 			 */
   1819 			if (zoneid == ALL_ZONES)
   1820 				zoneid = tsol_packet_to_zoneid(mp);
   1821 			else
   1822 				unlabeled = B_FALSE;
   1823 		}
   1824 		fport = up[0];
   1825 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
   1826 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
   1827 		mutex_enter(&connfp->connf_lock);
   1828 		for (connp = connfp->connf_head; connp != NULL;
   1829 		    connp = connp->conn_next) {
   1830 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
   1831 			    fport, ipha->ipha_src) &&
   1832 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
   1833 			    (unlabeled && shared_addr &&
   1834 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
   1835 				break;
   1836 		}
   1837 
   1838 		if (connp != NULL && is_system_labeled() &&
   1839 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
   1840 		    shared_addr, connp)) {
   1841 			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
   1842 			    char *, "connp(1) could not receive mp(2)",
   1843 			    conn_t *, connp, mblk_t *, mp);
   1844 			connp = NULL;
   1845 		}
   1846 
   1847 		if (connp != NULL) {
   1848 			CONN_INC_REF(connp);
   1849 			mutex_exit(&connfp->connf_lock);
   1850 			return (connp);
   1851 		}
   1852 
   1853 		/*
   1854 		 * We shouldn't come here for multicast/broadcast packets
   1855 		 */
   1856 		mutex_exit(&connfp->connf_lock);
   1857 		IPCL_DEBUG_LVL(512,
   1858 		    ("ipcl_classify: cant find udp conn_t for ports : %x %x",
   1859 		    lport, fport));
   1860 		break;
   1861 
   1862 	case IPPROTO_ENCAP:
   1863 	case IPPROTO_IPV6:
   1864 		return (ipcl_iptun_classify_v4(&ipha->ipha_src,
   1865 		    &ipha->ipha_dst, ipst));
   1866 	}
   1867 
   1868 	return (NULL);
   1869 }
   1870 
   1871 conn_t *
   1872 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
   1873     ip_stack_t *ipst)
   1874 {
   1875 	ip6_t		*ip6h;
   1876 	connf_t		*connfp, *bind_connfp;
   1877 	uint16_t	lport;
   1878 	uint16_t	fport;
   1879 	tcph_t		*tcph;
   1880 	uint32_t	ports;
   1881 	conn_t		*connp;
   1882 	uint16_t	*up;
   1883 	boolean_t	shared_addr;
   1884 	boolean_t	unlabeled;
   1885 
   1886 	ip6h = (ip6_t *)mp->b_rptr;
   1887 
   1888 	switch (protocol) {
   1889 	case IPPROTO_TCP:
   1890 		tcph = (tcph_t *)&mp->b_rptr[hdr_len];
   1891 		up = (uint16_t *)tcph->th_lport;
   1892 		ports = *(uint32_t *)up;
   1893 
   1894 		connfp =
   1895 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
   1896 		    ports, ipst)];
   1897 		mutex_enter(&connfp->connf_lock);
   1898 		for (connp = connfp->connf_head; connp != NULL;
   1899 		    connp = connp->conn_next) {
   1900 			if ((IPCL_CONN_MATCH_V6(connp, protocol,
   1901 			    ip6h->ip6_src, ip6h->ip6_dst, ports)) &&
   1902 			    (IPCL_ZONE_MATCH(connp, zoneid))) {
   1903 				break;
   1904 			}
   1905 		}
   1906 
   1907 		if (connp != NULL) {
   1908 			/*
   1909 			 * We have a fully-bound TCP connection.
   1910 			 *
   1911 			 * For labeled systems, there's no need to check the
   1912 			 * label here.  It's known to be good as we checked
   1913 			 * before allowing the connection to become bound.
   1914 			 */
   1915 			CONN_INC_REF(connp);
   1916 			mutex_exit(&connfp->connf_lock);
   1917 			return (connp);
   1918 		}
   1919 
   1920 		mutex_exit(&connfp->connf_lock);
   1921 
   1922 		lport = up[1];
   1923 		unlabeled = B_FALSE;
   1924 		/* Cred can be null on IPv6 */
   1925 		if (is_system_labeled()) {
   1926 			cred_t *cr = msg_getcred(mp, NULL);
   1927 
   1928 			unlabeled = (cr != NULL &&
   1929 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
   1930 		}
   1931 		shared_addr = (zoneid == ALL_ZONES);
   1932 		if (shared_addr) {
   1933 			/*
   1934 			 * No need to handle exclusive-stack zones since
   1935 			 * ALL_ZONES only applies to the shared stack.
   1936 			 */
   1937 			zoneid = tsol_mlp_findzone(protocol, lport);
   1938 			/*
   1939 			 * If no shared MLP is found, tsol_mlp_findzone returns
   1940 			 * ALL_ZONES.  In that case, we assume it's SLP, and
   1941 			 * search for the zone based on the packet label.
   1942 			 *
   1943 			 * If there is such a zone, we prefer to find a
   1944 			 * connection in it.  Otherwise, we look for a
   1945 			 * MAC-exempt connection in any zone whose label
   1946 			 * dominates the default label on the packet.
   1947 			 */
   1948 			if (zoneid == ALL_ZONES)
   1949 				zoneid = tsol_packet_to_zoneid(mp);
   1950 			else
   1951 				unlabeled = B_FALSE;
   1952 		}
   1953 
   1954 		bind_connfp =
   1955 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
   1956 		mutex_enter(&bind_connfp->connf_lock);
   1957 		for (connp = bind_connfp->connf_head; connp != NULL;
   1958 		    connp = connp->conn_next) {
   1959 			if (IPCL_BIND_MATCH_V6(connp, protocol,
   1960 			    ip6h->ip6_dst, lport) &&
   1961 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
   1962 			    (unlabeled && shared_addr &&
   1963 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
   1964 				break;
   1965 		}
   1966 
   1967 		if (connp != NULL && is_system_labeled() &&
   1968 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
   1969 		    shared_addr, connp)) {
   1970 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
   1971 			    char *, "connp(1) could not receive mp(2)",
   1972 			    conn_t *, connp, mblk_t *, mp);
   1973 			connp = NULL;
   1974 		}
   1975 
   1976 		if (connp != NULL) {
   1977 			/* Have a listner at least */
   1978 			CONN_INC_REF(connp);
   1979 			mutex_exit(&bind_connfp->connf_lock);
   1980 			IPCL_DEBUG_LVL(512,
   1981 			    ("ipcl_classify_v6: found listner "
   1982 			    "connp = %p\n", (void *)connp));
   1983 
   1984 			return (connp);
   1985 		}
   1986 
   1987 		mutex_exit(&bind_connfp->connf_lock);
   1988 
   1989 		IPCL_DEBUG_LVL(512,
   1990 		    ("ipcl_classify_v6: couldn't classify mp = %p\n",
   1991 		    (void *)mp));
   1992 		break;
   1993 
   1994 	case IPPROTO_UDP:
   1995 		up = (uint16_t *)&mp->b_rptr[hdr_len];
   1996 		lport = up[1];
   1997 		unlabeled = B_FALSE;
   1998 		/* Cred can be null on IPv6 */
   1999 		if (is_system_labeled()) {
   2000 			cred_t *cr = msg_getcred(mp, NULL);
   2001 
   2002 			unlabeled = (cr != NULL &&
   2003 			    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
   2004 		}
   2005 		shared_addr = (zoneid == ALL_ZONES);
   2006 		if (shared_addr) {
   2007 			/*
   2008 			 * No need to handle exclusive-stack zones since
   2009 			 * ALL_ZONES only applies to the shared stack.
   2010 			 */
   2011 			zoneid = tsol_mlp_findzone(protocol, lport);
   2012 			/*
   2013 			 * If no shared MLP is found, tsol_mlp_findzone returns
   2014 			 * ALL_ZONES.  In that case, we assume it's SLP, and
   2015 			 * search for the zone based on the packet label.
   2016 			 *
   2017 			 * If there is such a zone, we prefer to find a
   2018 			 * connection in it.  Otherwise, we look for a
   2019 			 * MAC-exempt connection in any zone whose label
   2020 			 * dominates the default label on the packet.
   2021 			 */
   2022 			if (zoneid == ALL_ZONES)
   2023 				zoneid = tsol_packet_to_zoneid(mp);
   2024 			else
   2025 				unlabeled = B_FALSE;
   2026 		}
   2027 
   2028 		fport = up[0];
   2029 		IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
   2030 		    fport));
   2031 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
   2032 		mutex_enter(&connfp->connf_lock);
   2033 		for (connp = connfp->connf_head; connp != NULL;
   2034 		    connp = connp->conn_next) {
   2035 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
   2036 			    fport, ip6h->ip6_src) &&
   2037 			    (IPCL_ZONE_MATCH(connp, zoneid) ||
   2038 			    (unlabeled && shared_addr &&
   2039 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
   2040 				break;
   2041 		}
   2042 
   2043 		if (connp != NULL && is_system_labeled() &&
   2044 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
   2045 		    shared_addr, connp)) {
   2046 			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
   2047 			    char *, "connp(1) could not receive mp(2)",
   2048 			    conn_t *, connp, mblk_t *, mp);
   2049 			connp = NULL;
   2050 		}
   2051 
   2052 		if (connp != NULL) {
   2053 			CONN_INC_REF(connp);
   2054 			mutex_exit(&connfp->connf_lock);
   2055 			return (connp);
   2056 		}
   2057 
   2058 		/*
   2059 		 * We shouldn't come here for multicast/broadcast packets
   2060 		 */
   2061 		mutex_exit(&connfp->connf_lock);
   2062 		IPCL_DEBUG_LVL(512,
   2063 		    ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
   2064 		    lport, fport));
   2065 		break;
   2066 	case IPPROTO_ENCAP:
   2067 	case IPPROTO_IPV6:
   2068 		return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
   2069 		    &ip6h->ip6_dst, ipst));
   2070 	}
   2071 
   2072 	return (NULL);
   2073 }
   2074 
   2075 /*
   2076  * wrapper around ipcl_classify_(v4,v6) routines.
   2077  */
   2078 conn_t *
   2079 ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst)
   2080 {
   2081 	uint16_t	hdr_len;
   2082 	ipha_t		*ipha;
   2083 	uint8_t		*nexthdrp;
   2084 
   2085 	if (MBLKL(mp) < sizeof (ipha_t))
   2086 		return (NULL);
   2087 
   2088 	switch (IPH_HDR_VERSION(mp->b_rptr)) {
   2089 	case IPV4_VERSION:
   2090 		ipha = (ipha_t *)mp->b_rptr;
   2091 		hdr_len = IPH_HDR_LENGTH(ipha);
   2092 		return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
   2093 		    zoneid, ipst));
   2094 	case IPV6_VERSION:
   2095 		if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
   2096 		    &hdr_len, &nexthdrp))
   2097 			return (NULL);
   2098 
   2099 		return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst));
   2100 	}
   2101 
   2102 	return (NULL);
   2103 }
   2104 
   2105 conn_t *
   2106 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid,
   2107     uint32_t ports, ipha_t *hdr, ip_stack_t *ipst)
   2108 {
   2109 	connf_t		*connfp;
   2110 	conn_t		*connp;
   2111 	in_port_t	lport;
   2112 	int		af;
   2113 	boolean_t	shared_addr;
   2114 	boolean_t	unlabeled;
   2115 	const void	*dst;
   2116 
   2117 	lport = ((uint16_t *)&ports)[1];
   2118 
   2119 	unlabeled = B_FALSE;
   2120 	/* Cred can be null on IPv6 */
   2121 	if (is_system_labeled()) {
   2122 		cred_t *cr = msg_getcred(mp, NULL);
   2123 
   2124 		unlabeled = (cr != NULL &&
   2125 		    crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
   2126 	}
   2127 	shared_addr = (zoneid == ALL_ZONES);
   2128 	if (shared_addr) {
   2129 		/*
   2130 		 * No need to handle exclusive-stack zones since ALL_ZONES
   2131 		 * only applies to the shared stack.
   2132 		 */
   2133 		zoneid = tsol_mlp_findzone(protocol, lport);
   2134 		/*
   2135 		 * If no shared MLP is found, tsol_mlp_findzone returns
   2136 		 * ALL_ZONES.  In that case, we assume it's SLP, and search for
   2137 		 * the zone based on the packet label.
   2138 		 *
   2139 		 * If there is such a zone, we prefer to find a connection in
   2140 		 * it.  Otherwise, we look for a MAC-exempt connection in any
   2141 		 * zone whose label dominates the default label on the packet.
   2142 		 */
   2143 		if (zoneid == ALL_ZONES)
   2144 			zoneid = tsol_packet_to_zoneid(mp);
   2145 		else
   2146 			unlabeled = B_FALSE;
   2147 	}
   2148 
   2149 	af = IPH_HDR_VERSION(hdr);
   2150 	dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst :
   2151 	    (const void *)&((ip6_t *)hdr)->ip6_dst;
   2152 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
   2153 
   2154 	mutex_enter(&connfp->connf_lock);
   2155 	for (connp = connfp->connf_head; connp != NULL;
   2156 	    connp = connp->conn_next) {
   2157 		/* We don't allow v4 fallback for v6 raw socket. */
   2158 		if (af == (connp->conn_af_isv6 ? IPV4_VERSION :
   2159 		    IPV6_VERSION))
   2160 			continue;
   2161 		if (connp->conn_fully_bound) {
   2162 			if (af == IPV4_VERSION) {
   2163 				if (!IPCL_CONN_MATCH(connp, protocol,
   2164 				    hdr->ipha_src, hdr->ipha_dst, ports))
   2165 					continue;
   2166 			} else {
   2167 				if (!IPCL_CONN_MATCH_V6(connp, protocol,
   2168 				    ((ip6_t *)hdr)->ip6_src,
   2169 				    ((ip6_t *)hdr)->ip6_dst, ports))
   2170 					continue;
   2171 			}
   2172 		} else {
   2173 			if (af == IPV4_VERSION) {
   2174 				if (!IPCL_BIND_MATCH(connp, protocol,
   2175 				    hdr->ipha_dst, lport))
   2176 					continue;
   2177 			} else {
   2178 				if (!IPCL_BIND_MATCH_V6(connp, protocol,
   2179 				    ((ip6_t *)hdr)->ip6_dst, lport))
   2180 					continue;
   2181 			}
   2182 		}
   2183 
   2184 		if (IPCL_ZONE_MATCH(connp, zoneid) ||
   2185 		    (unlabeled &&
   2186 		    (connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
   2187 		    shared_addr))
   2188 			break;
   2189 	}
   2190 	/*
   2191 	 * If the connection is fully-bound and connection-oriented (TCP or
   2192 	 * SCTP), then we've already validated the remote system's label.
   2193 	 * There's no need to do it again for every packet.
   2194 	 */
   2195 	if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound ||
   2196 	    !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) &&
   2197 	    !tsol_receive_local(mp, dst, af, shared_addr, connp)) {
   2198 		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
   2199 		    char *, "connp(1) could not receive mp(2)",
   2200 		    conn_t *, connp, mblk_t *, mp);
   2201 		connp = NULL;
   2202 	}
   2203 
   2204 	if (connp != NULL)
   2205 		goto found;
   2206 	mutex_exit(&connfp->connf_lock);
   2207 
   2208 	/* Try to look for a wildcard match. */
   2209 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
   2210 	mutex_enter(&connfp->connf_lock);
   2211 	for (connp = connfp->connf_head; connp != NULL;
   2212 	    connp = connp->conn_next) {
   2213 		/* We don't allow v4 fallback for v6 raw socket. */
   2214 		if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
   2215 		    IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) {
   2216 			continue;
   2217 		}
   2218 		if (af == IPV4_VERSION) {
   2219 			if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
   2220 				break;
   2221 		} else {
   2222 			if (IPCL_RAW_MATCH_V6(connp, protocol,
   2223 			    ((ip6_t *)hdr)->ip6_dst)) {
   2224 				break;
   2225 			}
   2226 		}
   2227 	}
   2228 
   2229 	if (connp != NULL)
   2230 		goto found;
   2231 
   2232 	mutex_exit(&connfp->connf_lock);
   2233 	return (NULL);
   2234 
   2235 found:
   2236 	ASSERT(connp != NULL);
   2237 	CONN_INC_REF(connp);
   2238 	mutex_exit(&connfp->connf_lock);
   2239 	return (connp);
   2240 }
   2241 
   2242 /* ARGSUSED */
   2243 static int
   2244 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
   2245 {
   2246 	itc_t	*itc = (itc_t *)buf;
   2247 	conn_t 	*connp = &itc->itc_conn;
   2248 	tcp_t	*tcp = (tcp_t *)&itc[1];
   2249 
   2250 	bzero(connp, sizeof (conn_t));
   2251 	bzero(tcp, sizeof (tcp_t));
   2252 
   2253 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
   2254 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
   2255 	cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
   2256 	tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
   2257 	connp->conn_tcp = tcp;
   2258 	connp->conn_flags = IPCL_TCPCONN;
   2259 	connp->conn_ulp = IPPROTO_TCP;
   2260 	tcp->tcp_connp = connp;
   2261 	return (0);
   2262 }
   2263 
   2264 /* ARGSUSED */
   2265 static void
   2266 tcp_conn_destructor(void *buf, void *cdrarg)
   2267 {
   2268 	itc_t	*itc = (itc_t *)buf;
   2269 	conn_t 	*connp = &itc->itc_conn;
   2270 	tcp_t	*tcp = (tcp_t *)&itc[1];
   2271 
   2272 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
   2273 	ASSERT(tcp->tcp_connp == connp);
   2274 	ASSERT(connp->conn_tcp == tcp);
   2275 	tcp_timermp_free(tcp);
   2276 	mutex_destroy(&connp->conn_lock);
   2277 	cv_destroy(&connp->conn_cv);
   2278 	cv_destroy(&connp->conn_sq_cv);
   2279 }
   2280 
   2281 /* ARGSUSED */
   2282 static int
   2283 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
   2284 {
   2285 	itc_t	*itc = (itc_t *)buf;
   2286 	conn_t 	*connp = &itc->itc_conn;
   2287 
   2288 	bzero(connp, sizeof (conn_t));
   2289 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
   2290 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
   2291 	connp->conn_flags = IPCL_IPCCONN;
   2292 
   2293 	return (0);
   2294 }
   2295 
   2296 /* ARGSUSED */
   2297 static void
   2298 ip_conn_destructor(void *buf, void *cdrarg)
   2299 {
   2300 	itc_t	*itc = (itc_t *)buf;
   2301 	conn_t 	*connp = &itc->itc_conn;
   2302 
   2303 	ASSERT(connp->conn_flags & IPCL_IPCCONN);
   2304 	ASSERT(connp->conn_priv == NULL);
   2305 	mutex_destroy(&connp->conn_lock);
   2306 	cv_destroy(&connp->conn_cv);
   2307 }
   2308 
   2309 /* ARGSUSED */
   2310 static int
   2311 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
   2312 {
   2313 	itc_t	*itc = (itc_t *)buf;
   2314 	conn_t 	*connp = &itc->itc_conn;
   2315 	udp_t	*udp = (udp_t *)&itc[1];
   2316 
   2317 	bzero(connp, sizeof (conn_t));
   2318 	bzero(udp, sizeof (udp_t));
   2319 
   2320 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
   2321 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
   2322 	connp->conn_udp = udp;
   2323 	connp->conn_flags = IPCL_UDPCONN;
   2324 	connp->conn_ulp = IPPROTO_UDP;
   2325 	udp->udp_connp = connp;
   2326 	return (0);
   2327 }
   2328 
   2329 /* ARGSUSED */
   2330 static void
   2331 udp_conn_destructor(void *buf, void *cdrarg)
   2332 {
   2333 	itc_t	*itc = (itc_t *)buf;
   2334 	conn_t 	*connp = &itc->itc_conn;
   2335 	udp_t	*udp = (udp_t *)&itc[1];
   2336 
   2337 	ASSERT(connp->conn_flags & IPCL_UDPCONN);
   2338 	ASSERT(udp->udp_connp == connp);
   2339 	ASSERT(connp->conn_udp == udp);
   2340 	mutex_destroy(&connp->conn_lock);
   2341 	cv_destroy(&connp->conn_cv);
   2342 }
   2343 
   2344 /* ARGSUSED */
   2345 static int
   2346 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
   2347 {
   2348 	itc_t	*itc = (itc_t *)buf;
   2349 	conn_t 	*connp = &itc->itc_conn;
   2350 	icmp_t	*icmp = (icmp_t *)&itc[1];
   2351 
   2352 	bzero(connp, sizeof (conn_t));
   2353 	bzero(icmp, sizeof (icmp_t));
   2354 
   2355 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
   2356 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
   2357 	connp->conn_icmp = icmp;
   2358 	connp->conn_flags = IPCL_RAWIPCONN;
   2359 	connp->conn_ulp = IPPROTO_ICMP;
   2360 	icmp->icmp_connp = connp;
   2361 	return (0);
   2362 }
   2363 
   2364 /* ARGSUSED */
   2365 static void
   2366 rawip_conn_destructor(void *buf, void *cdrarg)
   2367 {
   2368 	itc_t	*itc = (itc_t *)buf;
   2369 	conn_t 	*connp = &itc->itc_conn;
   2370 	icmp_t	*icmp = (icmp_t *)&itc[1];
   2371 
   2372 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
   2373 	ASSERT(icmp->icmp_connp == connp);
   2374 	ASSERT(connp->conn_icmp == icmp);
   2375 	mutex_destroy(&connp->conn_lock);
   2376 	cv_destroy(&connp->conn_cv);
   2377 }
   2378 
   2379 /* ARGSUSED */
   2380 static int
   2381 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
   2382 {
   2383 	itc_t	*itc = (itc_t *)buf;
   2384 	conn_t 	*connp = &itc->itc_conn;
   2385 	rts_t	*rts = (rts_t *)&itc[1];
   2386 
   2387 	bzero(connp, sizeof (conn_t));
   2388 	bzero(rts, sizeof (rts_t));
   2389 
   2390 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
   2391 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
   2392 	connp->conn_rts = rts;
   2393 	connp->conn_flags = IPCL_RTSCONN;
   2394 	rts->rts_connp = connp;
   2395 	return (0);
   2396 }
   2397 
   2398 /* ARGSUSED */
   2399 static void
   2400 rts_conn_destructor(void *buf, void *cdrarg)
   2401 {
   2402 	itc_t	*itc = (itc_t *)buf;
   2403 	conn_t 	*connp = &itc->itc_conn;
   2404 	rts_t	*rts = (rts_t *)&itc[1];
   2405 
   2406 	ASSERT(connp->conn_flags & IPCL_RTSCONN);
   2407 	ASSERT(rts->rts_connp == connp);
   2408 	ASSERT(connp->conn_rts == rts);
   2409 	mutex_destroy(&connp->conn_lock);
   2410 	cv_destroy(&connp->conn_cv);
   2411 }
   2412 
   2413 /* ARGSUSED */
   2414 int
   2415 ip_helper_stream_constructor(void *buf, void *cdrarg, int kmflags)
   2416 {
   2417 	int error;
   2418 	netstack_t	*ns;
   2419 	int		ret;
   2420 	tcp_stack_t	*tcps;
   2421 	ip_helper_stream_info_t	*ip_helper_str;
   2422 	ip_stack_t	*ipst;
   2423 
   2424 	ns = netstack_find_by_cred(kcred);
   2425 	ASSERT(ns != NULL);
   2426 	tcps = ns->netstack_tcp;
   2427 	ipst = ns->netstack_ip;
   2428 	ASSERT(tcps != NULL);
   2429 	ip_helper_str = (ip_helper_stream_info_t *)buf;
   2430 
   2431 	do {
   2432 		error = ldi_open_by_name(DEV_IP, IP_HELPER_STR, kcred,
   2433 		    &ip_helper_str->iphs_handle, ipst->ips_ldi_ident);
   2434 	} while (error == EINTR);
   2435 
   2436 	if (error == 0) {
   2437 		do {
   2438 			error = ldi_ioctl(
   2439 			    ip_helper_str->iphs_handle, SIOCSQPTR,
   2440 			    (intptr_t)buf, FKIOCTL, kcred, &ret);
   2441 		} while (error == EINTR);
   2442 
   2443 		if (error != 0) {
   2444 			(void) ldi_close(
   2445 			    ip_helper_str->iphs_handle, 0, kcred);
   2446 		}
   2447 	}
   2448 
   2449 	netstack_rele(ipst->ips_netstack);
   2450 
   2451 	return (error);
   2452 }
   2453 
   2454 /* ARGSUSED */
   2455 static void
   2456 ip_helper_stream_destructor(void *buf, void *cdrarg)
   2457 {
   2458 	ip_helper_stream_info_t *ip_helper_str = (ip_helper_stream_info_t *)buf;
   2459 
   2460 	ip_helper_str->iphs_rq->q_ptr =
   2461 	    ip_helper_str->iphs_wq->q_ptr =
   2462 	    ip_helper_str->iphs_minfo;
   2463 	(void) ldi_close(ip_helper_str->iphs_handle, 0, kcred);
   2464 }
   2465 
   2466 
   2467 /*
   2468  * Called as part of ipcl_conn_destroy to assert and clear any pointers
   2469  * in the conn_t.
   2470  */
   2471 void
   2472 ipcl_conn_cleanup(conn_t *connp)
   2473 {
   2474 	ASSERT(connp->conn_ire_cache == NULL);
   2475 	ASSERT(connp->conn_latch == NULL);
   2476 #ifdef notdef
   2477 	ASSERT(connp->conn_rq == NULL);
   2478 	ASSERT(connp->conn_wq == NULL);
   2479 #endif
   2480 	ASSERT(connp->conn_cred == NULL);
   2481 	ASSERT(connp->conn_g_fanout == NULL);
   2482 	ASSERT(connp->conn_g_next == NULL);
   2483 	ASSERT(connp->conn_g_prev == NULL);
   2484 	ASSERT(connp->conn_policy == NULL);
   2485 	ASSERT(connp->conn_fanout == NULL);
   2486 	ASSERT(connp->conn_next == NULL);
   2487 	ASSERT(connp->conn_prev == NULL);
   2488 #ifdef notdef
   2489 	/*
   2490 	 * The ill and ipif pointers are not cleared before the conn_t
   2491 	 * goes away since they do not hold a reference on the ill/ipif.
   2492 	 * We should replace these pointers with ifindex/ipaddr_t to
   2493 	 * make the code less complex.
   2494 	 */
   2495 	ASSERT(connp->conn_outgoing_ill == NULL);
   2496 	ASSERT(connp->conn_incoming_ill == NULL);
   2497 	ASSERT(connp->conn_multicast_ipif == NULL);
   2498 	ASSERT(connp->conn_multicast_ill == NULL);
   2499 #endif
   2500 	ASSERT(connp->conn_oper_pending_ill == NULL);
   2501 	ASSERT(connp->conn_ilg == NULL);
   2502 	ASSERT(connp->conn_drain_next == NULL);
   2503 	ASSERT(connp->conn_drain_prev == NULL);
   2504 #ifdef notdef
   2505 	/* conn_idl is not cleared when removed from idl list */
   2506 	ASSERT(connp->conn_idl == NULL);
   2507 #endif
   2508 	ASSERT(connp->conn_ipsec_opt_mp == NULL);
   2509 	ASSERT(connp->conn_effective_cred == NULL);
   2510 	ASSERT(connp->conn_netstack == NULL);
   2511 
   2512 	ASSERT(connp->conn_helper_info == NULL);
   2513 	/* Clear out the conn_t fields that are not preserved */
   2514 	bzero(&connp->conn_start_clr,
   2515 	    sizeof (conn_t) -
   2516 	    ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
   2517 }
   2518 
   2519 /*
   2520  * All conns are inserted in a global multi-list for the benefit of
   2521  * walkers. The walk is guaranteed to walk all open conns at the time
   2522  * of the start of the walk exactly once. This property is needed to
   2523  * achieve some cleanups during unplumb of interfaces. This is achieved
   2524  * as follows.
   2525  *
   2526  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
   2527  * call the insert and delete functions below at creation and deletion
   2528  * time respectively. The conn never moves or changes its position in this
   2529  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
   2530  * won't increase due to walkers, once the conn deletion has started. Note
   2531  * that we can't remove the conn from the global list and then wait for
   2532  * the refcnt to drop to zero, since walkers would then see a truncated
   2533  * list. CONN_INCIPIENT ensures that walkers don't start looking at
   2534  * conns until ip_open is ready to make them globally visible.
   2535  * The global round robin multi-list locks are held only to get the
   2536  * next member/insertion/deletion and contention should be negligible
   2537  * if the multi-list is much greater than the number of cpus.
   2538  */
   2539 void
   2540 ipcl_globalhash_insert(conn_t *connp)
   2541 {
   2542 	int	index;
   2543 	struct connf_s	*connfp;
   2544 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   2545 
   2546 	/*
   2547 	 * No need for atomic here. Approximate even distribution
   2548 	 * in the global lists is sufficient.
   2549 	 */
   2550 	ipst->ips_conn_g_index++;
   2551 	index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
   2552 
   2553 	connp->conn_g_prev = NULL;
   2554 	/*
   2555 	 * Mark as INCIPIENT, so that walkers will ignore this
   2556 	 * for now, till ip_open is ready to make it visible globally.
   2557 	 */
   2558 	connp->conn_state_flags |= CONN_INCIPIENT;
   2559 
   2560 	connfp = &ipst->ips_ipcl_globalhash_fanout[index];
   2561 	/* Insert at the head of the list */
   2562 	mutex_enter(&connfp->connf_lock);
   2563 	connp->conn_g_next = connfp->connf_head;
   2564 	if (connp->conn_g_next != NULL)
   2565 		connp->conn_g_next->conn_g_prev = connp;
   2566 	connfp->connf_head = connp;
   2567 
   2568 	/* The fanout bucket this conn points to */
   2569 	connp->conn_g_fanout = connfp;
   2570 
   2571 	mutex_exit(&connfp->connf_lock);
   2572 }
   2573 
   2574 void
   2575 ipcl_globalhash_remove(conn_t *connp)
   2576 {
   2577 	struct connf_s	*connfp;
   2578 
   2579 	/*
   2580 	 * We were never inserted in the global multi list.
   2581 	 * IPCL_NONE variety is never inserted in the global multilist
   2582 	 * since it is presumed to not need any cleanup and is transient.
   2583 	 */
   2584 	if (connp->conn_g_fanout == NULL)
   2585 		return;
   2586 
   2587 	connfp = connp->conn_g_fanout;
   2588 	mutex_enter(&connfp->connf_lock);
   2589 	if (connp->conn_g_prev != NULL)
   2590 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
   2591 	else
   2592 		connfp->connf_head = connp->conn_g_next;
   2593 	if (connp->conn_g_next != NULL)
   2594 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
   2595 	mutex_exit(&connfp->connf_lock);
   2596 
   2597 	/* Better to stumble on a null pointer than to corrupt memory */
   2598 	connp->conn_g_next = NULL;
   2599 	connp->conn_g_prev = NULL;
   2600 	connp->conn_g_fanout = NULL;
   2601 }
   2602 
   2603 /*
   2604  * Walk the list of all conn_t's in the system, calling the function provided
   2605  * with the specified argument for each.
   2606  * Applies to both IPv4 and IPv6.
   2607  *
   2608  * IPCs may hold pointers to ipif/ill. To guard against stale pointers
   2609  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
   2610  * unplumbed or removed. New conn_t's that are created while we are walking
   2611  * may be missed by this walk, because they are not necessarily inserted
   2612  * at the tail of the list. They are new conn_t's and thus don't have any
   2613  * stale pointers. The CONN_CLOSING flag ensures that no new reference
   2614  * is created to the struct that is going away.
   2615  */
   2616 void
   2617 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
   2618 {
   2619 	int	i;
   2620 	conn_t	*connp;
   2621 	conn_t	*prev_connp;
   2622 
   2623 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
   2624 		mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
   2625 		prev_connp = NULL;
   2626 		connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
   2627 		while (connp != NULL) {
   2628 			mutex_enter(&connp->conn_lock);
   2629 			if (connp->conn_state_flags &
   2630 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
   2631 				mutex_exit(&connp->conn_lock);
   2632 				connp = connp->conn_g_next;
   2633 				continue;
   2634 			}
   2635 			CONN_INC_REF_LOCKED(connp);
   2636 			mutex_exit(&connp->conn_lock);
   2637 			mutex_exit(
   2638 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
   2639 			(*func)(connp, arg);
   2640 			if (prev_connp != NULL)
   2641 				CONN_DEC_REF(prev_connp);
   2642 			mutex_enter(
   2643 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
   2644 			prev_connp = connp;
   2645 			connp = connp->conn_g_next;
   2646 		}
   2647 		mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
   2648 		if (prev_connp != NULL)
   2649 			CONN_DEC_REF(prev_connp);
   2650 	}
   2651 }
   2652 
   2653 /*
   2654  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
   2655  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
   2656  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
   2657  * (peer tcp in ESTABLISHED state).
   2658  */
   2659 conn_t *
   2660 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph,
   2661     ip_stack_t *ipst)
   2662 {
   2663 	uint32_t ports;
   2664 	uint16_t *pports = (uint16_t *)&ports;
   2665 	connf_t	*connfp;
   2666 	conn_t	*tconnp;
   2667 	boolean_t zone_chk;
   2668 
   2669 	/*
   2670 	 * If either the source of destination address is loopback, then
   2671 	 * both endpoints must be in the same Zone.  Otherwise, both of
   2672 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
   2673 	 * state) and the endpoints may reside in different Zones.
   2674 	 */
   2675 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
   2676 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
   2677 
   2678 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
   2679 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
   2680 
   2681 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
   2682 	    ports, ipst)];
   2683 
   2684 	mutex_enter(&connfp->connf_lock);
   2685 	for (tconnp = connfp->connf_head; tconnp != NULL;
   2686 	    tconnp = tconnp->conn_next) {
   2687 
   2688 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
   2689 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
   2690 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
   2691 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
   2692 
   2693 			ASSERT(tconnp != connp);
   2694 			CONN_INC_REF(tconnp);
   2695 			mutex_exit(&connfp->connf_lock);
   2696 			return (tconnp);
   2697 		}
   2698 	}
   2699 	mutex_exit(&connfp->connf_lock);
   2700 	return (NULL);
   2701 }
   2702 
   2703 /*
   2704  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
   2705  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
   2706  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
   2707  * (peer tcp in ESTABLISHED state).
   2708  */
   2709 conn_t *
   2710 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
   2711     ip_stack_t *ipst)
   2712 {
   2713 	uint32_t ports;
   2714 	uint16_t *pports = (uint16_t *)&ports;
   2715 	connf_t	*connfp;
   2716 	conn_t	*tconnp;
   2717 	boolean_t zone_chk;
   2718 
   2719 	/*
   2720 	 * If either the source of destination address is loopback, then
   2721 	 * both endpoints must be in the same Zone.  Otherwise, both of
   2722 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
   2723 	 * state) and the endpoints may reside in different Zones.  We
   2724 	 * don't do Zone check for link local address(es) because the
   2725 	 * current Zone implementation treats each link local address as
   2726 	 * being unique per system node, i.e. they belong to global Zone.
   2727 	 */
   2728 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
   2729 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
   2730 
   2731 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
   2732 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
   2733 
   2734 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
   2735 	    ports, ipst)];
   2736 
   2737 	mutex_enter(&connfp->connf_lock);
   2738 	for (tconnp = connfp->connf_head; tconnp != NULL;
   2739 	    tconnp = tconnp->conn_next) {
   2740 
   2741 		/* We skip tcp_bound_if check here as this is loopback tcp */
   2742 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
   2743 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
   2744 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
   2745 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
   2746 
   2747 			ASSERT(tconnp != connp);
   2748 			CONN_INC_REF(tconnp);
   2749 			mutex_exit(&connfp->connf_lock);
   2750 			return (tconnp);
   2751 		}
   2752 	}
   2753 	mutex_exit(&connfp->connf_lock);
   2754 	return (NULL);
   2755 }
   2756 
   2757 /*
   2758  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
   2759  * Returns with conn reference held. Caller must call CONN_DEC_REF.
   2760  * Only checks for connected entries i.e. no INADDR_ANY checks.
   2761  */
   2762 conn_t *
   2763 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state,
   2764     ip_stack_t *ipst)
   2765 {
   2766 	uint32_t ports;
   2767 	uint16_t *pports;
   2768 	connf_t	*connfp;
   2769 	conn_t	*tconnp;
   2770 
   2771 	pports = (uint16_t *)&ports;
   2772 	bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
   2773 	bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
   2774 
   2775 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
   2776 	    ports, ipst)];
   2777 
   2778 	mutex_enter(&connfp->connf_lock);
   2779 	for (tconnp = connfp->connf_head; tconnp != NULL;
   2780 	    tconnp = tconnp->conn_next) {
   2781 
   2782 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
   2783 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
   2784 		    tconnp->conn_tcp->tcp_state >= min_state) {
   2785 
   2786 			CONN_INC_REF(tconnp);
   2787 			mutex_exit(&connfp->connf_lock);
   2788 			return (tconnp);
   2789 		}
   2790 	}
   2791 	mutex_exit(&connfp->connf_lock);
   2792 	return (NULL);
   2793 }
   2794 
   2795 /*
   2796  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
   2797  * Returns with conn reference held. Caller must call CONN_DEC_REF.
   2798  * Only checks for connected entries i.e. no INADDR_ANY checks.
   2799  * Match on ifindex in addition to addresses.
   2800  */
   2801 conn_t *
   2802 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
   2803     uint_t ifindex, ip_stack_t *ipst)
   2804 {
   2805 	tcp_t	*tcp;
   2806 	uint32_t ports;
   2807 	uint16_t *pports;
   2808 	connf_t	*connfp;
   2809 	conn_t	*tconnp;
   2810 
   2811 	pports = (uint16_t *)&ports;
   2812 	pports[0] = tcpha->tha_fport;
   2813 	pports[1] = tcpha->tha_lport;
   2814 
   2815 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
   2816 	    ports, ipst)];
   2817 
   2818 	mutex_enter(&connfp->connf_lock);
   2819 	for (tconnp = connfp->connf_head; tconnp != NULL;
   2820 	    tconnp = tconnp->conn_next) {
   2821 
   2822 		tcp = tconnp->conn_tcp;
   2823 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
   2824 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
   2825 		    tcp->tcp_state >= min_state &&
   2826 		    (tcp->tcp_bound_if == 0 ||
   2827 		    tcp->tcp_bound_if == ifindex)) {
   2828 
   2829 			CONN_INC_REF(tconnp);
   2830 			mutex_exit(&connfp->connf_lock);
   2831 			return (tconnp);
   2832 		}
   2833 	}
   2834 	mutex_exit(&connfp->connf_lock);
   2835 	return (NULL);
   2836 }
   2837 
   2838 /*
   2839  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
   2840  * a listener when changing state.
   2841  */
   2842 conn_t *
   2843 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
   2844     ip_stack_t *ipst)
   2845 {
   2846 	connf_t		*bind_connfp;
   2847 	conn_t		*connp;
   2848 	tcp_t		*tcp;
   2849 
   2850 	/*
   2851 	 * Avoid false matches for packets sent to an IP destination of
   2852 	 * all zeros.
   2853 	 */
   2854 	if (laddr == 0)
   2855 		return (NULL);
   2856 
   2857 	ASSERT(zoneid != ALL_ZONES);
   2858 
   2859 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
   2860 	mutex_enter(&bind_connfp->connf_lock);
   2861 	for (connp = bind_connfp->connf_head; connp != NULL;
   2862 	    connp = connp->conn_next) {
   2863 		tcp = connp->conn_tcp;
   2864 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
   2865 		    IPCL_ZONE_MATCH(connp, zoneid) &&
   2866 		    (tcp->tcp_listener == NULL)) {
   2867 			CONN_INC_REF(connp);
   2868 			mutex_exit(&bind_connfp->connf_lock);
   2869 			return (connp);
   2870 		}
   2871 	}
   2872 	mutex_exit(&bind_connfp->connf_lock);
   2873 	return (NULL);
   2874 }
   2875 
   2876 /*
   2877  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
   2878  * a listener when changing state.
   2879  */
   2880 conn_t *
   2881 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
   2882     zoneid_t zoneid, ip_stack_t *ipst)
   2883 {
   2884 	connf_t		*bind_connfp;
   2885 	conn_t		*connp = NULL;
   2886 	tcp_t		*tcp;
   2887 
   2888 	/*
   2889 	 * Avoid false matches for packets sent to an IP destination of
   2890 	 * all zeros.
   2891 	 */
   2892 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
   2893 		return (NULL);
   2894 
   2895 	ASSERT(zoneid != ALL_ZONES);
   2896 
   2897 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
   2898 	mutex_enter(&bind_connfp->connf_lock);
   2899 	for (connp = bind_connfp->connf_head; connp != NULL;
   2900 	    connp = connp->conn_next) {
   2901 		tcp = connp->conn_tcp;
   2902 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
   2903 		    IPCL_ZONE_MATCH(connp, zoneid) &&
   2904 		    (tcp->tcp_bound_if == 0 ||
   2905 		    tcp->tcp_bound_if == ifindex) &&
   2906 		    tcp->tcp_listener == NULL) {
   2907 			CONN_INC_REF(connp);
   2908 			mutex_exit(&bind_connfp->connf_lock);
   2909 			return (connp);
   2910 		}
   2911 	}
   2912 	mutex_exit(&bind_connfp->connf_lock);
   2913 	return (NULL);
   2914 }
   2915 
   2916 /*
   2917  * ipcl_get_next_conn
   2918  *	get the next entry in the conn global list
   2919  *	and put a reference on the next_conn.
   2920  *	decrement the reference on the current conn.
   2921  *
   2922  * This is an iterator based walker function that also provides for
   2923  * some selection by the caller. It walks through the conn_hash bucket
   2924  * searching for the next valid connp in the list, and selects connections
   2925  * that are neither closed nor condemned. It also REFHOLDS the conn
   2926  * thus ensuring that the conn exists when the caller uses the conn.
   2927  */
   2928 conn_t *
   2929 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
   2930 {
   2931 	conn_t	*next_connp;
   2932 
   2933 	if (connfp == NULL)
   2934 		return (NULL);
   2935 
   2936 	mutex_enter(&connfp->connf_lock);
   2937 
   2938 	next_connp = (connp == NULL) ?
   2939 	    connfp->connf_head : connp->conn_g_next;
   2940 
   2941 	while (next_connp != NULL) {
   2942 		mutex_enter(&next_connp->conn_lock);
   2943 		if (!(next_connp->conn_flags & conn_flags) ||
   2944 		    (next_connp->conn_state_flags &
   2945 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
   2946 			/*
   2947 			 * This conn has been condemned or
   2948 			 * is closing, or the flags don't match
   2949 			 */
   2950 			mutex_exit(&next_connp->conn_lock);
   2951 			next_connp = next_connp->conn_g_next;
   2952 			continue;
   2953 		}
   2954 		CONN_INC_REF_LOCKED(next_connp);
   2955 		mutex_exit(&next_connp->conn_lock);
   2956 		break;
   2957 	}
   2958 
   2959 	mutex_exit(&connfp->connf_lock);
   2960 
   2961 	if (connp != NULL)
   2962 		CONN_DEC_REF(connp);
   2963 
   2964 	return (next_connp);
   2965 }
   2966 
   2967 #ifdef CONN_DEBUG
   2968 /*
   2969  * Trace of the last NBUF refhold/refrele
   2970  */
   2971 int
   2972 conn_trace_ref(conn_t *connp)
   2973 {
   2974 	int	last;
   2975 	conn_trace_t	*ctb;
   2976 
   2977 	ASSERT(MUTEX_HELD(&connp->conn_lock));
   2978 	last = connp->conn_trace_last;
   2979 	last++;
   2980 	if (last == CONN_TRACE_MAX)
   2981 		last = 0;
   2982 
   2983 	ctb = &connp->conn_trace_buf[last];
   2984 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
   2985 	connp->conn_trace_last = last;
   2986 	return (1);
   2987 }
   2988 
   2989 int
   2990 conn_untrace_ref(conn_t *connp)
   2991 {
   2992 	int	last;
   2993 	conn_trace_t	*ctb;
   2994 
   2995 	ASSERT(MUTEX_HELD(&connp->conn_lock));
   2996 	last = connp->conn_trace_last;
   2997 	last++;
   2998 	if (last == CONN_TRACE_MAX)
   2999 		last = 0;
   3000 
   3001 	ctb = &connp->conn_trace_buf[last];
   3002 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
   3003 	connp->conn_trace_last = last;
   3004 	return (1);
   3005 }
   3006 #endif
   3007