Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 const char ipclassifier_version[] = "@(#)ipclassifier.c	%I%	%E% SMI";
     29 
     30 /*
     31  * IP PACKET CLASSIFIER
     32  *
     33  * The IP packet classifier provides mapping between IP packets and persistent
     34  * connection state for connection-oriented protocols. It also provides
     35  * interface for managing connection states.
     36  *
     37  * The connection state is kept in conn_t data structure and contains, among
     38  * other things:
     39  *
     40  *	o local/remote address and ports
     41  *	o Transport protocol
     42  *	o squeue for the connection (for TCP only)
     43  *	o reference counter
     44  *	o Connection state
     45  *	o hash table linkage
     46  *	o interface/ire information
     47  *	o credentials
     48  *	o ipsec policy
     49  *	o send and receive functions.
     50  *	o mutex lock.
     51  *
     52  * Connections use a reference counting scheme. They are freed when the
     53  * reference counter drops to zero. A reference is incremented when connection
     54  * is placed in a list or table, when incoming packet for the connection arrives
     55  * and when connection is processed via squeue (squeue processing may be
     56  * asynchronous and the reference protects the connection from being destroyed
     57  * before its processing is finished).
     58  *
     59  * send and receive functions are currently used for TCP only. The send function
     60  * determines the IP entry point for the packet once it leaves TCP to be sent to
     61  * the destination address. The receive function is used by IP when the packet
     62  * should be passed for TCP processing. When a new connection is created these
     63  * are set to ip_output() and tcp_input() respectively. During the lifetime of
     64  * the connection the send and receive functions may change depending on the
     65  * changes in the connection state. For example, Once the connection is bound to
     66  * an addresse, the receive function for this connection is set to
     67  * tcp_conn_request().  This allows incoming SYNs to go directly into the
     68  * listener SYN processing function without going to tcp_input() first.
     69  *
     70  * Classifier uses several hash tables:
     71  *
     72  * 	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
     73  *	ipcl_bind_fanout:	contains all connections in BOUND state
     74  *	ipcl_proto_fanout:	IPv4 protocol fanout
     75  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
     76  *	ipcl_udp_fanout:	contains all UDP connections
     77  *	ipcl_globalhash_fanout:	contains all connections
     78  *
     79  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
     80  * which need to view all existing connections.
     81  *
     82  * All tables are protected by per-bucket locks. When both per-bucket lock and
     83  * connection lock need to be held, the per-bucket lock should be acquired
     84  * first, followed by the connection lock.
     85  *
     86  * All functions doing search in one of these tables increment a reference
     87  * counter on the connection found (if any). This reference should be dropped
     88  * when the caller has finished processing the connection.
     89  *
     90  *
     91  * INTERFACES:
     92  * ===========
     93  *
     94  * Connection Lookup:
     95  * ------------------
     96  *
     97  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack)
     98  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack)
     99  *
    100  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
    101  * it can't find any associated connection. If the connection is found, its
    102  * reference counter is incremented.
    103  *
    104  *	mp:	mblock, containing packet header. The full header should fit
    105  *		into a single mblock. It should also contain at least full IP
    106  *		and TCP or UDP header.
    107  *
    108  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
    109  *
    110  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
    111  *		 the packet.
    112  *
    113  * 	zoneid: The zone in which the returned connection must be; the zoneid
    114  *		corresponding to the ire_zoneid on the IRE located for the
    115  *		packet's destination address.
    116  *
    117  *	For TCP connections, the lookup order is as follows:
    118  *		5-tuple {src, dst, protocol, local port, remote port}
    119  *			lookup in ipcl_conn_fanout table.
    120  *		3-tuple {dst, remote port, protocol} lookup in
    121  *			ipcl_bind_fanout table.
    122  *
    123  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
    124  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
    125  *	these interfaces do not handle cases where a packets belongs
    126  *	to multiple UDP clients, which is handled in IP itself.
    127  *
    128  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
    129  * determine which actual zone gets the segment.  This is used only in a
    130  * labeled environment.  The matching rules are:
    131  *
    132  *	- If it's not a multilevel port, then the label on the packet selects
    133  *	  the zone.  Unlabeled packets are delivered to the global zone.
    134  *
    135  *	- If it's a multilevel port, then only the zone registered to receive
    136  *	  packets on that port matches.
    137  *
    138  * Also, in a labeled environment, packet labels need to be checked.  For fully
    139  * bound TCP connections, we can assume that the packet label was checked
    140  * during connection establishment, and doesn't need to be checked on each
    141  * packet.  For others, though, we need to check for strict equality or, for
    142  * multilevel ports, membership in the range or set.  This part currently does
    143  * a tnrh lookup on each packet, but could be optimized to use cached results
    144  * if that were necessary.  (SCTP doesn't come through here, but if it did,
    145  * we would apply the same rules as TCP.)
    146  *
    147  * An implication of the above is that fully-bound TCP sockets must always use
    148  * distinct 4-tuples; they can't be discriminated by label alone.
    149  *
    150  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
    151  * as there's no connection set-up handshake and no shared state.
    152  *
    153  * Labels on looped-back packets within a single zone do not need to be
    154  * checked, as all processes in the same zone have the same label.
    155  *
    156  * Finally, for unlabeled packets received by a labeled system, special rules
    157  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
    158  * socket in the zone whose label matches the default label of the sender, if
    159  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
    160  * receiver's label must dominate the sender's default label.
    161  *
    162  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack);
    163  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
    164  *					 ip_stack);
    165  *
    166  *	Lookup routine to find a exact match for {src, dst, local port,
    167  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
    168  *	ports are read from the IP and TCP header respectively.
    169  *
    170  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
    171  *					 zoneid, ip_stack);
    172  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
    173  *					 zoneid, ip_stack);
    174  *
    175  * 	Lookup routine to find a listener with the tuple {lport, laddr,
    176  * 	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
    177  * 	parameter interface index is also compared.
    178  *
    179  * void ipcl_walk(func, arg, ip_stack)
    180  *
    181  * 	Apply 'func' to every connection available. The 'func' is called as
    182  *	(*func)(connp, arg). The walk is non-atomic so connections may be
    183  *	created and destroyed during the walk. The CONN_CONDEMNED and
    184  *	CONN_INCIPIENT flags ensure that connections which are newly created
    185  *	or being destroyed are not selected by the walker.
    186  *
    187  * Table Updates
    188  * -------------
    189  *
    190  * int ipcl_conn_insert(connp, protocol, src, dst, ports)
    191  * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
    192  *
    193  *	Insert 'connp' in the ipcl_conn_fanout.
    194  *	Arguements :
    195  *		connp		conn_t to be inserted
    196  *		protocol	connection protocol
    197  *		src		source address
    198  *		dst		destination address
    199  *		ports		local and remote port
    200  *		ifindex		interface index for IPv6 connections
    201  *
    202  *	Return value :
    203  *		0		if connp was inserted
    204  *		EADDRINUSE	if the connection with the same tuple
    205  *				already exists.
    206  *
    207  * int ipcl_bind_insert(connp, protocol, src, lport);
    208  * int ipcl_bind_insert_v6(connp, protocol, src, lport);
    209  *
    210  * 	Insert 'connp' in ipcl_bind_fanout.
    211  * 	Arguements :
    212  * 		connp		conn_t to be inserted
    213  * 		protocol	connection protocol
    214  * 		src		source address connection wants
    215  * 				to bind to
    216  * 		lport		local port connection wants to
    217  * 				bind to
    218  *
    219  *
    220  * void ipcl_hash_remove(connp);
    221  *
    222  * 	Removes the 'connp' from the connection fanout table.
    223  *
    224  * Connection Creation/Destruction
    225  * -------------------------------
    226  *
    227  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
    228  *
    229  * 	Creates a new conn based on the type flag, inserts it into
    230  * 	globalhash table.
    231  *
    232  *	type:	This flag determines the type of conn_t which needs to be
    233  *		created i.e., which kmem_cache it comes from.
    234  *		IPCL_TCPCONN	indicates a TCP connection
    235  *		IPCL_SCTPCONN	indicates a SCTP connection
    236  *		IPCL_UDPCONN	indicates a UDP conn_t.
    237  *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
    238  *		IPCL_RTSCONN	indicates a RTS conn_t.
    239  *		IPCL_IPCCONN	indicates all other connections.
    240  *
    241  * void ipcl_conn_destroy(connp)
    242  *
    243  * 	Destroys the connection state, removes it from the global
    244  * 	connection hash table and frees its memory.
    245  */
    246 
    247 #include <sys/types.h>
    248 #include <sys/stream.h>
    249 #include <sys/stropts.h>
    250 #include <sys/sysmacros.h>
    251 #include <sys/strsubr.h>
    252 #include <sys/strsun.h>
    253 #define	_SUN_TPI_VERSION 2
    254 #include <sys/ddi.h>
    255 #include <sys/cmn_err.h>
    256 #include <sys/debug.h>
    257 
    258 #include <sys/systm.h>
    259 #include <sys/param.h>
    260 #include <sys/kmem.h>
    261 #include <sys/isa_defs.h>
    262 #include <inet/common.h>
    263 #include <netinet/ip6.h>
    264 #include <netinet/icmp6.h>
    265 
    266 #include <inet/ip.h>
    267 #include <inet/ip6.h>
    268 #include <inet/tcp.h>
    269 #include <inet/ip_ndp.h>
    270 #include <inet/udp_impl.h>
    271 #include <inet/sctp_ip.h>
    272 #include <inet/sctp/sctp_impl.h>
    273 #include <inet/rawip_impl.h>
    274 #include <inet/rts_impl.h>
    275 
    276 #include <sys/cpuvar.h>
    277 
    278 #include <inet/ipclassifier.h>
    279 #include <inet/ipsec_impl.h>
    280 
    281 #include <sys/tsol/tnet.h>
    282 
    283 #ifdef DEBUG
    284 #define	IPCL_DEBUG
    285 #else
    286 #undef	IPCL_DEBUG
    287 #endif
    288 
    289 #ifdef	IPCL_DEBUG
    290 int	ipcl_debug_level = 0;
    291 #define	IPCL_DEBUG_LVL(level, args)	\
    292 	if (ipcl_debug_level  & level) { printf args; }
    293 #else
    294 #define	IPCL_DEBUG_LVL(level, args) {; }
    295 #endif
    296 /* Old value for compatibility. Setable in /etc/system */
    297 uint_t tcp_conn_hash_size = 0;
    298 
    299 /* New value. Zero means choose automatically.  Setable in /etc/system */
    300 uint_t ipcl_conn_hash_size = 0;
    301 uint_t ipcl_conn_hash_memfactor = 8192;
    302 uint_t ipcl_conn_hash_maxsize = 82500;
    303 
    304 /* bind/udp fanout table size */
    305 uint_t ipcl_bind_fanout_size = 512;
    306 uint_t ipcl_udp_fanout_size = 16384;
    307 
    308 /* Raw socket fanout size.  Must be a power of 2. */
    309 uint_t ipcl_raw_fanout_size = 256;
    310 
    311 /*
    312  * Power of 2^N Primes useful for hashing for N of 0-28,
    313  * these primes are the nearest prime <= 2^N - 2^(N-2).
    314  */
    315 
    316 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
    317 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
    318 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
    319 		50331599, 100663291, 201326557, 0}
    320 
    321 /*
    322  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
    323  * are aligned on cache lines.
    324  */
    325 typedef union itc_s {
    326 	conn_t	itc_conn;
    327 	char	itcu_filler[CACHE_ALIGN(conn_s)];
    328 } itc_t;
    329 
    330 struct kmem_cache  *tcp_conn_cache;
    331 struct kmem_cache  *ip_conn_cache;
    332 extern struct kmem_cache  *sctp_conn_cache;
    333 extern struct kmem_cache  *tcp_sack_info_cache;
    334 extern struct kmem_cache  *tcp_iphc_cache;
    335 struct kmem_cache  *udp_conn_cache;
    336 struct kmem_cache  *rawip_conn_cache;
    337 struct kmem_cache  *rts_conn_cache;
    338 
    339 extern void	tcp_timermp_free(tcp_t *);
    340 extern mblk_t	*tcp_timermp_alloc(int);
    341 
    342 static int	ip_conn_constructor(void *, void *, int);
    343 static void	ip_conn_destructor(void *, void *);
    344 
    345 static int	tcp_conn_constructor(void *, void *, int);
    346 static void	tcp_conn_destructor(void *, void *);
    347 
    348 static int	udp_conn_constructor(void *, void *, int);
    349 static void	udp_conn_destructor(void *, void *);
    350 
    351 static int	rawip_conn_constructor(void *, void *, int);
    352 static void	rawip_conn_destructor(void *, void *);
    353 
    354 static int	rts_conn_constructor(void *, void *, int);
    355 static void	rts_conn_destructor(void *, void *);
    356 
    357 #ifdef	IPCL_DEBUG
    358 #define	INET_NTOA_BUFSIZE	18
    359 
    360 static char *
    361 inet_ntoa_r(uint32_t in, char *b)
    362 {
    363 	unsigned char	*p;
    364 
    365 	p = (unsigned char *)&in;
    366 	(void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
    367 	return (b);
    368 }
    369 #endif
    370 
    371 /*
    372  * Global (for all stack instances) init routine
    373  */
    374 void
    375 ipcl_g_init(void)
    376 {
    377 	ip_conn_cache = kmem_cache_create("ip_conn_cache",
    378 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
    379 	    ip_conn_constructor, ip_conn_destructor,
    380 	    NULL, NULL, NULL, 0);
    381 
    382 	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
    383 	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
    384 	    tcp_conn_constructor, tcp_conn_destructor,
    385 	    NULL, NULL, NULL, 0);
    386 
    387 	udp_conn_cache = kmem_cache_create("udp_conn_cache",
    388 	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
    389 	    udp_conn_constructor, udp_conn_destructor,
    390 	    NULL, NULL, NULL, 0);
    391 
    392 	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
    393 	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
    394 	    rawip_conn_constructor, rawip_conn_destructor,
    395 	    NULL, NULL, NULL, 0);
    396 
    397 	rts_conn_cache = kmem_cache_create("rts_conn_cache",
    398 	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
    399 	    rts_conn_constructor, rts_conn_destructor,
    400 	    NULL, NULL, NULL, 0);
    401 }
    402 
    403 /*
    404  * ipclassifier intialization routine, sets up hash tables.
    405  */
    406 void
    407 ipcl_init(ip_stack_t *ipst)
    408 {
    409 	int i;
    410 	int sizes[] = P2Ps();
    411 
    412 	/*
    413 	 * Calculate size of conn fanout table from /etc/system settings
    414 	 */
    415 	if (ipcl_conn_hash_size != 0) {
    416 		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
    417 	} else if (tcp_conn_hash_size != 0) {
    418 		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
    419 	} else {
    420 		extern pgcnt_t freemem;
    421 
    422 		ipst->ips_ipcl_conn_fanout_size =
    423 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
    424 
    425 		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
    426 			ipst->ips_ipcl_conn_fanout_size =
    427 			    ipcl_conn_hash_maxsize;
    428 		}
    429 	}
    430 
    431 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
    432 		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
    433 			break;
    434 		}
    435 	}
    436 	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
    437 		/* Out of range, use the 2^16 value */
    438 		ipst->ips_ipcl_conn_fanout_size = sizes[16];
    439 	}
    440 
    441 	/* Take values from /etc/system */
    442 	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
    443 	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
    444 	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
    445 
    446 	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
    447 
    448 	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
    449 	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
    450 
    451 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
    452 		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
    453 		    MUTEX_DEFAULT, NULL);
    454 	}
    455 
    456 	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
    457 	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
    458 
    459 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
    460 		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
    461 		    MUTEX_DEFAULT, NULL);
    462 	}
    463 
    464 	ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX *
    465 	    sizeof (connf_t), KM_SLEEP);
    466 	for (i = 0; i < IPPROTO_MAX; i++) {
    467 		mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL,
    468 		    MUTEX_DEFAULT, NULL);
    469 	}
    470 
    471 	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
    472 	    sizeof (connf_t), KM_SLEEP);
    473 	for (i = 0; i < IPPROTO_MAX; i++) {
    474 		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
    475 		    MUTEX_DEFAULT, NULL);
    476 	}
    477 
    478 	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
    479 	mutex_init(&ipst->ips_rts_clients->connf_lock,
    480 	    NULL, MUTEX_DEFAULT, NULL);
    481 
    482 	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
    483 	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
    484 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
    485 		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
    486 		    MUTEX_DEFAULT, NULL);
    487 	}
    488 
    489 	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
    490 	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
    491 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
    492 		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
    493 		    MUTEX_DEFAULT, NULL);
    494 	}
    495 
    496 	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
    497 	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
    498 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
    499 		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
    500 		    NULL, MUTEX_DEFAULT, NULL);
    501 	}
    502 }
    503 
    504 void
    505 ipcl_g_destroy(void)
    506 {
    507 	kmem_cache_destroy(ip_conn_cache);
    508 	kmem_cache_destroy(tcp_conn_cache);
    509 	kmem_cache_destroy(udp_conn_cache);
    510 	kmem_cache_destroy(rawip_conn_cache);
    511 	kmem_cache_destroy(rts_conn_cache);
    512 }
    513 
    514 /*
    515  * All user-level and kernel use of the stack must be gone
    516  * by now.
    517  */
    518 void
    519 ipcl_destroy(ip_stack_t *ipst)
    520 {
    521 	int i;
    522 
    523 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
    524 		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
    525 		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
    526 	}
    527 	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
    528 	    sizeof (connf_t));
    529 	ipst->ips_ipcl_conn_fanout = NULL;
    530 
    531 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
    532 		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
    533 		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
    534 	}
    535 	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
    536 	    sizeof (connf_t));
    537 	ipst->ips_ipcl_bind_fanout = NULL;
    538 
    539 	for (i = 0; i < IPPROTO_MAX; i++) {
    540 		ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL);
    541 		mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock);
    542 	}
    543 	kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t));
    544 	ipst->ips_ipcl_proto_fanout = NULL;
    545 
    546 	for (i = 0; i < IPPROTO_MAX; i++) {
    547 		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
    548 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
    549 	}
    550 	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
    551 	    IPPROTO_MAX * sizeof (connf_t));
    552 	ipst->ips_ipcl_proto_fanout_v6 = NULL;
    553 
    554 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
    555 		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
    556 		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
    557 	}
    558 	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
    559 	    sizeof (connf_t));
    560 	ipst->ips_ipcl_udp_fanout = NULL;
    561 
    562 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
    563 		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
    564 		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
    565 	}
    566 	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
    567 	    sizeof (connf_t));
    568 	ipst->ips_ipcl_raw_fanout = NULL;
    569 
    570 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
    571 		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
    572 		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
    573 	}
    574 	kmem_free(ipst->ips_ipcl_globalhash_fanout,
    575 	    sizeof (connf_t) * CONN_G_HASH_SIZE);
    576 	ipst->ips_ipcl_globalhash_fanout = NULL;
    577 
    578 	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
    579 	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
    580 	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
    581 	ipst->ips_rts_clients = NULL;
    582 }
    583 
    584 /*
    585  * conn creation routine. initialize the conn, sets the reference
    586  * and inserts it in the global hash table.
    587  */
    588 conn_t *
    589 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
    590 {
    591 	conn_t	*connp;
    592 	sctp_stack_t *sctps;
    593 	struct kmem_cache *conn_cache;
    594 
    595 	switch (type) {
    596 	case IPCL_SCTPCONN:
    597 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
    598 			return (NULL);
    599 		sctp_conn_init(connp);
    600 		sctps = ns->netstack_sctp;
    601 		SCTP_G_Q_REFHOLD(sctps);
    602 		netstack_hold(ns);
    603 		connp->conn_netstack = ns;
    604 		return (connp);
    605 
    606 	case IPCL_TCPCONN:
    607 		conn_cache = tcp_conn_cache;
    608 		break;
    609 
    610 	case IPCL_UDPCONN:
    611 		conn_cache = udp_conn_cache;
    612 		break;
    613 
    614 	case IPCL_RAWIPCONN:
    615 		conn_cache = rawip_conn_cache;
    616 		break;
    617 
    618 	case IPCL_RTSCONN:
    619 		conn_cache = rts_conn_cache;
    620 		break;
    621 
    622 	case IPCL_IPCCONN:
    623 		conn_cache = ip_conn_cache;
    624 		break;
    625 
    626 	default:
    627 		connp = NULL;
    628 		ASSERT(0);
    629 	}
    630 
    631 	if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
    632 		return (NULL);
    633 
    634 	connp->conn_ref = 1;
    635 	netstack_hold(ns);
    636 	connp->conn_netstack = ns;
    637 	ipcl_globalhash_insert(connp);
    638 	return (connp);
    639 }
    640 
    641 void
    642 ipcl_conn_destroy(conn_t *connp)
    643 {
    644 	mblk_t	*mp;
    645 	netstack_t	*ns = connp->conn_netstack;
    646 
    647 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
    648 	ASSERT(connp->conn_ref == 0);
    649 	ASSERT(connp->conn_ire_cache == NULL);
    650 
    651 	if (connp->conn_peercred != NULL &&
    652 	    connp->conn_peercred != connp->conn_cred)
    653 		crfree(connp->conn_peercred);
    654 	connp->conn_peercred = NULL;
    655 
    656 	if (connp->conn_cred != NULL) {
    657 		crfree(connp->conn_cred);
    658 		connp->conn_cred = NULL;
    659 	}
    660 
    661 	ipcl_globalhash_remove(connp);
    662 
    663 	/* FIXME: add separate tcp_conn_free()? */
    664 	if (connp->conn_flags & IPCL_TCPCONN) {
    665 		tcp_t	*tcp = connp->conn_tcp;
    666 		tcp_stack_t *tcps;
    667 
    668 		ASSERT(tcp != NULL);
    669 		tcps = tcp->tcp_tcps;
    670 		if (tcps != NULL) {
    671 			if (connp->conn_latch != NULL) {
    672 				IPLATCH_REFRELE(connp->conn_latch, ns);
    673 				connp->conn_latch = NULL;
    674 			}
    675 			if (connp->conn_policy != NULL) {
    676 				IPPH_REFRELE(connp->conn_policy, ns);
    677 				connp->conn_policy = NULL;
    678 			}
    679 			tcp->tcp_tcps = NULL;
    680 			TCPS_REFRELE(tcps);
    681 		}
    682 
    683 		tcp_free(tcp);
    684 		mp = tcp->tcp_timercache;
    685 		tcp->tcp_cred = NULL;
    686 
    687 		if (tcp->tcp_sack_info != NULL) {
    688 			bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
    689 			kmem_cache_free(tcp_sack_info_cache,
    690 			    tcp->tcp_sack_info);
    691 		}
    692 		if (tcp->tcp_iphc != NULL) {
    693 			if (tcp->tcp_hdr_grown) {
    694 				kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
    695 			} else {
    696 				bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
    697 				kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
    698 			}
    699 			tcp->tcp_iphc_len = 0;
    700 		}
    701 		ASSERT(tcp->tcp_iphc_len == 0);
    702 
    703 		ASSERT(connp->conn_latch == NULL);
    704 		ASSERT(connp->conn_policy == NULL);
    705 
    706 		if (ns != NULL) {
    707 			ASSERT(tcp->tcp_tcps == NULL);
    708 			connp->conn_netstack = NULL;
    709 			netstack_rele(ns);
    710 		}
    711 
    712 		ipcl_conn_cleanup(connp);
    713 		connp->conn_flags = IPCL_TCPCONN;
    714 		bzero(tcp, sizeof (tcp_t));
    715 
    716 		tcp->tcp_timercache = mp;
    717 		tcp->tcp_connp = connp;
    718 		kmem_cache_free(tcp_conn_cache, connp);
    719 		return;
    720 	}
    721 	if (connp->conn_latch != NULL) {
    722 		IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack);
    723 		connp->conn_latch = NULL;
    724 	}
    725 	if (connp->conn_policy != NULL) {
    726 		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
    727 		connp->conn_policy = NULL;
    728 	}
    729 	if (connp->conn_ipsec_opt_mp != NULL) {
    730 		freemsg(connp->conn_ipsec_opt_mp);
    731 		connp->conn_ipsec_opt_mp = NULL;
    732 	}
    733 
    734 	if (connp->conn_flags & IPCL_SCTPCONN) {
    735 		ASSERT(ns != NULL);
    736 		sctp_free(connp);
    737 		return;
    738 	}
    739 
    740 	if (ns != NULL) {
    741 		connp->conn_netstack = NULL;
    742 		netstack_rele(ns);
    743 	}
    744 	ipcl_conn_cleanup(connp);
    745 
    746 	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
    747 	if (connp->conn_flags & IPCL_UDPCONN) {
    748 		connp->conn_flags = IPCL_UDPCONN;
    749 		kmem_cache_free(udp_conn_cache, connp);
    750 	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
    751 		connp->conn_flags = IPCL_RAWIPCONN;
    752 		connp->conn_ulp = IPPROTO_ICMP;
    753 		kmem_cache_free(rawip_conn_cache, connp);
    754 	} else if (connp->conn_flags & IPCL_RTSCONN) {
    755 		connp->conn_flags = IPCL_RTSCONN;
    756 		kmem_cache_free(rts_conn_cache, connp);
    757 	} else {
    758 		connp->conn_flags = IPCL_IPCCONN;
    759 		ASSERT(connp->conn_flags & IPCL_IPCCONN);
    760 		ASSERT(connp->conn_priv == NULL);
    761 		kmem_cache_free(ip_conn_cache, connp);
    762 	}
    763 }
    764 
    765 /*
    766  * Running in cluster mode - deregister listener information
    767  */
    768 
    769 static void
    770 ipcl_conn_unlisten(conn_t *connp)
    771 {
    772 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
    773 	ASSERT(connp->conn_lport != 0);
    774 
    775 	if (cl_inet_unlisten != NULL) {
    776 		sa_family_t	addr_family;
    777 		uint8_t		*laddrp;
    778 
    779 		if (connp->conn_pkt_isv6) {
    780 			addr_family = AF_INET6;
    781 			laddrp = (uint8_t *)&connp->conn_bound_source_v6;
    782 		} else {
    783 			addr_family = AF_INET;
    784 			laddrp = (uint8_t *)&connp->conn_bound_source;
    785 		}
    786 		(*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp,
    787 		    connp->conn_lport);
    788 	}
    789 	connp->conn_flags &= ~IPCL_CL_LISTENER;
    790 }
    791 
    792 /*
    793  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
    794  * which table the conn belonged to). So for debugging we can see which hash
    795  * table this connection was in.
    796  */
    797 #define	IPCL_HASH_REMOVE(connp)	{					\
    798 	connf_t	*connfp = (connp)->conn_fanout;				\
    799 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
    800 	if (connfp != NULL) {						\
    801 		IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p",	\
    802 		    (void *)(connp)));					\
    803 		mutex_enter(&connfp->connf_lock);			\
    804 		if ((connp)->conn_next != NULL)				\
    805 			(connp)->conn_next->conn_prev =			\
    806 			    (connp)->conn_prev;				\
    807 		if ((connp)->conn_prev != NULL)				\
    808 			(connp)->conn_prev->conn_next =			\
    809 			    (connp)->conn_next;				\
    810 		else							\
    811 			connfp->connf_head = (connp)->conn_next;	\
    812 		(connp)->conn_fanout = NULL;				\
    813 		(connp)->conn_next = NULL;				\
    814 		(connp)->conn_prev = NULL;				\
    815 		(connp)->conn_flags |= IPCL_REMOVED;			\
    816 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
    817 			ipcl_conn_unlisten((connp));			\
    818 		CONN_DEC_REF((connp));					\
    819 		mutex_exit(&connfp->connf_lock);			\
    820 	}								\
    821 }
    822 
    823 void
    824 ipcl_hash_remove(conn_t *connp)
    825 {
    826 	IPCL_HASH_REMOVE(connp);
    827 }
    828 
    829 /*
    830  * The whole purpose of this function is allow removal of
    831  * a conn_t from the connected hash for timewait reclaim.
    832  * This is essentially a TW reclaim fastpath where timewait
    833  * collector checks under fanout lock (so no one else can
    834  * get access to the conn_t) that refcnt is 2 i.e. one for
    835  * TCP and one for the classifier hash list. If ref count
    836  * is indeed 2, we can just remove the conn under lock and
    837  * avoid cleaning up the conn under squeue. This gives us
    838  * improved performance.
    839  */
    840 void
    841 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
    842 {
    843 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
    844 	ASSERT(MUTEX_HELD(&connp->conn_lock));
    845 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
    846 
    847 	if ((connp)->conn_next != NULL) {
    848 		(connp)->conn_next->conn_prev = (connp)->conn_prev;
    849 	}
    850 	if ((connp)->conn_prev != NULL) {
    851 		(connp)->conn_prev->conn_next = (connp)->conn_next;
    852 	} else {
    853 		connfp->connf_head = (connp)->conn_next;
    854 	}
    855 	(connp)->conn_fanout = NULL;
    856 	(connp)->conn_next = NULL;
    857 	(connp)->conn_prev = NULL;
    858 	(connp)->conn_flags |= IPCL_REMOVED;
    859 	ASSERT((connp)->conn_ref == 2);
    860 	(connp)->conn_ref--;
    861 }
    862 
    863 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
    864 	ASSERT((connp)->conn_fanout == NULL);				\
    865 	ASSERT((connp)->conn_next == NULL);				\
    866 	ASSERT((connp)->conn_prev == NULL);				\
    867 	if ((connfp)->connf_head != NULL) {				\
    868 		(connfp)->connf_head->conn_prev = (connp);		\
    869 		(connp)->conn_next = (connfp)->connf_head;		\
    870 	}								\
    871 	(connp)->conn_fanout = (connfp);				\
    872 	(connfp)->connf_head = (connp);					\
    873 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
    874 	    IPCL_CONNECTED;						\
    875 	CONN_INC_REF(connp);						\
    876 }
    877 
    878 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
    879 	IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p "	\
    880 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
    881 	IPCL_HASH_REMOVE((connp));					\
    882 	mutex_enter(&(connfp)->connf_lock);				\
    883 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
    884 	mutex_exit(&(connfp)->connf_lock);				\
    885 }
    886 
    887 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
    888 	conn_t *pconnp = NULL, *nconnp;					\
    889 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p "	\
    890 	    "connp %p", (void *)connfp, (void *)(connp)));		\
    891 	IPCL_HASH_REMOVE((connp));					\
    892 	mutex_enter(&(connfp)->connf_lock);				\
    893 	nconnp = (connfp)->connf_head;					\
    894 	while (nconnp != NULL &&					\
    895 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) {			\
    896 		pconnp = nconnp;					\
    897 		nconnp = nconnp->conn_next;				\
    898 	}								\
    899 	if (pconnp != NULL) {						\
    900 		pconnp->conn_next = (connp);				\
    901 		(connp)->conn_prev = pconnp;				\
    902 	} else {							\
    903 		(connfp)->connf_head = (connp);				\
    904 	}								\
    905 	if (nconnp != NULL) {						\
    906 		(connp)->conn_next = nconnp;				\
    907 		nconnp->conn_prev = (connp);				\
    908 	}								\
    909 	(connp)->conn_fanout = (connfp);				\
    910 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
    911 	    IPCL_BOUND;							\
    912 	CONN_INC_REF(connp);						\
    913 	mutex_exit(&(connfp)->connf_lock);				\
    914 }
    915 
    916 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
    917 	conn_t **list, *prev, *next;					\
    918 	boolean_t isv4mapped =						\
    919 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6);			\
    920 	IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p "	\
    921 	    "connp %p", (void *)(connfp), (void *)(connp)));		\
    922 	IPCL_HASH_REMOVE((connp));					\
    923 	mutex_enter(&(connfp)->connf_lock);				\
    924 	list = &(connfp)->connf_head;					\
    925 	prev = NULL;							\
    926 	while ((next = *list) != NULL) {				\
    927 		if (isv4mapped &&					\
    928 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) &&	\
    929 		    connp->conn_zoneid == next->conn_zoneid) {		\
    930 			(connp)->conn_next = next;			\
    931 			if (prev != NULL)				\
    932 				prev = next->conn_prev;			\
    933 			next->conn_prev = (connp);			\
    934 			break;						\
    935 		}							\
    936 		list = &next->conn_next;				\
    937 		prev = next;						\
    938 	}								\
    939 	(connp)->conn_prev = prev;					\
    940 	*list = (connp);						\
    941 	(connp)->conn_fanout = (connfp);				\
    942 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
    943 	    IPCL_BOUND;							\
    944 	CONN_INC_REF((connp));						\
    945 	mutex_exit(&(connfp)->connf_lock);				\
    946 }
    947 
    948 void
    949 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
    950 {
    951 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
    952 }
    953 
    954 void
    955 ipcl_proto_insert(conn_t *connp, uint8_t protocol)
    956 {
    957 	connf_t	*connfp;
    958 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
    959 
    960 	ASSERT(connp != NULL);
    961 	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
    962 	    protocol == IPPROTO_ESP);
    963 
    964 	connp->conn_ulp = protocol;
    965 
    966 	/* Insert it in the protocol hash */
    967 	connfp = &ipst->ips_ipcl_proto_fanout[protocol];
    968 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
    969 }
    970 
    971 void
    972 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
    973 {
    974 	connf_t	*connfp;
    975 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
    976 
    977 	ASSERT(connp != NULL);
    978 	ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH ||
    979 	    protocol == IPPROTO_ESP);
    980 
    981 	connp->conn_ulp = protocol;
    982 
    983 	/* Insert it in the Bind Hash */
    984 	connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
    985 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
    986 }
    987 
    988 /*
    989  * This function is used only for inserting SCTP raw socket now.
    990  * This may change later.
    991  *
    992  * Note that only one raw socket can be bound to a port.  The param
    993  * lport is in network byte order.
    994  */
    995 static int
    996 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
    997 {
    998 	connf_t	*connfp;
    999 	conn_t	*oconnp;
   1000 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
   1001 
   1002 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
   1003 
   1004 	/* Check for existing raw socket already bound to the port. */
   1005 	mutex_enter(&connfp->connf_lock);
   1006 	for (oconnp = connfp->connf_head; oconnp != NULL;
   1007 	    oconnp = oconnp->conn_next) {
   1008 		if (oconnp->conn_lport == lport &&
   1009 		    oconnp->conn_zoneid == connp->conn_zoneid &&
   1010 		    oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
   1011 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
   1012 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
   1013 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
   1014 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
   1015 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
   1016 		    &connp->conn_srcv6))) {
   1017 			break;
   1018 		}
   1019 	}
   1020 	mutex_exit(&connfp->connf_lock);
   1021 	if (oconnp != NULL)
   1022 		return (EADDRNOTAVAIL);
   1023 
   1024 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
   1025 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
   1026 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
   1027 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
   1028 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
   1029 		} else {
   1030 			IPCL_HASH_INSERT_BOUND(connfp, connp);
   1031 		}
   1032 	} else {
   1033 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
   1034 	}
   1035 	return (0);
   1036 }
   1037 
   1038 /