1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 const char ipclassifier_version[] = "@(#)ipclassifier.c %I% %E% SMI"; 29 30 /* 31 * IP PACKET CLASSIFIER 32 * 33 * The IP packet classifier provides mapping between IP packets and persistent 34 * connection state for connection-oriented protocols. It also provides 35 * interface for managing connection states. 36 * 37 * The connection state is kept in conn_t data structure and contains, among 38 * other things: 39 * 40 * o local/remote address and ports 41 * o Transport protocol 42 * o squeue for the connection (for TCP only) 43 * o reference counter 44 * o Connection state 45 * o hash table linkage 46 * o interface/ire information 47 * o credentials 48 * o ipsec policy 49 * o send and receive functions. 50 * o mutex lock. 51 * 52 * Connections use a reference counting scheme. They are freed when the 53 * reference counter drops to zero. A reference is incremented when connection 54 * is placed in a list or table, when incoming packet for the connection arrives 55 * and when connection is processed via squeue (squeue processing may be 56 * asynchronous and the reference protects the connection from being destroyed 57 * before its processing is finished). 58 * 59 * send and receive functions are currently used for TCP only. The send function 60 * determines the IP entry point for the packet once it leaves TCP to be sent to 61 * the destination address. The receive function is used by IP when the packet 62 * should be passed for TCP processing. When a new connection is created these 63 * are set to ip_output() and tcp_input() respectively. During the lifetime of 64 * the connection the send and receive functions may change depending on the 65 * changes in the connection state. For example, Once the connection is bound to 66 * an addresse, the receive function for this connection is set to 67 * tcp_conn_request(). This allows incoming SYNs to go directly into the 68 * listener SYN processing function without going to tcp_input() first. 69 * 70 * Classifier uses several hash tables: 71 * 72 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 73 * ipcl_bind_fanout: contains all connections in BOUND state 74 * ipcl_proto_fanout: IPv4 protocol fanout 75 * ipcl_proto_fanout_v6: IPv6 protocol fanout 76 * ipcl_udp_fanout: contains all UDP connections 77 * ipcl_globalhash_fanout: contains all connections 78 * 79 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 80 * which need to view all existing connections. 81 * 82 * All tables are protected by per-bucket locks. When both per-bucket lock and 83 * connection lock need to be held, the per-bucket lock should be acquired 84 * first, followed by the connection lock. 85 * 86 * All functions doing search in one of these tables increment a reference 87 * counter on the connection found (if any). This reference should be dropped 88 * when the caller has finished processing the connection. 89 * 90 * 91 * INTERFACES: 92 * =========== 93 * 94 * Connection Lookup: 95 * ------------------ 96 * 97 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack) 98 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack) 99 * 100 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 101 * it can't find any associated connection. If the connection is found, its 102 * reference counter is incremented. 103 * 104 * mp: mblock, containing packet header. The full header should fit 105 * into a single mblock. It should also contain at least full IP 106 * and TCP or UDP header. 107 * 108 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 109 * 110 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 111 * the packet. 112 * 113 * zoneid: The zone in which the returned connection must be; the zoneid 114 * corresponding to the ire_zoneid on the IRE located for the 115 * packet's destination address. 116 * 117 * For TCP connections, the lookup order is as follows: 118 * 5-tuple {src, dst, protocol, local port, remote port} 119 * lookup in ipcl_conn_fanout table. 120 * 3-tuple {dst, remote port, protocol} lookup in 121 * ipcl_bind_fanout table. 122 * 123 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 124 * remote port} lookup is done on ipcl_udp_fanout. Note that, 125 * these interfaces do not handle cases where a packets belongs 126 * to multiple UDP clients, which is handled in IP itself. 127 * 128 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 129 * determine which actual zone gets the segment. This is used only in a 130 * labeled environment. The matching rules are: 131 * 132 * - If it's not a multilevel port, then the label on the packet selects 133 * the zone. Unlabeled packets are delivered to the global zone. 134 * 135 * - If it's a multilevel port, then only the zone registered to receive 136 * packets on that port matches. 137 * 138 * Also, in a labeled environment, packet labels need to be checked. For fully 139 * bound TCP connections, we can assume that the packet label was checked 140 * during connection establishment, and doesn't need to be checked on each 141 * packet. For others, though, we need to check for strict equality or, for 142 * multilevel ports, membership in the range or set. This part currently does 143 * a tnrh lookup on each packet, but could be optimized to use cached results 144 * if that were necessary. (SCTP doesn't come through here, but if it did, 145 * we would apply the same rules as TCP.) 146 * 147 * An implication of the above is that fully-bound TCP sockets must always use 148 * distinct 4-tuples; they can't be discriminated by label alone. 149 * 150 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 151 * as there's no connection set-up handshake and no shared state. 152 * 153 * Labels on looped-back packets within a single zone do not need to be 154 * checked, as all processes in the same zone have the same label. 155 * 156 * Finally, for unlabeled packets received by a labeled system, special rules 157 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 158 * socket in the zone whose label matches the default label of the sender, if 159 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 160 * receiver's label must dominate the sender's default label. 161 * 162 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack); 163 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 164 * ip_stack); 165 * 166 * Lookup routine to find a exact match for {src, dst, local port, 167 * remote port) for TCP connections in ipcl_conn_fanout. The address and 168 * ports are read from the IP and TCP header respectively. 169 * 170 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 171 * zoneid, ip_stack); 172 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 173 * zoneid, ip_stack); 174 * 175 * Lookup routine to find a listener with the tuple {lport, laddr, 176 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 177 * parameter interface index is also compared. 178 * 179 * void ipcl_walk(func, arg, ip_stack) 180 * 181 * Apply 'func' to every connection available. The 'func' is called as 182 * (*func)(connp, arg). The walk is non-atomic so connections may be 183 * created and destroyed during the walk. The CONN_CONDEMNED and 184 * CONN_INCIPIENT flags ensure that connections which are newly created 185 * or being destroyed are not selected by the walker. 186 * 187 * Table Updates 188 * ------------- 189 * 190 * int ipcl_conn_insert(connp, protocol, src, dst, ports) 191 * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex) 192 * 193 * Insert 'connp' in the ipcl_conn_fanout. 194 * Arguements : 195 * connp conn_t to be inserted 196 * protocol connection protocol 197 * src source address 198 * dst destination address 199 * ports local and remote port 200 * ifindex interface index for IPv6 connections 201 * 202 * Return value : 203 * 0 if connp was inserted 204 * EADDRINUSE if the connection with the same tuple 205 * already exists. 206 * 207 * int ipcl_bind_insert(connp, protocol, src, lport); 208 * int ipcl_bind_insert_v6(connp, protocol, src, lport); 209 * 210 * Insert 'connp' in ipcl_bind_fanout. 211 * Arguements : 212 * connp conn_t to be inserted 213 * protocol connection protocol 214 * src source address connection wants 215 * to bind to 216 * lport local port connection wants to 217 * bind to 218 * 219 * 220 * void ipcl_hash_remove(connp); 221 * 222 * Removes the 'connp' from the connection fanout table. 223 * 224 * Connection Creation/Destruction 225 * ------------------------------- 226 * 227 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 228 * 229 * Creates a new conn based on the type flag, inserts it into 230 * globalhash table. 231 * 232 * type: This flag determines the type of conn_t which needs to be 233 * created i.e., which kmem_cache it comes from. 234 * IPCL_TCPCONN indicates a TCP connection 235 * IPCL_SCTPCONN indicates a SCTP connection 236 * IPCL_UDPCONN indicates a UDP conn_t. 237 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 238 * IPCL_RTSCONN indicates a RTS conn_t. 239 * IPCL_IPCCONN indicates all other connections. 240 * 241 * void ipcl_conn_destroy(connp) 242 * 243 * Destroys the connection state, removes it from the global 244 * connection hash table and frees its memory. 245 */ 246 247 #include <sys/types.h> 248 #include <sys/stream.h> 249 #include <sys/stropts.h> 250 #include <sys/sysmacros.h> 251 #include <sys/strsubr.h> 252 #include <sys/strsun.h> 253 #define _SUN_TPI_VERSION 2 254 #include <sys/ddi.h> 255 #include <sys/cmn_err.h> 256 #include <sys/debug.h> 257 258 #include <sys/systm.h> 259 #include <sys/param.h> 260 #include <sys/kmem.h> 261 #include <sys/isa_defs.h> 262 #include <inet/common.h> 263 #include <netinet/ip6.h> 264 #include <netinet/icmp6.h> 265 266 #include <inet/ip.h> 267 #include <inet/ip6.h> 268 #include <inet/tcp.h> 269 #include <inet/ip_ndp.h> 270 #include <inet/udp_impl.h> 271 #include <inet/sctp_ip.h> 272 #include <inet/sctp/sctp_impl.h> 273 #include <inet/rawip_impl.h> 274 #include <inet/rts_impl.h> 275 276 #include <sys/cpuvar.h> 277 278 #include <inet/ipclassifier.h> 279 #include <inet/ipsec_impl.h> 280 281 #include <sys/tsol/tnet.h> 282 283 #ifdef DEBUG 284 #define IPCL_DEBUG 285 #else 286 #undef IPCL_DEBUG 287 #endif 288 289 #ifdef IPCL_DEBUG 290 int ipcl_debug_level = 0; 291 #define IPCL_DEBUG_LVL(level, args) \ 292 if (ipcl_debug_level & level) { printf args; } 293 #else 294 #define IPCL_DEBUG_LVL(level, args) {; } 295 #endif 296 /* Old value for compatibility. Setable in /etc/system */ 297 uint_t tcp_conn_hash_size = 0; 298 299 /* New value. Zero means choose automatically. Setable in /etc/system */ 300 uint_t ipcl_conn_hash_size = 0; 301 uint_t ipcl_conn_hash_memfactor = 8192; 302 uint_t ipcl_conn_hash_maxsize = 82500; 303 304 /* bind/udp fanout table size */ 305 uint_t ipcl_bind_fanout_size = 512; 306 uint_t ipcl_udp_fanout_size = 16384; 307 308 /* Raw socket fanout size. Must be a power of 2. */ 309 uint_t ipcl_raw_fanout_size = 256; 310 311 /* 312 * Power of 2^N Primes useful for hashing for N of 0-28, 313 * these primes are the nearest prime <= 2^N - 2^(N-2). 314 */ 315 316 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 317 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 318 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 319 50331599, 100663291, 201326557, 0} 320 321 /* 322 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 323 * are aligned on cache lines. 324 */ 325 typedef union itc_s { 326 conn_t itc_conn; 327 char itcu_filler[CACHE_ALIGN(conn_s)]; 328 } itc_t; 329 330 struct kmem_cache *tcp_conn_cache; 331 struct kmem_cache *ip_conn_cache; 332 extern struct kmem_cache *sctp_conn_cache; 333 extern struct kmem_cache *tcp_sack_info_cache; 334 extern struct kmem_cache *tcp_iphc_cache; 335 struct kmem_cache *udp_conn_cache; 336 struct kmem_cache *rawip_conn_cache; 337 struct kmem_cache *rts_conn_cache; 338 339 extern void tcp_timermp_free(tcp_t *); 340 extern mblk_t *tcp_timermp_alloc(int); 341 342 static int ip_conn_constructor(void *, void *, int); 343 static void ip_conn_destructor(void *, void *); 344 345 static int tcp_conn_constructor(void *, void *, int); 346 static void tcp_conn_destructor(void *, void *); 347 348 static int udp_conn_constructor(void *, void *, int); 349 static void udp_conn_destructor(void *, void *); 350 351 static int rawip_conn_constructor(void *, void *, int); 352 static void rawip_conn_destructor(void *, void *); 353 354 static int rts_conn_constructor(void *, void *, int); 355 static void rts_conn_destructor(void *, void *); 356 357 #ifdef IPCL_DEBUG 358 #define INET_NTOA_BUFSIZE 18 359 360 static char * 361 inet_ntoa_r(uint32_t in, char *b) 362 { 363 unsigned char *p; 364 365 p = (unsigned char *)∈ 366 (void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]); 367 return (b); 368 } 369 #endif 370 371 /* 372 * Global (for all stack instances) init routine 373 */ 374 void 375 ipcl_g_init(void) 376 { 377 ip_conn_cache = kmem_cache_create("ip_conn_cache", 378 sizeof (conn_t), CACHE_ALIGN_SIZE, 379 ip_conn_constructor, ip_conn_destructor, 380 NULL, NULL, NULL, 0); 381 382 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 383 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 384 tcp_conn_constructor, tcp_conn_destructor, 385 NULL, NULL, NULL, 0); 386 387 udp_conn_cache = kmem_cache_create("udp_conn_cache", 388 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 389 udp_conn_constructor, udp_conn_destructor, 390 NULL, NULL, NULL, 0); 391 392 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 393 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 394 rawip_conn_constructor, rawip_conn_destructor, 395 NULL, NULL, NULL, 0); 396 397 rts_conn_cache = kmem_cache_create("rts_conn_cache", 398 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 399 rts_conn_constructor, rts_conn_destructor, 400 NULL, NULL, NULL, 0); 401 } 402 403 /* 404 * ipclassifier intialization routine, sets up hash tables. 405 */ 406 void 407 ipcl_init(ip_stack_t *ipst) 408 { 409 int i; 410 int sizes[] = P2Ps(); 411 412 /* 413 * Calculate size of conn fanout table from /etc/system settings 414 */ 415 if (ipcl_conn_hash_size != 0) { 416 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 417 } else if (tcp_conn_hash_size != 0) { 418 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 419 } else { 420 extern pgcnt_t freemem; 421 422 ipst->ips_ipcl_conn_fanout_size = 423 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 424 425 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 426 ipst->ips_ipcl_conn_fanout_size = 427 ipcl_conn_hash_maxsize; 428 } 429 } 430 431 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 432 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 433 break; 434 } 435 } 436 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 437 /* Out of range, use the 2^16 value */ 438 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 439 } 440 441 /* Take values from /etc/system */ 442 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 443 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 444 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 445 446 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 447 448 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 449 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 450 451 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 452 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 453 MUTEX_DEFAULT, NULL); 454 } 455 456 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 457 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 458 459 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 460 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 461 MUTEX_DEFAULT, NULL); 462 } 463 464 ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX * 465 sizeof (connf_t), KM_SLEEP); 466 for (i = 0; i < IPPROTO_MAX; i++) { 467 mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL, 468 MUTEX_DEFAULT, NULL); 469 } 470 471 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 472 sizeof (connf_t), KM_SLEEP); 473 for (i = 0; i < IPPROTO_MAX; i++) { 474 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 475 MUTEX_DEFAULT, NULL); 476 } 477 478 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 479 mutex_init(&ipst->ips_rts_clients->connf_lock, 480 NULL, MUTEX_DEFAULT, NULL); 481 482 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 483 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 484 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 485 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 486 MUTEX_DEFAULT, NULL); 487 } 488 489 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 490 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 491 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 492 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 493 MUTEX_DEFAULT, NULL); 494 } 495 496 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 497 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 498 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 499 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 500 NULL, MUTEX_DEFAULT, NULL); 501 } 502 } 503 504 void 505 ipcl_g_destroy(void) 506 { 507 kmem_cache_destroy(ip_conn_cache); 508 kmem_cache_destroy(tcp_conn_cache); 509 kmem_cache_destroy(udp_conn_cache); 510 kmem_cache_destroy(rawip_conn_cache); 511 kmem_cache_destroy(rts_conn_cache); 512 } 513 514 /* 515 * All user-level and kernel use of the stack must be gone 516 * by now. 517 */ 518 void 519 ipcl_destroy(ip_stack_t *ipst) 520 { 521 int i; 522 523 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 524 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 525 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 526 } 527 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 528 sizeof (connf_t)); 529 ipst->ips_ipcl_conn_fanout = NULL; 530 531 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 532 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 533 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 534 } 535 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 536 sizeof (connf_t)); 537 ipst->ips_ipcl_bind_fanout = NULL; 538 539 for (i = 0; i < IPPROTO_MAX; i++) { 540 ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL); 541 mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock); 542 } 543 kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t)); 544 ipst->ips_ipcl_proto_fanout = NULL; 545 546 for (i = 0; i < IPPROTO_MAX; i++) { 547 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 548 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 549 } 550 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 551 IPPROTO_MAX * sizeof (connf_t)); 552 ipst->ips_ipcl_proto_fanout_v6 = NULL; 553 554 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 555 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 556 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 557 } 558 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 559 sizeof (connf_t)); 560 ipst->ips_ipcl_udp_fanout = NULL; 561 562 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 563 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 564 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 565 } 566 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 567 sizeof (connf_t)); 568 ipst->ips_ipcl_raw_fanout = NULL; 569 570 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 571 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 572 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 573 } 574 kmem_free(ipst->ips_ipcl_globalhash_fanout, 575 sizeof (connf_t) * CONN_G_HASH_SIZE); 576 ipst->ips_ipcl_globalhash_fanout = NULL; 577 578 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 579 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 580 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 581 ipst->ips_rts_clients = NULL; 582 } 583 584 /* 585 * conn creation routine. initialize the conn, sets the reference 586 * and inserts it in the global hash table. 587 */ 588 conn_t * 589 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 590 { 591 conn_t *connp; 592 sctp_stack_t *sctps; 593 struct kmem_cache *conn_cache; 594 595 switch (type) { 596 case IPCL_SCTPCONN: 597 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 598 return (NULL); 599 sctp_conn_init(connp); 600 sctps = ns->netstack_sctp; 601 SCTP_G_Q_REFHOLD(sctps); 602 netstack_hold(ns); 603 connp->conn_netstack = ns; 604 return (connp); 605 606 case IPCL_TCPCONN: 607 conn_cache = tcp_conn_cache; 608 break; 609 610 case IPCL_UDPCONN: 611 conn_cache = udp_conn_cache; 612 break; 613 614 case IPCL_RAWIPCONN: 615 conn_cache = rawip_conn_cache; 616 break; 617 618 case IPCL_RTSCONN: 619 conn_cache = rts_conn_cache; 620 break; 621 622 case IPCL_IPCCONN: 623 conn_cache = ip_conn_cache; 624 break; 625 626 default: 627 connp = NULL; 628 ASSERT(0); 629 } 630 631 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 632 return (NULL); 633 634 connp->conn_ref = 1; 635 netstack_hold(ns); 636 connp->conn_netstack = ns; 637 ipcl_globalhash_insert(connp); 638 return (connp); 639 } 640 641 void 642 ipcl_conn_destroy(conn_t *connp) 643 { 644 mblk_t *mp; 645 netstack_t *ns = connp->conn_netstack; 646 647 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 648 ASSERT(connp->conn_ref == 0); 649 ASSERT(connp->conn_ire_cache == NULL); 650 651 if (connp->conn_peercred != NULL && 652 connp->conn_peercred != connp->conn_cred) 653 crfree(connp->conn_peercred); 654 connp->conn_peercred = NULL; 655 656 if (connp->conn_cred != NULL) { 657 crfree(connp->conn_cred); 658 connp->conn_cred = NULL; 659 } 660 661 ipcl_globalhash_remove(connp); 662 663 /* FIXME: add separate tcp_conn_free()? */ 664 if (connp->conn_flags & IPCL_TCPCONN) { 665 tcp_t *tcp = connp->conn_tcp; 666 tcp_stack_t *tcps; 667 668 ASSERT(tcp != NULL); 669 tcps = tcp->tcp_tcps; 670 if (tcps != NULL) { 671 if (connp->conn_latch != NULL) { 672 IPLATCH_REFRELE(connp->conn_latch, ns); 673 connp->conn_latch = NULL; 674 } 675 if (connp->conn_policy != NULL) { 676 IPPH_REFRELE(connp->conn_policy, ns); 677 connp->conn_policy = NULL; 678 } 679 tcp->tcp_tcps = NULL; 680 TCPS_REFRELE(tcps); 681 } 682 683 tcp_free(tcp); 684 mp = tcp->tcp_timercache; 685 tcp->tcp_cred = NULL; 686 687 if (tcp->tcp_sack_info != NULL) { 688 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 689 kmem_cache_free(tcp_sack_info_cache, 690 tcp->tcp_sack_info); 691 } 692 if (tcp->tcp_iphc != NULL) { 693 if (tcp->tcp_hdr_grown) { 694 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 695 } else { 696 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 697 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 698 } 699 tcp->tcp_iphc_len = 0; 700 } 701 ASSERT(tcp->tcp_iphc_len == 0); 702 703 ASSERT(connp->conn_latch == NULL); 704 ASSERT(connp->conn_policy == NULL); 705 706 if (ns != NULL) { 707 ASSERT(tcp->tcp_tcps == NULL); 708 connp->conn_netstack = NULL; 709 netstack_rele(ns); 710 } 711 712 ipcl_conn_cleanup(connp); 713 connp->conn_flags = IPCL_TCPCONN; 714 bzero(tcp, sizeof (tcp_t)); 715 716 tcp->tcp_timercache = mp; 717 tcp->tcp_connp = connp; 718 kmem_cache_free(tcp_conn_cache, connp); 719 return; 720 } 721 if (connp->conn_latch != NULL) { 722 IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack); 723 connp->conn_latch = NULL; 724 } 725 if (connp->conn_policy != NULL) { 726 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); 727 connp->conn_policy = NULL; 728 } 729 if (connp->conn_ipsec_opt_mp != NULL) { 730 freemsg(connp->conn_ipsec_opt_mp); 731 connp->conn_ipsec_opt_mp = NULL; 732 } 733 734 if (connp->conn_flags & IPCL_SCTPCONN) { 735 ASSERT(ns != NULL); 736 sctp_free(connp); 737 return; 738 } 739 740 if (ns != NULL) { 741 connp->conn_netstack = NULL; 742 netstack_rele(ns); 743 } 744 ipcl_conn_cleanup(connp); 745 746 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 747 if (connp->conn_flags & IPCL_UDPCONN) { 748 connp->conn_flags = IPCL_UDPCONN; 749 kmem_cache_free(udp_conn_cache, connp); 750 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 751 connp->conn_flags = IPCL_RAWIPCONN; 752 connp->conn_ulp = IPPROTO_ICMP; 753 kmem_cache_free(rawip_conn_cache, connp); 754 } else if (connp->conn_flags & IPCL_RTSCONN) { 755 connp->conn_flags = IPCL_RTSCONN; 756 kmem_cache_free(rts_conn_cache, connp); 757 } else { 758 connp->conn_flags = IPCL_IPCCONN; 759 ASSERT(connp->conn_flags & IPCL_IPCCONN); 760 ASSERT(connp->conn_priv == NULL); 761 kmem_cache_free(ip_conn_cache, connp); 762 } 763 } 764 765 /* 766 * Running in cluster mode - deregister listener information 767 */ 768 769 static void 770 ipcl_conn_unlisten(conn_t *connp) 771 { 772 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 773 ASSERT(connp->conn_lport != 0); 774 775 if (cl_inet_unlisten != NULL) { 776 sa_family_t addr_family; 777 uint8_t *laddrp; 778 779 if (connp->conn_pkt_isv6) { 780 addr_family = AF_INET6; 781 laddrp = (uint8_t *)&connp->conn_bound_source_v6; 782 } else { 783 addr_family = AF_INET; 784 laddrp = (uint8_t *)&connp->conn_bound_source; 785 } 786 (*cl_inet_unlisten)(IPPROTO_TCP, addr_family, laddrp, 787 connp->conn_lport); 788 } 789 connp->conn_flags &= ~IPCL_CL_LISTENER; 790 } 791 792 /* 793 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 794 * which table the conn belonged to). So for debugging we can see which hash 795 * table this connection was in. 796 */ 797 #define IPCL_HASH_REMOVE(connp) { \ 798 connf_t *connfp = (connp)->conn_fanout; \ 799 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 800 if (connfp != NULL) { \ 801 IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p", \ 802 (void *)(connp))); \ 803 mutex_enter(&connfp->connf_lock); \ 804 if ((connp)->conn_next != NULL) \ 805 (connp)->conn_next->conn_prev = \ 806 (connp)->conn_prev; \ 807 if ((connp)->conn_prev != NULL) \ 808 (connp)->conn_prev->conn_next = \ 809 (connp)->conn_next; \ 810 else \ 811 connfp->connf_head = (connp)->conn_next; \ 812 (connp)->conn_fanout = NULL; \ 813 (connp)->conn_next = NULL; \ 814 (connp)->conn_prev = NULL; \ 815 (connp)->conn_flags |= IPCL_REMOVED; \ 816 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 817 ipcl_conn_unlisten((connp)); \ 818 CONN_DEC_REF((connp)); \ 819 mutex_exit(&connfp->connf_lock); \ 820 } \ 821 } 822 823 void 824 ipcl_hash_remove(conn_t *connp) 825 { 826 IPCL_HASH_REMOVE(connp); 827 } 828 829 /* 830 * The whole purpose of this function is allow removal of 831 * a conn_t from the connected hash for timewait reclaim. 832 * This is essentially a TW reclaim fastpath where timewait 833 * collector checks under fanout lock (so no one else can 834 * get access to the conn_t) that refcnt is 2 i.e. one for 835 * TCP and one for the classifier hash list. If ref count 836 * is indeed 2, we can just remove the conn under lock and 837 * avoid cleaning up the conn under squeue. This gives us 838 * improved performance. 839 */ 840 void 841 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 842 { 843 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 844 ASSERT(MUTEX_HELD(&connp->conn_lock)); 845 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 846 847 if ((connp)->conn_next != NULL) { 848 (connp)->conn_next->conn_prev = (connp)->conn_prev; 849 } 850 if ((connp)->conn_prev != NULL) { 851 (connp)->conn_prev->conn_next = (connp)->conn_next; 852 } else { 853 connfp->connf_head = (connp)->conn_next; 854 } 855 (connp)->conn_fanout = NULL; 856 (connp)->conn_next = NULL; 857 (connp)->conn_prev = NULL; 858 (connp)->conn_flags |= IPCL_REMOVED; 859 ASSERT((connp)->conn_ref == 2); 860 (connp)->conn_ref--; 861 } 862 863 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 864 ASSERT((connp)->conn_fanout == NULL); \ 865 ASSERT((connp)->conn_next == NULL); \ 866 ASSERT((connp)->conn_prev == NULL); \ 867 if ((connfp)->connf_head != NULL) { \ 868 (connfp)->connf_head->conn_prev = (connp); \ 869 (connp)->conn_next = (connfp)->connf_head; \ 870 } \ 871 (connp)->conn_fanout = (connfp); \ 872 (connfp)->connf_head = (connp); \ 873 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 874 IPCL_CONNECTED; \ 875 CONN_INC_REF(connp); \ 876 } 877 878 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 879 IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p " \ 880 "connp %p", (void *)(connfp), (void *)(connp))); \ 881 IPCL_HASH_REMOVE((connp)); \ 882 mutex_enter(&(connfp)->connf_lock); \ 883 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 884 mutex_exit(&(connfp)->connf_lock); \ 885 } 886 887 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 888 conn_t *pconnp = NULL, *nconnp; \ 889 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p " \ 890 "connp %p", (void *)connfp, (void *)(connp))); \ 891 IPCL_HASH_REMOVE((connp)); \ 892 mutex_enter(&(connfp)->connf_lock); \ 893 nconnp = (connfp)->connf_head; \ 894 while (nconnp != NULL && \ 895 !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) { \ 896 pconnp = nconnp; \ 897 nconnp = nconnp->conn_next; \ 898 } \ 899 if (pconnp != NULL) { \ 900 pconnp->conn_next = (connp); \ 901 (connp)->conn_prev = pconnp; \ 902 } else { \ 903 (connfp)->connf_head = (connp); \ 904 } \ 905 if (nconnp != NULL) { \ 906 (connp)->conn_next = nconnp; \ 907 nconnp->conn_prev = (connp); \ 908 } \ 909 (connp)->conn_fanout = (connfp); \ 910 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 911 IPCL_BOUND; \ 912 CONN_INC_REF(connp); \ 913 mutex_exit(&(connfp)->connf_lock); \ 914 } 915 916 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 917 conn_t **list, *prev, *next; \ 918 boolean_t isv4mapped = \ 919 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6); \ 920 IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p " \ 921 "connp %p", (void *)(connfp), (void *)(connp))); \ 922 IPCL_HASH_REMOVE((connp)); \ 923 mutex_enter(&(connfp)->connf_lock); \ 924 list = &(connfp)->connf_head; \ 925 prev = NULL; \ 926 while ((next = *list) != NULL) { \ 927 if (isv4mapped && \ 928 IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) && \ 929 connp->conn_zoneid == next->conn_zoneid) { \ 930 (connp)->conn_next = next; \ 931 if (prev != NULL) \ 932 prev = next->conn_prev; \ 933 next->conn_prev = (connp); \ 934 break; \ 935 } \ 936 list = &next->conn_next; \ 937 prev = next; \ 938 } \ 939 (connp)->conn_prev = prev; \ 940 *list = (connp); \ 941 (connp)->conn_fanout = (connfp); \ 942 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 943 IPCL_BOUND; \ 944 CONN_INC_REF((connp)); \ 945 mutex_exit(&(connfp)->connf_lock); \ 946 } 947 948 void 949 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 950 { 951 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 952 } 953 954 void 955 ipcl_proto_insert(conn_t *connp, uint8_t protocol) 956 { 957 connf_t *connfp; 958 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 959 960 ASSERT(connp != NULL); 961 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 962 protocol == IPPROTO_ESP); 963 964 connp->conn_ulp = protocol; 965 966 /* Insert it in the protocol hash */ 967 connfp = &ipst->ips_ipcl_proto_fanout[protocol]; 968 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 969 } 970 971 void 972 ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol) 973 { 974 connf_t *connfp; 975 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 976 977 ASSERT(connp != NULL); 978 ASSERT(!connp->conn_mac_exempt || protocol == IPPROTO_AH || 979 protocol == IPPROTO_ESP); 980 981 connp->conn_ulp = protocol; 982 983 /* Insert it in the Bind Hash */ 984 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 985 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 986 } 987 988 /* 989 * This function is used only for inserting SCTP raw socket now. 990 * This may change later. 991 * 992 * Note that only one raw socket can be bound to a port. The param 993 * lport is in network byte order. 994 */ 995 static int 996 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 997 { 998 connf_t *connfp; 999 conn_t *oconnp; 1000 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1001 1002 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1003 1004 /* Check for existing raw socket already bound to the port. */ 1005 mutex_enter(&connfp->connf_lock); 1006 for (oconnp = connfp->connf_head; oconnp != NULL; 1007 oconnp = oconnp->conn_next) { 1008 if (oconnp->conn_lport == lport && 1009 oconnp->conn_zoneid == connp->conn_zoneid && 1010 oconnp->conn_af_isv6 == connp->conn_af_isv6 && 1011 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1012 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) || 1013 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) || 1014 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) || 1015 IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6, 1016 &connp->conn_srcv6))) { 1017 break; 1018 } 1019 } 1020 mutex_exit(&connfp->connf_lock); 1021 if (oconnp != NULL) 1022 return (EADDRNOTAVAIL); 1023 1024 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) || 1025 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) { 1026 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || 1027 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) { 1028 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1029 } else { 1030 IPCL_HASH_INSERT_BOUND(connfp, connp); 1031 } 1032 } else { 1033 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1034 } 1035 return (0); 1036 } 1037 1038 /