1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 const char tcp_version[] = "%Z%%M% %I% %E% SMI"; 30 31 32 #include <sys/types.h> 33 #include <sys/stream.h> 34 #include <sys/strsun.h> 35 #include <sys/strsubr.h> 36 #include <sys/stropts.h> 37 #include <sys/strlog.h> 38 #include <sys/strsun.h> 39 #define _SUN_TPI_VERSION 2 40 #include <sys/tihdr.h> 41 #include <sys/timod.h> 42 #include <sys/ddi.h> 43 #include <sys/sunddi.h> 44 #include <sys/suntpi.h> 45 #include <sys/xti_inet.h> 46 #include <sys/cmn_err.h> 47 #include <sys/debug.h> 48 #include <sys/sdt.h> 49 #include <sys/vtrace.h> 50 #include <sys/kmem.h> 51 #include <sys/ethernet.h> 52 #include <sys/cpuvar.h> 53 #include <sys/dlpi.h> 54 #include <sys/multidata.h> 55 #include <sys/multidata_impl.h> 56 #include <sys/pattr.h> 57 #include <sys/policy.h> 58 #include <sys/priv.h> 59 #include <sys/zone.h> 60 #include <sys/sunldi.h> 61 62 #include <sys/errno.h> 63 #include <sys/signal.h> 64 #include <sys/socket.h> 65 #include <sys/sockio.h> 66 #include <sys/isa_defs.h> 67 #include <sys/md5.h> 68 #include <sys/random.h> 69 #include <sys/sodirect.h> 70 #include <sys/uio.h> 71 #include <netinet/in.h> 72 #include <netinet/tcp.h> 73 #include <netinet/ip6.h> 74 #include <netinet/icmp6.h> 75 #include <net/if.h> 76 #include <net/route.h> 77 #include <inet/ipsec_impl.h> 78 79 #include <inet/common.h> 80 #include <inet/ip.h> 81 #include <inet/ip_impl.h> 82 #include <inet/ip6.h> 83 #include <inet/ip_ndp.h> 84 #include <inet/mi.h> 85 #include <inet/mib2.h> 86 #include <inet/nd.h> 87 #include <inet/optcom.h> 88 #include <inet/snmpcom.h> 89 #include <inet/kstatcom.h> 90 #include <inet/tcp.h> 91 #include <inet/tcp_impl.h> 92 #include <net/pfkeyv2.h> 93 #include <inet/ipsec_info.h> 94 #include <inet/ipdrop.h> 95 #include <inet/tcp_trace.h> 96 97 #include <inet/ipclassifier.h> 98 #include <inet/ip_ire.h> 99 #include <inet/ip_ftable.h> 100 #include <inet/ip_if.h> 101 #include <inet/ipp_common.h> 102 #include <inet/ip_netinfo.h> 103 #include <sys/squeue.h> 104 #include <inet/kssl/ksslapi.h> 105 #include <sys/tsol/label.h> 106 #include <sys/tsol/tnet.h> 107 #include <rpc/pmap_prot.h> 108 109 /* 110 * TCP Notes: aka FireEngine Phase I (PSARC 2002/433) 111 * 112 * (Read the detailed design doc in PSARC case directory) 113 * 114 * The entire tcp state is contained in tcp_t and conn_t structure 115 * which are allocated in tandem using ipcl_conn_create() and passing 116 * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect 117 * the references on the tcp_t. The tcp_t structure is never compressed 118 * and packets always land on the correct TCP perimeter from the time 119 * eager is created till the time tcp_t dies (as such the old mentat 120 * TCP global queue is not used for detached state and no IPSEC checking 121 * is required). The global queue is still allocated to send out resets 122 * for connection which have no listeners and IP directly calls 123 * tcp_xmit_listeners_reset() which does any policy check. 124 * 125 * Protection and Synchronisation mechanism: 126 * 127 * The tcp data structure does not use any kind of lock for protecting 128 * its state but instead uses 'squeues' for mutual exclusion from various 129 * read and write side threads. To access a tcp member, the thread should 130 * always be behind squeue (via squeue_enter, squeue_enter_nodrain, or 131 * squeue_fill). Since the squeues allow a direct function call, caller 132 * can pass any tcp function having prototype of edesc_t as argument 133 * (different from traditional STREAMs model where packets come in only 134 * designated entry points). The list of functions that can be directly 135 * called via squeue are listed before the usual function prototype. 136 * 137 * Referencing: 138 * 139 * TCP is MT-Hot and we use a reference based scheme to make sure that the 140 * tcp structure doesn't disappear when its needed. When the application 141 * creates an outgoing connection or accepts an incoming connection, we 142 * start out with 2 references on 'conn_ref'. One for TCP and one for IP. 143 * The IP reference is just a symbolic reference since ip_tcpclose() 144 * looks at tcp structure after tcp_close_output() returns which could 145 * have dropped the last TCP reference. So as long as the connection is 146 * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the 147 * conn_t. The classifier puts its own reference when the connection is 148 * inserted in listen or connected hash. Anytime a thread needs to enter 149 * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr 150 * on write side or by doing a classify on read side and then puts a 151 * reference on the conn before doing squeue_enter/tryenter/fill. For 152 * read side, the classifier itself puts the reference under fanout lock 153 * to make sure that tcp can't disappear before it gets processed. The 154 * squeue will drop this reference automatically so the called function 155 * doesn't have to do a DEC_REF. 156 * 157 * Opening a new connection: 158 * 159 * The outgoing connection open is pretty simple. tcp_open() does the 160 * work in creating the conn/tcp structure and initializing it. The 161 * squeue assignment is done based on the CPU the application 162 * is running on. So for outbound connections, processing is always done 163 * on application CPU which might be different from the incoming CPU 164 * being interrupted by the NIC. An optimal way would be to figure out 165 * the NIC <-> CPU binding at listen time, and assign the outgoing 166 * connection to the squeue attached to the CPU that will be interrupted 167 * for incoming packets (we know the NIC based on the bind IP address). 168 * This might seem like a problem if more data is going out but the 169 * fact is that in most cases the transmit is ACK driven transmit where 170 * the outgoing data normally sits on TCP's xmit queue waiting to be 171 * transmitted. 172 * 173 * Accepting a connection: 174 * 175 * This is a more interesting case because of various races involved in 176 * establishing a eager in its own perimeter. Read the meta comment on 177 * top of tcp_conn_request(). But briefly, the squeue is picked by 178 * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU. 179 * 180 * Closing a connection: 181 * 182 * The close is fairly straight forward. tcp_close() calls tcp_close_output() 183 * via squeue to do the close and mark the tcp as detached if the connection 184 * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its 185 * reference but tcp_close() drop IP's reference always. So if tcp was 186 * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP 187 * and 1 because it is in classifier's connected hash. This is the condition 188 * we use to determine that its OK to clean up the tcp outside of squeue 189 * when time wait expires (check the ref under fanout and conn_lock and 190 * if it is 2, remove it from fanout hash and kill it). 191 * 192 * Although close just drops the necessary references and marks the 193 * tcp_detached state, tcp_close needs to know the tcp_detached has been 194 * set (under squeue) before letting the STREAM go away (because a 195 * inbound packet might attempt to go up the STREAM while the close 196 * has happened and tcp_detached is not set). So a special lock and 197 * flag is used along with a condition variable (tcp_closelock, tcp_closed, 198 * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked 199 * tcp_detached. 200 * 201 * Special provisions and fast paths: 202 * 203 * We make special provision for (AF_INET, SOCK_STREAM) sockets which 204 * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP 205 * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles 206 * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY 207 * check to send packets directly to tcp_rput_data via squeue. Everyone 208 * else comes through tcp_input() on the read side. 209 * 210 * We also make special provisions for sockfs by marking tcp_issocket 211 * whenever we have only sockfs on top of TCP. This allows us to skip 212 * putting the tcp in acceptor hash since a sockfs listener can never 213 * become acceptor and also avoid allocating a tcp_t for acceptor STREAM 214 * since eager has already been allocated and the accept now happens 215 * on acceptor STREAM. There is a big blob of comment on top of 216 * tcp_conn_request explaining the new accept. When socket is POP'd, 217 * sockfs sends us an ioctl to mark the fact and we go back to old 218 * behaviour. Once tcp_issocket is unset, its never set for the 219 * life of that connection. 220 * 221 * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT) 222 * two consoldiation private KAPIs are used to enqueue M_DATA mblk_t's 223 * directly to the socket (sodirect) and start an asynchronous copyout 224 * to a user-land receive-side buffer (uioa) when a blocking socket read 225 * (e.g. read, recv, ...) is pending. 226 * 227 * This is accomplished when tcp_issocket is set and tcp_sodirect is not 228 * NULL so points to an sodirect_t and if marked enabled then we enqueue 229 * all mblk_t's directly to the socket. 230 * 231 * Further, if the sodirect_t sod_uioa and if marked enabled (due to a 232 * blocking socket read, e.g. user-land read, recv, ...) then an asynchronous 233 * copyout will be started directly to the user-land uio buffer. Also, as we 234 * have a pending read, TCP's push logic can take into account the number of 235 * bytes to be received and only awake the blocked read()er when the uioa_t 236 * byte count has been satisfied. 237 * 238 * IPsec notes : 239 * 240 * Since a packet is always executed on the correct TCP perimeter 241 * all IPsec processing is defered to IP including checking new 242 * connections and setting IPSEC policies for new connection. The 243 * only exception is tcp_xmit_listeners_reset() which is called 244 * directly from IP and needs to policy check to see if TH_RST 245 * can be sent out. 246 * 247 * PFHooks notes : 248 * 249 * For mdt case, one meta buffer contains multiple packets. Mblks for every 250 * packet are assembled and passed to the hooks. When packets are blocked, 251 * or boundary of any packet is changed, the mdt processing is stopped, and 252 * packets of the meta buffer are send to the IP path one by one. 253 */ 254 255 /* 256 * Values for squeue switch: 257 * 1: squeue_enter_nodrain 258 * 2: squeue_enter 259 * 3: squeue_fill 260 */ 261 int tcp_squeue_close = 2; /* Setable in /etc/system */ 262 int tcp_squeue_wput = 2; 263 264 squeue_func_t tcp_squeue_close_proc; 265 squeue_func_t tcp_squeue_wput_proc; 266 267 /* 268 * Macros for sodirect: 269 * 270 * SOD_PTR_ENTER(tcp, sodp) - for the tcp_t pointer "tcp" set the 271 * sodirect_t pointer "sodp" to the socket/tcp shared sodirect_t 272 * if it exists and is enabled, else to NULL. Note, in the current 273 * sodirect implementation the sod_lock must not be held across any 274 * STREAMS call (e.g. putnext) else a "recursive mutex_enter" PANIC 275 * will result as sod_lock is the streamhead stdata.sd_lock. 276 * 277 * SOD_NOT_ENABLED(tcp) - return true if not a sodirect tcp_t or the 278 * sodirect_t isn't enabled, usefull for ASSERT()ing that a recieve 279 * side tcp code path dealing with a tcp_rcv_list or putnext() isn't 280 * being used when sodirect code paths should be. 281 */ 282 283 #define SOD_PTR_ENTER(tcp, sodp) \ 284 (sodp) = (tcp)->tcp_sodirect; \ 285 \ 286 if ((sodp) != NULL) { \ 287 mutex_enter((sodp)->sod_lock); \ 288 if (!((sodp)->sod_state & SOD_ENABLED)) { \ 289 mutex_exit((sodp)->sod_lock); \ 290 (sodp) = NULL; \ 291 } \ 292 } 293 294 #define SOD_NOT_ENABLED(tcp) \ 295 ((tcp)->tcp_sodirect == NULL || \ 296 !((tcp)->tcp_sodirect->sod_state & SOD_ENABLED)) 297 298 /* 299 * This controls how tiny a write must be before we try to copy it 300 * into the the mblk on the tail of the transmit queue. Not much 301 * speedup is observed for values larger than sixteen. Zero will 302 * disable the optimisation. 303 */ 304 int tcp_tx_pull_len = 16; 305 306 /* 307 * TCP Statistics. 308 * 309 * How TCP statistics work. 310 * 311 * There are two types of statistics invoked by two macros. 312 * 313 * TCP_STAT(name) does non-atomic increment of a named stat counter. It is 314 * supposed to be used in non MT-hot paths of the code. 315 * 316 * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is 317 * supposed to be used for DEBUG purposes and may be used on a hot path. 318 * 319 * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat 320 * (use "kstat tcp" to get them). 321 * 322 * There is also additional debugging facility that marks tcp_clean_death() 323 * instances and saves them in tcp_t structure. It is triggered by 324 * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for 325 * tcp_clean_death() calls that counts the number of times each tag was hit. It 326 * is triggered by TCP_CLD_COUNTERS define. 327 * 328 * How to add new counters. 329 * 330 * 1) Add a field in the tcp_stat structure describing your counter. 331 * 2) Add a line in the template in tcp_kstat2_init() with the name 332 * of the counter. 333 * 334 * IMPORTANT!! - make sure that both are in sync !! 335 * 3) Use either TCP_STAT or TCP_DBGSTAT with the name. 336 * 337 * Please avoid using private counters which are not kstat-exported. 338 * 339 * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances 340 * in tcp_t structure. 341 * 342 * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags. 343 */ 344 345 #ifndef TCP_DEBUG_COUNTER 346 #ifdef DEBUG 347 #define TCP_DEBUG_COUNTER 1 348 #else 349 #define TCP_DEBUG_COUNTER 0 350 #endif 351 #endif 352 353 #define TCP_CLD_COUNTERS 0 354 355 #define TCP_TAG_CLEAN_DEATH 1 356 #define TCP_MAX_CLEAN_DEATH_TAG 32 357 358 #ifdef lint 359 static int _lint_dummy_; 360 #endif 361 362 #if TCP_CLD_COUNTERS 363 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; 364 #define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++ 365 #elif defined(lint) 366 #define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0); 367 #else 368 #define TCP_CLD_STAT(x) 369 #endif 370 371 #if TCP_DEBUG_COUNTER 372 #define TCP_DBGSTAT(tcps, x) \ 373 atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1) 374 #define TCP_G_DBGSTAT(x) \ 375 atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1) 376 #elif defined(lint) 377 #define TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0); 378 #define TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0); 379 #else 380 #define TCP_DBGSTAT(tcps, x) 381 #define TCP_G_DBGSTAT(x) 382 #endif 383 384 #define TCP_G_STAT(x) (tcp_g_statistics.x.value.ui64++) 385 386 tcp_g_stat_t tcp_g_statistics; 387 kstat_t *tcp_g_kstat; 388 389 /* 390 * Call either ip_output or ip_output_v6. This replaces putnext() calls on the 391 * tcp write side. 392 */ 393 #define CALL_IP_WPUT(connp, q, mp) { \ 394 tcp_stack_t *tcps; \ 395 \ 396 tcps = connp->conn_netstack->netstack_tcp; \ 397 ASSERT(((q)->q_flag & QREADR) == 0); \ 398 TCP_DBGSTAT(tcps, tcp_ip_output); \ 399 connp->conn_send(connp, (mp), (q), IP_WPUT); \ 400 } 401 402 /* Macros for timestamp comparisons */ 403 #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) 404 #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) 405 406 /* 407 * Parameters for TCP Initial Send Sequence number (ISS) generation. When 408 * tcp_strong_iss is set to 1, which is the default, the ISS is calculated 409 * by adding three components: a time component which grows by 1 every 4096 410 * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27); 411 * a per-connection component which grows by 125000 for every new connection; 412 * and an "extra" component that grows by a random amount centered 413 * approximately on 64000. This causes the the ISS generator to cycle every 414 * 4.89 hours if no TCP connections are made, and faster if connections are 415 * made. 416 * 417 * When tcp_strong_iss is set to 0, ISS is calculated by adding two 418 * components: a time component which grows by 250000 every second; and 419 * a per-connection component which grows by 125000 for every new connections. 420 * 421 * A third method, when tcp_strong_iss is set to 2, for generating ISS is 422 * prescribed by Steve Bellovin. This involves adding time, the 125000 per 423 * connection, and a one-way hash (MD5) of the connection ID <sport, dport, 424 * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered 425 * password. 426 */ 427 #define ISS_INCR 250000 428 #define ISS_NSEC_SHT 12 429 430 static sin_t sin_null; /* Zero address for quick clears */ 431 static sin6_t sin6_null; /* Zero address for quick clears */ 432 433 /* 434 * This implementation follows the 4.3BSD interpretation of the urgent 435 * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause 436 * incompatible changes in protocols like telnet and rlogin. 437 */ 438 #define TCP_OLD_URP_INTERPRETATION 1 439 440 #define TCP_IS_DETACHED_NONEAGER(tcp) \ 441 (TCP_IS_DETACHED(tcp) && \ 442 (!(tcp)->tcp_hard_binding)) 443 444 /* 445 * TCP reassembly macros. We hide starting and ending sequence numbers in 446 * b_next and b_prev of messages on the reassembly queue. The messages are 447 * chained using b_cont. These macros are used in tcp_reass() so we don't 448 * have to see the ugly casts and assignments. 449 */ 450 #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) 451 #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ 452 (mblk_t *)(uintptr_t)(u)) 453 #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) 454 #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ 455 (mblk_t *)(uintptr_t)(u)) 456 457 /* 458 * Implementation of TCP Timers. 459 * ============================= 460 * 461 * INTERFACE: 462 * 463 * There are two basic functions dealing with tcp timers: 464 * 465 * timeout_id_t tcp_timeout(connp, func, time) 466 * clock_t tcp_timeout_cancel(connp, timeout_id) 467 * TCP_TIMER_RESTART(tcp, intvl) 468 * 469 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' 470 * after 'time' ticks passed. The function called by timeout() must adhere to 471 * the same restrictions as a driver soft interrupt handler - it must not sleep 472 * or call other functions that might sleep. The value returned is the opaque 473 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to 474 * cancel the request. The call to tcp_timeout() may fail in which case it 475 * returns zero. This is different from the timeout(9F) function which never 476 * fails. 477 * 478 * The call-back function 'func' always receives 'connp' as its single 479 * argument. It is always executed in the squeue corresponding to the tcp 480 * structure. The tcp structure is guaranteed to be present at the time the 481 * call-back is called. 482 * 483 * NOTE: The call-back function 'func' is never called if tcp is in 484 * the TCPS_CLOSED state. 485 * 486 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() 487 * request. locks acquired by the call-back routine should not be held across 488 * the call to tcp_timeout_cancel() or a deadlock may result. 489 * 490 * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request. 491 * Otherwise, it returns an integer value greater than or equal to 0. In 492 * particular, if the call-back function is already placed on the squeue, it can 493 * not be canceled. 494 * 495 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called 496 * within squeue context corresponding to the tcp instance. Since the 497 * call-back is also called via the same squeue, there are no race 498 * conditions described in untimeout(9F) manual page since all calls are 499 * strictly serialized. 500 * 501 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout 502 * stored in tcp_timer_tid and starts a new one using 503 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back 504 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid 505 * field. 506 * 507 * NOTE: since the timeout cancellation is not guaranteed, the cancelled 508 * call-back may still be called, so it is possible tcp_timer() will be 509 * called several times. This should not be a problem since tcp_timer() 510 * should always check the tcp instance state. 511 * 512 * 513 * IMPLEMENTATION: 514 * 515 * TCP timers are implemented using three-stage process. The call to 516 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function 517 * when the timer expires. The tcp_timer_callback() arranges the call of the 518 * tcp_timer_handler() function via squeue corresponding to the tcp 519 * instance. The tcp_timer_handler() calls actual requested timeout call-back 520 * and passes tcp instance as an argument to it. Information is passed between 521 * stages using the tcp_timer_t structure which contains the connp pointer, the 522 * tcp call-back to call and the timeout id returned by the timeout(9F). 523 * 524 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - 525 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo 526 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() 527 * returns the pointer to this mblk. 528 * 529 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It 530 * looks like a normal mblk without actual dblk attached to it. 531 * 532 * To optimize performance each tcp instance holds a small cache of timer 533 * mblocks. In the current implementation it caches up to two timer mblocks per 534 * tcp instance. The cache is preserved over tcp frees and is only freed when 535 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp 536 * timer processing happens on a corresponding squeue, the cache manipulation 537 * does not require any locks. Experiments show that majority of timer mblocks 538 * allocations are satisfied from the tcp cache and do not involve kmem calls. 539 * 540 * The tcp_timeout() places a refhold on the connp instance which guarantees 541 * that it will be present at the time the call-back function fires. The 542 * tcp_timer_handler() drops the reference after calling the call-back, so the 543 * call-back function does not need to manipulate the references explicitly. 544 */ 545 546 typedef struct tcp_timer_s { 547 conn_t *connp; 548 void (*tcpt_proc)(void *); 549 timeout_id_t tcpt_tid; 550 } tcp_timer_t; 551 552 static kmem_cache_t *tcp_timercache; 553 kmem_cache_t *tcp_sack_info_cache; 554 kmem_cache_t *tcp_iphc_cache; 555 556 /* 557 * For scalability, we must not run a timer for every TCP connection 558 * in TIME_WAIT state. To see why, consider (for time wait interval of 559 * 4 minutes): 560 * 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's 561 * 562 * This list is ordered by time, so you need only delete from the head 563 * until you get to entries which aren't old enough to delete yet. 564 * The list consists of only the detached TIME_WAIT connections. 565 * 566 * Note that the timer (tcp_time_wait_expire) is started when the tcp_t 567 * becomes detached TIME_WAIT (either by changing the state and already 568 * being detached or the other way around). This means that the TIME_WAIT 569 * state can be extended (up to doubled) if the connection doesn't become 570 * detached for a long time. 571 * 572 * The list manipulations (including tcp_time_wait_next/prev) 573 * are protected by the tcp_time_wait_lock. The content of the 574 * detached TIME_WAIT connections is protected by the normal perimeters. 575 * 576 * This list is per squeue and squeues are shared across the tcp_stack_t's. 577 * Things on tcp_time_wait_head remain associated with the tcp_stack_t 578 * and conn_netstack. 579 * The tcp_t's that are added to tcp_free_list are disassociated and 580 * have NULL tcp_tcps and conn_netstack pointers. 581 */ 582 typedef struct tcp_squeue_priv_s { 583 kmutex_t tcp_time_wait_lock; 584 timeout_id_t tcp_time_wait_tid; 585 tcp_t *tcp_time_wait_head; 586 tcp_t *tcp_time_wait_tail; 587 tcp_t *tcp_free_list; 588 uint_t tcp_free_list_cnt; 589 } tcp_squeue_priv_t; 590 591 /* 592 * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. 593 * Running it every 5 seconds seems to give the best results. 594 */ 595 #define TCP_TIME_WAIT_DELAY drv_usectohz(5000000) 596 597 /* 598 * To prevent memory hog, limit the number of entries in tcp_free_list 599 * to 1% of available memory / number of cpus 600 */ 601 uint_t tcp_free_list_max_cnt = 0; 602 603 #define TCP_XMIT_LOWATER 4096 604 #define TCP_XMIT_HIWATER 49152 605 #define TCP_RECV_LOWATER 2048 606 #define TCP_RECV_HIWATER 49152 607 608 /* 609 * PAWS needs a timer for 24 days. This is the number of ticks in 24 days 610 */ 611 #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) 612 613 #define TIDUSZ 4096 /* transport interface data unit size */ 614 615 /* 616 * Bind hash list size and has function. It has to be a power of 2 for 617 * hashing. 618 */ 619 #define TCP_BIND_FANOUT_SIZE 512 620 #define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1)) 621 /* 622 * Size of listen and acceptor hash list. It has to be a power of 2 for 623 * hashing. 624 */ 625 #define TCP_FANOUT_SIZE 256 626 627 #ifdef _ILP32 628 #define TCP_ACCEPTOR_HASH(accid) \ 629 (((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1)) 630 #else 631 #define TCP_ACCEPTOR_HASH(accid) \ 632 ((uint_t)(accid) & (TCP_FANOUT_SIZE - 1)) 633 #endif /* _ILP32 */ 634 635 #define IP_ADDR_CACHE_SIZE 2048 636 #define IP_ADDR_CACHE_HASH(faddr) \ 637 (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1)) 638 639 /* Hash for HSPs uses all 32 bits, since both networks and hosts are in table */ 640 #define TCP_HSP_HASH_SIZE 256 641 642 #define TCP_HSP_HASH(addr) \ 643 (((addr>>24) ^ (addr >>16) ^ \ 644 (addr>>8) ^ (addr)) % TCP_HSP_HASH_SIZE) 645 646 /* 647 * TCP options struct returned from tcp_parse_options. 648 */ 649 typedef struct tcp_opt_s { 650 uint32_t tcp_opt_mss; 651 uint32_t tcp_opt_wscale; 652 uint32_t tcp_opt_ts_val; 653 uint32_t tcp_opt_ts_ecr; 654 tcp_t *tcp; 655 } tcp_opt_t; 656 657 /* 658 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 659 */ 660 661 #ifdef _BIG_ENDIAN 662 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 663 (TCPOPT_TSTAMP << 8) | 10) 664 #else 665 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 666 (TCPOPT_NOP << 8) | TCPOPT_NOP) 667 #endif 668 669 /* 670 * Flags returned from tcp_parse_options. 671 */ 672 #define TCP_OPT_MSS_PRESENT 1 673 #define TCP_OPT_WSCALE_PRESENT 2 674 #define TCP_OPT_TSTAMP_PRESENT 4 675 #define TCP_OPT_SACK_OK_PRESENT 8 676 #define TCP_OPT_SACK_PRESENT 16 677 678 /* TCP option length */ 679 #define TCPOPT_NOP_LEN 1 680 #define TCPOPT_MAXSEG_LEN 4 681 #define TCPOPT_WS_LEN 3 682 #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) 683 #define TCPOPT_TSTAMP_LEN 10 684 #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) 685 #define TCPOPT_SACK_OK_LEN 2 686 #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) 687 #define TCPOPT_REAL_SACK_LEN 4 688 #define TCPOPT_MAX_SACK_LEN 36 689 #define TCPOPT_HEADER_LEN 2 690 691 /* TCP cwnd burst factor. */ 692 #define TCP_CWND_INFINITE 65535 693 #define TCP_CWND_SS 3 694 #define TCP_CWND_NORMAL 5 695 696 /* Maximum TCP initial cwin (start/restart). */ 697 #define TCP_MAX_INIT_CWND 8 698 699 /* 700 * Initialize cwnd according to RFC 3390. def_max_init_cwnd is 701 * either tcp_slow_start_initial or tcp_slow_start_after idle 702 * depending on the caller. If the upper layer has not used the 703 * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd 704 * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd. 705 * If the upper layer has changed set the tcp_init_cwnd, just use 706 * it to calculate the tcp_cwnd. 707 */ 708 #define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \ 709 { \ 710 if ((tcp)->tcp_init_cwnd == 0) { \ 711 (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \ 712 MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \ 713 } else { \ 714 (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \ 715 } \ 716 tcp->tcp_cwnd_cnt = 0; \ 717 } 718 719 /* TCP Timer control structure */ 720 typedef struct tcpt_s { 721 pfv_t tcpt_pfv; /* The routine we are to call */ 722 tcp_t *tcpt_tcp; /* The parameter we are to pass in */ 723 } tcpt_t; 724 725 /* Host Specific Parameter structure */ 726 typedef struct tcp_hsp { 727 struct tcp_hsp *tcp_hsp_next; 728 in6_addr_t tcp_hsp_addr_v6; 729 in6_addr_t tcp_hsp_subnet_v6; 730 uint_t tcp_hsp_vers; /* IPV4_VERSION | IPV6_VERSION */ 731 int32_t tcp_hsp_sendspace; 732 int32_t tcp_hsp_recvspace; 733 int32_t tcp_hsp_tstamp; 734 } tcp_hsp_t; 735 #define tcp_hsp_addr V4_PART_OF_V6(tcp_hsp_addr_v6) 736 #define tcp_hsp_subnet V4_PART_OF_V6(tcp_hsp_subnet_v6) 737 738 /* 739 * Functions called directly via squeue having a prototype of edesc_t. 740 */ 741 void tcp_conn_request(void *arg, mblk_t *mp, void *arg2); 742 static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2); 743 void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2); 744 static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2); 745 static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2); 746 void tcp_input(void *arg, mblk_t *mp, void *arg2); 747 void tcp_rput_data(void *arg, mblk_t *mp, void *arg2); 748 static void tcp_close_output(void *arg, mblk_t *mp, void *arg2); 749 void tcp_output(void *arg, mblk_t *mp, void *arg2); 750 static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2); 751 static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2); 752 static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2); 753 754 755 /* Prototype for TCP functions */ 756 static void tcp_random_init(void); 757 int tcp_random(void); 758 static void tcp_accept(tcp_t *tcp, mblk_t *mp); 759 static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, 760 tcp_t *eager); 761 static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp); 762 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 763 int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only, 764 boolean_t user_specified); 765 static void tcp_closei_local(tcp_t *tcp); 766 static void tcp_close_detached(tcp_t *tcp); 767 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, 768 mblk_t *idmp, mblk_t **defermp); 769 static void tcp_connect(tcp_t *tcp, mblk_t *mp); 770 static void tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, 771 in_port_t dstport, uint_t srcid); 772 static void tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, 773 in_port_t dstport, uint32_t flowinfo, uint_t srcid, 774 uint32_t scope_id); 775 static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag); 776 static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp); 777 static void tcp_disconnect(tcp_t *tcp, mblk_t *mp); 778 static char *tcp_display(tcp_t *tcp, char *, char); 779 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum); 780 static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only); 781 static void tcp_eager_unlink(tcp_t *tcp); 782 static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr, 783 int unixerr); 784 static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 785 int tlierr, int unixerr); 786 static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, 787 cred_t *cr); 788 static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, 789 char *value, caddr_t cp, cred_t *cr); 790 static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, 791 char *value, caddr_t cp, cred_t *cr); 792 static int tcp_tpistate(tcp_t *tcp); 793 static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp, 794 int caller_holds_lock); 795 static void tcp_bind_hash_remove(tcp_t *tcp); 796 static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *); 797 void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp); 798 static void tcp_acceptor_hash_remove(tcp_t *tcp); 799 static void tcp_capability_req(tcp_t *tcp, mblk_t *mp); 800 static void tcp_info_req(tcp_t *tcp, mblk_t *mp); 801 static void tcp_addr_req(tcp_t *tcp, mblk_t *mp); 802 static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp); 803 void tcp_g_q_setup(tcp_stack_t *); 804 void tcp_g_q_create(tcp_stack_t *); 805 void tcp_g_q_destroy(tcp_stack_t *); 806 static int tcp_header_init_ipv4(tcp_t *tcp); 807 static int tcp_header_init_ipv6(tcp_t *tcp); 808 int tcp_init(tcp_t *tcp, queue_t *q); 809 static int tcp_init_values(tcp_t *tcp); 810 static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic); 811 static mblk_t *tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, 812 t_scalar_t addr_length); 813 static void tcp_ip_ire_mark_advice(tcp_t *tcp); 814 static void tcp_ip_notify(tcp_t *tcp); 815 static mblk_t *tcp_ire_mp(mblk_t *mp); 816 static void tcp_iss_init(tcp_t *tcp); 817 static void tcp_keepalive_killer(void *arg); 818 static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt); 819 static void tcp_mss_set(tcp_t *tcp, uint32_t size, boolean_t do_ss); 820 static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, 821 int *do_disconnectp, int *t_errorp, int *sys_errorp); 822 static boolean_t tcp_allow_connopt_set(int level, int name); 823 int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); 824 int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr); 825 int tcp_opt_set(queue_t *q, uint_t optset_context, int level, 826 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, 827 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, 828 mblk_t *mblk); 829 static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha); 830 static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, 831 uchar_t *ptr, uint_t len); 832 static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); 833 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, 834 tcp_stack_t *); 835 static int tcp_param_set(queue_t *q, mblk_t *mp, char *value, 836 caddr_t cp, cred_t *cr); 837 static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, 838 caddr_t cp, cred_t *cr); 839 static void tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *); 840 static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, 841 caddr_t cp, cred_t *cr); 842 static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt); 843 static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start); 844 static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp); 845 static void tcp_reinit(tcp_t *tcp); 846 static void tcp_reinit_values(tcp_t *tcp); 847 static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, 848 tcp_t *thisstream, cred_t *cr); 849 850 static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp); 851 static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); 852 static boolean_t tcp_send_rst_chk(tcp_stack_t *); 853 static void tcp_ss_rexmit(tcp_t *tcp); 854 static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp); 855 static void tcp_process_options(tcp_t *, tcph_t *); 856 static void tcp_rput_common(tcp_t *tcp, mblk_t *mp); 857 static void tcp_rsrv(queue_t *q); 858 static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd); 859 static int tcp_snmp_state(tcp_t *tcp); 860 static int tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, 861 cred_t *cr); 862 static int tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 863 cred_t *cr); 864 static int tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 865 cred_t *cr); 866 static int tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 867 cred_t *cr); 868 static int tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 869 cred_t *cr); 870 static int tcp_host_param_set(queue_t *q, mblk_t *mp, char *value, 871 caddr_t cp, cred_t *cr); 872 static int tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value, 873 caddr_t cp, cred_t *cr); 874 static int tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp, 875 cred_t *cr); 876 static void tcp_timer(void *arg); 877 static void tcp_timer_callback(void *); 878 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp, 879 boolean_t random); 880 static in_port_t tcp_get_next_priv_port(const tcp_t *); 881 static void tcp_wput_sock(queue_t *q, mblk_t *mp); 882 void tcp_wput_accept(queue_t *q, mblk_t *mp); 883 static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); 884 static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp); 885 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); 886 static int tcp_send(queue_t *q, tcp_t *tcp, const int mss, 887 const int tcp_hdr_len, const int tcp_tcp_hdr_len, 888 const int num_sack_blk, int *usable, uint_t *snxt, 889 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 890 const int mdt_thres); 891 static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, 892 const int tcp_hdr_len, const int tcp_tcp_hdr_len, 893 const int num_sack_blk, int *usable, uint_t *snxt, 894 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 895 const int mdt_thres); 896 static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, 897 int num_sack_blk); 898 static void tcp_wsrv(queue_t *q); 899 static int tcp_xmit_end(tcp_t *tcp); 900 static void tcp_ack_timer(void *arg); 901 static mblk_t *tcp_ack_mp(tcp_t *tcp); 902 static void tcp_xmit_early_reset(char *str, mblk_t *mp, 903 uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len, 904 zoneid_t zoneid, tcp_stack_t *, conn_t *connp); 905 static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, 906 uint32_t ack, int ctl); 907 static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr, tcp_stack_t *); 908 static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr, tcp_stack_t *); 909 static int setmaxps(queue_t *q, int maxpsz); 910 static void tcp_set_rto(tcp_t *, time_t); 911 static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *, 912 boolean_t, boolean_t); 913 static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, 914 boolean_t ipsec_mctl); 915 static mblk_t *tcp_setsockopt_mp(int level, int cmd, 916 char *opt, int optlen); 917 static int tcp_build_hdrs(queue_t *, tcp_t *); 918 static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, 919 uint32_t seg_seq, uint32_t seg_ack, int seg_len, 920 tcph_t *tcph); 921 boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp); 922 boolean_t tcp_reserved_port_add(int, in_port_t *, in_port_t *); 923 boolean_t tcp_reserved_port_del(in_port_t, in_port_t); 924 boolean_t tcp_reserved_port_check(in_port_t, tcp_stack_t *); 925 static tcp_t *tcp_alloc_temp_tcp(in_port_t, tcp_stack_t *); 926 static int tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *); 927 static mblk_t *tcp_mdt_info_mp(mblk_t *); 928 static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t); 929 static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *, 930 const boolean_t, const uint32_t, const uint32_t, 931 const uint32_t, const uint32_t, tcp_stack_t *); 932 static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *, 933 const uint_t, const uint_t, boolean_t *); 934 static mblk_t *tcp_lso_info_mp<