1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsun.h> 31 #include <sys/strsubr.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #define _SUN_TPI_VERSION 2 35 #include <sys/tihdr.h> 36 #include <sys/timod.h> 37 #include <sys/ddi.h> 38 #include <sys/sunddi.h> 39 #include <sys/suntpi.h> 40 #include <sys/xti_inet.h> 41 #include <sys/cmn_err.h> 42 #include <sys/debug.h> 43 #include <sys/sdt.h> 44 #include <sys/vtrace.h> 45 #include <sys/kmem.h> 46 #include <sys/ethernet.h> 47 #include <sys/cpuvar.h> 48 #include <sys/dlpi.h> 49 #include <sys/multidata.h> 50 #include <sys/multidata_impl.h> 51 #include <sys/pattr.h> 52 #include <sys/policy.h> 53 #include <sys/priv.h> 54 #include <sys/zone.h> 55 #include <sys/sunldi.h> 56 57 #include <sys/errno.h> 58 #include <sys/signal.h> 59 #include <sys/socket.h> 60 #include <sys/socketvar.h> 61 #include <sys/sockio.h> 62 #include <sys/isa_defs.h> 63 #include <sys/md5.h> 64 #include <sys/random.h> 65 #include <sys/uio.h> 66 #include <sys/systm.h> 67 #include <netinet/in.h> 68 #include <netinet/tcp.h> 69 #include <netinet/ip6.h> 70 #include <netinet/icmp6.h> 71 #include <net/if.h> 72 #include <net/route.h> 73 #include <inet/ipsec_impl.h> 74 75 #include <inet/common.h> 76 #include <inet/ip.h> 77 #include <inet/ip_impl.h> 78 #include <inet/ip6.h> 79 #include <inet/ip_ndp.h> 80 #include <inet/proto_set.h> 81 #include <inet/mib2.h> 82 #include <inet/nd.h> 83 #include <inet/optcom.h> 84 #include <inet/snmpcom.h> 85 #include <inet/kstatcom.h> 86 #include <inet/tcp.h> 87 #include <inet/tcp_impl.h> 88 #include <inet/udp_impl.h> 89 #include <net/pfkeyv2.h> 90 #include <inet/ipsec_info.h> 91 #include <inet/ipdrop.h> 92 93 #include <inet/ipclassifier.h> 94 #include <inet/ip_ire.h> 95 #include <inet/ip_ftable.h> 96 #include <inet/ip_if.h> 97 #include <inet/ipp_common.h> 98 #include <inet/ip_netinfo.h> 99 #include <sys/squeue_impl.h> 100 #include <sys/squeue.h> 101 #include <inet/kssl/ksslapi.h> 102 #include <sys/tsol/label.h> 103 #include <sys/tsol/tnet.h> 104 #include <rpc/pmap_prot.h> 105 #include <sys/callo.h> 106 107 /* 108 * TCP Notes: aka FireEngine Phase I (PSARC 2002/433) 109 * 110 * (Read the detailed design doc in PSARC case directory) 111 * 112 * The entire tcp state is contained in tcp_t and conn_t structure 113 * which are allocated in tandem using ipcl_conn_create() and passing 114 * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect 115 * the references on the tcp_t. The tcp_t structure is never compressed 116 * and packets always land on the correct TCP perimeter from the time 117 * eager is created till the time tcp_t dies (as such the old mentat 118 * TCP global queue is not used for detached state and no IPSEC checking 119 * is required). The global queue is still allocated to send out resets 120 * for connection which have no listeners and IP directly calls 121 * tcp_xmit_listeners_reset() which does any policy check. 122 * 123 * Protection and Synchronisation mechanism: 124 * 125 * The tcp data structure does not use any kind of lock for protecting 126 * its state but instead uses 'squeues' for mutual exclusion from various 127 * read and write side threads. To access a tcp member, the thread should 128 * always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS, 129 * or SQ_NODRAIN). Since the squeues allow a direct function call, caller 130 * can pass any tcp function having prototype of edesc_t as argument 131 * (different from traditional STREAMs model where packets come in only 132 * designated entry points). The list of functions that can be directly 133 * called via squeue are listed before the usual function prototype. 134 * 135 * Referencing: 136 * 137 * TCP is MT-Hot and we use a reference based scheme to make sure that the 138 * tcp structure doesn't disappear when its needed. When the application 139 * creates an outgoing connection or accepts an incoming connection, we 140 * start out with 2 references on 'conn_ref'. One for TCP and one for IP. 141 * The IP reference is just a symbolic reference since ip_tcpclose() 142 * looks at tcp structure after tcp_close_output() returns which could 143 * have dropped the last TCP reference. So as long as the connection is 144 * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the 145 * conn_t. The classifier puts its own reference when the connection is 146 * inserted in listen or connected hash. Anytime a thread needs to enter 147 * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr 148 * on write side or by doing a classify on read side and then puts a 149 * reference on the conn before doing squeue_enter/tryenter/fill. For 150 * read side, the classifier itself puts the reference under fanout lock 151 * to make sure that tcp can't disappear before it gets processed. The 152 * squeue will drop this reference automatically so the called function 153 * doesn't have to do a DEC_REF. 154 * 155 * Opening a new connection: 156 * 157 * The outgoing connection open is pretty simple. tcp_open() does the 158 * work in creating the conn/tcp structure and initializing it. The 159 * squeue assignment is done based on the CPU the application 160 * is running on. So for outbound connections, processing is always done 161 * on application CPU which might be different from the incoming CPU 162 * being interrupted by the NIC. An optimal way would be to figure out 163 * the NIC <-> CPU binding at listen time, and assign the outgoing 164 * connection to the squeue attached to the CPU that will be interrupted 165 * for incoming packets (we know the NIC based on the bind IP address). 166 * This might seem like a problem if more data is going out but the 167 * fact is that in most cases the transmit is ACK driven transmit where 168 * the outgoing data normally sits on TCP's xmit queue waiting to be 169 * transmitted. 170 * 171 * Accepting a connection: 172 * 173 * This is a more interesting case because of various races involved in 174 * establishing a eager in its own perimeter. Read the meta comment on 175 * top of tcp_conn_request(). But briefly, the squeue is picked by 176 * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU. 177 * 178 * Closing a connection: 179 * 180 * The close is fairly straight forward. tcp_close() calls tcp_close_output() 181 * via squeue to do the close and mark the tcp as detached if the connection 182 * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its 183 * reference but tcp_close() drop IP's reference always. So if tcp was 184 * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP 185 * and 1 because it is in classifier's connected hash. This is the condition 186 * we use to determine that its OK to clean up the tcp outside of squeue 187 * when time wait expires (check the ref under fanout and conn_lock and 188 * if it is 2, remove it from fanout hash and kill it). 189 * 190 * Although close just drops the necessary references and marks the 191 * tcp_detached state, tcp_close needs to know the tcp_detached has been 192 * set (under squeue) before letting the STREAM go away (because a 193 * inbound packet might attempt to go up the STREAM while the close 194 * has happened and tcp_detached is not set). So a special lock and 195 * flag is used along with a condition variable (tcp_closelock, tcp_closed, 196 * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked 197 * tcp_detached. 198 * 199 * Special provisions and fast paths: 200 * 201 * We make special provision for (AF_INET, SOCK_STREAM) sockets which 202 * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP 203 * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles 204 * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY 205 * check to send packets directly to tcp_rput_data via squeue. Everyone 206 * else comes through tcp_input() on the read side. 207 * 208 * We also make special provisions for sockfs by marking tcp_issocket 209 * whenever we have only sockfs on top of TCP. This allows us to skip 210 * putting the tcp in acceptor hash since a sockfs listener can never 211 * become acceptor and also avoid allocating a tcp_t for acceptor STREAM 212 * since eager has already been allocated and the accept now happens 213 * on acceptor STREAM. There is a big blob of comment on top of 214 * tcp_conn_request explaining the new accept. When socket is POP'd, 215 * sockfs sends us an ioctl to mark the fact and we go back to old 216 * behaviour. Once tcp_issocket is unset, its never set for the 217 * life of that connection. 218 * 219 * IPsec notes : 220 * 221 * Since a packet is always executed on the correct TCP perimeter 222 * all IPsec processing is defered to IP including checking new 223 * connections and setting IPSEC policies for new connection. The 224 * only exception is tcp_xmit_listeners_reset() which is called 225 * directly from IP and needs to policy check to see if TH_RST 226 * can be sent out. 227 * 228 * PFHooks notes : 229 * 230 * For mdt case, one meta buffer contains multiple packets. Mblks for every 231 * packet are assembled and passed to the hooks. When packets are blocked, 232 * or boundary of any packet is changed, the mdt processing is stopped, and 233 * packets of the meta buffer are send to the IP path one by one. 234 */ 235 236 /* 237 * Values for squeue switch: 238 * 1: SQ_NODRAIN 239 * 2: SQ_PROCESS 240 * 3: SQ_FILL 241 */ 242 int tcp_squeue_wput = 2; /* /etc/systems */ 243 int tcp_squeue_flag; 244 245 /* 246 * This controls how tiny a write must be before we try to copy it 247 * into the the mblk on the tail of the transmit queue. Not much 248 * speedup is observed for values larger than sixteen. Zero will 249 * disable the optimisation. 250 */ 251 int tcp_tx_pull_len = 16; 252 253 /* 254 * TCP Statistics. 255 * 256 * How TCP statistics work. 257 * 258 * There are two types of statistics invoked by two macros. 259 * 260 * TCP_STAT(name) does non-atomic increment of a named stat counter. It is 261 * supposed to be used in non MT-hot paths of the code. 262 * 263 * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is 264 * supposed to be used for DEBUG purposes and may be used on a hot path. 265 * 266 * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat 267 * (use "kstat tcp" to get them). 268 * 269 * There is also additional debugging facility that marks tcp_clean_death() 270 * instances and saves them in tcp_t structure. It is triggered by 271 * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for 272 * tcp_clean_death() calls that counts the number of times each tag was hit. It 273 * is triggered by TCP_CLD_COUNTERS define. 274 * 275 * How to add new counters. 276 * 277 * 1) Add a field in the tcp_stat structure describing your counter. 278 * 2) Add a line in the template in tcp_kstat2_init() with the name 279 * of the counter. 280 * 281 * IMPORTANT!! - make sure that both are in sync !! 282 * 3) Use either TCP_STAT or TCP_DBGSTAT with the name. 283 * 284 * Please avoid using private counters which are not kstat-exported. 285 * 286 * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances 287 * in tcp_t structure. 288 * 289 * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags. 290 */ 291 292 #ifndef TCP_DEBUG_COUNTER 293 #ifdef DEBUG 294 #define TCP_DEBUG_COUNTER 1 295 #else 296 #define TCP_DEBUG_COUNTER 0 297 #endif 298 #endif 299 300 #define TCP_CLD_COUNTERS 0 301 302 #define TCP_TAG_CLEAN_DEATH 1 303 #define TCP_MAX_CLEAN_DEATH_TAG 32 304 305 #ifdef lint 306 static int _lint_dummy_; 307 #endif 308 309 #if TCP_CLD_COUNTERS 310 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; 311 #define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++ 312 #elif defined(lint) 313 #define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0); 314 #else 315 #define TCP_CLD_STAT(x) 316 #endif 317 318 #if TCP_DEBUG_COUNTER 319 #define TCP_DBGSTAT(tcps, x) \ 320 atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1) 321 #define TCP_G_DBGSTAT(x) \ 322 atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1) 323 #elif defined(lint) 324 #define TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0); 325 #define TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0); 326 #else 327 #define TCP_DBGSTAT(tcps, x) 328 #define TCP_G_DBGSTAT(x) 329 #endif 330 331 #define TCP_G_STAT(x) (tcp_g_statistics.x.value.ui64++) 332 333 tcp_g_stat_t tcp_g_statistics; 334 kstat_t *tcp_g_kstat; 335 336 /* 337 * Call either ip_output or ip_output_v6. This replaces putnext() calls on the 338 * tcp write side. 339 */ 340 #define CALL_IP_WPUT(connp, q, mp) { \ 341 ASSERT(((q)->q_flag & QREADR) == 0); \ 342 TCP_DBGSTAT(connp->conn_netstack->netstack_tcp, tcp_ip_output); \ 343 connp->conn_send(connp, (mp), (q), IP_WPUT); \ 344 } 345 346 /* Macros for timestamp comparisons */ 347 #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) 348 #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) 349 350 /* 351 * Parameters for TCP Initial Send Sequence number (ISS) generation. When 352 * tcp_strong_iss is set to 1, which is the default, the ISS is calculated 353 * by adding three components: a time component which grows by 1 every 4096 354 * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27); 355 * a per-connection component which grows by 125000 for every new connection; 356 * and an "extra" component that grows by a random amount centered 357 * approximately on 64000. This causes the the ISS generator to cycle every 358 * 4.89 hours if no TCP connections are made, and faster if connections are 359 * made. 360 * 361 * When tcp_strong_iss is set to 0, ISS is calculated by adding two 362 * components: a time component which grows by 250000 every second; and 363 * a per-connection component which grows by 125000 for every new connections. 364 * 365 * A third method, when tcp_strong_iss is set to 2, for generating ISS is 366 * prescribed by Steve Bellovin. This involves adding time, the 125000 per 367 * connection, and a one-way hash (MD5) of the connection ID <sport, dport, 368 * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered 369 * password. 370 */ 371 #define ISS_INCR 250000 372 #define ISS_NSEC_SHT 12 373 374 static sin_t sin_null; /* Zero address for quick clears */ 375 static sin6_t sin6_null; /* Zero address for quick clears */ 376 377 /* 378 * This implementation follows the 4.3BSD interpretation of the urgent 379 * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause 380 * incompatible changes in protocols like telnet and rlogin. 381 */ 382 #define TCP_OLD_URP_INTERPRETATION 1 383 384 #define TCP_IS_DETACHED_NONEAGER(tcp) \ 385 (TCP_IS_DETACHED(tcp) && \ 386 (!(tcp)->tcp_hard_binding)) 387 388 /* 389 * TCP reassembly macros. We hide starting and ending sequence numbers in 390 * b_next and b_prev of messages on the reassembly queue. The messages are 391 * chained using b_cont. These macros are used in tcp_reass() so we don't 392 * have to see the ugly casts and assignments. 393 */ 394 #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) 395 #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ 396 (mblk_t *)(uintptr_t)(u)) 397 #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) 398 #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ 399 (mblk_t *)(uintptr_t)(u)) 400 401 /* 402 * Implementation of TCP Timers. 403 * ============================= 404 * 405 * INTERFACE: 406 * 407 * There are two basic functions dealing with tcp timers: 408 * 409 * timeout_id_t tcp_timeout(connp, func, time) 410 * clock_t tcp_timeout_cancel(connp, timeout_id) 411 * TCP_TIMER_RESTART(tcp, intvl) 412 * 413 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' 414 * after 'time' ticks passed. The function called by timeout() must adhere to 415 * the same restrictions as a driver soft interrupt handler - it must not sleep 416 * or call other functions that might sleep. The value returned is the opaque 417 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to 418 * cancel the request. The call to tcp_timeout() may fail in which case it 419 * returns zero. This is different from the timeout(9F) function which never 420 * fails. 421 * 422 * The call-back function 'func' always receives 'connp' as its single 423 * argument. It is always executed in the squeue corresponding to the tcp 424 * structure. The tcp structure is guaranteed to be present at the time the 425 * call-back is called. 426 * 427 * NOTE: The call-back function 'func' is never called if tcp is in 428 * the TCPS_CLOSED state. 429 * 430 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() 431 * request. locks acquired by the call-back routine should not be held across 432 * the call to tcp_timeout_cancel() or a deadlock may result. 433 * 434 * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request. 435 * Otherwise, it returns an integer value greater than or equal to 0. In 436 * particular, if the call-back function is already placed on the squeue, it can 437 * not be canceled. 438 * 439 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called 440 * within squeue context corresponding to the tcp instance. Since the 441 * call-back is also called via the same squeue, there are no race 442 * conditions described in untimeout(9F) manual page since all calls are 443 * strictly serialized. 444 * 445 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout 446 * stored in tcp_timer_tid and starts a new one using 447 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back 448 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid 449 * field. 450 * 451 * NOTE: since the timeout cancellation is not guaranteed, the cancelled 452 * call-back may still be called, so it is possible tcp_timer() will be 453 * called several times. This should not be a problem since tcp_timer() 454 * should always check the tcp instance state. 455 * 456 * 457 * IMPLEMENTATION: 458 * 459 * TCP timers are implemented using three-stage process. The call to 460 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function 461 * when the timer expires. The tcp_timer_callback() arranges the call of the 462 * tcp_timer_handler() function via squeue corresponding to the tcp 463 * instance. The tcp_timer_handler() calls actual requested timeout call-back 464 * and passes tcp instance as an argument to it. Information is passed between 465 * stages using the tcp_timer_t structure which contains the connp pointer, the 466 * tcp call-back to call and the timeout id returned by the timeout(9F). 467 * 468 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - 469 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo 470 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() 471 * returns the pointer to this mblk. 472 * 473 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It 474 * looks like a normal mblk without actual dblk attached to it. 475 * 476 * To optimize performance each tcp instance holds a small cache of timer 477 * mblocks. In the current implementation it caches up to two timer mblocks per 478 * tcp instance. The cache is preserved over tcp frees and is only freed when 479 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp 480 * timer processing happens on a corresponding squeue, the cache manipulation 481 * does not require any locks. Experiments show that majority of timer mblocks 482 * allocations are satisfied from the tcp cache and do not involve kmem calls. 483 * 484 * The tcp_timeout() places a refhold on the connp instance which guarantees 485 * that it will be present at the time the call-back function fires. The 486 * tcp_timer_handler() drops the reference after calling the call-back, so the 487 * call-back function does not need to manipulate the references explicitly. 488 */ 489 490 typedef struct tcp_timer_s { 491 conn_t *connp; 492 void (*tcpt_proc)(void *); 493 callout_id_t tcpt_tid; 494 } tcp_timer_t; 495 496 static kmem_cache_t *tcp_timercache; 497 kmem_cache_t *tcp_sack_info_cache; 498 kmem_cache_t *tcp_iphc_cache; 499 500 /* 501 * For scalability, we must not run a timer for every TCP connection 502 * in TIME_WAIT state. To see why, consider (for time wait interval of 503 * 4 minutes): 504 * 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's 505 * 506 * This list is ordered by time, so you need only delete from the head 507 * until you get to entries which aren't old enough to delete yet. 508 * The list consists of only the detached TIME_WAIT connections. 509 * 510 * Note that the timer (tcp_time_wait_expire) is started when the tcp_t 511 * becomes detached TIME_WAIT (either by changing the state and already 512 * being detached or the other way around). This means that the TIME_WAIT 513 * state can be extended (up to doubled) if the connection doesn't become 514 * detached for a long time. 515 * 516 * The list manipulations (including tcp_time_wait_next/prev) 517 * are protected by the tcp_time_wait_lock. The content of the 518 * detached TIME_WAIT connections is protected by the normal perimeters. 519 * 520 * This list is per squeue and squeues are shared across the tcp_stack_t's. 521 * Things on tcp_time_wait_head remain associated with the tcp_stack_t 522 * and conn_netstack. 523 * The tcp_t's that are added to tcp_free_list are disassociated and 524 * have NULL tcp_tcps and conn_netstack pointers. 525 */ 526 typedef struct tcp_squeue_priv_s { 527 kmutex_t tcp_time_wait_lock; 528 callout_id_t tcp_time_wait_tid; 529 tcp_t *tcp_time_wait_head; 530 tcp_t *tcp_time_wait_tail; 531 tcp_t *tcp_free_list; 532 uint_t tcp_free_list_cnt; 533 } tcp_squeue_priv_t; 534 535 /* 536 * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. 537 * Running it every 5 seconds seems to give the best results. 538 */ 539 #define TCP_TIME_WAIT_DELAY drv_usectohz(5000000) 540 541 /* 542 * To prevent memory hog, limit the number of entries in tcp_free_list 543 * to 1% of available memory / number of cpus 544 */ 545 uint_t tcp_free_list_max_cnt = 0; 546 547 #define TCP_XMIT_LOWATER 4096 548 #define TCP_XMIT_HIWATER 49152 549 #define TCP_RECV_LOWATER 2048 550 #define TCP_RECV_HIWATER 49152 551 552 /* 553 * PAWS needs a timer for 24 days. This is the number of ticks in 24 days 554 */ 555 #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) 556 557 #define TIDUSZ 4096 /* transport interface data unit size */ 558 559 /* 560 * Bind hash list size and has function. It has to be a power of 2 for 561 * hashing. 562 */ 563 #define TCP_BIND_FANOUT_SIZE 512 564 #define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1)) 565 /* 566 * Size of listen and acceptor hash list. It has to be a power of 2 for 567 * hashing. 568 */ 569 #define TCP_FANOUT_SIZE 256 570 571 #ifdef _ILP32 572 #define TCP_ACCEPTOR_HASH(accid) \ 573 (((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1)) 574 #else 575 #define TCP_ACCEPTOR_HASH(accid) \ 576 ((uint_t)(accid) & (TCP_FANOUT_SIZE - 1)) 577 #endif /* _ILP32 */ 578 579 #define IP_ADDR_CACHE_SIZE 2048 580 #define IP_ADDR_CACHE_HASH(faddr) \ 581 (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1)) 582 583 /* 584 * TCP options struct returned from tcp_parse_options. 585 */ 586 typedef struct tcp_opt_s { 587 uint32_t tcp_opt_mss; 588 uint32_t tcp_opt_wscale; 589 uint32_t tcp_opt_ts_val; 590 uint32_t tcp_opt_ts_ecr; 591 tcp_t *tcp; 592 } tcp_opt_t; 593 594 /* 595 * TCP option struct passing information b/w lisenter and eager. 596 */ 597 struct tcp_options { 598 uint_t to_flags; 599 ssize_t to_boundif; /* IPV6_BOUND_IF */ 600 }; 601 602 #define TCPOPT_BOUNDIF 0x00000001 /* set IPV6_BOUND_IF */ 603 #define TCPOPT_RECVPKTINFO 0x00000002 /* set IPV6_RECVPKTINFO */ 604 605 /* 606 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 607 */ 608 609 #ifdef _BIG_ENDIAN 610 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 611 (TCPOPT_TSTAMP << 8) | 10) 612 #else 613 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 614 (TCPOPT_NOP << 8) | TCPOPT_NOP) 615 #endif 616 617 /* 618 * Flags returned from tcp_parse_options. 619 */ 620 #define TCP_OPT_MSS_PRESENT 1 621 #define TCP_OPT_WSCALE_PRESENT 2 622 #define TCP_OPT_TSTAMP_PRESENT 4 623 #define TCP_OPT_SACK_OK_PRESENT 8 624 #define TCP_OPT_SACK_PRESENT 16 625 626 /* TCP option length */ 627 #define TCPOPT_NOP_LEN 1 628 #define TCPOPT_MAXSEG_LEN 4 629 #define TCPOPT_WS_LEN 3 630 #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) 631 #define TCPOPT_TSTAMP_LEN 10 632 #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) 633 #define TCPOPT_SACK_OK_LEN 2 634 #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) 635 #define TCPOPT_REAL_SACK_LEN 4 636 #define TCPOPT_MAX_SACK_LEN 36 637 #define TCPOPT_HEADER_LEN 2 638 639 /* TCP cwnd burst factor. */ 640 #define TCP_CWND_INFINITE 65535 641 #define TCP_CWND_SS 3 642 #define TCP_CWND_NORMAL 5 643 644 /* Maximum TCP initial cwin (start/restart). */ 645 #define TCP_MAX_INIT_CWND 8 646 647 /* 648 * Initialize cwnd according to RFC 3390. def_max_init_cwnd is 649 * either tcp_slow_start_initial or tcp_slow_start_after idle 650 * depending on the caller. If the upper layer has not used the 651 * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd 652 * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd. 653 * If the upper layer has changed set the tcp_init_cwnd, just use 654 * it to calculate the tcp_cwnd. 655 */ 656 #define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \ 657 { \ 658 if ((tcp)->tcp_init_cwnd == 0) { \ 659 (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \ 660 MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \ 661 } else { \ 662 (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \ 663 } \ 664 tcp->tcp_cwnd_cnt = 0; \ 665 } 666 667 /* TCP Timer control structure */ 668 typedef struct tcpt_s { 669 pfv_t tcpt_pfv; /* The routine we are to call */ 670 tcp_t *tcpt_tcp; /* The parameter we are to pass in */ 671 } tcpt_t; 672 673 /* 674 * Functions called directly via squeue having a prototype of edesc_t. 675 */ 676 void tcp_conn_request(void *arg, mblk_t *mp, void *arg2); 677 static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2); 678 void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2); 679 static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2); 680 static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2); 681 void tcp_input(void *arg, mblk_t *mp, void *arg2); 682 void tcp_rput_data(void *arg, mblk_t *mp, void *arg2); 683 static void tcp_close_output(void *arg, mblk_t *mp, void *arg2); 684 void tcp_output(void *arg, mblk_t *mp, void *arg2); 685 void tcp_output_urgent(void *arg, mblk_t *mp, void *arg2); 686 static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2); 687 static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2); 688 static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2); 689 690 691 /* Prototype for TCP functions */ 692 static void tcp_random_init(void); 693 int tcp_random(void); 694 static void tcp_tli_accept(tcp_t *tcp, mblk_t *mp); 695 static int tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, 696 tcp_t *eager); 697 static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp); 698 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 699 int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only, 700 boolean_t user_specified); 701 static void tcp_closei_local(tcp_t *tcp); 702 static void tcp_close_detached(tcp_t *tcp); 703 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, 704 mblk_t *idmp, mblk_t **defermp); 705 static void tcp_tpi_connect(tcp_t *tcp, mblk_t *mp); 706 static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, 707 in_port_t dstport, uint_t srcid, cred_t *cr, pid_t pid); 708 static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, 709 in_port_t dstport, uint32_t flowinfo, uint_t srcid, 710 uint32_t scope_id, cred_t *cr, pid_t pid); 711 static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag); 712 static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp); 713 static void tcp_disconnect(tcp_t *tcp, mblk_t *mp); 714 static char *tcp_display(tcp_t *tcp, char *, char); 715 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum); 716 static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only); 717 static void tcp_eager_unlink(tcp_t *tcp); 718 static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr, 719 int unixerr); 720 static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 721 int tlierr, int unixerr); 722 static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, 723 cred_t *cr); 724 static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, 725 char *value, caddr_t cp, cred_t *cr); 726 static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, 727 char *value, caddr_t cp, cred_t *cr); 728 static int tcp_tpistate(tcp_t *tcp); 729 static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp, 730 int caller_holds_lock); 731 static void tcp_bind_hash_remove(tcp_t *tcp); 732 static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *); 733 void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp); 734 static void tcp_acceptor_hash_remove(tcp_t *tcp); 735 static void tcp_capability_req(tcp_t *tcp, mblk_t *mp); 736 static void tcp_info_req(tcp_t *tcp, mblk_t *mp); 737 static void tcp_addr_req(tcp_t *tcp, mblk_t *mp); 738 static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp); 739 void tcp_g_q_setup(tcp_stack_t *); 740 void tcp_g_q_create(tcp_stack_t *); 741 void tcp_g_q_destroy(tcp_stack_t *); 742 static int tcp_header_init_ipv4(tcp_t *tcp); 743 static int tcp_header_init_ipv6(tcp_t *tcp); 744 int tcp_init(tcp_t *tcp, queue_t *q); 745 static int tcp_init_values(tcp_t *tcp); 746 static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic); 747 static void tcp_ip_ire_mark_advice(tcp_t *tcp); 748 static void tcp_ip_notify(tcp_t *tcp); 749 static mblk_t *tcp_ire_mp(mblk_t **mpp); 750 static void tcp_iss_init(tcp_t *tcp); 751 static void tcp_keepalive_killer(void *arg); 752 static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt); 753 static void tcp_mss_set(tcp_t *tcp, uint32_t size, boolean_t do_ss); 754 static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, 755 int *do_disconnectp, int *t_errorp, int *sys_errorp); 756 static boolean_t tcp_allow_connopt_set(int level, int name); 757 int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); 758 int tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr); 759 int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, 760 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, 761 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, 762 mblk_t *mblk); 763 static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha); 764 static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, 765 uchar_t *ptr, uint_t len); 766 static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); 767 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, 768 tcp_stack_t *); 769 static int tcp_param_set(queue_t *q, mblk_t *mp, char *value, 770 caddr_t cp, cred_t *cr); 771 static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, 772 caddr_t cp, cred_t *cr); 773 static void tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *); 774 static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, 775 caddr_t cp, cred_t *cr); 776 static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt); 777 static void tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt); 778 static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start); 779 static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp); 780 static void tcp_reinit(tcp_t *tcp); 781 static void tcp_reinit_values(tcp_t *tcp); 782 783 static uint_t tcp_rwnd_reopen(tcp_t *tcp); 784 static uint_t tcp_rcv_drain(tcp_t *tcp); 785 static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); 786 static boolean_t tcp_send_rst_chk(tcp_stack_t *); 787 static void tcp_ss_rexmit(tcp_t *tcp); 788 static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp); 789 static void tcp_process_options(tcp_t *, tcph_t *); 790 static void tcp_rput_common(tcp_t *tcp, mblk_t *mp); 791 static void tcp_rsrv(queue_t *q); 792 static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd); 793 static int tcp_snmp_state(tcp_t *tcp); 794 static void tcp_timer(void *arg); 795 static void tcp_timer_callback(void *); 796 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp, 797 boolean_t random); 798 static in_port_t tcp_get_next_priv_port(const tcp_t *); 799 static void tcp_wput_sock(queue_t *q, mblk_t *mp); 800 static void tcp_wput_fallback(queue_t *q, mblk_t *mp); 801 void tcp_tpi_accept(queue_t *q, mblk_t *mp); 802 static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); 803 static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp); 804 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); 805 static int tcp_send(queue_t *q, tcp_t *tcp, const int mss, 806 const int tcp_hdr_len, const int tcp_tcp_hdr_len, 807 const int num_sack_blk, int *usable, uint_t *snxt, 808 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 809 const int mdt_thres); 810 static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, 811 const int tcp_hdr_len, const int tcp_tcp_hdr_len, 812 const int num_sack_blk, int *usable, uint_t *snxt, 813 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 814 const int mdt_thres); 815 static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, 816 int num_sack_blk); 817 static void tcp_wsrv(queue_t *q); 818 static int tcp_xmit_end(tcp_t *tcp); 819 static void tcp_ack_timer(void *arg); 820 static mblk_t *tcp_ack_mp(tcp_t *tcp); 821 static void tcp_xmit_early_reset(char *str, mblk_t *mp, 822 uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len, 823 zoneid_t zoneid, tcp_stack_t *, conn_t *connp); 824 static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, 825 uint32_t ack, int ctl); 826 static int setmaxps(queue_t *q, int maxpsz); 827 static void tcp_set_rto(tcp_t *, time_t); 828 static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *, 829 boolean_t, boolean_t); 830 static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, 831 boolean_t ipsec_mctl); 832 static int tcp_build_hdrs(tcp_t *); 833 static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, 834 uint32_t seg_seq, uint32_t seg_ack, int seg_len, 835 tcph_t *tcph); 836 boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp); 837 static mblk_t *tcp_mdt_info_mp(mblk_t *); 838 static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t); 839 static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *, 840 const boolean_t, const uint32_t, const uint32_t, 841 const uint32_t, const uint32_t, tcp_stack_t *); 842 static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *, 843 const uint_t, const uint_t, boolean_t *); 844 static mblk_t *tcp_lso_info_mp(mblk_t *); 845 static void tcp_lso_update(tcp_t *, ill_lso_capab_t *); 846 static void tcp_send_data(tcp_t *, queue_t *, mblk_t *); 847 extern mblk_t *tcp_timermp_alloc(int); 848 extern void tcp_timermp_free(tcp_t *); 849 static void tcp_timer_free(tcp_t *tcp, mblk_t *mp); 850 static void tcp_stop_lingering(tcp_t *tcp); 851 static void tcp_close_linger_timeout(void *arg); 852 static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns); 853 static void tcp_stack_shutdown(netstackid_t stackid, void *arg); 854 static void tcp_stack_fini(netstackid_t stackid, void *arg); 855 static void *tcp_g_kstat_init(tcp_g_stat_t *); 856 static void tcp_g_kstat_fini(kstat_t *); 857 static void *tcp_kstat_init(netstackid_t, tcp_stack_t *); 858 static void tcp_kstat_fini(netstackid_t, kstat_t *); 859 static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *); 860 static void tcp_kstat2_fini(netstackid_t, kstat_t *); 861 static int tcp_kstat_update(kstat_t *kp, int rw); 862 void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp); 863 static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, 864 tcph_t *tcph, uint_t ipvers, mblk_t *idmp); 865 static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, 866 tcph_t *tcph, mblk_t *idmp); 867 static int tcp_squeue_switch(int); 868 869 static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t); 870 static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *); 871 static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *); 872 static int tcp_tpi_close(queue_t *, int); 873 static int tcp_tpi_close_accept(queue_t *); 874 875 static void tcp_squeue_add(squeue_t *); 876 static boolean_t tcp_zcopy_check(tcp_t *); 877 static void tcp_zcopy_notify(tcp_t *); 878 static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *); 879 static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int); 880 static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t); 881 882 extern void tcp_kssl_input(tcp_t *, mblk_t *); 883 884 void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2); 885 void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2); 886 887 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, 888 sock_upper_handle_t, cred_t *); 889 static int tcp_listen(sock_lower_handle_t, int, cred_t *); 890 static int tcp_post_ip_bind(tcp_t *, mblk_t *, int, cred_t *, pid_t); 891 static int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, cred_t *, 892 boolean_t); 893 static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t, 894 cred_t *, pid_t); 895 static int tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *, 896 boolean_t); 897 static int tcp_do_unbind(conn_t *); 898 static int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, cred_t *, 899 boolean_t); 900 901 static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *); 902 903 /* 904 * Routines related to the TCP_IOC_ABORT_CONN ioctl command. 905 * 906 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting 907 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure 908 * (defined in tcp.h) needs to be filled in and passed into the kernel 909 * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t 910 * structure contains the four-tuple of a TCP connection and a range of TCP 911 * states (specified by ac_start and ac_end). The use of wildcard addresses 912 * and ports is allowed. Connections with a matching four tuple and a state 913 * within the specified range will be aborted. The valid states for the 914 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT, 915 * inclusive. 916 * 917 * An application which has its connection aborted by this ioctl will receive 918 * an error that is dependent on the connection state at the time of the abort. 919 * If the connection state is < TCPS_TIME_WAIT, an application should behave as 920 * though a RST packet has been received. If the connection state is equal to 921 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel 922 * and all resources associated with the connection will be freed. 923 */ 924 static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); 925 static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); 926 static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *); 927 static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps); 928 static void tcp_ioctl_abort_conn(queue_t *, mblk_t *); 929 static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, 930 boolean_t, tcp_stack_t *); 931 932 static struct module_info tcp_rinfo = { 933 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER 934 }; 935 936 static struct module_info tcp_winfo = { 937 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16 938 }; 939 940 /* 941 * Entry points for TCP as a device. The normal case which supports 942 * the TCP functionality. 943 * We have separate open functions for the /dev/tcp and /dev/tcp6 devices. 944 */ 945 struct qinit tcp_rinitv4 = { 946 NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, NULL, &tcp_rinfo 947 }; 948 949 struct qinit tcp_rinitv6 = { 950 NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_tpi_close, NULL, &tcp_rinfo 951 }; 952 953 struct qinit tcp_winit = { 954 (pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 955 }; 956 957 /* Initial entry point for TCP in socket mode. */ 958 struct qinit tcp_sock_winit = { 959 (pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 960 }; 961 962 /* TCP entry point during fallback */ 963 struct qinit tcp_fallback_sock_winit = { 964 (pfi_t)tcp_wput_fallback, NULL, NULL, NULL, NULL, &tcp_winfo 965 }; 966 967 /* 968 * Entry points for TCP as a acceptor STREAM opened by sockfs when doing 969 * an accept. Avoid allocating data structures since eager has already 970 * been created. 971 */ 972 struct qinit tcp_acceptor_rinit = { 973 NULL, (pfi_t)tcp_rsrv, NULL, tcp_tpi_close_accept, NULL, &tcp_winfo 974 }; 975 976 struct qinit tcp_acceptor_winit = { 977 (pfi_t)tcp_tpi_accept, NULL, NULL, NULL, NULL, &tcp_winfo 978 }; 979 980 /* For AF_INET aka /dev/tcp */ 981 struct streamtab tcpinfov4 = { 982 &tcp_rinitv4, &tcp_winit 983 }; 984 985 /* For AF_INET6 aka /dev/tcp6 */ 986 struct streamtab tcpinfov6 = { 987 &tcp_rinitv6, &tcp_winit 988 }; 989 990 sock_downcalls_t sock_tcp_downcalls; 991 992 /* 993 * Have to ensure that tcp_g_q_close is not done by an 994 * interrupt thread. 995 */ 996 static taskq_t *tcp_taskq; 997 998 /* Setable only in /etc/system. Move to ndd? */ 999 boolean_t tcp_icmp_source_quench = B_FALSE; 1000 1001 /* 1002 * Following assumes TPI alignment requirements stay along 32 bit 1003 * boundaries 1004 */ 1005 #define ROUNDUP32(x) \ 1006 (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1)) 1007 1008 /* Template for response to info request. */ 1009 static struct T_info_ack tcp_g_t_info_ack = { 1010 T_INFO_ACK, /* PRIM_type */ 1011 0, /* TSDU_size */ 1012 T_INFINITE, /* ETSDU_size */ 1013 T_INVALID, /* CDATA_size */ 1014 T_INVALID, /* DDATA_size */ 1015 sizeof (sin_t), /* ADDR_size */ 1016 0, /* OPT_size - not initialized here */ 1017 TIDUSZ, /* TIDU_size */ 1018 T_COTS_ORD, /* SERV_type */ 1019 TCPS_IDLE, /* CURRENT_state */ 1020 (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 1021 }; 1022 1023 static struct T_info_ack tcp_g_t_info_ack_v6 = { 1024 T_INFO_ACK, /* PRIM_type */ 1025 0, /* TSDU_size */ 1026 T_INFINITE, /* ETSDU_size */ 1027 T_INVALID, /* CDATA_size */ 1028 T_INVALID, /* DDATA_size */ 1029 sizeof (sin6_t), /* ADDR_size */ 1030 0, /* OPT_size - not initialized here */ 1031 TIDUSZ, /* TIDU_size */ 1032 T_COTS_ORD, /* SERV_type */ 1033 TCPS_IDLE, /* CURRENT_state */ 1034 (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 1035 }; 1036 1037 #define MS 1L 1038 #define SECONDS (1000 * MS) 1039 #define MINUTES (60 * SECONDS) 1040 #define HOURS (60 * MINUTES) 1041 #define DAYS (24 * HOURS) 1042 1043 #define PARAM_MAX (~(uint32_t)0) 1044 1045 /* Max size IP datagram is 64k - 1 */ 1046 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t))) 1047 #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t))) 1048 /* Max of the above */ 1049 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 1050 1051 /* Largest TCP port number */ 1052 #define TCP_MAX_PORT (64 * 1024 - 1) 1053 1054 /* 1055 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link 1056 * layer header. It has to be a multiple of 4. 1057 */ 1058 static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" }; 1059 #define tcps_wroff_xtra tcps_wroff_xtra_param->tcp_param_val 1060 1061 /* 1062 * All of these are alterable, within the min/max values given, at run time. 1063 * Note that the default value of "tcp_time_wait_interval" is four minutes, 1064 * per the TCP spec. 1065 */ 1066 /* BEGIN CSTYLED */ 1067 static tcpparam_t lcl_tcp_param_arr[] = { 1068 /*min max value name */ 1069 { 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"}, 1070 { 1, PARAM_MAX, 128, "tcp_conn_req_max_q" }, 1071 { 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" }, 1072 { 1, 1024, 1, "tcp_conn_req_min" }, 1073 { 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" }, 1074 { 128, (1<<30), 1024*1024, "tcp_cwnd_max" }, 1075 { 0, 10, 0, "tcp_debug" }, 1076 { 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"}, 1077 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"}, 1078 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"}, 1079 { 500*MS, PARAM_MAX, 8*MINUTES, "tcp_ip_abort_interval"}, 1080 { 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"}, 1081 { 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"}, 1082 { 1, 255, 64, "tcp_ipv4_ttl"}, 1083 { 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"}, 1084 { 0, 100, 10, "tcp_maxpsz_multiplier" }, 1085 { 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"}, 1086 { 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"}, 1087 { 1, TCP_MSS_MAX, 108, "tcp_mss_min"}, 1088 { 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"}, 1089 { 1*MS, 20*SECONDS, 3*SECONDS, "tcp_rexmit_interval_initial"}, 1090 { 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"}, 1091 { 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"}, 1092 { 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" }, 1093 { 0, 16, 0, "tcp_snd_lowat_fraction" }, 1094 { 0, 128000, 0, "tcp_sth_rcv_hiwat" }, 1095 { 0, 128000, 0, "tcp_sth_rcv_lowat" }, 1096 { 1, 10000, 3, "tcp_dupack_fast_retransmit" }, 1097 { 0, 1, 0, "tcp_ignore_path_mtu" }, 1098 { 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"}, 1099 { 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"}, 1100 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"}, 1101 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"}, 1102 { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"}, 1103 { 1, 65536, 4, "tcp_recv_hiwat_minmss"}, 1104 { 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"}, 1105 { 8192, (1<<30), 1024*1024, "tcp_max_buf"}, 1106 /* 1107 * Question: What default value should I set for tcp_strong_iss? 1108 */ 1109 { 0, 2, 1, "tcp_strong_iss"}, 1110 { 0, 65536, 20, "tcp_rtt_updates"}, 1111 { 0, 1, 1, "tcp_wscale_always"}, 1112 { 0, 1, 0, "tcp_tstamp_always"}, 1113 { 0, 1, 1, "tcp_tstamp_if_wscale"}, 1114 { 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"}, 1115 { 0, 16, 2, "tcp_deferred_acks_max"}, 1116 { 1, 16384, 4, "tcp_slow_start_after_idle"}, 1117 { 1, 4, 4, "tcp_slow_start_initial"}, 1118 { 0, 2, 2, "tcp_sack_permitted"}, 1119 { 0, 1, 1, "tcp_compression_enabled"}, 1120 { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"}, 1121 { 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"}, 1122 { 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"}, 1123 { 0, 1, 0, "tcp_rev_src_routes"}, 1124 { 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"}, 1125 { 0, 16, 8, "tcp_local_dacks_max"}, 1126 { 0, 2, 1, "tcp_ecn_permitted"}, 1127 { 0, 1, 1, "tcp_rst_sent_rate_enabled"}, 1128 { 0, PARAM_MAX, 40, "tcp_rst_sent_rate"}, 1129 { 0, 100*MS, 50*MS, "tcp_push_timer_interval"}, 1130 { 0, 1, 0, "tcp_use_smss_as_mss_opt"}, 1131 { 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"}, 1132 }; 1133 /* END CSTYLED */ 1134 1135 /* 1136 * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of 1137 * each header fragment in the header buffer. Each parameter value has 1138 * to be a multiple of 4 (32-bit aligned). 1139 */ 1140 static tcpparam_t lcl_tcp_mdt_head_param = 1141 { 32, 256, 32, "tcp_mdt_hdr_head_min" }; 1142 static tcpparam_t lcl_tcp_mdt_tail_param = 1143 { 0, 256, 32, "tcp_mdt_hdr_tail_min" }; 1144 #define tcps_mdt_hdr_head_min tcps_mdt_head_param->tcp_param_val 1145 #define tcps_mdt_hdr_tail_min tcps_mdt_tail_param->tcp_param_val 1146 1147 /* 1148 * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out 1149 * the maximum number of payload buffers associated per Multidata. 1150 */ 1151 static tcpparam_t lcl_tcp_mdt_max_pbufs_param = 1152 { 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" }; 1153 #define tcps_mdt_max_pbufs tcps_mdt_max_pbufs_param->tcp_param_val 1154 1155 /* Round up the value to the nearest mss. */ 1156 #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) 1157 1158 /* 1159 * Set ECN capable transport (ECT) code point in IP header. 1160 * 1161 * Note that there are 2 ECT code points '01' and '10', which are called 1162 * ECT(1) and ECT(0) respectively. Here we follow the original ECT code 1163 * point ECT(0) for TCP as described in RFC 2481. 1164 */ 1165 #define SET_ECT(tcp, iph) \ 1166 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1167 /* We need to clear the code point first. */ \ 1168 ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \ 1169 ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \ 1170 } else { \ 1171 ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \ 1172 ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \ 1173 } 1174 1175 /* 1176 * The format argument to pass to tcp_display(). 1177 * DISP_PORT_ONLY means that the returned string has only port info. 1178 * DISP_ADDR_AND_PORT means that the returned string also contains the 1179 * remote and local IP address. 1180 */ 1181 #define DISP_PORT_ONLY 1 1182 #define DISP_ADDR_AND_PORT 2 1183 1184 #define IS_VMLOANED_MBLK(mp) \ 1185 (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) 1186 1187 1188 /* Enable or disable b_cont M_MULTIDATA chaining for MDT. */ 1189 boolean_t tcp_mdt_chain = B_TRUE; 1190 1191 /* 1192 * MDT threshold in the form of effective send MSS multiplier; we take 1193 * the MDT path if the amount of unsent data exceeds the threshold value 1194 * (default threshold is 1*SMSS). 1195 */ 1196 uint_t tcp_mdt_smss_threshold = 1; 1197 1198 uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ 1199 1200 /* 1201 * Forces all connections to obey the value of the tcps_maxpsz_multiplier 1202 * tunable settable via NDD. Otherwise, the per-connection behavior is 1203 * determined dynamically during tcp_adapt_ire(), which is the default. 1204 */ 1205 boolean_t tcp_static_maxpsz = B_FALSE; 1206 1207 /* Setable in /etc/system */ 1208 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ 1209 uint32_t tcp_random_anon_port = 1; 1210 1211 /* 1212 * To reach to an eager in Q0 which can be dropped due to an incoming 1213 * new SYN request when Q0 is full, a new doubly linked list is 1214 * introduced. This list allows to select an eager from Q0 in O(1) time. 1215 * This is needed to avoid spending too much time walking through the 1216 * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of 1217 * this new list has to be a member of Q0. 1218 * This list is headed by listener's tcp_t. When the list is empty, 1219 * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0, 1220 * of listener's tcp_t point to listener's tcp_t itself. 1221 * 1222 * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager 1223 * in the list. MAKE_UNDROPPABLE() takes the eager out of the list. 1224 * These macros do not affect the eager's membership to Q0. 1225 */ 1226 1227 1228 #define MAKE_DROPPABLE(listener, eager) \ 1229 if ((eager)->tcp_eager_next_drop_q0 == NULL) { \ 1230 (listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\ 1231 = (eager); \ 1232 (eager)->tcp_eager_prev_drop_q0 = (listener); \ 1233 (eager)->tcp_eager_next_drop_q0 = \ 1234 (listener)->tcp_eager_next_drop_q0; \ 1235 (listener)->tcp_eager_next_drop_q0 = (eager); \ 1236 } 1237 1238 #define MAKE_UNDROPPABLE(eager) \ 1239 if ((eager)->tcp_eager_next_drop_q0 != NULL) { \ 1240 (eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0 \ 1241 = (eager)->tcp_eager_prev_drop_q0; \ 1242 (eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0 \ 1243 = (eager)->tcp_eager_next_drop_q0; \ 1244 (eager)->tcp_eager_prev_drop_q0 = NULL; \ 1245 (eager)->tcp_eager_next_drop_q0 = NULL; \ 1246 } 1247 1248 /* 1249 * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more 1250 * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent 1251 * data, TCP will not respond with an ACK. RFC 793 requires that 1252 * TCP responds with an ACK for such a bogus ACK. By not following 1253 * the RFC, we prevent TCP from getting into an ACK storm if somehow 1254 * an attacker successfully spoofs an acceptable segment to our 1255 * peer; or when our peer is "confused." 1256 */ 1257 uint32_t tcp_drop_ack_unsent_cnt = 10; 1258 1259 /* 1260 * Hook functions to enable cluster networking 1261 * On non-clustered systems these vectors must always be NULL. 1262 */ 1263 1264 void (*cl_inet_listen)(netstackid_t stack_id, uint8_t protocol, 1265 sa_family_t addr_family, uint8_t *laddrp, 1266 in_port_t lport, void *args) = NULL; 1267 void (*cl_inet_unlisten)(netstackid_t stack_id, uint8_t protocol, 1268 sa_family_t addr_family, uint8_t *laddrp, 1269 in_port_t lport, void *args) = NULL; 1270 1271 int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol, 1272 boolean_t is_outgoing, 1273 sa_family_t addr_family, 1274 uint8_t *laddrp, in_port_t lport, 1275 uint8_t *faddrp, in_port_t fport, 1276 void *args) = NULL; 1277 1278 void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol, 1279 sa_family_t addr_family, uint8_t *laddrp, 1280 in_port_t lport, uint8_t *faddrp, 1281 in_port_t fport, void *args) = NULL; 1282 1283 /* 1284 * The following are defined in ip.c 1285 */ 1286 extern int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol, 1287 sa_family_t addr_family, uint8_t *laddrp, 1288 void *args); 1289 extern uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol, 1290 sa_family_t addr_family, uint8_t *laddrp, 1291 uint8_t *faddrp, void *args); 1292 1293 1294 /* 1295 * int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err) 1296 */ 1297 #define CL_INET_CONNECT(connp, tcp, is_outgoing, err) { \ 1298 (err) = 0; \ 1299 if (cl_inet_connect2 != NULL) { \ 1300 /* \ 1301 * Running in cluster mode - register active connection \ 1302 * information \ 1303 */ \ 1304 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1305 if ((tcp)->tcp_ipha->ipha_src != 0) { \ 1306 (err) = (*cl_inet_connect2)( \ 1307 (connp)->conn_netstack->netstack_stackid,\ 1308 IPPROTO_TCP, is_outgoing, AF_INET, \ 1309 (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\ 1310 (in_port_t)(tcp)->tcp_lport, \ 1311 (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\ 1312 (in_port_t)(tcp)->tcp_fport, NULL); \ 1313 } \ 1314 } else { \ 1315 if (!IN6_IS_ADDR_UNSPECIFIED( \ 1316 &(tcp)->tcp_ip6h->ip6_src)) { \ 1317 (err) = (*cl_inet_connect2)( \ 1318 (connp)->conn_netstack->netstack_stackid,\ 1319 IPPROTO_TCP, is_outgoing, AF_INET6, \ 1320 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\ 1321 (in_port_t)(tcp)->tcp_lport, \ 1322 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\ 1323 (in_port_t)(tcp)->tcp_fport, NULL); \ 1324 } \ 1325 } \ 1326 } \ 1327 } 1328 1329 #define CL_INET_DISCONNECT(connp, tcp) { \ 1330 if (cl_inet_disconnect != NULL) { \ 1331 /* \ 1332 * Running in cluster mode - deregister active \ 1333 * connection information \ 1334 */ \ 1335 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1336 if ((tcp)->tcp_ip_src != 0) { \ 1337 (*cl_inet_disconnect)( \ 1338 (connp)->conn_netstack->netstack_stackid,\ 1339 IPPROTO_TCP, AF_INET, \ 1340 (uint8_t *)(&((tcp)->tcp_ip_src)), \ 1341 (in_port_t)(tcp)->tcp_lport, \ 1342 (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\ 1343 (in_port_t)(tcp)->tcp_fport, NULL); \ 1344 } \ 1345 } else { \ 1346 if (!IN6_IS_ADDR_UNSPECIFIED( \ 1347 &(tcp)->tcp_ip_src_v6)) { \ 1348 (*cl_inet_disconnect)( \ 1349 (connp)->conn_netstack->netstack_stackid,\ 1350 IPPROTO_TCP, AF_INET6, \ 1351 (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\ 1352 (in_port_t)(tcp)->tcp_lport, \ 1353 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\ 1354 (in_port_t)(tcp)->tcp_fport, NULL); \ 1355 } \ 1356 } \ 1357 } \ 1358 } 1359 1360 /* 1361 * Cluster networking hook for traversing current connection list. 1362 * This routine is used to extract the current list of live connections 1363 * which must continue to to be dispatched to this node. 1364 */ 1365 int cl_tcp_walk_list(netstackid_t stack_id, 1366 int (*callback)(cl_tcp_info_t *, void *), void *arg); 1367 1368 static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), 1369 void *arg, tcp_stack_t *tcps); 1370 1371 #define DTRACE_IP_FASTPATH(mp, iph, ill, ipha, ip6h) \ 1372 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, \ 1373 iph, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, \ 1374 ip6_t *, ip6h, int, 0); 1375 1376 /* 1377 * Figure out the value of window scale opton. Note that the rwnd is 1378 * ASSUMED to be rounded up to the nearest MSS before the calculation. 1379 * We cannot find the scale value and then do a round up of tcp_rwnd 1380 * because the scale value may not be correct after that. 1381 * 1382 * Set the compiler flag to make this function inline. 1383 */ 1384 static void 1385 tcp_set_ws_value(tcp_t *tcp) 1386 { 1387 int i; 1388 uint32_t rwnd = tcp->tcp_rwnd; 1389 1390 for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; 1391 i++, rwnd >>= 1) 1392 ; 1393 tcp->tcp_rcv_ws = i; 1394 } 1395 1396 /* 1397 * Remove a connection from the list of detached TIME_WAIT connections. 1398 * It returns B_FALSE if it can't remove the connection from the list 1399 * as the connection has already been removed from the list due to an 1400 * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. 1401 */ 1402 static boolean_t 1403 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) 1404 { 1405 boolean_t locked = B_FALSE; 1406 1407 if (tcp_time_wait == NULL) { 1408 tcp_time_wait = *((tcp_squeue_priv_t **) 1409 squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); 1410 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1411 locked = B_TRUE; 1412 } else { 1413 ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock)); 1414 } 1415 1416 if (tcp->tcp_time_wait_expire == 0) { 1417 ASSERT(tcp->tcp_time_wait_next == NULL); 1418 ASSERT(tcp->tcp_time_wait_prev == NULL); 1419 if (locked) 1420 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1421 return (B_FALSE); 1422 } 1423 ASSERT(TCP_IS_DETACHED(tcp)); 1424 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 1425 1426 if (tcp == tcp_time_wait->tcp_time_wait_head) { 1427 ASSERT(tcp->tcp_time_wait_prev == NULL); 1428 tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; 1429 if (tcp_time_wait->tcp_time_wait_head != NULL) { 1430 tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = 1431 NULL; 1432 } else { 1433 tcp_time_wait->tcp_time_wait_tail = NULL; 1434 } 1435 } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { 1436 ASSERT(tcp != tcp_time_wait->tcp_time_wait_head); 1437 ASSERT(tcp->tcp_time_wait_next == NULL); 1438 tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; 1439 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 1440 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; 1441 } else { 1442 ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 1443 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 1444 tcp->tcp_time_wait_prev->tcp_time_wait_next = 1445 tcp->tcp_time_wait_next; 1446 tcp->tcp_time_wait_next->tcp_time_wait_prev = 1447 tcp->tcp_time_wait_prev; 1448 } 1449 tcp->tcp_time_wait_next = NULL; 1450 tcp->tcp_time_wait_prev = NULL; 1451 tcp->tcp_time_wait_expire = 0; 1452 1453 if (locked) 1454 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1455 return (B_TRUE); 1456 } 1457 1458 /* 1459 * Add a connection to the list of detached TIME_WAIT connections 1460 * and set its time to expire. 1461 */ 1462 static void 1463 tcp_time_wait_append(tcp_t *tcp) 1464 { 1465 tcp_stack_t *tcps = tcp->tcp_tcps; 1466 tcp_squeue_priv_t *tcp_time_wait = 1467 *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp, 1468 SQPRIVATE_TCP)); 1469 1470 tcp_timers_stop(tcp); 1471 1472 /* Freed above */ 1473 ASSERT(tcp->tcp_timer_tid == 0); 1474 ASSERT(tcp->tcp_ack_tid == 0); 1475 1476 /* must have happened at the time of detaching the tcp */ 1477 ASSERT(tcp->tcp_ptpahn == NULL); 1478 ASSERT(tcp->tcp_flow_stopped == 0); 1479 ASSERT(tcp->tcp_time_wait_next == NULL); 1480 ASSERT(tcp->tcp_time_wait_prev == NULL); 1481 ASSERT(tcp->tcp_time_wait_expire == NULL); 1482 ASSERT(tcp->tcp_listener == NULL); 1483 1484 tcp->tcp_time_wait_expire = ddi_get_lbolt(); 1485 /* 1486 * The value computed below in tcp->tcp_time_wait_expire may 1487 * appear negative or wrap around. That is ok since our 1488 * interest is only in the difference between the current lbolt 1489 * value and tcp->tcp_time_wait_expire. But the value should not 1490 * be zero, since it means the tcp is not in the TIME_WAIT list. 1491 * The corresponding comparison in tcp_time_wait_collector() uses 1492 * modular arithmetic. 1493 */ 1494 tcp->tcp_time_wait_expire += 1495 drv_usectohz(tcps->tcps_time_wait_interval * 1000); 1496 if (tcp->tcp_time_wait_expire == 0) 1497 tcp->tcp_time_wait_expire = 1; 1498 1499 ASSERT(TCP_IS_DETACHED(tcp)); 1500 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 1501 ASSERT(tcp->tcp_time_wait_next == NULL); 1502 ASSERT(tcp->tcp_time_wait_prev == NULL); 1503 TCP_DBGSTAT(tcps, tcp_time_wait); 1504 1505 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1506 if (tcp_time_wait->tcp_time_wait_head == NULL) { 1507 ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); 1508 tcp_time_wait->tcp_time_wait_head = tcp; 1509 } else { 1510 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 1511 ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == 1512 TCPS_TIME_WAIT); 1513 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp; 1514 tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail; 1515 } 1516 tcp_time_wait->tcp_time_wait_tail = tcp; 1517 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1518 } 1519 1520 /* ARGSUSED */ 1521 void 1522 tcp_timewait_output(void *arg, mblk_t *mp, void *arg2) 1523 { 1524 conn_t *connp = (conn_t *)arg; 1525 tcp_t *tcp = connp->conn_tcp; 1526 tcp_stack_t *tcps = tcp->tcp_tcps; 1527 1528 ASSERT(tcp != NULL); 1529 if (tcp->tcp_state == TCPS_CLOSED) { 1530 return; 1531 } 1532 1533 ASSERT((tcp->tcp_family == AF_INET && 1534 tcp->tcp_ipversion == IPV4_VERSION) || 1535 (tcp->tcp_family == AF_INET6 && 1536 (tcp->tcp_ipversion == IPV4_VERSION || 1537 tcp->tcp_ipversion == IPV6_VERSION))); 1538 ASSERT(!tcp->tcp_listener); 1539 1540 TCP_STAT(tcps, tcp_time_wait_reap); 1541 ASSERT(TCP_IS_DETACHED(tcp)); 1542 1543 /* 1544 * Because they have no upstream client to rebind or tcp_close() 1545 * them later, we axe the connection here and now. 1546 */ 1547 tcp_close_detached(tcp); 1548 } 1549 1550 /* 1551 * Remove cached/latched IPsec references. 1552 */ 1553 void 1554 tcp_ipsec_cleanup(tcp_t *tcp) 1555 { 1556 conn_t *connp = tcp->tcp_connp; 1557 1558 ASSERT(connp->conn_flags & IPCL_TCPCONN); 1559 1560 if (connp->conn_latch != NULL) { 1561 IPLATCH_REFRELE(connp->conn_latch, 1562 connp->conn_netstack); 1563 connp->conn_latch = NULL; 1564 } 1565 if (connp->conn_policy != NULL) { 1566 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); 1567 connp->conn_policy = NULL; 1568 } 1569 } 1570 1571 /* 1572 * Cleaup before placing on free list. 1573 * Disassociate from the netstack/tcp_stack_t since the freelist 1574 * is per squeue and not per netstack. 1575 */ 1576 void 1577 tcp_cleanup(tcp_t *tcp) 1578 { 1579 mblk_t *mp; 1580 char *tcp_iphc; 1581 int tcp_iphc_len; 1582 int tcp_hdr_grown; 1583 tcp_sack_info_t *tcp_sack_info; 1584 conn_t *connp = tcp->tcp_connp; 1585 tcp_stack_t *tcps = tcp->tcp_tcps; 1586 netstack_t *ns = tcps->tcps_netstack; 1587 mblk_t *tcp_rsrv_mp; 1588 1589 tcp_bind_hash_remove(tcp); 1590 1591 /* Cleanup that which needs the netstack first */ 1592 tcp_ipsec_cleanup(tcp); 1593 1594 tcp_free(tcp); 1595 1596 /* Release any SSL context */ 1597 if (tcp->tcp_kssl_ent != NULL) { 1598 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY); 1599 tcp->tcp_kssl_ent = NULL; 1600 } 1601 1602 if (tcp->tcp_kssl_ctx != NULL) { 1603 kssl_release_ctx(tcp->tcp_kssl_ctx); 1604 tcp->tcp_kssl_ctx = NULL; 1605 } 1606 tcp->tcp_kssl_pending = B_FALSE; 1607 1608 conn_delete_ire(connp, NULL); 1609 1610 /* 1611 * Since we will bzero the entire structure, we need to 1612 * remove it and reinsert it in global hash list. We 1613 * know the walkers can't get to this conn because we 1614 * had set CONDEMNED flag earlier and checked reference 1615 * under conn_lock so walker won't pick it and when we 1616 * go the ipcl_globalhash_remove() below, no walker 1617 * can get to it. 1618 */ 1619 ipcl_globalhash_remove(connp); 1620 1621 /* 1622 * Now it is safe to decrement the reference counts. 1623 * This might be the last reference on the netstack and TCPS 1624 * in which case it will cause the tcp_g_q_close and 1625 * the freeing of the IP Instance. 1626 */ 1627 connp->conn_netstack = NULL; 1628 netstack_rele(ns); 1629 ASSERT(tcps != NULL); 1630 tcp->tcp_tcps = NULL; 1631 TCPS_REFRELE(tcps); 1632 1633 /* Save some state */ 1634 mp = tcp->tcp_timercache; 1635 1636 tcp_sack_info = tcp->tcp_sack_info; 1637 tcp_iphc = tcp->tcp_iphc; 1638 tcp_iphc_len = tcp->tcp_iphc_len; 1639 tcp_hdr_grown = tcp->tcp_hdr_grown; 1640 tcp_rsrv_mp = tcp->tcp_rsrv_mp; 1641 1642 if (connp->conn_cred != NULL) { 1643 crfree(connp->conn_cred); 1644 connp->conn_cred = NULL; 1645 } 1646 if (connp->conn_effective_cred != NULL) { 1647 crfree(connp->conn_effective_cred); 1648 connp->conn_effective_cred = NULL; 1649 } 1650 ipcl_conn_cleanup(connp); 1651 connp->conn_flags = IPCL_TCPCONN; 1652 bzero(tcp, sizeof (tcp_t)); 1653 1654 /* restore the state */ 1655 tcp->tcp_timercache = mp; 1656 1657 tcp->tcp_sack_info = tcp_sack_info; 1658 tcp->tcp_iphc = tcp_iphc; 1659 tcp->tcp_iphc_len = tcp_iphc_len; 1660 tcp->tcp_hdr_grown = tcp_hdr_grown; 1661 tcp->tcp_rsrv_mp = tcp_rsrv_mp; 1662 1663 tcp->tcp_connp = connp; 1664 1665 ASSERT(connp->conn_tcp == tcp); 1666 ASSERT(connp->conn_flags & IPCL_TCPCONN); 1667 connp->conn_state_flags = CONN_INCIPIENT; 1668 ASSERT(connp->conn_ulp == IPPROTO_TCP); 1669 ASSERT(connp->conn_ref == 1); 1670 } 1671 1672 /* 1673 * Blows away all tcps whose TIME_WAIT has expired. List traversal 1674 * is done forwards from the head. 1675 * This walks all stack instances since 1676 * tcp_time_wait remains global across all stacks. 1677 */ 1678 /* ARGSUSED */ 1679 void 1680 tcp_time_wait_collector(void *arg) 1681 { 1682 tcp_t *tcp; 1683 clock_t now; 1684 mblk_t *mp; 1685 conn_t *connp; 1686 kmutex_t *lock; 1687 boolean_t removed; 1688 1689 squeue_t *sqp = (squeue_t *)arg; 1690 tcp_squeue_priv_t *tcp_time_wait = 1691 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 1692 1693 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1694 tcp_time_wait->tcp_time_wait_tid = 0; 1695 1696 if (tcp_time_wait->tcp_free_list != NULL && 1697 tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { 1698 TCP_G_STAT(tcp_freelist_cleanup); 1699 while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { 1700 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 1701 tcp->tcp_time_wait_next = NULL; 1702 tcp_time_wait->tcp_free_list_cnt--; 1703 ASSERT(tcp->tcp_tcps == NULL); 1704 CONN_DEC_REF(tcp->tcp_connp); 1705 } 1706 ASSERT(tcp_time_wait->tcp_free_list_cnt == 0); 1707 } 1708 1709 /* 1710 * In order to reap time waits reliably, we should use a 1711 * source of time that is not adjustable by the user -- hence 1712 * the call to ddi_get_lbolt(). 1713 */ 1714 now = ddi_get_lbolt(); 1715 while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { 1716 /* 1717 * Compare times using modular arithmetic, since 1718 * lbolt can wrapover. 1719 */ 1720 if ((now - tcp->tcp_time_wait_expire) < 0) { 1721 break; 1722 } 1723 1724 removed = tcp_time_wait_remove(tcp, tcp_time_wait); 1725 ASSERT(removed); 1726 1727 connp = tcp->tcp_connp; 1728 ASSERT(connp->conn_fanout != NULL); 1729 lock = &connp->conn_fanout->connf_lock; 1730 /* 1731 * This is essentially a TW reclaim fast path optimization for 1732 * performance where the timewait collector checks under the 1733 * fanout lock (so that no one else can get access to the 1734 * conn_t) that the refcnt is 2 i.e. one for TCP and one for 1735 * the classifier hash list. If ref count is indeed 2, we can 1736 * just remove the conn under the fanout lock and avoid 1737 * cleaning up the conn under the squeue, provided that 1738 * clustering callbacks are not enabled. If clustering is 1739 * enabled, we need to make the clustering callback before 1740 * setting the CONDEMNED flag and after dropping all locks and 1741 * so we forego this optimization and fall back to the slow 1742 * path. Also please see the comments in tcp_closei_local 1743 * regarding the refcnt logic. 1744 * 1745 * Since we are holding the tcp_time_wait_lock, its better 1746 * not to block on the fanout_lock because other connections 1747 * can't add themselves to time_wait list. So we do a 1748 * tryenter instead of mutex_enter. 1749 */ 1750 if (mutex_tryenter(lock)) { 1751 mutex_enter(&connp->conn_lock); 1752 if ((connp->conn_ref == 2) && 1753 (cl_inet_disconnect == NULL)) { 1754 ipcl_hash_remove_locked(connp, 1755 connp->conn_fanout); 1756 /* 1757 * Set the CONDEMNED flag now itself so that 1758 * the refcnt cannot increase due to any 1759 * walker. But we have still not cleaned up 1760 * conn_ire_cache. This is still ok since 1761 * we are going to clean it up in tcp_cleanup 1762 * immediately and any interface unplumb 1763 * thread will wait till the ire is blown away 1764 */ 1765 connp->conn_state_flags |= CONN_CONDEMNED; 1766 mutex_exit(lock); 1767 mutex_exit(&connp->conn_lock); 1768 if (tcp_time_wait->tcp_free_list_cnt < 1769 tcp_free_list_max_cnt) { 1770 /* Add to head of tcp_free_list */ 1771 mutex_exit( 1772 &tcp_time_wait->tcp_time_wait_lock); 1773 tcp_cleanup(tcp); 1774 ASSERT(connp->conn_latch == NULL); 1775 ASSERT(connp->conn_policy == NULL); 1776 ASSERT(tcp->tcp_tcps == NULL); 1777 ASSERT(connp->conn_netstack == NULL); 1778 1779 mutex_enter( 1780 &tcp_time_wait->tcp_time_wait_lock); 1781 tcp->tcp_time_wait_next = 1782 tcp_time_wait->tcp_free_list; 1783 tcp_time_wait->tcp_free_list = tcp; 1784 tcp_time_wait->tcp_free_list_cnt++; 1785 continue; 1786 } else { 1787 /* Do not add to tcp_free_list */ 1788 mutex_exit( 1789 &tcp_time_wait->tcp_time_wait_lock); 1790 tcp_bind_hash_remove(tcp); 1791 conn_delete_ire(tcp->tcp_connp, NULL); 1792 tcp_ipsec_cleanup(tcp); 1793 CONN_DEC_REF(tcp->tcp_connp); 1794 } 1795 } else { 1796 CONN_INC_REF_LOCKED(connp); 1797 mutex_exit(lock); 1798 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1799 mutex_exit(&connp->conn_lock); 1800 /* 1801 * We can reuse the closemp here since conn has 1802 * detached (otherwise we wouldn't even be in 1803 * time_wait list). tcp_closemp_used can safely 1804 * be changed without taking a lock as no other 1805 * thread can concurrently access it at this 1806 * point in the connection lifecycle. 1807 */ 1808 1809 if (tcp->tcp_closemp.b_prev == NULL) 1810 tcp->tcp_closemp_used = B_TRUE; 1811 else 1812 cmn_err(CE_PANIC, 1813 "tcp_timewait_collector: " 1814 "concurrent use of tcp_closemp: " 1815 "connp %p tcp %p\n", (void *)connp, 1816 (void *)tcp); 1817 1818 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 1819 mp = &tcp->tcp_closemp; 1820 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 1821 tcp_timewait_output, connp, 1822 SQ_FILL, SQTAG_TCP_TIMEWAIT); 1823 } 1824 } else { 1825 mutex_enter(&connp->conn_lock); 1826 CONN_INC_REF_LOCKED(connp); 1827 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1828 mutex_exit(&connp->conn_lock); 1829 /* 1830 * We can reuse the closemp here since conn has 1831 * detached (otherwise we wouldn't even be in 1832 * time_wait list). tcp_closemp_used can safely 1833 * be changed without taking a lock as no other 1834 * thread can concurrently access it at this 1835 * point in the connection lifecycle. 1836 */ 1837 1838 if (tcp->tcp_closemp.b_prev == NULL) 1839 tcp->tcp_closemp_used = B_TRUE; 1840 else 1841 cmn_err(CE_PANIC, "tcp_timewait_collector: " 1842 "concurrent use of tcp_closemp: " 1843 "connp %p tcp %p\n", (void *)connp, 1844 (void *)tcp); 1845 1846 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 1847 mp = &tcp->tcp_closemp; 1848 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 1849 tcp_timewait_output, connp, 1850 SQ_FILL, SQTAG_TCP_TIMEWAIT); 1851 } 1852 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1853 } 1854 1855 if (tcp_time_wait->tcp_free_list != NULL) 1856 tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; 1857 1858 tcp_time_wait->tcp_time_wait_tid = 1859 timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, sqp, 1860 TICK_TO_NSEC(TCP_TIME_WAIT_DELAY), CALLOUT_TCP_RESOLUTION, 1861 CALLOUT_FLAG_ROUNDUP); 1862 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1863 } 1864 1865 /* 1866 * Reply to a clients T_CONN_RES TPI message. This function 1867 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES 1868 * on the acceptor STREAM and processed in tcp_wput_accept(). 1869 * Read the block comment on top of tcp_conn_request(). 1870 */ 1871 static void 1872 tcp_tli_accept(tcp_t *listener, mblk_t *mp) 1873 { 1874 tcp_t *acceptor; 1875 tcp_t *eager; 1876 tcp_t *tcp; 1877 struct T_conn_res *tcr; 1878 t_uscalar_t acceptor_id; 1879 t_scalar_t seqnum; 1880 mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */ 1881 struct tcp_options *tcpopt; 1882 mblk_t *ok_mp; 1883 mblk_t *mp1; 1884 tcp_stack_t *tcps = listener->tcp_tcps; 1885 int error; 1886 1887 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 1888 tcp_err_ack(listener, mp, TPROTO, 0); 1889 return; 1890 } 1891 tcr = (struct T_conn_res *)mp->b_rptr; 1892 1893 /* 1894 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the 1895 * read side queue of the streams device underneath us i.e. the 1896 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we 1897 * look it up in the queue_hash. Under LP64 it sends down the 1898 * minor_t of the accepting endpoint. 1899 * 1900 * Once the acceptor/eager are modified (in tcp_accept_swap) the 1901 * fanout hash lock is held. 1902 * This prevents any thread from entering the acceptor queue from 1903 * below (since it has not been hard bound yet i.e. any inbound 1904 * packets will arrive on the listener or default tcp queue and 1905 * go through tcp_lookup). 1906 * The CONN_INC_REF will prevent the acceptor from closing. 1907 * 1908 * XXX It is still possible for a tli application to send down data 1909 * on the accepting stream while another thread calls t_accept. 1910 * This should not be a problem for well-behaved applications since 1911 * the T_OK_ACK is sent after the queue swapping is completed. 1912 * 1913 * If the accepting fd is the same as the listening fd, avoid 1914 * queue hash lookup since that will return an eager listener in a 1915 * already established state. 1916 */ 1917 acceptor_id = tcr->ACCEPTOR_id; 1918 mutex_enter(&listener->tcp_eager_lock); 1919 if (listener->tcp_acceptor_id == acceptor_id) { 1920 eager = listener->tcp_eager_next_q; 1921 /* only count how many T_CONN_INDs so don't count q0 */ 1922 if ((listener->tcp_conn_req_cnt_q != 1) || 1923 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { 1924 mutex_exit(&listener->tcp_eager_lock); 1925 tcp_err_ack(listener, mp, TBADF, 0); 1926 return; 1927 } 1928 if (listener->tcp_conn_req_cnt_q0 != 0) { 1929 /* Throw away all the eagers on q0. */ 1930 tcp_eager_cleanup(listener, 1); 1931 } 1932 if (listener->tcp_syn_defense) { 1933 listener->tcp_syn_defense = B_FALSE; 1934 if (listener->tcp_ip_addr_cache != NULL) { 1935 kmem_free(listener->tcp_ip_addr_cache, 1936 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1937 listener->tcp_ip_addr_cache = NULL; 1938 } 1939 } 1940 /* 1941 * Transfer tcp_conn_req_max to the eager so that when 1942 * a disconnect occurs we can revert the endpoint to the 1943 * listen state. 1944 */ 1945 eager->tcp_conn_req_max = listener->tcp_conn_req_max; 1946 ASSERT(listener->tcp_conn_req_cnt_q0 == 0); 1947 /* 1948 * Get a reference on the acceptor just like the 1949 * tcp_acceptor_hash_lookup below. 1950 */ 1951 acceptor = listener; 1952 CONN_INC_REF(acceptor->tcp_connp); 1953 } else { 1954 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); 1955 if (acceptor == NULL) { 1956 if (listener->tcp_debug) { 1957 (void) strlog(TCP_MOD_ID, 0, 1, 1958 SL_ERROR|SL_TRACE, 1959 "tcp_accept: did not find acceptor 0x%x\n", 1960 acceptor_id); 1961 } 1962 mutex_exit(&listener->tcp_eager_lock); 1963 tcp_err_ack(listener, mp, TPROVMISMATCH, 0); 1964 return; 1965 } 1966 /* 1967 * Verify acceptor state. The acceptable states for an acceptor 1968 * include TCPS_IDLE and TCPS_BOUND. 1969 */ 1970 switch (acceptor->tcp_state) { 1971 case TCPS_IDLE: 1972 /* FALLTHRU */ 1973 case TCPS_BOUND: 1974 break; 1975 default: 1976 CONN_DEC_REF(acceptor->tcp_connp); 1977 mutex_exit(&listener->tcp_eager_lock); 1978 tcp_err_ack(listener, mp, TOUTSTATE, 0); 1979 return; 1980 } 1981 } 1982 1983 /* The listener must be in TCPS_LISTEN */ 1984 if (listener->tcp_state != TCPS_LISTEN) { 1985 CONN_DEC_REF(acceptor->tcp_connp); 1986 mutex_exit(&listener->tcp_eager_lock); 1987 tcp_err_ack(listener, mp, TOUTSTATE, 0); 1988 return; 1989 } 1990 1991 /* 1992 * Rendezvous with an eager connection request packet hanging off 1993 * 'tcp' that has the 'seqnum' tag. We tagged the detached open 1994 * tcp structure when the connection packet arrived in 1995 * tcp_conn_request(). 1996 */ 1997 seqnum = tcr->SEQ_number; 1998 eager = listener; 1999 do { 2000 eager = eager->tcp_eager_next_q; 2001 if (eager == NULL) { 2002 CONN_DEC_REF(acceptor->tcp_connp); 2003 mutex_exit(&listener->tcp_eager_lock); 2004 tcp_err_ack(listener, mp, TBADSEQ, 0); 2005 return; 2006 } 2007 } while (eager->tcp_conn_req_seqnum != seqnum); 2008 mutex_exit(&listener->tcp_eager_lock); 2009 2010 /* 2011 * At this point, both acceptor and listener have 2 ref 2012 * that they begin with. Acceptor has one additional ref 2013 * we placed in lookup while listener has 3 additional 2014 * ref for being behind the squeue (tcp_accept() is 2015 * done on listener's squeue); being in classifier hash; 2016 * and eager's ref on listener. 2017 */ 2018 ASSERT(listener->tcp_connp->conn_ref >= 5); 2019 ASSERT(acceptor->tcp_connp->conn_ref >= 3); 2020 2021 /* 2022 * The eager at this point is set in its own squeue and 2023 * could easily have been killed (tcp_accept_finish will 2024 * deal with that) because of a TH_RST so we can only 2025 * ASSERT for a single ref. 2026 */ 2027 ASSERT(eager->tcp_connp->conn_ref >= 1); 2028 2029 /* Pre allocate the stroptions mblk also */ 2030 opt_mp = allocb(MAX(sizeof (struct tcp_options), 2031 sizeof (struct T_conn_res)), BPRI_HI); 2032 if (opt_mp == NULL) { 2033 CONN_DEC_REF(acceptor->tcp_connp); 2034 CONN_DEC_REF(eager->tcp_connp); 2035 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 2036 return; 2037 } 2038 DB_TYPE(opt_mp) = M_SETOPTS; 2039 opt_mp->b_wptr += sizeof (struct tcp_options); 2040 tcpopt = (struct tcp_options *)opt_mp->b_rptr; 2041 tcpopt->to_flags = 0; 2042 2043 /* 2044 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO 2045 * from listener to acceptor. 2046 */ 2047 if (listener->tcp_bound_if != 0) { 2048 tcpopt->to_flags |= TCPOPT_BOUNDIF; 2049 tcpopt->to_boundif = listener->tcp_bound_if; 2050 } 2051 if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { 2052 tcpopt->to_flags |= TCPOPT_RECVPKTINFO; 2053 } 2054 2055 /* Re-use mp1 to hold a copy of mp, in case reallocb fails */ 2056 if ((mp1 = copymsg(mp)) == NULL) { 2057 CONN_DEC_REF(acceptor->tcp_connp); 2058 CONN_DEC_REF(eager->tcp_connp); 2059 freemsg(opt_mp); 2060 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 2061 return; 2062 } 2063 2064 tcr = (struct T_conn_res *)mp1->b_rptr; 2065 2066 /* 2067 * This is an expanded version of mi_tpi_ok_ack_alloc() 2068 * which allocates a larger mblk and appends the new 2069 * local address to the ok_ack. The address is copied by 2070 * soaccept() for getsockname(). 2071 */ 2072 { 2073 int extra; 2074 2075 extra = (eager->tcp_family == AF_INET) ? 2076 sizeof (sin_t) : sizeof (sin6_t); 2077 2078 /* 2079 * Try to re-use mp, if possible. Otherwise, allocate 2080 * an mblk and return it as ok_mp. In any case, mp 2081 * is no longer usable upon return. 2082 */ 2083 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { 2084 CONN_DEC_REF(acceptor->tcp_connp); 2085 CONN_DEC_REF(eager->tcp_connp); 2086 freemsg(opt_mp); 2087 /* Original mp has been freed by now, so use mp1 */ 2088 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); 2089 return; 2090 } 2091 2092 mp = NULL; /* We should never use mp after this point */ 2093 2094 switch (extra) { 2095 case sizeof (sin_t): { 2096 sin_t *sin = (sin_t *)ok_mp->b_wptr; 2097 2098 ok_mp->b_wptr += extra; 2099 sin->sin_family = AF_INET; 2100 sin->sin_port = eager->tcp_lport; 2101 sin->sin_addr.s_addr = 2102 eager->tcp_ipha->ipha_src; 2103 break; 2104 } 2105 case sizeof (sin6_t): { 2106 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; 2107 2108 ok_mp->b_wptr += extra; 2109 sin6->sin6_family = AF_INET6; 2110 sin6->sin6_port = eager->tcp_lport; 2111 if (eager->tcp_ipversion == IPV4_VERSION) { 2112 sin6->sin6_flowinfo = 0; 2113 IN6_IPADDR_TO_V4MAPPED( 2114 eager->tcp_ipha->ipha_src, 2115 &sin6->sin6_addr); 2116 } else { 2117 ASSERT(eager->tcp_ip6h != NULL); 2118 sin6->sin6_flowinfo = 2119 eager->tcp_ip6h->ip6_vcf & 2120 ~IPV6_VERS_AND_FLOW_MASK; 2121 sin6->sin6_addr = 2122 eager->tcp_ip6h->ip6_src; 2123 } 2124 sin6->sin6_scope_id = 0; 2125 sin6->__sin6_src_id = 0; 2126 break; 2127 } 2128 default: 2129 break; 2130 } 2131 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); 2132 } 2133 2134 /* 2135 * If there are no options we know that the T_CONN_RES will 2136 * succeed. However, we can't send the T_OK_ACK upstream until 2137 * the tcp_accept_swap is done since it would be dangerous to 2138 * let the application start using the new fd prior to the swap. 2139 */ 2140 error = tcp_accept_swap(listener, acceptor, eager); 2141 if (error != 0) { 2142 CONN_DEC_REF(acceptor->tcp_connp); 2143 CONN_DEC_REF(eager->tcp_connp); 2144 freemsg(ok_mp); 2145 /* Original mp has been freed by now, so use mp1 */ 2146 tcp_err_ack(listener, mp1, TSYSERR, error); 2147 return; 2148 } 2149 2150 /* 2151 * tcp_accept_swap unlinks eager from listener but does not drop 2152 * the eager's reference on the listener. 2153 */ 2154 ASSERT(eager->tcp_listener == NULL); 2155 ASSERT(listener->tcp_connp->conn_ref >= 5); 2156 2157 /* 2158 * The eager is now associated with its own queue. Insert in 2159 * the hash so that the connection can be reused for a future 2160 * T_CONN_RES. 2161 */ 2162 tcp_acceptor_hash_insert(acceptor_id, eager); 2163 2164 /* 2165 * We now do the processing of options with T_CONN_RES. 2166 * We delay till now since we wanted to have queue to pass to 2167 * option processing routines that points back to the right 2168 * instance structure which does not happen until after 2169 * tcp_accept_swap(). 2170 * 2171 * Note: 2172 * The sanity of the logic here assumes that whatever options 2173 * are appropriate to inherit from listner=>eager are done 2174 * before this point, and whatever were to be overridden (or not) 2175 * in transfer logic from eager=>acceptor in tcp_accept_swap(). 2176 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it 2177 * before its ACCEPTOR_id comes down in T_CONN_RES ] 2178 * This may not be true at this point in time but can be fixed 2179 * independently. This option processing code starts with 2180 * the instantiated acceptor instance and the final queue at 2181 * this point. 2182 */ 2183 2184 if (tcr->OPT_length != 0) { 2185 /* Options to process */ 2186 int t_error = 0; 2187 int sys_error = 0; 2188 int do_disconnect = 0; 2189 2190 if (tcp_conprim_opt_process(eager, mp1, 2191 &do_disconnect, &t_error, &sys_error) < 0) { 2192 eager->tcp_accept_error = 1; 2193 if (do_disconnect) { 2194 /* 2195 * An option failed which does not allow 2196 * connection to be accepted. 2197 * 2198 * We allow T_CONN_RES to succeed and 2199 * put a T_DISCON_IND on the eager queue. 2200 */ 2201 ASSERT(t_error == 0 && sys_error == 0); 2202 eager->tcp_send_discon_ind = 1; 2203 } else { 2204 ASSERT(t_error != 0); 2205 freemsg(ok_mp); 2206 /* 2207 * Original mp was either freed or set 2208 * to ok_mp above, so use mp1 instead. 2209 */ 2210 tcp_err_ack(listener, mp1, t_error, sys_error); 2211 goto finish; 2212 } 2213 } 2214 /* 2215 * Most likely success in setting options (except if 2216 * eager->tcp_send_discon_ind set). 2217 * mp1 option buffer represented by OPT_length/offset 2218 * potentially modified and contains results of setting 2219 * options at this point 2220 */ 2221 } 2222 2223 /* We no longer need mp1, since all options processing has passed */ 2224 freemsg(mp1); 2225 2226 putnext(listener->tcp_rq, ok_mp); 2227 2228 mutex_enter(&listener->tcp_eager_lock); 2229 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 2230 tcp_t *tail; 2231 mblk_t *conn_ind; 2232 2233 /* 2234 * This path should not be executed if listener and 2235 * acceptor streams are the same. 2236 */ 2237 ASSERT(listener != acceptor); 2238 2239 tcp = listener->tcp_eager_prev_q0; 2240 /* 2241 * listener->tcp_eager_prev_q0 points to the TAIL of the 2242 * deferred T_conn_ind queue. We need to get to the head of 2243 * the queue in order to send up T_conn_ind the same order as 2244 * how the 3WHS is completed. 2245 */ 2246 while (tcp != listener) { 2247 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) 2248 break; 2249 else 2250 tcp = tcp->tcp_eager_prev_q0; 2251 } 2252 ASSERT(tcp != listener); 2253 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; 2254 ASSERT(conn_ind != NULL); 2255 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 2256 2257 /* Move from q0 to q */ 2258 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 2259 listener->tcp_conn_req_cnt_q0--; 2260 listener->tcp_conn_req_cnt_q++; 2261 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 2262 tcp->tcp_eager_prev_q0; 2263 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 2264 tcp->tcp_eager_next_q0; 2265 tcp->tcp_eager_prev_q0 = NULL; 2266 tcp->tcp_eager_next_q0 = NULL; 2267 tcp->tcp_conn_def_q0 = B_FALSE; 2268 2269 /* Make sure the tcp isn't in the list of droppables */ 2270 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && 2271 tcp->tcp_eager_prev_drop_q0 == NULL); 2272 2273 /* 2274 * Insert at end of the queue because sockfs sends 2275 * down T_CONN_RES in chronological order. Leaving 2276 * the older conn indications at front of the queue 2277 * helps reducing search time. 2278 */ 2279 tail = listener->tcp_eager_last_q; 2280 if (tail != NULL) 2281 tail->tcp_eager_next_q = tcp; 2282 else 2283 listener->tcp_eager_next_q = tcp; 2284 listener->tcp_eager_last_q = tcp; 2285 tcp->tcp_eager_next_q = NULL; 2286 mutex_exit(&listener->tcp_eager_lock); 2287 putnext(tcp->tcp_rq, conn_ind); 2288 } else { 2289 mutex_exit(&listener->tcp_eager_lock); 2290 } 2291 2292 /* 2293 * Done with the acceptor - free it 2294 * 2295 * Note: from this point on, no access to listener should be made 2296 * as listener can be equal to acceptor. 2297 */ 2298 finish: 2299 ASSERT(acceptor->tcp_detached); 2300 ASSERT(tcps->tcps_g_q != NULL); 2301 ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); 2302 acceptor->tcp_rq = tcps->tcps_g_q; 2303 acceptor->tcp_wq = WR(tcps->tcps_g_q); 2304 (void) tcp_clean_death(acceptor, 0, 2); 2305 CONN_DEC_REF(acceptor->tcp_connp); 2306 2307 /* 2308 * In case we already received a FIN we have to make tcp_rput send 2309 * the ordrel_ind. This will also send up a window update if the window 2310 * has opened up. 2311 * 2312 * In the normal case of a successful connection acceptance 2313 * we give the O_T_BIND_REQ to the read side put procedure as an 2314 * indication that this was just accepted. This tells tcp_rput to 2315 * pass up any data queued in tcp_rcv_list. 2316 * 2317 * In the fringe case where options sent with T_CONN_RES failed and 2318 * we required, we would be indicating a T_DISCON_IND to blow 2319 * away this connection. 2320 */ 2321 2322 /* 2323 * XXX: we currently have a problem if XTI application closes the 2324 * acceptor stream in between. This problem exists in on10-gate also 2325 * and is well know but nothing can be done short of major rewrite 2326 * to fix it. Now it is possible to take care of it by assigning TLI/XTI 2327 * eager same squeue as listener (we can distinguish non socket 2328 * listeners at the time of handling a SYN in tcp_conn_request) 2329 * and do most of the work that tcp_accept_finish does here itself 2330 * and then get behind the acceptor squeue to access the acceptor 2331 * queue. 2332 */ 2333 /* 2334 * We already have a ref on tcp so no need to do one before squeue_enter 2335 */ 2336 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, opt_mp, tcp_accept_finish, 2337 eager->tcp_connp, SQ_FILL, SQTAG_TCP_ACCEPT_FINISH); 2338 } 2339 2340 /* 2341 * Swap information between the eager and acceptor for a TLI/XTI client. 2342 * The sockfs accept is done on the acceptor stream and control goes 2343 * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not 2344 * called. In either case, both the eager and listener are in their own 2345 * perimeter (squeue) and the code has to deal with potential race. 2346 * 2347 * See the block comment on top of tcp_accept() and tcp_wput_accept(). 2348 */ 2349 static int 2350 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) 2351 { 2352 conn_t *econnp, *aconnp; 2353 cred_t *effective_cred = NULL; 2354 2355 ASSERT(eager->tcp_rq == listener->tcp_rq); 2356 ASSERT(eager->tcp_detached && !acceptor->tcp_detached); 2357 ASSERT(!eager->tcp_hard_bound); 2358 ASSERT(!TCP_IS_SOCKET(acceptor)); 2359 ASSERT(!TCP_IS_SOCKET(eager)); 2360 ASSERT(!TCP_IS_SOCKET(listener)); 2361 2362 econnp = eager->tcp_connp; 2363 aconnp = acceptor->tcp_connp; 2364 2365 /* 2366 * Trusted Extensions may need to use a security label that is 2367 * different from the acceptor's label on MLP and MAC-Exempt 2368 * sockets. If this is the case, the required security label 2369 * already exists in econnp->conn_effective_cred. Use this label 2370 * to generate a new effective cred for the acceptor. 2371 * 2372 * We allow for potential application level retry attempts by 2373 * checking for transient errors before modifying eager. 2374 */ 2375 if (is_system_labeled() && 2376 aconnp->conn_cred != NULL && econnp->conn_effective_cred != NULL) { 2377 effective_cred = copycred_from_tslabel(aconnp->conn_cred, 2378 crgetlabel(econnp->conn_effective_cred), KM_NOSLEEP); 2379 if (effective_cred == NULL) 2380 return (ENOMEM); 2381 } 2382 2383 acceptor->tcp_detached = B_TRUE; 2384 /* 2385 * To permit stream re-use by TLI/XTI, the eager needs a copy of 2386 * the acceptor id. 2387 */ 2388 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; 2389 2390 /* remove eager from listen list... */ 2391 mutex_enter(&listener->tcp_eager_lock); 2392 tcp_eager_unlink(eager); 2393 ASSERT(eager->tcp_eager_next_q == NULL && 2394 eager->tcp_eager_last_q == NULL); 2395 ASSERT(eager->tcp_eager_next_q0 == NULL && 2396 eager->tcp_eager_prev_q0 == NULL); 2397 mutex_exit(&listener->tcp_eager_lock); 2398 eager->tcp_rq = acceptor->tcp_rq; 2399 eager->tcp_wq = acceptor->tcp_wq; 2400 2401 eager->tcp_rq->q_ptr = econnp; 2402 eager->tcp_wq->q_ptr = econnp; 2403 2404 /* 2405 * In the TLI/XTI loopback case, we are inside the listener's squeue, 2406 * which might be a different squeue from our peer TCP instance. 2407 * For TCP Fusion, the peer expects that whenever tcp_detached is 2408 * clear, our TCP queues point to the acceptor's queues. Thus, use 2409 * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq 2410 * above reach global visibility prior to the clearing of tcp_detached. 2411 */ 2412 membar_producer(); 2413 eager->tcp_detached = B_FALSE; 2414 2415 ASSERT(eager->tcp_ack_tid == 0); 2416 2417 econnp->conn_dev = aconnp->conn_dev; 2418 econnp->conn_minor_arena = aconnp->conn_minor_arena; 2419 2420 ASSERT(econnp->conn_minor_arena != NULL); 2421 if (eager->tcp_cred != NULL) 2422 crfree(eager->tcp_cred); 2423 eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred; 2424 if (econnp->conn_effective_cred != NULL) 2425 crfree(econnp->conn_effective_cred); 2426 econnp->conn_effective_cred = effective_cred; 2427 aconnp->conn_cred = NULL; 2428 ASSERT(aconnp->conn_effective_cred == NULL); 2429 2430 ASSERT(econnp->conn_netstack == aconnp->conn_netstack); 2431 ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); 2432 2433 econnp->conn_zoneid = aconnp->conn_zoneid; 2434 econnp->conn_allzones = aconnp->conn_allzones; 2435 2436 aconnp->conn_mac_exempt = B_FALSE; 2437 2438 /* Do the IPC initialization */ 2439 CONN_INC_REF(econnp); 2440 2441 econnp->conn_multicast_loop = aconnp->conn_multicast_loop; 2442 econnp->conn_af_isv6 = aconnp->conn_af_isv6; 2443 econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6; 2444 2445 /* Done with old IPC. Drop its ref on its connp */ 2446 CONN_DEC_REF(aconnp); 2447 return (0); 2448 } 2449 2450 2451 /* 2452 * Adapt to the information, such as rtt and rtt_sd, provided from the 2453 * ire cached in conn_cache_ire. If no ire cached, do a ire lookup. 2454 * 2455 * Checks for multicast and broadcast destination address. 2456 * Returns zero on failure; non-zero if ok. 2457 * 2458 * Note that the MSS calculation here is based on the info given in 2459 * the IRE. We do not do any calculation based on TCP options. They 2460 * will be handled in tcp_rput_other() and tcp_rput_data() when TCP 2461 * knows which options to use. 2462 * 2463 * Note on how TCP gets its parameters for a connection. 2464 * 2465 * When a tcp_t structure is allocated, it gets all the default parameters. 2466 * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd, 2467 * spipe, rpipe, ... from the route metrics. Route metric overrides the 2468 * default. 2469 * 2470 * An incoming SYN with a multicast or broadcast destination address, is dropped 2471 * in 1 of 2 places. 2472 * 2473 * 1. If the packet was received over the wire it is dropped in 2474 * ip_rput_process_broadcast() 2475 * 2476 * 2. If the packet was received through internal IP loopback, i.e. the packet 2477 * was generated and received on the same machine, it is dropped in 2478 * ip_wput_local() 2479 * 2480 * An incoming SYN with a multicast or broadcast source address is always 2481 * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to 2482 * reject an attempt to connect to a broadcast or multicast (destination) 2483 * address. 2484 */ 2485 static int 2486 tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) 2487 { 2488 ire_t *ire; 2489 ire_t *sire = NULL; 2490 iulp_t *ire_uinfo = NULL; 2491 uint32_t mss_max; 2492 uint32_t mss; 2493 boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 2494 conn_t *connp = tcp->tcp_connp; 2495 boolean_t ire_cacheable = B_FALSE; 2496 zoneid_t zoneid = connp->conn_zoneid; 2497 int match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 2498 MATCH_IRE_SECATTR; 2499 ts_label_t *tsl = crgetlabel(CONN_CRED(connp)); 2500 ill_t *ill = NULL; 2501 boolean_t incoming = (ire_mp == NULL); 2502 tcp_stack_t *tcps = tcp->tcp_tcps; 2503 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 2504 2505 ASSERT(connp->conn_ire_cache == NULL); 2506 2507 if (tcp->tcp_ipversion == IPV4_VERSION) { 2508 2509 if (CLASSD(tcp->tcp_connp->conn_rem)) { 2510 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); 2511 return (0); 2512 } 2513 /* 2514 * If IP_NEXTHOP is set, then look for an IRE_CACHE 2515 * for the destination with the nexthop as gateway. 2516 * ire_ctable_lookup() is used because this particular 2517 * ire, if it exists, will be marked private. 2518 * If that is not available, use the interface ire 2519 * for the nexthop. 2520 * 2521 * TSol: tcp_update_label will detect label mismatches based 2522 * only on the destination's label, but that would not 2523 * detect label mismatches based on the security attributes 2524 * of routes or next hop gateway. Hence we need to pass the 2525 * label to ire_ftable_lookup below in order to locate the 2526 * right prefix (and/or) ire cache. Similarly we also need 2527 * pass the label to the ire_cache_lookup below to locate 2528 * the right ire that also matches on the label. 2529 */ 2530 if (tcp->tcp_connp->conn_nexthop_set) { 2531 ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem, 2532 tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid, 2533 tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, 2534 ipst); 2535 if (ire == NULL) { 2536 ire = ire_ftable_lookup( 2537 tcp->tcp_connp->conn_nexthop_v4, 2538 0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0, 2539 tsl, match_flags, ipst); 2540 if (ire == NULL) 2541 return (0); 2542 } else { 2543 ire_uinfo = &ire->ire_uinfo; 2544 } 2545 } else { 2546 ire = ire_cache_lookup(tcp->tcp_connp->conn_rem, 2547 zoneid, tsl, ipst); 2548 if (ire != NULL) { 2549 ire_cacheable = B_TRUE; 2550 ire_uinfo = (ire_mp != NULL) ? 2551 &((ire_t *)ire_mp->b_rptr)->ire_uinfo: 2552 &ire->ire_uinfo; 2553 2554 } else { 2555 if (ire_mp == NULL) { 2556 ire = ire_ftable_lookup( 2557 tcp->tcp_connp->conn_rem, 2558 0, 0, 0, NULL, &sire, zoneid, 0, 2559 tsl, (MATCH_IRE_RECURSIVE | 2560 MATCH_IRE_DEFAULT), ipst); 2561 if (ire == NULL) 2562 return (0); 2563 ire_uinfo = (sire != NULL) ? 2564 &sire->ire_uinfo : 2565 &ire->ire_uinfo; 2566 } else { 2567 ire = (ire_t *)ire_mp->b_rptr; 2568 ire_uinfo = 2569 &((ire_t *) 2570 ire_mp->b_rptr)->ire_uinfo; 2571 } 2572 } 2573 } 2574 ASSERT(ire != NULL); 2575 2576 if ((ire->ire_src_addr == INADDR_ANY) || 2577 (ire->ire_type & IRE_BROADCAST)) { 2578 /* 2579 * ire->ire_mp is non null when ire_mp passed in is used 2580 * ire->ire_mp is set in ip_bind_insert_ire[_v6](). 2581 */ 2582 if (ire->ire_mp == NULL) 2583 ire_refrele(ire); 2584 if (sire != NULL) 2585 ire_refrele(sire); 2586 return (0); 2587 } 2588 2589 if (tcp->tcp_ipha->ipha_src == INADDR_ANY) { 2590 ipaddr_t src_addr; 2591 2592 /* 2593 * ip_bind_connected() has stored the correct source 2594 * address in conn_src. 2595 */ 2596 src_addr = tcp->tcp_connp->conn_src; 2597 tcp->tcp_ipha->ipha_src = src_addr; 2598 /* 2599 * Copy of the src addr. in tcp_t is needed 2600 * for the lookup funcs. 2601 */ 2602 IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6); 2603 } 2604 /* 2605 * Set the fragment bit so that IP will tell us if the MTU 2606 * should change. IP tells us the latest setting of 2607 * ip_path_mtu_discovery through ire_frag_flag. 2608 */ 2609 if (ipst->ips_ip_path_mtu_discovery) { 2610 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 2611 htons(IPH_DF); 2612 } 2613 /* 2614 * If ire_uinfo is NULL, this is the IRE_INTERFACE case 2615 * for IP_NEXTHOP. No cache ire has been found for the 2616 * destination and we are working with the nexthop's 2617 * interface ire. Since we need to forward all packets 2618 * to the nexthop first, we "blindly" set tcp_localnet 2619 * to false, eventhough the destination may also be 2620 * onlink. 2621 */ 2622 if (ire_uinfo == NULL) 2623 tcp->tcp_localnet = 0; 2624 else 2625 tcp->tcp_localnet = (ire->ire_gateway_addr == 0); 2626 } else { 2627 /* 2628 * For incoming connection ire_mp = NULL 2629 * For outgoing connection ire_mp != NULL 2630 * Technically we should check conn_incoming_ill 2631 * when ire_mp is NULL and conn_outgoing_ill when 2632 * ire_mp is non-NULL. But this is performance 2633 * critical path and for IPV*_BOUND_IF, outgoing 2634 * and incoming ill are always set to the same value. 2635 */ 2636 ill_t *dst_ill = NULL; 2637 ipif_t *dst_ipif = NULL; 2638 2639 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); 2640 2641 if (connp->conn_outgoing_ill != NULL) { 2642 /* Outgoing or incoming path */ 2643 int err; 2644 2645 dst_ill = conn_get_held_ill(connp, 2646 &connp->conn_outgoing_ill, &err); 2647 if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) { 2648 ip1dbg(("tcp_adapt_ire: ill_lookup failed\n")); 2649 return (0); 2650 } 2651 match_flags |= MATCH_IRE_ILL; 2652 dst_ipif = dst_ill->ill_ipif; 2653 } 2654 ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6, 2655 0, 0, dst_ipif, zoneid, tsl, match_flags, ipst); 2656 2657 if (ire != NULL) { 2658 ire_cacheable = B_TRUE; 2659 ire_uinfo = (ire_mp != NULL) ? 2660 &((ire_t *)ire_mp->b_rptr)->ire_uinfo: 2661 &ire->ire_uinfo; 2662 } else { 2663 if (ire_mp == NULL) { 2664 ire = ire_ftable_lookup_v6( 2665 &tcp->tcp_connp->conn_remv6, 2666 0, 0, 0, dst_ipif, &sire, zoneid, 2667 0, tsl, match_flags, ipst); 2668 if (ire == NULL) { 2669 if (dst_ill != NULL) 2670 ill_refrele(dst_ill); 2671 return (0); 2672 } 2673 ire_uinfo = (sire != NULL) ? &sire->ire_uinfo : 2674 &ire->ire_uinfo; 2675 } else { 2676 ire = (ire_t *)ire_mp->b_rptr; 2677 ire_uinfo = 2678 &((ire_t *)ire_mp->b_rptr)->ire_uinfo; 2679 } 2680 } 2681 if (dst_ill != NULL) 2682 ill_refrele(dst_ill); 2683 2684 ASSERT(ire != NULL); 2685 ASSERT(ire_uinfo != NULL); 2686 2687 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) || 2688 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { 2689 /* 2690 * ire->ire_mp is non null when ire_mp passed in is used 2691 * ire->ire_mp is set in ip_bind_insert_ire[_v6](). 2692 */ 2693 if (ire->ire_mp == NULL) 2694 ire_refrele(ire); 2695 if (sire != NULL) 2696 ire_refrele(sire); 2697 return (0); 2698 } 2699 2700 if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { 2701 in6_addr_t src_addr; 2702 2703 /* 2704 * ip_bind_connected_v6() has stored the correct source 2705 * address per IPv6 addr. selection policy in 2706 * conn_src_v6. 2707 */ 2708 src_addr = tcp->tcp_connp->conn_srcv6; 2709 2710 tcp->tcp_ip6h->ip6_src = src_addr; 2711 /* 2712 * Copy of the src addr. in tcp_t is needed 2713 * for the lookup funcs. 2714 */ 2715 tcp->tcp_ip_src_v6 = src_addr; 2716 ASSERT(IN6_ARE_ADDR_EQUAL(&tcp->tcp_ip6h->ip6_src, 2717 &connp->conn_srcv6)); 2718 } 2719 tcp->tcp_localnet = 2720 IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6); 2721 } 2722 2723 /* 2724 * This allows applications to fail quickly when connections are made 2725 * to dead hosts. Hosts can be labeled dead by adding a reject route 2726 * with both the RTF_REJECT and RTF_PRIVATE flags set. 2727 */ 2728 if ((ire->ire_flags & RTF_REJECT) && 2729 (ire->ire_flags & RTF_PRIVATE)) 2730 goto error; 2731 2732 /* 2733 * Make use of the cached rtt and rtt_sd values to calculate the 2734 * initial RTO. Note that they are already initialized in 2735 * tcp_init_values(). 2736 * If ire_uinfo is NULL, i.e., we do not have a cache ire for 2737 * IP_NEXTHOP, but instead are using the interface ire for the 2738 * nexthop, then we do not use the ire_uinfo from that ire to 2739 * do any initializations. 2740 */ 2741 if (ire_uinfo != NULL) { 2742 if (ire_uinfo->iulp_rtt != 0) { 2743 clock_t rto; 2744 2745 tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt; 2746 tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd; 2747 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 2748 tcps->tcps_rexmit_interval_extra + 2749 (tcp->tcp_rtt_sa >> 5); 2750 2751 if (rto > tcps->tcps_rexmit_interval_max) { 2752 tcp->tcp_rto = tcps->tcps_rexmit_interval_max; 2753 } else if (rto < tcps->tcps_rexmit_interval_min) { 2754 tcp->tcp_rto = tcps->tcps_rexmit_interval_min; 2755 } else { 2756 tcp->tcp_rto = rto; 2757 } 2758 } 2759 if (ire_uinfo->iulp_ssthresh != 0) 2760 tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh; 2761 else 2762 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 2763 if (ire_uinfo->iulp_spipe > 0) { 2764 tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe, 2765 tcps->tcps_max_buf); 2766 if (tcps->tcps_snd_lowat_fraction != 0) 2767 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater / 2768 tcps->tcps_snd_lowat_fraction; 2769 (void) tcp_maxpsz_set(tcp, B_TRUE); 2770 } 2771 /* 2772 * Note that up till now, acceptor always inherits receive 2773 * window from the listener. But if there is a metrics 2774 * associated with a host, we should use that instead of 2775 * inheriting it from listener. Thus we need to pass this 2776 * info back to the caller. 2777 */ 2778 if (ire_uinfo->iulp_rpipe > 0) { 2779 tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe, 2780 tcps->tcps_max_buf); 2781 } 2782 2783 if (ire_uinfo->iulp_rtomax > 0) { 2784 tcp->tcp_second_timer_threshold = 2785 ire_uinfo->iulp_rtomax; 2786 } 2787 2788 /* 2789 * Use the metric option settings, iulp_tstamp_ok and 2790 * iulp_wscale_ok, only for active open. What this means 2791 * is that if the other side uses timestamp or window 2792 * scale option, TCP will also use those options. That 2793 * is for passive open. If the application sets a 2794 * large window, window scale is enabled regardless of 2795 * the value in iulp_wscale_ok. This is the behavior 2796 * since 2.6. So we keep it. 2797 * The only case left in passive open processing is the 2798 * check for SACK. 2799 * For ECN, it should probably be like SACK. But the 2800 * current value is binary, so we treat it like the other 2801 * cases. The metric only controls active open.For passive 2802 * open, the ndd param, tcp_ecn_permitted, controls the 2803 * behavior. 2804 */ 2805 if (!tcp_detached) { 2806 /* 2807 * The if check means that the following can only 2808 * be turned on by the metrics only IRE, but not off. 2809 */ 2810 if (ire_uinfo->iulp_tstamp_ok) 2811 tcp->tcp_snd_ts_ok = B_TRUE; 2812 if (ire_uinfo->iulp_wscale_ok) 2813 tcp->tcp_snd_ws_ok = B_TRUE; 2814 if (ire_uinfo->iulp_sack == 2) 2815 tcp->tcp_snd_sack_ok = B_TRUE; 2816 if (ire_uinfo->iulp_ecn_ok) 2817 tcp->tcp_ecn_ok = B_TRUE; 2818 } else { 2819 /* 2820 * Passive open. 2821 * 2822 * As above, the if check means that SACK can only be 2823 * turned on by the metric only IRE. 2824 */ 2825 if (ire_uinfo->iulp_sack > 0) { 2826 tcp->tcp_snd_sack_ok = B_TRUE; 2827 } 2828 } 2829 } 2830 2831 2832 /* 2833 * XXX: Note that currently, ire_max_frag can be as small as 68 2834 * because of PMTUd. So tcp_mss may go to negative if combined 2835 * length of all those options exceeds 28 bytes. But because 2836 * of the tcp_mss_min check below, we may not have a problem if 2837 * tcp_mss_min is of a reasonable value. The default is 1 so 2838 * the negative problem still exists. And the check defeats PMTUd. 2839 * In fact, if PMTUd finds that the MSS should be smaller than 2840 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min 2841 * value. 2842 * 2843 * We do not deal with that now. All those problems related to 2844 * PMTUd will be fixed later. 2845 */ 2846 ASSERT(ire->ire_max_frag != 0); 2847 mss = tcp->tcp_if_mtu = ire->ire_max_frag; 2848 if (tcp->tcp_ipp_fields & IPPF_USE_MIN_MTU) { 2849 if (tcp->tcp_ipp_use_min_mtu == IPV6_USE_MIN_MTU_NEVER) { 2850 mss = MIN(mss, IPV6_MIN_MTU); 2851 } 2852 } 2853 2854 /* Sanity check for MSS value. */ 2855 if (tcp->tcp_ipversion == IPV4_VERSION) 2856 mss_max = tcps->tcps_mss_max_ipv4; 2857 else 2858 mss_max = tcps->tcps_mss_max_ipv6; 2859 2860 if (tcp->tcp_ipversion == IPV6_VERSION && 2861 (ire->ire_frag_flag & IPH_FRAG_HDR)) { 2862 /* 2863 * After receiving an ICMPv6 "packet too big" message with a 2864 * MTU < 1280, and for multirouted IPv6 packets, the IP layer 2865 * will insert a 8-byte fragment header in every packet; we 2866 * reduce the MSS by that amount here. 2867 */ 2868 mss -= sizeof (ip6_frag_t); 2869 } 2870 2871 if (tcp->tcp_ipsec_overhead == 0) 2872 tcp->tcp_ipsec_overhead = conn_ipsec_length(connp); 2873 2874 mss -= tcp->tcp_ipsec_overhead; 2875 2876 if (mss < tcps->tcps_mss_min) 2877 mss = tcps->tcps_mss_min; 2878 if (mss > mss_max) 2879 mss = mss_max; 2880 2881 /* Note that this is the maximum MSS, excluding all options. */ 2882 tcp->tcp_mss = mss; 2883 2884 /* 2885 * Initialize the ISS here now that we have the full connection ID. 2886 * The RFC 1948 method of initial sequence number generation requires 2887 * knowledge of the full connection ID before setting the ISS. 2888 */ 2889 2890 tcp_iss_init(tcp); 2891 2892 if (ire->ire_type & (IRE_LOOPBACK | IRE_LOCAL)) 2893 tcp->tcp_loopback = B_TRUE; 2894 2895 if (sire != NULL) 2896 IRE_REFRELE(sire); 2897 2898 /* 2899 * If we got an IRE_CACHE and an ILL, go through their properties; 2900 * otherwise, this is deferred until later when we have an IRE_CACHE. 2901 */ 2902 if (tcp->tcp_loopback || 2903 (ire_cacheable && (ill = ire_to_ill(ire)) != NULL)) { 2904 /* 2905 * For incoming, see if this tcp may be MDT-capable. For 2906 * outgoing, this process has been taken care of through 2907 * tcp_rput_other. 2908 */ 2909 tcp_ire_ill_check(tcp, ire, ill, incoming); 2910 tcp->tcp_ire_ill_check_done = B_TRUE; 2911 } 2912 2913 mutex_enter(&connp->conn_lock); 2914 /* 2915 * Make sure that conn is not marked incipient 2916 * for incoming connections. A blind 2917 * removal of incipient flag is cheaper than 2918 * check and removal. 2919 */ 2920 connp->conn_state_flags &= ~CONN_INCIPIENT; 2921 2922 /* 2923 * Must not cache forwarding table routes 2924 * or recache an IRE after the conn_t has 2925 * had conn_ire_cache cleared and is flagged 2926 * unusable, (see the CONN_CACHE_IRE() macro). 2927 */ 2928 if (ire_cacheable && CONN_CACHE_IRE(connp)) { 2929 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 2930 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 2931 connp->conn_ire_cache = ire; 2932 IRE_UNTRACE_REF(ire); 2933 rw_exit(&ire->ire_bucket->irb_lock); 2934 mutex_exit(&connp->conn_lock); 2935 return (1); 2936 } 2937 rw_exit(&ire->ire_bucket->irb_lock); 2938 } 2939 mutex_exit(&connp->conn_lock); 2940 2941 if (ire->ire_mp == NULL) 2942 ire_refrele(ire); 2943 return (1); 2944 2945 error: 2946 if (ire->ire_mp == NULL) 2947 ire_refrele(ire); 2948 if (sire != NULL) 2949 ire_refrele(sire); 2950 return (0); 2951 } 2952 2953 static void 2954 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) 2955 { 2956 int error; 2957 conn_t *connp = tcp->tcp_connp; 2958 struct sockaddr *sa; 2959 mblk_t *mp1; 2960 struct T_bind_req *tbr; 2961 int backlog; 2962 socklen_t len; 2963 sin_t *sin; 2964 sin6_t *sin6; 2965 cred_t *cr; 2966 2967 /* 2968 * All Solaris components should pass a db_credp 2969 * for this TPI message, hence we ASSERT. 2970 * But in case there is some other M_PROTO that looks 2971 * like a TPI message sent by some other kernel 2972 * component, we check and return an error. 2973 */ 2974 cr = msg_getcred(mp, NULL); 2975 ASSERT(cr != NULL); 2976 if (cr == NULL) { 2977 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 2978 return; 2979 } 2980 2981 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 2982 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 2983 if (tcp->tcp_debug) { 2984 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 2985 "tcp_tpi_bind: bad req, len %u", 2986 (uint_t)(mp->b_wptr - mp->b_rptr)); 2987 } 2988 tcp_err_ack(tcp, mp, TPROTO, 0); 2989 return; 2990 } 2991 /* Make sure the largest address fits */ 2992 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1); 2993 if (mp1 == NULL) { 2994 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 2995 return; 2996 } 2997 mp = mp1; 2998 tbr = (struct T_bind_req *)mp->b_rptr; 2999 3000 backlog = tbr->CONIND_number; 3001 len = tbr->ADDR_length; 3002 3003 switch (len) { 3004 case 0: /* request for a generic port */ 3005 tbr->ADDR_offset = sizeof (struct T_bind_req); 3006 if (tcp->tcp_family == AF_INET) { 3007 tbr->ADDR_length = sizeof (sin_t); 3008 sin = (sin_t *)&tbr[1]; 3009 *sin = sin_null; 3010 sin->sin_family = AF_INET; 3011 sa = (struct sockaddr *)sin; 3012 len = sizeof (sin_t); 3013 mp->b_wptr = (uchar_t *)&sin[1]; 3014 } else { 3015 ASSERT(tcp->tcp_family == AF_INET6); 3016 tbr->ADDR_length = sizeof (sin6_t); 3017 sin6 = (sin6_t *)&tbr[1]; 3018 *sin6 = sin6_null; 3019 sin6->sin6_family = AF_INET6; 3020 sa = (struct sockaddr *)sin6; 3021 len = sizeof (sin6_t); 3022 mp->b_wptr = (uchar_t *)&sin6[1]; 3023 } 3024 break; 3025 3026 case sizeof (sin_t): /* Complete IPv4 address */ 3027 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 3028 sizeof (sin_t)); 3029 break; 3030 3031 case sizeof (sin6_t): /* Complete IPv6 address */ 3032 sa = (struct sockaddr *)mi_offset_param(mp, 3033 tbr->ADDR_offset, sizeof (sin6_t)); 3034 break; 3035 3036 default: 3037 if (tcp->tcp_debug) { 3038 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 3039 "tcp_tpi_bind: bad address length, %d", 3040 tbr->ADDR_length); 3041 } 3042 tcp_err_ack(tcp, mp, TBADADDR, 0); 3043 return; 3044 } 3045 3046 if (backlog > 0) { 3047 error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp), 3048 tbr->PRIM_type != O_T_BIND_REQ); 3049 } else { 3050 error = tcp_do_bind(connp, sa, len, DB_CRED(mp), 3051 tbr->PRIM_type != O_T_BIND_REQ); 3052 } 3053 done: 3054 if (error > 0) { 3055 tcp_err_ack(tcp, mp, TSYSERR, error); 3056 } else if (error < 0) { 3057 tcp_err_ack(tcp, mp, -error, 0); 3058 } else { 3059 /* 3060 * Update port information as sockfs/tpi needs it for checking 3061 */ 3062 if (tcp->tcp_family == AF_INET) { 3063 sin = (sin_t *)sa; 3064 sin->sin_port = tcp->tcp_lport; 3065 } else { 3066 sin6 = (sin6_t *)sa; 3067 sin6->sin6_port = tcp->tcp_lport; 3068 } 3069 mp->b_datap->db_type = M_PCPROTO; 3070 tbr->PRIM_type = T_BIND_ACK; 3071 putnext(tcp->tcp_rq, mp); 3072 } 3073 } 3074 3075 /* 3076 * If the "bind_to_req_port_only" parameter is set, if the requested port 3077 * number is available, return it, If not return 0 3078 * 3079 * If "bind_to_req_port_only" parameter is not set and 3080 * If the requested port number is available, return it. If not, return 3081 * the first anonymous port we happen across. If no anonymous ports are 3082 * available, return 0. addr is the requested local address, if any. 3083 * 3084 * In either case, when succeeding update the tcp_t to record the port number 3085 * and insert it in the bind hash table. 3086 * 3087 * Note that TCP over IPv4 and IPv6 sockets can use the same port number 3088 * without setting SO_REUSEADDR. This is needed so that they 3089 * can be viewed as two independent transport protocols. 3090 */ 3091 static in_port_t 3092 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 3093 int reuseaddr, boolean_t quick_connect, 3094 boolean_t bind_to_req_port_only, boolean_t user_specified) 3095 { 3096 /* number of times we have run around the loop */ 3097 int count = 0; 3098 /* maximum number of times to run around the loop */ 3099 int loopmax; 3100 conn_t *connp = tcp->tcp_connp; 3101 zoneid_t zoneid = connp->conn_zoneid; 3102 tcp_stack_t *tcps = tcp->tcp_tcps; 3103 3104 /* 3105 * Lookup for free addresses is done in a loop and "loopmax" 3106 * influences how long we spin in the loop 3107 */ 3108 if (bind_to_req_port_only) { 3109 /* 3110 * If the requested port is busy, don't bother to look 3111 * for a new one. Setting loop maximum count to 1 has 3112 * that effect. 3113 */ 3114 loopmax = 1; 3115 } else { 3116 /* 3117 * If the requested port is busy, look for a free one 3118 * in the anonymous port range. 3119 * Set loopmax appropriately so that one does not look 3120 * forever in the case all of the anonymous ports are in use. 3121 */ 3122 if (tcp->tcp_anon_priv_bind) { 3123 /* 3124 * loopmax = 3125 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 3126 */ 3127 loopmax = IPPORT_RESERVED - 3128 tcps->tcps_min_anonpriv_port; 3129 } else { 3130 loopmax = (tcps->tcps_largest_anon_port - 3131 tcps->tcps_smallest_anon_port + 1); 3132 } 3133 } 3134 do { 3135 uint16_t lport; 3136 tf_t *tbf; 3137 tcp_t *ltcp; 3138 conn_t *lconnp; 3139 3140 lport = htons(port); 3141 3142 /* 3143 * Ensure that the tcp_t is not currently in the bind hash. 3144 * Hold the lock on the hash bucket to ensure that 3145 * the duplicate check plus the insertion is an atomic 3146 * operation. 3147 * 3148 * This function does an inline lookup on the bind hash list 3149 * Make sure that we access only members of tcp_t 3150 * and that we don't look at tcp_tcp, since we are not 3151 * doing a CONN_INC_REF. 3152 */ 3153 tcp_bind_hash_remove(tcp); 3154 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)]; 3155 mutex_enter(&tbf->tf_lock); 3156 for (ltcp = tbf->tf_tcp; ltcp != NULL; 3157 ltcp = ltcp->tcp_bind_hash) { 3158 if (lport == ltcp->tcp_lport) 3159 break; 3160 } 3161 3162 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { 3163 boolean_t not_socket; 3164 boolean_t exclbind; 3165 3166 lconnp = ltcp->tcp_connp; 3167 3168 /* 3169 * On a labeled system, we must treat bindings to ports 3170 * on shared IP addresses by sockets with MAC exemption 3171 * privilege as being in all zones, as there's 3172 * otherwise no way to identify the right receiver. 3173 */ 3174 if (!(IPCL_ZONE_MATCH(ltcp->tcp_connp, zoneid) || 3175 IPCL_ZONE_MATCH(connp, 3176 ltcp->tcp_connp->conn_zoneid)) && 3177 !lconnp->conn_mac_exempt && 3178 !connp->conn_mac_exempt) 3179 continue; 3180 3181 /* 3182 * If TCP_EXCLBIND is set for either the bound or 3183 * binding endpoint, the semantics of bind 3184 * is changed according to the following. 3185 * 3186 * spec = specified address (v4 or v6) 3187 * unspec = unspecified address (v4 or v6) 3188 * A = specified addresses are different for endpoints 3189 * 3190 * bound bind to allowed 3191 * ------------------------------------- 3192 * unspec unspec no 3193 * unspec spec no 3194 * spec unspec no 3195 * spec spec yes if A 3196 * 3197 * For labeled systems, SO_MAC_EXEMPT behaves the same 3198 * as TCP_EXCLBIND, except that zoneid is ignored. 3199 * 3200 * Note: 3201 * 3202 * 1. Because of TLI semantics, an endpoint can go 3203 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or 3204 * TCPS_BOUND, depending on whether it is originally 3205 * a listener or not. That is why we need to check 3206 * for states greater than or equal to TCPS_BOUND 3207 * here. 3208 * 3209 * 2. Ideally, we should only check for state equals 3210 * to TCPS_LISTEN. And the following check should be 3211 * added. 3212 * 3213 * if (ltcp->tcp_state == TCPS_LISTEN || 3214 * !reuseaddr || !ltcp->tcp_reuseaddr) { 3215 * ... 3216 * } 3217 * 3218 * The semantics will be changed to this. If the 3219 * endpoint on the list is in state not equal to 3220 * TCPS_LISTEN and both endpoints have SO_REUSEADDR 3221 * set, let the bind succeed. 3222 * 3223 * Because of (1), we cannot do that for TLI 3224 * endpoints. But we can do that for socket endpoints. 3225 * If in future, we can change this going back 3226 * semantics, we can use the above check for TLI also. 3227 */ 3228 not_socket = !(TCP_IS_SOCKET(ltcp) && 3229 TCP_IS_SOCKET(tcp)); 3230 exclbind = ltcp->tcp_exclbind || tcp->tcp_exclbind; 3231 3232 if (lconnp->conn_mac_exempt || connp->conn_mac_exempt || 3233 (exclbind && (not_socket || 3234 ltcp->tcp_state <= TCPS_ESTABLISHED))) { 3235 if (V6_OR_V4_INADDR_ANY( 3236 ltcp->tcp_bound_source_v6) || 3237 V6_OR_V4_INADDR_ANY(*laddr) || 3238 IN6_ARE_ADDR_EQUAL(laddr, 3239 <cp->tcp_bound_source_v6)) { 3240 break; 3241 } 3242 continue; 3243 } 3244 3245 /* 3246 * Check ipversion to allow IPv4 and IPv6 sockets to 3247 * have disjoint port number spaces, if *_EXCLBIND 3248 * is not set and only if the application binds to a 3249 * specific port. We use the same autoassigned port 3250 * number space for IPv4 and IPv6 sockets. 3251 */ 3252 if (tcp->tcp_ipversion != ltcp->tcp_ipversion && 3253 bind_to_req_port_only) 3254 continue; 3255 3256 /* 3257 * Ideally, we should make sure that the source 3258 * address, remote address, and remote port in the 3259 * four tuple for this tcp-connection is unique. 3260 * However, trying to find out the local source 3261 * address would require too much code duplication 3262 * with IP, since IP needs needs to have that code 3263 * to support userland TCP implementations. 3264 */ 3265 if (quick_connect && 3266 (ltcp->tcp_state > TCPS_LISTEN) && 3267 ((tcp->tcp_fport != ltcp->tcp_fport) || 3268 !IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6, 3269 <cp->tcp_remote_v6))) 3270 continue; 3271 3272 if (!reuseaddr) { 3273 /* 3274 * No socket option SO_REUSEADDR. 3275 * If existing port is bound to 3276 * a non-wildcard IP address 3277 * and the requesting stream is 3278 * bound to a distinct 3279 * different IP addresses 3280 * (non-wildcard, also), keep 3281 * going. 3282 */ 3283 if (!V6_OR_V4_INADDR_ANY(*laddr) && 3284 !V6_OR_V4_INADDR_ANY( 3285 ltcp->tcp_bound_source_v6) && 3286 !IN6_ARE_ADDR_EQUAL(laddr, 3287 <cp->tcp_bound_source_v6)) 3288 continue; 3289 if (ltcp->tcp_state >= TCPS_BOUND) { 3290 /* 3291 * This port is being used and 3292 * its state is >= TCPS_BOUND, 3293 * so we can't bind to it. 3294 */ 3295 break; 3296 } 3297 } else { 3298 /* 3299 * socket option SO_REUSEADDR is set on the 3300 * binding tcp_t. 3301 * 3302 * If two streams are bound to 3303 * same IP address or both addr 3304 * and bound source are wildcards 3305 * (INADDR_ANY), we want to stop 3306 * searching. 3307 * We have found a match of IP source 3308 * address and source port, which is 3309 * refused regardless of the 3310 * SO_REUSEADDR setting, so we break. 3311 */ 3312 if (IN6_ARE_ADDR_EQUAL(laddr, 3313 <cp->tcp_bound_source_v6) && 3314 (ltcp->tcp_state == TCPS_LISTEN || 3315 ltcp->tcp_state == TCPS_BOUND)) 3316 break; 3317 } 3318 } 3319 if (ltcp != NULL) { 3320 /* The port number is busy */ 3321 mutex_exit(&tbf->tf_lock); 3322 } else { 3323 /* 3324 * This port is ours. Insert in fanout and mark as 3325 * bound to prevent others from getting the port 3326 * number. 3327 */ 3328 tcp->tcp_state = TCPS_BOUND; 3329 tcp->tcp_lport = htons(port); 3330 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 3331 3332 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( 3333 tcp->tcp_lport)] == tbf); 3334 tcp_bind_hash_insert(tbf, tcp, 1); 3335 3336 mutex_exit(&tbf->tf_lock); 3337 3338 /* 3339 * We don't want tcp_next_port_to_try to "inherit" 3340 * a port number supplied by the user in a bind. 3341 */ 3342 if (user_specified) 3343 return (port); 3344 3345 /* 3346 * This is the only place where tcp_next_port_to_try 3347 * is updated. After the update, it may or may not 3348 * be in the valid range. 3349 */ 3350 if (!tcp->tcp_anon_priv_bind) 3351 tcps->tcps_next_port_to_try = port + 1; 3352 return (port); 3353 } 3354 3355 if (tcp->tcp_anon_priv_bind) { 3356 port = tcp_get_next_priv_port(tcp); 3357 } else { 3358 if (count == 0 && user_specified) { 3359 /* 3360 * We may have to return an anonymous port. So 3361 * get one to start with. 3362 */ 3363 port = 3364 tcp_update_next_port( 3365 tcps->tcps_next_port_to_try, 3366 tcp, B_TRUE); 3367 user_specified = B_FALSE; 3368 } else { 3369 port = tcp_update_next_port(port + 1, tcp, 3370 B_FALSE); 3371 } 3372 } 3373 if (port == 0) 3374 break; 3375 3376 /* 3377 * Don't let this loop run forever in the case where 3378 * all of the anonymous ports are in use. 3379 */ 3380 } while (++count < loopmax); 3381 return (0); 3382 } 3383 3384 /* 3385 * tcp_clean_death / tcp_close_detached must not be called more than once 3386 * on a tcp. Thus every function that potentially calls tcp_clean_death 3387 * must check for the tcp state before calling tcp_clean_death. 3388 * Eg. tcp_input, tcp_rput_data, tcp_eager_kill, tcp_clean_death_wrapper, 3389 * tcp_timer_handler, all check for the tcp state. 3390 */ 3391 /* ARGSUSED */ 3392 void 3393 tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2) 3394 { 3395 tcp_t *tcp = ((conn_t *)arg)->conn_tcp; 3396 3397 freemsg(mp); 3398 if (tcp->tcp_state > TCPS_BOUND) 3399 (void) tcp_clean_death(((conn_t *)arg)->conn_tcp, 3400 ETIMEDOUT, 5); 3401 } 3402 3403 /* 3404 * We are dying for some reason. Try to do it gracefully. (May be called 3405 * as writer.) 3406 * 3407 * Return -1 if the structure was not cleaned up (if the cleanup had to be 3408 * done by a service procedure). 3409 * TBD - Should the return value distinguish between the tcp_t being 3410 * freed and it being reinitialized? 3411 */ 3412 static int 3413 tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) 3414 { 3415 mblk_t *mp; 3416 queue_t *q; 3417 conn_t *connp = tcp->tcp_connp; 3418 tcp_stack_t *tcps = tcp->tcp_tcps; 3419 3420 TCP_CLD_STAT(tag); 3421 3422 #if TCP_TAG_CLEAN_DEATH 3423 tcp->tcp_cleandeathtag = tag; 3424 #endif 3425 3426 if (tcp->tcp_fused) 3427 tcp_unfuse(tcp); 3428 3429 if (tcp->tcp_linger_tid != 0 && 3430 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) { 3431 tcp_stop_lingering(tcp); 3432 } 3433 3434 ASSERT(tcp != NULL); 3435 ASSERT((tcp->tcp_family == AF_INET && 3436 tcp->tcp_ipversion == IPV4_VERSION) || 3437 (tcp->tcp_family == AF_INET6 && 3438 (tcp->tcp_ipversion == IPV4_VERSION || 3439 tcp->tcp_ipversion == IPV6_VERSION))); 3440 3441 if (TCP_IS_DETACHED(tcp)) { 3442 if (tcp->tcp_hard_binding) { 3443 /* 3444 * Its an eager that we are dealing with. We close the 3445 * eager but in case a conn_ind has already gone to the 3446 * listener, let tcp_accept_finish() send a discon_ind 3447 * to the listener and drop the last reference. If the 3448 * listener doesn't even know about the eager i.e. the 3449 * conn_ind hasn't gone up, blow away the eager and drop 3450 * the last reference as well. If the conn_ind has gone 3451 * up, state should be BOUND. tcp_accept_finish 3452 * will figure out that the connection has received a 3453 * RST and will send a DISCON_IND to the application. 3454 */ 3455 tcp_closei_local(tcp); 3456 if (!tcp->tcp_tconnind_started) { 3457 CONN_DEC_REF(connp); 3458 } else { 3459 tcp->tcp_state = TCPS_BOUND; 3460 } 3461 } else { 3462 tcp_close_detached(tcp); 3463 } 3464 return (0); 3465 } 3466 3467 TCP_STAT(tcps, tcp_clean_death_nondetached); 3468 3469 q = tcp->tcp_rq; 3470 3471 /* Trash all inbound data */ 3472 if (!IPCL_IS_NONSTR(connp)) { 3473 ASSERT(q != NULL); 3474 flushq(q, FLUSHALL); 3475 } 3476 3477 /* 3478 * If we are at least part way open and there is error 3479 * (err==0 implies no error) 3480 * notify our client by a T_DISCON_IND. 3481 */ 3482 if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) { 3483 if (tcp->tcp_state >= TCPS_ESTABLISHED && 3484 !TCP_IS_SOCKET(tcp)) { 3485 /* 3486 * Send M_FLUSH according to TPI. Because sockets will 3487 * (and must) ignore FLUSHR we do that only for TPI 3488 * endpoints and sockets in STREAMS mode. 3489 */ 3490 (void) putnextctl1(q, M_FLUSH, FLUSHR); 3491 } 3492 if (tcp->tcp_debug) { 3493 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 3494 "tcp_clean_death: discon err %d", err); 3495 } 3496 if (IPCL_IS_NONSTR(connp)) { 3497 /* Direct socket, use upcall */ 3498 (*connp->conn_upcalls->su_disconnected)( 3499 connp->conn_upper_handle, tcp->tcp_connid, err); 3500 } else { 3501 mp = mi_tpi_discon_ind(NULL, err, 0); 3502 if (mp != NULL) { 3503 putnext(q, mp); 3504 } else { 3505 if (tcp->tcp_debug) { 3506 (void) strlog(TCP_MOD_ID, 0, 1, 3507 SL_ERROR|SL_TRACE, 3508 "tcp_clean_death, sending M_ERROR"); 3509 } 3510 (void) putnextctl1(q, M_ERROR, EPROTO); 3511 } 3512 } 3513 if (tcp->tcp_state <= TCPS_SYN_RCVD) { 3514 /* SYN_SENT or SYN_RCVD */ 3515 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); 3516 } else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) { 3517 /* ESTABLISHED or CLOSE_WAIT */ 3518 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets); 3519 } 3520 } 3521 3522 tcp_reinit(tcp); 3523 if (IPCL_IS_NONSTR(connp)) 3524 (void) tcp_do_unbind(connp); 3525 3526 return (-1); 3527 } 3528 3529 /* 3530 * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout 3531 * to expire, stop the wait and finish the close. 3532 */ 3533 static void 3534 tcp_stop_lingering(tcp_t *tcp) 3535 { 3536 clock_t delta = 0; 3537 tcp_stack_t *tcps = tcp->tcp_tcps; 3538 3539 tcp->tcp_linger_tid = 0; 3540 if (tcp->tcp_state > TCPS_LISTEN) { 3541 tcp_acceptor_hash_remove(tcp); 3542 mutex_enter(&tcp->tcp_non_sq_lock); 3543 if (tcp->tcp_flow_stopped) { 3544 tcp_clrqfull(tcp); 3545 } 3546 mutex_exit(&tcp->tcp_non_sq_lock); 3547 3548 if (tcp->tcp_timer_tid != 0) { 3549 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 3550 tcp->tcp_timer_tid = 0; 3551 } 3552 /* 3553 * Need to cancel those timers which will not be used when 3554 * TCP is detached. This has to be done before the tcp_wq 3555 * is set to the global queue. 3556 */ 3557 tcp_timers_stop(tcp); 3558 3559 tcp->tcp_detached = B_TRUE; 3560 ASSERT(tcps->tcps_g_q != NULL); 3561 tcp->tcp_rq = tcps->tcps_g_q; 3562 tcp->tcp_wq = WR(tcps->tcps_g_q); 3563 3564 if (tcp->tcp_state == TCPS_TIME_WAIT) { 3565 tcp_time_wait_append(tcp); 3566 TCP_DBGSTAT(tcps, tcp_detach_time_wait); 3567 goto finish; 3568 } 3569 3570 /* 3571 * If delta is zero the timer event wasn't executed and was 3572 * successfully canceled. In this case we need to restart it 3573 * with the minimal delta possible. 3574 */ 3575 if (delta >= 0) { 3576 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 3577 delta ? delta : 1); 3578 } 3579 } else { 3580 tcp_closei_local(tcp); 3581 CONN_DEC_REF(tcp->tcp_connp); 3582 } 3583 finish: 3584 /* Signal closing thread that it can complete close */ 3585 mutex_enter(&tcp->tcp_closelock); 3586 tcp->tcp_detached = B_TRUE; 3587 ASSERT(tcps->tcps_g_q != NULL); 3588 3589 tcp->tcp_rq = tcps->tcps_g_q; 3590 tcp->tcp_wq = WR(tcps->tcps_g_q); 3591 3592 tcp->tcp_closed = 1; 3593 cv_signal(&tcp->tcp_closecv); 3594 mutex_exit(&tcp->tcp_closelock); 3595 } 3596 3597 /* 3598 * Handle lingering timeouts. This function is called when the SO_LINGER timeout 3599 * expires. 3600 */ 3601 static void 3602 tcp_close_linger_timeout(void *arg) 3603 { 3604 conn_t *connp = (conn_t *)arg; 3605 tcp_t *tcp = connp->conn_tcp; 3606 3607 tcp->tcp_client_errno = ETIMEDOUT; 3608 tcp_stop_lingering(tcp); 3609 } 3610 3611 static void 3612 tcp_close_common(conn_t *connp, int flags) 3613 { 3614 tcp_t *tcp = connp->conn_tcp; 3615 mblk_t *mp = &tcp->tcp_closemp; 3616 boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 3617 mblk_t *bp; 3618 3619 ASSERT(connp->conn_ref >= 2); 3620 3621 /* 3622 * Mark the conn as closing. ill_pending_mp_add will not 3623 * add any mp to the pending mp list, after this conn has 3624 * started closing. Same for sq_pending_mp_add 3625 */ 3626 mutex_enter(&connp->conn_lock); 3627 connp->conn_state_flags |= CONN_CLOSING; 3628 if (connp->conn_oper_pending_ill != NULL) 3629 conn_ioctl_cleanup_reqd = B_TRUE; 3630 CONN_INC_REF_LOCKED(connp); 3631 mutex_exit(&connp->conn_lock); 3632 tcp->tcp_closeflags = (uint8_t)flags; 3633 ASSERT(connp->conn_ref >= 3); 3634 3635 /* 3636 * tcp_closemp_used is used below without any protection of a lock 3637 * as we don't expect any one else to use it concurrently at this 3638 * point otherwise it would be a major defect. 3639 */ 3640 3641 if (mp->b_prev == NULL) 3642 tcp->tcp_closemp_used = B_TRUE; 3643 else 3644 cmn_err(CE_PANIC, "tcp_close: concurrent use of tcp_closemp: " 3645 "connp %p tcp %p\n", (void *)connp, (void *)tcp); 3646 3647 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 3648 3649 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp, 3650 tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); 3651 3652 mutex_enter(&tcp->tcp_closelock); 3653 while (!tcp->tcp_closed) { 3654 if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) { 3655 /* 3656 * The cv_wait_sig() was interrupted. We now do the 3657 * following: 3658 * 3659 * 1) If the endpoint was lingering, we allow this 3660 * to be interrupted by cancelling the linger timeout 3661 * and closing normally. 3662 * 3663 * 2) Revert to calling cv_wait() 3664 * 3665 * We revert to using cv_wait() to avoid an 3666 * infinite loop which can occur if the calling 3667 * thread is higher priority than the squeue worker 3668 * thread and is bound to the same cpu. 3669 */ 3670 if (tcp->tcp_linger && tcp->tcp_lingertime > 0) { 3671 mutex_exit(&tcp->tcp_closelock); 3672 /* Entering squeue, bump ref count. */ 3673 CONN_INC_REF(connp); 3674 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); 3675 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, 3676 tcp_linger_interrupted, connp, 3677 tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); 3678 mutex_enter(&tcp->tcp_closelock); 3679 } 3680 break; 3681 } 3682 } 3683 while (!tcp->tcp_closed) 3684 cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock); 3685 mutex_exit(&tcp->tcp_closelock); 3686 3687 /* 3688 * In the case of listener streams that have eagers in the q or q0 3689 * we wait for the eagers to drop their reference to us. tcp_rq and 3690 * tcp_wq of the eagers point to our queues. By waiting for the 3691 * refcnt to drop to 1, we are sure that the eagers have cleaned 3692 * up their queue pointers and also dropped their references to us. 3693 */ 3694 if (tcp->tcp_wait_for_eagers) { 3695 mutex_enter(&connp->conn_lock); 3696 while (connp->conn_ref != 1) { 3697 cv_wait(&connp->conn_cv, &connp->conn_lock); 3698 } 3699 mutex_exit(&connp->conn_lock); 3700 } 3701 /* 3702 * ioctl cleanup. The mp is queued in the 3703 * ill_pending_mp or in the sq_pending_mp. 3704 */ 3705 if (conn_ioctl_cleanup_reqd) 3706 conn_ioctl_cleanup(connp); 3707 3708 tcp->tcp_cpid = -1; 3709 } 3710 3711 static int 3712 tcp_tpi_close(queue_t *q, int flags) 3713 { 3714 conn_t *connp; 3715 3716 ASSERT(WR(q)->q_next == NULL); 3717 3718 if (flags & SO_FALLBACK) { 3719 /* 3720 * stream is being closed while in fallback 3721 * simply free the resources that were allocated 3722 */ 3723 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 3724 qprocsoff(q); 3725 goto done; 3726 } 3727 3728 connp = Q_TO_CONN(q); 3729 /* 3730 * We are being closed as /dev/tcp or /dev/tcp6. 3731 */ 3732 tcp_close_common(connp, flags); 3733 3734 qprocsoff(q); 3735 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 3736 3737 /* 3738 * Drop IP's reference on the conn. This is the last reference 3739 * on the connp if the state was less than established. If the 3740 * connection has gone into timewait state, then we will have 3741 * one ref for the TCP and one more ref (total of two) for the 3742 * classifier connected hash list (a timewait connections stays 3743 * in connected hash till closed). 3744 * 3745 * We can't assert the references because there might be other 3746 * transient reference places because of some walkers or queued 3747 * packets in squeue for the timewait state. 3748 */ 3749 CONN_DEC_REF(connp); 3750 done: 3751 q->q_ptr = WR(q)->q_ptr = NULL; 3752 return (0); 3753 } 3754 3755 static int 3756 tcp_tpi_close_accept(queue_t *q) 3757 { 3758 vmem_t *minor_arena; 3759 dev_t conn_dev; 3760 3761 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit); 3762 3763 /* 3764 * We had opened an acceptor STREAM for sockfs which is 3765 * now being closed due to some error. 3766 */ 3767 qprocsoff(q); 3768 3769 minor_arena = (vmem_t *)WR(q)->q_ptr; 3770 conn_dev = (dev_t)RD(q)->q_ptr; 3771 ASSERT(minor_arena != NULL); 3772 ASSERT(conn_dev != 0); 3773 inet_minor_free(minor_arena, conn_dev); 3774 q->q_ptr = WR(q)->q_ptr = NULL; 3775 return (0); 3776 } 3777 3778 /* 3779 * Called by tcp_close() routine via squeue when lingering is 3780 * interrupted by a signal. 3781 */ 3782 3783 /* ARGSUSED */ 3784 static void 3785 tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2) 3786 { 3787 conn_t *connp = (conn_t *)arg; 3788 tcp_t *tcp = connp->conn_tcp; 3789 3790 freeb(mp); 3791 if (tcp->tcp_linger_tid != 0 && 3792 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) { 3793 tcp_stop_lingering(tcp); 3794 tcp->tcp_client_errno = EINTR; 3795 } 3796 } 3797 3798 /* 3799 * Called by streams close routine via squeues when our client blows off her 3800 * descriptor, we take this to mean: "close the stream state NOW, close the tcp 3801 * connection politely" When SO_LINGER is set (with a non-zero linger time and 3802 * it is not a nonblocking socket) then this routine sleeps until the FIN is 3803 * acked. 3804 * 3805 * NOTE: tcp_close potentially returns error when lingering. 3806 * However, the stream head currently does not pass these errors 3807 * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK 3808 * errors to the application (from tsleep()) and not errors 3809 * like ECONNRESET caused by receiving a reset packet. 3810 */ 3811 3812 /* ARGSUSED */ 3813 static void 3814 tcp_close_output(void *arg, mblk_t *mp, void *arg2) 3815 { 3816 char *msg; 3817 conn_t *connp = (