Home | History | Annotate | Download | only in tcp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 /* Copyright (c) 1990 Mentat Inc. */
     27 
     28 #include <sys/types.h>
     29 #include <sys/stream.h>
     30 #include <sys/strsun.h>
     31 #include <sys/strsubr.h>
     32 #include <sys/stropts.h>
     33 #include <sys/strlog.h>
     34 #define	_SUN_TPI_VERSION 2
     35 #include <sys/tihdr.h>
     36 #include <sys/timod.h>
     37 #include <sys/ddi.h>
     38 #include <sys/sunddi.h>
     39 #include <sys/suntpi.h>
     40 #include <sys/xti_inet.h>
     41 #include <sys/cmn_err.h>
     42 #include <sys/debug.h>
     43 #include <sys/sdt.h>
     44 #include <sys/vtrace.h>
     45 #include <sys/kmem.h>
     46 #include <sys/ethernet.h>
     47 #include <sys/cpuvar.h>
     48 #include <sys/dlpi.h>
     49 #include <sys/multidata.h>
     50 #include <sys/multidata_impl.h>
     51 #include <sys/pattr.h>
     52 #include <sys/policy.h>
     53 #include <sys/priv.h>
     54 #include <sys/zone.h>
     55 #include <sys/sunldi.h>
     56 
     57 #include <sys/errno.h>
     58 #include <sys/signal.h>
     59 #include <sys/socket.h>
     60 #include <sys/socketvar.h>
     61 #include <sys/sockio.h>
     62 #include <sys/isa_defs.h>
     63 #include <sys/md5.h>
     64 #include <sys/random.h>
     65 #include <sys/uio.h>
     66 #include <sys/systm.h>
     67 #include <netinet/in.h>
     68 #include <netinet/tcp.h>
     69 #include <netinet/ip6.h>
     70 #include <netinet/icmp6.h>
     71 #include <net/if.h>
     72 #include <net/route.h>
     73 #include <inet/ipsec_impl.h>
     74 
     75 #include <inet/common.h>
     76 #include <inet/ip.h>
     77 #include <inet/ip_impl.h>
     78 #include <inet/ip6.h>
     79 #include <inet/ip_ndp.h>
     80 #include <inet/proto_set.h>
     81 #include <inet/mib2.h>
     82 #include <inet/nd.h>
     83 #include <inet/optcom.h>
     84 #include <inet/snmpcom.h>
     85 #include <inet/kstatcom.h>
     86 #include <inet/tcp.h>
     87 #include <inet/tcp_impl.h>
     88 #include <inet/udp_impl.h>
     89 #include <net/pfkeyv2.h>
     90 #include <inet/ipsec_info.h>
     91 #include <inet/ipdrop.h>
     92 
     93 #include <inet/ipclassifier.h>
     94 #include <inet/ip_ire.h>
     95 #include <inet/ip_ftable.h>
     96 #include <inet/ip_if.h>
     97 #include <inet/ipp_common.h>
     98 #include <inet/ip_netinfo.h>
     99 #include <sys/squeue_impl.h>
    100 #include <sys/squeue.h>
    101 #include <inet/kssl/ksslapi.h>
    102 #include <sys/tsol/label.h>
    103 #include <sys/tsol/tnet.h>
    104 #include <rpc/pmap_prot.h>
    105 #include <sys/callo.h>
    106 
    107 /*
    108  * TCP Notes: aka FireEngine Phase I (PSARC 2002/433)
    109  *
    110  * (Read the detailed design doc in PSARC case directory)
    111  *
    112  * The entire tcp state is contained in tcp_t and conn_t structure
    113  * which are allocated in tandem using ipcl_conn_create() and passing
    114  * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect
    115  * the references on the tcp_t. The tcp_t structure is never compressed
    116  * and packets always land on the correct TCP perimeter from the time
    117  * eager is created till the time tcp_t dies (as such the old mentat
    118  * TCP global queue is not used for detached state and no IPSEC checking
    119  * is required). The global queue is still allocated to send out resets
    120  * for connection which have no listeners and IP directly calls
    121  * tcp_xmit_listeners_reset() which does any policy check.
    122  *
    123  * Protection and Synchronisation mechanism:
    124  *
    125  * The tcp data structure does not use any kind of lock for protecting
    126  * its state but instead uses 'squeues' for mutual exclusion from various
    127  * read and write side threads. To access a tcp member, the thread should
    128  * always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS,
    129  * or SQ_NODRAIN). Since the squeues allow a direct function call, caller
    130  * can pass any tcp function having prototype of edesc_t as argument
    131  * (different from traditional STREAMs model where packets come in only
    132  * designated entry points). The list of functions that can be directly
    133  * called via squeue are listed before the usual function prototype.
    134  *
    135  * Referencing:
    136  *
    137  * TCP is MT-Hot and we use a reference based scheme to make sure that the
    138  * tcp structure doesn't disappear when its needed. When the application
    139  * creates an outgoing connection or accepts an incoming connection, we
    140  * start out with 2 references on 'conn_ref'. One for TCP and one for IP.
    141  * The IP reference is just a symbolic reference since ip_tcpclose()
    142  * looks at tcp structure after tcp_close_output() returns which could
    143  * have dropped the last TCP reference. So as long as the connection is
    144  * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the
    145  * conn_t. The classifier puts its own reference when the connection is
    146  * inserted in listen or connected hash. Anytime a thread needs to enter
    147  * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr
    148  * on write side or by doing a classify on read side and then puts a
    149  * reference on the conn before doing squeue_enter/tryenter/fill. For
    150  * read side, the classifier itself puts the reference under fanout lock
    151  * to make sure that tcp can't disappear before it gets processed. The
    152  * squeue will drop this reference automatically so the called function
    153  * doesn't have to do a DEC_REF.
    154  *
    155  * Opening a new connection:
    156  *
    157  * The outgoing connection open is pretty simple. tcp_open() does the
    158  * work in creating the conn/tcp structure and initializing it. The
    159  * squeue assignment is done based on the CPU the application
    160  * is running on. So for outbound connections, processing is always done
    161  * on application CPU which might be different from the incoming CPU
    162  * being interrupted by the NIC. An optimal way would be to figure out
    163  * the NIC <-> CPU binding at listen time, and assign the outgoing
    164  * connection to the squeue attached to the CPU that will be interrupted
    165  * for incoming packets (we know the NIC based on the bind IP address).
    166  * This might seem like a problem if more data is going out but the
    167  * fact is that in most cases the transmit is ACK driven transmit where
    168  * the outgoing data normally sits on TCP's xmit queue waiting to be
    169  * transmitted.
    170  *
    171  * Accepting a connection:
    172  *
    173  * This is a more interesting case because of various races involved in
    174  * establishing a eager in its own perimeter. Read the meta comment on
    175  * top of tcp_conn_request(). But briefly, the squeue is picked by
    176  * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU.
    177  *
    178  * Closing a connection:
    179  *
    180  * The close is fairly straight forward. tcp_close() calls tcp_close_output()
    181  * via squeue to do the close and mark the tcp as detached if the connection
    182  * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its
    183  * reference but tcp_close() drop IP's reference always. So if tcp was
    184  * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP
    185  * and 1 because it is in classifier's connected hash. This is the condition
    186  * we use to determine that its OK to clean up the tcp outside of squeue
    187  * when time wait expires (check the ref under fanout and conn_lock and
    188  * if it is 2, remove it from fanout hash and kill it).
    189  *
    190  * Although close just drops the necessary references and marks the
    191  * tcp_detached state, tcp_close needs to know the tcp_detached has been
    192  * set (under squeue) before letting the STREAM go away (because a
    193  * inbound packet might attempt to go up the STREAM while the close
    194  * has happened and tcp_detached is not set). So a special lock and
    195  * flag is used along with a condition variable (tcp_closelock, tcp_closed,
    196  * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked
    197  * tcp_detached.
    198  *
    199  * Special provisions and fast paths:
    200  *
    201  * We make special provision for (AF_INET, SOCK_STREAM) sockets which
    202  * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP
    203  * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles
    204  * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY
    205  * check to send packets directly to tcp_rput_data via squeue. Everyone
    206  * else comes through tcp_input() on the read side.
    207  *
    208  * We also make special provisions for sockfs by marking tcp_issocket
    209  * whenever we have only sockfs on top of TCP. This allows us to skip
    210  * putting the tcp in acceptor hash since a sockfs listener can never
    211  * become acceptor and also avoid allocating a tcp_t for acceptor STREAM
    212  * since eager has already been allocated and the accept now happens
    213  * on acceptor STREAM. There is a big blob of comment on top of
    214  * tcp_conn_request explaining the new accept. When socket is POP'd,
    215  * sockfs sends us an ioctl to mark the fact and we go back to old
    216  * behaviour. Once tcp_issocket is unset, its never set for the
    217  * life of that connection.
    218  *
    219  * IPsec notes :
    220  *
    221  * Since a packet is always executed on the correct TCP perimeter
    222  * all IPsec processing is defered to IP including checking new
    223  * connections and setting IPSEC policies for new connection. The
    224  * only exception is tcp_xmit_listeners_reset() which is called
    225  * directly from IP and needs to policy check to see if TH_RST
    226  * can be sent out.
    227  *
    228  * PFHooks notes :
    229  *
    230  * For mdt case, one meta buffer contains multiple packets. Mblks for every
    231  * packet are assembled and passed to the hooks. When packets are blocked,
    232  * or boundary of any packet is changed, the mdt processing is stopped, and
    233  * packets of the meta buffer are send to the IP path one by one.
    234  */
    235 
    236 /*
    237  * Values for squeue switch:
    238  * 1: SQ_NODRAIN
    239  * 2: SQ_PROCESS
    240  * 3: SQ_FILL
    241  */
    242 int tcp_squeue_wput = 2;	/* /etc/systems */
    243 int tcp_squeue_flag;
    244 
    245 /*
    246  * This controls how tiny a write must be before we try to copy it
    247  * into the the mblk on the tail of the transmit queue.  Not much
    248  * speedup is observed for values larger than sixteen.  Zero will
    249  * disable the optimisation.
    250  */
    251 int tcp_tx_pull_len = 16;
    252 
    253 /*
    254  * TCP Statistics.
    255  *
    256  * How TCP statistics work.
    257  *
    258  * There are two types of statistics invoked by two macros.
    259  *
    260  * TCP_STAT(name) does non-atomic increment of a named stat counter. It is
    261  * supposed to be used in non MT-hot paths of the code.
    262  *
    263  * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is
    264  * supposed to be used for DEBUG purposes and may be used on a hot path.
    265  *
    266  * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat
    267  * (use "kstat tcp" to get them).
    268  *
    269  * There is also additional debugging facility that marks tcp_clean_death()
    270  * instances and saves them in tcp_t structure. It is triggered by
    271  * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for
    272  * tcp_clean_death() calls that counts the number of times each tag was hit. It
    273  * is triggered by TCP_CLD_COUNTERS define.
    274  *
    275  * How to add new counters.
    276  *
    277  * 1) Add a field in the tcp_stat structure describing your counter.
    278  * 2) Add a line in the template in tcp_kstat2_init() with the name
    279  *    of the counter.
    280  *
    281  *    IMPORTANT!! - make sure that both are in sync !!
    282  * 3) Use either TCP_STAT or TCP_DBGSTAT with the name.
    283  *
    284  * Please avoid using private counters which are not kstat-exported.
    285  *
    286  * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances
    287  * in tcp_t structure.
    288  *
    289  * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
    290  */
    291 
    292 #ifndef TCP_DEBUG_COUNTER
    293 #ifdef DEBUG
    294 #define	TCP_DEBUG_COUNTER 1
    295 #else
    296 #define	TCP_DEBUG_COUNTER 0
    297 #endif
    298 #endif
    299 
    300 #define	TCP_CLD_COUNTERS 0
    301 
    302 #define	TCP_TAG_CLEAN_DEATH 1
    303 #define	TCP_MAX_CLEAN_DEATH_TAG 32
    304 
    305 #ifdef lint
    306 static int _lint_dummy_;
    307 #endif
    308 
    309 #if TCP_CLD_COUNTERS
    310 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
    311 #define	TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
    312 #elif defined(lint)
    313 #define	TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0);
    314 #else
    315 #define	TCP_CLD_STAT(x)
    316 #endif
    317 
    318 #if TCP_DEBUG_COUNTER
    319 #define	TCP_DBGSTAT(tcps, x)	\
    320 	atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1)
    321 #define	TCP_G_DBGSTAT(x)	\
    322 	atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1)
    323 #elif defined(lint)
    324 #define	TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0);
    325 #define	TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0);
    326 #else
    327 #define	TCP_DBGSTAT(tcps, x)
    328 #define	TCP_G_DBGSTAT(x)
    329 #endif
    330 
    331 #define	TCP_G_STAT(x)	(tcp_g_statistics.x.value.ui64++)
    332 
    333 tcp_g_stat_t	tcp_g_statistics;
    334 kstat_t		*tcp_g_kstat;
    335 
    336 /*
    337  * Call either ip_output or ip_output_v6. This replaces putnext() calls on the
    338  * tcp write side.
    339  */
    340 #define	CALL_IP_WPUT(connp, q, mp) {					\
    341 	ASSERT(((q)->q_flag & QREADR) == 0);				\
    342 	TCP_DBGSTAT(connp->conn_netstack->netstack_tcp, tcp_ip_output);	\
    343 	connp->conn_send(connp, (mp), (q), IP_WPUT);			\
    344 }
    345 
    346 /* Macros for timestamp comparisons */
    347 #define	TSTMP_GEQ(a, b)	((int32_t)((a)-(b)) >= 0)
    348 #define	TSTMP_LT(a, b)	((int32_t)((a)-(b)) < 0)
    349 
    350 /*
    351  * Parameters for TCP Initial Send Sequence number (ISS) generation.  When
    352  * tcp_strong_iss is set to 1, which is the default, the ISS is calculated
    353  * by adding three components: a time component which grows by 1 every 4096
    354  * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
    355  * a per-connection component which grows by 125000 for every new connection;
    356  * and an "extra" component that grows by a random amount centered
    357  * approximately on 64000.  This causes the the ISS generator to cycle every
    358  * 4.89 hours if no TCP connections are made, and faster if connections are
    359  * made.
    360  *
    361  * When tcp_strong_iss is set to 0, ISS is calculated by adding two
    362  * components: a time component which grows by 250000 every second; and
    363  * a per-connection component which grows by 125000 for every new connections.
    364  *
    365  * A third method, when tcp_strong_iss is set to 2, for generating ISS is
    366  * prescribed by Steve Bellovin.  This involves adding time, the 125000 per
    367  * connection, and a one-way hash (MD5) of the connection ID <sport, dport,
    368  * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered
    369  * password.
    370  */
    371 #define	ISS_INCR	250000
    372 #define	ISS_NSEC_SHT	12
    373 
    374 static sin_t	sin_null;	/* Zero address for quick clears */
    375 static sin6_t	sin6_null;	/* Zero address for quick clears */
    376 
    377 /*
    378  * This implementation follows the 4.3BSD interpretation of the urgent
    379  * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause
    380  * incompatible changes in protocols like telnet and rlogin.
    381  */
    382 #define	TCP_OLD_URP_INTERPRETATION	1
    383 
    384 #define	TCP_IS_DETACHED_NONEAGER(tcp)	\
    385 	(TCP_IS_DETACHED(tcp) && \
    386 	    (!(tcp)->tcp_hard_binding))
    387 
    388 /*
    389  * TCP reassembly macros.  We hide starting and ending sequence numbers in
    390  * b_next and b_prev of messages on the reassembly queue.  The messages are
    391  * chained using b_cont.  These macros are used in tcp_reass() so we don't
    392  * have to see the ugly casts and assignments.
    393  */
    394 #define	TCP_REASS_SEQ(mp)		((uint32_t)(uintptr_t)((mp)->b_next))
    395 #define	TCP_REASS_SET_SEQ(mp, u)	((mp)->b_next = \
    396 					(mblk_t *)(uintptr_t)(u))
    397 #define	TCP_REASS_END(mp)		((uint32_t)(uintptr_t)((mp)->b_prev))
    398 #define	TCP_REASS_SET_END(mp, u)	((mp)->b_prev = \
    399 					(mblk_t *)(uintptr_t)(u))
    400 
    401 /*
    402  * Implementation of TCP Timers.
    403  * =============================
    404  *
    405  * INTERFACE:
    406  *
    407  * There are two basic functions dealing with tcp timers:
    408  *
    409  *	timeout_id_t	tcp_timeout(connp, func, time)
    410  * 	clock_t		tcp_timeout_cancel(connp, timeout_id)
    411  *	TCP_TIMER_RESTART(tcp, intvl)
    412  *
    413  * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
    414  * after 'time' ticks passed. The function called by timeout() must adhere to
    415  * the same restrictions as a driver soft interrupt handler - it must not sleep
    416  * or call other functions that might sleep. The value returned is the opaque
    417  * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
    418  * cancel the request. The call to tcp_timeout() may fail in which case it
    419  * returns zero. This is different from the timeout(9F) function which never
    420  * fails.
    421  *
    422  * The call-back function 'func' always receives 'connp' as its single
    423  * argument. It is always executed in the squeue corresponding to the tcp
    424  * structure. The tcp structure is guaranteed to be present at the time the
    425  * call-back is called.
    426  *
    427  * NOTE: The call-back function 'func' is never called if tcp is in
    428  * 	the TCPS_CLOSED state.
    429  *
    430  * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
    431  * request. locks acquired by the call-back routine should not be held across
    432  * the call to tcp_timeout_cancel() or a deadlock may result.
    433  *
    434  * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
    435  * Otherwise, it returns an integer value greater than or equal to 0. In
    436  * particular, if the call-back function is already placed on the squeue, it can
    437  * not be canceled.
    438  *
    439  * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
    440  * 	within squeue context corresponding to the tcp instance. Since the
    441  *	call-back is also called via the same squeue, there are no race
    442  *	conditions described in untimeout(9F) manual page since all calls are
    443  *	strictly serialized.
    444  *
    445  *      TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
    446  *	stored in tcp_timer_tid and starts a new one using
    447  *	MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
    448  *	and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
    449  *	field.
    450  *
    451  * NOTE: since the timeout cancellation is not guaranteed, the cancelled
    452  *	call-back may still be called, so it is possible tcp_timer() will be
    453  *	called several times. This should not be a problem since tcp_timer()
    454  *	should always check the tcp instance state.
    455  *
    456  *
    457  * IMPLEMENTATION:
    458  *
    459  * TCP timers are implemented using three-stage process. The call to
    460  * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
    461  * when the timer expires. The tcp_timer_callback() arranges the call of the
    462  * tcp_timer_handler() function via squeue corresponding to the tcp
    463  * instance. The tcp_timer_handler() calls actual requested timeout call-back
    464  * and passes tcp instance as an argument to it. Information is passed between
    465  * stages using the tcp_timer_t structure which contains the connp pointer, the
    466  * tcp call-back to call and the timeout id returned by the timeout(9F).
    467  *
    468  * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
    469  * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
    470  * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
    471  * returns the pointer to this mblk.
    472  *
    473  * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
    474  * looks like a normal mblk without actual dblk attached to it.
    475  *
    476  * To optimize performance each tcp instance holds a small cache of timer
    477  * mblocks. In the current implementation it caches up to two timer mblocks per
    478  * tcp instance. The cache is preserved over tcp frees and is only freed when
    479  * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
    480  * timer processing happens on a corresponding squeue, the cache manipulation
    481  * does not require any locks. Experiments show that majority of timer mblocks
    482  * allocations are satisfied from the tcp cache and do not involve kmem calls.
    483  *
    484  * The tcp_timeout() places a refhold on the connp instance which guarantees
    485  * that it will be present at the time the call-back function fires. The
    486  * tcp_timer_handler() drops the reference after calling the call-back, so the
    487  * call-back function does not need to manipulate the references explicitly.
    488  */
    489 
    490 typedef struct tcp_timer_s {
    491 	conn_t	*connp;
    492 	void 	(*tcpt_proc)(void *);
    493 	callout_id_t   tcpt_tid;
    494 } tcp_timer_t;
    495 
    496 static kmem_cache_t *tcp_timercache;
    497 kmem_cache_t	*tcp_sack_info_cache;
    498 kmem_cache_t	*tcp_iphc_cache;
    499 
    500 /*
    501  * For scalability, we must not run a timer for every TCP connection
    502  * in TIME_WAIT state.  To see why, consider (for time wait interval of
    503  * 4 minutes):
    504  *	1000 connections/sec * 240 seconds/time wait = 240,000 active conn's
    505  *
    506  * This list is ordered by time, so you need only delete from the head
    507  * until you get to entries which aren't old enough to delete yet.
    508  * The list consists of only the detached TIME_WAIT connections.
    509  *
    510  * Note that the timer (tcp_time_wait_expire) is started when the tcp_t
    511  * becomes detached TIME_WAIT (either by changing the state and already
    512  * being detached or the other way around). This means that the TIME_WAIT
    513  * state can be extended (up to doubled) if the connection doesn't become
    514  * detached for a long time.
    515  *
    516  * The list manipulations (including tcp_time_wait_next/prev)
    517  * are protected by the tcp_time_wait_lock. The content of the
    518  * detached TIME_WAIT connections is protected by the normal perimeters.
    519  *
    520  * This list is per squeue and squeues are shared across the tcp_stack_t's.
    521  * Things on tcp_time_wait_head remain associated with the tcp_stack_t
    522  * and conn_netstack.
    523  * The tcp_t's that are added to tcp_free_list are disassociated and
    524  * have NULL tcp_tcps and conn_netstack pointers.
    525  */
    526 typedef struct tcp_squeue_priv_s {
    527 	kmutex_t	tcp_time_wait_lock;
    528 	callout_id_t	tcp_time_wait_tid;
    529 	tcp_t		*tcp_time_wait_head;
    530 	tcp_t		*tcp_time_wait_tail;
    531 	tcp_t		*tcp_free_list;
    532 	uint_t		tcp_free_list_cnt;
    533 } tcp_squeue_priv_t;
    534 
    535 /*
    536  * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
    537  * Running it every 5 seconds seems to give the best results.
    538  */
    539 #define	TCP_TIME_WAIT_DELAY drv_usectohz(5000000)
    540 
    541 /*
    542  * To prevent memory hog, limit the number of entries in tcp_free_list
    543  * to 1% of available memory / number of cpus
    544  */
    545 uint_t tcp_free_list_max_cnt = 0;
    546 
    547 #define	TCP_XMIT_LOWATER	4096
    548 #define	TCP_XMIT_HIWATER	49152
    549 #define	TCP_RECV_LOWATER	2048
    550 #define	TCP_RECV_HIWATER	49152
    551 
    552 /*
    553  *  PAWS needs a timer for 24 days.  This is the number of ticks in 24 days
    554  */
    555 #define	PAWS_TIMEOUT	((clock_t)(24*24*60*60*hz))
    556 
    557 #define	TIDUSZ	4096	/* transport interface data unit size */
    558 
    559 /*
    560  * Bind hash list size and has function.  It has to be a power of 2 for
    561  * hashing.
    562  */
    563 #define	TCP_BIND_FANOUT_SIZE	512
    564 #define	TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1))
    565 /*
    566  * Size of listen and acceptor hash list.  It has to be a power of 2 for
    567  * hashing.
    568  */
    569 #define	TCP_FANOUT_SIZE		256
    570 
    571 #ifdef	_ILP32
    572 #define	TCP_ACCEPTOR_HASH(accid)					\
    573 		(((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1))
    574 #else
    575 #define	TCP_ACCEPTOR_HASH(accid)					\
    576 		((uint_t)(accid) & (TCP_FANOUT_SIZE - 1))
    577 #endif	/* _ILP32 */
    578 
    579 #define	IP_ADDR_CACHE_SIZE	2048
    580 #define	IP_ADDR_CACHE_HASH(faddr)					\
    581 	(ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1))
    582 
    583 /*
    584  * TCP options struct returned from tcp_parse_options.
    585  */
    586 typedef struct tcp_opt_s {
    587 	uint32_t	tcp_opt_mss;
    588 	uint32_t	tcp_opt_wscale;
    589 	uint32_t	tcp_opt_ts_val;
    590 	uint32_t	tcp_opt_ts_ecr;
    591 	tcp_t		*tcp;
    592 } tcp_opt_t;
    593 
    594 /*
    595  * TCP option struct passing information b/w lisenter and eager.
    596  */
    597 struct tcp_options {
    598 	uint_t			to_flags;
    599 	ssize_t			to_boundif;	/* IPV6_BOUND_IF */
    600 };
    601 
    602 #define	TCPOPT_BOUNDIF		0x00000001	/* set IPV6_BOUND_IF */
    603 #define	TCPOPT_RECVPKTINFO	0x00000002	/* set IPV6_RECVPKTINFO */
    604 
    605 /*
    606  * RFC1323-recommended phrasing of TSTAMP option, for easier parsing
    607  */
    608 
    609 #ifdef _BIG_ENDIAN
    610 #define	TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
    611 	(TCPOPT_TSTAMP << 8) | 10)
    612 #else
    613 #define	TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
    614 	(TCPOPT_NOP << 8) | TCPOPT_NOP)
    615 #endif
    616 
    617 /*
    618  * Flags returned from tcp_parse_options.
    619  */
    620 #define	TCP_OPT_MSS_PRESENT	1
    621 #define	TCP_OPT_WSCALE_PRESENT	2
    622 #define	TCP_OPT_TSTAMP_PRESENT	4
    623 #define	TCP_OPT_SACK_OK_PRESENT	8
    624 #define	TCP_OPT_SACK_PRESENT	16
    625 
    626 /* TCP option length */
    627 #define	TCPOPT_NOP_LEN		1
    628 #define	TCPOPT_MAXSEG_LEN	4
    629 #define	TCPOPT_WS_LEN		3
    630 #define	TCPOPT_REAL_WS_LEN	(TCPOPT_WS_LEN+1)
    631 #define	TCPOPT_TSTAMP_LEN	10
    632 #define	TCPOPT_REAL_TS_LEN	(TCPOPT_TSTAMP_LEN+2)
    633 #define	TCPOPT_SACK_OK_LEN	2
    634 #define	TCPOPT_REAL_SACK_OK_LEN	(TCPOPT_SACK_OK_LEN+2)
    635 #define	TCPOPT_REAL_SACK_LEN	4
    636 #define	TCPOPT_MAX_SACK_LEN	36
    637 #define	TCPOPT_HEADER_LEN	2
    638 
    639 /* TCP cwnd burst factor. */
    640 #define	TCP_CWND_INFINITE	65535
    641 #define	TCP_CWND_SS		3
    642 #define	TCP_CWND_NORMAL		5
    643 
    644 /* Maximum TCP initial cwin (start/restart). */
    645 #define	TCP_MAX_INIT_CWND	8
    646 
    647 /*
    648  * Initialize cwnd according to RFC 3390.  def_max_init_cwnd is
    649  * either tcp_slow_start_initial or tcp_slow_start_after idle
    650  * depending on the caller.  If the upper layer has not used the
    651  * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd
    652  * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd.
    653  * If the upper layer has changed set the tcp_init_cwnd, just use
    654  * it to calculate the tcp_cwnd.
    655  */
    656 #define	SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd)			\
    657 {									\
    658 	if ((tcp)->tcp_init_cwnd == 0) {				\
    659 		(tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss),	\
    660 		    MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \
    661 	} else {							\
    662 		(tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss);		\
    663 	}								\
    664 	tcp->tcp_cwnd_cnt = 0;						\
    665 }
    666 
    667 /* TCP Timer control structure */
    668 typedef struct tcpt_s {
    669 	pfv_t	tcpt_pfv;	/* The routine we are to call */
    670 	tcp_t	*tcpt_tcp;	/* The parameter we are to pass in */
    671 } tcpt_t;
    672 
    673 /*
    674  * Functions called directly via squeue having a prototype of edesc_t.
    675  */
    676 void		tcp_conn_request(void *arg, mblk_t *mp, void *arg2);
    677 static void	tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2);
    678 void		tcp_accept_finish(void *arg, mblk_t *mp, void *arg2);
    679 static void	tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2);
    680 static void	tcp_wput_proto(void *arg, mblk_t *mp, void *arg2);
    681 void 		tcp_input(void *arg, mblk_t *mp, void *arg2);
    682 void		tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
    683 static void	tcp_close_output(void *arg, mblk_t *mp, void *arg2);
    684 void		tcp_output(void *arg, mblk_t *mp, void *arg2);
    685 void		tcp_output_urgent(void *arg, mblk_t *mp, void *arg2);
    686 static void	tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2);
    687 static void	tcp_timer_handler(void *arg, mblk_t *mp, void *arg2);
    688 static void	tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2);
    689 
    690 
    691 /* Prototype for TCP functions */
    692 static void	tcp_random_init(void);
    693 int		tcp_random(void);
    694 static void	tcp_tli_accept(tcp_t *tcp, mblk_t *mp);
    695 static int	tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
    696 		    tcp_t *eager);
    697 static int	tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp);
    698 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
    699     int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
    700     boolean_t user_specified);
    701 static void	tcp_closei_local(tcp_t *tcp);
    702 static void	tcp_close_detached(tcp_t *tcp);
    703 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph,
    704 			mblk_t *idmp, mblk_t **defermp);
    705 static void	tcp_tpi_connect(tcp_t *tcp, mblk_t *mp);
    706 static int	tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
    707 		    in_port_t dstport, uint_t srcid, cred_t *cr, pid_t pid);
    708 static int 	tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
    709 		    in_port_t dstport, uint32_t flowinfo, uint_t srcid,
    710 		    uint32_t scope_id, cred_t *cr, pid_t pid);
    711 static int	tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
    712 static void	tcp_def_q_set(tcp_t *tcp, mblk_t *mp);
    713 static void	tcp_disconnect(tcp_t *tcp, mblk_t *mp);
    714 static char	*tcp_display(tcp_t *tcp, char *, char);
    715 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
    716 static void	tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only);
    717 static void	tcp_eager_unlink(tcp_t *tcp);
    718 static void	tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr,
    719 		    int unixerr);
    720 static void	tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
    721 		    int tlierr, int unixerr);
    722 static int	tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
    723 		    cred_t *cr);
    724 static int	tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
    725 		    char *value, caddr_t cp, cred_t *cr);
    726 static int	tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
    727 		    char *value, caddr_t cp, cred_t *cr);
    728 static int	tcp_tpistate(tcp_t *tcp);
    729 static void	tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp,
    730     int caller_holds_lock);
    731 static void	tcp_bind_hash_remove(tcp_t *tcp);
    732 static tcp_t	*tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *);
    733 void		tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp);
    734 static void	tcp_acceptor_hash_remove(tcp_t *tcp);
    735 static void	tcp_capability_req(tcp_t *tcp, mblk_t *mp);
    736 static void	tcp_info_req(tcp_t *tcp, mblk_t *mp);
    737 static void	tcp_addr_req(tcp_t *tcp, mblk_t *mp);
    738 static void	tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp);
    739 void		tcp_g_q_setup(tcp_stack_t *);
    740 void		tcp_g_q_create(tcp_stack_t *);
    741 void		tcp_g_q_destroy(tcp_stack_t *);
    742 static int	tcp_header_init_ipv4(tcp_t *tcp);
    743 static int	tcp_header_init_ipv6(tcp_t *tcp);
    744 int		tcp_init(tcp_t *tcp, queue_t *q);
    745 static int	tcp_init_values(tcp_t *tcp);
    746 static mblk_t	*tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic);
    747 static void	tcp_ip_ire_mark_advice(tcp_t *tcp);
    748 static void	tcp_ip_notify(tcp_t *tcp);
    749 static mblk_t	*tcp_ire_mp(mblk_t **mpp);
    750 static void	tcp_iss_init(tcp_t *tcp);
    751 static void	tcp_keepalive_killer(void *arg);
    752 static int	tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt);
    753 static void	tcp_mss_set(tcp_t *tcp, uint32_t size, boolean_t do_ss);
    754 static int	tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
    755 		    int *do_disconnectp, int *t_errorp, int *sys_errorp);
    756 static boolean_t tcp_allow_connopt_set(int level, int name);
    757 int		tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
    758 int		tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
    759 int		tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level,
    760 		    int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
    761 		    uchar_t *outvalp, void *thisdg_attrs, cred_t *cr,
    762 		    mblk_t *mblk);
    763 static void	tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha);
    764 static int	tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly,
    765 		    uchar_t *ptr, uint_t len);
    766 static int	tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
    767 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt,
    768     tcp_stack_t *);
    769 static int	tcp_param_set(queue_t *q, mblk_t *mp, char *value,
    770 		    caddr_t cp, cred_t *cr);
    771 static int	tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value,
    772 		    caddr_t cp, cred_t *cr);
    773 static void	tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *);
    774 static int	tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value,
    775 		    caddr_t cp, cred_t *cr);
    776 static void	tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt);
    777 static void	tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt);
    778 static mblk_t	*tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start);
    779 static void	tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp);
    780 static void	tcp_reinit(tcp_t *tcp);
    781 static void	tcp_reinit_values(tcp_t *tcp);
    782 
    783 static uint_t	tcp_rwnd_reopen(tcp_t *tcp);
    784 static uint_t	tcp_rcv_drain(tcp_t *tcp);
    785 static void	tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
    786 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
    787 static void	tcp_ss_rexmit(tcp_t *tcp);
    788 static mblk_t	*tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp);
    789 static void	tcp_process_options(tcp_t *, tcph_t *);
    790 static void	tcp_rput_common(tcp_t *tcp, mblk_t *mp);
    791 static void	tcp_rsrv(queue_t *q);
    792 static int	tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd);
    793 static int	tcp_snmp_state(tcp_t *tcp);
    794 static void	tcp_timer(void *arg);
    795 static void	tcp_timer_callback(void *);
    796 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp,
    797     boolean_t random);
    798 static in_port_t tcp_get_next_priv_port(const tcp_t *);
    799 static void	tcp_wput_sock(queue_t *q, mblk_t *mp);
    800 static void	tcp_wput_fallback(queue_t *q, mblk_t *mp);
    801 void		tcp_tpi_accept(queue_t *q, mblk_t *mp);
    802 static void	tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
    803 static void	tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
    804 static void	tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
    805 static int	tcp_send(queue_t *q, tcp_t *tcp, const int mss,
    806 		    const int tcp_hdr_len, const int tcp_tcp_hdr_len,
    807 		    const int num_sack_blk, int *usable, uint_t *snxt,
    808 		    int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
    809 		    const int mdt_thres);
    810 static int	tcp_multisend(queue_t *q, tcp_t *tcp, const int mss,
    811 		    const int tcp_hdr_len, const int tcp_tcp_hdr_len,
    812 		    const int num_sack_blk, int *usable, uint_t *snxt,
    813 		    int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
    814 		    const int mdt_thres);
    815 static void	tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
    816 		    int num_sack_blk);
    817 static void	tcp_wsrv(queue_t *q);
    818 static int	tcp_xmit_end(tcp_t *tcp);
    819 static void	tcp_ack_timer(void *arg);
    820 static mblk_t	*tcp_ack_mp(tcp_t *tcp);
    821 static void	tcp_xmit_early_reset(char *str, mblk_t *mp,
    822 		    uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len,
    823 		    zoneid_t zoneid, tcp_stack_t *, conn_t *connp);
    824 static void	tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
    825 		    uint32_t ack, int ctl);
    826 static int	setmaxps(queue_t *q, int maxpsz);
    827 static void	tcp_set_rto(tcp_t *, time_t);
    828 static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *,
    829 		    boolean_t, boolean_t);
    830 static void	tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp,
    831 		    boolean_t ipsec_mctl);
    832 static int	tcp_build_hdrs(tcp_t *);
    833 static void	tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
    834 		    uint32_t seg_seq, uint32_t seg_ack, int seg_len,
    835 		    tcph_t *tcph);
    836 boolean_t	tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp);
    837 static mblk_t	*tcp_mdt_info_mp(mblk_t *);
    838 static void	tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t);
    839 static int	tcp_mdt_add_attrs(multidata_t *, const mblk_t *,
    840 		    const boolean_t, const uint32_t, const uint32_t,
    841 		    const uint32_t, const uint32_t, tcp_stack_t *);
    842 static void	tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *,
    843 		    const uint_t, const uint_t, boolean_t *);
    844 static mblk_t	*tcp_lso_info_mp(mblk_t *);
    845 static void	tcp_lso_update(tcp_t *, ill_lso_capab_t *);
    846 static void	tcp_send_data(tcp_t *, queue_t *, mblk_t *);
    847 extern mblk_t	*tcp_timermp_alloc(int);
    848 extern void	tcp_timermp_free(tcp_t *);
    849 static void	tcp_timer_free(tcp_t *tcp, mblk_t *mp);
    850 static void	tcp_stop_lingering(tcp_t *tcp);
    851 static void	tcp_close_linger_timeout(void *arg);
    852 static void	*tcp_stack_init(netstackid_t stackid, netstack_t *ns);
    853 static void	tcp_stack_shutdown(netstackid_t stackid, void *arg);
    854 static void	tcp_stack_fini(netstackid_t stackid, void *arg);
    855 static void	*tcp_g_kstat_init(tcp_g_stat_t *);
    856 static void	tcp_g_kstat_fini(kstat_t *);
    857 static void	*tcp_kstat_init(netstackid_t, tcp_stack_t *);
    858 static void	tcp_kstat_fini(netstackid_t, kstat_t *);
    859 static void	*tcp_kstat2_init(netstackid_t, tcp_stat_t *);
    860 static void	tcp_kstat2_fini(netstackid_t, kstat_t *);
    861 static int	tcp_kstat_update(kstat_t *kp, int rw);
    862 void		tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp);
    863 static int	tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
    864 			tcph_t *tcph, uint_t ipvers, mblk_t *idmp);
    865 static int	tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
    866 			tcph_t *tcph, mblk_t *idmp);
    867 static int	tcp_squeue_switch(int);
    868 
    869 static int	tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
    870 static int	tcp_openv4(queue_t *, dev_t *, int, int, cred_t *);
    871 static int	tcp_openv6(queue_t *, dev_t *, int, int, cred_t *);
    872 static int	tcp_tpi_close(queue_t *, int);
    873 static int	tcp_tpi_close_accept(queue_t *);
    874 
    875 static void	tcp_squeue_add(squeue_t *);
    876 static boolean_t tcp_zcopy_check(tcp_t *);
    877 static void	tcp_zcopy_notify(tcp_t *);
    878 static mblk_t	*tcp_zcopy_disable(tcp_t *, mblk_t *);
    879 static mblk_t	*tcp_zcopy_backoff(tcp_t *, mblk_t *, int);
    880 static void	tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t);
    881 
    882 extern void	tcp_kssl_input(tcp_t *, mblk_t *);
    883 
    884 void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2);
    885 void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2);
    886 
    887 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
    888 	    sock_upper_handle_t, cred_t *);
    889 static int tcp_listen(sock_lower_handle_t, int, cred_t *);
    890 static int tcp_post_ip_bind(tcp_t *, mblk_t *, int, cred_t *, pid_t);
    891 static int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, cred_t *,
    892     boolean_t);
    893 static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
    894     cred_t *, pid_t);
    895 static int tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *,
    896     boolean_t);
    897 static int tcp_do_unbind(conn_t *);
    898 static int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, cred_t *,
    899     boolean_t);
    900 
    901 static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *);
    902 
    903 /*
    904  * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
    905  *
    906  * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
    907  * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
    908  * (defined in tcp.h) needs to be filled in and passed into the kernel
    909  * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
    910  * structure contains the four-tuple of a TCP connection and a range of TCP
    911  * states (specified by ac_start and ac_end). The use of wildcard addresses
    912  * and ports is allowed. Connections with a matching four tuple and a state
    913  * within the specified range will be aborted. The valid states for the
    914  * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
    915  * inclusive.
    916  *
    917  * An application which has its connection aborted by this ioctl will receive
    918  * an error that is dependent on the connection state at the time of the abort.
    919  * If the connection state is < TCPS_TIME_WAIT, an application should behave as
    920  * though a RST packet has been received.  If the connection state is equal to
    921  * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
    922  * and all resources associated with the connection will be freed.
    923  */
    924 static mblk_t	*tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
    925 static void	tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
    926 static void	tcp_ioctl_abort_handler(tcp_t *, mblk_t *);
    927 static int	tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
    928 static void	tcp_ioctl_abort_conn(queue_t *, mblk_t *);
    929 static int	tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
    930     boolean_t, tcp_stack_t *);
    931 
    932 static struct module_info tcp_rinfo =  {
    933 	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
    934 };
    935 
    936 static struct module_info tcp_winfo =  {
    937 	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16
    938 };
    939 
    940 /*
    941  * Entry points for TCP as a device. The normal case which supports
    942  * the TCP functionality.
    943  * We have separate open functions for the /dev/tcp and /dev/tcp6 devices.
    944  */
    945 struct qinit tcp_rinitv4 = {
    946 	NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, NULL, &tcp_rinfo
    947 };
    948 
    949 struct qinit tcp_rinitv6 = {
    950 	NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_tpi_close, NULL, &tcp_rinfo
    951 };
    952 
    953 struct qinit tcp_winit = {
    954 	(pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
    955 };
    956 
    957 /* Initial entry point for TCP in socket mode. */
    958 struct qinit tcp_sock_winit = {
    959 	(pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
    960 };
    961 
    962 /* TCP entry point during fallback */
    963 struct qinit tcp_fallback_sock_winit = {
    964 	(pfi_t)tcp_wput_fallback, NULL, NULL, NULL, NULL, &tcp_winfo
    965 };
    966 
    967 /*
    968  * Entry points for TCP as a acceptor STREAM opened by sockfs when doing
    969  * an accept. Avoid allocating data structures since eager has already
    970  * been created.
    971  */
    972 struct qinit tcp_acceptor_rinit = {
    973 	NULL, (pfi_t)tcp_rsrv, NULL, tcp_tpi_close_accept, NULL, &tcp_winfo
    974 };
    975 
    976 struct qinit tcp_acceptor_winit = {
    977 	(pfi_t)tcp_tpi_accept, NULL, NULL, NULL, NULL, &tcp_winfo
    978 };
    979 
    980 /* For AF_INET aka /dev/tcp */
    981 struct streamtab tcpinfov4 = {
    982 	&tcp_rinitv4, &tcp_winit
    983 };
    984 
    985 /* For AF_INET6 aka /dev/tcp6 */
    986 struct streamtab tcpinfov6 = {
    987 	&tcp_rinitv6, &tcp_winit
    988 };
    989 
    990 sock_downcalls_t sock_tcp_downcalls;
    991 
    992 /*
    993  * Have to ensure that tcp_g_q_close is not done by an
    994  * interrupt thread.
    995  */
    996 static taskq_t *tcp_taskq;
    997 
    998 /* Setable only in /etc/system. Move to ndd? */
    999 boolean_t tcp_icmp_source_quench = B_FALSE;
   1000 
   1001 /*
   1002  * Following assumes TPI alignment requirements stay along 32 bit
   1003  * boundaries
   1004  */
   1005 #define	ROUNDUP32(x) \
   1006 	(((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1))
   1007 
   1008 /* Template for response to info request. */
   1009 static struct T_info_ack tcp_g_t_info_ack = {
   1010 	T_INFO_ACK,		/* PRIM_type */
   1011 	0,			/* TSDU_size */
   1012 	T_INFINITE,		/* ETSDU_size */
   1013 	T_INVALID,		/* CDATA_size */
   1014 	T_INVALID,		/* DDATA_size */
   1015 	sizeof (sin_t),		/* ADDR_size */
   1016 	0,			/* OPT_size - not initialized here */
   1017 	TIDUSZ,			/* TIDU_size */
   1018 	T_COTS_ORD,		/* SERV_type */
   1019 	TCPS_IDLE,		/* CURRENT_state */
   1020 	(XPG4_1|EXPINLINE)	/* PROVIDER_flag */
   1021 };
   1022 
   1023 static struct T_info_ack tcp_g_t_info_ack_v6 = {
   1024 	T_INFO_ACK,		/* PRIM_type */
   1025 	0,			/* TSDU_size */
   1026 	T_INFINITE,		/* ETSDU_size */
   1027 	T_INVALID,		/* CDATA_size */
   1028 	T_INVALID,		/* DDATA_size */
   1029 	sizeof (sin6_t),	/* ADDR_size */
   1030 	0,			/* OPT_size - not initialized here */
   1031 	TIDUSZ,		/* TIDU_size */
   1032 	T_COTS_ORD,		/* SERV_type */
   1033 	TCPS_IDLE,		/* CURRENT_state */
   1034 	(XPG4_1|EXPINLINE)	/* PROVIDER_flag */
   1035 };
   1036 
   1037 #define	MS	1L
   1038 #define	SECONDS	(1000 * MS)
   1039 #define	MINUTES	(60 * SECONDS)
   1040 #define	HOURS	(60 * MINUTES)
   1041 #define	DAYS	(24 * HOURS)
   1042 
   1043 #define	PARAM_MAX (~(uint32_t)0)
   1044 
   1045 /* Max size IP datagram is 64k - 1 */
   1046 #define	TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t)))
   1047 #define	TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t)))
   1048 /* Max of the above */
   1049 #define	TCP_MSS_MAX	TCP_MSS_MAX_IPV4
   1050 
   1051 /* Largest TCP port number */
   1052 #define	TCP_MAX_PORT	(64 * 1024 - 1)
   1053 
   1054 /*
   1055  * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
   1056  * layer header.  It has to be a multiple of 4.
   1057  */
   1058 static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" };
   1059 #define	tcps_wroff_xtra	tcps_wroff_xtra_param->tcp_param_val
   1060 
   1061 /*
   1062  * All of these are alterable, within the min/max values given, at run time.
   1063  * Note that the default value of "tcp_time_wait_interval" is four minutes,
   1064  * per the TCP spec.
   1065  */
   1066 /* BEGIN CSTYLED */
   1067 static tcpparam_t	lcl_tcp_param_arr[] = {
   1068  /*min		max		value		name */
   1069  { 1*SECONDS,	10*MINUTES,	1*MINUTES,	"tcp_time_wait_interval"},
   1070  { 1,		PARAM_MAX,	128,		"tcp_conn_req_max_q" },
   1071  { 0,		PARAM_MAX,	1024,		"tcp_conn_req_max_q0" },
   1072  { 1,		1024,		1,		"tcp_conn_req_min" },
   1073  { 0*MS,	20*SECONDS,	0*MS,		"tcp_conn_grace_period" },
   1074  { 128,		(1<<30),	1024*1024,	"tcp_cwnd_max" },
   1075  { 0,		10,		0,		"tcp_debug" },
   1076  { 1024,	(32*1024),	1024,		"tcp_smallest_nonpriv_port"},
   1077  { 1*SECONDS,	PARAM_MAX,	3*MINUTES,	"tcp_ip_abort_cinterval"},
   1078  { 1*SECONDS,	PARAM_MAX,	3*MINUTES,	"tcp_ip_abort_linterval"},
   1079  { 500*MS,	PARAM_MAX,	8*MINUTES,	"tcp_ip_abort_interval"},
   1080  { 1*SECONDS,	PARAM_MAX,	10*SECONDS,	"tcp_ip_notify_cinterval"},
   1081  { 500*MS,	PARAM_MAX,	10*SECONDS,	"tcp_ip_notify_interval"},
   1082  { 1,		255,		64,		"tcp_ipv4_ttl"},
   1083  { 10*SECONDS,	10*DAYS,	2*HOURS,	"tcp_keepalive_interval"},
   1084  { 0,		100,		10,		"tcp_maxpsz_multiplier" },
   1085  { 1,		TCP_MSS_MAX_IPV4, 536,		"tcp_mss_def_ipv4"},
   1086  { 1,		TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"},
   1087  { 1,		TCP_MSS_MAX,	108,		"tcp_mss_min"},
   1088  { 1,		(64*1024)-1,	(4*1024)-1,	"tcp_naglim_def"},
   1089  { 1*MS,	20*SECONDS,	3*SECONDS,	"tcp_rexmit_interval_initial"},
   1090  { 1*MS,	2*HOURS,	60*SECONDS,	"tcp_rexmit_interval_max"},
   1091  { 1*MS,	2*HOURS,	400*MS,		"tcp_rexmit_interval_min"},
   1092  { 1*MS,	1*MINUTES,	100*MS,		"tcp_deferred_ack_interval" },
   1093  { 0,		16,		0,		"tcp_snd_lowat_fraction" },
   1094  { 0,		128000,		0,		"tcp_sth_rcv_hiwat" },
   1095  { 0,		128000,		0,		"tcp_sth_rcv_lowat" },
   1096  { 1,		10000,		3,		"tcp_dupack_fast_retransmit" },
   1097  { 0,		1,		0,		"tcp_ignore_path_mtu" },
   1098  { 1024,	TCP_MAX_PORT,	32*1024,	"tcp_smallest_anon_port"},
   1099  { 1024,	TCP_MAX_PORT,	TCP_MAX_PORT,	"tcp_largest_anon_port"},
   1100  { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"},
   1101  { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"},
   1102  { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"},
   1103  { 1,		65536,		4,		"tcp_recv_hiwat_minmss"},
   1104  { 1*SECONDS,	PARAM_MAX,	675*SECONDS,	"tcp_fin_wait_2_flush_interval"},
   1105  { 8192,	(1<<30),	1024*1024,	"tcp_max_buf"},
   1106 /*
   1107  * Question:  What default value should I set for tcp_strong_iss?
   1108  */
   1109  { 0,		2,		1,		"tcp_strong_iss"},
   1110  { 0,		65536,		20,		"tcp_rtt_updates"},
   1111  { 0,		1,		1,		"tcp_wscale_always"},
   1112  { 0,		1,		0,		"tcp_tstamp_always"},
   1113  { 0,		1,		1,		"tcp_tstamp_if_wscale"},
   1114  { 0*MS,	2*HOURS,	0*MS,		"tcp_rexmit_interval_extra"},
   1115  { 0,		16,		2,		"tcp_deferred_acks_max"},
   1116  { 1,		16384,		4,		"tcp_slow_start_after_idle"},
   1117  { 1,		4,		4,		"tcp_slow_start_initial"},
   1118  { 0,		2,		2,		"tcp_sack_permitted"},
   1119  { 0,		1,		1,		"tcp_compression_enabled"},
   1120  { 0,		IPV6_MAX_HOPS,	IPV6_DEFAULT_HOPS,	"tcp_ipv6_hoplimit"},
   1121  { 1,		TCP_MSS_MAX_IPV6, 1220,		"tcp_mss_def_ipv6"},
   1122  { 1,		TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"},
   1123  { 0,		1,		0,		"tcp_rev_src_routes"},
   1124  { 10*MS,	500*MS,		50*MS,		"tcp_local_dack_interval"},
   1125  { 0,		16,		8,		"tcp_local_dacks_max"},
   1126  { 0,		2,		1,		"tcp_ecn_permitted"},
   1127  { 0,		1,		1,		"tcp_rst_sent_rate_enabled"},
   1128  { 0,		PARAM_MAX,	40,		"tcp_rst_sent_rate"},
   1129  { 0,		100*MS,		50*MS,		"tcp_push_timer_interval"},
   1130  { 0,		1,		0,		"tcp_use_smss_as_mss_opt"},
   1131  { 0,		PARAM_MAX,	8*MINUTES,	"tcp_keepalive_abort_interval"},
   1132 };
   1133 /* END CSTYLED */
   1134 
   1135 /*
   1136  * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of
   1137  * each header fragment in the header buffer.  Each parameter value has
   1138  * to be a multiple of 4 (32-bit aligned).
   1139  */
   1140 static tcpparam_t lcl_tcp_mdt_head_param =
   1141 	{ 32, 256, 32, "tcp_mdt_hdr_head_min" };
   1142 static tcpparam_t lcl_tcp_mdt_tail_param =
   1143 	{ 0,  256, 32, "tcp_mdt_hdr_tail_min" };
   1144 #define	tcps_mdt_hdr_head_min	tcps_mdt_head_param->tcp_param_val
   1145 #define	tcps_mdt_hdr_tail_min	tcps_mdt_tail_param->tcp_param_val
   1146 
   1147 /*
   1148  * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out
   1149  * the maximum number of payload buffers associated per Multidata.
   1150  */
   1151 static tcpparam_t lcl_tcp_mdt_max_pbufs_param =
   1152 	{ 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" };
   1153 #define	tcps_mdt_max_pbufs	tcps_mdt_max_pbufs_param->tcp_param_val
   1154 
   1155 /* Round up the value to the nearest mss. */
   1156 #define	MSS_ROUNDUP(value, mss)		((((value) - 1) / (mss) + 1) * (mss))
   1157 
   1158 /*
   1159  * Set ECN capable transport (ECT) code point in IP header.
   1160  *
   1161  * Note that there are 2 ECT code points '01' and '10', which are called
   1162  * ECT(1) and ECT(0) respectively.  Here we follow the original ECT code
   1163  * point ECT(0) for TCP as described in RFC 2481.
   1164  */
   1165 #define	SET_ECT(tcp, iph) \
   1166 	if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
   1167 		/* We need to clear the code point first. */ \
   1168 		((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
   1169 		((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
   1170 	} else { \
   1171 		((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
   1172 		((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
   1173 	}
   1174 
   1175 /*
   1176  * The format argument to pass to tcp_display().
   1177  * DISP_PORT_ONLY means that the returned string has only port info.
   1178  * DISP_ADDR_AND_PORT means that the returned string also contains the
   1179  * remote and local IP address.
   1180  */
   1181 #define	DISP_PORT_ONLY		1
   1182 #define	DISP_ADDR_AND_PORT	2
   1183 
   1184 #define	IS_VMLOANED_MBLK(mp) \
   1185 	(((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0)
   1186 
   1187 
   1188 /* Enable or disable b_cont M_MULTIDATA chaining for MDT. */
   1189 boolean_t tcp_mdt_chain = B_TRUE;
   1190 
   1191 /*
   1192  * MDT threshold in the form of effective send MSS multiplier; we take
   1193  * the MDT path if the amount of unsent data exceeds the threshold value
   1194  * (default threshold is 1*SMSS).
   1195  */
   1196 uint_t tcp_mdt_smss_threshold = 1;
   1197 
   1198 uint32_t do_tcpzcopy = 1;		/* 0: disable, 1: enable, 2: force */
   1199 
   1200 /*
   1201  * Forces all connections to obey the value of the tcps_maxpsz_multiplier
   1202  * tunable settable via NDD.  Otherwise, the per-connection behavior is
   1203  * determined dynamically during tcp_adapt_ire(), which is the default.
   1204  */
   1205 boolean_t tcp_static_maxpsz = B_FALSE;
   1206 
   1207 /* Setable in /etc/system */
   1208 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
   1209 uint32_t tcp_random_anon_port = 1;
   1210 
   1211 /*
   1212  * To reach to an eager in Q0 which can be dropped due to an incoming
   1213  * new SYN request when Q0 is full, a new doubly linked list is
   1214  * introduced. This list allows to select an eager from Q0 in O(1) time.
   1215  * This is needed to avoid spending too much time walking through the
   1216  * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of
   1217  * this new list has to be a member of Q0.
   1218  * This list is headed by listener's tcp_t. When the list is empty,
   1219  * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0,
   1220  * of listener's tcp_t point to listener's tcp_t itself.
   1221  *
   1222  * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager
   1223  * in the list. MAKE_UNDROPPABLE() takes the eager out of the list.
   1224  * These macros do not affect the eager's membership to Q0.
   1225  */
   1226 
   1227 
   1228 #define	MAKE_DROPPABLE(listener, eager)					\
   1229 	if ((eager)->tcp_eager_next_drop_q0 == NULL) {			\
   1230 		(listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\
   1231 		    = (eager);						\
   1232 		(eager)->tcp_eager_prev_drop_q0 = (listener);		\
   1233 		(eager)->tcp_eager_next_drop_q0 =			\
   1234 		    (listener)->tcp_eager_next_drop_q0;			\
   1235 		(listener)->tcp_eager_next_drop_q0 = (eager);		\
   1236 	}
   1237 
   1238 #define	MAKE_UNDROPPABLE(eager)						\
   1239 	if ((eager)->tcp_eager_next_drop_q0 != NULL) {			\
   1240 		(eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0	\
   1241 		    = (eager)->tcp_eager_prev_drop_q0;			\
   1242 		(eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0	\
   1243 		    = (eager)->tcp_eager_next_drop_q0;			\
   1244 		(eager)->tcp_eager_prev_drop_q0 = NULL;			\
   1245 		(eager)->tcp_eager_next_drop_q0 = NULL;			\
   1246 	}
   1247 
   1248 /*
   1249  * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
   1250  * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
   1251  * data, TCP will not respond with an ACK.  RFC 793 requires that
   1252  * TCP responds with an ACK for such a bogus ACK.  By not following
   1253  * the RFC, we prevent TCP from getting into an ACK storm if somehow
   1254  * an attacker successfully spoofs an acceptable segment to our
   1255  * peer; or when our peer is "confused."
   1256  */
   1257 uint32_t tcp_drop_ack_unsent_cnt = 10;
   1258 
   1259 /*
   1260  * Hook functions to enable cluster networking
   1261  * On non-clustered systems these vectors must always be NULL.
   1262  */
   1263 
   1264 void (*cl_inet_listen)(netstackid_t stack_id, uint8_t protocol,
   1265 			    sa_family_t addr_family, uint8_t *laddrp,
   1266 			    in_port_t lport, void *args) = NULL;
   1267 void (*cl_inet_unlisten)(netstackid_t stack_id, uint8_t protocol,
   1268 			    sa_family_t addr_family, uint8_t *laddrp,
   1269 			    in_port_t lport, void *args) = NULL;
   1270 
   1271 int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol,
   1272 			    boolean_t is_outgoing,
   1273 			    sa_family_t addr_family,
   1274 			    uint8_t *laddrp, in_port_t lport,
   1275 			    uint8_t *faddrp, in_port_t fport,
   1276 			    void *args) = NULL;
   1277 
   1278 void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol,
   1279 			    sa_family_t addr_family, uint8_t *laddrp,
   1280 			    in_port_t lport, uint8_t *faddrp,
   1281 			    in_port_t fport, void *args) = NULL;
   1282 
   1283 /*
   1284  * The following are defined in ip.c
   1285  */
   1286 extern int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
   1287 			    sa_family_t addr_family, uint8_t *laddrp,
   1288 			    void *args);
   1289 extern uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
   1290 			    sa_family_t addr_family, uint8_t *laddrp,
   1291 			    uint8_t *faddrp, void *args);
   1292 
   1293 
   1294 /*
   1295  * int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err)
   1296  */
   1297 #define	CL_INET_CONNECT(connp, tcp, is_outgoing, err) {		\
   1298 	(err) = 0;						\
   1299 	if (cl_inet_connect2 != NULL) {				\
   1300 		/*						\
   1301 		 * Running in cluster mode - register active connection	\
   1302 		 * information						\
   1303 		 */							\
   1304 		if ((tcp)->tcp_ipversion == IPV4_VERSION) {		\
   1305 			if ((tcp)->tcp_ipha->ipha_src != 0) {		\
   1306 				(err) = (*cl_inet_connect2)(		\
   1307 				    (connp)->conn_netstack->netstack_stackid,\
   1308 				    IPPROTO_TCP, is_outgoing, AF_INET,	\
   1309 				    (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\
   1310 				    (in_port_t)(tcp)->tcp_lport,	\
   1311 				    (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\
   1312 				    (in_port_t)(tcp)->tcp_fport, NULL);	\
   1313 			}						\
   1314 		} else {						\
   1315 			if (!IN6_IS_ADDR_UNSPECIFIED(			\
   1316 			    &(tcp)->tcp_ip6h->ip6_src)) {		\
   1317 				(err) = (*cl_inet_connect2)(		\
   1318 				    (connp)->conn_netstack->netstack_stackid,\
   1319 				    IPPROTO_TCP, is_outgoing, AF_INET6,	\
   1320 				    (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\
   1321 				    (in_port_t)(tcp)->tcp_lport,	\
   1322 				    (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\
   1323 				    (in_port_t)(tcp)->tcp_fport, NULL);	\
   1324 			}						\
   1325 		}							\
   1326 	}								\
   1327 }
   1328 
   1329 #define	CL_INET_DISCONNECT(connp, tcp)	{				\
   1330 	if (cl_inet_disconnect != NULL) {				\
   1331 		/*							\
   1332 		 * Running in cluster mode - deregister active		\
   1333 		 * connection information				\
   1334 		 */							\
   1335 		if ((tcp)->tcp_ipversion == IPV4_VERSION) {		\
   1336 			if ((tcp)->tcp_ip_src != 0) {			\
   1337 				(*cl_inet_disconnect)(			\
   1338 				    (connp)->conn_netstack->netstack_stackid,\
   1339 				    IPPROTO_TCP, AF_INET,		\
   1340 				    (uint8_t *)(&((tcp)->tcp_ip_src)),	\
   1341 				    (in_port_t)(tcp)->tcp_lport,	\
   1342 				    (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\
   1343 				    (in_port_t)(tcp)->tcp_fport, NULL);	\
   1344 			}						\
   1345 		} else {						\
   1346 			if (!IN6_IS_ADDR_UNSPECIFIED(			\
   1347 			    &(tcp)->tcp_ip_src_v6)) {			\
   1348 				(*cl_inet_disconnect)(			\
   1349 				    (connp)->conn_netstack->netstack_stackid,\
   1350 				    IPPROTO_TCP, AF_INET6,		\
   1351 				    (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\
   1352 				    (in_port_t)(tcp)->tcp_lport,	\
   1353 				    (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\
   1354 				    (in_port_t)(tcp)->tcp_fport, NULL);	\
   1355 			}						\
   1356 		}							\
   1357 	}								\
   1358 }
   1359 
   1360 /*
   1361  * Cluster networking hook for traversing current connection list.
   1362  * This routine is used to extract the current list of live connections
   1363  * which must continue to to be dispatched to this node.
   1364  */
   1365 int cl_tcp_walk_list(netstackid_t stack_id,
   1366     int (*callback)(cl_tcp_info_t *, void *), void *arg);
   1367 
   1368 static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *),
   1369     void *arg, tcp_stack_t *tcps);
   1370 
   1371 #define	DTRACE_IP_FASTPATH(mp, iph, ill, ipha, ip6h) 			\
   1372 	DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,	\
   1373 	    iph, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha,		\
   1374 	    ip6_t *, ip6h, int, 0);
   1375 
   1376 /*
   1377  * Figure out the value of window scale opton.  Note that the rwnd is
   1378  * ASSUMED to be rounded up to the nearest MSS before the calculation.
   1379  * We cannot find the scale value and then do a round up of tcp_rwnd
   1380  * because the scale value may not be correct after that.
   1381  *
   1382  * Set the compiler flag to make this function inline.
   1383  */
   1384 static void
   1385 tcp_set_ws_value(tcp_t *tcp)
   1386 {
   1387 	int i;
   1388 	uint32_t rwnd = tcp->tcp_rwnd;
   1389 
   1390 	for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT;
   1391 	    i++, rwnd >>= 1)
   1392 		;
   1393 	tcp->tcp_rcv_ws = i;
   1394 }
   1395 
   1396 /*
   1397  * Remove a connection from the list of detached TIME_WAIT connections.
   1398  * It returns B_FALSE if it can't remove the connection from the list
   1399  * as the connection has already been removed from the list due to an
   1400  * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
   1401  */
   1402 static boolean_t
   1403 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
   1404 {
   1405 	boolean_t	locked = B_FALSE;
   1406 
   1407 	if (tcp_time_wait == NULL) {
   1408 		tcp_time_wait = *((tcp_squeue_priv_t **)
   1409 		    squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
   1410 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1411 		locked = B_TRUE;
   1412 	} else {
   1413 		ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
   1414 	}
   1415 
   1416 	if (tcp->tcp_time_wait_expire == 0) {
   1417 		ASSERT(tcp->tcp_time_wait_next == NULL);
   1418 		ASSERT(tcp->tcp_time_wait_prev == NULL);
   1419 		if (locked)
   1420 			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1421 		return (B_FALSE);
   1422 	}
   1423 	ASSERT(TCP_IS_DETACHED(tcp));
   1424 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
   1425 
   1426 	if (tcp == tcp_time_wait->tcp_time_wait_head) {
   1427 		ASSERT(tcp->tcp_time_wait_prev == NULL);
   1428 		tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
   1429 		if (tcp_time_wait->tcp_time_wait_head != NULL) {
   1430 			tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
   1431 			    NULL;
   1432 		} else {
   1433 			tcp_time_wait->tcp_time_wait_tail = NULL;
   1434 		}
   1435 	} else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
   1436 		ASSERT(tcp != tcp_time_wait->tcp_time_wait_head);
   1437 		ASSERT(tcp->tcp_time_wait_next == NULL);
   1438 		tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
   1439 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
   1440 		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
   1441 	} else {
   1442 		ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
   1443 		ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
   1444 		tcp->tcp_time_wait_prev->tcp_time_wait_next =
   1445 		    tcp->tcp_time_wait_next;
   1446 		tcp->tcp_time_wait_next->tcp_time_wait_prev =
   1447 		    tcp->tcp_time_wait_prev;
   1448 	}
   1449 	tcp->tcp_time_wait_next = NULL;
   1450 	tcp->tcp_time_wait_prev = NULL;
   1451 	tcp->tcp_time_wait_expire = 0;
   1452 
   1453 	if (locked)
   1454 		mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1455 	return (B_TRUE);
   1456 }
   1457 
   1458 /*
   1459  * Add a connection to the list of detached TIME_WAIT connections
   1460  * and set its time to expire.
   1461  */
   1462 static void
   1463 tcp_time_wait_append(tcp_t *tcp)
   1464 {
   1465 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1466 	tcp_squeue_priv_t *tcp_time_wait =
   1467 	    *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp,
   1468 	    SQPRIVATE_TCP));
   1469 
   1470 	tcp_timers_stop(tcp);
   1471 
   1472 	/* Freed above */
   1473 	ASSERT(tcp->tcp_timer_tid == 0);
   1474 	ASSERT(tcp->tcp_ack_tid == 0);
   1475 
   1476 	/* must have happened at the time of detaching the tcp */
   1477 	ASSERT(tcp->tcp_ptpahn == NULL);
   1478 	ASSERT(tcp->tcp_flow_stopped == 0);
   1479 	ASSERT(tcp->tcp_time_wait_next == NULL);
   1480 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   1481 	ASSERT(tcp->tcp_time_wait_expire == NULL);
   1482 	ASSERT(tcp->tcp_listener == NULL);
   1483 
   1484 	tcp->tcp_time_wait_expire = ddi_get_lbolt();
   1485 	/*
   1486 	 * The value computed below in tcp->tcp_time_wait_expire may
   1487 	 * appear negative or wrap around. That is ok since our
   1488 	 * interest is only in the difference between the current lbolt
   1489 	 * value and tcp->tcp_time_wait_expire. But the value should not
   1490 	 * be zero, since it means the tcp is not in the TIME_WAIT list.
   1491 	 * The corresponding comparison in tcp_time_wait_collector() uses
   1492 	 * modular arithmetic.
   1493 	 */
   1494 	tcp->tcp_time_wait_expire +=
   1495 	    drv_usectohz(tcps->tcps_time_wait_interval * 1000);
   1496 	if (tcp->tcp_time_wait_expire == 0)
   1497 		tcp->tcp_time_wait_expire = 1;
   1498 
   1499 	ASSERT(TCP_IS_DETACHED(tcp));
   1500 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
   1501 	ASSERT(tcp->tcp_time_wait_next == NULL);
   1502 	ASSERT(tcp->tcp_time_wait_prev == NULL);
   1503 	TCP_DBGSTAT(tcps, tcp_time_wait);
   1504 
   1505 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1506 	if (tcp_time_wait->tcp_time_wait_head == NULL) {
   1507 		ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
   1508 		tcp_time_wait->tcp_time_wait_head = tcp;
   1509 	} else {
   1510 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
   1511 		ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
   1512 		    TCPS_TIME_WAIT);
   1513 		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp;
   1514 		tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail;
   1515 	}
   1516 	tcp_time_wait->tcp_time_wait_tail = tcp;
   1517 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1518 }
   1519 
   1520 /* ARGSUSED */
   1521 void
   1522 tcp_timewait_output(void *arg, mblk_t *mp, void *arg2)
   1523 {
   1524 	conn_t	*connp = (conn_t *)arg;
   1525 	tcp_t	*tcp = connp->conn_tcp;
   1526 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1527 
   1528 	ASSERT(tcp != NULL);
   1529 	if (tcp->tcp_state == TCPS_CLOSED) {
   1530 		return;
   1531 	}
   1532 
   1533 	ASSERT((tcp->tcp_family == AF_INET &&
   1534 	    tcp->tcp_ipversion == IPV4_VERSION) ||
   1535 	    (tcp->tcp_family == AF_INET6 &&
   1536 	    (tcp->tcp_ipversion == IPV4_VERSION ||
   1537 	    tcp->tcp_ipversion == IPV6_VERSION)));
   1538 	ASSERT(!tcp->tcp_listener);
   1539 
   1540 	TCP_STAT(tcps, tcp_time_wait_reap);
   1541 	ASSERT(TCP_IS_DETACHED(tcp));
   1542 
   1543 	/*
   1544 	 * Because they have no upstream client to rebind or tcp_close()
   1545 	 * them later, we axe the connection here and now.
   1546 	 */
   1547 	tcp_close_detached(tcp);
   1548 }
   1549 
   1550 /*
   1551  * Remove cached/latched IPsec references.
   1552  */
   1553 void
   1554 tcp_ipsec_cleanup(tcp_t *tcp)
   1555 {
   1556 	conn_t		*connp = tcp->tcp_connp;
   1557 
   1558 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
   1559 
   1560 	if (connp->conn_latch != NULL) {
   1561 		IPLATCH_REFRELE(connp->conn_latch,
   1562 		    connp->conn_netstack);
   1563 		connp->conn_latch = NULL;
   1564 	}
   1565 	if (connp->conn_policy != NULL) {
   1566 		IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
   1567 		connp->conn_policy = NULL;
   1568 	}
   1569 }
   1570 
   1571 /*
   1572  * Cleaup before placing on free list.
   1573  * Disassociate from the netstack/tcp_stack_t since the freelist
   1574  * is per squeue and not per netstack.
   1575  */
   1576 void
   1577 tcp_cleanup(tcp_t *tcp)
   1578 {
   1579 	mblk_t		*mp;
   1580 	char		*tcp_iphc;
   1581 	int		tcp_iphc_len;
   1582 	int		tcp_hdr_grown;
   1583 	tcp_sack_info_t	*tcp_sack_info;
   1584 	conn_t		*connp = tcp->tcp_connp;
   1585 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   1586 	netstack_t	*ns = tcps->tcps_netstack;
   1587 	mblk_t		*tcp_rsrv_mp;
   1588 
   1589 	tcp_bind_hash_remove(tcp);
   1590 
   1591 	/* Cleanup that which needs the netstack first */
   1592 	tcp_ipsec_cleanup(tcp);
   1593 
   1594 	tcp_free(tcp);
   1595 
   1596 	/* Release any SSL context */
   1597 	if (tcp->tcp_kssl_ent != NULL) {
   1598 		kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
   1599 		tcp->tcp_kssl_ent = NULL;
   1600 	}
   1601 
   1602 	if (tcp->tcp_kssl_ctx != NULL) {
   1603 		kssl_release_ctx(tcp->tcp_kssl_ctx);
   1604 		tcp->tcp_kssl_ctx = NULL;
   1605 	}
   1606 	tcp->tcp_kssl_pending = B_FALSE;
   1607 
   1608 	conn_delete_ire(connp, NULL);
   1609 
   1610 	/*
   1611 	 * Since we will bzero the entire structure, we need to
   1612 	 * remove it and reinsert it in global hash list. We
   1613 	 * know the walkers can't get to this conn because we
   1614 	 * had set CONDEMNED flag earlier and checked reference
   1615 	 * under conn_lock so walker won't pick it and when we
   1616 	 * go the ipcl_globalhash_remove() below, no walker
   1617 	 * can get to it.
   1618 	 */
   1619 	ipcl_globalhash_remove(connp);
   1620 
   1621 	/*
   1622 	 * Now it is safe to decrement the reference counts.
   1623 	 * This might be the last reference on the netstack and TCPS
   1624 	 * in which case it will cause the tcp_g_q_close and
   1625 	 * the freeing of the IP Instance.
   1626 	 */
   1627 	connp->conn_netstack = NULL;
   1628 	netstack_rele(ns);
   1629 	ASSERT(tcps != NULL);
   1630 	tcp->tcp_tcps = NULL;
   1631 	TCPS_REFRELE(tcps);
   1632 
   1633 	/* Save some state */
   1634 	mp = tcp->tcp_timercache;
   1635 
   1636 	tcp_sack_info = tcp->tcp_sack_info;
   1637 	tcp_iphc = tcp->tcp_iphc;
   1638 	tcp_iphc_len = tcp->tcp_iphc_len;
   1639 	tcp_hdr_grown = tcp->tcp_hdr_grown;
   1640 	tcp_rsrv_mp = tcp->tcp_rsrv_mp;
   1641 
   1642 	if (connp->conn_cred != NULL) {
   1643 		crfree(connp->conn_cred);
   1644 		connp->conn_cred = NULL;
   1645 	}
   1646 	if (connp->conn_effective_cred != NULL) {
   1647 		crfree(connp->conn_effective_cred);
   1648 		connp->conn_effective_cred = NULL;
   1649 	}
   1650 	ipcl_conn_cleanup(connp);
   1651 	connp->conn_flags = IPCL_TCPCONN;
   1652 	bzero(tcp, sizeof (tcp_t));
   1653 
   1654 	/* restore the state */
   1655 	tcp->tcp_timercache = mp;
   1656 
   1657 	tcp->tcp_sack_info = tcp_sack_info;
   1658 	tcp->tcp_iphc = tcp_iphc;
   1659 	tcp->tcp_iphc_len = tcp_iphc_len;
   1660 	tcp->tcp_hdr_grown = tcp_hdr_grown;
   1661 	tcp->tcp_rsrv_mp = tcp_rsrv_mp;
   1662 
   1663 	tcp->tcp_connp = connp;
   1664 
   1665 	ASSERT(connp->conn_tcp == tcp);
   1666 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
   1667 	connp->conn_state_flags = CONN_INCIPIENT;
   1668 	ASSERT(connp->conn_ulp == IPPROTO_TCP);
   1669 	ASSERT(connp->conn_ref == 1);
   1670 }
   1671 
   1672 /*
   1673  * Blows away all tcps whose TIME_WAIT has expired. List traversal
   1674  * is done forwards from the head.
   1675  * This walks all stack instances since
   1676  * tcp_time_wait remains global across all stacks.
   1677  */
   1678 /* ARGSUSED */
   1679 void
   1680 tcp_time_wait_collector(void *arg)
   1681 {
   1682 	tcp_t *tcp;
   1683 	clock_t now;
   1684 	mblk_t *mp;
   1685 	conn_t *connp;
   1686 	kmutex_t *lock;
   1687 	boolean_t removed;
   1688 
   1689 	squeue_t *sqp = (squeue_t *)arg;
   1690 	tcp_squeue_priv_t *tcp_time_wait =
   1691 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
   1692 
   1693 	mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1694 	tcp_time_wait->tcp_time_wait_tid = 0;
   1695 
   1696 	if (tcp_time_wait->tcp_free_list != NULL &&
   1697 	    tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
   1698 		TCP_G_STAT(tcp_freelist_cleanup);
   1699 		while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
   1700 			tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
   1701 			tcp->tcp_time_wait_next = NULL;
   1702 			tcp_time_wait->tcp_free_list_cnt--;
   1703 			ASSERT(tcp->tcp_tcps == NULL);
   1704 			CONN_DEC_REF(tcp->tcp_connp);
   1705 		}
   1706 		ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
   1707 	}
   1708 
   1709 	/*
   1710 	 * In order to reap time waits reliably, we should use a
   1711 	 * source of time that is not adjustable by the user -- hence
   1712 	 * the call to ddi_get_lbolt().
   1713 	 */
   1714 	now = ddi_get_lbolt();
   1715 	while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
   1716 		/*
   1717 		 * Compare times using modular arithmetic, since
   1718 		 * lbolt can wrapover.
   1719 		 */
   1720 		if ((now - tcp->tcp_time_wait_expire) < 0) {
   1721 			break;
   1722 		}
   1723 
   1724 		removed = tcp_time_wait_remove(tcp, tcp_time_wait);
   1725 		ASSERT(removed);
   1726 
   1727 		connp = tcp->tcp_connp;
   1728 		ASSERT(connp->conn_fanout != NULL);
   1729 		lock = &connp->conn_fanout->connf_lock;
   1730 		/*
   1731 		 * This is essentially a TW reclaim fast path optimization for
   1732 		 * performance where the timewait collector checks under the
   1733 		 * fanout lock (so that no one else can get access to the
   1734 		 * conn_t) that the refcnt is 2 i.e. one for TCP and one for
   1735 		 * the classifier hash list. If ref count is indeed 2, we can
   1736 		 * just remove the conn under the fanout lock and avoid
   1737 		 * cleaning up the conn under the squeue, provided that
   1738 		 * clustering callbacks are not enabled. If clustering is
   1739 		 * enabled, we need to make the clustering callback before
   1740 		 * setting the CONDEMNED flag and after dropping all locks and
   1741 		 * so we forego this optimization and fall back to the slow
   1742 		 * path. Also please see the comments in tcp_closei_local
   1743 		 * regarding the refcnt logic.
   1744 		 *
   1745 		 * Since we are holding the tcp_time_wait_lock, its better
   1746 		 * not to block on the fanout_lock because other connections
   1747 		 * can't add themselves to time_wait list. So we do a
   1748 		 * tryenter instead of mutex_enter.
   1749 		 */
   1750 		if (mutex_tryenter(lock)) {
   1751 			mutex_enter(&connp->conn_lock);
   1752 			if ((connp->conn_ref == 2) &&
   1753 			    (cl_inet_disconnect == NULL)) {
   1754 				ipcl_hash_remove_locked(connp,
   1755 				    connp->conn_fanout);
   1756 				/*
   1757 				 * Set the CONDEMNED flag now itself so that
   1758 				 * the refcnt cannot increase due to any
   1759 				 * walker. But we have still not cleaned up
   1760 				 * conn_ire_cache. This is still ok since
   1761 				 * we are going to clean it up in tcp_cleanup
   1762 				 * immediately and any interface unplumb
   1763 				 * thread will wait till the ire is blown away
   1764 				 */
   1765 				connp->conn_state_flags |= CONN_CONDEMNED;
   1766 				mutex_exit(lock);
   1767 				mutex_exit(&connp->conn_lock);
   1768 				if (tcp_time_wait->tcp_free_list_cnt <
   1769 				    tcp_free_list_max_cnt) {
   1770 					/* Add to head of tcp_free_list */
   1771 					mutex_exit(
   1772 					    &tcp_time_wait->tcp_time_wait_lock);
   1773 					tcp_cleanup(tcp);
   1774 					ASSERT(connp->conn_latch == NULL);
   1775 					ASSERT(connp->conn_policy == NULL);
   1776 					ASSERT(tcp->tcp_tcps == NULL);
   1777 					ASSERT(connp->conn_netstack == NULL);
   1778 
   1779 					mutex_enter(
   1780 					    &tcp_time_wait->tcp_time_wait_lock);
   1781 					tcp->tcp_time_wait_next =
   1782 					    tcp_time_wait->tcp_free_list;
   1783 					tcp_time_wait->tcp_free_list = tcp;
   1784 					tcp_time_wait->tcp_free_list_cnt++;
   1785 					continue;
   1786 				} else {
   1787 					/* Do not add to tcp_free_list */
   1788 					mutex_exit(
   1789 					    &tcp_time_wait->tcp_time_wait_lock);
   1790 					tcp_bind_hash_remove(tcp);
   1791 					conn_delete_ire(tcp->tcp_connp, NULL);
   1792 					tcp_ipsec_cleanup(tcp);
   1793 					CONN_DEC_REF(tcp->tcp_connp);
   1794 				}
   1795 			} else {
   1796 				CONN_INC_REF_LOCKED(connp);
   1797 				mutex_exit(lock);
   1798 				mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1799 				mutex_exit(&connp->conn_lock);
   1800 				/*
   1801 				 * We can reuse the closemp here since conn has
   1802 				 * detached (otherwise we wouldn't even be in
   1803 				 * time_wait list). tcp_closemp_used can safely
   1804 				 * be changed without taking a lock as no other
   1805 				 * thread can concurrently access it at this
   1806 				 * point in the connection lifecycle.
   1807 				 */
   1808 
   1809 				if (tcp->tcp_closemp.b_prev == NULL)
   1810 					tcp->tcp_closemp_used = B_TRUE;
   1811 				else
   1812 					cmn_err(CE_PANIC,
   1813 					    "tcp_timewait_collector: "
   1814 					    "concurrent use of tcp_closemp: "
   1815 					    "connp %p tcp %p\n", (void *)connp,
   1816 					    (void *)tcp);
   1817 
   1818 				TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
   1819 				mp = &tcp->tcp_closemp;
   1820 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
   1821 				    tcp_timewait_output, connp,
   1822 				    SQ_FILL, SQTAG_TCP_TIMEWAIT);
   1823 			}
   1824 		} else {
   1825 			mutex_enter(&connp->conn_lock);
   1826 			CONN_INC_REF_LOCKED(connp);
   1827 			mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1828 			mutex_exit(&connp->conn_lock);
   1829 			/*
   1830 			 * We can reuse the closemp here since conn has
   1831 			 * detached (otherwise we wouldn't even be in
   1832 			 * time_wait list). tcp_closemp_used can safely
   1833 			 * be changed without taking a lock as no other
   1834 			 * thread can concurrently access it at this
   1835 			 * point in the connection lifecycle.
   1836 			 */
   1837 
   1838 			if (tcp->tcp_closemp.b_prev == NULL)
   1839 				tcp->tcp_closemp_used = B_TRUE;
   1840 			else
   1841 				cmn_err(CE_PANIC, "tcp_timewait_collector: "
   1842 				    "concurrent use of tcp_closemp: "
   1843 				    "connp %p tcp %p\n", (void *)connp,
   1844 				    (void *)tcp);
   1845 
   1846 			TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
   1847 			mp = &tcp->tcp_closemp;
   1848 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
   1849 			    tcp_timewait_output, connp,
   1850 			    SQ_FILL, SQTAG_TCP_TIMEWAIT);
   1851 		}
   1852 		mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
   1853 	}
   1854 
   1855 	if (tcp_time_wait->tcp_free_list != NULL)
   1856 		tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
   1857 
   1858 	tcp_time_wait->tcp_time_wait_tid =
   1859 	    timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, sqp,
   1860 	    TICK_TO_NSEC(TCP_TIME_WAIT_DELAY), CALLOUT_TCP_RESOLUTION,
   1861 	    CALLOUT_FLAG_ROUNDUP);
   1862 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
   1863 }
   1864 
   1865 /*
   1866  * Reply to a clients T_CONN_RES TPI message. This function
   1867  * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
   1868  * on the acceptor STREAM and processed in tcp_wput_accept().
   1869  * Read the block comment on top of tcp_conn_request().
   1870  */
   1871 static void
   1872 tcp_tli_accept(tcp_t *listener, mblk_t *mp)
   1873 {
   1874 	tcp_t	*acceptor;
   1875 	tcp_t	*eager;
   1876 	tcp_t   *tcp;
   1877 	struct T_conn_res	*tcr;
   1878 	t_uscalar_t	acceptor_id;
   1879 	t_scalar_t	seqnum;
   1880 	mblk_t	*opt_mp = NULL;	/* T_OPTMGMT_REQ messages */
   1881 	struct tcp_options *tcpopt;
   1882 	mblk_t	*ok_mp;
   1883 	mblk_t	*mp1;
   1884 	tcp_stack_t	*tcps = listener->tcp_tcps;
   1885 	int	error;
   1886 
   1887 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
   1888 		tcp_err_ack(listener, mp, TPROTO, 0);
   1889 		return;
   1890 	}
   1891 	tcr = (struct T_conn_res *)mp->b_rptr;
   1892 
   1893 	/*
   1894 	 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
   1895 	 * read side queue of the streams device underneath us i.e. the
   1896 	 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
   1897 	 * look it up in the queue_hash.  Under LP64 it sends down the
   1898 	 * minor_t of the accepting endpoint.
   1899 	 *
   1900 	 * Once the acceptor/eager are modified (in tcp_accept_swap) the
   1901 	 * fanout hash lock is held.
   1902 	 * This prevents any thread from entering the acceptor queue from
   1903 	 * below (since it has not been hard bound yet i.e. any inbound
   1904 	 * packets will arrive on the listener or default tcp queue and
   1905 	 * go through tcp_lookup).
   1906 	 * The CONN_INC_REF will prevent the acceptor from closing.
   1907 	 *
   1908 	 * XXX It is still possible for a tli application to send down data
   1909 	 * on the accepting stream while another thread calls t_accept.
   1910 	 * This should not be a problem for well-behaved applications since
   1911 	 * the T_OK_ACK is sent after the queue swapping is completed.
   1912 	 *
   1913 	 * If the accepting fd is the same as the listening fd, avoid
   1914 	 * queue hash lookup since that will return an eager listener in a
   1915 	 * already established state.
   1916 	 */
   1917 	acceptor_id = tcr->ACCEPTOR_id;
   1918 	mutex_enter(&listener->tcp_eager_lock);
   1919 	if (listener->tcp_acceptor_id == acceptor_id) {
   1920 		eager = listener->tcp_eager_next_q;
   1921 		/* only count how many T_CONN_INDs so don't count q0 */
   1922 		if ((listener->tcp_conn_req_cnt_q != 1) ||
   1923 		    (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
   1924 			mutex_exit(&listener->tcp_eager_lock);
   1925 			tcp_err_ack(listener, mp, TBADF, 0);
   1926 			return;
   1927 		}
   1928 		if (listener->tcp_conn_req_cnt_q0 != 0) {
   1929 			/* Throw away all the eagers on q0. */
   1930 			tcp_eager_cleanup(listener, 1);
   1931 		}
   1932 		if (listener->tcp_syn_defense) {
   1933 			listener->tcp_syn_defense = B_FALSE;
   1934 			if (listener->tcp_ip_addr_cache != NULL) {
   1935 				kmem_free(listener->tcp_ip_addr_cache,
   1936 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
   1937 				listener->tcp_ip_addr_cache = NULL;
   1938 			}
   1939 		}
   1940 		/*
   1941 		 * Transfer tcp_conn_req_max to the eager so that when
   1942 		 * a disconnect occurs we can revert the endpoint to the
   1943 		 * listen state.
   1944 		 */
   1945 		eager->tcp_conn_req_max = listener->tcp_conn_req_max;
   1946 		ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
   1947 		/*
   1948 		 * Get a reference on the acceptor just like the
   1949 		 * tcp_acceptor_hash_lookup below.
   1950 		 */
   1951 		acceptor = listener;
   1952 		CONN_INC_REF(acceptor->tcp_connp);
   1953 	} else {
   1954 		acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
   1955 		if (acceptor == NULL) {
   1956 			if (listener->tcp_debug) {
   1957 				(void) strlog(TCP_MOD_ID, 0, 1,
   1958 				    SL_ERROR|SL_TRACE,
   1959 				    "tcp_accept: did not find acceptor 0x%x\n",
   1960 				    acceptor_id);
   1961 			}
   1962 			mutex_exit(&listener->tcp_eager_lock);
   1963 			tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
   1964 			return;
   1965 		}
   1966 		/*
   1967 		 * Verify acceptor state. The acceptable states for an acceptor
   1968 		 * include TCPS_IDLE and TCPS_BOUND.
   1969 		 */
   1970 		switch (acceptor->tcp_state) {
   1971 		case TCPS_IDLE:
   1972 			/* FALLTHRU */
   1973 		case TCPS_BOUND:
   1974 			break;
   1975 		default:
   1976 			CONN_DEC_REF(acceptor->tcp_connp);
   1977 			mutex_exit(&listener->tcp_eager_lock);
   1978 			tcp_err_ack(listener, mp, TOUTSTATE, 0);
   1979 			return;
   1980 		}
   1981 	}
   1982 
   1983 	/* The listener must be in TCPS_LISTEN */
   1984 	if (listener->tcp_state != TCPS_LISTEN) {
   1985 		CONN_DEC_REF(acceptor->tcp_connp);
   1986 		mutex_exit(&listener->tcp_eager_lock);
   1987 		tcp_err_ack(listener, mp, TOUTSTATE, 0);
   1988 		return;
   1989 	}
   1990 
   1991 	/*
   1992 	 * Rendezvous with an eager connection request packet hanging off
   1993 	 * 'tcp' that has the 'seqnum' tag.  We tagged the detached open
   1994 	 * tcp structure when the connection packet arrived in
   1995 	 * tcp_conn_request().
   1996 	 */
   1997 	seqnum = tcr->SEQ_number;
   1998 	eager = listener;
   1999 	do {
   2000 		eager = eager->tcp_eager_next_q;
   2001 		if (eager == NULL) {
   2002 			CONN_DEC_REF(acceptor->tcp_connp);
   2003 			mutex_exit(&listener->tcp_eager_lock);
   2004 			tcp_err_ack(listener, mp, TBADSEQ, 0);
   2005 			return;
   2006 		}
   2007 	} while (eager->tcp_conn_req_seqnum != seqnum);
   2008 	mutex_exit(&listener->tcp_eager_lock);
   2009 
   2010 	/*
   2011 	 * At this point, both acceptor and listener have 2 ref
   2012 	 * that they begin with. Acceptor has one additional ref
   2013 	 * we placed in lookup while listener has 3 additional
   2014 	 * ref for being behind the squeue (tcp_accept() is
   2015 	 * done on listener's squeue); being in classifier hash;
   2016 	 * and eager's ref on listener.
   2017 	 */
   2018 	ASSERT(listener->tcp_connp->conn_ref >= 5);
   2019 	ASSERT(acceptor->tcp_connp->conn_ref >= 3);
   2020 
   2021 	/*
   2022 	 * The eager at this point is set in its own squeue and
   2023 	 * could easily have been killed (tcp_accept_finish will
   2024 	 * deal with that) because of a TH_RST so we can only
   2025 	 * ASSERT for a single ref.
   2026 	 */
   2027 	ASSERT(eager->tcp_connp->conn_ref >= 1);
   2028 
   2029 	/* Pre allocate the stroptions mblk also */
   2030 	opt_mp = allocb(MAX(sizeof (struct tcp_options),
   2031 	    sizeof (struct T_conn_res)), BPRI_HI);
   2032 	if (opt_mp == NULL) {
   2033 		CONN_DEC_REF(acceptor->tcp_connp);
   2034 		CONN_DEC_REF(eager->tcp_connp);
   2035 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
   2036 		return;
   2037 	}
   2038 	DB_TYPE(opt_mp) = M_SETOPTS;
   2039 	opt_mp->b_wptr += sizeof (struct tcp_options);
   2040 	tcpopt = (struct tcp_options *)opt_mp->b_rptr;
   2041 	tcpopt->to_flags = 0;
   2042 
   2043 	/*
   2044 	 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO
   2045 	 * from listener to acceptor.
   2046 	 */
   2047 	if (listener->tcp_bound_if != 0) {
   2048 		tcpopt->to_flags |= TCPOPT_BOUNDIF;
   2049 		tcpopt->to_boundif = listener->tcp_bound_if;
   2050 	}
   2051 	if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) {
   2052 		tcpopt->to_flags |= TCPOPT_RECVPKTINFO;
   2053 	}
   2054 
   2055 	/* Re-use mp1 to hold a copy of mp, in case reallocb fails */
   2056 	if ((mp1 = copymsg(mp)) == NULL) {
   2057 		CONN_DEC_REF(acceptor->tcp_connp);
   2058 		CONN_DEC_REF(eager->tcp_connp);
   2059 		freemsg(opt_mp);
   2060 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
   2061 		return;
   2062 	}
   2063 
   2064 	tcr = (struct T_conn_res *)mp1->b_rptr;
   2065 
   2066 	/*
   2067 	 * This is an expanded version of mi_tpi_ok_ack_alloc()
   2068 	 * which allocates a larger mblk and appends the new
   2069 	 * local address to the ok_ack.  The address is copied by
   2070 	 * soaccept() for getsockname().
   2071 	 */
   2072 	{
   2073 		int extra;
   2074 
   2075 		extra = (eager->tcp_family == AF_INET) ?
   2076 		    sizeof (sin_t) : sizeof (sin6_t);
   2077 
   2078 		/*
   2079 		 * Try to re-use mp, if possible.  Otherwise, allocate
   2080 		 * an mblk and return it as ok_mp.  In any case, mp
   2081 		 * is no longer usable upon return.
   2082 		 */
   2083 		if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
   2084 			CONN_DEC_REF(acceptor->tcp_connp);
   2085 			CONN_DEC_REF(eager->tcp_connp);
   2086 			freemsg(opt_mp);
   2087 			/* Original mp has been freed by now, so use mp1 */
   2088 			tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
   2089 			return;
   2090 		}
   2091 
   2092 		mp = NULL;	/* We should never use mp after this point */
   2093 
   2094 		switch (extra) {
   2095 		case sizeof (sin_t): {
   2096 				sin_t *sin = (sin_t *)ok_mp->b_wptr;
   2097 
   2098 				ok_mp->b_wptr += extra;
   2099 				sin->sin_family = AF_INET;
   2100 				sin->sin_port = eager->tcp_lport;
   2101 				sin->sin_addr.s_addr =
   2102 				    eager->tcp_ipha->ipha_src;
   2103 				break;
   2104 			}
   2105 		case sizeof (sin6_t): {
   2106 				sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
   2107 
   2108 				ok_mp->b_wptr += extra;
   2109 				sin6->sin6_family = AF_INET6;
   2110 				sin6->sin6_port = eager->tcp_lport;
   2111 				if (eager->tcp_ipversion == IPV4_VERSION) {
   2112 					sin6->sin6_flowinfo = 0;
   2113 					IN6_IPADDR_TO_V4MAPPED(
   2114 					    eager->tcp_ipha->ipha_src,
   2115 					    &sin6->sin6_addr);
   2116 				} else {
   2117 					ASSERT(eager->tcp_ip6h != NULL);
   2118 					sin6->sin6_flowinfo =
   2119 					    eager->tcp_ip6h->ip6_vcf &
   2120 					    ~IPV6_VERS_AND_FLOW_MASK;
   2121 					sin6->sin6_addr =
   2122 					    eager->tcp_ip6h->ip6_src;
   2123 				}
   2124 				sin6->sin6_scope_id = 0;
   2125 				sin6->__sin6_src_id = 0;
   2126 				break;
   2127 			}
   2128 		default:
   2129 			break;
   2130 		}
   2131 		ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
   2132 	}
   2133 
   2134 	/*
   2135 	 * If there are no options we know that the T_CONN_RES will
   2136 	 * succeed. However, we can't send the T_OK_ACK upstream until
   2137 	 * the tcp_accept_swap is done since it would be dangerous to
   2138 	 * let the application start using the new fd prior to the swap.
   2139 	 */
   2140 	error = tcp_accept_swap(listener, acceptor, eager);
   2141 	if (error != 0) {
   2142 		CONN_DEC_REF(acceptor->tcp_connp);
   2143 		CONN_DEC_REF(eager->tcp_connp);
   2144 		freemsg(ok_mp);
   2145 		/* Original mp has been freed by now, so use mp1 */
   2146 		tcp_err_ack(listener, mp1, TSYSERR, error);
   2147 		return;
   2148 	}
   2149 
   2150 	/*
   2151 	 * tcp_accept_swap unlinks eager from listener but does not drop
   2152 	 * the eager's reference on the listener.
   2153 	 */
   2154 	ASSERT(eager->tcp_listener == NULL);
   2155 	ASSERT(listener->tcp_connp->conn_ref >= 5);
   2156 
   2157 	/*
   2158 	 * The eager is now associated with its own queue. Insert in
   2159 	 * the hash so that the connection can be reused for a future
   2160 	 * T_CONN_RES.
   2161 	 */
   2162 	tcp_acceptor_hash_insert(acceptor_id, eager);
   2163 
   2164 	/*
   2165 	 * We now do the processing of options with T_CONN_RES.
   2166 	 * We delay till now since we wanted to have queue to pass to
   2167 	 * option processing routines that points back to the right
   2168 	 * instance structure which does not happen until after
   2169 	 * tcp_accept_swap().
   2170 	 *
   2171 	 * Note:
   2172 	 * The sanity of the logic here assumes that whatever options
   2173 	 * are appropriate to inherit from listner=>eager are done
   2174 	 * before this point, and whatever were to be overridden (or not)
   2175 	 * in transfer logic from eager=>acceptor in tcp_accept_swap().
   2176 	 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
   2177 	 *   before its ACCEPTOR_id comes down in T_CONN_RES ]
   2178 	 * This may not be true at this point in time but can be fixed
   2179 	 * independently. This option processing code starts with
   2180 	 * the instantiated acceptor instance and the final queue at
   2181 	 * this point.
   2182 	 */
   2183 
   2184 	if (tcr->OPT_length != 0) {
   2185 		/* Options to process */
   2186 		int t_error = 0;
   2187 		int sys_error = 0;
   2188 		int do_disconnect = 0;
   2189 
   2190 		if (tcp_conprim_opt_process(eager, mp1,
   2191 		    &do_disconnect, &t_error, &sys_error) < 0) {
   2192 			eager->tcp_accept_error = 1;
   2193 			if (do_disconnect) {
   2194 				/*
   2195 				 * An option failed which does not allow
   2196 				 * connection to be accepted.
   2197 				 *
   2198 				 * We allow T_CONN_RES to succeed and
   2199 				 * put a T_DISCON_IND on the eager queue.
   2200 				 */
   2201 				ASSERT(t_error == 0 && sys_error == 0);
   2202 				eager->tcp_send_discon_ind = 1;
   2203 			} else {
   2204 				ASSERT(t_error != 0);
   2205 				freemsg(ok_mp);
   2206 				/*
   2207 				 * Original mp was either freed or set
   2208 				 * to ok_mp above, so use mp1 instead.
   2209 				 */
   2210 				tcp_err_ack(listener, mp1, t_error, sys_error);
   2211 				goto finish;
   2212 			}
   2213 		}
   2214 		/*
   2215 		 * Most likely success in setting options (except if
   2216 		 * eager->tcp_send_discon_ind set).
   2217 		 * mp1 option buffer represented by OPT_length/offset
   2218 		 * potentially modified and contains results of setting
   2219 		 * options at this point
   2220 		 */
   2221 	}
   2222 
   2223 	/* We no longer need mp1, since all options processing has passed */
   2224 	freemsg(mp1);
   2225 
   2226 	putnext(listener->tcp_rq, ok_mp);
   2227 
   2228 	mutex_enter(&listener->tcp_eager_lock);
   2229 	if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
   2230 		tcp_t	*tail;
   2231 		mblk_t	*conn_ind;
   2232 
   2233 		/*
   2234 		 * This path should not be executed if listener and
   2235 		 * acceptor streams are the same.
   2236 		 */
   2237 		ASSERT(listener != acceptor);
   2238 
   2239 		tcp = listener->tcp_eager_prev_q0;
   2240 		/*
   2241 		 * listener->tcp_eager_prev_q0 points to the TAIL of the
   2242 		 * deferred T_conn_ind queue. We need to get to the head of
   2243 		 * the queue in order to send up T_conn_ind the same order as
   2244 		 * how the 3WHS is completed.
   2245 		 */
   2246 		while (tcp != listener) {
   2247 			if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
   2248 				break;
   2249 			else
   2250 				tcp = tcp->tcp_eager_prev_q0;
   2251 		}
   2252 		ASSERT(tcp != listener);
   2253 		conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
   2254 		ASSERT(conn_ind != NULL);
   2255 		tcp->tcp_conn.tcp_eager_conn_ind = NULL;
   2256 
   2257 		/* Move from q0 to q */
   2258 		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
   2259 		listener->tcp_conn_req_cnt_q0--;
   2260 		listener->tcp_conn_req_cnt_q++;
   2261 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
   2262 		    tcp->tcp_eager_prev_q0;
   2263 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
   2264 		    tcp->tcp_eager_next_q0;
   2265 		tcp->tcp_eager_prev_q0 = NULL;
   2266 		tcp->tcp_eager_next_q0 = NULL;
   2267 		tcp->tcp_conn_def_q0 = B_FALSE;
   2268 
   2269 		/* Make sure the tcp isn't in the list of droppables */
   2270 		ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
   2271 		    tcp->tcp_eager_prev_drop_q0 == NULL);
   2272 
   2273 		/*
   2274 		 * Insert at end of the queue because sockfs sends
   2275 		 * down T_CONN_RES in chronological order. Leaving
   2276 		 * the older conn indications at front of the queue
   2277 		 * helps reducing search time.
   2278 		 */
   2279 		tail = listener->tcp_eager_last_q;
   2280 		if (tail != NULL)
   2281 			tail->tcp_eager_next_q = tcp;
   2282 		else
   2283 			listener->tcp_eager_next_q = tcp;
   2284 		listener->tcp_eager_last_q = tcp;
   2285 		tcp->tcp_eager_next_q = NULL;
   2286 		mutex_exit(&listener->tcp_eager_lock);
   2287 		putnext(tcp->tcp_rq, conn_ind);
   2288 	} else {
   2289 		mutex_exit(&listener->tcp_eager_lock);
   2290 	}
   2291 
   2292 	/*
   2293 	 * Done with the acceptor - free it
   2294 	 *
   2295 	 * Note: from this point on, no access to listener should be made
   2296 	 * as listener can be equal to acceptor.
   2297 	 */
   2298 finish:
   2299 	ASSERT(acceptor->tcp_detached);
   2300 	ASSERT(tcps->tcps_g_q != NULL);
   2301 	ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
   2302 	acceptor->tcp_rq = tcps->tcps_g_q;
   2303 	acceptor->tcp_wq = WR(tcps->tcps_g_q);
   2304 	(void) tcp_clean_death(acceptor, 0, 2);
   2305 	CONN_DEC_REF(acceptor->tcp_connp);
   2306 
   2307 	/*
   2308 	 * In case we already received a FIN we have to make tcp_rput send
   2309 	 * the ordrel_ind. This will also send up a window update if the window
   2310 	 * has opened up.
   2311 	 *
   2312 	 * In the normal case of a successful connection acceptance
   2313 	 * we give the O_T_BIND_REQ to the read side put procedure as an
   2314 	 * indication that this was just accepted. This tells tcp_rput to
   2315 	 * pass up any data queued in tcp_rcv_list.
   2316 	 *
   2317 	 * In the fringe case where options sent with T_CONN_RES failed and
   2318 	 * we required, we would be indicating a T_DISCON_IND to blow
   2319 	 * away this connection.
   2320 	 */
   2321 
   2322 	/*
   2323 	 * XXX: we currently have a problem if XTI application closes the
   2324 	 * acceptor stream in between. This problem exists in on10-gate also
   2325 	 * and is well know but nothing can be done short of major rewrite
   2326 	 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
   2327 	 * eager same squeue as listener (we can distinguish non socket
   2328 	 * listeners at the time of handling a SYN in tcp_conn_request)
   2329 	 * and do most of the work that tcp_accept_finish does here itself
   2330 	 * and then get behind the acceptor squeue to access the acceptor
   2331 	 * queue.
   2332 	 */
   2333 	/*
   2334 	 * We already have a ref on tcp so no need to do one before squeue_enter
   2335 	 */
   2336 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, opt_mp, tcp_accept_finish,
   2337 	    eager->tcp_connp, SQ_FILL, SQTAG_TCP_ACCEPT_FINISH);
   2338 }
   2339 
   2340 /*
   2341  * Swap information between the eager and acceptor for a TLI/XTI client.
   2342  * The sockfs accept is done on the acceptor stream and control goes
   2343  * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not
   2344  * called. In either case, both the eager and listener are in their own
   2345  * perimeter (squeue) and the code has to deal with potential race.
   2346  *
   2347  * See the block comment on top of tcp_accept() and tcp_wput_accept().
   2348  */
   2349 static int
   2350 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
   2351 {
   2352 	conn_t	*econnp, *aconnp;
   2353 	cred_t	*effective_cred = NULL;
   2354 
   2355 	ASSERT(eager->tcp_rq == listener->tcp_rq);
   2356 	ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
   2357 	ASSERT(!eager->tcp_hard_bound);
   2358 	ASSERT(!TCP_IS_SOCKET(acceptor));
   2359 	ASSERT(!TCP_IS_SOCKET(eager));
   2360 	ASSERT(!TCP_IS_SOCKET(listener));
   2361 
   2362 	econnp = eager->tcp_connp;
   2363 	aconnp = acceptor->tcp_connp;
   2364 
   2365 	/*
   2366 	 * Trusted Extensions may need to use a security label that is
   2367 	 * different from the acceptor's label on MLP and MAC-Exempt
   2368 	 * sockets. If this is the case, the required security label
   2369 	 * already exists in econnp->conn_effective_cred. Use this label
   2370 	 * to generate a new effective cred for the acceptor.
   2371 	 *
   2372 	 * We allow for potential application level retry attempts by
   2373 	 * checking for transient errors before modifying eager.
   2374 	 */
   2375 	if (is_system_labeled() &&
   2376 	    aconnp->conn_cred != NULL && econnp->conn_effective_cred != NULL) {
   2377 		effective_cred = copycred_from_tslabel(aconnp->conn_cred,
   2378 		    crgetlabel(econnp->conn_effective_cred), KM_NOSLEEP);
   2379 		if (effective_cred == NULL)
   2380 			return (ENOMEM);
   2381 	}
   2382 
   2383 	acceptor->tcp_detached = B_TRUE;
   2384 	/*
   2385 	 * To permit stream re-use by TLI/XTI, the eager needs a copy of
   2386 	 * the acceptor id.
   2387 	 */
   2388 	eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
   2389 
   2390 	/* remove eager from listen list... */
   2391 	mutex_enter(&listener->tcp_eager_lock);
   2392 	tcp_eager_unlink(eager);
   2393 	ASSERT(eager->tcp_eager_next_q == NULL &&
   2394 	    eager->tcp_eager_last_q == NULL);
   2395 	ASSERT(eager->tcp_eager_next_q0 == NULL &&
   2396 	    eager->tcp_eager_prev_q0 == NULL);
   2397 	mutex_exit(&listener->tcp_eager_lock);
   2398 	eager->tcp_rq = acceptor->tcp_rq;
   2399 	eager->tcp_wq = acceptor->tcp_wq;
   2400 
   2401 	eager->tcp_rq->q_ptr = econnp;
   2402 	eager->tcp_wq->q_ptr = econnp;
   2403 
   2404 	/*
   2405 	 * In the TLI/XTI loopback case, we are inside the listener's squeue,
   2406 	 * which might be a different squeue from our peer TCP instance.
   2407 	 * For TCP Fusion, the peer expects that whenever tcp_detached is
   2408 	 * clear, our TCP queues point to the acceptor's queues.  Thus, use
   2409 	 * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq
   2410 	 * above reach global visibility prior to the clearing of tcp_detached.
   2411 	 */
   2412 	membar_producer();
   2413 	eager->tcp_detached = B_FALSE;
   2414 
   2415 	ASSERT(eager->tcp_ack_tid == 0);
   2416 
   2417 	econnp->conn_dev = aconnp->conn_dev;
   2418 	econnp->conn_minor_arena = aconnp->conn_minor_arena;
   2419 
   2420 	ASSERT(econnp->conn_minor_arena != NULL);
   2421 	if (eager->tcp_cred != NULL)
   2422 		crfree(eager->tcp_cred);
   2423 	eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred;
   2424 	if (econnp->conn_effective_cred != NULL)
   2425 		crfree(econnp->conn_effective_cred);
   2426 	econnp->conn_effective_cred = effective_cred;
   2427 	aconnp->conn_cred = NULL;
   2428 	ASSERT(aconnp->conn_effective_cred == NULL);
   2429 
   2430 	ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
   2431 	ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
   2432 
   2433 	econnp->conn_zoneid = aconnp->conn_zoneid;
   2434 	econnp->conn_allzones = aconnp->conn_allzones;
   2435 
   2436 	aconnp->conn_mac_exempt = B_FALSE;
   2437 
   2438 	/* Do the IPC initialization */
   2439 	CONN_INC_REF(econnp);
   2440 
   2441 	econnp->conn_multicast_loop = aconnp->conn_multicast_loop;
   2442 	econnp->conn_af_isv6 = aconnp->conn_af_isv6;
   2443 	econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6;
   2444 
   2445 	/* Done with old IPC. Drop its ref on its connp */
   2446 	CONN_DEC_REF(aconnp);
   2447 	return (0);
   2448 }
   2449 
   2450 
   2451 /*
   2452  * Adapt to the information, such as rtt and rtt_sd, provided from the
   2453  * ire cached in conn_cache_ire. If no ire cached, do a ire lookup.
   2454  *
   2455  * Checks for multicast and broadcast destination address.
   2456  * Returns zero on failure; non-zero if ok.
   2457  *
   2458  * Note that the MSS calculation here is based on the info given in
   2459  * the IRE.  We do not do any calculation based on TCP options.  They
   2460  * will be handled in tcp_rput_other() and tcp_rput_data() when TCP
   2461  * knows which options to use.
   2462  *
   2463  * Note on how TCP gets its parameters for a connection.
   2464  *
   2465  * When a tcp_t structure is allocated, it gets all the default parameters.
   2466  * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd,
   2467  * spipe, rpipe, ... from the route metrics.  Route metric overrides the
   2468  * default.
   2469  *
   2470  * An incoming SYN with a multicast or broadcast destination address, is dropped
   2471  * in 1 of 2 places.
   2472  *
   2473  * 1. If the packet was received over the wire it is dropped in
   2474  * ip_rput_process_broadcast()
   2475  *
   2476  * 2. If the packet was received through internal IP loopback, i.e. the packet
   2477  * was generated and received on the same machine, it is dropped in
   2478  * ip_wput_local()
   2479  *
   2480  * An incoming SYN with a multicast or broadcast source address is always
   2481  * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to
   2482  * reject an attempt to connect to a broadcast or multicast (destination)
   2483  * address.
   2484  */
   2485 static int
   2486 tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
   2487 {
   2488 	ire_t		*ire;
   2489 	ire_t		*sire = NULL;
   2490 	iulp_t		*ire_uinfo = NULL;
   2491 	uint32_t	mss_max;
   2492 	uint32_t	mss;
   2493 	boolean_t	tcp_detached = TCP_IS_DETACHED(tcp);
   2494 	conn_t		*connp = tcp->tcp_connp;
   2495 	boolean_t	ire_cacheable = B_FALSE;
   2496 	zoneid_t	zoneid = connp->conn_zoneid;
   2497 	int		match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
   2498 	    MATCH_IRE_SECATTR;
   2499 	ts_label_t	*tsl = crgetlabel(CONN_CRED(connp));
   2500 	ill_t		*ill = NULL;
   2501 	boolean_t	incoming = (ire_mp == NULL);
   2502 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   2503 	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
   2504 
   2505 	ASSERT(connp->conn_ire_cache == NULL);
   2506 
   2507 	if (tcp->tcp_ipversion == IPV4_VERSION) {
   2508 
   2509 		if (CLASSD(tcp->tcp_connp->conn_rem)) {
   2510 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
   2511 			return (0);
   2512 		}
   2513 		/*
   2514 		 * If IP_NEXTHOP is set, then look for an IRE_CACHE
   2515 		 * for the destination with the nexthop as gateway.
   2516 		 * ire_ctable_lookup() is used because this particular
   2517 		 * ire, if it exists, will be marked private.
   2518 		 * If that is not available, use the interface ire
   2519 		 * for the nexthop.
   2520 		 *
   2521 		 * TSol: tcp_update_label will detect label mismatches based
   2522 		 * only on the destination's label, but that would not
   2523 		 * detect label mismatches based on the security attributes
   2524 		 * of routes or next hop gateway. Hence we need to pass the
   2525 		 * label to ire_ftable_lookup below in order to locate the
   2526 		 * right prefix (and/or) ire cache. Similarly we also need
   2527 		 * pass the label to the ire_cache_lookup below to locate
   2528 		 * the right ire that also matches on the label.
   2529 		 */
   2530 		if (tcp->tcp_connp->conn_nexthop_set) {
   2531 			ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem,
   2532 			    tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid,
   2533 			    tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW,
   2534 			    ipst);
   2535 			if (ire == NULL) {
   2536 				ire = ire_ftable_lookup(
   2537 				    tcp->tcp_connp->conn_nexthop_v4,
   2538 				    0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0,
   2539 				    tsl, match_flags, ipst);
   2540 				if (ire == NULL)
   2541 					return (0);
   2542 			} else {
   2543 				ire_uinfo = &ire->ire_uinfo;
   2544 			}
   2545 		} else {
   2546 			ire = ire_cache_lookup(tcp->tcp_connp->conn_rem,
   2547 			    zoneid, tsl, ipst);
   2548 			if (ire != NULL) {
   2549 				ire_cacheable = B_TRUE;
   2550 				ire_uinfo = (ire_mp != NULL) ?
   2551 				    &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
   2552 				    &ire->ire_uinfo;
   2553 
   2554 			} else {
   2555 				if (ire_mp == NULL) {
   2556 					ire = ire_ftable_lookup(
   2557 					    tcp->tcp_connp->conn_rem,
   2558 					    0, 0, 0, NULL, &sire, zoneid, 0,
   2559 					    tsl, (MATCH_IRE_RECURSIVE |
   2560 					    MATCH_IRE_DEFAULT), ipst);
   2561 					if (ire == NULL)
   2562 						return (0);
   2563 					ire_uinfo = (sire != NULL) ?
   2564 					    &sire->ire_uinfo :
   2565 					    &ire->ire_uinfo;
   2566 				} else {
   2567 					ire = (ire_t *)ire_mp->b_rptr;
   2568 					ire_uinfo =
   2569 					    &((ire_t *)
   2570 					    ire_mp->b_rptr)->ire_uinfo;
   2571 				}
   2572 			}
   2573 		}
   2574 		ASSERT(ire != NULL);
   2575 
   2576 		if ((ire->ire_src_addr == INADDR_ANY) ||
   2577 		    (ire->ire_type & IRE_BROADCAST)) {
   2578 			/*
   2579 			 * ire->ire_mp is non null when ire_mp passed in is used
   2580 			 * ire->ire_mp is set in ip_bind_insert_ire[_v6]().
   2581 			 */
   2582 			if (ire->ire_mp == NULL)
   2583 				ire_refrele(ire);
   2584 			if (sire != NULL)
   2585 				ire_refrele(sire);
   2586 			return (0);
   2587 		}
   2588 
   2589 		if (tcp->tcp_ipha->ipha_src == INADDR_ANY) {
   2590 			ipaddr_t src_addr;
   2591 
   2592 			/*
   2593 			 * ip_bind_connected() has stored the correct source
   2594 			 * address in conn_src.
   2595 			 */
   2596 			src_addr = tcp->tcp_connp->conn_src;
   2597 			tcp->tcp_ipha->ipha_src = src_addr;
   2598 			/*
   2599 			 * Copy of the src addr. in tcp_t is needed
   2600 			 * for the lookup funcs.
   2601 			 */
   2602 			IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6);
   2603 		}
   2604 		/*
   2605 		 * Set the fragment bit so that IP will tell us if the MTU
   2606 		 * should change. IP tells us the latest setting of
   2607 		 * ip_path_mtu_discovery through ire_frag_flag.
   2608 		 */
   2609 		if (ipst->ips_ip_path_mtu_discovery) {
   2610 			tcp->tcp_ipha->ipha_fragment_offset_and_flags =
   2611 			    htons(IPH_DF);
   2612 		}
   2613 		/*
   2614 		 * If ire_uinfo is NULL, this is the IRE_INTERFACE case
   2615 		 * for IP_NEXTHOP. No cache ire has been found for the
   2616 		 * destination and we are working with the nexthop's
   2617 		 * interface ire. Since we need to forward all packets
   2618 		 * to the nexthop first, we "blindly" set tcp_localnet
   2619 		 * to false, eventhough the destination may also be
   2620 		 * onlink.
   2621 		 */
   2622 		if (ire_uinfo == NULL)
   2623 			tcp->tcp_localnet = 0;
   2624 		else
   2625 			tcp->tcp_localnet = (ire->ire_gateway_addr == 0);
   2626 	} else {
   2627 		/*
   2628 		 * For incoming connection ire_mp = NULL
   2629 		 * For outgoing connection ire_mp != NULL
   2630 		 * Technically we should check conn_incoming_ill
   2631 		 * when ire_mp is NULL and conn_outgoing_ill when
   2632 		 * ire_mp is non-NULL. But this is performance
   2633 		 * critical path and for IPV*_BOUND_IF, outgoing
   2634 		 * and incoming ill are always set to the same value.
   2635 		 */
   2636 		ill_t	*dst_ill = NULL;
   2637 		ipif_t  *dst_ipif = NULL;
   2638 
   2639 		ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill);
   2640 
   2641 		if (connp->conn_outgoing_ill != NULL) {
   2642 			/* Outgoing or incoming path */
   2643 			int   err;
   2644 
   2645 			dst_ill = conn_get_held_ill(connp,
   2646 			    &connp->conn_outgoing_ill, &err);
   2647 			if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) {
   2648 				ip1dbg(("tcp_adapt_ire: ill_lookup failed\n"));
   2649 				return (0);
   2650 			}
   2651 			match_flags |= MATCH_IRE_ILL;
   2652 			dst_ipif = dst_ill->ill_ipif;
   2653 		}
   2654 		ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6,
   2655 		    0, 0, dst_ipif, zoneid, tsl, match_flags, ipst);
   2656 
   2657 		if (ire != NULL) {
   2658 			ire_cacheable = B_TRUE;
   2659 			ire_uinfo = (ire_mp != NULL) ?
   2660 			    &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
   2661 			    &ire->ire_uinfo;
   2662 		} else {
   2663 			if (ire_mp == NULL) {
   2664 				ire = ire_ftable_lookup_v6(
   2665 				    &tcp->tcp_connp->conn_remv6,
   2666 				    0, 0, 0, dst_ipif, &sire, zoneid,
   2667 				    0, tsl, match_flags, ipst);
   2668 				if (ire == NULL) {
   2669 					if (dst_ill != NULL)
   2670 						ill_refrele(dst_ill);
   2671 					return (0);
   2672 				}
   2673 				ire_uinfo = (sire != NULL) ? &sire->ire_uinfo :
   2674 				    &ire->ire_uinfo;
   2675 			} else {
   2676 				ire = (ire_t *)ire_mp->b_rptr;
   2677 				ire_uinfo =
   2678 				    &((ire_t *)ire_mp->b_rptr)->ire_uinfo;
   2679 			}
   2680 		}
   2681 		if (dst_ill != NULL)
   2682 			ill_refrele(dst_ill);
   2683 
   2684 		ASSERT(ire != NULL);
   2685 		ASSERT(ire_uinfo != NULL);
   2686 
   2687 		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) ||
   2688 		    IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) {
   2689 			/*
   2690 			 * ire->ire_mp is non null when ire_mp passed in is used
   2691 			 * ire->ire_mp is set in ip_bind_insert_ire[_v6]().
   2692 			 */
   2693 			if (ire->ire_mp == NULL)
   2694 				ire_refrele(ire);
   2695 			if (sire != NULL)
   2696 				ire_refrele(sire);
   2697 			return (0);
   2698 		}
   2699 
   2700 		if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
   2701 			in6_addr_t	src_addr;
   2702 
   2703 			/*
   2704 			 * ip_bind_connected_v6() has stored the correct source
   2705 			 * address per IPv6 addr. selection policy in
   2706 			 * conn_src_v6.
   2707 			 */
   2708 			src_addr = tcp->tcp_connp->conn_srcv6;
   2709 
   2710 			tcp->tcp_ip6h->ip6_src = src_addr;
   2711 			/*
   2712 			 * Copy of the src addr. in tcp_t is needed
   2713 			 * for the lookup funcs.
   2714 			 */
   2715 			tcp->tcp_ip_src_v6 = src_addr;
   2716 			ASSERT(IN6_ARE_ADDR_EQUAL(&tcp->tcp_ip6h->ip6_src,
   2717 			    &connp->conn_srcv6));
   2718 		}
   2719 		tcp->tcp_localnet =
   2720 		    IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6);
   2721 	}
   2722 
   2723 	/*
   2724 	 * This allows applications to fail quickly when connections are made
   2725 	 * to dead hosts. Hosts can be labeled dead by adding a reject route
   2726 	 * with both the RTF_REJECT and RTF_PRIVATE flags set.
   2727 	 */
   2728 	if ((ire->ire_flags & RTF_REJECT) &&
   2729 	    (ire->ire_flags & RTF_PRIVATE))
   2730 		goto error;
   2731 
   2732 	/*
   2733 	 * Make use of the cached rtt and rtt_sd values to calculate the
   2734 	 * initial RTO.  Note that they are already initialized in
   2735 	 * tcp_init_values().
   2736 	 * If ire_uinfo is NULL, i.e., we do not have a cache ire for
   2737 	 * IP_NEXTHOP, but instead are using the interface ire for the
   2738 	 * nexthop, then we do not use the ire_uinfo from that ire to
   2739 	 * do any initializations.
   2740 	 */
   2741 	if (ire_uinfo != NULL) {
   2742 		if (ire_uinfo->iulp_rtt != 0) {
   2743 			clock_t	rto;
   2744 
   2745 			tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt;
   2746 			tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd;
   2747 			rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
   2748 			    tcps->tcps_rexmit_interval_extra +
   2749 			    (tcp->tcp_rtt_sa >> 5);
   2750 
   2751 			if (rto > tcps->tcps_rexmit_interval_max) {
   2752 				tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
   2753 			} else if (rto < tcps->tcps_rexmit_interval_min) {
   2754 				tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
   2755 			} else {
   2756 				tcp->tcp_rto = rto;
   2757 			}
   2758 		}
   2759 		if (ire_uinfo->iulp_ssthresh != 0)
   2760 			tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh;
   2761 		else
   2762 			tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
   2763 		if (ire_uinfo->iulp_spipe > 0) {
   2764 			tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe,
   2765 			    tcps->tcps_max_buf);
   2766 			if (tcps->tcps_snd_lowat_fraction != 0)
   2767 				tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater /
   2768 				    tcps->tcps_snd_lowat_fraction;
   2769 			(void) tcp_maxpsz_set(tcp, B_TRUE);
   2770 		}
   2771 		/*
   2772 		 * Note that up till now, acceptor always inherits receive
   2773 		 * window from the listener.  But if there is a metrics
   2774 		 * associated with a host, we should use that instead of
   2775 		 * inheriting it from listener. Thus we need to pass this
   2776 		 * info back to the caller.
   2777 		 */
   2778 		if (ire_uinfo->iulp_rpipe > 0) {
   2779 			tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe,
   2780 			    tcps->tcps_max_buf);
   2781 		}
   2782 
   2783 		if (ire_uinfo->iulp_rtomax > 0) {
   2784 			tcp->tcp_second_timer_threshold =
   2785 			    ire_uinfo->iulp_rtomax;
   2786 		}
   2787 
   2788 		/*
   2789 		 * Use the metric option settings, iulp_tstamp_ok and
   2790 		 * iulp_wscale_ok, only for active open. What this means
   2791 		 * is that if the other side uses timestamp or window
   2792 		 * scale option, TCP will also use those options. That
   2793 		 * is for passive open.  If the application sets a
   2794 		 * large window, window scale is enabled regardless of
   2795 		 * the value in iulp_wscale_ok.  This is the behavior
   2796 		 * since 2.6.  So we keep it.
   2797 		 * The only case left in passive open processing is the
   2798 		 * check for SACK.
   2799 		 * For ECN, it should probably be like SACK.  But the
   2800 		 * current value is binary, so we treat it like the other
   2801 		 * cases.  The metric only controls active open.For passive
   2802 		 * open, the ndd param, tcp_ecn_permitted, controls the
   2803 		 * behavior.
   2804 		 */
   2805 		if (!tcp_detached) {
   2806 			/*
   2807 			 * The if check means that the following can only
   2808 			 * be turned on by the metrics only IRE, but not off.
   2809 			 */
   2810 			if (ire_uinfo->iulp_tstamp_ok)
   2811 				tcp->tcp_snd_ts_ok = B_TRUE;
   2812 			if (ire_uinfo->iulp_wscale_ok)
   2813 				tcp->tcp_snd_ws_ok = B_TRUE;
   2814 			if (ire_uinfo->iulp_sack == 2)
   2815 				tcp->tcp_snd_sack_ok = B_TRUE;
   2816 			if (ire_uinfo->iulp_ecn_ok)
   2817 				tcp->tcp_ecn_ok = B_TRUE;
   2818 		} else {
   2819 			/*
   2820 			 * Passive open.
   2821 			 *
   2822 			 * As above, the if check means that SACK can only be
   2823 			 * turned on by the metric only IRE.
   2824 			 */
   2825 			if (ire_uinfo->iulp_sack > 0) {
   2826 				tcp->tcp_snd_sack_ok = B_TRUE;
   2827 			}
   2828 		}
   2829 	}
   2830 
   2831 
   2832 	/*
   2833 	 * XXX: Note that currently, ire_max_frag can be as small as 68
   2834 	 * because of PMTUd.  So tcp_mss may go to negative if combined
   2835 	 * length of all those options exceeds 28 bytes.  But because
   2836 	 * of the tcp_mss_min check below, we may not have a problem if
   2837 	 * tcp_mss_min is of a reasonable value.  The default is 1 so
   2838 	 * the negative problem still exists.  And the check defeats PMTUd.
   2839 	 * In fact, if PMTUd finds that the MSS should be smaller than
   2840 	 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min
   2841 	 * value.
   2842 	 *
   2843 	 * We do not deal with that now.  All those problems related to
   2844 	 * PMTUd will be fixed later.
   2845 	 */
   2846 	ASSERT(ire->ire_max_frag != 0);
   2847 	mss = tcp->tcp_if_mtu = ire->ire_max_frag;
   2848 	if (tcp->tcp_ipp_fields & IPPF_USE_MIN_MTU) {
   2849 		if (tcp->tcp_ipp_use_min_mtu == IPV6_USE_MIN_MTU_NEVER) {
   2850 			mss = MIN(mss, IPV6_MIN_MTU);
   2851 		}
   2852 	}
   2853 
   2854 	/* Sanity check for MSS value. */
   2855 	if (tcp->tcp_ipversion == IPV4_VERSION)
   2856 		mss_max = tcps->tcps_mss_max_ipv4;
   2857 	else
   2858 		mss_max = tcps->tcps_mss_max_ipv6;
   2859 
   2860 	if (tcp->tcp_ipversion == IPV6_VERSION &&
   2861 	    (ire->ire_frag_flag & IPH_FRAG_HDR)) {
   2862 		/*
   2863 		 * After receiving an ICMPv6 "packet too big" message with a
   2864 		 * MTU < 1280, and for multirouted IPv6 packets, the IP layer
   2865 		 * will insert a 8-byte fragment header in every packet; we
   2866 		 * reduce the MSS by that amount here.
   2867 		 */
   2868 		mss -= sizeof (ip6_frag_t);
   2869 	}
   2870 
   2871 	if (tcp->tcp_ipsec_overhead == 0)
   2872 		tcp->tcp_ipsec_overhead = conn_ipsec_length(connp);
   2873 
   2874 	mss -= tcp->tcp_ipsec_overhead;
   2875 
   2876 	if (mss < tcps->tcps_mss_min)
   2877 		mss = tcps->tcps_mss_min;
   2878 	if (mss > mss_max)
   2879 		mss = mss_max;
   2880 
   2881 	/* Note that this is the maximum MSS, excluding all options. */
   2882 	tcp->tcp_mss = mss;
   2883 
   2884 	/*
   2885 	 * Initialize the ISS here now that we have the full connection ID.
   2886 	 * The RFC 1948 method of initial sequence number generation requires
   2887 	 * knowledge of the full connection ID before setting the ISS.
   2888 	 */
   2889 
   2890 	tcp_iss_init(tcp);
   2891 
   2892 	if (ire->ire_type & (IRE_LOOPBACK | IRE_LOCAL))
   2893 		tcp->tcp_loopback = B_TRUE;
   2894 
   2895 	if (sire != NULL)
   2896 		IRE_REFRELE(sire);
   2897 
   2898 	/*
   2899 	 * If we got an IRE_CACHE and an ILL, go through their properties;
   2900 	 * otherwise, this is deferred until later when we have an IRE_CACHE.
   2901 	 */
   2902 	if (tcp->tcp_loopback ||
   2903 	    (ire_cacheable && (ill = ire_to_ill(ire)) != NULL)) {
   2904 		/*
   2905 		 * For incoming, see if this tcp may be MDT-capable.  For
   2906 		 * outgoing, this process has been taken care of through
   2907 		 * tcp_rput_other.
   2908 		 */
   2909 		tcp_ire_ill_check(tcp, ire, ill, incoming);
   2910 		tcp->tcp_ire_ill_check_done = B_TRUE;
   2911 	}
   2912 
   2913 	mutex_enter(&connp->conn_lock);
   2914 	/*
   2915 	 * Make sure that conn is not marked incipient
   2916 	 * for incoming connections. A blind
   2917 	 * removal of incipient flag is cheaper than
   2918 	 * check and removal.
   2919 	 */
   2920 	connp->conn_state_flags &= ~CONN_INCIPIENT;
   2921 
   2922 	/*
   2923 	 * Must not cache forwarding table routes
   2924 	 * or recache an IRE after the conn_t has
   2925 	 * had conn_ire_cache cleared and is flagged
   2926 	 * unusable, (see the CONN_CACHE_IRE() macro).
   2927 	 */
   2928 	if (ire_cacheable && CONN_CACHE_IRE(connp)) {
   2929 		rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
   2930 		if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
   2931 			connp->conn_ire_cache = ire;
   2932 			IRE_UNTRACE_REF(ire);
   2933 			rw_exit(&ire->ire_bucket->irb_lock);
   2934 			mutex_exit(&connp->conn_lock);
   2935 			return (1);
   2936 		}
   2937 		rw_exit(&ire->ire_bucket->irb_lock);
   2938 	}
   2939 	mutex_exit(&connp->conn_lock);
   2940 
   2941 	if (ire->ire_mp == NULL)
   2942 		ire_refrele(ire);
   2943 	return (1);
   2944 
   2945 error:
   2946 	if (ire->ire_mp == NULL)
   2947 		ire_refrele(ire);
   2948 	if (sire != NULL)
   2949 		ire_refrele(sire);
   2950 	return (0);
   2951 }
   2952 
   2953 static void
   2954 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
   2955 {
   2956 	int	error;
   2957 	conn_t	*connp = tcp->tcp_connp;
   2958 	struct sockaddr	*sa;
   2959 	mblk_t  *mp1;
   2960 	struct T_bind_req *tbr;
   2961 	int	backlog;
   2962 	socklen_t	len;
   2963 	sin_t	*sin;
   2964 	sin6_t	*sin6;
   2965 	cred_t		*cr;
   2966 
   2967 	/*
   2968 	 * All Solaris components should pass a db_credp
   2969 	 * for this TPI message, hence we ASSERT.
   2970 	 * But in case there is some other M_PROTO that looks
   2971 	 * like a TPI message sent by some other kernel
   2972 	 * component, we check and return an error.
   2973 	 */
   2974 	cr = msg_getcred(mp, NULL);
   2975 	ASSERT(cr != NULL);
   2976 	if (cr == NULL) {
   2977 		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
   2978 		return;
   2979 	}
   2980 
   2981 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
   2982 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
   2983 		if (tcp->tcp_debug) {
   2984 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
   2985 			    "tcp_tpi_bind: bad req, len %u",
   2986 			    (uint_t)(mp->b_wptr - mp->b_rptr));
   2987 		}
   2988 		tcp_err_ack(tcp, mp, TPROTO, 0);
   2989 		return;
   2990 	}
   2991 	/* Make sure the largest address fits */
   2992 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
   2993 	if (mp1 == NULL) {
   2994 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
   2995 		return;
   2996 	}
   2997 	mp = mp1;
   2998 	tbr = (struct T_bind_req *)mp->b_rptr;
   2999 
   3000 	backlog = tbr->CONIND_number;
   3001 	len = tbr->ADDR_length;
   3002 
   3003 	switch (len) {
   3004 	case 0:		/* request for a generic port */
   3005 		tbr->ADDR_offset = sizeof (struct T_bind_req);
   3006 		if (tcp->tcp_family == AF_INET) {
   3007 			tbr->ADDR_length = sizeof (sin_t);
   3008 			sin = (sin_t *)&tbr[1];
   3009 			*sin = sin_null;
   3010 			sin->sin_family = AF_INET;
   3011 			sa = (struct sockaddr *)sin;
   3012 			len = sizeof (sin_t);
   3013 			mp->b_wptr = (uchar_t *)&sin[1];
   3014 		} else {
   3015 			ASSERT(tcp->tcp_family == AF_INET6);
   3016 			tbr->ADDR_length = sizeof (sin6_t);
   3017 			sin6 = (sin6_t *)&tbr[1];
   3018 			*sin6 = sin6_null;
   3019 			sin6->sin6_family = AF_INET6;
   3020 			sa = (struct sockaddr *)sin6;
   3021 			len = sizeof (sin6_t);
   3022 			mp->b_wptr = (uchar_t *)&sin6[1];
   3023 		}
   3024 		break;
   3025 
   3026 	case sizeof (sin_t):    /* Complete IPv4 address */
   3027 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
   3028 		    sizeof (sin_t));
   3029 		break;
   3030 
   3031 	case sizeof (sin6_t): /* Complete IPv6 address */
   3032 		sa = (struct sockaddr *)mi_offset_param(mp,
   3033 		    tbr->ADDR_offset, sizeof (sin6_t));
   3034 		break;
   3035 
   3036 	default:
   3037 		if (tcp->tcp_debug) {
   3038 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
   3039 			    "tcp_tpi_bind: bad address length, %d",
   3040 			    tbr->ADDR_length);
   3041 		}
   3042 		tcp_err_ack(tcp, mp, TBADADDR, 0);
   3043 		return;
   3044 	}
   3045 
   3046 	if (backlog > 0) {
   3047 		error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp),
   3048 		    tbr->PRIM_type != O_T_BIND_REQ);
   3049 	} else {
   3050 		error = tcp_do_bind(connp, sa, len, DB_CRED(mp),
   3051 		    tbr->PRIM_type != O_T_BIND_REQ);
   3052 	}
   3053 done:
   3054 	if (error > 0) {
   3055 		tcp_err_ack(tcp, mp, TSYSERR, error);
   3056 	} else if (error < 0) {
   3057 		tcp_err_ack(tcp, mp, -error, 0);
   3058 	} else {
   3059 		/*
   3060 		 * Update port information as sockfs/tpi needs it for checking
   3061 		 */
   3062 		if (tcp->tcp_family == AF_INET) {
   3063 			sin = (sin_t *)sa;
   3064 			sin->sin_port = tcp->tcp_lport;
   3065 		} else {
   3066 			sin6 = (sin6_t *)sa;
   3067 			sin6->sin6_port = tcp->tcp_lport;
   3068 		}
   3069 		mp->b_datap->db_type = M_PCPROTO;
   3070 		tbr->PRIM_type = T_BIND_ACK;
   3071 		putnext(tcp->tcp_rq, mp);
   3072 	}
   3073 }
   3074 
   3075 /*
   3076  * If the "bind_to_req_port_only" parameter is set, if the requested port
   3077  * number is available, return it, If not return 0
   3078  *
   3079  * If "bind_to_req_port_only" parameter is not set and
   3080  * If the requested port number is available, return it.  If not, return
   3081  * the first anonymous port we happen across.  If no anonymous ports are
   3082  * available, return 0. addr is the requested local address, if any.
   3083  *
   3084  * In either case, when succeeding update the tcp_t to record the port number
   3085  * and insert it in the bind hash table.
   3086  *
   3087  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
   3088  * without setting SO_REUSEADDR. This is needed so that they
   3089  * can be viewed as two independent transport protocols.
   3090  */
   3091 static in_port_t
   3092 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
   3093     int reuseaddr, boolean_t quick_connect,
   3094     boolean_t bind_to_req_port_only, boolean_t user_specified)
   3095 {
   3096 	/* number of times we have run around the loop */
   3097 	int count = 0;
   3098 	/* maximum number of times to run around the loop */
   3099 	int loopmax;
   3100 	conn_t *connp = tcp->tcp_connp;
   3101 	zoneid_t zoneid = connp->conn_zoneid;
   3102 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3103 
   3104 	/*
   3105 	 * Lookup for free addresses is done in a loop and "loopmax"
   3106 	 * influences how long we spin in the loop
   3107 	 */
   3108 	if (bind_to_req_port_only) {
   3109 		/*
   3110 		 * If the requested port is busy, don't bother to look
   3111 		 * for a new one. Setting loop maximum count to 1 has
   3112 		 * that effect.
   3113 		 */
   3114 		loopmax = 1;
   3115 	} else {
   3116 		/*
   3117 		 * If the requested port is busy, look for a free one
   3118 		 * in the anonymous port range.
   3119 		 * Set loopmax appropriately so that one does not look
   3120 		 * forever in the case all of the anonymous ports are in use.
   3121 		 */
   3122 		if (tcp->tcp_anon_priv_bind) {
   3123 			/*
   3124 			 * loopmax =
   3125 			 * 	(IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
   3126 			 */
   3127 			loopmax = IPPORT_RESERVED -
   3128 			    tcps->tcps_min_anonpriv_port;
   3129 		} else {
   3130 			loopmax = (tcps->tcps_largest_anon_port -
   3131 			    tcps->tcps_smallest_anon_port + 1);
   3132 		}
   3133 	}
   3134 	do {
   3135 		uint16_t	lport;
   3136 		tf_t		*tbf;
   3137 		tcp_t		*ltcp;
   3138 		conn_t		*lconnp;
   3139 
   3140 		lport = htons(port);
   3141 
   3142 		/*
   3143 		 * Ensure that the tcp_t is not currently in the bind hash.
   3144 		 * Hold the lock on the hash bucket to ensure that
   3145 		 * the duplicate check plus the insertion is an atomic
   3146 		 * operation.
   3147 		 *
   3148 		 * This function does an inline lookup on the bind hash list
   3149 		 * Make sure that we access only members of tcp_t
   3150 		 * and that we don't look at tcp_tcp, since we are not
   3151 		 * doing a CONN_INC_REF.
   3152 		 */
   3153 		tcp_bind_hash_remove(tcp);
   3154 		tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
   3155 		mutex_enter(&tbf->tf_lock);
   3156 		for (ltcp = tbf->tf_tcp; ltcp != NULL;
   3157 		    ltcp = ltcp->tcp_bind_hash) {
   3158 			if (lport == ltcp->tcp_lport)
   3159 				break;
   3160 		}
   3161 
   3162 		for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
   3163 			boolean_t not_socket;
   3164 			boolean_t exclbind;
   3165 
   3166 			lconnp = ltcp->tcp_connp;
   3167 
   3168 			/*
   3169 			 * On a labeled system, we must treat bindings to ports
   3170 			 * on shared IP addresses by sockets with MAC exemption
   3171 			 * privilege as being in all zones, as there's
   3172 			 * otherwise no way to identify the right receiver.
   3173 			 */
   3174 			if (!(IPCL_ZONE_MATCH(ltcp->tcp_connp, zoneid) ||
   3175 			    IPCL_ZONE_MATCH(connp,
   3176 			    ltcp->tcp_connp->conn_zoneid)) &&
   3177 			    !lconnp->conn_mac_exempt &&
   3178 			    !connp->conn_mac_exempt)
   3179 				continue;
   3180 
   3181 			/*
   3182 			 * If TCP_EXCLBIND is set for either the bound or
   3183 			 * binding endpoint, the semantics of bind
   3184 			 * is changed according to the following.
   3185 			 *
   3186 			 * spec = specified address (v4 or v6)
   3187 			 * unspec = unspecified address (v4 or v6)
   3188 			 * A = specified addresses are different for endpoints
   3189 			 *
   3190 			 * bound	bind to		allowed
   3191 			 * -------------------------------------
   3192 			 * unspec	unspec		no
   3193 			 * unspec	spec		no
   3194 			 * spec		unspec		no
   3195 			 * spec		spec		yes if A
   3196 			 *
   3197 			 * For labeled systems, SO_MAC_EXEMPT behaves the same
   3198 			 * as TCP_EXCLBIND, except that zoneid is ignored.
   3199 			 *
   3200 			 * Note:
   3201 			 *
   3202 			 * 1. Because of TLI semantics, an endpoint can go
   3203 			 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
   3204 			 * TCPS_BOUND, depending on whether it is originally
   3205 			 * a listener or not.  That is why we need to check
   3206 			 * for states greater than or equal to TCPS_BOUND
   3207 			 * here.
   3208 			 *
   3209 			 * 2. Ideally, we should only check for state equals
   3210 			 * to TCPS_LISTEN. And the following check should be
   3211 			 * added.
   3212 			 *
   3213 			 * if (ltcp->tcp_state == TCPS_LISTEN ||
   3214 			 *	!reuseaddr || !ltcp->tcp_reuseaddr) {
   3215 			 *		...
   3216 			 * }
   3217 			 *
   3218 			 * The semantics will be changed to this.  If the
   3219 			 * endpoint on the list is in state not equal to
   3220 			 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
   3221 			 * set, let the bind succeed.
   3222 			 *
   3223 			 * Because of (1), we cannot do that for TLI
   3224 			 * endpoints.  But we can do that for socket endpoints.
   3225 			 * If in future, we can change this going back
   3226 			 * semantics, we can use the above check for TLI also.
   3227 			 */
   3228 			not_socket = !(TCP_IS_SOCKET(ltcp) &&
   3229 			    TCP_IS_SOCKET(tcp));
   3230 			exclbind = ltcp->tcp_exclbind || tcp->tcp_exclbind;
   3231 
   3232 			if (lconnp->conn_mac_exempt || connp->conn_mac_exempt ||
   3233 			    (exclbind && (not_socket ||
   3234 			    ltcp->tcp_state <= TCPS_ESTABLISHED))) {
   3235 				if (V6_OR_V4_INADDR_ANY(
   3236 				    ltcp->tcp_bound_source_v6) ||
   3237 				    V6_OR_V4_INADDR_ANY(*laddr) ||
   3238 				    IN6_ARE_ADDR_EQUAL(laddr,
   3239 				    &ltcp->tcp_bound_source_v6)) {
   3240 					break;
   3241 				}
   3242 				continue;
   3243 			}
   3244 
   3245 			/*
   3246 			 * Check ipversion to allow IPv4 and IPv6 sockets to
   3247 			 * have disjoint port number spaces, if *_EXCLBIND
   3248 			 * is not set and only if the application binds to a
   3249 			 * specific port. We use the same autoassigned port
   3250 			 * number space for IPv4 and IPv6 sockets.
   3251 			 */
   3252 			if (tcp->tcp_ipversion != ltcp->tcp_ipversion &&
   3253 			    bind_to_req_port_only)
   3254 				continue;
   3255 
   3256 			/*
   3257 			 * Ideally, we should make sure that the source
   3258 			 * address, remote address, and remote port in the
   3259 			 * four tuple for this tcp-connection is unique.
   3260 			 * However, trying to find out the local source
   3261 			 * address would require too much code duplication
   3262 			 * with IP, since IP needs needs to have that code
   3263 			 * to support userland TCP implementations.
   3264 			 */
   3265 			if (quick_connect &&
   3266 			    (ltcp->tcp_state > TCPS_LISTEN) &&
   3267 			    ((tcp->tcp_fport != ltcp->tcp_fport) ||
   3268 			    !IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6,
   3269 			    &ltcp->tcp_remote_v6)))
   3270 				continue;
   3271 
   3272 			if (!reuseaddr) {
   3273 				/*
   3274 				 * No socket option SO_REUSEADDR.
   3275 				 * If existing port is bound to
   3276 				 * a non-wildcard IP address
   3277 				 * and the requesting stream is
   3278 				 * bound to a distinct
   3279 				 * different IP addresses
   3280 				 * (non-wildcard, also), keep
   3281 				 * going.
   3282 				 */
   3283 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
   3284 				    !V6_OR_V4_INADDR_ANY(
   3285 				    ltcp->tcp_bound_source_v6) &&
   3286 				    !IN6_ARE_ADDR_EQUAL(laddr,
   3287 				    &ltcp->tcp_bound_source_v6))
   3288 					continue;
   3289 				if (ltcp->tcp_state >= TCPS_BOUND) {
   3290 					/*
   3291 					 * This port is being used and
   3292 					 * its state is >= TCPS_BOUND,
   3293 					 * so we can't bind to it.
   3294 					 */
   3295 					break;
   3296 				}
   3297 			} else {
   3298 				/*
   3299 				 * socket option SO_REUSEADDR is set on the
   3300 				 * binding tcp_t.
   3301 				 *
   3302 				 * If two streams are bound to
   3303 				 * same IP address or both addr
   3304 				 * and bound source are wildcards
   3305 				 * (INADDR_ANY), we want to stop
   3306 				 * searching.
   3307 				 * We have found a match of IP source
   3308 				 * address and source port, which is
   3309 				 * refused regardless of the
   3310 				 * SO_REUSEADDR setting, so we break.
   3311 				 */
   3312 				if (IN6_ARE_ADDR_EQUAL(laddr,
   3313 				    &ltcp->tcp_bound_source_v6) &&
   3314 				    (ltcp->tcp_state == TCPS_LISTEN ||
   3315 				    ltcp->tcp_state == TCPS_BOUND))
   3316 					break;
   3317 			}
   3318 		}
   3319 		if (ltcp != NULL) {
   3320 			/* The port number is busy */
   3321 			mutex_exit(&tbf->tf_lock);
   3322 		} else {
   3323 			/*
   3324 			 * This port is ours. Insert in fanout and mark as
   3325 			 * bound to prevent others from getting the port
   3326 			 * number.
   3327 			 */
   3328 			tcp->tcp_state = TCPS_BOUND;
   3329 			tcp->tcp_lport = htons(port);
   3330 			*(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
   3331 
   3332 			ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
   3333 			    tcp->tcp_lport)] == tbf);
   3334 			tcp_bind_hash_insert(tbf, tcp, 1);
   3335 
   3336 			mutex_exit(&tbf->tf_lock);
   3337 
   3338 			/*
   3339 			 * We don't want tcp_next_port_to_try to "inherit"
   3340 			 * a port number supplied by the user in a bind.
   3341 			 */
   3342 			if (user_specified)
   3343 				return (port);
   3344 
   3345 			/*
   3346 			 * This is the only place where tcp_next_port_to_try
   3347 			 * is updated. After the update, it may or may not
   3348 			 * be in the valid range.
   3349 			 */
   3350 			if (!tcp->tcp_anon_priv_bind)
   3351 				tcps->tcps_next_port_to_try = port + 1;
   3352 			return (port);
   3353 		}
   3354 
   3355 		if (tcp->tcp_anon_priv_bind) {
   3356 			port = tcp_get_next_priv_port(tcp);
   3357 		} else {
   3358 			if (count == 0 && user_specified) {
   3359 				/*
   3360 				 * We may have to return an anonymous port. So
   3361 				 * get one to start with.
   3362 				 */
   3363 				port =
   3364 				    tcp_update_next_port(
   3365 				    tcps->tcps_next_port_to_try,
   3366 				    tcp, B_TRUE);
   3367 				user_specified = B_FALSE;
   3368 			} else {
   3369 				port = tcp_update_next_port(port + 1, tcp,
   3370 				    B_FALSE);
   3371 			}
   3372 		}
   3373 		if (port == 0)
   3374 			break;
   3375 
   3376 		/*
   3377 		 * Don't let this loop run forever in the case where
   3378 		 * all of the anonymous ports are in use.
   3379 		 */
   3380 	} while (++count < loopmax);
   3381 	return (0);
   3382 }
   3383 
   3384 /*
   3385  * tcp_clean_death / tcp_close_detached must not be called more than once
   3386  * on a tcp. Thus every function that potentially calls tcp_clean_death
   3387  * must check for the tcp state before calling tcp_clean_death.
   3388  * Eg. tcp_input, tcp_rput_data, tcp_eager_kill, tcp_clean_death_wrapper,
   3389  * tcp_timer_handler, all check for the tcp state.
   3390  */
   3391 /* ARGSUSED */
   3392 void
   3393 tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2)
   3394 {
   3395 	tcp_t	*tcp = ((conn_t *)arg)->conn_tcp;
   3396 
   3397 	freemsg(mp);
   3398 	if (tcp->tcp_state > TCPS_BOUND)
   3399 		(void) tcp_clean_death(((conn_t *)arg)->conn_tcp,
   3400 		    ETIMEDOUT, 5);
   3401 }
   3402 
   3403 /*
   3404  * We are dying for some reason.  Try to do it gracefully.  (May be called
   3405  * as writer.)
   3406  *
   3407  * Return -1 if the structure was not cleaned up (if the cleanup had to be
   3408  * done by a service procedure).
   3409  * TBD - Should the return value distinguish between the tcp_t being
   3410  * freed and it being reinitialized?
   3411  */
   3412 static int
   3413 tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
   3414 {
   3415 	mblk_t	*mp;
   3416 	queue_t	*q;
   3417 	conn_t	*connp = tcp->tcp_connp;
   3418 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3419 
   3420 	TCP_CLD_STAT(tag);
   3421 
   3422 #if TCP_TAG_CLEAN_DEATH
   3423 	tcp->tcp_cleandeathtag = tag;
   3424 #endif
   3425 
   3426 	if (tcp->tcp_fused)
   3427 		tcp_unfuse(tcp);
   3428 
   3429 	if (tcp->tcp_linger_tid != 0 &&
   3430 	    TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
   3431 		tcp_stop_lingering(tcp);
   3432 	}
   3433 
   3434 	ASSERT(tcp != NULL);
   3435 	ASSERT((tcp->tcp_family == AF_INET &&
   3436 	    tcp->tcp_ipversion == IPV4_VERSION) ||
   3437 	    (tcp->tcp_family == AF_INET6 &&
   3438 	    (tcp->tcp_ipversion == IPV4_VERSION ||
   3439 	    tcp->tcp_ipversion == IPV6_VERSION)));
   3440 
   3441 	if (TCP_IS_DETACHED(tcp)) {
   3442 		if (tcp->tcp_hard_binding) {
   3443 			/*
   3444 			 * Its an eager that we are dealing with. We close the
   3445 			 * eager but in case a conn_ind has already gone to the
   3446 			 * listener, let tcp_accept_finish() send a discon_ind
   3447 			 * to the listener and drop the last reference. If the
   3448 			 * listener doesn't even know about the eager i.e. the
   3449 			 * conn_ind hasn't gone up, blow away the eager and drop
   3450 			 * the last reference as well. If the conn_ind has gone
   3451 			 * up, state should be BOUND. tcp_accept_finish
   3452 			 * will figure out that the connection has received a
   3453 			 * RST and will send a DISCON_IND to the application.
   3454 			 */
   3455 			tcp_closei_local(tcp);
   3456 			if (!tcp->tcp_tconnind_started) {
   3457 				CONN_DEC_REF(connp);
   3458 			} else {
   3459 				tcp->tcp_state = TCPS_BOUND;
   3460 			}
   3461 		} else {
   3462 			tcp_close_detached(tcp);
   3463 		}
   3464 		return (0);
   3465 	}
   3466 
   3467 	TCP_STAT(tcps, tcp_clean_death_nondetached);
   3468 
   3469 	q = tcp->tcp_rq;
   3470 
   3471 	/* Trash all inbound data */
   3472 	if (!IPCL_IS_NONSTR(connp)) {
   3473 		ASSERT(q != NULL);
   3474 		flushq(q, FLUSHALL);
   3475 	}
   3476 
   3477 	/*
   3478 	 * If we are at least part way open and there is error
   3479 	 * (err==0 implies no error)
   3480 	 * notify our client by a T_DISCON_IND.
   3481 	 */
   3482 	if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) {
   3483 		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
   3484 		    !TCP_IS_SOCKET(tcp)) {
   3485 			/*
   3486 			 * Send M_FLUSH according to TPI. Because sockets will
   3487 			 * (and must) ignore FLUSHR we do that only for TPI
   3488 			 * endpoints and sockets in STREAMS mode.
   3489 			 */
   3490 			(void) putnextctl1(q, M_FLUSH, FLUSHR);
   3491 		}
   3492 		if (tcp->tcp_debug) {
   3493 			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
   3494 			    "tcp_clean_death: discon err %d", err);
   3495 		}
   3496 		if (IPCL_IS_NONSTR(connp)) {
   3497 			/* Direct socket, use upcall */
   3498 			(*connp->conn_upcalls->su_disconnected)(
   3499 			    connp->conn_upper_handle, tcp->tcp_connid, err);
   3500 		} else {
   3501 			mp = mi_tpi_discon_ind(NULL, err, 0);
   3502 			if (mp != NULL) {
   3503 				putnext(q, mp);
   3504 			} else {
   3505 				if (tcp->tcp_debug) {
   3506 					(void) strlog(TCP_MOD_ID, 0, 1,
   3507 					    SL_ERROR|SL_TRACE,
   3508 					    "tcp_clean_death, sending M_ERROR");
   3509 				}
   3510 				(void) putnextctl1(q, M_ERROR, EPROTO);
   3511 			}
   3512 		}
   3513 		if (tcp->tcp_state <= TCPS_SYN_RCVD) {
   3514 			/* SYN_SENT or SYN_RCVD */
   3515 			BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
   3516 		} else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) {
   3517 			/* ESTABLISHED or CLOSE_WAIT */
   3518 			BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
   3519 		}
   3520 	}
   3521 
   3522 	tcp_reinit(tcp);
   3523 	if (IPCL_IS_NONSTR(connp))
   3524 		(void) tcp_do_unbind(connp);
   3525 
   3526 	return (-1);
   3527 }
   3528 
   3529 /*
   3530  * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout
   3531  * to expire, stop the wait and finish the close.
   3532  */
   3533 static void
   3534 tcp_stop_lingering(tcp_t *tcp)
   3535 {
   3536 	clock_t	delta = 0;
   3537 	tcp_stack_t	*tcps = tcp->tcp_tcps;
   3538 
   3539 	tcp->tcp_linger_tid = 0;
   3540 	if (tcp->tcp_state > TCPS_LISTEN) {
   3541 		tcp_acceptor_hash_remove(tcp);
   3542 		mutex_enter(&tcp->tcp_non_sq_lock);
   3543 		if (tcp->tcp_flow_stopped) {
   3544 			tcp_clrqfull(tcp);
   3545 		}
   3546 		mutex_exit(&tcp->tcp_non_sq_lock);
   3547 
   3548 		if (tcp->tcp_timer_tid != 0) {
   3549 			delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
   3550 			tcp->tcp_timer_tid = 0;
   3551 		}
   3552 		/*
   3553 		 * Need to cancel those timers which will not be used when
   3554 		 * TCP is detached.  This has to be done before the tcp_wq
   3555 		 * is set to the global queue.
   3556 		 */
   3557 		tcp_timers_stop(tcp);
   3558 
   3559 		tcp->tcp_detached = B_TRUE;
   3560 		ASSERT(tcps->tcps_g_q != NULL);
   3561 		tcp->tcp_rq = tcps->tcps_g_q;
   3562 		tcp->tcp_wq = WR(tcps->tcps_g_q);
   3563 
   3564 		if (tcp->tcp_state == TCPS_TIME_WAIT) {
   3565 			tcp_time_wait_append(tcp);
   3566 			TCP_DBGSTAT(tcps, tcp_detach_time_wait);
   3567 			goto finish;
   3568 		}
   3569 
   3570 		/*
   3571 		 * If delta is zero the timer event wasn't executed and was
   3572 		 * successfully canceled. In this case we need to restart it
   3573 		 * with the minimal delta possible.
   3574 		 */
   3575 		if (delta >= 0) {
   3576 			tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
   3577 			    delta ? delta : 1);
   3578 		}
   3579 	} else {
   3580 		tcp_closei_local(tcp);
   3581 		CONN_DEC_REF(tcp->tcp_connp);
   3582 	}
   3583 finish:
   3584 	/* Signal closing thread that it can complete close */
   3585 	mutex_enter(&tcp->tcp_closelock);
   3586 	tcp->tcp_detached = B_TRUE;
   3587 	ASSERT(tcps->tcps_g_q != NULL);
   3588 
   3589 	tcp->tcp_rq = tcps->tcps_g_q;
   3590 	tcp->tcp_wq = WR(tcps->tcps_g_q);
   3591 
   3592 	tcp->tcp_closed = 1;
   3593 	cv_signal(&tcp->tcp_closecv);
   3594 	mutex_exit(&tcp->tcp_closelock);
   3595 }
   3596 
   3597 /*
   3598  * Handle lingering timeouts. This function is called when the SO_LINGER timeout
   3599  * expires.
   3600  */
   3601 static void
   3602 tcp_close_linger_timeout(void *arg)
   3603 {
   3604 	conn_t	*connp = (conn_t *)arg;
   3605 	tcp_t 	*tcp = connp->conn_tcp;
   3606 
   3607 	tcp->tcp_client_errno = ETIMEDOUT;
   3608 	tcp_stop_lingering(tcp);
   3609 }
   3610 
   3611 static void
   3612 tcp_close_common(conn_t *connp, int flags)
   3613 {
   3614 	tcp_t		*tcp = connp->conn_tcp;
   3615 	mblk_t 		*mp = &tcp->tcp_closemp;
   3616 	boolean_t	conn_ioctl_cleanup_reqd = B_FALSE;
   3617 	mblk_t		*bp;
   3618 
   3619 	ASSERT(connp->conn_ref >= 2);
   3620 
   3621 	/*
   3622 	 * Mark the conn as closing. ill_pending_mp_add will not
   3623 	 * add any mp to the pending mp list, after this conn has
   3624 	 * started closing. Same for sq_pending_mp_add
   3625 	 */
   3626 	mutex_enter(&connp->conn_lock);
   3627 	connp->conn_state_flags |= CONN_CLOSING;
   3628 	if (connp->conn_oper_pending_ill != NULL)
   3629 		conn_ioctl_cleanup_reqd = B_TRUE;
   3630 	CONN_INC_REF_LOCKED(connp);
   3631 	mutex_exit(&connp->conn_lock);
   3632 	tcp->tcp_closeflags = (uint8_t)flags;
   3633 	ASSERT(connp->conn_ref >= 3);
   3634 
   3635 	/*
   3636 	 * tcp_closemp_used is used below without any protection of a lock
   3637 	 * as we don't expect any one else to use it concurrently at this
   3638 	 * point otherwise it would be a major defect.
   3639 	 */
   3640 
   3641 	if (mp->b_prev == NULL)
   3642 		tcp->tcp_closemp_used = B_TRUE;
   3643 	else
   3644 		cmn_err(CE_PANIC, "tcp_close: concurrent use of tcp_closemp: "
   3645 		    "connp %p tcp %p\n", (void *)connp, (void *)tcp);
   3646 
   3647 	TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
   3648 
   3649 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp,
   3650 	    tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
   3651 
   3652 	mutex_enter(&tcp->tcp_closelock);
   3653 	while (!tcp->tcp_closed) {
   3654 		if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) {
   3655 			/*
   3656 			 * The cv_wait_sig() was interrupted. We now do the
   3657 			 * following:
   3658 			 *
   3659 			 * 1) If the endpoint was lingering, we allow this
   3660 			 * to be interrupted by cancelling the linger timeout
   3661 			 * and closing normally.
   3662 			 *
   3663 			 * 2) Revert to calling cv_wait()
   3664 			 *
   3665 			 * We revert to using cv_wait() to avoid an
   3666 			 * infinite loop which can occur if the calling
   3667 			 * thread is higher priority than the squeue worker
   3668 			 * thread and is bound to the same cpu.
   3669 			 */
   3670 			if (tcp->tcp_linger && tcp->tcp_lingertime > 0) {
   3671 				mutex_exit(&tcp->tcp_closelock);
   3672 				/* Entering squeue, bump ref count. */
   3673 				CONN_INC_REF(connp);
   3674 				bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
   3675 				SQUEUE_ENTER_ONE(connp->conn_sqp, bp,
   3676 				    tcp_linger_interrupted, connp,
   3677 				    tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
   3678 				mutex_enter(&tcp->tcp_closelock);
   3679 			}
   3680 			break;
   3681 		}
   3682 	}
   3683 	while (!tcp->tcp_closed)
   3684 		cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock);
   3685 	mutex_exit(&tcp->tcp_closelock);
   3686 
   3687 	/*
   3688 	 * In the case of listener streams that have eagers in the q or q0
   3689 	 * we wait for the eagers to drop their reference to us. tcp_rq and
   3690 	 * tcp_wq of the eagers point to our queues. By waiting for the
   3691 	 * refcnt to drop to 1, we are sure that the eagers have cleaned
   3692 	 * up their queue pointers and also dropped their references to us.
   3693 	 */
   3694 	if (tcp->tcp_wait_for_eagers) {
   3695 		mutex_enter(&connp->conn_lock);
   3696 		while (connp->conn_ref != 1) {
   3697 			cv_wait(&connp->conn_cv, &connp->conn_lock);
   3698 		}
   3699 		mutex_exit(&connp->conn_lock);
   3700 	}
   3701 	/*
   3702 	 * ioctl cleanup. The mp is queued in the
   3703 	 * ill_pending_mp or in the sq_pending_mp.
   3704 	 */
   3705 	if (conn_ioctl_cleanup_reqd)
   3706 		conn_ioctl_cleanup(connp);
   3707 
   3708 	tcp->tcp_cpid = -1;
   3709 }
   3710 
   3711 static int
   3712 tcp_tpi_close(queue_t *q, int flags)
   3713 {
   3714 	conn_t		*connp;
   3715 
   3716 	ASSERT(WR(q)->q_next == NULL);
   3717 
   3718 	if (flags & SO_FALLBACK) {
   3719 		/*
   3720 		 * stream is being closed while in fallback
   3721 		 * simply free the resources that were allocated
   3722 		 */
   3723 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
   3724 		qprocsoff(q);
   3725 		goto done;
   3726 	}
   3727 
   3728 	connp = Q_TO_CONN(q);
   3729 	/*
   3730 	 * We are being closed as /dev/tcp or /dev/tcp6.
   3731 	 */
   3732 	tcp_close_common(connp, flags);
   3733 
   3734 	qprocsoff(q);
   3735 	inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
   3736 
   3737 	/*
   3738 	 * Drop IP's reference on the conn. This is the last reference
   3739 	 * on the connp if the state was less than established. If the
   3740 	 * connection has gone into timewait state, then we will have
   3741 	 * one ref for the TCP and one more ref (total of two) for the
   3742 	 * classifier connected hash list (a timewait connections stays
   3743 	 * in connected hash till closed).
   3744 	 *
   3745 	 * We can't assert the references because there might be other
   3746 	 * transient reference places because of some walkers or queued
   3747 	 * packets in squeue for the timewait state.
   3748 	 */
   3749 	CONN_DEC_REF(connp);
   3750 done:
   3751 	q->q_ptr = WR(q)->q_ptr = NULL;
   3752 	return (0);
   3753 }
   3754 
   3755 static int
   3756 tcp_tpi_close_accept(queue_t *q)
   3757 {
   3758 	vmem_t	*minor_arena;
   3759 	dev_t	conn_dev;
   3760 
   3761 	ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
   3762 
   3763 	/*
   3764 	 * We had opened an acceptor STREAM for sockfs which is
   3765 	 * now being closed due to some error.
   3766 	 */
   3767 	qprocsoff(q);
   3768 
   3769 	minor_arena = (vmem_t *)WR(q)->q_ptr;
   3770 	conn_dev = (dev_t)RD(q)->q_ptr;
   3771 	ASSERT(minor_arena != NULL);
   3772 	ASSERT(conn_dev != 0);
   3773 	inet_minor_free(minor_arena, conn_dev);
   3774 	q->q_ptr = WR(q)->q_ptr = NULL;
   3775 	return (0);
   3776 }
   3777 
   3778 /*
   3779  * Called by tcp_close() routine via squeue when lingering is
   3780  * interrupted by a signal.
   3781  */
   3782 
   3783 /* ARGSUSED */
   3784 static void
   3785 tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2)
   3786 {
   3787 	conn_t	*connp = (conn_t *)arg;
   3788 	tcp_t	*tcp = connp->conn_tcp;
   3789 
   3790 	freeb(mp);
   3791 	if (tcp->tcp_linger_tid != 0 &&
   3792 	    TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
   3793 		tcp_stop_lingering(tcp);
   3794 		tcp->tcp_client_errno = EINTR;
   3795 	}
   3796 }
   3797 
   3798 /*
   3799  * Called by streams close routine via squeues when our client blows off her
   3800  * descriptor, we take this to mean: "close the stream state NOW, close the tcp
   3801  * connection politely" When SO_LINGER is set (with a non-zero linger time and
   3802  * it is not a nonblocking socket) then this routine sleeps until the FIN is
   3803  * acked.
   3804  *
   3805  * NOTE: tcp_close potentially returns error when lingering.
   3806  * However, the stream head currently does not pass these errors
   3807  * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK
   3808  * errors to the application (from tsleep()) and not errors
   3809  * like ECONNRESET caused by receiving a reset packet.
   3810  */
   3811 
   3812 /* ARGSUSED */
   3813 static void
   3814 tcp_close_output(void *arg, mblk_t *mp, void *arg2)
   3815 {
   3816 	char	*msg;
   3817 	conn_t	*connp = (