Home | History | Annotate | Download | only in tcp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 /* Copyright (c) 1990 Mentat Inc. */
     27 
     28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     29 const char tcp_version[] = "%Z%%M%	%I%	%E% SMI";
     30 
     31 
     32 #include <sys/types.h>
     33 #include <sys/stream.h>
     34 #include <sys/strsun.h>
     35 #include <sys/strsubr.h>
     36 #include <sys/stropts.h>
     37 #include <sys/strlog.h>
     38 #include <sys/strsun.h>
     39 #define	_SUN_TPI_VERSION 2
     40 #include <sys/tihdr.h>
     41 #include <sys/timod.h>
     42 #include <sys/ddi.h>
     43 #include <sys/sunddi.h>
     44 #include <sys/suntpi.h>
     45 #include <sys/xti_inet.h>
     46 #include <sys/cmn_err.h>
     47 #include <sys/debug.h>
     48 #include <sys/sdt.h>
     49 #include <sys/vtrace.h>
     50 #include <sys/kmem.h>
     51 #include <sys/ethernet.h>
     52 #include <sys/cpuvar.h>
     53 #include <sys/dlpi.h>
     54 #include <sys/multidata.h>
     55 #include <sys/multidata_impl.h>
     56 #include <sys/pattr.h>
     57 #include <sys/policy.h>
     58 #include <sys/priv.h>
     59 #include <sys/zone.h>
     60 #include <sys/sunldi.h>
     61 
     62 #include <sys/errno.h>
     63 #include <sys/signal.h>
     64 #include <sys/socket.h>
     65 #include <sys/sockio.h>
     66 #include <sys/isa_defs.h>
     67 #include <sys/md5.h>
     68 #include <sys/random.h>
     69 #include <sys/sodirect.h>
     70 #include <sys/uio.h>
     71 #include <netinet/in.h>
     72 #include <netinet/tcp.h>
     73 #include <netinet/ip6.h>
     74 #include <netinet/icmp6.h>
     75 #include <net/if.h>
     76 #include <net/route.h>
     77 #include <inet/ipsec_impl.h>
     78 
     79 #include <inet/common.h>
     80 #include <inet/ip.h>
     81 #include <inet/ip_impl.h>
     82 #include <inet/ip6.h>
     83 #include <inet/ip_ndp.h>
     84 #include <inet/mi.h>
     85 #include <inet/mib2.h>
     86 #include <inet/nd.h>
     87 #include <inet/optcom.h>
     88 #include <inet/snmpcom.h>
     89 #include <inet/kstatcom.h>
     90 #include <inet/tcp.h>
     91 #include <inet/tcp_impl.h>
     92 #include <net/pfkeyv2.h>
     93 #include <inet/ipsec_info.h>
     94 #include <inet/ipdrop.h>
     95 #include <inet/tcp_trace.h>
     96 
     97 #include <inet/ipclassifier.h>
     98 #include <inet/ip_ire.h>
     99 #include <inet/ip_ftable.h>
    100 #include <inet/ip_if.h>
    101 #include <inet/ipp_common.h>
    102 #include <inet/ip_netinfo.h>
    103 #include <sys/squeue.h>
    104 #include <inet/kssl/ksslapi.h>
    105 #include <sys/tsol/label.h>
    106 #include <sys/tsol/tnet.h>
    107 #include <rpc/pmap_prot.h>
    108 
    109 /*
    110  * TCP Notes: aka FireEngine Phase I (PSARC 2002/433)
    111  *
    112  * (Read the detailed design doc in PSARC case directory)
    113  *
    114  * The entire tcp state is contained in tcp_t and conn_t structure
    115  * which are allocated in tandem using ipcl_conn_create() and passing
    116  * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect
    117  * the references on the tcp_t. The tcp_t structure is never compressed
    118  * and packets always land on the correct TCP perimeter from the time
    119  * eager is created till the time tcp_t dies (as such the old mentat
    120  * TCP global queue is not used for detached state and no IPSEC checking
    121  * is required). The global queue is still allocated to send out resets
    122  * for connection which have no listeners and IP directly calls
    123  * tcp_xmit_listeners_reset() which does any policy check.
    124  *
    125  * Protection and Synchronisation mechanism:
    126  *
    127  * The tcp data structure does not use any kind of lock for protecting
    128  * its state but instead uses 'squeues' for mutual exclusion from various
    129  * read and write side threads. To access a tcp member, the thread should
    130  * always be behind squeue (via squeue_enter, squeue_enter_nodrain, or
    131  * squeue_fill). Since the squeues allow a direct function call, caller
    132  * can pass any tcp function having prototype of edesc_t as argument
    133  * (different from traditional STREAMs model where packets come in only
    134  * designated entry points). The list of functions that can be directly
    135  * called via squeue are listed before the usual function prototype.
    136  *
    137  * Referencing:
    138  *
    139  * TCP is MT-Hot and we use a reference based scheme to make sure that the
    140  * tcp structure doesn't disappear when its needed. When the application
    141  * creates an outgoing connection or accepts an incoming connection, we
    142  * start out with 2 references on 'conn_ref'. One for TCP and one for IP.
    143  * The IP reference is just a symbolic reference since ip_tcpclose()
    144  * looks at tcp structure after tcp_close_output() returns which could
    145  * have dropped the last TCP reference. So as long as the connection is
    146  * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the
    147  * conn_t. The classifier puts its own reference when the connection is
    148  * inserted in listen or connected hash. Anytime a thread needs to enter
    149  * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr
    150  * on write side or by doing a classify on read side and then puts a
    151  * reference on the conn before doing squeue_enter/tryenter/fill. For
    152  * read side, the classifier itself puts the reference under fanout lock
    153  * to make sure that tcp can't disappear before it gets processed. The
    154  * squeue will drop this reference automatically so the called function
    155  * doesn't have to do a DEC_REF.
    156  *
    157  * Opening a new connection:
    158  *
    159  * The outgoing connection open is pretty simple. tcp_open() does the
    160  * work in creating the conn/tcp structure and initializing it. The
    161  * squeue assignment is done based on the CPU the application
    162  * is running on. So for outbound connections, processing is always done
    163  * on application CPU which might be different from the incoming CPU
    164  * being interrupted by the NIC. An optimal way would be to figure out
    165  * the NIC <-> CPU binding at listen time, and assign the outgoing
    166  * connection to the squeue attached to the CPU that will be interrupted
    167  * for incoming packets (we know the NIC based on the bind IP address).
    168  * This might seem like a problem if more data is going out but the
    169  * fact is that in most cases the transmit is ACK driven transmit where
    170  * the outgoing data normally sits on TCP's xmit queue waiting to be
    171  * transmitted.
    172  *
    173  * Accepting a connection:
    174  *
    175  * This is a more interesting case because of various races involved in
    176  * establishing a eager in its own perimeter. Read the meta comment on
    177  * top of tcp_conn_request(). But briefly, the squeue is picked by
    178  * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU.
    179  *
    180  * Closing a connection:
    181  *
    182  * The close is fairly straight forward. tcp_close() calls tcp_close_output()
    183  * via squeue to do the close and mark the tcp as detached if the connection
    184  * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its
    185  * reference but tcp_close() drop IP's reference always. So if tcp was
    186  * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP
    187  * and 1 because it is in classifier's connected hash. This is the condition
    188  * we use to determine that its OK to clean up the tcp outside of squeue
    189  * when time wait expires (check the ref under fanout and conn_lock and
    190  * if it is 2, remove it from fanout hash and kill it).
    191  *
    192  * Although close just drops the necessary references and marks the
    193  * tcp_detached state, tcp_close needs to know the tcp_detached has been
    194  * set (under squeue) before letting the STREAM go away (because a
    195  * inbound packet might attempt to go up the STREAM while the close
    196  * has happened and tcp_detached is not set). So a special lock and
    197  * flag is used along with a condition variable (tcp_closelock, tcp_closed,
    198  * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked
    199  * tcp_detached.
    200  *
    201  * Special provisions and fast paths:
    202  *
    203  * We make special provision for (AF_INET, SOCK_STREAM) sockets which
    204  * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP
    205  * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles
    206  * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY
    207  * check to send packets directly to tcp_rput_data via squeue. Everyone
    208  * else comes through tcp_input() on the read side.
    209  *
    210  * We also make special provisions for sockfs by marking tcp_issocket
    211  * whenever we have only sockfs on top of TCP. This allows us to skip
    212  * putting the tcp in acceptor hash since a sockfs listener can never
    213  * become acceptor and also avoid allocating a tcp_t for acceptor STREAM
    214  * since eager has already been allocated and the accept now happens
    215  * on acceptor STREAM. There is a big blob of comment on top of
    216  * tcp_conn_request explaining the new accept. When socket is POP'd,
    217  * sockfs sends us an ioctl to mark the fact and we go back to old
    218  * behaviour. Once tcp_issocket is unset, its never set for the
    219  * life of that connection.
    220  *
    221  * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT)
    222  * two consoldiation private KAPIs are used to enqueue M_DATA mblk_t's
    223  * directly to the socket (sodirect) and start an asynchronous copyout
    224  * to a user-land receive-side buffer (uioa) when a blocking socket read
    225  * (e.g. read, recv, ...) is pending.
    226  *
    227  * This is accomplished when tcp_issocket is set and tcp_sodirect is not
    228  * NULL so points to an sodirect_t and if marked enabled then we enqueue
    229  * all mblk_t's directly to the socket.
    230  *
    231  * Further, if the sodirect_t sod_uioa and if marked enabled (due to a
    232  * blocking socket read, e.g. user-land read, recv, ...) then an asynchronous
    233  * copyout will be started directly to the user-land uio buffer. Also, as we
    234  * have a pending read, TCP's push logic can take into account the number of
    235  * bytes to be received and only awake the blocked read()er when the uioa_t
    236  * byte count has been satisfied.
    237  *
    238  * IPsec notes :
    239  *
    240  * Since a packet is always executed on the correct TCP perimeter
    241  * all IPsec processing is defered to IP including checking new
    242  * connections and setting IPSEC policies for new connection. The
    243  * only exception is tcp_xmit_listeners_reset() which is called
    244  * directly from IP and needs to policy check to see if TH_RST
    245  * can be sent out.
    246  *
    247  * PFHooks notes :
    248  *
    249  * For mdt case, one meta buffer contains multiple packets. Mblks for every
    250  * packet are assembled and passed to the hooks. When packets are blocked,
    251  * or boundary of any packet is changed, the mdt processing is stopped, and
    252  * packets of the meta buffer are send to the IP path one by one.
    253  */
    254 
    255 /*
    256  * Values for squeue switch:
    257  * 1: squeue_enter_nodrain
    258  * 2: squeue_enter
    259  * 3: squeue_fill
    260  */
    261 int tcp_squeue_close = 2;	/* Setable in /etc/system */
    262 int tcp_squeue_wput = 2;
    263 
    264 squeue_func_t tcp_squeue_close_proc;
    265 squeue_func_t tcp_squeue_wput_proc;
    266 
    267 /*
    268  * Macros for sodirect:
    269  *
    270  * SOD_PTR_ENTER(tcp, sodp) - for the tcp_t pointer "tcp" set the
    271  * sodirect_t pointer "sodp" to the socket/tcp shared sodirect_t
    272  * if it exists and is enabled, else to NULL. Note, in the current
    273  * sodirect implementation the sod_lock must not be held across any
    274  * STREAMS call (e.g. putnext) else a "recursive mutex_enter" PANIC
    275  * will result as sod_lock is the streamhead stdata.sd_lock.
    276  *
    277  * SOD_NOT_ENABLED(tcp) - return true if not a sodirect tcp_t or the
    278  * sodirect_t isn't enabled, usefull for ASSERT()ing that a recieve
    279  * side tcp code path dealing with a tcp_rcv_list or putnext() isn't
    280  * being used when sodirect code paths should be.
    281  */
    282 
    283 #define	SOD_PTR_ENTER(tcp, sodp)					\
    284 	(sodp) = (tcp)->tcp_sodirect;					\
    285 									\
    286 	if ((sodp) != NULL) {						\
    287 		mutex_enter((sodp)->sod_lock);				\
    288 		if (!((sodp)->sod_state & SOD_ENABLED)) {		\
    289 			mutex_exit((sodp)->sod_lock);			\
    290 			(sodp) = NULL;					\
    291 		}							\
    292 	}
    293 
    294 #define	SOD_NOT_ENABLED(tcp)						\
    295 	((tcp)->tcp_sodirect == NULL ||					\
    296 	    !((tcp)->tcp_sodirect->sod_state & SOD_ENABLED))
    297 
    298 /*
    299  * This controls how tiny a write must be before we try to copy it
    300  * into the the mblk on the tail of the transmit queue.  Not much
    301  * speedup is observed for values larger than sixteen.  Zero will
    302  * disable the optimisation.
    303  */
    304 int tcp_tx_pull_len = 16;
    305 
    306 /*
    307  * TCP Statistics.
    308  *
    309  * How TCP statistics work.
    310  *
    311  * There are two types of statistics invoked by two macros.
    312  *
    313  * TCP_STAT(name) does non-atomic increment of a named stat counter. It is
    314  * supposed to be used in non MT-hot paths of the code.
    315  *
    316  * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is
    317  * supposed to be used for DEBUG purposes and may be used on a hot path.
    318  *
    319  * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat
    320  * (use "kstat tcp" to get them).
    321  *
    322  * There is also additional debugging facility that marks tcp_clean_death()
    323  * instances and saves them in tcp_t structure. It is triggered by
    324  * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for
    325  * tcp_clean_death() calls that counts the number of times each tag was hit. It
    326  * is triggered by TCP_CLD_COUNTERS define.
    327  *
    328  * How to add new counters.
    329  *
    330  * 1) Add a field in the tcp_stat structure describing your counter.
    331  * 2) Add a line in the template in tcp_kstat2_init() with the name
    332  *    of the counter.
    333  *
    334  *    IMPORTANT!! - make sure that both are in sync !!
    335  * 3) Use either TCP_STAT or TCP_DBGSTAT with the name.
    336  *
    337  * Please avoid using private counters which are not kstat-exported.
    338  *
    339  * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances
    340  * in tcp_t structure.
    341  *
    342  * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
    343  */
    344 
    345 #ifndef TCP_DEBUG_COUNTER
    346 #ifdef DEBUG
    347 #define	TCP_DEBUG_COUNTER 1
    348 #else
    349 #define	TCP_DEBUG_COUNTER 0
    350 #endif
    351 #endif
    352 
    353 #define	TCP_CLD_COUNTERS 0
    354 
    355 #define	TCP_TAG_CLEAN_DEATH 1
    356 #define	TCP_MAX_CLEAN_DEATH_TAG 32
    357 
    358 #ifdef lint
    359 static int _lint_dummy_;
    360 #endif
    361 
    362 #if TCP_CLD_COUNTERS
    363 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
    364 #define	TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
    365 #elif defined(lint)
    366 #define	TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0);
    367 #else
    368 #define	TCP_CLD_STAT(x)
    369 #endif
    370 
    371 #if TCP_DEBUG_COUNTER
    372 #define	TCP_DBGSTAT(tcps, x)	\
    373 	atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1)
    374 #define	TCP_G_DBGSTAT(x)	\
    375 	atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1)
    376 #elif defined(lint)
    377 #define	TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0);
    378 #define	TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0);
    379 #else
    380 #define	TCP_DBGSTAT(tcps, x)
    381 #define	TCP_G_DBGSTAT(x)
    382 #endif
    383 
    384 #define	TCP_G_STAT(x)	(tcp_g_statistics.x.value.ui64++)
    385 
    386 tcp_g_stat_t	tcp_g_statistics;
    387 kstat_t		*tcp_g_kstat;
    388 
    389 /*
    390  * Call either ip_output or ip_output_v6. This replaces putnext() calls on the
    391  * tcp write side.
    392  */
    393 #define	CALL_IP_WPUT(connp, q, mp) {					\
    394 	tcp_stack_t	*tcps;						\
    395 									\
    396 	tcps = connp->conn_netstack->netstack_tcp;			\
    397 	ASSERT(((q)->q_flag & QREADR) == 0);				\
    398 	TCP_DBGSTAT(tcps, tcp_ip_output);				\
    399 	connp->conn_send(connp, (mp), (q), IP_WPUT);			\
    400 }
    401 
    402 /* Macros for timestamp comparisons */
    403 #define	TSTMP_GEQ(a, b)	((int32_t)((a)-(b)) >= 0)
    404 #define	TSTMP_LT(a, b)	((int32_t)((a)-(b)) < 0)
    405 
    406 /*
    407  * Parameters for TCP Initial Send Sequence number (ISS) generation.  When
    408  * tcp_strong_iss is set to 1, which is the default, the ISS is calculated
    409  * by adding three components: a time component which grows by 1 every 4096
    410  * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
    411  * a per-connection component which grows by 125000 for every new connection;
    412  * and an "extra" component that grows by a random amount centered
    413  * approximately on 64000.  This causes the the ISS generator to cycle every
    414  * 4.89 hours if no TCP connections are made, and faster if connections are
    415  * made.
    416  *
    417  * When tcp_strong_iss is set to 0, ISS is calculated by adding two
    418  * components: a time component which grows by 250000 every second; and
    419  * a per-connection component which grows by 125000 for every new connections.
    420  *
    421  * A third method, when tcp_strong_iss is set to 2, for generating ISS is
    422  * prescribed by Steve Bellovin.  This involves adding time, the 125000 per
    423  * connection, and a one-way hash (MD5) of the connection ID <sport, dport,
    424  * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered
    425  * password.
    426  */
    427 #define	ISS_INCR	250000
    428 #define	ISS_NSEC_SHT	12
    429 
    430 static sin_t	sin_null;	/* Zero address for quick clears */
    431 static sin6_t	sin6_null;	/* Zero address for quick clears */
    432 
    433 /*
    434  * This implementation follows the 4.3BSD interpretation of the urgent
    435  * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause
    436  * incompatible changes in protocols like telnet and rlogin.
    437  */
    438 #define	TCP_OLD_URP_INTERPRETATION	1
    439 
    440 #define	TCP_IS_DETACHED_NONEAGER(tcp)	\
    441 	(TCP_IS_DETACHED(tcp) && \
    442 	    (!(tcp)->tcp_hard_binding))
    443 
    444 /*
    445  * TCP reassembly macros.  We hide starting and ending sequence numbers in
    446  * b_next and b_prev of messages on the reassembly queue.  The messages are
    447  * chained using b_cont.  These macros are used in tcp_reass() so we don't
    448  * have to see the ugly casts and assignments.
    449  */
    450 #define	TCP_REASS_SEQ(mp)		((uint32_t)(uintptr_t)((mp)->b_next))
    451 #define	TCP_REASS_SET_SEQ(mp, u)	((mp)->b_next = \
    452 					(mblk_t *)(uintptr_t)(u))
    453 #define	TCP_REASS_END(mp)		((uint32_t)(uintptr_t)((mp)->b_prev))
    454 #define	TCP_REASS_SET_END(mp, u)	((mp)->b_prev = \
    455 					(mblk_t *)(uintptr_t)(u))
    456 
    457 /*
    458  * Implementation of TCP Timers.
    459  * =============================
    460  *
    461  * INTERFACE:
    462  *
    463  * There are two basic functions dealing with tcp timers:
    464  *
    465  *	timeout_id_t	tcp_timeout(connp, func, time)
    466  * 	clock_t		tcp_timeout_cancel(connp, timeout_id)
    467  *	TCP_TIMER_RESTART(tcp, intvl)
    468  *
    469  * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
    470  * after 'time' ticks passed. The function called by timeout() must adhere to
    471  * the same restrictions as a driver soft interrupt handler - it must not sleep
    472  * or call other functions that might sleep. The value returned is the opaque
    473  * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
    474  * cancel the request. The call to tcp_timeout() may fail in which case it
    475  * returns zero. This is different from the timeout(9F) function which never
    476  * fails.
    477  *
    478  * The call-back function 'func' always receives 'connp' as its single
    479  * argument. It is always executed in the squeue corresponding to the tcp
    480  * structure. The tcp structure is guaranteed to be present at the time the
    481  * call-back is called.
    482  *
    483  * NOTE: The call-back function 'func' is never called if tcp is in
    484  * 	the TCPS_CLOSED state.
    485  *
    486  * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
    487  * request. locks acquired by the call-back routine should not be held across
    488  * the call to tcp_timeout_cancel() or a deadlock may result.
    489  *
    490  * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
    491  * Otherwise, it returns an integer value greater than or equal to 0. In
    492  * particular, if the call-back function is already placed on the squeue, it can
    493  * not be canceled.
    494  *
    495  * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
    496  * 	within squeue context corresponding to the tcp instance. Since the
    497  *	call-back is also called via the same squeue, there are no race
    498  *	conditions described in untimeout(9F) manual page since all calls are
    499  *	strictly serialized.
    500  *
    501  *      TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
    502  *	stored in tcp_timer_tid and starts a new one using
    503  *	MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
    504  *	and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
    505  *	field.
    506  *
    507  * NOTE: since the timeout cancellation is not guaranteed, the cancelled
    508  *	call-back may still be called, so it is possible tcp_timer() will be
    509  *	called several times. This should not be a problem since tcp_timer()
    510  *	should always check the tcp instance state.
    511  *
    512  *
    513  * IMPLEMENTATION:
    514  *
    515  * TCP timers are implemented using three-stage process. The call to
    516  * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
    517  * when the timer expires. The tcp_timer_callback() arranges the call of the
    518  * tcp_timer_handler() function via squeue corresponding to the tcp
    519  * instance. The tcp_timer_handler() calls actual requested timeout call-back
    520  * and passes tcp instance as an argument to it. Information is passed between
    521  * stages using the tcp_timer_t structure which contains the connp pointer, the
    522  * tcp call-back to call and the timeout id returned by the timeout(9F).
    523  *
    524  * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
    525  * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
    526  * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
    527  * returns the pointer to this mblk.
    528  *
    529  * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
    530  * looks like a normal mblk without actual dblk attached to it.
    531  *
    532  * To optimize performance each tcp instance holds a small cache of timer
    533  * mblocks. In the current implementation it caches up to two timer mblocks per
    534  * tcp instance. The cache is preserved over tcp frees and is only freed when
    535  * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
    536  * timer processing happens on a corresponding squeue, the cache manipulation
    537  * does not require any locks. Experiments show that majority of timer mblocks
    538  * allocations are satisfied from the tcp cache and do not involve kmem calls.
    539  *
    540  * The tcp_timeout() places a refhold on the connp instance which guarantees
    541  * that it will be present at the time the call-back function fires. The
    542  * tcp_timer_handler() drops the reference after calling the call-back, so the
    543  * call-back function does not need to manipulate the references explicitly.
    544  */
    545 
    546 typedef struct tcp_timer_s {
    547 	conn_t	*connp;
    548 	void 	(*tcpt_proc)(void *);
    549 	timeout_id_t   tcpt_tid;
    550 } tcp_timer_t;
    551 
    552 static kmem_cache_t *tcp_timercache;
    553 kmem_cache_t	*tcp_sack_info_cache;
    554 kmem_cache_t	*tcp_iphc_cache;
    555 
    556 /*
    557  * For scalability, we must not run a timer for every TCP connection
    558  * in TIME_WAIT state.  To see why, consider (for time wait interval of
    559  * 4 minutes):
    560  *	1000 connections/sec * 240 seconds/time wait = 240,000 active conn's
    561  *
    562  * This list is ordered by time, so you need only delete from the head
    563  * until you get to entries which aren't old enough to delete yet.
    564  * The list consists of only the detached TIME_WAIT connections.
    565  *
    566  * Note that the timer (tcp_time_wait_expire) is started when the tcp_t
    567  * becomes detached TIME_WAIT (either by changing the state and already
    568  * being detached or the other way around). This means that the TIME_WAIT
    569  * state can be extended (up to doubled) if the connection doesn't become
    570  * detached for a long time.
    571  *
    572  * The list manipulations (including tcp_time_wait_next/prev)
    573  * are protected by the tcp_time_wait_lock. The content of the
    574  * detached TIME_WAIT connections is protected by the normal perimeters.
    575  *
    576  * This list is per squeue and squeues are shared across the tcp_stack_t's.
    577  * Things on tcp_time_wait_head remain associated with the tcp_stack_t
    578  * and conn_netstack.
    579  * The tcp_t's that are added to tcp_free_list are disassociated and
    580  * have NULL tcp_tcps and conn_netstack pointers.
    581  */
    582 typedef struct tcp_squeue_priv_s {
    583 	kmutex_t	tcp_time_wait_lock;
    584 	timeout_id_t	tcp_time_wait_tid;
    585 	tcp_t		*tcp_time_wait_head;
    586 	tcp_t		*tcp_time_wait_tail;
    587 	tcp_t		*tcp_free_list;
    588 	uint_t		tcp_free_list_cnt;
    589 } tcp_squeue_priv_t;
    590 
    591 /*
    592  * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
    593  * Running it every 5 seconds seems to give the best results.
    594  */
    595 #define	TCP_TIME_WAIT_DELAY drv_usectohz(5000000)
    596 
    597 /*
    598  * To prevent memory hog, limit the number of entries in tcp_free_list
    599  * to 1% of available memory / number of cpus
    600  */
    601 uint_t tcp_free_list_max_cnt = 0;
    602 
    603 #define	TCP_XMIT_LOWATER	4096
    604 #define	TCP_XMIT_HIWATER	49152
    605 #define	TCP_RECV_LOWATER	2048
    606 #define	TCP_RECV_HIWATER	49152
    607 
    608 /*
    609  *  PAWS needs a timer for 24 days.  This is the number of ticks in 24 days
    610  */
    611 #define	PAWS_TIMEOUT	((clock_t)(24*24*60*60*hz))
    612 
    613 #define	TIDUSZ	4096	/* transport interface data unit size */
    614 
    615 /*
    616  * Bind hash list size and has function.  It has to be a power of 2 for
    617  * hashing.
    618  */
    619 #define	TCP_BIND_FANOUT_SIZE	512
    620 #define	TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1))
    621 /*
    622  * Size of listen and acceptor hash list.  It has to be a power of 2 for
    623  * hashing.
    624  */
    625 #define	TCP_FANOUT_SIZE		256
    626 
    627 #ifdef	_ILP32
    628 #define	TCP_ACCEPTOR_HASH(accid)					\
    629 		(((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1))
    630 #else
    631 #define	TCP_ACCEPTOR_HASH(accid)					\
    632 		((uint_t)(accid) & (TCP_FANOUT_SIZE - 1))
    633 #endif	/* _ILP32 */
    634 
    635 #define	IP_ADDR_CACHE_SIZE	2048
    636 #define	IP_ADDR_CACHE_HASH(faddr)					\
    637 	(ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1))
    638 
    639 /* Hash for HSPs uses all 32 bits, since both networks and hosts are in table */
    640 #define	TCP_HSP_HASH_SIZE 256
    641 
    642 #define	TCP_HSP_HASH(addr)					\
    643 	(((addr>>24) ^ (addr >>16) ^			\
    644 	    (addr>>8) ^ (addr)) % TCP_HSP_HASH_SIZE)
    645 
    646 /*
    647  * TCP options struct returned from tcp_parse_options.
    648  */
    649 typedef struct tcp_opt_s {
    650 	uint32_t	tcp_opt_mss;
    651 	uint32_t	tcp_opt_wscale;
    652 	uint32_t	tcp_opt_ts_val;
    653 	uint32_t	tcp_opt_ts_ecr;
    654 	tcp_t		*tcp;
    655 } tcp_opt_t;
    656 
    657 /*
    658  * RFC1323-recommended phrasing of TSTAMP option, for easier parsing
    659  */
    660 
    661 #ifdef _BIG_ENDIAN
    662 #define	TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
    663 	(TCPOPT_TSTAMP << 8) | 10)
    664 #else
    665 #define	TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
    666 	(TCPOPT_NOP << 8) | TCPOPT_NOP)
    667 #endif
    668 
    669 /*
    670  * Flags returned from tcp_parse_options.
    671  */
    672 #define	TCP_OPT_MSS_PRESENT	1
    673 #define	TCP_OPT_WSCALE_PRESENT	2
    674 #define	TCP_OPT_TSTAMP_PRESENT	4
    675 #define	TCP_OPT_SACK_OK_PRESENT	8
    676 #define	TCP_OPT_SACK_PRESENT	16
    677 
    678 /* TCP option length */
    679 #define	TCPOPT_NOP_LEN		1
    680 #define	TCPOPT_MAXSEG_LEN	4
    681 #define	TCPOPT_WS_LEN		3
    682 #define	TCPOPT_REAL_WS_LEN	(TCPOPT_WS_LEN+1)
    683 #define	TCPOPT_TSTAMP_LEN	10
    684 #define	TCPOPT_REAL_TS_LEN	(TCPOPT_TSTAMP_LEN+2)
    685 #define	TCPOPT_SACK_OK_LEN	2
    686 #define	TCPOPT_REAL_SACK_OK_LEN	(TCPOPT_SACK_OK_LEN+2)
    687 #define	TCPOPT_REAL_SACK_LEN	4
    688 #define	TCPOPT_MAX_SACK_LEN	36
    689 #define	TCPOPT_HEADER_LEN	2
    690 
    691 /* TCP cwnd burst factor. */
    692 #define	TCP_CWND_INFINITE	65535
    693 #define	TCP_CWND_SS		3
    694 #define	TCP_CWND_NORMAL		5
    695 
    696 /* Maximum TCP initial cwin (start/restart). */
    697 #define	TCP_MAX_INIT_CWND	8
    698 
    699 /*
    700  * Initialize cwnd according to RFC 3390.  def_max_init_cwnd is
    701  * either tcp_slow_start_initial or tcp_slow_start_after idle
    702  * depending on the caller.  If the upper layer has not used the
    703  * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd
    704  * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd.
    705  * If the upper layer has changed set the tcp_init_cwnd, just use
    706  * it to calculate the tcp_cwnd.
    707  */
    708 #define	SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd)			\
    709 {									\
    710 	if ((tcp)->tcp_init_cwnd == 0) {				\
    711 		(tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss),	\
    712 		    MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \
    713 	} else {							\
    714 		(tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss);		\
    715 	}								\
    716 	tcp->tcp_cwnd_cnt = 0;						\
    717 }
    718 
    719 /* TCP Timer control structure */
    720 typedef struct tcpt_s {
    721 	pfv_t	tcpt_pfv;	/* The routine we are to call */
    722 	tcp_t	*tcpt_tcp;	/* The parameter we are to pass in */
    723 } tcpt_t;
    724 
    725 /* Host Specific Parameter structure */
    726 typedef struct tcp_hsp {
    727 	struct tcp_hsp	*tcp_hsp_next;
    728 	in6_addr_t	tcp_hsp_addr_v6;
    729 	in6_addr_t	tcp_hsp_subnet_v6;
    730 	uint_t		tcp_hsp_vers;	/* IPV4_VERSION | IPV6_VERSION */
    731 	int32_t		tcp_hsp_sendspace;
    732 	int32_t		tcp_hsp_recvspace;
    733 	int32_t		tcp_hsp_tstamp;
    734 } tcp_hsp_t;
    735 #define	tcp_hsp_addr	V4_PART_OF_V6(tcp_hsp_addr_v6)
    736 #define	tcp_hsp_subnet	V4_PART_OF_V6(tcp_hsp_subnet_v6)
    737 
    738 /*
    739  * Functions called directly via squeue having a prototype of edesc_t.
    740  */
    741 void		tcp_conn_request(void *arg, mblk_t *mp, void *arg2);
    742 static void	tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2);
    743 void		tcp_accept_finish(void *arg, mblk_t *mp, void *arg2);
    744 static void	tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2);
    745 static void	tcp_wput_proto(void *arg, mblk_t *mp, void *arg2);
    746 void 		tcp_input(void *arg, mblk_t *mp, void *arg2);
    747 void		tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
    748 static void	tcp_close_output(void *arg, mblk_t *mp, void *arg2);
    749 void		tcp_output(void *arg, mblk_t *mp, void *arg2);
    750 static void	tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2);
    751 static void	tcp_timer_handler(void *arg, mblk_t *mp, void *arg2);
    752 static void	tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2);
    753 
    754 
    755 /* Prototype for TCP functions */
    756 static void	tcp_random_init(void);
    757 int		tcp_random(void);
    758 static void	tcp_accept(tcp_t *tcp, mblk_t *mp);
    759 static void	tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
    760 		    tcp_t *eager);
    761 static int	tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp);
    762 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
    763     int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
    764     boolean_t user_specified);
    765 static void	tcp_closei_local(tcp_t *tcp);
    766 static void	tcp_close_detached(tcp_t *tcp);
    767 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph,
    768 			mblk_t *idmp, mblk_t **defermp);
    769 static void	tcp_connect(tcp_t *tcp, mblk_t *mp);
    770 static void	tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp,
    771 		    in_port_t dstport, uint_t srcid);
    772 static void	tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
    773 		    in_port_t dstport, uint32_t flowinfo, uint_t srcid,
    774 		    uint32_t scope_id);
    775 static int	tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
    776 static void	tcp_def_q_set(tcp_t *tcp, mblk_t *mp);
    777 static void	tcp_disconnect(tcp_t *tcp, mblk_t *mp);
    778 static char	*tcp_display(tcp_t *tcp, char *, char);
    779 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
    780 static void	tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only);
    781 static void	tcp_eager_unlink(tcp_t *tcp);
    782 static void	tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr,
    783 		    int unixerr);
    784 static void	tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
    785 		    int tlierr, int unixerr);
    786 static int	tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
    787 		    cred_t *cr);
    788 static int	tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
    789 		    char *value, caddr_t cp, cred_t *cr);
    790 static int	tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
    791 		    char *value, caddr_t cp, cred_t *cr);
    792 static int	tcp_tpistate(tcp_t *tcp);
    793 static void	tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp,
    794     int caller_holds_lock);
    795 static void	tcp_bind_hash_remove(tcp_t *tcp);
    796 static tcp_t	*tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *);
    797 void		tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp);
    798 static void	tcp_acceptor_hash_remove(tcp_t *tcp);
    799 static void	tcp_capability_req(tcp_t *tcp, mblk_t *mp);
    800 static void	tcp_info_req(tcp_t *tcp, mblk_t *mp);
    801 static void	tcp_addr_req(tcp_t *tcp, mblk_t *mp);
    802 static void	tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp);
    803 void		tcp_g_q_setup(tcp_stack_t *);
    804 void		tcp_g_q_create(tcp_stack_t *);
    805 void		tcp_g_q_destroy(tcp_stack_t *);
    806 static int	tcp_header_init_ipv4(tcp_t *tcp);
    807 static int	tcp_header_init_ipv6(tcp_t *tcp);
    808 int		tcp_init(tcp_t *tcp, queue_t *q);
    809 static int	tcp_init_values(tcp_t *tcp);
    810 static mblk_t	*tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic);
    811 static mblk_t	*tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim,
    812 		    t_scalar_t addr_length);
    813 static void	tcp_ip_ire_mark_advice(tcp_t *tcp);
    814 static void	tcp_ip_notify(tcp_t *tcp);
    815 static mblk_t	*tcp_ire_mp(mblk_t *mp);
    816 static void	tcp_iss_init(tcp_t *tcp);
    817 static void	tcp_keepalive_killer(void *arg);
    818 static int	tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt);
    819 static void	tcp_mss_set(tcp_t *tcp, uint32_t size, boolean_t do_ss);
    820 static int	tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
    821 		    int *do_disconnectp, int *t_errorp, int *sys_errorp);
    822 static boolean_t tcp_allow_connopt_set(int level, int name);
    823 int		tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
    824 int		tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
    825 int		tcp_opt_set(queue_t *q, uint_t optset_context, int level,
    826 		    int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
    827 		    uchar_t *outvalp, void *thisdg_attrs, cred_t *cr,
    828 		    mblk_t *mblk);
    829 static void	tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha);
    830 static int	tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly,
    831 		    uchar_t *ptr, uint_t len);
    832 static int	tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
    833 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt,
    834     tcp_stack_t *);
    835 static int	tcp_param_set(queue_t *q, mblk_t *mp, char *value,
    836 		    caddr_t cp, cred_t *cr);
    837 static int	tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value,
    838 		    caddr_t cp, cred_t *cr);
    839 static void	tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *);
    840 static int	tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value,
    841 		    caddr_t cp, cred_t *cr);
    842 static void	tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt);
    843 static mblk_t	*tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start);
    844 static void	tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp);
    845 static void	tcp_reinit(tcp_t *tcp);
    846 static void	tcp_reinit_values(tcp_t *tcp);
    847 static void	tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval,
    848 		    tcp_t *thisstream, cred_t *cr);
    849 
    850 static uint_t	tcp_rcv_drain(queue_t *q, tcp_t *tcp);
    851 static void	tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
    852 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
    853 static void	tcp_ss_rexmit(tcp_t *tcp);
    854 static mblk_t	*tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp);
    855 static void	tcp_process_options(tcp_t *, tcph_t *);
    856 static void	tcp_rput_common(tcp_t *tcp, mblk_t *mp);
    857 static void	tcp_rsrv(queue_t *q);
    858 static int	tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd);
    859 static int	tcp_snmp_state(tcp_t *tcp);
    860 static int	tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
    861 		    cred_t *cr);
    862 static int	tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
    863 		    cred_t *cr);
    864 static int	tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
    865 		    cred_t *cr);
    866 static int	tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
    867 		    cred_t *cr);
    868 static int	tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
    869 		    cred_t *cr);
    870 static int	tcp_host_param_set(queue_t *q, mblk_t *mp, char *value,
    871 		    caddr_t cp, cred_t *cr);
    872 static int	tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value,
    873 		    caddr_t cp, cred_t *cr);
    874 static int	tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp,
    875 		    cred_t *cr);
    876 static void	tcp_timer(void *arg);
    877 static void	tcp_timer_callback(void *);
    878 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp,
    879     boolean_t random);
    880 static in_port_t tcp_get_next_priv_port(const tcp_t *);
    881 static void	tcp_wput_sock(queue_t *q, mblk_t *mp);
    882 void		tcp_wput_accept(queue_t *q, mblk_t *mp);
    883 static void	tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
    884 static void	tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
    885 static void	tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
    886 static int	tcp_send(queue_t *q, tcp_t *tcp, const int mss,
    887 		    const int tcp_hdr_len, const int tcp_tcp_hdr_len,
    888 		    const int num_sack_blk, int *usable, uint_t *snxt,
    889 		    int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
    890 		    const int mdt_thres);
    891 static int	tcp_multisend(queue_t *q, tcp_t *tcp, const int mss,
    892 		    const int tcp_hdr_len, const int tcp_tcp_hdr_len,
    893 		    const int num_sack_blk, int *usable, uint_t *snxt,
    894 		    int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
    895 		    const int mdt_thres);
    896 static void	tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
    897 		    int num_sack_blk);
    898 static void	tcp_wsrv(queue_t *q);
    899 static int	tcp_xmit_end(tcp_t *tcp);
    900 static void	tcp_ack_timer(void *arg);
    901 static mblk_t	*tcp_ack_mp(tcp_t *tcp);
    902 static void	tcp_xmit_early_reset(char *str, mblk_t *mp,
    903 		    uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len,
    904 		    zoneid_t zoneid, tcp_stack_t *, conn_t *connp);
    905 static void	tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
    906 		    uint32_t ack, int ctl);
    907 static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr, tcp_stack_t *);
    908 static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr, tcp_stack_t *);
    909 static int	setmaxps(queue_t *q, int maxpsz);
    910 static void	tcp_set_rto(tcp_t *, time_t);
    911 static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *,
    912 		    boolean_t, boolean_t);
    913 static void	tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp,
    914 		    boolean_t ipsec_mctl);
    915 static mblk_t	*tcp_setsockopt_mp(int level, int cmd,
    916 		    char *opt, int optlen);
    917 static int	tcp_build_hdrs(queue_t *, tcp_t *);
    918 static void	tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
    919 		    uint32_t seg_seq, uint32_t seg_ack, int seg_len,
    920 		    tcph_t *tcph);
    921 boolean_t	tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp);
    922 boolean_t	tcp_reserved_port_add(int, in_port_t *, in_port_t *);
    923 boolean_t	tcp_reserved_port_del(in_port_t, in_port_t);
    924 boolean_t	tcp_reserved_port_check(in_port_t, tcp_stack_t *);
    925 static tcp_t	*tcp_alloc_temp_tcp(in_port_t, tcp_stack_t *);
    926 static int	tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *);
    927 static mblk_t	*tcp_mdt_info_mp(mblk_t *);
    928 static void	tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t);
    929 static int	tcp_mdt_add_attrs(multidata_t *, const mblk_t *,
    930 		    const boolean_t, const uint32_t, const uint32_t,
    931 		    const uint32_t, const uint32_t, tcp_stack_t *);
    932 static void	tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *,
    933 		    const uint_t, const uint_t, boolean_t *);
    934 static mblk_t	*tcp_lso_info_mp<