Home | History | Annotate | Download | only in ip
      1      0      stevel /*
      2      0      stevel  * CDDL HEADER START
      3      0      stevel  *
      4      0      stevel  * The contents of this file are subject to the terms of the
      5   1392     ja97890  * Common Development and Distribution License (the "License").
      6   1392     ja97890  * You may not use this file except in compliance with the License.
      7      0      stevel  *
      8      0      stevel  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9      0      stevel  * or http://www.opensolaris.org/os/licensing.
     10      0      stevel  * See the License for the specific language governing permissions
     11      0      stevel  * and limitations under the License.
     12      0      stevel  *
     13      0      stevel  * When distributing Covered Code, include this CDDL HEADER in each
     14      0      stevel  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15      0      stevel  * If applicable, add the following below this CDDL HEADER, with the
     16      0      stevel  * fields enclosed by brackets "[]" replaced with your own identifying
     17      0      stevel  * information: Portions Copyright [yyyy] [name of copyright owner]
     18      0      stevel  *
     19      0      stevel  * CDDL HEADER END
     20      0      stevel  */
     21   3448    dh155122 
     22      0      stevel /*
     23   8485       Peter  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24      0      stevel  * Use is subject to license terms.
     25      0      stevel  */
     26      0      stevel /* Copyright (c) 1990 Mentat Inc. */
     27      0      stevel 
     28      0      stevel #include <sys/types.h>
     29      0      stevel #include <sys/stream.h>
     30      0      stevel #include <sys/dlpi.h>
     31      0      stevel #include <sys/stropts.h>
     32      0      stevel #include <sys/sysmacros.h>
     33      0      stevel #include <sys/strsubr.h>
     34      0      stevel #include <sys/strlog.h>
     35      0      stevel #include <sys/strsun.h>
     36      0      stevel #include <sys/zone.h>
     37      0      stevel #define	_SUN_TPI_VERSION 2
     38      0      stevel #include <sys/tihdr.h>
     39      0      stevel #include <sys/xti_inet.h>
     40      0      stevel #include <sys/ddi.h>
     41  11042        Erik #include <sys/suntpi.h>
     42      0      stevel #include <sys/cmn_err.h>
     43      0      stevel #include <sys/debug.h>
     44      0      stevel #include <sys/kobj.h>
     45      0      stevel #include <sys/modctl.h>
     46      0      stevel #include <sys/atomic.h>
     47      0      stevel #include <sys/policy.h>
     48   1676         jpk #include <sys/priv.h>
     49   8275        Eric #include <sys/taskq.h>
     50      0      stevel 
     51      0      stevel #include <sys/systm.h>
     52      0      stevel #include <sys/param.h>
     53      0      stevel #include <sys/kmem.h>
     54   2958    dr146992 #include <sys/sdt.h>
     55      0      stevel #include <sys/socket.h>
     56      0      stevel #include <sys/vtrace.h>
     57      0      stevel #include <sys/isa_defs.h>
     58   5868    dr146992 #include <sys/mac.h>
     59      0      stevel #include <net/if.h>
     60      0      stevel #include <net/if_arp.h>
     61      0      stevel #include <net/route.h>
     62      0      stevel #include <sys/sockio.h>
     63      0      stevel #include <netinet/in.h>
     64      0      stevel #include <net/if_dl.h>
     65      0      stevel 
     66      0      stevel #include <inet/common.h>
     67      0      stevel #include <inet/mi.h>
     68      0      stevel #include <inet/mib2.h>
     69      0      stevel #include <inet/nd.h>
     70      0      stevel #include <inet/arp.h>
     71      0      stevel #include <inet/snmpcom.h>
     72   5240    nordmark #include <inet/optcom.h>
     73      0      stevel #include <inet/kstatcom.h>
     74      0      stevel 
     75      0      stevel #include <netinet/igmp_var.h>
     76      0      stevel #include <netinet/ip6.h>
     77      0      stevel #include <netinet/icmp6.h>
     78      0      stevel #include <netinet/sctp.h>
     79      0      stevel 
     80      0      stevel #include <inet/ip.h>
     81    741    masputra #include <inet/ip_impl.h>
     82      0      stevel #include <inet/ip6.h>
     83      0      stevel #include <inet/ip6_asp.h>
     84      0      stevel #include <inet/tcp.h>
     85    741    masputra #include <inet/tcp_impl.h>
     86      0      stevel #include <inet/ip_multi.h>
     87      0      stevel #include <inet/ip_if.h>
     88      0      stevel #include <inet/ip_ire.h>
     89   2535    sangeeta #include <inet/ip_ftable.h>
     90      0      stevel #include <inet/ip_rts.h>
     91      0      stevel #include <inet/ip_ndp.h>
     92      0      stevel #include <inet/ip_listutils.h>
     93      0      stevel #include <netinet/igmp.h>
     94      0      stevel #include <netinet/ip_mroute.h>
     95      0      stevel #include <inet/ipp_common.h>
     96      0      stevel 
     97      0      stevel #include <net/pfkeyv2.h>
     98      0      stevel #include <inet/sadb.h>
     99      0      stevel #include <inet/ipsec_impl.h>
    100  10616   Sebastien #include <inet/iptun/iptun_impl.h>
    101      0      stevel #include <inet/ipdrop.h>
    102   2958    dr146992 #include <inet/ip_netinfo.h>
    103  10946    Sangeeta #include <inet/ilb_ip.h>
    104      0      stevel 
    105      0      stevel #include <sys/ethernet.h>
    106      0      stevel #include <net/if_types.h>
    107      0      stevel #include <sys/cpuvar.h>
    108      0      stevel 
    109      0      stevel #include <ipp/ipp.h>
    110      0      stevel #include <ipp/ipp_impl.h>
    111      0      stevel #include <ipp/ipgpc/ipgpc.h>
    112      0      stevel 
    113      0      stevel #include <sys/pattr.h>
    114      0      stevel #include <inet/ipclassifier.h>
    115      0      stevel #include <inet/sctp_ip.h>
    116   2252    priyanka #include <inet/sctp/sctp_impl.h>
    117    741    masputra #include <inet/udp_impl.h>
    118   5240    nordmark #include <inet/rawip_impl.h>
    119   5240    nordmark #include <inet/rts_impl.h>
    120   1676         jpk 
    121   1676         jpk #include <sys/tsol/label.h>
    122   1676         jpk #include <sys/tsol/tnet.h>
    123   1676         jpk 
    124   8275        Eric #include <sys/squeue_impl.h>
    125  11042        Erik #include <inet/ip_arp.h>
    126  11110        Erik 
    127  11110        Erik #include <sys/clock_impl.h>	/* For LBOLT_FASTPATH{,64} */
    128      0      stevel 
    129      0      stevel /*
    130      0      stevel  * Values for squeue switch:
    131   8275        Eric  * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN
    132   8275        Eric  * IP_SQUEUE_ENTER: SQ_PROCESS
    133   8275        Eric  * IP_SQUEUE_FILL: SQ_FILL
    134      0      stevel  */
    135  11042        Erik int ip_squeue_enter = IP_SQUEUE_ENTER;	/* Setable in /etc/system */
    136   3448    dh155122 
    137   8275        Eric int ip_squeue_flag;
    138      0      stevel 
    139   3448    dh155122 /*
    140   3448    dh155122  * Setable in /etc/system
    141   3448    dh155122  */
    142      0      stevel int ip_poll_normal_ms = 100;
    143      0      stevel int ip_poll_normal_ticks = 0;
    144   3233    yz147064 int ip_modclose_ackwait_ms = 3000;
    145   5023    carlsonj 
    146   5023    carlsonj /*
    147   5023    carlsonj  * It would be nice to have these present only in DEBUG systems, but the
    148   5023    carlsonj  * current design of the global symbol checking logic requires them to be
    149   5023    carlsonj  * unconditionally present.
    150   5023    carlsonj  */
    151   5023    carlsonj uint_t ip_thread_data;			/* TSD key for debug support */
    152   5023    carlsonj krwlock_t ip_thread_rwlock;
    153   5023    carlsonj list_t	ip_thread_list;
    154      0      stevel 
    155      0      stevel /*
    156      0      stevel  * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions.
    157      0      stevel  */
    158      0      stevel 
    159      0      stevel struct listptr_s {
    160      0      stevel 	mblk_t	*lp_head;	/* pointer to the head of the list */
    161      0      stevel 	mblk_t	*lp_tail;	/* pointer to the tail of the list */
    162      0      stevel };
    163      0      stevel 
    164      0      stevel typedef struct listptr_s listptr_t;
    165      0      stevel 
    166      0      stevel /*
    167   1676         jpk  * This is used by ip_snmp_get_mib2_ip_route_media and
    168   1676         jpk  * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data.
    169   1676         jpk  */
    170   1676         jpk typedef struct iproutedata_s {
    171   1676         jpk 	uint_t		ird_idx;
    172   8485       Peter 	uint_t		ird_flags;	/* see below */
    173   1676         jpk 	listptr_t	ird_route;	/* ipRouteEntryTable */
    174   1676         jpk 	listptr_t	ird_netmedia;	/* ipNetToMediaEntryTable */
    175   1676         jpk 	listptr_t	ird_attrs;	/* ipRouteAttributeTable */
    176   1676         jpk } iproutedata_t;
    177   1676         jpk 
    178  11042        Erik /* Include ire_testhidden and IRE_IF_CLONE routes */
    179  11042        Erik #define	IRD_REPORT_ALL	0x01
    180   8485       Peter 
    181   1676         jpk /*
    182      0      stevel  * Cluster specific hooks. These should be NULL when booted as a non-cluster
    183      0      stevel  */
    184      0      stevel 
    185      0      stevel /*
    186      0      stevel  * Hook functions to enable cluster networking
    187      0      stevel  * On non-clustered systems these vectors must always be NULL.
    188      0      stevel  *
    189      0      stevel  * Hook function to Check ip specified ip address is a shared ip address
    190      0      stevel  * in the cluster
    191      0      stevel  *
    192      0      stevel  */
    193   8392     Huafeng int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
    194   8392     Huafeng     sa_family_t addr_family, uint8_t *laddrp, void *args) = NULL;
    195      0      stevel 
    196      0      stevel /*
    197      0      stevel  * Hook function to generate cluster wide ip fragment identifier
    198      0      stevel  */
    199   8392     Huafeng uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
    200   8392     Huafeng     sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp,
    201   8392     Huafeng     void *args) = NULL;
    202   7749  Thejaswini 
    203   7749  Thejaswini /*
    204   7749  Thejaswini  * Hook function to generate cluster wide SPI.
    205   7749  Thejaswini  */
    206   8392     Huafeng void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
    207   8392     Huafeng     void *) = NULL;
    208   7749  Thejaswini 
    209   7749  Thejaswini /*
    210   7749  Thejaswini  * Hook function to verify if the SPI is already utlized.
    211   7749  Thejaswini  */
    212   7749  Thejaswini 
    213   8392     Huafeng int (*cl_inet_checkspi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
    214   7749  Thejaswini 
    215   7749  Thejaswini /*
    216   7749  Thejaswini  * Hook function to delete the SPI from the cluster wide repository.
    217   7749  Thejaswini  */
    218   7749  Thejaswini 
    219   8392     Huafeng void (*cl_inet_deletespi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
    220   7749  Thejaswini 
    221   7749  Thejaswini /*
    222   7749  Thejaswini  * Hook function to inform the cluster when packet received on an IDLE SA
    223   7749  Thejaswini  */
    224   7749  Thejaswini 
    225   8392     Huafeng void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
    226   8392     Huafeng     in6_addr_t, in6_addr_t, void *) = NULL;
    227      0      stevel 
    228      0      stevel /*
    229      0      stevel  * Synchronization notes:
    230      0      stevel  *
    231      0      stevel  * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
    232      0      stevel  * MT level protection given by STREAMS. IP uses a combination of its own
    233      0      stevel  * internal serialization mechanism and standard Solaris locking techniques.
    234   8485       Peter  * The internal serialization is per phyint.  This is used to serialize
    235  11042        Erik  * plumbing operations, IPMP operations, most set ioctls, etc.
    236      0      stevel  *
    237      0      stevel  * Plumbing is a long sequence of operations involving message
    238      0      stevel  * exchanges between IP, ARP and device drivers. Many set ioctls are typically
    239      0      stevel  * involved in plumbing operations. A natural model is to serialize these
    240      0      stevel  * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
    241      0      stevel  * parallel without any interference. But various set ioctls on hme0 are best
    242  11042        Erik  * serialized, along with IPMP operations and processing of DLPI control
    243  11042        Erik  * messages received from drivers on a per phyint basis. This serialization is
    244  11042        Erik  * provided by the ipsq_t and primitives operating on this. Details can
    245  11042        Erik  * be found in ip_if.c above the core primitives operating on ipsq_t.
    246      0      stevel  *
    247      0      stevel  * Lookups of an ipif or ill by a thread return a refheld ipif / ill.
    248      0      stevel  * Simiarly lookup of an ire by a thread also returns a refheld ire.
    249      0      stevel  * In addition ipif's and ill's referenced by the ire are also indirectly
    250  11042        Erik  * refheld. Thus no ipif or ill can vanish as long as an ipif is refheld
    251   8485       Peter  * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
    252      0      stevel  * address of an ipif has to go through the ipsq_t. This ensures that only
    253  11042        Erik  * one such exclusive operation proceeds at any time on the ipif. It then
    254  11042        Erik  * waits for all refcnts
    255      0      stevel  * associated with this ipif to come down to zero. The address is changed
    256      0      stevel  * only after the ipif has been quiesced. Then the ipif is brought up again.
    257      0      stevel  * More details are described above the comment in ip_sioctl_flags.
    258      0      stevel  *
    259      0      stevel  * Packet processing is based mostly on IREs and are fully multi-threaded
    260      0      stevel  * using standard Solaris MT techniques.
    261      0      stevel  *
    262      0      stevel  * There are explicit locks in IP to handle:
    263      0      stevel  * - The ip_g_head list maintained by mi_open_link() and friends.
    264      0      stevel  *
    265      0      stevel  * - The reassembly data structures (one lock per hash bucket)
    266      0      stevel  *
    267      0      stevel  * - conn_lock is meant to protect conn_t fields. The fields actually
    268      0      stevel  *   protected by conn_lock are documented in the conn_t definition.
    269      0      stevel  *
    270      0      stevel  * - ire_lock to protect some of the fields of the ire, IRE tables
    271      0      stevel  *   (one lock per hash bucket). Refer to ip_ire.c for details.
    272      0      stevel  *
    273  11042        Erik  * - ndp_g_lock and ncec_lock for protecting NCEs.
    274      0      stevel  *
    275      0      stevel  * - ill_lock protects fields of the ill and ipif. Details in ip.h
    276      0      stevel  *
    277      0      stevel  * - ill_g_lock: This is a global reader/writer lock. Protects the following
    278      0      stevel  *	* The AVL tree based global multi list of all ills.
    279      0      stevel  *	* The linked list of all ipifs of an ill
    280   8485       Peter  *	* The <ipsq-xop> mapping
    281      0      stevel  *	* <ill-phyint> association
    282      0      stevel  *   Insertion/deletion of an ill in the system, insertion/deletion of an ipif
    283   8485       Peter  *   into an ill, changing the <ipsq-xop> mapping of an ill, changing the
    284   8485       Peter  *   <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as
    285   8485       Peter  *   writer for the actual duration of the insertion/deletion/change.
    286      0      stevel  *
    287      0      stevel  * - ill_lock:  This is a per ill mutex.
    288   8485       Peter  *   It protects some members of the ill_t struct; see ip.h for details.
    289      0      stevel  *   It also protects the <ill-phyint> assoc.
    290      0      stevel  *   It also protects the list of ipifs hanging off the ill.
    291      0      stevel  *
    292      0      stevel  * - ipsq_lock: This is a per ipsq_t mutex lock.
    293   8485       Peter  *   This protects some members of the ipsq_t struct; see ip.h for details.
    294   8485       Peter  *   It also protects the <ipsq-ipxop> mapping
    295   8485       Peter  *
    296   8485       Peter  * - ipx_lock: This is a per ipxop_t mutex lock.
    297   8485       Peter  *   This protects some members of the ipxop_t struct; see ip.h for details.
    298      0      stevel  *
    299      0      stevel  * - phyint_lock: This is a per phyint mutex lock. Protects just the
    300      0      stevel  *   phyint_flags
    301      0      stevel  *
    302      0      stevel  * - ip_g_nd_lock: This is a global reader/writer lock.
    303      0      stevel  *   Any call to nd_load to load a new parameter to the ND table must hold the
    304      0      stevel  *   lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock
    305      0      stevel  *   as reader.
    306      0      stevel  *
    307      0      stevel  * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses.
    308      0      stevel  *   This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
    309      0      stevel  *   uniqueness check also done atomically.
    310      0      stevel  *
    311      0      stevel  * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
    312      0      stevel  *   group list linked by ill_usesrc_grp_next. It also protects the
    313      0      stevel  *   ill_usesrc_ifindex field. It is taken as a writer when a member of the
    314      0      stevel  *   group is being added or deleted.  This lock is taken as a reader when
    315      0      stevel  *   walking the list/group(eg: to get the number of members in a usesrc group).
    316      0      stevel  *   Note, it is only necessary to take this lock if the ill_usesrc_grp_next
    317      0      stevel  *   field is changing state i.e from NULL to non-NULL or vice-versa. For
    318      0      stevel  *   example, it is not necessary to take this lock in the initial portion
    319   8485       Peter  *   of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these
    320   8485       Peter  *   operations are executed exclusively and that ensures that the "usesrc
    321   8485       Peter  *   group state" cannot change. The "usesrc group state" change can happen
    322   8485       Peter  *   only in the latter part of ip_sioctl_slifusesrc and in ill_delete.
    323   8485       Peter  *
    324   8485       Peter  * Changing <ill-phyint>, <ipsq-xop> assocications:
    325      0      stevel  *
    326      0      stevel  * To change the <ill-phyint> association, the ill_g_lock must be held
    327      0      stevel  * as writer, and the ill_locks of both the v4 and v6 instance of the ill
    328      0      stevel  * must be held.
    329      0      stevel  *
    330   8485       Peter  * To change the <ipsq-xop> association, the ill_g_lock must be held as
    331   8485       Peter  * writer, the ipsq_lock must be held, and one must be writer on the ipsq.
    332   8485       Peter  * This is only done when ills are added or removed from IPMP groups.
    333      0      stevel  *
    334      0      stevel  * To add or delete an ipif from the list of ipifs hanging off the ill,
    335      0      stevel  * ill_g_lock (writer) and ill_lock must be held and the thread must be
    336   8485       Peter  * a writer on the associated ipsq.
    337      0      stevel  *
    338      0      stevel  * To add or delete an ill to the system, the ill_g_lock must be held as
    339      0      stevel  * writer and the thread must be a writer on the associated ipsq.
    340      0      stevel  *
    341      0      stevel  * To add or delete an ilm to an ill, the ill_lock must be held and the thread
    342      0      stevel  * must be a writer on the associated ipsq.
    343      0      stevel  *
    344      0      stevel  * Lock hierarchy
    345      0      stevel  *
    346      0      stevel  * Some lock hierarchy scenarios are listed below.
    347      0      stevel  *
    348   8485       Peter  * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
    349      0      stevel  * ill_g_lock -> ill_lock(s) -> phyint_lock
    350  11042        Erik  * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock
    351      0      stevel  * ill_g_lock -> ip_addr_avail_lock
    352      0      stevel  * conn_lock -> irb_lock -> ill_lock -> ire_lock
    353      0      stevel  * ill_g_lock -> ip_g_nd_lock
    354  11042        Erik  * ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock
    355  11042        Erik  * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock
    356  11042        Erik  * arl_lock -> ill_lock
    357  11042        Erik  * ips_ire_dep_lock -> irb_lock
    358   1676         jpk  *
    359   1676         jpk  * When more than 1 ill lock is needed to be held, all ill lock addresses
    360   1676         jpk  * are sorted on address and locked starting from highest addressed lock
    361   1676         jpk  * downward.
    362   1676         jpk  *
    363  11042        Erik  * Multicast scenarios
    364  11042        Erik  * ips_ill_g_lock -> ill_mcast_lock
    365  11042        Erik  * conn_ilg_lock -> ips_ill_g_lock -> ill_lock
    366  11042        Erik  * ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock
    367  11042        Erik  * ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock
    368  11042        Erik  * ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock
    369  11042        Erik  * ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock
    370  11042        Erik  *
    371   1676         jpk  * IPsec scenarios
    372   1676         jpk  *
    373   1676         jpk  * ipsa_lock -> ill_g_lock -> ill_lock
    374      0      stevel  * ill_g_usesrc_lock -> ill_g_lock -> ill_lock
    375      0      stevel  *
    376   1676         jpk  * Trusted Solaris scenarios
    377   1676         jpk  *
    378   1676         jpk  * igsa_lock -> gcgrp_rwlock -> gcgrp_lock
    379   1676         jpk  * igsa_lock -> gcdb_lock
    380   1676         jpk  * gcgrp_rwlock -> ire_lock
    381   1676         jpk  * gcgrp_rwlock -> gcdb_lock
    382      0      stevel  *
    383   8275        Eric  * squeue(sq_lock), flow related (ft_lock, fe_lock) locking
    384   8275        Eric  *
    385   8275        Eric  * cpu_lock --> ill_lock --> sqset_lock --> sq_lock
    386   8275        Eric  * sq_lock -> conn_lock -> QLOCK(q)
    387   8275        Eric  * ill_lock -> ft_lock -> fe_lock
    388   2535    sangeeta  *
    389   2535    sangeeta  * Routing/forwarding table locking notes:
    390   2535    sangeeta  *
    391   2535    sangeeta  * Lock acquisition order: Radix tree lock, irb_lock.
    392   2535    sangeeta  * Requirements:
    393   2535    sangeeta  * i.  Walker must not hold any locks during the walker callback.
    394   2535    sangeeta  * ii  Walker must not see a truncated tree during the walk because of any node
    395   2535    sangeeta  *     deletion.
    396   2535    sangeeta  * iii Existing code assumes ire_bucket is valid if it is non-null and is used
    397   2535    sangeeta  *     in many places in the code to walk the irb list. Thus even if all the
    398   2535    sangeeta  *     ires in a bucket have been deleted, we still can't free the radix node
    399   2535    sangeeta  *     until the ires have actually been inactive'd (freed).
    400   2535    sangeeta  *
    401   2535    sangeeta  * Tree traversal - Need to hold the global tree lock in read mode.
    402   2535    sangeeta  * Before dropping the global tree lock, need to either increment the ire_refcnt
    403   2535    sangeeta  * to ensure that the radix node can't be deleted.
    404   2535    sangeeta  *
    405   2535    sangeeta  * Tree add - Need to hold the global tree lock in write mode to add a
    406   2535    sangeeta  * radix node. To prevent the node from being deleted, increment the
    407   2535    sangeeta  * irb_refcnt, after the node is added to the tree. The ire itself is
    408   2535    sangeeta  * added later while holding the irb_lock, but not the tree lock.
    409   2535    sangeeta  *
    410   2535    sangeeta  * Tree delete - Need to hold the global tree lock and irb_lock in write mode.
    411   2535    sangeeta  * All associated ires must be inactive (i.e. freed), and irb_refcnt
    412   2535    sangeeta  * must be zero.
    413   2535    sangeeta  *
    414   2535    sangeeta  * Walker - Increment irb_refcnt before calling the walker callback. Hold the
    415   2535    sangeeta  * global tree lock (read mode) for traversal.
    416   2535    sangeeta  *
    417  11042        Erik  * IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele
    418  11042        Erik  * hence we will acquire irb_lock while holding ips_ire_dep_lock.
    419  11042        Erik  *
    420   4987      danmcd  * IPsec notes :
    421   4987      danmcd  *
    422  11042        Erik  * IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes
    423  11042        Erik  * in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the
    424  11042        Erik  * ip_xmit_attr_t has the
    425   4987      danmcd  * information used by the IPsec code for applying the right level of
    426  11042        Erik  * protection. The information initialized by IP in the ip_xmit_attr_t
    427      0      stevel  * is determined by the per-socket policy or global policy in the system.
    428  11042        Erik  * For inbound datagrams, the ip_recv_attr_t
    429  11042        Erik  * starts out with nothing in it. It gets filled
    430      0      stevel  * with the right information if it goes through the AH/ESP code, which
    431      0      stevel  * happens if the incoming packet is secure. The information initialized
    432  11042        Erik  * by AH/ESP, is later used by IP (during fanouts to ULP) to see whether
    433      0      stevel  * the policy requirements needed by per-socket policy or global policy
    434      0      stevel  * is met or not.
    435      0      stevel  *
    436      0      stevel  * For fully connected sockets i.e dst, src [addr, port] is known,
    437      0      stevel  * conn_policy_cached is set indicating that policy has been cached.
    438      0      stevel  * conn_in_enforce_policy may or may not be set depending on whether
    439      0      stevel  * there is a global policy match or per-socket policy match.
    440  11042        Erik  * Policy inheriting happpens in ip_policy_set once the destination is known.
    441      0      stevel  * Once the right policy is set on the conn_t, policy cannot change for
    442      0      stevel  * this socket. This makes life simpler for TCP (UDP ?) where
    443      0      stevel  * re-transmissions go out with the same policy. For symmetry, policy
    444      0      stevel  * is cached for fully connected UDP sockets also. Thus if policy is cached,
    445      0      stevel  * it also implies that policy is latched i.e policy cannot change
    446      0      stevel  * on these sockets. As we have the right policy on the conn, we don't
    447      0      stevel  * have to lookup global policy for every outbound and inbound datagram
    448      0      stevel  * and thus serving as an optimization. Note that a global policy change
    449      0      stevel  * does not affect fully connected sockets if they have policy. If fully
    450      0      stevel  * connected sockets did not have any policy associated with it, global
    451      0      stevel  * policy change may affect them.
    452      0      stevel  *
    453      0      stevel  * IP Flow control notes:
    454   8833        Venu  * ---------------------
    455   8833        Venu  * Non-TCP streams are flow controlled by IP. The way this is accomplished
    456   8833        Venu  * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When
    457   8833        Venu  * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into
    458   8833        Venu  * GLDv3. Otherwise packets are sent down to lower layers using STREAMS
    459   8833        Venu  * functions.
    460   8833        Venu  *
    461   8833        Venu  * Per Tx ring udp flow control:
    462   8833        Venu  * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in
    463   8833        Venu  * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true).
    464   8833        Venu  *
    465   8833        Venu  * The underlying link can expose multiple Tx rings to the GLDv3 mac layer.
    466   8833        Venu  * To achieve best performance, outgoing traffic need to be fanned out among
    467   8833        Venu  * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send
    468   8833        Venu  * traffic out of the NIC and it takes a fanout hint. UDP connections pass
    469   8833        Venu  * the address of connp as fanout hint to mac_tx(). Under flow controlled
    470   8833        Venu  * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This
    471   8833        Venu  * cookie points to a specific Tx ring that is blocked. The cookie is used to
    472   8833        Venu  * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t
    473   8833        Venu  * point to drain_lists (idl_t's). These drain list will store the blocked UDP
    474   8833        Venu  * connp's. The drain list is not a single list but a configurable number of
    475   8833        Venu  * lists.
    476   8833        Venu  *
    477   8833        Venu  * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t
    478   8833        Venu  * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE
    479   8833        Venu  * which is equal to 128. This array in turn contains a pointer to idl_t[],
    480   8833        Venu  * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain
    481   8833        Venu  * list will point to the list of connp's that are flow controlled.
    482   8833        Venu  *
    483   8833        Venu  *                      ---------------   -------   -------   -------
    484   8833        Venu  *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
    485   8833        Venu  *                   |  ---------------   -------   -------   -------
    486   8833        Venu  *                   |  ---------------   -------   -------   -------
    487   8833        Venu  *                   |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
    488   8833        Venu  * ----------------  |  ---------------   -------   -------   -------
    489   8833        Venu  * |idl_tx_list[0]|->|  ---------------   -------   -------   -------
    490   8833        Venu  * ----------------  |->|drain_list[2]|-->|connp|-->|connp|-->|connp|-->
    491   8833        Venu  *                   |  ---------------   -------   -------   -------
    492   8833        Venu  *                   .        .              .         .         .
    493   8833        Venu  *                   |  ---------------   -------   -------   -------
    494   8833        Venu  *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
    495   8833        Venu  *                      ---------------   -------   -------   -------
    496   8833        Venu  *                      ---------------   -------   -------   -------
    497   8833        Venu  *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
    498   8833        Venu  *                   |  ---------------   -------   -------   -------
    499   8833        Venu  *                   |  ---------------   -------   -------   -------
    500   8833        Venu  * ----------------  |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
    501   8833        Venu  * |idl_tx_list[1]|->|  ---------------   -------   -------   -------
    502   8833        Venu  * ----------------  |        .              .         .         .
    503   8833        Venu  *                   |  ---------------   -------   -------   -------
    504   8833        Venu  *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
    505   8833        Venu  *                      ---------------   -------   -------   -------
    506   8833        Venu  *     .....
    507   8833        Venu  * ----------------
    508   8833        Venu  * |idl_tx_list[n]|-> ...
    509   8833        Venu  * ----------------
    510   8833        Venu  *
    511   8833        Venu  * When mac_tx() returns a cookie, the cookie is used to hash into a
    512   8833        Venu  * idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is
    513   8833        Venu  * called passing idl_tx_list. The connp gets inserted in a drain list
    514   8833        Venu  * pointed to by idl_tx_list. conn_drain_list() asserts flow control for
    515  11042        Erik  * the sockets (non stream based) and sets QFULL condition on the conn_wq
    516  11042        Erik  * of streams sockets, or the su_txqfull for non-streams sockets.
    517   8833        Venu  * connp->conn_direct_blocked will be set to indicate the blocked
    518   8833        Venu  * condition.
    519   8833        Venu  *
    520   8833        Venu  * GLDv3 mac layer calls ill_flow_enable() when flow control is relieved.
    521   8833        Venu  * A cookie is passed in the call to ill_flow_enable() that identifies the
    522   8833        Venu  * blocked Tx ring. This cookie is used to get to the idl_tx_list that
    523   8833        Venu  * contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t
    524  11042        Erik  * and goes through each conn in the drain list and calls conn_idl_remove
    525  11042        Erik  * for the conn to clear the qfull condition for the conn, as well as to
    526  11042        Erik  * remove the conn from the idl list. In addition, streams based sockets
    527  11042        Erik  * will have the conn_wq enabled, causing ip_wsrv to run for the
    528   8833        Venu  * conn. ip_wsrv drains the queued messages, and removes the conn from the
    529  11042        Erik  * drain list, if all messages were drained. It also notifies the
    530  11042        Erik  * conn_upcalls for the conn to signal that flow-control has opened up.
    531      0      stevel  *
    532      0      stevel  * In reality the drain list is not a single list, but a configurable number
    533  11042        Erik  * of lists. conn_walk_drain() in the IP module, notifies the conn_upcalls for
    534  11042        Erik  * each conn in the list. conn_drain_insert and conn_drain_tail are the only
    535   8833        Venu  * functions that manipulate this drain list. conn_drain_insert is called in
    536  11042        Erik  * from the protocol layer when conn_ip_output returns EWOULDBLOCK.
    537  11042        Erik  * (as opposed to from ip_wsrv context for STREAMS
    538   8833        Venu  * case -- see below). The synchronization between drain insertion and flow
    539   8833        Venu  * control wakeup is handled by using idl_txl->txl_lock.
    540   8833        Venu  *
    541   8833        Venu  * Flow control using STREAMS:
    542   8833        Venu  * When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism
    543   8833        Venu  * is used. On the send side, if the packet cannot be sent down to the
    544  11042        Erik  * driver by IP, because of a canput failure, ip_xmit drops the packet
    545  11042        Erik  * and returns EWOULDBLOCK to the caller, who may then invoke
    546  11042        Erik  * ixa_check_drain_insert to insert the conn on the 0'th drain list.
    547  11042        Erik  * When ip_wsrv runs on the ill_wq because flow control has been relieved, the
    548  11042        Erik  * blocked conns in the * 0'th drain list is drained as with the
    549  11042        Erik  * non-STREAMS case.
    550  11042        Erik  *
    551  11042        Erik  * In both the STREAMS and non-STREAMS case, the sockfs upcall to set
    552  11042        Erik  * qfull is done when the conn is inserted into the drain list
    553  11042        Erik  * (conn_drain_insert()) and cleared when the conn is removed from the drain
    554  11042        Erik  * list (conn_idl_remove()).
    555      0      stevel  *
    556      0      stevel  * IPQOS notes:
    557      0      stevel  *
    558      0      stevel  * IPQoS Policies are applied to packets using IPPF (IP Policy framework)
    559      0      stevel  * and IPQoS modules. IPPF includes hooks in IP at different control points
    560      0      stevel  * (callout positions) which direct packets to IPQoS modules for policy
    561      0      stevel  * processing. Policies, if present, are global.
    562      0      stevel  *
    563      0      stevel  * The callout positions are located in the following paths:
    564      0      stevel  *		o local_in (packets destined for this host)
    565      0      stevel  *		o local_out (packets orginating from this host )
    566      0      stevel  *		o fwd_in  (packets forwarded by this m/c - inbound)
    567      0      stevel  *		o fwd_out (packets forwarded by this m/c - outbound)
    568      0      stevel  * Hooks at these callout points can be enabled/disabled using the ndd variable
    569      0      stevel  * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions).
    570      0      stevel  * By default all the callout positions are enabled.
    571      0      stevel  *
    572      0      stevel  * Outbound (local_out)
    573  11042        Erik  * Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6.
    574      0      stevel  *
    575      0      stevel  * Inbound (local_in)
    576  11042        Erik  * Hooks are placed in ip_fanout_v4 and ip_fanout_v6.
    577      0      stevel  *
    578      0      stevel  * Forwarding (in and out)
    579  11042        Erik  * Hooks are placed in ire_recv_forward_v4/v6.
    580      0      stevel  *
    581      0      stevel  * IP Policy Framework processing (IPPF processing)
    582      0      stevel  * Policy processing for a packet is initiated by ip_process, which ascertains
    583      0      stevel  * that the classifier (ipgpc) is loaded and configured, failing which the
    584      0      stevel  * packet resumes normal processing in IP. If the clasifier is present, the
    585      0      stevel  * packet is acted upon by one or more IPQoS modules (action instances), per
    586      0      stevel  * filters configured in ipgpc and resumes normal IP processing thereafter.
    587      0      stevel  * An action instance can drop a packet in course of its processing.
    588      0      stevel  *
    589      0      stevel  * Zones notes:
    590      0      stevel  *
    591      0      stevel  * The partitioning rules for networking are as follows:
    592      0      stevel  * 1) Packets coming from a zone must have a source address belonging to that
    593      0      stevel  * zone.
    594      0      stevel  * 2) Packets coming from a zone can only be sent on a physical interface on
    595      0      stevel  * which the zone has an IP address.
    596      0      stevel  * 3) Between two zones on the same machine, packet delivery is only allowed if
    597      0      stevel  * there's a matching route for the destination and zone in the forwarding
    598      0      stevel  * table.
    599      0      stevel  * 4) The TCP and UDP port spaces are per-zone; that is, two processes in
    600      0      stevel  * different zones can bind to the same port with the wildcard address
    601      0      stevel  * (INADDR_ANY).
    602      0      stevel  *
    603      0      stevel  * The granularity of interface partitioning is at the logical interface level.
    604      0      stevel  * Therefore, every zone has its own IP addresses, and incoming packets can be
    605      0      stevel  * attributed to a zone unambiguously. A logical interface is placed into a zone
    606      0      stevel  * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t
    607      0      stevel  * structure. Rule (1) is implemented by modifying the source address selection
    608      0      stevel  * algorithm so that the list of eligible addresses is filtered based on the
    609      0      stevel  * sending process zone.
    610      0      stevel  *
    611      0      stevel  * The Internet Routing Entries (IREs) are either exclusive to a zone or shared
    612      0      stevel  * across all zones, depending on their type. Here is the break-up:
    613      0      stevel  *
    614      0      stevel  * IRE type				Shared/exclusive
    615      0      stevel  * --------				----------------
    616      0      stevel  * IRE_BROADCAST			Exclusive
    617      0      stevel  * IRE_DEFAULT (default routes)		Shared (*)
    618   2733    nordmark  * IRE_LOCAL				Exclusive (x)
    619      0      stevel  * IRE_LOOPBACK				Exclusive
    620      0      stevel  * IRE_PREFIX (net routes)		Shared (*)
    621      0      stevel  * IRE_IF_NORESOLVER (interface routes)	Exclusive
    622      0      stevel  * IRE_IF_RESOLVER (interface routes)	Exclusive
    623  11042        Erik  * IRE_IF_CLONE (interface routes)	Exclusive
    624      0      stevel  * IRE_HOST (host routes)		Shared (*)
    625      0      stevel  *
    626      0      stevel  * (*) A zone can only use a default or off-subnet route if the gateway is
    627      0      stevel  * directly reachable from the zone, that is, if the gateway's address matches
    628      0      stevel  * one of the zone's logical interfaces.
    629   2733    nordmark  *
    630  11042        Erik  * (x) IRE_LOCAL are handled a bit differently.
    631  11042        Erik  * When ip_restrict_interzone_loopback is set (the default),
    632  11042        Erik  * ire_route_recursive restricts loopback using an IRE_LOCAL
    633   2733    nordmark  * between zone to the case when L2 would have conceptually looped the packet
    634   2733    nordmark  * back, i.e. the loopback which is required since neither Ethernet drivers
    635   2733    nordmark  * nor Ethernet hardware loops them back. This is the case when the normal
    636   2733    nordmark  * routes (ignoring IREs with different zoneids) would send out the packet on
    637   8485       Peter  * the same ill as the ill with which is IRE_LOCAL is associated.
    638      0      stevel  *
    639      0      stevel  * Multiple zones can share a common broadcast address; typically all zones
    640      0      stevel  * share the 255.255.255.255 address. Incoming as well as locally originated
    641      0      stevel  * broadcast packets must be dispatched to all the zones on the broadcast
    642      0      stevel  * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial
    643      0      stevel  * since some zones may not be on the 10.16.72/24 network. To handle this, each
    644      0      stevel  * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
    645      0      stevel  * sent to every zone that has an IRE_BROADCAST entry for the destination
    646  11042        Erik  * address on the input ill, see ip_input_broadcast().
    647      0      stevel  *
    648      0      stevel  * Applications in different zones can join the same multicast group address.
    649  11042        Erik  * The same logic applies for multicast as for broadcast. ip_input_multicast
    650  11042        Erik  * dispatches packets to all zones that have members on the physical interface.
    651      0      stevel  */
    652      0      stevel 
    653      0      stevel /*
    654      0      stevel  * Squeue Fanout flags:
    655      0      stevel  *	0: No fanout.
    656      0      stevel  *	1: Fanout across all squeues
    657      0      stevel  */
    658      0      stevel boolean_t	ip_squeue_fanout = 0;
    659      0      stevel 
    660      0      stevel /*
    661      0      stevel  * Maximum dups allowed per packet.
    662      0      stevel  */
    663      0      stevel uint_t ip_max_frag_dups = 10;
    664      0      stevel 
    665   8348        Eric /* RFC 1122 Conformance */
    666      0      stevel #define	IP_FORWARD_DEFAULT	IP_FORWARD_NEVER
    667      0      stevel 
    668      0      stevel #define	ILL_MAX_NAMELEN			LIFNAMSIZ
    669      0      stevel 
    670   5240    nordmark static int	ip_open(queue_t *q, dev_t *devp, int flag, int sflag,
    671   5240    nordmark 		    cred_t *credp, boolean_t isv6);
    672  11042        Erik static mblk_t	*ip_xmit_attach_llhdr(mblk_t *, nce_t *);
    673  11042        Erik 
    674  11042        Erik static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *);
    675  11042        Erik static void	icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *);
    676  11042        Erik static void	icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *,
    677  11042        Erik     ip_recv_attr_t *);
    678      0      stevel static void	icmp_options_update(ipha_t *);
    679  11042        Erik static void	icmp_param_problem(mblk_t *, uint8_t,  ip_recv_attr_t *);
    680  11042        Erik static void	icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *);
    681  11042        Erik static mblk_t	*icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *);
    682  11042        Erik static void	icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *,
    683  11042        Erik     ip_recv_attr_t *);
    684  11042        Erik static void	icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *);
    685  11042        Erik static void	icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *,
    686  11042        Erik     ip_recv_attr_t *);
    687  11042        Erik 
    688      0      stevel mblk_t		*ip_dlpi_alloc(size_t, t_uscalar_t);
    689      0      stevel char		*ip_dot_addr(ipaddr_t, char *);
    690      0      stevel mblk_t		*ip_carve_mp(mblk_t **, ssize_t);
    691      0      stevel int		ip_close(queue_t *, int);
    692      0      stevel static char	*ip_dot_saddr(uchar_t *, char *);
    693      0      stevel static void	ip_lrput(queue_t *, mblk_t *);
    694      0      stevel ipaddr_t	ip_net_mask(ipaddr_t);
    695      0      stevel char		*ip_nv_lookup(nv_t *, int);
    696      0      stevel static int	ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    697      0      stevel static int	ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    698   3448    dh155122 static boolean_t	ip_param_register(IDP *ndp, ipparam_t *, size_t,
    699   3448    dh155122     ipndp_t *, size_t);
    700      0      stevel static int	ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
    701      0      stevel void	ip_rput(queue_t *, mblk_t *);
    702      0      stevel static void	ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
    703      0      stevel 		    void *dummy_arg);
    704   5240    nordmark int		ip_snmp_get(queue_t *, mblk_t *, int);
    705   3284    apersson static mblk_t	*ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
    706   3448    dh155122 		    mib2_ipIfStatsEntry_t *, ip_stack_t *);
    707   3448    dh155122 static mblk_t	*ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *,
    708   3448    dh155122 		    ip_stack_t *);
    709   3448    dh155122 static mblk_t	*ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *);
    710   3448    dh155122 static mblk_t	*ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst);
    711   3448    dh155122 static mblk_t	*ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst);
    712   3448    dh155122 static mblk_t	*ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst);
    713   3448    dh155122 static mblk_t	*ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst);
    714   3448    dh155122 static mblk_t	*ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *,
    715   3448    dh155122 		    ip_stack_t *ipst);
    716   3448    dh155122 static mblk_t	*ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *,
    717   3448    dh155122 		    ip_stack_t *ipst);
    718   3448    dh155122 static mblk_t	*ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *,
    719   3448    dh155122 		    ip_stack_t *ipst);
    720   3448    dh155122 static mblk_t	*ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *,
    721   3448    dh155122 		    ip_stack_t *ipst);
    722   3448    dh155122 static mblk_t	*ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *,
    723   3448    dh155122 		    ip_stack_t *ipst);
    724   3448    dh155122 static mblk_t	*ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *,
    725   3448    dh155122 		    ip_stack_t *ipst);
    726   3448    dh155122 static mblk_t	*ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
    727   3448    dh155122 		    ip_stack_t *ipst);
    728   3448    dh155122 static mblk_t	*ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
    729   3448    dh155122 		    ip_stack_t *ipst);
    730   8485       Peter static mblk_t	*ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int,
    731   3448    dh155122 		    ip_stack_t *ipst);
    732   8485       Peter static mblk_t	*ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
    733   3448    dh155122 		    ip_stack_t *ipst);
    734   1676         jpk static void	ip_snmp_get2_v4(ire_t *, iproutedata_t *);
    735   1676         jpk static void	ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
    736  11042        Erik static int	ip_snmp_get2_v4_media(ncec_t *, iproutedata_t *);
    737  11042        Erik static int	ip_snmp_get2_v6_media(ncec_t *, iproutedata_t *);
    738      0      stevel int		ip_snmp_set(queue_t *, int, int, uchar_t *, int);
    739  11042        Erik 
    740  11042        Erik static mblk_t	*ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *,
    741   8778        Erik 		    mblk_t *);
    742   3448    dh155122 
    743   3448    dh155122 static void	conn_drain_init(ip_stack_t *);
    744   3448    dh155122 static void	conn_drain_fini(ip_stack_t *);
    745      0      stevel static void	conn_drain_tail(conn_t *connp, boolean_t closing);
    746      0      stevel 
    747   8833        Venu static void	conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
    748  11042        Erik static void	conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *);
    749   3448    dh155122 
    750   3448    dh155122 static void	*ip_stack_init(netstackid_t stackid, netstack_t *ns);
    751   3448    dh155122 static void	ip_stack_shutdown(netstackid_t stackid, void *arg);
    752   3448    dh155122 static void	ip_stack_fini(netstackid_t stackid, void *arg);
    753      0      stevel 
    754      0      stevel static int	ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
    755      0      stevel 
    756      0      stevel static int	ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
    757  11042        Erik     const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
    758  11042        Erik     ire_t *, conn_t *, boolean_t, const in6_addr_t *,  mcast_record_t,
    759  11042        Erik     const in6_addr_t *);
    760      0      stevel 
    761      0      stevel static int	ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    762      0      stevel static int	ip_cgtp_filter_set(queue_t *, mblk_t *, char *,
    763      0      stevel     caddr_t, cred_t *);
    764      0      stevel static int	ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
    765      0      stevel     caddr_t cp, cred_t *cr);
    766   1184      krgopi static int	ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
    767   4041    nordmark     cred_t *);
    768   8275        Eric static int	ip_squeue_switch(int);
    769      0      stevel 
    770   3448    dh155122 static void	*ip_kstat_init(netstackid_t, ip_stack_t *);
    771   3448    dh155122 static void	ip_kstat_fini(netstackid_t, kstat_t *);
    772      0      stevel static int	ip_kstat_update(kstat_t *kp, int rw);
    773   3448    dh155122 static void	*icmp_kstat_init(netstackid_t);
    774   3448    dh155122 static void	icmp_kstat_fini(netstackid_t, kstat_t *);
    775      0      stevel static int	icmp_kstat_update(kstat_t *kp, int rw);
    776   3448    dh155122 static void	*ip_kstat2_init(netstackid_t, ip_stat_t *);
    777   3448    dh155122 static void	ip_kstat2_fini(netstackid_t, kstat_t *);
    778      0      stevel 
    779  11042        Erik static void	ipobs_init(ip_stack_t *);
    780  11042        Erik static void	ipobs_fini(ip_stack_t *);
    781  11042        Erik 
    782      0      stevel ipaddr_t	ip_g_all_ones = IP_HOST_MASK;
    783      0      stevel 
    784      0      stevel /* How long, in seconds, we allow frags to hang around. */
    785   9213      Girish #define	IP_FRAG_TIMEOUT		15
    786   9213      Girish #define	IPV6_FRAG_TIMEOUT	60
    787    741    masputra 
    788      0      stevel static long ip_rput_pullups;
    789      0      stevel int	dohwcksum = 1;	/* use h/w cksum if supported by the hardware */
    790      0      stevel 
    791   5815    gt145670 vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */
    792   5815    gt145670 vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */
    793      0      stevel 
    794   3448    dh155122 int	ip_debug;
    795      0      stevel 
    796      0      stevel /*
    797      0      stevel  * Multirouting/CGTP stuff
    798      0      stevel  */
    799      0      stevel int	ip_cgtp_filter_rev = CGTP_FILTER_REV;	/* CGTP hooks version */
    800      0      stevel 
    801      0      stevel /*
    802      0      stevel  * Named Dispatch Parameter Table.
    803      0      stevel  * All of these are alterable, within the min/max values given, at run time.
    804      0      stevel  */
    805      0      stevel static ipparam_t	lcl_param_arr[] = {
    806      0      stevel 	/* min	max	value	name */
    807      0      stevel 	{  0,	1,	0,	"ip_respond_to_address_mask_broadcast"},
    808      0      stevel 	{  0,	1,	1,	"ip_respond_to_echo_broadcast"},
    809      0      stevel 	{  0,	1,	1,	"ip_respond_to_echo_multicast"},
    810      0      stevel 	{  0,	1,	0,	"ip_respond_to_timestamp"},
    811      0      stevel 	{  0,	1,	0,	"ip_respond_to_timestamp_broadcast"},
    812      0      stevel 	{  0,	1,	1,	"ip_send_redirects"},
    813      0      stevel 	{  0,	1,	0,	"ip_forward_directed_broadcasts"},
    814      0      stevel 	{  0,	10,	0,	"ip_mrtdebug"},
    815  11042        Erik 	{  1,	8,	3,	"ip_ire_reclaim_fraction" },
    816  11042        Erik 	{  1,	8,	3,	"ip_nce_reclaim_fraction" },
    817  11042        Erik 	{  1,	8,	3,	"ip_dce_reclaim_fraction" },
    818      0      stevel 	{  1,	255,	255,	"ip_def_ttl" },
    819      0      stevel 	{  0,	1,	0,	"ip_forward_src_routed"},
    820      0      stevel 	{  0,	256,	32,	"ip_wroff_extra" },
    821  11042        Erik 	{  2, 999999999, 60*20, "ip_pathmtu_interval" },	/* In seconds */
    822      0      stevel 	{  8,	65536,  64,	"ip_icmp_return_data_bytes" },
    823      0      stevel 	{  0,	1,	1,	"ip_path_mtu_discovery" },
    824  11042        Erik 	{ 68,	65535,	576,	"ip_pmtu_min" },
    825      0      stevel 	{  0,	1,	0,	"ip_ignore_redirect" },
    826  11042        Erik 	{  0,	1,	0,	"ip_arp_icmp_error" },
    827      0      stevel 	{  1,	254,	1,	"ip_broadcast_ttl" },
    828      0      stevel 	{  0,	99999,	100,	"ip_icmp_err_interval" },
    829      0      stevel 	{  1,	99999,	10,	"ip_icmp_err_burst" },
    830      0      stevel 	{  0,	999999999,	1000000, "ip_reass_queue_bytes" },
    831      0      stevel 	{  0,	1,	0,	"ip_strict_dst_multihoming" },
    832      0      stevel 	{  1,	MAX_ADDRS_PER_IF,	256,	"ip_addrs_per_if"},
    833      0      stevel 	{  0,	1,	0,	"ipsec_override_persocket_policy" },
    834      0      stevel 	{  0,	1,	1,	"icmp_accept_clear_messages" },
    835      0      stevel 	{  0,	1,	1,	"igmp_accept_clear_messages" },
    836      0      stevel 	{  2,	999999999, ND_DELAY_FIRST_PROBE_TIME,
    837      0      stevel 				"ip_ndp_delay_first_probe_time"},
    838      0      stevel 	{  1,	999999999, ND_MAX_UNICAST_SOLICIT,
    839      0      stevel 				"ip_ndp_max_unicast_solicit"},
    840      0      stevel 	{  1,	255,	IPV6_MAX_HOPS,	"ip6_def_hops" },
    841      0      stevel 	{  8,	IPV6_MIN_MTU,	IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" },
    842      0      stevel 	{  0,	1,	0,	"ip6_forward_src_routed"},
    843      0      stevel 	{  0,	1,	1,	"ip6_respond_to_echo_multicast"},
    844      0      stevel 	{  0,	1,	1,	"ip6_send_redirects"},
    845      0      stevel 	{  0,	1,	0,	"ip6_ignore_redirect" },
    846      0      stevel 	{  0,	1,	0,	"ip6_strict_dst_multihoming" },
    847      0      stevel 
    848  11042        Erik 	{  0,	2,	2,	"ip_src_check" },
    849      0      stevel 
    850      0      stevel 	{  0,	999999,	1000,	"ipsec_policy_log_interval" },
    851      0      stevel 
    852      0      stevel 	{  0,	1,	1,	"pim_accept_clear_messages" },
    853      0      stevel 	{  1000, 20000,	2000,	"ip_ndp_unsolicit_interval" },
    854      0      stevel 	{  1,	20,	3,	"ip_ndp_unsolicit_count" },
    855      0      stevel 	{  0,	1,	1,	"ip6_ignore_home_address_opt" },
    856      0      stevel 	{  0,	15,	0,	"ip_policy_mask" },
    857  11042        Erik 	{  0,	2,	2,	"ip_ecmp_behavior" },
    858      0      stevel 	{  0,	255,	1,	"ip_multirt_ttl" },
    859  11042        Erik 	{  0,	3600,	60,	"ip_ire_badcnt_lifetime" },	/* In seconds */
    860   2546    carlsonj 	{  0,	999999,	60*60*24, "ip_max_temp_idle" },
    861   2546    carlsonj 	{  0,	1000,	1,	"ip_max_temp_defend" },
    862  11042        Erik 	/*
    863  11042        Erik 	 * when a conflict of an active address is detected,
    864  11042        Erik 	 * defend up to ip_max_defend times, within any
    865  11042        Erik 	 * ip_defend_interval span.
    866  11042        Erik 	 */
    867   2546    carlsonj 	{  0,	1000,	3,	"ip_max_defend" },
    868   2546    carlsonj 	{  0,	999999,	30,	"ip_defend_interval" },
    869   2546    carlsonj 	{  0,	3600000, 300000, "ip_dup_recovery" },
    870   2733    nordmark 	{  0,	1,	1,	"ip_restrict_interzone_loopback" },
    871   3115    yl150051 	{  0,	1,	1,	"ip_lso_outbound" },
    872   4783        udpa 	{  IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" },
    873   4783        udpa 	{  MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" },
    874      0      stevel #ifdef DEBUG
    875      0      stevel 	{  0,	1,	0,	"ip6_drop_inbound_icmpv6" },
    876   3448    dh155122 #else
    877   3448    dh155122 	{  0,	0,	0,	"" },
    878      0      stevel #endif
    879  11042        Erik 	/* delay before sending first probe: */
    880  11042        Erik 	{  0,	20000,	1000,	"arp_probe_delay" },
    881  11042        Erik 	{  0,	20000,	100,	"arp_fastprobe_delay" },
    882  11042        Erik 	/* interval at which DAD probes are sent: */
    883  11042        Erik 	{ 10,	20000,	1500,	"arp_probe_interval" },
    884  11042        Erik 	{ 10,	20000,	150,	"arp_fastprobe_interval" },
    885  11042        Erik 	/* setting probe count to 0 will disable ARP probing for DAD. */
    886  11042        Erik 	{  0,	20,	3,	"arp_probe_count" },
    887  11042        Erik 	{  0,	20,	3,	"arp_fastprobe_count" },
    888  11042        Erik 
    889  11042        Erik 	{  0,	3600000, 15000,	"ipv4_dad_announce_interval"},
    890  11042        Erik 	{  0,	3600000, 15000,	"ipv6_dad_announce_interval"},
    891  11042        Erik 	/*
    892  11042        Erik 	 * Rate limiting parameters for DAD defense used in
    893  11042        Erik 	 * ill_defend_rate_limit():
    894  11042        Erik 	 * defend_rate : pkts/hour permitted
    895  11042        Erik 	 * defend_interval : time that can elapse before we send out a
    896  11042        Erik 	 *			DAD defense.
    897  11042        Erik 	 * defend_period: denominator for defend_rate (in seconds).
    898  11042        Erik 	 */
    899  11042        Erik 	{  0,	3600000, 300000,	"arp_defend_interval"},
    900  11042        Erik 	{  0,	20000, 100,		"arp_defend_rate"},
    901  11042        Erik 	{  0,	3600000, 300000,	"ndp_defend_interval"},
    902  11042        Erik 	{  0,	20000, 100,		"ndp_defend_rate"},
    903  11042        Erik 	{  5,	86400,	3600,		"arp_defend_period"},
    904  11042        Erik 	{  5,	86400,	3600,		"ndp_defend_period"},
    905  11042        Erik 	{  0,	1,	1,		"ipv4_icmp_return_pmtu" },
    906  11042        Erik 	{  0,	1,	1,		"ipv6_icmp_return_pmtu" },
    907  11042        Erik 	/*
    908  11042        Erik 	 * publish count/interval values used to announce local addresses
    909  11042        Erik 	 * for IPv4, IPv6.
    910  11042        Erik 	 */
    911  11042        Erik 	{  1,	20,	5,	"ip_arp_publish_count" },
    912  11042        Erik 	{  1000, 20000,	2000,	"ip_arp_publish_interval" },
    913      0      stevel };
    914      0      stevel 
    915   3448    dh155122 /*
    916   3448    dh155122  * Extended NDP table
    917   3448    dh155122  * The addresses for the first two are filled in to be ips_ip_g_forward
    918   3448    dh155122  * and ips_ipv6_forward at init time.
    919   3448    dh155122  */
    920      0      stevel static ipndp_t	lcl_ndp_arr[] = {
    921      0      stevel 	/* getf			setf		data			name */
    922   3448    dh155122 #define	IPNDP_IP_FORWARDING_OFFSET	0
    923   3448    dh155122 	{  ip_param_generic_get,	ip_forward_set,	NULL,
    924      0      stevel 	    "ip_forwarding" },
    925   3448    dh155122 #define	IPNDP_IP6_FORWARDING_OFFSET	1
    926   3448    dh155122 	{  ip_param_generic_get,	ip_forward_set,	NULL,
    927      0      stevel 	    "ip6_forwarding" },
    928      0      stevel 	{ ip_param_generic_get, ip_input_proc_set,
    929      0      stevel 	    (caddr_t)&ip_squeue_enter, "ip_squeue_enter" },
    930   1184      krgopi 	{ ip_param_generic_get, ip_int_set,
    931      0      stevel 	    (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" },
    932   9089   Vasumathi #define	IPNDP_CGTP_FILTER_OFFSET	4
    933   3448    dh155122 	{  ip_cgtp_filter_get,	ip_cgtp_filter_set, NULL,
    934   1184      krgopi 	    "ip_cgtp_filter" },
    935   5401    nordmark 	{  ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug,
    936   5401    nordmark 	    "ip_debug" },
    937      0      stevel };
    938      0      stevel 
    939      0      stevel /*
    940      0      stevel  * Table of IP ioctls encoding the various properties of the ioctl and
    941      0      stevel  * indexed based on the last byte of the ioctl command. Occasionally there
    942      0      stevel  * is a clash, and there is more than 1 ioctl with the same last byte.
    943      0      stevel  * In such a case 1 ioctl is encoded in the ndx table and the remaining
    944      0      stevel  * ioctls are encoded in the misc table. An entry in the ndx table is
    945      0      stevel  * retrieved by indexing on the last byte of the ioctl command and comparing
    946      0      stevel  * the ioctl command with the value in the ndx table. In the event of a
    947      0      stevel  * mismatch the misc table is then searched sequentially for the desired
    948      0      stevel  * ioctl command.
    949      0      stevel  *
    950      0      stevel  * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func>
    951      0      stevel  */
    952      0      stevel ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
    953      0      stevel 	/* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    954      0      stevel 	/* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    955      0      stevel 	/* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    956      0      stevel 	/* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    957      0      stevel 	/* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    958      0      stevel 	/* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    959      0      stevel 	/* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    960      0      stevel 	/* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    961      0      stevel 	/* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    962      0      stevel 	/* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    963      0      stevel 
    964      0      stevel 	/* 010 */ { SIOCADDRT,	sizeof (struct rtentry), IPI_PRIV,
    965      0      stevel 			MISC_CMD, ip_siocaddrt, NULL },
    966      0      stevel 	/* 011 */ { SIOCDELRT,	sizeof (struct rtentry), IPI_PRIV,
    967      0      stevel 			MISC_CMD, ip_siocdelrt, NULL },
    968      0      stevel 
    969      0      stevel 	/* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    970      0      stevel 			IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
    971   8485       Peter 	/* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD,
    972      0      stevel 			IF_CMD, ip_sioctl_get_addr, NULL },
    973      0      stevel 
    974      0      stevel 	/* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    975      0      stevel 			IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
    976      0      stevel 	/* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
    977   8485       Peter 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL },
    978      0      stevel 
    979      0      stevel 	/* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
    980   8485       Peter 			IPI_PRIV | IPI_WR,
    981      0      stevel 			IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
    982      0      stevel 	/* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
    983   8485       Peter 			IPI_MODOK | IPI_GET_CMD,
    984      0      stevel 			IF_CMD, ip_sioctl_get_flags, NULL },
    985      0      stevel 
    986      0      stevel 	/* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    987      0      stevel 	/* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    988      0      stevel 
    989      0      stevel 	/* copyin size cannot be coded for SIOCGIFCONF */
    990   4972        meem 	/* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD,
    991      0      stevel 			MISC_CMD, ip_sioctl_get_ifconf, NULL },
    992      0      stevel 
    993      0      stevel 	/* 021 */ { SIOCSIFMTU,	sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    994      0      stevel 			IF_CMD, ip_sioctl_mtu, NULL },
    995   8485       Peter 	/* 022 */ { SIOCGIFMTU,	sizeof (struct ifreq), IPI_GET_CMD,
    996      0      stevel 			IF_CMD, ip_sioctl_get_mtu, NULL },
    997      0      stevel 	/* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
    998   8485       Peter 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL },
    999      0      stevel 	/* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
   1000      0      stevel 			IF_CMD, ip_sioctl_brdaddr, NULL },
   1001      0      stevel 	/* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
   1002   8485       Peter 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL },
   1003      0      stevel 	/* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
   1004      0      stevel 			IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
   1005      0      stevel 	/* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
   1006   8485       Peter 			IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL },
   1007      0      stevel 	/* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
   1008      0      stevel 			IF_CMD, ip_sioctl_metric, NULL },
   1009      0      stevel 	/* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1010      0      stevel 
   1011      0      stevel 	/* See 166-168 below for extended SIOC*XARP ioctls */
   1012   8485       Peter 	/* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
   1013   4972        meem 			ARP_CMD, ip_sioctl_arp, NULL },
   1014   8485       Peter 	/* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD,
   1015   4972        meem 			ARP_CMD, ip_sioctl_arp, NULL },
   1016   8485       Peter 	/* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
   1017   4972        meem 			ARP_CMD, ip_sioctl_arp, NULL },
   1018      0      stevel 
   1019      0      stevel 	/* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1020      0      stevel 	/* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1021      0      stevel 	/* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1022      0      stevel 	/* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1023      0      stevel 	/* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1024      0      stevel 	/* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1025      0      stevel 	/* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1026      0      stevel 	/* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1027      0      stevel 	/* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1028      0      stevel 	/* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1029      0      stevel 	/* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1030      0      stevel 	/* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1031      0      stevel 	/* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1032      0      stevel 	/* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1033      0      stevel 	/* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1034      0      stevel 	/* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1035      0      stevel 	/* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1036      0      stevel 	/* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1037      0      stevel 	/* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1038      0      stevel 	/* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1039      0      stevel 	/* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1040      0      stevel 
   1041      0      stevel 	/* 054 */ { IF_UNITSEL,	sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK,
   1042      0      stevel 			MISC_CMD, if_unitsel, if_unitsel_restart },
   1043      0      stevel 
   1044      0      stevel 	/* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1045      0      stevel 	/* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1046      0      stevel 	/* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1047      0      stevel 	/* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1048      0      stevel 	/* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1049      0      stevel 	/* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1050      0      stevel 	/* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1051      0      stevel 	/* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1052      0      stevel 	/* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1053      0      stevel 	/* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1054      0      stevel 	/* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1055      0      stevel 	/* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1056      0      stevel 	/* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1057      0      stevel 	/* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1058      0      stevel 	/* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1059      0      stevel 	/* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1060      0      stevel 	/* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1061      0      stevel 	/* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1062      0      stevel 
   1063      0      stevel 	/* 073 */ { SIOCSIFNAME, sizeof (struct ifreq),
   1064      0      stevel 			IPI_PRIV | IPI_WR | IPI_MODOK,
   1065      0      stevel 			IF_CMD, ip_sioctl_sifname, NULL },
   1066      0      stevel 
   1067      0      stevel 	/* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1068      0      stevel 	/* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1069      0      stevel 	/* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1070      0      stevel 	/* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1071      0      stevel 	/* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1072      0      stevel 	/* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1073      0      stevel 	/* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1074      0      stevel 	/* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1075      0      stevel 	/* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1076      0      stevel 	/* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1077      0      stevel 	/* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1078      0      stevel 	/* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1079      0      stevel 	/* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1080      0      stevel 
   1081   8485       Peter 	/* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD,
   1082      0      stevel 			MISC_CMD, ip_sioctl_get_ifnum, NULL },
   1083   8485       Peter 	/* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD,
   1084      0      stevel 			IF_CMD, ip_sioctl_get_muxid, NULL },
   1085      0      stevel 	/* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
   1086   8485       Peter 			IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL },
   1087      0      stevel 
   1088      0      stevel 	/* Both if and lif variants share same func */
   1089   8485       Peter 	/* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD,
   1090      0      stevel 			IF_CMD, ip_sioctl_get_lifindex, NULL },
   1091      0      stevel 	/* Both if and lif variants share same func */
   1092      0      stevel 	/* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
   1093   8485       Peter 			IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL },
   1094      0      stevel 
   1095      0      stevel 	/* copyin size cannot be coded for SIOCGIFCONF */
   1096   4972        meem 	/* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD,
   1097      0      stevel 			MISC_CMD, ip_sioctl_get_ifconf, NULL },
   1098      0      stevel 	/* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1099      0      stevel 	/* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1100      0      stevel 	/* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1101      0      stevel 	/* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1102      0      stevel 	/* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1103      0      stevel 	/* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1104      0      stevel 	/* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1105      0      stevel 	/* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1106      0      stevel 	/* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1107      0      stevel 	/* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1108      0      stevel 	/* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1109      0      stevel 	/* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1110      0      stevel 	/* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1111      0      stevel 	/* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1112      0      stevel 	/* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1113      0      stevel 	/* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1114      0      stevel 	/* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1115      0      stevel 
   1116      0      stevel 	/* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
   1117   8485       Peter 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif,
   1118      0      stevel 			ip_sioctl_removeif_restart },
   1119      0      stevel 	/* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
   1120   8485       Peter 			IPI_GET_CMD | IPI_PRIV | IPI_WR,
   1121      0      stevel 			LIF_CMD, ip_sioctl_addif, NULL },
   1122      0      stevel #define	SIOCLIFADDR_NDX 112
   1123      0      stevel 	/* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1124      0      stevel 			LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
   1125      0      stevel 	/* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
   1126   8485       Peter 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL },
   1127      0      stevel 	/* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1128      0      stevel 			LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
   1129      0      stevel 	/* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
   1130   8485       Peter 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL },
   1131      0      stevel 	/* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
   1132   8485       Peter 			IPI_PRIV | IPI_WR,
   1133      0      stevel 			LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
   1134      0      stevel 	/* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
   1135   8485       Peter 			IPI_GET_CMD | IPI_MODOK,
   1136      0      stevel 			LIF_CMD, ip_sioctl_get_flags, NULL },
   1137      0      stevel 
   1138      0      stevel 	/* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1139      0      stevel 	/* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1140      0      stevel 
   1141   4972        meem 	/* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
   1142      0      stevel 			ip_sioctl_get_lifconf, NULL },
   1143      0      stevel 	/* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1144      0      stevel 			LIF_CMD, ip_sioctl_mtu, NULL },
   1145   8485       Peter 	/* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD,
   1146      0      stevel 			LIF_CMD, ip_sioctl_get_mtu, NULL },
   1147      0      stevel 	/* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
   1148   8485       Peter 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL },
   1149      0      stevel 	/* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1150      0      stevel 			LIF_CMD, ip_sioctl_brdaddr, NULL },
   1151      0      stevel 	/* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
   1152   8485       Peter 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL },
   1153      0      stevel 	/* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1154      0      stevel 			LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
   1155      0      stevel 	/* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
   1156   8485       Peter 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL },
   1157      0      stevel 	/* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1158      0      stevel 			LIF_CMD, ip_sioctl_metric, NULL },
   1159      0      stevel 	/* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
   1160   8485       Peter 			IPI_PRIV | IPI_WR | IPI_MODOK,
   1161      0      stevel 			LIF_CMD, ip_sioctl_slifname,
   1162      0      stevel 			ip_sioctl_slifname_restart },
   1163      0      stevel 
   1164   8485       Peter 	/* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD,
   1165      0      stevel 			MISC_CMD, ip_sioctl_get_lifnum, NULL },
   1166      0      stevel 	/* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
   1167   8485       Peter 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL },
   1168      0      stevel 	/* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
   1169   8485       Peter 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL },
   1170      0      stevel 	/* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
   1171   8485       Peter 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 },
   1172      0      stevel 	/* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
   1173   8485       Peter 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 },
   1174      0      stevel 	/* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1175      0      stevel 			LIF_CMD, ip_sioctl_token, NULL },
   1176      0      stevel 	/* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
   1177   8485       Peter 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL },
   1178      0      stevel 	/* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1179      0      stevel 			LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
   1180      0      stevel 	/* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
   1181   8485       Peter 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL },
   1182      0      stevel 	/* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
   1183      0      stevel 			LIF_CMD, ip_sioctl_lnkinfo, NULL },
   1184      0      stevel 
   1185      0      stevel 	/* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
   1186   8485       Peter 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
   1187      0      stevel 	/* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
   1188      0      stevel 			LIF_CMD, ip_siocdelndp_v6, NULL },
   1189      0      stevel 	/* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
   1190      0      stevel 			LIF_CMD, ip_siocqueryndp_v6, NULL },
   1191      0      stevel 	/* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV,
   1192      0      stevel 			LIF_CMD, ip_siocsetndp_v6, NULL },
   1193      0      stevel 	/* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD,
   1194      0      stevel 			MISC_CMD, ip_sioctl_tmyaddr, NULL },
   1195      0      stevel 	/* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD,
   1196      0      stevel 			MISC_CMD, ip_sioctl_tonlink, NULL },
   1197      0      stevel 	/* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
   1198      0      stevel 			MISC_CMD, ip_sioctl_tmysite, NULL },
   1199  10616   Sebastien 	/* 147 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1200  10616   Sebastien 	/* 148 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1201      0      stevel 	/* IPSECioctls handled in ip_sioctl_copyin_setup itself */
   1202      0      stevel 	/* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1203      0      stevel 	/* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1204      0      stevel 	/* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1205      0      stevel 	/* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
   1206      0      stevel 
   1207   8485       Peter 	/* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1208   8485       Peter 
   1209   8700       Peter 	/* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD,
   1210   8700       Peter 			LIF_CMD, ip_sioctl_get_binding, NULL },
   1211      0      stevel 	/* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
   1212   8485       Peter 			IPI_PRIV | IPI_WR,
   1213      0      stevel 			LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
   1214      0      stevel 	/* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
   1215   8485       Peter 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL },
   1216   8485       Peter 	/* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t),
   1217   8485       Peter 			IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL },
   1218      0      stevel 
   1219      0      stevel 	/* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
   1220      0      stevel 	/* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1221      0      stevel 	/* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1222      0      stevel 	/* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1223      0      stevel 
   1224   8485       Peter 	/* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1225      0      stevel 
   1226      0      stevel 	/* These are handled in ip_sioctl_copyin_setup itself */
   1227      0      stevel 	/* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
   1228      0      stevel 			MISC_CMD, NULL, NULL },
   1229      0      stevel 	/* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT,
   1230      0      stevel 			MISC_CMD, NULL, NULL },
   1231      0      stevel 	/* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL },
   1232      0      stevel 
   1233   4972        meem 	/* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
   1234      0      stevel 			ip_sioctl_get_lifconf, NULL },
   1235      0      stevel 
   1236   8485       Peter 	/* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
   1237   4972        meem 			XARP_CMD, ip_sioctl_arp, NULL },
   1238   8485       Peter 	/* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD,
   1239   4972        meem 			XARP_CMD, ip_sioctl_arp, NULL },
   1240   8485       Peter 	/* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
   1241   4972        meem 			XARP_CMD, ip_sioctl_arp, NULL },
   1242      0      stevel 
   1243      0      stevel 	/* SIOCPOPSOCKFS is not handled by IP */
   1244      0      stevel 	/* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
   1245      0      stevel 
   1246      0      stevel 	/* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
   1247   8485       Peter 			IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL },
   1248      0      stevel 	/* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
   1249   8485       Peter 			IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone,
   1250      0      stevel 			ip_sioctl_slifzone_restart },
   1251      0      stevel 	/* 172-174 are SCTP ioctls and not handled by IP */
   1252      0      stevel 	/* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1253      0      stevel 	/* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1254      0      stevel 	/* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1255      0      stevel 	/* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq),
   1256      0      stevel 			IPI_GET_CMD, LIF_CMD,
   1257      0      stevel 			ip_sioctl_get_lifusesrc, 0 },
   1258      0      stevel 	/* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq),
   1259      0      stevel 			IPI_PRIV | IPI_WR,
   1260      0      stevel 			LIF_CMD, ip_sioctl_slifusesrc,
   1261      0      stevel 			NULL },
   1262      0      stevel 	/* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD,
   1263      0      stevel 			ip_sioctl_get_lifsrcof, NULL },
   1264      0      stevel 	/* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD,
   1265   4972        meem 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1266  11042        Erik 	/* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0,
   1267   4972        meem 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1268      0      stevel 	/* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD,
   1269   4972        meem 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1270  11042        Erik 	/* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0,
   1271   4972        meem 			MSFILT_CMD, ip_sioctl_msfilter, NULL },
   1272   8485       Peter 	/* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
   1273   5381        meem 	/* SIOCSENABLESDP is handled by SDP */
   1274   5381        meem 	/* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
   1275   8348        Eric 	/* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
   1276  10946    Sangeeta 	/* 185 */ { IPI_DONTCARE /* SIOCGIFHWADDR */, 0, 0, 0, NULL, NULL },
   1277  10946    Sangeeta 	/* 186 */ { IPI_DONTCARE /* SIOCGSTAMP */, 0, 0, 0, NULL, NULL },
   1278  10946    Sangeeta 	/* 187 */ { SIOCILB, 0, IPI_PRIV | IPI_GET_CMD, MISC_CMD,
   1279  10946    Sangeeta 			ip_sioctl_ilb_cmd, NULL },
   1280      0      stevel };
   1281      0      stevel 
   1282      0      stevel int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
   1283      0      stevel 
   1284      0      stevel ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
   1285  11042        Erik 	{ I_LINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1286  11042        Erik 	{ I_UNLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1287  11042        Erik 	{ I_PLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1288  11042        Erik 	{ I_PUNLINK,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1289  11042        Erik 	{ ND_GET,	0, 0, 0, NULL, NULL },
   1290  11042        Erik 	{ ND_SET,	0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
   1291      0      stevel 	{ IP_IOCTL,	0, 0, 0, NULL, NULL },
   1292   8485       Peter 	{ SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
   1293      0      stevel 		MISC_CMD, mrt_ioctl},
   1294   8485       Peter 	{ SIOCGETSGCNT,	sizeof (struct sioc_sg_req), IPI_GET_CMD,
   1295      0      stevel 		MISC_CMD, mrt_ioctl},
   1296   8485       Peter 	{ SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD,
   1297      0      stevel 		MISC_CMD, mrt_ioctl}
   1298      0      stevel };
   1299      0      stevel 
   1300      0      stevel int ip_misc_ioctl_count =
   1301      0      stevel     sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t);
   1302      0      stevel 
   1303      0      stevel int	conn_drain_nthreads;		/* Number of drainers reqd. */
   1304      0      stevel 					/* Settable in /etc/system */
   1305      0      stevel /* Defined in ip_ire.c */
   1306      0      stevel extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt;
   1307      0      stevel extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt;
   1308      0      stevel extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio;
   1309      0      stevel 
   1310      0      stevel static nv_t	ire_nv_arr[] = {
   1311      0      stevel 	{ IRE_BROADCAST, "BROADCAST" },
   1312      0      stevel 	{ IRE_LOCAL, "LOCAL" },
   1313      0      stevel 	{ IRE_LOOPBACK, "LOOPBACK" },
   1314      0      stevel 	{ IRE_DEFAULT, "DEFAULT" },
   1315      0      stevel 	{ IRE_PREFIX, "PREFIX" },
   1316      0      stevel 	{ IRE_IF_NORESOLVER, "IF_NORESOL" },
   1317      0      stevel 	{ IRE_IF_RESOLVER, "IF_RESOLV" },
   1318  11042        Erik 	{ IRE_IF_CLONE, "IF_CLONE" },
   1319      0      stevel 	{ IRE_HOST, "HOST" },
   1320  11042        Erik 	{ IRE_MULTICAST, "MULTICAST" },
   1321  11042        Erik 	{ IRE_NOROUTE, "NOROUTE" },
   1322      0      stevel 	{ 0 }
   1323      0      stevel };
   1324      0      stevel 
   1325      0      stevel nv_t	*ire_nv_tbl = ire_nv_arr;
   1326      0      stevel 
   1327      0      stevel /* Simple ICMP IP Header Template */
   1328      0      stevel static ipha_t icmp_ipha = {
   1329      0      stevel 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
   1330      0      stevel };
   1331      0      stevel 
   1332      0      stevel struct module_info ip_mod_info = {
   1333   8348        Eric 	IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT,
   1334   8348        Eric 	IP_MOD_LOWAT
   1335      0      stevel };
   1336      0      stevel 
   1337   2546    carlsonj /*
   1338   2546    carlsonj  * Duplicate static symbols within a module confuses mdb; so we avoid the
   1339   2546    carlsonj  * problem by making the symbols here distinct from those in udp.c.
   1340   2546    carlsonj  */
   1341   2546    carlsonj 
   1342   5240    nordmark /*
   1343   5240    nordmark  * Entry points for IP as a device and as a module.
   1344   5240    nordmark  * We have separate open functions for the /dev/ip and /dev/ip6 devices.
   1345   5240    nordmark  */
   1346   5240    nordmark static struct qinit iprinitv4 = {
   1347   5240    nordmark 	(pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL,
   1348      0      stevel 	&ip_mod_info
   1349      0      stevel };
   1350      0      stevel 
   1351   5240    nordmark struct qinit iprinitv6 = {
   1352   5240    nordmark 	(pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL,
   1353      0      stevel 	&ip_mod_info
   1354      0      stevel };
   1355      0      stevel 
   1356  11042        Erik static struct qinit ipwinit = {
   1357  11042        Erik 	(pfi_t)ip_wput_nondata, (pfi_t)ip_wsrv, NULL, NULL, NULL,
   1358      0      stevel 	&ip_mod_info
   1359      0      stevel };
   1360      0      stevel 
   1361   5240    nordmark static struct qinit iplrinit = {
   1362   5240    nordmark 	(pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL,
   1363   5240    nordmark 	&ip_mod_info
   1364   5240    nordmark };
   1365   5240    nordmark 
   1366   5240    nordmark static struct qinit iplwinit = {
   1367   5240    nordmark 	(pfi_t)ip_lwput, NULL, NULL, NULL, NULL,
   1368   5240    nordmark 	&ip_mod_info
   1369   5240    nordmark };
   1370   5240    nordmark 
   1371   5240    nordmark /* For AF_INET aka /dev/ip */
   1372   5240    nordmark struct streamtab ipinfov4 = {
   1373  11042        Erik 	&iprinitv4, &ipwinit, &iplrinit, &iplwinit
   1374   5240    nordmark };
   1375   5240    nordmark 
   1376   5240    nordmark /* For AF_INET6 aka /dev/ip6 */
   1377   5240    nordmark struct streamtab ipinfov6 = {
   1378  11042        Erik 	&iprinitv6, &ipwinit, &iplrinit, &iplwinit
   1379      0      stevel };
   1380      0      stevel 
   1381      0      stevel #ifdef	DEBUG
   1382  11042        Erik boolean_t skip_sctp_cksum = B_FALSE;
   1383      0      stevel #endif
   1384   2733    nordmark 
   1385   2733    nordmark /*
   1386  11042        Erik  * Generate an ICMP fragmentation needed message.
   1387  11042        Erik  * When called from ip_output side a minimal ip_recv_attr_t needs to be
   1388  11042        Erik  * constructed by the caller.
   1389  11042        Erik  */
   1390  11042        Erik void
   1391  11042        Erik icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira)
   1392      0      stevel {
   1393      0      stevel 	icmph_t	icmph;
   1394  11042        Erik 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   1395  11042        Erik 
   1396  11042        Erik 	mp = icmp_pkt_err_ok(mp, ira);
   1397  11042        Erik 	if (mp == NULL)
   1398  11042        Erik 		return;
   1399      0      stevel 
   1400      0      stevel 	bzero(&icmph, sizeof (icmph_t));
   1401      0      stevel 	icmph.icmph_type = ICMP_DEST_UNREACHABLE;
   1402      0      stevel 	icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED;
   1403      0      stevel 	icmph.icmph_du_mtu = htons((uint16_t)mtu);
   1404   3448    dh155122 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded);
   1405   3448    dh155122 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
   1406  11042        Erik 
   1407  11042        Erik 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   1408  11042        Erik }
   1409  11042        Erik 
   1410  11042        Erik /*
   1411  11042        Erik  * icmp_inbound_v4 deals with ICMP messages that are handled by IP.
   1412  11042        Erik  * If the ICMP message is consumed by IP, i.e., it should not be delivered
   1413  11042        Erik  * to any IPPROTO_ICMP raw sockets, then it returns NULL.
   1414  11042        Erik  * Likewise, if the ICMP error is misformed (too short, etc), then it
   1415  11042        Erik  * returns NULL. The caller uses this to determine whether or not to send
   1416  11042        Erik  * to raw sockets.
   1417  11042        Erik  *
   1418  11042        Erik  * All error messages are passed to the matching transport stream.
   1419  11042        Erik  *
   1420  11042        Erik  * The following cases are handled by icmp_inbound:
   1421      0      stevel  * 1) It needs to send a reply back and possibly delivering it
   1422      0      stevel  *    to the "interested" upper clients.
   1423  11042        Erik  * 2) Return the mblk so that the caller can pass it to the RAW socket clients.
   1424      0      stevel  * 3) It needs to change some values in IP only.
   1425  11042        Erik  * 4) It needs to change some values in IP and upper layers e.g TCP
   1426  11042        Erik  *    by delivering an error to the upper layers.
   1427  11042        Erik  *
   1428  11042        Erik  * We handle the above three cases in the context of IPsec in the
   1429      0      stevel  * following way :
   1430      0      stevel  *
   1431      0      stevel  * 1) Send the reply back in the same way as the request came in.
   1432      0      stevel  *    If it came in encrypted, it goes out encrypted. If it came in
   1433      0      stevel  *    clear, it goes out in clear. Thus, this will prevent chosen
   1434      0      stevel  *    plain text attack.
   1435      0      stevel  * 2) The client may or may not expect things to come in secure.
   1436      0      stevel  *    If it comes in secure, the policy constraints are checked
   1437      0      stevel  *    before delivering it to the upper layers. If it comes in
   1438      0      stevel  *    clear, ipsec_inbound_accept_clear will decide whether to
   1439      0      stevel  *    accept this in clear or not. In both the cases, if the returned
   1440      0      stevel  *    message (IP header + 8 bytes) that caused the icmp message has
   1441      0      stevel  *    AH/ESP headers, it is sent up to AH/ESP for validation before
   1442      0      stevel  *    sending up. If there are only 8 bytes of returned message, then
   1443      0      stevel  *    upper client will not be notified.
   1444      0      stevel  * 3) Check with global policy to see whether it matches the constaints.
   1445      0      stevel  *    But this will be done only if icmp_accept_messages_in_clear is
   1446      0      stevel  *    zero.
   1447      0      stevel  * 4) If we need to change both in IP and ULP, then the decision taken
   1448      0      stevel  *    while affecting the values in IP and while delivering up to TCP
   1449      0      stevel  *    should be the same.
   1450      0      stevel  *
   1451      0      stevel  * 	There are two cases.
   1452      0      stevel  *
   1453      0      stevel  * 	a) If we reject data at the IP layer (ipsec_check_global_policy()
   1454      0      stevel  *	   failed), we will not deliver it to the ULP, even though they
   1455      0      stevel  *	   are *willing* to accept in *clear*. This is fine as our global
   1456      0      stevel  *	   disposition to icmp messages asks us reject the datagram.
   1457      0      stevel  *
   1458      0      stevel  *	b) If we accept data at the IP layer (ipsec_check_global_policy()
   1459      0      stevel  *	   succeeded or icmp_accept_messages_in_clear is 1), and not able
   1460      0      stevel  *	   to deliver it to ULP (policy failed), it can lead to
   1461      0      stevel  *	   consistency problems. The cases known at this time are
   1462      0      stevel  *	   ICMP_DESTINATION_UNREACHABLE  messages with following code
   1463      0      stevel  *	   values :
   1464      0      stevel  *
   1465      0      stevel  *	   - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value
   1466      0      stevel  *	     and Upper layer rejects. Then the communication will
   1467      0      stevel  *	     come to a stop. This is solved by making similar decisions
   1468      0      stevel  *	     at both levels. Currently, when we are unable to deliver
   1469      0      stevel  *	     to the Upper Layer (due to policy failures) while IP has
   1470  11042        Erik  *	     adjusted dce_pmtu, the next outbound datagram would
   1471      0      stevel  *	     generate a local ICMP_FRAGMENTATION_NEEDED message - which
   1472      0      stevel  *	     will be with the right level of protection. Thus the right
   1473      0      stevel  *	     value will be communicated even if we are not able to
   1474      0      stevel  *	     communicate when we get from the wire initially. But this
   1475      0      stevel  *	     assumes there would be at least one outbound datagram after
   1476  11042        Erik  *	     IP has adjusted its dce_pmtu value. To make things
   1477      0      stevel  *	     simpler, we accept in clear after the validation of
   1478      0      stevel  *	     AH/ESP headers.
   1479      0      stevel  *
   1480      0      stevel  *	   - Other ICMP ERRORS : We may not be able to deliver it to the
   1481      0      stevel  *	     upper layer depending on the level of protection the upper
   1482      0      stevel  *	     layer expects and the disposition in ipsec_inbound_accept_clear().
   1483      0      stevel  *	     ipsec_inbound_accept_clear() decides whether a given ICMP error
   1484      0      stevel  *	     should be accepted in clear when the Upper layer expects secure.
   1485      0      stevel  *	     Thus the communication may get aborted by some bad ICMP
   1486      0      stevel  *	     packets.
   1487  11042        Erik  */
   1488  11042        Erik mblk_t *
   1489  11042        Erik icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira)
   1490  11042        Erik {
   1491  11042        Erik 	icmph_t		*icmph;
   1492  11042        Erik 	ipha_t		*ipha;		/* Outer header */
   1493  11042        Erik 	int		ip_hdr_length;	/* Outer header length */
   1494      0      stevel 	boolean_t	interested;
   1495  11042        Erik 	ipif_t		*ipif;
   1496      0      stevel 	uint32_t	ts;
   1497  11042        Erik 	uint32_t	*tsp;
   1498  11042        Erik 	timestruc_t	now;
   1499  11042        Erik 	ill_t		*ill = ira->ira_ill;
   1500  11042        Erik 	ip_stack_t	*ipst = ill->ill_ipst;
   1501  11042        Erik 	zoneid_t	zoneid = ira->ira_zoneid;
   1502  11042        Erik 	int		len_needed;
   1503  11042        Erik 	mblk_t		*mp_ret = NULL;
   1504      0      stevel 
   1505      0      stevel 	ipha = (ipha_t *)mp->b_rptr;
   1506  11042        Erik 
   1507  11042        Erik 	BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs);
   1508  11042        Erik 
   1509  11042        Erik 	ip_hdr_length = ira->ira_ip_hdr_length;
   1510  11042        Erik 	if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) {
   1511  11042        Erik 		if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) {
   1512  11042        Erik 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
   1513  11042        Erik 			ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
   1514  11042        Erik 			freemsg(mp);
   1515  11042        Erik 			return (NULL);
   1516  11042        Erik 		}
   1517  11042        Erik 		/* Last chance to get real. */
   1518  11042        Erik 		ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira);
   1519  11042        Erik 		if (ipha == NULL) {
   1520   3448    dh155122 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
   1521  11042        Erik 			freemsg(mp);
   1522  11042        Erik 			return (NULL);
   1523  11042        Erik 		}
   1524  11042        Erik 	}
   1525  11042        Erik 
   1526      0      stevel 	/* The IP header will always be a multiple of four bytes */
   1527  11042        Erik 	icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1528  11042        Erik 	ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type,
   1529      0      stevel 	    icmph->icmph_code));
   1530  11042        Erik 
   1531  11042        Erik 	/*
   1532  11042        Erik 	 * We will set "interested" to "true" if we should pass a copy to
   1533  11042        Erik 	 * the transport or if we handle the packet locally.
   1534  11042        Erik 	 */
   1535      0      stevel 	interested = B_FALSE;
   1536      0      stevel 	switch (icmph->icmph_type) {
   1537      0      stevel 	case ICMP_ECHO_REPLY:
   1538   3448    dh155122 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps);
   1539      0      stevel 		break;
   1540      0      stevel 	case ICMP_DEST_UNREACHABLE:
   1541      0      stevel 		if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED)
   1542   3448    dh155122 			BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded);
   1543      0      stevel 		interested = B_TRUE;	/* Pass up to transport */
   1544   3448    dh155122 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs);
   1545      0      stevel 		break;
   1546      0      stevel 	case ICMP_SOURCE_QUENCH:
   1547      0      stevel 		interested = B_TRUE;	/* Pass up to transport */
   1548   3448    dh155122 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs);
   1549      0      stevel 		break;
   1550      0      stevel 	case ICMP_REDIRECT:
   1551   3448    dh155122 		if (!ipst->ips_ip_ignore_redirect)
   1552      0      stevel 			interested = B_TRUE;
   1553   3448    dh155122 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects);
   1554      0      stevel 		break;
   1555      0      stevel 	case ICMP_ECHO_REQUEST:
   1556      0      stevel 		/*
   1557      0      stevel 		 * Whether to respond to echo requests that come in as IP
   1558      0      stevel 		 * broadcasts or as IP multicast is subject to debate
   1559      0      stevel 		 * (what isn't?).  We aim to please, you pick it.
   1560      0      stevel 		 * Default is do it.
   1561      0      stevel 		 */
   1562  11042        Erik 		if (ira->ira_flags & IRAF_MULTICAST) {
   1563  11042        Erik 			/* multicast: respond based on tunable */
   1564  11042        Erik 			interested = ipst->ips_ip_g_resp_to_echo_mcast;
   1565  11042        Erik 		} else if (ira->ira_flags & IRAF_BROADCAST) {
   1566  11042        Erik 			/* broadcast: respond based on tunable */
   1567  11042        Erik 			interested = ipst->ips_ip_g_resp_to_echo_bcast;
   1568  11042        Erik 		} else {
   1569      0      stevel 			/* unicast: always respond */
   1570      0      stevel 			interested = B_TRUE;
   1571   3448    dh155122 		}
   1572   3448    dh155122 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos);
   1573  11042        Erik 		if (!interested) {
   1574  11042        Erik 			/* We never pass these to RAW sockets */
   1575  11042        Erik 			freemsg(mp);
   1576  11042        Erik 			return (NULL);
   1577  11042        Erik 		}
   1578  11042        Erik 
   1579  11042        Erik 		/* Check db_ref to make sure we can modify the packet. */
   1580  11042        Erik 		if (mp->b_datap->db_ref > 1) {
   1581  11042        Erik 			mblk_t	*mp1;
   1582  11042        Erik 
   1583  11042        Erik 			mp1 = copymsg(mp);
   1584  11042        Erik 			freemsg(mp);
   1585  11042        Erik 			if (!mp1) {
   1586  11042        Erik 				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   1587  11042        Erik 				return (NULL);
   1588  11042        Erik 			}
   1589  11042        Erik 			mp = mp1;
   1590  11042        Erik 			ipha = (ipha_t *)mp->b_rptr;
   1591  11042        Erik 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1592  11042        Erik 		}
   1593  11042        Erik 		icmph->icmph_type = ICMP_ECHO_REPLY;
   1594  11042        Erik 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
   1595  11042        Erik 		icmp_send_reply_v4(mp, ipha, icmph, ira);
   1596  11042        Erik 		return (NULL);
   1597  11042        Erik 
   1598      0      stevel 	case ICMP_ROUTER_ADVERTISEMENT:
   1599      0      stevel 	case ICMP_ROUTER_SOLICITATION:
   1600      0      stevel 		break;
   1601      0      stevel 	case ICMP_TIME_EXCEEDED:
   1602      0      stevel 		interested = B_TRUE;	/* Pass up to transport */
   1603   3448    dh155122 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds);
   1604      0      stevel 		break;
   1605      0      stevel 	case ICMP_PARAM_PROBLEM:
   1606      0      stevel 		interested = B_TRUE;	/* Pass up to transport */
   1607   3448    dh155122 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs);
   1608      0      stevel 		break;
   1609      0      stevel 	case ICMP_TIME_STAMP_REQUEST:
   1610      0      stevel 		/* Response to Time Stamp Requests is local policy. */
   1611  11042        Erik 		if (ipst->ips_ip_g_resp_to_timestamp) {
   1612  11042        Erik 			if (ira->ira_flags & IRAF_MULTIBROADCAST)
   1613  11042        Erik 				interested =
   1614  11042        Erik 				    ipst->ips_ip_g_resp_to_timestamp_bcast;
   1615  11042        Erik 			else
   1616  11042        Erik 				interested = B_TRUE;
   1617  11042        Erik 		}
   1618  11042        Erik 		if (!interested) {
   1619  11042        Erik 			/* We never pass these to RAW sockets */
   1620  11042        Erik 			freemsg(mp);
   1621  11042        Erik 			return (NULL);
   1622  11042        Erik 		}
   1623  11042        Erik 
   1624  11042        Erik 		/* Make sure we have enough of the packet */
   1625  11042        Erik 		len_needed = ip_hdr_length + ICMPH_SIZE +
   1626  11042        Erik 		    3 * sizeof (uint32_t);
   1627  11042        Erik 
   1628  11042        Erik 		if (mp->b_wptr - mp->b_rptr < len_needed) {
   1629  11042        Erik 			ipha = ip_pullup(mp, len_needed, ira);
   1630  11042        Erik 			if (ipha == NULL) {
   1631  11042        Erik 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1632  11042        Erik 				ip_drop_input("ipIfStatsInDiscards - ip_pullup",
   1633  11042        Erik 				    mp, ill);
   1634  11042        Erik 				freemsg(mp);
   1635  11042        Erik 				return (NULL);
   1636  11042        Erik 			}
   1637  11042        Erik 			/* Refresh following the pullup. */
   1638  11042        Erik 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1639      0      stevel 		}
   1640   3448    dh155122 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps);
   1641  11042        Erik 		/* Check db_ref to make sure we can modify the packet. */
   1642  11042        Erik 		if (mp->b_datap->db_ref > 1) {
   1643  11042        Erik 			mblk_t	*mp1;
   1644  11042        Erik 
   1645  11042        Erik 			mp1 = copymsg(mp);
   1646  11042        Erik 			freemsg(mp);
   1647  11042        Erik 			if (!mp1) {
   1648  11042        Erik 				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   1649  11042        Erik 				return (NULL);
   1650  11042        Erik 			}
   1651  11042        Erik 			mp = mp1;
   1652  11042        Erik 			ipha = (ipha_t *)mp->b_rptr;
   1653  11042        Erik 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1654  11042        Erik 		}
   1655      0      stevel 		icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
   1656  11042        Erik 		tsp = (uint32_t *)&icmph[1];
   1657      0      stevel 		tsp++;		/* Skip past 'originate time' */
   1658      0      stevel 		/* Compute # of milliseconds since midnight */
   1659      0      stevel 		gethrestime(&now);
   1660      0      stevel 		ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
   1661      0      stevel 		    now.tv_nsec / (NANOSEC / MILLISEC);
   1662      0      stevel 		*tsp++ = htonl(ts);	/* Lay in 'receive time' */
   1663      0      stevel 		*tsp++ = htonl(ts);	/* Lay in 'send time' */
   1664   3448    dh155122 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
   1665  11042        Erik 		icmp_send_reply_v4(mp, ipha, icmph, ira);
   1666  11042        Erik 		return (NULL);
   1667  11042        Erik 
   1668  11042        Erik 	case ICMP_TIME_STAMP_REPLY:
   1669  11042        Erik 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps);
   1670  11042        Erik 		break;
   1671  11042        Erik 	case ICMP_INFO_REQUEST:
   1672  11042        Erik 		/* Per RFC 1122 3.2.2.7, ignore this. */
   1673  11042        Erik 	case ICMP_INFO_REPLY:
   1674  11042        Erik 		break;
   1675  11042        Erik 	case ICMP_ADDRESS_MASK_REQUEST:
   1676  11042        Erik 		if (ira->ira_flags & IRAF_MULTIBROADCAST) {
   1677  11042        Erik 			interested =
   1678  11042        Erik 			    ipst->ips_ip_respond_to_address_mask_broadcast;
   1679  11042        Erik 		} else {
   1680  11042        Erik 			interested = B_TRUE;
   1681  11042        Erik 		}
   1682  11042        Erik 		if (!interested) {
   1683  11042        Erik 			/* We never pass these to RAW sockets */
   1684  11042        Erik 			freemsg(mp);
   1685  11042        Erik 			return (NULL);
   1686  11042        Erik 		}
   1687  11042        Erik 		len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN;
   1688  11042        Erik 		if (mp->b_wptr - mp->b_rptr < len_needed) {
   1689  11042        Erik 			ipha = ip_pullup(mp, len_needed, ira);
   1690  11042        Erik 			if (ipha == NULL) {
   1691  11042        Erik 				BUMP_MIB(ill->ill_ip_mib,
   1692  11042        Erik 				    ipIfStatsInTruncatedPkts);
   1693  11042        Erik 				ip_drop_input("ipIfStatsInTruncatedPkts", mp,
   1694  11042        Erik 				    ill);
   1695  11042        Erik 				freemsg(mp);
   1696  11042        Erik 				return (NULL);
   1697  11042        Erik 			}
   1698  11042        Erik 			/* Refresh following the pullup. */
   1699  11042        Erik 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1700  11042        Erik 		}
   1701  11042        Erik 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks);
   1702  11042        Erik 		/* Check db_ref to make sure we can modify the packet. */
   1703  11042        Erik 		if (mp->b_datap->db_ref > 1) {
   1704  11042        Erik 			mblk_t	*mp1;
   1705  11042        Erik 
   1706  11042        Erik 			mp1 = copymsg(mp);
   1707  11042        Erik 			freemsg(mp);
   1708  11042        Erik 			if (!mp1) {
   1709  11042        Erik 				BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
   1710  11042        Erik 				return (NULL);
   1711  11042        Erik 			}
   1712  11042        Erik 			mp = mp1;
   1713  11042        Erik 			ipha = (ipha_t *)mp->b_rptr;
   1714  11042        Erik 			icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1715  11042        Erik 		}
   1716  11042        Erik 		/*
   1717  11042        Erik 		 * Need the ipif with the mask be the same as the source
   1718  11042        Erik 		 * address of the mask reply. For unicast we have a specific
   1719  11042        Erik 		 * ipif. For multicast/broadcast we only handle onlink
   1720  11042        Erik 		 * senders, and use the source address to pick an ipif.
   1721  11042        Erik 		 */
   1722  11042        Erik 		ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst);
   1723  11042        Erik 		if (ipif == NULL) {
   1724  11042        Erik 			/* Broadcast or multicast */
   1725  11042        Erik 			ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
   1726  11042        Erik 			if (ipif == NULL) {
   1727  11042        Erik 				freemsg(mp);
   1728  11042        Erik 				return (NULL);
   1729  11042        Erik 			}
   1730  11042        Erik 		}
   1731  11042        Erik 		icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
   1732  11042        Erik 		bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
   1733  11042        Erik 		ipif_refrele(ipif);
   1734  11042        Erik 		BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
   1735  11042        Erik 		icmp_send_reply_v4(mp, ipha, icmph, ira);
   1736  11042        Erik 		return (NULL);
   1737  11042        Erik 
   1738  11042        Erik 	case ICMP_ADDRESS_MASK_REPLY:
   1739  11042        Erik 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps);
   1740  11042        Erik 		break;
   1741  11042        Erik 	default:
   1742  11042        Erik 		interested = B_TRUE;	/* Pass up to transport */
   1743  11042        Erik 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns);
   1744  11042        Erik 		break;
   1745  11042        Erik 	}
   1746  11042        Erik 	/*
   1747  11042        Erik 	 * See if there is an ICMP client to avoid an extra copymsg/freemsg
   1748  11042        Erik 	 * if there isn't one.
   1749  11042        Erik 	 */
   1750  11042        Erik 	if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) {
   1751  11042        Erik 		/* If there is an ICMP client and we want one too, copy it. */
   1752  11042        Erik 
   1753  11042        Erik 		if (!interested) {
   1754  11042        Erik 			/* Caller will deliver to RAW sockets */
   1755  11042        Erik 			return (mp);
   1756  11042        Erik 		}
   1757  11042        Erik 		mp_ret = copymsg(mp);
   1758  11042        Erik 		if (mp_ret == NULL) {
   1759   3284    apersson 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1760  11042        Erik 			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
   1761  11042        Erik 		}
   1762  11042        Erik 	} else if (!interested) {
   1763  11042        Erik 		/* Neither we nor raw sockets are interested. Drop packet now */
   1764  11042        Erik 		freemsg(mp);
   1765  11042        Erik 		return (NULL);
   1766  11042        Erik 	}
   1767  11042        Erik 
   1768  11042        Erik 	/*
   1769  11042        Erik 	 * ICMP error or redirect packet. Make sure we have enough of
   1770  11042        Erik 	 * the header and that db_ref == 1 since we might end up modifying
   1771  11042        Erik 	 * the packet.
   1772  11042        Erik 	 */
   1773  11042        Erik 	if (mp->b_cont != NULL) {
   1774  11042        Erik 		if (ip_pullup(mp, -1, ira) == NULL) {
   1775   3284    apersson 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1776  11042        Erik 			ip_drop_input("ipIfStatsInDiscards - ip_pullup",
   1777  11042        Erik 			    mp, ill);
   1778  11042        Erik 			freemsg(mp);
   1779  11042        Erik 			return (mp_ret);
   1780  11042        Erik 		}
   1781  11042        Erik 	}
   1782  11042        Erik 
   1783  11042        Erik 	if (mp->b_datap->db_ref > 1) {
   1784  11042        Erik 		mblk_t	*mp1;
   1785  11042        Erik 
   1786  11042        Erik 		mp1 = copymsg(mp);
   1787  11042        Erik 		if (mp1 == NULL) {
   1788  11042        Erik 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1789  11042        Erik 			ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
   1790  11042        Erik 			freemsg(mp);
   1791  11042        Erik 			return (mp_ret);
   1792  11042        Erik 		}
   1793  11042        Erik 		freemsg(mp);
   1794  11042        Erik 		mp = mp1;
   1795  11042        Erik 	}
   1796  11042        Erik 
   1797  11042        Erik 	/*
   1798  11042        Erik 	 * In case mp has changed, verify the message before any further
   1799  11042        Erik 	 * processes.
   1800  11042        Erik 	 */
   1801  11042        Erik 	ipha = (ipha_t *)mp->b_rptr;
   1802  11042        Erik 	icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
   1803  11042        Erik 	if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
   1804  11042        Erik 		freemsg(mp);
   1805  11042        Erik 		return (mp_ret);
   1806  11042        Erik 	}
   1807  11042        Erik 
   1808  11042        Erik 	switch (icmph->icmph_type) {
   1809  11042        Erik 	case ICMP_REDIRECT:
   1810  11042        Erik 		icmp_redirect_v4(mp, ipha, icmph, ira);
   1811  11042        Erik 		break;
   1812  11042        Erik 	case ICMP_DEST_UNREACHABLE:
   1813  11042        Erik 		if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
   1814  11042        Erik 			/* Update DCE and adjust MTU is icmp header if needed */
   1815  11042        Erik 			icmp_inbound_too_big_v4(icmph, ira);
   1816  11042        Erik 		}
   1817  11042        Erik 		/* FALLTHRU */
   1818  11042        Erik 	default:
   1819  11042        Erik 		icmp_inbound_error_fanout_v4(mp, icmph, ira);
   1820  11042        Erik 		break;
   1821  11042        Erik 	}
   1822  11042        Erik 	return (mp_ret);
   1823  11042        Erik }
   1824  11042        Erik 
   1825  11042        Erik /*
   1826  11042        Erik  * Send an ICMP echo, timestamp or address mask reply.
   1827  11042        Erik  * The caller has already updated the payload part of the packet.
   1828  11042        Erik  * We handle the ICMP checksum, IP source address selection and feed
   1829  11042        Erik  * the packet into ip_output_simple.
   1830  11042        Erik  */
   1831  11042        Erik static void
   1832  11042        Erik icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph,
   1833  11042        Erik     ip_recv_attr_t *ira)
   1834  11042        Erik {
   1835  11042        Erik 	uint_t		ip_hdr_length = ira->ira_ip_hdr_length;
   1836  11042        Erik 	ill_t		*ill = ira->ira_ill;
   1837  11042        Erik 	ip_stack_t	*ipst = ill->ill_ipst;
   1838  11042        Erik 	ip_xmit_attr_t	ixas;
   1839  11042        Erik 
   1840      0      stevel 	/* Send out an ICMP packet */
   1841      0      stevel 	icmph->icmph_checksum = 0;
   1842  11042        Erik 	icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0);
   1843      0      stevel 	/* Reset time to live. */
   1844   3448    dh155122 	ipha->ipha_ttl = ipst->ips_ip_def_ttl;
   1845      0      stevel 	{
   1846      0      stevel 		/* Swap source and destination addresses */
   1847      0      stevel 		ipaddr_t tmp;
   1848      0      stevel 
   1849      0      stevel 		tmp = ipha->ipha_src;
   1850      0      stevel 		ipha->ipha_src = ipha->ipha_dst;
   1851      0      stevel 		ipha->ipha_dst = tmp;
   1852      0      stevel 	}
   1853      0      stevel 	ipha->ipha_ident = 0;
   1854      0      stevel 	if (!IS_SIMPLE_IPH(ipha))
   1855      0      stevel 		icmp_options_update(ipha);
   1856      0      stevel 
   1857  11042        Erik 	bzero(&ixas, sizeof (ixas));
   1858  11042        Erik 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
   1859  11042        Erik 	ixas.ixa_zoneid = ira->ira_zoneid;
   1860  11042        Erik 	ixas.ixa_cred = kcred;
   1861  11042        Erik 	ixas.ixa_cpid = NOPID;
   1862  11042        Erik 	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
   1863  11042        Erik 	ixas.ixa_ifindex = 0;
   1864  11042        Erik 	ixas.ixa_ipst = ipst;
   1865  11042        Erik 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
   1866  11042        Erik 
   1867  11042        Erik 	if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
   1868      0      stevel 		/*
   1869      0      stevel 		 * This packet should go out the same way as it
   1870  11042        Erik 		 * came in i.e in clear, independent of the IPsec policy
   1871  11042        Erik 		 * for transmitting packets.
   1872  11042        Erik 		 */
   1873  11042        Erik 		ixas.ixa_flags |= IXAF_NO_IPSEC;
   1874  11042        Erik 	} else {
   1875  11042        Erik 		if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
   1876   3284    apersson 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   1877  11042        Erik 			/* Note: mp already consumed and ip_drop_packet done */
   1878  11042        Erik 			return;
   1879  11042        Erik 		}
   1880  11042        Erik 	}
   1881  11042        Erik 	if (ira->ira_flags & IRAF_MULTIBROADCAST) {
   1882  11042        Erik 		/*
   1883  11042        Erik 		 * Not one or our addresses (IRE_LOCALs), thus we let
   1884  11042        Erik 		 * ip_output_simple pick the source.
   1885  11042        Erik 		 */
   1886  11042        Erik 		ipha->ipha_src = INADDR_ANY;
   1887  11042        Erik 		ixas.ixa_flags |= IXAF_SET_SOURCE;
   1888  11042        Erik 	}
   1889  11042        Erik 	/* Should we send with DF and use dce_pmtu? */
   1890  11042        Erik 	if (ipst->ips_ipv4_icmp_return_pmtu) {
   1891  11042        Erik 		ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
   1892  11042        Erik 		ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
   1893  11042        Erik 	}
   1894  11042        Erik 
   1895   3448    dh155122 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
   1896  11042        Erik 
   1897  11042        Erik 	(void) ip_output_simple(mp, &ixas);
   1898  11042        Erik 	ixa_cleanup(&ixas);
   1899  11042        Erik }
   1900  11042        Erik 
   1901  11042        Erik /*
   1902  11042        Erik  * Verify the ICMP messages for either for ICMP error or redirect packet.
   1903  11042        Erik  * The caller should have fully pulled up the message. If it's a redirect
   1904  11042        Erik  * packet, only basic checks on IP header will be done; otherwise, verify
   1905  11042        Erik  * the packet by looking at the included ULP header.
   1906  11042        Erik  *
   1907  11042        Erik  * Called before icmp_inbound_error_fanout_v4 is called.
   1908  11042        Erik  */
   1909  11042        Erik static boolean_t
   1910  11042        Erik icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
   1911  11042        Erik {
   1912  11042        Erik 	ill_t		*ill = ira->ira_ill;
   1913  11042        Erik 	int		hdr_length;
   1914  11042        Erik 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   1915  11042        Erik 	conn_t		*connp;
   1916  11042        Erik 	ipha_t		*ipha;	/* Inner IP header */
   1917  11042        Erik 
   1918  11042        Erik 	ipha = (ipha_t *)&icmph[1];
   1919  11042        Erik 	if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr)
   1920  11042        Erik 		goto truncated;
   1921  11042        Erik 
   1922  11042        Erik 	hdr_length = IPH_HDR_LENGTH(ipha);
   1923  11042        Erik 
   1924  11042        Erik 	if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION))
   1925  11042        Erik 		goto discard_pkt;
   1926  11042        Erik 
   1927  11042        Erik 	if (hdr_length < sizeof (ipha_t))
   1928  11042        Erik 		goto truncated;
   1929  11042        Erik 
   1930  11042        Erik 	if ((uchar_t *)ipha + hdr_length > mp->b_wptr)
   1931  11042        Erik 		goto truncated;
   1932  11042        Erik 
   1933  11042        Erik 	/*
   1934  11042        Erik 	 * Stop here for ICMP_REDIRECT.
   1935  11042        Erik 	 */
   1936  11042        Erik 	if (icmph->icmph_type == ICMP_REDIRECT)
   1937  11042        Erik 		return (B_TRUE);
   1938  11042        Erik 
   1939  11042        Erik 	/*
   1940  11042        Erik 	 * ICMP errors only.
   1941  11042        Erik 	 */
   1942   2252    priyanka 	switch (ipha->ipha_protocol) {
   1943  11042        Erik 	case IPPROTO_UDP:
   1944  11042        Erik 		/*
   1945  11042        Erik 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   1946  11042        Erik 		 * transport header.
   1947  11042        Erik 		 */
   1948  11042        Erik 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   1949  11042        Erik 		    mp->b_wptr)
   1950  11042        Erik 			goto truncated;
   1951  11042        Erik 		break;
   1952  11042        Erik 	case IPPROTO_TCP: {
   1953  11042        Erik 		tcpha_t		*tcpha;
   1954  11042        Erik 
   1955  11042        Erik 		/*
   1956  11042        Erik 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   1957  11042        Erik 		 * transport header.
   1958  11042        Erik 		 */
   1959  11042        Erik 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   1960  11042        Erik 		    mp->b_wptr)
   1961  11042        Erik 			goto truncated;
   1962  11042        Erik 
   1963  11042        Erik 		tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
   1964  11042        Erik 		connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
   1965  11042        Erik 		    ipst);
   1966  11042        Erik 		if (connp == NULL)
   1967  11042        Erik 			goto discard_pkt;
   1968  11042        Erik 
   1969  11042        Erik 		if ((connp->conn_verifyicmp != NULL) &&
   1970  11042        Erik 		    !connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) {
   1971  11042        Erik 			CONN_DEC_REF(connp);
   1972  11042        Erik 			goto discard_pkt;
   1973  11042        Erik 		}
   1974   2252    priyanka 		CONN_DEC_REF(connp);
   1975  11042        Erik 		break;
   1976  11042        Erik 	}
   1977  11042        Erik 	case IPPROTO_SCTP:
   1978  11042        Erik 		/*
   1979  11042        Erik 		 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
   1980  11042        Erik 		 * transport header.
   1981  11042        Erik 		 */
   1982  11042        Erik 		if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
   1983  11042        Erik 		    mp->b_wptr)
   1984  11042        Erik 			goto truncated;
   1985  11042        Erik 		break;
   1986  11042        Erik 	case IPPROTO_ESP:
   1987  11042        Erik 	case IPPROTO_AH:
   1988  11042        Erik 		break;
   1989  11042        Erik 	case IPPROTO_ENCAP:
   1990  11042        Erik 		if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
   1991  11042        Erik 		    mp->b_wptr)
   1992  11042        Erik 			goto truncated;
   1993  11042        Erik 		break;
   1994  11042        Erik 	default:
   1995  11042        Erik 		break;
   1996  11042        Erik 	}
   1997  11042        Erik 
   1998  11042        Erik 	return (B_TRUE);
   1999  11042        Erik 
   2000  11042        Erik discard_pkt:
   2001  11042        Erik 	/* Bogus ICMP error. */
   2002  11042        Erik 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2003  11042        Erik 	return (B_FALSE);
   2004  11042        Erik 
   2005  11042        Erik truncated:
   2006  11042        Erik 	/* We pulled up everthing already. Must be truncated */
   2007  11042        Erik 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
   2008  11042        Erik 	ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
   2009  11042        Erik 	return (B_FALSE);
   2010   2252    priyanka }
   2011   2252    priyanka 
   2012      0      stevel /* Table from RFC 1191 */
   2013      0      stevel static int icmp_frag_size_table[] =
   2014      0      stevel { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 };
   2015      0      stevel 
   2016      0      stevel /*
   2017      0      stevel  * Process received ICMP Packet too big.
   2018  11042        Erik  * Just handles the DCE create/update, including using the above table of
   2019  11042        Erik  * PMTU guesses. The caller is responsible for validating the packet before
   2020  11042        Erik  * passing it in and also to fanout the ICMP error to any matching transport
   2021  11042        Erik  * conns. Assumes the message has been fully pulled up and verified.
   2022  11042        Erik  *
   2023  11042        Erik  * Before getting here, the caller has called icmp_inbound_verify_v4()
   2024  11042        Erik  * that should have verified with ULP to prevent undoing the changes we're
   2025  11042        Erik  * going to make to DCE. For example, TCP might have verified that the packet
   2026  11042        Erik  * which generated error is in the send window.
   2027  11042        Erik  *
   2028  11042        Erik  * In some cases modified this MTU in the ICMP header packet; the caller
   2029  11042        Erik  * should pass to the matching ULP after this returns.
   2030  11042        Erik  */
   2031  11042        Erik static void
   2032  11042        Erik icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira)
   2033  11042        Erik {
   2034  11042        Erik 	dce_t		*dce;
   2035  11042        Erik 	int		old_mtu;
   2036  11042        Erik 	int		mtu, orig_mtu;
   2037  11042        Erik 	ipaddr_t	dst;
   2038  11042        Erik 	boolean_t	disable_pmtud;
   2039  11042        Erik 	ill_t		*ill = ira->ira_ill;
   2040  11042        Erik 	ip_stack_t	*ipst = ill->ill_ipst;
   2041  11042        Erik 	uint_t		hdr_length;
   2042  11042        Erik 	ipha_t		*ipha;
   2043  11042        Erik 
   2044  11042        Erik 	/* Caller already pulled up everything. */
   2045  11042        Erik 	ipha = (ipha_t *)&icmph[1];
   2046      0      stevel 	ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
   2047      0      stevel 	    icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED);
   2048   3284    apersson 	ASSERT(ill != NULL);
   2049      0      stevel 
   2050      0      stevel 	hdr_length = IPH_HDR_LENGTH(ipha);
   2051      0      stevel 
   2052  11042        Erik 	/*
   2053  11042        Erik 	 * We handle path MTU for source routed packets since the DCE
   2054  11042        Erik 	 * is looked up using the final destination.
   2055  11042        Erik 	 */
   2056  11042        Erik 	dst = ip_get_dst(ipha);
   2057  11042        Erik 
   2058  11042        Erik 	dce = dce_lookup_and_add_v4(dst, ipst);
   2059  11042        Erik 	if (dce == NULL) {
   2060  11042        Erik 		/* Couldn't add a unique one - ENOMEM */
   2061  11042        Erik 		ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n",
   2062  11042        Erik 		    ntohl(dst)));
   2063  11042        Erik 		return;
   2064      0      stevel 	}
   2065   8106    Kacheong 
   2066      0      stevel 	/* Check for MTU discovery advice as described in RFC 1191 */
   2067      0      stevel 	mtu = ntohs(icmph->icmph_du_mtu);
   2068   8106    Kacheong 	orig_mtu = mtu;
   2069   8106    Kacheong 	disable_pmtud = B_FALSE;
   2070   8106    Kacheong 
   2071  11042        Erik 	mutex_enter(&dce->dce_lock);
   2072  11042        Erik 	if (dce->dce_flags & DCEF_PMTU)
   2073  11042        Erik 		old_mtu = dce->dce_pmtu;
   2074  11042        Erik 	else
   2075  11042        Erik 		old_mtu = ill->ill_mtu;
   2076  11042        Erik 
   2077  11042        Erik 	if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
   2078  11042        Erik 		uint32_t length;
   2079  11042        Erik 		int	i;
   2080  11042        Erik 
   2081  11042        Erik 		/*
   2082  11042        Erik 		 * Use the table from RFC 1191 to figure out
   2083  11042        Erik 		 * the next "plateau" based on the length in
   2084  11042        Erik 		 * the original IP packet.
   2085  11042        Erik 		 */
   2086  11042        Erik 		length = ntohs(ipha->ipha_length);
   2087  11042        Erik 		DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce,
   2088  11042        Erik 		    uint32_t, length);
   2089  11042        Erik 		if (old_mtu <= length &&
   2090  11042        Erik 		    old_mtu >= length - hdr_length) {
   2091  11042        Erik 			/*
   2092  11042        Erik 			 * Handle broken BSD 4.2 systems that
   2093  11042        Erik 			 * return the wrong ipha_length in ICMP
   2094  11042        Erik 			 * errors.
   2095  11042        Erik 			 */
   2096  11042        Erik 			ip1dbg(("Wrong mtu: sent %d, dce %d\n",
   2097  11042        Erik 			    length, old_mtu));
   2098  11042        Erik 			length -= hdr_length;
   2099  11042        Erik 		}
   2100  11042        Erik 		for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
   2101  11042        Erik 			if (length > icmp_frag_size_table[i])
   2102  11042        Erik 				break;
   2103  11042        Erik 		}
   2104  11042        Erik 		if (i == A_CNT(icmp_frag_size_table)) {
   2105  11042        Erik 			/* Smaller than IP_MIN_MTU! */
   2106  11042        Erik 			ip1dbg(("Too big for packet size %d\n",
   2107  11042        Erik 			    length));
   2108  11042        Erik 			disable_pmtud = B_TRUE;
   2109  11042        Erik 			mtu = ipst->ips_ip_pmtu_min;
   2110  11042        Erik 		} else {
   2111  11042        Erik 			mtu = icmp_frag_size_table[i];
   2112  11042        Erik 			ip1dbg(("Calculated mtu %d, packet size %d, "
   2113  11042        Erik 			    "before %d\n", mtu, length, old_mtu));
   2114  11042        Erik 			if (mtu < ipst->ips_ip_pmtu_min) {
   2115  11042        Erik 				mtu = ipst->ips_ip_pmtu_min;
   2116   8106    Kacheong 				disable_pmtud = B_TRUE;
   2117  11042        Erik 			}
   2118  11042        Erik 		}
   2119  11042        Erik 	}
   2120  11042        Erik 	if (disable_pmtud)
   2121  11042        Erik 		dce->dce_flags |= DCEF_TOO_SMALL_PMTU;
   2122  11042        Erik 	else
   2123  11042        Erik 		dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU;
   2124  11042        Erik 
   2125  11042        Erik 	dce->dce_pmtu = MIN(old_mtu, mtu);
   2126  11042        Erik 	/* Prepare to send the new max frag size for the ULP. */
   2127  11042        Erik 	icmph->icmph_du_zero = 0;
   2128  11042        Erik 	icmph->icmph_du_mtu =  htons((uint16_t)dce->dce_pmtu);
   2129  11042        Erik 	DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, dce_t *,
   2130  11042        Erik 	    dce, int, orig_mtu, int, mtu);
   2131  11042        Erik 
   2132  11042        Erik 	/* We now have a PMTU for sure */
   2133  11042        Erik 	dce->dce_flags |= DCEF_PMTU;
   2134  11066      rafael 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
   2135  11042        Erik 	mutex_exit(&dce->dce_lock);
   2136  11042        Erik 	/*
   2137  11042        Erik 	 * After dropping the lock the new value is visible to everyone.
   2138  11042        Erik 	 * Then we bump the generation number so any cached values reinspect
   2139  11042        Erik 	 * the dce_t.
   2140  11042        Erik 	 */
   2141  11042        Erik 	dce_increment_generation(dce);
   2142  11042        Erik 	dce_refrele(dce);
   2143  11042        Erik }
   2144  11042        Erik 
   2145  11042        Erik /*
   2146  11042        Erik  * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout_v4
   2147      0      stevel  * calls this function.
   2148      0      stevel  */
   2149      0      stevel static mblk_t *
   2150  11042        Erik icmp_inbound_self_encap_error_v4(mblk_t *mp, ipha_t *ipha, ipha_t *in_ipha)
   2151  11042        Erik {
   2152      0      stevel 	int length;
   2153      0      stevel 
   2154      0      stevel 	ASSERT(mp->b_datap->db_type == M_DATA);
   2155      0      stevel 
   2156  11042        Erik 	/* icmp_inbound_v4 has already pulled up the whole error packet */
   2157  11042        Erik 	ASSERT(mp->b_cont == NULL);
   2158  11042        Erik 
   2159  11042        Erik 	/*
   2160  11042        Erik 	 * The length that we want to overlay is the inner header
   2161  11042        Erik 	 * and what follows it.
   2162  11042        Erik 	 */
   2163  11042        Erik 	length = msgdsize(mp) - ((uchar_t *)in_ipha - mp->b_rptr);
   2164  11042        Erik 
   2165  11042        Erik 	/*
   2166  11042        Erik 	 * Overlay the inner header and whatever follows it over the
   2167      0      stevel 	 * outer header.
   2168      0      stevel 	 */
   2169      0      stevel 	bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length);
   2170      0      stevel 
   2171  11042        Erik 	/* Adjust for what we removed */
   2172  11042        Erik 	mp->b_wptr -= (uchar_t *)in_ipha - (uchar_t *)ipha;
   2173      0      stevel 	return (mp);
   2174  10616   Sebastien }
   2175  10616   Sebastien 
   2176  10616   Sebastien /*
   2177      0      stevel  * Try to pass the ICMP message upstream in case the ULP cares.
   2178      0      stevel  *
   2179      0      stevel  * If the packet that caused the ICMP error is secure, we send
   2180      0      stevel  * it to AH/ESP to make sure that the attached packet has a
   2181      0      stevel  * valid association. ipha in the code below points to the
   2182      0      stevel  * IP header of the packet that caused the error.
   2183      0      stevel  *
   2184  10616   Sebastien  * For IPsec cases, we let the next-layer-up (which has access to
   2185  10616   Sebastien  * cached policy on the conn_t, or can query the SPD directly)
   2186  10616   Sebastien  * subtract out any IPsec overhead if they must.  We therefore make no
   2187  10616   Sebastien  * adjustments here for IPsec overhead.
   2188      0      stevel  *
   2189      0      stevel  * IFN could have been generated locally or by some router.
   2190      0      stevel  *
   2191  11042        Erik  * LOCAL : ire_send_wire (before calling ipsec_out_process) can call
   2192  11042        Erik  * icmp_frag_needed/icmp_pkt2big_v6 to generated a local IFN.
   2193      0      stevel  *	    This happens because IP adjusted its value of MTU on an
   2194      0      stevel  *	    earlier IFN message and could not tell the upper layer,
   2195      0      stevel  *	    the new adjusted value of MTU e.g. Packet was encrypted
   2196      0      stevel  *	    or there was not enough information to fanout to upper
   2197  11042        Erik  *	    layers. Thus on the next outbound datagram, ire_send_wire
   2198   4987      danmcd  *	    generates the IFN, where IPsec processing has *not* been
   2199      0      stevel  *	    done.
   2200      0      stevel  *
   2201  11042        Erik  *	    Note that we retain ixa_fragsize across IPsec thus once
   2202  11042        Erik  *	    we have picking ixa_fragsize and entered ipsec_out_process we do
   2203  11042        Erik  *	    no change the fragsize even if the path MTU changes before
   2204  11042        Erik  *	    we reach ip_output_post_ipsec.
   2205  11042        Erik  *
   2206  11042        Erik  *	    In the local case, IRAF_LOOPBACK will be set indicating
   2207      0      stevel  *	    that IFN was generated locally.
   2208      0      stevel  *
   2209      0      stevel  * ROUTER : IFN could be secure or non-secure.
   2210      0      stevel  *
   2211      0      stevel  *	    * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the
   2212      0      stevel  *	      packet in error has AH/ESP headers to validate the AH/ESP
   2213      0      stevel  *	      headers. AH/ESP will verify whether there is a valid SA or
   2214      0      stevel  *	      not and send it back. We will fanout again if we have more
   2215      0      stevel  *	      data in the packet.
   2216      0      stevel  *
   2217      0      stevel  *	      If the packet in error does not have AH/ESP, we handle it
   2218      0      stevel  *	      like any other case.
   2219      0      stevel  *
   2220  11042        Erik  *	    * NON_SECURE : If the packet in error has AH/ESP headers, we send it
   2221  11042        Erik  *	      up to AH/ESP for validation. AH/ESP will verify whether there is a
   2222      0      stevel  *	      valid SA or not and send it back. We will fanout again if
   2223      0      stevel  *	      we have more data in the packet.
   2224      0      stevel  *
   2225      0      stevel  *	      If the packet in error does not have AH/ESP, we handle it
   2226      0      stevel  *	      like any other case.
   2227  11042        Erik  *
   2228  11042        Erik  * The caller must have called icmp_inbound_verify_v4.
   2229  11042        Erik  */
   2230  11042        Erik static void
   2231  11042        Erik icmp_inbound_error_fanout_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
   2232  11042        Erik {
   2233  11042        Erik 	uint16_t	*up;	/* Pointer to ports in ULP header */
   2234  11042        Erik 	uint32_t	ports;	/* reversed ports for fanout */
   2235  11042        Erik 	ipha_t		ripha;	/* With reversed addresses */
   2236  11042        Erik 	ipha_t		*ipha;  /* Inner IP header */
   2237  11042        Erik 	uint_t		hdr_length; /* Inner IP header length */
   2238  11042        Erik 	tcpha_t		*tcpha;
   2239  11042        Erik 	conn_t		*connp;
   2240  11042        Erik 	ill_t		*ill = ira->ira_ill;
   2241  11042        Erik 	ip_stack_t	*ipst = ill->ill_ipst;
   2242  11042        Erik 	ipsec_stack_t	*ipss = ipst->ips_netstack->netstack_ipsec;
   2243  11042        Erik 	ill_t		*rill = ira->ira_rill;
   2244  11042        Erik 
   2245  11042        Erik 	/* Caller already pulled up everything. */
   2246  11042        Erik 	ipha = (ipha_t *)&icmph[1];
   2247  11042        Erik 	ASSERT((uchar_t *)&ipha[1] <= mp->b_wptr);
   2248  11042        Erik 	ASSERT(mp->b_cont == NULL);
   2249  11042        Erik 
   2250  11042        Erik 	hdr_length = IPH_HDR_LENGTH(ipha);
   2251  11042        Erik 	ira->ira_protocol = ipha->ipha_protocol;
   2252      0      stevel 
   2253  10616   Sebastien 	/*
   2254  10616   Sebastien 	 * We need a separate IP header with the source and destination
   2255  10616   Sebastien 	 * addresses reversed to do fanout/classification because the ipha in
   2256  10616   Sebastien 	 * the ICMP error is in the form we sent it out.
   2257  10616   Sebastien 	 */
   2258  10616   Sebastien 	ripha.ipha_src = ipha->ipha_dst;
   2259  10616   Sebastien 	ripha.ipha_dst = ipha->ipha_src;
   2260  10616   Sebastien 	ripha.ipha_protocol = ipha->ipha_protocol;
   2261  10616   Sebastien 	ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length;
   2262  10616   Sebastien 
   2263  11042        Erik 	ip2dbg(("icmp_inbound_error_v4: proto %d %x to %x: %d/%d\n",
   2264  10616   Sebastien 	    ripha.ipha_protocol, ntohl(ipha->ipha_src),
   2265  10616   Sebastien 	    ntohl(ipha->ipha_dst),
   2266  10616   Sebastien 	    icmph->icmph_type, icmph->icmph_code));
   2267  10616   Sebastien 
   2268      0      stevel 	switch (ipha->ipha_protocol) {
   2269      0      stevel 	case IPPROTO_UDP:
   2270      0      stevel 		up = (uint16_t *)((uchar_t *)ipha + hdr_length);
   2271      0      stevel 
   2272  10616   Sebastien 		/* Attempt to find a client stream based on port. */
   2273  11042        Erik 		ip2dbg(("icmp_inbound_error_v4: UDP ports %d to %d\n",
   2274  10616   Sebastien 		    ntohs(up[0]), ntohs(up[1])));
   2275      0      stevel 
   2276  11042        Erik 		/* Note that we send error to all matches. */
   2277  11042        Erik 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2278  11042        Erik 		ip_fanout_udp_multi_v4(mp, &ripha, up[0], up[1], ira);
   2279  11042        Erik 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2280      0      stevel 		return;
   2281      0      stevel 
   2282      0      stevel 	case IPPROTO_TCP:
   2283      0      stevel 		/*
   2284      0      stevel 		 * Find a TCP client stream for this packet.
   2285      0      stevel 		 * Note that we do a reverse lookup since the header is
   2286      0      stevel 		 * in the form we sent it out.
   2287      0      stevel 		 */
   2288  11042        Erik 		tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
   2289  11042        Erik 		connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
   2290   3448    dh155122 		    ipst);
   2291   3284    apersson 		if (connp == NULL)
   2292   3284    apersson 			goto discard_pkt;
   2293      0      stevel 
   2294  11042        Erik 		if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
   2295  11042        Erik 		    (ira->ira_flags & IRAF_IPSEC_SECURE)) {
   2296  11042        Erik 			mp = ipsec_check_inbound_policy(mp, connp,
   2297  11042        Erik 			    ipha, NULL, ira);
   2298  11042        Erik 			if (mp == NULL) {
   2299  11042        Erik 				BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2300  11042        Erik 				/* Note that mp is NULL */
   2301  11042        Erik 				ip_drop_input("ipIfStatsInDiscards", mp, ill);
   2302  11042        Erik 				CONN_DEC_REF(connp);
   2303  11042        Erik 				return;
   2304  11042        Erik 			}
   2305  11042        Erik 		}
   2306  11042        Erik 
   2307  11042        Erik 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2308  11042        Erik 		ira->ira_ill = ira->ira_rill = NULL;
   2309  11042        Erik 		if (IPCL_IS_TCP(connp)) {
   2310  11042        Erik 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
   2311  11042        Erik 			    connp->conn_recvicmp, connp, ira, SQ_FILL,
   2312  11042        Erik 			    SQTAG_TCP_INPUT_ICMP_ERR);
   2313  11042        Erik 		} else {
   2314  11042        Erik 			/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
   2315  11042        Erik 			(connp->conn_recv)(connp, mp, NULL, ira);
   2316  11042        Erik 			CONN_DEC_REF(connp);
   2317  11042        Erik 		}
   2318  11042        Erik 		ira->ira_ill = ill;
   2319  11042        Erik 		ira->ira_rill = rill;
   2320  11042        Erik 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2321      0      stevel 		return;
   2322      0      stevel 
   2323      0      stevel 	case IPPROTO_SCTP:
   2324      0      stevel 		up = (uint16_t *)((uchar_t *)ipha + hdr_length);
   2325  10616   Sebastien 		/* Find a SCTP client stream for this packet. */
   2326      0      stevel 		((uint16_t *)&ports)[0] = up[1];
   2327      0      stevel 		((uint16_t *)&ports)[1] = up[0];
   2328      0      stevel 
   2329  11042        Erik 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2330  11042        Erik 		ip_fanout_sctp(mp, &ripha, NULL, ports, ira);
   2331  11042        Erik 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2332      0      stevel 		return;
   2333      0      stevel 
   2334      0      stevel 	case IPPROTO_ESP:
   2335  11042        Erik 	case IPPROTO_AH:
   2336  11042        Erik 		if (!ipsec_loaded(ipss)) {
   2337  11042        Erik 			ip_proto_not_sup(mp, ira);
   2338  11042        Erik 			return;
   2339  11042        Erik 		}
   2340  11042        Erik 
   2341  11042        Erik 		if (ipha->ipha_protocol == IPPROTO_ESP)
   2342  11042        Erik 			mp = ipsecesp_icmp_error(mp, ira);
   2343  11042        Erik 		else
   2344  11042        Erik 			mp = ipsecah_icmp_error(mp, ira);
   2345  11042        Erik 		if (mp == NULL)
   2346  11042        Erik 			return;
   2347  11042        Erik 
   2348  11042        Erik 		/* Just in case ipsec didn't preserve the NULL b_cont */
   2349  11042        Erik 		if (mp->b_cont != NULL) {
   2350  11042        Erik 			if (!pullupmsg(mp, -1))
   2351  11042        Erik 				goto discard_pkt;
   2352  11042        Erik 		}
   2353  11042        Erik 
   2354  11042        Erik 		/*
   2355  11042        Erik 		 * Note that ira_pktlen and ira_ip_hdr_length are no longer
   2356  11042        Erik 		 * correct, but we don't use them any more here.
   2357  11042        Erik 		 *
   2358  11042        Erik 		 * If succesful, the mp has been modified to not include
   2359  11042        Erik 		 * the ESP/AH header so we can fanout to the ULP's icmp
   2360  11042        Erik 		 * error handler.
   2361  11042        Erik 		 */
   2362  11042        Erik 		if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
   2363  11042        Erik 			goto truncated;
   2364  11042        Erik 
   2365  11042        Erik 		/* Verify the modified message before any further processes. */
   2366  11042        Erik 		ipha = (ipha_t *)mp->b_rptr;
   2367  11042        Erik 		hdr_length = IPH_HDR_LENGTH(ipha);
   2368  11042        Erik 		icmph = (icmph_t *)&mp->b_rptr[hdr_length];
   2369  11042        Erik 		if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
   2370  11042        Erik 			freemsg(mp);
   2371  11042        Erik 			return;
   2372  11042        Erik 		}
   2373  11042        Erik 
   2374  11042        Erik 		icmp_inbound_error_fanout_v4(mp, icmph, ira);
   2375  11042        Erik 		return;
   2376  11042        Erik 
   2377  11042        Erik 	case IPPROTO_ENCAP: {
   2378  11042        Erik 		/* Look for self-encapsulated packets that caused an error */
   2379  11042        Erik 		ipha_t *in_ipha;
   2380  11042        Erik 
   2381  11042        Erik 		/*
   2382  11042        Erik 		 * Caller has verified that length has to be
   2383  11042        Erik 		 * at least the size of IP header.
   2384  11042        Erik 		 */
   2385  11042        Erik 		ASSERT(hdr_length >= sizeof (ipha_t));
   2386  11042        Erik 		/*
   2387  11042        Erik 		 * Check the sanity of the inner IP header like
   2388  11042        Erik 		 * we did for the outer header.
   2389  11042        Erik 		 */
   2390  11042        Erik 		in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
   2391  11042        Erik 		if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) {
   2392  11042        Erik 			goto discard_pkt;
   2393  11042        Erik 		}
   2394  11042        Erik 		if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) {
   2395  11042        Erik 			goto discard_pkt;
   2396  11042        Erik 		}
   2397  11042        Erik 		/* Check for Self-encapsulated tunnels */
   2398  11042        Erik 		if (in_ipha->ipha_src == ipha->ipha_src &&
   2399  11042        Erik 		    in_ipha->ipha_dst == ipha->ipha_dst) {
   2400  11042        Erik 
   2401  11042        Erik 			mp = icmp_inbound_self_encap_error_v4(mp, ipha,
   2402  11042        Erik 			    in_ipha);
   2403  11042        Erik 			if (mp == NULL)
   2404  11042        Erik 				goto discard_pkt;
   2405  11042        Erik 
   2406  11042        Erik 			/*
   2407  11042        Erik 			 * Just in case self_encap didn't preserve the NULL
   2408  11042        Erik 			 * b_cont
   2409  11042        Erik 			 */
   2410  11042        Erik 			if (mp->b_cont != NULL) {
   2411  11042        Erik 				if (!pullupmsg(mp, -1))
   2412  11042        Erik 					goto discard_pkt;
   2413  11042        Erik 			}
   2414  11042        Erik 			/*
   2415  11042        Erik 			 * Note that ira_pktlen and ira_ip_hdr_length are no
   2416  11042        Erik 			 * longer correct, but we don't use them any more here.
   2417  11042        Erik 			 */
   2418  11042        Erik 			if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
   2419  11042        Erik 				goto truncated;
   2420  11042        Erik 
   2421  11042        Erik 			/*
   2422  11042        Erik 			 * Verify the modified message before any further
   2423  11042        Erik 			 * processes.
   2424  11042        Erik 			 */
   2425  11042        Erik 			ipha = (ipha_t *)mp->b_rptr;
   2426  11042        Erik 			hdr_length = IPH_HDR_LENGTH(ipha);
   2427  11042        Erik 			icmph = (icmph_t *)&mp->b_rptr[hdr_length];
   2428  11042        Erik 			if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
   2429      0      stevel 				freemsg(mp);
   2430      0      stevel 				return;
   2431      0      stevel 			}
   2432  11042        Erik 
   2433  11042        Erik 			/*
   2434  11042        Erik 			 * The packet in error is self-encapsualted.
   2435  11042        Erik 			 * And we are finding it further encapsulated
   2436  11042        Erik 			 * which we could not have possibly generated.
   2437  11042        Erik 			 */
   2438  11042        Erik 			if (ipha->ipha_protocol == IPPROTO_ENCAP) {
   2439  11042        Erik 				goto discard_pkt;
   2440  11042        Erik 			}
   2441  11042        Erik 			icmp_inbound_error_fanout_v4(mp, icmph, ira);
   2442  11042        Erik 			return;
   2443  11042        Erik 		}
   2444  11042        Erik 		/* No self-encapsulated */
   2445  11042        Erik 		/* FALLTHRU */
   2446  11042        Erik 	}
   2447  10616   Sebastien 	case IPPROTO_IPV6:
   2448  11042        Erik 		if ((connp = ipcl_iptun_classify_v4(&ripha.ipha_src,
   2449  11042        Erik 		    &ripha.ipha_dst, ipst)) != NULL) {
   2450  11042        Erik 			ira->ira_flags |= IRAF_ICMP_ERROR;
   2451  11042        Erik 			connp->conn_recvicmp(connp, mp, NULL, ira);
   2452  11042        Erik 			CONN_DEC_REF(connp);
   2453  11042        Erik 			ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2454  11042        Erik 			return;
   2455  11042        Erik 		}
   2456  10616   Sebastien 		/*
   2457  10616   Sebastien 		 * No IP tunnel is interested, fallthrough and see
   2458  10616   Sebastien 		 * if a raw socket will want it.
   2459  10616   Sebastien 		 */
   2460  10616   Sebastien 		/* FALLTHRU */
   2461  10616   Sebastien 	default:
   2462  11042        Erik 		ira->ira_flags |= IRAF_ICMP_ERROR;
   2463  11042        Erik 		ip_fanout_proto_v4(mp, &ripha, ira);
   2464  11042        Erik 		ira->ira_flags &= ~IRAF_ICMP_ERROR;
   2465      0      stevel 		return;
   2466      0      stevel 	}
   2467      0      stevel 	/* NOTREACHED */
   2468   3284    apersson discard_pkt:
   2469   3284    apersson 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
   2470  11042        Erik 	ip1dbg(("icmp_inbound_error_fanout_v4: drop pkt\n"));
   2471  11042        Erik 	ip_drop_input("ipIfStatsInDiscards", mp, ill);
   2472  11042        Erik 	freemsg(mp);
   2473  11042        Erik 	return;
   2474  11042        Erik 
   2475  11042        Erik truncated:
   2476  11042        Erik 	/* We pulled up everthing already. Must be truncated */
   2477  11042        Erik 	BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
   2478  11042        Erik 	ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
   2479  11042        Erik 	freemsg(mp);
   2480      0      stevel }
   2481      0      stevel 
   2482      0      stevel /*
   2483      0      stevel  * Common IP options parser.
   2484      0      stevel  *
   2485      0      stevel  * Setup routine: fill in *optp with options-parsing state, then
   2486      0      stevel  * tail-call ipoptp_next to return the first option.
   2487      0      stevel  */
   2488      0      stevel uint8_t
   2489      0      stevel ipoptp_first(ipoptp_t *optp, ipha_t *ipha)
   2490      0      stevel {
   2491      0      stevel 	uint32_t totallen; /* total length of all options */
   2492      0      stevel 
   2493      0      stevel 	totallen = ipha->ipha_version_and_hdr_length -
   2494      0      stevel 	    (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
   2495      0      stevel 	totallen <<= 2;
   2496      0      stevel 	optp->ipoptp_next = (uint8_t *)(&ipha[1]);
   2497  11042        Erik 	optp->ipoptp_end = optp->ipoptp_next + totallen;
   2498  11042        Erik 	optp->ipoptp_flags = 0;
   2499  11042        Erik 	return (ipoptp_next(optp));
   2500  11042        Erik }
   2501  11042        Erik 
   2502  11042        Erik /* Like above but without an ipha_t */
   2503  11042        Erik uint8_t
   2504  11042        Erik ipoptp_first2(ipoptp_t *optp, uint32_t totallen, uint8_t *opt)
   2505  11042        Erik {
   2506  11042        Erik 	optp->ipoptp_next = opt;
   2507      0      stevel 	optp->ipoptp_end = optp->ipoptp_next + totallen;
   2508      0      stevel 	optp->ipoptp_flags = 0;
   2509      0      stevel 	return (ipoptp_next(optp));
   2510      0      stevel }
   2511      0      stevel 
   2512      0      stevel /*
   2513      0      stevel  * Common IP options parser: extract next option.
   2514      0      stevel  */
   2515      0      stevel uint8_t
   2516      0      stevel ipoptp_next(ipoptp_t *optp)
   2517      0      stevel {
   2518      0      stevel 	uint8_t *end = optp->ipoptp_end;
   2519      0      stevel 	uint8_t *cur = optp->ipoptp_next;
   2520      0      stevel 	uint8_t opt, len, pointer;
   2521      0      stevel 
   2522      0      stevel 	/*
   2523      0      stevel 	 * If cur > end already, then the ipoptp_end or ipoptp_next pointer
   2524      0      stevel 	 * has been corrupted.
   2525      0      stevel 	 */
   2526      0      stevel 	ASSERT(cur <= end);
   2527      0      stevel 
   2528      0      stevel 	if (cur == end)
   2529      0      stevel 		return (IPOPT_EOL);
   2530      0      stevel 
   2531      0      stevel 	opt = cur[IPOPT_OPTVAL];
   2532      0      stevel 
   2533      0      stevel 	/*
   2534      0      stevel 	 * Skip any NOP options.
   2535      0      stevel 	 */
   2536      0      stevel 	while (opt == IPOPT_NOP) {
   2537      0      stevel 		cur++;
   2538      0      stevel 		if (cur == end)
   2539      0      stevel 			return (IPOPT_EOL);
   2540      0      stevel 		opt = cur[IPOPT_OPTVAL];
   2541      0      stevel 	}
   2542      0      stevel 
   2543      0      stevel 	if (opt == IPOPT_EOL)
   2544      0      stevel 		return (IPOPT_EOL);
   2545      0      stevel 
   2546      0      stevel 	/*
   2547      0      stevel 	 * Option requiring a length.
   2548      0      stevel 	 */
   2549      0      stevel 	if ((cur + 1) >= end) {
   2550      0      stevel 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2551      0      stevel 		return (IPOPT_EOL);
   2552      0      stevel 	}
   2553      0      stevel 	len = cur[IPOPT_OLEN];
   2554      0      stevel 	if (len < 2) {
   2555      0      stevel 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2556      0      stevel 		return (IPOPT_EOL);
   2557      0      stevel 	}
   2558      0      stevel 	optp->ipoptp_cur = cur;
   2559      0      stevel 	optp->ipoptp_len = len;
   2560      0      stevel 	optp->ipoptp_next = cur + len;
   2561      0      stevel 	if (cur + len > end) {
   2562      0      stevel 		optp->ipoptp_flags |= IPOPTP_ERROR;
   2563      0      stevel 		return (IPOPT_EOL);
   2564      0      stevel 	}
   2565      0      stevel 
   2566      0      stevel 	/*
   2567      0      stevel 	 * For the options which require a pointer field, make sure
   2568      0      stevel 	 * its there, and make sure it points to either something
   2569      0      stevel 	 * inside this option, or the end of the option.
   2570      0      stevel 	 */
   2571      0      stevel 	switch (opt) {
   2572      0      stevel 	case IPOPT_RR:
   2573      0      stevel 	case IPOPT_TS:
   2574      0      stevel 	case IPOPT_LSRR:
   2575      0      stevel 	case IPOPT_SSRR:
   2576      0      stevel 		if (len <= IPOPT_OFFSET) {
   2577      0      stevel 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2578      0      stevel 			return (opt);
   2579      0      stevel 		}
   2580      0      stevel 		pointer = cur[IPOPT_OFFSET];
   2581      0      stevel 		if (pointer - 1 > len) {
   2582      0      stevel 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2583      0      stevel 			return (opt);
   2584      0      stevel 		}
   2585      0      stevel 		break;
   2586      0      stevel 	}
   2587      0      stevel 
   2588      0      stevel 	/*
   2589      0      stevel 	 * Sanity check the pointer field based on the type of the
   2590      0      stevel 	 * option.
   2591      0      stevel 	 */
   2592      0      stevel 	switch (opt) {
   2593      0      stevel 	case IPOPT_RR:
   2594      0      stevel 	case IPOPT_SSRR:
   2595      0      stevel 	case IPOPT_LSRR:
   2596      0      stevel 		if (pointer < IPOPT_MINOFF_SR)
   2597      0      stevel 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2598      0      stevel 		break;
   2599      0      stevel 	case IPOPT_TS:
   2600      0      stevel 		if (pointer < IPOPT_MINOFF_IT)
   2601      0      stevel 			optp->ipoptp_flags |= IPOPTP_ERROR;
   2602      0      stevel 		/*
   2603      0      stevel 		 * Note that the Internet Timestamp option also
   2604      0      stevel 		 * contains two four bit fields (the Overflow field,
   2605      0      stevel 		 * and the Flag field), which follow the pointer
   2606      0      stevel 		 * field.  We don't need to check that these fields
   2607      0      stevel 		 * fall within the length of the option because this
   2608      0      stevel 		 * was implicitely done above.  We've checked that the
   2609      0      stevel 		 * pointer value is at least IPOPT_MINOFF_IT, and that
   2610      0      stevel 		 * it falls within the option.  Since IPOPT_MINOFF_IT >
   2611      0      stevel 		 * IPOPT_POS_OV_FLG, we don't need the explicit check.
   2612      0      stevel 		 */
   2613      0      stevel 		ASSERT(len > IPOPT_POS_OV_FLG);
   2614      0      stevel 		break;
   2615      0      stevel 	}
   2616      0      stevel 
   2617      0      stevel 	return (opt);
   2618   1676         jpk }
   2619   1676         jpk 
   2620   1676         jpk /*
   2621   1676         jpk  * Use the outgoing IP header to create an IP_OPTIONS option the way
   2622   1676         jpk  * it was passed down from the application.
   2623  11042        Erik  *
   2624  11042        Erik  * This is compatible with BSD in that it returns
   2625  11042        Erik  * the reverse source route with the final destination
   2626  11042        Erik  * as the last entry. The first 4 bytes of the option
   2627  11042        Erik  * will contain the final destination.
   2628  11042        Erik  */
   2629  11042        Erik int
   2630  11042        Erik ip_opt_get_user(conn_t *connp, uchar_t *buf)
   2631   1676         jpk {
   2632   1676         jpk 	ipoptp_t	opts;
   2633  11042        Erik 	uchar_t		*opt;
   2634   1676         jpk 	uint8_t		optval;
   2635   1676         jpk 	uint8_t		optlen;
   2636   1676         jpk 	uint32_t	len = 0;
   2637  11042        Erik 	uchar_t		*buf1 = buf;
   2638  11042        Erik 	uint32_t	totallen;
   2639  11042        Erik 	ipaddr_t	dst;
   2640  11042        Erik 	ip_pkt_t	*ipp = &connp->conn_xmit_ipp;
   2641  11042        Erik 
   2642  11042        Erik 	if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
   2643  11042        Erik 		return (0);
   2644  11042        Erik 
   2645  11042        Erik 	totallen = ipp->ipp_ipv4_options_len;
   2646  11042        Erik 	if (totallen & 0x3)
   2647  11042        Erik 		return (0);
   2648   1676         jpk 
   2649   1676         jpk 	buf += IP_ADDR_LEN;	/* Leave room for final destination */
   2650   1676         jpk 	len += IP_ADDR_LEN;
   2651   1676         jpk 	bzero(buf1, IP_ADDR_LEN);
   2652   1676         jpk 
   2653  11042        Erik 	dst = connp->conn_faddr_v4;
   2654  11042        Erik 
   2655  11042        Erik 	for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
   2656   1676         jpk 	    optval != IPOPT_EOL;
   2657   1676         jpk 	    optval = ipoptp_next(&opts)) {
   2658   1676         jpk 		int	off;
   2659   1676         jpk 
   2660   1676         jpk 		opt = opts.ipoptp_cur;
   2661  11042        Erik 		if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
   2662  11042        Erik 			break;
   2663  11042        Erik 		}
   2664   1676         jpk 		optlen = opts.ipoptp_len;
   2665  11042        Erik 
   2666   1676         jpk 		switch (optval) {
   2667   1676         jpk 		case IPOPT_SSRR:
   2668   1676         jpk 		case IPOPT_LSRR:
   2669   1676         jpk 
   2670   1676         jpk 			/*
   2671  11042        Erik 			 * Insert destination as the first entry in the source
   2672   1676         jpk 			 * route and move down the entries on step.
   2673   1676         jpk 			 * The last entry gets placed at buf1.
   2674   1676         jpk 			 */
   2675   1676         jpk 			buf[IPOPT_OPTVAL] = optval;
   2676   1676         jpk 			buf[IPOPT_OLEN] = optlen;
   2677   1676         jpk 			buf[IPOPT_OFFSET] = optlen;
   2678   1676         jpk 
   2679   1676         jpk 			off = optlen - IP_ADDR_LEN;
   2680   1676         jpk 			if (off < 0) {
   2681   1676         jpk 				/* No entries in source route */
   2682   1676         jpk 				break;
   2683   1676         jpk 			}
   2684  11042        Erik 			/* Last entry in source route if not already set */
   2685  11042        Erik 			if (dst == INADDR_ANY)
   2686  11042        Erik 				bcopy(opt + off, buf1, IP_ADDR_LEN);
   2687   1676         jpk 			off -= IP_ADDR_LEN;
   2688   1676         jpk 
   2689   1676         jpk 			while (off > 0) {
   2690   1676         jpk 				bcopy(opt + off,
   2691   1676         jpk 				    buf + off + IP_ADDR_LEN,
   2692   1676         jpk 				    IP_ADDR_LEN);
   2693   1676         jpk 				off -= IP_ADDR_LEN;
   2694   1676         jpk 			}
   2695   1676         jpk 			/* ipha_dst into first slot */
   2696  11042        Erik 			bcopy(&dst, buf + off + IP_ADDR_LEN,
   2697   1676         jpk 			    IP_ADDR_LEN);
   2698   1676         jpk 			buf += optlen;
   2699   1676         jpk 			len += optlen;
   2700   1676         jpk 			break;
   2701   1676         jpk 
   2702   1676         jpk 		default:
   2703   1676         jpk 			bcopy(opt, buf, optlen);
   2704   1676         jpk 			buf += optlen;
   2705   1676         jpk 			len += optlen;
   2706   1676         jpk 			break;
   2707   1676         jpk 		}
   2708   1676         jpk 	}
   2709   1676         jpk done:
   2710   1676         jpk 	/* Pad the resulting options */
   2711   1676         jpk 	while (len & 0x3) {
   2712   1676         jpk 		*buf++ = IPOPT_EOL;
   2713   1676         jpk 		len++;
   2714   1676         jpk 	}
   2715   1676         jpk 	return (len);
   2716      0      stevel }
   2717      0      stevel 
   2718      0      stevel /*
   2719      0      stevel  * Update any record route or timestamp options to include this host.
   2720      0      stevel  * Reverse any source route option.
   2721      0      stevel  * This routine assumes that the options are well formed i.e. that they
   2722      0      stevel  * have already been checked.
   2723      0      stevel  */
   2724      0      stevel static void
   2725      0      stevel icmp_options_update(ipha_t *ipha)
   2726      0      stevel {
   2727      0      stevel 	ipoptp_t	opts;
   2728      0      stevel 	uchar_t		*opt;
   2729      0      stevel 	uint8_t		optval;
   2730      0      stevel 	ipaddr_t	src;		/* Our local address */
   2731      0      stevel 	ipaddr_t	dst;
   2732      0      stevel 
   2733      0      stevel 	ip2dbg(("icmp_options_update\n"));
   2734      0      stevel 	src = ipha->ipha_src;
   2735      0      stevel 	dst = ipha->ipha_dst;
   2736      0      stevel 
   2737      0      stevel 	for (optval = ipoptp_first(&opts, ipha);
   2738      0      stevel 	    optval != IPOPT_EOL;
   2739      0      stevel 	    optval = ipoptp_next(&opts)) {
   2740      0      stevel 		ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
   2741      0      stevel 		opt = opts.ipoptp_cur;
   2742      0      stevel 		ip2dbg(("icmp_options_update: opt %d, len %d\n",
   2743      0      stevel 		    optval, opts.ipoptp_len));
   2744      0      stevel 		switch (optval) {
   2745      0      stevel 			int off1, off2;
   2746      0      stevel 		case IPOPT_SSRR:
   2747      0      stevel 		case IPOPT_LSRR:
   2748      0      stevel 			/*
   2749      0      stevel 			 * Reverse the source route.  The first entry
   2750      0      stevel 			 * should be the next to last one in the current
   2751      0      stevel 			 * source route (the last entry is our address).
   2752      0      stevel 			 * The last entry should be the final destination.
   2753      0      stevel 			 */
   2754      0      stevel 			off1 = IPOPT_MINOFF_SR - 1;
   2755      0      stevel 			off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
   2756      0      stevel 			if (off2 < 0) {
   2757      0      stevel 				/* No entries in source route */
   2758      0      stevel 				ip1dbg((
   2759      0      stevel 				    "icmp_options_update: bad src route\n"));
   2760      0      stevel 				break;
   2761      0      stevel 			}
   2762      0      stevel 			bcopy((char *)opt + off2, &dst, IP_ADDR_LEN);
   2763      0      stevel 			bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN);
   2764      0      stevel 			bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN);
   2765      0      stevel 			off2 -= IP_ADDR_LEN;
   2766      0      stevel 
   2767      0      stevel 			while (off1 < off2) {
   2768      0      stevel 				bcopy((char *)opt + off1, &src, IP_ADDR_LEN);
   2769      0      stevel 				bcopy((char *)opt + off2, (char *)opt + off1,
   2770      0      stevel 				    IP_ADDR_LEN);
   2771      0      stevel 				bcopy(&src, (char *)opt + off2, IP_ADDR_LEN);
   2772      0      stevel 				off1 += IP_ADDR_LEN;
   2773      0      stevel 				off2 -= IP_ADDR_LEN;
   2774      0      stevel 			}
   2775      0      stevel 			opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
   2776      0      stevel 			break;
   2777      0      stevel 		}
   2778      0      stevel 	}
   2779      0      stevel }
   2780      0      stevel 
   2781      0      stevel /*
   2782      0      stevel  * Process received ICMP Redirect messages.
   2783  11042        Erik  * Assumes the caller has verified that the headers are in the pulled up mblk.
   2784  11042        Erik  * Consumes mp.
   2785  11042        Erik  */
   2786  11042        Erik static void
   2787  11042        Erik icmp_redirect_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, ip_recv_attr_t *ira)
   2788  11042        Erik {
   2789  11042        Erik 	ire_t		*ire, *nire;
   2790  11042        Erik 	ire_t		*prev_ire;
   2791  11042        Erik 	ipaddr_t  	src, dst, gateway;
   2792  11042        Erik 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   2793  11042        Erik 	ipha_t		*inner_ipha;	/* Inner IP header */
   2794  11042        Erik 
   2795  11042        Erik 	/* Caller already pulled up everything. */
   2796  11042        Erik 	inner_ipha = (ipha_t *)&icmph[1];
   2797      0      stevel 	src = ipha->ipha_src;
   2798  11042        Erik 	dst = inner_ipha->ipha_dst;
   2799      0      stevel 	gateway = icmph->icmph_rd_gateway;
   2800      0      stevel 	/* Make sure the new gateway is reachable somehow. */
   2801  11042        Erik 	ire = ire_ftable_lookup_v4(gateway, 0, 0, IRE_ONLINK, NULL,
   2802  11042        Erik 	    ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
   2803      0      stevel 	/*
   2804      0      stevel 	 * Make sure we had a route for the dest in question and that
   2805      0      stevel 	 * that route was pointing to the old gateway (the source of the
   2806      0      stevel 	 * redirect packet.)
   2807  11042        Erik 	 * Note: this merely says that there is some IRE which matches that
   2808  11042        Erik 	 * gateway; not that the longest match matches that gateway.
   2809  11042        Erik 	 */
   2810  11042        Erik 	prev_ire = ire_ftable_lookup_v4(dst, 0, src, 0, NULL, ALL_ZONES,
   2811  11042        Erik 	    NULL, MATCH_IRE_GW, 0, ipst, NULL);
   2812      0      stevel 	/*
   2813      0      stevel 	 * Check that
   2814      0      stevel 	 *	the redirect was not from ourselves
   2815      0      stevel 	 *	the new gateway and the old gateway are directly reachable
   2816      0      stevel 	 */
   2817  11042        Erik 	if (prev_ire == NULL || ire == NULL ||
   2818  11042        Erik 	    (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
   2819  11042        Erik 	    (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
   2820  11042        Erik 	    !(ire->ire_type & IRE_IF_ALL)) {
   2821   3448    dh155122 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
   2822  11042        Erik 		ip_drop_input("icmpInBadRedirects - ire", mp, ira->ira_ill);
   2823      0      stevel 		freemsg(mp);
   2824      0      stevel 		if (ire != NULL)
   2825      0      stevel 			ire_refrele(ire);
   2826      0      stevel 		if (prev_ire != NULL)
   2827      0      stevel 			ire_refrele(prev_ire);
   2828      0      stevel 		return;
   2829      0      stevel 	}
   2830      0      stevel 
   2831      0      stevel 	ire_refrele(prev_ire);
   2832  11042        Erik 	ire_refrele(ire);
   2833  11042        Erik 
   2834      0      stevel 	/*
   2835      0      stevel 	 * TODO: more precise handling for cases 0, 2, 3, the latter two
   2836      0      stevel 	 * require TOS routing
   2837      0      stevel 	 */
   2838      0      stevel 	switch (icmph->icmph_code) {
   2839      0      stevel 	case 0:
   2840      0      stevel 	case 1:
   2841      0      stevel 		/* TODO: TOS specificity for cases 2 and 3 */
   2842      0      stevel 	case 2:
   2843      0      stevel 	case 3:
   2844      0      stevel 		break;
   2845      0      stevel 	default:
   2846   3448    dh155122 		BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
   2847  11042        Erik 		ip_drop_input("icmpInBadRedirects - code", mp, ira->ira_ill);
   2848  11042        Erik 		freemsg(mp);
   2849      0      stevel 		return;
   2850      0      stevel 	}
   2851      0      stevel 	/*
   2852      0      stevel 	 * Create a Route Association.  This will allow us to remember that
   2853      0      stevel 	 * someone we believe told us to use the particular gateway.
   2854      0      stevel 	 */
   2855      0      stevel 	ire = ire_create(
   2856   4459      kcpoon 	    (uchar_t *)&dst,			/* dest addr */
   2857   4459      kcpoon 	    (uchar_t *)&ip_g_all_ones,		/* mask */
   2858   4459      kcpoon 	    (uchar_t *)&gateway,		/* gateway addr */
   2859   4459      kcpoon 	    IRE_HOST,
   2860  11042        Erik 	    NULL,				/* ill */
   2861  11042        Erik 	    ALL_ZONES,
   2862   4459      kcpoon 	    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
   2863   4714     sowmini 	    NULL,				/* tsol_gc_t */
   2864   4459      kcpoon 	    ipst);
   2865      0      stevel 
   2866      0      stevel 	if (ire == NULL) {
   2867      0      stevel 		freemsg(mp);
   2868  11042        Erik 		return;
   2869  11042        Erik 	}
   2870  11042        Erik 	nire = ire_add(ire);
   2871  11042        Erik 	/* Check if it was a duplicate entry */
   2872  11042        Erik 	if (nire != NULL && nire != ire) {
   2873  11042        Erik 		ASSERT(nire->ire_identical_ref > 1);
   2874  11042        Erik 		ire_delete(nire);
   2875  11042        Erik 		ire_refrele(nire);
   2876  11042        Erik 		nire = NULL;
   2877  11042        Erik 	}
   2878  11042        Erik 	ire = nire;
   2879  11042        Erik 	if (ire != NULL) {
   2880  11042        Erik 		ire_refrele(ire);		/* Held in ire_add */
   2881  11042        Erik 
   2882      0      stevel 		/* tell routing sockets that we received a redirect */
   2883      0      stevel 		ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src,
   2884      0      stevel 		    (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
   2885   3448    dh155122 		    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
   2886      0      stevel 	}
   2887      0      stevel 
   2888      0      stevel 	/*
   2889   3004    dd193516 	 * Delete any existing IRE_HOST type redirect ires for this destination.
   2890      0      stevel 	 * This together with the added IRE has the effect of
   2891      0      stevel 	 * modifying an existing redirect.
   2892      0      stevel 	 */
   2893  11042        Erik 	prev_ire = ire_ftable_lookup_v4(dst, 0, src, IRE_HOST, NULL,
   2894  11042        Erik 	    ALL_ZONES, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), 0, ipst, NULL);
   2895   3004    dd193516 	if (prev_ire != NULL) {
   2896   3004    dd193516 		if (prev_ire ->ire_flags & RTF_DYNAMIC)
   2897   3004    dd193516 			ire_delete(prev_ire);
   2898      0      stevel 		ire_refrele(prev_ire);
   2899      0      stevel 	}
   2900      0      stevel 
   2901      0      stevel 	freemsg(mp);
   2902      0      stevel }
   2903      0      stevel 
   2904      0      stevel /*
   2905      0      stevel  * Generate an ICMP parameter problem message.
   2906  11042        Erik  * When called from ip_output side a minimal ip_recv_attr_t needs to be
   2907  11042        Erik  * constructed by the caller.
   2908  11042        Erik  */
   2909  11042        Erik static void
   2910  11042        Erik icmp_param_problem(mblk_t *mp, uint8_t ptr, ip_recv_attr_t *ira)
   2911      0      stevel {
   2912      0      stevel 	icmph_t	icmph;
   2913  11042        Erik 	ip_stack_t	*ipst = ira->ira_ill->ill_ipst;
   2914  11042        Erik 
   2915  11042        Erik 	mp = icmp_pkt_err_ok(mp, ira);
   2916  11042        Erik 	if (mp == NULL)
   2917  11042        Erik 		return;
   2918      0      stevel 
   2919      0      stevel 	bzero(&icmph, sizeof (icmph_t));
   2920      0      stevel 	icmph.icmph_type = ICMP_PARAM_PROBLEM;
   2921      0      stevel 	icmph.icmph_pp_ptr = ptr;
   2922   3448    dh155122 	BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs);
   2923  11042        Erik 	icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
   2924      0      stevel }
   2925      0      stevel 
   2926      0      stevel /*
   2927      0      stevel  * Build and ship an IPv4 ICMP message using the packet data in mp, and
   2928      0      stevel  * the ICMP header pointed to by "stuff".  (May be called as writer.)
   2929      0      stevel  * Note: assumes that icmp_pkt_err_ok has been called to verify that
   2930      0      stevel  * an icmp error packet can be sent.
   2931      0      stevel  * Assigns an appropriate source address to the packet. If ipha_dst is
   2932  11042        Erik  * one of our addresses use it for source. Otherwise let ip_output_simple
   2933  11042        Erik  * pick the source address.
   2934  11042        Erik  */
   2935  11042        Erik static void
   2936  11042        Erik icmp_pkt(mblk_t *mp, void *stuff, size_t len, ip_recv_attr_t *ira)
   2937      0      stevel {
   2938      0      stevel 	ipaddr_t dst;
   2939      0      stevel 	icmph_t	*icmph;
   2940      0      stevel 	ipha_t	*ipha;
   2941      0      stevel 	uint_t	len_needed;
   2942      0      stevel 	size_t	msg_len;
   2943      0      stevel 	mblk_t	*mp1;
   2944      0      stevel 	ipaddr_t src;
   2945      0      stevel 	ire_t	*ire;
   2946  11042        Erik 	ip_xmit_attr_t ixas;
   2947  11042        Erik 	ip_stack_t *ipst = ira->ira_ill->ill_ipst;
   2948  11042        Erik 
   2949  11042        Erik 	ipha = (ipha_t *)mp->b_rptr;
   2950  11042        Erik 
   2951  11042        Erik 	bzero(&ixas, sizeof (ixas));
   2952  11042        Erik 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
   2953  11042        Erik 	ixas.ixa_zoneid = ira->ira_zoneid;
   2954  11042        Erik 	ixas.ixa_ifindex = 0;
   2955  11042        Erik 	ixas.ixa_ipst = ipst;
   2956  11042        Erik 	ixas.ixa_cred = kcred;
   2957  11042        Erik 	ixas.ixa_cpid = NOPID;
   2958  11042        Erik 	ixas.ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
   2959  11042        Erik 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
   2960  11042        Erik 
   2961  11042        Erik 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
   2962  11042        Erik 		/*
   2963  11042        Erik 		 * Apply IPsec based on how IPsec was applied to
   2964  11042        Erik 		 * the packet that had the error.
   2965      0      stevel 		 *
   2966  11042        Erik 		 * If it was an outbound packet that caused the ICMP
   2967  11042        Erik 		 * error, then the caller will have setup the IRA
   2968  11042        Erik 		 * appropriately.
   2969  11042        Erik 		 */
   2970  11042        Erik 		if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
   2971  11042        Erik 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
   2972  11042        Erik 			/* Note: mp already consumed and ip_drop_packet done */
   2973  11042        Erik 			return;
   2974  11042        Erik 		}
   2975      0      stevel 	} else {
   2976      0      stevel 		/*
   2977      0      stevel 		 * This is in clear. The icmp message we are building
   2978  11042        Erik 		 * here should go out in clear, independent of our policy.
   2979  11042        Erik 		 */
   2980  11042        Erik 		ixas.ixa_flags |= IXAF_NO_IPSEC;
   2981      0      stevel 	}
   2982      0      stevel 
   2983      0      stevel 	/* Remember our eventual destination */
   2984      0      stevel 	dst = ipha->ipha_src;
   2985      0      stevel 
   2986  11042        Erik 	/*
   2987  11042        Erik 	 * If the packet was for one of our unicast addresses, make
   2988  11042        Erik 	 * sure we respond with that as the source. Otherwise
   2989  11042        Erik 	 * have ip_output_simple pick the source address.
   2990  11042        Erik 	 */
   2991  11042        Erik 	ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0,
   2992  11042        Erik 	    (IRE_LOCAL|IRE_LOOPBACK), NULL, ira->ira_zoneid, NULL,
   2993  11042        Erik 	    MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY, 0, ipst, NULL);
   2994  11042        Erik 	if (ire != NULL) {
   2995  11042        Erik 		ire_refrele(ire);
   2996      0      stevel 		src = ipha->ipha_dst;
   2997   4823         seb 	} else {
   2998  11042        Erik 		src = INADDR_ANY;
   2999  11042        Erik 		ixas.ixa_flags |= IXAF_SET_SOURCE;
   3000  11042        Erik 	}
   3001      0      stevel 
   3002      0      stevel 	/*
   3003   4564     wy83408 	 * Check if we can send back more then 8 bytes in addition to
   3004   4564     wy83408 	 * the IP header.  We try to send 64 bytes of data and the internal
   3005   4564     wy83408 	 * header in the special cases of ipv4 encapsulated ipv4 or ipv6.
   3006      0      stevel 	 */
   3007   1676         jpk 	len_needed = IPH_HDR_LENGTH(ipha);
   3008   4564     wy83408 	if (ipha->ipha_protocol == IPPROTO_ENCAP ||
   3009   4564     wy83408 	    ipha->ipha_protocol == IPPROTO_IPV6) {
   3010   4564     wy83408 		if (!pullupmsg(mp, -1)) {
   3011   4564     wy83408 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
   3012