Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 /* Copyright (c) 1990 Mentat Inc. */
     27 
     28 #include <sys/types.h>
     29 #include <sys/stream.h>
     30 #include <sys/dlpi.h>
     31 #include <sys/stropts.h>
     32 #include <sys/sysmacros.h>
     33 #include <sys/strsubr.h>
     34 #include <sys/strlog.h>
     35 #include <sys/strsun.h>
     36 #include <sys/zone.h>
     37 #define	_SUN_TPI_VERSION 2
     38 #include <sys/tihdr.h>
     39 #include <sys/xti_inet.h>
     40 #include <sys/ddi.h>
     41 #include <sys/sunddi.h>
     42 #include <sys/cmn_err.h>
     43 #include <sys/debug.h>
     44 #include <sys/kobj.h>
     45 #include <sys/modctl.h>
     46 #include <sys/atomic.h>
     47 #include <sys/policy.h>
     48 #include <sys/priv.h>
     49 
     50 #include <sys/systm.h>
     51 #include <sys/param.h>
     52 #include <sys/kmem.h>
     53 #include <sys/sdt.h>
     54 #include <sys/socket.h>
     55 #include <sys/vtrace.h>
     56 #include <sys/isa_defs.h>
     57 #include <sys/mac.h>
     58 #include <net/if.h>
     59 #include <net/if_arp.h>
     60 #include <net/route.h>
     61 #include <sys/sockio.h>
     62 #include <netinet/in.h>
     63 #include <net/if_dl.h>
     64 
     65 #include <inet/common.h>
     66 #include <inet/mi.h>
     67 #include <inet/mib2.h>
     68 #include <inet/nd.h>
     69 #include <inet/arp.h>
     70 #include <inet/snmpcom.h>
     71 #include <inet/optcom.h>
     72 #include <inet/kstatcom.h>
     73 
     74 #include <netinet/igmp_var.h>
     75 #include <netinet/ip6.h>
     76 #include <netinet/icmp6.h>
     77 #include <netinet/sctp.h>
     78 
     79 #include <inet/ip.h>
     80 #include <inet/ip_impl.h>
     81 #include <inet/ip6.h>
     82 #include <inet/ip6_asp.h>
     83 #include <inet/tcp.h>
     84 #include <inet/tcp_impl.h>
     85 #include <inet/ip_multi.h>
     86 #include <inet/ip_if.h>
     87 #include <inet/ip_ire.h>
     88 #include <inet/ip_ftable.h>
     89 #include <inet/ip_rts.h>
     90 #include <inet/ip_ndp.h>
     91 #include <inet/ip_listutils.h>
     92 #include <netinet/igmp.h>
     93 #include <netinet/ip_mroute.h>
     94 #include <inet/ipp_common.h>
     95 
     96 #include <net/pfkeyv2.h>
     97 #include <inet/ipsec_info.h>
     98 #include <inet/sadb.h>
     99 #include <inet/ipsec_impl.h>
    100 #include <sys/iphada.h>
    101 #include <inet/tun.h>
    102 #include <inet/ipdrop.h>
    103 #include <inet/ip_netinfo.h>
    104 
    105 #include <sys/ethernet.h>
    106 #include <net/if_types.h>
    107 #include <sys/cpuvar.h>
    108 
    109 #include <ipp/ipp.h>
    110 #include <ipp/ipp_impl.h>
    111 #include <ipp/ipgpc/ipgpc.h>
    112 
    113 #include <sys/multidata.h>
    114 #include <sys/pattr.h>
    115 
    116 #include <inet/ipclassifier.h>
    117 #include <inet/sctp_ip.h>
    118 #include <inet/sctp/sctp_impl.h>
    119 #include <inet/udp_impl.h>
    120 #include <inet/rawip_impl.h>
    121 #include <inet/rts_impl.h>
    122 #include <sys/sunddi.h>
    123 
    124 #include <sys/tsol/label.h>
    125 #include <sys/tsol/tnet.h>
    126 
    127 #include <rpc/pmap_prot.h>
    128 
    129 /*
    130  * Values for squeue switch:
    131  * IP_SQUEUE_ENTER_NODRAIN: squeue_enter_nodrain
    132  * IP_SQUEUE_ENTER: squeue_enter
    133  * IP_SQUEUE_FILL: squeue_fill
    134  */
    135 int ip_squeue_enter = 2;	/* Setable in /etc/system */
    136 
    137 squeue_func_t ip_input_proc;
    138 #define	SET_BPREV_FLAG(x)	((mblk_t *)(uintptr_t)(x))
    139 
    140 /*
    141  * Setable in /etc/system
    142  */
    143 int ip_poll_normal_ms = 100;
    144 int ip_poll_normal_ticks = 0;
    145 int ip_modclose_ackwait_ms = 3000;
    146 
    147 /*
    148  * It would be nice to have these present only in DEBUG systems, but the
    149  * current design of the global symbol checking logic requires them to be
    150  * unconditionally present.
    151  */
    152 uint_t ip_thread_data;			/* TSD key for debug support */
    153 krwlock_t ip_thread_rwlock;
    154 list_t	ip_thread_list;
    155 
    156 /*
    157  * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions.
    158  */
    159 
    160 struct listptr_s {
    161 	mblk_t	*lp_head;	/* pointer to the head of the list */
    162 	mblk_t	*lp_tail;	/* pointer to the tail of the list */
    163 };
    164 
    165 typedef struct listptr_s listptr_t;
    166 
    167 /*
    168  * This is used by ip_snmp_get_mib2_ip_route_media and
    169  * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data.
    170  */
    171 typedef struct iproutedata_s {
    172 	uint_t		ird_idx;
    173 	listptr_t	ird_route;	/* ipRouteEntryTable */
    174 	listptr_t	ird_netmedia;	/* ipNetToMediaEntryTable */
    175 	listptr_t	ird_attrs;	/* ipRouteAttributeTable */
    176 } iproutedata_t;
    177 
    178 /*
    179  * Cluster specific hooks. These should be NULL when booted as a non-cluster
    180  */
    181 
    182 /*
    183  * Hook functions to enable cluster networking
    184  * On non-clustered systems these vectors must always be NULL.
    185  *
    186  * Hook function to Check ip specified ip address is a shared ip address
    187  * in the cluster
    188  *
    189  */
    190 int (*cl_inet_isclusterwide)(uint8_t protocol,
    191     sa_family_t addr_family, uint8_t *laddrp) = NULL;
    192 
    193 /*
    194  * Hook function to generate cluster wide ip fragment identifier
    195  */
    196 uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family,
    197     uint8_t *laddrp, uint8_t *faddrp) = NULL;
    198 
    199 /*
    200  * Hook function to generate cluster wide SPI.
    201  */
    202 void (*cl_inet_getspi)(uint8_t, uint8_t *, size_t) = NULL;
    203 
    204 /*
    205  * Hook function to verify if the SPI is already utlized.
    206  */
    207 
    208 int (*cl_inet_checkspi)(uint8_t, uint32_t) = NULL;
    209 
    210 /*
    211  * Hook function to delete the SPI from the cluster wide repository.
    212  */
    213 
    214 void (*cl_inet_deletespi)(uint8_t, uint32_t) = NULL;
    215 
    216 /*
    217  * Hook function to inform the cluster when packet received on an IDLE SA
    218  */
    219 
    220 void (*cl_inet_idlesa)(uint8_t, uint32_t, sa_family_t, in6_addr_t,
    221     in6_addr_t) = NULL;
    222 
    223 /*
    224  * Synchronization notes:
    225  *
    226  * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
    227  * MT level protection given by STREAMS. IP uses a combination of its own
    228  * internal serialization mechanism and standard Solaris locking techniques.
    229  * The internal serialization is per phyint (no IPMP) or per IPMP group.
    230  * This is used to serialize plumbing operations, IPMP operations, certain
    231  * multicast operations, most set ioctls, igmp/mld timers etc.
    232  *
    233  * Plumbing is a long sequence of operations involving message
    234  * exchanges between IP, ARP and device drivers. Many set ioctls are typically
    235  * involved in plumbing operations. A natural model is to serialize these
    236  * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
    237  * parallel without any interference. But various set ioctls on hme0 are best
    238  * serialized. However if the system uses IPMP, the operations are easier if
    239  * they are serialized on a per IPMP group basis since IPMP operations
    240  * happen across ill's of a group. Thus the lowest common denominator is to
    241  * serialize most set ioctls, multicast join/leave operations, IPMP operations
    242  * igmp/mld timer operations, and processing of DLPI control messages received
    243  * from drivers on a per IPMP group basis. If the system does not employ
    244  * IPMP the serialization is on a per phyint basis. This serialization is
    245  * provided by the ipsq_t and primitives operating on this. Details can
    246  * be found in ip_if.c above the core primitives operating on ipsq_t.
    247  *
    248  * Lookups of an ipif or ill by a thread return a refheld ipif / ill.
    249  * Simiarly lookup of an ire by a thread also returns a refheld ire.
    250  * In addition ipif's and ill's referenced by the ire are also indirectly
    251  * refheld. Thus no ipif or ill can vanish nor can critical parameters like
    252  * the ipif's address or netmask change as long as an ipif is refheld
    253  * directly or indirectly. For example an SIOCLIFADDR ioctl that changes the
    254  * address of an ipif has to go through the ipsq_t. This ensures that only
    255  * 1 such exclusive operation proceeds at any time on the ipif. It then
    256  * deletes all ires associated with this ipif, and waits for all refcnts
    257  * associated with this ipif to come down to zero. The address is changed
    258  * only after the ipif has been quiesced. Then the ipif is brought up again.
    259  * More details are described above the comment in ip_sioctl_flags.
    260  *
    261  * Packet processing is based mostly on IREs and are fully multi-threaded
    262  * using standard Solaris MT techniques.
    263  *
    264  * There are explicit locks in IP to handle:
    265  * - The ip_g_head list maintained by mi_open_link() and friends.
    266  *
    267  * - The reassembly data structures (one lock per hash bucket)
    268  *
    269  * - conn_lock is meant to protect conn_t fields. The fields actually
    270  *   protected by conn_lock are documented in the conn_t definition.
    271  *
    272  * - ire_lock to protect some of the fields of the ire, IRE tables
    273  *   (one lock per hash bucket). Refer to ip_ire.c for details.
    274  *
    275  * - ndp_g_lock and nce_lock for protecting NCEs.
    276  *
    277  * - ill_lock protects fields of the ill and ipif. Details in ip.h
    278  *
    279  * - ill_g_lock: This is a global reader/writer lock. Protects the following
    280  *	* The AVL tree based global multi list of all ills.
    281  *	* The linked list of all ipifs of an ill
    282  *	* The <ill-ipsq> mapping
    283  *	* The ipsq->ipsq_phyint_list threaded by phyint_ipsq_next
    284  *	* The illgroup list threaded by ill_group_next.
    285  *	* <ill-phyint> association
    286  *   Insertion/deletion of an ill in the system, insertion/deletion of an ipif
    287  *   into an ill, changing the <ill-ipsq> mapping of an ill, insertion/deletion
    288  *   of an ill into the illgrp list, changing the <ill-phyint> assoc of an ill
    289  *   will all have to hold the ill_g_lock as writer for the actual duration
    290  *   of the insertion/deletion/change. More details about the <ill-ipsq> mapping
    291  *   may be found in the IPMP section.
    292  *
    293  * - ill_lock:  This is a per ill mutex.
    294  *   It protects some members of the ill and is documented below.
    295  *   It also protects the <ill-ipsq> mapping
    296  *   It also protects the illgroup list threaded by ill_group_next.
    297  *   It also protects the <ill-phyint> assoc.
    298  *   It also protects the list of ipifs hanging off the ill.
    299  *
    300  * - ipsq_lock: This is a per ipsq_t mutex lock.
    301  *   This protects all the other members of the ipsq struct except
    302  *   ipsq_refs and ipsq_phyint_list which are protected by ill_g_lock
    303  *
    304  * - illgrp_lock: This is a per ill_group mutex lock.
    305  *   The only thing it protects is the illgrp_ill_schednext member of ill_group
    306  *   which dictates which is the next ill in an ill_group that is to be chosen
    307  *   for sending outgoing packets, through creation of an IRE_CACHE that
    308  *   references this ill.
    309  *
    310  * - phyint_lock: This is a per phyint mutex lock. Protects just the
    311  *   phyint_flags
    312  *
    313  * - ip_g_nd_lock: This is a global reader/writer lock.
    314  *   Any call to nd_load to load a new parameter to the ND table must hold the
    315  *   lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock
    316  *   as reader.
    317  *
    318  * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses.
    319  *   This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
    320  *   uniqueness check also done atomically.
    321  *
    322  * - ipsec_capab_ills_lock: This readers/writer lock protects the global
    323  *   lists of IPsec capable ills (ipsec_capab_ills_{ah,esp}). It is taken
    324  *   as a writer when adding or deleting elements from these lists, and
    325  *   as a reader when walking these lists to send a SADB update to the
    326  *   IPsec capable ills.
    327  *
    328  * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
    329  *   group list linked by ill_usesrc_grp_next. It also protects the
    330  *   ill_usesrc_ifindex field. It is taken as a writer when a member of the
    331  *   group is being added or deleted.  This lock is taken as a reader when
    332  *   walking the list/group(eg: to get the number of members in a usesrc group).
    333  *   Note, it is only necessary to take this lock if the ill_usesrc_grp_next
    334  *   field is changing state i.e from NULL to non-NULL or vice-versa. For
    335  *   example, it is not necessary to take this lock in the initial portion
    336  *   of ip_sioctl_slifusesrc or at all in ip_sioctl_groupname and
    337  *   ip_sioctl_flags since the these operations are executed exclusively and
    338  *   that ensures that the "usesrc group state" cannot change. The "usesrc
    339  *   group state" change can happen only in the latter part of
    340  *   ip_sioctl_slifusesrc and in ill_delete.
    341  *
    342  * Changing <ill-phyint>, <ill-ipsq>, <ill-illgroup> assocications.
    343  *
    344  * To change the <ill-phyint> association, the ill_g_lock must be held
    345  * as writer, and the ill_locks of both the v4 and v6 instance of the ill
    346  * must be held.
    347  *
    348  * To change the <ill-ipsq> association the ill_g_lock must be held as writer
    349  * and the ill_lock of the ill in question must be held.
    350  *
    351  * To change the <ill-illgroup> association the ill_g_lock must be held as
    352  * writer and the ill_lock of the ill in question must be held.
    353  *
    354  * To add or delete an ipif from the list of ipifs hanging off the ill,
    355  * ill_g_lock (writer) and ill_lock must be held and the thread must be
    356  * a writer on the associated ipsq,.
    357  *
    358  * To add or delete an ill to the system, the ill_g_lock must be held as
    359  * writer and the thread must be a writer on the associated ipsq.
    360  *
    361  * To add or delete an ilm to an ill, the ill_lock must be held and the thread
    362  * must be a writer on the associated ipsq.
    363  *
    364  * Lock hierarchy
    365  *
    366  * Some lock hierarchy scenarios are listed below.
    367  *
    368  * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock
    369  * ill_g_lock -> illgrp_lock -> ill_lock
    370  * ill_g_lock -> ill_lock(s) -> phyint_lock
    371  * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock
    372  * ill_g_lock -> ip_addr_avail_lock
    373  * conn_lock -> irb_lock -> ill_lock -> ire_lock
    374  * ill_g_lock -> ip_g_nd_lock
    375  *
    376  * When more than 1 ill lock is needed to be held, all ill lock addresses
    377  * are sorted on address and locked starting from highest addressed lock
    378  * downward.
    379  *
    380  * IPsec scenarios
    381  *
    382  * ipsa_lock -> ill_g_lock -> ill_lock
    383  * ipsec_capab_ills_lock -> ill_g_lock -> ill_lock
    384  * ipsec_capab_ills_lock -> ipsa_lock
    385  * ill_g_usesrc_lock -> ill_g_lock -> ill_lock
    386  *
    387  * Trusted Solaris scenarios
    388  *
    389  * igsa_lock -> gcgrp_rwlock -> gcgrp_lock
    390  * igsa_lock -> gcdb_lock
    391  * gcgrp_rwlock -> ire_lock
    392  * gcgrp_rwlock -> gcdb_lock
    393  *
    394  *
    395  * Routing/forwarding table locking notes:
    396  *
    397  * Lock acquisition order: Radix tree lock, irb_lock.
    398  * Requirements:
    399  * i.  Walker must not hold any locks during the walker callback.
    400  * ii  Walker must not see a truncated tree during the walk because of any node
    401  *     deletion.
    402  * iii Existing code assumes ire_bucket is valid if it is non-null and is used
    403  *     in many places in the code to walk the irb list. Thus even if all the
    404  *     ires in a bucket have been deleted, we still can't free the radix node
    405  *     until the ires have actually been inactive'd (freed).
    406  *
    407  * Tree traversal - Need to hold the global tree lock in read mode.
    408  * Before dropping the global tree lock, need to either increment the ire_refcnt
    409  * to ensure that the radix node can't be deleted.
    410  *
    411  * Tree add - Need to hold the global tree lock in write mode to add a
    412  * radix node. To prevent the node from being deleted, increment the
    413  * irb_refcnt, after the node is added to the tree. The ire itself is
    414  * added later while holding the irb_lock, but not the tree lock.
    415  *
    416  * Tree delete - Need to hold the global tree lock and irb_lock in write mode.
    417  * All associated ires must be inactive (i.e. freed), and irb_refcnt
    418  * must be zero.
    419  *
    420  * Walker - Increment irb_refcnt before calling the walker callback. Hold the
    421  * global tree lock (read mode) for traversal.
    422  *
    423  * IPsec notes :
    424  *
    425  * IP interacts with the IPsec code (AH/ESP) by tagging a M_CTL message
    426  * in front of the actual packet. For outbound datagrams, the M_CTL
    427  * contains a ipsec_out_t (defined in ipsec_info.h), which has the
    428  * information used by the IPsec code for applying the right level of
    429  * protection. The information initialized by IP in the ipsec_out_t
    430  * is determined by the per-socket policy or global policy in the system.
    431  * For inbound datagrams, the M_CTL contains a ipsec_in_t (defined in
    432  * ipsec_info.h) which starts out with nothing in it. It gets filled
    433  * with the right information if it goes through the AH/ESP code, which
    434  * happens if the incoming packet is secure. The information initialized
    435  * by AH/ESP, is later used by IP(during fanouts to ULP) to see whether
    436  * the policy requirements needed by per-socket policy or global policy
    437  * is met or not.
    438  *
    439  * If there is both per-socket policy (set using setsockopt) and there
    440  * is also global policy match for the 5 tuples of the socket,
    441  * ipsec_override_policy() makes the decision of which one to use.
    442  *
    443  * For fully connected sockets i.e dst, src [addr, port] is known,
    444  * conn_policy_cached is set indicating that policy has been cached.
    445  * conn_in_enforce_policy may or may not be set depending on whether
    446  * there is a global policy match or per-socket policy match.
    447  * Policy inheriting happpens in ip_bind during the ipa_conn_t bind.
    448  * Once the right policy is set on the conn_t, policy cannot change for
    449  * this socket. This makes life simpler for TCP (UDP ?) where
    450  * re-transmissions go out with the same policy. For symmetry, policy
    451  * is cached for fully connected UDP sockets also. Thus if policy is cached,
    452  * it also implies that policy is latched i.e policy cannot change
    453  * on these sockets. As we have the right policy on the conn, we don't
    454  * have to lookup global policy for every outbound and inbound datagram
    455  * and thus serving as an optimization. Note that a global policy change
    456  * does not affect fully connected sockets if they have policy. If fully
    457  * connected sockets did not have any policy associated with it, global
    458  * policy change may affect them.
    459  *
    460  * IP Flow control notes:
    461  *
    462  * Non-TCP streams are flow controlled by IP. On the send side, if the packet
    463  * cannot be sent down to the driver by IP, because of a canput failure, IP
    464  * does a putq on the conn_wq. This will cause ip_wsrv to run on the conn_wq.
    465  * ip_wsrv in turn, inserts the conn in a list of conn's that need to be drained
    466  * when the flowcontrol condition subsides. Ultimately STREAMS backenables the
    467  * ip_wsrv on the IP module, which in turn does a qenable of the conn_wq of the
    468  * first conn in the list of conn's to be drained. ip_wsrv on this conn drains
    469  * the queued messages, and removes the conn from the drain list, if all
    470  * messages were drained. It also qenables the next conn in the drain list to
    471  * continue the drain process.
    472  *
    473  * In reality the drain list is not a single list, but a configurable number
    474  * of lists. The ip_wsrv on the IP module, qenables the first conn in each
    475  * list. If the ip_wsrv of the next qenabled conn does not run, because the
    476  * stream closes, ip_close takes responsibility to qenable the next conn in
    477  * the drain list. The directly called ip_wput path always does a putq, if
    478  * it cannot putnext. Thus synchronization problems are handled between
    479  * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only
    480  * functions that manipulate this drain list. Furthermore conn_drain_insert
    481  * is called only from ip_wsrv, and there can be only 1 instance of ip_wsrv
    482  * running on a queue at any time. conn_drain_tail can be simultaneously called
    483  * from both ip_wsrv and ip_close.
    484  *
    485  * IPQOS notes:
    486  *
    487  * IPQoS Policies are applied to packets using IPPF (IP Policy framework)
    488  * and IPQoS modules. IPPF includes hooks in IP at different control points
    489  * (callout positions) which direct packets to IPQoS modules for policy
    490  * processing. Policies, if present, are global.
    491  *
    492  * The callout positions are located in the following paths:
    493  *		o local_in (packets destined for this host)
    494  *		o local_out (packets orginating from this host )
    495  *		o fwd_in  (packets forwarded by this m/c - inbound)
    496  *		o fwd_out (packets forwarded by this m/c - outbound)
    497  * Hooks at these callout points can be enabled/disabled using the ndd variable
    498  * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions).
    499  * By default all the callout positions are enabled.
    500  *
    501  * Outbound (local_out)
    502  * Hooks are placed in ip_wput_ire and ipsec_out_process.
    503  *
    504  * Inbound (local_in)
    505  * Hooks are placed in ip_proto_input, icmp_inbound, ip_fanout_proto and
    506  * TCP and UDP fanout routines.
    507  *
    508  * Forwarding (in and out)
    509  * Hooks are placed in ip_rput_forward.
    510  *
    511  * IP Policy Framework processing (IPPF processing)
    512  * Policy processing for a packet is initiated by ip_process, which ascertains
    513  * that the classifier (ipgpc) is loaded and configured, failing which the
    514  * packet resumes normal processing in IP. If the clasifier is present, the
    515  * packet is acted upon by one or more IPQoS modules (action instances), per
    516  * filters configured in ipgpc and resumes normal IP processing thereafter.
    517  * An action instance can drop a packet in course of its processing.
    518  *
    519  * A boolean variable, ip_policy, is used in all the fanout routines that can
    520  * invoke ip_process for a packet. This variable indicates if the packet should
    521  * to be sent for policy processing. The variable is set to B_TRUE by default,
    522  * i.e. when the routines are invoked in the normal ip procesing path for a
    523  * packet. The two exceptions being ip_wput_local and icmp_inbound_error_fanout;
    524  * ip_policy is set to B_FALSE for all the routines called in these two
    525  * functions because, in the former case,  we don't process loopback traffic
    526  * currently while in the latter, the packets have already been processed in
    527  * icmp_inbound.
    528  *
    529  * Zones notes:
    530  *
    531  * The partitioning rules for networking are as follows:
    532  * 1) Packets coming from a zone must have a source address belonging to that
    533  * zone.
    534  * 2) Packets coming from a zone can only be sent on a physical interface on
    535  * which the zone has an IP address.
    536  * 3) Between two zones on the same machine, packet delivery is only allowed if
    537  * there's a matching route for the destination and zone in the forwarding
    538  * table.
    539  * 4) The TCP and UDP port spaces are per-zone; that is, two processes in
    540  * different zones can bind to the same port with the wildcard address
    541  * (INADDR_ANY).
    542  *
    543  * The granularity of interface partitioning is at the logical interface level.
    544  * Therefore, every zone has its own IP addresses, and incoming packets can be
    545  * attributed to a zone unambiguously. A logical interface is placed into a zone
    546  * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t
    547  * structure. Rule (1) is implemented by modifying the source address selection
    548  * algorithm so that the list of eligible addresses is filtered based on the
    549  * sending process zone.
    550  *
    551  * The Internet Routing Entries (IREs) are either exclusive to a zone or shared
    552  * across all zones, depending on their type. Here is the break-up:
    553  *
    554  * IRE type				Shared/exclusive
    555  * --------				----------------
    556  * IRE_BROADCAST			Exclusive
    557  * IRE_DEFAULT (default routes)		Shared (*)
    558  * IRE_LOCAL				Exclusive (x)
    559  * IRE_LOOPBACK				Exclusive
    560  * IRE_PREFIX (net routes)		Shared (*)
    561  * IRE_CACHE				Exclusive
    562  * IRE_IF_NORESOLVER (interface routes)	Exclusive
    563  * IRE_IF_RESOLVER (interface routes)	Exclusive
    564  * IRE_HOST (host routes)		Shared (*)
    565  *
    566  * (*) A zone can only use a default or off-subnet route if the gateway is
    567  * directly reachable from the zone, that is, if the gateway's address matches
    568  * one of the zone's logical interfaces.
    569  *
    570  * (x) IRE_LOCAL are handled a bit differently, since for all other entries
    571  * in ire_ctable and IRE_INTERFACE, ire_src_addr is what can be used as source
    572  * when sending packets using the IRE. For IRE_LOCAL ire_src_addr is the IP
    573  * address of the zone itself (the destination). Since IRE_LOCAL is used
    574  * for communication between zones, ip_wput_ire has special logic to set
    575  * the right source address when sending using an IRE_LOCAL.
    576  *
    577  * Furthermore, when ip_restrict_interzone_loopback is set (the default),
    578  * ire_cache_lookup restricts loopback using an IRE_LOCAL
    579  * between zone to the case when L2 would have conceptually looped the packet
    580  * back, i.e. the loopback which is required since neither Ethernet drivers
    581  * nor Ethernet hardware loops them back. This is the case when the normal
    582  * routes (ignoring IREs with different zoneids) would send out the packet on
    583  * the same ill (or ill group) as the ill with which is IRE_LOCAL is
    584  * associated.
    585  *
    586  * Multiple zones can share a common broadcast address; typically all zones
    587  * share the 255.255.255.255 address. Incoming as well as locally originated
    588  * broadcast packets must be dispatched to all the zones on the broadcast
    589  * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial
    590  * since some zones may not be on the 10.16.72/24 network. To handle this, each
    591  * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
    592  * sent to every zone that has an IRE_BROADCAST entry for the destination
    593  * address on the input ill, see conn_wantpacket().
    594  *
    595  * Applications in different zones can join the same multicast group address.
    596  * For IPv4, group memberships are per-logical interface, so they're already
    597  * inherently part of a zone. For IPv6, group memberships are per-physical
    598  * interface, so we distinguish IPv6 group memberships based on group address,
    599  * interface and zoneid. In both cases, received multicast packets are sent to
    600  * every zone for which a group membership entry exists. On IPv6 we need to
    601  * check that the target zone still has an address on the receiving physical
    602  * interface; it could have been removed since the application issued the
    603  * IPV6_JOIN_GROUP.
    604  */
    605 
    606 /*
    607  * Squeue Fanout flags:
    608  *	0: No fanout.
    609  *	1: Fanout across all squeues
    610  */
    611 boolean_t	ip_squeue_fanout = 0;
    612 
    613 /*
    614  * Maximum dups allowed per packet.
    615  */
    616 uint_t ip_max_frag_dups = 10;
    617 
    618 #define	IS_SIMPLE_IPH(ipha)						\
    619 	((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)
    620 
    621 /* RFC1122 Conformance */
    622 #define	IP_FORWARD_DEFAULT	IP_FORWARD_NEVER
    623 
    624 #define	ILL_MAX_NAMELEN			LIFNAMSIZ
    625 
    626 static int	conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *);
    627 
    628 static int	ip_open(queue_t *q, dev_t *devp, int flag, int sflag,
    629 		    cred_t *credp, boolean_t isv6);
    630 static mblk_t	*ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t,
    631 		    ipha_t **);
    632 
    633 static void	icmp_frag_needed(queue_t *, mblk_t *, int, zoneid_t,
    634 		    ip_stack_t *);
    635 static void	icmp_inbound(queue_t *, mblk_t *, boolean_t, ill_t *, int,
    636 		    uint32_t, boolean_t, boolean_t, ill_t *, zoneid_t);
    637 static ipaddr_t	icmp_get_nexthop_addr(ipha_t *, ill_t *, zoneid_t, mblk_t *mp);
    638 static boolean_t icmp_inbound_too_big(icmph_t *, ipha_t *, ill_t *, zoneid_t,
    639 		    mblk_t *, int, ip_stack_t *);
    640 static void	icmp_inbound_error_fanout(queue_t *, ill_t *, mblk_t *,
    641 		    icmph_t *, ipha_t *, int, int, boolean_t, boolean_t,
    642 		    ill_t *, zoneid_t);
    643 static void	icmp_options_update(ipha_t *);
    644 static void	icmp_param_problem(queue_t *, mblk_t *, uint8_t, zoneid_t,
    645 		    ip_stack_t *);
    646 static void	icmp_pkt(queue_t *, mblk_t *, void *, size_t, boolean_t,
    647 		    zoneid_t zoneid, ip_stack_t *);
    648 static mblk_t	*icmp_pkt_err_ok(mblk_t *, ip_stack_t *);
    649 static void	icmp_redirect(ill_t *, mblk_t *);
    650 static void	icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t,
    651 		    ip_stack_t *);
    652 
    653 static void	ip_arp_news(queue_t *, mblk_t *);
    654 static boolean_t ip_bind_insert_ire(mblk_t *, ire_t *, iulp_t *,
    655 		    ip_stack_t *);
    656 mblk_t		*ip_dlpi_alloc(size_t, t_uscalar_t);
    657 char		*ip_dot_addr(ipaddr_t, char *);
    658 mblk_t		*ip_carve_mp(mblk_t **, ssize_t);
    659 int		ip_close(queue_t *, int);
    660 static char	*ip_dot_saddr(uchar_t *, char *);
    661 static void	ip_fanout_proto(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t,
    662 		    boolean_t, boolean_t, ill_t *, zoneid_t);
    663 static void	ip_fanout_tcp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t,
    664 		    boolean_t, boolean_t, zoneid_t);
    665 static void	ip_fanout_udp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint32_t,
    666 		    boolean_t, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t);
    667 static void	ip_lrput(queue_t *, mblk_t *);
    668 ipaddr_t	ip_net_mask(ipaddr_t);
    669 void		ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t,
    670 		    ip_stack_t *);
    671 static void	ip_newroute_ipif(queue_t *, mblk_t *, ipif_t *, ipaddr_t,
    672 		    conn_t *, uint32_t, zoneid_t, ip_opt_info_t *);
    673 char		*ip_nv_lookup(nv_t *, int);
    674 static boolean_t	ip_check_for_ipsec_opt(queue_t *, mblk_t *);
    675 static int	ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    676 static int	ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    677 static boolean_t	ip_param_register(IDP *ndp, ipparam_t *, size_t,
    678     ipndp_t *, size_t);
    679 static int	ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
    680 void	ip_rput(queue_t *, mblk_t *);
    681 static void	ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
    682 		    void *dummy_arg);
    683 void	ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *);
    684 static int	ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *,
    685     ip_stack_t *);
    686 static boolean_t	ip_rput_local_options(queue_t *, mblk_t *, ipha_t *,
    687 			    ire_t *, ip_stack_t *);
    688 static boolean_t	ip_rput_multimblk_ipoptions(queue_t *, ill_t *,
    689 			    mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *);
    690 static int	ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *,
    691     ip_stack_t *);
    692 static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *,
    693 		    uint16_t *);
    694 int		ip_snmp_get(queue_t *, mblk_t *, int);
    695 static mblk_t	*ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
    696 		    mib2_ipIfStatsEntry_t *, ip_stack_t *);
    697 static mblk_t	*ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *,
    698 		    ip_stack_t *);
    699 static mblk_t	*ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *);
    700 static mblk_t	*ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst);
    701 static mblk_t	*ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst);
    702 static mblk_t	*ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst);
    703 static mblk_t	*ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst);
    704 static mblk_t	*ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *,
    705 		    ip_stack_t *ipst);
    706 static mblk_t	*ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *,
    707 		    ip_stack_t *ipst);
    708 static mblk_t	*ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *,
    709 		    ip_stack_t *ipst);
    710 static mblk_t	*ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *,
    711 		    ip_stack_t *ipst);
    712 static mblk_t	*ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *,
    713 		    ip_stack_t *ipst);
    714 static mblk_t	*ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *,
    715 		    ip_stack_t *ipst);
    716 static mblk_t	*ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
    717 		    ip_stack_t *ipst);
    718 static mblk_t	*ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
    719 		    ip_stack_t *ipst);
    720 static mblk_t	*ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *,
    721 		    ip_stack_t *ipst);
    722 static mblk_t	*ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *,
    723 		    ip_stack_t *ipst);
    724 static void	ip_snmp_get2_v4(ire_t *, iproutedata_t *);
    725 static void	ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
    726 static int	ip_snmp_get2_v6_media(nce_t *, iproutedata_t *);
    727 int		ip_snmp_set(queue_t *, int, int, uchar_t *, int);
    728 static boolean_t	ip_source_routed(ipha_t *, ip_stack_t *);
    729 static boolean_t	ip_source_route_included(ipha_t *);
    730 static void	ip_trash_ire_reclaim_stack(ip_stack_t *);
    731 
    732 static void	ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t,
    733 		    zoneid_t, ip_stack_t *);
    734 static mblk_t	*ip_wput_frag_copyhdr(uchar_t *, int, int, ip_stack_t *);
    735 static void	ip_wput_local_options(ipha_t *, ip_stack_t *);
    736 static int	ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t,
    737 		    zoneid_t, ip_stack_t *);
    738 
    739 static void	conn_drain_init(ip_stack_t *);
    740 static void	conn_drain_fini(ip_stack_t *);
    741 static void	conn_drain_tail(conn_t *connp, boolean_t closing);
    742 
    743 static void	conn_walk_drain(ip_stack_t *);
    744 static void	conn_walk_fanout_table(connf_t *, uint_t, pfv_t, void *,
    745     zoneid_t);
    746 
    747 static void	*ip_stack_init(netstackid_t stackid, netstack_t *ns);
    748 static void	ip_stack_shutdown(netstackid_t stackid, void *arg);
    749 static void	ip_stack_fini(netstackid_t stackid, void *arg);
    750 
    751 static boolean_t	conn_wantpacket(conn_t *, ill_t *, ipha_t *, int,
    752     zoneid_t);
    753 static void	ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
    754     void *dummy_arg);
    755 
    756 static int	ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
    757 
    758 static int	ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
    759     ipaddr_t, ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *,
    760     conn_t *, boolean_t, ipaddr_t, mcast_record_t, ipaddr_t, mblk_t *);
    761 static void	ip_multirt_bad_mtu(ire_t *, uint32_t);
    762 
    763 static int	ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    764 static int	ip_cgtp_filter_set(queue_t *, mblk_t *, char *,
    765     caddr_t, cred_t *);
    766 extern int	ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
    767     caddr_t cp, cred_t *cr);
    768 extern int	ip_squeue_profile_set(queue_t *, mblk_t *, char *, caddr_t,
    769     cred_t *);
    770 static int	ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
    771     caddr_t cp, cred_t *cr);
    772 static int	ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
    773     cred_t *);
    774 static int	ipmp_hook_emulation_set(queue_t *, mblk_t *, char *, caddr_t,
    775     cred_t *);
    776 static squeue_func_t ip_squeue_switch(int);
    777 
    778 static void	*ip_kstat_init(netstackid_t, ip_stack_t *);
    779 static void	ip_kstat_fini(netstackid_t, kstat_t *);
    780 static int	ip_kstat_update(kstat_t *kp, int rw);
    781 static void	*icmp_kstat_init(netstackid_t);
    782 static void	icmp_kstat_fini(netstackid_t, kstat_t *);
    783 static int	icmp_kstat_update(kstat_t *kp, int rw);
    784 static void	*ip_kstat2_init(netstackid_t, ip_stat_t *);
    785 static void	ip_kstat2_fini(netstackid_t, kstat_t *);
    786 
    787 static int	ip_conn_report(queue_t *, mblk_t *, caddr_t, cred_t *);
    788 
    789 static mblk_t	*ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t,
    790     ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *);
    791 
    792 static void	ip_rput_process_forward(queue_t *, mblk_t *, ire_t *,
    793     ipha_t *, ill_t *, boolean_t);
    794 ipaddr_t	ip_g_all_ones = IP_HOST_MASK;
    795 
    796 /* How long, in seconds, we allow frags to hang around. */
    797 #define	IP_FRAG_TIMEOUT	60
    798 
    799 /*
    800  * Threshold which determines whether MDT should be used when
    801  * generating IP fragments; payload size must be greater than
    802  * this threshold for MDT to take place.
    803  */
    804 #define	IP_WPUT_FRAG_MDT_MIN	32768
    805 
    806 /* Setable in /etc/system only */
    807 int	ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN;
    808 
    809 static long ip_rput_pullups;
    810 int	dohwcksum = 1;	/* use h/w cksum if supported by the hardware */
    811 
    812 vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */
    813 vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */
    814 
    815 int	ip_debug;
    816 
    817 #ifdef DEBUG
    818 uint32_t ipsechw_debug = 0;
    819 #endif
    820 
    821 /*
    822  * Multirouting/CGTP stuff
    823  */
    824 int	ip_cgtp_filter_rev = CGTP_FILTER_REV;	/* CGTP hooks version */
    825 
    826 /*
    827  * XXX following really should only be in a header. Would need more
    828  * header and .c clean up first.
    829  */
    830 extern optdb_obj_t	ip_opt_obj;
    831 
    832 ulong_t ip_squeue_enter_unbound = 0;
    833 
    834 /*
    835  * Named Dispatch Parameter Table.
    836  * All of these are alterable, within the min/max values given, at run time.
    837  */
    838 static ipparam_t	lcl_param_arr[] = {
    839 	/* min	max	value	name */
    840 	{  0,	1,	0,	"ip_respond_to_address_mask_broadcast"},
    841 	{  0,	1,	1,	"ip_respond_to_echo_broadcast"},
    842 	{  0,	1,	1,	"ip_respond_to_echo_multicast"},
    843 	{  0,	1,	0,	"ip_respond_to_timestamp"},
    844 	{  0,	1,	0,	"ip_respond_to_timestamp_broadcast"},
    845 	{  0,	1,	1,	"ip_send_redirects"},
    846 	{  0,	1,	0,	"ip_forward_directed_broadcasts"},
    847 	{  0,	10,	0,	"ip_mrtdebug"},
    848 	{  5000, 999999999,	60000, "ip_ire_timer_interval" },
    849 	{  60000, 999999999,	1200000, "ip_ire_arp_interval" },
    850 	{  60000, 999999999,	60000, "ip_ire_redirect_interval" },
    851 	{  1,	255,	255,	"ip_def_ttl" },
    852 	{  0,	1,	0,	"ip_forward_src_routed"},
    853 	{  0,	256,	32,	"ip_wroff_extra" },
    854 	{  5000, 999999999, 600000, "ip_ire_pathmtu_interval" },
    855 	{  8,	65536,  64,	"ip_icmp_return_data_bytes" },
    856 	{  0,	1,	1,	"ip_path_mtu_discovery" },
    857 	{  0,	240,	30,	"ip_ignore_delete_time" },
    858 	{  0,	1,	0,	"ip_ignore_redirect" },
    859 	{  0,	1,	1,	"ip_output_queue" },
    860 	{  1,	254,	1,	"ip_broadcast_ttl" },
    861 	{  0,	99999,	100,	"ip_icmp_err_interval" },
    862 	{  1,	99999,	10,	"ip_icmp_err_burst" },
    863 	{  0,	999999999,	1000000, "ip_reass_queue_bytes" },
    864 	{  0,	1,	0,	"ip_strict_dst_multihoming" },
    865 	{  1,	MAX_ADDRS_PER_IF,	256,	"ip_addrs_per_if"},
    866 	{  0,	1,	0,	"ipsec_override_persocket_policy" },
    867 	{  0,	1,	1,	"icmp_accept_clear_messages" },
    868 	{  0,	1,	1,	"igmp_accept_clear_messages" },
    869 	{  2,	999999999, ND_DELAY_FIRST_PROBE_TIME,
    870 				"ip_ndp_delay_first_probe_time"},
    871 	{  1,	999999999, ND_MAX_UNICAST_SOLICIT,
    872 				"ip_ndp_max_unicast_solicit"},
    873 	{  1,	255,	IPV6_MAX_HOPS,	"ip6_def_hops" },
    874 	{  8,	IPV6_MIN_MTU,	IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" },
    875 	{  0,	1,	0,	"ip6_forward_src_routed"},
    876 	{  0,	1,	1,	"ip6_respond_to_echo_multicast"},
    877 	{  0,	1,	1,	"ip6_send_redirects"},
    878 	{  0,	1,	0,	"ip6_ignore_redirect" },
    879 	{  0,	1,	0,	"ip6_strict_dst_multihoming" },
    880 
    881 	{  1,	8,	3,	"ip_ire_reclaim_fraction" },
    882 
    883 	{  0,	999999,	1000,	"ipsec_policy_log_interval" },
    884 
    885 	{  0,	1,	1,	"pim_accept_clear_messages" },
    886 	{  1000, 20000,	2000,	"ip_ndp_unsolicit_interval" },
    887 	{  1,	20,	3,	"ip_ndp_unsolicit_count" },
    888 	{  0,	1,	1,	"ip6_ignore_home_address_opt" },
    889 	{  0,	15,	0,	"ip_policy_mask" },
    890 	{  1000, 60000, 1000,	"ip_multirt_resolution_interval" },
    891 	{  0,	255,	1,	"ip_multirt_ttl" },
    892 	{  0,	1,	1,	"ip_multidata_outbound" },
    893 	{  0,	3600000, 300000, "ip_ndp_defense_interval" },
    894 	{  0,	999999,	60*60*24, "ip_max_temp_idle" },
    895 	{  0,	1000,	1,	"ip_max_temp_defend" },
    896 	{  0,	1000,	3,	"ip_max_defend" },
    897 	{  0,	999999,	30,	"ip_defend_interval" },
    898 	{  0,	3600000, 300000, "ip_dup_recovery" },
    899 	{  0,	1,	1,	"ip_restrict_interzone_loopback" },
    900 	{  0,	1,	1,	"ip_lso_outbound" },
    901 	{  IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" },
    902 	{  MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" },
    903 #ifdef DEBUG
    904 	{  0,	1,	0,	"ip6_drop_inbound_icmpv6" },
    905 #else
    906 	{  0,	0,	0,	"" },
    907 #endif
    908 };
    909 
    910 /*
    911  * Extended NDP table
    912  * The addresses for the first two are filled in to be ips_ip_g_forward
    913  * and ips_ipv6_forward at init time.
    914  */
    915 static ipndp_t	lcl_ndp_arr[] = {
    916 	/* getf			setf		data			name */
    917 #define	IPNDP_IP_FORWARDING_OFFSET	0
    918 	{  ip_param_generic_get,	ip_forward_set,	NULL,
    919 	    "ip_forwarding" },
    920 #define	IPNDP_IP6_FORWARDING_OFFSET	1
    921 	{  ip_param_generic_get,	ip_forward_set,	NULL,
    922 	    "ip6_forwarding" },
    923 	{  ip_ill_report,	NULL,		NULL,
    924 	    "ip_ill_status" },
    925 	{  ip_ipif_report,	NULL,		NULL,
    926 	    "ip_ipif_status" },
    927 	{  ip_conn_report,	NULL,		NULL,
    928 	    "ip_conn_status" },
    929 	{  nd_get_long,		nd_set_long,	(caddr_t)&ip_rput_pullups,
    930 	    "ip_rput_pullups" },
    931 	{  ip_srcid_report,	NULL,		NULL,
    932 	    "ip_srcid_status" },
    933 	{ ip_param_generic_get, ip_squeue_profile_set,
    934 	    (caddr_t)&ip_squeue_profile, "ip_squeue_profile" },
    935 	{ ip_param_generic_get, ip_squeue_bind_set,
    936 	    (caddr_t)&ip_squeue_bind, "ip_squeue_bind" },
    937 	{ ip_param_generic_get, ip_input_proc_set,
    938 	    (caddr_t)&ip_squeue_enter, "ip_squeue_enter" },
    939 	{ ip_param_generic_get, ip_int_set,
    940 	    (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" },
    941 #define	IPNDP_CGTP_FILTER_OFFSET	11
    942 	{  ip_cgtp_filter_get,	ip_cgtp_filter_set, NULL,
    943 	    "ip_cgtp_filter" },
    944 	{ ip_param_generic_get, ip_int_set,
    945 	    (caddr_t)&ip_soft_rings_cnt, "ip_soft_rings_cnt" },
    946 #define	IPNDP_IPMP_HOOK_OFFSET	13
    947 	{  ip_param_generic_get, ipmp_hook_emulation_set, NULL,
    948 	    "ipmp_hook_emulation" },
    949 	{  ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug,
    950 	    "ip_debug" },
    951 };
    952 
    953 /*
    954  * Table of IP ioctls encoding the various properties of the ioctl and
    955  * indexed based on the last byte of the ioctl command. Occasionally there
    956  * is a clash, and there is more than 1 ioctl with the same last byte.
    957  * In such a case 1 ioctl is encoded in the ndx table and the remaining
    958  * ioctls are encoded in the misc table. An entry in the ndx table is
    959  * retrieved by indexing on the last byte of the ioctl command and comparing
    960  * the ioctl command with the value in the ndx table. In the event of a
    961  * mismatch the misc table is then searched sequentially for the desired
    962  * ioctl command.
    963  *
    964  * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func>
    965  */
    966 ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
    967 	/* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    968 	/* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    969 	/* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    970 	/* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    971 	/* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    972 	/* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    973 	/* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    974 	/* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    975 	/* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    976 	/* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
    977 
    978 	/* 010 */ { SIOCADDRT,	sizeof (struct rtentry), IPI_PRIV,
    979 			MISC_CMD, ip_siocaddrt, NULL },
    980 	/* 011 */ { SIOCDELRT,	sizeof (struct rtentry), IPI_PRIV,
    981 			MISC_CMD, ip_siocdelrt, NULL },
    982 
    983 	/* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    984 			IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
    985 	/* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
    986 			IF_CMD, ip_sioctl_get_addr, NULL },
    987 
    988 	/* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
    989 			IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
    990 	/* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
    991 			IPI_GET_CMD | IPI_REPL,
    992 			IF_CMD, ip_sioctl_get_dstaddr, NULL },
    993 
    994 	/* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
    995 			IPI_PRIV | IPI_WR |