/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ #include #include #include #include #include #include #include #include #include #define _SUN_TPI_VERSION 2 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Values for squeue switch: * IP_SQUEUE_ENTER_NODRAIN: squeue_enter_nodrain * IP_SQUEUE_ENTER: squeue_enter * IP_SQUEUE_FILL: squeue_fill */ int ip_squeue_enter = 2; /* Setable in /etc/system */ squeue_func_t ip_input_proc; #define SET_BPREV_FLAG(x) ((mblk_t *)(uintptr_t)(x)) /* * Setable in /etc/system */ int ip_poll_normal_ms = 100; int ip_poll_normal_ticks = 0; int ip_modclose_ackwait_ms = 3000; /* * It would be nice to have these present only in DEBUG systems, but the * current design of the global symbol checking logic requires them to be * unconditionally present. */ uint_t ip_thread_data; /* TSD key for debug support */ krwlock_t ip_thread_rwlock; list_t ip_thread_list; /* * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions. */ struct listptr_s { mblk_t *lp_head; /* pointer to the head of the list */ mblk_t *lp_tail; /* pointer to the tail of the list */ }; typedef struct listptr_s listptr_t; /* * This is used by ip_snmp_get_mib2_ip_route_media and * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data. */ typedef struct iproutedata_s { uint_t ird_idx; listptr_t ird_route; /* ipRouteEntryTable */ listptr_t ird_netmedia; /* ipNetToMediaEntryTable */ listptr_t ird_attrs; /* ipRouteAttributeTable */ } iproutedata_t; /* * Cluster specific hooks. These should be NULL when booted as a non-cluster */ /* * Hook functions to enable cluster networking * On non-clustered systems these vectors must always be NULL. * * Hook function to Check ip specified ip address is a shared ip address * in the cluster * */ int (*cl_inet_isclusterwide)(uint8_t protocol, sa_family_t addr_family, uint8_t *laddrp) = NULL; /* * Hook function to generate cluster wide ip fragment identifier */ uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp) = NULL; /* * Hook function to generate cluster wide SPI. */ void (*cl_inet_getspi)(uint8_t, uint8_t *, size_t) = NULL; /* * Hook function to verify if the SPI is already utlized. */ int (*cl_inet_checkspi)(uint8_t, uint32_t) = NULL; /* * Hook function to delete the SPI from the cluster wide repository. */ void (*cl_inet_deletespi)(uint8_t, uint32_t) = NULL; /* * Hook function to inform the cluster when packet received on an IDLE SA */ void (*cl_inet_idlesa)(uint8_t, uint32_t, sa_family_t, in6_addr_t, in6_addr_t) = NULL; /* * Synchronization notes: * * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any * MT level protection given by STREAMS. IP uses a combination of its own * internal serialization mechanism and standard Solaris locking techniques. * The internal serialization is per phyint (no IPMP) or per IPMP group. * This is used to serialize plumbing operations, IPMP operations, certain * multicast operations, most set ioctls, igmp/mld timers etc. * * Plumbing is a long sequence of operations involving message * exchanges between IP, ARP and device drivers. Many set ioctls are typically * involved in plumbing operations. A natural model is to serialize these * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in * parallel without any interference. But various set ioctls on hme0 are best * serialized. However if the system uses IPMP, the operations are easier if * they are serialized on a per IPMP group basis since IPMP operations * happen across ill's of a group. Thus the lowest common denominator is to * serialize most set ioctls, multicast join/leave operations, IPMP operations * igmp/mld timer operations, and processing of DLPI control messages received * from drivers on a per IPMP group basis. If the system does not employ * IPMP the serialization is on a per phyint basis. This serialization is * provided by the ipsq_t and primitives operating on this. Details can * be found in ip_if.c above the core primitives operating on ipsq_t. * * Lookups of an ipif or ill by a thread return a refheld ipif / ill. * Simiarly lookup of an ire by a thread also returns a refheld ire. * In addition ipif's and ill's referenced by the ire are also indirectly * refheld. Thus no ipif or ill can vanish nor can critical parameters like * the ipif's address or netmask change as long as an ipif is refheld * directly or indirectly. For example an SIOCLIFADDR ioctl that changes the * address of an ipif has to go through the ipsq_t. This ensures that only * 1 such exclusive operation proceeds at any time on the ipif. It then * deletes all ires associated with this ipif, and waits for all refcnts * associated with this ipif to come down to zero. The address is changed * only after the ipif has been quiesced. Then the ipif is brought up again. * More details are described above the comment in ip_sioctl_flags. * * Packet processing is based mostly on IREs and are fully multi-threaded * using standard Solaris MT techniques. * * There are explicit locks in IP to handle: * - The ip_g_head list maintained by mi_open_link() and friends. * * - The reassembly data structures (one lock per hash bucket) * * - conn_lock is meant to protect conn_t fields. The fields actually * protected by conn_lock are documented in the conn_t definition. * * - ire_lock to protect some of the fields of the ire, IRE tables * (one lock per hash bucket). Refer to ip_ire.c for details. * * - ndp_g_lock and nce_lock for protecting NCEs. * * - ill_lock protects fields of the ill and ipif. Details in ip.h * * - ill_g_lock: This is a global reader/writer lock. Protects the following * * The AVL tree based global multi list of all ills. * * The linked list of all ipifs of an ill * * The mapping * * The ipsq->ipsq_phyint_list threaded by phyint_ipsq_next * * The illgroup list threaded by ill_group_next. * * association * Insertion/deletion of an ill in the system, insertion/deletion of an ipif * into an ill, changing the mapping of an ill, insertion/deletion * of an ill into the illgrp list, changing the assoc of an ill * will all have to hold the ill_g_lock as writer for the actual duration * of the insertion/deletion/change. More details about the mapping * may be found in the IPMP section. * * - ill_lock: This is a per ill mutex. * It protects some members of the ill and is documented below. * It also protects the mapping * It also protects the illgroup list threaded by ill_group_next. * It also protects the assoc. * It also protects the list of ipifs hanging off the ill. * * - ipsq_lock: This is a per ipsq_t mutex lock. * This protects all the other members of the ipsq struct except * ipsq_refs and ipsq_phyint_list which are protected by ill_g_lock * * - illgrp_lock: This is a per ill_group mutex lock. * The only thing it protects is the illgrp_ill_schednext member of ill_group * which dictates which is the next ill in an ill_group that is to be chosen * for sending outgoing packets, through creation of an IRE_CACHE that * references this ill. * * - phyint_lock: This is a per phyint mutex lock. Protects just the * phyint_flags * * - ip_g_nd_lock: This is a global reader/writer lock. * Any call to nd_load to load a new parameter to the ND table must hold the * lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock * as reader. * * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses. * This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the * uniqueness check also done atomically. * * - ipsec_capab_ills_lock: This readers/writer lock protects the global * lists of IPsec capable ills (ipsec_capab_ills_{ah,esp}). It is taken * as a writer when adding or deleting elements from these lists, and * as a reader when walking these lists to send a SADB update to the * IPsec capable ills. * * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc * group list linked by ill_usesrc_grp_next. It also protects the * ill_usesrc_ifindex field. It is taken as a writer when a member of the * group is being added or deleted. This lock is taken as a reader when * walking the list/group(eg: to get the number of members in a usesrc group). * Note, it is only necessary to take this lock if the ill_usesrc_grp_next * field is changing state i.e from NULL to non-NULL or vice-versa. For * example, it is not necessary to take this lock in the initial portion * of ip_sioctl_slifusesrc or at all in ip_sioctl_groupname and * ip_sioctl_flags since the these operations are executed exclusively and * that ensures that the "usesrc group state" cannot change. The "usesrc * group state" change can happen only in the latter part of * ip_sioctl_slifusesrc and in ill_delete. * * Changing , , assocications. * * To change the association, the ill_g_lock must be held * as writer, and the ill_locks of both the v4 and v6 instance of the ill * must be held. * * To change the association the ill_g_lock must be held as writer * and the ill_lock of the ill in question must be held. * * To change the association the ill_g_lock must be held as * writer and the ill_lock of the ill in question must be held. * * To add or delete an ipif from the list of ipifs hanging off the ill, * ill_g_lock (writer) and ill_lock must be held and the thread must be * a writer on the associated ipsq,. * * To add or delete an ill to the system, the ill_g_lock must be held as * writer and the thread must be a writer on the associated ipsq. * * To add or delete an ilm to an ill, the ill_lock must be held and the thread * must be a writer on the associated ipsq. * * Lock hierarchy * * Some lock hierarchy scenarios are listed below. * * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock * ill_g_lock -> illgrp_lock -> ill_lock * ill_g_lock -> ill_lock(s) -> phyint_lock * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock * ill_g_lock -> ip_addr_avail_lock * conn_lock -> irb_lock -> ill_lock -> ire_lock * ill_g_lock -> ip_g_nd_lock * * When more than 1 ill lock is needed to be held, all ill lock addresses * are sorted on address and locked starting from highest addressed lock * downward. * * IPsec scenarios * * ipsa_lock -> ill_g_lock -> ill_lock * ipsec_capab_ills_lock -> ill_g_lock -> ill_lock * ipsec_capab_ills_lock -> ipsa_lock * ill_g_usesrc_lock -> ill_g_lock -> ill_lock * * Trusted Solaris scenarios * * igsa_lock -> gcgrp_rwlock -> gcgrp_lock * igsa_lock -> gcdb_lock * gcgrp_rwlock -> ire_lock * gcgrp_rwlock -> gcdb_lock * * * Routing/forwarding table locking notes: * * Lock acquisition order: Radix tree lock, irb_lock. * Requirements: * i. Walker must not hold any locks during the walker callback. * ii Walker must not see a truncated tree during the walk because of any node * deletion. * iii Existing code assumes ire_bucket is valid if it is non-null and is used * in many places in the code to walk the irb list. Thus even if all the * ires in a bucket have been deleted, we still can't free the radix node * until the ires have actually been inactive'd (freed). * * Tree traversal - Need to hold the global tree lock in read mode. * Before dropping the global tree lock, need to either increment the ire_refcnt * to ensure that the radix node can't be deleted. * * Tree add - Need to hold the global tree lock in write mode to add a * radix node. To prevent the node from being deleted, increment the * irb_refcnt, after the node is added to the tree. The ire itself is * added later while holding the irb_lock, but not the tree lock. * * Tree delete - Need to hold the global tree lock and irb_lock in write mode. * All associated ires must be inactive (i.e. freed), and irb_refcnt * must be zero. * * Walker - Increment irb_refcnt before calling the walker callback. Hold the * global tree lock (read mode) for traversal. * * IPsec notes : * * IP interacts with the IPsec code (AH/ESP) by tagging a M_CTL message * in front of the actual packet. For outbound datagrams, the M_CTL * contains a ipsec_out_t (defined in ipsec_info.h), which has the * information used by the IPsec code for applying the right level of * protection. The information initialized by IP in the ipsec_out_t * is determined by the per-socket policy or global policy in the system. * For inbound datagrams, the M_CTL contains a ipsec_in_t (defined in * ipsec_info.h) which starts out with nothing in it. It gets filled * with the right information if it goes through the AH/ESP code, which * happens if the incoming packet is secure. The information initialized * by AH/ESP, is later used by IP(during fanouts to ULP) to see whether * the policy requirements needed by per-socket policy or global policy * is met or not. * * If there is both per-socket policy (set using setsockopt) and there * is also global policy match for the 5 tuples of the socket, * ipsec_override_policy() makes the decision of which one to use. * * For fully connected sockets i.e dst, src [addr, port] is known, * conn_policy_cached is set indicating that policy has been cached. * conn_in_enforce_policy may or may not be set depending on whether * there is a global policy match or per-socket policy match. * Policy inheriting happpens in ip_bind during the ipa_conn_t bind. * Once the right policy is set on the conn_t, policy cannot change for * this socket. This makes life simpler for TCP (UDP ?) where * re-transmissions go out with the same policy. For symmetry, policy * is cached for fully connected UDP sockets also. Thus if policy is cached, * it also implies that policy is latched i.e policy cannot change * on these sockets. As we have the right policy on the conn, we don't * have to lookup global policy for every outbound and inbound datagram * and thus serving as an optimization. Note that a global policy change * does not affect fully connected sockets if they have policy. If fully * connected sockets did not have any policy associated with it, global * policy change may affect them. * * IP Flow control notes: * * Non-TCP streams are flow controlled by IP. On the send side, if the packet * cannot be sent down to the driver by IP, because of a canput failure, IP * does a putq on the conn_wq. This will cause ip_wsrv to run on the conn_wq. * ip_wsrv in turn, inserts the conn in a list of conn's that need to be drained * when the flowcontrol condition subsides. Ultimately STREAMS backenables the * ip_wsrv on the IP module, which in turn does a qenable of the conn_wq of the * first conn in the list of conn's to be drained. ip_wsrv on this conn drains * the queued messages, and removes the conn from the drain list, if all * messages were drained. It also qenables the next conn in the drain list to * continue the drain process. * * In reality the drain list is not a single list, but a configurable number * of lists. The ip_wsrv on the IP module, qenables the first conn in each * list. If the ip_wsrv of the next qenabled conn does not run, because the * stream closes, ip_close takes responsibility to qenable the next conn in * the drain list. The directly called ip_wput path always does a putq, if * it cannot putnext. Thus synchronization problems are handled between * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only * functions that manipulate this drain list. Furthermore conn_drain_insert * is called only from ip_wsrv, and there can be only 1 instance of ip_wsrv * running on a queue at any time. conn_drain_tail can be simultaneously called * from both ip_wsrv and ip_close. * * IPQOS notes: * * IPQoS Policies are applied to packets using IPPF (IP Policy framework) * and IPQoS modules. IPPF includes hooks in IP at different control points * (callout positions) which direct packets to IPQoS modules for policy * processing. Policies, if present, are global. * * The callout positions are located in the following paths: * o local_in (packets destined for this host) * o local_out (packets orginating from this host ) * o fwd_in (packets forwarded by this m/c - inbound) * o fwd_out (packets forwarded by this m/c - outbound) * Hooks at these callout points can be enabled/disabled using the ndd variable * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions). * By default all the callout positions are enabled. * * Outbound (local_out) * Hooks are placed in ip_wput_ire and ipsec_out_process. * * Inbound (local_in) * Hooks are placed in ip_proto_input, icmp_inbound, ip_fanout_proto and * TCP and UDP fanout routines. * * Forwarding (in and out) * Hooks are placed in ip_rput_forward. * * IP Policy Framework processing (IPPF processing) * Policy processing for a packet is initiated by ip_process, which ascertains * that the classifier (ipgpc) is loaded and configured, failing which the * packet resumes normal processing in IP. If the clasifier is present, the * packet is acted upon by one or more IPQoS modules (action instances), per * filters configured in ipgpc and resumes normal IP processing thereafter. * An action instance can drop a packet in course of its processing. * * A boolean variable, ip_policy, is used in all the fanout routines that can * invoke ip_process for a packet. This variable indicates if the packet should * to be sent for policy processing. The variable is set to B_TRUE by default, * i.e. when the routines are invoked in the normal ip procesing path for a * packet. The two exceptions being ip_wput_local and icmp_inbound_error_fanout; * ip_policy is set to B_FALSE for all the routines called in these two * functions because, in the former case, we don't process loopback traffic * currently while in the latter, the packets have already been processed in * icmp_inbound. * * Zones notes: * * The partitioning rules for networking are as follows: * 1) Packets coming from a zone must have a source address belonging to that * zone. * 2) Packets coming from a zone can only be sent on a physical interface on * which the zone has an IP address. * 3) Between two zones on the same machine, packet delivery is only allowed if * there's a matching route for the destination and zone in the forwarding * table. * 4) The TCP and UDP port spaces are per-zone; that is, two processes in * different zones can bind to the same port with the wildcard address * (INADDR_ANY). * * The granularity of interface partitioning is at the logical interface level. * Therefore, every zone has its own IP addresses, and incoming packets can be * attributed to a zone unambiguously. A logical interface is placed into a zone * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t * structure. Rule (1) is implemented by modifying the source address selection * algorithm so that the list of eligible addresses is filtered based on the * sending process zone. * * The Internet Routing Entries (IREs) are either exclusive to a zone or shared * across all zones, depending on their type. Here is the break-up: * * IRE type Shared/exclusive * -------- ---------------- * IRE_BROADCAST Exclusive * IRE_DEFAULT (default routes) Shared (*) * IRE_LOCAL Exclusive (x) * IRE_LOOPBACK Exclusive * IRE_PREFIX (net routes) Shared (*) * IRE_CACHE Exclusive * IRE_IF_NORESOLVER (interface routes) Exclusive * IRE_IF_RESOLVER (interface routes) Exclusive * IRE_HOST (host routes) Shared (*) * * (*) A zone can only use a default or off-subnet route if the gateway is * directly reachable from the zone, that is, if the gateway's address matches * one of the zone's logical interfaces. * * (x) IRE_LOCAL are handled a bit differently, since for all other entries * in ire_ctable and IRE_INTERFACE, ire_src_addr is what can be used as source * when sending packets using the IRE. For IRE_LOCAL ire_src_addr is the IP * address of the zone itself (the destination). Since IRE_LOCAL is used * for communication between zones, ip_wput_ire has special logic to set * the right source address when sending using an IRE_LOCAL. * * Furthermore, when ip_restrict_interzone_loopback is set (the default), * ire_cache_lookup restricts loopback using an IRE_LOCAL * between zone to the case when L2 would have conceptually looped the packet * back, i.e. the loopback which is required since neither Ethernet drivers * nor Ethernet hardware loops them back. This is the case when the normal * routes (ignoring IREs with different zoneids) would send out the packet on * the same ill (or ill group) as the ill with which is IRE_LOCAL is * associated. * * Multiple zones can share a common broadcast address; typically all zones * share the 255.255.255.255 address. Incoming as well as locally originated * broadcast packets must be dispatched to all the zones on the broadcast * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial * since some zones may not be on the 10.16.72/24 network. To handle this, each * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are * sent to every zone that has an IRE_BROADCAST entry for the destination * address on the input ill, see conn_wantpacket(). * * Applications in different zones can join the same multicast group address. * For IPv4, group memberships are per-logical interface, so they're already * inherently part of a zone. For IPv6, group memberships are per-physical * interface, so we distinguish IPv6 group memberships based on group address, * interface and zoneid. In both cases, received multicast packets are sent to * every zone for which a group membership entry exists. On IPv6 we need to * check that the target zone still has an address on the receiving physical * interface; it could have been removed since the application issued the * IPV6_JOIN_GROUP. */ /* * Squeue Fanout flags: * 0: No fanout. * 1: Fanout across all squeues */ boolean_t ip_squeue_fanout = 0; /* * Maximum dups allowed per packet. */ uint_t ip_max_frag_dups = 10; #define IS_SIMPLE_IPH(ipha) \ ((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION) /* RFC1122 Conformance */ #define IP_FORWARD_DEFAULT IP_FORWARD_NEVER #define ILL_MAX_NAMELEN LIFNAMSIZ static int conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *); static int ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, boolean_t isv6); static mblk_t *ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t, ipha_t **); static void icmp_frag_needed(queue_t *, mblk_t *, int, zoneid_t, ip_stack_t *); static void icmp_inbound(queue_t *, mblk_t *, boolean_t, ill_t *, int, uint32_t, boolean_t, boolean_t, ill_t *, zoneid_t); static ipaddr_t icmp_get_nexthop_addr(ipha_t *, ill_t *, zoneid_t, mblk_t *mp); static boolean_t icmp_inbound_too_big(icmph_t *, ipha_t *, ill_t *, zoneid_t, mblk_t *, int, ip_stack_t *); static void icmp_inbound_error_fanout(queue_t *, ill_t *, mblk_t *, icmph_t *, ipha_t *, int, int, boolean_t, boolean_t, ill_t *, zoneid_t); static void icmp_options_update(ipha_t *); static void icmp_param_problem(queue_t *, mblk_t *, uint8_t, zoneid_t, ip_stack_t *); static void icmp_pkt(queue_t *, mblk_t *, void *, size_t, boolean_t, zoneid_t zoneid, ip_stack_t *); static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_stack_t *); static void icmp_redirect(ill_t *, mblk_t *); static void icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t, ip_stack_t *); static void ip_arp_news(queue_t *, mblk_t *); static boolean_t ip_bind_insert_ire(mblk_t *, ire_t *, iulp_t *, ip_stack_t *); mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); char *ip_dot_addr(ipaddr_t, char *); mblk_t *ip_carve_mp(mblk_t **, ssize_t); int ip_close(queue_t *, int); static char *ip_dot_saddr(uchar_t *, char *); static void ip_fanout_proto(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t); static void ip_fanout_tcp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, boolean_t, boolean_t, zoneid_t); static void ip_fanout_udp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint32_t, boolean_t, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t); static void ip_lrput(queue_t *, mblk_t *); ipaddr_t ip_net_mask(ipaddr_t); void ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t, ip_stack_t *); static void ip_newroute_ipif(queue_t *, mblk_t *, ipif_t *, ipaddr_t, conn_t *, uint32_t, zoneid_t, ip_opt_info_t *); char *ip_nv_lookup(nv_t *, int); static boolean_t ip_check_for_ipsec_opt(queue_t *, mblk_t *); static int ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *); static int ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *); static boolean_t ip_param_register(IDP *ndp, ipparam_t *, size_t, ipndp_t *, size_t); static int ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); void ip_rput(queue_t *, mblk_t *); static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg); void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *); static int ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *, ip_stack_t *); static boolean_t ip_rput_local_options(queue_t *, mblk_t *, ipha_t *, ire_t *, ip_stack_t *); static boolean_t ip_rput_multimblk_ipoptions(queue_t *, ill_t *, mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *); static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *, ip_stack_t *); static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *, uint16_t *); int ip_snmp_get(queue_t *, mblk_t *, int); static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *, mib2_ipIfStatsEntry_t *, ip_stack_t *); static mblk_t *ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *, ip_stack_t *); static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *); static mblk_t *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, ip_stack_t *ipst); static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, ip_stack_t *ipst); static void ip_snmp_get2_v4(ire_t *, iproutedata_t *); static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *); static int ip_snmp_get2_v6_media(nce_t *, iproutedata_t *); int ip_snmp_set(queue_t *, int, int, uchar_t *, int); static boolean_t ip_source_routed(ipha_t *, ip_stack_t *); static boolean_t ip_source_route_included(ipha_t *); static void ip_trash_ire_reclaim_stack(ip_stack_t *); static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t, zoneid_t, ip_stack_t *); static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int, ip_stack_t *); static void ip_wput_local_options(ipha_t *, ip_stack_t *); static int ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t, zoneid_t, ip_stack_t *); static void conn_drain_init(ip_stack_t *); static void conn_drain_fini(ip_stack_t *); static void conn_drain_tail(conn_t *connp, boolean_t closing); static void conn_walk_drain(ip_stack_t *); static void conn_walk_fanout_table(connf_t *, uint_t, pfv_t, void *, zoneid_t); static void *ip_stack_init(netstackid_t stackid, netstack_t *ns); static void ip_stack_shutdown(netstackid_t stackid, void *arg); static void ip_stack_fini(netstackid_t stackid, void *arg); static boolean_t conn_wantpacket(conn_t *, ill_t *, ipha_t *, int, zoneid_t); static void ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg); static int ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *, conn_t *, boolean_t, ipaddr_t, mcast_record_t, ipaddr_t, mblk_t *); static void ip_multirt_bad_mtu(ire_t *, uint32_t); static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *); static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); extern int ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); extern int ip_squeue_profile_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); static int ipmp_hook_emulation_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); static squeue_func_t ip_squeue_switch(int); static void *ip_kstat_init(netstackid_t, ip_stack_t *); static void ip_kstat_fini(netstackid_t, kstat_t *); static int ip_kstat_update(kstat_t *kp, int rw); static void *icmp_kstat_init(netstackid_t); static void icmp_kstat_fini(netstackid_t, kstat_t *); static int icmp_kstat_update(kstat_t *kp, int rw); static void *ip_kstat2_init(netstackid_t, ip_stat_t *); static void ip_kstat2_fini(netstackid_t, kstat_t *); static int ip_conn_report(queue_t *, mblk_t *, caddr_t, cred_t *); static mblk_t *ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t, ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *); static void ip_rput_process_forward(queue_t *, mblk_t *, ire_t *, ipha_t *, ill_t *, boolean_t); static void ipobs_init(ip_stack_t *); static void ipobs_fini(ip_stack_t *); ipaddr_t ip_g_all_ones = IP_HOST_MASK; /* How long, in seconds, we allow frags to hang around. */ #define IP_FRAG_TIMEOUT 15 /* * Threshold which determines whether MDT should be used when * generating IP fragments; payload size must be greater than * this threshold for MDT to take place. */ #define IP_WPUT_FRAG_MDT_MIN 32768 /* Setable in /etc/system only */ int ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN; static long ip_rput_pullups; int dohwcksum = 1; /* use h/w cksum if supported by the hardware */ vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */ vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */ int ip_debug; #ifdef DEBUG uint32_t ipsechw_debug = 0; #endif /* * Multirouting/CGTP stuff */ int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */ /* * XXX following really should only be in a header. Would need more * header and .c clean up first. */ extern optdb_obj_t ip_opt_obj; ulong_t ip_squeue_enter_unbound = 0; /* * Named Dispatch Parameter Table. * All of these are alterable, within the min/max values given, at run time. */ static ipparam_t lcl_param_arr[] = { /* min max value name */ { 0, 1, 0, "ip_respond_to_address_mask_broadcast"}, { 0, 1, 1, "ip_respond_to_echo_broadcast"}, { 0, 1, 1, "ip_respond_to_echo_multicast"}, { 0, 1, 0, "ip_respond_to_timestamp"}, { 0, 1, 0, "ip_respond_to_timestamp_broadcast"}, { 0, 1, 1, "ip_send_redirects"}, { 0, 1, 0, "ip_forward_directed_broadcasts"}, { 0, 10, 0, "ip_mrtdebug"}, { 5000, 999999999, 60000, "ip_ire_timer_interval" }, { 60000, 999999999, 1200000, "ip_ire_arp_interval" }, { 60000, 999999999, 60000, "ip_ire_redirect_interval" }, { 1, 255, 255, "ip_def_ttl" }, { 0, 1, 0, "ip_forward_src_routed"}, { 0, 256, 32, "ip_wroff_extra" }, { 5000, 999999999, 600000, "ip_ire_pathmtu_interval" }, { 8, 65536, 64, "ip_icmp_return_data_bytes" }, { 0, 1, 1, "ip_path_mtu_discovery" }, { 0, 240, 30, "ip_ignore_delete_time" }, { 0, 1, 0, "ip_ignore_redirect" }, { 0, 1, 1, "ip_output_queue" }, { 1, 254, 1, "ip_broadcast_ttl" }, { 0, 99999, 100, "ip_icmp_err_interval" }, { 1, 99999, 10, "ip_icmp_err_burst" }, { 0, 999999999, 1000000, "ip_reass_queue_bytes" }, { 0, 1, 0, "ip_strict_dst_multihoming" }, { 1, MAX_ADDRS_PER_IF, 256, "ip_addrs_per_if"}, { 0, 1, 0, "ipsec_override_persocket_policy" }, { 0, 1, 1, "icmp_accept_clear_messages" }, { 0, 1, 1, "igmp_accept_clear_messages" }, { 2, 999999999, ND_DELAY_FIRST_PROBE_TIME, "ip_ndp_delay_first_probe_time"}, { 1, 999999999, ND_MAX_UNICAST_SOLICIT, "ip_ndp_max_unicast_solicit"}, { 1, 255, IPV6_MAX_HOPS, "ip6_def_hops" }, { 8, IPV6_MIN_MTU, IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" }, { 0, 1, 0, "ip6_forward_src_routed"}, { 0, 1, 1, "ip6_respond_to_echo_multicast"}, { 0, 1, 1, "ip6_send_redirects"}, { 0, 1, 0, "ip6_ignore_redirect" }, { 0, 1, 0, "ip6_strict_dst_multihoming" }, { 1, 8, 3, "ip_ire_reclaim_fraction" }, { 0, 999999, 1000, "ipsec_policy_log_interval" }, { 0, 1, 1, "pim_accept_clear_messages" }, { 1000, 20000, 2000, "ip_ndp_unsolicit_interval" }, { 1, 20, 3, "ip_ndp_unsolicit_count" }, { 0, 1, 1, "ip6_ignore_home_address_opt" }, { 0, 15, 0, "ip_policy_mask" }, { 1000, 60000, 1000, "ip_multirt_resolution_interval" }, { 0, 255, 1, "ip_multirt_ttl" }, { 0, 1, 1, "ip_multidata_outbound" }, { 0, 3600000, 300000, "ip_ndp_defense_interval" }, { 0, 999999, 60*60*24, "ip_max_temp_idle" }, { 0, 1000, 1, "ip_max_temp_defend" }, { 0, 1000, 3, "ip_max_defend" }, { 0, 999999, 30, "ip_defend_interval" }, { 0, 3600000, 300000, "ip_dup_recovery" }, { 0, 1, 1, "ip_restrict_interzone_loopback" }, { 0, 1, 1, "ip_lso_outbound" }, { IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" }, { MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" }, { 68, 65535, 576, "ip_pmtu_min" }, #ifdef DEBUG { 0, 1, 0, "ip6_drop_inbound_icmpv6" }, #else { 0, 0, 0, "" }, #endif }; /* * Extended NDP table * The addresses for the first two are filled in to be ips_ip_g_forward * and ips_ipv6_forward at init time. */ static ipndp_t lcl_ndp_arr[] = { /* getf setf data name */ #define IPNDP_IP_FORWARDING_OFFSET 0 { ip_param_generic_get, ip_forward_set, NULL, "ip_forwarding" }, #define IPNDP_IP6_FORWARDING_OFFSET 1 { ip_param_generic_get, ip_forward_set, NULL, "ip6_forwarding" }, { ip_ill_report, NULL, NULL, "ip_ill_status" }, { ip_ipif_report, NULL, NULL, "ip_ipif_status" }, { ip_conn_report, NULL, NULL, "ip_conn_status" }, { nd_get_long, nd_set_long, (caddr_t)&ip_rput_pullups, "ip_rput_pullups" }, { ip_srcid_report, NULL, NULL, "ip_srcid_status" }, { ip_param_generic_get, ip_squeue_profile_set, (caddr_t)&ip_squeue_profile, "ip_squeue_profile" }, { ip_param_generic_get, ip_squeue_bind_set, (caddr_t)&ip_squeue_bind, "ip_squeue_bind" }, { ip_param_generic_get, ip_input_proc_set, (caddr_t)&ip_squeue_enter, "ip_squeue_enter" }, { ip_param_generic_get, ip_int_set, (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" }, #define IPNDP_CGTP_FILTER_OFFSET 11 { ip_cgtp_filter_get, ip_cgtp_filter_set, NULL, "ip_cgtp_filter" }, { ip_param_generic_get, ip_int_set, (caddr_t)&ip_soft_rings_cnt, "ip_soft_rings_cnt" }, #define IPNDP_IPMP_HOOK_OFFSET 13 { ip_param_generic_get, ipmp_hook_emulation_set, NULL, "ipmp_hook_emulation" }, { ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug, "ip_debug" }, }; /* * Table of IP ioctls encoding the various properties of the ioctl and * indexed based on the last byte of the ioctl command. Occasionally there * is a clash, and there is more than 1 ioctl with the same last byte. * In such a case 1 ioctl is encoded in the ndx table and the remaining * ioctls are encoded in the misc table. An entry in the ndx table is * retrieved by indexing on the last byte of the ioctl command and comparing * the ioctl command with the value in the ndx table. In the event of a * mismatch the misc table is then searched sequentially for the desired * ioctl command. * * Entry: */ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 010 */ { SIOCADDRT, sizeof (struct rtentry), IPI_PRIV, MISC_CMD, ip_siocaddrt, NULL }, /* 011 */ { SIOCDELRT, sizeof (struct rtentry), IPI_PRIV, MISC_CMD, ip_siocdelrt, NULL }, /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, IF_CMD, ip_sioctl_get_addr, NULL }, /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, IF_CMD, ip_sioctl_get_dstaddr, NULL }, /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq), IPI_PRIV | IPI_WR | IPI_REPL, IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq), IPI_MODOK | IPI_GET_CMD | IPI_REPL, IF_CMD, ip_sioctl_get_flags, NULL }, /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* copyin size cannot be coded for SIOCGIFCONF */ /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD, MISC_CMD, ip_sioctl_get_ifconf, NULL }, /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_mtu, NULL }, /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, IF_CMD, ip_sioctl_get_mtu, NULL }, /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, IF_CMD, ip_sioctl_get_brdaddr, NULL }, /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_brdaddr, NULL }, /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, IF_CMD, ip_sioctl_get_netmask, NULL }, /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, IF_CMD, ip_sioctl_get_metric, NULL }, /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV, IF_CMD, ip_sioctl_metric, NULL }, /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* See 166-168 below for extended SIOC*XARP ioctls */ /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV, ARP_CMD, ip_sioctl_arp, NULL }, /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD | IPI_REPL, ARP_CMD, ip_sioctl_arp, NULL }, /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV, ARP_CMD, ip_sioctl_arp, NULL }, /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK, MISC_CMD, if_unitsel, if_unitsel_restart }, /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq), IPI_PRIV | IPI_WR | IPI_MODOK, IF_CMD, ip_sioctl_sifname, NULL }, /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD | IPI_REPL, MISC_CMD, ip_sioctl_get_ifnum, NULL }, /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, IF_CMD, ip_sioctl_get_muxid, NULL }, /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq), IPI_PRIV | IPI_WR | IPI_REPL, IF_CMD, ip_sioctl_muxid, NULL }, /* Both if and lif variants share same func */ /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL, IF_CMD, ip_sioctl_get_lifindex, NULL }, /* Both if and lif variants share same func */ /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq), IPI_PRIV | IPI_WR | IPI_REPL, IF_CMD, ip_sioctl_slifindex, NULL }, /* copyin size cannot be coded for SIOCGIFCONF */ /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD, MISC_CMD, ip_sioctl_get_ifconf, NULL }, /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq), IPI_PRIV | IPI_WR | IPI_REPL, LIF_CMD, ip_sioctl_removeif, ip_sioctl_removeif_restart }, /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq), IPI_GET_CMD | IPI_PRIV | IPI_WR | IPI_REPL, LIF_CMD, ip_sioctl_addif, NULL }, #define SIOCLIFADDR_NDX 112 /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, LIF_CMD, ip_sioctl_get_addr, NULL }, /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, LIF_CMD, ip_sioctl_get_dstaddr, NULL }, /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq), IPI_PRIV | IPI_WR | IPI_REPL, LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq), IPI_GET_CMD | IPI_MODOK | IPI_REPL, LIF_CMD, ip_sioctl_get_flags, NULL }, /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, ip_sioctl_get_lifconf, NULL }, /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_mtu, NULL }, /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, LIF_CMD, ip_sioctl_get_mtu, NULL }, /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, LIF_CMD, ip_sioctl_get_brdaddr, NULL }, /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_brdaddr, NULL }, /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, LIF_CMD, ip_sioctl_get_netmask, NULL }, /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, LIF_CMD, ip_sioctl_get_metric, NULL }, /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_metric, NULL }, /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq), IPI_PRIV | IPI_WR | IPI_MODOK | IPI_REPL, LIF_CMD, ip_sioctl_slifname, ip_sioctl_slifname_restart }, /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD | IPI_REPL, MISC_CMD, ip_sioctl_get_lifnum, NULL }, /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, LIF_CMD, ip_sioctl_get_muxid, NULL }, /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq), IPI_PRIV | IPI_WR | IPI_REPL, LIF_CMD, ip_sioctl_muxid, NULL }, /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, LIF_CMD, ip_sioctl_get_lifindex, 0 }, /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq), IPI_PRIV | IPI_WR | IPI_REPL, LIF_CMD, ip_sioctl_slifindex, 0 }, /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_token, NULL }, /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, LIF_CMD, ip_sioctl_get_token, NULL }, /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart }, /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, LIF_CMD, ip_sioctl_get_subnet, NULL }, /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_lnkinfo, NULL }, /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, LIF_CMD, ip_sioctl_get_lnkinfo, NULL }, /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV, LIF_CMD, ip_siocdelndp_v6, NULL }, /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD, LIF_CMD, ip_siocqueryndp_v6, NULL }, /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV, LIF_CMD, ip_siocsetndp_v6, NULL }, /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD, MISC_CMD, ip_sioctl_tmyaddr, NULL }, /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD, MISC_CMD, ip_sioctl_tonlink, NULL }, /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0, MISC_CMD, ip_sioctl_tmysite, NULL }, /* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), IPI_REPL, TUN_CMD, ip_sioctl_tunparam, NULL }, /* 148 */ { SIOCSTUNPARAM, sizeof (struct iftun_req), IPI_PRIV | IPI_WR, TUN_CMD, ip_sioctl_tunparam, NULL }, /* IPSECioctls handled in ip_sioctl_copyin_setup itself */ /* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, /* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, /* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, /* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, /* 153 */ { SIOCLIFFAILOVER, sizeof (struct lifreq), IPI_PRIV | IPI_WR | IPI_REPL, LIF_CMD, ip_sioctl_move, ip_sioctl_move }, /* 154 */ { SIOCLIFFAILBACK, sizeof (struct lifreq), IPI_PRIV | IPI_WR | IPI_REPL, LIF_CMD, ip_sioctl_move, ip_sioctl_move }, /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq), IPI_PRIV | IPI_WR | IPI_REPL, LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname }, /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, LIF_CMD, ip_sioctl_get_groupname, NULL }, /* 157 */ { SIOCGLIFOINDEX, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, LIF_CMD, ip_sioctl_get_oindex, NULL }, /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */ /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 161 */ { SIOCSLIFOINDEX, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifoindex, NULL }, /* These are handled in ip_sioctl_copyin_setup itself */ /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT, MISC_CMD, NULL, NULL }, /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT, MISC_CMD, NULL, NULL }, /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL }, /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, ip_sioctl_get_lifconf, NULL }, /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV, XARP_CMD, ip_sioctl_arp, NULL }, /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD | IPI_REPL, XARP_CMD, ip_sioctl_arp, NULL }, /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV, XARP_CMD, ip_sioctl_arp, NULL }, /* SIOCPOPSOCKFS is not handled by IP */ /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL }, /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL, LIF_CMD, ip_sioctl_get_lifzone, NULL }, /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq), IPI_PRIV | IPI_WR | IPI_REPL, LIF_CMD, ip_sioctl_slifzone, ip_sioctl_slifzone_restart }, /* 172-174 are SCTP ioctls and not handled by IP */ /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq), IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifusesrc, 0 }, /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq), IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifusesrc, NULL }, /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD, ip_sioctl_get_lifsrcof, NULL }, /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD, MSFILT_CMD, ip_sioctl_msfilter, NULL }, /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), IPI_WR, MSFILT_CMD, ip_sioctl_msfilter, NULL }, /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD, MSFILT_CMD, ip_sioctl_msfilter, NULL }, /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR, MSFILT_CMD, ip_sioctl_msfilter, NULL }, /* 182 */ { SIOCSIPMPFAILBACK, sizeof (int), IPI_PRIV, MISC_CMD, ip_sioctl_set_ipmpfailback, NULL }, /* SIOCSENABLESDP is handled by SDP */ /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL }, }; int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t); ip_ioctl_cmd_t ip_misc_ioctl_table[] = { { OSIOCGTUNPARAM, sizeof (struct old_iftun_req), IPI_GET_CMD | IPI_REPL, TUN_CMD, ip_sioctl_tunparam, NULL }, { OSIOCSTUNPARAM, sizeof (struct old_iftun_req), IPI_PRIV | IPI_WR, TUN_CMD, ip_sioctl_tunparam, NULL }, { I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, { I_UNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, { I_PLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, { I_PUNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, { ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL }, { ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, { IP_IOCTL, 0, 0, 0, NULL, NULL }, { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_REPL | IPI_GET_CMD, MISC_CMD, mrt_ioctl}, { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_REPL | IPI_GET_CMD, MISC_CMD, mrt_ioctl}, { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_REPL | IPI_GET_CMD, MISC_CMD, mrt_ioctl} }; int ip_misc_ioctl_count = sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t); int conn_drain_nthreads; /* Number of drainers reqd. */ /* Settable in /etc/system */ /* Defined in ip_ire.c */ extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt; extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt; extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio; static nv_t ire_nv_arr[] = { { IRE_BROADCAST, "BROADCAST" }, { IRE_LOCAL, "LOCAL" }, { IRE_LOOPBACK, "LOOPBACK" }, { IRE_CACHE, "CACHE" }, { IRE_DEFAULT, "DEFAULT" }, { IRE_PREFIX, "PREFIX" }, { IRE_IF_NORESOLVER, "IF_NORESOL" }, { IRE_IF_RESOLVER, "IF_RESOLV" }, { IRE_HOST, "HOST" }, { 0 } }; nv_t *ire_nv_tbl = ire_nv_arr; /* Simple ICMP IP Header Template */ static ipha_t icmp_ipha = { IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP }; struct module_info ip_mod_info = { IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024 }; /* * Duplicate static symbols within a module confuses mdb; so we avoid the * problem by making the symbols here distinct from those in udp.c. */ /* * Entry points for IP as a device and as a module. * FIXME: down the road we might want a separate module and driver qinit. * We have separate open functions for the /dev/ip and /dev/ip6 devices. */ static struct qinit iprinitv4 = { (pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL, &ip_mod_info }; struct qinit iprinitv6 = { (pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL, &ip_mod_info }; static struct qinit ipwinitv4 = { (pfi_t)ip_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &ip_mod_info }; struct qinit ipwinitv6 = { (pfi_t)ip_wput_v6, (pfi_t)ip_wsrv, NULL, NULL, NULL, &ip_mod_info }; static struct qinit iplrinit = { (pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL, &ip_mod_info }; static struct qinit iplwinit = { (pfi_t)ip_lwput, NULL, NULL, NULL, NULL, &ip_mod_info }; /* For AF_INET aka /dev/ip */ struct streamtab ipinfov4 = { &iprinitv4, &ipwinitv4, &iplrinit, &iplwinit }; /* For AF_INET6 aka /dev/ip6 */ struct streamtab ipinfov6 = { &iprinitv6, &ipwinitv6, &iplrinit, &iplwinit }; #ifdef DEBUG static boolean_t skip_sctp_cksum = B_FALSE; #endif /* * Prepend the zoneid using an ipsec_out_t for later use by functions like * ip_rput_v6(), ip_output(), etc. If the message * block already has a M_CTL at the front of it, then simply set the zoneid * appropriately. */ mblk_t * ip_prepend_zoneid(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst) { mblk_t *first_mp; ipsec_out_t *io; ASSERT(zoneid != ALL_ZONES); if (mp->b_datap->db_type == M_CTL) { io = (ipsec_out_t *)mp->b_rptr; ASSERT(io->ipsec_out_type == IPSEC_OUT); io->ipsec_out_zoneid = zoneid; return (mp); } first_mp = ipsec_alloc_ipsec_out(ipst->ips_netstack); if (first_mp == NULL) return (NULL); io = (ipsec_out_t *)first_mp->b_rptr; /* This is not a secure packet */ io->ipsec_out_secure = B_FALSE; io->ipsec_out_zoneid = zoneid; first_mp->b_cont = mp; return (first_mp); } /* * Copy an M_CTL-tagged message, preserving reference counts appropriately. */ mblk_t * ip_copymsg(mblk_t *mp) { mblk_t *nmp; ipsec_info_t *in; if (mp->b_datap->db_type != M_CTL) return (copymsg(mp)); in = (ipsec_info_t *)mp->b_rptr; /* * Note that M_CTL is also used for delivering ICMP error messages * upstream to transport layers. */ if (in->ipsec_info_type != IPSEC_OUT && in->ipsec_info_type != IPSEC_IN) return (copymsg(mp)); nmp = copymsg(mp->b_cont); if (in->ipsec_info_type == IPSEC_OUT) { return (ipsec_out_tag(mp, nmp, ((ipsec_out_t *)in)->ipsec_out_ns)); } else { return (ipsec_in_tag(mp, nmp, ((ipsec_in_t *)in)->ipsec_in_ns)); } } /* Generate an ICMP fragmentation needed message. */ static void icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid, ip_stack_t *ipst) { icmph_t icmph; mblk_t *first_mp; boolean_t mctl_present; EXTRACT_PKT_MP(mp, first_mp, mctl_present); if (!(mp = icmp_pkt_err_ok(mp, ipst))) { if (mctl_present) freeb(first_mp); return; } bzero(&icmph, sizeof (icmph_t)); icmph.icmph_type = ICMP_DEST_UNREACHABLE; icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED; icmph.icmph_du_mtu = htons((uint16_t)mtu); BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded); BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, ipst); } /* * icmp_inbound deals with ICMP messages in the following ways. * * 1) It needs to send a reply back and possibly delivering it * to the "interested" upper clients. * 2) It needs to send it to the upper clients only. * 3) It needs to change some values in IP only. * 4) It needs to change some values in IP and upper layers e.g TCP. * * We need to accomodate icmp messages coming in clear until we get * everything secure from the wire. If icmp_accept_clear_messages * is zero we check with the global policy and act accordingly. If * it is non-zero, we accept the message without any checks. But * *this does not mean* that this will be delivered to the upper * clients. By accepting we might send replies back, change our MTU * value etc. but delivery to the ULP/clients depends on their policy * dispositions. * * We handle the above 4 cases in the context of IPsec in the * following way : * * 1) Send the reply back in the same way as the request came in. * If it came in encrypted, it goes out encrypted. If it came in * clear, it goes out in clear. Thus, this will prevent chosen * plain text attack. * 2) The client may or may not expect things to come in secure. * If it comes in secure, the policy constraints are checked * before delivering it to the upper layers. If it comes in * clear, ipsec_inbound_accept_clear will decide whether to * accept this in clear or not. In both the cases, if the returned * message (IP header + 8 bytes) that caused the icmp message has * AH/ESP headers, it is sent up to AH/ESP for validation before * sending up. If there are only 8 bytes of returned message, then * upper client will not be notified. * 3) Check with global policy to see whether it matches the constaints. * But this will be done only if icmp_accept_messages_in_clear is * zero. * 4) If we need to change both in IP and ULP, then the decision taken * while affecting the values in IP and while delivering up to TCP * should be the same. * * There are two cases. * * a) If we reject data at the IP layer (ipsec_check_global_policy() * failed), we will not deliver it to the ULP, even though they * are *willing* to accept in *clear*. This is fine as our global * disposition to icmp messages asks us reject the datagram. * * b) If we accept data at the IP layer (ipsec_check_global_policy() * succeeded or icmp_accept_messages_in_clear is 1), and not able * to deliver it to ULP (policy failed), it can lead to * consistency problems. The cases known at this time are * ICMP_DESTINATION_UNREACHABLE messages with following code * values : * * - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value * and Upper layer rejects. Then the communication will * come to a stop. This is solved by making similar decisions * at both levels. Currently, when we are unable to deliver * to the Upper Layer (due to policy failures) while IP has * adjusted ire_max_frag, the next outbound datagram would * generate a local ICMP_FRAGMENTATION_NEEDED message - which * will be with the right level of protection. Thus the right * value will be communicated even if we are not able to * communicate when we get from the wire initially. But this * assumes there would be at least one outbound datagram after * IP has adjusted its ire_max_frag value. To make things * simpler, we accept in clear after the validation of * AH/ESP headers. * * - Other ICMP ERRORS : We may not be able to deliver it to the * upper layer depending on the level of protection the upper * layer expects and the disposition in ipsec_inbound_accept_clear(). * ipsec_inbound_accept_clear() decides whether a given ICMP error * should be accepted in clear when the Upper layer expects secure. * Thus the communication may get aborted by some bad ICMP * packets. * * IPQoS Notes: * The only instance when a packet is sent for processing is when there * isn't an ICMP client and if we are interested in it. * If there is a client, IPPF processing will take place in the * ip_fanout_proto routine. * * Zones notes: * The packet is only processed in the context of the specified zone: typically * only this zone will reply to an echo request, and only interested clients in * this zone will receive a copy of the packet. This means that the caller must * call icmp_inbound() for each relevant zone. */ static void icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, int sum_valid, uint32_t sum, boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, zoneid_t zoneid) { icmph_t *icmph; ipha_t *ipha; int iph_hdr_length; int hdr_length; boolean_t interested; uint32_t ts; uchar_t *wptr; ipif_t *ipif; mblk_t *first_mp; ipsec_in_t *ii; ire_t *src_ire; boolean_t onlink; timestruc_t now; uint32_t ill_index; ip_stack_t *ipst; ASSERT(ill != NULL); ipst = ill->ill_ipst; first_mp = mp; if (mctl_present) { mp = first_mp->b_cont; ASSERT(mp != NULL); } ipha = (ipha_t *)mp->b_rptr; if (ipst->ips_icmp_accept_clear_messages == 0) { first_mp = ipsec_check_global_policy(first_mp, NULL, ipha, NULL, mctl_present, ipst->ips_netstack); if (first_mp == NULL) return; } /* * On a labeled system, we have to check whether the zone itself is * permitted to receive raw traffic. */ if (is_system_labeled()) { if (zoneid == ALL_ZONES) zoneid = tsol_packet_to_zoneid(mp); if (!tsol_can_accept_raw(mp, B_FALSE)) { ip1dbg(("icmp_inbound: zone %d can't receive raw", zoneid)); BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); freemsg(first_mp); return; } } /* * We have accepted the ICMP message. It means that we will * respond to the packet if needed. It may not be delivered * to the upper client depending on the policy constraints * and the disposition in ipsec_inbound_accept_clear. */ ASSERT(ill != NULL); BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs); iph_hdr_length = IPH_HDR_LENGTH(ipha); if ((mp->b_wptr - mp->b_rptr) < (iph_hdr_length + ICMPH_SIZE)) { /* Last chance to get real. */ if (!pullupmsg(mp, iph_hdr_length + ICMPH_SIZE)) { BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); freemsg(first_mp); return; } /* Refresh iph following the pullup. */ ipha = (ipha_t *)mp->b_rptr; } /* ICMP header checksum, including checksum field, should be zero. */ if (sum_valid ? (sum != 0 && sum != 0xFFFF) : IP_CSUM(mp, iph_hdr_length, 0)) { BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs); freemsg(first_mp); return; } /* The IP header will always be a multiple of four bytes */ icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; ip2dbg(("icmp_inbound: type %d code %d\n", icmph->icmph_type, icmph->icmph_code)); wptr = (uchar_t *)icmph + ICMPH_SIZE; /* We will set "interested" to "true" if we want a copy */ interested = B_FALSE; switch (icmph->icmph_type) { case ICMP_ECHO_REPLY: BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps); break; case ICMP_DEST_UNREACHABLE: if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded); interested = B_TRUE; /* Pass up to transport */ BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs); break; case ICMP_SOURCE_QUENCH: interested = B_TRUE; /* Pass up to transport */ BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs); break; case ICMP_REDIRECT: if (!ipst->ips_ip_ignore_redirect) interested = B_TRUE; BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects); break; case ICMP_ECHO_REQUEST: /* * Whether to respond to echo requests that come in as IP * broadcasts or as IP multicast is subject to debate * (what isn't?). We aim to please, you pick it. * Default is do it. */ if (!broadcast && !CLASSD(ipha->ipha_dst)) { /* unicast: always respond */ interested = B_TRUE; } else if (CLASSD(ipha->ipha_dst)) { /* multicast: respond based on tunable */ interested = ipst->ips_ip_g_resp_to_echo_mcast; } else if (broadcast) { /* broadcast: respond based on tunable */ interested = ipst->ips_ip_g_resp_to_echo_bcast; } BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos); break; case ICMP_ROUTER_ADVERTISEMENT: case ICMP_ROUTER_SOLICITATION: break; case ICMP_TIME_EXCEEDED: interested = B_TRUE; /* Pass up to transport */ BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds); break; case ICMP_PARAM_PROBLEM: interested = B_TRUE; /* Pass up to transport */ BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs); break; case ICMP_TIME_STAMP_REQUEST: /* Response to Time Stamp Requests is local policy. */ if (ipst->ips_ip_g_resp_to_timestamp && /* So is whether to respond if it was an IP broadcast. */ (!broadcast || ipst->ips_ip_g_resp_to_timestamp_bcast)) { int tstamp_len = 3 * sizeof (uint32_t); if (wptr + tstamp_len > mp->b_wptr) { if (!pullupmsg(mp, wptr + tstamp_len - mp->b_rptr)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); freemsg(first_mp); return; } /* Refresh ipha following the pullup. */ ipha = (ipha_t *)mp->b_rptr; icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; wptr = (uchar_t *)icmph + ICMPH_SIZE; } interested = B_TRUE; } BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps); break; case ICMP_TIME_STAMP_REPLY: BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps); break; case ICMP_INFO_REQUEST: /* Per RFC 1122 3.2.2.7, ignore this. */ case ICMP_INFO_REPLY: break; case ICMP_ADDRESS_MASK_REQUEST: if ((ipst->ips_ip_respond_to_address_mask_broadcast || !broadcast) && /* TODO m_pullup of complete header? */ (mp->b_datap->db_lim - wptr) >= IP_ADDR_LEN) { interested = B_TRUE; } BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks); break; case ICMP_ADDRESS_MASK_REPLY: BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps); break; default: interested = B_TRUE; /* Pass up to transport */ BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns); break; } /* See if there is an ICMP client. */ if (ipst->ips_ipcl_proto_fanout[IPPROTO_ICMP].connf_head != NULL) { /* If there is an ICMP client and we want one too, copy it. */ mblk_t *first_mp1; if (!interested) { ip_fanout_proto(q, first_mp, ill, ipha, 0, mctl_present, ip_policy, recv_ill, zoneid); return; } first_mp1 = ip_copymsg(first_mp); if (first_mp1 != NULL) { ip_fanout_proto(q, first_mp1, ill, ipha, 0, mctl_present, ip_policy, recv_ill, zoneid); } } else if (!interested) { freemsg(first_mp); return; } else { /* * Initiate policy processing for this packet if ip_policy * is true. */ if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { ill_index = ill->ill_phyint->phyint_ifindex; ip_process(IPP_LOCAL_IN, &mp, ill_index); if (mp == NULL) { if (mctl_present) { freeb(first_mp); } BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); return; } } } /* We want to do something with it. */ /* Check db_ref to make sure we can modify the packet. */ if (mp->b_datap->db_ref > 1) { mblk_t *first_mp1; first_mp1 = ip_copymsg(first_mp); freemsg(first_mp); if (!first_mp1) { BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); return; } first_mp = first_mp1; if (mctl_present) { mp = first_mp->b_cont; ASSERT(mp != NULL); } else { mp = first_mp; } ipha = (ipha_t *)mp->b_rptr; icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; wptr = (uchar_t *)icmph + ICMPH_SIZE; } switch (icmph->icmph_type) { case ICMP_ADDRESS_MASK_REQUEST: ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); if (ipif == NULL) { freemsg(first_mp); return; } /* * outging interface must be IPv4 */ ASSERT(ipif != NULL && !ipif->ipif_isv6); icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; bcopy(&ipif->ipif_net_mask, wptr, IP_ADDR_LEN); ipif_refrele(ipif); BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps); break; case ICMP_ECHO_REQUEST: icmph->icmph_type = ICMP_ECHO_REPLY; BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps); break; case ICMP_TIME_STAMP_REQUEST: { uint32_t *tsp; icmph->icmph_type = ICMP_TIME_STAMP_REPLY; tsp = (uint32_t *)wptr; tsp++; /* Skip past 'originate time' */ /* Compute # of milliseconds since midnight */ gethrestime(&now); ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + now.tv_nsec / (NANOSEC / MILLISEC); *tsp++ = htonl(ts); /* Lay in 'receive time' */ *tsp++ = htonl(ts); /* Lay in 'send time' */ BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps); break; } default: ipha = (ipha_t *)&icmph[1]; if ((uchar_t *)&ipha[1] > mp->b_wptr) { if (!pullupmsg(mp, (uchar_t *)&ipha[1] - mp->b_rptr)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); freemsg(first_mp); return; } icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; ipha = (ipha_t *)&icmph[1]; } if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); freemsg(first_mp); return; } hdr_length = IPH_HDR_LENGTH(ipha); if (hdr_length < sizeof (ipha_t)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); freemsg(first_mp); return; } if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length - mp->b_rptr)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); freemsg(first_mp); return; } icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; ipha = (ipha_t *)&icmph[1]; } switch (icmph->icmph_type) { case ICMP_REDIRECT: /* * As there is no upper client to deliver, we don't * need the first_mp any more. */ if (mctl_present) { freeb(first_mp); } icmp_redirect(ill, mp); return; case ICMP_DEST_UNREACHABLE: if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) { if (!icmp_inbound_too_big(icmph, ipha, ill, zoneid, mp, iph_hdr_length, ipst)) { freemsg(first_mp); return; } /* * icmp_inbound_too_big() may alter mp. * Resynch ipha and icmph accordingly. */ icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; ipha = (ipha_t *)&icmph[1]; } /* FALLTHRU */ default : /* * IPQoS notes: Since we have already done IPQoS * processing we don't want to do it again in * the fanout routines called by * icmp_inbound_error_fanout, hence the last * argument, ip_policy, is B_FALSE. */ icmp_inbound_error_fanout(q, ill, first_mp, icmph, ipha, iph_hdr_length, hdr_length, mctl_present, B_FALSE, recv_ill, zoneid); } return; } /* Send out an ICMP packet */ icmph->icmph_checksum = 0; icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0); if (broadcast || CLASSD(ipha->ipha_dst)) { ipif_t *ipif_chosen; /* * Make it look like it was directed to us, so we don't look * like a fool with a broadcast or multicast source address. */ ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); /* * Make sure that we haven't grabbed an interface that's DOWN. */ if (ipif != NULL) { ipif_chosen = ipif_select_source(ipif->ipif_ill, ipha->ipha_src, zoneid); if (ipif_chosen != NULL) { ipif_refrele(ipif); ipif = ipif_chosen; } } if (ipif == NULL) { ip0dbg(("icmp_inbound: " "No source for broadcast/multicast:\n" "\tsrc 0x%x dst 0x%x ill %p " "ipif_lcl_addr 0x%x\n", ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), (void *)ill, ill->ill_ipif->ipif_lcl_addr)); freemsg(first_mp); return; } ASSERT(ipif != NULL && !ipif->ipif_isv6); ipha->ipha_dst = ipif->ipif_src_addr; ipif_refrele(ipif); } /* Reset time to live. */ ipha->ipha_ttl = ipst->ips_ip_def_ttl; { /* Swap source and destination addresses */ ipaddr_t tmp; tmp = ipha->ipha_src; ipha->ipha_src = ipha->ipha_dst; ipha->ipha_dst = tmp; } ipha->ipha_ident = 0; if (!IS_SIMPLE_IPH(ipha)) icmp_options_update(ipha); /* * ICMP echo replies should go out on the same interface * the request came on as probes used by in.mpathd for detecting * NIC failures are ECHO packets. We turn-off load spreading * by setting ipsec_in_attach_if to B_TRUE, which is copied * to ipsec_out_attach_if by ipsec_in_to_out called later in this * function. This is in turn handled by ip_wput and ip_newroute * to make sure that the packet goes out on the interface it came * in on. If we don't turnoff load spreading, the packets might get * dropped if there are no non-FAILED/INACTIVE interfaces for it * to go out and in.mpathd would wrongly detect a failure or * mis-detect a NIC failure for link failure. As load spreading * can happen only if ill_group is not NULL, we do only for * that case and this does not affect the normal case. * * We turn off load spreading only on echo packets that came from * on-link hosts. If the interface route has been deleted, this will * not be enforced as we can't do much. For off-link hosts, as the * default routes in IPv4 does not typically have an ire_ipif * pointer, we can't force MATCH_IRE_ILL in ip_wput/ip_newroute. * Moreover, expecting a default route through this interface may * not be correct. We use ipha_dst because of the swap above. */ onlink = B_FALSE; if (icmph->icmph_type == ICMP_ECHO_REPLY && ill->ill_group != NULL) { /* * First, we need to make sure that it is not one of our * local addresses. If we set onlink when it is one of * our local addresses, we will end up creating IRE_CACHES * for one of our local addresses. Then, we will never * accept packets for them afterwards. */ src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); if (src_ire == NULL) { ipif = ipif_get_next_ipif(NULL, ill); if (ipif == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); freemsg(mp); return; } src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE, ipst); ipif_refrele(ipif); if (src_ire != NULL) { onlink = B_TRUE; ire_refrele(src_ire); } } else { ire_refrele(src_ire); } } if (!mctl_present) { /* * This packet should go out the same way as it * came in i.e in clear. To make sure that global * policy will not be applied to this in ip_wput_ire, * we attach a IPSEC_IN mp and clear ipsec_in_secure. */ ASSERT(first_mp == mp); first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); if (first_mp == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); freemsg(mp); return; } ii = (ipsec_in_t *)first_mp->b_rptr; /* This is not a secure packet */ ii->ipsec_in_secure = B_FALSE; if (onlink) { ii->ipsec_in_attach_if = B_TRUE; ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex; } first_mp->b_cont = mp; } else if (onlink) { ii = (ipsec_in_t *)first_mp->b_rptr; ii->ipsec_in_attach_if = B_TRUE; ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex; ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */ } else { ii = (ipsec_in_t *)first_mp->b_rptr; ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */ } ii->ipsec_in_zoneid = zoneid; ASSERT(zoneid != ALL_ZONES); if (!ipsec_in_to_out(first_mp, ipha, NULL)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); return; } BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs); put(WR(q), first_mp); } static ipaddr_t icmp_get_nexthop_addr(ipha_t *ipha, ill_t *ill, zoneid_t zoneid, mblk_t *mp) { conn_t *connp; connf_t *connfp; ipaddr_t nexthop_addr = INADDR_ANY; int hdr_length = IPH_HDR_LENGTH(ipha); uint16_t *up; uint32_t ports; ip_stack_t *ipst = ill->ill_ipst; up = (uint16_t *)((uchar_t *)ipha + hdr_length); switch (ipha->ipha_protocol) { case IPPROTO_TCP: { tcph_t *tcph; /* do a reverse lookup */ tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, TCPS_LISTEN, ipst); break; } case IPPROTO_UDP: { uint32_t dstport, srcport; ((uint16_t *)&ports)[0] = up[1]; ((uint16_t *)&ports)[1] = up[0]; /* Extract ports in net byte order */ dstport = htons(ntohl(ports) & 0xFFFF); srcport = htons(ntohl(ports) >> 16); connfp = &ipst->ips_ipcl_udp_fanout[ IPCL_UDP_HASH(dstport, ipst)]; mutex_enter(&connfp->connf_lock); connp = connfp->connf_head; /* do a reverse lookup */ while ((connp != NULL) && (!IPCL_UDP_MATCH(connp, dstport, ipha->ipha_src, srcport, ipha->ipha_dst) || !IPCL_ZONE_MATCH(connp, zoneid))) { connp = connp->conn_next; } if (connp != NULL) CONN_INC_REF(connp); mutex_exit(&connfp->connf_lock); break; } case IPPROTO_SCTP: { in6_addr_t map_src, map_dst; IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_src); IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_dst); ((uint16_t *)&ports)[0] = up[1]; ((uint16_t *)&ports)[1] = up[0]; connp = sctp_find_conn(&map_src, &map_dst, ports, zoneid, ipst->ips_netstack->netstack_sctp); if (connp == NULL) { connp = ipcl_classify_raw(mp, IPPROTO_SCTP, zoneid, ports, ipha, ipst); } else { CONN_INC_REF(connp); SCTP_REFRELE(CONN2SCTP(connp)); } break; } default: { ipha_t ripha; ripha.ipha_src = ipha->ipha_dst; ripha.ipha_dst = ipha->ipha_src; ripha.ipha_protocol = ipha->ipha_protocol; connfp = &ipst->ips_ipcl_proto_fanout[ ipha->ipha_protocol]; mutex_enter(&connfp->connf_lock); connp = connfp->connf_head; for (connp = connfp->connf_head; connp != NULL; connp = connp->conn_next) { if (IPCL_PROTO_MATCH(connp, ipha->ipha_protocol, &ripha, ill, 0, zoneid)) { CONN_INC_REF(connp); break; } } mutex_exit(&connfp->connf_lock); } } if (connp != NULL) { if (connp->conn_nexthop_set) nexthop_addr = connp->conn_nexthop_v4; CONN_DEC_REF(connp); } return (nexthop_addr); } /* Table from RFC 1191 */ static int icmp_frag_size_table[] = { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 }; /* * Process received ICMP Packet too big. * After updating any IRE it does the fanout to any matching transport streams. * Assumes the message has been pulled up till the IP header that caused * the error. * * Returns B_FALSE on failure and B_TRUE on success. */ static boolean_t icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha, ill_t *ill, zoneid_t zoneid, mblk_t *mp, int iph_hdr_length, ip_stack_t *ipst) { ire_t *ire, *first_ire; int mtu, orig_mtu; int hdr_length; ipaddr_t nexthop_addr; boolean_t disable_pmtud; ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE && icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED); ASSERT(ill != NULL); hdr_length = IPH_HDR_LENGTH(ipha); /* Drop if the original packet contained a source route */ if (ip_source_route_included(ipha)) { return (B_FALSE); } /* * Verify we have atleast ICMP_MIN_TP_HDR_LENGTH bytes of transport * header. */ if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > mp->b_wptr) { if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); ip1dbg(("icmp_inbound_too_big: insufficient hdr\n")); return (B_FALSE); } icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; ipha = (ipha_t *)&icmph[1]; } nexthop_addr = icmp_get_nexthop_addr(ipha, ill, zoneid, mp); if (nexthop_addr != INADDR_ANY) { /* nexthop set */ first_ire = ire_ctable_lookup(ipha->ipha_dst, nexthop_addr, 0, NULL, ALL_ZONES, MBLK_GETLABEL(mp), MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, ipst); } else { /* nexthop not set */ first_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_CACHE, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); } if (!first_ire) { ip1dbg(("icmp_inbound_too_big: no route for 0x%x\n", ntohl(ipha->ipha_dst))); return (B_FALSE); } /* Check for MTU discovery advice as described in RFC 1191 */ mtu = ntohs(icmph->icmph_du_mtu); orig_mtu = mtu; disable_pmtud = B_FALSE; rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER); for (ire = first_ire; ire != NULL && ire->ire_addr == ipha->ipha_dst; ire = ire->ire_next) { /* * Look for the connection to which this ICMP message is * directed. If it has the IP_NEXTHOP option set, then the * search is limited to IREs with the MATCH_IRE_PRIVATE * option. Else the search is limited to regular IREs. */ if (((ire->ire_marks & IRE_MARK_PRIVATE_ADDR) && (nexthop_addr != ire->ire_gateway_addr)) || (!(ire->ire_marks & IRE_MARK_PRIVATE_ADDR) && (nexthop_addr != INADDR_ANY))) continue; mutex_enter(&ire->ire_lock); if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) { uint32_t length; int i; /* * Use the table from RFC 1191 to figure out * the next "plateau" based on the length in * the original IP packet. */ length = ntohs(ipha->ipha_length); DTRACE_PROBE2(ip4__pmtu__guess, ire_t *, ire, uint32_t, length); if (ire->ire_max_frag <= length && ire->ire_max_frag >= length - hdr_length) { /* * Handle broken BSD 4.2 systems that * return the wrong iph_length in ICMP * errors. */ length -= hdr_length; } for (i = 0; i < A_CNT(icmp_frag_size_table); i++) { if (length > icmp_frag_size_table[i]) break; } if (i == A_CNT(icmp_frag_size_table)) { /* Smaller than 68! */ disable_pmtud = B_TRUE; mtu = ipst->ips_ip_pmtu_min; } else { mtu = icmp_frag_size_table[i]; if (mtu < ipst->ips_ip_pmtu_min) { mtu = ipst->ips_ip_pmtu_min; disable_pmtud = B_TRUE; } } /* Fool the ULP into believing our guessed PMTU. */ icmph->icmph_du_zero = 0; icmph->icmph_du_mtu = htons(mtu); } if (disable_pmtud) ire->ire_frag_flag = 0; /* Reduce the IRE max frag value as advised. */ ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); mutex_exit(&ire->ire_lock); DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, ire_t *, ire, int, orig_mtu, int, mtu); } rw_exit(&first_ire->ire_bucket->irb_lock); ire_refrele(first_ire); return (B_TRUE); } /* * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout * calls this function. */ static mblk_t * icmp_inbound_self_encap_error(mblk_t *mp, int iph_hdr_length, int hdr_length) { ipha_t *ipha; icmph_t *icmph; ipha_t *in_ipha; int length; ASSERT(mp->b_datap->db_type == M_DATA); /* * For Self-encapsulated packets, we added an extra IP header * without the options. Inner IP header is the one from which * the outer IP header was formed. Thus, we need to remove the * outer IP header. To do this, we pullup the whole message * and overlay whatever follows the outer IP header over the * outer IP header. */ if (!pullupmsg(mp, -1)) return (NULL); icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; ipha = (ipha_t *)&icmph[1]; in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); /* * The length that we want to overlay is following the inner * IP header. Subtracting the IP header + icmp header + outer * IP header's length should give us the length that we want to * overlay. */ length = msgdsize(mp) - iph_hdr_length - sizeof (icmph_t) - hdr_length; /* * Overlay whatever follows the inner header over the * outer header. */ bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length); /* Set the wptr to account for the outer header */ mp->b_wptr -= hdr_length; return (mp); } /* * Try to pass the ICMP message upstream in case the ULP cares. * * If the packet that caused the ICMP error is secure, we send * it to AH/ESP to make sure that the attached packet has a * valid association. ipha in the code below points to the * IP header of the packet that caused the error. * * We handle ICMP_FRAGMENTATION_NEEDED(IFN) message differently * in the context of IPsec. Normally we tell the upper layer * whenever we send the ire (including ip_bind), the IPsec header * length in ire_ipsec_overhead. TCP can deduce the MSS as it * has both the MTU (ire_max_frag) and the ire_ipsec_overhead. * Similarly, we pass the new MTU icmph_du_mtu and TCP does the * same thing. As TCP has the IPsec options size that needs to be * adjusted, we just pass the MTU unchanged. * * IFN could have been generated locally or by some router. * * LOCAL : *ip_wput_ire -> icmp_frag_needed could have generated this. * This happens because IP adjusted its value of MTU on an * earlier IFN message and could not tell the upper layer, * the new adjusted value of MTU e.g. Packet was encrypted * or there was not enough information to fanout to upper * layers. Thus on the next outbound datagram, ip_wput_ire * generates the IFN, where IPsec processing has *not* been * done. * * *ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed * could have generated this. This happens because ire_max_frag * value in IP was set to a new value, while the IPsec processing * was being done and after we made the fragmentation check in * ip_wput_ire. Thus on return from IPsec processing, * ip_wput_ipsec_out finds that the new length is > ire_max_frag * and generates the IFN. As IPsec processing is over, we fanout * to AH/ESP to remove the header. * * In both these cases, ipsec_in_loopback will be set indicating * that IFN was generated locally. * * ROUTER : IFN could be secure or non-secure. * * * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the * packet in error has AH/ESP headers to validate the AH/ESP * headers. AH/ESP will verify whether there is a valid SA or * not and send it back. We will fanout again if we have more * data in the packet. * * If the packet in error does not have AH/ESP, we handle it * like any other case. * * * NON_SECURE : If the packet in error has AH/ESP headers, * we attach a dummy ipsec_in and send it up to AH/ESP * for validation. AH/ESP will verify whether there is a * valid SA or not and send it back. We will fanout again if * we have more data in the packet. * * If the packet in error does not have AH/ESP, we handle it * like any other case. */ static void icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp, icmph_t *icmph, ipha_t *ipha, int iph_hdr_length, int hdr_length, boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, zoneid_t zoneid) { uint16_t *up; /* Pointer to ports in ULP header */ uint32_t ports; /* reversed ports for fanout */ ipha_t ripha; /* With reversed addresses */ mblk_t *first_mp; ipsec_in_t *ii; tcph_t *tcph; conn_t *connp; ip_stack_t *ipst; ASSERT(ill != NULL); ASSERT(recv_ill != NULL); ipst = recv_ill->ill_ipst; first_mp = mp; if (mctl_present) { mp = first_mp->b_cont; ASSERT(mp != NULL); ii = (ipsec_in_t *)first_mp->b_rptr; ASSERT(ii->ipsec_in_type == IPSEC_IN); } else { ii = NULL; } switch (ipha->ipha_protocol) { case IPPROTO_UDP: /* * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of * transport header. */ if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > mp->b_wptr) { if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { goto discard_pkt; } icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; ipha = (ipha_t *)&icmph[1]; } up = (uint16_t *)((uchar_t *)ipha + hdr_length); /* * Attempt to find a client stream based on port. * Note that we do a reverse lookup since the header is * in the form we sent it out. * The ripha header is only used for the IP_UDP_MATCH and we * only set the src and dst addresses and protocol. */ ripha.ipha_src = ipha->ipha_dst; ripha.ipha_dst = ipha->ipha_src; ripha.ipha_protocol = ipha->ipha_protocol; ((uint16_t *)&ports)[0] = up[1]; ((uint16_t *)&ports)[1] = up[0]; ip2dbg(("icmp_inbound_error: UDP %x:%d to %x:%d: %d/%d\n", ntohl(ipha->ipha_src), ntohs(up[0]), ntohl(ipha->ipha_dst), ntohs(up[1]), icmph->icmph_type, icmph->icmph_code)); /* Have to change db_type after any pullupmsg */ DB_TYPE(mp) = M_CTL; ip_fanout_udp(q, first_mp, ill, &ripha, ports, B_FALSE, 0, mctl_present, ip_policy, recv_ill, zoneid); return; case IPPROTO_TCP: /* * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of * transport header. */ if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > mp->b_wptr) { if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { goto discard_pkt; } icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; ipha = (ipha_t *)&icmph[1]; } /* * Find a TCP client stream for this packet. * Note that we do a reverse lookup since the header is * in the form we sent it out. */ tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, TCPS_LISTEN, ipst); if (connp == NULL) goto discard_pkt; /* Have to change db_type after any pullupmsg */ DB_TYPE(mp) = M_CTL; squeue_fill(connp->conn_sqp, first_mp, tcp_input, connp, SQTAG_TCP_INPUT_ICMP_ERR); return; case IPPROTO_SCTP: /* * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of * transport header. */ if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > mp->b_wptr) { if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { goto discard_pkt; } icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; ipha = (ipha_t *)&icmph[1]; } up = (uint16_t *)((uchar_t *)ipha + hdr_length); /* * Find a SCTP client stream for this packet. * Note that we do a reverse lookup since the header is * in the form we sent it out. * The ripha header is only used for the matching and we * only set the src and dst addresses, protocol, and version. */ ripha.ipha_src = ipha->ipha_dst; ripha.ipha_dst = ipha->ipha_src; ripha.ipha_protocol = ipha->ipha_protocol; ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length; ((uint16_t *)&ports)[0] = up[1]; ((uint16_t *)&ports)[1] = up[0]; /* Have to change db_type after any pullupmsg */ DB_TYPE(mp) = M_CTL; ip_fanout_sctp(first_mp, recv_ill, &ripha, ports, 0, mctl_present, ip_policy, zoneid); return; case IPPROTO_ESP: case IPPROTO_AH: { int ipsec_rc; ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; /* * We need a IPSEC_IN in the front to fanout to AH/ESP. * We will re-use the IPSEC_IN if it is already present as * AH/ESP will not affect any fields in the IPSEC_IN for * ICMP errors. If there is no IPSEC_IN, allocate a new * one and attach it in the front. */ if (ii != NULL) { /* * ip_fanout_proto_again converts the ICMP errors * that come back from AH/ESP to M_DATA so that * if it is non-AH/ESP and we do a pullupmsg in * this function, it would work. Convert it back * to M_CTL before we send up as this is a ICMP * error. This could have been generated locally or * by some router. Validate the inner IPsec * headers. * * NOTE : ill_index is used by ip_fanout_proto_again * to locate the ill. */ ASSERT(ill != NULL); ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex; DB_TYPE(first_mp->b_cont) = M_CTL; } else { /* * IPSEC_IN is not present. We attach a ipsec_in * message and send up to IPsec for validating * and removing the IPsec headers. Clear * ipsec_in_secure so that when we return * from IPsec, we don't mistakenly think that this * is a secure packet came from the network. * * NOTE : ill_index is used by ip_fanout_proto_again * to locate the ill. */ ASSERT(first_mp == mp); first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); if (first_mp == NULL) { freemsg(mp); BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); return; } ii = (ipsec_in_t *)first_mp->b_rptr; /* This is not a secure packet */ ii->ipsec_in_secure = B_FALSE; first_mp->b_cont = mp; DB_TYPE(mp) = M_CTL; ASSERT(ill != NULL); ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex; } ip2dbg(("icmp_inbound_error: ipsec\n")); if (!ipsec_loaded(ipss)) { ip_proto_not_sup(q, first_mp, 0, zoneid, ipst); return; } if (ipha->ipha_protocol == IPPROTO_ESP) ipsec_rc = ipsecesp_icmp_error(first_mp); else ipsec_rc = ipsecah_icmp_error(first_mp); if (ipsec_rc == IPSEC_STATUS_FAILED) return; ip_fanout_proto_again(first_mp, ill, recv_ill, NULL); return; } default: /* * The ripha header is only used for the lookup and we * only set the src and dst addresses and protocol. */ ripha.ipha_src = ipha->ipha_dst; ripha.ipha_dst = ipha->ipha_src; ripha.ipha_protocol = ipha->ipha_protocol; ip2dbg(("icmp_inbound_error: proto %d %x to %x: %d/%d\n", ripha.ipha_protocol, ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), icmph->icmph_type, icmph->icmph_code)); if (ipha->ipha_protocol == IPPROTO_ENCAP) { ipha_t *in_ipha; if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > mp->b_wptr) { if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + sizeof (ipha_t) - mp->b_rptr)) { goto discard_pkt; } icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; ipha = (ipha_t *)&icmph[1]; } /* * Caller has verified that length has to be * at least the size of IP header. */ ASSERT(hdr_length >= sizeof (ipha_t)); /* * Check the sanity of the inner IP header like * we did for the outer header. */ in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) { goto discard_pkt; } if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) { goto discard_pkt; } /* Check for Self-encapsulated tunnels */ if (in_ipha->ipha_src == ipha->ipha_src && in_ipha->ipha_dst == ipha->ipha_dst) { mp = icmp_inbound_self_encap_error(mp, iph_hdr_length, hdr_length); if (mp == NULL) goto discard_pkt; icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; ipha = (ipha_t *)&icmph[1]; hdr_length = IPH_HDR_LENGTH(ipha); /* * The packet in error is self-encapsualted. * And we are finding it further encapsulated * which we could not have possibly generated. */ if (ipha->ipha_protocol == IPPROTO_ENCAP) { goto discard_pkt; } icmp_inbound_error_fanout(q, ill, first_mp, icmph, ipha, iph_hdr_length, hdr_length, mctl_present, ip_policy, recv_ill, zoneid); return; } } if ((ipha->ipha_protocol == IPPROTO_ENCAP || ipha->ipha_protocol == IPPROTO_IPV6) && icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED && ii != NULL && ii->ipsec_in_loopback && ii->ipsec_in_secure) { /* * For IP tunnels that get a looped-back * ICMP_FRAGMENTATION_NEEDED message, adjust the * reported new MTU to take into account the IPsec * headers protecting this configured tunnel. * * This allows the tunnel module (tun.c) to blindly * accept the MTU reported in an ICMP "too big" * message. * * Non-looped back ICMP messages will just be * handled by the security protocols (if needed), * and the first subsequent packet will hit this * path. */ icmph->icmph_du_mtu = htons(ntohs(icmph->icmph_du_mtu) - ipsec_in_extra_length(first_mp)); } /* Have to change db_type after any pullupmsg */ DB_TYPE(mp) = M_CTL; ip_fanout_proto(q, first_mp, ill, &ripha, 0, mctl_present, ip_policy, recv_ill, zoneid); return; } /* NOTREACHED */ discard_pkt: BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); drop_pkt:; ip1dbg(("icmp_inbound_error_fanout: drop pkt\n")); freemsg(first_mp); } /* * Common IP options parser. * * Setup routine: fill in *optp with options-parsing state, then * tail-call ipoptp_next to return the first option. */ uint8_t ipoptp_first(ipoptp_t *optp, ipha_t *ipha) { uint32_t totallen; /* total length of all options */ totallen = ipha->ipha_version_and_hdr_length - (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS); totallen <<= 2; optp->ipoptp_next = (uint8_t *)(&ipha[1]); optp->ipoptp_end = optp->ipoptp_next + totallen; optp->ipoptp_flags = 0; return (ipoptp_next(optp)); } /* * Common IP options parser: extract next option. */ uint8_t ipoptp_next(ipoptp_t *optp) { uint8_t *end = optp->ipoptp_end; uint8_t *cur = optp->ipoptp_next; uint8_t opt, len, pointer; /* * If cur > end already, then the ipoptp_end or ipoptp_next pointer * has been corrupted. */ ASSERT(cur <= end); if (cur == end) return (IPOPT_EOL); opt = cur[IPOPT_OPTVAL]; /* * Skip any NOP options. */ while (opt == IPOPT_NOP) { cur++; if (cur == end) return (IPOPT_EOL); opt = cur[IPOPT_OPTVAL]; } if (opt == IPOPT_EOL) return (IPOPT_EOL); /* * Option requiring a length. */ if ((cur + 1) >= end) { optp->ipoptp_flags |= IPOPTP_ERROR; return (IPOPT_EOL); } len = cur[IPOPT_OLEN]; if (len < 2) { optp->ipoptp_flags |= IPOPTP_ERROR; return (IPOPT_EOL); } optp->ipoptp_cur = cur; optp->ipoptp_len = len; optp->ipoptp_next = cur + len; if (cur + len > end) { optp->ipoptp_flags |= IPOPTP_ERROR; return (IPOPT_EOL); } /* * For the options which require a pointer field, make sure * its there, and make sure it points to either something * inside this option, or the end of the option. */ switch (opt) { case IPOPT_RR: case IPOPT_TS: case IPOPT_LSRR: case IPOPT_SSRR: if (len <= IPOPT_OFFSET) { optp->ipoptp_flags |= IPOPTP_ERROR; return (opt); } pointer = cur[IPOPT_OFFSET]; if (pointer - 1 > len) { optp->ipoptp_flags |= IPOPTP_ERROR; return (opt); } break; } /* * Sanity check the pointer field based on the type of the * option. */ switch (opt) { case IPOPT_RR: case IPOPT_SSRR: case IPOPT_LSRR: if (pointer < IPOPT_MINOFF_SR) optp->ipoptp_flags |= IPOPTP_ERROR; break; case IPOPT_TS: if (pointer < IPOPT_MINOFF_IT) optp->ipoptp_flags |= IPOPTP_ERROR; /* * Note that the Internet Timestamp option also * contains two four bit fields (the Overflow field, * and the Flag field), which follow the pointer * field. We don't need to check that these fields * fall within the length of the option because this * was implicitely done above. We've checked that the * pointer value is at least IPOPT_MINOFF_IT, and that * it falls within the option. Since IPOPT_MINOFF_IT > * IPOPT_POS_OV_FLG, we don't need the explicit check. */ ASSERT(len > IPOPT_POS_OV_FLG); break; } return (opt); } /* * Use the outgoing IP header to create an IP_OPTIONS option the way * it was passed down from the application. */ int ip_opt_get_user(const ipha_t *ipha, uchar_t *buf) { ipoptp_t opts; const uchar_t *opt; uint8_t optval; uint8_t optlen; uint32_t len = 0; uchar_t *buf1 = buf; buf += IP_ADDR_LEN; /* Leave room for final destination */ len += IP_ADDR_LEN; bzero(buf1, IP_ADDR_LEN); /* * OK to cast away const here, as we don't store through the returned * opts.ipoptp_cur pointer. */ for (optval = ipoptp_first(&opts, (ipha_t *)ipha); optval != IPOPT_EOL; optval = ipoptp_next(&opts)) { int off; opt = opts.ipoptp_cur; optlen = opts.ipoptp_len; switch (optval) { case IPOPT_SSRR: case IPOPT_LSRR: /* * Insert ipha_dst as the first entry in the source * route and move down the entries on step. * The last entry gets placed at buf1. */ buf[IPOPT_OPTVAL] = optval; buf[IPOPT_OLEN] = optlen; buf[IPOPT_OFFSET] = optlen; off = optlen - IP_ADDR_LEN; if (off < 0) { /* No entries in source route */ break; } /* Last entry in source route */ bcopy(opt + off, buf1, IP_ADDR_LEN); off -= IP_ADDR_LEN; while (off > 0) { bcopy(opt + off, buf + off + IP_ADDR_LEN, IP_ADDR_LEN); off -= IP_ADDR_LEN; } /* ipha_dst into first slot */ bcopy(&ipha->ipha_dst, buf + off + IP_ADDR_LEN, IP_ADDR_LEN); buf += optlen; len += optlen; break; case IPOPT_COMSEC: case IPOPT_SECURITY: /* if passing up a label is not ok, then remove */ if (is_system_labeled()) break; /* FALLTHROUGH */ default: bcopy(opt, buf, optlen); buf += optlen; len += optlen; break; } } done: /* Pad the resulting options */ while (len & 0x3) { *buf++ = IPOPT_EOL; len++; } return (len); } /* * Update any record route or timestamp options to include this host. * Reverse any source route option. * This routine assumes that the options are well formed i.e. that they * have already been checked. */ static void icmp_options_update(ipha_t *ipha) { ipoptp_t opts; uchar_t *opt; uint8_t optval; ipaddr_t src; /* Our local address */ ipaddr_t dst; ip2dbg(("icmp_options_update\n")); src = ipha->ipha_src; dst = ipha->ipha_dst; for (optval = ipoptp_first(&opts, ipha); optval != IPOPT_EOL; optval = ipoptp_next(&opts)) { ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); opt = opts.ipoptp_cur; ip2dbg(("icmp_options_update: opt %d, len %d\n", optval, opts.ipoptp_len)); switch (optval) { int off1, off2; case IPOPT_SSRR: case IPOPT_LSRR: /* * Reverse the source route. The first entry * should be the next to last one in the current * source route (the last entry is our address). * The last entry should be the final destination. */ off1 = IPOPT_MINOFF_SR - 1; off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; if (off2 < 0) { /* No entries in source route */ ip1dbg(( "icmp_options_update: bad src route\n")); break; } bcopy((char *)opt + off2, &dst, IP_ADDR_LEN); bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN); bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN); off2 -= IP_ADDR_LEN; while (off1 < off2) { bcopy((char *)opt + off1, &src, IP_ADDR_LEN); bcopy((char *)opt + off2, (char *)opt + off1, IP_ADDR_LEN); bcopy(&src, (char *)opt + off2, IP_ADDR_LEN); off1 += IP_ADDR_LEN; off2 -= IP_ADDR_LEN; } opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR; break; } } } /* * Process received ICMP Redirect messages. */ static void icmp_redirect(ill_t *ill, mblk_t *mp) { ipha_t *ipha; int iph_hdr_length; icmph_t *icmph; ipha_t *ipha_err; ire_t *ire; ire_t *prev_ire; ire_t *save_ire; ipaddr_t src, dst, gateway; iulp_t ulp_info = { 0 }; int error; ip_stack_t *ipst; ASSERT(ill != NULL); ipst = ill->ill_ipst; ipha = (ipha_t *)mp->b_rptr; iph_hdr_length = IPH_HDR_LENGTH(ipha); if (((mp->b_wptr - mp->b_rptr) - iph_hdr_length) < sizeof (icmph_t) + IP_SIMPLE_HDR_LENGTH) { BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); freemsg(mp); return; } icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; ipha_err = (ipha_t *)&icmph[1]; src = ipha->ipha_src; dst = ipha_err->ipha_dst; gateway = icmph->icmph_rd_gateway; /* Make sure the new gateway is reachable somehow. */ ire = ire_route_lookup(gateway, 0, 0, IRE_INTERFACE, NULL, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); /* * Make sure we had a route for the dest in question and that * that route was pointing to the old gateway (the source of the * redirect packet.) */ prev_ire = ire_route_lookup(dst, 0, src, 0, NULL, NULL, ALL_ZONES, NULL, MATCH_IRE_GW, ipst); /* * Check that * the redirect was not from ourselves * the new gateway and the old gateway are directly reachable */ if (!prev_ire || !ire || ire->ire_type == IRE_LOCAL) { BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects); freemsg(mp); if (ire != NULL) ire_refrele(ire); if (prev_ire != NULL) ire_refrele(prev_ire); return; } /* * Should we use the old ULP info to create the new gateway? From * a user's perspective, we should inherit the info so that it * is a "smooth" transition. If we do not do that, then new * connections going thru the new gateway will have no route metrics, * which is counter-intuitive to user. From a network point of * view, this may or may not make sense even though the new gateway * is still directly connected to us so the route metrics should not * change much. * * But if the old ire_uinfo is not initialized, we do another * recursive lookup on the dest using the new gateway. There may * be a route to that. If so, use it to initialize the redirect * route. */ if (prev_ire->ire_uinfo.iulp_set) { bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t)); } else { ire_t *tmp_ire; ire_t *sire; tmp_ire = ire_ftable_lookup(dst, 0, gateway, 0, NULL, &sire, ALL_ZONES, 0, NULL, (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT), ipst); if (sire != NULL) { bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t)); /* * If sire != NULL, ire_ftable_lookup() should not * return a NULL value. */ ASSERT(tmp_ire != NULL); ire_refrele(tmp_ire); ire_refrele(sire); } else if (tmp_ire != NULL) { bcopy(&tmp_ire->ire_uinfo, &ulp_info, sizeof (iulp_t)); ire_refrele(tmp_ire); } } if (prev_ire->ire_type == IRE_CACHE) ire_delete(prev_ire); ire_refrele(prev_ire); /* * TODO: more precise handling for cases 0, 2, 3, the latter two * require TOS routing */ switch (icmph->icmph_code) { case 0: case 1: /* TODO: TOS specificity for cases 2 and 3 */ case 2: case 3: break; default: freemsg(mp); BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects); ire_refrele(ire); return; } /* * Create a Route Association. This will allow us to remember that * someone we believe told us to use the particular gateway. */ save_ire = ire; ire = ire_create( (uchar_t *)&dst, /* dest addr */ (uchar_t *)&ip_g_all_ones, /* mask */ (uchar_t *)&save_ire->ire_src_addr, /* source addr */ (uchar_t *)&gateway, /* gateway addr */ &save_ire->ire_max_frag, /* max frag */ NULL, /* no src nce */ NULL, /* no rfq */ NULL, /* no stq */ IRE_HOST, NULL, /* ipif */ 0, /* cmask */ 0, /* phandle */ 0, /* ihandle */ (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), &ulp_info, NULL, /* tsol_gc_t */ NULL, /* gcgrp */ ipst); if (ire == NULL) { freemsg(mp); ire_refrele(save_ire); return; } error = ire_add(&ire, NULL, NULL, NULL, B_FALSE); ire_refrele(save_ire); atomic_inc_32(&ipst->ips_ip_redirect_cnt); if (error == 0) { ire_refrele(ire); /* Held in ire_add_v4 */ /* tell routing sockets that we received a redirect */ ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src, (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0, (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst); } /* * Delete any existing IRE_HOST type redirect ires for this destination. * This together with the added IRE has the effect of * modifying an existing redirect. */ prev_ire = ire_ftable_lookup(dst, 0, src, IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), ipst); if (prev_ire != NULL) { if (prev_ire ->ire_flags & RTF_DYNAMIC) ire_delete(prev_ire); ire_refrele(prev_ire); } freemsg(mp); } /* * Generate an ICMP parameter problem message. */ static void icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr, zoneid_t zoneid, ip_stack_t *ipst) { icmph_t icmph; boolean_t mctl_present; mblk_t *first_mp; EXTRACT_PKT_MP(mp, first_mp, mctl_present); if (!(mp = icmp_pkt_err_ok(mp, ipst))) { if (mctl_present) freeb(first_mp); return; } bzero(&icmph, sizeof (icmph_t)); icmph.icmph_type = ICMP_PARAM_PROBLEM; icmph.icmph_pp_ptr = ptr; BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs); icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, ipst); } /* * Build and ship an IPv4 ICMP message using the packet data in mp, and * the ICMP header pointed to by "stuff". (May be called as writer.) * Note: assumes that icmp_pkt_err_ok has been called to verify that * an icmp error packet can be sent. * Assigns an appropriate source address to the packet. If ipha_dst is * one of our addresses use it for source. Otherwise pick a source based * on a route lookup back to ipha_src. * Note that ipha_src must be set here since the * packet is likely to arrive on an ill queue in ip_wput() which will * not set a source address. */ static void icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len, boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst) { ipaddr_t dst; icmph_t *icmph; ipha_t *ipha; uint_t len_needed; size_t msg_len; mblk_t *mp1; ipaddr_t src; ire_t *ire; mblk_t *ipsec_mp; ipsec_out_t *io = NULL; if (mctl_present) { /* * If it is : * * 1) a IPSEC_OUT, then this is caused by outbound * datagram originating on this host. IPsec processing * may or may not have been done. Refer to comments above * icmp_inbound_error_fanout for details. * * 2) a IPSEC_IN if we are generating a icmp_message * for an incoming datagram destined for us i.e called * from ip_fanout_send_icmp. */ ipsec_info_t *in; ipsec_mp = mp; mp = ipsec_mp->b_cont; in = (ipsec_info_t *)ipsec_mp->b_rptr; ipha = (ipha_t *)mp->b_rptr; ASSERT(in->ipsec_info_type == IPSEC_OUT || in->ipsec_info_type == IPSEC_IN); if (in->ipsec_info_type == IPSEC_IN) { /* * Convert the IPSEC_IN to IPSEC_OUT. */ if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); return; } io = (ipsec_out_t *)ipsec_mp->b_rptr; } else { ASSERT(in->ipsec_info_type == IPSEC_OUT); io = (ipsec_out_t *)in; /* * Clear out ipsec_out_proc_begin, so we do a fresh * ire lookup. */ io->ipsec_out_proc_begin = B_FALSE; } ASSERT(zoneid == io->ipsec_out_zoneid); ASSERT(zoneid != ALL_ZONES); } else { /* * This is in clear. The icmp message we are building * here should go out in clear. * * Pardon the convolution of it all, but it's easier to * allocate a "use cleartext" IPSEC_IN message and convert * it than it is to allocate a new one. */ ipsec_in_t *ii; ASSERT(DB_TYPE(mp) == M_DATA); ipsec_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); if (ipsec_mp == NULL) { freemsg(mp); BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); return; } ii = (ipsec_in_t *)ipsec_mp->b_rptr; /* This is not a secure packet */ ii->ipsec_in_secure = B_FALSE; /* * For trusted extensions using a shared IP address we can * send using any zoneid. */ if (zoneid == ALL_ZONES) ii->ipsec_in_zoneid = GLOBAL_ZONEID; else ii->ipsec_in_zoneid = zoneid; ipsec_mp->b_cont = mp; ipha = (ipha_t *)mp->b_rptr; /* * Convert the IPSEC_IN to IPSEC_OUT. */ if (!ipsec_in_to_out(ipsec_mp, ipha, NULL)) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); return; } io = (ipsec_out_t *)ipsec_mp->b_rptr; } /* Remember our eventual destination */ dst = ipha->ipha_src; ire = ire_route_lookup(ipha->ipha_dst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK), NULL, NULL, zoneid, NULL, MATCH_IRE_TYPE, ipst); if (ire != NULL && (ire->ire_zoneid == zoneid || ire->ire_zoneid == ALL_ZONES)) { src = ipha->ipha_dst; } else { if (ire != NULL) ire_refrele(ire); ire = ire_route_lookup(dst, 0, 0, 0, NULL, NULL, zoneid, NULL, (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE|MATCH_IRE_ZONEONLY), ipst); if (ire == NULL) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); freemsg(ipsec_mp); return; } src = ire->ire_src_addr; } if (ire != NULL) ire_refrele(ire); /* * Check if we can send back more then 8 bytes in addition to * the IP header. We try to send 64 bytes of data and the internal * header in the special cases of ipv4 encapsulated ipv4 or ipv6. */ len_needed = IPH_HDR_LENGTH(ipha); if (ipha->ipha_protocol == IPPROTO_ENCAP || ipha->ipha_protocol == IPPROTO_IPV6) { if (!pullupmsg(mp, -1)) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); freemsg(ipsec_mp); return; } ipha = (ipha_t *)mp->b_rptr; if (ipha->ipha_protocol == IPPROTO_ENCAP) { len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha + len_needed)); } else { ip6_t *ip6h = (ip6_t *)((uchar_t *)ipha + len_needed); ASSERT(ipha->ipha_protocol == IPPROTO_IPV6); len_needed += ip_hdr_length_v6(mp, ip6h); } } len_needed += ipst->ips_ip_icmp_return; msg_len = msgdsize(mp); if (msg_len > len_needed) { (void) adjmsg(mp, len_needed - msg_len); msg_len = len_needed; } mp1 = allocb_tmpl(sizeof (icmp_ipha) + len, mp); if (mp1 == NULL) { BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors); freemsg(ipsec_mp); return; } mp1->b_cont = mp; mp = mp1; ASSERT(ipsec_mp->b_datap->db_type == M_CTL && ipsec_mp->b_rptr == (uint8_t *)io && io->ipsec_out_type == IPSEC_OUT); ipsec_mp->b_cont = mp; /* * Set ipsec_out_icmp_loopback so we can let the ICMP messages this * node generates be accepted in peace by all on-host destinations. * If we do NOT assume that all on-host destinations trust * self-generated ICMP messages, then rework here, ip6.c, and spd.c. * (Look for ipsec_out_icmp_loopback). */ io->ipsec_out_icmp_loopback = B_TRUE; ipha = (ipha_t *)mp->b_rptr; mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len); *ipha = icmp_ipha; ipha->ipha_src = src; ipha->ipha_dst = dst; ipha->ipha_ttl = ipst->ips_ip_def_ttl; msg_len += sizeof (icmp_ipha) + len; if (msg_len > IP_MAXPACKET) { (void) adjmsg(mp, IP_MAXPACKET - msg_len); msg_len = IP_MAXPACKET; } ipha->ipha_length = htons((uint16_t)msg_len); icmph = (icmph_t *)&ipha[1]; bcopy(stuff, icmph, len); icmph->icmph_checksum = 0; icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0); BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs); put(q, ipsec_mp); } /* * Determine if an ICMP error packet can be sent given the rate limit. * The limit consists of an average frequency (icmp_pkt_err_interval measured * in milliseconds) and a burst size. Burst size number of packets can * be sent arbitrarely closely spaced. * The state is tracked using two variables to implement an approximate * token bucket filter: * icmp_pkt_err_last - lbolt value when the last burst started * icmp_pkt_err_sent - number of packets sent in current burst */ boolean_t icmp_err_rate_limit(ip_stack_t *ipst) { clock_t now = TICK_TO_MSEC(lbolt); uint_t refilled; /* Number of packets refilled in tbf since last */ /* Guard against changes by loading into local variable */ uint_t err_interval = ipst->ips_ip_icmp_err_interval; if (err_interval == 0) return (B_FALSE); if (ipst->ips_icmp_pkt_err_last > now) { /* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */ ipst->ips_icmp_pkt_err_last = 0; ipst->ips_icmp_pkt_err_sent = 0; } /* * If we are in a burst update the token bucket filter. * Update the "last" time to be close to "now" but make sure * we don't loose precision. */ if (ipst->ips_icmp_pkt_err_sent != 0) { refilled = (now - ipst->ips_icmp_pkt_err_last)/err_interval; if (refilled > ipst->ips_icmp_pkt_err_sent) { ipst->ips_icmp_pkt_err_sent = 0; } else { ipst->ips_icmp_pkt_err_sent -= refilled; ipst->ips_icmp_pkt_err_last += refilled * err_interval; } } if (ipst->ips_icmp_pkt_err_sent == 0) { /* Start of new burst */ ipst->ips_icmp_pkt_err_last = now; } if (ipst->ips_icmp_pkt_err_sent < ipst->ips_ip_icmp_err_burst) { ipst->ips_icmp_pkt_err_sent++; ip1dbg(("icmp_err_rate_limit: %d sent in burst\n", ipst->ips_icmp_pkt_err_sent)); return (B_FALSE); } ip1dbg(("icmp_err_rate_limit: dropped\n")); return (B_TRUE); } /* * Check if it is ok to send an IPv4 ICMP error packet in * response to the IPv4 packet in mp. * Free the message and return null if no * ICMP error packet should be sent. */ static mblk_t * icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst) { icmph_t *icmph; ipha_t *ipha; uint_t len_needed; ire_t *src_ire; ire_t *dst_ire; if (!mp) return (NULL); ipha = (ipha_t *)mp->b_rptr; if (ip_csum_hdr(ipha)) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs); freemsg(mp); return (NULL); } src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); dst_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); if (src_ire != NULL || dst_ire != NULL || CLASSD(ipha->ipha_dst) || CLASSD(ipha->ipha_src) || (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) { /* Note: only errors to the fragment with offset 0 */ BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); freemsg(mp); if (src_ire != NULL) ire_refrele(src_ire); if (dst_ire != NULL) ire_refrele(dst_ire); return (NULL); } if (ipha->ipha_protocol == IPPROTO_ICMP) { /* * Check the ICMP type. RFC 1122 sez: don't send ICMP * errors in response to any ICMP errors. */ len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE; if (mp->b_wptr - mp->b_rptr < len_needed) { if (!pullupmsg(mp, len_needed)) { BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); freemsg(mp); return (NULL); } ipha = (ipha_t *)mp->b_rptr; } icmph = (icmph_t *) (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]); switch (icmph->icmph_type) { case ICMP_DEST_UNREACHABLE: case ICMP_SOURCE_QUENCH: case ICMP_TIME_EXCEEDED: case ICMP_PARAM_PROBLEM: case ICMP_REDIRECT: BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); freemsg(mp); return (NULL); default: break; } } /* * If this is a labeled system, then check to see if we're allowed to * send a response to this particular sender. If not, then just drop. */ if (is_system_labeled() && !tsol_can_reply_error(mp)) { ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n")); BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); freemsg(mp); return (NULL); } if (icmp_err_rate_limit(ipst)) { /* * Only send ICMP error packets every so often. * This should be done on a per port/source basis, * but for now this will suffice. */ freemsg(mp); return (NULL); } return (mp); } /* * Generate an ICMP redirect message. */ static void icmp_send_redirect(queue_t *q, mblk_t *mp, ipaddr_t gateway, ip_stack_t *ipst) { icmph_t icmph; /* * We are called from ip_rput where we could * not have attached an IPSEC_IN. */ ASSERT(mp->b_datap->db_type == M_DATA); if (!(mp = icmp_pkt_err_ok(mp, ipst))) { return; } bzero(&icmph, sizeof (icmph_t)); icmph.icmph_type = ICMP_REDIRECT; icmph.icmph_code = 1; icmph.icmph_rd_gateway = gateway; BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects); /* Redirects sent by router, and router is global zone */ icmp_pkt(q, mp, &icmph, sizeof (icmph_t), B_FALSE, GLOBAL_ZONEID, ipst); } /* * Generate an ICMP time exceeded message. */ void icmp_time_exceeded(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid, ip_stack_t *ipst) { icmph_t icmph; boolean_t mctl_present; mblk_t *first_mp; EXTRACT_PKT_MP(mp, first_mp, mctl_present); if (!(mp = icmp_pkt_err_ok(mp, ipst))) { if (mctl_present) freeb(first_mp); return; } bzero(&icmph, sizeof (icmph_t)); icmph.icmph_type = ICMP_TIME_EXCEEDED; icmph.icmph_code = code; BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds); icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, ipst); } /* * Generate an ICMP unreachable message. */ void icmp_unreachable(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid, ip_stack_t *ipst) { icmph_t icmph; mblk_t *first_mp; boolean_t mctl_present; EXTRACT_PKT_MP(mp, first_mp, mctl_present); if (!(mp = icmp_pkt_err_ok(mp, ipst))) { if (mctl_present) freeb(first_mp); return; } bzero(&icmph, sizeof (icmph_t)); icmph.icmph_type = ICMP_DEST_UNREACHABLE; icmph.icmph_code = code; BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); ip2dbg(("send icmp destination unreachable code %d\n", code)); icmp_pkt(q, first_mp, (char *)&icmph, sizeof (icmph_t), mctl_present, zoneid, ipst); } /* * Attempt to start recovery of an IPv4 interface that's been shut down as a * duplicate. As long as someone else holds the address, the interface will * stay down. When that conflict goes away, the interface is brought back up. * This is done so that accidental shutdowns of addresses aren't made * permanent. Your server will recover from a failure. * * For DHCP, recovery is not done in the kernel. Instead, it's handled by a * user space process (dhcpagent). * * Recovery completes if ARP reports that the address is now ours (via * AR_CN_READY). In that case, we go to ip_arp_excl to finish the operation. * * This function is entered on a timer expiry; the ID is in ipif_recovery_id. */ static void ipif_dup_recovery(void *arg) { ipif_t *ipif = arg; ill_t *ill = ipif->ipif_ill; mblk_t *arp_add_mp; mblk_t *arp_del_mp; area_t *area; ip_stack_t *ipst = ill->ill_ipst; ipif->ipif_recovery_id = 0; /* * No lock needed for moving or condemned check, as this is just an * optimization. */ if (ill->ill_arp_closing || !(ipif->ipif_flags & IPIF_DUPLICATE) || (ipif->ipif_flags & IPIF_POINTOPOINT) || (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))) { /* No reason to try to bring this address back. */ return; } if ((arp_add_mp = ipif_area_alloc(ipif)) == NULL) goto alloc_fail; if (ipif->ipif_arp_del_mp == NULL) { if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL) goto alloc_fail; ipif->ipif_arp_del_mp = arp_del_mp; } /* Setting the 'unverified' flag restarts DAD */ area = (area_t *)arp_add_mp->b_rptr; area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | ACE_F_UNVERIFIED; putnext(ill->ill_rq, arp_add_mp); return; alloc_fail: /* * On allocation failure, just restart the timer. Note that the ipif * is down here, so no other thread could be trying to start a recovery * timer. The ill_lock protects the condemned flag and the recovery * timer ID. */ freemsg(arp_add_mp); mutex_enter(&ill->ill_lock); if (ipst->ips_ip_dup_recovery > 0 && ipif->ipif_recovery_id == 0 && !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { ipif->ipif_recovery_id = timeout(ipif_dup_recovery, ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); } mutex_exit(&ill->ill_lock); } /* * This is for exclusive changes due to ARP. Either tear down an interface due * to AR_CN_FAILED and AR_CN_BOGON, or bring one up for successful recovery. */ /* ARGSUSED */ static void ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) { ill_t *ill = rq->q_ptr; arh_t *arh; ipaddr_t src; ipif_t *ipif; char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */ char hbuf[MAC_STR_LEN]; char sbuf[INET_ADDRSTRLEN]; const char *failtype; boolean_t bring_up; ip_stack_t *ipst = ill->ill_ipst; switch (((arcn_t *)mp->b_rptr)->arcn_code) { case AR_CN_READY: failtype = NULL; bring_up = B_TRUE; break; case AR_CN_FAILED: failtype = "in use"; bring_up = B_FALSE; break; default: failtype = "claimed"; bring_up = B_FALSE; break; } arh = (arh_t *)mp->b_cont->b_rptr; bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, hbuf, sizeof (hbuf)); (void) ip_dot_addr(src, sbuf); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if ((ipif->ipif_flags & IPIF_POINTOPOINT) || ipif->ipif_lcl_addr != src) { continue; } /* * If we failed on a recovery probe, then restart the timer to * try again later. */ if (!bring_up && (ipif->ipif_flags & IPIF_DUPLICATE) && !(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && ill->ill_net_type == IRE_IF_RESOLVER && !(ipif->ipif_state_flags & IPIF_CONDEMNED) && ipst->ips_ip_dup_recovery > 0 && ipif->ipif_recovery_id == 0) { ipif->ipif_recovery_id = timeout(ipif_dup_recovery, ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); continue; } /* * If what we're trying to do has already been done, then do * nothing. */ if (bring_up == ((ipif->ipif_flags & IPIF_UP) != 0)) continue; ipif_get_name(ipif, ibuf, sizeof (ibuf)); if (failtype == NULL) { cmn_err(CE_NOTE, "recovered address %s on %s", sbuf, ibuf); } else { cmn_err(CE_WARN, "%s has duplicate address %s (%s " "by %s); disabled", ibuf, sbuf, failtype, hbuf); } if (bring_up) { ASSERT(ill->ill_dl_up); /* * Free up the ARP delete message so we can allocate * a fresh one through the normal path. */ freemsg(ipif->ipif_arp_del_mp); ipif->ipif_arp_del_mp = NULL; if (ipif_resolver_up(ipif, Res_act_initial) != EINPROGRESS) { ipif->ipif_addr_ready = 1; (void) ipif_up_done(ipif); } continue; } mutex_enter(&ill->ill_lock); ASSE