Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /* Copyright (c) 1990 Mentat Inc. */
     26 
     27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     28 
     29 /*
     30  * This file contains the interface control functions for IP.
     31  */
     32 
     33 #include <sys/types.h>
     34 #include <sys/stream.h>
     35 #include <sys/dlpi.h>
     36 #include <sys/stropts.h>
     37 #include <sys/strsun.h>
     38 #include <sys/sysmacros.h>
     39 #include <sys/strlog.h>
     40 #include <sys/ddi.h>
     41 #include <sys/sunddi.h>
     42 #include <sys/cmn_err.h>
     43 #include <sys/kstat.h>
     44 #include <sys/debug.h>
     45 #include <sys/zone.h>
     46 #include <sys/sunldi.h>
     47 #include <sys/file.h>
     48 #include <sys/bitmap.h>
     49 
     50 #include <sys/kmem.h>
     51 #include <sys/systm.h>
     52 #include <sys/param.h>
     53 #include <sys/socket.h>
     54 #include <sys/isa_defs.h>
     55 #include <net/if.h>
     56 #include <net/if_arp.h>
     57 #include <net/if_types.h>
     58 #include <net/if_dl.h>
     59 #include <net/route.h>
     60 #include <sys/sockio.h>
     61 #include <netinet/in.h>
     62 #include <netinet/ip6.h>
     63 #include <netinet/icmp6.h>
     64 #include <netinet/igmp_var.h>
     65 #include <sys/strsun.h>
     66 #include <sys/policy.h>
     67 #include <sys/ethernet.h>
     68 
     69 #include <inet/common.h>   /* for various inet/mi.h and inet/nd.h needs */
     70 #include <inet/mi.h>
     71 #include <inet/nd.h>
     72 #include <inet/arp.h>
     73 #include <inet/mib2.h>
     74 #include <inet/ip.h>
     75 #include <inet/ip6.h>
     76 #include <inet/ip6_asp.h>
     77 #include <inet/tcp.h>
     78 #include <inet/ip_multi.h>
     79 #include <inet/ip_ire.h>
     80 #include <inet/ip_ftable.h>
     81 #include <inet/ip_rts.h>
     82 #include <inet/ip_ndp.h>
     83 #include <inet/ip_if.h>
     84 #include <inet/ip_impl.h>
     85 #include <inet/tun.h>
     86 #include <inet/sctp_ip.h>
     87 #include <inet/ip_netinfo.h>
     88 #include <inet/mib2.h>
     89 
     90 #include <net/pfkeyv2.h>
     91 #include <inet/ipsec_info.h>
     92 #include <inet/sadb.h>
     93 #include <inet/ipsec_impl.h>
     94 #include <sys/iphada.h>
     95 
     96 
     97 #include <netinet/igmp.h>
     98 #include <inet/ip_listutils.h>
     99 #include <inet/ipclassifier.h>
    100 #include <sys/mac.h>
    101 
    102 #include <sys/systeminfo.h>
    103 #include <sys/bootconf.h>
    104 
    105 #include <sys/tsol/tndb.h>
    106 #include <sys/tsol/tnet.h>
    107 
    108 /* The character which tells where the ill_name ends */
    109 #define	IPIF_SEPARATOR_CHAR	':'
    110 
    111 /* IP ioctl function table entry */
    112 typedef struct ipft_s {
    113 	int	ipft_cmd;
    114 	pfi_t	ipft_pfi;
    115 	int	ipft_min_size;
    116 	int	ipft_flags;
    117 } ipft_t;
    118 #define	IPFT_F_NO_REPLY		0x1	/* IP ioctl does not expect any reply */
    119 #define	IPFT_F_SELF_REPLY	0x2	/* ioctl callee does the ioctl reply */
    120 
    121 typedef struct ip_sock_ar_s {
    122 	union {
    123 		area_t	ip_sock_area;
    124 		ared_t	ip_sock_ared;
    125 		areq_t	ip_sock_areq;
    126 	} ip_sock_ar_u;
    127 	queue_t	*ip_sock_ar_q;
    128 } ip_sock_ar_t;
    129 
    130 static int	nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
    131 static int	nd_ill_forward_set(queue_t *q, mblk_t *mp,
    132 		    char *value, caddr_t cp, cred_t *ioc_cr);
    133 
    134 static boolean_t ill_is_quiescent(ill_t *);
    135 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
    136 static ip_m_t	*ip_m_lookup(t_uscalar_t mac_type);
    137 static int	ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
    138     mblk_t *mp, boolean_t need_up);
    139 static int	ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
    140     mblk_t *mp, boolean_t need_up);
    141 static int	ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
    142     queue_t *q, mblk_t *mp, boolean_t need_up);
    143 static int	ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
    144     mblk_t *mp, boolean_t need_up);
    145 static int	ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
    146     mblk_t *mp);
    147 static int	ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
    148     queue_t *q, mblk_t *mp, boolean_t need_up);
    149 static int	ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
    150     int ioccmd, struct linkblk *li, boolean_t doconsist);
    151 static ipaddr_t	ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
    152 static void	ip_wput_ioctl(queue_t *q, mblk_t *mp);
    153 static void	ipsq_flush(ill_t *ill);
    154 
    155 static	int	ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
    156     queue_t *q, mblk_t *mp, boolean_t need_up);
    157 static void	ipsq_delete(ipsq_t *);
    158 
    159 static ipif_t	*ipif_allocate(ill_t *ill, int id, uint_t ire_type,
    160 		    boolean_t initialize);
    161 static void	ipif_check_bcast_ires(ipif_t *test_ipif);
    162 static ire_t	**ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
    163 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
    164 		    boolean_t isv6);
    165 static void	ipif_down_delete_ire(ire_t *ire, char *ipif);
    166 static void	ipif_delete_cache_ire(ire_t *, char *);
    167 static int	ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
    168 static void	ipif_free(ipif_t *ipif);
    169 static void	ipif_free_tail(ipif_t *ipif);
    170 static void	ipif_mtu_change(ire_t *ire, char *ipif_arg);
    171 static void	ipif_multicast_down(ipif_t *ipif);
    172 static void	ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif);
    173 static void	ipif_set_default(ipif_t *ipif);
    174 static int	ipif_set_values(queue_t *q, mblk_t *mp,
    175     char *interf_name, uint_t *ppa);
    176 static int	ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
    177     queue_t *q);
    178 static ipif_t	*ipif_lookup_on_name(char *name, size_t namelen,
    179     boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
    180     queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *);
    181 static int	ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp);
    182 static void	ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp);
    183 
    184 static int	ill_alloc_ppa(ill_if_t *, ill_t *);
    185 static int	ill_arp_off(ill_t *ill);
    186 static int	ill_arp_on(ill_t *ill);
    187 static void	ill_delete_interface_type(ill_if_t *);
    188 static int	ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
    189 static void	ill_dl_down(ill_t *ill);
    190 static void	ill_down(ill_t *ill);
    191 static void	ill_downi(ire_t *ire, char *ill_arg);
    192 static void	ill_free_mib(ill_t *ill);
    193 static void	ill_glist_delete(ill_t *);
    194 static boolean_t ill_has_usable_ipif(ill_t *);
    195 static int	ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int);
    196 static void	ill_nominate_bcast_rcv(ill_group_t *illgrp);
    197 static void	ill_phyint_free(ill_t *ill);
    198 static void	ill_phyint_reinit(ill_t *ill);
    199 static void	ill_set_nce_router_flags(ill_t *, boolean_t);
    200 static void	ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
    201 static void	ill_signal_ipsq_ills(ipsq_t *, boolean_t);
    202 static boolean_t ill_split_ipsq(ipsq_t *cur_sq);
    203 static void	ill_stq_cache_delete(ire_t *, char *);
    204 
    205 static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *);
    206 static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *);
    207 static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
    208     in6_addr_t *);
    209 static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
    210     ipaddr_t *);
    211 static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *);
    212 static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
    213     in6_addr_t *);
    214 static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
    215     ipaddr_t *);
    216 
    217 static void	ipif_save_ire(ipif_t *, ire_t *);
    218 static void	ipif_remove_ire(ipif_t *, ire_t *);
    219 static void 	ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *);
    220 static void 	ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
    221 
    222 /*
    223  * Per-ill IPsec capabilities management.
    224  */
    225 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void);
    226 static void	ill_ipsec_capab_free(ill_ipsec_capab_t *);
    227 static void	ill_ipsec_capab_add(ill_t *, uint_t, boolean_t);
    228 static void	ill_ipsec_capab_delete(ill_t *, uint_t);
    229 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int);
    230 static void ill_capability_proto(ill_t *, int, mblk_t *);
    231 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *,
    232     boolean_t);
    233 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    234 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    235 static void ill_capability_mdt_reset(ill_t *, mblk_t **);
    236 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    237 static void ill_capability_ipsec_reset(ill_t *, mblk_t **);
    238 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    239 static void ill_capability_hcksum_reset(ill_t *, mblk_t **);
    240 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
    241     dl_capability_sub_t *);
    242 static void ill_capability_zerocopy_reset(ill_t *, mblk_t **);
    243 static void ill_capability_lso_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    244 static void ill_capability_lso_reset(ill_t *, mblk_t **);
    245 static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
    246 static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *);
    247 static void	ill_capability_dls_reset(ill_t *, mblk_t **);
    248 static void	ill_capability_dls_disable(ill_t *);
    249 
    250 static void	illgrp_cache_delete(ire_t *, char *);
    251 static void	illgrp_delete(ill_t *ill);
    252 static void	illgrp_reset_schednext(ill_t *ill);
    253 
    254 static ill_t	*ill_prev_usesrc(ill_t *);
    255 static int	ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
    256 static void	ill_disband_usesrc_group(ill_t *);
    257 
    258 static void	conn_cleanup_stale_ire(conn_t *, caddr_t);
    259 
    260 #ifdef DEBUG
    261 static	void	ill_trace_cleanup(const ill_t *);
    262 static	void	ipif_trace_cleanup(const ipif_t *);
    263 #endif
    264 
    265 /*
    266  * if we go over the memory footprint limit more than once in this msec
    267  * interval, we'll start pruning aggressively.
    268  */
    269 int ip_min_frag_prune_time = 0;
    270 
    271 /*
    272  * max # of IPsec algorithms supported.  Limited to 1 byte by PF_KEY
    273  * and the IPsec DOI
    274  */
    275 #define	MAX_IPSEC_ALGS	256
    276 
    277 #define	BITSPERBYTE	8
    278 #define	BITS(type)	(BITSPERBYTE * (long)sizeof (type))
    279 
    280 #define	IPSEC_ALG_ENABLE(algs, algid) \
    281 		((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \
    282 		(1 << ((algid) % BITS(ipsec_capab_elem_t))))
    283 
    284 #define	IPSEC_ALG_IS_ENABLED(algid, algs) \
    285 		((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \
    286 		(1 << ((algid) % BITS(ipsec_capab_elem_t))))
    287 
    288 typedef uint8_t ipsec_capab_elem_t;
    289 
    290 /*
    291  * Per-algorithm parameters.  Note that at present, only encryption
    292  * algorithms have variable keysize (IKE does not provide a way to negotiate
    293  * auth algorithm keysize).
    294  *
    295  * All sizes here are in bits.
    296  */
    297 typedef struct
    298 {
    299 	uint16_t	minkeylen;
    300 	uint16_t	maxkeylen;
    301 } ipsec_capab_algparm_t;
    302 
    303 /*
    304  * Per-ill capabilities.
    305  */
    306 struct ill_ipsec_capab_s {
    307 	ipsec_capab_elem_t *encr_hw_algs;
    308 	ipsec_capab_elem_t *auth_hw_algs;
    309 	uint32_t algs_size;	/* size of _hw_algs in bytes */
    310 	/* algorithm key lengths */
    311 	ipsec_capab_algparm_t *encr_algparm;
    312 	uint32_t encr_algparm_size;
    313 	uint32_t encr_algparm_end;
    314 };
    315 
    316 /*
    317  * The field values are larger than strictly necessary for simple
    318  * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls.
    319  */
    320 static area_t	ip_area_template = {
    321 	AR_ENTRY_ADD,			/* area_cmd */
    322 	sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl),
    323 					/* area_name_offset */
    324 	/* area_name_length temporarily holds this structure length */
    325 	sizeof (area_t),			/* area_name_length */
    326 	IP_ARP_PROTO_TYPE,		/* area_proto */
    327 	sizeof (ip_sock_ar_t),		/* area_proto_addr_offset */
    328 	IP_ADDR_LEN,			/* area_proto_addr_length */
    329 	sizeof (ip_sock_ar_t) + IP_ADDR_LEN,
    330 					/* area_proto_mask_offset */
    331 	0,				/* area_flags */
    332 	sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN,
    333 					/* area_hw_addr_offset */
    334 	/* Zero length hw_addr_length means 'use your idea of the address' */
    335 	0				/* area_hw_addr_length */
    336 };
    337 
    338 /*
    339  * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver
    340  * support
    341  */
    342 static area_t	ip6_area_template = {
    343 	AR_ENTRY_ADD,			/* area_cmd */
    344 	sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t),
    345 					/* area_name_offset */
    346 	/* area_name_length temporarily holds this structure length */
    347 	sizeof (area_t),			/* area_name_length */
    348 	IP_ARP_PROTO_TYPE,		/* area_proto */
    349 	sizeof (ip_sock_ar_t),		/* area_proto_addr_offset */
    350 	IPV6_ADDR_LEN,			/* area_proto_addr_length */
    351 	sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN,
    352 					/* area_proto_mask_offset */
    353 	0,				/* area_flags */
    354 	sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN,
    355 					/* area_hw_addr_offset */
    356 	/* Zero length hw_addr_length means 'use your idea of the address' */
    357 	0				/* area_hw_addr_length */
    358 };
    359 
    360 static ared_t	ip_ared_template = {
    361 	AR_ENTRY_DELETE,
    362 	sizeof (ared_t) + IP_ADDR_LEN,
    363 	sizeof (ared_t),
    364 	IP_ARP_PROTO_TYPE,
    365 	sizeof (ared_t),
    366 	IP_ADDR_LEN
    367 };
    368 
    369 static ared_t	ip6_ared_template = {
    370 	AR_ENTRY_DELETE,
    371 	sizeof (ared_t) + IPV6_ADDR_LEN,
    372 	sizeof (ared_t),
    373 	IP_ARP_PROTO_TYPE,
    374 	sizeof (ared_t),
    375 	IPV6_ADDR_LEN
    376 };
    377 
    378 /*
    379  * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as
    380  * as the areq doesn't include an IP address in ill_dl_up() (the only place a
    381  * areq is used).
    382  */
    383 static areq_t	ip_areq_template = {
    384 	AR_ENTRY_QUERY,			/* cmd */
    385 	sizeof (areq_t)+(2*IP_ADDR_LEN),	/* name offset */
    386 	sizeof (areq_t),	/* name len (filled by ill_arp_alloc) */
    387 	IP_ARP_PROTO_TYPE,		/* protocol, from arps perspective */
    388 	sizeof (areq_t),			/* target addr offset */
    389 	IP_ADDR_LEN,			/* target addr_length */
    390 	0,				/* flags */
    391 	sizeof (areq_t) + IP_ADDR_LEN,	/* sender addr offset */
    392 	IP_ADDR_LEN,			/* sender addr length */
    393 	AR_EQ_DEFAULT_XMIT_COUNT,	/* xmit_count */
    394 	AR_EQ_DEFAULT_XMIT_INTERVAL,	/* (re)xmit_interval in milliseconds */
    395 	AR_EQ_DEFAULT_MAX_BUFFERED	/* max # of requests to buffer */
    396 	/* anything else filled in by the code */
    397 };
    398 
    399 static arc_t	ip_aru_template = {
    400 	AR_INTERFACE_UP,
    401 	sizeof (arc_t),		/* Name offset */
    402 	sizeof (arc_t)		/* Name length (set by ill_arp_alloc) */
    403 };
    404 
    405 static arc_t	ip_ard_template = {
    406 	AR_INTERFACE_DOWN,
    407 	sizeof (arc_t),		/* Name offset */
    408 	sizeof (arc_t)		/* Name length (set by ill_arp_alloc) */
    409 };
    410 
    411 static arc_t	ip_aron_template = {
    412 	AR_INTERFACE_ON,
    413 	sizeof (arc_t),		/* Name offset */
    414 	sizeof (arc_t)		/* Name length (set by ill_arp_alloc) */
    415 };
    416 
    417 static arc_t	ip_aroff_template = {
    418 	AR_INTERFACE_OFF,
    419 	sizeof (arc_t),		/* Name offset */
    420 	sizeof (arc_t)		/* Name length (set by ill_arp_alloc) */
    421 };
    422 
    423 
    424 static arma_t	ip_arma_multi_template = {
    425 	AR_MAPPING_ADD,
    426 	sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN,
    427 				/* Name offset */
    428 	sizeof (arma_t),	/* Name length (set by ill_arp_alloc) */
    429 	IP_ARP_PROTO_TYPE,
    430 	sizeof (arma_t),			/* proto_addr_offset */
    431 	IP_ADDR_LEN,				/* proto_addr_length */
    432 	sizeof (arma_t) + IP_ADDR_LEN,		/* proto_mask_offset */
    433 	sizeof (arma_t) + 2*IP_ADDR_LEN,	/* proto_extract_mask_offset */
    434 	ACE_F_PERMANENT | ACE_F_MAPPING,	/* flags */
    435 	sizeof (arma_t) + 3*IP_ADDR_LEN,	/* hw_addr_offset */
    436 	IP_MAX_HW_LEN,				/* hw_addr_length */
    437 	0,					/* hw_mapping_start */
    438 };
    439 
    440 static ipft_t	ip_ioctl_ftbl[] = {
    441 	{ IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
    442 	{ IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
    443 		IPFT_F_NO_REPLY },
    444 	{ IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t),
    445 		IPFT_F_NO_REPLY },
    446 	{ IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
    447 	{ 0 }
    448 };
    449 
    450 /* Simple ICMP IP Header Template */
    451 static ipha_t icmp_ipha = {
    452 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
    453 };
    454 
    455 /* Flag descriptors for ip_ipif_report */
    456 static nv_t	ipif_nv_tbl[] = {
    457 	{ IPIF_UP,		"UP" },
    458 	{ IPIF_BROADCAST,	"BROADCAST" },
    459 	{ ILLF_DEBUG,		"DEBUG" },
    460 	{ PHYI_LOOPBACK,	"LOOPBACK" },
    461 	{ IPIF_POINTOPOINT,	"POINTOPOINT" },
    462 	{ ILLF_NOTRAILERS,	"NOTRAILERS" },
    463 	{ PHYI_RUNNING,		"RUNNING" },
    464 	{ ILLF_NOARP,		"NOARP" },
    465 	{ PHYI_PROMISC,		"PROMISC" },
    466 	{ PHYI_ALLMULTI,	"ALLMULTI" },
    467 	{ PHYI_INTELLIGENT,	"INTELLIGENT" },
    468 	{ ILLF_MULTICAST,	"MULTICAST" },
    469 	{ PHYI_MULTI_BCAST,	"MULTI_BCAST" },
    470 	{ IPIF_UNNUMBERED,	"UNNUMBERED" },
    471 	{ IPIF_DHCPRUNNING,	"DHCP" },
    472 	{ IPIF_PRIVATE,		"PRIVATE" },
    473 	{ IPIF_NOXMIT,		"NOXMIT" },
    474 	{ IPIF_NOLOCAL,		"NOLOCAL" },
    475 	{ IPIF_DEPRECATED,	"DEPRECATED" },
    476 	{ IPIF_PREFERRED,	"PREFERRED" },
    477 	{ IPIF_TEMPORARY,	"TEMPORARY" },
    478 	{ IPIF_ADDRCONF,	"ADDRCONF" },
    479 	{ PHYI_VIRTUAL,		"VIRTUAL" },
    480 	{ ILLF_ROUTER,		"ROUTER" },
    481 	{ ILLF_NONUD,		"NONUD" },
    482 	{ IPIF_ANYCAST,		"ANYCAST" },
    483 	{ ILLF_NORTEXCH,	"NORTEXCH" },
    484 	{ ILLF_IPV4,		"IPV4" },
    485 	{ ILLF_IPV6,		"IPV6" },
    486 	{ IPIF_NOFAILOVER,	"NOFAILOVER" },
    487 	{ PHYI_FAILED,		"FAILED" },
    488 	{ PHYI_STANDBY,		"STANDBY" },
    489 	{ PHYI_INACTIVE,	"INACTIVE" },
    490 	{ PHYI_OFFLINE,		"OFFLINE" },
    491 };
    492 
    493 static uchar_t	ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
    494 
    495 static ip_m_t	ip_m_tbl[] = {
    496 	{ DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
    497 	    ip_ether_v6intfid },
    498 	{ DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
    499 	    ip_nodef_v6intfid },
    500 	{ DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
    501 	    ip_nodef_v6intfid },
    502 	{ DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
    503 	    ip_nodef_v6intfid },
    504 	{ DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
    505 	    ip_ether_v6intfid },
    506 	{ DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo,
    507 	    ip_ib_v6intfid },
    508 	{ SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL},
    509 	{ DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
    510 	    ip_nodef_v6intfid }
    511 };
    512 
    513 static ill_t	ill_null;		/* Empty ILL for init. */
    514 char	ipif_loopback_name[] = "lo0";
    515 static char *ipv4_forward_suffix = ":ip_forwarding";
    516 static char *ipv6_forward_suffix = ":ip6_forwarding";
    517 static	sin6_t	sin6_null;	/* Zero address for quick clears */
    518 static	sin_t	sin_null;	/* Zero address for quick clears */
    519 
    520 /* When set search for unused ipif_seqid */
    521 static ipif_t	ipif_zero;
    522 
    523 /*
    524  * ppa arena is created after these many
    525  * interfaces have been plumbed.
    526  */
    527 uint_t	ill_no_arena = 12;	/* Setable in /etc/system */
    528 
    529 /*
    530  * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout
    531  * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is
    532  * set through platform specific code (Niagara/Ontario).
    533  */
    534 #define	SOFT_RINGS_ENABLED()	(ip_soft_rings_cnt ? \
    535 		(ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE)
    536 
    537 #define	ILL_CAPAB_DLS	(ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL)
    538 
    539 static uint_t
    540 ipif_rand(ip_stack_t *ipst)
    541 {
    542 	ipst->ips_ipif_src_random = ipst->ips_ipif_src_random * 1103515245 +
    543 	    12345;
    544 	return ((ipst->ips_ipif_src_random >> 16) & 0x7fff);
    545 }
    546 
    547 /*
    548  * Allocate per-interface mibs.
    549  * Returns true if ok. False otherwise.
    550  *  ipsq  may not yet be allocated (loopback case ).
    551  */
    552 static boolean_t
    553 ill_allocate_mibs(ill_t *ill)
    554 {
    555 	/* Already allocated? */
    556 	if (ill->ill_ip_mib != NULL) {
    557 		if (ill->ill_isv6)
    558 			ASSERT(ill->ill_icmp6_mib != NULL);
    559 		return (B_TRUE);
    560 	}
    561 
    562 	ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
    563 	    KM_NOSLEEP);
    564 	if (ill->ill_ip_mib == NULL) {
    565 		return (B_FALSE);
    566 	}
    567 
    568 	/* Setup static information */
    569 	SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
    570 	    sizeof (mib2_ipIfStatsEntry_t));
    571 	if (ill->ill_isv6) {
    572 		ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
    573 		SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
    574 		    sizeof (mib2_ipv6AddrEntry_t));
    575 		SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
    576 		    sizeof (mib2_ipv6RouteEntry_t));
    577 		SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
    578 		    sizeof (mib2_ipv6NetToMediaEntry_t));
    579 		SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
    580 		    sizeof (ipv6_member_t));
    581 		SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
    582 		    sizeof (ipv6_grpsrc_t));
    583 	} else {
    584 		ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
    585 		SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
    586 		    sizeof (mib2_ipAddrEntry_t));
    587 		SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
    588 		    sizeof (mib2_ipRouteEntry_t));
    589 		SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
    590 		    sizeof (mib2_ipNetToMediaEntry_t));
    591 		SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
    592 		    sizeof (ip_member_t));
    593 		SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
    594 		    sizeof (ip_grpsrc_t));
    595 
    596 		/*
    597 		 * For a v4 ill, we are done at this point, because per ill
    598 		 * icmp mibs are only used for v6.
    599 		 */
    600 		return (B_TRUE);
    601 	}
    602 
    603 	ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
    604 	    KM_NOSLEEP);
    605 	if (ill->ill_icmp6_mib == NULL) {
    606 		kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
    607 		ill->ill_ip_mib = NULL;
    608 		return (B_FALSE);
    609 	}
    610 	/* static icmp info */
    611 	ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
    612 	    sizeof (mib2_ipv6IfIcmpEntry_t);
    613 	/*
    614 	 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
    615 	 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert
    616 	 * -> ill_phyint_reinit
    617 	 */
    618 	return (B_TRUE);
    619 }
    620 
    621 /*
    622  * Common code for preparation of ARP commands.  Two points to remember:
    623  * 	1) The ill_name is tacked on at the end of the allocated space so
    624  *	   the templates name_offset field must contain the total space
    625  *	   to allocate less the name length.
    626  *
    627  *	2) The templates name_length field should contain the *template*
    628  *	   length.  We use it as a parameter to bcopy() and then write
    629  *	   the real ill_name_length into the name_length field of the copy.
    630  * (Always called as writer.)
    631  */
    632 mblk_t *
    633 ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr)
    634 {
    635 	arc_t	*arc = (arc_t *)template;
    636 	char	*cp;
    637 	int	len;
    638 	mblk_t	*mp;
    639 	uint_t	name_length = ill->ill_name_length;
    640 	uint_t	template_len = arc->arc_name_length;
    641 
    642 	len = arc->arc_name_offset + name_length;
    643 	mp = allocb(len, BPRI_HI);
    644 	if (mp == NULL)
    645 		return (NULL);
    646 	cp = (char *)mp->b_rptr;
    647 	mp->b_wptr = (uchar_t *)&cp[len];
    648 	if (template_len)
    649 		bcopy(template, cp, template_len);
    650 	if (len > template_len)
    651 		bzero(&cp[template_len], len - template_len);
    652 	mp->b_datap->db_type = M_PROTO;
    653 
    654 	arc = (arc_t *)cp;
    655 	arc->arc_name_length = name_length;
    656 	cp = (char *)arc + arc->arc_name_offset;
    657 	bcopy(ill->ill_name, cp, name_length);
    658 
    659 	if (addr) {
    660 		area_t	*area = (area_t *)mp->b_rptr;
    661 
    662 		cp = (char *)area + area->area_proto_addr_offset;
    663 		bcopy(addr, cp, area->area_proto_addr_length);
    664 		if (area->area_cmd == AR_ENTRY_ADD) {
    665 			cp = (char *)area;
    666 			len = area->area_proto_addr_length;
    667 			if (area->area_proto_mask_offset)
    668 				cp += area->area_proto_mask_offset;
    669 			else
    670 				cp += area->area_proto_addr_offset + len;
    671 			while (len-- > 0)
    672 				*cp++ = (char)~0;
    673 		}
    674 	}
    675 	return (mp);
    676 }
    677 
    678 mblk_t *
    679 ipif_area_alloc(ipif_t *ipif)
    680 {
    681 	return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template,
    682 	    (char *)&ipif->ipif_lcl_addr));
    683 }
    684 
    685 mblk_t *
    686 ipif_ared_alloc(ipif_t *ipif)
    687 {
    688 	return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template,
    689 	    (char *)&ipif->ipif_lcl_addr));
    690 }
    691 
    692 mblk_t *
    693 ill_ared_alloc(ill_t *ill, ipaddr_t addr)
    694 {
    695 	return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template,
    696 	    (char *)&addr));
    697 }
    698 
    699 /*
    700  * Completely vaporize a lower level tap and all associated interfaces.
    701  * ill_delete is called only out of ip_close when the device control
    702  * stream is being closed.
    703  */
    704 void
    705 ill_delete(ill_t *ill)
    706 {
    707 	ipif_t	*ipif;
    708 	ill_t	*prev_ill;
    709 	ip_stack_t	*ipst = ill->ill_ipst;
    710 
    711 	/*
    712 	 * ill_delete may be forcibly entering the ipsq. The previous
    713 	 * ioctl may not have completed and may need to be aborted.
    714 	 * ipsq_flush takes care of it. If we don't need to enter the
    715 	 * the ipsq forcibly, the 2nd invocation of ipsq_flush in
    716 	 * ill_delete_tail is sufficient.
    717 	 */
    718 	ipsq_flush(ill);
    719 
    720 	/*
    721 	 * Nuke all interfaces.  ipif_free will take down the interface,
    722 	 * remove it from the list, and free the data structure.
    723 	 * Walk down the ipif list and remove the logical interfaces
    724 	 * first before removing the main ipif. We can't unplumb
    725 	 * zeroth interface first in the case of IPv6 as reset_conn_ill
    726 	 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking
    727 	 * POINTOPOINT.
    728 	 *
    729 	 * If ill_ipif was not properly initialized (i.e low on memory),
    730 	 * then no interfaces to clean up. In this case just clean up the
    731 	 * ill.
    732 	 */
    733 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
    734 		ipif_free(ipif);
    735 
    736 	/*
    737 	 * Used only by ill_arp_on and ill_arp_off, which are writers.
    738 	 * So nobody can be using this mp now. Free the mp allocated for
    739 	 * honoring ILLF_NOARP
    740 	 */
    741 	freemsg(ill->ill_arp_on_mp);
    742 	ill->ill_arp_on_mp = NULL;
    743 
    744 	/* Clean up msgs on pending upcalls for mrouted */
    745 	reset_mrt_ill(ill);
    746 
    747 	/*
    748 	 * ipif_free -> reset_conn_ipif will remove all multicast
    749 	 * references for IPv4. For IPv6, we need to do it here as
    750 	 * it points only at ills.
    751 	 */
    752 	reset_conn_ill(ill);
    753 
    754 	/*
    755 	 * ill_down will arrange to blow off any IRE's dependent on this
    756 	 * ILL, and shut down fragmentation reassembly.
    757 	 */
    758 	ill_down(ill);
    759 
    760 	/* Let SCTP know, so that it can remove this from its list. */
    761 	sctp_update_ill(ill, SCTP_ILL_REMOVE);
    762 
    763 	/*
    764 	 * If an address on this ILL is being used as a source address then
    765 	 * clear out the pointers in other ILLs that point to this ILL.
    766 	 */
    767 	rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
    768 	if (ill->ill_usesrc_grp_next != NULL) {
    769 		if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
    770 			ill_disband_usesrc_group(ill);
    771 		} else {	/* consumer of the usesrc ILL */
    772 			prev_ill = ill_prev_usesrc(ill);
    773 			prev_ill->ill_usesrc_grp_next =
    774 			    ill->ill_usesrc_grp_next;
    775 		}
    776 	}
    777 	rw_exit(&ipst->ips_ill_g_usesrc_lock);
    778 }
    779 
    780 static void
    781 ipif_non_duplicate(ipif_t *ipif)
    782 {
    783 	ill_t *ill = ipif->ipif_ill;
    784 	mutex_enter(&ill->ill_lock);
    785 	if (ipif->ipif_flags & IPIF_DUPLICATE) {
    786 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
    787 		ASSERT(ill->ill_ipif_dup_count > 0);
    788 		ill->ill_ipif_dup_count--;
    789 	}
    790 	mutex_exit(&ill->ill_lock);
    791 }
    792 
    793 /*
    794  * ill_delete_tail is called from ip_modclose after all references
    795  * to the closing ill are gone. The wait is done in ip_modclose
    796  */
    797 void
    798 ill_delete_tail(ill_t *ill)
    799 {
    800 	mblk_t	**mpp;
    801 	ipif_t	*ipif;
    802 	ip_stack_t	*ipst = ill->ill_ipst;
    803 
    804 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
    805 		ipif_non_duplicate(ipif);
    806 		ipif_down_tail(ipif);
    807 	}
    808 
    809 	ASSERT(ill->ill_ipif_dup_count == 0 &&
    810 	    ill->ill_arp_down_mp == NULL &&
    811 	    ill->ill_arp_del_mapping_mp == NULL);
    812 
    813 	/*
    814 	 * If polling capability is enabled (which signifies direct
    815 	 * upcall into IP and driver has ill saved as a handle),
    816 	 * we need to make sure that unbind has completed before we
    817 	 * let the ill disappear and driver no longer has any reference
    818 	 * to this ill.
    819 	 */
    820 	mutex_enter(&ill->ill_lock);
    821 	while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
    822 		cv_wait(&ill->ill_cv, &ill->ill_lock);
    823 	mutex_exit(&ill->ill_lock);
    824 
    825 	/*
    826 	 * Clean up polling and soft ring capabilities
    827 	 */
    828 	if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))
    829 		ill_capability_dls_disable(ill);
    830 
    831 	if (ill->ill_net_type != IRE_LOOPBACK)
    832 		qprocsoff(ill->ill_rq);
    833 
    834 	/*
    835 	 * We do an ipsq_flush once again now. New messages could have
    836 	 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
    837 	 * could also have landed up if an ioctl thread had looked up
    838 	 * the ill before we set the ILL_CONDEMNED flag, but not yet
    839 	 * enqueued the ioctl when we did the ipsq_flush last time.
    840 	 */
    841 	ipsq_flush(ill);
    842 
    843 	/*
    844 	 * Free capabilities.
    845 	 */
    846 	if (ill->ill_ipsec_capab_ah != NULL) {
    847 		ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH);
    848 		ill_ipsec_capab_free(ill->ill_ipsec_capab_ah);
    849 		ill->ill_ipsec_capab_ah = NULL;
    850 	}
    851 
    852 	if (ill->ill_ipsec_capab_esp != NULL) {
    853 		ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP);
    854 		ill_ipsec_capab_free(ill->ill_ipsec_capab_esp);
    855 		ill->ill_ipsec_capab_esp = NULL;
    856 	}
    857 
    858 	if (ill->ill_mdt_capab != NULL) {
    859 		kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t));
    860 		ill->ill_mdt_capab = NULL;
    861 	}
    862 
    863 	if (ill->ill_hcksum_capab != NULL) {
    864 		kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
    865 		ill->ill_hcksum_capab = NULL;
    866 	}
    867 
    868 	if (ill->ill_zerocopy_capab != NULL) {
    869 		kmem_free(ill->ill_zerocopy_capab,
    870 		    sizeof (ill_zerocopy_capab_t));
    871 		ill->ill_zerocopy_capab = NULL;
    872 	}
    873 
    874 	if (ill->ill_lso_capab != NULL) {
    875 		kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
    876 		ill->ill_lso_capab = NULL;
    877 	}
    878 
    879 	if (ill->ill_dls_capab != NULL) {
    880 		CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn);
    881 		ill->ill_dls_capab->ill_unbind_conn = NULL;
    882 		kmem_free(ill->ill_dls_capab,
    883 		    sizeof (ill_dls_capab_t) +
    884 		    (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS));
    885 		ill->ill_dls_capab = NULL;
    886 	}
    887 
    888 	ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL));
    889 
    890 	while (ill->ill_ipif != NULL)
    891 		ipif_free_tail(ill->ill_ipif);
    892 
    893 	/*
    894 	 * We have removed all references to ilm from conn and the ones joined
    895 	 * within the kernel.
    896 	 *
    897 	 * We don't walk conns, mrts and ires because
    898 	 *
    899 	 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts.
    900 	 * 2) ill_down ->ill_downi walks all the ires and cleans up
    901 	 *    ill references.
    902 	 */
    903 	ASSERT(ilm_walk_ill(ill) == 0);
    904 	/*
    905 	 * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free
    906 	 * could free the phyint. No more reference to the phyint after this
    907 	 * point.
    908 	 */
    909 	(void) ill_glist_delete(ill);
    910 
    911 	rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER);
    912 	if (ill->ill_ndd_name != NULL)
    913 		nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name);
    914 	rw_exit(&ipst->ips_ip_g_nd_lock);
    915 
    916 
    917 	if (ill->ill_frag_ptr != NULL) {
    918 		uint_t count;
    919 
    920 		for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
    921 			mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
    922 		}
    923 		mi_free(ill->ill_frag_ptr);
    924 		ill->ill_frag_ptr = NULL;
    925 		ill->ill_frag_hash_tbl = NULL;
    926 	}
    927 
    928 	freemsg(ill->ill_nd_lla_mp);
    929 	/* Free all retained control messages. */
    930 	mpp = &ill->ill_first_mp_to_free;
    931 	do {
    932 		while (mpp[0]) {
    933 			mblk_t  *mp;
    934 			mblk_t  *mp1;
    935 
    936 			mp = mpp[0];
    937 			mpp[0] = mp->b_next;
    938 			for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
    939 				mp1->b_next = NULL;
    940 				mp1->b_prev =