Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /* Copyright (c) 1990 Mentat Inc. */
     26 
     27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     28 
     29 
     30 /*
     31  * This file contains routines that manipulate Internet Routing Entries (IREs).
     32  */
     33 
     34 #include <sys/types.h>
     35 #include <sys/stream.h>
     36 #include <sys/stropts.h>
     37 #include <sys/ddi.h>
     38 #include <sys/cmn_err.h>
     39 #include <sys/policy.h>
     40 
     41 #include <sys/systm.h>
     42 #include <sys/kmem.h>
     43 #include <sys/param.h>
     44 #include <sys/socket.h>
     45 #include <net/if.h>
     46 #include <net/route.h>
     47 #include <netinet/in.h>
     48 #include <net/if_dl.h>
     49 #include <netinet/ip6.h>
     50 #include <netinet/icmp6.h>
     51 
     52 #include <inet/common.h>
     53 #include <inet/mi.h>
     54 #include <inet/ip.h>
     55 #include <inet/ip6.h>
     56 #include <inet/ip_ndp.h>
     57 #include <inet/arp.h>
     58 #include <inet/ip_if.h>
     59 #include <inet/ip_ire.h>
     60 #include <inet/ip_ftable.h>
     61 #include <inet/ip_rts.h>
     62 #include <inet/nd.h>
     63 
     64 #include <net/pfkeyv2.h>
     65 #include <inet/ipsec_info.h>
     66 #include <inet/sadb.h>
     67 #include <sys/kmem.h>
     68 #include <inet/tcp.h>
     69 #include <inet/ipclassifier.h>
     70 #include <sys/zone.h>
     71 #include <sys/cpuvar.h>
     72 
     73 #include <sys/tsol/label.h>
     74 #include <sys/tsol/tnet.h>
     75 
     76 struct kmem_cache *rt_entry_cache;
     77 
     78 /*
     79  * Synchronization notes:
     80  *
     81  * The fields of the ire_t struct are protected in the following way :
     82  *
     83  * ire_next/ire_ptpn
     84  *
     85  *	- bucket lock of the respective tables (cache or forwarding tables).
     86  *
     87  * ire_mp, ire_rfq, ire_stq, ire_u *except* ire_gateway_addr[v6], ire_mask,
     88  * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, ire_ipif,
     89  * ire_ihandle, ire_phandle, ire_nce, ire_bucket, ire_in_ill, ire_in_src_addr
     90  *
     91  *	- Set in ire_create_v4/v6 and never changes after that. Thus,
     92  *	  we don't need a lock whenever these fields are accessed.
     93  *
     94  *	- ire_bucket and ire_masklen (also set in ire_create) is set in
     95  *        ire_add_v4/ire_add_v6 before inserting in the bucket and never
     96  *        changes after that. Thus we don't need a lock whenever these
     97  *	  fields are accessed.
     98  *
     99  * ire_gateway_addr_v4[v6]
    100  *
    101  *	- ire_gateway_addr_v4[v6] is set during ire_create and later modified
    102  *	  by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to
    103  *	  it assumed to be atomic and hence the other parts of the code
    104  *	  does not use any locks. ire_gateway_addr_v6 updates are not atomic
    105  *	  and hence any access to it uses ire_lock to get/set the right value.
    106  *
    107  * ire_ident, ire_refcnt
    108  *
    109  *	- Updated atomically using atomic_add_32
    110  *
    111  * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count
    112  *
    113  *	- Assumes that 32 bit writes are atomic. No locks. ire_lock is
    114  *	  used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt.
    115  *
    116  * ire_max_frag, ire_frag_flag
    117  *
    118  *	- ire_lock is used to set/read both of them together.
    119  *
    120  * ire_tire_mark
    121  *
    122  *	- Set in ire_create and updated in ire_expire, which is called
    123  *	  by only one function namely ip_trash_timer_expire. Thus only
    124  *	  one function updates and examines the value.
    125  *
    126  * ire_marks
    127  *	- bucket lock protects this.
    128  *
    129  * ire_ipsec_overhead/ire_ll_hdr_length
    130  *
    131  *	- Place holder for returning the information to the upper layers
    132  *	  when IRE_DB_REQ comes down.
    133  *
    134  *
    135  * ipv6_ire_default_count is protected by the bucket lock of
    136  * ip_forwarding_table_v6[0][0].
    137  *
    138  * ipv6_ire_default_index is not protected as it  is just a hint
    139  * at which default gateway to use. There is nothing
    140  * wrong in using the same gateway for two different connections.
    141  *
    142  * As we always hold the bucket locks in all the places while accessing
    143  * the above values, it is natural to use them for protecting them.
    144  *
    145  * We have a separate cache table and forwarding table for IPv4 and IPv6.
    146  * Cache table (ip_cache_table/ip_cache_table_v6) is a pointer to an
    147  * array of irb_t structures. The IPv6 forwarding table
    148  * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t
    149  *  structure. ip_forwarding_table_v6 is allocated dynamically in
    150  * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads
    151  * initializing the same bucket. Once a bucket is initialized, it is never
    152  * de-alloacted. This assumption enables us to access
    153  * ip_forwarding_table_v6[i] without any locks.
    154  *
    155  * The forwarding table for IPv4 is a radix tree whose leaves
    156  * are rt_entry structures containing the irb_t for the rt_dst. The irb_t
    157  * for IPv4 is dynamically allocated and freed.
    158  *
    159  * Each irb_t - ire bucket structure has a lock to protect
    160  * a bucket and the ires residing in the bucket have a back pointer to
    161  * the bucket structure. It also has a reference count for the number
    162  * of threads walking the bucket - irb_refcnt which is bumped up
    163  * using the macro IRB_REFHOLD macro. The flags irb_flags can be
    164  * set to IRE_MARK_CONDEMNED indicating that there are some ires
    165  * in this bucket that are marked with IRE_MARK_CONDEMNED and the
    166  * last thread to leave the bucket should delete the ires. Usually
    167  * this is done by the IRB_REFRELE macro which is used to decrement
    168  * the reference count on a bucket. See comments above irb_t structure
    169  * definition in ip.h for further details.
    170  *
    171  * IRE_REFHOLD/IRE_REFRELE macros operate on the ire which increments/
    172  * decrements the reference count, ire_refcnt, atomically on the ire.
    173  * ire_refcnt is modified only using this macro. Operations on the IRE
    174  * could be described as follows :
    175  *
    176  * CREATE an ire with reference count initialized to 1.
    177  *
    178  * ADDITION of an ire holds the bucket lock, checks for duplicates
    179  * and then adds the ire. ire_add_v4/ire_add_v6 returns the ire after
    180  * bumping up once more i.e the reference count is 2. This is to avoid
    181  * an extra lookup in the functions calling ire_add which wants to
    182  * work with the ire after adding.
    183  *
    184  * LOOKUP of an ire bumps up the reference count using IRE_REFHOLD
    185  * macro. It is valid to bump up the referece count of the IRE,
    186  * after the lookup has returned an ire. Following are the lookup
    187  * functions that return an HELD ire :
    188  *
    189  * ire_lookup_local[_v6], ire_ctable_lookup[_v6], ire_ftable_lookup[_v6],
    190  * ire_cache_lookup[_v6], ire_lookup_multi[_v6], ire_route_lookup[_v6],
    191  * ipif_to_ire[_v6].
    192  *
    193  * DELETION of an ire holds the bucket lock, removes it from the list
    194  * and then decrements the reference count for having removed from the list
    195  * by using the IRE_REFRELE macro. If some other thread has looked up
    196  * the ire, the reference count would have been bumped up and hence
    197  * this ire will not be freed once deleted. It will be freed once the
    198  * reference count drops to zero.
    199  *
    200  * Add and Delete acquires the bucket lock as RW_WRITER, while all the
    201  * lookups acquire the bucket lock as RW_READER.
    202  *
    203  * NOTE : The only functions that does the IRE_REFRELE when an ire is
    204  *	  passed as an argument are :
    205  *
    206  *	  1) ip_wput_ire : This is because it IRE_REFHOLD/RELEs the
    207  *			   broadcast ires it looks up internally within
    208  *			   the function. Currently, for simplicity it does
    209  *			   not differentiate the one that is passed in and
    210  *			   the ones it looks up internally. It always
    211  *			   IRE_REFRELEs.
    212  *	  2) ire_send
    213  *	     ire_send_v6 : As ire_send calls ip_wput_ire and other functions
    214  *			   that take ire as an argument, it has to selectively
    215  *			   IRE_REFRELE the ire. To maintain symmetry,
    216  *			   ire_send_v6 does the same.
    217  *
    218  * Otherwise, the general rule is to do the IRE_REFRELE in the function
    219  * that is passing the ire as an argument.
    220  *
    221  * In trying to locate ires the following points are to be noted.
    222  *
    223  * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is
    224  * to be ignored when walking the ires using ire_next.
    225  *
    226  * IRE_MARK_HIDDEN signifies that the ire is a special ire typically for the
    227  * benefit of in.mpathd which needs to probe interfaces for failures. Normal
    228  * applications should not be seeing this ire and hence this ire is ignored
    229  * in most cases in the search using ire_next.
    230  *
    231  * Zones note:
    232  *	Walking IREs within a given zone also walks certain ires in other
    233  *	zones.  This is done intentionally.  IRE walks with a specified
    234  *	zoneid are used only when doing informational reports, and
    235  *	zone users want to see things that they can access. See block
    236  *	comment in ire_walk_ill_match().
    237  */
    238 
    239 /*
    240  * The minimum size of IRE cache table.  It will be recalcuated in
    241  * ip_ire_init().
    242  * Setable in /etc/system
    243  */
    244 uint32_t ip_cache_table_size = IP_CACHE_TABLE_SIZE;
    245 uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE;
    246 
    247 /*
    248  * The size of the forwarding table.  We will make sure that it is a
    249  * power of 2 in ip_ire_init().
    250  * Setable in /etc/system
    251  */
    252 uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE;
    253 
    254 struct	kmem_cache	*ire_cache;
    255 static ire_t	ire_null;
    256 
    257 /*
    258  * The threshold number of IRE in a bucket when the IREs are
    259  * cleaned up.  This threshold is calculated later in ip_open()
    260  * based on the speed of CPU and available memory.  This default
    261  * value is the maximum.
    262  *
    263  * We have two kinds of cached IRE, temporary and
    264  * non-temporary.  Temporary IREs are marked with
    265  * IRE_MARK_TEMPORARY.  They are IREs created for non
    266  * TCP traffic and for forwarding purposes.  All others
    267  * are non-temporary IREs.  We don't mark IRE created for
    268  * TCP as temporary because TCP is stateful and there are
    269  * info stored in the IRE which can be shared by other TCP
    270  * connections to the same destination.  For connected
    271  * endpoint, we also don't want to mark the IRE used as
    272  * temporary because the same IRE will be used frequently,
    273  * otherwise, the app should not do a connect().  We change
    274  * the marking at ip_bind_connected_*() if necessary.
    275  *
    276  * We want to keep the cache IRE hash bucket length reasonably
    277  * short, otherwise IRE lookup functions will take "forever."
    278  * We use the "crude" function that the IRE bucket
    279  * length should be based on the CPU speed, which is 1 entry
    280  * per x MHz, depending on the shift factor ip_ire_cpu_ratio
    281  * (n).  This means that with a 750MHz CPU, the max bucket
    282  * length can be (750 >> n) entries.
    283  *
    284  * Note that this threshold is separate for temp and non-temp
    285  * IREs.  This means that the actual bucket length can be
    286  * twice as that.  And while we try to keep temporary IRE
    287  * length at most at the threshold value, we do not attempt to
    288  * make the length for non-temporary IREs fixed, for the
    289  * reason stated above.  Instead, we start trying to find
    290  * "unused" non-temporary IREs when the bucket length reaches
    291  * this threshold and clean them up.
    292  *
    293  * We also want to limit the amount of memory used by
    294  * IREs.  So if we are allowed to use ~3% of memory (M)
    295  * for those IREs, each bucket should not have more than
    296  *
    297  * 	M / num of cache bucket / sizeof (ire_t)
    298  *
    299  * Again the above memory uses are separate for temp and
    300  * non-temp cached IREs.
    301  *
    302  * We may also want the limit to be a function of the number
    303  * of interfaces and number of CPUs.  Doing the initialization
    304  * in ip_open() means that every time an interface is plumbed,
    305  * the max is re-calculated.  Right now, we don't do anything
    306  * different.  In future, when we have more experience, we
    307  * may want to change this behavior.
    308  */
    309 uint32_t ip_ire_max_bucket_cnt = 10;	/* Setable in /etc/system */
    310 uint32_t ip6_ire_max_bucket_cnt = 10;
    311 uint32_t ip_ire_cleanup_cnt = 2;
    312 
    313 /*
    314  * The minimum of the temporary IRE bucket count.  We do not want
    315  * the length of each bucket to be too short.  This may hurt
    316  * performance of some apps as the temporary IREs are removed too
    317  * often.
    318  */
    319 uint32_t ip_ire_min_bucket_cnt = 3;	/* /etc/system - not used */
    320 uint32_t ip6_ire_min_bucket_cnt = 3;
    321 
    322 /*
    323  * The ratio of memory consumed by IRE used for temporary to available
    324  * memory.  This is a shift factor, so 6 means the ratio 1 to 64.  This
    325  * value can be changed in /etc/system.  6 is a reasonable number.
    326  */
    327 uint32_t ip_ire_mem_ratio = 6;	/* /etc/system */
    328 /* The shift factor for CPU speed to calculate the max IRE bucket length. */
    329 uint32_t ip_ire_cpu_ratio = 7;	/* /etc/system */
    330 
    331 typedef struct nce_clookup_s {
    332 	ipaddr_t ncecl_addr;
    333 	boolean_t ncecl_found;
    334 } nce_clookup_t;
    335 
    336 /*
    337  * The maximum number of buckets in IRE cache table.  In future, we may
    338  * want to make it a dynamic hash table.  For the moment, we fix the
    339  * size and allocate the table in ip_ire_init() when IP is first loaded.
    340  * We take into account the amount of memory a system has.
    341  */
    342 #define	IP_MAX_CACHE_TABLE_SIZE	4096
    343 
    344 /* Setable in /etc/system */
    345 static uint32_t	ip_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE;
    346 static uint32_t	ip6_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE;
    347 
    348 #define	NUM_ILLS	2	/* To build the ILL list to unlock */
    349 
    350 /* Zero iulp_t for initialization. */
    351 const iulp_t	ire_uinfo_null = { 0 };
    352 
    353 static int	ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp,
    354     ipsq_func_t func, boolean_t);
    355 static void	ire_delete_v4(ire_t *ire);
    356 static void	ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers,
    357     zoneid_t zoneid, ip_stack_t *);
    358 static void	ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type,
    359     pfv_t func, void *arg, uchar_t vers, ill_t *ill);
    360 static void	ire_cache_cleanup(irb_t *irb, uint32_t threshold,
    361     ire_t *ref_ire);
    362 static	void	ip_nce_clookup_and_delete(nce_t *nce, void *arg);
    363 #ifdef DEBUG
    364 static void	ire_trace_cleanup(const ire_t *);
    365 #endif
    366 
    367 /*
    368  * To avoid bloating the code, we call this function instead of
    369  * using the macro IRE_REFRELE. Use macro only in performance
    370  * critical paths.
    371  *
    372  * Must not be called while holding any locks. Otherwise if this is
    373  * the last reference to be released there is a chance of recursive mutex
    374  * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
    375  * to restart an ioctl. The one exception is when the caller is sure that
    376  * this is not the last reference to be released. Eg. if the caller is
    377  * sure that the ire has not been deleted and won't be deleted.
    378  */
    379 void
    380 ire_refrele(ire_t *ire)
    381 {
    382 	IRE_REFRELE(ire);
    383 }
    384 
    385 void
    386 ire_refrele_notr(ire_t *ire)
    387 {
    388 	IRE_REFRELE_NOTR(ire);
    389 }
    390 
    391 /*
    392  * kmem_cache_alloc constructor for IRE in kma space.
    393  * Note that when ire_mp is set the IRE is stored in that mblk and
    394  * not in this cache.
    395  */
    396 /* ARGSUSED */
    397 static int
    398 ip_ire_constructor(void *buf, void *cdrarg, int kmflags)
    399 {
    400 	ire_t	*ire = buf;
    401 
    402 	ire->ire_nce = NULL;
    403 
    404 	return (0);
    405 }
    406 
    407 /* ARGSUSED1 */
    408 static void
    409 ip_ire_destructor(void *buf, void *cdrarg)
    410 {
    411 	ire_t	*ire = buf;
    412 
    413 	ASSERT(ire->ire_nce == NULL);
    414 }
    415 
    416 /*
    417  * This function is associated with the IP_IOC_IRE_ADVISE_NO_REPLY
    418  * IOCTL.  It is used by TCP (or other ULPs) to supply revised information
    419  * for an existing CACHED IRE.
    420  */
    421 /* ARGSUSED */
    422 int
    423 ip_ire_advise(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
    424 {
    425 	uchar_t	*addr_ucp;
    426 	ipic_t	*ipic;
    427 	ire_t	*ire;
    428 	ipaddr_t	addr;
    429 	in6_addr_t	v6addr;
    430 	irb_t	*irb;
    431 	zoneid_t	zoneid;
    432 	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
    433 
    434 	ASSERT(q->q_next == NULL);
    435 	zoneid = Q_TO_CONN(q)->conn_zoneid;
    436 
    437 	/*
    438 	 * Check privilege using the ioctl credential; if it is NULL
    439 	 * then this is a kernel message and therefor privileged.
    440 	 */
    441 	if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0)
    442 		return (EPERM);
    443 
    444 	ipic = (ipic_t *)mp->b_rptr;
    445 	if (!(addr_ucp = mi_offset_param(mp, ipic->ipic_addr_offset,
    446 	    ipic->ipic_addr_length))) {
    447 		return (EINVAL);
    448 	}
    449 	if (!OK_32PTR(addr_ucp))
    450 		return (EINVAL);
    451 	switch (ipic->ipic_addr_length) {
    452 	case IP_ADDR_LEN: {
    453 		/* Extract the destination address. */
    454 		addr = *(ipaddr_t *)addr_ucp;
    455 		/* Find the corresponding IRE. */
    456 		ire = ire_cache_lookup(addr, zoneid, NULL, ipst);
    457 		break;
    458 	}
    459 	case IPV6_ADDR_LEN: {
    460 		/* Extract the destination address. */
    461 		v6addr = *(in6_addr_t *)addr_ucp;
    462 		/* Find the corresponding IRE. */
    463 		ire = ire_cache_lookup_v6(&v6addr, zoneid, NULL, ipst);
    464 		break;
    465 	}
    466 	default:
    467 		return (EINVAL);
    468 	}
    469 
    470 	if (ire == NULL)
    471 		return (ENOENT);
    472 	/*
    473 	 * Update the round trip time estimate and/or the max frag size
    474 	 * and/or the slow start threshold.
    475 	 *
    476 	 * We serialize multiple advises using ire_lock.
    477 	 */
    478 	mutex_enter(&ire->ire_lock);
    479 	if (ipic->ipic_rtt) {
    480 		/*
    481 		 * If there is no old cached values, initialize them
    482 		 * conservatively.  Set them to be (1.5 * new value).
    483 		 */
    484 		if (ire->ire_uinfo.iulp_rtt != 0) {
    485 			ire->ire_uinfo.iulp_rtt = (ire->ire_uinfo.iulp_rtt +
    486 			    ipic->ipic_rtt) >> 1;
    487 		} else {
    488 			ire->ire_uinfo.iulp_rtt = ipic->ipic_rtt +
    489 			    (ipic->ipic_rtt >> 1);
    490 		}
    491 		if (ire->ire_uinfo.iulp_rtt_sd != 0) {
    492 			ire->ire_uinfo.iulp_rtt_sd =
    493 			    (ire->ire_uinfo.iulp_rtt_sd +
    494 			    ipic->ipic_rtt_sd) >> 1;
    495 		} else {
    496 			ire->ire_uinfo.iulp_rtt_sd = ipic->ipic_rtt_sd +
    497 			    (ipic->ipic_rtt_sd >> 1);
    498 		}
    499 	}
    500 	if (ipic->ipic_max_frag)
    501 		ire->ire_max_frag = MIN(ipic->ipic_max_frag, IP_MAXPACKET);
    502 	if (ipic->ipic_ssthresh != 0) {
    503 		if (ire->ire_uinfo.iulp_ssthresh != 0)
    504 			ire->ire_uinfo.iulp_ssthresh =
    505 			    (ipic->ipic_ssthresh +
    506 			    ire->ire_uinfo.iulp_ssthresh) >> 1;
    507 		else
    508 			ire->ire_uinfo.iulp_ssthresh = ipic->ipic_ssthresh;
    509 	}
    510 	/*
    511 	 * Don't need the ire_lock below this. ire_type does not change
    512 	 * after initialization. ire_marks is protected by irb_lock.
    513 	 */
    514 	mutex_exit(&ire->ire_lock);
    515 
    516 	if (ipic->ipic_ire_marks != 0 && ire->ire_type == IRE_CACHE) {
    517 		/*
    518 		 * Only increment the temporary IRE count if the original
    519 		 * IRE is not already marked temporary.
    520 		 */
    521 		irb = ire->ire_bucket;
    522 		rw_enter(&irb->irb_lock, RW_WRITER);
    523 		if ((ipic->ipic_ire_marks & IRE_MARK_TEMPORARY) &&
    524 		    !(ire->ire_marks & IRE_MARK_TEMPORARY)) {
    525 			irb->irb_tmp_ire_cnt++;
    526 		}
    527 		ire->ire_marks |= ipic->ipic_ire_marks;
    528 		rw_exit(&irb->irb_lock);
    529 	}
    530 
    531 	ire_refrele(ire);
    532 	return (0);
    533 }
    534 
    535 /*
    536  * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY]
    537  * IOCTL[s].  The NO_REPLY form is used by TCP to delete a route IRE
    538  * for a host that is not responding.  This will force an attempt to
    539  * establish a new route, if available, and flush out the ARP entry so
    540  * it will re-resolve.  Management processes may want to use the
    541  * version that generates a reply.
    542  *
    543  * This function does not support IPv6 since Neighbor Unreachability Detection
    544  * means that negative advise like this is useless.
    545  */
    546 /* ARGSUSED */
    547 int
    548 ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
    549 {
    550 	uchar_t		*addr_ucp;
    551 	ipaddr_t	addr;
    552 	ire_t		*ire;
    553 	ipid_t		*ipid;
    554 	boolean_t	routing_sock_info = B_FALSE;	/* Sent info? */
    555 	zoneid_t	zoneid;
    556 	ire_t		*gire = NULL;
    557 	ill_t		*ill;
    558 	mblk_t		*arp_mp;
    559 	ip_stack_t	*ipst;
    560 
    561 	ASSERT(q->q_next == NULL);
    562 	zoneid = Q_TO_CONN(q)->conn_zoneid;
    563 	ipst = CONNQ_TO_IPST(q);
    564 
    565 	/*
    566 	 * Check privilege using the ioctl credential; if it is NULL
    567 	 * then this is a kernel message and therefor privileged.
    568 	 */
    569 	if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0)
    570 		return (EPERM);
    571 
    572 	ipid = (ipid_t *)mp->b_rptr;
    573 
    574 	/* Only actions on IRE_CACHEs are acceptable at present. */
    575 	if (ipid->ipid_ire_type != IRE_CACHE)
    576 		return (EINVAL);
    577 
    578 	addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset,
    579 	    ipid->ipid_addr_length);
    580 	if (addr_ucp == NULL || !OK_32PTR(addr_ucp))
    581 		return (EINVAL);
    582 	switch (ipid->ipid_addr_length) {
    583 	case IP_ADDR_LEN:
    584 		/* addr_ucp points at IP addr */
    585 		break;
    586 	case sizeof (sin_t): {
    587 		sin_t	*sin;
    588 		/*
    589 		 * got complete (sockaddr) address - increment addr_ucp to point
    590 		 * at the ip_addr field.
    591 		 */
    592 		sin = (sin_t *)addr_ucp;
    593 		addr_ucp = (uchar_t *)&sin->sin_addr.s_addr;
    594 		break;
    595 	}
    596 	default:
    597 		return (EINVAL);
    598 	}
    599 	/* Extract the destination address. */
    600 	bcopy(addr_ucp, &addr, IP_ADDR_LEN);
    601 
    602 	/* Try to find the CACHED IRE. */
    603 	ire = ire_cache_lookup(addr, zoneid, NULL, ipst);
    604 
    605 	/* Nail it. */
    606 	if (ire) {
    607 		/* Allow delete only on CACHE entries */
    608 		if (ire->ire_type != IRE_CACHE) {
    609 			ire_refrele(ire);
    610 			return (EINVAL);
    611 		}
    612 
    613 		/*
    614 		 * Verify that the IRE has been around for a while.
    615 		 * This is to protect against transport protocols
    616 		 * that are too eager in sending delete messages.
    617 		 */
    618 		if (gethrestime_sec() <
    619 		    ire->ire_create_time + ipst->ips_ip_ignore_delete_time) {
    620 			ire_refrele(ire);
    621 			return (EINVAL);
    622 		}
    623 		/*
    624 		 * Now we have a potentially dead cache entry. We need
    625 		 * to remove it.
    626 		 * If this cache entry is generated from a
    627 		 * default route (i.e., ire_cmask == 0),
    628 		 * search the default list and mark it dead and some
    629 		 * background process will try to activate it.
    630 		 */
    631 		if ((ire->ire_gateway_addr != 0) && (ire->ire_cmask == 0)) {
    632 			/*
    633 			 * Make sure that we pick a different
    634 			 * IRE_DEFAULT next time.
    635 			 */
    636 			ire_t *gw_ire;
    637 			irb_t *irb = NULL;
    638 			uint_t match_flags;
    639 
    640 			match_flags = (MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE);
    641 
    642 			gire = ire_ftable_lookup(ire->ire_addr,
    643 			    ire->ire_cmask, 0, 0,
    644 			    ire->ire_ipif, NULL, zoneid, 0, NULL, match_flags,
    645 			    ipst);
    646 
    647 			ip3dbg(("ire_ftable_lookup() returned gire %p\n",
    648 			    (void *)gire));
    649 
    650 			if (gire != NULL) {
    651 				irb = gire->ire_bucket;
    652 
    653 				/*
    654 				 * We grab it as writer just to serialize
    655 				 * multiple threads trying to bump up
    656 				 * irb_rr_origin
    657 				 */
    658 				rw_enter(&irb->irb_lock, RW_WRITER);
    659 				if ((gw_ire = irb->irb_rr_origin) == NULL) {
    660 					rw_exit(&irb->irb_lock);
    661 					goto done;
    662 				}
    663 
    664 				DTRACE_PROBE1(ip__ire__del__origin,
    665 				    (ire_t *), gw_ire);
    666 
    667 				/* Skip past the potentially bad gateway */
    668 				if (ire->ire_gateway_addr ==
    669 				    gw_ire->ire_gateway_addr) {
    670 					ire_t *next = gw_ire->ire_next;
    671 
    672 					DTRACE_PROBE2(ip__ire__del,
    673 					    (ire_t *), gw_ire, (irb_t *), irb);
    674 					IRE_FIND_NEXT_ORIGIN(next);
    675 					irb->irb_rr_origin = next;
    676 				}
    677 				rw_exit(&irb->irb_lock);
    678 			}
    679 		}
    680 done:
    681 		if (gire != NULL)
    682 			IRE_REFRELE(gire);
    683 		/* report the bad route to routing sockets */
    684 		ip_rts_change(RTM_LOSING, ire->ire_addr, ire->ire_gateway_addr,
    685 		    ire->ire_mask, ire->ire_src_addr, 0, 0, 0,
    686 		    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), ipst);
    687 		routing_sock_info = B_TRUE;
    688 
    689 		/*
    690 		 * TCP is really telling us to start over completely, and it
    691 		 * expects that we'll resend the ARP query.  Tell ARP to
    692 		 * discard the entry, if this is a local destination.
    693 		 */
    694 		ill = ire->ire_stq->q_ptr;
    695 		if (ire->ire_gateway_addr == 0 &&
    696 		    (arp_mp = ill_ared_alloc(ill, addr)) != NULL) {
    697 			putnext(ill->ill_rq, arp_mp);
    698 		}
    699 
    700 		ire_delete(ire);
    701 		ire_refrele(ire);
    702 	}
    703 	/*
    704 	 * Also look for an IRE_HOST type redirect ire and
    705 	 * remove it if present.
    706 	 */
    707 	ire = ire_route_lookup(addr, 0, 0, IRE_HOST, NULL, NULL,
    708 	    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
    709 
    710 	/* Nail it. */
    711 	if (ire != NULL) {
    712 		if (ire->ire_flags & RTF_DYNAMIC) {
    713 			if (!routing_sock_info) {
    714 				ip_rts_change(RTM_LOSING, ire->ire_addr,
    715 				    ire->ire_gateway_addr, ire->ire_mask,
    716 				    ire->ire_src_addr, 0, 0, 0,
    717 				    (RTA_DST | RTA_GATEWAY |
    718 				    RTA_NETMASK | RTA_IFA),
    719 				    ipst);
    720 			}
    721 			ire_delete(ire);
    722 		}
    723 		ire_refrele(ire);
    724 	}
    725 	return (0);
    726 }
    727 
    728 
    729 /*
    730  * ip_ire_req is called by ip_wput when an IRE_DB_REQ_TYPE message is handed
    731  * down from the Upper Level Protocol to request a copy of the IRE (to check
    732  * its type or to extract information like round-trip time estimates or the
    733  * MTU.)
    734  * The address is assumed to be in the ire_addr field. If no IRE is found
    735  * an IRE is returned with ire_type being zero.
    736  * Note that the upper lavel protocol has to check for broadcast
    737  * (IRE_BROADCAST) and multicast (CLASSD(addr)).
    738  * If there is a b_cont the resulting IRE_DB_TYPE mblk is placed at the
    739  * end of the returned message.
    740  *
    741  * TCP sends down a message of this type with a connection request packet
    742  * chained on. UDP and ICMP send it down to verify that a route exists for
    743  * the destination address when they get connected.
    744  */
    745 void
    746 ip_ire_req(queue_t *q, mblk_t *mp)
    747 {
    748 	ire_t	*inire;
    749 	ire_t	*ire;
    750 	mblk_t	*mp1;
    751 	ire_t	*sire = NULL;
    752 	zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid;
    753 	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
    754 
    755 	ASSERT(q->q_next == NULL);
    756 
    757 	if ((mp->b_wptr - mp->b_rptr) < sizeof (ire_t) ||
    758 	    !OK_32PTR(mp->b_rptr)) {
    759 		freemsg(mp);
    760 		return;
    761 	}
    762 	inire = (ire_t *)mp->b_rptr;
    763 	/*
    764 	 * Got it, now take our best shot at an IRE.
    765 	 */
    766 	if (inire->ire_ipversion == IPV6_VERSION) {
    767 		ire = ire_route_lookup_v6(&inire->ire_addr_v6, 0, 0, 0,
    768 		    NULL, &sire, zoneid, NULL,
    769 		    (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
    770 	} else {
    771 		ASSERT(inire->ire_ipversion == IPV4_VERSION);
    772 		ire = ire_route_lookup(inire->ire_addr, 0, 0, 0,
    773 		    NULL, &sire, zoneid, NULL,
    774 		    (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
    775 	}
    776 
    777 	/*
    778 	 * We prevent returning IRES with source address INADDR_ANY
    779 	 * as these were temporarily created for sending packets
    780 	 * from endpoints that have conn_unspec_src set.
    781 	 */
    782 	if (ire == NULL ||
    783 	    (ire->ire_ipversion == IPV4_VERSION &&
    784 	    ire->ire_src_addr == INADDR_ANY) ||
    785 	    (ire->ire_ipversion == IPV6_VERSION &&
    786 	    IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6))) {
    787 		inire->ire_type = 0;
    788 	} else {
    789 		bcopy(ire, inire, sizeof (ire_t));
    790 		/* Copy the route metrics from the parent. */
    791 		if (sire != NULL) {
    792 			bcopy(&(sire->ire_uinfo), &(inire->ire_uinfo),
    793 			    sizeof (iulp_t));
    794 		}
    795 
    796 		/*
    797 		 * As we don't lookup global policy here, we may not
    798 		 * pass the right size if per-socket policy is not
    799 		 * present. For these cases, path mtu discovery will
    800 		 * do the right thing.
    801 		 */
    802 		inire->ire_ipsec_overhead = conn_ipsec_length(Q_TO_CONN(q));
    803 
    804 		/* Pass the latest setting of the ip_path_mtu_discovery */
    805 		inire->ire_frag_flag |=
    806 		    (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0;
    807 	}
    808 	if (ire != NULL)
    809 		ire_refrele(ire);
    810 	if (sire != NULL)
    811 		ire_refrele(sire);
    812 	mp->b_wptr = &mp->b_rptr[sizeof (ire_t)];
    813 	mp->b_datap->db_type = IRE_DB_TYPE;
    814 
    815 	/* Put the IRE_DB_TYPE mblk last in the chain */
    816 	mp1 = mp->b_cont;
    817 	if (mp1 != NULL) {
    818 		mp->b_cont = NULL;
    819 		linkb(mp1, mp);
    820 		mp = mp1;
    821 	}
    822 	qreply(q, mp);
    823 }
    824 
    825 /*
    826  * Send a packet using the specified IRE.
    827  * If ire_src_addr_v6 is all zero then discard the IRE after
    828  * the packet has been sent.
    829  */
    830 static void
    831 ire_send(queue_t *q, mblk_t *pkt, ire_t *ire)
    832 {
    833 	mblk_t *ipsec_mp;
    834 	boolean_t is_secure;
    835 	uint_t ifindex;
    836 	ill_t	*ill;
    837 	zoneid_t zoneid = ire->ire_zoneid;
    838 	ip_stack_t	*ipst = ire->ire_ipst;
    839 
    840 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
    841 	ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */
    842 	ipsec_mp = pkt;
    843 	is_secure = (pkt->b_datap->db_type == M_CTL);
    844 	if (is_secure) {
    845 		ipsec_out_t *io;
    846 
    847 		pkt = pkt->b_cont;
    848 		io = (ipsec_out_t *)ipsec_mp->b_rptr;
    849 		if (io->ipsec_out_type == IPSEC_OUT)
    850 			zoneid = io->ipsec_out_zoneid;
    851 	}
    852 
    853 	/* If the packet originated externally then */
    854 	if (pkt->b_prev) {
    855 		ire_refrele(ire);
    856 		/*
    857 		 * Extract the ifindex from b_prev (set in ip_rput_noire).
    858 		 * Look up interface to see if it still exists (it could have
    859 		 * been unplumbed by the time the reply came back from ARP)
    860 		 */
    861 		ifindex = (uint_t)(uintptr_t)pkt->b_prev;
    862 		ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
    863 		    NULL, NULL, NULL, NULL, ipst);
    864 		if (ill == NULL) {
    865 			pkt->b_prev = NULL;
    866 			pkt->b_next = NULL;
    867 			freemsg(ipsec_mp);
    868 			return;
    869 		}
    870 		q = ill->ill_rq;
    871 		pkt->b_prev = NULL;
    872 		/*
    873 		 * This packet has not gone through IPSEC processing
    874 		 * and hence we should not have any IPSEC message
    875 		 * prepended.
    876 		 */
    877 		ASSERT(ipsec_mp == pkt);
    878 		put(q, pkt);
    879 		ill_refrele(ill);
    880 	} else if (pkt->b_next) {
    881 		/* Packets from multicast router */
    882 		pkt->b_next = NULL;
    883 		/*
    884 		 * We never get the IPSEC_OUT while forwarding the
    885 		 * packet for multicast router.
    886 		 */
    887 		ASSERT(ipsec_mp == pkt);
    888 		ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, ipsec_mp, NULL);
    889 		ire_refrele(ire);
    890 	} else {
    891 		/* Locally originated packets */
    892 		boolean_t is_inaddr_any;
    893 		ipha_t *ipha = (ipha_t *)pkt->b_rptr;
    894 
    895 		/*
    896 		 * We need to do an ire_delete below for which
    897 		 * we need to make sure that the IRE will be
    898 		 * around even after calling ip_wput_ire -
    899 		 * which does ire_refrele. Otherwise somebody
    900 		 * could potentially delete this ire and hence
    901 		 * free this ire and we will be calling ire_delete
    902 		 * on a freed ire below.
    903 		 */
    904 		is_inaddr_any = (ire->ire_src_addr == INADDR_ANY);
    905 		if (is_inaddr_any) {
    906 			IRE_REFHOLD(ire);
    907 		}
    908 		/*
    909 		 * If we were resolving a router we can not use the
    910 		 * routers IRE for sending the packet (since it would
    911 		 * violate the uniqness of the IP idents) thus we
    912 		 * make another pass through ip_wput to create the IRE_CACHE
    913 		 * for the destination.
    914 		 * When IRE_MARK_NOADD is set, ire_add() is not called.
    915 		 * Thus ip_wput() will never find a ire and result in an
    916 		 * infinite loop. Thus we check whether IRE_MARK_NOADD is
    917 		 * is set. This also implies that IRE_MARK_NOADD can only be
    918 		 * used to send packets to directly connected hosts.
    919 		 */
    920 		if (ipha->ipha_dst != ire->ire_addr &&
    921 		    !(ire->ire_marks & IRE_MARK_NOADD)) {
    922 			ire_refrele(ire);	/* Held in ire_add */
    923 			if (CONN_Q(q)) {
    924 				(void) ip_output(Q_TO_CONN(q), ipsec_mp, q,
    925 				    IRE_SEND);
    926 			} else {
    927 				(void) ip_output((void *)(uintptr_t)zoneid,
    928 				    ipsec_mp, q, IRE_SEND);
    929 			}
    930 		} else {
    931 			if (is_secure) {
    932 				ipsec_out_t *oi;
    933 				ipha_t *ipha;
    934 
    935 				oi = (ipsec_out_t *)ipsec_mp->b_rptr;
    936 				ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
    937 				if (oi->ipsec_out_proc_begin) {
    938 					/*
    939 					 * This is the case where
    940 					 * ip_wput_ipsec_out could not find
    941 					 * the IRE and recreated a new one.
    942 					 * As ip_wput_ipsec_out does ire
    943 					 * lookups, ire_refrele for the extra
    944 					 * bump in ire_add.
    945 					 */
    946 					ire_refrele(ire);
    947 					ip_wput_ipsec_out(q, ipsec_mp, ipha,
    948 					    NULL, NULL);
    949 				} else {
    950 					/*
    951 					 * IRE_REFRELE will be done in
    952 					 * ip_wput_ire.
    953 					 */
    954 					ip_wput_ire(q, ipsec_mp, ire, NULL,
    955 					    IRE_SEND, zoneid);
    956 				}
    957 			} else {
    958 				/*
    959 				 * IRE_REFRELE will be done in ip_wput_ire.
    960 				 */
    961 				ip_wput_ire(q, ipsec_mp, ire, NULL,
    962 				    IRE_SEND, zoneid);
    963 			}
    964 		}
    965 		/*
    966 		 * Special code to support sending a single packet with
    967 		 * conn_unspec_src using an IRE which has no source address.
    968 		 * The IRE is deleted here after sending the packet to avoid
    969 		 * having other code trip on it. But before we delete the
    970 		 * ire, somebody could have looked up this ire.
    971 		 * We prevent returning/using this IRE by the upper layers
    972 		 * by making checks to NULL source address in other places
    973 		 * like e.g ip_ire_append, ip_ire_req and ip_bind_connected.
    974 		 * Though, this does not completely prevent other threads
    975 		 * from using this ire, this should not cause any problems.
    976 		 *
    977 		 * NOTE : We use is_inaddr_any instead of using ire_src_addr
    978 		 * because for the normal case i.e !is_inaddr_any, ire_refrele
    979 		 * above could have potentially freed the ire.
    980 		 */
    981 		if (is_inaddr_any) {
    982 			/*
    983 			 * If this IRE has been deleted by another thread, then
    984 			 * ire_bucket won't be NULL, but ire_ptpn will be NULL.
    985 			 * Thus, ire_delete will do nothing.  This check
    986 			 * guards against calling ire_delete when the IRE was
    987 			 * never inserted in the table, which is handled by
    988 			 * ire_delete as dropping another reference.
    989 			 */
    990 			if (ire->ire_bucket != NULL) {
    991 				ip1dbg(("ire_send: delete IRE\n"));
    992 				ire_delete(ire);
    993 			}
    994 			ire_refrele(ire);	/* Held above */
    995 		}
    996 	}
    997 }
    998 
    999 /*
   1000  * Send a packet using the specified IRE.
   1001  * If ire_src_addr_v6 is all zero then discard the IRE after
   1002  * the packet has been sent.
   1003  */
   1004 static void
   1005 ire_send_v6(queue_t *q, mblk_t *pkt, ire_t *ire)
   1006 {
   1007 	mblk_t *ipsec_mp;
   1008 	boolean_t secure;
   1009 	uint_t ifindex;
   1010 	zoneid_t zoneid = ire->ire_zoneid;
   1011 	ip_stack_t	*ipst = ire->ire_ipst;
   1012 
   1013 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
   1014 	ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */
   1015 	if (pkt->b_datap->db_type == M_CTL) {
   1016 		ipsec_out_t *io;
   1017 
   1018 		ipsec_mp = pkt;
   1019 		pkt = pkt->b_cont;
   1020 		secure = B_TRUE;
   1021 		io = (ipsec_out_t *)ipsec_mp->b_rptr;
   1022 		if (io->ipsec_out_type == IPSEC_OUT)
   1023 			zoneid = io->ipsec_out_zoneid;
   1024 	} else {
   1025 		ipsec_mp = pkt;
   1026 		secure = B_FALSE;
   1027 	}
   1028 
   1029 	/* If the packet originated externally then */
   1030 	if (pkt->b_prev) {
   1031 		ill_t	*ill;
   1032 		/*
   1033 		 * Extract the ifindex from b_prev (set in ip_rput_data_v6).
   1034 		 * Look up interface to see if it still exists (it could have
   1035 		 * been unplumbed by the time the reply came back from the
   1036 		 * resolver).
   1037 		 */
   1038 		ifindex = (uint_t)(uintptr_t)pkt->b_prev;
   1039 		ill = ill_lookup_on_ifindex(ifindex, B_TRUE,
   1040 		    NULL, NULL, NULL, NULL, ipst);
   1041 		if (ill == NULL) {
   1042 			pkt->b_prev = NULL;
   1043 			pkt->b_next = NULL;
   1044 			freemsg(ipsec_mp);
   1045 			ire_refrele(ire);	/* Held in ire_add */
   1046 			return;
   1047 		}
   1048 		q = ill->ill_rq;
   1049 		pkt->b_prev = NULL;
   1050 		/*
   1051 		 * This packet has not gone through IPSEC processing
   1052 		 * and hence we should not have any IPSEC message
   1053 		 * prepended.
   1054 		 */
   1055 		ASSERT(ipsec_mp == pkt);
   1056 		put(q, pkt);
   1057 		ill_refrele(ill);
   1058 	} else if (pkt->b_next) {
   1059 		/* Packets from multicast router */
   1060 		pkt->b_next = NULL;
   1061 		/*
   1062 		 * We never get the IPSEC_OUT while forwarding the
   1063 		 * packet for multicast router.
   1064 		 */
   1065 		ASSERT(ipsec_mp == pkt);
   1066 		/*
   1067 		 * XXX TODO IPv6.
   1068 		 */
   1069 		freemsg(pkt);
   1070 #ifdef XXX
   1071 		ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, pkt, NULL);
   1072 #endif
   1073 	} else {
   1074 		if (secure) {
   1075 			ipsec_out_t *oi;
   1076 			ip6_t *ip6h;
   1077 
   1078 			oi = (ipsec_out_t *)ipsec_mp->b_rptr;
   1079 			ip6h = (ip6_t *)ipsec_mp->b_cont->b_rptr;
   1080 			if (oi->ipsec_out_proc_begin) {
   1081 				/*
   1082 				 * This is the case where
   1083 				 * ip_wput_ipsec_out could not find
   1084 				 * the IRE and recreated a new one.
   1085 				 */
   1086 				ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h,
   1087 				    NULL, NULL);
   1088 			} else {
   1089 				if (CONN_Q(q)) {
   1090 					(void) ip_output_v6(Q_TO_CONN(q),
   1091 					    ipsec_mp, q, IRE_SEND);
   1092 				} else {
   1093 					(void) ip_output_v6(
   1094 					    (void *)(uintptr_t)zoneid,
   1095 					    ipsec_mp, q, IRE_SEND);
   1096 				}
   1097 			}
   1098 		} else {
   1099 			/*
   1100 			 * Send packets through ip_output_v6 so that any
   1101 			 * ip6_info header can be processed again.
   1102 			 */
   1103 			if (CONN_Q(q)) {
   1104 				(void) ip_output_v6(Q_TO_CONN(q), ipsec_mp, q,
   1105 				    IRE_SEND);
   1106 			} else {
   1107 				(void) ip_output_v6((void *)(uintptr_t)zoneid,
   1108 				    ipsec_mp, q, IRE_SEND);
   1109 			}
   1110 		}
   1111 		/*
   1112 		 * Special code to support sending a single packet with
   1113 		 * conn_unspec_src using an IRE which has no source address.
   1114 		 * The IRE is deleted here after sending the packet to avoid
   1115 		 * having other code trip on it. But before we delete the
   1116 		 * ire, somebody could have looked up this ire.
   1117 		 * We prevent returning/using this IRE by the upper layers
   1118 		 * by making checks to NULL source address in other places
   1119 		 * like e.g ip_ire_append_v6, ip_ire_req and
   1120 		 * ip_bind_connected_v6. Though, this does not completely
   1121 		 * prevent other threads from using this ire, this should
   1122 		 * not cause any problems.
   1123 		 */
   1124 		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) {
   1125 			ip1dbg(("ire_send_v6: delete IRE\n"));
   1126 			ire_delete(ire);
   1127 		}
   1128 	}
   1129 	ire_refrele(ire);	/* Held in ire_add */
   1130 }
   1131 
   1132 /*
   1133  * Make sure that IRE bucket does not get too long.
   1134  * This can cause lock up because ire_cache_lookup()
   1135  * may take "forever" to finish.
   1136  *
   1137  * We only remove a maximum of cnt IREs each time.  This
   1138  * should keep the bucket length approximately constant,
   1139  * depending on cnt.  This should be enough to defend
   1140  * against DoS attack based on creating temporary IREs
   1141  * (for forwarding and non-TCP traffic).
   1142  *
   1143  * We also pass in the address of the newly created IRE
   1144  * as we do not want to remove this straight after adding
   1145  * it. New IREs are normally added at the tail of the
   1146  * bucket.  This means that we are removing the "oldest"
   1147  * temporary IREs added.  Only if there are IREs with
   1148  * the same ire_addr, do we not add it at the tail.  Refer
   1149  * to ire_add_v*().  It should be OK for our purpose.
   1150  *
   1151  * For non-temporary cached IREs, we make sure that they
   1152  * have not been used for some time (defined below), they
   1153  * are non-local destinations, and there is no one using
   1154  * them at the moment (refcnt == 1).
   1155  *
   1156  * The above means that the IRE bucket length may become
   1157  * very long, consisting of mostly non-temporary IREs.
   1158  * This can happen when the hash function does a bad job
   1159  * so that most TCP connections cluster to a specific bucket.
   1160  * This "hopefully" should never happen.  It can also
   1161  * happen if most TCP connections have very long lives.
   1162  * Even with the minimal hash table size of 256, there
   1163  * has to be a lot of such connections to make the bucket
   1164  * length unreasonably long.  This should probably not
   1165  * happen either.  The third can when this can happen is
   1166  * when the machine is under attack, such as SYN flooding.
   1167  * TCP should already have the proper mechanism to protect
   1168  * that.  So we should be safe.
   1169  *
   1170  * This function is