1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 /* 31 * This file contains routines that manipulate Internet Routing Entries (IREs). 32 */ 33 34 #include <sys/types.h> 35 #include <sys/stream.h> 36 #include <sys/stropts.h> 37 #include <sys/ddi.h> 38 #include <sys/cmn_err.h> 39 #include <sys/policy.h> 40 41 #include <sys/systm.h> 42 #include <sys/kmem.h> 43 #include <sys/param.h> 44 #include <sys/socket.h> 45 #include <net/if.h> 46 #include <net/route.h> 47 #include <netinet/in.h> 48 #include <net/if_dl.h> 49 #include <netinet/ip6.h> 50 #include <netinet/icmp6.h> 51 52 #include <inet/common.h> 53 #include <inet/mi.h> 54 #include <inet/ip.h> 55 #include <inet/ip6.h> 56 #include <inet/ip_ndp.h> 57 #include <inet/arp.h> 58 #include <inet/ip_if.h> 59 #include <inet/ip_ire.h> 60 #include <inet/ip_ftable.h> 61 #include <inet/ip_rts.h> 62 #include <inet/nd.h> 63 64 #include <net/pfkeyv2.h> 65 #include <inet/ipsec_info.h> 66 #include <inet/sadb.h> 67 #include <sys/kmem.h> 68 #include <inet/tcp.h> 69 #include <inet/ipclassifier.h> 70 #include <sys/zone.h> 71 #include <sys/cpuvar.h> 72 73 #include <sys/tsol/label.h> 74 #include <sys/tsol/tnet.h> 75 76 struct kmem_cache *rt_entry_cache; 77 78 /* 79 * Synchronization notes: 80 * 81 * The fields of the ire_t struct are protected in the following way : 82 * 83 * ire_next/ire_ptpn 84 * 85 * - bucket lock of the respective tables (cache or forwarding tables). 86 * 87 * ire_mp, ire_rfq, ire_stq, ire_u *except* ire_gateway_addr[v6], ire_mask, 88 * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, ire_ipif, 89 * ire_ihandle, ire_phandle, ire_nce, ire_bucket, ire_in_ill, ire_in_src_addr 90 * 91 * - Set in ire_create_v4/v6 and never changes after that. Thus, 92 * we don't need a lock whenever these fields are accessed. 93 * 94 * - ire_bucket and ire_masklen (also set in ire_create) is set in 95 * ire_add_v4/ire_add_v6 before inserting in the bucket and never 96 * changes after that. Thus we don't need a lock whenever these 97 * fields are accessed. 98 * 99 * ire_gateway_addr_v4[v6] 100 * 101 * - ire_gateway_addr_v4[v6] is set during ire_create and later modified 102 * by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to 103 * it assumed to be atomic and hence the other parts of the code 104 * does not use any locks. ire_gateway_addr_v6 updates are not atomic 105 * and hence any access to it uses ire_lock to get/set the right value. 106 * 107 * ire_ident, ire_refcnt 108 * 109 * - Updated atomically using atomic_add_32 110 * 111 * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count 112 * 113 * - Assumes that 32 bit writes are atomic. No locks. ire_lock is 114 * used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt. 115 * 116 * ire_max_frag, ire_frag_flag 117 * 118 * - ire_lock is used to set/read both of them together. 119 * 120 * ire_tire_mark 121 * 122 * - Set in ire_create and updated in ire_expire, which is called 123 * by only one function namely ip_trash_timer_expire. Thus only 124 * one function updates and examines the value. 125 * 126 * ire_marks 127 * - bucket lock protects this. 128 * 129 * ire_ipsec_overhead/ire_ll_hdr_length 130 * 131 * - Place holder for returning the information to the upper layers 132 * when IRE_DB_REQ comes down. 133 * 134 * 135 * ipv6_ire_default_count is protected by the bucket lock of 136 * ip_forwarding_table_v6[0][0]. 137 * 138 * ipv6_ire_default_index is not protected as it is just a hint 139 * at which default gateway to use. There is nothing 140 * wrong in using the same gateway for two different connections. 141 * 142 * As we always hold the bucket locks in all the places while accessing 143 * the above values, it is natural to use them for protecting them. 144 * 145 * We have a separate cache table and forwarding table for IPv4 and IPv6. 146 * Cache table (ip_cache_table/ip_cache_table_v6) is a pointer to an 147 * array of irb_t structures. The IPv6 forwarding table 148 * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t 149 * structure. ip_forwarding_table_v6 is allocated dynamically in 150 * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads 151 * initializing the same bucket. Once a bucket is initialized, it is never 152 * de-alloacted. This assumption enables us to access 153 * ip_forwarding_table_v6[i] without any locks. 154 * 155 * The forwarding table for IPv4 is a radix tree whose leaves 156 * are rt_entry structures containing the irb_t for the rt_dst. The irb_t 157 * for IPv4 is dynamically allocated and freed. 158 * 159 * Each irb_t - ire bucket structure has a lock to protect 160 * a bucket and the ires residing in the bucket have a back pointer to 161 * the bucket structure. It also has a reference count for the number 162 * of threads walking the bucket - irb_refcnt which is bumped up 163 * using the macro IRB_REFHOLD macro. The flags irb_flags can be 164 * set to IRE_MARK_CONDEMNED indicating that there are some ires 165 * in this bucket that are marked with IRE_MARK_CONDEMNED and the 166 * last thread to leave the bucket should delete the ires. Usually 167 * this is done by the IRB_REFRELE macro which is used to decrement 168 * the reference count on a bucket. See comments above irb_t structure 169 * definition in ip.h for further details. 170 * 171 * IRE_REFHOLD/IRE_REFRELE macros operate on the ire which increments/ 172 * decrements the reference count, ire_refcnt, atomically on the ire. 173 * ire_refcnt is modified only using this macro. Operations on the IRE 174 * could be described as follows : 175 * 176 * CREATE an ire with reference count initialized to 1. 177 * 178 * ADDITION of an ire holds the bucket lock, checks for duplicates 179 * and then adds the ire. ire_add_v4/ire_add_v6 returns the ire after 180 * bumping up once more i.e the reference count is 2. This is to avoid 181 * an extra lookup in the functions calling ire_add which wants to 182 * work with the ire after adding. 183 * 184 * LOOKUP of an ire bumps up the reference count using IRE_REFHOLD 185 * macro. It is valid to bump up the referece count of the IRE, 186 * after the lookup has returned an ire. Following are the lookup 187 * functions that return an HELD ire : 188 * 189 * ire_lookup_local[_v6], ire_ctable_lookup[_v6], ire_ftable_lookup[_v6], 190 * ire_cache_lookup[_v6], ire_lookup_multi[_v6], ire_route_lookup[_v6], 191 * ipif_to_ire[_v6]. 192 * 193 * DELETION of an ire holds the bucket lock, removes it from the list 194 * and then decrements the reference count for having removed from the list 195 * by using the IRE_REFRELE macro. If some other thread has looked up 196 * the ire, the reference count would have been bumped up and hence 197 * this ire will not be freed once deleted. It will be freed once the 198 * reference count drops to zero. 199 * 200 * Add and Delete acquires the bucket lock as RW_WRITER, while all the 201 * lookups acquire the bucket lock as RW_READER. 202 * 203 * NOTE : The only functions that does the IRE_REFRELE when an ire is 204 * passed as an argument are : 205 * 206 * 1) ip_wput_ire : This is because it IRE_REFHOLD/RELEs the 207 * broadcast ires it looks up internally within 208 * the function. Currently, for simplicity it does 209 * not differentiate the one that is passed in and 210 * the ones it looks up internally. It always 211 * IRE_REFRELEs. 212 * 2) ire_send 213 * ire_send_v6 : As ire_send calls ip_wput_ire and other functions 214 * that take ire as an argument, it has to selectively 215 * IRE_REFRELE the ire. To maintain symmetry, 216 * ire_send_v6 does the same. 217 * 218 * Otherwise, the general rule is to do the IRE_REFRELE in the function 219 * that is passing the ire as an argument. 220 * 221 * In trying to locate ires the following points are to be noted. 222 * 223 * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is 224 * to be ignored when walking the ires using ire_next. 225 * 226 * IRE_MARK_HIDDEN signifies that the ire is a special ire typically for the 227 * benefit of in.mpathd which needs to probe interfaces for failures. Normal 228 * applications should not be seeing this ire and hence this ire is ignored 229 * in most cases in the search using ire_next. 230 * 231 * Zones note: 232 * Walking IREs within a given zone also walks certain ires in other 233 * zones. This is done intentionally. IRE walks with a specified 234 * zoneid are used only when doing informational reports, and 235 * zone users want to see things that they can access. See block 236 * comment in ire_walk_ill_match(). 237 */ 238 239 /* 240 * The minimum size of IRE cache table. It will be recalcuated in 241 * ip_ire_init(). 242 * Setable in /etc/system 243 */ 244 uint32_t ip_cache_table_size = IP_CACHE_TABLE_SIZE; 245 uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE; 246 247 /* 248 * The size of the forwarding table. We will make sure that it is a 249 * power of 2 in ip_ire_init(). 250 * Setable in /etc/system 251 */ 252 uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE; 253 254 struct kmem_cache *ire_cache; 255 static ire_t ire_null; 256 257 /* 258 * The threshold number of IRE in a bucket when the IREs are 259 * cleaned up. This threshold is calculated later in ip_open() 260 * based on the speed of CPU and available memory. This default 261 * value is the maximum. 262 * 263 * We have two kinds of cached IRE, temporary and 264 * non-temporary. Temporary IREs are marked with 265 * IRE_MARK_TEMPORARY. They are IREs created for non 266 * TCP traffic and for forwarding purposes. All others 267 * are non-temporary IREs. We don't mark IRE created for 268 * TCP as temporary because TCP is stateful and there are 269 * info stored in the IRE which can be shared by other TCP 270 * connections to the same destination. For connected 271 * endpoint, we also don't want to mark the IRE used as 272 * temporary because the same IRE will be used frequently, 273 * otherwise, the app should not do a connect(). We change 274 * the marking at ip_bind_connected_*() if necessary. 275 * 276 * We want to keep the cache IRE hash bucket length reasonably 277 * short, otherwise IRE lookup functions will take "forever." 278 * We use the "crude" function that the IRE bucket 279 * length should be based on the CPU speed, which is 1 entry 280 * per x MHz, depending on the shift factor ip_ire_cpu_ratio 281 * (n). This means that with a 750MHz CPU, the max bucket 282 * length can be (750 >> n) entries. 283 * 284 * Note that this threshold is separate for temp and non-temp 285 * IREs. This means that the actual bucket length can be 286 * twice as that. And while we try to keep temporary IRE 287 * length at most at the threshold value, we do not attempt to 288 * make the length for non-temporary IREs fixed, for the 289 * reason stated above. Instead, we start trying to find 290 * "unused" non-temporary IREs when the bucket length reaches 291 * this threshold and clean them up. 292 * 293 * We also want to limit the amount of memory used by 294 * IREs. So if we are allowed to use ~3% of memory (M) 295 * for those IREs, each bucket should not have more than 296 * 297 * M / num of cache bucket / sizeof (ire_t) 298 * 299 * Again the above memory uses are separate for temp and 300 * non-temp cached IREs. 301 * 302 * We may also want the limit to be a function of the number 303 * of interfaces and number of CPUs. Doing the initialization 304 * in ip_open() means that every time an interface is plumbed, 305 * the max is re-calculated. Right now, we don't do anything 306 * different. In future, when we have more experience, we 307 * may want to change this behavior. 308 */ 309 uint32_t ip_ire_max_bucket_cnt = 10; /* Setable in /etc/system */ 310 uint32_t ip6_ire_max_bucket_cnt = 10; 311 uint32_t ip_ire_cleanup_cnt = 2; 312 313 /* 314 * The minimum of the temporary IRE bucket count. We do not want 315 * the length of each bucket to be too short. This may hurt 316 * performance of some apps as the temporary IREs are removed too 317 * often. 318 */ 319 uint32_t ip_ire_min_bucket_cnt = 3; /* /etc/system - not used */ 320 uint32_t ip6_ire_min_bucket_cnt = 3; 321 322 /* 323 * The ratio of memory consumed by IRE used for temporary to available 324 * memory. This is a shift factor, so 6 means the ratio 1 to 64. This 325 * value can be changed in /etc/system. 6 is a reasonable number. 326 */ 327 uint32_t ip_ire_mem_ratio = 6; /* /etc/system */ 328 /* The shift factor for CPU speed to calculate the max IRE bucket length. */ 329 uint32_t ip_ire_cpu_ratio = 7; /* /etc/system */ 330 331 typedef struct nce_clookup_s { 332 ipaddr_t ncecl_addr; 333 boolean_t ncecl_found; 334 } nce_clookup_t; 335 336 /* 337 * The maximum number of buckets in IRE cache table. In future, we may 338 * want to make it a dynamic hash table. For the moment, we fix the 339 * size and allocate the table in ip_ire_init() when IP is first loaded. 340 * We take into account the amount of memory a system has. 341 */ 342 #define IP_MAX_CACHE_TABLE_SIZE 4096 343 344 /* Setable in /etc/system */ 345 static uint32_t ip_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE; 346 static uint32_t ip6_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE; 347 348 #define NUM_ILLS 2 /* To build the ILL list to unlock */ 349 350 /* Zero iulp_t for initialization. */ 351 const iulp_t ire_uinfo_null = { 0 }; 352 353 static int ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, 354 ipsq_func_t func, boolean_t); 355 static void ire_delete_v4(ire_t *ire); 356 static void ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, 357 zoneid_t zoneid, ip_stack_t *); 358 static void ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, 359 pfv_t func, void *arg, uchar_t vers, ill_t *ill); 360 static void ire_cache_cleanup(irb_t *irb, uint32_t threshold, 361 ire_t *ref_ire); 362 static void ip_nce_clookup_and_delete(nce_t *nce, void *arg); 363 #ifdef DEBUG 364 static void ire_trace_cleanup(const ire_t *); 365 #endif 366 367 /* 368 * To avoid bloating the code, we call this function instead of 369 * using the macro IRE_REFRELE. Use macro only in performance 370 * critical paths. 371 * 372 * Must not be called while holding any locks. Otherwise if this is 373 * the last reference to be released there is a chance of recursive mutex 374 * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 375 * to restart an ioctl. The one exception is when the caller is sure that 376 * this is not the last reference to be released. Eg. if the caller is 377 * sure that the ire has not been deleted and won't be deleted. 378 */ 379 void 380 ire_refrele(ire_t *ire) 381 { 382 IRE_REFRELE(ire); 383 } 384 385 void 386 ire_refrele_notr(ire_t *ire) 387 { 388 IRE_REFRELE_NOTR(ire); 389 } 390 391 /* 392 * kmem_cache_alloc constructor for IRE in kma space. 393 * Note that when ire_mp is set the IRE is stored in that mblk and 394 * not in this cache. 395 */ 396 /* ARGSUSED */ 397 static int 398 ip_ire_constructor(void *buf, void *cdrarg, int kmflags) 399 { 400 ire_t *ire = buf; 401 402 ire->ire_nce = NULL; 403 404 return (0); 405 } 406 407 /* ARGSUSED1 */ 408 static void 409 ip_ire_destructor(void *buf, void *cdrarg) 410 { 411 ire_t *ire = buf; 412 413 ASSERT(ire->ire_nce == NULL); 414 } 415 416 /* 417 * This function is associated with the IP_IOC_IRE_ADVISE_NO_REPLY 418 * IOCTL. It is used by TCP (or other ULPs) to supply revised information 419 * for an existing CACHED IRE. 420 */ 421 /* ARGSUSED */ 422 int 423 ip_ire_advise(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 424 { 425 uchar_t *addr_ucp; 426 ipic_t *ipic; 427 ire_t *ire; 428 ipaddr_t addr; 429 in6_addr_t v6addr; 430 irb_t *irb; 431 zoneid_t zoneid; 432 ip_stack_t *ipst = CONNQ_TO_IPST(q); 433 434 ASSERT(q->q_next == NULL); 435 zoneid = Q_TO_CONN(q)->conn_zoneid; 436 437 /* 438 * Check privilege using the ioctl credential; if it is NULL 439 * then this is a kernel message and therefor privileged. 440 */ 441 if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 442 return (EPERM); 443 444 ipic = (ipic_t *)mp->b_rptr; 445 if (!(addr_ucp = mi_offset_param(mp, ipic->ipic_addr_offset, 446 ipic->ipic_addr_length))) { 447 return (EINVAL); 448 } 449 if (!OK_32PTR(addr_ucp)) 450 return (EINVAL); 451 switch (ipic->ipic_addr_length) { 452 case IP_ADDR_LEN: { 453 /* Extract the destination address. */ 454 addr = *(ipaddr_t *)addr_ucp; 455 /* Find the corresponding IRE. */ 456 ire = ire_cache_lookup(addr, zoneid, NULL, ipst); 457 break; 458 } 459 case IPV6_ADDR_LEN: { 460 /* Extract the destination address. */ 461 v6addr = *(in6_addr_t *)addr_ucp; 462 /* Find the corresponding IRE. */ 463 ire = ire_cache_lookup_v6(&v6addr, zoneid, NULL, ipst); 464 break; 465 } 466 default: 467 return (EINVAL); 468 } 469 470 if (ire == NULL) 471 return (ENOENT); 472 /* 473 * Update the round trip time estimate and/or the max frag size 474 * and/or the slow start threshold. 475 * 476 * We serialize multiple advises using ire_lock. 477 */ 478 mutex_enter(&ire->ire_lock); 479 if (ipic->ipic_rtt) { 480 /* 481 * If there is no old cached values, initialize them 482 * conservatively. Set them to be (1.5 * new value). 483 */ 484 if (ire->ire_uinfo.iulp_rtt != 0) { 485 ire->ire_uinfo.iulp_rtt = (ire->ire_uinfo.iulp_rtt + 486 ipic->ipic_rtt) >> 1; 487 } else { 488 ire->ire_uinfo.iulp_rtt = ipic->ipic_rtt + 489 (ipic->ipic_rtt >> 1); 490 } 491 if (ire->ire_uinfo.iulp_rtt_sd != 0) { 492 ire->ire_uinfo.iulp_rtt_sd = 493 (ire->ire_uinfo.iulp_rtt_sd + 494 ipic->ipic_rtt_sd) >> 1; 495 } else { 496 ire->ire_uinfo.iulp_rtt_sd = ipic->ipic_rtt_sd + 497 (ipic->ipic_rtt_sd >> 1); 498 } 499 } 500 if (ipic->ipic_max_frag) 501 ire->ire_max_frag = MIN(ipic->ipic_max_frag, IP_MAXPACKET); 502 if (ipic->ipic_ssthresh != 0) { 503 if (ire->ire_uinfo.iulp_ssthresh != 0) 504 ire->ire_uinfo.iulp_ssthresh = 505 (ipic->ipic_ssthresh + 506 ire->ire_uinfo.iulp_ssthresh) >> 1; 507 else 508 ire->ire_uinfo.iulp_ssthresh = ipic->ipic_ssthresh; 509 } 510 /* 511 * Don't need the ire_lock below this. ire_type does not change 512 * after initialization. ire_marks is protected by irb_lock. 513 */ 514 mutex_exit(&ire->ire_lock); 515 516 if (ipic->ipic_ire_marks != 0 && ire->ire_type == IRE_CACHE) { 517 /* 518 * Only increment the temporary IRE count if the original 519 * IRE is not already marked temporary. 520 */ 521 irb = ire->ire_bucket; 522 rw_enter(&irb->irb_lock, RW_WRITER); 523 if ((ipic->ipic_ire_marks & IRE_MARK_TEMPORARY) && 524 !(ire->ire_marks & IRE_MARK_TEMPORARY)) { 525 irb->irb_tmp_ire_cnt++; 526 } 527 ire->ire_marks |= ipic->ipic_ire_marks; 528 rw_exit(&irb->irb_lock); 529 } 530 531 ire_refrele(ire); 532 return (0); 533 } 534 535 /* 536 * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY] 537 * IOCTL[s]. The NO_REPLY form is used by TCP to delete a route IRE 538 * for a host that is not responding. This will force an attempt to 539 * establish a new route, if available, and flush out the ARP entry so 540 * it will re-resolve. Management processes may want to use the 541 * version that generates a reply. 542 * 543 * This function does not support IPv6 since Neighbor Unreachability Detection 544 * means that negative advise like this is useless. 545 */ 546 /* ARGSUSED */ 547 int 548 ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 549 { 550 uchar_t *addr_ucp; 551 ipaddr_t addr; 552 ire_t *ire; 553 ipid_t *ipid; 554 boolean_t routing_sock_info = B_FALSE; /* Sent info? */ 555 zoneid_t zoneid; 556 ire_t *gire = NULL; 557 ill_t *ill; 558 mblk_t *arp_mp; 559 ip_stack_t *ipst; 560 561 ASSERT(q->q_next == NULL); 562 zoneid = Q_TO_CONN(q)->conn_zoneid; 563 ipst = CONNQ_TO_IPST(q); 564 565 /* 566 * Check privilege using the ioctl credential; if it is NULL 567 * then this is a kernel message and therefor privileged. 568 */ 569 if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 570 return (EPERM); 571 572 ipid = (ipid_t *)mp->b_rptr; 573 574 /* Only actions on IRE_CACHEs are acceptable at present. */ 575 if (ipid->ipid_ire_type != IRE_CACHE) 576 return (EINVAL); 577 578 addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset, 579 ipid->ipid_addr_length); 580 if (addr_ucp == NULL || !OK_32PTR(addr_ucp)) 581 return (EINVAL); 582 switch (ipid->ipid_addr_length) { 583 case IP_ADDR_LEN: 584 /* addr_ucp points at IP addr */ 585 break; 586 case sizeof (sin_t): { 587 sin_t *sin; 588 /* 589 * got complete (sockaddr) address - increment addr_ucp to point 590 * at the ip_addr field. 591 */ 592 sin = (sin_t *)addr_ucp; 593 addr_ucp = (uchar_t *)&sin->sin_addr.s_addr; 594 break; 595 } 596 default: 597 return (EINVAL); 598 } 599 /* Extract the destination address. */ 600 bcopy(addr_ucp, &addr, IP_ADDR_LEN); 601 602 /* Try to find the CACHED IRE. */ 603 ire = ire_cache_lookup(addr, zoneid, NULL, ipst); 604 605 /* Nail it. */ 606 if (ire) { 607 /* Allow delete only on CACHE entries */ 608 if (ire->ire_type != IRE_CACHE) { 609 ire_refrele(ire); 610 return (EINVAL); 611 } 612 613 /* 614 * Verify that the IRE has been around for a while. 615 * This is to protect against transport protocols 616 * that are too eager in sending delete messages. 617 */ 618 if (gethrestime_sec() < 619 ire->ire_create_time + ipst->ips_ip_ignore_delete_time) { 620 ire_refrele(ire); 621 return (EINVAL); 622 } 623 /* 624 * Now we have a potentially dead cache entry. We need 625 * to remove it. 626 * If this cache entry is generated from a 627 * default route (i.e., ire_cmask == 0), 628 * search the default list and mark it dead and some 629 * background process will try to activate it. 630 */ 631 if ((ire->ire_gateway_addr != 0) && (ire->ire_cmask == 0)) { 632 /* 633 * Make sure that we pick a different 634 * IRE_DEFAULT next time. 635 */ 636 ire_t *gw_ire; 637 irb_t *irb = NULL; 638 uint_t match_flags; 639 640 match_flags = (MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE); 641 642 gire = ire_ftable_lookup(ire->ire_addr, 643 ire->ire_cmask, 0, 0, 644 ire->ire_ipif, NULL, zoneid, 0, NULL, match_flags, 645 ipst); 646 647 ip3dbg(("ire_ftable_lookup() returned gire %p\n", 648 (void *)gire)); 649 650 if (gire != NULL) { 651 irb = gire->ire_bucket; 652 653 /* 654 * We grab it as writer just to serialize 655 * multiple threads trying to bump up 656 * irb_rr_origin 657 */ 658 rw_enter(&irb->irb_lock, RW_WRITER); 659 if ((gw_ire = irb->irb_rr_origin) == NULL) { 660 rw_exit(&irb->irb_lock); 661 goto done; 662 } 663 664 DTRACE_PROBE1(ip__ire__del__origin, 665 (ire_t *), gw_ire); 666 667 /* Skip past the potentially bad gateway */ 668 if (ire->ire_gateway_addr == 669 gw_ire->ire_gateway_addr) { 670 ire_t *next = gw_ire->ire_next; 671 672 DTRACE_PROBE2(ip__ire__del, 673 (ire_t *), gw_ire, (irb_t *), irb); 674 IRE_FIND_NEXT_ORIGIN(next); 675 irb->irb_rr_origin = next; 676 } 677 rw_exit(&irb->irb_lock); 678 } 679 } 680 done: 681 if (gire != NULL) 682 IRE_REFRELE(gire); 683 /* report the bad route to routing sockets */ 684 ip_rts_change(RTM_LOSING, ire->ire_addr, ire->ire_gateway_addr, 685 ire->ire_mask, ire->ire_src_addr, 0, 0, 0, 686 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), ipst); 687 routing_sock_info = B_TRUE; 688 689 /* 690 * TCP is really telling us to start over completely, and it 691 * expects that we'll resend the ARP query. Tell ARP to 692 * discard the entry, if this is a local destination. 693 */ 694 ill = ire->ire_stq->q_ptr; 695 if (ire->ire_gateway_addr == 0 && 696 (arp_mp = ill_ared_alloc(ill, addr)) != NULL) { 697 putnext(ill->ill_rq, arp_mp); 698 } 699 700 ire_delete(ire); 701 ire_refrele(ire); 702 } 703 /* 704 * Also look for an IRE_HOST type redirect ire and 705 * remove it if present. 706 */ 707 ire = ire_route_lookup(addr, 0, 0, IRE_HOST, NULL, NULL, 708 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 709 710 /* Nail it. */ 711 if (ire != NULL) { 712 if (ire->ire_flags & RTF_DYNAMIC) { 713 if (!routing_sock_info) { 714 ip_rts_change(RTM_LOSING, ire->ire_addr, 715 ire->ire_gateway_addr, ire->ire_mask, 716 ire->ire_src_addr, 0, 0, 0, 717 (RTA_DST | RTA_GATEWAY | 718 RTA_NETMASK | RTA_IFA), 719 ipst); 720 } 721 ire_delete(ire); 722 } 723 ire_refrele(ire); 724 } 725 return (0); 726 } 727 728 729 /* 730 * ip_ire_req is called by ip_wput when an IRE_DB_REQ_TYPE message is handed 731 * down from the Upper Level Protocol to request a copy of the IRE (to check 732 * its type or to extract information like round-trip time estimates or the 733 * MTU.) 734 * The address is assumed to be in the ire_addr field. If no IRE is found 735 * an IRE is returned with ire_type being zero. 736 * Note that the upper lavel protocol has to check for broadcast 737 * (IRE_BROADCAST) and multicast (CLASSD(addr)). 738 * If there is a b_cont the resulting IRE_DB_TYPE mblk is placed at the 739 * end of the returned message. 740 * 741 * TCP sends down a message of this type with a connection request packet 742 * chained on. UDP and ICMP send it down to verify that a route exists for 743 * the destination address when they get connected. 744 */ 745 void 746 ip_ire_req(queue_t *q, mblk_t *mp) 747 { 748 ire_t *inire; 749 ire_t *ire; 750 mblk_t *mp1; 751 ire_t *sire = NULL; 752 zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid; 753 ip_stack_t *ipst = CONNQ_TO_IPST(q); 754 755 ASSERT(q->q_next == NULL); 756 757 if ((mp->b_wptr - mp->b_rptr) < sizeof (ire_t) || 758 !OK_32PTR(mp->b_rptr)) { 759 freemsg(mp); 760 return; 761 } 762 inire = (ire_t *)mp->b_rptr; 763 /* 764 * Got it, now take our best shot at an IRE. 765 */ 766 if (inire->ire_ipversion == IPV6_VERSION) { 767 ire = ire_route_lookup_v6(&inire->ire_addr_v6, 0, 0, 0, 768 NULL, &sire, zoneid, NULL, 769 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst); 770 } else { 771 ASSERT(inire->ire_ipversion == IPV4_VERSION); 772 ire = ire_route_lookup(inire->ire_addr, 0, 0, 0, 773 NULL, &sire, zoneid, NULL, 774 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst); 775 } 776 777 /* 778 * We prevent returning IRES with source address INADDR_ANY 779 * as these were temporarily created for sending packets 780 * from endpoints that have conn_unspec_src set. 781 */ 782 if (ire == NULL || 783 (ire->ire_ipversion == IPV4_VERSION && 784 ire->ire_src_addr == INADDR_ANY) || 785 (ire->ire_ipversion == IPV6_VERSION && 786 IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6))) { 787 inire->ire_type = 0; 788 } else { 789 bcopy(ire, inire, sizeof (ire_t)); 790 /* Copy the route metrics from the parent. */ 791 if (sire != NULL) { 792 bcopy(&(sire->ire_uinfo), &(inire->ire_uinfo), 793 sizeof (iulp_t)); 794 } 795 796 /* 797 * As we don't lookup global policy here, we may not 798 * pass the right size if per-socket policy is not 799 * present. For these cases, path mtu discovery will 800 * do the right thing. 801 */ 802 inire->ire_ipsec_overhead = conn_ipsec_length(Q_TO_CONN(q)); 803 804 /* Pass the latest setting of the ip_path_mtu_discovery */ 805 inire->ire_frag_flag |= 806 (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0; 807 } 808 if (ire != NULL) 809 ire_refrele(ire); 810 if (sire != NULL) 811 ire_refrele(sire); 812 mp->b_wptr = &mp->b_rptr[sizeof (ire_t)]; 813 mp->b_datap->db_type = IRE_DB_TYPE; 814 815 /* Put the IRE_DB_TYPE mblk last in the chain */ 816 mp1 = mp->b_cont; 817 if (mp1 != NULL) { 818 mp->b_cont = NULL; 819 linkb(mp1, mp); 820 mp = mp1; 821 } 822 qreply(q, mp); 823 } 824 825 /* 826 * Send a packet using the specified IRE. 827 * If ire_src_addr_v6 is all zero then discard the IRE after 828 * the packet has been sent. 829 */ 830 static void 831 ire_send(queue_t *q, mblk_t *pkt, ire_t *ire) 832 { 833 mblk_t *ipsec_mp; 834 boolean_t is_secure; 835 uint_t ifindex; 836 ill_t *ill; 837 zoneid_t zoneid = ire->ire_zoneid; 838 ip_stack_t *ipst = ire->ire_ipst; 839 840 ASSERT(ire->ire_ipversion == IPV4_VERSION); 841 ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */ 842 ipsec_mp = pkt; 843 is_secure = (pkt->b_datap->db_type == M_CTL); 844 if (is_secure) { 845 ipsec_out_t *io; 846 847 pkt = pkt->b_cont; 848 io = (ipsec_out_t *)ipsec_mp->b_rptr; 849 if (io->ipsec_out_type == IPSEC_OUT) 850 zoneid = io->ipsec_out_zoneid; 851 } 852 853 /* If the packet originated externally then */ 854 if (pkt->b_prev) { 855 ire_refrele(ire); 856 /* 857 * Extract the ifindex from b_prev (set in ip_rput_noire). 858 * Look up interface to see if it still exists (it could have 859 * been unplumbed by the time the reply came back from ARP) 860 */ 861 ifindex = (uint_t)(uintptr_t)pkt->b_prev; 862 ill = ill_lookup_on_ifindex(ifindex, B_FALSE, 863 NULL, NULL, NULL, NULL, ipst); 864 if (ill == NULL) { 865 pkt->b_prev = NULL; 866 pkt->b_next = NULL; 867 freemsg(ipsec_mp); 868 return; 869 } 870 q = ill->ill_rq; 871 pkt->b_prev = NULL; 872 /* 873 * This packet has not gone through IPSEC processing 874 * and hence we should not have any IPSEC message 875 * prepended. 876 */ 877 ASSERT(ipsec_mp == pkt); 878 put(q, pkt); 879 ill_refrele(ill); 880 } else if (pkt->b_next) { 881 /* Packets from multicast router */ 882 pkt->b_next = NULL; 883 /* 884 * We never get the IPSEC_OUT while forwarding the 885 * packet for multicast router. 886 */ 887 ASSERT(ipsec_mp == pkt); 888 ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, ipsec_mp, NULL); 889 ire_refrele(ire); 890 } else { 891 /* Locally originated packets */ 892 boolean_t is_inaddr_any; 893 ipha_t *ipha = (ipha_t *)pkt->b_rptr; 894 895 /* 896 * We need to do an ire_delete below for which 897 * we need to make sure that the IRE will be 898 * around even after calling ip_wput_ire - 899 * which does ire_refrele. Otherwise somebody 900 * could potentially delete this ire and hence 901 * free this ire and we will be calling ire_delete 902 * on a freed ire below. 903 */ 904 is_inaddr_any = (ire->ire_src_addr == INADDR_ANY); 905 if (is_inaddr_any) { 906 IRE_REFHOLD(ire); 907 } 908 /* 909 * If we were resolving a router we can not use the 910 * routers IRE for sending the packet (since it would 911 * violate the uniqness of the IP idents) thus we 912 * make another pass through ip_wput to create the IRE_CACHE 913 * for the destination. 914 * When IRE_MARK_NOADD is set, ire_add() is not called. 915 * Thus ip_wput() will never find a ire and result in an 916 * infinite loop. Thus we check whether IRE_MARK_NOADD is 917 * is set. This also implies that IRE_MARK_NOADD can only be 918 * used to send packets to directly connected hosts. 919 */ 920 if (ipha->ipha_dst != ire->ire_addr && 921 !(ire->ire_marks & IRE_MARK_NOADD)) { 922 ire_refrele(ire); /* Held in ire_add */ 923 if (CONN_Q(q)) { 924 (void) ip_output(Q_TO_CONN(q), ipsec_mp, q, 925 IRE_SEND); 926 } else { 927 (void) ip_output((void *)(uintptr_t)zoneid, 928 ipsec_mp, q, IRE_SEND); 929 } 930 } else { 931 if (is_secure) { 932 ipsec_out_t *oi; 933 ipha_t *ipha; 934 935 oi = (ipsec_out_t *)ipsec_mp->b_rptr; 936 ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; 937 if (oi->ipsec_out_proc_begin) { 938 /* 939 * This is the case where 940 * ip_wput_ipsec_out could not find 941 * the IRE and recreated a new one. 942 * As ip_wput_ipsec_out does ire 943 * lookups, ire_refrele for the extra 944 * bump in ire_add. 945 */ 946 ire_refrele(ire); 947 ip_wput_ipsec_out(q, ipsec_mp, ipha, 948 NULL, NULL); 949 } else { 950 /* 951 * IRE_REFRELE will be done in 952 * ip_wput_ire. 953 */ 954 ip_wput_ire(q, ipsec_mp, ire, NULL, 955 IRE_SEND, zoneid); 956 } 957 } else { 958 /* 959 * IRE_REFRELE will be done in ip_wput_ire. 960 */ 961 ip_wput_ire(q, ipsec_mp, ire, NULL, 962 IRE_SEND, zoneid); 963 } 964 } 965 /* 966 * Special code to support sending a single packet with 967 * conn_unspec_src using an IRE which has no source address. 968 * The IRE is deleted here after sending the packet to avoid 969 * having other code trip on it. But before we delete the 970 * ire, somebody could have looked up this ire. 971 * We prevent returning/using this IRE by the upper layers 972 * by making checks to NULL source address in other places 973 * like e.g ip_ire_append, ip_ire_req and ip_bind_connected. 974 * Though, this does not completely prevent other threads 975 * from using this ire, this should not cause any problems. 976 * 977 * NOTE : We use is_inaddr_any instead of using ire_src_addr 978 * because for the normal case i.e !is_inaddr_any, ire_refrele 979 * above could have potentially freed the ire. 980 */ 981 if (is_inaddr_any) { 982 /* 983 * If this IRE has been deleted by another thread, then 984 * ire_bucket won't be NULL, but ire_ptpn will be NULL. 985 * Thus, ire_delete will do nothing. This check 986 * guards against calling ire_delete when the IRE was 987 * never inserted in the table, which is handled by 988 * ire_delete as dropping another reference. 989 */ 990 if (ire->ire_bucket != NULL) { 991 ip1dbg(("ire_send: delete IRE\n")); 992 ire_delete(ire); 993 } 994 ire_refrele(ire); /* Held above */ 995 } 996 } 997 } 998 999 /* 1000 * Send a packet using the specified IRE. 1001 * If ire_src_addr_v6 is all zero then discard the IRE after 1002 * the packet has been sent. 1003 */ 1004 static void 1005 ire_send_v6(queue_t *q, mblk_t *pkt, ire_t *ire) 1006 { 1007 mblk_t *ipsec_mp; 1008 boolean_t secure; 1009 uint_t ifindex; 1010 zoneid_t zoneid = ire->ire_zoneid; 1011 ip_stack_t *ipst = ire->ire_ipst; 1012 1013 ASSERT(ire->ire_ipversion == IPV6_VERSION); 1014 ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */ 1015 if (pkt->b_datap->db_type == M_CTL) { 1016 ipsec_out_t *io; 1017 1018 ipsec_mp = pkt; 1019 pkt = pkt->b_cont; 1020 secure = B_TRUE; 1021 io = (ipsec_out_t *)ipsec_mp->b_rptr; 1022 if (io->ipsec_out_type == IPSEC_OUT) 1023 zoneid = io->ipsec_out_zoneid; 1024 } else { 1025 ipsec_mp = pkt; 1026 secure = B_FALSE; 1027 } 1028 1029 /* If the packet originated externally then */ 1030 if (pkt->b_prev) { 1031 ill_t *ill; 1032 /* 1033 * Extract the ifindex from b_prev (set in ip_rput_data_v6). 1034 * Look up interface to see if it still exists (it could have 1035 * been unplumbed by the time the reply came back from the 1036 * resolver). 1037 */ 1038 ifindex = (uint_t)(uintptr_t)pkt->b_prev; 1039 ill = ill_lookup_on_ifindex(ifindex, B_TRUE, 1040 NULL, NULL, NULL, NULL, ipst); 1041 if (ill == NULL) { 1042 pkt->b_prev = NULL; 1043 pkt->b_next = NULL; 1044 freemsg(ipsec_mp); 1045 ire_refrele(ire); /* Held in ire_add */ 1046 return; 1047 } 1048 q = ill->ill_rq; 1049 pkt->b_prev = NULL; 1050 /* 1051 * This packet has not gone through IPSEC processing 1052 * and hence we should not have any IPSEC message 1053 * prepended. 1054 */ 1055 ASSERT(ipsec_mp == pkt); 1056 put(q, pkt); 1057 ill_refrele(ill); 1058 } else if (pkt->b_next) { 1059 /* Packets from multicast router */ 1060 pkt->b_next = NULL; 1061 /* 1062 * We never get the IPSEC_OUT while forwarding the 1063 * packet for multicast router. 1064 */ 1065 ASSERT(ipsec_mp == pkt); 1066 /* 1067 * XXX TODO IPv6. 1068 */ 1069 freemsg(pkt); 1070 #ifdef XXX 1071 ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, pkt, NULL); 1072 #endif 1073 } else { 1074 if (secure) { 1075 ipsec_out_t *oi; 1076 ip6_t *ip6h; 1077 1078 oi = (ipsec_out_t *)ipsec_mp->b_rptr; 1079 ip6h = (ip6_t *)ipsec_mp->b_cont->b_rptr; 1080 if (oi->ipsec_out_proc_begin) { 1081 /* 1082 * This is the case where 1083 * ip_wput_ipsec_out could not find 1084 * the IRE and recreated a new one. 1085 */ 1086 ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, 1087 NULL, NULL); 1088 } else { 1089 if (CONN_Q(q)) { 1090 (void) ip_output_v6(Q_TO_CONN(q), 1091 ipsec_mp, q, IRE_SEND); 1092 } else { 1093 (void) ip_output_v6( 1094 (void *)(uintptr_t)zoneid, 1095 ipsec_mp, q, IRE_SEND); 1096 } 1097 } 1098 } else { 1099 /* 1100 * Send packets through ip_output_v6 so that any 1101 * ip6_info header can be processed again. 1102 */ 1103 if (CONN_Q(q)) { 1104 (void) ip_output_v6(Q_TO_CONN(q), ipsec_mp, q, 1105 IRE_SEND); 1106 } else { 1107 (void) ip_output_v6((void *)(uintptr_t)zoneid, 1108 ipsec_mp, q, IRE_SEND); 1109 } 1110 } 1111 /* 1112 * Special code to support sending a single packet with 1113 * conn_unspec_src using an IRE which has no source address. 1114 * The IRE is deleted here after sending the packet to avoid 1115 * having other code trip on it. But before we delete the 1116 * ire, somebody could have looked up this ire. 1117 * We prevent returning/using this IRE by the upper layers 1118 * by making checks to NULL source address in other places 1119 * like e.g ip_ire_append_v6, ip_ire_req and 1120 * ip_bind_connected_v6. Though, this does not completely 1121 * prevent other threads from using this ire, this should 1122 * not cause any problems. 1123 */ 1124 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) { 1125 ip1dbg(("ire_send_v6: delete IRE\n")); 1126 ire_delete(ire); 1127 } 1128 } 1129 ire_refrele(ire); /* Held in ire_add */ 1130 } 1131 1132 /* 1133 * Make sure that IRE bucket does not get too long. 1134 * This can cause lock up because ire_cache_lookup() 1135 * may take "forever" to finish. 1136 * 1137 * We only remove a maximum of cnt IREs each time. This 1138 * should keep the bucket length approximately constant, 1139 * depending on cnt. This should be enough to defend 1140 * against DoS attack based on creating temporary IREs 1141 * (for forwarding and non-TCP traffic). 1142 * 1143 * We also pass in the address of the newly created IRE 1144 * as we do not want to remove this straight after adding 1145 * it. New IREs are normally added at the tail of the 1146 * bucket. This means that we are removing the "oldest" 1147 * temporary IREs added. Only if there are IREs with 1148 * the same ire_addr, do we not add it at the tail. Refer 1149 * to ire_add_v*(). It should be OK for our purpose. 1150 * 1151 * For non-temporary cached IREs, we make sure that they 1152 * have not been used for some time (defined below), they 1153 * are non-local destinations, and there is no one using 1154 * them at the moment (refcnt == 1). 1155 * 1156 * The above means that the IRE bucket length may become 1157 * very long, consisting of mostly non-temporary IREs. 1158 * This can happen when the hash function does a bad job 1159 * so that most TCP connections cluster to a specific bucket. 1160 * This "hopefully" should never happen. It can also 1161 * happen if most TCP connections have very long lives. 1162 * Even with the minimal hash table size of 256, there 1163 * has to be a lot of such connections to make the bucket 1164 * length unreasonably long. This should probably not 1165 * happen either. The third can when this can happen is 1166 * when the machine is under attack, such as SYN flooding. 1167 * TCP should already have the proper mechanism to protect 1168 * that. So we should be safe. 1169 * 1170 * This function is