1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 1990 Mentat Inc. 27 */ 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 /* 32 * This file contains routines that manipulate Internet Routing Entries (IREs). 33 */ 34 #include <sys/types.h> 35 #include <sys/stream.h> 36 #include <sys/stropts.h> 37 #include <sys/ddi.h> 38 #include <sys/cmn_err.h> 39 40 #include <sys/systm.h> 41 #include <sys/param.h> 42 #include <sys/socket.h> 43 #include <net/if.h> 44 #include <net/route.h> 45 #include <netinet/in.h> 46 #include <net/if_dl.h> 47 #include <netinet/ip6.h> 48 #include <netinet/icmp6.h> 49 50 #include <inet/common.h> 51 #include <inet/mi.h> 52 #include <inet/ip.h> 53 #include <inet/ip6.h> 54 #include <inet/ip_ndp.h> 55 #include <inet/ip_if.h> 56 #include <inet/ip_ire.h> 57 #include <inet/ipclassifier.h> 58 #include <inet/nd.h> 59 #include <sys/kmem.h> 60 #include <sys/zone.h> 61 62 #include <sys/tsol/label.h> 63 #include <sys/tsol/tnet.h> 64 65 static ire_t ire_null; 66 67 static ire_t *ire_ihandle_lookup_onlink_v6(ire_t *cire); 68 static boolean_t ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, 69 const in6_addr_t *mask, const in6_addr_t *gateway, int type, 70 const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle, 71 const ts_label_t *tsl, int match_flags); 72 static ire_t *ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *, 73 const in6_addr_t *, const in6_addr_t *, uint_t *, queue_t *, queue_t *, 74 ushort_t, ipif_t *, const in6_addr_t *, uint32_t, uint32_t, uint_t, 75 const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *); 76 77 78 /* 79 * Initialize the ire that is specific to IPv6 part and call 80 * ire_init_common to finish it. 81 */ 82 static ire_t * 83 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask, 84 const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway, 85 uint_t *max_fragp, queue_t *rfq, queue_t *stq, ushort_t type, 86 ipif_t *ipif, const in6_addr_t *v6cmask, uint32_t phandle, 87 uint32_t ihandle, uint_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, 88 tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) 89 { 90 91 /* 92 * Reject IRE security attribute creation/initialization 93 * if system is not running in Trusted mode. 94 */ 95 if ((gc != NULL || gcgrp != NULL) && !is_system_labeled()) 96 return (NULL); 97 98 99 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced); 100 ire->ire_addr_v6 = *v6addr; 101 102 if (v6src_addr != NULL) 103 ire->ire_src_addr_v6 = *v6src_addr; 104 if (v6mask != NULL) { 105 ire->ire_mask_v6 = *v6mask; 106 ire->ire_masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6); 107 } 108 if (v6gateway != NULL) 109 ire->ire_gateway_addr_v6 = *v6gateway; 110 111 if (type == IRE_CACHE && v6cmask != NULL) 112 ire->ire_cmask_v6 = *v6cmask; 113 114 /* 115 * Multirouted packets need to have a fragment header added so that 116 * the receiver is able to discard duplicates according to their 117 * fragment identifier. 118 */ 119 if (type == IRE_CACHE && (flags & RTF_MULTIRT)) { 120 ire->ire_frag_flag = IPH_FRAG_HDR; 121 } 122 123 /* ire_init_common will free the mblks upon encountering any failure */ 124 if (!ire_init_common(ire, max_fragp, NULL, rfq, stq, type, ipif, 125 phandle, ihandle, flags, IPV6_VERSION, ulp_info, gc, gcgrp, ipst)) 126 return (NULL); 127 128 return (ire); 129 } 130 131 /* 132 * Similar to ire_create_v6 except that it is called only when 133 * we want to allocate ire as an mblk e.g. we have a external 134 * resolver. Do we need this in IPv6 ? 135 * 136 * IPv6 initializes the ire_nce in ire_add_v6, which expects to 137 * find the ire_nce to be null when it is called. So, although 138 * we have a src_nce parameter (in the interest of matching up with 139 * the argument list of the v4 version), we ignore the src_nce 140 * argument here. 141 */ 142 /* ARGSUSED */ 143 ire_t * 144 ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, 145 const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway, 146 nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type, 147 ipif_t *ipif, const in6_addr_t *v6cmask, 148 uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info, 149 tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) 150 { 151 ire_t *ire; 152 ire_t *ret_ire; 153 mblk_t *mp; 154 155 ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr)); 156 157 /* Allocate the new IRE. */ 158 mp = allocb(sizeof (ire_t), BPRI_MED); 159 if (mp == NULL) { 160 ip1dbg(("ire_create_mp_v6: alloc failed\n")); 161 return (NULL); 162 } 163 164 ire = (ire_t *)mp->b_rptr; 165 mp->b_wptr = (uchar_t *)&ire[1]; 166 167 /* Start clean. */ 168 *ire = ire_null; 169 ire->ire_mp = mp; 170 mp->b_datap->db_type = IRE_DB_TYPE; 171 172 ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway, 173 NULL, rfq, stq, type, ipif, v6cmask, phandle, 174 ihandle, flags, ulp_info, gc, gcgrp, ipst); 175 176 if (ret_ire == NULL) { 177 freeb(ire->ire_mp); 178 return (NULL); 179 } 180 return (ire); 181 } 182 183 /* 184 * ire_create_v6 is called to allocate and initialize a new IRE. 185 * 186 * NOTE : This is called as writer sometimes though not required 187 * by this function. 188 * 189 * See comments above ire_create_mp_v6() for the rationale behind the 190 * unused src_nce argument. 191 */ 192 /* ARGSUSED */ 193 ire_t * 194 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, 195 const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway, 196 uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq, 197 ushort_t type, ipif_t *ipif, const in6_addr_t *v6cmask, 198 uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info, 199 tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) 200 { 201 ire_t *ire; 202 ire_t *ret_ire; 203 204 ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr)); 205 206 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 207 if (ire == NULL) { 208 ip1dbg(("ire_create_v6: alloc failed\n")); 209 return (NULL); 210 } 211 *ire = ire_null; 212 213 ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway, 214 max_fragp, rfq, stq, type, ipif, v6cmask, phandle, 215 ihandle, flags, ulp_info, gc, gcgrp, ipst); 216 217 if (ret_ire == NULL) { 218 kmem_cache_free(ire_cache, ire); 219 return (NULL); 220 } 221 ASSERT(ret_ire == ire); 222 return (ire); 223 } 224 225 /* 226 * Find an IRE_INTERFACE for the multicast group. 227 * Allows different routes for multicast addresses 228 * in the unicast routing table (akin to FF::0/8 but could be more specific) 229 * which point at different interfaces. This is used when IPV6_MULTICAST_IF 230 * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't 231 * specify the interface to join on. 232 * 233 * Supports link-local addresses by following the ipif/ill when recursing. 234 */ 235 ire_t * 236 ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst) 237 { 238 ire_t *ire; 239 ipif_t *ipif = NULL; 240 int match_flags = MATCH_IRE_TYPE; 241 in6_addr_t gw_addr_v6; 242 243 ire = ire_ftable_lookup_v6(group, 0, 0, 0, NULL, NULL, 244 zoneid, 0, NULL, MATCH_IRE_DEFAULT, ipst); 245 246 /* We search a resolvable ire in case of multirouting. */ 247 if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) { 248 ire_t *cire = NULL; 249 /* 250 * If the route is not resolvable, the looked up ire 251 * may be changed here. In that case, ire_multirt_lookup() 252 * IRE_REFRELE the original ire and change it. 253 */ 254 (void) ire_multirt_lookup_v6(&cire, &ire, MULTIRT_CACHEGW, 255 NULL, ipst); 256 if (cire != NULL) 257 ire_refrele(cire); 258 } 259 if (ire == NULL) 260 return (NULL); 261 /* 262 * Make sure we follow ire_ipif. 263 * 264 * We need to determine the interface route through 265 * which the gateway will be reached. We don't really 266 * care which interface is picked if the interface is 267 * part of a group. 268 */ 269 if (ire->ire_ipif != NULL) { 270 ipif = ire->ire_ipif; 271 match_flags |= MATCH_IRE_ILL_GROUP; 272 } 273 274 switch (ire->ire_type) { 275 case IRE_DEFAULT: 276 case IRE_PREFIX: 277 case IRE_HOST: 278 mutex_enter(&ire->ire_lock); 279 gw_addr_v6 = ire->ire_gateway_addr_v6; 280 mutex_exit(&ire->ire_lock); 281 ire_refrele(ire); 282 ire = ire_ftable_lookup_v6(&gw_addr_v6, 0, 0, 283 IRE_INTERFACE, ipif, NULL, zoneid, 0, 284 NULL, match_flags, ipst); 285 return (ire); 286 case IRE_IF_NORESOLVER: 287 case IRE_IF_RESOLVER: 288 return (ire); 289 default: 290 ire_refrele(ire); 291 return (NULL); 292 } 293 } 294 295 /* 296 * Return any local address. We use this to target ourselves 297 * when the src address was specified as 'default'. 298 * Preference for IRE_LOCAL entries. 299 */ 300 ire_t * 301 ire_lookup_local_v6(zoneid_t zoneid, ip_stack_t *ipst) 302 { 303 ire_t *ire; 304 irb_t *irb; 305 ire_t *maybe = NULL; 306 int i; 307 308 for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { 309 irb = &ipst->ips_ip_cache_table_v6[i]; 310 if (irb->irb_ire == NULL) 311 continue; 312 rw_enter(&irb->irb_lock, RW_READER); 313 for (ire = irb->irb_ire; ire; ire = ire->ire_next) { 314 if ((ire->ire_marks & IRE_MARK_CONDEMNED) || 315 ire->ire_zoneid != zoneid && 316 ire->ire_zoneid != ALL_ZONES) 317 continue; 318 switch (ire->ire_type) { 319 case IRE_LOOPBACK: 320 if (maybe == NULL) { 321 IRE_REFHOLD(ire); 322 maybe = ire; 323 } 324 break; 325 case IRE_LOCAL: 326 if (maybe != NULL) { 327 ire_refrele(maybe); 328 } 329 IRE_REFHOLD(ire); 330 rw_exit(&irb->irb_lock); 331 return (ire); 332 } 333 } 334 rw_exit(&irb->irb_lock); 335 } 336 return (maybe); 337 } 338 339 /* 340 * This function takes a mask and returns number of bits set in the 341 * mask (the represented prefix length). Assumes a contiguous mask. 342 */ 343 int 344 ip_mask_to_plen_v6(const in6_addr_t *v6mask) 345 { 346 int bits; 347 int plen = IPV6_ABITS; 348 int i; 349 350 for (i = 3; i >= 0; i--) { 351 if (v6mask->s6_addr32[i] == 0) { 352 plen -= 32; 353 continue; 354 } 355 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 356 if (bits == 0) 357 break; 358 plen -= bits; 359 } 360 361 return (plen); 362 } 363 364 /* 365 * Convert a prefix length to the mask for that prefix. 366 * Returns the argument bitmask. 367 */ 368 in6_addr_t * 369 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask) 370 { 371 uint32_t *ptr; 372 373 if (plen < 0 || plen > IPV6_ABITS) 374 return (NULL); 375 *bitmask = ipv6_all_zeros; 376 377 ptr = (uint32_t *)bitmask; 378 while (plen > 32) { 379 *ptr++ = 0xffffffffU; 380 plen -= 32; 381 } 382 *ptr = htonl(0xffffffffU << (32 - plen)); 383 return (bitmask); 384 } 385 386 /* 387 * Add a fully initialized IRE to an appropriate 388 * table based on ire_type. 389 * 390 * The forward table contains IRE_PREFIX/IRE_HOST/IRE_HOST and 391 * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT. 392 * 393 * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK 394 * and IRE_CACHE. 395 * 396 * NOTE : This function is called as writer though not required 397 * by this function. 398 */ 399 int 400 ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) 401 { 402 ire_t *ire1; 403 int mask_table_index; 404 irb_t *irb_ptr; 405 ire_t **irep; 406 int flags; 407 ire_t *pire = NULL; 408 ill_t *stq_ill; 409 boolean_t ndp_g_lock_held = B_FALSE; 410 ire_t *ire = *ire_p; 411 int error; 412 ip_stack_t *ipst = ire->ire_ipst; 413 414 ASSERT(ire->ire_ipversion == IPV6_VERSION); 415 ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */ 416 ASSERT(ire->ire_nce == NULL); 417 418 /* Find the appropriate list head. */ 419 switch (ire->ire_type) { 420 case IRE_HOST: 421 ire->ire_mask_v6 = ipv6_all_ones; 422 ire->ire_masklen = IPV6_ABITS; 423 if ((ire->ire_flags & RTF_SETSRC) == 0) 424 ire->ire_src_addr_v6 = ipv6_all_zeros; 425 break; 426 case IRE_CACHE: 427 case IRE_LOCAL: 428 case IRE_LOOPBACK: 429 ire->ire_mask_v6 = ipv6_all_ones; 430 ire->ire_masklen = IPV6_ABITS; 431 break; 432 case IRE_PREFIX: 433 if ((ire->ire_flags & RTF_SETSRC) == 0) 434 ire->ire_src_addr_v6 = ipv6_all_zeros; 435 break; 436 case IRE_DEFAULT: 437 if ((ire->ire_flags & RTF_SETSRC) == 0) 438 ire->ire_src_addr_v6 = ipv6_all_zeros; 439 break; 440 case IRE_IF_RESOLVER: 441 case IRE_IF_NORESOLVER: 442 break; 443 default: 444 printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n", 445 (void *)ire, ire->ire_type); 446 ire_delete(ire); 447 *ire_p = NULL; 448 return (EINVAL); 449 } 450 451 /* Make sure the address is properly masked. */ 452 V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6); 453 454 if ((ire->ire_type & IRE_CACHETABLE) == 0) { 455 /* IRE goes into Forward Table */ 456 mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6); 457 if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == 458 NULL) { 459 irb_t *ptr; 460 int i; 461 462 ptr = (irb_t *)mi_zalloc(( 463 ipst->ips_ip6_ftable_hash_size * sizeof (irb_t))); 464 if (ptr == NULL) { 465 ire_delete(ire); 466 *ire_p = NULL; 467 return (ENOMEM); 468 } 469 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 470 rw_init(&ptr[i].irb_lock, NULL, 471 RW_DEFAULT, NULL); 472 } 473 mutex_enter(&ipst->ips_ire_ft_init_lock); 474 if (ipst->ips_ip_forwarding_table_v6[ 475 mask_table_index] == NULL) { 476 ipst->ips_ip_forwarding_table_v6[ 477 mask_table_index] = ptr; 478 mutex_exit(&ipst->ips_ire_ft_init_lock); 479 } else { 480 /* 481 * Some other thread won the race in 482 * initializing the forwarding table at the 483 * same index. 484 */ 485 mutex_exit(&ipst->ips_ire_ft_init_lock); 486 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; 487 i++) { 488 rw_destroy(&ptr[i].irb_lock); 489 } 490 mi_free(ptr); 491 } 492 } 493 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][ 494 IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6, 495 ipst->ips_ip6_ftable_hash_size)]); 496 } else { 497 irb_ptr = &(ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6( 498 ire->ire_addr_v6, ipst->ips_ip6_cache_table_size)]); 499 } 500 /* 501 * For xresolv interfaces (v6 interfaces with an external 502 * address resolver), ip_newroute_v6/ip_newroute_ipif_v6 503 * are unable to prevent the deletion of the interface route 504 * while adding an IRE_CACHE for an on-link destination 505 * in the IRE_IF_RESOLVER case, since the ire has to go to 506 * the external resolver and return. We can't do a REFHOLD on the 507 * associated interface ire for fear of the message being freed 508 * if the external resolver can't resolve the address. 509 * Here we look up the interface ire in the forwarding table 510 * and make sure that the interface route has not been deleted. 511 */ 512 if (ire->ire_type == IRE_CACHE && 513 IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6) && 514 (((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) && 515 (((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_XRESOLV)) { 516 517 pire = ire_ihandle_lookup_onlink_v6(ire); 518 if (pire == NULL) { 519 ire_delete(ire); 520 *ire_p = NULL; 521 return (EINVAL); 522 } 523 /* Prevent pire from getting deleted */ 524 IRB_REFHOLD(pire->ire_bucket); 525 /* Has it been removed already? */ 526 if (pire->ire_marks & IRE_MARK_CONDEMNED) { 527 IRB_REFRELE(pire->ire_bucket); 528 ire_refrele(pire); 529 ire_delete(ire); 530 *ire_p = NULL; 531 return (EINVAL); 532 } 533 } 534 535 flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 536 /* 537 * For IRE_CACHES, MATCH_IRE_IPIF is not enough to check 538 * for duplicates because : 539 * 540 * 1) ire_ipif->ipif_ill and ire_stq->q_ptr could be 541 * pointing at different ills. A real duplicate is 542 * a match on both ire_ipif and ire_stq. 543 * 544 * 2) We could have multiple packets trying to create 545 * an IRE_CACHE for the same ill. 546 * 547 * Moreover, IPIF_NOFAILOVER and IPV6_BOUND_PIF endpoints wants 548 * to go out on a particular ill. Rather than looking at the 549 * packet, we depend on the above for MATCH_IRE_ILL here. 550 * 551 * Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have 552 * multiple IRE_CACHES for an ill for the same destination 553 * with various scoped addresses i.e represented by ipifs. 554 * 555 * MATCH_IRE_ILL is done implicitly below for IRE_CACHES. 556 */ 557 if (ire->ire_ipif != NULL) 558 flags |= MATCH_IRE_IPIF; 559 /* 560 * If we are creating hidden ires, make sure we search on 561 * this ill (MATCH_IRE_ILL) and a hidden ire, while we are 562 * searching for duplicates below. Otherwise we could 563 * potentially find an IRE on some other interface 564 * and it may not be a IRE marked with IRE_MARK_HIDDEN. We 565 * shouldn't do this as this will lead to an infinite loop as 566 * eventually we need an hidden ire for this packet to go 567 * out. MATCH_IRE_ILL is already marked above. 568 */ 569 if (ire->ire_marks & IRE_MARK_HIDDEN) { 570 ASSERT(ire->ire_type == IRE_CACHE); 571 flags |= MATCH_IRE_MARK_HIDDEN; 572 } 573 574 /* 575 * Start the atomic add of the ire. Grab the ill locks, 576 * ill_g_usesrc_lock and the bucket lock. Check for condemned. 577 * To avoid lock order problems, get the ndp6.ndp_g_lock now itself. 578 */ 579 if (ire->ire_type == IRE_CACHE) { 580 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 581 ndp_g_lock_held = B_TRUE; 582 } 583 584 /* 585 * If ipif or ill is changing ire_atomic_start() may queue the 586 * request and return EINPROGRESS. 587 */ 588 589 error = ire_atomic_start(irb_ptr, ire, q, mp, func); 590 if (error != 0) { 591 if (ndp_g_lock_held) 592 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 593 /* 594 * We don't know whether it is a valid ipif or not. 595 * So, set it to NULL. This assumes that the ire has not added 596 * a reference to the ipif. 597 */ 598 ire->ire_ipif = NULL; 599 ire_delete(ire); 600 if (pire != NULL) { 601 IRB_REFRELE(pire->ire_bucket); 602 ire_refrele(pire); 603 } 604 *ire_p = NULL; 605 return (error); 606 } 607 /* 608 * To avoid creating ires having stale values for the ire_max_frag 609 * we get the latest value atomically here. For more details 610 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE 611 * in ip_rput_dlpi_writer 612 */ 613 if (ire->ire_max_fragp == NULL) { 614 if (IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) 615 ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 616 else 617 ire->ire_max_frag = pire->ire_max_frag; 618 } else { 619 uint_t max_frag; 620 621 max_frag = *ire->ire_max_fragp; 622 ire->ire_max_fragp = NULL; 623 ire->ire_max_frag = max_frag; 624 } 625 626 /* 627 * Atomically check for duplicate and insert in the table. 628 */ 629 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 630 if (ire1->ire_marks & IRE_MARK_CONDEMNED) 631 continue; 632 633 if (ire->ire_type == IRE_CACHE) { 634 /* 635 * We do MATCH_IRE_ILL implicitly here for IRE_CACHES. 636 * As ire_ipif and ire_stq could point to two 637 * different ills, we can't pass just ire_ipif to 638 * ire_match_args and get a match on both ills. 639 * This is just needed for duplicate checks here and 640 * so we don't add an extra argument to 641 * ire_match_args for this. Do it locally. 642 * 643 * NOTE : Currently there is no part of the code 644 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL 645 * match for IRE_CACHEs. Thus we don't want to 646 * extend the arguments to ire_match_args_v6. 647 */ 648 if (ire1->ire_stq != ire->ire_stq) 649 continue; 650 /* 651 * Multiroute IRE_CACHEs for a given destination can 652 * have the same ire_ipif, typically if their source 653 * address is forced using RTF_SETSRC, and the same 654 * send-to queue. We differentiate them using the parent 655 * handle. 656 */ 657 if ((ire1->ire_flags & RTF_MULTIRT) && 658 (ire->ire_flags & RTF_MULTIRT) && 659 (ire1->ire_phandle != ire->ire_phandle)) 660 continue; 661 } 662 if (ire1->ire_zoneid != ire->ire_zoneid) 663 continue; 664 if (ire_match_args_v6(ire1, &ire->ire_addr_v6, 665 &ire->ire_mask_v6, &ire->ire_gateway_addr_v6, 666 ire->ire_type, ire->ire_ipif, ire->ire_zoneid, 0, NULL, 667 flags)) { 668 /* 669 * Return the old ire after doing a REFHOLD. 670 * As most of the callers continue to use the IRE 671 * after adding, we return a held ire. This will 672 * avoid a lookup in the caller again. If the callers 673 * don't want to use it, they need to do a REFRELE. 674 */ 675 ip1dbg(("found dup ire existing %p new %p", 676 (void *)ire1, (void *)ire)); 677 IRE_REFHOLD(ire1); 678 if (ndp_g_lock_held) 679 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 680 ire_atomic_end(irb_ptr, ire); 681 ire_delete(ire); 682 if (pire != NULL) { 683 /* 684 * Assert that it is 685 * not yet removed from the list. 686 */ 687 ASSERT(pire->ire_ptpn != NULL); 688 IRB_REFRELE(pire->ire_bucket); 689 ire_refrele(pire); 690 } 691 *ire_p = ire1; 692 return (0); 693 } 694 } 695 if (ire->ire_type == IRE_CACHE) { 696 in6_addr_t gw_addr_v6; 697 ill_t *ill = ire_to_ill(ire); 698 char buf[INET6_ADDRSTRLEN]; 699 nce_t *nce; 700 701 /* 702 * All IRE_CACHE types must have a nce. If this is 703 * not the case the entry will not be added. We need 704 * to make sure that if somebody deletes the nce 705 * after we looked up, they will find this ire and 706 * delete the ire. To delete this ire one needs the 707 * bucket lock which we are still holding here. So, 708 * even if the nce gets deleted after we looked up, 709 * this ire will get deleted. 710 * 711 * NOTE : Don't need the ire_lock for accessing 712 * ire_gateway_addr_v6 as it is appearing first 713 * time on the list and rts_setgwr_v6 could not 714 * be changing this. 715 */ 716 gw_addr_v6 = ire->ire_gateway_addr_v6; 717 if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { 718 nce = ndp_lookup_v6(ill, &ire->ire_addr_v6, B_TRUE); 719 } else { 720 nce = ndp_lookup_v6(ill, &gw_addr_v6, B_TRUE); 721 } 722 if (nce == NULL) 723 goto failed; 724 725 /* Pair of refhold, refrele just to get the tracing right */ 726 NCE_REFHOLD_TO_REFHOLD_NOTR(nce); 727 /* 728 * Atomically make sure that new IREs don't point 729 * to an NCE that is logically deleted (CONDEMNED). 730 * ndp_delete() first marks the NCE CONDEMNED. 731 * This ensures that the nce_refcnt won't increase 732 * due to new nce_lookups or due to addition of new IREs 733 * pointing to this NCE. Then ndp_delete() cleans up 734 * existing references. If we don't do it atomically here, 735 * ndp_delete() -> nce_ire_delete() will not be able to 736 * clean up the IRE list completely, and the nce_refcnt 737 * won't go down to zero. 738 */ 739 mutex_enter(&nce->nce_lock); 740 if (ill->ill_flags & ILLF_XRESOLV) { 741 /* 742 * If we used an external resolver, we may not 743 * have gone through neighbor discovery to get here. 744 * Must update the nce_state before the next check. 745 */ 746 if (nce->nce_state == ND_INCOMPLETE) 747 nce->nce_state = ND_REACHABLE; 748 } 749 if (nce->nce_state == ND_INCOMPLETE || 750 (nce->nce_flags & NCE_F_CONDEMNED) || 751 (nce->nce_state == ND_UNREACHABLE)) { 752 failed: 753 if (ndp_g_lock_held) 754 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 755 if (nce != NULL) 756 mutex_exit(&nce->nce_lock); 757 ire_atomic_end(irb_ptr, ire); 758 ip1dbg(("ire_add_v6: No nce for dst %s \n", 759 inet_ntop(AF_INET6, &ire->ire_addr_v6, 760 buf, sizeof (buf)))); 761 ire_delete(ire); 762 if (pire != NULL) { 763 /* 764 * Assert that it is 765 * not yet removed from the list. 766 */ 767 ASSERT(pire->ire_ptpn != NULL); 768 IRB_REFRELE(pire->ire_bucket); 769 ire_refrele(pire); 770 } 771 if (nce != NULL) 772 NCE_REFRELE_NOTR(nce); 773 *ire_p = NULL; 774 return (EINVAL); 775 } else { 776 ire->ire_nce = nce; 777 } 778 mutex_exit(&nce->nce_lock); 779 } 780 /* 781 * Find the first entry that matches ire_addr - provides 782 * tail insertion. *irep will be null if no match. 783 */ 784 irep = (ire_t **)irb_ptr; 785 while ((ire1 = *irep) != NULL && 786 !IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6)) 787 irep = &ire1->ire_next; 788 ASSERT(!(ire->ire_type & IRE_BROADCAST)); 789 790 if (*irep != NULL) { 791 /* 792 * Find the last ire which matches ire_addr_v6. 793 * Needed to do tail insertion among entries with the same 794 * ire_addr_v6. 795 */ 796 while (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, 797 &ire1->ire_addr_v6)) { 798 irep = &ire1->ire_next; 799 ire1 = *irep; 800 if (ire1 == NULL) 801 break; 802 } 803 } 804 805 if (ire->ire_type == IRE_DEFAULT) { 806 /* 807 * We keep a count of default gateways which is used when 808 * assigning them as routes. 809 */ 810 ipst->ips_ipv6_ire_default_count++; 811 ASSERT(ipst->ips_ipv6_ire_default_count != 0); /* Wraparound */ 812 } 813 /* Insert at *irep */ 814 ire1 = *irep; 815 if (ire1 != NULL) 816 ire1->ire_ptpn = &ire->ire_next; 817 ire->ire_next = ire1; 818 /* Link the new one in. */ 819 ire->ire_ptpn = irep; 820 /* 821 * ire_walk routines de-reference ire_next without holding 822 * a lock. Before we point to the new ire, we want to make 823 * sure the store that sets the ire_next of the new ire 824 * reaches global visibility, so that ire_walk routines 825 * don't see a truncated list of ires i.e if the ire_next 826 * of the new ire gets set after we do "*irep = ire" due 827 * to re-ordering, the ire_walk thread will see a NULL 828 * once it accesses the ire_next of the new ire. 829 * membar_producer() makes sure that the following store 830 * happens *after* all of the above stores. 831 */ 832 membar_producer(); 833 *irep = ire; 834 ire->ire_bucket = irb_ptr; 835 /* 836 * We return a bumped up IRE above. Keep it symmetrical 837 * so that the callers will always have to release. This 838 * helps the callers of this function because they continue 839 * to use the IRE after adding and hence they don't have to 840 * lookup again after we return the IRE. 841 * 842 * NOTE : We don't have to use atomics as this is appearing 843 * in the list for the first time and no one else can bump 844 * up the reference count on this yet. 845 */ 846 IRE_REFHOLD_LOCKED(ire); 847 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted); 848 irb_ptr->irb_ire_cnt++; 849 if (ire->ire_marks & IRE_MARK_TEMPORARY) 850 irb_ptr->irb_tmp_ire_cnt++; 851 852 if (ire->ire_ipif != NULL) { 853 DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif, 854 (char *), "ire", (void *), ire); 855 ire->ire_ipif->ipif_ire_cnt++; 856 if (ire->ire_stq != NULL) { 857 stq_ill = (ill_t *)ire->ire_stq->q_ptr; 858 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill, 859 (char *), "ire", (void *), ire); 860 stq_ill->ill_ire_cnt++; 861 } 862 } else { 863 ASSERT(ire->ire_stq == NULL); 864 } 865 866 if (ndp_g_lock_held) 867 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 868 ire_atomic_end(irb_ptr, ire); 869 870 if (pire != NULL) { 871 /* Assert that it is not removed from the list yet */ 872 ASSERT(pire->ire_ptpn != NULL); 873 IRB_REFRELE(pire->ire_bucket); 874 ire_refrele(pire); 875 } 876 877 if (ire->ire_type != IRE_CACHE) { 878 /* 879 * For ire's with with host mask see if there is an entry 880 * in the cache. If there is one flush the whole cache as 881 * there might be multiple entries due to RTF_MULTIRT (CGTP). 882 * If no entry is found than there is no need to flush the 883 * cache. 884 */ 885 886 if (ip_mask_to_plen_v6(&ire->ire_mask_v6) == IPV6_ABITS) { 887 ire_t *lire; 888 lire = ire_ctable_lookup_v6(&ire->ire_addr_v6, NULL, 889 IRE_CACHE, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 890 ipst); 891 if (lire != NULL) { 892 ire_refrele(lire); 893 ire_flush_cache_v6(ire, IRE_FLUSH_ADD); 894 } 895 } else { 896 ire_flush_cache_v6(ire, IRE_FLUSH_ADD); 897 } 898 } 899 900 *ire_p = ire; 901 return (0); 902 } 903 904 /* 905 * Search for all HOST REDIRECT routes that are 906 * pointing at the specified gateway and 907 * delete them. This routine is called only 908 * when a default gateway is going away. 909 */ 910 static void 911 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst) 912 { 913 irb_t *irb_ptr; 914 irb_t *irb; 915 ire_t *ire; 916 in6_addr_t gw_addr_v6; 917 int i; 918 919 /* get the hash table for HOST routes */ 920 irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)]; 921 if (irb_ptr == NULL) 922 return; 923 for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) { 924 irb = &irb_ptr[i]; 925 IRB_REFHOLD(irb); 926 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 927 if (!(ire->ire_flags & RTF_DYNAMIC)) 928 continue; 929 mutex_enter(&ire->ire_lock); 930 gw_addr_v6 = ire->ire_gateway_addr_v6; 931 mutex_exit(&ire->ire_lock); 932 if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) 933 ire_delete(ire); 934 } 935 IRB_REFRELE(irb); 936 } 937 } 938 939 /* 940 * Delete all the cache entries with this 'addr'. This is the IPv6 counterpart 941 * of ip_ire_clookup_and_delete. The difference being this function does not 942 * return any value. IPv6 processing of a gratuitous ARP, as it stands, is 943 * different than IPv4 in that, regardless of the presence of a cache entry 944 * for this address, an ire_walk_v6 is done. Another difference is that unlike 945 * in the case of IPv4 this does not take an ipif_t argument, since it is only 946 * called by ip_arp_news and the match is always only on the address. 947 */ 948 void 949 ip_ire_clookup_and_delete_v6(const in6_addr_t *addr, ip_stack_t *ipst) 950 { 951 irb_t *irb; 952 ire_t *cire; 953 boolean_t found = B_FALSE; 954 955 irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr, 956 ipst->ips_ip6_cache_table_size)]; 957 IRB_REFHOLD(irb); 958 for (cire = irb->irb_ire; cire != NULL; cire = cire->ire_next) { 959 if (cire->ire_marks & IRE_MARK_CONDEMNED) 960 continue; 961 if (IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, addr)) { 962 963 /* This signifies start of a match */ 964 if (!found) 965 found = B_TRUE; 966 if (cire->ire_type == IRE_CACHE) { 967 if (cire->ire_nce != NULL) 968 ndp_delete(cire->ire_nce); 969 ire_delete_v6(cire); 970 } 971 /* End of the match */ 972 } else if (found) 973 break; 974 } 975 IRB_REFRELE(irb); 976 } 977 978 /* 979 * Delete the specified IRE. 980 * All calls should use ire_delete(). 981 * Sometimes called as writer though not required by this function. 982 * 983 * NOTE : This function is called only if the ire was added 984 * in the list. 985 */ 986 void 987 ire_delete_v6(ire_t *ire) 988 { 989 in6_addr_t gw_addr_v6; 990 ip_stack_t *ipst = ire->ire_ipst; 991 992 ASSERT(ire->ire_refcnt >= 1); 993 ASSERT(ire->ire_ipversion == IPV6_VERSION); 994 995 if (ire->ire_type != IRE_CACHE) 996 ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); 997 if (ire->ire_type == IRE_DEFAULT) { 998 /* 999 * when a default gateway is going away 1000 * delete all the host redirects pointing at that 1001 * gateway. 1002 */ 1003 mutex_enter(&ire->ire_lock); 1004 gw_addr_v6 = ire->ire_gateway_addr_v6; 1005 mutex_exit(&ire->ire_lock); 1006 ire_delete_host_redirects_v6(&gw_addr_v6, ipst); 1007 } 1008 } 1009 1010 /* 1011 * ire_walk routine to delete all IRE_CACHE and IRE_HOST type redirect 1012 * entries. 1013 */ 1014 /*ARGSUSED1*/ 1015 void 1016 ire_delete_cache_v6(ire_t *ire, char *arg) 1017 { 1018 char addrstr1[INET6_ADDRSTRLEN]; 1019 char addrstr2[INET6_ADDRSTRLEN]; 1020 1021 if ((ire->ire_type & IRE_CACHE) || 1022 (ire->ire_flags & RTF_DYNAMIC)) { 1023 ip1dbg(("ire_delete_cache_v6: deleted %s type %d through %s\n", 1024 inet_ntop(AF_INET6, &ire->ire_addr_v6, 1025 addrstr1, sizeof (addrstr1)), 1026 ire->ire_type, 1027 inet_ntop(AF_INET6, &ire->ire_gateway_addr_v6, 1028 addrstr2, sizeof (addrstr2)))); 1029 ire_delete(ire); 1030 } 1031 1032 } 1033 1034 /* 1035 * ire_walk routine to delete all IRE_CACHE/IRE_HOST type redirect entries 1036 * that have a given gateway address. 1037 */ 1038 void 1039 ire_delete_cache_gw_v6(ire_t *ire, char *addr) 1040 { 1041 in6_addr_t *gw_addr = (in6_addr_t *)addr; 1042 char buf1[INET6_ADDRSTRLEN]; 1043 char buf2[INET6_ADDRSTRLEN]; 1044 in6_addr_t ire_gw_addr_v6; 1045 1046 if (!(ire->ire_type & IRE_CACHE) && 1047 !(ire->ire_flags & RTF_DYNAMIC)) 1048 return; 1049 1050 mutex_enter(&ire->ire_lock); 1051 ire_gw_addr_v6 = ire->ire_gateway_addr_v6; 1052 mutex_exit(&ire->ire_lock); 1053 1054 if (IN6_ARE_ADDR_EQUAL(&ire_gw_addr_v6, gw_addr)) { 1055 ip1dbg(("ire_delete_cache_gw_v6: deleted %s type %d to %s\n", 1056 inet_ntop(AF_INET6, &ire->ire_src_addr_v6, 1057 buf1, sizeof (buf1)), 1058 ire->ire_type