1 2535 sangeeta /* 2 2535 sangeeta * CDDL HEADER START 3 2535 sangeeta * 4 2535 sangeeta * The contents of this file are subject to the terms of the 5 2535 sangeeta * Common Development and Distribution License (the "License"). 6 2535 sangeeta * You may not use this file except in compliance with the License. 7 2535 sangeeta * 8 2535 sangeeta * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 2535 sangeeta * or http://www.opensolaris.org/os/licensing. 10 2535 sangeeta * See the License for the specific language governing permissions 11 2535 sangeeta * and limitations under the License. 12 2535 sangeeta * 13 2535 sangeeta * When distributing Covered Code, include this CDDL HEADER in each 14 2535 sangeeta * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 2535 sangeeta * If applicable, add the following below this CDDL HEADER, with the 16 2535 sangeeta * fields enclosed by brackets "[]" replaced with your own identifying 17 2535 sangeeta * information: Portions Copyright [yyyy] [name of copyright owner] 18 2535 sangeeta * 19 2535 sangeeta * CDDL HEADER END 20 2535 sangeeta */ 21 2535 sangeeta /* 22 8485 Peter * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 2535 sangeeta * Use is subject to license terms. 24 2535 sangeeta */ 25 2535 sangeeta 26 2535 sangeeta /* 27 2535 sangeeta * This file contains consumer routines of the IPv4 forwarding engine 28 2535 sangeeta */ 29 2535 sangeeta 30 2535 sangeeta #include <sys/types.h> 31 2535 sangeeta #include <sys/stream.h> 32 2535 sangeeta #include <sys/stropts.h> 33 2535 sangeeta #include <sys/strlog.h> 34 2535 sangeeta #include <sys/dlpi.h> 35 2535 sangeeta #include <sys/ddi.h> 36 2535 sangeeta #include <sys/cmn_err.h> 37 2535 sangeeta #include <sys/policy.h> 38 2535 sangeeta 39 2535 sangeeta #include <sys/systm.h> 40 2535 sangeeta #include <sys/strsun.h> 41 2535 sangeeta #include <sys/kmem.h> 42 2535 sangeeta #include <sys/param.h> 43 2535 sangeeta #include <sys/socket.h> 44 4482 dr146992 #include <sys/strsubr.h> 45 2535 sangeeta #include <net/if.h> 46 2535 sangeeta #include <net/route.h> 47 2535 sangeeta #include <netinet/in.h> 48 2535 sangeeta #include <net/if_dl.h> 49 2535 sangeeta #include <netinet/ip6.h> 50 2535 sangeeta #include <netinet/icmp6.h> 51 2535 sangeeta 52 11042 Erik #include <inet/ipsec_impl.h> 53 2535 sangeeta #include <inet/common.h> 54 2535 sangeeta #include <inet/mi.h> 55 2535 sangeeta #include <inet/mib2.h> 56 2535 sangeeta #include <inet/ip.h> 57 4482 dr146992 #include <inet/ip_impl.h> 58 2535 sangeeta #include <inet/ip6.h> 59 2535 sangeeta #include <inet/ip_ndp.h> 60 2535 sangeeta #include <inet/arp.h> 61 2535 sangeeta #include <inet/ip_if.h> 62 2535 sangeeta #include <inet/ip_ire.h> 63 2535 sangeeta #include <inet/ip_ftable.h> 64 2535 sangeeta #include <inet/ip_rts.h> 65 2535 sangeeta #include <inet/nd.h> 66 2535 sangeeta 67 2535 sangeeta #include <net/pfkeyv2.h> 68 2535 sangeeta #include <inet/sadb.h> 69 2535 sangeeta #include <inet/tcp.h> 70 2535 sangeeta #include <inet/ipclassifier.h> 71 2535 sangeeta #include <sys/zone.h> 72 2535 sangeeta #include <net/radix.h> 73 2535 sangeeta #include <sys/tsol/label.h> 74 2535 sangeeta #include <sys/tsol/tnet.h> 75 2535 sangeeta 76 2535 sangeeta #define IS_DEFAULT_ROUTE(ire) \ 77 2535 sangeeta (((ire)->ire_type & IRE_DEFAULT) || \ 78 2535 sangeeta (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) 79 2535 sangeeta 80 3448 dh155122 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); 81 11042 Erik static void ire_del_host_redir(ire_t *, char *); 82 11042 Erik static boolean_t ire_find_best_route(struct radix_node *, void *); 83 2535 sangeeta 84 2535 sangeeta /* 85 2535 sangeeta * Lookup a route in forwarding table. A specific lookup is indicated by 86 2535 sangeeta * passing the required parameters and indicating the match required in the 87 2535 sangeeta * flag field. 88 2535 sangeeta * 89 2535 sangeeta * Supports IP_BOUND_IF by following the ipif/ill when recursing. 90 2535 sangeeta */ 91 2535 sangeeta ire_t * 92 11042 Erik ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 93 11042 Erik int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, 94 11042 Erik int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 95 2535 sangeeta { 96 11042 Erik ire_t *ire; 97 2535 sangeeta struct rt_sockaddr rdst, rmask; 98 2535 sangeeta struct rt_entry *rt; 99 2535 sangeeta ire_ftable_args_t margs; 100 2535 sangeeta 101 11042 Erik ASSERT(ill == NULL || !ill->ill_isv6); 102 2535 sangeeta 103 2535 sangeeta /* 104 11042 Erik * ire_match_args() will dereference ill if MATCH_IRE_ILL 105 11042 Erik * is set. 106 2535 sangeeta */ 107 11042 Erik if ((flags & MATCH_IRE_ILL) && (ill == NULL)) 108 2535 sangeeta return (NULL); 109 2535 sangeeta 110 11131 Erik bzero(&rdst, sizeof (rdst)); 111 2535 sangeeta rdst.rt_sin_len = sizeof (rdst); 112 2535 sangeeta rdst.rt_sin_family = AF_INET; 113 2535 sangeeta rdst.rt_sin_addr.s_addr = addr; 114 2535 sangeeta 115 11131 Erik bzero(&rmask, sizeof (rmask)); 116 2535 sangeeta rmask.rt_sin_len = sizeof (rmask); 117 2535 sangeeta rmask.rt_sin_family = AF_INET; 118 2535 sangeeta rmask.rt_sin_addr.s_addr = mask; 119 2535 sangeeta 120 11131 Erik bzero(&margs, sizeof (margs)); 121 2535 sangeeta margs.ift_addr = addr; 122 2535 sangeeta margs.ift_mask = mask; 123 2535 sangeeta margs.ift_gateway = gateway; 124 2535 sangeeta margs.ift_type = type; 125 11042 Erik margs.ift_ill = ill; 126 2535 sangeeta margs.ift_zoneid = zoneid; 127 2535 sangeeta margs.ift_tsl = tsl; 128 2535 sangeeta margs.ift_flags = flags; 129 2535 sangeeta 130 2535 sangeeta /* 131 2535 sangeeta * The flags argument passed to ire_ftable_lookup may cause the 132 2535 sangeeta * search to return, not the longest matching prefix, but the 133 2535 sangeeta * "best matching prefix", i.e., the longest prefix that also 134 2535 sangeeta * satisfies constraints imposed via the permutation of flags 135 2535 sangeeta * passed in. To achieve this, we invoke ire_match_args() on 136 2535 sangeeta * each matching leaf in the radix tree. ire_match_args is 137 2535 sangeeta * invoked by the callback function ire_find_best_route() 138 2535 sangeeta * We hold the global tree lock in read mode when calling 139 11042 Erik * rn_match_args. Before dropping the global tree lock, ensure 140 2535 sangeeta * that the radix node can't be deleted by incrementing ire_refcnt. 141 2535 sangeeta */ 142 3448 dh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 143 3448 dh155122 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 144 3448 dh155122 ipst->ips_ip_ftable, ire_find_best_route, &margs); 145 2535 sangeeta ire = margs.ift_best_ire; 146 2535 sangeeta if (rt == NULL) { 147 11042 Erik RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 148 2535 sangeeta return (NULL); 149 2535 sangeeta } 150 11042 Erik ASSERT(ire != NULL); 151 2535 sangeeta 152 2535 sangeeta DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); 153 2535 sangeeta 154 2535 sangeeta /* 155 2535 sangeeta * round-robin only if we have more than one route in the bucket. 156 11042 Erik * ips_ip_ecmp_behavior controls when we do ECMP 157 11042 Erik * 2: always 158 11042 Erik * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 159 11042 Erik * 0: never 160 2535 sangeeta */ 161 11042 Erik if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 162 11042 Erik if (ipst->ips_ip_ecmp_behavior == 2 || 163 11042 Erik (ipst->ips_ip_ecmp_behavior == 1 && 164 11042 Erik IS_DEFAULT_ROUTE(ire))) { 165 11042 Erik ire_t *next_ire; 166 2535 sangeeta 167 11042 Erik margs.ift_best_ire = NULL; 168 11042 Erik next_ire = ire_round_robin(ire->ire_bucket, &margs, 169 11042 Erik xmit_hint, ire, ipst); 170 11042 Erik if (next_ire == NULL) { 171 11042 Erik /* keep ire if next_ire is null */ 172 11042 Erik goto done; 173 11042 Erik } 174 11042 Erik ire_refrele(ire); 175 2535 sangeeta ire = next_ire; 176 2535 sangeeta } 177 2535 sangeeta } 178 11042 Erik 179 11042 Erik done: 180 11042 Erik /* Return generation before dropping lock */ 181 11042 Erik if (generationp != NULL) 182 11042 Erik *generationp = ire->ire_generation; 183 11042 Erik 184 11042 Erik RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 185 11042 Erik 186 11042 Erik /* 187 11042 Erik * For shared-IP zones we need additional checks to what was 188 11042 Erik * done in ire_match_args to make sure IRE_LOCALs are handled. 189 11042 Erik * 190 11042 Erik * When ip_restrict_interzone_loopback is set, then 191 11042 Erik * we ensure that IRE_LOCAL are only used for loopback 192 11042 Erik * between zones when the logical "Ethernet" would 193 11042 Erik * have looped them back. That is, if in the absense of 194 11042 Erik * the IRE_LOCAL we would have sent to packet out the 195 11042 Erik * same ill. 196 11042 Erik */ 197 11042 Erik if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 198 11042 Erik ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 199 11042 Erik ipst->ips_ip_restrict_interzone_loopback) { 200 11042 Erik ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 201 11042 Erik ASSERT(ire != NULL); 202 2535 sangeeta } 203 2535 sangeeta return (ire); 204 2535 sangeeta } 205 2535 sangeeta 206 8275 Eric /* 207 8275 Eric * This function is called by 208 11042 Erik * ip_input/ire_route_recursive when doing a route lookup on only the 209 11042 Erik * destination address. 210 11042 Erik * 211 8275 Eric * The optimizations of this function over ire_ftable_lookup are: 212 8275 Eric * o removing unnecessary flag matching 213 8275 Eric * o doing longest prefix match instead of overloading it further 214 8275 Eric * with the unnecessary "best_prefix_match" 215 11042 Erik * 216 11042 Erik * If no route is found we return IRE_NOROUTE. 217 8275 Eric */ 218 11042 Erik ire_t * 219 11042 Erik ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, 220 11042 Erik uint_t *generationp) 221 8275 Eric { 222 11042 Erik ire_t *ire; 223 8275 Eric struct rt_sockaddr rdst; 224 8275 Eric struct rt_entry *rt; 225 11042 Erik irb_t *irb; 226 8275 Eric 227 8275 Eric rdst.rt_sin_len = sizeof (rdst); 228 8275 Eric rdst.rt_sin_family = AF_INET; 229 8275 Eric rdst.rt_sin_addr.s_addr = addr; 230 8275 Eric 231 8275 Eric /* 232 8275 Eric * This is basically inlining a simpler version of ire_match_args 233 8275 Eric */ 234 8275 Eric RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 235 8275 Eric 236 8275 Eric rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 237 8275 Eric ipst->ips_ip_ftable, NULL, NULL); 238 8275 Eric 239 11042 Erik if (rt == NULL) 240 11042 Erik goto bad; 241 11042 Erik 242 11042 Erik irb = &rt->rt_irb; 243 11042 Erik if (irb->irb_ire_cnt == 0) 244 11042 Erik goto bad; 245 11042 Erik 246 11042 Erik rw_enter(&irb->irb_lock, RW_READER); 247 11042 Erik ire = irb->irb_ire; 248 11042 Erik if (ire == NULL) { 249 11042 Erik rw_exit(&irb->irb_lock); 250 11042 Erik goto bad; 251 8275 Eric } 252 11042 Erik while (IRE_IS_CONDEMNED(ire)) { 253 11042 Erik ire = ire->ire_next; 254 11042 Erik if (ire == NULL) { 255 11042 Erik rw_exit(&irb->irb_lock); 256 11042 Erik goto bad; 257 11042 Erik } 258 8275 Eric } 259 8275 Eric 260 11042 Erik /* we have a ire that matches */ 261 11042 Erik ire_refhold(ire); 262 11042 Erik rw_exit(&irb->irb_lock); 263 11042 Erik 264 11042 Erik /* 265 11042 Erik * round-robin only if we have more than one route in the bucket. 266 11042 Erik * ips_ip_ecmp_behavior controls when we do ECMP 267 11042 Erik * 2: always 268 11042 Erik * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 269 11042 Erik * 0: never 270 11042 Erik * 271 11042 Erik * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 272 11042 Erik * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 273 11042 Erik * and the IRE_INTERFACESs are likely to be shorter matches. 274 11042 Erik */ 275 11042 Erik if (ire->ire_bucket->irb_ire_cnt > 1) { 276 11042 Erik if (ipst->ips_ip_ecmp_behavior == 2 || 277 11042 Erik (ipst->ips_ip_ecmp_behavior == 1 && 278 11042 Erik IS_DEFAULT_ROUTE(ire))) { 279 11042 Erik ire_t *next_ire; 280 11042 Erik ire_ftable_args_t margs; 281 11042 Erik 282 11131 Erik bzero(&margs, sizeof (margs)); 283 11042 Erik margs.ift_addr = addr; 284 11042 Erik margs.ift_zoneid = ALL_ZONES; 285 11042 Erik 286 11042 Erik next_ire = ire_round_robin(ire->ire_bucket, &margs, 287 11042 Erik xmit_hint, ire, ipst); 288 11042 Erik if (next_ire == NULL) { 289 11042 Erik /* keep ire if next_ire is null */ 290 11042 Erik if (generationp != NULL) 291 11042 Erik *generationp = ire->ire_generation; 292 11042 Erik RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 293 11042 Erik return (ire); 294 11042 Erik } 295 11042 Erik ire_refrele(ire); 296 11042 Erik ire = next_ire; 297 11042 Erik } 298 8275 Eric } 299 11042 Erik /* Return generation before dropping lock */ 300 11042 Erik if (generationp != NULL) 301 11042 Erik *generationp = ire->ire_generation; 302 8275 Eric 303 8275 Eric RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 304 8275 Eric 305 8275 Eric /* 306 11042 Erik * Since we only did ALL_ZONES matches there is no special handling 307 11042 Erik * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. 308 8275 Eric */ 309 11042 Erik return (ire); 310 8275 Eric 311 11042 Erik bad: 312 11042 Erik if (generationp != NULL) 313 11042 Erik *generationp = IRE_GENERATION_VERIFY; 314 8275 Eric 315 11042 Erik RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 316 11042 Erik return (ire_reject(ipst, B_FALSE)); 317 8275 Eric } 318 2535 sangeeta 319 2535 sangeeta /* 320 11042 Erik * Find the ill matching a multicast group. 321 2535 sangeeta * Allows different routes for multicast addresses 322 2535 sangeeta * in the unicast routing table (akin to 224.0.0.0 but could be more specific) 323 2535 sangeeta * which point at different interfaces. This is used when IP_MULTICAST_IF 324 2535 sangeeta * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't 325 2535 sangeeta * specify the interface to join on. 326 2535 sangeeta * 327 11042 Erik * Supports link-local addresses by using ire_route_recursive which follows 328 11042 Erik * the ill when recursing. 329 11042 Erik * 330 11042 Erik * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 331 11042 Erik * and the MULTIRT property can be different for different groups, we 332 11042 Erik * extract RTF_MULTIRT from the special unicast route added for a group 333 11042 Erik * with CGTP and pass that back in the multirtp argument. 334 11042 Erik * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 335 11042 Erik * We have a setsrcp argument for the same reason. 336 2535 sangeeta */ 337 11042 Erik ill_t * 338 11042 Erik ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 339 11042 Erik boolean_t *multirtp, ipaddr_t *setsrcp) 340 2535 sangeeta { 341 2535 sangeeta ire_t *ire; 342 11042 Erik ill_t *ill; 343 2535 sangeeta 344 11042 Erik ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, 345 11042 Erik MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL); 346 11042 Erik ASSERT(ire != NULL); 347 11042 Erik if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 348 2535 sangeeta ire_refrele(ire); 349 2535 sangeeta return (NULL); 350 2535 sangeeta } 351 11042 Erik 352 11042 Erik if (multirtp != NULL) 353 11042 Erik *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 354 11042 Erik 355 11042 Erik ill = ire_nexthop_ill(ire); 356 11042 Erik ire_refrele(ire); 357 11042 Erik return (ill); 358 2535 sangeeta } 359 2535 sangeeta 360 2535 sangeeta /* 361 2535 sangeeta * Delete the passed in ire if the gateway addr matches 362 2535 sangeeta */ 363 2535 sangeeta void 364 2535 sangeeta ire_del_host_redir(ire_t *ire, char *gateway) 365 2535 sangeeta { 366 3004 dd193516 if ((ire->ire_flags & RTF_DYNAMIC) && 367 2535 sangeeta (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) 368 2535 sangeeta ire_delete(ire); 369 2535 sangeeta } 370 2535 sangeeta 371 2535 sangeeta /* 372 11042 Erik * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are 373 2535 sangeeta * pointing at the specified gateway and 374 2535 sangeeta * delete them. This routine is called only 375 2535 sangeeta * when a default gateway is going away. 376 2535 sangeeta */ 377 2535 sangeeta void 378 3448 dh155122 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) 379 2535 sangeeta { 380 2535 sangeeta struct rtfuncarg rtfarg; 381 2535 sangeeta 382 11131 Erik bzero(&rtfarg, sizeof (rtfarg)); 383 2535 sangeeta rtfarg.rt_func = ire_del_host_redir; 384 2535 sangeeta rtfarg.rt_arg = (void *)&gateway; 385 11131 Erik rtfarg.rt_zoneid = ALL_ZONES; 386 11131 Erik rtfarg.rt_ipst = ipst; 387 3448 dh155122 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, 388 3448 dh155122 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 389 2535 sangeeta } 390 2535 sangeeta 391 2535 sangeeta /* 392 3448 dh155122 * Obtain the rt_entry and rt_irb for the route to be added to 393 3448 dh155122 * the ips_ip_ftable. 394 2535 sangeeta * First attempt to add a node to the radix tree via rn_addroute. If the 395 2535 sangeeta * route already exists, return the bucket for the existing route. 396 2535 sangeeta * 397 2535 sangeeta * Locking notes: Need to hold the global radix tree lock in write mode to 398 2535 sangeeta * add a radix node. To prevent the node from being deleted, ire_get_bucket() 399 2535 sangeeta * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() 400 2535 sangeeta * while holding the irb_lock, but not the radix tree lock. 401 2535 sangeeta */ 402 2535 sangeeta irb_t * 403 2535 sangeeta ire_get_bucket(ire_t *ire) 404 2535 sangeeta { 405 2535 sangeeta struct radix_node *rn; 406 2535 sangeeta struct rt_entry *rt; 407 2535 sangeeta struct rt_sockaddr rmask, rdst; 408 2535 sangeeta irb_t *irb = NULL; 409 3448 dh155122 ip_stack_t *ipst = ire->ire_ipst; 410 2535 sangeeta 411 3448 dh155122 ASSERT(ipst->ips_ip_ftable != NULL); 412 2535 sangeeta 413 2535 sangeeta /* first try to see if route exists (based on rtalloc1) */ 414 11131 Erik bzero(&rdst, sizeof (rdst)); 415 2535 sangeeta rdst.rt_sin_len = sizeof (rdst); 416 2535 sangeeta rdst.rt_sin_family = AF_INET; 417 2535 sangeeta rdst.rt_sin_addr.s_addr = ire->ire_addr; 418 2535 sangeeta 419 11131 Erik bzero(&rmask, sizeof (rmask)); 420 2535 sangeeta rmask.rt_sin_len = sizeof (rmask); 421 2535 sangeeta rmask.rt_sin_family = AF_INET; 422 2535 sangeeta rmask.rt_sin_addr.s_addr = ire->ire_mask; 423 2535 sangeeta 424 2535 sangeeta /* 425 2535 sangeeta * add the route. based on BSD's rtrequest1(RTM_ADD) 426 2535 sangeeta */ 427 2535 sangeeta R_Malloc(rt, rt_entry_cache, sizeof (*rt)); 428 5090 sangeeta /* kmem_alloc failed */ 429 5090 sangeeta if (rt == NULL) 430 5090 sangeeta return (NULL); 431 5090 sangeeta 432 11131 Erik bzero(rt, sizeof (*rt)); 433 2535 sangeeta rt->rt_nodes->rn_key = (char *)&rt->rt_dst; 434 2535 sangeeta rt->rt_dst = rdst; 435 2535 sangeeta irb = &rt->rt_irb; 436 11042 Erik irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ 437 3448 dh155122 irb->irb_ipst = ipst; 438 2535 sangeeta rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); 439 3448 dh155122 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 440 3448 dh155122 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, 441 3448 dh155122 ipst->ips_ip_ftable, (struct radix_node *)rt); 442 2535 sangeeta if (rn == NULL) { 443 3448 dh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 444 2535 sangeeta Free(rt, rt_entry_cache); 445 2535 sangeeta rt = NULL; 446 2535 sangeeta irb = NULL; 447 3448 dh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 448 3448 dh155122 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, 449 3448 dh155122 ipst->ips_ip_ftable); 450 3448 dh155122 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { 451 2535 sangeeta /* found a non-root match */ 452 2535 sangeeta rt = (struct rt_entry *)rn; 453 2535 sangeeta } 454 2535 sangeeta } 455 2535 sangeeta if (rt != NULL) { 456 2535 sangeeta irb = &rt->rt_irb; 457 11042 Erik irb_refhold(irb); 458 2535 sangeeta } 459 3448 dh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 460 2535 sangeeta return (irb); 461 2535 sangeeta } 462 2535 sangeeta 463 2535 sangeeta /* 464 2535 sangeeta * This function is used when the caller wants to know the outbound 465 2535 sangeeta * interface for a packet given only the address. 466 2535 sangeeta * If this is a offlink IP address and there are multiple 467 2535 sangeeta * routes to this destination, this routine will utilise the 468 2535 sangeeta * first route it finds to IP address 469 2535 sangeeta * Return values: 470 2535 sangeeta * 0 - FAILURE 471 2535 sangeeta * nonzero - ifindex 472 2535 sangeeta */ 473 2535 sangeeta uint_t 474 2535 sangeeta ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) 475 2535 sangeeta { 476 2535 sangeeta uint_t ifindex = 0; 477 2535 sangeeta ire_t *ire; 478 2535 sangeeta ill_t *ill; 479 3448 dh155122 netstack_t *ns; 480 3448 dh155122 ip_stack_t *ipst; 481 2535 sangeeta 482 3448 dh155122 if (zoneid == ALL_ZONES) 483 3448 dh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 484 3448 dh155122 else 485 3448 dh155122 ns = netstack_find_by_zoneid(zoneid); 486 3448 dh155122 ASSERT(ns != NULL); 487 3448 dh155122 488 3448 dh155122 /* 489 3448 dh155122 * For exclusive stacks we set the zoneid to zero 490 3448 dh155122 * since IP uses the global zoneid in the exclusive stacks. 491 3448 dh155122 */ 492 3448 dh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 493 3448 dh155122 zoneid = GLOBAL_ZONEID; 494 3448 dh155122 ipst = ns->netstack_ip; 495 2535 sangeeta 496 2535 sangeeta ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); 497 2535 sangeeta 498 11042 Erik if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { 499 11042 Erik ill = ire_nexthop_ill(ire); 500 11042 Erik if (ill != NULL) { 501 2535 sangeeta ifindex = ill->ill_phyint->phyint_ifindex; 502 11042 Erik ill_refrele(ill); 503 11042 Erik } 504 2535 sangeeta ire_refrele(ire); 505 2535 sangeeta } 506 3448 dh155122 netstack_rele(ns); 507 2535 sangeeta return (ifindex); 508 2535 sangeeta } 509 2535 sangeeta 510 2535 sangeeta /* 511 2535 sangeeta * Routine to find the route to a destination. If a ifindex is supplied 512 11042 Erik * it tries to match the route to the corresponding ipif for the ifindex 513 2535 sangeeta */ 514 2535 sangeeta static ire_t * 515 3448 dh155122 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) 516 2535 sangeeta { 517 2535 sangeeta ire_t *ire = NULL; 518 2535 sangeeta int match_flags; 519 2535 sangeeta 520 11042 Erik match_flags = MATCH_IRE_DSTONLY; 521 2535 sangeeta 522 2535 sangeeta /* XXX pass NULL tsl for now */ 523 2535 sangeeta 524 2535 sangeeta if (dst_addr->sa_family == AF_INET) { 525 11042 Erik ire = ire_route_recursive_v4( 526 11042 Erik ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, 527 11042 Erik zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL, 528 11042 Erik NULL); 529 2535 sangeeta } else { 530 11042 Erik ire = ire_route_recursive_v6( 531 11042 Erik &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, 532 11042 Erik zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL, 533 11042 Erik NULL); 534 11042 Erik } 535 11042 Erik ASSERT(ire != NULL); 536 11042 Erik if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 537 11042 Erik ire_refrele(ire); 538 11042 Erik return (NULL); 539 2535 sangeeta } 540 2535 sangeeta return (ire); 541 2535 sangeeta } 542 2535 sangeeta 543 2535 sangeeta /* 544 2535 sangeeta * This routine is called by IP Filter to send a packet out on the wire 545 11042 Erik * to a specified dstination (which may be onlink or offlink). The ifindex may 546 11042 Erik * or may not be 0. A non-null ifindex indicates IP Filter has stipulated 547 2535 sangeeta * an outgoing interface and requires the nexthop to be on that interface. 548 4482 dr146992 * IP WILL NOT DO the following to the data packet before sending it out: 549 2535 sangeeta * a. manipulate ttl 550 4482 dr146992 * b. ipsec work 551 4482 dr146992 * c. fragmentation 552 4482 dr146992 * 553 4482 dr146992 * If the packet has been prepared for hardware checksum then it will be 554 4482 dr146992 * passed off to ip_send_align_cksum() to check that the flags set on the 555 4482 dr146992 * packet are in alignment with the capabilities of the new outgoing NIC. 556 2535 sangeeta * 557 2535 sangeeta * Return values: 558 2535 sangeeta * 0: IP was able to send of the data pkt 559 2535 sangeeta * ECOMM: Could not send packet 560 2535 sangeeta * ENONET No route to dst. It is up to the caller 561 2535 sangeeta * to send icmp unreachable error message, 562 2535 sangeeta * EINPROGRESS The macaddr of the onlink dst or that 563 2535 sangeeta * of the offlink dst's nexthop needs to get 564 2535 sangeeta * resolved before packet can be sent to dst. 565 2535 sangeeta * Thus transmission is not guaranteed. 566 11042 Erik * Note: No longer have visibility to the ARP queue 567 11042 Erik * hence no EINPROGRESS. 568 2535 sangeeta */ 569 2535 sangeeta int 570 2535 sangeeta ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, 571 2535 sangeeta zoneid_t zoneid) 572 2535 sangeeta { 573 11042 Erik ipaddr_t nexthop; 574 3448 dh155122 netstack_t *ns; 575 3448 dh155122 ip_stack_t *ipst; 576 11042 Erik ip_xmit_attr_t ixas; 577 11042 Erik int error; 578 2535 sangeeta 579 2535 sangeeta ASSERT(mp != NULL); 580 3448 dh155122 581 3448 dh155122 if (zoneid == ALL_ZONES) 582 3448 dh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 583 3448 dh155122 else 584 3448 dh155122 ns = netstack_find_by_zoneid(zoneid); 585 3448 dh155122 ASSERT(ns != NULL); 586 3448 dh155122 587 3448 dh155122 /* 588 3448 dh155122 * For exclusive stacks we set the zoneid to zero 589 3448 dh155122 * since IP uses the global zoneid in the exclusive stacks. 590 3448 dh155122 */ 591 3448 dh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 592 3448 dh155122 zoneid = GLOBAL_ZONEID; 593 3448 dh155122 ipst = ns->netstack_ip; 594 2535 sangeeta 595 2535 sangeeta ASSERT(dst_addr->sa_family == AF_INET || 596 2535 sangeeta dst_addr->sa_family == AF_INET6); 597 2535 sangeeta 598 11042 Erik bzero(&ixas, sizeof (ixas)); 599 11042 Erik /* 600 11042 Erik * No IPsec, no fragmentation, and don't let any hooks see 601 11042 Erik * the packet. 602 11042 Erik */ 603 11042 Erik ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; 604 11042 Erik ixas.ixa_cred = kcred; 605 11042 Erik ixas.ixa_cpid = NOPID; 606 11042 Erik ixas.ixa_tsl = NULL; 607 11042 Erik ixas.ixa_ipst = ipst; 608 11042 Erik ixas.ixa_ifindex = ifindex; 609 11042 Erik 610 2535 sangeeta if (dst_addr->sa_family == AF_INET) { 611 11042 Erik ipha_t *ipha = (ipha_t *)mp->b_rptr; 612 11042 Erik 613 11042 Erik ixas.ixa_flags |= IXAF_IS_IPV4; 614 11042 Erik nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; 615 11042 Erik if (nexthop != ipha->ipha_dst) { 616 11042 Erik ixas.ixa_flags |= IXAF_NEXTHOP_SET; 617 11042 Erik ixas.ixa_nexthop_v4 = nexthop; 618 11042 Erik } 619 11042 Erik ixas.ixa_multicast_ttl = ipha->ipha_ttl; 620 2535 sangeeta } else { 621 11042 Erik ip6_t *ip6h = (ip6_t *)mp->b_rptr; 622 11042 Erik in6_addr_t *nexthop6; 623 11042 Erik 624 11042 Erik nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; 625 11042 Erik if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { 626 11042 Erik ixas.ixa_flags |= IXAF_NEXTHOP_SET; 627 11042 Erik ixas.ixa_nexthop_v6 = *nexthop6; 628 11042 Erik } 629 11042 Erik ixas.ixa_multicast_ttl = ip6h->ip6_hops; 630 2535 sangeeta } 631 11042 Erik error = ip_output_simple(mp, &ixas); 632 11042 Erik ixa_cleanup(&ixas); 633 2535 sangeeta 634 11042 Erik netstack_rele(ns); 635 11042 Erik switch (error) { 636 11042 Erik case 0: 637 11042 Erik break; 638 2535 sangeeta 639 11042 Erik case EHOSTUNREACH: 640 11042 Erik case ENETUNREACH: 641 11042 Erik error = ENONET; 642 11042 Erik break; 643 2535 sangeeta 644 11042 Erik default: 645 11042 Erik error = ECOMM; 646 2535 sangeeta break; 647 2535 sangeeta } 648 11042 Erik return (error); 649 4482 dr146992 } 650 4482 dr146992 651 2535 sangeeta /* 652 2535 sangeeta * callback function provided by ire_ftable_lookup when calling 653 2535 sangeeta * rn_match_args(). Invoke ire_match_args on each matching leaf node in 654 2535 sangeeta * the radix tree. 655 2535 sangeeta */ 656 2535 sangeeta boolean_t 657 2535 sangeeta ire_find_best_route(struct radix_node *rn, void *arg) 658 2535 sangeeta { 659 2535 sangeeta struct rt_entry *rt = (struct rt_entry *)rn; 660 2535 sangeeta irb_t *irb_ptr; 661 2535 sangeeta ire_t *ire; 662 2535 sangeeta ire_ftable_args_t *margs = arg; 663 2535 sangeeta ipaddr_t match_mask; 664 2535 sangeeta 665 2535 sangeeta ASSERT(rt != NULL); 666 2535 sangeeta 667 2535 sangeeta irb_ptr = &rt->rt_irb; 668 2535 sangeeta 669 2535 sangeeta if (irb_ptr->irb_ire_cnt == 0) 670 2535 sangeeta return (B_FALSE); 671 2535 sangeeta 672 2535 sangeeta rw_enter(&irb_ptr->irb_lock, RW_READER); 673 2535 sangeeta for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 674 11042 Erik if (IRE_IS_CONDEMNED(ire)) 675 2535 sangeeta continue; 676 11042 Erik if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK)) 677 2535 sangeeta match_mask = margs->ift_mask; 678 2535 sangeeta else 679 2535 sangeeta match_mask = ire->ire_mask; 680 2535 sangeeta 681 2535 sangeeta if (ire_match_args(ire, margs->ift_addr, match_mask, 682 11042 Erik margs->ift_gateway, margs->ift_type, margs->ift_ill, 683 11042 Erik margs->ift_zoneid, margs->ift_tsl, 684 11042 Erik margs->ift_flags)) { 685 11042 Erik ire_refhold(ire); 686 2535 sangeeta rw_exit(&irb_ptr->irb_lock); 687 2535 sangeeta margs->ift_best_ire = ire; 688 2535 sangeeta return (B_TRUE); 689 2535 sangeeta } 690 2535 sangeeta } 691 2535 sangeeta rw_exit(&irb_ptr->irb_lock); 692 2535 sangeeta return (B_FALSE); 693 2535 sangeeta } 694 2535 sangeeta 695 2535 sangeeta /* 696 2535 sangeeta * ftable irb_t structures are dynamically allocated, and we need to 697 2535 sangeeta * check if the irb_t (and associated ftable tree attachment) needs to 698 2535 sangeeta * be cleaned up when the irb_refcnt goes to 0. The conditions that need 699 2535 sangeeta * be verified are: 700 2535 sangeeta * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, 701 2535 sangeeta * - no other threads holding references to ire's in the bucket, 702 2535 sangeeta * i.e., irb_nire == 0 703 2535 sangeeta * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 704 2535 sangeeta * - need to hold the global tree lock and irb_lock in write mode. 705 2535 sangeeta */ 706 2535 sangeeta void 707 2535 sangeeta irb_refrele_ftable(irb_t *irb) 708 2535 sangeeta { 709 2535 sangeeta for (;;) { 710 2535 sangeeta rw_enter(&irb->irb_lock, RW_WRITER); 711 2535 sangeeta ASSERT(irb->irb_refcnt != 0); 712 2535 sangeeta if (irb->irb_refcnt != 1) { 713 2535 sangeeta /* 714 2535 sangeeta * Someone has a reference to this radix node 715 2535 sangeeta * or there is some bucket walker. 716 2535 sangeeta */ 717 2535 sangeeta irb->irb_refcnt--; 718 2535 sangeeta rw_exit(&irb->irb_lock); 719 2535 sangeeta return; 720 2535 sangeeta } else { 721 2535 sangeeta /* 722 2535 sangeeta * There is no other walker, nor is there any 723 2535 sangeeta * other thread that holds a direct ref to this 724 2535 sangeeta * radix node. Do the clean up if needed. Call 725 2535 sangeeta * to ire_unlink will clear the IRB_MARK_CONDEMNED flag 726 2535 sangeeta */ 727 2535 sangeeta if (irb->irb_marks & IRB_MARK_CONDEMNED) { 728 2535 sangeeta ire_t *ire_list; 729 2535 sangeeta 730 2535 sangeeta ire_list = ire_unlink(irb); 731 2535 sangeeta rw_exit(&irb->irb_lock); 732 2535 sangeeta 733 2535 sangeeta if (ire_list != NULL) 734 2535 sangeeta ire_cleanup(ire_list); 735 2535 sangeeta /* 736 2535 sangeeta * more CONDEMNED entries could have 737 2535 sangeeta * been added while we dropped the lock, 738 2535 sangeeta * so we have to re-check. 739 2535 sangeeta */ 740 2535 sangeeta continue; 741 2535 sangeeta } 742 2535 sangeeta 743 2535 sangeeta /* 744 2535 sangeeta * Now check if there are still any ires 745 2535 sangeeta * associated with this radix node. 746 2535 sangeeta */ 747 2535 sangeeta if (irb->irb_nire != 0) { 748 2535 sangeeta /* 749 2535 sangeeta * someone is still holding on 750 2535 sangeeta * to ires in this bucket 751 2535 sangeeta */ 752 2535 sangeeta irb->irb_refcnt--; 753 2535 sangeeta rw_exit(&irb->irb_lock); 754 2535 sangeeta return; 755 2535 sangeeta } else { 756 2535 sangeeta /* 757 2535 sangeeta * Everything is clear. Zero walkers, 758 2535 sangeeta * Zero threads with a ref to this 759 2535 sangeeta * radix node, Zero ires associated with 760 2535 sangeeta * this radix node. Due to lock order, 761 2535 sangeeta * check the above conditions again 762 2535 sangeeta * after grabbing all locks in the right order 763 2535 sangeeta */ 764 2535 sangeeta rw_exit(&irb->irb_lock); 765 2535 sangeeta if (irb_inactive(irb)) 766 2535 sangeeta return; 767 2535 sangeeta /* 768 2535 sangeeta * irb_inactive could not free the irb. 769 2535 sangeeta * See if there are any walkers, if not 770 2535 sangeeta * try to clean up again. 771 2535 sangeeta */ 772 2535 sangeeta } 773 2535 sangeeta } 774 2535 sangeeta } 775 2535 sangeeta } 776 2535 sangeeta 777 2535 sangeeta /* 778 11042 Erik * IRE iterator used by ire_ftable_lookup to process multiple equal 779 11042 Erik * routes. Given a starting point in the hash list (hash), walk the IREs 780 11042 Erik * in the bucket skipping deleted entries. We treat the bucket as a circular 781 11042 Erik * list for the purposes of walking it. 782 11042 Erik * Returns the IRE (held) that corresponds to the hash value. If that IRE is 783 11042 Erik * not applicable (ire_match_args failed) then it returns a subsequent one. 784 11042 Erik * If we fail to find an IRE we return NULL. 785 2535 sangeeta * 786 11042 Erik * Assumes that the caller holds a reference on the IRE bucket and a read lock 787 11042 Erik * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). 788 11042 Erik * 789 11042 Erik * Applies to IPv4 and IPv6. 790 11042 Erik * 791 11042 Erik * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same 792 11042 Erik * address and bucket, we compare against ire_type for the orig_ire. We also 793 11042 Erik * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being 794 11131 Erik * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire. 795 11042 Erik * 796 11042 Erik * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is 797 11042 Erik * reachable from the zone i.e., that the ire_gateway_addr is in a subnet 798 11042 Erik * in which the zone has an IP address. We check this for the global zone 799 11042 Erik * even if no shared-IP zones are configured. 800 2535 sangeeta */ 801 2535 sangeeta ire_t * 802 11042 Erik ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, 803 11042 Erik ire_t *orig_ire, ip_stack_t *ipst) 804 2535 sangeeta { 805 11042 Erik ire_t *ire, *maybe_ire = NULL; 806 11042 Erik uint_t maybe_badcnt; 807 11042 Erik uint_t maxwalk; 808 11042 Erik 809 11042 Erik /* Fold in more bits from the hint/hash */ 810 11042 Erik hash = hash ^ (hash >> 8) ^ (hash >> 16); 811 2535 sangeeta 812 2535 sangeeta rw_enter(&irb_ptr->irb_lock, RW_WRITER); 813 11042 Erik maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ 814 11042 Erik hash %= maxwalk; 815 11042 Erik irb_refhold_locked(irb_ptr); 816 2535 sangeeta rw_exit(&irb_ptr->irb_lock); 817 2535 sangeeta 818 2535 sangeeta /* 819 2535 sangeeta * Round-robin the routers list looking for a route that 820 2535 sangeeta * matches the passed in parameters. 821 11042 Erik * First we skip "hash" number of non-condemned IREs. 822 11042 Erik * Then we match the IRE. 823 11042 Erik * If we find an ire which has a non-zero ire_badcnt then we remember 824 11042 Erik * it and keep on looking for a lower ire_badcnt. 825 11042 Erik * If we come to the end of the list we continue (treat the 826 11042 Erik * bucket list as a circular list) but we match less than "max" 827 11042 Erik * entries. 828 2535 sangeeta */ 829 11042 Erik ire = irb_ptr->irb_ire; 830 11042 Erik while (maxwalk > 0) { 831 11042 Erik if (IRE_IS_CONDEMNED(ire)) 832 11042 Erik goto next_ire_skip; 833 2535 sangeeta 834 11042 Erik /* Skip the first "hash" entries to do ECMP */ 835 11042 Erik if (hash != 0) { 836 11042 Erik hash--; 837 11042 Erik goto next_ire_skip; 838 11042 Erik } 839 11042 Erik 840 11042 Erik /* See CGTP comment above */ 841 11042 Erik if (ire->ire_type != orig_ire->ire_type || 842 11131 Erik ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0) 843 2535 sangeeta goto next_ire; 844 2535 sangeeta 845 11042 Erik /* 846 11042 Erik * Note: Since IPv6 has hash buckets instead of radix 847 11042 Erik * buckers we need to explicitly compare the addresses. 848 11042 Erik * That makes this less efficient since we will be called 849 11042 Erik * even if there is no alternatives just because the 850 11042 Erik * bucket has multiple IREs for different addresses. 851 11042 Erik */ 852 11042 Erik if (ire->ire_ipversion == IPV6_VERSION) { 853 11042 Erik if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, 854 11042 Erik &ire->ire_addr_v6)) 855 11042 Erik goto next_ire; 856 11042 Erik } 857 11042 Erik 858 11042 Erik /* 859 11042 Erik * For some reason find_best_route uses ire_mask. We do 860 11042 Erik * the same. 861 11042 Erik */ 862 11042 Erik if (ire->ire_ipversion == IPV4_VERSION ? 863 11042 Erik !ire_match_args(ire, margs->ift_addr, 864 11042 Erik ire->ire_mask, margs->ift_gateway, 865 11042 Erik margs->ift_type, margs->ift_ill, margs->ift_zoneid, 866 11042 Erik margs->ift_tsl, margs->ift_flags) : 867 11042 Erik !ire_match_args_v6(ire, &margs->ift_addr_v6, 868 11042 Erik &ire->ire_mask_v6, &margs->ift_gateway_v6, 869 11042 Erik margs->ift_type, margs->ift_ill, margs->ift_zoneid, 870 11042 Erik margs->ift_tsl, margs->ift_flags)) 871 2535 sangeeta goto next_ire; 872 2535 sangeeta 873 11042 Erik if (margs->ift_zoneid != ALL_ZONES && 874 11042 Erik (ire->ire_type & IRE_OFFLINK)) { 875 2535 sangeeta /* 876 11042 Erik * When we're in a zone, we're only 877 11042 Erik * interested in routers that are 878 11042 Erik * reachable through ipifs within our zone. 879 2535 sangeeta */ 880 11042 Erik if (ire->ire_ipversion == IPV4_VERSION) { 881 11042 Erik if (!ire_gateway_ok_zone_v4( 882 11042 Erik ire->ire_gateway_addr, margs->ift_zoneid, 883 11042 Erik ire->ire_ill, margs->ift_tsl, ipst, 884 11042 Erik B_TRUE)) 885 11042 Erik goto next_ire; 886 11042 Erik } else { 887 11042 Erik if (!ire_gateway_ok_zone_v6( 888 11042 Erik &ire->ire_gateway_addr_v6, 889 11042 Erik margs->ift_zoneid, ire->ire_ill, 890 11042 Erik margs->ift_tsl, ipst, B_TRUE)) 891 11042 Erik goto next_ire; 892 11042 Erik } 893 2535 sangeeta } 894 11042 Erik mutex_enter(&ire->ire_lock); 895 11042 Erik /* Look for stale ire_badcnt and clear */ 896 11042 Erik if (ire->ire_badcnt != 0 && 897 11066 rafael (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt > 898 11042 Erik ipst->ips_ip_ire_badcnt_lifetime)) 899 11042 Erik ire->ire_badcnt = 0; 900 11042 Erik mutex_exit(&ire->ire_lock); 901 2535 sangeeta 902 11042 Erik if (ire->ire_badcnt == 0) { 903 11042 Erik /* We found one with a zero badcnt; done */ 904 11042 Erik ire_refhold(ire); 905 11042 Erik /* 906 11042 Erik * Care needed since irb_refrele grabs WLOCK to free 907 11042 Erik * the irb_t. 908 11042 Erik */ 909 11042 Erik if (ire->ire_ipversion == IPV4_VERSION) { 910 11042 Erik RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 911 11042 Erik irb_refrele(irb_ptr); 912 11042 Erik RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 913 11042 Erik } else { 914 11042 Erik rw_exit(&ipst->ips_ip6_ire_head_lock); 915 11042 Erik irb_refrele(irb_ptr); 916 11042 Erik rw_enter(&ipst->ips_ip6_ire_head_lock, 917 11042 Erik RW_READER); 918 11042 Erik } 919 2535 sangeeta return (ire); 920 2535 sangeeta } 921 2535 sangeeta /* 922 11042 Erik * keep looking to see if there is a better (lower 923 11042 Erik * badcnt) matching IRE, but save this one as a last resort. 924 11042 Erik * If we find a lower badcnt pick that one as the last* resort. 925 2535 sangeeta */ 926 11042 Erik if (maybe_ire == NULL) { 927 11042 Erik maybe_ire = ire; 928 11042 Erik maybe_badcnt = ire->ire_badcnt; 929 11042 Erik } else if (ire->ire_badcnt < maybe_badcnt) { 930 11042 Erik maybe_ire = ire; 931 11042 Erik maybe_badcnt = ire->ire_badcnt; 932 11042 Erik } 933 8485 Peter 934 2535 sangeeta next_ire: 935 11042 Erik maxwalk--; 936 11042 Erik next_ire_skip: 937 11042 Erik ire = ire->ire_next; 938 11042 Erik if (ire == NULL) 939 11042 Erik ire = irb_ptr->irb_ire; 940 2535 sangeeta } 941 2535 sangeeta if (maybe_ire != NULL) 942 11042 Erik ire_refhold(maybe_ire); 943 11042 Erik 944 11042 Erik /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ 945 11042 Erik if (ire->ire_ipversion == IPV4_VERSION) { 946 11042 Erik RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 947 11042 Erik irb_refrele(irb_ptr); 948 11042 Erik RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 949 11042 Erik } else { 950 11042 Erik rw_exit(&ipst->ips_ip6_ire_head_lock); 951 11042 Erik irb_refrele(irb_ptr); 952 11042 Erik rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 953 11042 Erik } 954 2535 sangeeta return (maybe_ire); 955 2535 sangeeta } 956 2783 sowmini 957 2783 sowmini void 958 2783 sowmini irb_refhold_rn(struct radix_node *rn) 959 2783 sowmini { 960 2783 sowmini if ((rn->rn_flags & RNF_ROOT) == 0) 961 11042 Erik irb_refhold(&((rt_t *)(rn))->rt_irb); 962 2783 sowmini } 963 2783 sowmini 964 2783 sowmini void 965 2783 sowmini irb_refrele_rn(struct radix_node *rn) 966 2783 sowmini { 967 2783 sowmini if ((rn->rn_flags & RNF_ROOT) == 0) 968 2783 sowmini irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); 969 2783 sowmini } 970 11042 Erik 971 11042 Erik /* 972 11042 Erik * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject 973 11042 Erik * routes this routine sets up a ire_nce_cache as well. The caller needs to 974 11042 Erik * lookup an nce for the multicast case. 975 11042 Erik */ 976 11042 Erik ire_t * 977 11042 Erik ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa, 978 11042 Erik uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) 979 11042 Erik { 980 11042 Erik uint_t match_args; 981 11042 Erik uint_t ire_type; 982 11042 Erik ill_t *ill; 983 11042 Erik ire_t *ire; 984 11042 Erik ip_stack_t *ipst = ixa->ixa_ipst; 985 11042 Erik ipaddr_t v4dst; 986 11042 Erik in6_addr_t v6nexthop; 987 11042 Erik iaflags_t ixaflags = ixa->ixa_flags; 988 11042 Erik nce_t *nce; 989 11042 Erik 990 11042 Erik match_args = MATCH_IRE_SECATTR; 991 11042 Erik IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); 992 11042 Erik if (setsrcp != NULL) 993 11042 Erik ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 994 11042 Erik if (errorp != NULL) 995 11042 Erik ASSERT(*errorp == 0); 996 11042 Erik 997 11042 Erik /* 998 11042 Erik * The content of the ixa will be different if IP_NEXTHOP, 999 11042 Erik * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set 1000 11042 Erik */ 1001 11042 Erik 1002 11042 Erik if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) : 1003 11042 Erik IN6_IS_ADDR_MULTICAST(v6dst)) { 1004 11042 Erik /* Pick up the IRE_MULTICAST for the ill */ 1005 11042 Erik if (ixa->ixa_multicast_ifindex != 0) { 1006 11042 Erik ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, 1007 11042 Erik !(ixaflags & IXAF_IS_IPV4), ipst); 1008 11042 Erik } else if (ixaflags & IXAF_SCOPEID_SET) { 1009 11042 Erik /* sin6_scope_id takes precedence over ixa_ifindex */ 1010 11042 Erik ASSERT(ixa->ixa_scopeid != 0); 1011 11042 Erik ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1012 11042 Erik !(ixaflags & IXAF_IS_IPV4), ipst); 1013 11042 Erik } else if (ixa->ixa_ifindex != 0) { 1014 11042 Erik /* 1015 11042 Erik * In the ipmp case, the ixa_ifindex is set to 1016 11042 Erik * point at an under_ill and we would return the 1017 11042 Erik * ire_multicast() corresponding to that under_ill. 1018 11042 Erik */ 1019 11042 Erik ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1020 11042 Erik !(ixaflags & IXAF_IS_IPV4), ipst); 1021 11042 Erik } else if (ixaflags & IXAF_IS_IPV4) { 1022 11042 Erik ipaddr_t v4setsrc = INADDR_ANY; 1023 11042 Erik 1024 11042 Erik ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst, 1025 11042 Erik multirtp, &v4setsrc); 1026 11042 Erik if (setsrcp != NULL) 1027 11042 Erik IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1028 11042 Erik } else { 1029 11042 Erik ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst, 1030 11042 Erik multirtp, setsrcp); 1031 11042 Erik } 1032 11042 Erik if (ill != NULL && IS_VNI(ill)) { 1033 11042 Erik ill_refrele(ill); 1034 11042 Erik ill = NULL; 1035 11042 Erik } 1036 11042 Erik if (ill == NULL) { 1037 11042 Erik if (errorp != NULL) 1038 11042 Erik *errorp = ENXIO; 1039 11042 Erik /* Get a hold on the IRE_NOROUTE */ 1040 11042 Erik ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1041 11042 Erik return (ire); 1042 11042 Erik } 1043 11042 Erik if (!(ill->ill_flags & ILLF_MULTICAST)) { 1044 11042 Erik ill_refrele(ill); 1045 11042 Erik if (errorp != NULL) 1046 11042 Erik *errorp = EHOSTUNREACH; 1047 11042 Erik /* Get a hold on the IRE_NOROUTE */ 1048 11042 Erik ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1049 11042 Erik return (ire); 1050 11042 Erik } 1051 11042 Erik /* Get a refcnt on the single IRE_MULTICAST per ill */ 1052 11042 Erik ire = ire_multicast(ill); 1053 11042 Erik ill_refrele(ill); 1054 11042 Erik if (generationp != NULL) 1055 11042 Erik *generationp = ire->ire_generation; 1056 11042 Erik if (errorp != NULL && 1057 11042 Erik (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 1058 11042 Erik *errorp = EHOSTUNREACH; 1059 11042 Erik } 1060 11042 Erik return (ire); 1061 11042 Erik } 1062 11042 Erik 1063 11042 Erik if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { 1064 11042 Erik if (ixaflags & IXAF_SCOPEID_SET) { 1065 11042 Erik /* sin6_scope_id takes precedence over ixa_ifindex */ 1066 11042 Erik ASSERT(ixa->ixa_scopeid != 0); 1067 11042 Erik ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1068 11042 Erik !(ixaflags & IXAF_IS_IPV4), ipst); 1069 11042 Erik } else { 1070 11042 Erik ASSERT(ixa->ixa_ifindex != 0); 1071 11042 Erik ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1072 11042 Erik !(ixaflags & IXAF_IS_IPV4), ipst); 1073 11042 Erik } 1074 11042 Erik if (ill != NULL && IS_VNI(ill)) { 1075 11042 Erik ill_refrele(ill); 1076 11042 Erik ill = NULL; 1077 11042 Erik } 1078 11042 Erik if (ill == NULL) { 1079 11042 Erik if (errorp != NULL) 1080 11042 Erik *errorp = ENXIO; 1081 11042 Erik /* Get a hold on the IRE_NOROUTE */ 1082 11042 Erik ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1083 11042 Erik return (ire); 1084 11042 Erik } 1085 11042 Erik /* 1086 11042 Erik * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF 1087 11042 Erik * so for both of them we need to be able look for an under 1088 11042 Erik * interface. 1089 11042 Erik */ 1090 11042 Erik if (IS_UNDER_IPMP(ill)) 1091 11042 Erik match_args |= MATCH_IRE_TESTHIDDEN; 1092 11042 Erik } else { 1093 11042 Erik ill = NULL; 1094 11042 Erik } 1095 11042 Erik 1096 11042 Erik if (ixaflags & IXAF_NEXTHOP_SET) { 1097 11042 Erik /* IP_NEXTHOP was set */ 1098 11042 Erik v6nexthop = ixa->ixa_nexthop_v6; 1099 11042 Erik } else { 1100 11042 Erik v6nexthop = *v6dst; 1101 11042 Erik } 1102 11042 Erik 1103 11042 Erik ire_type = 0; 1104 11042 Erik /* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */ 1105 11042 Erik 1106 11042 Erik /* 1107 11042 Erik * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then 1108 11042 Erik * we only look for an onlink IRE. 1109 11042 Erik */ 1110 11042 Erik if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { 1111 11042 Erik match_args |= MATCH_IRE_TYPE; 1112 11042 Erik ire_type = IRE_ONLINK; 1113 11042 Erik } 1114 11042 Erik 1115 11042 Erik if (ixaflags & IXAF_IS_IPV4) { 1116 11042 Erik ipaddr_t v4nexthop; 1117 11042 Erik ipaddr_t v4setsrc = INADDR_ANY; 1118 11042 Erik 1119 11042 Erik IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); 1120 11042 Erik ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, 1121 11042 Erik ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE, 1122 11042 Erik ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); 1123 11042 Erik if (setsrcp != NULL) 1124 11042 Erik IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1125 11042 Erik } else { 1126 11042 Erik ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, 1127 11042 Erik ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE, 1128 11042 Erik ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); 1129 11042 Erik } 1130 11042 Erik 1131 11042 Erik #ifdef DEBUG 1132 11042 Erik if (match_args & MATCH_IRE_TESTHIDDEN) { 1133 11042 Erik ip3dbg(("looking for hidden; dst %x ire %p\n", 1134 11042 Erik v4dst, (void *)ire)); 1135 11042 Erik } 1136 11042 Erik #endif 1137 11042 Erik 1138 11042 Erik if (ill != NULL) 1139 11042 Erik ill_refrele(ill); 1140 11042 Erik 1141 11042 Erik if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1142 11042 Erik (ire->ire_type & IRE_MULTICAST)) { 1143 11042 Erik /* No ire_nce_cache */ 1144 11042 Erik return (ire); 1145 11042 Erik } 1146 11042 Erik 1147 11042 Erik /* Setup ire_nce_cache if it doesn't exist or is condemned. */ 1148 11042 Erik mutex_enter(&ire->ire_lock); 1149 11042 Erik nce = ire->ire_nce_cache; 1150 11042 Erik if (nce == NULL || nce->nce_is_condemned) { 1151 11042 Erik mutex_exit(&ire->ire_lock); 1152 11042 Erik (void) ire_revalidate_nce(ire); 1153 11042 Erik } else { 1154 11042 Erik mutex_exit(&ire->ire_lock); 1155 11042 Erik } 1156 11042 Erik return (ire); 1157 11042 Erik } 1158 11042 Erik 1159 11042 Erik /* 1160 11042 Erik * Find a route given some xmit attributes and a packet. 1161 11042 Erik * Generic for IPv4 and IPv6 1162 11042 Erik * 1163 11042 Erik * This never returns NULL. But when it returns the IRE_NOROUTE 1164 11042 Erik * it might set errorp. 1165 11042 Erik */ 1166 11042 Erik ire_t * 1167 11042 Erik ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, 1168 11042 Erik int *errorp, boolean_t *multirtp) 1169 11042 Erik { 1170 11042 Erik if (ixa->ixa_flags & IXAF_IS_IPV4) { 1171 11042 Erik ipha_t *ipha = (ipha_t *)mp->b_rptr; 1172 11042 Erik in6_addr_t v6dst; 1173 11042 Erik 1174 11042 Erik IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 1175 11042 Erik 1176 11042 Erik return (ip_select_route(&v6dst, ixa, generationp, 1177 11042 Erik NULL, errorp, multirtp)); 1178 11042 Erik } else { 1179 11042 Erik ip6_t *ip6h = (ip6_t *)mp->b_rptr; 1180 11042 Erik 1181 11042 Erik return (ip_select_route(&ip6h->ip6_dst, ixa, generationp, 1182 11042 Erik NULL, errorp, multirtp)); 1183 11042 Erik } 1184 11042 Erik } 1185 11042 Erik 1186 11042 Erik ire_t * 1187 11042 Erik ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp, 1188 11042 Erik ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) 1189 11042 Erik { 1190 11042 Erik in6_addr_t v6dst; 1191 11042 Erik ire_t *ire; 1192 11042 Erik in6_addr_t setsrc; 1193 11042 Erik 1194 11042 Erik ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 1195 11042 Erik 1196 11042 Erik IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); 1197 11042 Erik 1198 11042 Erik setsrc = ipv6_all_zeros; 1199 11042 Erik ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp, 1200 11042 Erik multirtp); 1201 11042 Erik if (v4setsrcp != NULL) 1202 11042 Erik IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); 1203 11042 Erik return (ire); 1204 11042 Erik } 1205 11042 Erik 1206 11042 Erik /* 1207 11042 Erik * Recursively look for a route to the destination. Can also match on 1208 11042 Erik * the zoneid, ill, and label. Used for the data paths. See also 1209 11042 Erik * ire_route_recursive. 1210 11042 Erik * 1211 11042 Erik * If ill is set this means we will match it by adding MATCH_IRE_ILL. 1212 11042 Erik * 1213 11042 Erik * Note that this function never returns NULL. It returns an IRE_NOROUTE 1214 11042 Erik * instead. 1215 11042 Erik * 1216 11042 Erik * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1217 11042 Erik * is an error. 1218 11042 Erik * Allow at most one RTF_INDIRECT. 1219 11042 Erik */ 1220 11042 Erik ire_t * 1221 11042 Erik ire_route_recursive_impl_v4(ire_t *ire, 1222 11042 Erik ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, 1223 11042 Erik zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1224 11042 Erik boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1225 11042 Erik tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1226 11042 Erik { 1227 11042 Erik int i, j; 1228 11042 Erik ire_t *ires[MAX_IRE_RECURSION]; 1229 11042 Erik uint_t generation; 1230 11042 Erik uint_t generations[MAX_IRE_RECURSION]; 1231 11042 Erik boolean_t need_refrele = B_FALSE; 1232 11042 Erik boolean_t invalidate = B_FALSE; 1233 11042 Erik int prefs[MAX_IRE_RECURSION]; 1234 11042 Erik ill_t *ill = NULL; 1235 11042 Erik 1236 11042 Erik if (setsrcp != NULL) 1237 11042 Erik ASSERT(*setsrcp == INADDR_ANY); 1238 11042 Erik if (gwattrp != NULL) 1239 11042 Erik ASSERT(*gwattrp == NULL); 1240 11042 Erik 1241 11042 Erik if (ill_arg != NULL) 1242 11042 Erik match_args |= MATCH_IRE_ILL; 1243 11042 Erik 1244 11042 Erik /* 1245 11042 Erik * We iterate up to three times to resolve a route, even though 1246 11042 Erik * we have four slots in the array. The extra slot is for an 1247 11042 Erik * IRE_IF_CLONE we might need to create. 1248 11042 Erik */ 1249 11042 Erik i = 0; 1250 11042 Erik while (i < MAX_IRE_RECURSION - 1) { 1251 11042 Erik /* ire_ftable_lookup handles round-robin/ECMP */ 1252 11042 Erik if (ire == NULL) { 1253 11042 Erik ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, 1254 11042 Erik (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, 1255 11042 Erik match_args, xmit_hint, ipst, &generation); 1256 11042 Erik } else { 1257 11042 Erik /* Caller passed it; extra hold since we will rele */ 1258 11042 Erik ire_refhold(ire); 1259 11042 Erik if (generationp != NULL) 1260 11042 Erik generation = *generationp; 1261 11042 Erik else 1262 11042 Erik generation = IRE_GENERATION_VERIFY; 1263 11042 Erik } 1264 11042 Erik if (ire == NULL) 1265 11042 Erik ire = ire_reject(ipst, B_FALSE); 1266 11042 Erik 1267 11042 Erik /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1268 11042 Erik if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1269 11042 Erik goto error; 1270 11042 Erik 1271 11042 Erik ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1272 11042 Erik 1273 11042 Erik if (i != 0) { 1274 11131 Erik prefs[i] = ire_pref(ire); 1275 11042 Erik /* 1276 11042 Erik * Don't allow anything unusual past the first 1277 11042 Erik * iteration. 1278 11042 Erik */ 1279 11042 Erik if ((ire->ire_type & 1280 11042 Erik (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || 1281 11042 Erik prefs[i] <= prefs[i-1]) { 1282 11042 Erik ire_refrele(ire); 1283 11042 Erik ire = ire_reject(ipst, B_FALSE); 1284 11042 Erik goto error; 1285 11042 Erik } 1286 11042 Erik } 1287 11042 Erik /* We have a usable IRE */ 1288 11042 Erik ires[i] = ire; 1289 11042 Erik generations[i] = generation; 1290 11042 Erik i++; 1291 11042 Erik 1292 11042 Erik /* The first RTF_SETSRC address is passed back if setsrcp */ 1293 11042 Erik if ((ire->ire_flags & RTF_SETSRC) && 1294 11042 Erik setsrcp != NULL && *setsrcp == INADDR_ANY) { 1295 11042 Erik ASSERT(ire->ire_setsrc_addr != INADDR_ANY); 1296 11042 Erik *setsrcp = ire->ire_setsrc_addr; 1297 11042 Erik } 1298 11042 Erik 1299 11042 Erik /* The first ire_gw_secattr is passed back if gwattrp */ 1300 11042 Erik if (ire->ire_gw_secattr != NULL && 1301 11042 Erik gwattrp != NULL && *gwattrp == NULL) 1302 11042 Erik *gwattrp = ire->ire_gw_secattr; 1303 11042 Erik 1304 11042 Erik /* 1305 11042 Erik * Check if we have a short-cut pointer to an IRE for this 1306 11042 Erik * destination, and that the cached dependency isn't stale. 1307 11042 Erik * In that case we've rejoined an existing tree towards a 1308 11042 Erik * parent, thus we don't need to continue the loop to 1309 11042 Erik * discover the rest of the tree. 1310 11042 Erik */ 1311 11042 Erik mutex_enter(&ire->ire_lock); 1312 11042 Erik if (ire->ire_dep_parent != NULL && 1313 11042 Erik ire->ire_dep_parent->ire_generation == 1314 11042 Erik ire->ire_dep_parent_generation) { 1315 11042 Erik mutex_exit(&ire->ire_lock); 1316 11042 Erik ire = NULL; 1317 11042 Erik goto done; 1318 11042 Erik } 1319 11042 Erik mutex_exit(&ire->ire_lock); 1320 11042 Erik 1321 11042 Erik /* 1322 11042 Erik * If this type should have an ire_nce_cache (even if it 1323 11042 Erik * doesn't yet have one) then we are done. Includes 1324 11042 Erik * IRE_INTERFACE with a full 32 bit mask. 1325 11042 Erik */ 1326 11042 Erik if (ire->ire_nce_capable) { 1327 11042 Erik ire = NULL; 1328 11042 Erik goto done; 1329 11042 Erik } 1330 11042 Erik ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1331 11042 Erik /* 1332 11042 Erik * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1333 11042 Erik * particular destination 1334 11042 Erik */ 1335 11042 Erik if (ire->ire_type & IRE_INTERFACE) { 1336 11042 Erik in6_addr_t v6nexthop; 1337 11042 Erik ire_t *clone; 1338 11042 Erik 1339 11042 Erik ASSERT(ire->ire_masklen != IPV4_ABITS); 1340 11042 Erik 1341 11042 Erik /* 1342 11042 Erik * In the case of ip_input and ILLF_FORWARDING not 1343 11042 Erik * being set, and in the case of RTM_GET, 1344 11042 Erik * there is no point in allocating 1345 11042 Erik * an IRE_IF_CLONE. We return the IRE_INTERFACE. 1346 11042 Erik * Note that !allocate can result in a ire_dep_parent 1347 11042 Erik * which is IRE_IF_* without an IRE_IF_CLONE. 1348 11042 Erik * We recover from that when we need to send packets 1349 11042 Erik * by ensuring that the generations become 1350 11042 Erik * IRE_GENERATION_VERIFY in this case. 1351 11042 Erik */ 1352 11042 Erik if (!allocate) { 1353 11042 Erik invalidate = B_TRUE; 1354 11042 Erik ire = NULL; 1355 11042 Erik goto done; 1356 11042 Erik } 1357 11042 Erik 1358 11042 Erik IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); 1359 11042 Erik 1360 11042 Erik clone = ire_create_if_clone(ire, &v6nexthop, 1361 11042 Erik &generation); 1362 11042 Erik if (clone == NULL) { 1363 11042 Erik /* 1364 11042 Erik * Temporary failure - no memory. 1365 11042 Erik * Don't want caller to cache IRE_NOROUTE. 1366 11042 Erik */ 1367 11042 Erik invalidate = B_TRUE; 1368 11042 Erik ire = ire_blackhole(ipst, B_FALSE); 1369 11042 Erik goto error; 1370 11042 Erik } 1371 11042 Erik /* 1372 11042 Erik * Make clone next to last entry and the 1373 11042 Erik * IRE_INTERFACE the last in the dependency 1374 11042 Erik * chain since the clone depends on the 1375 11042 Erik * IRE_INTERFACE. 1376 11042 Erik */ 1377 11042 Erik ASSERT(i >= 1); 1378 11042 Erik ASSERT(i < MAX_IRE_RECURSION); 1379 11042 Erik 1380 11042 Erik ires[i] = ires[i-1]; 1381 11042 Erik generations[i] = generations[i-1]; 1382 11042 Erik ires[i-1] = clone; 1383 11042 Erik generations[i-1] = generation; 1384 11042 Erik i++; 1385 11042 Erik 1386 11042 Erik ire = NULL; 1387 11042 Erik goto done; 1388 11042 Erik } 1389 11042 Erik 1390 11042 Erik /* 1391 11042 Erik * We only match on the type and optionally ILL when 1392 11042 Erik * recursing. The type match is used by some callers 1393 11042 Erik * to exclude certain types (such as IRE_IF_CLONE or 1394 11042 Erik * IRE_LOCAL|IRE_LOOPBACK). 1395 11042 Erik */ 1396 11042 Erik match_args &= MATCH_IRE_TYPE; 1397 11042 Erik nexthop = ire->ire_gateway_addr; 1398 11042 Erik if (ill == NULL && ire->ire_ill != NULL) { 1399 11042 Erik ill = ire->ire_ill; 1400 11042 Erik need_refrele = B_TRUE; 1401 11042 Erik ill_refhold(ill); 1402 11042 Erik match_args |= MATCH_IRE_ILL; 1403 11042 Erik } 1404 11131 Erik /* 1405 11131 Erik * We set the prefs[i] value above if i > 0. We've already 1406 11131 Erik * done i++ so i is one in the case of the first time around. 1407 11131 Erik */ 1408 11131 Erik if (i == 1) 1409 11131 Erik prefs[0] = ire_pref(ire); 1410 11042 Erik ire = NULL; 1411 11042 Erik } 1412 11042 Erik ASSERT(ire == NULL); 1413 11042 Erik ire = ire_reject(ipst, B_FALSE); 1414 11042 Erik 1415 11042 Erik error: 1416 11042 Erik ASSERT(ire != NULL); 1417 11042 Erik if (need_refrele) 1418 11042 Erik ill_refrele(ill); 1419 11042 Erik 1420 11042 Erik /* 1421 11042 Erik * In the case of MULTIRT we want to try a different IRE the next 1422 11042 Erik * time. We let the next packet retry in that case. 1423 11042 Erik */ 1424 11042 Erik if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1425 11042 Erik (void) ire_no_good(ires[0]); 1426 11042 Erik 1427 11042 Erik cleanup: 1428 11042 Erik /* cleanup ires[i] */ 1429 11042 Erik ire_dep_unbuild(ires, i); 1430 11042 Erik for (j = 0; j < i; j++) 1431 11042 Erik ire_refrele(ires[j]); 1432 11042 Erik 1433 11042 Erik ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)); 1434 11042 Erik /* 1435 11042 Erik * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1436 11042 Erik * ip_select_route since the reject or lack of memory might be gone. 1437 11042 Erik */ 1438 11042 Erik if (generationp != NULL) 1439 11042 Erik *generationp = IRE_GENERATION_VERIFY; 1440 11042 Erik return (ire); 1441 11042 Erik 1442 11042 Erik done: 1443 11042 Erik ASSERT(ire == NULL); 1444 11042 Erik if (need_refrele) { 1445 11042 Erik ill_refrele(ill); 1446 11042 Erik ill = NULL; 1447 11042 Erik } 1448 11042 Erik 1449 11042 Erik /* Build dependencies */ 1450 11131 Erik if (i > 1 && !ire_dep_build(ires, generations, i)) { 1451 11042 Erik /* Something in chain was condemned; tear it apart */ 1452 11042 Erik ire = ire_reject(ipst, B_FALSE); 1453 11042 Erik goto cleanup; 1454 11042 Erik } 1455 11042 Erik 1456 11042 Erik /* 1457 11042 Erik * Release all refholds except the one for ires[0] that we 1458 11042 Erik * will return to the caller. 1459 11042 Erik */ 1460 11042 Erik for (j = 1; j < i; j++) 1461 11042 Erik ire_refrele(ires[j]); 1462 11042 Erik 1463 11042 Erik if (invalidate) { 1464 11042 Erik /* 1465 11042 Erik * Since we needed to allocate but couldn't we need to make 1466 11042 Erik * sure that the dependency chain is rebuilt the next time. 1467 11042 Erik */ 1468 11042 Erik ire_dep_invalidate_generations(ires[0]); 1469 11042 Erik generation = IRE_GENERATION_VERIFY; 1470 11042 Erik } else { 1471 11042 Erik /* 1472 11042 Erik * IREs can have been added or deleted while we did the 1473 11042 Erik * recursive lookup and we can't catch those until we've built 1474 11042 Erik * the dependencies. We verify the stored 1475 11042 Erik * ire_dep_parent_generation to catch any such changes and 1476 11042 Erik * return IRE_GENERATION_VERIFY (which will cause 1477 11042 Erik * ip_select_route to be called again so we can redo the 1478 11042 Erik * recursive lookup next time we send a packet. 1479 11042 Erik */ 1480 11131 Erik if (ires[0]->ire_dep_parent == NULL) 1481 11131 Erik generation = ires[0]->ire_generation; 1482 11131 Erik else 1483 11131 Erik generation = ire_dep_validate_generations(ires[0]); 1484 11042 Erik if (generations[0] != ires[0]->ire_generation) { 1485 11042 Erik /* Something changed at the top */ 1486 11042 Erik generation = IRE_GENERATION_VERIFY; 1487 11042 Erik } 1488 11042 Erik } 1489 11042 Erik if (generationp != NULL) 1490 11042 Erik *generationp = generation; 1491 11042 Erik 1492 11042 Erik return (ires[0]); 1493 11042 Erik } 1494 11042 Erik 1495 11042 Erik ire_t * 1496 11042 Erik ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, 1497 11042 Erik zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1498 11042 Erik boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1499 11042 Erik tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1500 11042 Erik { 1501 11042 Erik return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, 1502 11042 Erik zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp, 1503 11042 Erik gwattrp, generationp)); 1504 11042 Erik } 1505 11042 Erik 1506 11042 Erik /* 1507 11042 Erik * Recursively look for a route to the destination. 1508 11042 Erik * We only handle a destination match here, yet we have the same arguments 1509 11042 Erik * as the full match to allow function pointers to select between the two. 1510 11042 Erik * 1511 11042 Erik * Note that this function never returns NULL. It returns an IRE_NOROUTE 1512 11042 Erik * instead. 1513 11042 Erik * 1514 11042 Erik * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1515 11042 Erik * is an error. 1516 11042 Erik * Allow at most one RTF_INDIRECT. 1517 11042 Erik */ 1518 11042 Erik ire_t * 1519 11042 Erik ire_route_recursive_dstonly_v4(ipaddr_t nexthop, boolean_t allocate, 1520 11042 Erik uint32_t xmit_hint, ip_stack_t *ipst) 1521 11042 Erik { 1522 11042 Erik ire_t *ire; 1523 11042 Erik ire_t *ire1; 1524 11042 Erik uint_t generation; 1525 11042 Erik 1526 11042 Erik /* ire_ftable_lookup handles round-robin/ECMP */ 1527 11042 Erik ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, 1528 11042 Erik &generation); 1529 11042 Erik ASSERT(ire != NULL); 1530 11042 Erik 1531 11042 Erik /* 1532 11042 Erik * If this type should have an ire_nce_cache (even if it 1533 11042 Erik * doesn't yet have one) then we are done. Includes 1534 11042 Erik * IRE_INTERFACE with a full 32 bit mask. 1535 11042 Erik */ 1536 11042 Erik if (ire->ire_nce_capable) 1537 11042 Erik return (ire); 1538 11042 Erik 1539 11042 Erik /* 1540 11042 Erik * If the IRE has a current cached parent we know that the whole 1541 11042 Erik * parent chain is current, hence we don't need to discover and 1542 11042 Erik * build any dependencies by doing a recursive lookup. 1543 11042 Erik */ 1544 11042 Erik mutex_enter(&ire->ire_lock); 1545 11042 Erik if (ire->ire_dep_parent != NULL && 1546 11042 Erik ire->ire_dep_parent->ire_generation == 1547 11042 Erik ire->ire_dep_parent_generation) { 1548 11042 Erik mutex_exit(&ire->ire_lock); 1549 11042 Erik return (ire); 1550 11042 Erik } 1551 11042 Erik mutex_exit(&ire->ire_lock); 1552 11042 Erik 1553 11042 Erik /* 1554 11042 Erik * Fallback to loop in the normal code starting with the ire 1555 11042 Erik * we found. Normally this would return the same ire. 1556 11042 Erik */ 1557 11042 Erik ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, 1558 11042 Erik NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL, 1559 11042 Erik &generation); 1560 11042 Erik ire_refrele(ire); 1561 11042 Erik return (ire1); 1562 11042 Erik } 1563