1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. 23 * All rights reserved. Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Procedures for the kernel part of DVMRP, 31 * a Distance-Vector Multicast Routing Protocol. 32 * (See RFC-1075) 33 * Written by David Waitzman, BBN Labs, August 1988. 34 * Modified by Steve Deering, Stanford, February 1989. 35 * Modified by Mark J. Steiglitz, Stanford, May, 1991 36 * Modified by Van Jacobson, LBL, January 1993 37 * Modified by Ajit Thyagarajan, PARC, August 1993 38 * Modified by Bill Fenner, PARC, April 1995 39 * 40 * MROUTING 3.5 41 */ 42 43 /* 44 * TODO 45 * - function pointer field in vif, void *vif_sendit() 46 */ 47 48 #include <sys/types.h> 49 #include <sys/stream.h> 50 #include <sys/stropts.h> 51 #include <sys/strlog.h> 52 #include <sys/systm.h> 53 #include <sys/ddi.h> 54 #include <sys/cmn_err.h> 55 #include <sys/zone.h> 56 57 #include <sys/param.h> 58 #include <sys/socket.h> 59 #include <sys/vtrace.h> 60 #include <sys/debug.h> 61 #include <net/if.h> 62 #include <sys/sockio.h> 63 #include <netinet/in.h> 64 #include <net/if_dl.h> 65 66 #include <inet/common.h> 67 #include <inet/mi.h> 68 #include <inet/nd.h> 69 #include <inet/mib2.h> 70 #include <netinet/ip6.h> 71 #include <inet/ip.h> 72 #include <inet/snmpcom.h> 73 74 #include <netinet/igmp.h> 75 #include <netinet/igmp_var.h> 76 #include <netinet/udp.h> 77 #include <netinet/ip_mroute.h> 78 #include <inet/ip_multi.h> 79 #include <inet/ip_ire.h> 80 #include <inet/ip_if.h> 81 #include <inet/ipclassifier.h> 82 83 #include <netinet/pim.h> 84 85 86 /* 87 * MT Design: 88 * 89 * There are three main data structures viftable, mfctable and tbftable that 90 * need to be protected against MT races. 91 * 92 * vitable is a fixed length array of vif structs. There is no lock to protect 93 * the whole array, instead each struct is protected by its own indiviual lock. 94 * The value of v_marks in conjuction with the value of v_refcnt determines the 95 * current state of a vif structure. One special state that needs mention 96 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates 97 * that vif is being initalized. 98 * Each structure is freed when the refcnt goes down to zero. If a delete comes 99 * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED 100 * which prevents the struct from further use. When the refcnt goes to zero 101 * the struct is freed and is marked VIF_MARK_NOTINUSE. 102 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill 103 * from going away a refhold is put on the ipif before using it. see 104 * lock_good_vif() and unlock_good_vif(). 105 * 106 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts 107 * of the vif struct. 108 * 109 * tbftable is also a fixed length array of tbf structs and is only accessed 110 * via v_tbf. It is protected by its own lock tbf_lock. 111 * 112 * Lock Ordering is 113 * v_lock --> tbf_lock 114 * v_lock --> ill_locK 115 * 116 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb). 117 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker, 118 * it also maintains a state. These fields are protected by a lock (mfcb_lock). 119 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to 120 * protect the struct elements. 121 * 122 * mfc structs are dynamically allocated and are singly linked 123 * at the head of the chain. When an mfc structure is to be deleted 124 * it is marked condemned and so is the state in the bucket struct. 125 * When the last walker of the hash bucket exits all the mfc structs 126 * marked condemed are freed. 127 * 128 * Locking Hierarchy: 129 * The bucket lock should be acquired before the mfc struct lock. 130 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking 131 * operations on the bucket struct. 132 * 133 * last_encap_lock and numvifs_mutex should be acquired after 134 * acquring vif or mfc locks. These locks protect some global variables. 135 * 136 * The statistics are not currently protected by a lock 137 * causing the stats be be approximate, not exact. 138 */ 139 140 #define NO_VIF MAXVIFS /* from mrouted, no route for src */ 141 142 /* 143 * Timeouts: 144 * Upcall timeouts - BSD uses boolean_t mfc->expire and 145 * nexpire[MFCTBLSIZE], the number of times expire has been called. 146 * SunOS 5.x uses mfc->timeout for each mfc. 147 * Some Unixes are limited in the number of simultaneous timeouts 148 * that can be run, SunOS 5.x does not have this restriction. 149 */ 150 151 /* 152 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and 153 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall 154 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE 155 */ 156 #define EXPIRE_TIMEOUT (hz/4) /* 4x / second */ 157 #define UPCALL_EXPIRE 6 /* number of timeouts */ 158 159 /* 160 * Hash function for a source, group entry 161 */ 162 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 163 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 164 165 #define TBF_REPROCESS (hz / 100) /* 100x /second */ 166 167 /* Identify PIM packet that came on a Register interface */ 168 #define PIM_REGISTER_MARKER 0xffffffff 169 170 /* Function declarations */ 171 static int add_mfc(struct mfcctl *, ip_stack_t *); 172 static int add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *); 173 static int del_mfc(struct mfcctl *, ip_stack_t *); 174 static int del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *); 175 static void del_vifp(struct vif *); 176 static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 177 static void expire_upcalls(void *); 178 static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *); 179 static void free_queue(struct mfc *); 180 static int get_assert(uchar_t *, ip_stack_t *); 181 static int get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *); 182 static int get_sg_cnt(struct sioc_sg_req *, ip_stack_t *); 183 static int get_version(uchar_t *); 184 static int get_vif_cnt(struct sioc_vif_req *, ip_stack_t *); 185 static int ip_mdq(mblk_t *, ipha_t *, ill_t *, 186 ipaddr_t, struct mfc *); 187 static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *); 188 static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 189 static int register_mforward(queue_t *, mblk_t *, ill_t *); 190 static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); 191 static int set_assert(int *, ip_stack_t *); 192 193 /* 194 * Token Bucket Filter functions 195 */ 196 static int priority(struct vif *, ipha_t *); 197 static void tbf_control(struct vif *, mblk_t *, ipha_t *); 198 static int tbf_dq_sel(struct vif *, ipha_t *); 199 static void tbf_process_q(struct vif *); 200 static void tbf_queue(struct vif *, mblk_t *); 201 static void tbf_reprocess_q(void *); 202 static void tbf_send_packet(struct vif *, mblk_t *); 203 static void tbf_update_tokens(struct vif *); 204 static void release_mfc(struct mfcb *); 205 206 static boolean_t is_mrouter_off(ip_stack_t *); 207 /* 208 * Encapsulation packets 209 */ 210 211 #define ENCAP_TTL 64 212 213 /* prototype IP hdr for encapsulated packets */ 214 static ipha_t multicast_encap_iphdr = { 215 IP_SIMPLE_HDR_VERSION, 216 0, /* tos */ 217 sizeof (ipha_t), /* total length */ 218 0, /* id */ 219 0, /* frag offset */ 220 ENCAP_TTL, IPPROTO_ENCAP, 221 0, /* checksum */ 222 }; 223 224 /* 225 * Rate limit for assert notification messages, in nsec. 226 */ 227 #define ASSERT_MSG_TIME 3000000000 228 229 230 #define VIF_REFHOLD(vifp) { \ 231 mutex_enter(&(vifp)->v_lock); \ 232 (vifp)->v_refcnt++; \ 233 mutex_exit(&(vifp)->v_lock); \ 234 } 235 236 #define VIF_REFRELE_LOCKED(vifp) { \ 237 (vifp)->v_refcnt--; \ 238 if ((vifp)->v_refcnt == 0 && \ 239 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 240 del_vifp(vifp); \ 241 } else { \ 242 mutex_exit(&(vifp)->v_lock); \ 243 } \ 244 } 245 246 #define VIF_REFRELE(vifp) { \ 247 mutex_enter(&(vifp)->v_lock); \ 248 (vifp)->v_refcnt--; \ 249 if ((vifp)->v_refcnt == 0 && \ 250 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \ 251 del_vifp(vifp); \ 252 } else { \ 253 mutex_exit(&(vifp)->v_lock); \ 254 } \ 255 } 256 257 #define MFCB_REFHOLD(mfcb) { \ 258 mutex_enter(&(mfcb)->mfcb_lock); \ 259 (mfcb)->mfcb_refcnt++; \ 260 ASSERT((mfcb)->mfcb_refcnt != 0); \ 261 mutex_exit(&(mfcb)->mfcb_lock); \ 262 } 263 264 #define MFCB_REFRELE(mfcb) { \ 265 mutex_enter(&(mfcb)->mfcb_lock); \ 266 ASSERT((mfcb)->mfcb_refcnt != 0); \ 267 if (--(mfcb)->mfcb_refcnt == 0 && \ 268 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) { \ 269 release_mfc(mfcb); \ 270 } \ 271 mutex_exit(&(mfcb)->mfcb_lock); \ 272 } 273 274 /* 275 * MFCFIND: 276 * Find a route for a given origin IP address and multicast group address. 277 * Skip entries with pending upcalls. 278 * Type of service parameter to be added in the future! 279 */ 280 #define MFCFIND(mfcbp, o, g, rt) { \ 281 struct mfc *_mb_rt = NULL; \ 282 rt = NULL; \ 283 _mb_rt = mfcbp->mfcb_mfc; \ 284 while (_mb_rt) { \ 285 if ((_mb_rt->mfc_origin.s_addr == o) && \ 286 (_mb_rt->mfc_mcastgrp.s_addr == g) && \ 287 (_mb_rt->mfc_rte == NULL) && \ 288 (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) { \ 289 rt = _mb_rt; \ 290 break; \ 291 } \ 292 _mb_rt = _mb_rt->mfc_next; \ 293 } \ 294 } 295 296 /* 297 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime() 298 * are inefficient. We use gethrestime() which returns a timespec_t with 299 * sec and nsec, the resolution is machine dependent. 300 * The following 2 macros have been changed to use nsec instead of usec. 301 */ 302 /* 303 * Macros to compute elapsed time efficiently. 304 * Borrowed from Van Jacobson's scheduling code. 305 * Delta should be a hrtime_t. 306 */ 307 #define TV_DELTA(a, b, delta) { \ 308 int xxs; \ 309 \ 310 delta = (a).tv_nsec - (b).tv_nsec; \ 311 if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \ 312 switch (xxs) { \ 313 case 2: \ 314 delta += 1000000000; \ 315 /*FALLTHROUGH*/ \ 316 case 1: \ 317 delta += 1000000000; \ 318 break; \ 319 default: \ 320 delta += (1000000000 * xxs); \ 321 } \ 322 } \ 323 } 324 325 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \ 326 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 327 328 /* 329 * Handle MRT setsockopt commands to modify the multicast routing tables. 330 */ 331 int 332 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data, 333 int datalen, mblk_t *first_mp) 334 { 335 conn_t *connp = Q_TO_CONN(q); 336 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 337 338 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 339 if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) { 340 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 341 return (EACCES); 342 } 343 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 344 345 if (checkonly) { 346 /* 347 * do not do operation, just pretend to - new T_CHECK 348 * Note: Even routines further on can probably fail but 349 * this T_CHECK stuff is only to please XTI so it not 350 * necessary to be perfect. 351 */ 352 switch (cmd) { 353 case MRT_INIT: 354 case MRT_DONE: 355 case MRT_ADD_VIF: 356 case MRT_DEL_VIF: 357 case MRT_ADD_MFC: 358 case MRT_DEL_MFC: 359 case MRT_ASSERT: 360 return (0); 361 default: 362 return (EOPNOTSUPP); 363 } 364 } 365 366 /* 367 * make sure no command is issued after multicast routing has been 368 * turned off. 369 */ 370 if (cmd != MRT_INIT && cmd != MRT_DONE) { 371 if (is_mrouter_off(ipst)) 372 return (EINVAL); 373 } 374 375 switch (cmd) { 376 case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst)); 377 case MRT_DONE: return (ip_mrouter_done(first_mp, ipst)); 378 case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, 379 first_mp, ipst)); 380 case MRT_DEL_VIF: return (del_vif((vifi_t *)data, connp, first_mp, 381 ipst)); 382 case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst)); 383 case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst)); 384 case MRT_ASSERT: return (set_assert((int *)data, ipst)); 385 default: return (EOPNOTSUPP); 386 } 387 } 388 389 /* 390 * Handle MRT getsockopt commands 391 */ 392 int 393 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data) 394 { 395 conn_t *connp = Q_TO_CONN(q); 396 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 397 398 if (connp != ipst->ips_ip_g_mrouter) 399 return (EACCES); 400 401 switch (cmd) { 402 case MRT_VERSION: return (get_version((uchar_t *)data)); 403 case MRT_ASSERT: return (get_assert((uchar_t *)data, ipst)); 404 default: return (EOPNOTSUPP); 405 } 406 } 407 408 /* 409 * Handle ioctl commands to obtain information from the cache. 410 * Called with shared access to IP. These are read_only ioctls. 411 */ 412 /* ARGSUSED */ 413 int 414 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 415 ip_ioctl_cmd_t *ipip, void *if_req) 416 { 417 mblk_t *mp1; 418 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 419 conn_t *connp = Q_TO_CONN(q); 420 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 421 422 /* Existence verified in ip_wput_nondata */ 423 mp1 = mp->b_cont->b_cont; 424 425 switch (iocp->ioc_cmd) { 426 case (SIOCGETVIFCNT): 427 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst)); 428 case (SIOCGETSGCNT): 429 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst)); 430 case (SIOCGETLSGCNT): 431 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst)); 432 default: 433 return (EINVAL); 434 } 435 } 436 437 /* 438 * Returns the packet, byte, rpf-failure count for the source, group provided. 439 */ 440 static int 441 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst) 442 { 443 struct mfc *rt; 444 struct mfcb *mfcbp; 445 446 mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)]; 447 MFCB_REFHOLD(mfcbp); 448 MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt); 449 450 if (rt != NULL) { 451 mutex_enter(&rt->mfc_mutex); 452 req->pktcnt = rt->mfc_pkt_cnt; 453 req->bytecnt = rt->mfc_byte_cnt; 454 req->wrong_if = rt->mfc_wrong_if; 455 mutex_exit(&rt->mfc_mutex); 456 } else 457 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU; 458 459 MFCB_REFRELE(mfcbp); 460 return (0); 461 } 462 463 /* 464 * Returns the packet, byte, rpf-failure count for the source, group provided. 465 * Uses larger counters and IPv6 addresses. 466 */ 467 /* ARGSUSED XXX until implemented */ 468 static int 469 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst) 470 { 471 /* XXX TODO SIOCGETLSGCNT */ 472 return (ENXIO); 473 } 474 475 /* 476 * Returns the input and output packet and byte counts on the vif provided. 477 */ 478 static int 479 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst) 480 { 481 vifi_t vifi = req->vifi; 482 483 if (vifi >= ipst->ips_numvifs) 484 return (EINVAL); 485 486 /* 487 * No locks here, an approximation is fine. 488 */ 489 req->icount = ipst->ips_vifs[vifi].v_pkt_in; 490 req->ocount = ipst->ips_vifs[vifi].v_pkt_out; 491 req->ibytes = ipst->ips_vifs[vifi].v_bytes_in; 492 req->obytes = ipst->ips_vifs[vifi].v_bytes_out; 493 494 return (0); 495 } 496 497 static int 498 get_version(uchar_t *data) 499 { 500 int *v = (int *)data; 501 502 *v = 0x0305; /* XXX !!!! */ 503 504 return (0); 505 } 506 507 /* 508 * Set PIM assert processing global. 509 */ 510 static int 511 set_assert(int *i, ip_stack_t *ipst) 512 { 513 if ((*i != 1) && (*i != 0)) 514 return (EINVAL); 515 516 ipst->ips_pim_assert = *i; 517 518 return (0); 519 } 520 521 /* 522 * Get PIM assert processing global. 523 */ 524 static int 525 get_assert(uchar_t *data, ip_stack_t *ipst) 526 { 527 int *i = (int *)data; 528 529 *i = ipst->ips_pim_assert; 530 531 return (0); 532 } 533 534 /* 535 * Enable multicast routing. 536 */ 537 static int 538 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst) 539 { 540 int *v; 541 542 if (data == NULL || (datalen != sizeof (int))) 543 return (ENOPROTOOPT); 544 545 v = (int *)data; 546 if (*v != 1) 547 return (ENOPROTOOPT); 548 549 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 550 if (ipst->ips_ip_g_mrouter != NULL) { 551 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 552 return (EADDRINUSE); 553 } 554 555 /* 556 * MRT_INIT should only be allowed for RAW sockets, but we double 557 * check. 558 */ 559 if (!IPCL_IS_RAWIP(connp)) { 560 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 561 return (EINVAL); 562 } 563 564 ipst->ips_ip_g_mrouter = connp; 565 connp->conn_multi_router = 1; 566 /* In order for tunnels to work we have to turn ip_g_forward on */ 567 if (!WE_ARE_FORWARDING(ipst)) { 568 if (ipst->ips_ip_mrtdebug > 1) { 569 (void) mi_strlog(connp->conn_rq, 1, SL_TRACE, 570 "ip_mrouter_init: turning on forwarding"); 571 } 572 ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward; 573 ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS; 574 } 575 576 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 577 return (0); 578 } 579 580 void 581 ip_mrouter_stack_init(ip_stack_t *ipst) 582 { 583 mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL); 584 585 ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1), 586 KM_SLEEP); 587 ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP); 588 /* 589 * mfctable: 590 * Includes all mfcs, including waiting upcalls. 591 * Multiple mfcs per bucket. 592 */ 593 ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ, 594 KM_SLEEP); 595 /* 596 * Define the token bucket filter structures. 597 * tbftable -> each vif has one of these for storing info. 598 */ 599 ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP); 600 601 mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL); 602 603 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl); 604 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl); 605 } 606 607 /* 608 * Disable multicast routing. 609 * Didn't use global timeout_val (BSD version), instead check the mfctable. 610 */ 611 int 612 ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst) 613 { 614 conn_t *mrouter; 615 vifi_t vifi; 616 struct mfc *mfc_rt; 617 int i; 618 619 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 620 if (ipst->ips_ip_g_mrouter == NULL) { 621 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 622 return (EINVAL); 623 } 624 625 mrouter = ipst->ips_ip_g_mrouter; 626 627 if (ipst->ips_saved_ip_g_forward != -1) { 628 if (ipst->ips_ip_mrtdebug > 1) { 629 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 630 "ip_mrouter_done: turning off forwarding"); 631 } 632 ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward; 633 ipst->ips_saved_ip_g_forward = -1; 634 } 635 636 /* 637 * Always clear cache when vifs change. 638 * No need to get ipst->ips_last_encap_lock since we are running as 639 * a writer. 640 */ 641 mutex_enter(&ipst->ips_last_encap_lock); 642 ipst->ips_last_encap_src = 0; 643 ipst->ips_last_encap_vif = NULL; 644 mutex_exit(&ipst->ips_last_encap_lock); 645 mrouter->conn_multi_router = 0; 646 647 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 648 649 /* 650 * For each phyint in use, 651 * disable promiscuous reception of all IP multicasts. 652 */ 653 for (vifi = 0; vifi < MAXVIFS; vifi++) { 654 struct vif *vifp = ipst->ips_vifs + vifi; 655 656 mutex_enter(&vifp->v_lock); 657 /* 658 * if the vif is active mark it condemned. 659 */ 660 if (vifp->v_marks & VIF_MARK_GOOD) { 661 ASSERT(vifp->v_ipif != NULL); 662 ipif_refhold(vifp->v_ipif); 663 /* Phyint only */ 664 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 665 ipif_t *ipif = vifp->v_ipif; 666 ipsq_t *ipsq; 667 boolean_t suc; 668 ill_t *ill; 669 670 ill = ipif->ipif_ill; 671 suc = B_FALSE; 672 if (mp == NULL) { 673 /* 674 * being called from ip_close, 675 * lets do it synchronously. 676 * Clear VIF_MARK_GOOD and 677 * set VIF_MARK_CONDEMNED. 678 */ 679 vifp->v_marks &= ~VIF_MARK_GOOD; 680 vifp->v_marks |= VIF_MARK_CONDEMNED; 681 mutex_exit(&(vifp)->v_lock); 682 suc = ipsq_enter(ill, B_FALSE); 683 ipsq = ill->ill_phyint->phyint_ipsq; 684 } else { 685 ipsq = ipsq_try_enter(ipif, NULL, 686 mrouter->conn_wq, mp, 687 ip_restart_optmgmt, NEW_OP, B_TRUE); 688 if (ipsq == NULL) { 689 mutex_exit(&(vifp)->v_lock); 690 ipif_refrele(ipif); 691 return (EINPROGRESS); 692 } 693 /* 694 * Clear VIF_MARK_GOOD and 695 * set VIF_MARK_CONDEMNED. 696 */ 697 vifp->v_marks &= ~VIF_MARK_GOOD; 698 vifp->v_marks |= VIF_MARK_CONDEMNED; 699 mutex_exit(&(vifp)->v_lock); 700 suc = B_TRUE; 701 } 702 703 if (suc) { 704 (void) ip_delmulti(INADDR_ANY, ipif, 705 B_TRUE, B_TRUE); 706 ipsq_exit(ipsq, B_TRUE, B_TRUE); 707 } 708 mutex_enter(&vifp->v_lock); 709 } 710 /* 711 * decreases the refcnt added in add_vif. 712 * and release v_lock. 713 */ 714 VIF_REFRELE_LOCKED(vifp); 715 } else { 716 mutex_exit(&vifp->v_lock); 717 continue; 718 } 719 } 720 721 mutex_enter(&ipst->ips_numvifs_mutex); 722 ipst->ips_numvifs = 0; 723 ipst->ips_pim_assert = 0; 724 ipst->ips_reg_vif_num = ALL_VIFS; 725 mutex_exit(&ipst->ips_numvifs_mutex); 726 727 /* 728 * Free upcall msgs. 729 * Go through mfctable and stop any outstanding upcall 730 * timeouts remaining on mfcs. 731 */ 732 for (i = 0; i < MFCTBLSIZ; i++) { 733 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock); 734 ipst->ips_mfcs[i].mfcb_refcnt++; 735 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED; 736 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock); 737 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc; 738 while (mfc_rt) { 739 /* Free upcalls */ 740 mutex_enter(&mfc_rt->mfc_mutex); 741 if (mfc_rt->mfc_rte != NULL) { 742 if (mfc_rt->mfc_timeout_id != 0) { 743 /* 744 * OK to drop the lock as we have 745 * a refcnt on the bucket. timeout 746 * can fire but it will see that 747 * mfc_timeout_id == 0 and not do 748 * anything. see expire_upcalls(). 749 */ 750 mfc_rt->mfc_timeout_id = 0; 751 mutex_exit(&mfc_rt->mfc_mutex); 752 (void) untimeout( 753 mfc_rt->mfc_timeout_id); 754 mfc_rt->mfc_timeout_id = 0; 755 mutex_enter(&mfc_rt->mfc_mutex); 756 757 /* 758 * all queued upcall packets 759 * and mblk will be freed in 760 * release_mfc(). 761 */ 762 } 763 } 764 765 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED; 766 767 mutex_exit(&mfc_rt->mfc_mutex); 768 mfc_rt = mfc_rt->mfc_next; 769 } 770 MFCB_REFRELE(&ipst->ips_mfcs[i]); 771 } 772 773 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 774 ipst->ips_ip_g_mrouter = NULL; 775 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 776 return (0); 777 } 778 779 void 780 ip_mrouter_stack_destroy(ip_stack_t *ipst) 781 { 782 struct mfcb *mfcbp; 783 struct mfc *rt; 784 int i; 785 786 for (i = 0; i < MFCTBLSIZ; i++) { 787 mfcbp = &ipst->ips_mfcs[i]; 788 789 while ((rt = mfcbp->mfcb_mfc) != NULL) { 790 (void) printf("ip_mrouter_stack_destroy: free for %d\n", 791 i); 792 793 mfcbp->mfcb_mfc = rt->mfc_next; 794 free_queue(rt); 795 mi_free(rt); 796 } 797 } 798 kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1)); 799 ipst->ips_vifs = NULL; 800 kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat)); 801 ipst->ips_mrtstat = NULL; 802 kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ); 803 ipst->ips_mfcs = NULL; 804 kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS); 805 ipst->ips_tbfs = NULL; 806 807 mutex_destroy(&ipst->ips_last_encap_lock); 808 mutex_destroy(&ipst->ips_ip_g_mrouter_mutex); 809 } 810 811 static boolean_t 812 is_mrouter_off(ip_stack_t *ipst) 813 { 814 conn_t *mrouter; 815 816 mutex_enter(&ipst->ips_ip_g_mrouter_mutex); 817 if (ipst->ips_ip_g_mrouter == NULL) { 818 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 819 return (B_TRUE); 820 } 821 822 mrouter = ipst->ips_ip_g_mrouter; 823 if (mrouter->conn_multi_router == 0) { 824 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 825 return (B_TRUE); 826 } 827 mutex_exit(&ipst->ips_ip_g_mrouter_mutex); 828 return (B_FALSE); 829 } 830 831 static void 832 unlock_good_vif(struct vif *vifp) 833 { 834 ASSERT(vifp->v_ipif != NULL); 835 ipif_refrele(vifp->v_ipif); 836 VIF_REFRELE(vifp); 837 } 838 839 static boolean_t 840 lock_good_vif(struct vif *vifp) 841 { 842 mutex_enter(&vifp->v_lock); 843 if (!(vifp->v_marks & VIF_MARK_GOOD)) { 844 mutex_exit(&vifp->v_lock); 845 return (B_FALSE); 846 } 847 848 ASSERT(vifp->v_ipif != NULL); 849 mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock); 850 if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) { 851 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 852 mutex_exit(&vifp->v_lock); 853 return (B_FALSE); 854 } 855 ipif_refhold_locked(vifp->v_ipif); 856 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock); 857 vifp->v_refcnt++; 858 mutex_exit(&vifp->v_lock); 859 return (B_TRUE); 860 } 861 862 /* 863 * Add a vif to the vif table. 864 */ 865 static int 866 add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) 867 { 868 struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi; 869 ipif_t *ipif; 870 int error; 871 struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi; 872 ipsq_t *ipsq; 873 conn_t *mrouter = ipst->ips_ip_g_mrouter; 874 875 ASSERT(connp != NULL); 876 877 if (vifcp->vifc_vifi >= MAXVIFS) 878 return (EINVAL); 879 880 if (is_mrouter_off(ipst)) 881 return (EINVAL); 882 883 mutex_enter(&vifp->v_lock); 884 /* 885 * Viftable entry should be 0. 886 * if v_marks == 0 but v_refcnt != 0 means struct is being 887 * initialized. 888 * 889 * Also note that it is very unlikely that we will get a MRT_ADD_VIF 890 * request while the delete is in progress, mrouted only sends add 891 * requests when a new interface is added and the new interface cannot 892 * have the same vifi as an existing interface. We make sure that 893 * ill_delete will block till the vif is deleted by adding a refcnt 894 * to ipif in del_vif(). 895 */ 896 if (vifp->v_lcl_addr.s_addr != 0 || 897 vifp->v_marks != 0 || 898 vifp->v_refcnt != 0) { 899 mutex_exit(&vifp->v_lock); 900 return (EADDRINUSE); 901 } 902 903 /* Incoming vif should not be 0 */ 904 if (vifcp->vifc_lcl_addr.s_addr == 0) { 905 mutex_exit(&vifp->v_lock); 906 return (EINVAL); 907 } 908 909 vifp->v_refcnt++; 910 mutex_exit(&vifp->v_lock); 911 /* Find the interface with the local address */ 912 ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL, 913 connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp, 914 ip_restart_optmgmt, &error, ipst); 915 if (ipif == NULL) { 916 VIF_REFRELE(vifp); 917 if (error == EINPROGRESS) 918 return (error); 919 return (EADDRNOTAVAIL); 920 } 921 922 /* 923 * We have to be exclusive as we have to call ip_addmulti() 924 * This is the best position to try to be exclusive in case 925 * we have to wait. 926 */ 927 ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp, 928 ip_restart_optmgmt, NEW_OP, B_TRUE); 929 if ((ipsq) == NULL) { 930 VIF_REFRELE(vifp); 931 ipif_refrele(ipif); 932 return (EINPROGRESS); 933 } 934 935 if (ipst->ips_ip_mrtdebug > 1) { 936 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 937 "add_vif: src 0x%x enter", 938 vifcp->vifc_lcl_addr.s_addr); 939 } 940 941 mutex_enter(&vifp->v_lock); 942 /* 943 * Always clear cache when vifs change. 944 * Needed to ensure that src isn't left over from before vif was added. 945 * No need to get last_encap_lock, since we are running as a writer. 946 */ 947 948 mutex_enter(&ipst->ips_last_encap_lock); 949 ipst->ips_last_encap_src = 0; 950 ipst->ips_last_encap_vif = NULL; 951 mutex_exit(&ipst->ips_last_encap_lock); 952 953 if (vifcp->vifc_flags & VIFF_TUNNEL) { 954 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) { 955 cmn_err(CE_WARN, 956 "add_vif: source route tunnels not supported\n"); 957 VIF_REFRELE_LOCKED(vifp); 958 ipif_refrele(ipif); 959 ipsq_exit(ipsq, B_TRUE, B_TRUE); 960 return (EOPNOTSUPP); 961 } 962 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 963 964 } else { 965 /* Phyint or Register vif */ 966 if (vifcp->vifc_flags & VIFF_REGISTER) { 967 /* 968 * Note: Since all IPPROTO_IP level options (including 969 * MRT_ADD_VIF) are done exclusively via 970 * ip_optmgmt_writer(), a lock is not necessary to 971 * protect reg_vif_num. 972 */ 973 mutex_enter(&ipst->ips_numvifs_mutex); 974 if (ipst->ips_reg_vif_num == ALL_VIFS) { 975 ipst->ips_reg_vif_num = vifcp->vifc_vifi; 976 mutex_exit(&ipst->ips_numvifs_mutex); 977 } else { 978 mutex_exit(&ipst->ips_numvifs_mutex); 979 VIF_REFRELE_LOCKED(vifp); 980 ipif_refrele(ipif); 981 ipsq_exit(ipsq, B_TRUE, B_TRUE); 982 return (EADDRINUSE); 983 } 984 } 985 986 /* Make sure the interface supports multicast */ 987 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) { 988 VIF_REFRELE_LOCKED(vifp); 989 ipif_refrele(ipif); 990 if (vifcp->vifc_flags & VIFF_REGISTER) { 991 mutex_enter(&ipst->ips_numvifs_mutex); 992 ipst->ips_reg_vif_num = ALL_VIFS; 993 mutex_exit(&ipst->ips_numvifs_mutex); 994 } 995 ipsq_exit(ipsq, B_TRUE, B_TRUE); 996 return (EOPNOTSUPP); 997 } 998 /* Enable promiscuous reception of all IP mcasts from the if */ 999 mutex_exit(&vifp->v_lock); 1000 error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE, 1001 MODE_IS_EXCLUDE, NULL); 1002 mutex_enter(&vifp->v_lock); 1003 /* 1004 * since we released the lock lets make sure that 1005 * ip_mrouter_done() has not been called. 1006 */ 1007 if (error != 0 || is_mrouter_off(ipst)) { 1008 if (error == 0) 1009 (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, 1010 B_TRUE); 1011 if (vifcp->vifc_flags & VIFF_REGISTER) { 1012 mutex_enter(&ipst->ips_numvifs_mutex); 1013 ipst->ips_reg_vif_num = ALL_VIFS; 1014 mutex_exit(&ipst->ips_numvifs_mutex); 1015 } 1016 VIF_REFRELE_LOCKED(vifp); 1017 ipif_refrele(ipif); 1018 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1019 return (error?error:EINVAL); 1020 } 1021 } 1022 /* Define parameters for the tbf structure */ 1023 vifp->v_tbf = v_tbf; 1024 gethrestime(&vifp->v_tbf->tbf_last_pkt_t); 1025 vifp->v_tbf->tbf_n_tok = 0; 1026 vifp->v_tbf->tbf_q_len = 0; 1027 vifp->v_tbf->tbf_max_q_len = MAXQSIZE; 1028 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; 1029 1030 vifp->v_flags = vifcp->vifc_flags; 1031 vifp->v_threshold = vifcp->vifc_threshold; 1032 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 1033 vifp->v_ipif = ipif; 1034 ipif_refrele(ipif); 1035 /* Scaling up here, allows division by 1024 in critical code. */ 1036 vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000); 1037 vifp->v_timeout_id = 0; 1038 /* initialize per vif pkt counters */ 1039 vifp->v_pkt_in = 0; 1040 vifp->v_pkt_out = 0; 1041 vifp->v_bytes_in = 0; 1042 vifp->v_bytes_out = 0; 1043 mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL); 1044 1045 /* Adjust numvifs up, if the vifi is higher than numvifs */ 1046 mutex_enter(&ipst->ips_numvifs_mutex); 1047 if (ipst->ips_numvifs <= vifcp->vifc_vifi) 1048 ipst->ips_numvifs = vifcp->vifc_vifi + 1; 1049 mutex_exit(&ipst->ips_numvifs_mutex); 1050 1051 if (ipst->ips_ip_mrtdebug > 1) { 1052 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, 1053 "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d", 1054 vifcp->vifc_vifi, 1055 ntohl(vifcp->vifc_lcl_addr.s_addr), 1056 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1057 ntohl(vifcp->vifc_rmt_addr.s_addr), 1058 vifcp->vifc_threshold, vifcp->vifc_rate_limit); 1059 }