Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.
     23  * All rights reserved.  Use is subject to license terms.
     24  */
     25 /* Copyright (c) 1990 Mentat Inc. */
     26 
     27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     28 
     29 /*
     30  * Procedures for the kernel part of DVMRP,
     31  * a Distance-Vector Multicast Routing Protocol.
     32  * (See RFC-1075)
     33  * Written by David Waitzman, BBN Labs, August 1988.
     34  * Modified by Steve Deering, Stanford, February 1989.
     35  * Modified by Mark J. Steiglitz, Stanford, May, 1991
     36  * Modified by Van Jacobson, LBL, January 1993
     37  * Modified by Ajit Thyagarajan, PARC, August 1993
     38  * Modified by Bill Fenner, PARC, April 1995
     39  *
     40  * MROUTING 3.5
     41  */
     42 
     43 /*
     44  * TODO
     45  * - function pointer field in vif, void *vif_sendit()
     46  */
     47 
     48 #include <sys/types.h>
     49 #include <sys/stream.h>
     50 #include <sys/stropts.h>
     51 #include <sys/strlog.h>
     52 #include <sys/systm.h>
     53 #include <sys/ddi.h>
     54 #include <sys/cmn_err.h>
     55 #include <sys/zone.h>
     56 
     57 #include <sys/param.h>
     58 #include <sys/socket.h>
     59 #include <sys/vtrace.h>
     60 #include <sys/debug.h>
     61 #include <net/if.h>
     62 #include <sys/sockio.h>
     63 #include <netinet/in.h>
     64 #include <net/if_dl.h>
     65 
     66 #include <inet/common.h>
     67 #include <inet/mi.h>
     68 #include <inet/nd.h>
     69 #include <inet/mib2.h>
     70 #include <netinet/ip6.h>
     71 #include <inet/ip.h>
     72 #include <inet/snmpcom.h>
     73 
     74 #include <netinet/igmp.h>
     75 #include <netinet/igmp_var.h>
     76 #include <netinet/udp.h>
     77 #include <netinet/ip_mroute.h>
     78 #include <inet/ip_multi.h>
     79 #include <inet/ip_ire.h>
     80 #include <inet/ip_if.h>
     81 #include <inet/ipclassifier.h>
     82 
     83 #include <netinet/pim.h>
     84 
     85 
     86 /*
     87  * MT Design:
     88  *
     89  * There are three main data structures viftable, mfctable and tbftable that
     90  * need to be protected against MT races.
     91  *
     92  * vitable is a fixed length array of vif structs. There is no lock to protect
     93  * the whole array, instead each struct is protected by its own indiviual lock.
     94  * The value of v_marks in conjuction with the value of v_refcnt determines the
     95  * current state of a vif structure. One special state that needs mention
     96  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
     97  * that vif is being initalized.
     98  * Each structure is freed when the refcnt goes down to zero. If a delete comes
     99  * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
    100  * which prevents the struct from further use.  When the refcnt goes to zero
    101  * the struct is freed and is marked VIF_MARK_NOTINUSE.
    102  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
    103  * from  going away a refhold is put on the ipif before using it. see
    104  * lock_good_vif() and unlock_good_vif().
    105  *
    106  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
    107  * of the vif struct.
    108  *
    109  * tbftable is also a fixed length array of tbf structs and is only accessed
    110  * via v_tbf.  It is protected by its own lock tbf_lock.
    111  *
    112  * Lock Ordering is
    113  * v_lock --> tbf_lock
    114  * v_lock --> ill_locK
    115  *
    116  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
    117  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
    118  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
    119  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
    120  * protect the struct elements.
    121  *
    122  * mfc structs are dynamically allocated and are singly linked
    123  * at the head of the chain. When an mfc structure is to be deleted
    124  * it is marked condemned and so is the state in the bucket struct.
    125  * When the last walker of the hash bucket exits all the mfc structs
    126  * marked condemed are freed.
    127  *
    128  * Locking Hierarchy:
    129  * The bucket lock should be acquired before the mfc struct lock.
    130  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
    131  * operations on the bucket struct.
    132  *
    133  * last_encap_lock and numvifs_mutex should be acquired after
    134  * acquring vif or mfc locks. These locks protect some global variables.
    135  *
    136  * The statistics are not currently protected by a lock
    137  * causing the stats be be approximate, not exact.
    138  */
    139 
    140 #define	NO_VIF	MAXVIFS 	/* from mrouted, no route for src */
    141 
    142 /*
    143  * Timeouts:
    144  * 	Upcall timeouts - BSD uses boolean_t mfc->expire and
    145  *	nexpire[MFCTBLSIZE], the number of times expire has been called.
    146  *	SunOS 5.x uses mfc->timeout for each mfc.
    147  *	Some Unixes are limited in the number of simultaneous timeouts
    148  * 	that can be run, SunOS 5.x does not have this restriction.
    149  */
    150 
    151 /*
    152  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
    153  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
    154  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
    155  */
    156 #define		EXPIRE_TIMEOUT	(hz/4)	/* 4x / second	*/
    157 #define		UPCALL_EXPIRE	6	/* number of timeouts	*/
    158 
    159 /*
    160  * Hash function for a source, group entry
    161  */
    162 #define	MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
    163 	((g) >> 20) ^ ((g) >> 10) ^ (g))
    164 
    165 #define			TBF_REPROCESS	(hz / 100)	/* 100x /second	*/
    166 
    167 /* Identify PIM packet that came on a Register interface */
    168 #define	PIM_REGISTER_MARKER	0xffffffff
    169 
    170 /* Function declarations */
    171 static int	add_mfc(struct mfcctl *, ip_stack_t *);
    172 static int	add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *);
    173 static int	del_mfc(struct mfcctl *, ip_stack_t *);
    174 static int	del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *);
    175 static void	del_vifp(struct vif *);
    176 static void	encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
    177 static void	expire_upcalls(void *);
    178 static void	fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
    179 static void	free_queue(struct mfc *);
    180 static int	get_assert(uchar_t *, ip_stack_t *);
    181 static int	get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
    182 static int	get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
    183 static int	get_version(uchar_t *);
    184 static int	get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
    185 static int	ip_mdq(mblk_t *, ipha_t *, ill_t *,
    186 		    ipaddr_t, struct mfc *);
    187 static int	ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
    188 static void	phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
    189 static int	register_mforward(queue_t *, mblk_t *, ill_t *);
    190 static void	register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
    191 static int	set_assert(int *, ip_stack_t *);
    192 
    193 /*
    194  * Token Bucket Filter functions
    195  */
    196 static int  priority(struct vif *, ipha_t *);
    197 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
    198 static int  tbf_dq_sel(struct vif *, ipha_t *);
    199 static void tbf_process_q(struct vif *);
    200 static void tbf_queue(struct vif *, mblk_t *);
    201 static void tbf_reprocess_q(void *);
    202 static void tbf_send_packet(struct vif *, mblk_t *);
    203 static void tbf_update_tokens(struct vif *);
    204 static void release_mfc(struct mfcb *);
    205 
    206 static boolean_t is_mrouter_off(ip_stack_t *);
    207 /*
    208  * Encapsulation packets
    209  */
    210 
    211 #define	ENCAP_TTL	64
    212 
    213 /* prototype IP hdr for encapsulated packets */
    214 static ipha_t multicast_encap_iphdr = {
    215 	IP_SIMPLE_HDR_VERSION,
    216 	0,				/* tos */
    217 	sizeof (ipha_t),		/* total length */
    218 	0,				/* id */
    219 	0,				/* frag offset */
    220 	ENCAP_TTL, IPPROTO_ENCAP,
    221 	0,				/* checksum */
    222 };
    223 
    224 /*
    225  * Rate limit for assert notification messages, in nsec.
    226  */
    227 #define	ASSERT_MSG_TIME		3000000000
    228 
    229 
    230 #define	VIF_REFHOLD(vifp) {			\
    231 	mutex_enter(&(vifp)->v_lock);		\
    232 	(vifp)->v_refcnt++;			\
    233 	mutex_exit(&(vifp)->v_lock);		\
    234 }
    235 
    236 #define	VIF_REFRELE_LOCKED(vifp) {				\
    237 	(vifp)->v_refcnt--;					\
    238 	if ((vifp)->v_refcnt == 0 &&				\
    239 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
    240 			del_vifp(vifp);				\
    241 	} else {						\
    242 		mutex_exit(&(vifp)->v_lock);			\
    243 	}							\
    244 }
    245 
    246 #define	VIF_REFRELE(vifp) {					\
    247 	mutex_enter(&(vifp)->v_lock);				\
    248 	(vifp)->v_refcnt--;					\
    249 	if ((vifp)->v_refcnt == 0 &&				\
    250 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
    251 			del_vifp(vifp);				\
    252 	} else {						\
    253 		mutex_exit(&(vifp)->v_lock);			\
    254 	}							\
    255 }
    256 
    257 #define	MFCB_REFHOLD(mfcb) {				\
    258 	mutex_enter(&(mfcb)->mfcb_lock);		\
    259 	(mfcb)->mfcb_refcnt++;				\
    260 	ASSERT((mfcb)->mfcb_refcnt != 0);		\
    261 	mutex_exit(&(mfcb)->mfcb_lock);			\
    262 }
    263 
    264 #define	MFCB_REFRELE(mfcb) {					\
    265 	mutex_enter(&(mfcb)->mfcb_lock);			\
    266 	ASSERT((mfcb)->mfcb_refcnt != 0);			\
    267 	if (--(mfcb)->mfcb_refcnt == 0 &&			\
    268 		((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {	\
    269 			release_mfc(mfcb);			\
    270 	}							\
    271 	mutex_exit(&(mfcb)->mfcb_lock);				\
    272 }
    273 
    274 /*
    275  * MFCFIND:
    276  * Find a route for a given origin IP address and multicast group address.
    277  * Skip entries with pending upcalls.
    278  * Type of service parameter to be added in the future!
    279  */
    280 #define	MFCFIND(mfcbp, o, g, rt) { \
    281 	struct mfc *_mb_rt = NULL; \
    282 	rt = NULL; \
    283 	_mb_rt = mfcbp->mfcb_mfc; \
    284 	while (_mb_rt) { \
    285 		if ((_mb_rt->mfc_origin.s_addr == o) && \
    286 		    (_mb_rt->mfc_mcastgrp.s_addr == g) && \
    287 		    (_mb_rt->mfc_rte == NULL) && \
    288 		    (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
    289 		    rt = _mb_rt; \
    290 		    break; \
    291 		} \
    292 	_mb_rt = _mb_rt->mfc_next; \
    293 	} \
    294 }
    295 
    296 /*
    297  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
    298  * are inefficient. We use gethrestime() which returns a timespec_t with
    299  * sec and nsec, the resolution is machine dependent.
    300  * The following 2 macros have been changed to use nsec instead of usec.
    301  */
    302 /*
    303  * Macros to compute elapsed time efficiently.
    304  * Borrowed from Van Jacobson's scheduling code.
    305  * Delta should be a hrtime_t.
    306  */
    307 #define	TV_DELTA(a, b, delta) { \
    308 	int xxs; \
    309  \
    310 	delta = (a).tv_nsec - (b).tv_nsec; \
    311 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
    312 		switch (xxs) { \
    313 		case 2: \
    314 		    delta += 1000000000; \
    315 		    /*FALLTHROUGH*/ \
    316 		case 1: \
    317 		    delta += 1000000000; \
    318 		    break; \
    319 		default: \
    320 		    delta += (1000000000 * xxs); \
    321 		} \
    322 	} \
    323 }
    324 
    325 #define	TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
    326 	(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
    327 
    328 /*
    329  * Handle MRT setsockopt commands to modify the multicast routing tables.
    330  */
    331 int
    332 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
    333     int datalen, mblk_t *first_mp)
    334 {
    335 	conn_t		*connp = Q_TO_CONN(q);
    336 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
    337 
    338 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    339 	if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
    340 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    341 		return (EACCES);
    342 	}
    343 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    344 
    345 	if (checkonly) {
    346 		/*
    347 		 * do not do operation, just pretend to - new T_CHECK
    348 		 * Note: Even routines further on can probably fail but
    349 		 * this T_CHECK stuff is only to please XTI so it not
    350 		 * necessary to be perfect.
    351 		 */
    352 		switch (cmd) {
    353 		case MRT_INIT:
    354 		case MRT_DONE:
    355 		case MRT_ADD_VIF:
    356 		case MRT_DEL_VIF:
    357 		case MRT_ADD_MFC:
    358 		case MRT_DEL_MFC:
    359 		case MRT_ASSERT:
    360 			return (0);
    361 		default:
    362 			return (EOPNOTSUPP);
    363 		}
    364 	}
    365 
    366 	/*
    367 	 * make sure no command is issued after multicast routing has been
    368 	 * turned off.
    369 	 */
    370 	if (cmd != MRT_INIT && cmd != MRT_DONE) {
    371 		if (is_mrouter_off(ipst))
    372 			return (EINVAL);
    373 	}
    374 
    375 	switch (cmd) {
    376 	case MRT_INIT:	return (ip_mrouter_init(connp, data, datalen, ipst));
    377 	case MRT_DONE:	return (ip_mrouter_done(first_mp, ipst));
    378 	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp,
    379 			    first_mp, ipst));
    380 	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, connp, first_mp,
    381 			    ipst));
    382 	case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
    383 	case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
    384 	case MRT_ASSERT:   return (set_assert((int *)data, ipst));
    385 	default:	   return (EOPNOTSUPP);
    386 	}
    387 }
    388 
    389 /*
    390  * Handle MRT getsockopt commands
    391  */
    392 int
    393 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data)
    394 {
    395 	conn_t		*connp = Q_TO_CONN(q);
    396 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
    397 
    398 	if (connp != ipst->ips_ip_g_mrouter)
    399 		return (EACCES);
    400 
    401 	switch (cmd) {
    402 	case MRT_VERSION:	return (get_version((uchar_t *)data));
    403 	case MRT_ASSERT:	return (get_assert((uchar_t *)data, ipst));
    404 	default:		return (EOPNOTSUPP);
    405 	}
    406 }
    407 
    408 /*
    409  * Handle ioctl commands to obtain information from the cache.
    410  * Called with shared access to IP. These are read_only ioctls.
    411  */
    412 /* ARGSUSED */
    413 int
    414 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
    415     ip_ioctl_cmd_t *ipip, void *if_req)
    416 {
    417 	mblk_t	*mp1;
    418 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
    419 	conn_t		*connp = Q_TO_CONN(q);
    420 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
    421 
    422 	/* Existence verified in ip_wput_nondata */
    423 	mp1 = mp->b_cont->b_cont;
    424 
    425 	switch (iocp->ioc_cmd) {
    426 	case (SIOCGETVIFCNT):
    427 		return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
    428 	case (SIOCGETSGCNT):
    429 		return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
    430 	case (SIOCGETLSGCNT):
    431 		return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
    432 	default:
    433 		return (EINVAL);
    434 	}
    435 }
    436 
    437 /*
    438  * Returns the packet, byte, rpf-failure count for the source, group provided.
    439  */
    440 static int
    441 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
    442 {
    443 	struct mfc *rt;
    444 	struct mfcb *mfcbp;
    445 
    446 	mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
    447 	MFCB_REFHOLD(mfcbp);
    448 	MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
    449 
    450 	if (rt != NULL) {
    451 		mutex_enter(&rt->mfc_mutex);
    452 		req->pktcnt   = rt->mfc_pkt_cnt;
    453 		req->bytecnt  = rt->mfc_byte_cnt;
    454 		req->wrong_if = rt->mfc_wrong_if;
    455 		mutex_exit(&rt->mfc_mutex);
    456 	} else
    457 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
    458 
    459 	MFCB_REFRELE(mfcbp);
    460 	return (0);
    461 }
    462 
    463 /*
    464  * Returns the packet, byte, rpf-failure count for the source, group provided.
    465  * Uses larger counters and IPv6 addresses.
    466  */
    467 /* ARGSUSED XXX until implemented */
    468 static int
    469 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
    470 {
    471 	/* XXX TODO SIOCGETLSGCNT */
    472 	return (ENXIO);
    473 }
    474 
    475 /*
    476  * Returns the input and output packet and byte counts on the vif provided.
    477  */
    478 static int
    479 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
    480 {
    481 	vifi_t vifi = req->vifi;
    482 
    483 	if (vifi >= ipst->ips_numvifs)
    484 		return (EINVAL);
    485 
    486 	/*
    487 	 * No locks here, an approximation is fine.
    488 	 */
    489 	req->icount = ipst->ips_vifs[vifi].v_pkt_in;
    490 	req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
    491 	req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
    492 	req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
    493 
    494 	return (0);
    495 }
    496 
    497 static int
    498 get_version(uchar_t *data)
    499 {
    500 	int *v = (int *)data;
    501 
    502 	*v = 0x0305;	/* XXX !!!! */
    503 
    504 	return (0);
    505 }
    506 
    507 /*
    508  * Set PIM assert processing global.
    509  */
    510 static int
    511 set_assert(int *i, ip_stack_t *ipst)
    512 {
    513 	if ((*i != 1) && (*i != 0))
    514 		return (EINVAL);
    515 
    516 	ipst->ips_pim_assert = *i;
    517 
    518 	return (0);
    519 }
    520 
    521 /*
    522  * Get PIM assert processing global.
    523  */
    524 static int
    525 get_assert(uchar_t *data, ip_stack_t *ipst)
    526 {
    527 	int *i = (int *)data;
    528 
    529 	*i = ipst->ips_pim_assert;
    530 
    531 	return (0);
    532 }
    533 
    534 /*
    535  * Enable multicast routing.
    536  */
    537 static int
    538 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
    539 {
    540 	int	*v;
    541 
    542 	if (data == NULL || (datalen != sizeof (int)))
    543 		return (ENOPROTOOPT);
    544 
    545 	v = (int *)data;
    546 	if (*v != 1)
    547 		return (ENOPROTOOPT);
    548 
    549 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    550 	if (ipst->ips_ip_g_mrouter != NULL) {
    551 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    552 		return (EADDRINUSE);
    553 	}
    554 
    555 	/*
    556 	 * MRT_INIT should only be allowed for RAW sockets, but we double
    557 	 * check.
    558 	 */
    559 	if (!IPCL_IS_RAWIP(connp)) {
    560 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    561 		return (EINVAL);
    562 	}
    563 
    564 	ipst->ips_ip_g_mrouter = connp;
    565 	connp->conn_multi_router = 1;
    566 	/* In order for tunnels to work we have to turn ip_g_forward on */
    567 	if (!WE_ARE_FORWARDING(ipst)) {
    568 		if (ipst->ips_ip_mrtdebug > 1) {
    569 			(void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
    570 			    "ip_mrouter_init: turning on forwarding");
    571 		}
    572 		ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward;
    573 		ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS;
    574 	}
    575 
    576 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    577 	return (0);
    578 }
    579 
    580 void
    581 ip_mrouter_stack_init(ip_stack_t *ipst)
    582 {
    583 	mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
    584 
    585 	ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
    586 	    KM_SLEEP);
    587 	ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
    588 	/*
    589 	 * mfctable:
    590 	 * Includes all mfcs, including waiting upcalls.
    591 	 * Multiple mfcs per bucket.
    592 	 */
    593 	ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
    594 	    KM_SLEEP);
    595 	/*
    596 	 * Define the token bucket filter structures.
    597 	 * tbftable -> each vif has one of these for storing info.
    598 	 */
    599 	ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
    600 
    601 	mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
    602 
    603 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
    604 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
    605 }
    606 
    607 /*
    608  * Disable multicast routing.
    609  * Didn't use global timeout_val (BSD version), instead check the mfctable.
    610  */
    611 int
    612 ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst)
    613 {
    614 	conn_t		*mrouter;
    615 	vifi_t 		vifi;
    616 	struct mfc	*mfc_rt;
    617 	int		i;
    618 
    619 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    620 	if (ipst->ips_ip_g_mrouter == NULL) {
    621 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    622 		return (EINVAL);
    623 	}
    624 
    625 	mrouter = ipst->ips_ip_g_mrouter;
    626 
    627 	if (ipst->ips_saved_ip_g_forward != -1) {
    628 		if (ipst->ips_ip_mrtdebug > 1) {
    629 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
    630 			    "ip_mrouter_done: turning off forwarding");
    631 		}
    632 		ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward;
    633 		ipst->ips_saved_ip_g_forward = -1;
    634 	}
    635 
    636 	/*
    637 	 * Always clear cache when vifs change.
    638 	 * No need to get ipst->ips_last_encap_lock since we are running as
    639 	 * a writer.
    640 	 */
    641 	mutex_enter(&ipst->ips_last_encap_lock);
    642 	ipst->ips_last_encap_src = 0;
    643 	ipst->ips_last_encap_vif = NULL;
    644 	mutex_exit(&ipst->ips_last_encap_lock);
    645 	mrouter->conn_multi_router = 0;
    646 
    647 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    648 
    649 	/*
    650 	 * For each phyint in use,
    651 	 * disable promiscuous reception of all IP multicasts.
    652 	 */
    653 	for (vifi = 0; vifi < MAXVIFS; vifi++) {
    654 		struct vif *vifp = ipst->ips_vifs + vifi;
    655 
    656 		mutex_enter(&vifp->v_lock);
    657 		/*
    658 		 * if the vif is active mark it condemned.
    659 		 */
    660 		if (vifp->v_marks & VIF_MARK_GOOD) {
    661 			ASSERT(vifp->v_ipif != NULL);
    662 			ipif_refhold(vifp->v_ipif);
    663 			/* Phyint only */
    664 			if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
    665 				ipif_t *ipif = vifp->v_ipif;
    666 				ipsq_t  *ipsq;
    667 				boolean_t suc;
    668 				ill_t *ill;
    669 
    670 				ill = ipif->ipif_ill;
    671 				suc = B_FALSE;
    672 				if (mp == NULL) {
    673 					/*
    674 					 * being called from ip_close,
    675 					 * lets do it synchronously.
    676 					 * Clear VIF_MARK_GOOD and
    677 					 * set VIF_MARK_CONDEMNED.
    678 					 */
    679 					vifp->v_marks &= ~VIF_MARK_GOOD;
    680 					vifp->v_marks |= VIF_MARK_CONDEMNED;
    681 					mutex_exit(&(vifp)->v_lock);
    682 					suc = ipsq_enter(ill, B_FALSE);
    683 					ipsq = ill->ill_phyint->phyint_ipsq;
    684 				} else {
    685 					ipsq = ipsq_try_enter(ipif, NULL,
    686 					    mrouter->conn_wq, mp,
    687 					    ip_restart_optmgmt, NEW_OP, B_TRUE);
    688 					if (ipsq == NULL) {
    689 						mutex_exit(&(vifp)->v_lock);
    690 						ipif_refrele(ipif);
    691 						return (EINPROGRESS);
    692 					}
    693 					/*
    694 					 * Clear VIF_MARK_GOOD and
    695 					 * set VIF_MARK_CONDEMNED.
    696 					 */
    697 					vifp->v_marks &= ~VIF_MARK_GOOD;
    698 					vifp->v_marks |= VIF_MARK_CONDEMNED;
    699 					mutex_exit(&(vifp)->v_lock);
    700 					suc = B_TRUE;
    701 				}
    702 
    703 				if (suc) {
    704 					(void) ip_delmulti(INADDR_ANY, ipif,
    705 					    B_TRUE, B_TRUE);
    706 					ipsq_exit(ipsq, B_TRUE, B_TRUE);
    707 				}
    708 				mutex_enter(&vifp->v_lock);
    709 			}
    710 			/*
    711 			 * decreases the refcnt added in add_vif.
    712 			 * and release v_lock.
    713 			 */
    714 			VIF_REFRELE_LOCKED(vifp);
    715 		} else {
    716 			mutex_exit(&vifp->v_lock);
    717 			continue;
    718 		}
    719 	}
    720 
    721 	mutex_enter(&ipst->ips_numvifs_mutex);
    722 	ipst->ips_numvifs = 0;
    723 	ipst->ips_pim_assert = 0;
    724 	ipst->ips_reg_vif_num = ALL_VIFS;
    725 	mutex_exit(&ipst->ips_numvifs_mutex);
    726 
    727 	/*
    728 	 * Free upcall msgs.
    729 	 * Go through mfctable and stop any outstanding upcall
    730 	 * timeouts remaining on mfcs.
    731 	 */
    732 	for (i = 0; i < MFCTBLSIZ; i++) {
    733 		mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
    734 		ipst->ips_mfcs[i].mfcb_refcnt++;
    735 		ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
    736 		mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
    737 		mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
    738 		while (mfc_rt) {
    739 			/* Free upcalls */
    740 			mutex_enter(&mfc_rt->mfc_mutex);
    741 			if (mfc_rt->mfc_rte != NULL) {
    742 				if (mfc_rt->mfc_timeout_id != 0) {
    743 					/*
    744 					 * OK to drop the lock as we have
    745 					 * a refcnt on the bucket. timeout
    746 					 * can fire but it will see that
    747 					 * mfc_timeout_id == 0 and not do
    748 					 * anything. see expire_upcalls().
    749 					 */
    750 					mfc_rt->mfc_timeout_id = 0;
    751 					mutex_exit(&mfc_rt->mfc_mutex);
    752 					(void) untimeout(
    753 					    mfc_rt->mfc_timeout_id);
    754 						mfc_rt->mfc_timeout_id = 0;
    755 					mutex_enter(&mfc_rt->mfc_mutex);
    756 
    757 					/*
    758 					 * all queued upcall packets
    759 					 * and mblk will be freed in
    760 					 * release_mfc().
    761 					 */
    762 				}
    763 			}
    764 
    765 			mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
    766 
    767 			mutex_exit(&mfc_rt->mfc_mutex);
    768 			mfc_rt = mfc_rt->mfc_next;
    769 		}
    770 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
    771 	}
    772 
    773 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    774 	ipst->ips_ip_g_mrouter = NULL;
    775 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    776 	return (0);
    777 }
    778 
    779 void
    780 ip_mrouter_stack_destroy(ip_stack_t *ipst)
    781 {
    782 	struct mfcb *mfcbp;
    783 	struct mfc  *rt;
    784 	int i;
    785 
    786 	for (i = 0; i < MFCTBLSIZ; i++) {
    787 		mfcbp = &ipst->ips_mfcs[i];
    788 
    789 		while ((rt = mfcbp->mfcb_mfc) != NULL) {
    790 			(void) printf("ip_mrouter_stack_destroy: free for %d\n",
    791 			    i);
    792 
    793 			mfcbp->mfcb_mfc = rt->mfc_next;
    794 			free_queue(rt);
    795 			mi_free(rt);
    796 		}
    797 	}
    798 	kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
    799 	ipst->ips_vifs = NULL;
    800 	kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
    801 	ipst->ips_mrtstat = NULL;
    802 	kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
    803 	ipst->ips_mfcs = NULL;
    804 	kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
    805 	ipst->ips_tbfs = NULL;
    806 
    807 	mutex_destroy(&ipst->ips_last_encap_lock);
    808 	mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
    809 }
    810 
    811 static boolean_t
    812 is_mrouter_off(ip_stack_t *ipst)
    813 {
    814 	conn_t	*mrouter;
    815 
    816 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    817 	if (ipst->ips_ip_g_mrouter == NULL) {
    818 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    819 		return (B_TRUE);
    820 	}
    821 
    822 	mrouter = ipst->ips_ip_g_mrouter;
    823 	if (mrouter->conn_multi_router == 0) {
    824 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    825 		return (B_TRUE);
    826 	}
    827 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    828 	return (B_FALSE);
    829 }
    830 
    831 static void
    832 unlock_good_vif(struct vif *vifp)
    833 {
    834 	ASSERT(vifp->v_ipif != NULL);
    835 	ipif_refrele(vifp->v_ipif);
    836 	VIF_REFRELE(vifp);
    837 }
    838 
    839 static boolean_t
    840 lock_good_vif(struct vif *vifp)
    841 {
    842 	mutex_enter(&vifp->v_lock);
    843 	if (!(vifp->v_marks & VIF_MARK_GOOD)) {
    844 		mutex_exit(&vifp->v_lock);
    845 		return (B_FALSE);
    846 	}
    847 
    848 	ASSERT(vifp->v_ipif != NULL);
    849 	mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
    850 	if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
    851 		mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
    852 		mutex_exit(&vifp->v_lock);
    853 		return (B_FALSE);
    854 	}
    855 	ipif_refhold_locked(vifp->v_ipif);
    856 	mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
    857 	vifp->v_refcnt++;
    858 	mutex_exit(&vifp->v_lock);
    859 	return (B_TRUE);
    860 }
    861 
    862 /*
    863  * Add a vif to the vif table.
    864  */
    865 static int
    866 add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
    867 {
    868 	struct vif	*vifp = ipst->ips_vifs + vifcp->vifc_vifi;
    869 	ipif_t		*ipif;
    870 	int		error;
    871 	struct tbf	*v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
    872 	ipsq_t  	*ipsq;
    873 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
    874 
    875 	ASSERT(connp != NULL);
    876 
    877 	if (vifcp->vifc_vifi >= MAXVIFS)
    878 		return (EINVAL);
    879 
    880 	if (is_mrouter_off(ipst))
    881 		return (EINVAL);
    882 
    883 	mutex_enter(&vifp->v_lock);
    884 	/*
    885 	 * Viftable entry should be 0.
    886 	 * if v_marks == 0 but v_refcnt != 0 means struct is being
    887 	 * initialized.
    888 	 *
    889 	 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
    890 	 * request while the delete is in progress, mrouted only sends add
    891 	 * requests when a new interface is added and the new interface cannot
    892 	 * have the same vifi as an existing interface. We make sure that
    893 	 * ill_delete will block till the vif is deleted by adding a refcnt
    894 	 * to ipif in del_vif().
    895 	 */
    896 	if (vifp->v_lcl_addr.s_addr != 0 ||
    897 	    vifp->v_marks != 0 ||
    898 	    vifp->v_refcnt != 0) {
    899 		mutex_exit(&vifp->v_lock);
    900 		return (EADDRINUSE);
    901 	}
    902 
    903 	/* Incoming vif should not be 0 */
    904 	if (vifcp->vifc_lcl_addr.s_addr == 0) {
    905 		mutex_exit(&vifp->v_lock);
    906 		return (EINVAL);
    907 	}
    908 
    909 	vifp->v_refcnt++;
    910 	mutex_exit(&vifp->v_lock);
    911 	/* Find the interface with the local address */
    912 	ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
    913 	    connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp,
    914 	    ip_restart_optmgmt, &error, ipst);
    915 	if (ipif == NULL) {
    916 		VIF_REFRELE(vifp);
    917 		if (error == EINPROGRESS)
    918 			return (error);
    919 		return (EADDRNOTAVAIL);
    920 	}
    921 
    922 	/*
    923 	 * We have to be exclusive as we have to call ip_addmulti()
    924 	 * This is the best position to try to be exclusive in case
    925 	 * we have to wait.
    926 	 */
    927 	ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp,
    928 	    ip_restart_optmgmt, NEW_OP, B_TRUE);
    929 	if ((ipsq) == NULL) {
    930 		VIF_REFRELE(vifp);
    931 		ipif_refrele(ipif);
    932 		return (EINPROGRESS);
    933 	}
    934 
    935 	if (ipst->ips_ip_mrtdebug > 1) {
    936 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
    937 		    "add_vif: src 0x%x enter",
    938 		    vifcp->vifc_lcl_addr.s_addr);
    939 	}
    940 
    941 	mutex_enter(&vifp->v_lock);
    942 	/*
    943 	 * Always clear cache when vifs change.
    944 	 * Needed to ensure that src isn't left over from before vif was added.
    945 	 * No need to get last_encap_lock, since we are running as a writer.
    946 	 */
    947 
    948 	mutex_enter(&ipst->ips_last_encap_lock);
    949 	ipst->ips_last_encap_src = 0;
    950 	ipst->ips_last_encap_vif = NULL;
    951 	mutex_exit(&ipst->ips_last_encap_lock);
    952 
    953 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
    954 		if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
    955 			cmn_err(CE_WARN,
    956 			    "add_vif: source route tunnels not supported\n");
    957 			VIF_REFRELE_LOCKED(vifp);
    958 			ipif_refrele(ipif);
    959 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
    960 			return (EOPNOTSUPP);
    961 		}
    962 		vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
    963 
    964 	} else {
    965 		/* Phyint or Register vif */
    966 		if (vifcp->vifc_flags & VIFF_REGISTER) {
    967 			/*
    968 			 * Note: Since all IPPROTO_IP level options (including
    969 			 * MRT_ADD_VIF) are done exclusively via
    970 			 * ip_optmgmt_writer(), a lock is not necessary to
    971 			 * protect reg_vif_num.
    972 			 */
    973 			mutex_enter(&ipst->ips_numvifs_mutex);
    974 			if (ipst->ips_reg_vif_num == ALL_VIFS) {
    975 				ipst->ips_reg_vif_num = vifcp->vifc_vifi;
    976 				mutex_exit(&ipst->ips_numvifs_mutex);
    977 			} else {
    978 				mutex_exit(&ipst->ips_numvifs_mutex);
    979 				VIF_REFRELE_LOCKED(vifp);
    980 				ipif_refrele(ipif);
    981 				ipsq_exit(ipsq, B_TRUE, B_TRUE);
    982 				return (EADDRINUSE);
    983 			}
    984 		}
    985 
    986 		/* Make sure the interface supports multicast */
    987 		if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
    988 			VIF_REFRELE_LOCKED(vifp);
    989 			ipif_refrele(ipif);
    990 			if (vifcp->vifc_flags & VIFF_REGISTER) {
    991 				mutex_enter(&ipst->ips_numvifs_mutex);
    992 				ipst->ips_reg_vif_num = ALL_VIFS;
    993 				mutex_exit(&ipst->ips_numvifs_mutex);
    994 			}
    995 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
    996 			return (EOPNOTSUPP);
    997 		}
    998 		/* Enable promiscuous reception of all IP mcasts from the if */
    999 		mutex_exit(&vifp->v_lock);
   1000 		error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE,
   1001 		    MODE_IS_EXCLUDE, NULL);
   1002 		mutex_enter(&vifp->v_lock);
   1003 		/*
   1004 		 * since we released the lock lets make sure that
   1005 		 * ip_mrouter_done() has not been called.
   1006 		 */
   1007 		if (error != 0 || is_mrouter_off(ipst)) {
   1008 			if (error == 0)
   1009 				(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE,
   1010 				    B_TRUE);
   1011 			if (vifcp->vifc_flags & VIFF_REGISTER) {
   1012 				mutex_enter(&ipst->ips_numvifs_mutex);
   1013 				ipst->ips_reg_vif_num = ALL_VIFS;
   1014 				mutex_exit(&ipst->ips_numvifs_mutex);
   1015 			}
   1016 			VIF_REFRELE_LOCKED(vifp);
   1017 			ipif_refrele(ipif);
   1018 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
   1019 			return (error?error:EINVAL);
   1020 		}
   1021 	}
   1022 	/* Define parameters for the tbf structure */
   1023 	vifp->v_tbf = v_tbf;
   1024 	gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
   1025 	vifp->v_tbf->tbf_n_tok = 0;
   1026 	vifp->v_tbf->tbf_q_len = 0;
   1027 	vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
   1028 	vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
   1029 
   1030 	vifp->v_flags = vifcp->vifc_flags;
   1031 	vifp->v_threshold = vifcp->vifc_threshold;
   1032 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
   1033 	vifp->v_ipif = ipif;
   1034 	ipif_refrele(ipif);
   1035 	/* Scaling up here, allows division by 1024 in critical code.	*/
   1036 	vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
   1037 	vifp->v_timeout_id = 0;
   1038 	/* initialize per vif pkt counters */
   1039 	vifp->v_pkt_in = 0;
   1040 	vifp->v_pkt_out = 0;
   1041 	vifp->v_bytes_in = 0;
   1042 	vifp->v_bytes_out = 0;
   1043 	mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
   1044 
   1045 	/* Adjust numvifs up, if the vifi is higher than numvifs */
   1046 	mutex_enter(&ipst->ips_numvifs_mutex);
   1047 	if (ipst->ips_numvifs <= vifcp->vifc_vifi)
   1048 		ipst->ips_numvifs = vifcp->vifc_vifi + 1;
   1049 	mutex_exit(&ipst->ips_numvifs_mutex);
   1050 
   1051 	if (ipst->ips_ip_mrtdebug > 1) {
   1052 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1053 		    "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
   1054 		    vifcp->vifc_vifi,
   1055 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
   1056 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
   1057 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
   1058 		    vifcp->vifc_threshold, vifcp->vifc_rate_limit);
   1059 	}