Home | History | Annotate | Download | only in ip
      1 /*
      2  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
      3  * Use is subject to license terms.
      4  */
      5 /*
      6  * CDDL HEADER START
      7  *
      8  * The contents of this file are subject to the terms of the
      9  * Common Development and Distribution License (the "License").
     10  * You may not use this file except in compliance with the License.
     11  *
     12  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
     13  * or http://www.opensolaris.org/os/licensing.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  *
     17  * When distributing Covered Code, include this CDDL HEADER in each
     18  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     19  * If applicable, add the following below this CDDL HEADER, with the
     20  * fields enclosed by brackets "[]" replaced with your own identifying
     21  * information: Portions Copyright [yyyy] [name of copyright owner]
     22  *
     23  * CDDL HEADER END
     24  */
     25 /*
     26  * Copyright 2008 Sun Microsystems, Inc.
     27  * All rights reserved.  Use is subject to license terms.
     28  */
     29 /* Copyright (c) 1990 Mentat Inc. */
     30 
     31 /*
     32  * Procedures for the kernel part of DVMRP,
     33  * a Distance-Vector Multicast Routing Protocol.
     34  * (See RFC-1075)
     35  * Written by David Waitzman, BBN Labs, August 1988.
     36  * Modified by Steve Deering, Stanford, February 1989.
     37  * Modified by Mark J. Steiglitz, Stanford, May, 1991
     38  * Modified by Van Jacobson, LBL, January 1993
     39  * Modified by Ajit Thyagarajan, PARC, August 1993
     40  * Modified by Bill Fenner, PARC, April 1995
     41  *
     42  * MROUTING 3.5
     43  */
     44 
     45 /*
     46  * TODO
     47  * - function pointer field in vif, void *vif_sendit()
     48  */
     49 
     50 #include <sys/types.h>
     51 #include <sys/stream.h>
     52 #include <sys/stropts.h>
     53 #include <sys/strlog.h>
     54 #include <sys/systm.h>
     55 #include <sys/ddi.h>
     56 #include <sys/cmn_err.h>
     57 #include <sys/zone.h>
     58 
     59 #include <sys/param.h>
     60 #include <sys/socket.h>
     61 #include <sys/vtrace.h>
     62 #include <sys/debug.h>
     63 #include <net/if.h>
     64 #include <sys/sockio.h>
     65 #include <netinet/in.h>
     66 #include <net/if_dl.h>
     67 
     68 #include <inet/common.h>
     69 #include <inet/mi.h>
     70 #include <inet/nd.h>
     71 #include <inet/mib2.h>
     72 #include <netinet/ip6.h>
     73 #include <inet/ip.h>
     74 #include <inet/snmpcom.h>
     75 
     76 #include <netinet/igmp.h>
     77 #include <netinet/igmp_var.h>
     78 #include <netinet/udp.h>
     79 #include <netinet/ip_mroute.h>
     80 #include <inet/ip_multi.h>
     81 #include <inet/ip_ire.h>
     82 #include <inet/ip_if.h>
     83 #include <inet/ipclassifier.h>
     84 
     85 #include <netinet/pim.h>
     86 
     87 
     88 /*
     89  * MT Design:
     90  *
     91  * There are three main data structures viftable, mfctable and tbftable that
     92  * need to be protected against MT races.
     93  *
     94  * vitable is a fixed length array of vif structs. There is no lock to protect
     95  * the whole array, instead each struct is protected by its own indiviual lock.
     96  * The value of v_marks in conjuction with the value of v_refcnt determines the
     97  * current state of a vif structure. One special state that needs mention
     98  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
     99  * that vif is being initalized.
    100  * Each structure is freed when the refcnt goes down to zero. If a delete comes
    101  * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
    102  * which prevents the struct from further use.  When the refcnt goes to zero
    103  * the struct is freed and is marked VIF_MARK_NOTINUSE.
    104  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
    105  * from  going away a refhold is put on the ipif before using it. see
    106  * lock_good_vif() and unlock_good_vif().
    107  *
    108  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
    109  * of the vif struct.
    110  *
    111  * tbftable is also a fixed length array of tbf structs and is only accessed
    112  * via v_tbf.  It is protected by its own lock tbf_lock.
    113  *
    114  * Lock Ordering is
    115  * v_lock --> tbf_lock
    116  * v_lock --> ill_locK
    117  *
    118  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
    119  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
    120  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
    121  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
    122  * protect the struct elements.
    123  *
    124  * mfc structs are dynamically allocated and are singly linked
    125  * at the head of the chain. When an mfc structure is to be deleted
    126  * it is marked condemned and so is the state in the bucket struct.
    127  * When the last walker of the hash bucket exits all the mfc structs
    128  * marked condemed are freed.
    129  *
    130  * Locking Hierarchy:
    131  * The bucket lock should be acquired before the mfc struct lock.
    132  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
    133  * operations on the bucket struct.
    134  *
    135  * last_encap_lock and numvifs_mutex should be acquired after
    136  * acquring vif or mfc locks. These locks protect some global variables.
    137  *
    138  * The statistics are not currently protected by a lock
    139  * causing the stats be be approximate, not exact.
    140  */
    141 
    142 #define	NO_VIF	MAXVIFS 	/* from mrouted, no route for src */
    143 
    144 /*
    145  * Timeouts:
    146  * 	Upcall timeouts - BSD uses boolean_t mfc->expire and
    147  *	nexpire[MFCTBLSIZE], the number of times expire has been called.
    148  *	SunOS 5.x uses mfc->timeout for each mfc.
    149  *	Some Unixes are limited in the number of simultaneous timeouts
    150  * 	that can be run, SunOS 5.x does not have this restriction.
    151  */
    152 
    153 /*
    154  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
    155  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
    156  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
    157  */
    158 #define		EXPIRE_TIMEOUT	(hz/4)	/* 4x / second	*/
    159 #define		UPCALL_EXPIRE	6	/* number of timeouts	*/
    160 
    161 /*
    162  * Hash function for a source, group entry
    163  */
    164 #define	MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
    165 	((g) >> 20) ^ ((g) >> 10) ^ (g))
    166 
    167 #define			TBF_REPROCESS	(hz / 100)	/* 100x /second	*/
    168 
    169 /* Identify PIM packet that came on a Register interface */
    170 #define	PIM_REGISTER_MARKER	0xffffffff
    171 
    172 /* Function declarations */
    173 static int	add_mfc(struct mfcctl *, ip_stack_t *);
    174 static int	add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *);
    175 static int	del_mfc(struct mfcctl *, ip_stack_t *);
    176 static int	del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *);
    177 static void	del_vifp(struct vif *);
    178 static void	encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
    179 static void	expire_upcalls(void *);
    180 static void	fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
    181 static void	free_queue(struct mfc *);
    182 static int	get_assert(uchar_t *, ip_stack_t *);
    183 static int	get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
    184 static int	get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
    185 static int	get_version(uchar_t *);
    186 static int	get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
    187 static int	ip_mdq(mblk_t *, ipha_t *, ill_t *,
    188 		    ipaddr_t, struct mfc *);
    189 static int	ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
    190 static void	phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
    191 static int	register_mforward(queue_t *, mblk_t *, ill_t *);
    192 static void	register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
    193 static int	set_assert(int *, ip_stack_t *);
    194 
    195 /*
    196  * Token Bucket Filter functions
    197  */
    198 static int  priority(struct vif *, ipha_t *);
    199 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
    200 static int  tbf_dq_sel(struct vif *, ipha_t *);
    201 static void tbf_process_q(struct vif *);
    202 static void tbf_queue(struct vif *, mblk_t *);
    203 static void tbf_reprocess_q(void *);
    204 static void tbf_send_packet(struct vif *, mblk_t *);
    205 static void tbf_update_tokens(struct vif *);
    206 static void release_mfc(struct mfcb *);
    207 
    208 static boolean_t is_mrouter_off(ip_stack_t *);
    209 /*
    210  * Encapsulation packets
    211  */
    212 
    213 #define	ENCAP_TTL	64
    214 
    215 /* prototype IP hdr for encapsulated packets */
    216 static ipha_t multicast_encap_iphdr = {
    217 	IP_SIMPLE_HDR_VERSION,
    218 	0,				/* tos */
    219 	sizeof (ipha_t),		/* total length */
    220 	0,				/* id */
    221 	0,				/* frag offset */
    222 	ENCAP_TTL, IPPROTO_ENCAP,
    223 	0,				/* checksum */
    224 };
    225 
    226 /*
    227  * Rate limit for assert notification messages, in nsec.
    228  */
    229 #define	ASSERT_MSG_TIME		3000000000
    230 
    231 
    232 #define	VIF_REFHOLD(vifp) {			\
    233 	mutex_enter(&(vifp)->v_lock);		\
    234 	(vifp)->v_refcnt++;			\
    235 	mutex_exit(&(vifp)->v_lock);		\
    236 }
    237 
    238 #define	VIF_REFRELE_LOCKED(vifp) {				\
    239 	(vifp)->v_refcnt--;					\
    240 	if ((vifp)->v_refcnt == 0 &&				\
    241 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
    242 			del_vifp(vifp);				\
    243 	} else {						\
    244 		mutex_exit(&(vifp)->v_lock);			\
    245 	}							\
    246 }
    247 
    248 #define	VIF_REFRELE(vifp) {					\
    249 	mutex_enter(&(vifp)->v_lock);				\
    250 	(vifp)->v_refcnt--;					\
    251 	if ((vifp)->v_refcnt == 0 &&				\
    252 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
    253 			del_vifp(vifp);				\
    254 	} else {						\
    255 		mutex_exit(&(vifp)->v_lock);			\
    256 	}							\
    257 }
    258 
    259 #define	MFCB_REFHOLD(mfcb) {				\
    260 	mutex_enter(&(mfcb)->mfcb_lock);		\
    261 	(mfcb)->mfcb_refcnt++;				\
    262 	ASSERT((mfcb)->mfcb_refcnt != 0);		\
    263 	mutex_exit(&(mfcb)->mfcb_lock);			\
    264 }
    265 
    266 #define	MFCB_REFRELE(mfcb) {					\
    267 	mutex_enter(&(mfcb)->mfcb_lock);			\
    268 	ASSERT((mfcb)->mfcb_refcnt != 0);			\
    269 	if (--(mfcb)->mfcb_refcnt == 0 &&			\
    270 		((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {	\
    271 			release_mfc(mfcb);			\
    272 	}							\
    273 	mutex_exit(&(mfcb)->mfcb_lock);				\
    274 }
    275 
    276 /*
    277  * MFCFIND:
    278  * Find a route for a given origin IP address and multicast group address.
    279  * Skip entries with pending upcalls.
    280  * Type of service parameter to be added in the future!
    281  */
    282 #define	MFCFIND(mfcbp, o, g, rt) { \
    283 	struct mfc *_mb_rt = NULL; \
    284 	rt = NULL; \
    285 	_mb_rt = mfcbp->mfcb_mfc; \
    286 	while (_mb_rt) { \
    287 		if ((_mb_rt->mfc_origin.s_addr == o) && \
    288 		    (_mb_rt->mfc_mcastgrp.s_addr == g) && \
    289 		    (_mb_rt->mfc_rte == NULL) && \
    290 		    (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
    291 		    rt = _mb_rt; \
    292 		    break; \
    293 		} \
    294 	_mb_rt = _mb_rt->mfc_next; \
    295 	} \
    296 }
    297 
    298 /*
    299  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
    300  * are inefficient. We use gethrestime() which returns a timespec_t with
    301  * sec and nsec, the resolution is machine dependent.
    302  * The following 2 macros have been changed to use nsec instead of usec.
    303  */
    304 /*
    305  * Macros to compute elapsed time efficiently.
    306  * Borrowed from Van Jacobson's scheduling code.
    307  * Delta should be a hrtime_t.
    308  */
    309 #define	TV_DELTA(a, b, delta) { \
    310 	int xxs; \
    311  \
    312 	delta = (a).tv_nsec - (b).tv_nsec; \
    313 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
    314 		switch (xxs) { \
    315 		case 2: \
    316 		    delta += 1000000000; \
    317 		    /*FALLTHROUGH*/ \
    318 		case 1: \
    319 		    delta += 1000000000; \
    320 		    break; \
    321 		default: \
    322 		    delta += (1000000000 * xxs); \
    323 		} \
    324 	} \
    325 }
    326 
    327 #define	TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
    328 	(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
    329 
    330 /*
    331  * Handle MRT setsockopt commands to modify the multicast routing tables.
    332  */
    333 int
    334 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
    335     int datalen, mblk_t *first_mp)
    336 {
    337 	conn_t		*connp = Q_TO_CONN(q);
    338 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
    339 
    340 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    341 	if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
    342 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    343 		return (EACCES);
    344 	}
    345 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    346 
    347 	if (checkonly) {
    348 		/*
    349 		 * do not do operation, just pretend to - new T_CHECK
    350 		 * Note: Even routines further on can probably fail but
    351 		 * this T_CHECK stuff is only to please XTI so it not
    352 		 * necessary to be perfect.
    353 		 */
    354 		switch (cmd) {
    355 		case MRT_INIT:
    356 		case MRT_DONE:
    357 		case MRT_ADD_VIF:
    358 		case MRT_DEL_VIF:
    359 		case MRT_ADD_MFC:
    360 		case MRT_DEL_MFC:
    361 		case MRT_ASSERT:
    362 			return (0);
    363 		default:
    364 			return (EOPNOTSUPP);
    365 		}
    366 	}
    367 
    368 	/*
    369 	 * make sure no command is issued after multicast routing has been
    370 	 * turned off.
    371 	 */
    372 	if (cmd != MRT_INIT && cmd != MRT_DONE) {
    373 		if (is_mrouter_off(ipst))
    374 			return (EINVAL);
    375 	}
    376 
    377 	switch (cmd) {
    378 	case MRT_INIT:	return (ip_mrouter_init(connp, data, datalen, ipst));
    379 	case MRT_DONE:	return (ip_mrouter_done(first_mp, ipst));
    380 	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp,
    381 			    first_mp, ipst));
    382 	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, connp, first_mp,
    383 			    ipst));
    384 	case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
    385 	case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
    386 	case MRT_ASSERT:   return (set_assert((int *)data, ipst));
    387 	default:	   return (EOPNOTSUPP);
    388 	}
    389 }
    390 
    391 /*
    392  * Handle MRT getsockopt commands
    393  */
    394 int
    395 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data)
    396 {
    397 	conn_t		*connp = Q_TO_CONN(q);
    398 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
    399 
    400 	if (connp != ipst->ips_ip_g_mrouter)
    401 		return (EACCES);
    402 
    403 	switch (cmd) {
    404 	case MRT_VERSION:	return (get_version((uchar_t *)data));
    405 	case MRT_ASSERT:	return (get_assert((uchar_t *)data, ipst));
    406 	default:		return (EOPNOTSUPP);
    407 	}
    408 }
    409 
    410 /*
    411  * Handle ioctl commands to obtain information from the cache.
    412  * Called with shared access to IP. These are read_only ioctls.
    413  */
    414 /* ARGSUSED */
    415 int
    416 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
    417     ip_ioctl_cmd_t *ipip, void *if_req)
    418 {
    419 	mblk_t	*mp1;
    420 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
    421 	conn_t		*connp = Q_TO_CONN(q);
    422 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
    423 
    424 	/* Existence verified in ip_wput_nondata */
    425 	mp1 = mp->b_cont->b_cont;
    426 
    427 	switch (iocp->ioc_cmd) {
    428 	case (SIOCGETVIFCNT):
    429 		return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
    430 	case (SIOCGETSGCNT):
    431 		return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
    432 	case (SIOCGETLSGCNT):
    433 		return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
    434 	default:
    435 		return (EINVAL);
    436 	}
    437 }
    438 
    439 /*
    440  * Returns the packet, byte, rpf-failure count for the source, group provided.
    441  */
    442 static int
    443 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
    444 {
    445 	struct mfc *rt;
    446 	struct mfcb *mfcbp;
    447 
    448 	mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
    449 	MFCB_REFHOLD(mfcbp);
    450 	MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
    451 
    452 	if (rt != NULL) {
    453 		mutex_enter(&rt->mfc_mutex);
    454 		req->pktcnt   = rt->mfc_pkt_cnt;
    455 		req->bytecnt  = rt->mfc_byte_cnt;
    456 		req->wrong_if = rt->mfc_wrong_if;
    457 		mutex_exit(&rt->mfc_mutex);
    458 	} else
    459 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
    460 
    461 	MFCB_REFRELE(mfcbp);
    462 	return (0);
    463 }
    464 
    465 /*
    466  * Returns the packet, byte, rpf-failure count for the source, group provided.
    467  * Uses larger counters and IPv6 addresses.
    468  */
    469 /* ARGSUSED XXX until implemented */
    470 static int
    471 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
    472 {
    473 	/* XXX TODO SIOCGETLSGCNT */
    474 	return (ENXIO);
    475 }
    476 
    477 /*
    478  * Returns the input and output packet and byte counts on the vif provided.
    479  */
    480 static int
    481 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
    482 {
    483 	vifi_t vifi = req->vifi;
    484 
    485 	if (vifi >= ipst->ips_numvifs)
    486 		return (EINVAL);
    487 
    488 	/*
    489 	 * No locks here, an approximation is fine.
    490 	 */
    491 	req->icount = ipst->ips_vifs[vifi].v_pkt_in;
    492 	req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
    493 	req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
    494 	req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
    495 
    496 	return (0);
    497 }
    498 
    499 static int
    500 get_version(uchar_t *data)
    501 {
    502 	int *v = (int *)data;
    503 
    504 	*v = 0x0305;	/* XXX !!!! */
    505 
    506 	return (0);
    507 }
    508 
    509 /*
    510  * Set PIM assert processing global.
    511  */
    512 static int
    513 set_assert(int *i, ip_stack_t *ipst)
    514 {
    515 	if ((*i != 1) && (*i != 0))
    516 		return (EINVAL);
    517 
    518 	ipst->ips_pim_assert = *i;
    519 
    520 	return (0);
    521 }
    522 
    523 /*
    524  * Get PIM assert processing global.
    525  */
    526 static int
    527 get_assert(uchar_t *data, ip_stack_t *ipst)
    528 {
    529 	int *i = (int *)data;
    530 
    531 	*i = ipst->ips_pim_assert;
    532 
    533 	return (0);
    534 }
    535 
    536 /*
    537  * Enable multicast routing.
    538  */
    539 static int
    540 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
    541 {
    542 	int	*v;
    543 
    544 	if (data == NULL || (datalen != sizeof (int)))
    545 		return (ENOPROTOOPT);
    546 
    547 	v = (int *)data;
    548 	if (*v != 1)
    549 		return (ENOPROTOOPT);
    550 
    551 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    552 	if (ipst->ips_ip_g_mrouter != NULL) {
    553 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    554 		return (EADDRINUSE);
    555 	}
    556 
    557 	/*
    558 	 * MRT_INIT should only be allowed for RAW sockets, but we double
    559 	 * check.
    560 	 */
    561 	if (!IPCL_IS_RAWIP(connp)) {
    562 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    563 		return (EINVAL);
    564 	}
    565 
    566 	ipst->ips_ip_g_mrouter = connp;
    567 	connp->conn_multi_router = 1;
    568 	/* In order for tunnels to work we have to turn ip_g_forward on */
    569 	if (!WE_ARE_FORWARDING(ipst)) {
    570 		if (ipst->ips_ip_mrtdebug > 1) {
    571 			(void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
    572 			    "ip_mrouter_init: turning on forwarding");
    573 		}
    574 		ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward;
    575 		ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS;
    576 	}
    577 
    578 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    579 	return (0);
    580 }
    581 
    582 void
    583 ip_mrouter_stack_init(ip_stack_t *ipst)
    584 {
    585 	mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
    586 
    587 	ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
    588 	    KM_SLEEP);
    589 	ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
    590 	/*
    591 	 * mfctable:
    592 	 * Includes all mfcs, including waiting upcalls.
    593 	 * Multiple mfcs per bucket.
    594 	 */
    595 	ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
    596 	    KM_SLEEP);
    597 	/*
    598 	 * Define the token bucket filter structures.
    599 	 * tbftable -> each vif has one of these for storing info.
    600 	 */
    601 	ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
    602 
    603 	mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
    604 
    605 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
    606 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
    607 }
    608 
    609 /*
    610  * Disable multicast routing.
    611  * Didn't use global timeout_val (BSD version), instead check the mfctable.
    612  */
    613 int
    614 ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst)
    615 {
    616 	conn_t		*mrouter;
    617 	vifi_t 		vifi;
    618 	struct mfc	*mfc_rt;
    619 	int		i;
    620 
    621 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    622 	if (ipst->ips_ip_g_mrouter == NULL) {
    623 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    624 		return (EINVAL);
    625 	}
    626 
    627 	mrouter = ipst->ips_ip_g_mrouter;
    628 
    629 	if (ipst->ips_saved_ip_g_forward != -1) {
    630 		if (ipst->ips_ip_mrtdebug > 1) {
    631 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
    632 			    "ip_mrouter_done: turning off forwarding");
    633 		}
    634 		ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward;
    635 		ipst->ips_saved_ip_g_forward = -1;
    636 	}
    637 
    638 	/*
    639 	 * Always clear cache when vifs change.
    640 	 * No need to get ipst->ips_last_encap_lock since we are running as
    641 	 * a writer.
    642 	 */
    643 	mutex_enter(&ipst->ips_last_encap_lock);
    644 	ipst->ips_last_encap_src = 0;
    645 	ipst->ips_last_encap_vif = NULL;
    646 	mutex_exit(&ipst->ips_last_encap_lock);
    647 	mrouter->conn_multi_router = 0;
    648 
    649 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    650 
    651 	/*
    652 	 * For each phyint in use,
    653 	 * disable promiscuous reception of all IP multicasts.
    654 	 */
    655 	for (vifi = 0; vifi < MAXVIFS; vifi++) {
    656 		struct vif *vifp = ipst->ips_vifs + vifi;
    657 
    658 		mutex_enter(&vifp->v_lock);
    659 		/*
    660 		 * if the vif is active mark it condemned.
    661 		 */
    662 		if (vifp->v_marks & VIF_MARK_GOOD) {
    663 			ASSERT(vifp->v_ipif != NULL);
    664 			ipif_refhold(vifp->v_ipif);
    665 			/* Phyint only */
    666 			if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
    667 				ipif_t *ipif = vifp->v_ipif;
    668 				ipsq_t  *ipsq;
    669 				boolean_t suc;
    670 				ill_t *ill;
    671 
    672 				ill = ipif->ipif_ill;
    673 				suc = B_FALSE;
    674 				if (mp == NULL) {
    675 					/*
    676 					 * being called from ip_close,
    677 					 * lets do it synchronously.
    678 					 * Clear VIF_MARK_GOOD and
    679 					 * set VIF_MARK_CONDEMNED.
    680 					 */
    681 					vifp->v_marks &= ~VIF_MARK_GOOD;
    682 					vifp->v_marks |= VIF_MARK_CONDEMNED;
    683 					mutex_exit(&(vifp)->v_lock);
    684 					suc = ipsq_enter(ill, B_FALSE, NEW_OP);
    685 					ipsq = ill->ill_phyint->phyint_ipsq;
    686 				} else {
    687 					ipsq = ipsq_try_enter(ipif, NULL,
    688 					    mrouter->conn_wq, mp,
    689 					    ip_restart_optmgmt, NEW_OP, B_TRUE);
    690 					if (ipsq == NULL) {
    691 						mutex_exit(&(vifp)->v_lock);
    692 						ipif_refrele(ipif);
    693 						return (EINPROGRESS);
    694 					}
    695 					/*
    696 					 * Clear VIF_MARK_GOOD and
    697 					 * set VIF_MARK_CONDEMNED.
    698 					 */
    699 					vifp->v_marks &= ~VIF_MARK_GOOD;
    700 					vifp->v_marks |= VIF_MARK_CONDEMNED;
    701 					mutex_exit(&(vifp)->v_lock);
    702 					suc = B_TRUE;
    703 				}
    704 
    705 				if (suc) {
    706 					(void) ip_delmulti(INADDR_ANY, ipif,
    707 					    B_TRUE, B_TRUE);
    708 					ipsq_exit(ipsq);
    709 				}
    710 				mutex_enter(&vifp->v_lock);
    711 			}
    712 			ipif_refrele(vifp->v_ipif);
    713 			/*
    714 			 * decreases the refcnt added in add_vif.
    715 			 * and release v_lock.
    716 			 */
    717 			VIF_REFRELE_LOCKED(vifp);
    718 		} else {
    719 			mutex_exit(&vifp->v_lock);
    720 			continue;
    721 		}
    722 	}
    723 
    724 	mutex_enter(&ipst->ips_numvifs_mutex);
    725 	ipst->ips_numvifs = 0;
    726 	ipst->ips_pim_assert = 0;
    727 	ipst->ips_reg_vif_num = ALL_VIFS;
    728 	mutex_exit(&ipst->ips_numvifs_mutex);
    729 
    730 	/*
    731 	 * Free upcall msgs.
    732 	 * Go through mfctable and stop any outstanding upcall
    733 	 * timeouts remaining on mfcs.
    734 	 */
    735 	for (i = 0; i < MFCTBLSIZ; i++) {
    736 		mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
    737 		ipst->ips_mfcs[i].mfcb_refcnt++;
    738 		ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
    739 		mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
    740 		mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
    741 		while (mfc_rt) {
    742 			/* Free upcalls */
    743 			mutex_enter(&mfc_rt->mfc_mutex);
    744 			if (mfc_rt->mfc_rte != NULL) {
    745 				if (mfc_rt->mfc_timeout_id != 0) {
    746 					/*
    747 					 * OK to drop the lock as we have
    748 					 * a refcnt on the bucket. timeout
    749 					 * can fire but it will see that
    750 					 * mfc_timeout_id == 0 and not do
    751 					 * anything. see expire_upcalls().
    752 					 */
    753 					mfc_rt->mfc_timeout_id = 0;
    754 					mutex_exit(&mfc_rt->mfc_mutex);
    755 					(void) untimeout(
    756 					    mfc_rt->mfc_timeout_id);
    757 						mfc_rt->mfc_timeout_id = 0;
    758 					mutex_enter(&mfc_rt->mfc_mutex);
    759 
    760 					/*
    761 					 * all queued upcall packets
    762 					 * and mblk will be freed in
    763 					 * release_mfc().
    764 					 */
    765 				}
    766 			}
    767 
    768 			mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
    769 
    770 			mutex_exit(&mfc_rt->mfc_mutex);
    771 			mfc_rt = mfc_rt->mfc_next;
    772 		}
    773 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
    774 	}
    775 
    776 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    777 	ipst->ips_ip_g_mrouter = NULL;
    778 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    779 	return (0);
    780 }
    781 
    782 void
    783 ip_mrouter_stack_destroy(ip_stack_t *ipst)
    784 {
    785 	struct mfcb *mfcbp;
    786 	struct mfc  *rt;
    787 	int i;
    788 
    789 	for (i = 0; i < MFCTBLSIZ; i++) {
    790 		mfcbp = &ipst->ips_mfcs[i];
    791 
    792 		while ((rt = mfcbp->mfcb_mfc) != NULL) {
    793 			(void) printf("ip_mrouter_stack_destroy: free for %d\n",
    794 			    i);
    795 
    796 			mfcbp->mfcb_mfc = rt->mfc_next;
    797 			free_queue(rt);
    798 			mi_free(rt);
    799 		}
    800 	}
    801 	kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
    802 	ipst->ips_vifs = NULL;
    803 	kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
    804 	ipst->ips_mrtstat = NULL;
    805 	kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
    806 	ipst->ips_mfcs = NULL;
    807 	kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
    808 	ipst->ips_tbfs = NULL;
    809 
    810 	mutex_destroy(&ipst->ips_last_encap_lock);
    811 	mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
    812 }
    813 
    814 static boolean_t
    815 is_mrouter_off(ip_stack_t *ipst)
    816 {
    817 	conn_t	*mrouter;
    818 
    819 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
    820 	if (ipst->ips_ip_g_mrouter == NULL) {
    821 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    822 		return (B_TRUE);
    823 	}
    824 
    825 	mrouter = ipst->ips_ip_g_mrouter;
    826 	if (mrouter->conn_multi_router == 0) {
    827 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    828 		return (B_TRUE);
    829 	}
    830 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
    831 	return (B_FALSE);
    832 }
    833 
    834 static void
    835 unlock_good_vif(struct vif *vifp)
    836 {
    837 	ASSERT(vifp->v_ipif != NULL);
    838 	ipif_refrele(vifp->v_ipif);
    839 	VIF_REFRELE(vifp);
    840 }
    841 
    842 static boolean_t
    843 lock_good_vif(struct vif *vifp)
    844 {
    845 	mutex_enter(&vifp->v_lock);
    846 	if (!(vifp->v_marks & VIF_MARK_GOOD)) {
    847 		mutex_exit(&vifp->v_lock);
    848 		return (B_FALSE);
    849 	}
    850 
    851 	ASSERT(vifp->v_ipif != NULL);
    852 	mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
    853 	if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
    854 		mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
    855 		mutex_exit(&vifp->v_lock);
    856 		return (B_FALSE);
    857 	}
    858 	ipif_refhold_locked(vifp->v_ipif);
    859 	mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
    860 	vifp->v_refcnt++;
    861 	mutex_exit(&vifp->v_lock);
    862 	return (B_TRUE);
    863 }
    864 
    865 /*
    866  * Add a vif to the vif table.
    867  */
    868 static int
    869 add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
    870 {
    871 	struct vif	*vifp = ipst->ips_vifs + vifcp->vifc_vifi;
    872 	ipif_t		*ipif;
    873 	int		error;
    874 	struct tbf	*v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
    875 	ipsq_t  	*ipsq;
    876 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
    877 
    878 	ASSERT(connp != NULL);
    879 
    880 	if (vifcp->vifc_vifi >= MAXVIFS)
    881 		return (EINVAL);
    882 
    883 	if (is_mrouter_off(ipst))
    884 		return (EINVAL);
    885 
    886 	mutex_enter(&vifp->v_lock);
    887 	/*
    888 	 * Viftable entry should be 0.
    889 	 * if v_marks == 0 but v_refcnt != 0 means struct is being
    890 	 * initialized.
    891 	 *
    892 	 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
    893 	 * request while the delete is in progress, mrouted only sends add
    894 	 * requests when a new interface is added and the new interface cannot
    895 	 * have the same vifi as an existing interface. We make sure that
    896 	 * ill_delete will block till the vif is deleted by adding a refcnt
    897 	 * to ipif in del_vif().
    898 	 */
    899 	if (vifp->v_lcl_addr.s_addr != 0 ||
    900 	    vifp->v_marks != 0 ||
    901 	    vifp->v_refcnt != 0) {
    902 		mutex_exit(&vifp->v_lock);
    903 		return (EADDRINUSE);
    904 	}
    905 
    906 	/* Incoming vif should not be 0 */
    907 	if (vifcp->vifc_lcl_addr.s_addr == 0) {
    908 		mutex_exit(&vifp->v_lock);
    909 		return (EINVAL);
    910 	}
    911 
    912 	vifp->v_refcnt++;
    913 	mutex_exit(&vifp->v_lock);
    914 	/* Find the interface with the local address */
    915 	ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
    916 	    connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp,
    917 	    ip_restart_optmgmt, &error, ipst);
    918 	if (ipif == NULL) {
    919 		VIF_REFRELE(vifp);
    920 		if (error == EINPROGRESS)
    921 			return (error);
    922 		return (EADDRNOTAVAIL);
    923 	}
    924 
    925 	/*
    926 	 * We have to be exclusive as we have to call ip_addmulti()
    927 	 * This is the best position to try to be exclusive in case
    928 	 * we have to wait.
    929 	 */
    930 	ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp,
    931 	    ip_restart_optmgmt, NEW_OP, B_TRUE);
    932 	if ((ipsq) == NULL) {
    933 		VIF_REFRELE(vifp);
    934 		ipif_refrele(ipif);
    935 		return (EINPROGRESS);
    936 	}
    937 
    938 	if (ipst->ips_ip_mrtdebug > 1) {
    939 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
    940 		    "add_vif: src 0x%x enter",
    941 		    vifcp->vifc_lcl_addr.s_addr);
    942 	}
    943 
    944 	mutex_enter(&vifp->v_lock);
    945 	/*
    946 	 * Always clear cache when vifs change.
    947 	 * Needed to ensure that src isn't left over from before vif was added.
    948 	 * No need to get last_encap_lock, since we are running as a writer.
    949 	 */
    950 
    951 	mutex_enter(&ipst->ips_last_encap_lock);
    952 	ipst->ips_last_encap_src = 0;
    953 	ipst->ips_last_encap_vif = NULL;
    954 	mutex_exit(&ipst->ips_last_encap_lock);
    955 
    956 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
    957 		if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
    958 			cmn_err(CE_WARN,
    959 			    "add_vif: source route tunnels not supported\n");
    960 			VIF_REFRELE_LOCKED(vifp);
    961 			ipif_refrele(ipif);
    962 			ipsq_exit(ipsq);
    963 			return (EOPNOTSUPP);
    964 		}
    965 		vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
    966 
    967 	} else {
    968 		/* Phyint or Register vif */
    969 		if (vifcp->vifc_flags & VIFF_REGISTER) {
    970 			/*
    971 			 * Note: Since all IPPROTO_IP level options (including
    972 			 * MRT_ADD_VIF) are done exclusively via
    973 			 * ip_optmgmt_writer(), a lock is not necessary to
    974 			 * protect reg_vif_num.
    975 			 */
    976 			mutex_enter(&ipst->ips_numvifs_mutex);
    977 			if (ipst->ips_reg_vif_num == ALL_VIFS) {
    978 				ipst->ips_reg_vif_num = vifcp->vifc_vifi;
    979 				mutex_exit(&ipst->ips_numvifs_mutex);
    980 			} else {
    981 				mutex_exit(&ipst->ips_numvifs_mutex);
    982 				VIF_REFRELE_LOCKED(vifp);
    983 				ipif_refrele(ipif);
    984 				ipsq_exit(ipsq);
    985 				return (EADDRINUSE);
    986 			}
    987 		}
    988 
    989 		/* Make sure the interface supports multicast */
    990 		if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
    991 			VIF_REFRELE_LOCKED(vifp);
    992 			ipif_refrele(ipif);
    993 			if (vifcp->vifc_flags & VIFF_REGISTER) {
    994 				mutex_enter(&ipst->ips_numvifs_mutex);
    995 				ipst->ips_reg_vif_num = ALL_VIFS;
    996 				mutex_exit(&ipst->ips_numvifs_mutex);
    997 			}
    998 			ipsq_exit(ipsq);
    999 			return (EOPNOTSUPP);
   1000 		}
   1001 		/* Enable promiscuous reception of all IP mcasts from the if */
   1002 		mutex_exit(&vifp->v_lock);
   1003 		error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE,
   1004 		    MODE_IS_EXCLUDE, NULL);
   1005 		mutex_enter(&vifp->v_lock);
   1006 		/*
   1007 		 * since we released the lock lets make sure that
   1008 		 * ip_mrouter_done() has not been called.
   1009 		 */
   1010 		if (error != 0 || is_mrouter_off(ipst)) {
   1011 			if (error == 0)
   1012 				(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE,
   1013 				    B_TRUE);
   1014 			if (vifcp->vifc_flags & VIFF_REGISTER) {
   1015 				mutex_enter(&ipst->ips_numvifs_mutex);
   1016 				ipst->ips_reg_vif_num = ALL_VIFS;
   1017 				mutex_exit(&ipst->ips_numvifs_mutex);
   1018 			}
   1019 			VIF_REFRELE_LOCKED(vifp);
   1020 			ipif_refrele(ipif);
   1021 			ipsq_exit(ipsq);
   1022 			return (error?error:EINVAL);
   1023 		}
   1024 	}
   1025 	/* Define parameters for the tbf structure */
   1026 	vifp->v_tbf = v_tbf;
   1027 	gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
   1028 	vifp->v_tbf->tbf_n_tok = 0;
   1029 	vifp->v_tbf->tbf_q_len = 0;
   1030 	vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
   1031 	vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
   1032 
   1033 	vifp->v_flags = vifcp->vifc_flags;
   1034 	vifp->v_threshold = vifcp->vifc_threshold;
   1035 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
   1036 	vifp->v_ipif = ipif;
   1037 	ipif_refrele(ipif);
   1038 	/* Scaling up here, allows division by 1024 in critical code.	*/
   1039 	vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
   1040 	vifp->v_timeout_id = 0;
   1041 	/* initialize per vif pkt counters */
   1042 	vifp->v_pkt_in = 0;
   1043 	vifp->v_pkt_out = 0;
   1044 	vifp->v_bytes_in = 0;
   1045 	vifp->v_bytes_out = 0;
   1046 	mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
   1047 
   1048 	/* Adjust numvifs up, if the vifi is higher than numvifs */
   1049 	mutex_enter(&ipst->ips_numvifs_mutex);
   1050 	if (ipst->ips_numvifs <= vifcp->vifc_vifi)
   1051 		ipst->ips_numvifs = vifcp->vifc_vifi + 1;
   1052 	mutex_exit(&ipst->ips_numvifs_mutex);
   1053 
   1054 	if (ipst->ips_ip_mrtdebug > 1) {
   1055 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1056 		    "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
   1057 		    vifcp->vifc_vifi,
   1058 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
   1059 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
   1060 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
   1061 		    vifcp->vifc_threshold, vifcp->vifc_rate_limit);
   1062 	}
   1063 
   1064 	vifp->v_marks = VIF_MARK_GOOD;
   1065 	mutex_exit(&vifp->v_lock);
   1066 	ipsq_exit(ipsq);
   1067 	return (0);
   1068 }
   1069 
   1070 
   1071 /* Delete a vif from the vif table. */
   1072 static void
   1073 del_vifp(struct vif *vifp)
   1074 {
   1075 	struct tbf	*t = vifp->v_tbf;
   1076 	mblk_t  *mp0;
   1077 	vifi_t  vifi;
   1078 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   1079 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   1080 
   1081 	ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
   1082 	ASSERT(t != NULL);
   1083 
   1084 	if (ipst->ips_ip_mrtdebug > 1) {
   1085 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1086 		    "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
   1087 	}
   1088 
   1089 	if (vifp->v_timeout_id != 0) {
   1090 		(void) untimeout(vifp->v_timeout_id);
   1091 		vifp->v_timeout_id = 0;
   1092 	}
   1093 
   1094 	/*
   1095 	 * Free packets queued at the interface.
   1096 	 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
   1097 	 */
   1098 	mutex_enter(&t->tbf_lock);
   1099 	while (t->tbf_q != NULL) {
   1100 		mp0 = t->tbf_q;
   1101 		t->tbf_q = t->tbf_q->b_next;
   1102 		mp0->b_prev = mp0->b_next = NULL;
   1103 		freemsg(mp0);
   1104 	}
   1105 	mutex_exit(&t->tbf_lock);
   1106 
   1107 	/*
   1108 	 * Always clear cache when vifs change.
   1109 	 * No need to get last_encap_lock since we are running as a writer.
   1110 	 */
   1111 	mutex_enter(&ipst->ips_last_encap_lock);
   1112 	if (vifp == ipst->ips_last_encap_vif) {
   1113 		ipst->ips_last_encap_vif = NULL;
   1114 		ipst->ips_last_encap_src = 0;
   1115 	}
   1116 	mutex_exit(&ipst->ips_last_encap_lock);
   1117 
   1118 	mutex_destroy(&t->tbf_lock);
   1119 
   1120 	bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
   1121 
   1122 	/* Adjust numvifs down */
   1123 	mutex_enter(&ipst->ips_numvifs_mutex);
   1124 	for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
   1125 		if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
   1126 			break;
   1127 	ipst->ips_numvifs = vifi;
   1128 	mutex_exit(&ipst->ips_numvifs_mutex);
   1129 
   1130 	bzero(vifp, sizeof (*vifp));
   1131 }
   1132 
   1133 static int
   1134 del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
   1135 {
   1136 	struct vif	*vifp = ipst->ips_vifs + *vifip;
   1137 	ipsq_t  	*ipsq;
   1138 
   1139 	if (*vifip >= ipst->ips_numvifs)
   1140 		return (EINVAL);
   1141 
   1142 	mutex_enter(&vifp->v_lock);
   1143 	/*
   1144 	 * Not initialized
   1145 	 * Here we are not looking at the vif that is being initialized
   1146 	 * i.e vifp->v_marks == 0 and refcnt > 0.
   1147 	 */
   1148 	if (vifp->v_lcl_addr.s_addr == 0 ||
   1149 	    !(vifp->v_marks & VIF_MARK_GOOD)) {
   1150 		mutex_exit(&vifp->v_lock);
   1151 		return (EADDRNOTAVAIL);
   1152 	}
   1153 
   1154 	/*
   1155 	 * This is an optimization, if first_mp == NULL
   1156 	 * than we are being called from reset_mrt_vif_ipif()
   1157 	 * so we already have exclusive access to the ipsq.
   1158 	 * the ASSERT below is a check for this condition.
   1159 	 */
   1160 	if (first_mp != NULL &&
   1161 	    !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
   1162 		ASSERT(connp != NULL);
   1163 		/*
   1164 		 * We have to be exclusive as we have to call ip_delmulti()
   1165 		 * This is the best position to try to be exclusive in case
   1166 		 * we have to wait.
   1167 		 */
   1168 		ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp),
   1169 		    first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE);
   1170 		if ((ipsq) == NULL) {
   1171 			mutex_exit(&vifp->v_lock);
   1172 			return (EINPROGRESS);
   1173 		}
   1174 		/* recheck after being exclusive */
   1175 		if (vifp->v_lcl_addr.s_addr == 0 ||
   1176 		    !vifp->v_marks & VIF_MARK_GOOD) {
   1177 			/*
   1178 			 * someone beat us.
   1179 			 */
   1180 			mutex_exit(&vifp->v_lock);
   1181 			ipsq_exit(ipsq);
   1182 			return (EADDRNOTAVAIL);
   1183 		}
   1184 	}
   1185 
   1186 
   1187 	ASSERT(IAM_WRITER_IPIF(vifp->v_ipif));
   1188 
   1189 	/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
   1190 	vifp->v_marks &= ~VIF_MARK_GOOD;
   1191 	vifp->v_marks |= VIF_MARK_CONDEMNED;
   1192 
   1193 	/* Phyint only */
   1194 	if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
   1195 		ipif_t *ipif = vifp->v_ipif;
   1196 		ASSERT(ipif != NULL);
   1197 		/*
   1198 		 * should be OK to drop the lock as we
   1199 		 * have marked this as CONDEMNED.
   1200 		 */
   1201 		mutex_exit(&(vifp)->v_lock);
   1202 		(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE);
   1203 		if (first_mp != NULL)
   1204 			ipsq_exit(ipsq);
   1205 		mutex_enter(&(vifp)->v_lock);
   1206 	}
   1207 
   1208 	/*
   1209 	 * decreases the refcnt added in add_vif.
   1210 	 */
   1211 	VIF_REFRELE_LOCKED(vifp);
   1212 	return (0);
   1213 }
   1214 
   1215 /*
   1216  * Add an mfc entry.
   1217  */
   1218 static int
   1219 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
   1220 {
   1221 	struct mfc *rt;
   1222 	struct rtdetq *rte;
   1223 	ushort_t nstl;
   1224 	int i;
   1225 	struct mfcb *mfcbp;
   1226 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   1227 
   1228 	/*
   1229 	 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
   1230 	 * did not have a real route for pkt.
   1231 	 * We want this pkt without rt installed in the mfctable to prevent
   1232 	 * multiiple tries, so go ahead and put it in mfctable, it will
   1233 	 * be discarded later in ip_mdq() because the child is NULL.
   1234 	 */
   1235 
   1236 	/* Error checking, out of bounds? */
   1237 	if (mfccp->mfcc_parent > MAXVIFS) {
   1238 		ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
   1239 		    (int)mfccp->mfcc_parent));
   1240 		return (EINVAL);
   1241 	}
   1242 
   1243 	if ((mfccp->mfcc_parent != NO_VIF) &&
   1244 	    (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
   1245 		ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
   1246 		    (int)mfccp->mfcc_parent));
   1247 		return (EINVAL);
   1248 	}
   1249 
   1250 	if (is_mrouter_off(ipst)) {
   1251 		return (EINVAL);
   1252 	}
   1253 
   1254 	mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
   1255 	    mfccp->mfcc_mcastgrp.s_addr)];
   1256 	MFCB_REFHOLD(mfcbp);
   1257 	MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
   1258 	    mfccp->mfcc_mcastgrp.s_addr, rt);
   1259 
   1260 	/* If an entry already exists, just update the fields */
   1261 	if (rt) {
   1262 		if (ipst->ips_ip_mrtdebug > 1) {
   1263 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1264 			    "add_mfc: update o %x grp %x parent %x",
   1265 			    ntohl(mfccp->mfcc_origin.s_addr),
   1266 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
   1267 			    mfccp->mfcc_parent);
   1268 		}
   1269 		mutex_enter(&rt->mfc_mutex);
   1270 		rt->mfc_parent = mfccp->mfcc_parent;
   1271 
   1272 		mutex_enter(&ipst->ips_numvifs_mutex);
   1273 		for (i = 0; i < (int)ipst->ips_numvifs; i++)
   1274 			rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
   1275 		mutex_exit(&ipst->ips_numvifs_mutex);
   1276 		mutex_exit(&rt->mfc_mutex);
   1277 
   1278 		MFCB_REFRELE(mfcbp);
   1279 		return (0);
   1280 	}
   1281 
   1282 	/*
   1283 	 * Find the entry for which the upcall was made and update.
   1284 	 */
   1285 	for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
   1286 		mutex_enter(&rt->mfc_mutex);
   1287 		if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
   1288 		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
   1289 		    (rt->mfc_rte != NULL) &&
   1290 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
   1291 			if (nstl++ != 0)
   1292 				cmn_err(CE_WARN,
   1293 				    "add_mfc: %s o %x g %x p %x",
   1294 				    "multiple kernel entries",
   1295 				    ntohl(mfccp->mfcc_origin.s_addr),
   1296 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
   1297 				    mfccp->mfcc_parent);
   1298 
   1299 			if (ipst->ips_ip_mrtdebug > 1) {
   1300 				(void) mi_strlog(mrouter->conn_rq, 1,
   1301 				    SL_TRACE,
   1302 				    "add_mfc: o %x g %x p %x",
   1303 				    ntohl(mfccp->mfcc_origin.s_addr),
   1304 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
   1305 				    mfccp->mfcc_parent);
   1306 			}
   1307 			fill_route(rt, mfccp, ipst);
   1308 
   1309 			/*
   1310 			 * Prevent cleanup of cache entry.
   1311 			 * Timer starts in ip_mforward.
   1312 			 */
   1313 			if (rt->mfc_timeout_id != 0) {
   1314 				timeout_id_t id;
   1315 				id = rt->mfc_timeout_id;
   1316 				/*
   1317 				 * setting id to zero will avoid this
   1318 				 * entry from being cleaned up in
   1319 				 * expire_up_calls().
   1320 				 */
   1321 				rt->mfc_timeout_id = 0;
   1322 				/*
   1323 				 * dropping the lock is fine as we
   1324 				 * have a refhold on the bucket.
   1325 				 * so mfc cannot be freed.
   1326 				 * The timeout can fire but it will see
   1327 				 * that mfc_timeout_id == 0 and not cleanup.
   1328 				 */
   1329 				mutex_exit(&rt->mfc_mutex);
   1330 				(void) untimeout(id);
   1331 				mutex_enter(&rt->mfc_mutex);
   1332 			}
   1333 
   1334 			/*
   1335 			 * Send all pkts that are queued waiting for the upcall.
   1336 			 * ip_mdq param tun set to 0 -
   1337 			 * the return value of ip_mdq() isn't used here,
   1338 			 * so value we send doesn't matter.
   1339 			 */
   1340 			while (rt->mfc_rte != NULL) {
   1341 				rte = rt->mfc_rte;
   1342 				rt->mfc_rte = rte->rte_next;
   1343 				mutex_exit(&rt->mfc_mutex);
   1344 				(void) ip_mdq(rte->mp, (ipha_t *)
   1345 				    rte->mp->b_rptr, rte->ill, 0, rt);
   1346 				freemsg(rte->mp);
   1347 				mi_free((char *)rte);
   1348 				mutex_enter(&rt->mfc_mutex);
   1349 			}
   1350 		}
   1351 		mutex_exit(&rt->mfc_mutex);
   1352 	}
   1353 
   1354 
   1355 	/*
   1356 	 * It is possible that an entry is being inserted without an upcall
   1357 	 */
   1358 	if (nstl == 0) {
   1359 		mutex_enter(&(mfcbp->mfcb_lock));
   1360 		if (ipst->ips_ip_mrtdebug > 1) {
   1361 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1362 			    "add_mfc: no upcall o %x g %x p %x",
   1363 			    ntohl(mfccp->mfcc_origin.s_addr),
   1364 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
   1365 			    mfccp->mfcc_parent);
   1366 		}
   1367 		if (is_mrouter_off(ipst)) {
   1368 			mutex_exit(&mfcbp->mfcb_lock);
   1369 			MFCB_REFRELE(mfcbp);
   1370 			return (EINVAL);
   1371 		}
   1372 
   1373 		for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
   1374 
   1375 			mutex_enter(&rt->mfc_mutex);
   1376 			if ((rt->mfc_origin.s_addr ==
   1377 			    mfccp->mfcc_origin.s_addr) &&
   1378 			    (rt->mfc_mcastgrp.s_addr ==
   1379 			    mfccp->mfcc_mcastgrp.s_addr) &&
   1380 			    (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
   1381 				fill_route(rt, mfccp, ipst);
   1382 				mutex_exit(&rt->mfc_mutex);
   1383 				break;
   1384 			}
   1385 			mutex_exit(&rt->mfc_mutex);
   1386 		}
   1387 
   1388 		/* No upcall, so make a new entry into mfctable */
   1389 		if (rt == NULL) {
   1390 			rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
   1391 			if (rt == NULL) {
   1392 				ip1dbg(("add_mfc: out of memory\n"));
   1393 				mutex_exit(&mfcbp->mfcb_lock);
   1394 				MFCB_REFRELE(mfcbp);
   1395 				return (ENOBUFS);
   1396 			}
   1397 
   1398 			/* Insert new entry at head of hash chain */
   1399 			mutex_enter(&rt->mfc_mutex);
   1400 			fill_route(rt, mfccp, ipst);
   1401 
   1402 			/* Link into table */
   1403 			rt->mfc_next   = mfcbp->mfcb_mfc;
   1404 			mfcbp->mfcb_mfc = rt;
   1405 			mutex_exit(&rt->mfc_mutex);
   1406 		}
   1407 		mutex_exit(&mfcbp->mfcb_lock);
   1408 	}
   1409 
   1410 	MFCB_REFRELE(mfcbp);
   1411 	return (0);
   1412 }
   1413 
   1414 /*
   1415  * Fills in mfc structure from mrouted mfcctl.
   1416  */
   1417 static void
   1418 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
   1419 {
   1420 	int i;
   1421 
   1422 	rt->mfc_origin		= mfccp->mfcc_origin;
   1423 	rt->mfc_mcastgrp	= mfccp->mfcc_mcastgrp;
   1424 	rt->mfc_parent		= mfccp->mfcc_parent;
   1425 	mutex_enter(&ipst->ips_numvifs_mutex);
   1426 	for (i = 0; i < (int)ipst->ips_numvifs; i++) {
   1427 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
   1428 	}
   1429 	mutex_exit(&ipst->ips_numvifs_mutex);
   1430 	/* Initialize pkt counters per src-grp */
   1431 	rt->mfc_pkt_cnt	= 0;
   1432 	rt->mfc_byte_cnt	= 0;
   1433 	rt->mfc_wrong_if	= 0;
   1434 	rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
   1435 
   1436 }
   1437 
   1438 static void
   1439 free_queue(struct mfc *mfcp)
   1440 {
   1441 	struct rtdetq *rte0;
   1442 
   1443 	/*
   1444 	 * Drop all queued upcall packets.
   1445 	 * Free the mbuf with the pkt.
   1446 	 */
   1447 	while ((rte0 = mfcp->mfc_rte) != NULL) {
   1448 		mfcp->mfc_rte = rte0->rte_next;
   1449 		freemsg(rte0->mp);
   1450 		mi_free((char *)rte0);
   1451 	}
   1452 }
   1453 /*
   1454  * go thorugh the hash bucket and free all the entries marked condemned.
   1455  */
   1456 void
   1457 release_mfc(struct mfcb *mfcbp)
   1458 {
   1459 	struct mfc *current_mfcp;
   1460 	struct mfc *prev_mfcp;
   1461 
   1462 	prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
   1463 
   1464 	while (current_mfcp != NULL) {
   1465 		if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
   1466 			if (current_mfcp == mfcbp->mfcb_mfc) {
   1467 				mfcbp->mfcb_mfc = current_mfcp->mfc_next;
   1468 				free_queue(current_mfcp);
   1469 				mi_free(current_mfcp);
   1470 				prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
   1471 				continue;
   1472 			}
   1473 			ASSERT(prev_mfcp != NULL);
   1474 			prev_mfcp->mfc_next = current_mfcp->mfc_next;
   1475 			free_queue(current_mfcp);
   1476 			mi_free(current_mfcp);
   1477 			current_mfcp = NULL;
   1478 		} else {
   1479 			prev_mfcp = current_mfcp;
   1480 		}
   1481 
   1482 		current_mfcp = prev_mfcp->mfc_next;
   1483 
   1484 	}
   1485 	mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
   1486 	ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
   1487 }
   1488 
   1489 /*
   1490  * Delete an mfc entry.
   1491  */
   1492 static int
   1493 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
   1494 {
   1495 	struct in_addr	origin;
   1496 	struct in_addr	mcastgrp;
   1497 	struct mfc 	*rt;
   1498 	uint_t		hash;
   1499 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   1500 
   1501 	origin = mfccp->mfcc_origin;
   1502 	mcastgrp = mfccp->mfcc_mcastgrp;
   1503 	hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
   1504 
   1505 	if (ipst->ips_ip_mrtdebug > 1) {
   1506 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1507 		    "del_mfc: o %x g %x",
   1508 		    ntohl(origin.s_addr),
   1509 		    ntohl(mcastgrp.s_addr));
   1510 	}
   1511 
   1512 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
   1513 
   1514 	/* Find mfc in mfctable, finds only entries without upcalls */
   1515 	for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
   1516 		mutex_enter(&rt->mfc_mutex);
   1517 		if (origin.s_addr == rt->mfc_origin.s_addr &&
   1518 		    mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
   1519 		    rt->mfc_rte == NULL &&
   1520 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
   1521 			break;
   1522 		mutex_exit(&rt->mfc_mutex);
   1523 	}
   1524 
   1525 	/*
   1526 	 * Return if there was an upcall (mfc_rte != NULL,
   1527 	 * or rt not in mfctable.
   1528 	 */
   1529 	if (rt == NULL) {
   1530 		MFCB_REFRELE(&ipst->ips_mfcs[hash]);
   1531 		return (EADDRNOTAVAIL);
   1532 	}
   1533 
   1534 
   1535 	/*
   1536 	 * no need to hold lock as we have a reference.
   1537 	 */
   1538 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
   1539 	/* error checking */
   1540 	if (rt->mfc_timeout_id != 0) {
   1541 		ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
   1542 		/*
   1543 		 * Its ok to drop the lock,  the struct cannot be freed
   1544 		 * since we have a ref on the hash bucket.
   1545 		 */
   1546 		rt->mfc_timeout_id = 0;
   1547 		mutex_exit(&rt->mfc_mutex);
   1548 		(void) untimeout(rt->mfc_timeout_id);
   1549 		mutex_enter(&rt->mfc_mutex);
   1550 	}
   1551 
   1552 	ASSERT(rt->mfc_rte == NULL);
   1553 
   1554 
   1555 	/*
   1556 	 * Delete the entry from the cache
   1557 	 */
   1558 	rt->mfc_marks |= MFCB_MARK_CONDEMNED;
   1559 	mutex_exit(&rt->mfc_mutex);
   1560 
   1561 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
   1562 
   1563 	return (0);
   1564 }
   1565 
   1566 #define	TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
   1567 
   1568 /*
   1569  * IP multicast forwarding function. This function assumes that the packet
   1570  * pointed to by ipha has arrived on (or is about to be sent to) the interface
   1571  * pointed to by "ill", and the packet is to be relayed to other networks
   1572  * that have members of the packet's destination IP multicast group.
   1573  *
   1574  * The packet is returned unscathed to the caller, unless it is
   1575  * erroneous, in which case a -1 value tells the caller (IP)
   1576  * to discard it.
   1577  *
   1578  * Unlike BSD, SunOS 5.x needs to return to IP info about
   1579  * whether pkt came in thru a tunnel, so it can be discarded, unless
   1580  * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
   1581  * to be delivered.
   1582  * Return values are 0 - pkt is okay and phyint
   1583  *		    -1 - pkt is malformed and to be tossed
   1584  *                   1 - pkt came in on tunnel
   1585  */
   1586 int
   1587 ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
   1588 {
   1589 	struct mfc 	*rt;
   1590 	ipaddr_t	src, dst, tunnel_src = 0;
   1591 	static int	srctun = 0;
   1592 	vifi_t		vifi;
   1593 	boolean_t	pim_reg_packet = B_FALSE;
   1594 	struct mfcb *mfcbp;
   1595 	ip_stack_t	*ipst = ill->ill_ipst;
   1596 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   1597 
   1598 	if (ipst->ips_ip_mrtdebug > 1) {
   1599 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1600 		    "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
   1601 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
   1602 		    ill->ill_name);
   1603 	}
   1604 
   1605 	dst = ipha->ipha_dst;
   1606 	if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER)
   1607 		pim_reg_packet = B_TRUE;
   1608 	else
   1609 		tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev;
   1610 
   1611 	/*
   1612 	 * Don't forward a packet with time-to-live of zero or one,
   1613 	 * or a packet destined to a local-only group.
   1614 	 */
   1615 	if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
   1616 	    (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
   1617 		if (ipst->ips_ip_mrtdebug > 1) {
   1618 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1619 			    "ip_mforward: not forwarded ttl %d,"
   1620 			    " dst 0x%x ill %s",
   1621 			    ipha->ipha_ttl, ntohl(dst), ill->ill_name);
   1622 		}
   1623 		mp->b_prev = NULL;
   1624 		if (tunnel_src != 0)
   1625 			return (1);
   1626 		else
   1627 			return (0);
   1628 	}
   1629 
   1630 	if ((tunnel_src != 0) || pim_reg_packet) {
   1631 		/*
   1632 		 * Packet arrived over an encapsulated tunnel or via a PIM
   1633 		 * register message. Both ip_mroute_decap() and pim_input()
   1634 		 * encode information in mp->b_prev.
   1635 		 */
   1636 		mp->b_prev = NULL;
   1637 		if (ipst->ips_ip_mrtdebug > 1) {
   1638 			if (tunnel_src != 0) {
   1639 				(void) mi_strlog(mrouter->conn_rq, 1,
   1640 				    SL_TRACE,
   1641 				    "ip_mforward: ill %s arrived via ENCAP TUN",
   1642 				    ill->ill_name);
   1643 			} else if (pim_reg_packet) {
   1644 				(void) mi_strlog(mrouter->conn_rq, 1,
   1645 				    SL_TRACE,
   1646 				    "ip_mforward: ill %s arrived via"
   1647 				    "  REGISTER VIF",
   1648 				    ill->ill_name);
   1649 			}
   1650 		}
   1651 	} else if ((ipha->ipha_version_and_hdr_length & 0xf) <
   1652 	    (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
   1653 	    ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
   1654 		/* Packet arrived via a physical interface. */
   1655 		if (ipst->ips_ip_mrtdebug > 1) {
   1656 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1657 			    "ip_mforward: ill %s arrived via PHYINT",
   1658 			    ill->ill_name);
   1659 		}
   1660 
   1661 	} else {
   1662 		/*
   1663 		 * Packet arrived through a SRCRT tunnel.
   1664 		 * Source-route tunnels are no longer supported.
   1665 		 * Error message printed every 1000 times.
   1666 		 */
   1667 		if ((srctun++ % 1000) == 0) {
   1668 			cmn_err(CE_WARN,
   1669 			    "ip_mforward: received source-routed pkt from %x",
   1670 			    ntohl(ipha->ipha_src));
   1671 		}
   1672 		return (-1);
   1673 	}
   1674 
   1675 	ipst->ips_mrtstat->mrts_fwd_in++;
   1676 	src = ipha->ipha_src;
   1677 
   1678 	/* Find route in cache, return NULL if not there or upcalls q'ed. */
   1679 
   1680 	/*
   1681 	 * Lock the mfctable against changes made by ip_mforward.
   1682 	 * Note that only add_mfc and del_mfc can remove entries and
   1683 	 * they run with exclusive access to IP. So we do not need to
   1684 	 * guard against the rt being deleted, so release lock after reading.
   1685 	 */
   1686 
   1687 	if (is_mrouter_off(ipst))
   1688 		return (-1);
   1689 
   1690 	mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
   1691 	MFCB_REFHOLD(mfcbp);
   1692 	MFCFIND(mfcbp, src, dst, rt);
   1693 
   1694 	/* Entry exists, so forward if necessary */
   1695 	if (rt != NULL) {
   1696 		int ret = 0;
   1697 		ipst->ips_mrtstat->mrts_mfc_hits++;
   1698 		if (pim_reg_packet) {
   1699 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
   1700 			ret = ip_mdq(mp, ipha,
   1701 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
   1702 			    v_ipif->ipif_ill,
   1703 			    0, rt);
   1704 		} else {
   1705 			ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
   1706 		}
   1707 
   1708 		MFCB_REFRELE(mfcbp);
   1709 		return (ret);
   1710 
   1711 		/*
   1712 		 * Don't forward if we don't have a cache entry.  Mrouted will
   1713 		 * always provide a cache entry in response to an upcall.
   1714 		 */
   1715 	} else {
   1716 		/*
   1717 		 * If we don't have a route for packet's origin, make a copy
   1718 		 * of the packet and send message to routing daemon.
   1719 		 */
   1720 		struct mfc	*mfc_rt	 = NULL;
   1721 		mblk_t		*mp0	 = NULL;
   1722 		mblk_t		*mp_copy = NULL;
   1723 		struct rtdetq	*rte	 = NULL;
   1724 		struct rtdetq	*rte_m, *rte1, *prev_rte;
   1725 		uint_t		hash;
   1726 		int		npkts;
   1727 		boolean_t	new_mfc = B_FALSE;
   1728 		ipst->ips_mrtstat->mrts_mfc_misses++;
   1729 		/* BSD uses mrts_no_route++ */
   1730 		if (ipst->ips_ip_mrtdebug > 1) {
   1731 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1732 			    "ip_mforward: no rte ill %s src %x g %x misses %d",
   1733 			    ill->ill_name, ntohl(src), ntohl(dst),
   1734 			    (int)ipst->ips_mrtstat->mrts_mfc_misses);
   1735 		}
   1736 		/*
   1737 		 * The order of the following code differs from the BSD code.
   1738 		 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
   1739 		 * code works, so SunOS 5.x wasn't changed to conform to the
   1740 		 * BSD version.
   1741 		 */
   1742 
   1743 		/* Lock mfctable. */
   1744 		hash = MFCHASH(src, dst);
   1745 		mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
   1746 
   1747 		/*
   1748 		 * If we are turning off mrouted return an error
   1749 		 */
   1750 		if (is_mrouter_off(ipst)) {
   1751 			mutex_exit(&mfcbp->mfcb_lock);
   1752 			MFCB_REFRELE(mfcbp);
   1753 			return (-1);
   1754 		}
   1755 
   1756 		/* Is there an upcall waiting for this packet? */
   1757 		for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
   1758 		    mfc_rt = mfc_rt->mfc_next) {
   1759 			mutex_enter(&mfc_rt->mfc_mutex);
   1760 			if (ipst->ips_ip_mrtdebug > 1) {
   1761 				(void) mi_strlog(mrouter->conn_rq, 1,
   1762 				    SL_TRACE,
   1763 				    "ip_mforward: MFCTAB hash %d o 0x%x"
   1764 				    " g 0x%x\n",
   1765 				    hash, ntohl(mfc_rt->mfc_origin.s_addr),
   1766 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
   1767 			}
   1768 			/* There is an upcall */
   1769 			if ((src == mfc_rt->mfc_origin.s_addr) &&
   1770 			    (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
   1771 			    (mfc_rt->mfc_rte != NULL) &&
   1772 			    !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
   1773 				break;
   1774 			}
   1775 			mutex_exit(&mfc_rt->mfc_mutex);
   1776 		}
   1777 		/* No upcall, so make a new entry into mfctable */
   1778 		if (mfc_rt == NULL) {
   1779 			mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
   1780 			if (mfc_rt == NULL) {
   1781 				ipst->ips_mrtstat->mrts_fwd_drop++;
   1782 				ip1dbg(("ip_mforward: out of memory "
   1783 				    "for mfc, mfc_rt\n"));
   1784 				goto error_return;
   1785 			} else
   1786 				new_mfc = B_TRUE;
   1787 			/* Get resources */
   1788 			/* TODO could copy header and dup rest */
   1789 			mp_copy = copymsg(mp);
   1790 			if (mp_copy == NULL) {
   1791 				ipst->ips_mrtstat->mrts_fwd_drop++;
   1792 				ip1dbg(("ip_mforward: out of memory for "
   1793 				    "mblk, mp_copy\n"));
   1794 				goto error_return;
   1795 			}
   1796 			mutex_enter(&mfc_rt->mfc_mutex);
   1797 		}
   1798 		/* Get resources for rte, whether first rte or not first. */
   1799 		/* Add this packet into rtdetq */
   1800 		rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
   1801 		if (rte == NULL) {
   1802 			ipst->ips_mrtstat->mrts_fwd_drop++;
   1803 			mutex_exit(&mfc_rt->mfc_mutex);
   1804 			ip1dbg(("ip_mforward: out of memory for"
   1805 			    " rtdetq, rte\n"));
   1806 			goto error_return;
   1807 		}
   1808 
   1809 		mp0 = copymsg(mp);
   1810 		if (mp0 == NULL) {
   1811 			ipst->ips_mrtstat->mrts_fwd_drop++;
   1812 			ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
   1813 			mutex_exit(&mfc_rt->mfc_mutex);
   1814 			goto error_return;
   1815 		}
   1816 		rte->mp		= mp0;
   1817 		if (pim_reg_packet) {
   1818 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
   1819 			rte->ill =
   1820 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
   1821 			    v_ipif->ipif_ill;
   1822 		} else {
   1823 			rte->ill = ill;
   1824 		}
   1825 		rte->rte_next	= NULL;
   1826 
   1827 		/*
   1828 		 * Determine if upcall q (rtdetq) has overflowed.
   1829 		 * mfc_rt->mfc_rte is null by mi_zalloc
   1830 		 * if it is the first message.
   1831 		 */
   1832 		for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
   1833 		    rte_m = rte_m->rte_next)
   1834 			npkts++;
   1835 		if (ipst->ips_ip_mrtdebug > 1) {
   1836 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1837 			    "ip_mforward: upcalls %d\n", npkts);
   1838 		}
   1839 		if (npkts > MAX_UPQ) {
   1840 			ipst->ips_mrtstat->mrts_upq_ovflw++;
   1841 			mutex_exit(&mfc_rt->mfc_mutex);
   1842 			goto error_return;
   1843 		}
   1844 
   1845 		if (npkts == 0) {	/* first upcall */
   1846 			int i = 0;
   1847 			/*
   1848 			 * Now finish installing the new mfc! Now that we have
   1849 			 * resources!  Insert new entry at head of hash chain.
   1850 			 * Use src and dst which are ipaddr_t's.
   1851 			 */
   1852 			mfc_rt->mfc_origin.s_addr = src;
   1853 			mfc_rt->mfc_mcastgrp.s_addr = dst;
   1854 
   1855 			mutex_enter(&ipst->ips_numvifs_mutex);
   1856 			for (i = 0; i < (int)ipst->ips_numvifs; i++)
   1857 				mfc_rt->mfc_ttls[i] = 0;
   1858 			mutex_exit(&ipst->ips_numvifs_mutex);
   1859 			mfc_rt->mfc_parent = ALL_VIFS;
   1860 
   1861 			/* Link into table */
   1862 			if (ipst->ips_ip_mrtdebug > 1) {
   1863 				(void) mi_strlog(mrouter->conn_rq, 1,
   1864 				    SL_TRACE,
   1865 				    "ip_mforward: NEW MFCTAB hash %d o 0x%x "
   1866 				    "g 0x%x\n", hash,
   1867 				    ntohl(mfc_rt->mfc_origin.s_addr),
   1868 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
   1869 			}
   1870 			mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
   1871 			ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
   1872 			mfc_rt->mfc_rte = NULL;
   1873 		}
   1874 
   1875 		/* Link in the upcall */
   1876 		/* First upcall */
   1877 		if (mfc_rt->mfc_rte == NULL)
   1878 			mfc_rt->mfc_rte = rte;
   1879 		else {
   1880 			/* not the first upcall */
   1881 			prev_rte = mfc_rt->mfc_rte;
   1882 			for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
   1883 			    prev_rte = rte1, rte1 = rte1->rte_next)
   1884 				;
   1885 			prev_rte->rte_next = rte;
   1886 		}
   1887 
   1888 		/*
   1889 		 * No upcalls waiting, this is first one, so send a message to
   1890 		 * routing daemon to install a route into kernel table.
   1891 		 */
   1892 		if (npkts == 0) {
   1893 			struct igmpmsg	*im;
   1894 			/* ipha_protocol is 0, for upcall */
   1895 			ASSERT(mp_copy != NULL);
   1896 			im = (struct igmpmsg *)mp_copy->b_rptr;
   1897 			im->im_msgtype	= IGMPMSG_NOCACHE;
   1898 			im->im_mbz = 0;
   1899 			mutex_enter(&ipst->ips_numvifs_mutex);
   1900 			if (pim_reg_packet) {
   1901 				im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
   1902 				mutex_exit(&ipst->ips_numvifs_mutex);
   1903 			} else {
   1904 				/*
   1905 				 * XXX do we need to hold locks here ?
   1906 				 */
   1907 				for (vifi = 0;
   1908 				    vifi < ipst->ips_numvifs;
   1909 				    vifi++) {
   1910 					if (ipst->ips_vifs[vifi].v_ipif == NULL)
   1911 						continue;
   1912 					if (ipst->ips_vifs[vifi].
   1913 					    v_ipif->ipif_ill == ill) {
   1914 						im->im_vif = (uchar_t)vifi;
   1915 						break;
   1916 					}
   1917 				}
   1918 				mutex_exit(&ipst->ips_numvifs_mutex);
   1919 				ASSERT(vifi < ipst->ips_numvifs);
   1920 			}
   1921 
   1922 			ipst->ips_mrtstat->mrts_upcalls++;
   1923 			/* Timer to discard upcalls if mrouted is too slow */
   1924 			mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
   1925 			    mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
   1926 			mutex_exit(&mfc_rt->mfc_mutex);
   1927 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
   1928 			/* Pass to RAWIP */
   1929 			(mrouter->conn_recv)(mrouter, mp_copy, NULL);
   1930 		} else {
   1931 			mutex_exit(&mfc_rt->mfc_mutex);
   1932 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
   1933 			freemsg(mp_copy);
   1934 		}
   1935 
   1936 		MFCB_REFRELE(mfcbp);
   1937 		if (tunnel_src != 0)
   1938 			return (1);
   1939 		else
   1940 			return (0);
   1941 	error_return:
   1942 		mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
   1943 		MFCB_REFRELE(mfcbp);
   1944 		if (mfc_rt != NULL && (new_mfc == B_TRUE))
   1945 			mi_free((char *)mfc_rt);
   1946 		if (rte != NULL)
   1947 			mi_free((char *)rte);
   1948 		if (mp_copy != NULL)
   1949 			freemsg(mp_copy);
   1950 		if (mp0 != NULL)
   1951 			freemsg(mp0);
   1952 		return (-1);
   1953 	}
   1954 }
   1955 
   1956 /*
   1957  * Clean up the mfctable cache entry if upcall is not serviced.
   1958  * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
   1959  */
   1960 static void
   1961 expire_upcalls(void *arg)
   1962 {
   1963 	struct mfc *mfc_rt = arg;
   1964 	uint_t hash;
   1965 	struct mfc *prev_mfc, *mfc0;
   1966 	ip_stack_t	*ipst;
   1967 	conn_t		*mrouter;
   1968 
   1969 	if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
   1970 		cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
   1971 		return;
   1972 	}
   1973 	ipst = mfc_rt->mfc_rte->ill->ill_ipst;
   1974 	mrouter = ipst->ips_ip_g_mrouter;
   1975 
   1976 	hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
   1977 	if (ipst->ips_ip_mrtdebug > 1) {
   1978 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   1979 		    "expire_upcalls: hash %d s %x g %x",
   1980 		    hash, ntohl(mfc_rt->mfc_origin.s_addr),
   1981 		    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
   1982 	}
   1983 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
   1984 	mutex_enter(&mfc_rt->mfc_mutex);
   1985 	/*
   1986 	 * if timeout has been set to zero, than the
   1987 	 * entry has been filled, no need to delete it.
   1988 	 */
   1989 	if (mfc_rt->mfc_timeout_id == 0)
   1990 		goto done;
   1991 	ipst->ips_mrtstat->mrts_cache_cleanups++;
   1992 	mfc_rt->mfc_timeout_id = 0;
   1993 
   1994 	/* Determine entry to be cleaned up in cache table. */
   1995 	for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
   1996 	    prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
   1997 		if (mfc0 == mfc_rt)
   1998 			break;
   1999 
   2000 	/* del_mfc takes care of gone mfcs */
   2001 	ASSERT(prev_mfc != NULL);
   2002 	ASSERT(mfc0 != NULL);
   2003 
   2004 	/*
   2005 	 * Delete the entry from the cache
   2006 	 */
   2007 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
   2008 	mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
   2009 
   2010 	/*
   2011 	 * release_mfc will drop all queued upcall packets.
   2012 	 * and will free the mbuf with the pkt, if, timing info.
   2013 	 */
   2014 done:
   2015 	mutex_exit(&mfc_rt->mfc_mutex);
   2016 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
   2017 }
   2018 
   2019 /*
   2020  * Packet forwarding routine once entry in the cache is made.
   2021  */
   2022 static int
   2023 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
   2024     struct mfc *rt)
   2025 {
   2026 	ill_t *vill;
   2027 	vifi_t vifi;
   2028 	struct vif *vifp;
   2029 	ipaddr_t dst = ipha->ipha_dst;
   2030 	size_t  plen = msgdsize(mp);
   2031 	vifi_t num_of_vifs;
   2032 	ip_stack_t	*ipst = ill->ill_ipst;
   2033 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2034 
   2035 	if (ipst->ips_ip_mrtdebug > 1) {
   2036 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2037 		    "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
   2038 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
   2039 		    ill->ill_name);
   2040 	}
   2041 
   2042 	/* Macro to send packet on vif */
   2043 #define	MC_SEND(ipha, mp, vifp, dst) { \
   2044 	if ((vifp)->v_flags & VIFF_TUNNEL) \
   2045 		encap_send((ipha), (mp), (vifp), (dst)); \
   2046 	else if ((vifp)->v_flags & VIFF_REGISTER) \
   2047 		register_send((ipha), (mp), (vifp), (dst)); \
   2048 	else \
   2049 		phyint_send((ipha), (mp), (vifp), (dst)); \
   2050 }
   2051 
   2052 	vifi = rt->mfc_parent;
   2053 
   2054 	/*
   2055 	 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
   2056 	 * Mrouted had no route.
   2057 	 * We wanted the route installed in the mfctable to prevent multiple
   2058 	 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
   2059 	 * NULL so we don't want to check the ill. Still needed as of Mrouted
   2060 	 * 3.6.
   2061 	 */
   2062 	if (vifi == NO_VIF) {
   2063 		ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
   2064 		    ill->ill_name));
   2065 		if (ipst->ips_ip_mrtdebug > 1) {
   2066 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2067 			    "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
   2068 		}
   2069 		return (-1);	/* drop pkt */
   2070 	}
   2071 
   2072 	if (!lock_good_vif(&ipst->ips_vifs[vifi]))
   2073 		return (-1);
   2074 	/*
   2075 	 * The MFC entries are not cleaned up when an ipif goes
   2076 	 * away thus this code has to guard against an MFC referencing
   2077 	 * an ipif that has been closed. Note: reset_mrt_vif_ipif
   2078 	 * sets the v_ipif to NULL when the ipif disappears.
   2079 	 */
   2080 	ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
   2081 
   2082 	if (vifi >= ipst->ips_numvifs) {
   2083 		cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
   2084 		    "%d ill %s viftable ill %s\n",
   2085 		    (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
   2086 		    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
   2087 		unlock_good_vif(&ipst->ips_vifs[vifi]);
   2088 		return (-1);
   2089 	}
   2090 	/*
   2091 	 * Don't forward if it didn't arrive from the parent vif for its
   2092 	 * origin.
   2093 	 */
   2094 	vill = ipst->ips_vifs[vifi].v_ipif->ipif_ill;
   2095 	if ((vill != ill && !IS_IN_SAME_ILLGRP(vill, ill)) ||
   2096 	    (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
   2097 		/* Came in the wrong interface */
   2098 		ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
   2099 			"numvifs %d ill %s viftable ill %s\n",
   2100 			(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
   2101 			vill->ill_name));
   2102 		if (ipst->ips_ip_mrtdebug > 1) {
   2103 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2104 			    "ip_mdq: arrived wrong if, vifi %d ill "
   2105 			    "%s viftable ill %s\n",
   2106 			    (int)vifi, ill->ill_name, vill->ill_name);
   2107 		}
   2108 		ipst->ips_mrtstat->mrts_wrong_if++;
   2109 		rt->mfc_wrong_if++;
   2110 
   2111 		/*
   2112 		 * If we are doing PIM assert processing and we are forwarding
   2113 		 * packets on this interface, and it is a broadcast medium
   2114 		 * interface (and not a tunnel), send a message to the routing.
   2115 		 *
   2116 		 * We use the first ipif on the list, since it's all we have.
   2117 		 * Chances are the ipif_flags are the same for ipifs on the ill.
   2118 		 */
   2119 		if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
   2120 		    (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
   2121 		    !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
   2122 			mblk_t		*mp_copy;
   2123 			struct igmpmsg	*im;
   2124 
   2125 			/* TODO could copy header and dup rest */
   2126 			mp_copy = copymsg(mp);
   2127 			if (mp_copy == NULL) {
   2128 				ipst->ips_mrtstat->mrts_fwd_drop++;
   2129 				ip1dbg(("ip_mdq: out of memory "
   2130 				    "for mblk, mp_copy\n"));
   2131 				unlock_good_vif(&ipst->ips_vifs[vifi]);
   2132 				return (-1);
   2133 			}
   2134 
   2135 			im = (struct igmpmsg *)mp_copy->b_rptr;
   2136 			im->im_msgtype = IGMPMSG_WRONGVIF;
   2137 			im->im_mbz = 0;
   2138 			im->im_vif = (ushort_t)vifi;
   2139 			/* Pass to RAWIP */
   2140 			(mrouter->conn_recv)(mrouter, mp_copy, NULL);
   2141 		}
   2142 		unlock_good_vif(&ipst->ips_vifs[vifi]);
   2143 		if (tunnel_src != 0)
   2144 			return (1);
   2145 		else
   2146 			return (0);
   2147 	}
   2148 	/*
   2149 	 * If I sourced this packet, it counts as output, else it was input.
   2150 	 */
   2151 	if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
   2152 		ipst->ips_vifs[vifi].v_pkt_out++;
   2153 		ipst->ips_vifs[vifi].v_bytes_out += plen;
   2154 	} else {
   2155 		ipst->ips_vifs[vifi].v_pkt_in++;
   2156 		ipst->ips_vifs[vifi].v_bytes_in += plen;
   2157 	}
   2158 	mutex_enter(&rt->mfc_mutex);
   2159 	rt->mfc_pkt_cnt++;
   2160 	rt->mfc_byte_cnt += plen;
   2161 	mutex_exit(&rt->mfc_mutex);
   2162 	unlock_good_vif(&ipst->ips_vifs[vifi]);
   2163 	/*
   2164 	 * For each vif, decide if a copy of the packet should be forwarded.
   2165 	 * Forward if:
   2166 	 *		- the vif threshold ttl is non-zero AND
   2167 	 *		- the pkt ttl exceeds the vif's threshold
   2168 	 * A non-zero mfc_ttl indicates that the vif is part of
   2169 	 * the output set for the mfc entry.
   2170 	 */
   2171 	mutex_enter(&ipst->ips_numvifs_mutex);
   2172 	num_of_vifs = ipst->ips_numvifs;
   2173 	mutex_exit(&ipst->ips_numvifs_mutex);
   2174 	for (vifp = ipst->ips_vifs, vifi = 0;
   2175 	    vifi < num_of_vifs;
   2176 	    vifp++, vifi++) {
   2177 		if (!lock_good_vif(vifp))
   2178 			continue;
   2179 		if ((rt->mfc_ttls[vifi] > 0) &&
   2180 		    (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
   2181 			/*
   2182 			 * lock_good_vif should not have succedded if
   2183 			 * v_ipif is null.
   2184 			 */
   2185 			ASSERT(vifp->v_ipif != NULL);
   2186 			vifp->v_pkt_out++;
   2187 			vifp->v_bytes_out += plen;
   2188 			MC_SEND(ipha, mp, vifp, dst);
   2189 			ipst->ips_mrtstat->mrts_fwd_out++;
   2190 		}
   2191 		unlock_good_vif(vifp);
   2192 	}
   2193 	if (tunnel_src != 0)
   2194 		return (1);
   2195 	else
   2196 		return (0);
   2197 }
   2198 
   2199 /*
   2200  * Send the packet on physical interface.
   2201  * Caller assumes can continue to use mp on return.
   2202  */
   2203 /* ARGSUSED */
   2204 static void
   2205 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
   2206 {
   2207 	mblk_t 	*mp_copy;
   2208 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   2209 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2210 
   2211 	/* Make a new reference to the packet */
   2212 	mp_copy = copymsg(mp);	/* TODO could copy header and dup rest */
   2213 	if (mp_copy == NULL) {
   2214 		ipst->ips_mrtstat->mrts_fwd_drop++;
   2215 		ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
   2216 		return;
   2217 	}
   2218 	if (vifp->v_rate_limit <= 0)
   2219 		tbf_send_packet(vifp, mp_copy);
   2220 	else  {
   2221 		if (ipst->ips_ip_mrtdebug > 1) {
   2222 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2223 			    "phyint_send: tbf_contr rate %d "
   2224 			    "vifp 0x%p mp 0x%p dst 0x%x",
   2225 			    vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
   2226 		}
   2227 		tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
   2228 	}
   2229 }
   2230 
   2231 /*
   2232  * Send the whole packet for REGISTER encapsulation to PIM daemon
   2233  * Caller assumes it can continue to use mp on return.
   2234  */
   2235 /* ARGSUSED */
   2236 static void
   2237 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
   2238 {
   2239 	struct igmpmsg	*im;
   2240 	mblk_t		*mp_copy;
   2241 	ipha_t		*ipha_copy;
   2242 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   2243 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2244 
   2245 	if (ipst->ips_ip_mrtdebug > 1) {
   2246 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2247 		    "register_send: src %x, dst %x\n",
   2248 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
   2249 	}
   2250 
   2251 	/*
   2252 	 * Copy the old packet & pullup its IP header into the new mblk_t so we
   2253 	 * can modify it.  Try to fill the new mblk_t since if we don't the
   2254 	 * ethernet driver will.
   2255 	 */
   2256 	mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
   2257 	if (mp_copy == NULL) {
   2258 		++ipst->ips_mrtstat->mrts_pim_nomemory;
   2259 		if (ipst->ips_ip_mrtdebug > 3) {
   2260 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2261 			    "register_send: allocb failure.");
   2262 		}
   2263 		return;
   2264 	}
   2265 
   2266 	/*
   2267 	 * Bump write pointer to account for igmpmsg being added.
   2268 	 */
   2269 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
   2270 
   2271 	/*
   2272 	 * Chain packet to new mblk_t.
   2273 	 */
   2274 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
   2275 		++ipst->ips_mrtstat->mrts_pim_nomemory;
   2276 		if (ipst->ips_ip_mrtdebug > 3) {
   2277 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2278 			    "register_send: copymsg failure.");
   2279 		}
   2280 		freeb(mp_copy);
   2281 		return;
   2282 	}
   2283 
   2284 	/*
   2285 	 * icmp_input() asserts that IP version field is set to an
   2286 	 * appropriate version. Hence, the struct igmpmsg that this really
   2287 	 * becomes, needs to have the correct IP version field.
   2288 	 */
   2289 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
   2290 	*ipha_copy = multicast_encap_iphdr;
   2291 
   2292 	/*
   2293 	 * The kernel uses the struct igmpmsg header to encode the messages to
   2294 	 * the multicast routing daemon. Fill in the fields in the header
   2295 	 * starting with the message type which is IGMPMSG_WHOLEPKT
   2296 	 */
   2297 	im = (struct igmpmsg *)mp_copy->b_rptr;
   2298 	im->im_msgtype = IGMPMSG_WHOLEPKT;
   2299 	im->im_src.s_addr = ipha->ipha_src;
   2300 	im->im_dst.s_addr = ipha->ipha_dst;
   2301 
   2302 	/*
   2303 	 * Must Be Zero. This is because the struct igmpmsg is really an IP
   2304 	 * header with renamed fields and the multicast routing daemon uses
   2305 	 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
   2306 	 */
   2307 	im->im_mbz = 0;
   2308 
   2309 	++ipst->ips_mrtstat->mrts_upcalls;
   2310 	if (!canputnext(mrouter->conn_rq)) {
   2311 		++ipst->ips_mrtstat->mrts_pim_regsend_drops;
   2312 		if (ipst->ips_ip_mrtdebug > 3) {
   2313 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2314 			    "register_send: register upcall failure.");
   2315 		}
   2316 		freemsg(mp_copy);
   2317 	} else {
   2318 		/* Pass to RAWIP */
   2319 		(mrouter->conn_recv)(mrouter, mp_copy, NULL);
   2320 	}
   2321 }
   2322 
   2323 /*
   2324  * pim_validate_cksum handles verification of the checksum in the
   2325  * pim header.  For PIM Register packets, the checksum is calculated
   2326  * across the PIM header only.  For all other packets, the checksum
   2327  * is for the PIM header and remainder of the packet.
   2328  *
   2329  * returns: B_TRUE, if checksum is okay.
   2330  *          B_FALSE, if checksum is not valid.
   2331  */
   2332 static boolean_t
   2333 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
   2334 {
   2335 	mblk_t *mp_dup;
   2336 
   2337 	if ((mp_dup = dupmsg(mp)) == NULL)
   2338 		return (B_FALSE);
   2339 
   2340 	mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
   2341 	if (pimp->pim_type == PIM_REGISTER)
   2342 		mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
   2343 	if (IP_CSUM(mp_dup, 0, 0)) {
   2344 		freemsg(mp_dup);
   2345 		return (B_FALSE);
   2346 	}
   2347 	freemsg(mp_dup);
   2348 	return (B_TRUE);
   2349 }
   2350 
   2351 /*
   2352  * int
   2353  * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets.
   2354  *	IP Protocol 103. Register messages are decapsulated and sent
   2355  *	onto multicast forwarding.
   2356  */
   2357 int
   2358 pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
   2359 {
   2360 	ipha_t		*eip, *ip;
   2361 	int		iplen, pimlen, iphlen;
   2362 	struct pim	*pimp;	/* pointer to a pim struct */
   2363 	uint32_t	*reghdr;
   2364 	ip_stack_t	*ipst = ill->ill_ipst;
   2365 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2366 
   2367 	/*
   2368 	 * Pullup the msg for PIM protocol processing.
   2369 	 */
   2370 	if (pullupmsg(mp, -1) == 0) {
   2371 		++ipst->ips_mrtstat->mrts_pim_nomemory;
   2372 		freemsg(mp);
   2373 		return (-1);
   2374 	}
   2375 
   2376 	ip = (ipha_t *)mp->b_rptr;
   2377 	iplen = ip->ipha_length;
   2378 	iphlen = IPH_HDR_LENGTH(ip);
   2379 	pimlen = ntohs(iplen) - iphlen;
   2380 
   2381 	/*
   2382 	 * Validate lengths
   2383 	 */
   2384 	if (pimlen < PIM_MINLEN) {
   2385 		++ipst->ips_mrtstat->mrts_pim_malformed;
   2386 		if (ipst->ips_ip_mrtdebug > 1) {
   2387 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2388 			    "pim_input: length not at least minlen");
   2389 		}
   2390 		freemsg(mp);
   2391 		return (-1);
   2392 	}
   2393 
   2394 	/*
   2395 	 * Point to the PIM header.
   2396 	 */
   2397 	pimp = (struct pim *)((caddr_t)ip + iphlen);
   2398 
   2399 	/*
   2400 	 * Check the version number.
   2401 	 */
   2402 	if (pimp->pim_vers != PIM_VERSION) {
   2403 		++ipst->ips_mrtstat->mrts_pim_badversion;
   2404 		if (ipst->ips_ip_mrtdebug > 1) {
   2405 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2406 			    "pim_input: unknown version of PIM");
   2407 		}
   2408 		freemsg(mp);
   2409 		return (-1);
   2410 	}
   2411 
   2412 	/*
   2413 	 * Validate the checksum
   2414 	 */
   2415 	if (!pim_validate_cksum(mp, ip, pimp)) {
   2416 		++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
   2417 		if (ipst->ips_ip_mrtdebug > 1) {
   2418 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2419 			    "pim_input: invalid checksum");
   2420 		}
   2421 		freemsg(mp);
   2422 		return (-1);
   2423 	}
   2424 
   2425 	if (pimp->pim_type != PIM_REGISTER)
   2426 		return (0);
   2427 
   2428 	reghdr = (uint32_t *)(pimp + 1);
   2429 	eip = (ipha_t *)(reghdr + 1);
   2430 
   2431 	/*
   2432 	 * check if the inner packet is destined to mcast group
   2433 	 */
   2434 	if (!CLASSD(eip->ipha_dst)) {
   2435 		++ipst->ips_mrtstat->mrts_pim_badregisters;
   2436 		if (ipst->ips_ip_mrtdebug > 1) {
   2437 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2438 			    "pim_input: Inner pkt not mcast .. !");
   2439 		}
   2440 		freemsg(mp);
   2441 		return (-1);
   2442 	}
   2443 	if (ipst->ips_ip_mrtdebug > 1) {
   2444 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2445 		    "register from %x, to %x, len %d",
   2446 		    ntohl(eip->ipha_src),
   2447 		    ntohl(eip->ipha_dst),
   2448 		    ntohs(eip->ipha_length));
   2449 	}
   2450 	/*
   2451 	 * If the null register bit is not set, decapsulate
   2452 	 * the packet before forwarding it.
   2453 	 */
   2454 	if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) {
   2455 		mblk_t *mp_copy;
   2456 
   2457 		/* Copy the message */
   2458 		if ((mp_copy = copymsg(mp)) == NULL) {
   2459 			++ipst->ips_mrtstat->mrts_pim_nomemory;
   2460 			freemsg(mp);
   2461 			return (-1);
   2462 		}
   2463 
   2464 		/*
   2465 		 * Decapsulate the packet and give it to
   2466 		 * register_mforward.
   2467 		 */
   2468 		mp_copy->b_rptr += iphlen + sizeof (pim_t) +
   2469 		    sizeof (*reghdr);
   2470 		if (register_mforward(q, mp_copy, ill) != 0) {
   2471 			freemsg(mp);
   2472 			return (-1);
   2473 		}
   2474 	}
   2475 
   2476 	/*
   2477 	 * Pass all valid PIM packets up to any process(es) listening on a raw
   2478 	 * PIM socket. For Solaris it is done right after pim_input() is
   2479 	 * called.
   2480 	 */
   2481 	return (0);
   2482 }
   2483 
   2484 /*
   2485  * PIM sparse mode hook.  Called by pim_input after decapsulating
   2486  * the packet. Loop back the packet, as if we have received it.
   2487  * In pim_input() we have to check if the destination is a multicast address.
   2488  */
   2489 /* ARGSUSED */
   2490 static int
   2491 register_mforward(queue_t *q, mblk_t *mp, ill_t *ill)
   2492 {
   2493 	ip_stack_t	*ipst = ill->ill_ipst;
   2494 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2495 
   2496 	ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
   2497 
   2498 	if (ipst->ips_ip_mrtdebug > 3) {
   2499 		ipha_t *ipha;
   2500 
   2501 		ipha = (ipha_t *)mp->b_rptr;
   2502 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2503 		    "register_mforward: src %x, dst %x\n",
   2504 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
   2505 	}
   2506 	/*
   2507 	 * Need to pass in to ip_mforward() the information that the
   2508 	 * packet has arrived on the register_vif. We use the solution that
   2509 	 * ip_mroute_decap() employs: use mp->b_prev to pass some information
   2510 	 * to ip_mforward(). Nonzero value means the packet has arrived on a
   2511 	 * tunnel (ip_mroute_decap() puts the address of the other side of the
   2512 	 * tunnel there.) This is safe since ip_rput() either frees the packet
   2513 	 * or passes it to ip_mforward(). We use
   2514 	 * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the
   2515 	 * register vif. If in the future we have more than one register vifs,
   2516 	 * then this will need re-examination.
   2517 	 */
   2518 	mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER;
   2519 	++ipst->ips_mrtstat->mrts_pim_regforwards;
   2520 	ip_rput(q, mp);
   2521 	return (0);
   2522 }
   2523 
   2524 /*
   2525  * Send an encapsulated packet.
   2526  * Caller assumes can continue to use mp when routine returns.
   2527  */
   2528 /* ARGSUSED */
   2529 static void
   2530 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
   2531 {
   2532 	mblk_t 	*mp_copy;
   2533 	ipha_t 	*ipha_copy;
   2534 	size_t	len;
   2535 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   2536 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2537 
   2538 	if (ipst->ips_ip_mrtdebug > 1) {
   2539 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2540 		    "encap_send: vif %ld enter",
   2541 		    (ptrdiff_t)(vifp - ipst->ips_vifs));
   2542 	}
   2543 	len = ntohs(ipha->ipha_length);
   2544 
   2545 	/*
   2546 	 * Copy the old packet & pullup it's IP header into the
   2547 	 * new mbuf so we can modify it.  Try to fill the new
   2548 	 * mbuf since if we don't the ethernet driver will.
   2549 	 */
   2550 	mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
   2551 	if (mp_copy == NULL)
   2552 		return;
   2553 	mp_copy->b_rptr += 32;
   2554 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
   2555 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
   2556 		freeb(mp_copy);
   2557 		return;
   2558 	}
   2559 
   2560 	/*
   2561 	 * Fill in the encapsulating IP header.
   2562 	 * Remote tunnel dst in rmt_addr, from add_vif().
   2563 	 */
   2564 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
   2565 	*ipha_copy = multicast_encap_iphdr;
   2566 	ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
   2567 	ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
   2568 	ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
   2569 	ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
   2570 	ASSERT(ipha_copy->ipha_ident == 0);
   2571 
   2572 	/* Turn the encapsulated IP header back into a valid one. */
   2573 	ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
   2574 	ipha->ipha_ttl--;
   2575 	ipha->ipha_hdr_checksum = 0;
   2576 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
   2577 
   2578 	if (ipst->ips_ip_mrtdebug > 1) {
   2579 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2580 		    "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
   2581 	}
   2582 	if (vifp->v_rate_limit <= 0)
   2583 		tbf_send_packet(vifp, mp_copy);
   2584 	else
   2585 		/* ipha is from the original header */
   2586 		tbf_control(vifp, mp_copy, ipha);
   2587 }
   2588 
   2589 /*
   2590  * De-encapsulate a packet and feed it back through IP input.
   2591  * This routine is called whenever IP gets a packet with prototype
   2592  * IPPROTO_ENCAP and a local destination address.
   2593  */
   2594 void
   2595 ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill)
   2596 {
   2597 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
   2598 	ipha_t		*ipha_encap;
   2599 	int		hlen = IPH_HDR_LENGTH(ipha);
   2600 	ipaddr_t	src;
   2601 	struct vif	*vifp;
   2602 	ip_stack_t	*ipst = ill->ill_ipst;
   2603 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2604 
   2605 	/*
   2606 	 * Dump the packet if it's not to a multicast destination or if
   2607 	 * we don't have an encapsulating tunnel with the source.
   2608 	 * Note:  This code assumes that the remote site IP address
   2609 	 * uniquely identifies the tunnel (i.e., that this site has
   2610 	 * at most one tunnel with the remote site).
   2611 	 */
   2612 	ipha_encap = (ipha_t *)((char *)ipha + hlen);
   2613 	if (!CLASSD(ipha_encap->ipha_dst)) {
   2614 		ipst->ips_mrtstat->mrts_bad_tunnel++;
   2615 		ip1dbg(("ip_mroute_decap: bad tunnel\n"));
   2616 		freemsg(mp);
   2617 		return;
   2618 	}
   2619 	src = (ipaddr_t)ipha->ipha_src;
   2620 	mutex_enter(&ipst->ips_last_encap_lock);
   2621 	if (src != ipst->ips_last_encap_src) {
   2622 		struct vif *vife;
   2623 
   2624 		vifp = ipst->ips_vifs;
   2625 		vife = vifp + ipst->ips_numvifs;
   2626 		ipst->ips_last_encap_src = src;
   2627 		ipst->ips_last_encap_vif = 0;
   2628 		for (; vifp < vife; ++vifp) {
   2629 			if (!lock_good_vif(vifp))
   2630 				continue;
   2631 			if (vifp->v_rmt_addr.s_addr == src) {
   2632 				if (vifp->v_flags & VIFF_TUNNEL)
   2633 					ipst->ips_last_encap_vif = vifp;
   2634 				if (ipst->ips_ip_mrtdebug > 1) {
   2635 					(void) mi_strlog(mrouter->conn_rq,
   2636 					    1, SL_TRACE,
   2637 					    "ip_mroute_decap: good tun "
   2638 					    "vif %ld with %x",
   2639 					    (ptrdiff_t)(vifp - ipst->ips_vifs),
   2640 					    ntohl(src));
   2641 				}
   2642 				unlock_good_vif(vifp);
   2643 				break;
   2644 			}
   2645 			unlock_good_vif(vifp);
   2646 		}
   2647 	}
   2648 	if ((vifp = ipst->ips_last_encap_vif) == 0) {
   2649 		mutex_exit(&ipst->ips_last_encap_lock);
   2650 		ipst->ips_mrtstat->mrts_bad_tunnel++;
   2651 		freemsg(mp);
   2652 		ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
   2653 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
   2654 		return;
   2655 	}
   2656 	mutex_exit(&ipst->ips_last_encap_lock);
   2657 
   2658 	/*
   2659 	 * Need to pass in the tunnel source to ip_mforward (so that it can
   2660 	 * verify that the packet arrived over the correct vif.)  We use b_prev
   2661 	 * to pass this information. This is safe since the ip_rput either
   2662 	 * frees the packet or passes it to ip_mforward.
   2663 	 */
   2664 	mp->b_prev = (mblk_t *)(uintptr_t)src;
   2665 	mp->b_rptr += hlen;
   2666 	/* Feed back into ip_rput as an M_DATA. */
   2667 	ip_rput(q, mp);
   2668 }
   2669 
   2670 /*
   2671  * Remove all records with v_ipif == ipif.  Called when an interface goes away
   2672  * (stream closed).  Called as writer.
   2673  */
   2674 void
   2675 reset_mrt_vif_ipif(ipif_t *ipif)
   2676 {
   2677 	vifi_t vifi, tmp_vifi;
   2678 	vifi_t num_of_vifs;
   2679 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
   2680 
   2681 	/* Can't check vifi >= 0 since vifi_t is unsigned! */
   2682 
   2683 	mutex_enter(&ipst->ips_numvifs_mutex);
   2684 	num_of_vifs = ipst->ips_numvifs;
   2685 	mutex_exit(&ipst->ips_numvifs_mutex);
   2686 
   2687 	for (vifi = num_of_vifs; vifi != 0; vifi--) {
   2688 		tmp_vifi = vifi - 1;
   2689 		if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
   2690 			(void) del_vif(&tmp_vifi, NULL, NULL, ipst);
   2691 		}
   2692 	}
   2693 }
   2694 
   2695 /* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
   2696 void
   2697 reset_mrt_ill(ill_t *ill)
   2698 {
   2699 	struct mfc		*rt;
   2700 	struct rtdetq	*rte;
   2701 	int			i;
   2702 	ip_stack_t	*ipst = ill->ill_ipst;
   2703 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2704 
   2705 	for (i = 0; i < MFCTBLSIZ; i++) {
   2706 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
   2707 		if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
   2708 			if (ipst->ips_ip_mrtdebug > 1) {
   2709 				(void) mi_strlog(mrouter->conn_rq, 1,
   2710 				    SL_TRACE,
   2711 				    "reset_mrt_ill: mfctable [%d]", i);
   2712 			}
   2713 			while (rt != NULL) {
   2714 				mutex_enter(&rt->mfc_mutex);
   2715 				while ((rte = rt->mfc_rte) != NULL) {
   2716 					if (rte->ill == ill) {
   2717 						if (ipst->ips_ip_mrtdebug > 1) {
   2718 						(void) mi_strlog(
   2719 						    mrouter->conn_rq,
   2720 						    1, SL_TRACE,
   2721 						    "reset_mrt_ill: "
   2722 						    "ill 0x%p", (void *)ill);
   2723 						}
   2724 						rt->mfc_rte = rte->rte_next;
   2725 						freemsg(rte->mp);
   2726 						mi_free((char *)rte);
   2727 					}
   2728 				}
   2729 				mutex_exit(&rt->mfc_mutex);
   2730 				rt = rt->mfc_next;
   2731 			}
   2732 		}
   2733 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
   2734 	}
   2735 }
   2736 
   2737 /*
   2738  * Token bucket filter module.
   2739  * The ipha is for mcastgrp destination for phyint and encap.
   2740  */
   2741 static void
   2742 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
   2743 {
   2744 	size_t 	p_len =  msgdsize(mp);
   2745 	struct tbf	*t    = vifp->v_tbf;
   2746 	timeout_id_t id = 0;
   2747 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   2748 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2749 
   2750 	/* Drop if packet is too large */
   2751 	if (p_len > MAX_BKT_SIZE) {
   2752 		ipst->ips_mrtstat->mrts_pkt2large++;
   2753 		freemsg(mp);
   2754 		return;
   2755 	}
   2756 	if (ipst->ips_ip_mrtdebug > 1) {
   2757 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2758 		    "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
   2759 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
   2760 		    ntohl(ipha->ipha_dst));
   2761 	}
   2762 
   2763 	mutex_enter(&t->tbf_lock);
   2764 
   2765 	tbf_update_tokens(vifp);
   2766 
   2767 	/*
   2768 	 * If there are enough tokens,
   2769 	 * and the queue is empty, send this packet out.
   2770 	 */
   2771 	if (ipst->ips_ip_mrtdebug > 1) {
   2772 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2773 		    "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
   2774 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
   2775 		    t->tbf_q_len);
   2776 	}
   2777 	/* No packets are queued */
   2778 	if (t->tbf_q_len == 0) {
   2779 		/* queue empty, send packet if enough tokens */
   2780 		if (p_len <= t->tbf_n_tok) {
   2781 			t->tbf_n_tok -= p_len;
   2782 			mutex_exit(&t->tbf_lock);
   2783 			tbf_send_packet(vifp, mp);
   2784 			return;
   2785 		} else {
   2786 			/* Queue packet and timeout till later */
   2787 			tbf_queue(vifp, mp);
   2788 			ASSERT(vifp->v_timeout_id == 0);
   2789 			vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
   2790 			    TBF_REPROCESS);
   2791 		}
   2792 	} else if (t->tbf_q_len < t->tbf_max_q_len) {
   2793 		/* Finite queue length, so queue pkts and process queue */
   2794 		tbf_queue(vifp, mp);
   2795 		tbf_process_q(vifp);
   2796 	} else {
   2797 		/* Check that we have UDP header with IP header */
   2798 		size_t hdr_length = IPH_HDR_LENGTH(ipha) +
   2799 		    sizeof (struct udphdr);
   2800 
   2801 		if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
   2802 			if (!pullupmsg(mp, hdr_length)) {
   2803 				freemsg(mp);
   2804 				ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
   2805 				    "vif %ld src 0x%x dst 0x%x\n",
   2806 				    (ptrdiff_t)(vifp - ipst->ips_vifs),
   2807 				    ntohl(ipha->ipha_src),
   2808 				    ntohl(ipha->ipha_dst)));
   2809 				mutex_exit(&vifp->v_tbf->tbf_lock);
   2810 				return;
   2811 			} else
   2812 				/* Have to reassign ipha after pullupmsg */
   2813 				ipha = (ipha_t *)mp->b_rptr;
   2814 		}
   2815 		/*
   2816 		 * Queue length too much,
   2817 		 * try to selectively dq, or queue and process
   2818 		 */
   2819 		if (!tbf_dq_sel(vifp, ipha)) {
   2820 			ipst->ips_mrtstat->mrts_q_overflow++;
   2821 			freemsg(mp);
   2822 		} else {
   2823 			tbf_queue(vifp, mp);
   2824 			tbf_process_q(vifp);
   2825 		}
   2826 	}
   2827 	if (t->tbf_q_len == 0) {
   2828 		id = vifp->v_timeout_id;
   2829 		vifp->v_timeout_id = 0;
   2830 	}
   2831 	mutex_exit(&vifp->v_tbf->tbf_lock);
   2832 	if (id != 0)
   2833 		(void) untimeout(id);
   2834 }
   2835 
   2836 /*
   2837  * Adds a packet to the tbf queue at the interface.
   2838  * The ipha is for mcastgrp destination for phyint and encap.
   2839  */
   2840 static void
   2841 tbf_queue(struct vif *vifp, mblk_t *mp)
   2842 {
   2843 	struct tbf	*t = vifp->v_tbf;
   2844 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   2845 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2846 
   2847 	if (ipst->ips_ip_mrtdebug > 1) {
   2848 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2849 		    "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
   2850 	}
   2851 	ASSERT(MUTEX_HELD(&t->tbf_lock));
   2852 
   2853 	if (t->tbf_t == NULL) {
   2854 		/* Queue was empty */
   2855 		t->tbf_q = mp;
   2856 	} else {
   2857 		/* Insert at tail */
   2858 		t->tbf_t->b_next = mp;
   2859 	}
   2860 	/* set new tail pointer */
   2861 	t->tbf_t = mp;
   2862 
   2863 	mp->b_next = mp->b_prev = NULL;
   2864 
   2865 	t->tbf_q_len++;
   2866 }
   2867 
   2868 /*
   2869  * Process the queue at the vif interface.
   2870  * Drops the tbf_lock when sending packets.
   2871  *
   2872  * NOTE : The caller should quntimeout if the queue length is 0.
   2873  */
   2874 static void
   2875 tbf_process_q(struct vif *vifp)
   2876 {
   2877 	mblk_t	*mp;
   2878 	struct tbf	*t = vifp->v_tbf;
   2879 	size_t	len;
   2880 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   2881 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2882 
   2883 	if (ipst->ips_ip_mrtdebug > 1) {
   2884 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2885 		    "tbf_process_q 1: vif %ld qlen = %d",
   2886 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
   2887 	}
   2888 
   2889 	/*
   2890 	 * Loop through the queue at the interface and send
   2891 	 * as many packets as possible.
   2892 	 */
   2893 	ASSERT(MUTEX_HELD(&t->tbf_lock));
   2894 
   2895 	while (t->tbf_q_len > 0) {
   2896 		mp = t->tbf_q;
   2897 		len = (size_t)msgdsize(mp); /* length of ip pkt */
   2898 
   2899 		/* Determine if the packet can be sent */
   2900 		if (len <= t->tbf_n_tok) {
   2901 			/*
   2902 			 * If so, reduce no. of tokens, dequeue the packet,
   2903 			 * send the packet.
   2904 			 */
   2905 			t->tbf_n_tok -= len;
   2906 
   2907 			t->tbf_q = mp->b_next;
   2908 			if (--t->tbf_q_len == 0) {
   2909 				t->tbf_t = NULL;
   2910 			}
   2911 			mp->b_next = NULL;
   2912 			/* Exit mutex before sending packet, then re-enter */
   2913 			mutex_exit(&t->tbf_lock);
   2914 			tbf_send_packet(vifp, mp);
   2915 			mutex_enter(&t->tbf_lock);
   2916 		} else
   2917 			break;
   2918 	}
   2919 }
   2920 
   2921 /* Called at tbf timeout to update tokens, process q and reset timer.  */
   2922 static void
   2923 tbf_reprocess_q(void *arg)
   2924 {
   2925 	struct vif *vifp = arg;
   2926 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   2927 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2928 
   2929 	mutex_enter(&vifp->v_tbf->tbf_lock);
   2930 	vifp->v_timeout_id = 0;
   2931 	tbf_update_tokens(vifp);
   2932 
   2933 	tbf_process_q(vifp);
   2934 
   2935 	if (vifp->v_tbf->tbf_q_len > 0) {
   2936 		vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
   2937 		    TBF_REPROCESS);
   2938 	}
   2939 	mutex_exit(&vifp->v_tbf->tbf_lock);
   2940 
   2941 	if (ipst->ips_ip_mrtdebug > 1) {
   2942 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2943 		    "tbf_reprcess_q: vif %ld timeout id = %p",
   2944 		    (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
   2945 	}
   2946 }
   2947 
   2948 /*
   2949  * Function that will selectively discard a member of the tbf queue,
   2950  * based on the precedence value and the priority.
   2951  *
   2952  * NOTE : The caller should quntimeout if the queue length is 0.
   2953  */
   2954 static int
   2955 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
   2956 {
   2957 	uint_t		p;
   2958 	struct tbf		*t = vifp->v_tbf;
   2959 	mblk_t		**np;
   2960 	mblk_t		*last, *mp;
   2961 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   2962 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   2963 
   2964 	if (ipst->ips_ip_mrtdebug > 1) {
   2965 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   2966 		    "dq_sel: vif %ld dst 0x%x",
   2967 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
   2968 	}
   2969 
   2970 	ASSERT(MUTEX_HELD(&t->tbf_lock));
   2971 	p = priority(vifp, ipha);
   2972 
   2973 	np = &t->tbf_q;
   2974 	last = NULL;
   2975 	while ((mp = *np) != NULL) {
   2976 		if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
   2977 			*np = mp->b_next;
   2978 			/* If removing the last packet, fix the tail pointer */
   2979 			if (mp == t->tbf_t)
   2980 				t->tbf_t = last;
   2981 			mp->b_prev = mp->b_next = NULL;
   2982 			freemsg(mp);
   2983 			/*
   2984 			 * It's impossible for the queue to be empty, but
   2985 			 * we check anyway.
   2986 			 */
   2987 			if (--t->tbf_q_len == 0) {
   2988 				t->tbf_t = NULL;
   2989 			}
   2990 			ipst->ips_mrtstat->mrts_drop_sel++;
   2991 			return (1);
   2992 		}
   2993 		np = &mp->b_next;
   2994 		last = mp;
   2995 	}
   2996 	return (0);
   2997 }
   2998 
   2999 /* Sends packet, 2 cases - encap tunnel, phyint.  */
   3000 static void
   3001 tbf_send_packet(struct vif *vifp, mblk_t *mp)
   3002 {
   3003 	ipif_t  *ipif;
   3004 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   3005 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   3006 
   3007 	/* If encap tunnel options */
   3008 	if (vifp->v_flags & VIFF_TUNNEL)  {
   3009 		if (ipst->ips_ip_mrtdebug > 1) {
   3010 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   3011 			    "tbf_send_pkt: ENCAP tunnel vif %ld",
   3012 			    (ptrdiff_t)(vifp - ipst->ips_vifs));
   3013 		}
   3014 
   3015 		/*
   3016 		 * Feed into ip_wput which will set the ident field and
   3017 		 * checksum the encapsulating header.
   3018 		 * BSD gets the cached route vifp->v_route from ip_output()
   3019 		 * to speed up route table lookups. Not necessary in SunOS 5.x.
   3020 		 */
   3021 		put(vifp->v_ipif->ipif_wq, mp);
   3022 		return;
   3023 
   3024 		/* phyint */
   3025 	} else {
   3026 		/* Need to loop back to members on the outgoing interface. */
   3027 		ipha_t  *ipha;
   3028 		ipaddr_t    dst;
   3029 		ipha  = (ipha_t *)mp->b_rptr;
   3030 		dst  = ipha->ipha_dst;
   3031 		ipif = vifp->v_ipif;
   3032 
   3033 		if (ilm_lookup_ipif(ipif, dst) != NULL) {
   3034 			/*
   3035 			 * The packet is not yet reassembled, thus we need to
   3036 			 * pass it to ip_rput_local for checksum verification
   3037 			 * and reassembly (and fanout the user stream).
   3038 			 */
   3039 			mblk_t 	*mp_loop;
   3040 			ire_t	*ire;
   3041 
   3042 			if (ipst->ips_ip_mrtdebug > 1) {
   3043 				(void) mi_strlog(mrouter->conn_rq, 1,
   3044 				    SL_TRACE,
   3045 				    "tbf_send_pkt: loopback vif %ld",
   3046 				    (ptrdiff_t)(vifp - ipst->ips_vifs));
   3047 			}
   3048 			mp_loop = copymsg(mp);
   3049 			ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL,
   3050 			    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
   3051 
   3052 			if (mp_loop != NULL && ire != NULL) {
   3053 				IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop,
   3054 				    ((ipha_t *)mp_loop->b_rptr),
   3055 				    ire, (ill_t *)ipif->ipif_rq->q_ptr);
   3056 			} else {
   3057 				/* Either copymsg failed or no ire */
   3058 				(void) mi_strlog(mrouter->conn_rq, 1,
   3059 				    SL_TRACE,
   3060 				    "tbf_send_pkt: mp_loop 0x%p, ire 0x%p "
   3061 				    "vif %ld\n", (void *)mp_loop, (void *)ire,
   3062 				    (ptrdiff_t)(vifp - ipst->ips_vifs));
   3063 			}
   3064 			if (ire != NULL)
   3065 				ire_refrele(ire);
   3066 		}
   3067 		if (ipst->ips_ip_mrtdebug > 1) {
   3068 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   3069 			    "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
   3070 			    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
   3071 		}
   3072 		ip_rput_forward_multicast(dst, mp, ipif);
   3073 	}
   3074 }
   3075 
   3076 /*
   3077  * Determine the current time and then the elapsed time (between the last time
   3078  * and time now).  Update the no. of tokens in the bucket.
   3079  */
   3080 static void
   3081 tbf_update_tokens(struct vif *vifp)
   3082 {
   3083 	timespec_t	tp;
   3084 	hrtime_t	tm;
   3085 	struct tbf	*t = vifp->v_tbf;
   3086 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   3087 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   3088 
   3089 	ASSERT(MUTEX_HELD(&t->tbf_lock));
   3090 
   3091 	/* Time in secs and nsecs, rate limit in kbits/sec */
   3092 	gethrestime(&tp);
   3093 
   3094 	/*LINTED*/
   3095 	TV_DELTA(tp, t->tbf_last_pkt_t, tm);
   3096 
   3097 	/*
   3098 	 * This formula is actually
   3099 	 * "time in seconds" * "bytes/second".  Scaled for nsec.
   3100 	 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
   3101 	 *
   3102 	 * The (1000/1024) was introduced in add_vif to optimize
   3103 	 * this divide into a shift.
   3104 	 */
   3105 	t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
   3106 	t->tbf_last_pkt_t = tp;
   3107 
   3108 	if (t->tbf_n_tok > MAX_BKT_SIZE)
   3109 		t->tbf_n_tok = MAX_BKT_SIZE;
   3110 	if (ipst->ips_ip_mrtdebug > 1) {
   3111 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   3112 		    "tbf_update_tok: tm %lld tok %d vif %ld",
   3113 		    tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
   3114 	}
   3115 }
   3116 
   3117 /*
   3118  * Priority currently is based on port nos.
   3119  * Different forwarding mechanisms have different ways
   3120  * of obtaining the port no. Hence, the vif must be
   3121  * given along with the packet itself.
   3122  *
   3123  */
   3124 static int
   3125 priority(struct vif *vifp, ipha_t *ipha)
   3126 {
   3127 	int prio;
   3128 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
   3129 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
   3130 
   3131 	/* Temporary hack; may add general packet classifier some day */
   3132 
   3133 	ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
   3134 
   3135 	/*
   3136 	 * The UDP port space is divided up into four priority ranges:
   3137 	 * [0, 16384)	: unclassified - lowest priority
   3138 	 * [16384, 32768)	: audio - highest priority
   3139 	 * [32768, 49152)	: whiteboard - medium priority
   3140 	 * [49152, 65536)	: video - low priority
   3141 	 */
   3142 
   3143 	if (ipha->ipha_protocol == IPPROTO_UDP) {
   3144 		struct udphdr *udp =
   3145 		    (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
   3146 		switch (ntohs(udp->uh_dport) & 0xc000) {
   3147 		case 0x4000:
   3148 			prio = 70;
   3149 			break;
   3150 		case 0x8000:
   3151 			prio = 60;
   3152 			break;
   3153 		case 0xc000:
   3154 			prio = 55;
   3155 			break;
   3156 		default:
   3157 			prio = 50;
   3158 			break;
   3159 		}
   3160 		if (ipst->ips_ip_mrtdebug > 1) {
   3161 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
   3162 			    "priority: port %x prio %d\n",
   3163 			    ntohs(udp->uh_dport), prio);
   3164 		}
   3165 	} else
   3166 		prio = 50;  /* default priority */
   3167 	return (prio);
   3168 }
   3169 
   3170 /*
   3171  * End of token bucket filter modifications
   3172  */
   3173 
   3174 
   3175 
   3176 /*
   3177  * Produces data for netstat -M.
   3178  */
   3179 int
   3180 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
   3181 {
   3182 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
   3183 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
   3184 	if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
   3185 		sizeof (struct mrtstat))) {
   3186 		ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
   3187 		    (size_t)sizeof (struct mrtstat)));
   3188 		return (0);
   3189 	}
   3190 	return (1);
   3191 }
   3192 
   3193 /*
   3194  * Sends info for SNMP's MIB.
   3195  */
   3196 int
   3197 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
   3198 {
   3199 	struct vifctl 	vi;
   3200 	vifi_t		vifi;
   3201 
   3202 	mutex_enter(&ipst->ips_numvifs_mutex);
   3203 	for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
   3204 		if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
   3205 			continue;
   3206 		/*
   3207 		 * No locks here, an approximation is fine.
   3208 		 */
   3209 		vi.vifc_vifi = vifi;
   3210 		vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
   3211 		vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
   3212 		vi.vifc_rate_limit	= ipst->ips_vifs[vifi].v_rate_limit;
   3213 		vi.vifc_lcl_addr	= ipst->ips_vifs[vifi].v_lcl_addr;
   3214 		vi.vifc_rmt_addr	= ipst->ips_vifs[vifi].v_rmt_addr;
   3215 		vi.vifc_pkt_in		= ipst->ips_vifs[vifi].v_pkt_in;
   3216 		vi.vifc_pkt_out		= ipst->ips_vifs[vifi].v_pkt_out;
   3217 
   3218 		if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
   3219 			ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
   3220 			    (size_t)sizeof (vi)));
   3221 			mutex_exit(&ipst->ips_numvifs_mutex);
   3222 			return (0);
   3223 		}
   3224 	}
   3225 	mutex_exit(&ipst->ips_numvifs_mutex);
   3226 	return (1);
   3227 }
   3228 
   3229 /*
   3230  * Called by ip_snmp_get to send up multicast routing table.
   3231  */
   3232 int
   3233 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
   3234 {
   3235 	int			i, j;
   3236 	struct mfc		*rt;
   3237 	struct mfcctl	mfcc;
   3238 
   3239 	/*
   3240 	 * Make sure multicast has not been turned off.
   3241 	 */
   3242 	if (is_mrouter_off(ipst))
   3243 		return (1);
   3244 
   3245 	/* Loop over all hash buckets and their chains */
   3246 	for (i = 0; i < MFCTBLSIZ; i++) {
   3247 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
   3248 		for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
   3249 			mutex_enter(&rt->mfc_mutex);
   3250 			if (rt->mfc_rte != NULL ||
   3251 			    (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
   3252 				mutex_exit(&rt->mfc_mutex);
   3253 				continue;
   3254 			}
   3255 			mfcc.mfcc_origin = rt->mfc_origin;
   3256 			mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
   3257 			mfcc.mfcc_parent = rt->mfc_parent;
   3258 			mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
   3259 			mutex_enter(&ipst->ips_numvifs_mutex);
   3260 			for (j = 0; j < (int)ipst->ips_numvifs; j++)
   3261 				mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
   3262 			for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
   3263 				mfcc.mfcc_ttls[j] = 0;
   3264 			mutex_exit(&ipst->ips_numvifs_mutex);
   3265 
   3266 			mutex_exit(&rt->mfc_mutex);
   3267 			if (!snmp_append_data(mp, (char *)&mfcc,
   3268 			    sizeof (mfcc))) {
   3269 				MFCB_REFRELE(&ipst->ips_mfcs[i]);
   3270 				ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
   3271 				    (size_t)sizeof (mfcc)));
   3272 				return (0);
   3273 			}
   3274 		}
   3275 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
   3276 	}
   3277 	return (1);
   3278 }
   3279