Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /* Copyright (c) 1990 Mentat Inc. */
     26 
     27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     28 
     29 /*
     30  * Internet Group Management Protocol (IGMP) routines.
     31  * Multicast Listener Discovery Protocol (MLD) routines.
     32  *
     33  * Written by Steve Deering, Stanford, May 1988.
     34  * Modified by Rosen Sharma, Stanford, Aug 1994.
     35  * Modified by Bill Fenner, Xerox PARC, Feb. 1995.
     36  *
     37  * MULTICAST 3.5.1.1
     38  */
     39 
     40 #include <sys/types.h>
     41 #include <sys/stream.h>
     42 #include <sys/stropts.h>
     43 #include <sys/strlog.h>
     44 #include <sys/strsun.h>
     45 #include <sys/systm.h>
     46 #include <sys/ddi.h>
     47 #include <sys/sunddi.h>
     48 #include <sys/cmn_err.h>
     49 #include <sys/atomic.h>
     50 #include <sys/zone.h>
     51 
     52 #include <sys/param.h>
     53 #include <sys/socket.h>
     54 #include <inet/ipclassifier.h>
     55 #include <net/if.h>
     56 #include <net/route.h>
     57 #include <netinet/in.h>
     58 #include <netinet/igmp_var.h>
     59 #include <netinet/ip6.h>
     60 #include <netinet/icmp6.h>
     61 
     62 #include <inet/common.h>
     63 #include <inet/mi.h>
     64 #include <inet/nd.h>
     65 #include <inet/ip.h>
     66 #include <inet/ip6.h>
     67 #include <inet/ip_multi.h>
     68 #include <inet/ip_listutils.h>
     69 
     70 #include <netinet/igmp.h>
     71 #include <inet/ip_if.h>
     72 #include <net/pfkeyv2.h>
     73 #include <inet/ipsec_info.h>
     74 
     75 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
     76 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
     77 static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
     78 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
     79 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
     80 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
     81 static void	igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist);
     82 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
     83 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
     84 		    slist_t *srclist, mrec_t *next);
     85 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
     86 		    mcast_record_t rtype, slist_t *flist);
     87 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
     88 
     89 
     90 /*
     91  * Macros used to do timer len conversions.  Timer values are always
     92  * stored and passed to the timer functions as milliseconds; but the
     93  * default values and values from the wire may not be.
     94  *
     95  * And yes, it's obscure, but decisecond is easier to abbreviate than
     96  * "tenths of a second".
     97  */
     98 #define	DSEC_TO_MSEC(dsec)	((dsec) * 100)
     99 #define	SEC_TO_MSEC(sec)	((sec) * 1000)
    100 
    101 /*
    102  * A running timer (scheduled thru timeout) can be cancelled if another
    103  * timer with a shorter timeout value is scheduled before it has timed
    104  * out.  When the shorter timer expires, the original timer is updated
    105  * to account for the time elapsed while the shorter timer ran; but this
    106  * does not take into account the amount of time already spent in timeout
    107  * state before being preempted by the shorter timer, that is the time
    108  * interval between time scheduled to time cancelled.  This can cause
    109  * delays in sending out multicast membership reports.  To resolve this
    110  * problem, wallclock time (absolute time) is used instead of deltas
    111  * (relative time) to track timers.
    112  *
    113  * The MACRO below gets the lbolt value, used for proper timer scheduling
    114  * and firing. Therefore multicast membership reports are sent on time.
    115  * The timer does not exactly fire at the time it was scehduled to fire,
    116  * there is a difference of a few milliseconds observed. An offset is used
    117  * to take care of the difference.
    118  */
    119 
    120 #define	CURRENT_MSTIME	((uint_t)TICK_TO_MSEC(ddi_get_lbolt()))
    121 #define	CURRENT_OFFSET	(999)
    122 
    123 /*
    124  * The first multicast join will trigger the igmp timers / mld timers
    125  * The unit for next is milliseconds.
    126  */
    127 void
    128 igmp_start_timers(unsigned next, ip_stack_t *ipst)
    129 {
    130 	int	time_left;
    131 	int	ret;
    132 
    133 	ASSERT(next != 0 && next != INFINITY);
    134 
    135 	mutex_enter(&ipst->ips_igmp_timer_lock);
    136 
    137 	if (ipst->ips_igmp_timer_setter_active) {
    138 		/*
    139 		 * Serialize timer setters, one at a time. If the
    140 		 * timer is currently being set by someone,
    141 		 * just record the next time when it has to be
    142 		 * invoked and return. The current setter will
    143 		 * take care.
    144 		 */
    145 		ipst->ips_igmp_time_to_next =
    146 		    MIN(ipst->ips_igmp_time_to_next, next);
    147 		mutex_exit(&ipst->ips_igmp_timer_lock);
    148 		return;
    149 	} else {
    150 		ipst->ips_igmp_timer_setter_active = B_TRUE;
    151 	}
    152 	if (ipst->ips_igmp_timeout_id == 0) {
    153 		/*
    154 		 * The timer is inactive. We need to start a timer
    155 		 */
    156 		ipst->ips_igmp_time_to_next = next;
    157 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
    158 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
    159 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
    160 		ipst->ips_igmp_timer_setter_active = B_FALSE;
    161 		mutex_exit(&ipst->ips_igmp_timer_lock);
    162 		return;
    163 	}
    164 
    165 	/*
    166 	 * The timer was scheduled sometime back for firing in
    167 	 * 'igmp_time_to_next' ms and is active. We need to
    168 	 * reschedule the timeout if the new 'next' will happen
    169 	 * earlier than the currently scheduled timeout
    170 	 */
    171 	time_left = ipst->ips_igmp_timer_scheduled_last +
    172 	    MSEC_TO_TICK(ipst->ips_igmp_time_to_next) - ddi_get_lbolt();
    173 	if (time_left < MSEC_TO_TICK(next)) {
    174 		ipst->ips_igmp_timer_setter_active = B_FALSE;
    175 		mutex_exit(&ipst->ips_igmp_timer_lock);
    176 		return;
    177 	}
    178 
    179 	mutex_exit(&ipst->ips_igmp_timer_lock);
    180 	ret = untimeout(ipst->ips_igmp_timeout_id);
    181 	mutex_enter(&ipst->ips_igmp_timer_lock);
    182 	/*
    183 	 * The timeout was cancelled, or the timeout handler
    184 	 * completed, while we were blocked in the untimeout.
    185 	 * No other thread could have set the timer meanwhile
    186 	 * since we serialized all the timer setters. Thus
    187 	 * no timer is currently active nor executing nor will
    188 	 * any timer fire in the future. We start the timer now
    189 	 * if needed.
    190 	 */
    191 	if (ret == -1) {
    192 		ASSERT(ipst->ips_igmp_timeout_id == 0);
    193 	} else {
    194 		ASSERT(ipst->ips_igmp_timeout_id != 0);
    195 		ipst->ips_igmp_timeout_id = 0;
    196 	}
    197 	if (ipst->ips_igmp_time_to_next != 0) {
    198 		ipst->ips_igmp_time_to_next =
    199 		    MIN(ipst->ips_igmp_time_to_next, next);
    200 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
    201 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
    202 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
    203 	}
    204 	ipst->ips_igmp_timer_setter_active = B_FALSE;
    205 	mutex_exit(&ipst->ips_igmp_timer_lock);
    206 }
    207 
    208 /*
    209  * mld_start_timers:
    210  * The unit for next is milliseconds.
    211  */
    212 void
    213 mld_start_timers(unsigned next, ip_stack_t *ipst)
    214 {
    215 	int	time_left;
    216 	int	ret;
    217 
    218 	ASSERT(next != 0 && next != INFINITY);
    219 
    220 	mutex_enter(&ipst->ips_mld_timer_lock);
    221 	if (ipst->ips_mld_timer_setter_active) {
    222 		/*
    223 		 * Serialize timer setters, one at a time. If the
    224 		 * timer is currently being set by someone,
    225 		 * just record the next time when it has to be
    226 		 * invoked and return. The current setter will
    227 		 * take care.
    228 		 */
    229 		ipst->ips_mld_time_to_next =
    230 		    MIN(ipst->ips_mld_time_to_next, next);
    231 		mutex_exit(&ipst->ips_mld_timer_lock);
    232 		return;
    233 	} else {
    234 		ipst->ips_mld_timer_setter_active = B_TRUE;
    235 	}
    236 	if (ipst->ips_mld_timeout_id == 0) {
    237 		/*
    238 		 * The timer is inactive. We need to start a timer
    239 		 */
    240 		ipst->ips_mld_time_to_next = next;
    241 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
    242 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
    243 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
    244 		ipst->ips_mld_timer_setter_active = B_FALSE;
    245 		mutex_exit(&ipst->ips_mld_timer_lock);
    246 		return;
    247 	}
    248 
    249 	/*
    250 	 * The timer was scheduled sometime back for firing in
    251 	 * 'igmp_time_to_next' ms and is active. We need to
    252 	 * reschedule the timeout if the new 'next' will happen
    253 	 * earlier than the currently scheduled timeout
    254 	 */
    255 	time_left = ipst->ips_mld_timer_scheduled_last +
    256 	    MSEC_TO_TICK(ipst->ips_mld_time_to_next) - ddi_get_lbolt();
    257 	if (time_left < MSEC_TO_TICK(next)) {
    258 		ipst->ips_mld_timer_setter_active = B_FALSE;
    259 		mutex_exit(&ipst->ips_mld_timer_lock);
    260 		return;
    261 	}
    262 
    263 	mutex_exit(&ipst->ips_mld_timer_lock);
    264 	ret = untimeout(ipst->ips_mld_timeout_id);
    265 	mutex_enter(&ipst->ips_mld_timer_lock);
    266 	/*
    267 	 * The timeout was cancelled, or the timeout handler
    268 	 * completed, while we were blocked in the untimeout.
    269 	 * No other thread could have set the timer meanwhile
    270 	 * since we serialized all the timer setters. Thus
    271 	 * no timer is currently active nor executing nor will
    272 	 * any timer fire in the future. We start the timer now
    273 	 * if needed.
    274 	 */
    275 	if (ret == -1) {
    276 		ASSERT(ipst->ips_mld_timeout_id == 0);
    277 	} else {
    278 		ASSERT(ipst->ips_mld_timeout_id != 0);
    279 		ipst->ips_mld_timeout_id = 0;
    280 	}
    281 	if (ipst->ips_mld_time_to_next != 0) {
    282 		ipst->ips_mld_time_to_next =
    283 		    MIN(ipst->ips_mld_time_to_next, next);
    284 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
    285 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
    286 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
    287 	}
    288 	ipst->ips_mld_timer_setter_active = B_FALSE;
    289 	mutex_exit(&ipst->ips_mld_timer_lock);
    290 }
    291 
    292 /*
    293  * igmp_input:
    294  * Return NULL for a bad packet that is discarded here.
    295  * Return mp if the message is OK and should be handed to "raw" receivers.
    296  * Callers of igmp_input() may need to reinitialize variables that were copied
    297  * from the mblk as this calls pullupmsg().
    298  */
    299 /* ARGSUSED */
    300 mblk_t *
    301 igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
    302 {
    303 	igmpa_t 	*igmpa;
    304 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
    305 	int		iphlen, igmplen, mblklen;
    306 	ilm_t 		*ilm;
    307 	uint32_t	src, dst;
    308 	uint32_t 	group;
    309 	uint_t		next;
    310 	ipif_t 		*ipif;
    311 	ip_stack_t	 *ipst;
    312 
    313 	ASSERT(ill != NULL);
    314 	ASSERT(!ill->ill_isv6);
    315 	ipst = ill->ill_ipst;
    316 	++ipst->ips_igmpstat.igps_rcv_total;
    317 
    318 	mblklen = MBLKL(mp);
    319 	if (mblklen < 1 || mblklen < (iphlen = IPH_HDR_LENGTH(ipha))) {
    320 		++ipst->ips_igmpstat.igps_rcv_tooshort;
    321 		goto bad_pkt;
    322 	}
    323 	igmplen = ntohs(ipha->ipha_length) - iphlen;
    324 	/*
    325 	 * Since msg sizes are more variable with v3, just pullup the
    326 	 * whole thing now.
    327 	 */
    328 	if (MBLKL(mp) < (igmplen + iphlen)) {
    329 		mblk_t *mp1;
    330 		if ((mp1 = msgpullup(mp, -1)) == NULL) {
    331 			++ipst->ips_igmpstat.igps_rcv_tooshort;
    332 			goto bad_pkt;
    333 		}
    334 		freemsg(mp);
    335 		mp = mp1;
    336 		ipha = (ipha_t *)(mp->b_rptr);
    337 	}
    338 
    339 	/*
    340 	 * Validate lengths
    341 	 */
    342 	if (igmplen < IGMP_MINLEN) {
    343 		++ipst->ips_igmpstat.igps_rcv_tooshort;
    344 		goto bad_pkt;
    345 	}
    346 	/*
    347 	 * Validate checksum
    348 	 */
    349 	if (IP_CSUM(mp, iphlen, 0)) {
    350 		++ipst->ips_igmpstat.igps_rcv_badsum;
    351 		goto bad_pkt;
    352 	}
    353 
    354 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
    355 	src = ipha->ipha_src;
    356 	dst = ipha->ipha_dst;
    357 	if (ip_debug > 1)
    358 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
    359 		    "igmp_input: src 0x%x, dst 0x%x on %s\n",
    360 		    (int)ntohl(src), (int)ntohl(dst),
    361 		    ill->ill_name);
    362 
    363 	switch (igmpa->igmpa_type) {
    364 	case IGMP_MEMBERSHIP_QUERY:
    365 		/*
    366 		 * packet length differentiates between v1/v2 and v3
    367 		 * v1/v2 should be exactly 8 octets long; v3 is >= 12
    368 		 */
    369 		if ((igmplen == IGMP_MINLEN) ||
    370 		    (ipst->ips_igmp_max_version <= IGMP_V2_ROUTER)) {
    371 			next = igmp_query_in(ipha, igmpa, ill);
    372 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
    373 			next = igmpv3_query_in((igmp3qa_t *)igmpa, ill,
    374 			    igmplen);
    375 		} else {
    376 			++ipst->ips_igmpstat.igps_rcv_tooshort;
    377 			goto bad_pkt;
    378 		}
    379 		if (next == 0)
    380 			goto bad_pkt;
    381 
    382 		if (next != INFINITY)
    383 			igmp_start_timers(next, ipst);
    384 
    385 		break;
    386 
    387 	case IGMP_V1_MEMBERSHIP_REPORT:
    388 	case IGMP_V2_MEMBERSHIP_REPORT:
    389 		/*
    390 		 * For fast leave to work, we have to know that we are the
    391 		 * last person to send a report for this group. Reports
    392 		 * generated by us are looped back since we could potentially
    393 		 * be a multicast router, so discard reports sourced by me.
    394 		 */
    395 		mutex_enter(&ill->ill_lock);
    396 		for (ipif = ill->ill_ipif; ipif != NULL;
    397 		    ipif = ipif->ipif_next) {
    398 			if (ipif->ipif_lcl_addr == src) {
    399 				if (ip_debug > 1) {
    400 					(void) mi_strlog(ill->ill_rq,
    401 					    1,
    402 					    SL_TRACE,
    403 					    "igmp_input: we are only "
    404 					    "member src 0x%x ipif_local 0x%x",
    405 					    (int)ntohl(src),
    406 					    (int)
    407 					    ntohl(ipif->ipif_lcl_addr));
    408 				}
    409 				mutex_exit(&ill->ill_lock);
    410 				return (mp);
    411 			}
    412 		}
    413 		mutex_exit(&ill->ill_lock);
    414 
    415 		++ipst->ips_igmpstat.igps_rcv_reports;
    416 		group = igmpa->igmpa_group;
    417 		if (!CLASSD(group)) {
    418 			++ipst->ips_igmpstat.igps_rcv_badreports;
    419 			goto bad_pkt;
    420 		}
    421 
    422 		/*
    423 		 * KLUDGE: if the IP source address of the report has an
    424 		 * unspecified (i.e., zero) subnet number, as is allowed for
    425 		 * a booting host, replace it with the correct subnet number
    426 		 * so that a process-level multicast routing demon can
    427 		 * determine which subnet it arrived from.  This is necessary
    428 		 * to compensate for the lack of any way for a process to
    429 		 * determine the arrival interface of an incoming packet.
    430 		 *
    431 		 * Requires that a copy of *this* message it passed up
    432 		 * to the raw interface which is done by our caller.
    433 		 */
    434 		if ((src & htonl(0xFF000000U)) == 0) {	/* Minimum net mask */
    435 			/* Pick the first ipif on this ill */
    436 			mutex_enter(&ill->ill_lock);
    437 			src = ill->ill_ipif->ipif_subnet;
    438 			mutex_exit(&ill->ill_lock);
    439 			ip1dbg(("igmp_input: changed src to 0x%x\n",
    440 			    (int)ntohl(src)));
    441 			ipha->ipha_src = src;
    442 		}
    443 
    444 		/*
    445 		 * If we belong to the group being reported, and
    446 		 * we are a 'Delaying member' in the RFC terminology,
    447 		 * stop our timer for that group and 'clear flag' i.e.
    448 		 * mark as IGMP_OTHERMEMBER. Do this for all logical
    449 		 * interfaces on the given physical interface.
    450 		 */
    451 		mutex_enter(&ill->ill_lock);
    452 		for (ipif = ill->ill_ipif; ipif != NULL;
    453 		    ipif = ipif->ipif_next) {
    454 			ilm = ilm_lookup_ipif(ipif, group);
    455 			if (ilm != NULL) {
    456 				++ipst->ips_igmpstat.igps_rcv_ourreports;
    457 				ilm->ilm_timer = INFINITY;
    458 				ilm->ilm_state = IGMP_OTHERMEMBER;
    459 			}
    460 		} /* for */
    461 		mutex_exit(&ill->ill_lock);
    462 		break;
    463 
    464 	case IGMP_V3_MEMBERSHIP_REPORT:
    465 		/*
    466 		 * Currently nothing to do here; IGMP router is not
    467 		 * implemented in ip, and v3 hosts don't pay attention
    468 		 * to membership reports.
    469 		 */
    470 		break;
    471 	}
    472 	/*
    473 	 * Pass all valid IGMP packets up to any process(es) listening
    474 	 * on a raw IGMP socket. Do not free the packet.
    475 	 */
    476 	return (mp);
    477 
    478 bad_pkt:
    479 	freemsg(mp);
    480 	return (NULL);
    481 }
    482 
    483 static uint_t
    484 igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
    485 {
    486 	ilm_t	*ilm;
    487 	int	timer;
    488 	uint_t	next, current;
    489 	ip_stack_t	 *ipst;
    490 
    491 	ipst = ill->ill_ipst;
    492 	++ipst->ips_igmpstat.igps_rcv_queries;
    493 
    494 	/*
    495 	 * In the IGMPv2 specification, there are 3 states and a flag.
    496 	 *
    497 	 * In Non-Member state, we simply don't have a membership record.
    498 	 * In Delaying Member state, our timer is running (ilm->ilm_timer
    499 	 * < INFINITY).  In Idle Member state, our timer is not running
    500 	 * (ilm->ilm_timer == INFINITY).
    501 	 *
    502 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
    503 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
    504 	 * if I sent the last report.
    505 	 */
    506 	if ((igmpa->igmpa_code == 0) ||
    507 	    (ipst->ips_igmp_max_version == IGMP_V1_ROUTER)) {
    508 		/*
    509 		 * Query from an old router.
    510 		 * Remember that the querier on this interface is old,
    511 		 * and set the timer to the value in RFC 1112.
    512 		 */
    513 
    514 
    515 		mutex_enter(&ill->ill_lock);
    516 		ill->ill_mcast_v1_time = 0;
    517 		ill->ill_mcast_v1_tset = 1;
    518 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
    519 			ip1dbg(("Received IGMPv1 Query on %s, switching mode "
    520 			    "to IGMP_V1_ROUTER\n", ill->ill_name));
    521 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
    522 			ill->ill_mcast_type = IGMP_V1_ROUTER;
    523 		}
    524 		mutex_exit(&ill->ill_lock);
    525 
    526 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
    527 
    528 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
    529 		    igmpa->igmpa_group != 0) {
    530 			++ipst->ips_igmpstat.igps_rcv_badqueries;
    531 			return (0);
    532 		}
    533 
    534 	} else {
    535 		in_addr_t group;
    536 
    537 		/*
    538 		 * Query from a new router
    539 		 * Simply do a validity check
    540 		 */
    541 		group = igmpa->igmpa_group;
    542 		if (group != 0 && (!CLASSD(group))) {
    543 			++ipst->ips_igmpstat.igps_rcv_badqueries;
    544 			return (0);
    545 		}
    546 
    547 		/*
    548 		 * Switch interface state to v2 on receipt of a v2 query
    549 		 * ONLY IF current state is v3.  Let things be if current
    550 		 * state if v1 but do reset the v2-querier-present timer.
    551 		 */
    552 		mutex_enter(&ill->ill_lock);
    553 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
    554 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
    555 			    "to IGMP_V2_ROUTER", ill->ill_name));
    556 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v2, 1);
    557 			ill->ill_mcast_type = IGMP_V2_ROUTER;
    558 		}
    559 		ill->ill_mcast_v2_time = 0;
    560 		ill->ill_mcast_v2_tset = 1;
    561 		mutex_exit(&ill->ill_lock);
    562 
    563 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
    564 	}
    565 
    566 	if (ip_debug > 1) {
    567 		mutex_enter(&ill->ill_lock);
    568 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
    569 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
    570 		    (int)ntohs(igmpa->igmpa_code),
    571 		    (int)ntohs(igmpa->igmpa_type));
    572 		mutex_exit(&ill->ill_lock);
    573 	}
    574 
    575 	/*
    576 	 * -Start the timers in all of our membership records
    577 	 *  for the physical interface on which the query
    578 	 *  arrived, excluding those that belong to the "all
    579 	 *  hosts" group (224.0.0.1).
    580 	 *
    581 	 * -Restart any timer that is already running but has
    582 	 *  a value longer than the requested timeout.
    583 	 *
    584 	 * -Use the value specified in the query message as
    585 	 *  the maximum timeout.
    586 	 */
    587 	next = (unsigned)INFINITY;
    588 	mutex_enter(&ill->ill_lock);
    589 
    590 	current = CURRENT_MSTIME;
    591 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
    592 
    593 		/*
    594 		 * A multicast router joins INADDR_ANY address
    595 		 * to enable promiscuous reception of all
    596 		 * mcasts from the interface. This INADDR_ANY
    597 		 * is stored in the ilm_v6addr as V6 unspec addr
    598 		 */
    599 		if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr))
    600 			continue;
    601 		if (ilm->ilm_addr == htonl(INADDR_ANY))
    602 			continue;
    603 		if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) &&
    604 		    (igmpa->igmpa_group == 0) ||
    605 		    (igmpa->igmpa_group == ilm->ilm_addr)) {
    606 			if (ilm->ilm_timer > timer) {
    607 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
    608 				if (ilm->ilm_timer < next)
    609 					next = ilm->ilm_timer;
    610 				ilm->ilm_timer += current;
    611 			}
    612 		}
    613 	}
    614 	mutex_exit(&ill->ill_lock);
    615 
    616 	return (next);
    617 }
    618 
    619 static uint_t
    620 igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
    621 {
    622 	uint_t		i, next, mrd, qqi, timer, delay, numsrc;
    623 	uint_t		current;
    624 	ilm_t		*ilm;
    625 	ipaddr_t	*src_array;
    626 	uint8_t		qrv;
    627 	ip_stack_t	 *ipst;
    628 
    629 	ipst = ill->ill_ipst;
    630 	/* make sure numsrc matches packet size */
    631 	numsrc = ntohs(igmp3qa->igmp3qa_numsrc);
    632 	if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) {
    633 		++ipst->ips_igmpstat.igps_rcv_tooshort;
    634 		return (0);
    635 	}
    636 	src_array = (ipaddr_t *)&igmp3qa[1];
    637 
    638 	++ipst->ips_igmpstat.igps_rcv_queries;
    639 
    640 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
    641 		uint_t hdrval, mant, exp;
    642 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
    643 		mant = hdrval & IGMP_V3_MAXRT_MANT_MASK;
    644 		exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4;
    645 		mrd = (mant | 0x10) << (exp + 3);
    646 	}
    647 	if (mrd == 0)
    648 		mrd = MCAST_DEF_QUERY_RESP_INTERVAL;
    649 	timer = DSEC_TO_MSEC(mrd);
    650 	MCAST_RANDOM_DELAY(delay, timer);
    651 	next = (unsigned)INFINITY;
    652 	current = CURRENT_MSTIME;
    653 
    654 	if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0)
    655 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
    656 	else
    657 		ill->ill_mcast_rv = qrv;
    658 
    659 	if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) {
    660 		uint_t hdrval, mant, exp;
    661 		hdrval = (uint_t)igmp3qa->igmp3qa_qqic;
    662 		mant = hdrval & IGMP_V3_QQI_MANT_MASK;
    663 		exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4;
    664 		qqi = (mant | 0x10) << (exp + 3);
    665 	}
    666 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
    667 
    668 	/*
    669 	 * If we have a pending general query response that's scheduled
    670 	 * sooner than the delay we calculated for this response, then
    671 	 * no action is required (RFC3376 section 5.2 rule 1)
    672 	 */
    673 	mutex_enter(&ill->ill_lock);
    674 	if (ill->ill_global_timer < (current + delay)) {
    675 		mutex_exit(&ill->ill_lock);
    676 		return (next);
    677 	}
    678 	mutex_exit(&ill->ill_lock);
    679 
    680 	/*
    681 	 * Now take action depending upon query type:
    682 	 * general, group specific, or group/source specific.
    683 	 */
    684 	if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) {
    685 		/*
    686 		 * general query
    687 		 * We know global timer is either not running or is
    688 		 * greater than our calculated delay, so reset it to
    689 		 * our delay (random value in range [0, response time]).
    690 		 */
    691 		mutex_enter(&ill->ill_lock);
    692 		ill->ill_global_timer =  current + delay;
    693 		mutex_exit(&ill->ill_lock);
    694 		next = delay;
    695 
    696 	} else {
    697 		/* group or group/source specific query */
    698 		mutex_enter(&ill->ill_lock);
    699 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
    700 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
    701 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
    702 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
    703 			    (igmp3qa->igmp3qa_group != ilm->ilm_addr))
    704 				continue;
    705 			/*
    706 			 * If the query is group specific or we have a
    707 			 * pending group specific query, the response is
    708 			 * group specific (pending sources list should be
    709 			 * empty).  Otherwise, need to update the pending
    710 			 * sources list for the group and source specific
    711 			 * response.
    712 			 */
    713 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
    714 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
    715 group_query:
    716 				FREE_SLIST(ilm->ilm_pendsrcs);
    717 				ilm->ilm_pendsrcs = NULL;
    718 			} else {
    719 				boolean_t overflow;
    720 				slist_t *pktl;
    721 				if (numsrc > MAX_FILTER_SIZE ||
    722 				    (ilm->ilm_pendsrcs == NULL &&
    723 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
    724 					/*
    725 					 * We've been sent more sources than
    726 					 * we can deal with; or we can't deal
    727 					 * with a source list at all.  Revert
    728 					 * to a group specific query.
    729 					 */
    730 					goto group_query;
    731 				}
    732 				if ((pktl = l_alloc()) == NULL)
    733 					goto group_query;
    734 				pktl->sl_numsrc = numsrc;
    735 				for (i = 0; i < numsrc; i++)
    736 					IN6_IPADDR_TO_V4MAPPED(src_array[i],
    737 					    &(pktl->sl_addr[i]));
    738 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
    739 				    &overflow);
    740 				l_free(pktl);
    741 				if (overflow)
    742 					goto group_query;
    743 			}
    744 
    745 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
    746 			    INFINITY : (ilm->ilm_timer - current);
    747 			/* choose soonest timer */
    748 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
    749 			if (ilm->ilm_timer < next)
    750 				next = ilm->ilm_timer;
    751 			ilm->ilm_timer += current;
    752 		}
    753 		mutex_exit(&ill->ill_lock);
    754 	}
    755 
    756 	return (next);
    757 }
    758 
    759 void
    760 igmp_joingroup(ilm_t *ilm)
    761 {
    762 	uint_t	timer;
    763 	ill_t	*ill;
    764 	ip_stack_t	*ipst = ilm->ilm_ipst;
    765 
    766 	ill = ilm->ilm_ipif->ipif_ill;
    767 
    768 	ASSERT(IAM_WRITER_ILL(ill));
    769 	ASSERT(ilm->ilm_ill == NULL && !ilm->ilm_ipif->ipif_isv6);
    770 
    771 	mutex_enter(&ill->ill_lock);
    772 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
    773 		ilm->ilm_rtx.rtx_timer = INFINITY;
    774 		ilm->ilm_state = IGMP_OTHERMEMBER;
    775 		mutex_exit(&ill->ill_lock);
    776 	} else {
    777 		ip1dbg(("Querier mode %d, sending report, group %x\n",
    778 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
    779 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
    780 			mutex_exit(&ill->ill_lock);
    781 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
    782 			mutex_enter(&ill->ill_lock);
    783 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
    784 			mutex_exit(&ill->ill_lock);
    785 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
    786 			mutex_enter(&ill->ill_lock);
    787 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
    788 			mrec_t *rp;
    789 			mcast_record_t rtype;
    790 			/*
    791 			 * The possible state changes we need to handle here:
    792 			 *   Old State	New State	Report
    793 			 *
    794 			 *   INCLUDE(0)	INCLUDE(X)	ALLOW(X),BLOCK(0)
    795 			 *   INCLUDE(0)	EXCLUDE(X)	TO_EX(X)
    796 			 *
    797 			 * No need to send the BLOCK(0) report; ALLOW(X)
    798 			 * is enough.
    799 			 */
    800 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
    801 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
    802 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
    803 			    ilm->ilm_filter, NULL);
    804 			mutex_exit(&ill->ill_lock);
    805 			igmpv3_sendrpt(ilm->ilm_ipif, rp);
    806 			mutex_enter(&ill->ill_lock);
    807 			/*
    808 			 * Set up retransmission state.  Timer is set below,
    809 			 * for both v3 and older versions.
    810 			 */
    811 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
    812 			    ilm->ilm_filter);
    813 		}
    814 
    815 		/* Set the ilm timer value */
    816 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
    817 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
    818 		timer = ilm->ilm_rtx.rtx_timer;
    819 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
    820 		ilm->ilm_state = IGMP_IREPORTEDLAST;
    821 		mutex_exit(&ill->ill_lock);
    822 
    823 		/*
    824 		 * To avoid deadlock, we don't call igmp_start_timers from
    825 		 * here. igmp_start_timers needs to call untimeout, and we
    826 		 * can't hold the ipsq across untimeout since
    827 		 * igmp_timeout_handler could be blocking trying to
    828 		 * acquire the ipsq. Instead we start the timer after we get
    829 		 * out of the ipsq in ipsq_exit.
    830 		 */
    831 		mutex_enter(&ipst->ips_igmp_timer_lock);
    832 		ipst->ips_igmp_deferred_next = MIN(timer,
    833 		    ipst->ips_igmp_deferred_next);
    834 		mutex_exit(&ipst->ips_igmp_timer_lock);
    835 	}
    836 
    837 	if (ip_debug > 1) {
    838 		(void) mi_strlog(ilm->ilm_ipif->ipif_ill->ill_rq, 1, SL_TRACE,
    839 		    "igmp_joingroup: multicast_type %d timer %d",
    840 		    (ilm->ilm_ipif->ipif_ill->ill_mcast_type),
    841 		    (int)ntohl(timer));
    842 	}
    843 }
    844 
    845 void
    846 mld_joingroup(ilm_t *ilm)
    847 {
    848 	uint_t	timer;
    849 	ill_t	*ill;
    850 	ip_stack_t	*ipst = ilm->ilm_ipst;
    851 
    852 	ill = ilm->ilm_ill;
    853 
    854 	ASSERT(IAM_WRITER_ILL(ill));
    855 	ASSERT(ilm->ilm_ipif == NULL && ill->ill_isv6);
    856 
    857 	mutex_enter(&ill->ill_lock);
    858 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
    859 		ilm->ilm_rtx.rtx_timer = INFINITY;
    860 		ilm->ilm_state = IGMP_OTHERMEMBER;
    861 		mutex_exit(&ill->ill_lock);
    862 	} else {
    863 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
    864 			mutex_exit(&ill->ill_lock);
    865 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
    866 			mutex_enter(&ill->ill_lock);
    867 		} else {
    868 			mrec_t *rp;
    869 			mcast_record_t rtype;
    870 			/*
    871 			 * The possible state changes we need to handle here:
    872 			 *	Old State   New State	Report
    873 			 *
    874 			 *	INCLUDE(0)  INCLUDE(X)	ALLOW(X),BLOCK(0)
    875 			 *	INCLUDE(0)  EXCLUDE(X)	TO_EX(X)
    876 			 *
    877 			 * No need to send the BLOCK(0) report; ALLOW(X)
    878 			 * is enough
    879 			 */
    880 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
    881 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
    882 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
    883 			    ilm->ilm_filter, NULL);
    884 			mutex_exit(&ill->ill_lock);
    885 			mldv2_sendrpt(ill, rp);
    886 			mutex_enter(&ill->ill_lock);
    887 			/*
    888 			 * Set up retransmission state.  Timer is set below,
    889 			 * for both v2 and v1.
    890 			 */
    891 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
    892 			    ilm->ilm_filter);
    893 		}
    894 
    895 		/* Set the ilm timer value */
    896 		ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER ||
    897 		    ilm->ilm_rtx.rtx_cnt > 0);
    898 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
    899 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
    900 		timer = ilm->ilm_rtx.rtx_timer;
    901 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
    902 		ilm->ilm_state = IGMP_IREPORTEDLAST;
    903 		mutex_exit(&ill->ill_lock);
    904 
    905 		/*
    906 		 * To avoid deadlock, we don't call mld_start_timers from
    907 		 * here. mld_start_timers needs to call untimeout, and we
    908 		 * can't hold the ipsq (i.e. the lock) across untimeout
    909 		 * since mld_timeout_handler could be blocking trying to
    910 		 * acquire the ipsq. Instead we start the timer after we get
    911 		 * out of the ipsq in ipsq_exit
    912 		 */
    913 		mutex_enter(&ipst->ips_mld_timer_lock);
    914 		ipst->ips_mld_deferred_next = MIN(timer,
    915 		    ipst->ips_mld_deferred_next);
    916 		mutex_exit(&ipst->ips_mld_timer_lock);
    917 	}
    918 
    919 	if (ip_debug > 1) {
    920 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
    921 		    "mld_joingroup: multicast_type %d timer %d",
    922 		    (ilm->ilm_ill->ill_mcast_type),
    923 		    (int)ntohl(timer));
    924 	}
    925 }
    926 
    927 void
    928 igmp_leavegroup(ilm_t *ilm)
    929 {
    930 	ill_t *ill = ilm->ilm_ipif->ipif_ill;
    931 
    932 	ASSERT(ilm->ilm_ill == NULL);
    933 	ASSERT(!ill->ill_isv6);
    934 
    935 	mutex_enter(&ill->ill_lock);
    936 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
    937 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
    938 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
    939 		mutex_exit(&ill->ill_lock);
    940 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
    941 		    (htonl(INADDR_ALLRTRS_GROUP)));
    942 		return;
    943 	} else if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
    944 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
    945 		mrec_t *rp;
    946 		/*
    947 		 * The possible state changes we need to handle here:
    948 		 *	Old State	New State	Report
    949 		 *
    950 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
    951 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
    952 		 *
    953 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
    954 		 */
    955 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
    956 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
    957 			    ilm->ilm_filter, NULL);
    958 		} else {
    959 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
    960 			    NULL, NULL);
    961 		}
    962 		mutex_exit(&ill->ill_lock);
    963 		igmpv3_sendrpt(ilm->ilm_ipif, rp);
    964 		return;
    965 	}
    966 	mutex_exit(&ill->ill_lock);
    967 }
    968 
    969 void
    970 mld_leavegroup(ilm_t *ilm)
    971 {
    972 	ill_t *ill = ilm->ilm_ill;
    973 
    974 	ASSERT(ilm->ilm_ipif == NULL);
    975 	ASSERT(ill->ill_isv6);
    976 
    977 	mutex_enter(&ill->ill_lock);
    978 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
    979 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
    980 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
    981 		mutex_exit(&ill->ill_lock);
    982 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
    983 		return;
    984 	} else if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
    985 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
    986 		mrec_t *rp;
    987 		/*
    988 		 * The possible state changes we need to handle here:
    989 		 *	Old State	New State	Report
    990 		 *
    991 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
    992 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
    993 		 *
    994 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
    995 		 */
    996 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
    997 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
    998 			    ilm->ilm_filter, NULL);
    999 		} else {
   1000 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
   1001 			    NULL, NULL);
   1002 		}
   1003 		mutex_exit(&ill->ill_lock);
   1004 		mldv2_sendrpt(ill, rp);
   1005 		return;
   1006 	}
   1007 	mutex_exit(&ill->ill_lock);
   1008 }
   1009 
   1010 void
   1011 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
   1012 {
   1013 	ill_t *ill;
   1014 	mrec_t *rp;
   1015 	ip_stack_t	*ipst = ilm->ilm_ipst;
   1016 
   1017 	ASSERT(ilm != NULL);
   1018 
   1019 	/* state change reports should only be sent if the router is v3 */
   1020 	if (ilm->ilm_ipif->ipif_ill->ill_mcast_type != IGMP_V3_ROUTER)
   1021 		return;
   1022