Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * IP interface to squeues.
     28  *
     29  * IP uses squeues to force serialization of packets, both incoming and
     30  * outgoing. Each squeue is associated with a connection instance (conn_t)
     31  * above, and a soft ring (if enabled) below. Each CPU will have a default
     32  * squeue for outbound connections, and each soft ring of an interface will
     33  * have an squeue to which it sends incoming packets. squeues are never
     34  * destroyed, and if they become unused they are kept around against future
     35  * needs.
     36  *
     37  * IP organizes its squeues using squeue sets (squeue_set_t). For each CPU
     38  * in the system there will be one squeue set, all of whose squeues will be
     39  * bound to that CPU, plus one additional set known as the unbound set. Sets
     40  * associated with CPUs will have one default squeue, for outbound
     41  * connections, and a linked list of squeues used by various NICs for inbound
     42  * packets. The unbound set also has a linked list of squeues, but no default
     43  * squeue.
     44  *
     45  * When a CPU goes offline its squeue set is destroyed, and all its squeues
     46  * are moved to the unbound set. When a CPU comes online, a new squeue set is
     47  * created and the default set is searched for a default squeue formerly bound
     48  * to this CPU. If no default squeue is found, a new one is created.
     49  *
     50  * Two fields of the squeue_t, namely sq_next and sq_set, are owned by IP
     51  * and not the squeue code. squeue.c will not touch them, and we can modify
     52  * them without holding the squeue lock because of the guarantee that squeues
     53  * are never destroyed. ip_squeue locks must be held, however.
     54  *
     55  * All the squeue sets are protected by a single lock, the sqset_lock. This
     56  * is also used to protect the sq_next and sq_set fields of an squeue_t.
     57  *
     58  * The lock order is: cpu_lock --> ill_lock --> sqset_lock --> sq_lock
     59  *
     60  * There are two modes of associating connection with squeues. The first mode
     61  * associates each connection with the CPU that creates the connection (either
     62  * during open time or during accept time). The second mode associates each
     63  * connection with a random CPU, effectively distributing load over all CPUs
     64  * and all squeues in the system. The mode is controlled by the
     65  * ip_squeue_fanout variable.
     66  *
     67  * NOTE: The fact that there is an association between each connection and
     68  * squeue and squeue and CPU does not mean that each connection is always
     69  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
     70  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
     71  * binding is only relevant for the worker thread.
     72  *
     73  * INTERFACE:
     74  *
     75  * squeue_t *ip_squeue_get(ill_rx_ring_t)
     76  *
     77  * Returns the squeue associated with an ill receive ring. If the ring is
     78  * not bound to a CPU, and we're currently servicing the interrupt which
     79  * generated the packet, then bind the squeue to CPU.
     80  *
     81  *
     82  * DR Notes
     83  * ========
     84  *
     85  * The ip_squeue_init() registers a call-back function with the CPU DR
     86  * subsystem using register_cpu_setup_func(). The call-back function does two
     87  * things:
     88  *
     89  * o When the CPU is going off-line or unconfigured, the worker thread is
     90  *	unbound from the CPU. This allows the CPU unconfig code to move it to
     91  *	another CPU.
     92  *
     93  * o When the CPU is going online, it creates a new squeue for this CPU if
     94  *	necessary and binds the squeue worker thread to this CPU.
     95  *
     96  * TUNABLES:
     97  *
     98  * ip_squeue_fanout: used when TCP calls IP_SQUEUE_GET(). If 1, then
     99  * pick the default squeue from a random CPU, otherwise use our CPU's default
    100  * squeue.
    101  *
    102  * ip_squeue_fanout can be accessed and changed using ndd on /dev/tcp or
    103  * /dev/ip.
    104  *
    105  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues *
    106  * created. This is the time squeue code waits before waking up the worker
    107  * thread after queuing a request.
    108  */
    109 
    110 #include <sys/types.h>
    111 #include <sys/debug.h>
    112 #include <sys/kmem.h>
    113 #include <sys/cpuvar.h>
    114 #include <sys/cmn_err.h>
    115 
    116 #include <inet/common.h>
    117 #include <inet/ip.h>
    118 #include <netinet/ip6.h>
    119 #include <inet/ip_if.h>
    120 #include <inet/ip_ire.h>
    121 #include <inet/nd.h>
    122 #include <inet/ipclassifier.h>
    123 #include <sys/types.h>
    124 #include <sys/conf.h>
    125 #include <sys/sunddi.h>
    126 #include <sys/dlpi.h>
    127 #include <sys/squeue_impl.h>
    128 #include <sys/tihdr.h>
    129 #include <inet/udp_impl.h>
    130 #include <sys/strsubr.h>
    131 #include <sys/zone.h>
    132 #include <sys/dld.h>
    133 #include <sys/atomic.h>
    134 
    135 /*
    136  * List of all created squeue sets. The list and its size are protected by
    137  * sqset_lock.
    138  */
    139 static squeue_set_t	**sqset_global_list; /* list 0 is the unbound list */
    140 static uint_t		sqset_global_size;
    141 kmutex_t		sqset_lock;
    142 
    143 static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
    144 
    145 /*
    146  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
    147  *	created. This is the time squeue code waits before waking up the worker
    148  *	thread after queuing a request.
    149  */
    150 uint_t ip_squeue_worker_wait = 10;
    151 
    152 static squeue_t *ip_squeue_create(pri_t);
    153 static squeue_set_t *ip_squeue_set_create(processorid_t);
    154 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
    155 static void ip_squeue_set_move(squeue_t *, squeue_set_t *);
    156 static void ip_squeue_set_destroy(cpu_t *);
    157 static void ip_squeue_clean(void *, mblk_t *, void *);
    158 
    159 #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
    160 
    161 static squeue_t *
    162 ip_squeue_create(pri_t pri)
    163 {
    164 	squeue_t *sqp;
    165 
    166 	sqp = squeue_create(ip_squeue_worker_wait, pri);
    167 	ASSERT(sqp != NULL);
    168 	if (ip_squeue_create_callback != NULL)
    169 		ip_squeue_create_callback(sqp);
    170 	return (sqp);
    171 }
    172 
    173 /*
    174  * Create a new squeue_set. If id == -1, then we're creating the unbound set,
    175  * which should only happen once when we are first initialized. Otherwise id
    176  * is the id of the CPU that needs a set, either because we are initializing
    177  * or because the CPU has come online.
    178  *
    179  * If id != -1, then we need at a minimum to provide a default squeue for the
    180  * new set. We search the unbound set for candidates, and if none are found we
    181  * create a new one.
    182  */
    183 static squeue_set_t *
    184 ip_squeue_set_create(processorid_t id)
    185 {
    186 	squeue_set_t	*sqs;
    187 	squeue_set_t	*src = sqset_global_list[0];
    188 	squeue_t	**lastsqp, *sq;
    189 	squeue_t	**defaultq_lastp = NULL;
    190 
    191 	sqs = kmem_zalloc(sizeof (squeue_set_t), KM_SLEEP);
    192 	sqs->sqs_cpuid = id;
    193 
    194 	if (id == -1) {
    195 		ASSERT(sqset_global_size == 0);
    196 		sqset_global_list[0] = sqs;
    197 		sqset_global_size = 1;
    198 		return (sqs);
    199 	}
    200 
    201 	/*
    202 	 * When we create an squeue set id != -1, we need to give it a
    203 	 * default squeue, in order to support fanout of conns across
    204 	 * CPUs. Try to find a former default squeue that matches this
    205 	 * cpu id on the unbound squeue set. If no such squeue is found,
    206 	 * find some non-default TCP squeue that is free. If still no such
    207 	 * candidate is found, create a new squeue.
    208 	 */
    209 
    210 	ASSERT(MUTEX_HELD(&cpu_lock));
    211 	mutex_enter(&sqset_lock);
    212 	lastsqp = &src->sqs_head;
    213 
    214 	while (*lastsqp) {
    215 		if ((*lastsqp)->sq_bind == id &&
    216 		    (*lastsqp)->sq_state & SQS_DEFAULT) {
    217 			/*
    218 			 * Exact match. Former default squeue of cpu 'id'
    219 			 */
    220 			ASSERT(!((*lastsqp)->sq_state & SQS_ILL_BOUND));
    221 			defaultq_lastp = lastsqp;
    222 			break;
    223 		}
    224 		if (defaultq_lastp == NULL &&
    225 		    !((*lastsqp)->sq_state & (SQS_ILL_BOUND | SQS_DEFAULT))) {
    226 			/*
    227 			 * A free non-default TCP squeue
    228 			 */
    229 			defaultq_lastp = lastsqp;
    230 		}
    231 		lastsqp = &(*lastsqp)->sq_next;
    232 	}
    233 
    234 	if (defaultq_lastp != NULL) {
    235 		/* Remove from src set and set SQS_DEFAULT */
    236 		sq = *defaultq_lastp;
    237 		*defaultq_lastp = sq->sq_next;
    238 		sq->sq_next = NULL;
    239 		if (!(sq->sq_state & SQS_DEFAULT)) {
    240 			mutex_enter(&sq->sq_lock);
    241 			sq->sq_state |= SQS_DEFAULT;
    242 			mutex_exit(&sq->sq_lock);
    243 		}
    244 	} else {
    245 		sq = ip_squeue_create(SQUEUE_DEFAULT_PRIORITY);
    246 		sq->sq_state |= SQS_DEFAULT;
    247 	}
    248 
    249 	sq->sq_set = sqs;
    250 	sqs->sqs_default = sq;
    251 	squeue_bind(sq, id); /* this locks squeue mutex */
    252 
    253 	ASSERT(sqset_global_size <= NCPU);
    254 	sqset_global_list[sqset_global_size++] = sqs;
    255 	mutex_exit(&sqset_lock);
    256 	return (sqs);
    257 }
    258 
    259 /*
    260  * Called by ill_ring_add() to find an squeue to associate with a new ring.
    261  */
    262 
    263 squeue_t *
    264 ip_squeue_getfree(pri_t pri)
    265 {
    266 	squeue_set_t	*sqs = sqset_global_list[0];
    267 	squeue_t	*sq;
    268 
    269 	mutex_enter(&sqset_lock);
    270 	for (sq = sqs->sqs_head; sq != NULL; sq = sq->sq_next) {
    271 		/*
    272 		 * Select a non-default TCP squeue that is free i.e. not
    273 		 * bound to any ill.
    274 		 */
    275 		if (!(sq->sq_state & (SQS_DEFAULT | SQS_ILL_BOUND)))
    276 			break;
    277 	}
    278 
    279 	if (sq == NULL) {
    280 		sq = ip_squeue_create(pri);
    281 		sq->sq_set = sqs;
    282 		sq->sq_next = sqs->sqs_head;
    283 		sqs->sqs_head = sq;
    284 	}
    285 
    286 	ASSERT(!(sq->sq_state & (SQS_POLL_THR_CONTROL | SQS_WORKER_THR_CONTROL |
    287 	    SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE |
    288 	    SQS_POLL_THR_QUIESCED)));
    289 
    290 	mutex_enter(&sq->sq_lock);
    291 	sq->sq_state |= SQS_ILL_BOUND;
    292 	mutex_exit(&sq->sq_lock);
    293 	mutex_exit(&sqset_lock);
    294 
    295 	if (sq->sq_priority != pri) {
    296 		thread_lock(sq->sq_worker);
    297 		(void) thread_change_pri(sq->sq_worker, pri, 0);
    298 		thread_unlock(sq->sq_worker);
    299 
    300 		thread_lock(sq->sq_poll_thr);
    301 		(void) thread_change_pri(sq->sq_poll_thr, pri, 0);
    302 		thread_unlock(sq->sq_poll_thr);
    303 
    304 		sq->sq_priority = pri;
    305 	}
    306 	return (sq);
    307 }
    308 
    309 /*
    310  * Initialize IP squeues.
    311  */
    312 void
    313 ip_squeue_init(void (*callback)(squeue_t *))
    314 {
    315 	int i;
    316 	squeue_set_t	*sqs;
    317 
    318 	ASSERT(sqset_global_list == NULL);
    319 
    320 	ip_squeue_create_callback = callback;
    321 	squeue_init();
    322 	mutex_init(&sqset_lock, NULL, MUTEX_DEFAULT, NULL);
    323 	sqset_global_list =
    324 	    kmem_zalloc(sizeof (squeue_set_t *) * (NCPU+1), KM_SLEEP);
    325 	sqset_global_size = 0;
    326 	/*
    327 	 * We are called at system boot time and we don't
    328 	 * expect memory allocation failure.
    329 	 */
    330 	sqs = ip_squeue_set_create(-1);
    331 	ASSERT(sqs != NULL);
    332 
    333 	mutex_enter(&cpu_lock);
    334 	/* Create squeue for each active CPU available */
    335 	for (i = 0; i < NCPU; i++) {
    336 		cpu_t *cp = cpu_get(i);
    337 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
    338 			/*
    339 			 * We are called at system boot time and we don't
    340 			 * expect memory allocation failure then
    341 			 */
    342 			cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
    343 			ASSERT(cp->cpu_squeue_set != NULL);
    344 		}
    345 	}
    346 
    347 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
    348 	mutex_exit(&cpu_lock);
    349 }
    350 
    351 /*
    352  * Get a default squeue, either from the current CPU or a CPU derived by hash
    353  * from the index argument, depending upon the setting of ip_squeue_fanout.
    354  */
    355 squeue_t *
    356 ip_squeue_random(uint_t index)
    357 {
    358 	squeue_set_t *sqs = NULL;
    359 	squeue_t *sq;
    360 
    361 	/*
    362 	 * The minimum value of sqset_global_size is 2, one for the unbound
    363 	 * squeue set and another for the squeue set of the zeroth CPU.
    364 	 * Even though the value could be changing, it can never go below 2,
    365 	 * so the assert does not need the lock protection.
    366 	 */
    367 	ASSERT(sqset_global_size > 1);
    368 
    369 	/* Protect against changes to sqset_global_list */
    370 	mutex_enter(&sqset_lock);
    371 
    372 	if (!ip_squeue_fanout)
    373 		sqs = CPU->cpu_squeue_set;
    374 
    375 	/*
    376 	 * sqset_global_list[0] corresponds to the unbound squeue set.
    377 	 * The computation below picks a set other than the unbound set.
    378 	 */
    379 	if (sqs == NULL)
    380 		sqs = sqset_global_list[(index % (sqset_global_size - 1)) + 1];
    381 	sq = sqs->sqs_default;
    382 
    383 	mutex_exit(&sqset_lock);
    384 	ASSERT(sq);
    385 	return (sq);
    386 }
    387 
    388 /*
    389  * Move squeue from its current set to newset. Not used for default squeues.
    390  * Bind or unbind the worker thread as appropriate.
    391  */
    392 
    393 static void
    394 ip_squeue_set_move(squeue_t *sq, squeue_set_t *newset)
    395 {
    396 	squeue_set_t	*set;
    397 	squeue_t	**lastsqp;
    398 	processorid_t	cpuid = newset->sqs_cpuid;
    399 
    400 	ASSERT(!(sq->sq_state & SQS_DEFAULT));
    401 	ASSERT(!MUTEX_HELD(&sq->sq_lock));
    402 	ASSERT(MUTEX_HELD(&sqset_lock));
    403 
    404 	set = sq->sq_set;
    405 	if (set == newset)
    406 		return;
    407 
    408 	lastsqp = &set->sqs_head;
    409 	while (*lastsqp != sq)
    410 		lastsqp = &(*lastsqp)->sq_next;
    411 
    412 	*lastsqp = sq->sq_next;
    413 	sq->sq_next = newset->sqs_head;
    414 	newset->sqs_head = sq;
    415 	sq->sq_set = newset;
    416 	if (cpuid == -1)
    417 		squeue_unbind(sq);
    418 	else
    419 		squeue_bind(sq, cpuid);
    420 }
    421 
    422 /*
    423  * Move squeue from its current set to cpuid's set and bind to cpuid.
    424  */
    425 
    426 int
    427 ip_squeue_cpu_move(squeue_t *sq, processorid_t cpuid)
    428 {
    429 	cpu_t *cpu;
    430 	squeue_set_t *set;
    431 
    432 	if (sq->sq_state & SQS_DEFAULT)
    433 		return (-1);
    434 
    435 	ASSERT(MUTEX_HELD(&cpu_lock));
    436 
    437 	cpu = cpu_get(cpuid);
    438 	if (!CPU_ISON(cpu))
    439 		return (-1);
    440 
    441 	mutex_enter(&sqset_lock);
    442 	set = cpu->cpu_squeue_set;
    443 	if (set != NULL)
    444 		ip_squeue_set_move(sq, set);
    445 	mutex_exit(&sqset_lock);
    446 	return ((set == NULL) ? -1 : 0);
    447 }
    448 
    449 /*
    450  * The mac layer is calling, asking us to move an squeue to a
    451  * new CPU. This routine is called with cpu_lock held.
    452  */
    453 void
    454 ip_squeue_bind_ring(ill_t *ill, ill_rx_ring_t *rx_ring, processorid_t cpuid)
    455 {
    456 	ASSERT(ILL_MAC_PERIM_HELD(ill));
    457 	ASSERT(rx_ring->rr_ill == ill);
    458 
    459 	mutex_enter(&ill->ill_lock);
    460 	if (rx_ring->rr_ring_state == RR_FREE ||
    461 	    rx_ring->rr_ring_state == RR_FREE_INPROG) {
    462 		mutex_exit(&ill->ill_lock);
    463 		return;
    464 	}
    465 
    466 	if (ip_squeue_cpu_move(rx_ring->rr_sqp, cpuid) != -1)
    467 		rx_ring->rr_ring_state = RR_SQUEUE_BOUND;
    468 
    469 	mutex_exit(&ill->ill_lock);
    470 }
    471 
    472 void *
    473 ip_squeue_add_ring(ill_t *ill, void *mrp)
    474 {
    475 	mac_rx_fifo_t		*mrfp = (mac_rx_fifo_t *)mrp;
    476 	ill_rx_ring_t		*rx_ring, *ring_tbl;
    477 	int			ip_rx_index;
    478 	squeue_t		*sq = NULL;
    479 	pri_t			pri;
    480 
    481 	ASSERT(ILL_MAC_PERIM_HELD(ill));
    482 	ASSERT(mrfp->mrf_type == MAC_RX_FIFO);
    483 	ASSERT(ill->ill_dld_capab != NULL);
    484 
    485 	ring_tbl = ill->ill_dld_capab->idc_poll.idp_ring_tbl;
    486 
    487 	mutex_enter(&ill->ill_lock);
    488 	for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
    489 		rx_ring = &ring_tbl[ip_rx_index];
    490 		if (rx_ring->rr_ring_state == RR_FREE)
    491 			break;
    492 	}
    493 
    494 	if (ip_rx_index == ILL_MAX_RINGS) {
    495 		/*
    496 		 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If
    497 		 * we have devices which can overwhelm this limit,
    498 		 * ILL_MAX_RING should be made configurable. Meanwhile it
    499 		 * cause no panic because driver will pass ip_input a NULL
    500 		 * handle which will make IP allocate the default squeue and
    501 		 * Polling mode will not be used for this ring.
    502 		 */
    503 		cmn_err(CE_NOTE,
    504 		    "Reached maximum number of receiving rings (%d) for %s\n",
    505 		    ILL_MAX_RINGS, ill->ill_name);
    506 		mutex_exit(&ill->ill_lock);
    507 		return (NULL);
    508 	}
    509 
    510 	bzero(rx_ring, sizeof (ill_rx_ring_t));
    511 	rx_ring->rr_rx = (ip_mac_rx_t)mrfp->mrf_receive;
    512 	/* XXX: Hard code it to tcp accept for now */
    513 	rx_ring->rr_ip_accept = (ip_accept_t)ip_accept_tcp;
    514 
    515 	rx_ring->rr_intr_handle = mrfp->mrf_intr_handle;
    516 	rx_ring->rr_intr_enable = (ip_mac_intr_enable_t)mrfp->mrf_intr_enable;
    517 	rx_ring->rr_intr_disable =
    518 	    (ip_mac_intr_disable_t)mrfp->mrf_intr_disable;
    519 	rx_ring->rr_rx_handle = mrfp->mrf_rx_arg;
    520 	rx_ring->rr_ill = ill;
    521 
    522 	pri = mrfp->mrf_flow_priority;
    523 
    524 	sq = ip_squeue_getfree(pri);
    525 
    526 	mutex_enter(&sq->sq_lock);
    527 	sq->sq_rx_ring = rx_ring;
    528 	rx_ring->rr_sqp = sq;
    529 
    530 	sq->sq_state |= SQS_POLL_CAPAB;
    531 
    532 	rx_ring->rr_ring_state = RR_SQUEUE_UNBOUND;
    533 	sq->sq_ill = ill;
    534 	mutex_exit(&sq->sq_lock);
    535 	mutex_exit(&ill->ill_lock);
    536 
    537 	DTRACE_PROBE4(ill__ring__add, char *, ill->ill_name, ill_t *, ill, int,
    538 	    ip_rx_index, void *, mrfp->mrf_rx_arg);
    539 
    540 	/* Assign the squeue to the specified CPU as well */
    541 	mutex_enter(&cpu_lock);
    542 	(void) ip_squeue_bind_ring(ill, rx_ring, mrfp->mrf_cpu_id);
    543 	mutex_exit(&cpu_lock);
    544 
    545 	return (rx_ring);
    546 }
    547 
    548 /*
    549  * sanitize the squeue etc. Some of the processing
    550  * needs to be done from inside the perimeter.
    551  */
    552 void
    553 ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
    554 {
    555 	squeue_t *sqp;
    556 
    557 	ASSERT(ILL_MAC_PERIM_HELD(ill));
    558 	ASSERT(rx_ring != NULL);
    559 
    560 	/* Just clean one squeue */
    561 	mutex_enter(&ill->ill_lock);
    562 	if (rx_ring->rr_ring_state == RR_FREE) {
    563 		mutex_exit(&ill->ill_lock);
    564 		return;
    565 	}
    566 	rx_ring->rr_ring_state = RR_FREE_INPROG;
    567 	sqp = rx_ring->rr_sqp;
    568 
    569 	mutex_enter(&sqp->sq_lock);
    570 	sqp->sq_state |= SQS_POLL_CLEANUP;
    571 	cv_signal(&sqp->sq_worker_cv);
    572 	mutex_exit(&ill->ill_lock);
    573 	while (!(sqp->sq_state & SQS_POLL_CLEANUP_DONE))
    574 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
    575 	sqp->sq_state &= ~SQS_POLL_CLEANUP_DONE;
    576 
    577 	ASSERT(!(sqp->sq_state & (SQS_POLL_THR_CONTROL |
    578 	    SQS_WORKER_THR_CONTROL | SQS_POLL_QUIESCE_DONE |
    579 	    SQS_POLL_THR_QUIESCED)));
    580 
    581 	cv_signal(&sqp->sq_worker_cv);
    582 	mutex_exit(&sqp->sq_lock);
    583 
    584 	/*
    585 	 * Move the squeue to sqset_global_list[0] which holds the set of
    586 	 * squeues not bound to any cpu. Note that the squeue is still
    587 	 * considered bound to an ill as long as SQS_ILL_BOUND is set.
    588 	 */
    589 	mutex_enter(&sqset_lock);
    590 	ip_squeue_set_move(sqp, sqset_global_list[0]);
    591 	mutex_exit(&sqset_lock);
    592 
    593 	/*
    594 	 * CPU going offline can also trigger a move of the squeue to the
    595 	 * unbound set sqset_global_list[0]. However the squeue won't be
    596 	 * recycled for the next use as long as the SQS_ILL_BOUND flag
    597 	 * is set. Hence we clear the SQS_ILL_BOUND flag only towards the
    598 	 * end after the move.
    599 	 */
    600 	mutex_enter(&sqp->sq_lock);
    601 	sqp->sq_state &= ~SQS_ILL_BOUND;
    602 	mutex_exit(&sqp->sq_lock);
    603 
    604 	mutex_enter(&ill->ill_lock);
    605 	rx_ring->rr_ring_state = RR_FREE;
    606 	mutex_exit(&ill->ill_lock);
    607 }
    608 
    609 /*
    610  * Stop the squeue from polling. This needs to be done
    611  * from inside the perimeter.
    612  */
    613 void
    614 ip_squeue_quiesce_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
    615 {
    616 	squeue_t *sqp;
    617 
    618 	ASSERT(ILL_MAC_PERIM_HELD(ill));
    619 	ASSERT(rx_ring != NULL);
    620 
    621 	sqp = rx_ring->rr_sqp;
    622 	mutex_enter(&sqp->sq_lock);
    623 	sqp->sq_state |= SQS_POLL_QUIESCE;
    624 	cv_signal(&sqp->sq_worker_cv);
    625 	while (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE))
    626 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
    627 
    628 	mutex_exit(&sqp->sq_lock);
    629 }
    630 
    631 /*
    632  * Restart polling etc. Needs to be inside the perimeter to
    633  * prevent races.
    634  */
    635 void
    636 ip_squeue_restart_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
    637 {
    638 	squeue_t *sqp;
    639 
    640 	ASSERT(ILL_MAC_PERIM_HELD(ill));
    641 	ASSERT(rx_ring != NULL);
    642 
    643 	sqp = rx_ring->rr_sqp;
    644 	mutex_enter(&sqp->sq_lock);
    645 	/*
    646 	 * Handle change in number of rings between the quiesce and
    647 	 * restart operations by checking for a previous quiesce before
    648 	 * attempting a restart.
    649 	 */
    650 	if (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) {
    651 		mutex_exit(&sqp->sq_lock);
    652 		return;
    653 	}
    654 	sqp->sq_state |= SQS_POLL_RESTART;
    655 	cv_signal(&sqp->sq_worker_cv);
    656 	while (!(sqp->sq_state & SQS_POLL_RESTART_DONE))
    657 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
    658 	sqp->sq_state &= ~SQS_POLL_RESTART_DONE;
    659 	mutex_exit(&sqp->sq_lock);
    660 }
    661 
    662 /*
    663  * sanitize all squeues associated with the ill.
    664  */
    665 void
    666 ip_squeue_clean_all(ill_t *ill)
    667 {
    668 	int idx;
    669 	ill_rx_ring_t	*rx_ring;
    670 
    671 	for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
    672 		rx_ring = &ill->ill_dld_capab->idc_poll.idp_ring_tbl[idx];
    673 		ip_squeue_clean_ring(ill, rx_ring);
    674 	}
    675 }
    676 
    677 /*
    678  * Used by IP to get the squeue associated with a ring. If the squeue isn't
    679  * yet bound to a CPU, and we're being called directly from the NIC's
    680  * interrupt, then we know what CPU we want to assign the squeue to, so
    681  * dispatch that task to a taskq.
    682  */
    683 squeue_t *
    684 ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
    685 {
    686 	squeue_t 	*sqp;
    687 
    688 	if ((ill_rx_ring == NULL) || ((sqp = ill_rx_ring->rr_sqp) == NULL))
    689 		return (IP_SQUEUE_GET(CPU_PSEUDO_RANDOM()));
    690 
    691 	return (sqp);
    692 }
    693 
    694 /*
    695  * Called when a CPU goes offline. It's squeue_set_t is destroyed, and all
    696  * squeues are unboudn and moved to the unbound set.
    697  */
    698 static void
    699 ip_squeue_set_destroy(cpu_t *cpu)
    700 {
    701 	int i;
    702 	squeue_t *sqp, *lastsqp = NULL;
    703 	squeue_set_t *sqs, *unbound = sqset_global_list[0];
    704 
    705 	mutex_enter(&sqset_lock);
    706 	if ((sqs = cpu->cpu_squeue_set) == NULL) {
    707 		mutex_exit(&sqset_lock);
    708 		return;
    709 	}
    710 
    711 	/* Move all squeues to unbound set */
    712 
    713 	for (sqp = sqs->sqs_head; sqp; lastsqp = sqp, sqp = sqp->sq_next) {
    714 		squeue_unbind(sqp);
    715 		sqp->sq_set = unbound;
    716 	}
    717 	if (sqs->sqs_head) {
    718 		lastsqp->sq_next = unbound->sqs_head;
    719 		unbound->sqs_head = sqs->sqs_head;
    720 	}
    721 
    722 	/* Also move default squeue to unbound set */
    723 
    724 	sqp = sqs->sqs_default;
    725 	ASSERT(sqp != NULL);
    726 	ASSERT((sqp->sq_state & (SQS_DEFAULT|SQS_ILL_BOUND)) == SQS_DEFAULT);
    727 
    728 	sqp->sq_next = unbound->sqs_head;
    729 	unbound->sqs_head = sqp;
    730 	squeue_unbind(sqp);
    731 	sqp->sq_set = unbound;
    732 
    733 	for (i = 1; i < sqset_global_size; i++)
    734 		if (sqset_global_list[i] == sqs)
    735 			break;
    736 
    737 	ASSERT(i < sqset_global_size);
    738 	sqset_global_list[i] = sqset_global_list[sqset_global_size - 1];
    739 	sqset_global_list[sqset_global_size - 1] = NULL;
    740 	sqset_global_size--;
    741 
    742 	mutex_exit(&sqset_lock);
    743 	kmem_free(sqs, sizeof (*sqs));
    744 }
    745 
    746 /*
    747  * Reconfiguration callback
    748  */
    749 /* ARGSUSED */
    750 static int
    751 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
    752 {
    753 	cpu_t *cp = cpu_get(id);
    754 
    755 	ASSERT(MUTEX_HELD(&cpu_lock));
    756 	switch (what) {
    757 	case CPU_CONFIG:
    758 	case CPU_ON:
    759 	case CPU_INIT:
    760 	case CPU_CPUPART_IN:
    761 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL)
    762 			cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
    763 		break;
    764 	case CPU_UNCONFIG:
    765 	case CPU_OFF:
    766 	case CPU_CPUPART_OUT:
    767 		if (cp->cpu_squeue_set != NULL) {
    768 			ip_squeue_set_destroy(cp);
    769 			cp->cpu_squeue_set = NULL;
    770 		}
    771 		break;
    772 	default:
    773 		break;
    774 	}
    775 	return (0);
    776 }
    777