Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/callo.h>
     27 #include <sys/param.h>
     28 #include <sys/types.h>
     29 #include <sys/cpuvar.h>
     30 #include <sys/thread.h>
     31 #include <sys/kmem.h>
     32 #include <sys/kmem_impl.h>
     33 #include <sys/cmn_err.h>
     34 #include <sys/callb.h>
     35 #include <sys/debug.h>
     36 #include <sys/vtrace.h>
     37 #include <sys/sysmacros.h>
     38 #include <sys/sdt.h>
     39 
     40 /*
     41  * Callout tables.  See timeout(9F) for details.
     42  */
     43 static int callout_threads;			/* callout normal threads */
     44 static hrtime_t callout_debug_hrtime;		/* debugger entry time */
     45 static int callout_min_reap;			/* callout minimum reap count */
     46 static int callout_tolerance;			/* callout hires tolerance */
     47 static callout_table_t *callout_boot_ct;	/* Boot CPU's callout tables */
     48 static clock_t callout_max_ticks;		/* max interval */
     49 static hrtime_t callout_longterm;		/* longterm nanoseconds */
     50 static ulong_t callout_counter_low;		/* callout ID increment */
     51 static ulong_t callout_table_bits;		/* number of table bits in ID */
     52 static ulong_t callout_table_mask;		/* mask for the table bits */
     53 static callout_cache_t *callout_caches;		/* linked list of caches */
     54 #pragma align 64(callout_table)
     55 static callout_table_t *callout_table;		/* global callout table array */
     56 
     57 /*
     58  * We run 'realtime' callouts at PIL 1 (CY_LOW_LEVEL). For 'normal'
     59  * callouts, from PIL 10 (CY_LOCK_LEVEL) we dispatch the callout,
     60  * via taskq, to a thread that executes at PIL 0 - so we end up running
     61  * 'normal' callouts at PIL 0.
     62  */
     63 static volatile int callout_realtime_level = CY_LOW_LEVEL;
     64 static volatile int callout_normal_level = CY_LOCK_LEVEL;
     65 
     66 static char *callout_kstat_names[] = {
     67 	"callout_timeouts",
     68 	"callout_timeouts_pending",
     69 	"callout_untimeouts_unexpired",
     70 	"callout_untimeouts_executing",
     71 	"callout_untimeouts_expired",
     72 	"callout_expirations",
     73 	"callout_allocations",
     74 	"callout_cleanups",
     75 };
     76 
     77 static hrtime_t	callout_heap_process(callout_table_t *, hrtime_t, int);
     78 
     79 #define	CALLOUT_HASH_INSERT(hash, cp, cnext, cprev)	\
     80 {							\
     81 	callout_hash_t *hashp = &(hash);		\
     82 							\
     83 	cp->cprev = NULL;				\
     84 	cp->cnext = hashp->ch_head;			\
     85 	if (hashp->ch_head == NULL)			\
     86 		hashp->ch_tail = cp;			\
     87 	else						\
     88 		cp->cnext->cprev = cp;			\
     89 	hashp->ch_head = cp;				\
     90 }
     91 
     92 #define	CALLOUT_HASH_APPEND(hash, cp, cnext, cprev)	\
     93 {							\
     94 	callout_hash_t *hashp = &(hash);		\
     95 							\
     96 	cp->cnext = NULL;				\
     97 	cp->cprev = hashp->ch_tail;			\
     98 	if (hashp->ch_tail == NULL)			\
     99 		hashp->ch_head = cp;			\
    100 	else						\
    101 		cp->cprev->cnext = cp;			\
    102 	hashp->ch_tail = cp;				\
    103 }
    104 
    105 #define	CALLOUT_HASH_DELETE(hash, cp, cnext, cprev)	\
    106 {							\
    107 	callout_hash_t *hashp = &(hash);		\
    108 							\
    109 	if (cp->cnext == NULL)				\
    110 		hashp->ch_tail = cp->cprev;		\
    111 	else						\
    112 		cp->cnext->cprev = cp->cprev;		\
    113 	if (cp->cprev == NULL)				\
    114 		hashp->ch_head = cp->cnext;		\
    115 	else						\
    116 		cp->cprev->cnext = cp->cnext;		\
    117 }
    118 
    119 /*
    120  * These definitions help us queue callouts and callout lists. Here is
    121  * the queueing rationale:
    122  *
    123  *	- callouts are queued in a FIFO manner in the ID hash table.
    124  *	  TCP timers are typically cancelled in the same order that they
    125  *	  were issued. The FIFO queueing shortens the search for a callout
    126  *	  during untimeout().
    127  *
    128  *	- callouts are queued in a FIFO manner in their callout lists.
    129  *	  This ensures that the callouts are executed in the same order that
    130  *	  they were queued. This is fair. Plus, it helps to make each
    131  *	  callout expiration timely. It also favors cancellations.
    132  *
    133  *	- callout lists are queued in the following manner in the callout
    134  *	  hash table buckets:
    135  *
    136  *		- appended, if the callout list is a 1-nanosecond resolution
    137  *		  callout list. When a callout is created, we first look for
    138  *		  a callout list that has the same expiration so we can avoid
    139  *		  allocating a callout list and inserting the expiration into
    140  *		  the heap. However, we do not want to look at 1-nanosecond
    141  *		  resolution callout lists as we will seldom find a match in
    142  *		  them. Keeping these callout lists in the rear of the hash
    143  *		  buckets allows us to skip these during the lookup.
    144  *
    145  *		- inserted at the beginning, if the callout list is not a
    146  *		  1-nanosecond resolution callout list. This also has the
    147  *		  side-effect of keeping the long term timers away from the
    148  *		  front of the buckets.
    149  *
    150  *	- callout lists are queued in a FIFO manner in the expired callouts
    151  *	  list. This ensures that callout lists are executed in the order
    152  *	  of expiration.
    153  */
    154 #define	CALLOUT_APPEND(ct, cp)						\
    155 	CALLOUT_HASH_APPEND(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
    156 		cp, c_idnext, c_idprev);				\
    157 	CALLOUT_HASH_APPEND(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
    158 
    159 #define	CALLOUT_DELETE(ct, cp)						\
    160 	CALLOUT_HASH_DELETE(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
    161 		cp, c_idnext, c_idprev);				\
    162 	CALLOUT_HASH_DELETE(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
    163 
    164 #define	CALLOUT_LIST_INSERT(hash, cl)				\
    165 	CALLOUT_HASH_INSERT(hash, cl, cl_next, cl_prev)
    166 
    167 #define	CALLOUT_LIST_APPEND(hash, cl)				\
    168 	CALLOUT_HASH_APPEND(hash, cl, cl_next, cl_prev)
    169 
    170 #define	CALLOUT_LIST_DELETE(hash, cl)				\
    171 	CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev)
    172 
    173 /*
    174  * For normal callouts, there is a deadlock scenario if two callouts that
    175  * have an inter-dependency end up on the same callout list. To break the
    176  * deadlock, you need two taskq threads running in parallel. We compute
    177  * the number of taskq threads here using a bunch of conditions to make
    178  * it optimal for the common case. This is an ugly hack, but one that is
    179  * necessary (sigh).
    180  */
    181 #define	CALLOUT_THRESHOLD	100000000
    182 #define	CALLOUT_EXEC_COMPUTE(ct, exec)					\
    183 {									\
    184 	callout_list_t *cl;						\
    185 									\
    186 	cl = ct->ct_expired.ch_head;					\
    187 	if (cl == NULL) {						\
    188 		/*							\
    189 		 * If the expired list is NULL, there is nothing to	\
    190 		 * process.						\
    191 		 */							\
    192 		exec = 0;						\
    193 	} else if ((cl->cl_next == NULL) &&				\
    194 	    (cl->cl_callouts.ch_head == cl->cl_callouts.ch_tail)) {	\
    195 		/*							\
    196 		 * If there is only one callout list and it contains	\
    197 		 * only one callout, there is no need for two threads.	\
    198 		 */							\
    199 		exec = 1;						\
    200 	} else if ((ct->ct_heap_num == 0) ||				\
    201 	    (ct->ct_heap[0].ch_expiration > gethrtime() + CALLOUT_THRESHOLD)) {\
    202 		/*							\
    203 		 * If the heap has become empty, we need two threads as	\
    204 		 * there is no one to kick off the second thread in the	\
    205 		 * future. If the heap is not empty and the top of the	\
    206 		 * heap does not expire in the near future, we need two	\
    207 		 * threads.						\
    208 		 */							\
    209 		exec = 2;						\
    210 	} else {							\
    211 		/*							\
    212 		 * We have multiple callouts to process. But the cyclic	\
    213 		 * will fire in the near future. So, we only need one	\
    214 		 * thread for now.					\
    215 		 */							\
    216 		exec = 1;						\
    217 	}								\
    218 }
    219 
    220 /*
    221  * Macro to swap two heap items.
    222  */
    223 #define	CALLOUT_SWAP(h1, h2)		\
    224 {					\
    225 	callout_heap_t tmp;		\
    226 					\
    227 	tmp = *h1;			\
    228 	*h1 = *h2;			\
    229 	*h2 = tmp;			\
    230 }
    231 
    232 /*
    233  * Macro to free a callout list.
    234  */
    235 #define	CALLOUT_LIST_FREE(ct, cl)			\
    236 {							\
    237 	cl->cl_next = ct->ct_lfree;			\
    238 	ct->ct_lfree = cl;				\
    239 	cl->cl_flags |= CALLOUT_LIST_FLAG_FREE;		\
    240 }
    241 
    242 /*
    243  * Allocate a callout structure.  We try quite hard because we
    244  * can't sleep, and if we can't do the allocation, we're toast.
    245  * Failing all, we try a KM_PANIC allocation. Note that we never
    246  * deallocate a callout. See untimeout() for the reasoning.
    247  */
    248 static callout_t *
    249 callout_alloc(callout_table_t *ct)
    250 {
    251 	size_t size;
    252 	callout_t *cp;
    253 
    254 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
    255 	mutex_exit(&ct->ct_mutex);
    256 
    257 	cp = kmem_cache_alloc(ct->ct_cache, KM_NOSLEEP);
    258 	if (cp == NULL) {
    259 		size = sizeof (callout_t);
    260 		cp = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
    261 	}
    262 	cp->c_xid = 0;
    263 	cp->c_executor = NULL;
    264 	cv_init(&cp->c_done, NULL, CV_DEFAULT, NULL);
    265 	cp->c_waiting = 0;
    266 
    267 	mutex_enter(&ct->ct_mutex);
    268 	ct->ct_allocations++;
    269 	return (cp);
    270 }
    271 
    272 /*
    273  * Allocate a callout list structure.  We try quite hard because we
    274  * can't sleep, and if we can't do the allocation, we're toast.
    275  * Failing all, we try a KM_PANIC allocation. Note that we never
    276  * deallocate a callout list.
    277  */
    278 static void
    279 callout_list_alloc(callout_table_t *ct)
    280 {
    281 	size_t size;
    282 	callout_list_t *cl;
    283 
    284 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
    285 	mutex_exit(&ct->ct_mutex);
    286 
    287 	cl = kmem_cache_alloc(ct->ct_lcache, KM_NOSLEEP);
    288 	if (cl == NULL) {
    289 		size = sizeof (callout_list_t);
    290 		cl = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
    291 	}
    292 	bzero(cl, sizeof (callout_list_t));
    293 
    294 	mutex_enter(&ct->ct_mutex);
    295 	CALLOUT_LIST_FREE(ct, cl);
    296 }
    297 
    298 /*
    299  * Find a callout list that corresponds to an expiration and matching flags.
    300  */
    301 static callout_list_t *
    302 callout_list_get(callout_table_t *ct, hrtime_t expiration, int flags, int hash)
    303 {
    304 	callout_list_t *cl;
    305 	int clflags;
    306 
    307 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
    308 
    309 	if (flags & CALLOUT_LIST_FLAG_NANO) {
    310 		/*
    311 		 * This is a 1-nanosecond resolution callout. We will rarely
    312 		 * find a match for this. So, bail out.
    313 		 */
    314 		return (NULL);
    315 	}
    316 
    317 	clflags = (CALLOUT_LIST_FLAG_ABSOLUTE | CALLOUT_LIST_FLAG_HRESTIME);
    318 	for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) {
    319 		/*
    320 		 * If we have reached a 1-nanosecond resolution callout list,
    321 		 * we don't have much hope of finding a match in this hash
    322 		 * bucket. So, just bail out.
    323 		 */
    324 		if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO)
    325 			return (NULL);
    326 
    327 		if ((cl->cl_expiration == expiration) &&
    328 		    ((cl->cl_flags & clflags) == (flags & clflags)))
    329 			return (cl);
    330 	}
    331 
    332 	return (NULL);
    333 }
    334 
    335 /*
    336  * Initialize a callout table's heap, if necessary. Preallocate some free
    337  * entries so we don't have to check for NULL elsewhere.
    338  */
    339 static void
    340 callout_heap_init(callout_table_t *ct)
    341 {
    342 	size_t size;
    343 
    344 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
    345 	ASSERT(ct->ct_heap == NULL);
    346 
    347 	ct->ct_heap_num = 0;
    348 	ct->ct_heap_max = CALLOUT_CHUNK;
    349 	size = sizeof (callout_heap_t) * CALLOUT_CHUNK;
    350 	ct->ct_heap = kmem_alloc(size, KM_SLEEP);
    351 }
    352 
    353 /*
    354  * Reallocate the heap. We try quite hard because we can't sleep, and if
    355  * we can't do the allocation, we're toast. Failing all, we try a KM_PANIC
    356  * allocation. Note that the heap only expands, it never contracts.
    357  */
    358 static void
    359 callout_heap_expand(callout_table_t *ct)
    360 {
    361 	size_t max, size, osize;
    362 	callout_heap_t *heap;
    363 
    364 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
    365 	ASSERT(ct->ct_heap_num <= ct->ct_heap_max);
    366 
    367 	while (ct->ct_heap_num == ct->ct_heap_max) {
    368 		max = ct->ct_heap_max;
    369 		mutex_exit(&ct->ct_mutex);
    370 
    371 		osize = sizeof (callout_heap_t) * max;
    372 		size = sizeof (callout_heap_t) * (max + CALLOUT_CHUNK);
    373 		heap = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
    374 
    375 		mutex_enter(&ct->ct_mutex);
    376 		if (max < ct->ct_heap_max) {
    377 			/*
    378 			 * Someone beat us to the allocation. Free what we
    379 			 * just allocated and proceed.
    380 			 */
    381 			kmem_free(heap, size);
    382 			continue;
    383 		}
    384 
    385 		bcopy(ct->ct_heap, heap, osize);
    386 		kmem_free(ct->ct_heap, osize);
    387 		ct->ct_heap = heap;
    388 		ct->ct_heap_max = size / sizeof (callout_heap_t);
    389 	}
    390 }
    391 
    392 /*
    393  * Move an expiration from the bottom of the heap to its correct place
    394  * in the heap. If we reached the root doing this, return 1. Else,
    395  * return 0.
    396  */
    397 static int
    398 callout_upheap(callout_table_t *ct)
    399 {
    400 	int current, parent;
    401 	callout_heap_t *heap, *hcurrent, *hparent;
    402 
    403 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
    404 	ASSERT(ct->ct_heap_num >= 1);
    405 
    406 	if (ct->ct_heap_num == 1) {
    407 		return (1);
    408 	}
    409 
    410 	heap = ct->ct_heap;
    411 	current = ct->ct_heap_num - 1;
    412 
    413 	for (;;) {
    414 		parent = CALLOUT_HEAP_PARENT(current);
    415 		hparent = &heap[parent];
    416 		hcurrent = &heap[current];
    417 
    418 		/*
    419 		 * We have an expiration later than our parent; we're done.
    420 		 */
    421 		if (hcurrent->ch_expiration >= hparent->ch_expiration) {
    422 			return (0);
    423 		}
    424 
    425 		/*
    426 		 * We need to swap with our parent, and continue up the heap.
    427 		 */
    428 		CALLOUT_SWAP(hparent, hcurrent);
    429 
    430 		/*
    431 		 * If we just reached the root, we're done.
    432 		 */
    433 		if (parent == 0) {
    434 			return (1);
    435 		}
    436 
    437 		current = parent;
    438 	}
    439 	/*NOTREACHED*/
    440 }
    441 
    442 /*
    443  * Insert a new heap item into a callout table's heap.
    444  */
    445 static void
    446 callout_heap_insert(callout_table_t *ct, callout_list_t *cl)
    447 {
    448 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
    449 	ASSERT(ct->ct_heap_num < ct->ct_heap_max);
    450 
    451 	/*
    452 	 * First, copy the expiration and callout list pointer to the bottom
    453 	 * of the heap.
    454 	 */
    455 	ct->ct_heap[ct->ct_heap_num].ch_expiration = cl->cl_expiration;
    456 	ct->ct_heap[ct->ct_heap_num].ch_list = cl;
    457 	ct->ct_heap_num++;
    458 
    459 	/*
    460 	 * Now, perform an upheap operation. If we reached the root, then
    461 	 * the cyclic needs to be reprogrammed as we have an earlier
    462 	 * expiration.
    463 	 *
    464 	 * Also, during the CPR suspend phase, do not reprogram the cyclic.
    465 	 * We don't want any callout activity. When the CPR resume phase is
    466 	 * entered, the cyclic will be programmed for the earliest expiration
    467 	 * in the heap.
    468 	 */
    469 	if (callout_upheap(ct) && (ct->ct_suspend == 0))
    470 		(void) cyclic_reprogram(ct->ct_cyclic, cl->cl_expiration);
    471 }
    472 
    473 /*
    474  * Move an expiration from the top of the heap to its correct place
    475  * in the heap.
    476  */
    477 static void
    478 callout_downheap(callout_table_t *ct)
    479 {
    480 	int current, left, right, nelems;
    481 	callout_heap_t *heap, *hleft, *hright, *hcurrent;
    482 
    483 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
    484 	ASSERT(ct->ct_heap_num >= 1);
    485 
    486 	heap = ct->ct_heap;
    487 	current = 0;
    488 	nelems = ct->ct_heap_num;
    489 
    490 	for (;;) {
    491 		/*
    492 		 * If we don't have a left child (i.e., we're a leaf), we're
    493 		 * done.
    494 		 */
    495 		if ((left = CALLOUT_HEAP_LEFT(current)) >= nelems)
    496 			return;
    497 
    498 		hleft = &heap[left];
    499 		hcurrent = &heap[current];
    500 
    501 		right = CALLOUT_HEAP_RIGHT(current);
    502 
    503 		/*
    504 		 * Even if we don't have a right child, we still need to compare
    505 		 * our expiration against that of our left child.
    506 		 */
    507 		if (right >= nelems)
    508 			goto comp_left;
    509 
    510 		hright = &heap[right];
    511 
    512 		/*
    513 		 * We have both a left and a right child.  We need to compare
    514 		 * the expiration of the children to determine which
    515 		 * expires earlier.
    516 		 */
    517 		if (hright->ch_expiration < hleft->ch_expiration) {
    518 			/*
    519 			 * Our right child is the earlier of our children.
    520 			 * We'll now compare our expiration to its expiration.
    521 			 * If ours is the earlier one, we're done.
    522 			 */
    523 			if (hcurrent->ch_expiration <= hright->ch_expiration)
    524 				return;
    525 
    526 			/*
    527 			 * Our right child expires earlier than we do; swap
    528 			 * with our right child, and descend right.
    529 			 */
    530 			CALLOUT_SWAP(hright, hcurrent);
    531 			current = right;
    532 			continue;
    533 		}
    534 
    535 comp_left:
    536 		/*
    537 		 * Our left child is the earlier of our children (or we have
    538 		 * no right child).  We'll now compare our expiration
    539 		 * to its expiration. If ours is the earlier one, we're done.
    540 		 */
    541 		if (hcurrent->ch_expiration <= hleft->ch_expiration)
    542 			return;
    543 
    544 		/*
    545 		 * Our left child expires earlier than we do; swap with our
    546 		 * left child, and descend left.
    547 		 */
    548 		CALLOUT_SWAP(hleft, hcurrent);
    549 		current = left;
    550 	}
    551 }
    552 
    553 /*
    554  * Delete and handle all past expirations in a callout table's heap.
    555  */
    556 static void
    557 callout_heap_delete(callout_table_t *ct)
    558 {
    559 	hrtime_t now, expiration, next;
    560 	callout_list_t *cl;
    561 	callout_heap_t *heap;
    562 	int hash;
    563 
    564 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
    565 
    566 	if (CALLOUT_CLEANUP(ct)) {
    567 		/*
    568 		 * There are too many heap elements pointing to empty callout
    569 		 * lists. Clean them out.
    570 		 */
    571 		(void) callout_heap_process(ct, 0, 0);
    572 	}
    573 
    574 	now = gethrtime();
    575 	heap = ct->ct_heap;
    576 
    577 	while (ct->ct_heap_num > 0) {
    578 		expiration = heap->ch_expiration;
    579 		hash = CALLOUT_CLHASH(expiration);
    580 		cl = heap->ch_list;
    581 		ASSERT(expiration == cl->cl_expiration);
    582 
    583 		if (cl->cl_callouts.ch_head == NULL) {
    584 			/*
    585 			 * If the callout list is empty, reap it.
    586 			 * Decrement the reap count.
    587 			 */
    588 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
    589 			CALLOUT_LIST_FREE(ct, cl);
    590 			ct->ct_nreap--;
    591 		} else {
    592 			/*
    593 			 * If the root of the heap expires in the future,
    594 			 * bail out.
    595 			 */
    596 			if (expiration > now)
    597 				break;
    598 
    599 			/*
    600 			 * Move the callout list for this expiration to the
    601 			 * list of expired callout lists. It will be processed
    602 			 * by the callout executor.
    603 			 */
    604 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
    605 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
    606 		}
    607 
    608 		/*
    609 		 * Now delete the root. This is done by swapping the root with
    610 		 * the last item in the heap and downheaping the item.
    611 		 */
    612 		ct->ct_heap_num--;
    613 		if (ct->ct_heap_num > 0) {
    614 			heap[0] = heap[ct->ct_heap_num];
    615 			callout_downheap(ct);
    616 		}
    617 	}
    618 
    619 	/*
    620 	 * If this callout table is empty or callouts have been suspended,
    621 	 * just return. The cyclic has already been programmed to
    622 	 * infinity by the cyclic subsystem.
    623 	 */
    624 	if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0))
    625 		return;
    626 
    627 	/*
    628 	 * If the top expirations are within callout_tolerance of each other,
    629 	 * delay the cyclic expire so that they can be processed together.
    630 	 * This is to prevent high resolution timers from swamping the system
    631 	 * with cyclic activity.
    632 	 */
    633 	if (ct->ct_heap_num > 2) {
    634 		next = expiration + callout_tolerance;
    635 		if ((heap[1].ch_expiration < next) ||
    636 		    (heap[2].ch_expiration < next))
    637 			expiration = next;
    638 	}
    639 
    640 	(void) cyclic_reprogram(ct->ct_cyclic, expiration);
    641 }
    642 
    643 /*
    644  * There are some situations when the entire heap is walked and processed.
    645  * This function is called to do the processing. These are the situations:
    646  *
    647  * 1. When the reap count reaches its threshold, the heap has to be cleared
    648  *    of all empty callout lists.
    649  *
    650  * 2. When the system enters and exits KMDB/OBP, all entries in the heap
    651  *    need to be adjusted by the interval spent in KMDB/OBP.
    652  *
    653  * 3. When system time is changed, the heap has to be scanned for
    654  *    absolute hrestime timers. These need to be removed from the heap
    655  *    and expired immediately.
    656  *
    657  * In cases 2 and 3, it is a good idea to do 1 as well since we are
    658  * scanning the heap anyway.
    659  *
    660  * If the root gets changed and/or callout lists are expired, return the
    661  * new expiration to the caller so he can reprogram the cyclic accordingly.
    662  */
    663 static hrtime_t
    664 callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange)
    665 {
    666 	callout_heap_t *heap;
    667 	callout_list_t *cl, *rootcl;
    668 	hrtime_t expiration, now;
    669 	int i, hash, clflags, expired;
    670 	ulong_t num;
    671 
    672 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
    673 
    674 	if (ct->ct_heap_num == 0)
    675 		return (0);
    676 
    677 	if (ct->ct_nreap > 0)
    678 		ct->ct_cleanups++;
    679 
    680 	heap = ct->ct_heap;
    681 	rootcl = heap->ch_list;
    682 
    683 	/*
    684 	 * We walk the heap from the top to the bottom. If we encounter
    685 	 * a heap item that points to an empty callout list, we clean
    686 	 * it out. If we encounter a hrestime entry that must be removed,
    687 	 * again we clean it out. Otherwise, we apply any adjustments needed
    688 	 * to an element.
    689 	 *
    690 	 * During the walk, we also compact the heap from the bottom and
    691 	 * reconstruct the heap using upheap operations. This is very
    692 	 * efficient if the number of elements to be cleaned is greater than
    693 	 * or equal to half the heap. This is the common case.
    694 	 *
    695 	 * Even in the non-common case, the upheap operations should be short
    696 	 * as the entries below generally tend to be bigger than the entries
    697 	 * above.
    698 	 */
    699 	num = ct->ct_heap_num;
    700 	ct->ct_heap_num = 0;
    701 	clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
    702 	now = gethrtime();
    703 	expired = 0;
    704 	for (i = 0; i < num; i++) {
    705 		cl = heap[i].ch_list;
    706 		/*
    707 		 * If the callout list is empty, delete the heap element and
    708 		 * free the callout list.
    709 		 */
    710 		if (cl->cl_callouts.ch_head == NULL) {
    711 			hash = CALLOUT_CLHASH(cl->cl_expiration);
    712 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
    713 			CALLOUT_LIST_FREE(ct, cl);
    714 			continue;
    715 		}
    716 
    717 		/*
    718 		 * Delete the heap element and expire the callout list, if
    719 		 * one of the following is true:
    720 		 *	- the callout list has expired
    721 		 *	- the callout list is an absolute hrestime one and
    722 		 *	  there has been a system time change
    723 		 */
    724 		if ((cl->cl_expiration <= now) ||
    725 		    (timechange && ((cl->cl_flags & clflags) == clflags))) {
    726 			hash = CALLOUT_CLHASH(cl->cl_expiration);
    727 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
    728 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
    729 			expired = 1;
    730 			continue;
    731 		}
    732 
    733 		/*
    734 		 * Apply adjustments, if any. Adjustments are applied after
    735 		 * the system returns from KMDB or OBP. They are only applied
    736 		 * to relative callout lists.
    737 		 */
    738 		if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
    739 			hash = CALLOUT_CLHASH(cl->cl_expiration);
    740 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
    741 			expiration = cl->cl_expiration + delta;
    742 			if (expiration <= 0)
    743 				expiration = CY_INFINITY;
    744 			heap[i].ch_expiration = expiration;
    745 			cl->cl_expiration = expiration;
    746 			hash = CALLOUT_CLHASH(cl->cl_expiration);
    747 			if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO) {
    748 				CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
    749 			} else {
    750 				CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
    751 			}
    752 		}
    753 
    754 		heap[ct->ct_heap_num] = heap[i];
    755 		ct->ct_heap_num++;
    756 		(void) callout_upheap(ct);
    757 	}
    758 
    759 	ct->ct_nreap = 0;
    760 
    761 	if (expired)
    762 		expiration = gethrtime();
    763 	else if (ct->ct_heap_num == 0)
    764 		expiration = CY_INFINITY;
    765 	else if (rootcl != heap->ch_list)
    766 		expiration = heap->ch_expiration;
    767 	else
    768 		expiration = 0;
    769 
    770 	return (expiration);
    771 }
    772 
    773 /*
    774  * Common function used to create normal and realtime callouts.
    775  *
    776  * Realtime callouts are handled at CY_LOW_PIL by a cyclic handler. So,
    777  * there is one restriction on a realtime callout handler - it should not
    778  * directly or indirectly acquire cpu_lock. CPU offline waits for pending
    779  * cyclic handlers to complete while holding cpu_lock. So, if a realtime
    780  * callout handler were to try to get cpu_lock, there would be a deadlock
    781  * during CPU offline.
    782  */
    783 callout_id_t
    784 timeout_generic(int type, void (*func)(void *), void *arg,
    785 	hrtime_t expiration, hrtime_t resolution, int flags)
    786 {
    787 	callout_table_t *ct;
    788 	callout_t *cp;
    789 	callout_id_t id;
    790 	callout_list_t *cl;
    791 	hrtime_t now, interval, rexpiration;
    792 	int hash, clflags;
    793 
    794 	ASSERT(resolution > 0);
    795 	ASSERT(func != NULL);
    796 
    797 	/*
    798 	 * We get the current hrtime right upfront so that latencies in
    799 	 * this function do not affect the accuracy of the callout.
    800 	 */
    801 	now = gethrtime();
    802 
    803 	/*
    804 	 * We disable kernel preemption so that we remain on the same CPU
    805 	 * throughout. If we needed to reprogram the callout table's cyclic,
    806 	 * we can avoid X-calls if we are on the same CPU.
    807 	 *
    808 	 * Note that callout_alloc() releases and reacquires the callout
    809 	 * table mutex. While reacquiring the mutex, it is possible for us
    810 	 * to go to sleep and later migrate to another CPU. This should be
    811 	 * pretty rare, though.
    812 	 */
    813 	kpreempt_disable();
    814 
    815 	ct = &callout_table[CALLOUT_TABLE(type, CPU->cpu_seqid)];
    816 	mutex_enter(&ct->ct_mutex);
    817 
    818 	if (ct->ct_cyclic == CYCLIC_NONE) {
    819 		mutex_exit(&ct->ct_mutex);
    820 		/*
    821 		 * The callout table has not yet been initialized fully.
    822 		 * So, put this one on the boot callout table which is
    823 		 * always initialized.
    824 		 */
    825 		ct = &callout_boot_ct[type];
    826 		mutex_enter(&ct->ct_mutex);
    827 	}
    828 
    829 	if (CALLOUT_CLEANUP(ct)) {
    830 		/*
    831 		 * There are too many heap elements pointing to empty callout
    832 		 * lists. Clean them out.
    833 		 */
    834 		rexpiration = callout_heap_process(ct, 0, 0);
    835 		if ((rexpiration != 0) && (ct->ct_suspend == 0))
    836 			(void) cyclic_reprogram(ct->ct_cyclic, rexpiration);
    837 	}
    838 
    839 	if ((cp = ct->ct_free) == NULL)
    840 		cp = callout_alloc(ct);
    841 	else
    842 		ct->ct_free = cp->c_idnext;
    843 
    844 	cp->c_func = func;
    845 	cp->c_arg = arg;
    846 
    847 	/*
    848 	 * Compute the expiration hrtime.
    849 	 */
    850 	if (flags & CALLOUT_FLAG_ABSOLUTE) {
    851 		interval = expiration - now;
    852 	} else {
    853 		interval = expiration;
    854 		expiration += now;
    855 	}
    856 
    857 	if (resolution > 1) {
    858 		/*
    859 		 * Align expiration to the specified resolution.
    860 		 */
    861 		if (flags & CALLOUT_FLAG_ROUNDUP)
    862 			expiration += resolution - 1;
    863 		expiration = (expiration / resolution) * resolution;
    864 	}
    865 
    866 	if (expiration <= 0) {
    867 		/*
    868 		 * expiration hrtime overflow has occurred. Just set the
    869 		 * expiration to infinity.
    870 		 */
    871 		expiration = CY_INFINITY;
    872 	}
    873 
    874 	/*
    875 	 * Assign an ID to this callout
    876 	 */
    877 	if (flags & CALLOUT_FLAG_32BIT) {
    878 		if (interval > callout_longterm) {
    879 			id = (ct->ct_long_id - callout_counter_low);
    880 			id |= CALLOUT_COUNTER_HIGH;
    881 			ct->ct_long_id = id;
    882 		} else {
    883 			id = (ct->ct_short_id - callout_counter_low);
    884 			id |= CALLOUT_COUNTER_HIGH;
    885 			ct->ct_short_id = id;
    886 		}
    887 	} else {
    888 		id = (ct->ct_gen_id - callout_counter_low);
    889 		if ((id & CALLOUT_COUNTER_HIGH) == 0) {
    890 			id |= CALLOUT_COUNTER_HIGH;
    891 			id += CALLOUT_GENERATION_LOW;
    892 		}
    893 		ct->ct_gen_id = id;
    894 	}
    895 
    896 	cp->c_xid = id;
    897 
    898 	clflags = 0;
    899 	if (flags & CALLOUT_FLAG_ABSOLUTE)
    900 		clflags |= CALLOUT_LIST_FLAG_ABSOLUTE;
    901 	if (flags & CALLOUT_FLAG_HRESTIME)
    902 		clflags |= CALLOUT_LIST_FLAG_HRESTIME;
    903 	if (resolution == 1)
    904 		clflags |= CALLOUT_LIST_FLAG_NANO;
    905 	hash = CALLOUT_CLHASH(expiration);
    906 
    907 again:
    908 	/*
    909 	 * Try to see if a callout list already exists for this expiration.
    910 	 */
    911 	cl = callout_list_get(ct, expiration, clflags, hash);
    912 	if (cl == NULL) {
    913 		/*
    914 		 * Check if we have enough space in the heap to insert one
    915 		 * expiration. If not, expand the heap.
    916 		 */
    917 		if (ct->ct_heap_num == ct->ct_heap_max) {
    918 			callout_heap_expand(ct);
    919 			/*
    920 			 * In the above call, we drop the lock, allocate and
    921 			 * reacquire the lock. So, we could have been away
    922 			 * for a while. In the meantime, someone could have
    923 			 * inserted a callout list with the same expiration.
    924 			 * So, the best course is to repeat the steps. This
    925 			 * should be an infrequent event.
    926 			 */
    927 			goto again;
    928 		}
    929 
    930 		/*
    931 		 * Check the free list. If we don't find one, we have to
    932 		 * take the slow path and allocate from kmem.
    933 		 */
    934 		if ((cl = ct->ct_lfree) == NULL) {
    935 			callout_list_alloc(ct);
    936 			/*
    937 			 * In the above call, we drop the lock, allocate and
    938 			 * reacquire the lock. So, we could have been away
    939 			 * for a while. In the meantime, someone could have
    940 			 * inserted a callout list with the same expiration.
    941 			 * Plus, the heap could have become full. So, the best
    942 			 * course is to repeat the steps. This should be an
    943 			 * infrequent event.
    944 			 */
    945 			goto again;
    946 		}
    947 		ct->ct_lfree = cl->cl_next;
    948 		cl->cl_expiration = expiration;
    949 		cl->cl_flags = clflags;
    950 
    951 		if (clflags & CALLOUT_LIST_FLAG_NANO) {
    952 			CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
    953 		} else {
    954 			CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
    955 		}
    956 
    957 		/*
    958 		 * This is a new expiration. So, insert it into the heap.
    959 		 * This will also reprogram the cyclic, if the expiration
    960 		 * propagated to the root of the heap.
    961 		 */
    962 		callout_heap_insert(ct, cl);
    963 	} else {
    964 		/*
    965 		 * If the callout list was empty, untimeout_generic() would
    966 		 * have incremented a reap count. Decrement the reap count
    967 		 * as we are going to insert a callout into this list.
    968 		 */
    969 		if (cl->cl_callouts.ch_head == NULL)
    970 			ct->ct_nreap--;
    971 	}
    972 	cp->c_list = cl;
    973 	CALLOUT_APPEND(ct, cp);
    974 
    975 	ct->ct_timeouts++;
    976 	ct->ct_timeouts_pending++;
    977 
    978 	mutex_exit(&ct->ct_mutex);
    979 
    980 	kpreempt_enable();
    981 
    982 	TRACE_4(TR_FAC_CALLOUT, TR_TIMEOUT,
    983 	    "timeout:%K(%p) in %llx expiration, cp %p", func, arg, expiration,
    984 	    cp);
    985 
    986 	return (id);
    987 }
    988 
    989 timeout_id_t
    990 timeout(void (*func)(void *), void *arg, clock_t delta)
    991 {
    992 	ulong_t id;
    993 
    994 	/*
    995 	 * Make sure the callout runs at least 1 tick in the future.
    996 	 */
    997 	if (delta <= 0)
    998 		delta = 1;
    999 	else if (delta > callout_max_ticks)
   1000 		delta = callout_max_ticks;
   1001 
   1002 	id =  (ulong_t)timeout_generic(CALLOUT_NORMAL, func, arg,
   1003 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
   1004 
   1005 	return ((timeout_id_t)id);
   1006 }
   1007 
   1008 /*
   1009  * Convenience function that creates a normal callout with default parameters
   1010  * and returns a full ID.
   1011  */
   1012 callout_id_t
   1013 timeout_default(void (*func)(void *), void *arg, clock_t delta)
   1014 {
   1015 	callout_id_t id;
   1016 
   1017 	/*
   1018 	 * Make sure the callout runs at least 1 tick in the future.
   1019 	 */
   1020 	if (delta <= 0)
   1021 		delta = 1;
   1022 	else if (delta > callout_max_ticks)
   1023 		delta = callout_max_ticks;
   1024 
   1025 	id = timeout_generic(CALLOUT_NORMAL, func, arg, TICK_TO_NSEC(delta),
   1026 	    nsec_per_tick, 0);
   1027 
   1028 	return (id);
   1029 }
   1030 
   1031 timeout_id_t
   1032 realtime_timeout(void (*func)(void *), void *arg, clock_t delta)
   1033 {
   1034 	ulong_t id;
   1035 
   1036 	/*
   1037 	 * Make sure the callout runs at least 1 tick in the future.
   1038 	 */
   1039 	if (delta <= 0)
   1040 		delta = 1;
   1041 	else if (delta > callout_max_ticks)
   1042 		delta = callout_max_ticks;
   1043 
   1044 	id =  (ulong_t)timeout_generic(CALLOUT_REALTIME, func, arg,
   1045 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
   1046 
   1047 	return ((timeout_id_t)id);
   1048 }
   1049 
   1050 /*
   1051  * Convenience function that creates a realtime callout with default parameters
   1052  * and returns a full ID.
   1053  */
   1054 callout_id_t
   1055 realtime_timeout_default(void (*func)(void *), void *arg, clock_t delta)
   1056 {
   1057 	callout_id_t id;
   1058 
   1059 	/*
   1060 	 * Make sure the callout runs at least 1 tick in the future.
   1061 	 */
   1062 	if (delta <= 0)
   1063 		delta = 1;
   1064 	else if (delta > callout_max_ticks)
   1065 		delta = callout_max_ticks;
   1066 
   1067 	id = timeout_generic(CALLOUT_REALTIME, func, arg, TICK_TO_NSEC(delta),
   1068 	    nsec_per_tick, 0);
   1069 
   1070 	return (id);
   1071 }
   1072 
   1073 hrtime_t
   1074 untimeout_generic(callout_id_t id, int nowait)
   1075 {
   1076 	callout_table_t *ct;
   1077 	callout_t *cp;
   1078 	callout_id_t xid;
   1079 	callout_list_t *cl;
   1080 	int hash;
   1081 	callout_id_t bogus;
   1082 
   1083 	ct = &callout_table[CALLOUT_ID_TO_TABLE(id)];
   1084 	hash = CALLOUT_IDHASH(id);
   1085 
   1086 	mutex_enter(&ct->ct_mutex);
   1087 
   1088 	/*
   1089 	 * Search the ID hash table for the callout.
   1090 	 */
   1091 	for (cp = ct->ct_idhash[hash].ch_head; cp; cp = cp->c_idnext) {
   1092 
   1093 		xid = cp->c_xid;
   1094 
   1095 		/*
   1096 		 * Match the ID and generation number.
   1097 		 */
   1098 		if ((xid & CALLOUT_ID_MASK) != id)
   1099 			continue;
   1100 
   1101 		if ((xid & CALLOUT_EXECUTING) == 0) {
   1102 			hrtime_t expiration;
   1103 
   1104 			/*
   1105 			 * Delete the callout. If the callout list becomes
   1106 			 * NULL, we don't remove it from the table. This is
   1107 			 * so it can be reused. If the empty callout list
   1108 			 * corresponds to the top of the the callout heap, we
   1109 			 * don't reprogram the table cyclic here. This is in
   1110 			 * order to avoid lots of X-calls to the CPU associated
   1111 			 * with the callout table.
   1112 			 */
   1113 			cl = cp->c_list;
   1114 			expiration = cl->cl_expiration;
   1115 			CALLOUT_DELETE(ct, cp);
   1116 			cp->c_idnext = ct->ct_free;
   1117 			ct->ct_free = cp;
   1118 			cp->c_xid |= CALLOUT_FREE;
   1119 			ct->ct_untimeouts_unexpired++;
   1120 			ct->ct_timeouts_pending--;
   1121 
   1122 			/*
   1123 			 * If the callout list has become empty, it needs
   1124 			 * to be cleaned along with its heap entry. Increment
   1125 			 * a reap count.
   1126 			 */
   1127 			if (cl->cl_callouts.ch_head == NULL)
   1128 				ct->ct_nreap++;
   1129 			mutex_exit(&ct->ct_mutex);
   1130 
   1131 			expiration -= gethrtime();
   1132 			TRACE_2(TR_FAC_CALLOUT, TR_UNTIMEOUT,
   1133 			    "untimeout:ID %lx hrtime left %llx", id,
   1134 			    expiration);
   1135 			return (expiration < 0 ? 0 : expiration);
   1136 		}
   1137 
   1138 		ct->ct_untimeouts_executing++;
   1139 		/*
   1140 		 * The callout we want to delete is currently executing.
   1141 		 * The DDI states that we must wait until the callout
   1142 		 * completes before returning, so we block on c_done until the
   1143 		 * callout ID changes (to the old ID if it's on the freelist,
   1144 		 * or to a new callout ID if it's in use).  This implicitly
   1145 		 * assumes that callout structures are persistent (they are).
   1146 		 */
   1147 		if (cp->c_executor == curthread) {
   1148 			/*
   1149 			 * The timeout handler called untimeout() on itself.
   1150 			 * Stupid, but legal.  We can't wait for the timeout
   1151 			 * to complete without deadlocking, so we just return.
   1152 			 */
   1153 			mutex_exit(&ct->ct_mutex);
   1154 			TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_SELF,
   1155 			    "untimeout_self:ID %x", id);
   1156 			return (-1);
   1157 		}
   1158 		if (nowait == 0) {
   1159 			/*
   1160 			 * We need to wait. Indicate that we are waiting by
   1161 			 * incrementing c_waiting. This prevents the executor
   1162 			 * from doing a wakeup on c_done if there are no
   1163 			 * waiters.
   1164 			 */
   1165 			while (cp->c_xid == xid) {
   1166 				cp->c_waiting = 1;
   1167 				cv_wait(&cp->c_done, &ct->ct_mutex);
   1168 			}
   1169 		}
   1170 		mutex_exit(&ct->ct_mutex);
   1171 		TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_EXECUTING,
   1172 		    "untimeout_executing:ID %lx", id);
   1173 		return (-1);
   1174 	}
   1175 	ct->ct_untimeouts_expired++;
   1176 
   1177 	mutex_exit(&ct->ct_mutex);
   1178 	TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_BOGUS_ID,
   1179 	    "untimeout_bogus_id:ID %lx", id);
   1180 
   1181 	/*
   1182 	 * We didn't find the specified callout ID.  This means either
   1183 	 * (1) the callout already fired, or (2) the caller passed us
   1184 	 * a bogus value.  Perform a sanity check to detect case (2).
   1185 	 */
   1186 	bogus = (CALLOUT_ID_FLAGS | CALLOUT_COUNTER_HIGH);
   1187 	if (((id & bogus) != CALLOUT_COUNTER_HIGH) && (id != 0))
   1188 		panic("untimeout: impossible timeout id %llx",
   1189 		    (unsigned long long)id);
   1190 
   1191 	return (-1);
   1192 }
   1193 
   1194 clock_t
   1195 untimeout(timeout_id_t id_arg)
   1196 {
   1197 	hrtime_t hleft;
   1198 	clock_t tleft;
   1199 	callout_id_t id;
   1200 
   1201 	id = (ulong_t)id_arg;
   1202 	hleft = untimeout_generic(id, 0);
   1203 	if (hleft < 0)
   1204 		tleft = -1;
   1205 	else if (hleft == 0)
   1206 		tleft = 0;
   1207 	else
   1208 		tleft = NSEC_TO_TICK(hleft);
   1209 
   1210 	return (tleft);
   1211 }
   1212 
   1213 /*
   1214  * Convenience function to untimeout a timeout with a full ID with default
   1215  * parameters.
   1216  */
   1217 clock_t
   1218 untimeout_default(callout_id_t id, int nowait)
   1219 {
   1220 	hrtime_t hleft;
   1221 	clock_t tleft;
   1222 
   1223 	hleft = untimeout_generic(id, nowait);
   1224 	if (hleft < 0)
   1225 		tleft = -1;
   1226 	else if (hleft == 0)
   1227 		tleft = 0;
   1228 	else
   1229 		tleft = NSEC_TO_TICK(hleft);
   1230 
   1231 	return (tleft);
   1232 }
   1233 
   1234 /*
   1235  * Expire all the callouts queued in the specified callout list.
   1236  */
   1237 static void
   1238 callout_list_expire(callout_table_t *ct, callout_list_t *cl)
   1239 {
   1240 	callout_t *cp, *cnext;
   1241 
   1242 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
   1243 	ASSERT(cl != NULL);
   1244 
   1245 	for (cp = cl->cl_callouts.ch_head; cp != NULL; cp = cnext) {
   1246 		/*
   1247 		 * Multiple executor threads could be running at the same
   1248 		 * time. If this callout is already being executed,
   1249 		 * go on to the next one.
   1250 		 */
   1251 		if (cp->c_xid & CALLOUT_EXECUTING) {
   1252 			cnext = cp->c_clnext;
   1253 			continue;
   1254 		}
   1255 
   1256 		/*
   1257 		 * Indicate to untimeout() that a callout is
   1258 		 * being expired by the executor.
   1259 		 */
   1260 		cp->c_xid |= CALLOUT_EXECUTING;
   1261 		cp->c_executor = curthread;
   1262 		mutex_exit(&ct->ct_mutex);
   1263 
   1264 		DTRACE_PROBE1(callout__start, callout_t *, cp);
   1265 		(*cp->c_func)(cp->c_arg);
   1266 		DTRACE_PROBE1(callout__end, callout_t *, cp);
   1267 
   1268 		mutex_enter(&ct->ct_mutex);
   1269 
   1270 		ct->ct_expirations++;
   1271 		ct->ct_timeouts_pending--;
   1272 		/*
   1273 		 * Indicate completion for c_done.
   1274 		 */
   1275 		cp->c_xid &= ~CALLOUT_EXECUTING;
   1276 		cp->c_executor = NULL;
   1277 		cnext = cp->c_clnext;
   1278 
   1279 		/*
   1280 		 * Delete callout from ID hash table and the callout
   1281 		 * list, return to freelist, and tell any untimeout() that
   1282 		 * cares that we're done.
   1283 		 */
   1284 		CALLOUT_DELETE(ct, cp);
   1285 		cp->c_idnext = ct->ct_free;
   1286 		ct->ct_free = cp;
   1287 		cp->c_xid |= CALLOUT_FREE;
   1288 
   1289 		if (cp->c_waiting) {
   1290 			cp->c_waiting = 0;
   1291 			cv_broadcast(&cp->c_done);
   1292 		}
   1293 	}
   1294 }
   1295 
   1296 /*
   1297  * Execute all expired callout lists for a callout table.
   1298  */
   1299 static void
   1300 callout_expire(callout_table_t *ct)
   1301 {
   1302 	callout_list_t *cl, *clnext;
   1303 
   1304 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
   1305 
   1306 	for (cl = ct->ct_expired.ch_head; (cl != NULL); cl = clnext) {
   1307 		/*
   1308 		 * Expire all the callouts in this callout list.
   1309 		 */
   1310 		callout_list_expire(ct, cl);
   1311 
   1312 		clnext = cl->cl_next;
   1313 		if (cl->cl_callouts.ch_head == NULL) {
   1314 			/*
   1315 			 * Free the callout list.
   1316 			 */
   1317 			CALLOUT_LIST_DELETE(ct->ct_expired, cl);
   1318 			CALLOUT_LIST_FREE(ct, cl);
   1319 		}
   1320 	}
   1321 }
   1322 
   1323 /*
   1324  * The cyclic handlers below process callouts in two steps:
   1325  *
   1326  *	1. Find all expired callout lists and queue them in a separate
   1327  *	   list of expired callouts.
   1328  *	2. Execute the expired callout lists.
   1329  *
   1330  * This is done for two reasons:
   1331  *
   1332  *	1. We want to quickly find the next earliest expiration to program
   1333  *	   the cyclic to and reprogram it. We can do this right at the end
   1334  *	   of step 1.
   1335  *	2. The realtime cyclic handler expires callouts in place. However,
   1336  *	   for normal callouts, callouts are expired by a taskq thread.
   1337  *	   So, it is simpler and more robust to have the taskq thread just
   1338  *	   do step 2.
   1339  */
   1340 
   1341 /*
   1342  * Realtime callout cyclic handler.
   1343  */
   1344 void
   1345 callout_realtime(callout_table_t *ct)
   1346 {
   1347 	mutex_enter(&ct->ct_mutex);
   1348 	callout_heap_delete(ct);
   1349 	callout_expire(ct);
   1350 	mutex_exit(&ct->ct_mutex);
   1351 }
   1352 
   1353 void
   1354 callout_execute(callout_table_t *ct)
   1355 {
   1356 	mutex_enter(&ct->ct_mutex);
   1357 	callout_expire(ct);
   1358 	mutex_exit(&ct->ct_mutex);
   1359 }
   1360 
   1361 /*
   1362  * Normal callout cyclic handler.
   1363  */
   1364 void
   1365 callout_normal(callout_table_t *ct)
   1366 {
   1367 	int i, exec;
   1368 
   1369 	mutex_enter(&ct->ct_mutex);
   1370 	callout_heap_delete(ct);
   1371 	CALLOUT_EXEC_COMPUTE(ct, exec);
   1372 	mutex_exit(&ct->ct_mutex);
   1373 
   1374 	for (i = 0; i < exec; i++) {
   1375 		ASSERT(ct->ct_taskq != NULL);
   1376 		(void) taskq_dispatch(ct->ct_taskq,
   1377 		    (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
   1378 	}
   1379 }
   1380 
   1381 /*
   1382  * Suspend callout processing.
   1383  */
   1384 static void
   1385 callout_suspend(void)
   1386 {
   1387 	int t, f;
   1388 	callout_table_t *ct;
   1389 
   1390 	/*
   1391 	 * Traverse every callout table in the system and suspend callout
   1392 	 * processing.
   1393 	 *
   1394 	 * We need to suspend all the tables (including the inactive ones)
   1395 	 * so that if a table is made active while the suspend is still on,
   1396 	 * the table remains suspended.
   1397 	 */
   1398 	for (f = 0; f < max_ncpus; f++) {
   1399 		for (t = 0; t < CALLOUT_NTYPES; t++) {
   1400 			ct = &callout_table[CALLOUT_TABLE(t, f)];
   1401 
   1402 			mutex_enter(&ct->ct_mutex);
   1403 			ct->ct_suspend++;
   1404 			if (ct->ct_cyclic == CYCLIC_NONE) {
   1405 				mutex_exit(&ct->ct_mutex);
   1406 				continue;
   1407 			}
   1408 			if (ct->ct_suspend == 1)
   1409 				(void) cyclic_reprogram(ct->ct_cyclic,
   1410 				    CY_INFINITY);
   1411 			mutex_exit(&ct->ct_mutex);
   1412 		}
   1413 	}
   1414 }
   1415 
   1416 /*
   1417  * Resume callout processing.
   1418  */
   1419 static void
   1420 callout_resume(hrtime_t delta, int timechange)
   1421 {
   1422 	hrtime_t exp;
   1423 	int t, f;
   1424 	callout_table_t *ct;
   1425 
   1426 	/*
   1427 	 * Traverse every callout table in the system and resume callout
   1428 	 * processing. For active tables, perform any hrtime adjustments
   1429 	 * necessary.
   1430 	 */
   1431 	for (f = 0; f < max_ncpus; f++) {
   1432 		for (t = 0; t < CALLOUT_NTYPES; t++) {
   1433 			ct = &callout_table[CALLOUT_TABLE(t, f)];
   1434 
   1435 			mutex_enter(&ct->ct_mutex);
   1436 			if (ct->ct_cyclic == CYCLIC_NONE) {
   1437 				ct->ct_suspend--;
   1438 				mutex_exit(&ct->ct_mutex);
   1439 				continue;
   1440 			}
   1441 
   1442 			/*
   1443 			 * If a delta is specified, adjust the expirations in
   1444 			 * the heap by delta. Also, if the caller indicates
   1445 			 * a timechange, process that. This step also cleans
   1446 			 * out any empty callout lists that might happen to
   1447 			 * be there.
   1448 			 */
   1449 			(void) callout_heap_process(ct, delta, timechange);
   1450 
   1451 			ct->ct_suspend--;
   1452 			if (ct->ct_suspend == 0) {
   1453 				/*
   1454 				 * If the expired list is non-empty, then have
   1455 				 * the cyclic expire immediately. Else, program
   1456 				 * the cyclic based on the heap.
   1457 				 */
   1458 				if (ct->ct_expired.ch_head != NULL)
   1459 					exp = gethrtime();
   1460 				else if (ct->ct_heap_num > 0)
   1461 					exp = ct->ct_heap[0].ch_expiration;
   1462 				else
   1463 					exp = 0;
   1464 				if (exp != 0)
   1465 					(void) cyclic_reprogram(ct->ct_cyclic,
   1466 					    exp);
   1467 			}
   1468 
   1469 			mutex_exit(&ct->ct_mutex);
   1470 		}
   1471 	}
   1472 }
   1473 
   1474 /*
   1475  * Callback handler used by CPR to stop and resume callouts.
   1476  * The cyclic subsystem saves and restores hrtime during CPR.
   1477  * That is why callout_resume() is called with a 0 delta.
   1478  * Although hrtime is the same, hrestime (system time) has
   1479  * progressed during CPR. So, we have to indicate a time change
   1480  * to expire the absolute hrestime timers.
   1481  */
   1482 /*ARGSUSED*/
   1483 static boolean_t
   1484 callout_cpr_callb(void *arg, int code)
   1485 {
   1486 	if (code == CB_CODE_CPR_CHKPT)
   1487 		callout_suspend();
   1488 	else
   1489 		callout_resume(0, 1);
   1490 
   1491 	return (B_TRUE);
   1492 }
   1493 
   1494 /*
   1495  * Callback handler invoked when the debugger is entered or exited.
   1496  */
   1497 /*ARGSUSED*/
   1498 static boolean_t
   1499 callout_debug_callb(void *arg, int code)
   1500 {
   1501 	hrtime_t delta;
   1502 
   1503 	/*
   1504 	 * When the system enters the debugger. make a note of the hrtime.
   1505 	 * When it is resumed, compute how long the system was in the
   1506 	 * debugger. This interval should not be counted for callouts.
   1507 	 */
   1508 	if (code == 0) {
   1509 		callout_suspend();
   1510 		callout_debug_hrtime = gethrtime();
   1511 	} else {
   1512 		delta = gethrtime() - callout_debug_hrtime;
   1513 		callout_resume(delta, 0);
   1514 	}
   1515 
   1516 	return (B_TRUE);
   1517 }
   1518 
   1519 /*
   1520  * Move the absolute hrestime callouts to the expired list. Then program the
   1521  * table's cyclic to expire immediately so that the callouts can be executed
   1522  * immediately.
   1523  */
   1524 static void
   1525 callout_hrestime_one(callout_table_t *ct)
   1526 {
   1527 	hrtime_t expiration;
   1528 
   1529 	mutex_enter(&ct->ct_mutex);
   1530 	if (ct->ct_heap_num == 0) {
   1531 		mutex_exit(&ct->ct_mutex);
   1532 		return;
   1533 	}
   1534 
   1535 	/*
   1536 	 * Walk the heap and process all the absolute hrestime entries.
   1537 	 */
   1538 	expiration = callout_heap_process(ct, 0, 1);
   1539 
   1540 	if ((expiration != 0) && (ct->ct_suspend == 0))
   1541 		(void) cyclic_reprogram(ct->ct_cyclic, expiration);
   1542 
   1543 	mutex_exit(&ct->ct_mutex);
   1544 }
   1545 
   1546 /*
   1547  * This function is called whenever system time (hrestime) is changed
   1548  * explicitly. All the HRESTIME callouts must be expired at once.
   1549  */
   1550 /*ARGSUSED*/
   1551 void
   1552 callout_hrestime(void)
   1553 {
   1554 	int t, f;
   1555 	callout_table_t *ct;
   1556 
   1557 	/*
   1558 	 * Traverse every callout table in the system and process the hrestime
   1559 	 * callouts therein.
   1560 	 *
   1561 	 * We look at all the tables because we don't know which ones were
   1562 	 * onlined and offlined in the past. The offlined tables may still
   1563 	 * have active cyclics processing timers somewhere.
   1564 	 */
   1565 	for (f = 0; f < max_ncpus; f++) {
   1566 		for (t = 0; t < CALLOUT_NTYPES; t++) {
   1567 			ct = &callout_table[CALLOUT_TABLE(t, f)];
   1568 			callout_hrestime_one(ct);
   1569 		}
   1570 	}
   1571 }
   1572 
   1573 /*
   1574  * Create the hash tables for this callout table.
   1575  */
   1576 static void
   1577 callout_hash_init(callout_table_t *ct)
   1578 {
   1579 	size_t size;
   1580 
   1581 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
   1582 	ASSERT((ct->ct_idhash == NULL) && (ct->ct_clhash == NULL));
   1583 
   1584 	size = sizeof (callout_hash_t) * CALLOUT_BUCKETS;
   1585 	ct->ct_idhash = kmem_zalloc(size, KM_SLEEP);
   1586 	ct->ct_clhash = kmem_zalloc(size, KM_SLEEP);
   1587 }
   1588 
   1589 /*
   1590  * Create per-callout table kstats.
   1591  */
   1592 static void
   1593 callout_kstat_init(callout_table_t *ct)
   1594 {
   1595 	callout_stat_type_t stat;
   1596 	kstat_t *ct_kstats;
   1597 	int ndx;
   1598 
   1599 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
   1600 	ASSERT(ct->ct_kstats == NULL);
   1601 
   1602 	ndx = ct - callout_table;
   1603 	ct_kstats = kstat_create("unix", ndx, "callout",
   1604 	    "misc", KSTAT_TYPE_NAMED, CALLOUT_NUM_STATS, KSTAT_FLAG_VIRTUAL);
   1605 
   1606 	if (ct_kstats == NULL) {
   1607 		cmn_err(CE_WARN, "kstat_create for callout table %p failed",
   1608 		    (void *)ct);
   1609 	} else {
   1610 		ct_kstats->ks_data = ct->ct_kstat_data;
   1611 		for (stat = 0; stat < CALLOUT_NUM_STATS; stat++)
   1612 			kstat_named_init(&ct->ct_kstat_data[stat],
   1613 			    callout_kstat_names[stat], KSTAT_DATA_INT64);
   1614 		ct->ct_kstats = ct_kstats;
   1615 		kstat_install(ct_kstats);
   1616 	}
   1617 }
   1618 
   1619 static void
   1620 callout_cyclic_init(callout_table_t *ct)
   1621 {
   1622 	cyc_handler_t hdlr;
   1623 	cyc_time_t when;
   1624 	processorid_t seqid;
   1625 	int t;
   1626 
   1627 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
   1628 
   1629 	t = CALLOUT_TABLE_TYPE(ct);
   1630 	seqid = CALLOUT_TABLE_SEQID(ct);
   1631 
   1632 	/*
   1633 	 * Create the taskq thread if the table type is normal.
   1634 	 * Realtime tables are handled at PIL1 by a softint
   1635 	 * handler.
   1636 	 */
   1637 	if (t == CALLOUT_NORMAL) {
   1638 		ASSERT(ct->ct_taskq == NULL);
   1639 		/*
   1640 		 * Each callout thread consumes exactly one
   1641 		 * task structure while active.  Therefore,
   1642 		 * prepopulating with 2 * callout_threads tasks
   1643 		 * ensures that there's at least one task per
   1644 		 * thread that's either scheduled or on the
   1645 		 * freelist.  In turn, this guarantees that
   1646 		 * taskq_dispatch() will always either succeed
   1647 		 * (because there's a free task structure) or
   1648 		 * be unnecessary (because "callout_excute(ct)"
   1649 		 * has already scheduled).
   1650 		 */
   1651 		ct->ct_taskq =
   1652 		    taskq_create_instance("callout_taskq", seqid,
   1653 		    callout_threads, maxclsyspri,
   1654 		    2 * callout_threads, 2 * callout_threads,
   1655 		    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
   1656 	}
   1657 
   1658 	/*
   1659 	 * callouts can only be created in a table whose
   1660 	 * cyclic has been initialized.
   1661 	 */
   1662 	ASSERT(ct->ct_heap_num == 0);
   1663 
   1664 	/*
   1665 	 * Create the callout table cyclics.
   1666 	 *
   1667 	 * The realtime cyclic handler executes at low PIL. The normal cyclic
   1668 	 * handler executes at lock PIL. This is because there are cases
   1669 	 * where code can block at PIL > 1 waiting for a normal callout handler
   1670 	 * to unblock it directly or indirectly. If the normal cyclic were to
   1671 	 * be executed at low PIL, it could get blocked out by the waiter
   1672 	 * and cause a deadlock.
   1673 	 */
   1674 	ASSERT(ct->ct_cyclic == CYCLIC_NONE);
   1675 
   1676 	hdlr.cyh_func = (cyc_func_t)CALLOUT_CYCLIC_HANDLER(t);
   1677 	if (ct->ct_type == CALLOUT_REALTIME)
   1678 		hdlr.cyh_level = callout_realtime_level;
   1679 	else
   1680 		hdlr.cyh_level = callout_normal_level;
   1681 	hdlr.cyh_arg = ct;
   1682 	when.cyt_when = CY_INFINITY;
   1683 	when.cyt_interval = CY_INFINITY;
   1684 
   1685 	ct->ct_cyclic = cyclic_add(&hdlr, &when);
   1686 }
   1687 
   1688 void
   1689 callout_cpu_online(cpu_t *cp)
   1690 {
   1691 	lgrp_handle_t hand;
   1692 	callout_cache_t *cache;
   1693 	char s[KMEM_CACHE_NAMELEN];
   1694 	callout_table_t *ct;
   1695 	processorid_t seqid;
   1696 	int t;
   1697 
   1698 	ASSERT(MUTEX_HELD(&cpu_lock));
   1699 
   1700 	/*
   1701 	 * Locate the cache corresponding to the onlined CPU's lgroup.
   1702 	 * Note that access to callout_caches is protected by cpu_lock.
   1703 	 */
   1704 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
   1705 	for (cache = callout_caches; cache != NULL; cache = cache->cc_next) {
   1706 		if (cache->cc_hand == hand)
   1707 			break;
   1708 	}
   1709 
   1710 	/*
   1711 	 * If not found, create one. The caches are never destroyed.
   1712 	 */
   1713 	if (cache == NULL) {
   1714 		cache = kmem_alloc(sizeof (callout_cache_t), KM_SLEEP);
   1715 		cache->cc_hand = hand;
   1716 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_cache%lx",
   1717 		    (long)hand);
   1718 		cache->cc_cache = kmem_cache_create(s, sizeof (callout_t),
   1719 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
   1720 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_lcache%lx",
   1721 		    (long)hand);
   1722 		cache->cc_lcache = kmem_cache_create(s, sizeof (callout_list_t),
   1723 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
   1724 		cache->cc_next = callout_caches;
   1725 		callout_caches = cache;
   1726 	}
   1727 
   1728 	seqid = cp->cpu_seqid;
   1729 
   1730 	for (t = 0; t < CALLOUT_NTYPES; t++) {
   1731 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
   1732 
   1733 		mutex_enter(&ct->ct_mutex);
   1734 		/*
   1735 		 * Store convinience pointers to the kmem caches
   1736 		 * in the callout table. These assignments should always be
   1737 		 * done as callout tables can map to different physical
   1738 		 * CPUs each time.
   1739 		 */
   1740 		ct->ct_cache = cache->cc_cache;
   1741 		ct->ct_lcache = cache->cc_lcache;
   1742 
   1743 		/*
   1744 		 * We use the heap pointer to check if stuff has been
   1745 		 * initialized for this callout table.
   1746 		 */
   1747 		if (ct->ct_heap == NULL) {
   1748 			callout_heap_init(ct);
   1749 			callout_hash_init(ct);
   1750 			callout_kstat_init(ct);
   1751 			callout_cyclic_init(ct);
   1752 		}
   1753 
   1754 		mutex_exit(&ct->ct_mutex);
   1755 
   1756 		/*
   1757 		 * Move the cyclic to this CPU by doing a bind.
   1758 		 */
   1759 		cyclic_bind(ct->ct_cyclic, cp, NULL);
   1760 	}
   1761 }
   1762 
   1763 void
   1764 callout_cpu_offline(cpu_t *cp)
   1765 {
   1766 	callout_table_t *ct;
   1767 	processorid_t seqid;
   1768 	int t;
   1769 
   1770 	ASSERT(MUTEX_HELD(&cpu_lock));
   1771 
   1772 	seqid = cp->cpu_seqid;
   1773 
   1774 	for (t = 0; t < CALLOUT_NTYPES; t++) {
   1775 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
   1776 
   1777 		/*
   1778 		 * Unbind the cyclic. This will allow the cyclic subsystem
   1779 		 * to juggle the cyclic during CPU offline.
   1780 		 */
   1781 		cyclic_bind(ct->ct_cyclic, NULL, NULL);
   1782 	}
   1783 }
   1784 
   1785 /*
   1786  * This is called to perform per-CPU initialization for slave CPUs at
   1787  * boot time.
   1788  */
   1789 void
   1790 callout_mp_init(void)
   1791 {
   1792 	cpu_t *cp;
   1793 
   1794 	mutex_enter(&cpu_lock);
   1795 
   1796 	cp = cpu_active;
   1797 	do {
   1798 		callout_cpu_online(cp);
   1799 	} while ((cp = cp->cpu_next_onln) != cpu_active);
   1800 
   1801 	mutex_exit(&cpu_lock);
   1802 }
   1803 
   1804 /*
   1805  * Initialize all callout tables.  Called at boot time just before clkstart().
   1806  */
   1807 void
   1808 callout_init(void)
   1809 {
   1810 	int f, t;
   1811 	size_t size;
   1812 	int table_id;
   1813 	callout_table_t *ct;
   1814 	long bits, fanout;
   1815 	uintptr_t buf;
   1816 
   1817 	/*
   1818 	 * Initialize callout globals.
   1819 	 */
   1820 	bits = 0;
   1821 	for (fanout = 1; (fanout < max_ncpus); fanout <<= 1)
   1822 		bits++;
   1823 	callout_table_bits = CALLOUT_TYPE_BITS + bits;
   1824 	callout_table_mask = (1 << callout_table_bits) - 1;
   1825 	callout_counter_low = 1 << CALLOUT_COUNTER_SHIFT;
   1826 	callout_longterm = TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS);
   1827 	callout_max_ticks = CALLOUT_MAX_TICKS;
   1828 	if (callout_min_reap == 0)
   1829 		callout_min_reap = CALLOUT_MIN_REAP;
   1830 
   1831 	if (callout_tolerance <= 0)
   1832 		callout_tolerance = CALLOUT_TOLERANCE;
   1833 	if (callout_threads <= 0)
   1834 		callout_threads = CALLOUT_THREADS;
   1835 
   1836 	/*
   1837 	 * Allocate all the callout tables based on max_ncpus. We have chosen
   1838 	 * to do boot-time allocation instead of dynamic allocation because:
   1839 	 *
   1840 	 *	- the size of the callout tables is not too large.
   1841 	 *	- there are race conditions involved in making this dynamic.
   1842 	 *	- the hash tables that go with the callout tables consume
   1843 	 *	  most of the memory and they are only allocated in
   1844 	 *	  callout_cpu_online().
   1845 	 *
   1846 	 * Each CPU has two tables that are consecutive in the array. The first
   1847 	 * one is for realtime callouts and the second one is for normal ones.
   1848 	 *
   1849 	 * We do this alignment dance to make sure that callout table
   1850 	 * structures will always be on a cache line boundary.
   1851 	 */
   1852 	size = sizeof (callout_table_t) * CALLOUT_NTYPES * max_ncpus;
   1853 	size += CALLOUT_ALIGN;
   1854 	buf = (uintptr_t)kmem_zalloc(size, KM_SLEEP);
   1855 	callout_table = (callout_table_t *)P2ROUNDUP(buf, CALLOUT_ALIGN);
   1856 
   1857 	size = sizeof (kstat_named_t) * CALLOUT_NUM_STATS;
   1858 	/*
   1859 	 * Now, initialize the tables for all the CPUs.
   1860 	 */
   1861 	for (f = 0; f < max_ncpus; f++) {
   1862 		for (t = 0; t < CALLOUT_NTYPES; t++) {
   1863 			table_id = CALLOUT_TABLE(t, f);
   1864 			ct = &callout_table[table_id];
   1865 			ct->ct_type = t;
   1866 			mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
   1867 			/*
   1868 			 * Precompute the base IDs for long and short-term
   1869 			 * legacy IDs. This makes ID generation during
   1870 			 * timeout() fast.
   1871 			 */
   1872 			ct->ct_short_id = CALLOUT_SHORT_ID(table_id);
   1873 			ct->ct_long_id = CALLOUT_LONG_ID(table_id);
   1874 			/*
   1875 			 * Precompute the base ID for generation-based IDs.
   1876 			 * Note that when the first ID gets allocated, the
   1877 			 * ID will wrap. This will cause the generation
   1878 			 * number to be incremented to 1.
   1879 			 */
   1880 			ct->ct_gen_id = CALLOUT_SHORT_ID(table_id);
   1881 			/*
   1882 			 * Initialize the cyclic as NONE. This will get set
   1883 			 * during CPU online. This is so that partially
   1884 			 * populated systems will only have the required
   1885 			 * number of cyclics, not more.
   1886 			 */
   1887 			ct->ct_cyclic = CYCLIC_NONE;
   1888 			ct->ct_kstat_data = kmem_zalloc(size, KM_SLEEP);
   1889 		}
   1890 	}
   1891 
   1892 	/*
   1893 	 * Add the callback for CPR. This is called during checkpoint
   1894 	 * resume to suspend and resume callouts.
   1895 	 */
   1896 	(void) callb_add(callout_cpr_callb, 0, CB_CL_CPR_CALLOUT,
   1897 	    "callout_cpr");
   1898 	(void) callb_add(callout_debug_callb, 0, CB_CL_ENTER_DEBUGGER,
   1899 	    "callout_debug");
   1900 
   1901 	/*
   1902 	 * Call the per-CPU initialization function for the boot CPU. This
   1903 	 * is done here because the function is not called automatically for
   1904 	 * the boot CPU from the CPU online/offline hooks. Note that the
   1905 	 * CPU lock is taken here because of convention.
   1906 	 */
   1907 	mutex_enter(&cpu_lock);
   1908 	callout_boot_ct = &callout_table[CALLOUT_TABLE(0, CPU->cpu_seqid)];
   1909 	callout_cpu_online(CPU);
   1910 	mutex_exit(&cpu_lock);
   1911 }
   1912