Home | History | Annotate | Download | only in os
      1      0      stevel /*
      2      0      stevel  * CDDL HEADER START
      3      0      stevel  *
      4      0      stevel  * The contents of this file are subject to the terms of the
      5   2036     wentaoy  * Common Development and Distribution License (the "License").
      6   2036     wentaoy  * You may not use this file except in compliance with the License.
      7      0      stevel  *
      8      0      stevel  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9      0      stevel  * or http://www.opensolaris.org/os/licensing.
     10      0      stevel  * See the License for the specific language governing permissions
     11      0      stevel  * and limitations under the License.
     12      0      stevel  *
     13      0      stevel  * When distributing Covered Code, include this CDDL HEADER in each
     14      0      stevel  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15      0      stevel  * If applicable, add the following below this CDDL HEADER, with the
     16      0      stevel  * fields enclosed by brackets "[]" replaced with your own identifying
     17      0      stevel  * information: Portions Copyright [yyyy] [name of copyright owner]
     18      0      stevel  *
     19      0      stevel  * CDDL HEADER END
     20      0      stevel  */
     21      0      stevel /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     22      0      stevel /*	  All Rights Reserved	*/
     23      0      stevel 
     24      0      stevel /*
     25   8566    Madhavan  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     26      0      stevel  * Use is subject to license terms.
     27      0      stevel  */
     28      0      stevel 
     29      0      stevel #include <sys/param.h>
     30      0      stevel #include <sys/t_lock.h>
     31      0      stevel #include <sys/types.h>
     32      0      stevel #include <sys/tuneable.h>
     33      0      stevel #include <sys/sysmacros.h>
     34      0      stevel #include <sys/systm.h>
     35      0      stevel #include <sys/cpuvar.h>
     36      0      stevel #include <sys/lgrp.h>
     37      0      stevel #include <sys/user.h>
     38      0      stevel #include <sys/proc.h>
     39      0      stevel #include <sys/callo.h>
     40      0      stevel #include <sys/kmem.h>
     41      0      stevel #include <sys/var.h>
     42      0      stevel #include <sys/cmn_err.h>
     43      0      stevel #include <sys/swap.h>
     44      0      stevel #include <sys/vmsystm.h>
     45      0      stevel #include <sys/class.h>
     46      0      stevel #include <sys/time.h>
     47      0      stevel #include <sys/debug.h>
     48      0      stevel #include <sys/vtrace.h>
     49      0      stevel #include <sys/spl.h>
     50      0      stevel #include <sys/atomic.h>
     51      0      stevel #include <sys/dumphdr.h>
     52      0      stevel #include <sys/archsystm.h>
     53      0      stevel #include <sys/fs/swapnode.h>
     54      0      stevel #include <sys/panic.h>
     55      0      stevel #include <sys/disp.h>
     56      0      stevel #include <sys/msacct.h>
     57      0      stevel #include <sys/mem_cage.h>
     58      0      stevel 
     59      0      stevel #include <vm/page.h>
     60      0      stevel #include <vm/anon.h>
     61      0      stevel #include <vm/rm.h>
     62      0      stevel #include <sys/cyclic.h>
     63      0      stevel #include <sys/cpupart.h>
     64      0      stevel #include <sys/rctl.h>
     65      0      stevel #include <sys/task.h>
     66      0      stevel #include <sys/sdt.h>
     67   5107        eota #include <sys/ddi_timer.h>
     68  10696       David #include <sys/random.h>
     69  10696       David #include <sys/modctl.h>
     70      0      stevel 
     71      0      stevel /*
     72      0      stevel  * for NTP support
     73      0      stevel  */
     74      0      stevel #include <sys/timex.h>
     75      0      stevel #include <sys/inttypes.h>
     76  11066      rafael 
     77  11066      rafael #include <sys/sunddi.h>
     78  11066      rafael #include <sys/clock_impl.h>
     79      0      stevel 
     80      0      stevel /*
     81   3792       akolb  * clock() is called straight from the clock cyclic; see clock_init().
     82      0      stevel  *
     83      0      stevel  * Functions:
     84      0      stevel  *	reprime clock
     85      0      stevel  *	maintain date
     86      0      stevel  *	jab the scheduler
     87      0      stevel  */
     88      0      stevel 
     89      0      stevel extern kcondvar_t	fsflush_cv;
     90      0      stevel extern sysinfo_t	sysinfo;
     91      0      stevel extern vminfo_t	vminfo;
     92      0      stevel extern int	idleswtch;	/* flag set while idle in pswtch() */
     93  10696       David extern hrtime_t volatile devinfo_freeze;
     94      0      stevel 
     95      0      stevel /*
     96      0      stevel  * high-precision avenrun values.  These are needed to make the
     97      0      stevel  * regular avenrun values accurate.
     98      0      stevel  */
     99      0      stevel static uint64_t hp_avenrun[3];
    100      0      stevel int	avenrun[3];		/* FSCALED average run queue lengths */
    101      0      stevel time_t	time;	/* time in seconds since 1970 - for compatibility only */
    102      0      stevel 
    103      0      stevel static struct loadavg_s loadavg;
    104      0      stevel /*
    105      0      stevel  * Phase/frequency-lock loop (PLL/FLL) definitions
    106      0      stevel  *
    107      0      stevel  * The following variables are read and set by the ntp_adjtime() system
    108      0      stevel  * call.
    109      0      stevel  *
    110      0      stevel  * time_state shows the state of the system clock, with values defined
    111      0      stevel  * in the timex.h header file.
    112      0      stevel  *
    113      0      stevel  * time_status shows the status of the system clock, with bits defined
    114      0      stevel  * in the timex.h header file.
    115      0      stevel  *
    116      0      stevel  * time_offset is used by the PLL/FLL to adjust the system time in small
    117      0      stevel  * increments.
    118      0      stevel  *
    119      0      stevel  * time_constant determines the bandwidth or "stiffness" of the PLL.
    120      0      stevel  *
    121      0      stevel  * time_tolerance determines maximum frequency error or tolerance of the
    122      0      stevel  * CPU clock oscillator and is a property of the architecture; however,
    123      0      stevel  * in principle it could change as result of the presence of external
    124      0      stevel  * discipline signals, for instance.
    125      0      stevel  *
    126      0      stevel  * time_precision is usually equal to the kernel tick variable; however,
    127      0      stevel  * in cases where a precision clock counter or external clock is
    128      0      stevel  * available, the resolution can be much less than this and depend on
    129      0      stevel  * whether the external clock is working or not.
    130      0      stevel  *
    131      0      stevel  * time_maxerror is initialized by a ntp_adjtime() call and increased by
    132      0      stevel  * the kernel once each second to reflect the maximum error bound
    133      0      stevel  * growth.
    134      0      stevel  *
    135      0      stevel  * time_esterror is set and read by the ntp_adjtime() call, but
    136      0      stevel  * otherwise not used by the kernel.
    137      0      stevel  */
    138      0      stevel int32_t time_state = TIME_OK;	/* clock state */
    139      0      stevel int32_t time_status = STA_UNSYNC;	/* clock status bits */
    140      0      stevel int32_t time_offset = 0;		/* time offset (us) */
    141      0      stevel int32_t time_constant = 0;		/* pll time constant */
    142      0      stevel int32_t time_tolerance = MAXFREQ;	/* frequency tolerance (scaled ppm) */
    143      0      stevel int32_t time_precision = 1;	/* clock precision (us) */
    144      0      stevel int32_t time_maxerror = MAXPHASE;	/* maximum error (us) */
    145      0      stevel int32_t time_esterror = MAXPHASE;	/* estimated error (us) */
    146      0      stevel 
    147      0      stevel /*
    148      0      stevel  * The following variables establish the state of the PLL/FLL and the
    149      0      stevel  * residual time and frequency offset of the local clock. The scale
    150      0      stevel  * factors are defined in the timex.h header file.
    151      0      stevel  *
    152      0      stevel  * time_phase and time_freq are the phase increment and the frequency
    153      0      stevel  * increment, respectively, of the kernel time variable.
    154      0      stevel  *
    155      0      stevel  * time_freq is set via ntp_adjtime() from a value stored in a file when
    156      0      stevel  * the synchronization daemon is first started. Its value is retrieved
    157      0      stevel  * via ntp_adjtime() and written to the file about once per hour by the
    158      0      stevel  * daemon.
    159      0      stevel  *
    160      0      stevel  * time_adj is the adjustment added to the value of tick at each timer
    161      0      stevel  * interrupt and is recomputed from time_phase and time_freq at each
    162      0      stevel  * seconds rollover.
    163      0      stevel  *
    164      0      stevel  * time_reftime is the second's portion of the system time at the last
    165      0      stevel  * call to ntp_adjtime(). It is used to adjust the time_freq variable
    166      0      stevel  * and to increase the time_maxerror as the time since last update
    167      0      stevel  * increases.
    168      0      stevel  */
    169      0      stevel int32_t time_phase = 0;		/* phase offset (scaled us) */
    170      0      stevel int32_t time_freq = 0;		/* frequency offset (scaled ppm) */
    171      0      stevel int32_t time_adj = 0;		/* tick adjust (scaled 1 / hz) */
    172      0      stevel int32_t time_reftime = 0;		/* time at last adjustment (s) */
    173      0      stevel 
    174      0      stevel /*
    175      0      stevel  * The scale factors of the following variables are defined in the
    176      0      stevel  * timex.h header file.
    177      0      stevel  *
    178      0      stevel  * pps_time contains the time at each calibration interval, as read by
    179      0      stevel  * microtime(). pps_count counts the seconds of the calibration
    180      0      stevel  * interval, the duration of which is nominally pps_shift in powers of
    181      0      stevel  * two.
    182      0      stevel  *
    183      0      stevel  * pps_offset is the time offset produced by the time median filter
    184      0      stevel  * pps_tf[], while pps_jitter is the dispersion (jitter) measured by
    185      0      stevel  * this filter.
    186      0      stevel  *
    187      0      stevel  * pps_freq is the frequency offset produced by the frequency median
    188      0      stevel  * filter pps_ff[], while pps_stabil is the dispersion (wander) measured
    189      0      stevel  * by this filter.
    190      0      stevel  *
    191      0      stevel  * pps_usec is latched from a high resolution counter or external clock
    192      0      stevel  * at pps_time. Here we want the hardware counter contents only, not the
    193      0      stevel  * contents plus the time_tv.usec as usual.
    194      0      stevel  *
    195      0      stevel  * pps_valid counts the number of seconds since the last PPS update. It
    196      0      stevel  * is used as a watchdog timer to disable the PPS discipline should the
    197      0      stevel  * PPS signal be lost.
    198      0      stevel  *
    199      0      stevel  * pps_glitch counts the number of seconds since the beginning of an
    200      0      stevel  * offset burst more than tick/2 from current nominal offset. It is used
    201      0      stevel  * mainly to suppress error bursts due to priority conflicts between the
    202      0      stevel  * PPS interrupt and timer interrupt.
    203      0      stevel  *
    204      0      stevel  * pps_intcnt counts the calibration intervals for use in the interval-
    205      0      stevel  * adaptation algorithm. It's just too complicated for words.
    206      0      stevel  */
    207      0      stevel struct timeval pps_time;	/* kernel time at last interval */
    208      0      stevel int32_t pps_tf[] = {0, 0, 0};	/* pps time offset median filter (us) */
    209      0      stevel int32_t pps_offset = 0;		/* pps time offset (us) */
    210      0      stevel int32_t pps_jitter = MAXTIME;	/* time dispersion (jitter) (us) */
    211      0      stevel int32_t pps_ff[] = {0, 0, 0};	/* pps frequency offset median filter */
    212      0      stevel int32_t pps_freq = 0;		/* frequency offset (scaled ppm) */
    213      0      stevel int32_t pps_stabil = MAXFREQ;	/* frequency dispersion (scaled ppm) */
    214      0      stevel int32_t pps_usec = 0;		/* microsec counter at last interval */
    215      0      stevel int32_t pps_valid = PPS_VALID;	/* pps signal watchdog counter */
    216      0      stevel int32_t pps_glitch = 0;		/* pps signal glitch counter */
    217      0      stevel int32_t pps_count = 0;		/* calibration interval counter (s) */
    218      0      stevel int32_t pps_shift = PPS_SHIFT;	/* interval duration (s) (shift) */
    219      0      stevel int32_t pps_intcnt = 0;		/* intervals at current duration */
    220      0      stevel 
    221      0      stevel /*
    222      0      stevel  * PPS signal quality monitors
    223      0      stevel  *
    224      0      stevel  * pps_jitcnt counts the seconds that have been discarded because the
    225      0      stevel  * jitter measured by the time median filter exceeds the limit MAXTIME
    226      0      stevel  * (100 us).
    227      0      stevel  *
    228      0      stevel  * pps_calcnt counts the frequency calibration intervals, which are
    229      0      stevel  * variable from 4 s to 256 s.
    230      0      stevel  *
    231      0      stevel  * pps_errcnt counts the calibration intervals which have been discarded
    232      0      stevel  * because the wander exceeds the limit MAXFREQ (100 ppm) or where the
    233      0      stevel  * calibration interval jitter exceeds two ticks.
    234      0      stevel  *
    235      0      stevel  * pps_stbcnt counts the calibration intervals that have been discarded
    236      0      stevel  * because the frequency wander exceeds the limit MAXFREQ / 4 (25 us).
    237      0      stevel  */
    238      0      stevel int32_t pps_jitcnt = 0;		/* jitter limit exceeded */
    239      0      stevel int32_t pps_calcnt = 0;		/* calibration intervals */
    240      0      stevel int32_t pps_errcnt = 0;		/* calibration errors */
    241      0      stevel int32_t pps_stbcnt = 0;		/* stability limit exceeded */
    242      0      stevel 
    243  11066      rafael kcondvar_t lbolt_cv;
    244      0      stevel 
    245  11066      rafael /*
    246  11066      rafael  * Hybrid lbolt implementation:
    247  11066      rafael  *
    248  11066      rafael  * The service historically provided by the lbolt and lbolt64 variables has
    249  11066      rafael  * been replaced by the ddi_get_lbolt() and ddi_get_lbolt64() routines, and the
    250  11066      rafael  * original symbols removed from the system. The once clock driven variables are
    251  11066      rafael  * now implemented in an event driven fashion, backed by gethrtime() coarsed to
    252  11066      rafael  * the appropriate clock resolution. The default event driven implementation is
    253  11066      rafael  * complemented by a cyclic driven one, active only during periods of intense
    254  11066      rafael  * activity around the DDI lbolt routines, when a lbolt specific cyclic is
    255  11066      rafael  * reprogramed to fire at a clock tick interval to serve consumers of lbolt who
    256  11066      rafael  * rely on the original low cost of consulting a memory position.
    257  11066      rafael  *
    258  11066      rafael  * The implementation uses the number of calls to these routines and the
    259  11066      rafael  * frequency of these to determine when to transition from event to cyclic
    260  11066      rafael  * driven and vice-versa. These values are kept on a per CPU basis for
    261  11066      rafael  * scalability reasons and to prevent CPUs from constantly invalidating a single
    262  11066      rafael  * cache line when modifying a global variable. The transition from event to
    263  11066      rafael  * cyclic mode happens once the thresholds are crossed, and activity on any CPU
    264  11066      rafael  * can cause such transition.
    265  11066      rafael  *
    266  11066      rafael  * The lbolt_hybrid function pointer is called by ddi_get_lbolt() and
    267  11066      rafael  * ddi_get_lbolt64(), and will point to lbolt_event_driven() or
    268  11066      rafael  * lbolt_cyclic_driven() according to the current mode. When the thresholds
    269  11066      rafael  * are exceeded, lbolt_event_driven() will reprogram the lbolt cyclic to
    270  11066      rafael  * fire at a nsec_per_tick interval and increment an internal variable at
    271  11066      rafael  * each firing. lbolt_hybrid will then point to lbolt_cyclic_driven(), which
    272  11066      rafael  * will simply return the value of such variable. lbolt_cyclic() will attempt
    273  11066      rafael  * to shut itself off at each threshold interval (sampling period for calls
    274  11066      rafael  * to the DDI lbolt routines), and return to the event driven mode, but will
    275  11066      rafael  * be prevented from doing so if lbolt_cyclic_driven() is being heavily used.
    276  11066      rafael  *
    277  11066      rafael  * lbolt_bootstrap is used during boot to serve lbolt consumers who don't wait
    278  11066      rafael  * for the cyclic subsystem to be intialized.
    279  11066      rafael  *
    280  11066      rafael  */
    281  11066      rafael static int64_t lbolt_bootstrap(void);
    282  11066      rafael int64_t lbolt_event_driven(void);
    283  11066      rafael int64_t lbolt_cyclic_driven(void);
    284  11066      rafael int64_t (*lbolt_hybrid)(void) = lbolt_bootstrap;
    285  11066      rafael uint_t lbolt_ev_to_cyclic(caddr_t, caddr_t);
    286  11066      rafael 
    287  11066      rafael /*
    288  11066      rafael  * lbolt's cyclic, installed by clock_init().
    289  11066      rafael  */
    290  11066      rafael static void lbolt_cyclic(void);
    291  11066      rafael 
    292  11066      rafael /*
    293  11066      rafael  * Tunable to keep lbolt in cyclic driven mode. This will prevent the system
    294  11066      rafael  * from switching back to event driven, once it reaches cyclic mode.
    295  11066      rafael  */
    296  11066      rafael static boolean_t lbolt_cyc_only = B_FALSE;
    297  11066      rafael 
    298  11066      rafael /*
    299  11066      rafael  * Cache aligned, per CPU structure with lbolt usage statistics.
    300  11066      rafael  */
    301  11066      rafael static lbolt_cpu_t *lb_cpu;
    302  11066      rafael 
    303  11066      rafael /*
    304  11066      rafael  * Single, cache aligned, structure with all the information required by
    305  11066      rafael  * the lbolt implementation.
    306  11066      rafael  */
    307  11066      rafael lbolt_info_t *lb_info;
    308  11066      rafael 
    309  11066      rafael 
    310      0      stevel int one_sec = 1; /* turned on once every second */
    311      0      stevel static int fsflushcnt;	/* counter for t_fsflushr */
    312      0      stevel int	dosynctodr = 1;	/* patchable; enable/disable sync to TOD chip */
    313      0      stevel int	tod_needsync = 0;	/* need to sync tod chip with software time */
    314      0      stevel static int tod_broken = 0;	/* clock chip doesn't work */
    315      0      stevel time_t	boot_time = 0;		/* Boot time in seconds since 1970 */
    316      0      stevel cyclic_id_t clock_cyclic;	/* clock()'s cyclic_id */
    317      0      stevel cyclic_id_t deadman_cyclic;	/* deadman()'s cyclic_id */
    318   5265        eota cyclic_id_t ddi_timer_cyclic;	/* cyclic_timer()'s cyclic_id */
    319      0      stevel 
    320   5788    mv143129 extern void	clock_tick_schedule(int);
    321   5788    mv143129 
    322      0      stevel static int lgrp_ticks;		/* counter to schedule lgrp load calcs */
    323      0      stevel 
    324      0      stevel /*
    325      0      stevel  * for tod fault detection
    326      0      stevel  */
    327      0      stevel #define	TOD_REF_FREQ		((longlong_t)(NANOSEC))
    328      0      stevel #define	TOD_STALL_THRESHOLD	(TOD_REF_FREQ * 3 / 2)
    329      0      stevel #define	TOD_JUMP_THRESHOLD	(TOD_REF_FREQ / 2)
    330      0      stevel #define	TOD_FILTER_N		4
    331      0      stevel #define	TOD_FILTER_SETTLE	(4 * TOD_FILTER_N)
    332      0      stevel static int tod_faulted = TOD_NOFAULT;
    333      0      stevel static int tod_fault_reset_flag = 0;
    334      0      stevel 
    335      0      stevel /* patchable via /etc/system */
    336      0      stevel int tod_validate_enable = 1;
    337  10696       David 
    338  10696       David /* Diagnose/Limit messages about delay(9F) called from interrupt context */
    339  10696       David int			delay_from_interrupt_diagnose = 0;
    340  10696       David volatile uint32_t	delay_from_interrupt_msg = 20;
    341    950       sethg 
    342    950       sethg /*
    343    950       sethg  * On non-SPARC systems, TOD validation must be deferred until gethrtime
    344    950       sethg  * returns non-zero values (after mach_clkinit's execution).
    345    950       sethg  * On SPARC systems, it must be deferred until after hrtime_base
    346    950       sethg  * and hres_last_tick are set (in the first invocation of hres_tick).
    347    950       sethg  * Since in both cases the prerequisites occur before the invocation of
    348    950       sethg  * tod_get() in clock(), the deferment is lifted there.
    349    950       sethg  */
    350    950       sethg static boolean_t tod_validate_deferred = B_TRUE;
    351      0      stevel 
    352      0      stevel /*
    353      0      stevel  * tod_fault_table[] must be aligned with
    354      0      stevel  * enum tod_fault_type in systm.h
    355      0      stevel  */
    356      0      stevel static char *tod_fault_table[] = {
    357      0      stevel 	"Reversed",			/* TOD_REVERSED */
    358      0      stevel 	"Stalled",			/* TOD_STALLED */
    359      0      stevel 	"Jumped",			/* TOD_JUMPED */
    360   5084     johnlev 	"Changed in Clock Rate",	/* TOD_RATECHANGED */
    361   5084     johnlev 	"Is Read-Only"			/* TOD_RDONLY */
    362      0      stevel 	/*
    363      0      stevel 	 * no strings needed for TOD_NOFAULT
    364      0      stevel 	 */
    365      0      stevel };
    366      0      stevel 
    367      0      stevel /*
    368      0      stevel  * test hook for tod broken detection in tod_validate
    369      0      stevel  */
    370      0      stevel int tod_unit_test = 0;
    371      0      stevel time_t tod_test_injector;
    372      0      stevel 
    373      0      stevel #define	CLOCK_ADJ_HIST_SIZE	4
    374      0      stevel 
    375      0      stevel static int	adj_hist_entry;
    376      0      stevel 
    377      0      stevel int64_t clock_adj_hist[CLOCK_ADJ_HIST_SIZE];
    378      0      stevel 
    379      0      stevel static void calcloadavg(int, uint64_t *);
    380      0      stevel static int genloadavg(struct loadavg_s *);
    381      0      stevel static void loadavg_update();
    382      0      stevel 
    383      0      stevel void (*cmm_clock_callout)() = NULL;
    384   3792       akolb void (*cpucaps_clock_callout)() = NULL;
    385      0      stevel 
    386   5788    mv143129 extern clock_t clock_tick_proc_max;
    387   5788    mv143129 
    388  11066      rafael static int64_t deadman_counter = 0;
    389  11066      rafael 
    390      0      stevel static void
    391      0      stevel clock(void)
    392      0      stevel {
    393      0      stevel 	kthread_t	*t;
    394   5788    mv143129 	uint_t	nrunnable;
    395      0      stevel 	uint_t	w_io;
    396      0      stevel 	cpu_t	*cp;
    397      0      stevel 	cpupart_t *cpupart;
    398      0      stevel 	extern void set_anoninfo();
    399      0      stevel 	extern	void	set_freemem();
    400      0      stevel 	void	(*funcp)();
    401      0      stevel 	int32_t ltemp;
    402      0      stevel 	int64_t lltemp;
    403      0      stevel 	int s;
    404      0      stevel 	int do_lgrp_load;
    405      0      stevel 	int i;
    406  11066      rafael 	clock_t now = LBOLT_NO_ACCOUNT;	/* current tick */
    407      0      stevel 
    408      0      stevel 	if (panicstr)
    409      0      stevel 		return;
    410      0      stevel 
    411      0      stevel 	set_anoninfo();
    412      0      stevel 	/*
    413      0      stevel 	 * Make sure that 'freemem' do not drift too far from the truth
    414      0      stevel 	 */
    415      0      stevel 	set_freemem();
    416      0      stevel 
    417      0      stevel 
    418      0      stevel 	/*
    419      0      stevel 	 * Before the section which is repeated is executed, we do
    420      0      stevel 	 * the time delta processing which occurs every clock tick
    421      0      stevel 	 *
    422      0      stevel 	 * There is additional processing which happens every time
    423      0      stevel 	 * the nanosecond counter rolls over which is described
    424      0      stevel 	 * below - see the section which begins with : if (one_sec)
    425      0      stevel 	 *
    426      0      stevel 	 * This section marks the beginning of the precision-kernel
    427      0      stevel 	 * code fragment.
    428      0      stevel 	 *
    429      0      stevel 	 * First, compute the phase adjustment. If the low-order bits
    430      0      stevel 	 * (time_phase) of the update overflow, bump the higher order
    431      0      stevel 	 * bits (time_update).
    432      0      stevel 	 */
    433      0      stevel 	time_phase += time_adj;
    434      0      stevel 	if (time_phase <= -FINEUSEC) {
    435      0      stevel 		ltemp = -time_phase / SCALE_PHASE;
    436      0      stevel 		time_phase += ltemp * SCALE_PHASE;
    437      0      stevel 		s = hr_clock_lock();
    438      0      stevel 		timedelta -= ltemp * (NANOSEC/MICROSEC);
    439      0      stevel 		hr_clock_unlock(s);
    440      0      stevel 	} else if (time_phase >= FINEUSEC) {
    441      0      stevel 		ltemp = time_phase / SCALE_PHASE;
    442      0      stevel 		time_phase -= ltemp * SCALE_PHASE;
    443      0      stevel 		s = hr_clock_lock();
    444      0      stevel 		timedelta += ltemp * (NANOSEC/MICROSEC);
    445      0      stevel 		hr_clock_unlock(s);
    446      0      stevel 	}
    447      0      stevel 
    448      0      stevel 	/*
    449      0      stevel 	 * End of precision-kernel code fragment which is processed
    450      0      stevel 	 * every timer interrupt.
    451      0      stevel 	 *
    452      0      stevel 	 * Continue with the interrupt processing as scheduled.
    453      0      stevel 	 */
    454      0      stevel 	/*
    455      0      stevel 	 * Count the number of runnable threads and the number waiting
    456      0      stevel 	 * for some form of I/O to complete -- gets added to
    457      0      stevel 	 * sysinfo.waiting.  To know the state of the system, must add
    458      0      stevel 	 * wait counts from all CPUs.  Also add up the per-partition
    459      0      stevel 	 * statistics.
    460      0      stevel 	 */
    461      0      stevel 	w_io = 0;
    462      0      stevel 	nrunnable = 0;
    463      0      stevel 
    464      0      stevel 	/*
    465      0      stevel 	 * keep track of when to update lgrp/part loads
    466      0      stevel 	 */
    467      0      stevel 
    468      0      stevel 	do_lgrp_load = 0;
    469      0      stevel 	if (lgrp_ticks++ >= hz / 10) {
    470      0      stevel 		lgrp_ticks = 0;
    471      0      stevel 		do_lgrp_load = 1;
    472      0      stevel 	}
    473      0      stevel 
    474  11066      rafael 	if (one_sec) {
    475      0      stevel 		loadavg_update();
    476  11066      rafael 		deadman_counter++;
    477  11066      rafael 	}
    478      0      stevel 
    479      0      stevel 	/*
    480      0      stevel 	 * First count the threads waiting on kpreempt queues in each
    481      0      stevel 	 * CPU partition.
    482      0      stevel 	 */
    483      0      stevel 
    484      0      stevel 	cpupart = cp_list_head;
    485      0      stevel 	do {
    486      0      stevel 		uint_t cpupart_nrunnable = cpupart->cp_kp_queue.disp_nrunnable;
    487      0      stevel 
    488      0      stevel 		cpupart->cp_updates++;
    489      0      stevel 		nrunnable += cpupart_nrunnable;
    490      0      stevel 		cpupart->cp_nrunnable_cum += cpupart_nrunnable;
    491      0      stevel 		if (one_sec) {
    492      0      stevel 			cpupart->cp_nrunning = 0;
    493      0      stevel 			cpupart->cp_nrunnable = cpupart_nrunnable;
    494      0      stevel 		}
    495      0      stevel 	} while ((cpupart = cpupart->cp_next) != cp_list_head);
    496      0      stevel 
    497      0      stevel 
    498      0      stevel 	/* Now count the per-CPU statistics. */
    499      0      stevel 	cp = cpu_list;
    500      0      stevel 	do {
    501      0      stevel 		uint_t cpu_nrunnable = cp->cpu_disp->disp_nrunnable;
    502      0      stevel 
    503      0      stevel 		nrunnable += cpu_nrunnable;
    504      0      stevel 		cpupart = cp->cpu_part;
    505      0      stevel 		cpupart->cp_nrunnable_cum += cpu_nrunnable;
    506   3446         mrj 		if (one_sec) {
    507      0      stevel 			cpupart->cp_nrunnable += cpu_nrunnable;
    508   5788    mv143129 			/*
    509   5788    mv143129 			 * Update user, system, and idle cpu times.
    510   5788    mv143129 			 */
    511   5788    mv143129 			cpupart->cp_nrunning++;
    512   3446         mrj 			/*
    513   3446         mrj 			 * w_io is used to update sysinfo.waiting during
    514   3446         mrj 			 * one_second processing below.  Only gather w_io
    515   3446         mrj 			 * information when we walk the list of cpus if we're
    516   3446         mrj 			 * going to perform one_second processing.
    517   3446         mrj 			 */
    518   3446         mrj 			w_io += CPU_STATS(cp, sys.iowait);
    519   5076      mishra 		}
    520   3446         mrj 
    521   5076      mishra 		if (one_sec && (cp->cpu_flags & CPU_EXISTS)) {
    522   5076      mishra 			int i, load, change;
    523   5076      mishra 			hrtime_t intracct, intrused;
    524   5076      mishra 			const hrtime_t maxnsec = 1000000000;
    525   5076      mishra 			const int precision = 100;
    526   5076      mishra 
    527   5076      mishra 			/*
    528   5076      mishra 			 * Estimate interrupt load on this cpu each second.
    529   5076      mishra 			 * Computes cpu_intrload as %utilization (0-99).
    530   5076      mishra 			 */
    531   5076      mishra 
    532   5076      mishra 			/* add up interrupt time from all micro states */
    533   5076      mishra 			for (intracct = 0, i = 0; i < NCMSTATES; i++)
    534   5076      mishra 				intracct += cp->cpu_intracct[i];
    535   5076      mishra 			scalehrtime(&intracct);
    536   5076      mishra 
    537   5076      mishra 			/* compute nsec used in the past second */
    538   5076      mishra 			intrused = intracct - cp->cpu_intrlast;
    539   5076      mishra 			cp->cpu_intrlast = intracct;
    540   5076      mishra 
    541   5076      mishra 			/* limit the value for safety (and the first pass) */
    542   5076      mishra 			if (intrused >= maxnsec)
    543   5076      mishra 				intrused = maxnsec - 1;
    544   5076      mishra 
    545   5076      mishra 			/* calculate %time in interrupt */
    546   5076      mishra 			load = (precision * intrused) / maxnsec;
    547   5076      mishra 			ASSERT(load >= 0 && load < precision);
    548   5076      mishra 			change = cp->cpu_intrload - load;
    549   5076      mishra 
    550   5076      mishra 			/* jump to new max, or decay the old max */
    551   5076      mishra 			if (change < 0)
    552   5076      mishra 				cp->cpu_intrload = load;
    553   5076      mishra 			else if (change > 0)
    554   5076      mishra 				cp->cpu_intrload -= (change + 3) / 4;
    555   5076      mishra 
    556   5076      mishra 			DTRACE_PROBE3(cpu_intrload,
    557   5076      mishra 			    cpu_t *, cp,
    558   5076      mishra 			    hrtime_t, intracct,
    559   5076      mishra 			    hrtime_t, intrused);
    560   3446         mrj 		}
    561   5076      mishra 
    562      0      stevel 		if (do_lgrp_load &&
    563      0      stevel 		    (cp->cpu_flags & CPU_EXISTS)) {
    564      0      stevel 			/*
    565      0      stevel 			 * When updating the lgroup's load average,
    566      0      stevel 			 * account for the thread running on the CPU.
    567      0      stevel 			 * If the CPU is the current one, then we need
    568      0      stevel 			 * to account for the underlying thread which
    569      0      stevel 			 * got the clock interrupt not the thread that is
    570      0      stevel 			 * handling the interrupt and caculating the load
    571      0      stevel 			 * average
    572      0      stevel 			 */
    573      0      stevel 			t = cp->cpu_thread;
    574      0      stevel 			if (CPU == cp)
    575      0      stevel 				t = t->t_intr;
    576      0      stevel 
    577      0      stevel 			/*
    578      0      stevel 			 * Account for the load average for this thread if
    579      0      stevel 			 * it isn't the idle thread or it is on the interrupt
    580      0      stevel 			 * stack and not the current CPU handling the clock
    581      0      stevel 			 * interrupt
    582      0      stevel 			 */
    583      0      stevel 			if ((t && t != cp->cpu_idle_thread) || (CPU != cp &&
    584      0      stevel 			    CPU_ON_INTR(cp))) {
    585      0      stevel 				if (t->t_lpl == cp->cpu_lpl) {
    586      0      stevel 					/* local thread */
    587      0      stevel 					cpu_nrunnable++;
    588      0      stevel 				} else {
    589      0      stevel 					/*
    590      0      stevel 					 * This is a remote thread, charge it
    591      0      stevel 					 * against its home lgroup.  Note that
    592      0      stevel 					 * we notice that a thread is remote
    593      0      stevel 					 * only if it's currently executing.
    594      0      stevel 					 * This is a reasonable approximation,
    595      0      stevel 					 * since queued remote threads are rare.
    596      0      stevel 					 * Note also that if we didn't charge
    597      0      stevel 					 * it to its home lgroup, remote
    598      0      stevel 					 * execution would often make a system
    599      0      stevel 					 * appear balanced even though it was
    600      0      stevel 					 * not, and thread placement/migration
    601      0      stevel 					 * would often not be done correctly.
    602      0      stevel 					 */
    603      0      stevel 					lgrp_loadavg(t->t_lpl,
    604      0      stevel 					    LGRP_LOADAVG_IN_THREAD_MAX, 0);
    605      0      stevel 				}
    606      0      stevel 			}
    607      0      stevel 			lgrp_loadavg(cp->cpu_lpl,
    608      0      stevel 			    cpu_nrunnable * LGRP_LOADAVG_IN_THREAD_MAX, 1);
    609      0      stevel 		}
    610      0      stevel 	} while ((cp = cp->cpu_next) != cpu_list);
    611      0      stevel 
    612   5788    mv143129 	clock_tick_schedule(one_sec);
    613      0      stevel 
    614      0      stevel 	/*
    615      0      stevel 	 * Check for a callout that needs be called from the clock
    616      0      stevel 	 * thread to support the membership protocol in a clustered
    617      0      stevel 	 * system.  Copy the function pointer so that we can reset
    618      0      stevel 	 * this to NULL if needed.
    619      0      stevel 	 */
    620      0      stevel 	if ((funcp = cmm_clock_callout) != NULL)
    621   3792       akolb 		(*funcp)();
    622   3792       akolb 
    623   3792       akolb 	if ((funcp = cpucaps_clock_callout) != NULL)
    624      0      stevel 		(*funcp)();
    625      0      stevel 
    626      0      stevel 	/*
    627      0      stevel 	 * Wakeup the cageout thread waiters once per second.
    628      0      stevel 	 */
    629      0      stevel 	if (one_sec)
    630      0      stevel 		kcage_tick();
    631      0      stevel 
    632      0      stevel 	if (one_sec) {
    633      0      stevel 
    634      0      stevel 		int drift, absdrift;
    635      0      stevel 		timestruc_t tod;
    636      0      stevel 		int s;
    637      0      stevel 
    638      0      stevel 		/*
    639      0      stevel 		 * Beginning of precision-kernel code fragment executed
    640      0      stevel 		 * every second.
    641      0      stevel 		 *
    642      0      stevel 		 * On rollover of the second the phase adjustment to be
    643      0      stevel 		 * used for the next second is calculated.  Also, the
    644      0      stevel 		 * maximum error is increased by the tolerance.  If the
    645      0      stevel 		 * PPS frequency discipline code is present, the phase is
    646      0      stevel 		 * increased to compensate for the CPU clock oscillator
    647      0      stevel 		 * frequency error.
    648      0      stevel 		 *
    649      0      stevel 		 * On a 32-bit machine and given parameters in the timex.h
    650      0      stevel 		 * header file, the maximum phase adjustment is +-512 ms
    651      0      stevel 		 * and maximum frequency offset is (a tad less than)
    652      0      stevel 		 * +-512 ppm. On a 64-bit machine, you shouldn't need to ask.
    653      0      stevel 		 */
    654      0      stevel 		time_maxerror += time_tolerance / SCALE_USEC;
    655      0      stevel 
    656      0      stevel 		/*
    657      0      stevel 		 * Leap second processing. If in leap-insert state at
    658      0      stevel 		 * the end of the day, the system clock is set back one
    659      0      stevel 		 * second; if in leap-delete state, the system clock is
    660      0      stevel 		 * set ahead one second. The microtime() routine or
    661      0      stevel 		 * external clock driver will insure that reported time
    662      0      stevel 		 * is always monotonic. The ugly divides should be
    663      0      stevel 		 * replaced.
    664      0      stevel 		 */
    665      0      stevel 		switch (time_state) {
    666      0      stevel 
    667      0      stevel 		case TIME_OK:
    668      0      stevel 			if (time_status & STA_INS)
    669      0      stevel 				time_state = TIME_INS;
    670      0      stevel 			else if (time_status & STA_DEL)
    671      0      stevel 				time_state = TIME_DEL;
    672      0      stevel 			break;
    673      0      stevel 
    674      0      stevel 		case TIME_INS:
    675      0      stevel 			if (hrestime.tv_sec % 86400 == 0) {
    676      0      stevel 				s = hr_clock_lock();
    677      0      stevel 				hrestime.tv_sec--;
    678      0      stevel 				hr_clock_unlock(s);
    679      0      stevel 				time_state = TIME_OOP;
    680      0      stevel 			}
    681      0      stevel 			break;
    682      0      stevel 
    683      0      stevel 		case TIME_DEL:
    684      0      stevel 			if ((hrestime.tv_sec + 1) % 86400 == 0) {
    685      0      stevel 				s = hr_clock_lock();
    686      0      stevel 				hrestime.tv_sec++;
    687      0      stevel 				hr_clock_unlock(s);
    688      0      stevel 				time_state = TIME_WAIT;
    689      0      stevel 			}
    690      0      stevel 			break;
    691      0      stevel 
    692      0      stevel 		case TIME_OOP:
    693      0      stevel 			time_state = TIME_WAIT;
    694      0      stevel 			break;
    695      0      stevel 
    696      0      stevel 		case TIME_WAIT:
    697      0      stevel 			if (!(time_status & (STA_INS | STA_DEL)))
    698      0      stevel 				time_state = TIME_OK;
    699      0      stevel 		default:
    700      0      stevel 			break;
    701      0      stevel 		}
    702      0      stevel 
    703      0      stevel 		/*
    704      0      stevel 		 * Compute the phase adjustment for the next second. In
    705      0      stevel 		 * PLL mode, the offset is reduced by a fixed factor
    706      0      stevel 		 * times the time constant. In FLL mode the offset is
    707      0      stevel 		 * used directly. In either mode, the maximum phase
    708      0      stevel 		 * adjustment for each second is clamped so as to spread
    709      0      stevel 		 * the adjustment over not more than the number of
    710      0      stevel 		 * seconds between updates.
    711      0      stevel 		 */
    712      0      stevel 		if (time_offset == 0)
    713      0      stevel 			time_adj = 0;
    714      0      stevel 		else if (time_offset < 0) {
    715      0      stevel 			lltemp = -time_offset;
    716      0      stevel 			if (!(time_status & STA_FLL)) {
    717      0      stevel 				if ((1 << time_constant) >= SCALE_KG)
    718      0      stevel 					lltemp *= (1 << time_constant) /
    719      0      stevel 					    SCALE_KG;
    720      0      stevel 				else
    721      0      stevel 					lltemp = (lltemp / SCALE_KG) >>
    722      0      stevel 					    time_constant;
    723      0      stevel 			}
    724      0      stevel 			if (lltemp > (MAXPHASE / MINSEC) * SCALE_UPDATE)
    725      0      stevel 				lltemp = (MAXPHASE / MINSEC) * SCALE_UPDATE;
    726      0      stevel 			time_offset += lltemp;
    727      0      stevel 			time_adj = -(lltemp * SCALE_PHASE) / hz / SCALE_UPDATE;
    728      0      stevel 		} else {
    729      0      stevel 			lltemp = time_offset;
    730      0      stevel 			if (!(time_status & STA_FLL)) {
    731      0      stevel 				if ((1 << time_constant) >= SCALE_KG)
    732      0      stevel 					lltemp *= (1 << time_constant) /
    733      0      stevel 					    SCALE_KG;
    734      0      stevel 				else
    735      0      stevel 					lltemp = (lltemp / SCALE_KG) >>
    736      0      stevel 					    time_constant;
    737      0      stevel 			}
    738      0      stevel 			if (lltemp > (MAXPHASE / MINSEC) * SCALE_UPDATE)
    739      0      stevel 				lltemp = (MAXPHASE / MINSEC) * SCALE_UPDATE;
    740      0      stevel 			time_offset -= lltemp;
    741      0      stevel 			time_adj = (lltemp * SCALE_PHASE) / hz / SCALE_UPDATE;
    742      0      stevel 		}
    743      0      stevel 
    744      0      stevel 		/*
    745      0      stevel 		 * Compute the frequency estimate and additional phase
    746      0      stevel 		 * adjustment due to frequency error for the next
    747      0      stevel 		 * second. When the PPS signal is engaged, gnaw on the
    748      0      stevel 		 * watchdog counter and update the frequency computed by
    749      0      stevel 		 * the pll and the PPS signal.
    750      0      stevel 		 */
    751      0      stevel 		pps_valid++;
    752      0      stevel 		if (pps_valid == PPS_VALID) {
    753      0      stevel 			pps_jitter = MAXTIME;
    754      0      stevel 			pps_stabil = MAXFREQ;
    755      0      stevel 			time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
    756      0      stevel 			    STA_PPSWANDER | STA_PPSERROR);
    757      0      stevel 		}
    758      0      stevel 		lltemp = time_freq + pps_freq;
    759      0      stevel 
    760      0      stevel 		if (lltemp)
    761      0      stevel 			time_adj += (lltemp * SCALE_PHASE) / (SCALE_USEC * hz);
    762      0      stevel 
    763      0      stevel 		/*
    764      0      stevel 		 * End of precision kernel-code fragment
    765      0      stevel 		 *
    766      0      stevel 		 * The section below should be modified if we are planning
    767      0      stevel 		 * to use NTP for synchronization.
    768      0      stevel 		 *
    769      0      stevel 		 * Note: the clock synchronization code now assumes
    770      0      stevel 		 * the following:
    771      0      stevel 		 *   - if dosynctodr is 1, then compute the drift between
    772      0      stevel 		 *	the tod chip and software time and adjust one or
    773      0      stevel 		 *	the other depending on the circumstances
    774      0      stevel 		 *
    775      0      stevel 		 *   - if dosynctodr is 0, then the tod chip is independent
    776      0      stevel 		 *	of the software clock and should not be adjusted,
    777      0      stevel 		 *	but allowed to free run.  this allows NTP to sync.
    778      0      stevel 		 *	hrestime without any interference from the tod chip.
    779      0      stevel 		 */
    780      0      stevel 
    781    950       sethg 		tod_validate_deferred = B_FALSE;
    782      0      stevel 		mutex_enter(&tod_lock);
    783      0      stevel 		tod = tod_get();
    784      0      stevel 		drift = tod.tv_sec - hrestime.tv_sec;
    785      0      stevel 		absdrift = (drift >= 0) ? drift : -drift;
    786      0      stevel 		if (tod_needsync || absdrift > 1) {
    787      0      stevel 			int s;
    788      0      stevel 			if (absdrift > 2) {
    789      0      stevel 				if (!tod_broken && tod_faulted == TOD_NOFAULT) {
    790      0      stevel 					s = hr_clock_lock();
    791      0      stevel 					hrestime = tod;
    792      0      stevel 					membar_enter();	/* hrestime visible */
    793      0      stevel 					timedelta = 0;
    794   4123    dm120769 					timechanged++;
    795      0      stevel 					tod_needsync = 0;
    796      0      stevel 					hr_clock_unlock(s);
    797   8048    Madhavan 					callout_hrestime();
    798   8048    Madhavan 
    799      0      stevel 				}
    800      0      stevel 			} else {
    801      0      stevel 				if (tod_needsync || !dosynctodr) {
    802      0      stevel 					gethrestime(&tod);
    803      0      stevel 					tod_set(tod);
    804      0      stevel 					s = hr_clock_lock();
    805      0      stevel 					if (timedelta == 0)
    806      0      stevel 						tod_needsync = 0;
    807      0      stevel 					hr_clock_unlock(s);
    808      0      stevel 				} else {
    809      0      stevel 					/*
    810      0      stevel 					 * If the drift is 2 seconds on the
    811      0      stevel 					 * money, then the TOD is adjusting
    812      0      stevel 					 * the clock;  record that.
    813      0      stevel 					 */
    814      0      stevel 					clock_adj_hist[adj_hist_entry++ %
    815  11066      rafael 					    CLOCK_ADJ_HIST_SIZE] = now;
    816      0      stevel 					s = hr_clock_lock();
    817      0      stevel 					timedelta = (int64_t)drift*NANOSEC;
    818      0      stevel 					hr_clock_unlock(s);
    819      0      stevel 				}
    820      0      stevel 			}
    821      0      stevel 		}
    822      0      stevel 		one_sec = 0;
    823      0      stevel 		time = gethrestime_sec();  /* for crusty old kmem readers */
    824      0      stevel 		mutex_exit(&tod_lock);
    825      0      stevel 
    826      0      stevel 		/*
    827      0      stevel 		 * Some drivers still depend on this... XXX
    828      0      stevel 		 */
    829      0      stevel 		cv_broadcast(&lbolt_cv);
    830      0      stevel 
    831      0      stevel 		sysinfo.updates++;
    832      0      stevel 		vminfo.freemem += freemem;
    833      0      stevel 		{
    834      0      stevel 			pgcnt_t maxswap, resv, free;
    835      0      stevel 			pgcnt_t avail =
    836      0      stevel 			    MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
    837      0      stevel 
    838   5076      mishra 			maxswap = k_anoninfo.ani_mem_resv +
    839   5076      mishra 			    k_anoninfo.ani_max +avail;
    840      0      stevel 			free = k_anoninfo.ani_free + avail;
    841      0      stevel 			resv = k_anoninfo.ani_phys_resv +
    842      0      stevel 			    k_anoninfo.ani_mem_resv;
    843      0      stevel 
    844      0      stevel 			vminfo.swap_resv += resv;
    845      0      stevel 			/* number of reserved and allocated pages */
    846      0      stevel #ifdef	DEBUG
    847      0      stevel 			if (maxswap < free)
    848      0      stevel 				cmn_err(CE_WARN, "clock: maxswap < free");
    849      0      stevel 			if (maxswap < resv)
    850      0      stevel 				cmn_err(CE_WARN, "clock: maxswap < resv");
    851      0      stevel #endif
    852      0      stevel 			vminfo.swap_alloc += maxswap - free;
    853      0      stevel 			vminfo.swap_avail += maxswap - resv;
    854      0      stevel 			vminfo.swap_free += free;
    855      0      stevel 		}
    856      0      stevel 		if (nrunnable) {
    857      0      stevel 			sysinfo.runque += nrunnable;
    858      0      stevel 			sysinfo.runocc++;
    859      0      stevel 		}
    860      0      stevel 		if (nswapped) {
    861      0      stevel 			sysinfo.swpque += nswapped;
    862      0      stevel 			sysinfo.swpocc++;
    863      0      stevel 		}
    864      0      stevel 		sysinfo.waiting += w_io;
    865      0      stevel 
    866      0      stevel 		/*
    867      0      stevel 		 * Wake up fsflush to write out DELWRI
    868      0      stevel 		 * buffers, dirty pages and other cached
    869      0      stevel 		 * administrative data, e.g. inodes.
    870      0      stevel 		 */
    871      0      stevel 		if (--fsflushcnt <= 0) {
    872      0      stevel 			fsflushcnt = tune.t_fsflushr;
    873      0      stevel 			cv_signal(&fsflush_cv);
    874      0      stevel 		}
    875      0      stevel 
    876      0      stevel 		vmmeter();
    877      0      stevel 		calcloadavg(genloadavg(&loadavg), hp_avenrun);
    878      0      stevel 		for (i = 0; i < 3; i++)
    879      0      stevel 			/*
    880      0      stevel 			 * At the moment avenrun[] can only hold 31
    881      0      stevel 			 * bits of load average as it is a signed
    882      0      stevel 			 * int in the API. We need to ensure that
    883      0      stevel 			 * hp_avenrun[i] >> (16 - FSHIFT) will not be
    884      0      stevel 			 * too large. If it is, we put the largest value
    885      0      stevel 			 * that we can use into avenrun[i]. This is
    886      0      stevel 			 * kludgey, but about all we can do until we
    887      0      stevel 			 * avenrun[] is declared as an array of uint64[]
    888      0      stevel 			 */
    889      0      stevel 			if (hp_avenrun[i] < ((uint64_t)1<<(31+16-FSHIFT)))
    890      0      stevel 				avenrun[i] = (int32_t)(hp_avenrun[i] >>
    891      0      stevel 				    (16 - FSHIFT));
    892      0      stevel 			else
    893      0      stevel 				avenrun[i] = 0x7fffffff;
    894      0      stevel 
    895      0      stevel 		cpupart = cp_list_head;
    896      0      stevel 		do {
    897      0      stevel 			calcloadavg(genloadavg(&cpupart->cp_loadavg),
    898      0      stevel 			    cpupart->cp_hp_avenrun);
    899      0      stevel 		} while ((cpupart = cpupart->cp_next) != cp_list_head);
    900      0      stevel 
    901      0      stevel 		/*
    902      0      stevel 		 * Wake up the swapper thread if necessary.
    903      0      stevel 		 */
    904      0      stevel 		if (runin ||
    905      0      stevel 		    (runout && (avefree < desfree || wake_sched_sec))) {
    906      0      stevel 			t = &t0;
    907      0      stevel 			thread_lock(t);
    908      0      stevel 			if (t->t_state == TS_STOPPED) {
    909      0      stevel 				runin = runout = 0;
    910      0      stevel 				wake_sched_sec = 0;
    911      0      stevel 				t->t_whystop = 0;
    912      0      stevel 				t->t_whatstop = 0;
    913      0      stevel 				t->t_schedflag &= ~TS_ALLSTART;
    914      0      stevel 				THREAD_TRANSITION(t);
    915      0      stevel 				setfrontdq(t);
    916      0      stevel 			}
    917      0      stevel 			thread_unlock(t);
    918      0      stevel 		}
    919      0      stevel 	}
    920      0      stevel 
    921      0      stevel 	/*
    922      0      stevel 	 * Wake up the swapper if any high priority swapped-out threads
    923      0      stevel 	 * became runable during the last tick.
    924      0      stevel 	 */
    925      0      stevel 	if (wake_sched) {
    926      0      stevel 		t = &t0;
    927      0      stevel 		thread_lock(t);
    928      0      stevel 		if (t->t_state == TS_STOPPED) {
    929      0      stevel 			runin = runout = 0;
    930      0      stevel 			wake_sched = 0;
    931      0      stevel 			t->t_whystop = 0;
    932      0      stevel 			t->t_whatstop = 0;
    933      0      stevel 			t->t_schedflag &= ~TS_ALLSTART;
    934      0      stevel 			THREAD_TRANSITION(t);
    935      0      stevel 			setfrontdq(t);
    936      0      stevel 		}
    937      0      stevel 		thread_unlock(t);
    938      0      stevel 	}
    939      0      stevel }
    940      0      stevel 
    941      0      stevel void
    942      0      stevel clock_init(void)
    943      0      stevel {
    944  11066      rafael 	cyc_handler_t clk_hdlr, timer_hdlr, lbolt_hdlr;
    945  11066      rafael 	cyc_time_t clk_when, lbolt_when;
    946  11066      rafael 	int i, sz;
    947  11066      rafael 	intptr_t buf;
    948      0      stevel 
    949  11066      rafael 	/*
    950  11066      rafael 	 * Setup handler and timer for the clock cyclic.
    951  11066      rafael 	 */
    952  11066      rafael 	clk_hdlr.cyh_func = (cyc_func_t)clock;
    953  11066      rafael 	clk_hdlr.cyh_level = CY_LOCK_LEVEL;
    954  11066      rafael 	clk_hdlr.cyh_arg = NULL;
    955      0      stevel 
    956  11066      rafael 	clk_when.cyt_when = 0;
    957  11066      rafael 	clk_when.cyt_interval = nsec_per_tick;
    958   5107        eota 
    959   5107        eota 	/*
    960   5107        eota 	 * cyclic_timer is dedicated to the ddi interface, which
    961   5107        eota 	 * uses the same clock resolution as the system one.
    962   5107        eota 	 */
    963  11066      rafael 	timer_hdlr.cyh_func = (cyc_func_t)cyclic_timer;
    964  11066      rafael 	timer_hdlr.cyh_level = CY_LOCK_LEVEL;
    965  11066      rafael 	timer_hdlr.cyh_arg = NULL;
    966      0      stevel 
    967  11066      rafael 	/*
    968  11066      rafael 	 * Setup the necessary structures for the lbolt cyclic and add the
    969  11066      rafael 	 * soft interrupt which will switch from event to cyclic mode when
    970  11066      rafael 	 * under high pil.
    971  11066      rafael 	 */
    972  11066      rafael 	lbolt_hdlr.cyh_func = (cyc_func_t)lbolt_cyclic;
    973  11066      rafael 	lbolt_hdlr.cyh_level = CY_LOCK_LEVEL;
    974  11066      rafael 	lbolt_hdlr.cyh_arg = NULL;
    975  11066      rafael 
    976  11066      rafael 	lbolt_when.cyt_interval = nsec_per_tick;
    977  11066      rafael 
    978  11066      rafael 	if (lbolt_cyc_only) {
    979  11066      rafael 		lbolt_when.cyt_when = 0;
    980  11066      rafael 		lbolt_hybrid = lbolt_cyclic_driven;
    981  11066      rafael 	} else {
    982  11066      rafael 		lbolt_when.cyt_when = CY_INFINITY;
    983  11066      rafael 		lbolt_hybrid = lbolt_event_driven;
    984  11066      rafael 	}
    985  11066      rafael 
    986  11066      rafael 	/*
    987  11066      rafael 	 * Allocate cache line aligned space for the per CPU lbolt data and
    988  11099      rafael 	 * lbolt info structures, and initialize them with their default
    989  11099      rafael 	 * values. Note that these structures are also cache line sized.
    990  11066      rafael 	 */
    991  11066      rafael 	sz = sizeof (lbolt_info_t) + CPU_CACHE_COHERENCE_SIZE;
    992  11066      rafael 	buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP);
    993  11066      rafael 	lb_info = (lbolt_info_t *)P2ROUNDUP(buf, CPU_CACHE_COHERENCE_SIZE);
    994  11066      rafael 
    995  11066      rafael 	if (hz != HZ_DEFAULT)
    996  11066      rafael 		lb_info->lbi_thresh_interval = LBOLT_THRESH_INTERVAL *
    997  11066      rafael 		    hz/HZ_DEFAULT;
    998  11066      rafael 	else
    999  11066      rafael 		lb_info->lbi_thresh_interval = LBOLT_THRESH_INTERVAL;
   1000  11066      rafael 
   1001  11066      rafael 	lb_info->lbi_thresh_calls = LBOLT_THRESH_CALLS;
   1002  11066      rafael 
   1003  11099      rafael 	sz = (sizeof (lbolt_cpu_t) * max_ncpus) + CPU_CACHE_COHERENCE_SIZE;
   1004  11066      rafael 	buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP);
   1005  11066      rafael 	lb_cpu = (lbolt_cpu_t *)P2ROUNDUP(buf, CPU_CACHE_COHERENCE_SIZE);
   1006  11066      rafael 
   1007  11066      rafael 	for (i = 0; i < max_ncpus; i++)
   1008  11066      rafael 		lb_cpu[i].lbc_counter = lb_info->lbi_thresh_calls;
   1009  11066      rafael 
   1010  11066      rafael 	lbolt_softint_add();
   1011  11066      rafael 
   1012  11066      rafael 	/*
   1013  11066      rafael 	 * Grab cpu_lock and install all three cyclics.
   1014  11066      rafael 	 */
   1015      0      stevel 	mutex_enter(&cpu_lock);
   1016  11066      rafael 
   1017  11066      rafael 	clock_cyclic = cyclic_add(&clk_hdlr, &clk_when);
   1018  11066      rafael 	ddi_timer_cyclic = cyclic_add(&timer_hdlr, &clk_when);
   1019  11151      rafael 	lb_info->id.lbi_cyclic_id = cyclic_add(&lbolt_hdlr, &lbolt_when);
   1020  11066      rafael 
   1021      0      stevel 	mutex_exit(&cpu_lock);
   1022      0      stevel }
   1023      0      stevel 
   1024      0      stevel /*
   1025      0      stevel  * Called before calcloadavg to get 10-sec moving loadavg together
   1026      0      stevel  */
   1027      0      stevel 
   1028      0      stevel static int
   1029      0      stevel genloadavg(struct loadavg_s *avgs)
   1030      0      stevel {
   1031      0      stevel 	int avg;
   1032      0      stevel 	int spos; /* starting position */
   1033      0      stevel 	int cpos; /* moving current position */
   1034      0      stevel 	int i;
   1035      0      stevel 	int slen;
   1036      0      stevel 	hrtime_t hr_avg;
   1037      0      stevel 
   1038      0      stevel 	/* 10-second snapshot, calculate first positon */
   1039      0      stevel 	if (avgs->lg_len == 0) {
   1040      0      stevel 		return (0);
   1041      0      stevel 	}
   1042      0      stevel 	slen = avgs->lg_len < S_MOVAVG_SZ ? avgs->lg_len : S_MOVAVG_SZ;
   1043      0      stevel 
   1044      0      stevel 	spos = (avgs->lg_cur - 1) >= 0 ? avgs->lg_cur - 1 :
   1045      0      stevel 	    S_LOADAVG_SZ + (avgs->lg_cur - 1);
   1046      0      stevel 	for (i = hr_avg = 0; i < slen; i++) {
   1047      0      stevel 		cpos = (spos - i) >= 0 ? spos - i : S_LOADAVG_SZ + (spos - i);
   1048      0      stevel 		hr_avg += avgs->lg_loads[cpos];
   1049      0      stevel 	}
   1050      0      stevel 
   1051      0      stevel 	hr_avg = hr_avg / slen;
   1052      0      stevel 	avg = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
   1053      0      stevel 
   1054      0      stevel 	return (avg);
   1055      0      stevel }
   1056      0      stevel 
   1057      0      stevel /*
   1058      0      stevel  * Run every second from clock () to update the loadavg count available to the
   1059      0      stevel  * system and cpu-partitions.
   1060      0      stevel  *
   1061      0      stevel  * This works by sampling the previous usr, sys, wait time elapsed,
   1062      0      stevel  * computing a delta, and adding that delta to the elapsed usr, sys,
   1063      0      stevel  * wait increase.
   1064      0      stevel  */
   1065      0      stevel 
   1066      0      stevel static void
   1067      0      stevel loadavg_update()
   1068      0      stevel {
   1069      0      stevel 	cpu_t *cp;
   1070      0      stevel 	cpupart_t *cpupart;
   1071      0      stevel 	hrtime_t cpu_total;
   1072      0      stevel 	int prev;
   1073      0      stevel 
   1074      0      stevel 	cp = cpu_list;
   1075      0      stevel 	loadavg.lg_total = 0;
   1076      0      stevel 
   1077      0      stevel 	/*
   1078      0      stevel 	 * first pass totals up per-cpu statistics for system and cpu
   1079      0      stevel 	 * partitions
   1080      0      stevel 	 */
   1081      0      stevel 
   1082      0      stevel 	do {
   1083      0      stevel 		struct loadavg_s *lavg;
   1084      0      stevel 
   1085      0      stevel 		lavg = &cp->cpu_loadavg;
   1086      0      stevel 
   1087      0      stevel 		cpu_total = cp->cpu_acct[CMS_USER] +
   1088      0      stevel 		    cp->cpu_acct[CMS_SYSTEM] + cp->cpu_waitrq;
   1089      0      stevel 		/* compute delta against last total */
   1090      0      stevel 		scalehrtime(&cpu_total);
   1091      0      stevel 		prev = (lavg->lg_cur - 1) >= 0 ? lavg->lg_cur - 1 :
   1092      0      stevel 		    S_LOADAVG_SZ + (lavg->lg_cur - 1);
   1093      0      stevel 		if (lavg->lg_loads[prev] <= 0) {
   1094      0      stevel 			lavg->lg_loads[lavg->lg_cur] = cpu_total;
   1095      0      stevel 			cpu_total = 0;
   1096      0      stevel 		} else {
   1097      0      stevel 			lavg->lg_loads[lavg->lg_cur] = cpu_total;
   1098      0      stevel 			cpu_total = cpu_total - lavg->lg_loads[prev];
   1099      0      stevel 			if (cpu_total < 0)
   1100      0      stevel 				cpu_total = 0;
   1101      0      stevel 		}
   1102      0      stevel 
   1103      0      stevel 		lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
   1104      0      stevel 		lavg->lg_len = (lavg->lg_len + 1) < S_LOADAVG_SZ ?
   1105      0      stevel 		    lavg->lg_len + 1 : S_LOADAVG_SZ;
   1106      0      stevel 
   1107      0      stevel 		loadavg.lg_total += cpu_total;
   1108      0      stevel 		cp->cpu_part->cp_loadavg.lg_total += cpu_total;
   1109      0      stevel 
   1110      0      stevel 	} while ((cp = cp->cpu_next) != cpu_list);
   1111      0      stevel 
   1112      0      stevel 	loadavg.lg_loads[loadavg.lg_cur] = loadavg.lg_total;
   1113      0      stevel 	loadavg.lg_cur = (loadavg.lg_cur + 1) % S_LOADAVG_SZ;
   1114      0      stevel 	loadavg.lg_len = (loadavg.lg_len + 1) < S_LOADAVG_SZ ?
   1115      0      stevel 	    loadavg.lg_len + 1 : S_LOADAVG_SZ;
   1116      0      stevel 	/*
   1117      0      stevel 	 * Second pass updates counts
   1118      0      stevel 	 */
   1119      0      stevel 	cpupart = cp_list_head;
   1120      0      stevel 
   1121      0      stevel 	do {
   1122      0      stevel 		struct loadavg_s *lavg;
   1123      0      stevel 
   1124      0      stevel 		lavg = &cpupart->cp_loadavg;
   1125      0      stevel 		lavg->lg_loads[lavg->lg_cur] = lavg->lg_total;
   1126      0      stevel 		lavg->lg_total = 0;
   1127      0      stevel 		lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
   1128      0      stevel 		lavg->lg_len = (lavg->lg_len + 1) < S_LOADAVG_SZ ?
   1129      0      stevel 		    lavg->lg_len + 1 : S_LOADAVG_SZ;
   1130      0      stevel 
   1131      0      stevel 	} while ((cpupart = cpupart->cp_next) != cp_list_head);
   1132      0      stevel 
   1133      0      stevel }
   1134      0      stevel 
   1135      0      stevel /*
   1136      0      stevel  * clock_update() - local clock update
   1137      0      stevel  *
   1138      0      stevel  * This routine is called by ntp_adjtime() to update the local clock
   1139      0      stevel  * phase and frequency. The implementation is of an
   1140      0      stevel  * adaptive-parameter, hybrid phase/frequency-lock loop (PLL/FLL). The
   1141      0      stevel  * routine computes new time and frequency offset estimates for each
   1142      0      stevel  * call.  The PPS signal itself determines the new time offset,
   1143      0      stevel  * instead of the calling argument.  Presumably, calls to
   1144      0      stevel  * ntp_adjtime() occur only when the caller believes the local clock
   1145      0      stevel  * is valid within some bound (+-128 ms with NTP). If the caller's
   1146      0      stevel  * time is far different than the PPS time, an argument will ensue,
   1147      0      stevel  * and it's not clear who will lose.
   1148      0      stevel  *
   1149      0      stevel  * For uncompensated quartz crystal oscillatores and nominal update
   1150      0      stevel  * intervals less than 1024 s, operation should be in phase-lock mode
   1151      0      stevel  * (STA_FLL = 0), where the loop is disciplined to phase. For update
   1152      0      stevel  * intervals greater than this, operation should be in frequency-lock
   1153      0      stevel  * mode (STA_FLL = 1), where the loop is disciplined to frequency.
   1154      0      stevel  *
   1155      0      stevel  * Note: mutex(&tod_lock) is in effect.
   1156      0      stevel  */
   1157      0      stevel void
   1158      0      stevel clock_update(int offset)
   1159      0      stevel {
   1160      0      stevel 	int ltemp, mtemp, s;
   1161      0      stevel 
   1162      0      stevel 	ASSERT(MUTEX_HELD(&tod_lock));
   1163      0      stevel 
   1164      0      stevel 	if (!(time_status & STA_PLL) && !(time_status & STA_PPSTIME))
   1165      0      stevel 		return;
   1166      0      stevel 	ltemp = offset;
   1167      0      stevel 	if ((time_status & STA_PPSTIME) && (time_status & STA_PPSSIGNAL))
   1168      0      stevel 		ltemp = pps_offset;
   1169      0      stevel 
   1170      0      stevel 	/*
   1171      0      stevel 	 * Scale the phase adjustment and clamp to the operating range.
   1172      0      stevel 	 */
   1173      0      stevel 	if (ltemp > MAXPHASE)
   1174      0      stevel 		time_offset = MAXPHASE * SCALE_UPDATE;
   1175      0      stevel 	else if (ltemp < -MAXPHASE)
   1176      0      stevel 		time_offset = -(MAXPHASE * SCALE_UPDATE);
   1177      0      stevel 	else
   1178      0      stevel 		time_offset = ltemp * SCALE_UPDATE;
   1179      0      stevel 
   1180      0      stevel 	/*
   1181      0      stevel 	 * Select whether the frequency is to be controlled and in which
   1182      0      stevel 	 * mode (PLL or FLL). Clamp to the operating range. Ugly
   1183      0      stevel 	 * multiply/divide should be replaced someday.
   1184      0      stevel 	 */
   1185      0      stevel 	if (time_status & STA_FREQHOLD || time_reftime == 0)
   1186      0      stevel 		time_reftime = hrestime.tv_sec;
   1187      0      stevel 
   1188      0      stevel 	mtemp = hrestime.tv_sec - time_reftime;
   1189      0      stevel 	time_reftime = hrestime.tv_sec;
   1190      0      stevel 
   1191      0      stevel 	if (time_status & STA_FLL) {
   1192      0      stevel 		if (mtemp >= MINSEC) {
   1193      0      stevel 			ltemp = ((time_offset / mtemp) * (SCALE_USEC /
   1194      0      stevel 			    SCALE_UPDATE));
   1195      0      stevel 			if (ltemp)
   1196      0      stevel 				time_freq += ltemp / SCALE_KH;
   1197      0      stevel 		}
   1198      0      stevel 	} else {
   1199      0      stevel 		if (mtemp < MAXSEC) {
   1200      0      stevel 			ltemp *= mtemp;
   1201      0      stevel 			if (ltemp)
   1202      0      stevel 				time_freq += (int)(((int64_t)ltemp *
   1203      0      stevel 				    SCALE_USEC) / SCALE_KF)
   1204      0      stevel 				    / (1 << (time_constant * 2));
   1205      0      stevel 		}
   1206      0      stevel 	}
   1207      0      stevel 	if (time_freq > time_tolerance)
   1208      0      stevel 		time_freq = time_tolerance;
   1209      0      stevel 	else if (time_freq < -time_tolerance)
   1210      0      stevel 		time_freq = -time_tolerance;
   1211      0      stevel 
   1212      0      stevel 	s = hr_clock_lock();
   1213      0      stevel 	tod_needsync = 1;
   1214      0      stevel 	hr_clock_unlock(s);
   1215      0      stevel }
   1216      0      stevel 
   1217      0      stevel /*
   1218      0      stevel  * ddi_hardpps() - discipline CPU clock oscillator to external PPS signal
   1219      0      stevel  *
   1220      0      stevel  * This routine is called at each PPS interrupt in order to discipline
   1221      0      stevel  * the CPU clock oscillator to the PPS signal. It measures the PPS phase
   1222      0      stevel  * and leaves it in a handy spot for the clock() routine. It
   1223      0      stevel  * integrates successive PPS phase differences and calculates the
   1224      0      stevel  * frequency offset. This is used in clock() to discipline the CPU
   1225      0      stevel  * clock oscillator so that intrinsic frequency error is cancelled out.
   1226      0      stevel  * The code requires the caller to capture the time and hardware counter
   1227      0      stevel  * value at the on-time PPS signal transition.
   1228      0      stevel  *
   1229      0      stevel  * Note that, on some Unix systems, this routine runs at an interrupt
   1230      0      stevel  * priority level higher than the timer interrupt routine clock().
   1231      0      stevel  * Therefore, the variables used are distinct from the clock()
   1232      0      stevel  * variables, except for certain exceptions: The PPS frequency pps_freq
   1233      0      stevel  * and phase pps_offset variables are determined by this routine and
   1234      0      stevel  * updated atomically. The time_tolerance variable can be considered a
   1235      0      stevel  * constant, since it is infrequently changed, and then only when the
   1236      0      stevel  * PPS signal is disabled. The watchdog counter pps_valid is updated
   1237      0      stevel  * once per second by clock() and is atomically cleared in this
   1238      0      stevel  * routine.
   1239      0      stevel  *
   1240      0      stevel  * tvp is the time of the last tick; usec is a microsecond count since the
   1241      0      stevel  * last tick.
   1242      0      stevel  *
   1243      0      stevel  * Note: In Solaris systems, the tick value is actually given by
   1244      0      stevel  *       usec_per_tick.  This is called from the serial driver cdintr(),
   1245      0      stevel  *	 or equivalent, at a high PIL.  Because the kernel keeps a
   1246      0      stevel  *	 highresolution time, the following code can accept either
   1247      0      stevel  *	 the traditional argument pair, or the current highres timestamp
   1248      0      stevel  *       in tvp and zero in usec.
   1249      0      stevel  */
   1250      0      stevel void
   1251      0      stevel ddi_hardpps(struct timeval *tvp, int usec)
   1252      0      stevel {
   1253      0      stevel 	int u_usec, v_usec, bigtick;
   1254      0      stevel 	time_t cal_sec;
   1255      0      stevel 	int cal_usec;
   1256      0      stevel 
   1257      0      stevel 	/*
   1258      0      stevel 	 * An occasional glitch can be produced when the PPS interrupt
   1259      0      stevel 	 * occurs in the clock() routine before the time variable is
   1260      0      stevel 	 * updated. Here the offset is discarded when the difference
   1261      0      stevel 	 * between it and the last one is greater than tick/2, but not
   1262      0      stevel 	 * if the interval since the first discard exceeds 30 s.
   1263      0      stevel 	 */
   1264      0      stevel 	time_status |= STA_PPSSIGNAL;
   1265      0      stevel 	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
   1266      0      stevel 	pps_valid = 0;
   1267      0      stevel 	u_usec = -tvp->tv_usec;
   1268      0      stevel 	if (u_usec < -(MICROSEC/2))
   1269      0      stevel 		u_usec += MICROSEC;
   1270      0      stevel 	v_usec = pps_offset - u_usec;
   1271      0      stevel 	if (v_usec < 0)
   1272      0      stevel 		v_usec = -v_usec;
   1273      0      stevel 	if (v_usec > (usec_per_tick >> 1)) {
   1274      0      stevel 		if (pps_glitch > MAXGLITCH) {
   1275      0      stevel 			pps_glitch = 0;
   1276      0      stevel 			pps_tf[2] = u_usec;
   1277      0      stevel 			pps_tf[1] = u_usec;
   1278      0      stevel 		} else {
   1279      0      stevel 			pps_glitch++;
   1280      0      stevel 			u_usec = pps_offset;
   1281      0      stevel 		}
   1282      0      stevel 	} else
   1283      0      stevel 		pps_glitch = 0;
   1284      0      stevel 
   1285      0      stevel 	/*
   1286      0      stevel 	 * A three-stage median filter is used to help deglitch the pps
   1287      0      stevel 	 * time. The median sample becomes the time offset estimate; the
   1288      0      stevel 	 * difference between the other two samples becomes the time
   1289      0      stevel 	 * dispersion (jitter) estimate.
   1290      0      stevel 	 */
   1291      0      stevel 	pps_tf[2] = pps_tf[1];
   1292      0      stevel 	pps_tf[1] = pps_tf[0];
   1293      0      stevel 	pps_tf[0] = u_usec;
   1294      0      stevel 	if (pps_tf[0] > pps_tf[1]) {
   1295      0      stevel 		if (pps_tf[1] > pps_tf[2]) {
   1296      0      stevel 			pps_offset = pps_tf[1];		/* 0 1 2 */
   1297      0      stevel 			v_usec = pps_tf[0] - pps_tf[2];
   1298      0      stevel 		} else if (pps_tf[2] > pps_tf[0]) {
   1299      0      stevel 			pps_offset = pps_tf[0];		/* 2 0 1 */
   1300      0      stevel 			v_usec = pps_tf[2] - pps_tf[1];
   1301      0      stevel 		} else {
   1302      0      stevel 			pps_offset = pps_tf[2];		/* 0 2 1 */
   1303      0      stevel 			v_usec = pps_tf[0] - pps_tf[1];
   1304      0      stevel 		}
   1305      0      stevel 	} else {
   1306      0      stevel 		if (pps_tf[1] < pps_tf[2]) {
   1307      0      stevel 			pps_offset = pps_tf[1];		/* 2 1 0 */
   1308      0      stevel 			v_usec = pps_tf[2] - pps_tf[0];
   1309      0      stevel 		} else  if (pps_tf[2] < pps_tf[0]) {
   1310      0      stevel 			pps_offset = pps_tf[0];		/* 1 0 2 */
   1311      0      stevel 			v_usec = pps_tf[1] - pps_tf[2];
   1312      0      stevel 		} else {
   1313      0      stevel 			pps_offset = pps_tf[2];		/* 1 2 0 */
   1314      0      stevel 			v_usec = pps_tf[1] - pps_tf[0];
   1315      0      stevel 		}
   1316      0      stevel 	}
   1317      0      stevel 	if (v_usec > MAXTIME)
   1318      0      stevel 		pps_jitcnt++;
   1319      0      stevel 	v_usec = (v_usec << PPS_AVG) - pps_jitter;
   1320      0      stevel 	pps_jitter += v_usec / (1 << PPS_AVG);
   1321      0      stevel 	if (pps_jitter > (MAXTIME >> 1))
   1322      0      stevel 		time_status |= STA_PPSJITTER;
   1323      0      stevel 
   1324      0      stevel 	/*
   1325      0      stevel 	 * During the calibration interval adjust the starting time when
   1326      0      stevel 	 * the tick overflows. At the end of the interval compute the
   1327      0      stevel 	 * duration of the interval and the difference of the hardware
   1328      0      stevel 	 * counters at the beginning and end of the interval. This code
   1329      0      stevel 	 * is deliciously complicated by the fact valid differences may
   1330      0      stevel 	 * exceed the value of tick when using long calibration
   1331      0      stevel 	 * intervals and small ticks. Note that the counter can be
   1332      0      stevel 	 * greater than tick if caught at just the wrong instant, but
   1333      0      stevel 	 * the values returned and used here are correct.
   1334      0      stevel 	 */
   1335      0      stevel 	bigtick = (int)usec_per_tick * SCALE_USEC;
   1336      0      stevel 	pps_usec -= pps_freq;
   1337      0      stevel 	if (pps_usec >= bigtick)
   1338      0      stevel 		pps_usec -= bigtick;
   1339      0      stevel 	if (pps_usec < 0)
   1340      0      stevel 		pps_usec += bigtick;
   1341      0      stevel 	pps_time.tv_sec++;
   1342      0      stevel 	pps_count++;
   1343      0      stevel 	if (pps_count < (1 << pps_shift))
   1344      0      stevel 		return;
   1345      0      stevel 	pps_count = 0;
   1346      0      stevel 	pps_calcnt++;
   1347      0      stevel 	u_usec = usec * SCALE_USEC;
   1348      0      stevel 	v_usec = pps_usec - u_usec;
   1349      0      stevel 	if (v_usec >= bigtick >> 1)
   1350      0      stevel 		v_usec -= bigtick;
   1351      0      stevel 	if (v_usec < -(bigtick >> 1))
   1352      0      stevel 		v_usec += bigtick;
   1353      0      stevel 	if (v_usec < 0)
   1354      0      stevel 		v_usec = -(-v_usec >> pps_shift);
   1355      0      stevel 	else
   1356      0      stevel 		v_usec = v_usec >> pps_shift;
   1357      0      stevel 	pps_usec = u_usec;
   1358      0      stevel 	cal_sec = tvp->tv_sec;
   1359      0      stevel 	cal_usec = tvp->tv_usec;
   1360      0      stevel 	cal_sec -= pps_time.tv_sec;
   1361      0      stevel 	cal_usec -= pps_time.tv_usec;
   1362      0      stevel 	if (cal_usec < 0) {
   1363      0      stevel 		cal_usec += MICROSEC;
   1364      0      stevel 		cal_sec--;
   1365      0      stevel 	}
   1366      0      stevel 	pps_time = *tvp;
   1367      0      stevel 
   1368      0      stevel 	/*
   1369      0      stevel 	 * Check for lost interrupts, noise, excessive jitter and
   1370      0      stevel 	 * excessive frequency error. The number of timer ticks during
   1371      0      stevel 	 * the interval may vary +-1 tick. Add to this a margin of one
   1372      0      stevel 	 * tick for the PPS signal jitter and maximum frequency
   1373      0      stevel 	 * deviation. If the limits are exceeded, the calibration
   1374      0      stevel 	 * interval is reset to the minimum and we start over.
   1375      0      stevel 	 */
   1376      0      stevel 	u_usec = (int)usec_per_tick << 1;
   1377      0      stevel 	if (!((cal_sec == -1 && cal_usec > (MICROSEC - u_usec)) ||
   1378      0      stevel 	    (cal_sec == 0 && cal_usec < u_usec)) ||
   1379      0      stevel 	    v_usec > time_tolerance || v_usec < -time_tolerance) {
   1380      0      stevel 		pps_errcnt++;
   1381      0      stevel 		pps_shift = PPS_SHIFT;
   1382      0      stevel 		pps_intcnt = 0;
   1383      0      stevel 		time_status |= STA_PPSERROR;
   1384      0      stevel 		return;
   1385      0      stevel 	}
   1386      0      stevel 
   1387      0      stevel 	/*
   1388      0      stevel 	 * A three-stage median filter is used to help deglitch the pps
   1389      0      stevel 	 * frequency. The median sample becomes the frequency offset
   1390      0      stevel 	 * estimate; the difference between the other two samples
   1391      0      stevel 	 * becomes the frequency dispersion (stability) estimate.
   1392      0      stevel 	 */
   1393      0      stevel 	pps_ff[2] = pps_ff[1];
   1394      0      stevel 	pps_ff[1] = pps_ff[0];
   1395      0      stevel 	pps_ff[0] = v_usec;
   1396      0      stevel 	if (pps_ff[0] > pps_ff[1]) {
   1397      0      stevel 		if (pps_ff[1] > pps_ff[2]) {
   1398      0      stevel 			u_usec = pps_ff[1];		/* 0 1 2 */
   1399      0      stevel 			v_usec = pps_ff[0] - pps_ff[2];
   1400      0      stevel 		} else if (pps_ff[2] > pps_ff[0]) {
   1401      0      stevel 			u_usec = pps_ff[0];		/* 2 0 1 */
   1402      0      stevel 			v_usec = pps_ff[2] - pps_ff[1];
   1403      0      stevel 		} else {
   1404      0      stevel 			u_usec = pps_ff[2];		/* 0 2 1 */
   1405      0      stevel 			v_usec = pps_ff[0] - pps_ff[1];
   1406      0      stevel 		}
   1407      0      stevel 	} else {
   1408      0      stevel 		if (pps_ff[1] < pps_ff[2]) {
   1409      0      stevel 			u_usec = pps_ff[1];		/* 2 1 0 */
   1410      0      stevel 			v_usec = pps_ff[2] - pps_ff[0];
   1411      0      stevel 		} else  if (pps_ff[2] < pps_ff[0]) {
   1412      0      stevel 			u_usec = pps_ff[0];		/* 1 0 2 */
   1413      0      stevel 			v_usec = pps_ff[1] - pps_ff[2];
   1414      0      stevel 		} else {
   1415      0      stevel 			u_usec = pps_ff[2];		/* 1 2 0 */
   1416      0      stevel 			v_usec = pps_ff[1] - pps_ff[0];
   1417      0      stevel 		}
   1418      0      stevel 	}
   1419      0      stevel 
   1420      0      stevel 	/*
   1421      0      stevel 	 * Here the frequency dispersion (stability) is updated. If it
   1422      0      stevel 	 * is less than one-fourth the maximum (MAXFREQ), the frequency
   1423      0      stevel 	 * offset is updated as well, but clamped to the tolerance. It
   1424      0      stevel 	 * will be processed later by the clock() routine.
   1425      0      stevel 	 */
   1426      0      stevel 	v_usec = (v_usec >> 1) - pps_stabil;
   1427      0      stevel 	if (v_usec < 0)
   1428      0      stevel 		pps_stabil -= -v_usec >> PPS_AVG;
   1429      0      stevel 	else
   1430      0      stevel 		pps_stabil += v_usec >> PPS_AVG;
   1431      0      stevel 	if (pps_stabil > MAXFREQ >> 2) {
   1432      0      stevel 		pps_stbcnt++;
   1433      0      stevel 		time_status |= STA_PPSWANDER;
   1434      0      stevel 		return;
   1435      0      stevel 	}
   1436      0      stevel 	if (time_status & STA_PPSFREQ) {
   1437      0      stevel 		if (u_usec < 0) {
   1438      0      stevel 			pps_freq -= -u_usec >> PPS_AVG;
   1439      0      stevel 			if (pps_freq < -time_tolerance)
   1440      0      stevel 				pps_freq = -time_tolerance;
   1441      0      stevel 			u_usec = -u_usec;
   1442      0      stevel 		} else {
   1443      0      stevel 			pps_freq += u_usec >> PPS_AVG;
   1444      0      stevel 			if (pps_freq > time_tolerance)
   1445      0      stevel 				pps_freq = time_tolerance;
   1446      0      stevel 		}
   1447      0      stevel 	}
   1448      0      stevel 
   1449      0      stevel 	/*
   1450      0      stevel 	 * Here the calibration interval is adjusted. If the maximum
   1451      0      stevel 	 * time difference is greater than tick / 4, reduce the interval
   1452      0      stevel 	 * by half. If this is not the case for four consecutive
   1453      0      stevel 	 * intervals, double the interval.
   1454      0      stevel 	 */
   1455      0      stevel 	if (u_usec << pps_shift > bigtick >> 2) {
   1456      0      stevel 		pps_intcnt = 0;
   1457      0      stevel 		if (pps_shift > PPS_SHIFT)
   1458      0      stevel 			pps_shift--;
   1459      0      stevel 	} else if (pps_intcnt >= 4) {
   1460      0      stevel 		pps_intcnt = 0;
   1461      0      stevel 		if (pps_shift < PPS_SHIFTMAX)
   1462      0      stevel 			pps_shift++;
   1463      0      stevel 	} else
   1464      0      stevel 		pps_intcnt++;
   1465      0      stevel 
   1466      0      stevel 	/*
   1467      0      stevel 	 * If recovering from kmdb, then make sure the tod chip gets resynced.
   1468      0      stevel 	 * If we took an early exit above, then we don't yet have a stable
   1469      0      stevel 	 * calibration signal to lock onto, so don't mark the tod for sync
   1470      0      stevel 	 * until we get all the way here.
   1471      0      stevel 	 */
   1472      0      stevel 	{
   1473      0      stevel 		int s = hr_clock_lock();
   1474      0      stevel 
   1475      0      stevel 		tod_needsync = 1;
   1476      0      stevel 		hr_clock_unlock(s);
   1477      0      stevel 	}
   1478      0      stevel }
   1479      0      stevel 
   1480      0      stevel /*
   1481      0      stevel  * Handle clock tick processing for a thread.
   1482      0      stevel  * Check for timer action, enforce CPU rlimit, do profiling etc.
   1483      0      stevel  */
   1484      0      stevel void
   1485   5788    mv143129 clock_tick(kthread_t *t, int pending)
   1486      0      stevel {
   1487      0      stevel 	struct proc *pp;
   1488      0      stevel 	klwp_id_t    lwp;
   1489      0      stevel 	struct as *as;
   1490   5788    mv143129 	clock_t	ticks;
   1491      0      stevel 	int	poke = 0;		/* notify another CPU */
   1492      0      stevel 	int	user_mode;
   1493      0      stevel 	size_t	 rss;
   1494   5788    mv143129 	int i, total_usec, usec;
   1495   5788    mv143129 	rctl_qty_t secs;
   1496   5788    mv143129 
   1497   5788    mv143129 	ASSERT(pending > 0);
   1498      0      stevel 
   1499      0      stevel 	/* Must be operating on a lwp/thread */
   1500      0      stevel 	if ((lwp = ttolwp(t)) == NULL) {
   1501      0      stevel 		panic("clock_tick: no lwp");
   1502      0      stevel 		/*NOTREACHED*/
   1503      0      stevel 	}
   1504      0      stevel 
   1505   5788    mv143129 	for (i = 0; i < pending; i++) {
   1506   5788    mv143129 		CL_TICK(t);	/* Class specific tick processing */
   1507   5788    mv143129 		DTRACE_SCHED1(tick, kthread_t *, t);
   1508   5788    mv143129 	}
   1509      0      stevel 
   1510      0      stevel 	pp = ttoproc(t);
   1511      0      stevel 
   1512      0      stevel 	/* pp->p_lock makes sure that the thread does not exit */
   1513      0      stevel 	ASSERT(MUTEX_HELD(&pp->p_lock));
   1514      0      stevel 
   1515      0      stevel 	user_mode = (lwp->lwp_state == LWP_USER);
   1516      0      stevel 
   1517   5788    mv143129 	ticks = (pp->p_utime + pp->p_stime) % hz;
   1518      0      stevel 	/*
   1519      0      stevel 	 * Update process times. Should use high res clock and state
   1520      0      stevel 	 * changes instead of statistical sampling method. XXX
   1521      0      stevel 	 */
   1522      0      stevel 	if (user_mode) {
   1523   5788    mv143129 		pp->p_utime += pending;
   1524      0      stevel 	} else {
   1525   5788    mv143129 		pp->p_stime += pending;
   1526      0      stevel 	}
   1527   5788    mv143129 
   1528   5788    mv143129 	pp->p_ttime += pending;
   1529      0      stevel 	as = pp->p_as;
   1530      0      stevel 
   1531      0      stevel 	/*
   1532      0      stevel 	 * Update user profiling statistics. Get the pc from the
   1533      0      stevel 	 * lwp when the AST happens.
   1534      0      stevel 	 */
   1535      0      stevel 	if (pp->p_prof.pr_scale) {
   1536   5788    mv143129 		atomic_add_32(&lwp->lwp_oweupc, (int32_t)pending);
   1537      0      stevel 		if (user_mode) {
   1538      0      stevel 			poke = 1;
   1539      0      stevel 			aston(t);
   1540      0      stevel 		}
   1541      0      stevel 	}
   1542      0      stevel 
   1543   5788    mv143129 	/*
   1544   5788    mv143129 	 * If CPU was in user state, process lwp-virtual time
   1545   5788    mv143129 	 * interval timer. The value passed to itimerdecr() has to be
   1546   5788    mv143129 	 * in microseconds and has to be less than one second. Hence
   1547   5788    mv143129 	 * this loop.
   1548   5788    mv143129 	 */
   1549   5788    mv143129 	total_usec = usec_per_tick * pending;
   1550   5788    mv143129 	while (total_usec > 0) {
   1551   5788    mv143129 		usec = MIN(total_usec, (MICROSEC - 1));
   1552   5788    mv143129 		if (user_mode &&
   1553   5788    mv143129 		    timerisset(&lwp->lwp_timer[ITIMER_VIRTUAL].it_value) &&
   1554   5788    mv143129 		    itimerdecr(&lwp->lwp_timer[ITIMER_VIRTUAL], usec) == 0) {
   1555   5788    mv143129 			poke = 1;
   1556   5788    mv143129 			sigtoproc(pp, t, SIGVTALRM);
   1557   5788    mv143129 		}
   1558   5788    mv143129 		total_usec -= usec;
   1559   5788    mv143129 	}
   1560      0      stevel 
   1561      0      stevel 	/*
   1562   5788    mv143129 	 * If CPU was in user state, process lwp-profile
   1563      0      stevel 	 * interval timer.
   1564      0      stevel 	 */
   1565   5788    mv143129 	total_usec = usec_per_tick * pending;
   1566   5788    mv143129 	while (total_usec > 0) {
   1567   5788    mv143129 		usec = MIN(total_usec, (MICROSEC - 1));
   1568   5788    mv143129 		if (timerisset(&lwp->lwp_timer[ITIMER_PROF].it_value) &&
   1569   5788    mv143129 		    itimerdecr(&lwp->lwp_timer[ITIMER_PROF], usec) == 0) {
   1570   5788    mv143129 			poke = 1;
   1571   5788    mv143129 			sigtoproc(pp, t, SIGPROF);
   1572   5788    mv143129 		}
   1573   5788    mv143129 		total_usec -= usec;
   1574      0      stevel 	}
   1575      0      stevel 
   1576      0      stevel 	/*
   1577      0      stevel 	 * Enforce CPU resource controls:
   1578      0      stevel 	 *   (a) process.max-cpu-time resource control
   1579   5788    mv143129 	 *
   1580   5788    mv143129 	 * Perform the check only if we have accumulated more a second.
   1581      0      stevel 	 */
   1582   5788    mv143129 	if ((ticks + pending) >= hz) {
   1583   5788    mv143129 		(void) rctl_test(rctlproc_legacy[RLIMIT_CPU], pp->p_rctls, pp,
   1584   5788    mv143129 		    (pp->p_utime + pp->p_stime)/hz, RCA_UNSAFE_SIGINFO);
   1585   5788    mv143129 	}
   1586      0      stevel 
   1587      0      stevel 	/*
   1588      0      stevel 	 *   (b) task.max-cpu-time resource control
   1589   5788    mv143129 	 *
   1590   5788    mv143129 	 * If we have accumulated enough ticks, increment the task CPU
   1591   5788    mv143129 	 * time usage and test for the resource limit. This minimizes the
   1592   5788    mv143129 	 * number of calls to the rct_test(). The task CPU time mutex
   1593   5788    mv143129 	 * is highly contentious as many processes can be sharing a task.
   1594      0      stevel 	 */
   1595   5788    mv143129 	if (pp->p_ttime >= clock_tick_proc_max) {
   1596   5788    mv143129 		secs = task_cpu_time_incr(pp->p_task, pp->p_ttime);
   1597   5788    mv143129 		pp->p_ttime = 0;
   1598   5788    mv143129 		if (secs) {
   1599   5788    mv143129 			(void) rctl_test(rc_task_cpu_time, pp->p_task->tk_rctls,
   1600   5788    mv143129 			    pp, secs, RCA_UNSAFE_SIGINFO);
   1601   5788    mv143129 		}
   1602   5788    mv143129 	}
   1603      0      stevel 
   1604      0      stevel 	/*
   1605      0      stevel 	 * Update memory usage for the currently running process.
   1606      0      stevel 	 */
   1607      0      stevel 	rss = rm_asrss(as);
   1608      0      stevel 	PTOU(pp)->u_mem += rss;
   1609      0      stevel 	if (rss > PTOU(pp)->u_mem_max)
   1610      0      stevel 		PTOU(pp)->u_mem_max = rss;
   1611      0      stevel 
   1612      0      stevel 	/*
   1613      0      stevel 	 * Notify the CPU the thread is running on.
   1614      0      stevel 	 */
   1615      0      stevel 	if (poke && t->t_cpu != CPU)
   1616      0      stevel 		poke_cpu(t->t_cpu->cpu_id);
   1617      0      stevel }
   1618      0      stevel 
   1619      0      stevel void
   1620      0      stevel profil_tick(uintptr_t upc)
   1621      0      stevel {
   1622      0      stevel 	int ticks;
   1623      0      stevel 	proc_t *p = ttoproc(curthread);
   1624      0      stevel 	klwp_t *lwp = ttolwp(curthread);
   1625      0      stevel 	struct prof *pr = &p->p_prof;
   1626      0      stevel 
   1627      0      stevel 	do {
   1628      0      stevel 		ticks = lwp->lwp_oweupc;
   1629      0      stevel 	} while (cas32(&lwp->lwp_oweupc, ticks, 0) != ticks);
   1630      0      stevel 
   1631      0      stevel 	mutex_enter(&p->p_pflock);
   1632      0      stevel 	if (pr->pr_scale >= 2 && upc >= pr->pr_off) {
   1633      0      stevel 		/*
   1634      0      stevel 		 * Old-style profiling
   1635      0      stevel 		 */
   1636      0      stevel 		uint16_t *slot = pr->pr_base;
   1637      0      stevel 		uint16_t old, new;
   1638      0      stevel 		if (pr->pr_scale != 2) {
   1639      0      stevel 			uintptr_t delta = upc - pr->pr_off;
   1640      0      stevel 			uintptr_t byteoff = ((delta >> 16) * pr->pr_scale) +
   1641      0      stevel 			    (((delta & 0xffff) * pr->pr_scale) >> 16);
   1642      0      stevel 			if (byteoff >= (uintptr_t)pr->pr_size) {
   1643      0      stevel 				mutex_exit(&p->p_pflock);
   1644      0      stevel 				return;
   1645      0      stevel 			}
   1646      0      stevel 			slot += byteoff / sizeof (uint16_t);
   1647      0      stevel 		}
   1648      0      stevel 		if (fuword16(slot, &old) < 0 ||
   1649      0      stevel 		    (new = old + ticks) > SHRT_MAX ||
   1650      0      stevel 		    suword16(slot, new) < 0) {
   1651      0      stevel 			pr->pr_scale = 0;
   1652      0      stevel 		}
   1653      0      stevel 	} else if (pr->pr_scale == 1) {
   1654      0      stevel 		/*
   1655      0      stevel 		 * PC Sampling
   1656      0      stevel 		 */
   1657      0      stevel 		model_t model = lwp_getdatamodel(lwp);
   1658      0      stevel 		int result;
   1659      0      stevel #ifdef __lint
   1660      0      stevel 		model = model;
   1661      0      stevel #endif
   1662      0      stevel 		while (ticks-- > 0) {
   1663      0      stevel 			if (pr->pr_samples == pr->pr_size) {
   1664      0      stevel 				/* buffer full, turn off sampling */
   1665      0      stevel 				pr->pr_scale = 0;
   1666      0      stevel 				break;
   1667      0      stevel 			}
   1668      0      stevel 			switch (SIZEOF_PTR(model)) {
   1669      0      stevel 			case sizeof (uint32_t):
   1670      0      stevel 				result = suword32(pr->pr_base, (uint32_t)upc);
   1671      0      stevel 				break;
   1672      0      stevel #ifdef _LP64
   1673      0      stevel 			case sizeof (uint64_t):
   1674      0      stevel 				result = suword64(pr->pr_base, (uint64_t)upc);
   1675      0      stevel 				break;
   1676      0      stevel #endif
   1677      0      stevel 			default:
   1678      0      stevel 				cmn_err(CE_WARN, "profil_tick: unexpected "
   1679      0      stevel 				    "data model");
   1680      0      stevel 				result = -1;
   1681      0      stevel 				break;
   1682      0      stevel 			}
   1683      0      stevel 			if (result != 0) {
   1684      0      stevel 				pr->pr_scale = 0;
   1685      0      stevel 				break;
   1686      0      stevel 			}
   1687      0      stevel 			pr->pr_base = (caddr_t)pr->pr_base + SIZEOF_PTR(model);
   1688      0      stevel 			pr->pr_samples++;
   1689      0      stevel 		}
   1690      0      stevel 	}
   1691      0      stevel 	mutex_exit(&p->p_pflock);
   1692      0      stevel }
   1693      0      stevel 
   1694      0      stevel static void
   1695      0      stevel delay_wakeup(void *arg)
   1696      0      stevel {
   1697  10696       David 	kthread_t	*t = arg;
   1698      0      stevel 
   1699      0      stevel 	mutex_enter(&t->t_delay_lock);
   1700      0      stevel 	cv_signal(&t->t_delay_cv);
   1701      0      stevel 	mutex_exit(&t->t_delay_lock);
   1702      0      stevel }
   1703      0      stevel 
   1704  10696       David /*
   1705  10696       David  * The delay(9F) man page indicates that it can only be called from user or
   1706  10696       David  * kernel context - detect and diagnose bad calls. The following macro will
   1707  10696       David  * produce a limited number of messages identifying bad callers.  This is done
   1708  10696       David  * in a macro so that caller() is meaningful. When a bad caller is identified,
   1709  10696       David  * switching to 'drv_usecwait(TICK_TO_USEC(ticks));' may be appropriate.
   1710  10696       David  */
   1711  10696       David #define	DELAY_CONTEXT_CHECK()	{					\
   1712  10696       David 	uint32_t	m;						\
   1713  10696       David 	char		*f;						\
   1714  10696       David 	ulong_t		off;						\
   1715  10696       David 									\
   1716  10696       David 	m = delay_from_interrupt_msg;					\
   1717  10696       David 	if (delay_from_interrupt_diagnose && servicing_interrupt() &&	\
   1718  10696       David 	    !panicstr && !devinfo_freeze &&				\
   1719  10696       David 	    atomic_cas_32(&delay_from_interrupt_msg, m ? m : 1, m-1)) {	\
   1720  10696       David 		f = modgetsymname((uintptr_t)caller(), &off);		\
   1721  10696       David 		cmn_err(CE_WARN, "delay(9F) called from "		\
   1722  10696       David 		    "interrupt context: %s`%s",				\
   1723  10696       David 		    mod_containing_pc(caller()), f ? f : "...");	\
   1724  10696       David 	}								\
   1725  10696       David }
   1726  10696       David 
   1727  10696       David /*
   1728  10696       David  * delay_common: common delay code.
   1729  10696       David  */
   1730  10696       David static void
   1731  10696       David delay_common(clock_t ticks)
   1732  10696       David {
   1733  10696       David 	kthread_t	*t = curthread;
   1734  10696       David 	clock_t		deadline;
   1735  10696       David 	clock_t		timeleft;
   1736  10696       David 	callout_id_t	id;
   1737  10696       David 
   1738  10696       David 	/* If timeouts aren't running all we can do is spin. */
   1739  10696       David 	if (panicstr || devinfo_freeze) {
   1740  10696       David 		/* Convert delay(9F) call into drv_usecwait(9F) call. */
   1741  10696       David 		if (ticks > 0)
   1742  10696       David 			drv_usecwait(TICK_TO_USEC(ticks));
   1743  10696       David 		return;
   1744  10696       David 	}
   1745  10696       David 
   1746  11066      rafael 	deadline = ddi_get_lbolt() + ticks;
   1747  11066      rafael 	while ((timeleft = deadline - ddi_get_lbolt()) > 0) {
   1748  10696       David 		mutex_enter(&t->t_delay_lock);
   1749  10696       David 		id = timeout_default(delay_wakeup, t, timeleft);
   1750  10696       David 		cv_wait(&t->t_delay_cv, &t->t_delay_lock);
   1751  10696       David 		mutex_exit(&t->t_delay_lock);
   1752  10696       David 		(void) untimeout_default(id, 0);
   1753  10696       David 	}
   1754  10696       David }
   1755  10696       David 
   1756  10696       David /*
   1757  10696       David  * Delay specified number of clock ticks.
   1758  10696       David  */
   1759      0      stevel void
   1760      0      stevel delay(clock_t ticks)
   1761      0      stevel {
   1762  10696       David 	DELAY_CONTEXT_CHECK();
   1763      0      stevel 
   1764  10696       David 	delay_common(ticks);
   1765  10696       David }
   1766      0      stevel 
   1767  10696       David /*
   1768  10696       David  * Delay a random number of clock ticks between 1 and ticks.
   1769  10696       David  */
   1770  10696       David void
   1771  10696       David delay_random(clock_t ticks)
   1772  10696       David {
   1773  10696       David 	int	r;
   1774  10696       David 
   1775  10696       David 	DELAY_CONTEXT_CHECK();
   1776  10696       David 
   1777  10696       David 	(void) random_get_pseudo_bytes((void *)&r, sizeof (r));
   1778  10696       David 	if (ticks == 0)
   1779  10696       David 		ticks = 1;
   1780  10696       David 	ticks = (r % ticks) + 1;
   1781  10696       David 	delay_common(ticks);
   1782      0      stevel }
   1783      0      stevel 
   1784      0      stevel /*
   1785      0      stevel  * Like delay, but interruptible by a signal.
   1786      0      stevel  */
   1787      0      stevel int
   1788      0      stevel delay_sig(clock_t ticks)
   1789      0      stevel {
   1790  10696       David 	kthread_t	*t = curthread;
   1791  10696       David 	clock_t		deadline;
   1792  10696       David 	clock_t		rc;
   1793      0      stevel 
   1794  10696       David 	/* If timeouts aren't running all we can do is spin. */
   1795  10696       David 	if (panicstr || devinfo_freeze) {
   1796  10696       David 		if (ticks > 0)
   1797  10696       David 			drv_usecwait(TICK_TO_USEC(ticks));
   1798  10696       David 		return (0);
   1799  10696       David 	}
   1800  10696       David 
   1801  11066      rafael 	deadline = ddi_get_lbolt() + ticks;
   1802  10696       David 	mutex_enter(&t->t_delay_lock);
   1803      0      stevel 	do {
   1804  10696       David 		rc = cv_timedwait_sig(&t->t_delay_cv,
   1805  10696       David 		    &t->t_delay_lock, deadline);
   1806  10696       David 		/* loop until past deadline or signaled */
   1807      0      stevel 	} while (rc > 0);
   1808  10696       David 	mutex_exit(&t->t_delay_lock);
   1809      0      stevel 	if (rc == 0)
   1810      0      stevel 		return (EINTR);
   1811      0      stevel 	return (0);
   1812      0      stevel }
   1813  10696       David 
   1814      0      stevel 
   1815      0      stevel #define	SECONDS_PER_DAY 86400
   1816      0      stevel 
   1817      0      stevel /*
   1818      0      stevel  * Initialize the system time based on the TOD chip.  approx is used as
   1819      0      stevel  * an approximation of time (e.g. from the filesystem) in the event that
   1820      0      stevel  * the TOD chip has been cleared or is unresponsive.  An approx of -1
   1821      0      stevel  * means the filesystem doesn't keep time.
   1822      0      stevel  */
   1823      0      stevel void
   1824      0      stevel clkset(time_t approx)
   1825      0      stevel {
   1826      0      stevel 	timestruc_t ts;
   1827      0      stevel 	int spl;
   1828      0      stevel 	int set_clock = 0;
   1829      0      stevel 
   1830      0      stevel 	mutex_enter(&tod_lock);
   1831      0      stevel 	ts = tod_get();
   1832      0      stevel 
   1833      0      stevel 	if (ts.tv_sec > 365 * SECONDS_PER_DAY) {
   1834      0      stevel 		/*
   1835      0      stevel 		 * If the TOD chip is reporting some time after 1971,
   1836      0      stevel 		 * then it probably didn't lose power or become otherwise
   1837      0      stevel 		 * cleared in the recent past;  check to assure that
   1838      0      stevel 		 * the time coming from the filesystem isn't in the future
   1839      0      stevel 		 * according to the TOD chip.
   1840      0      stevel 		 */
   1841      0      stevel 		if (approx != -1 && approx > ts.tv_sec) {
   1842      0      stevel 			cmn_err(CE_WARN, "Last shutdown is later "
   1843      0      stevel 			    "than time on time-of-day chip; check date.");
   1844      0      stevel 		}
   1845      0      stevel 	} else {
   1846      0      stevel 		/*
   1847   9158  Krishnendu 		 * If the TOD chip isn't giving correct time, set it to the
   1848   9158  Krishnendu 		 * greater of i) approx and ii) 1987. That way if approx
   1849   9158  Krishnendu 		 * is negative or is earlier than 1987, we set the clock
   1850   9158  Krishnendu 		 * back to a time when Oliver North, ALF and Dire Straits
   1851   9158  Krishnendu 		 * were all on the collective brain:  1987.
   1852      0      stevel 		 */
   1853      0      stevel 		timestruc_t tmp;
   1854   9158  Krishnendu 		time_t diagnose_date = (1987 - 1970) * 365 * SECONDS_PER_DAY;
   1855   9158  Krishnendu 		ts.tv_sec = (approx > diagnose_date ? approx : diagnose_date);
   1856      0      stevel 		ts.tv_nsec = 0;
   1857      0      stevel 
   1858      0      stevel 		/*
   1859      0      stevel 		 * Attempt to write the new time to the TOD chip.  Set spl high
   1860      0      stevel 		 * to avoid getting preempted between the tod_set and tod_get.
   1861      0      stevel 		 */
   1862      0      stevel 		spl = splhi();
   1863      0      stevel 		tod_set(ts);
   1864      0      stevel 		tmp = tod_get();
   1865      0      stevel 		splx(spl);
   1866      0      stevel 
   1867      0      stevel 		if (tmp.tv_sec != ts.tv_sec && tmp.tv_sec != ts.tv_sec + 1) {
   1868      0      stevel 			tod_broken = 1;
   1869      0      stevel 			dosynctodr = 0;
   1870   9158  Krishnendu 			cmn_err(CE_WARN, "Time-of-day chip unresponsive.");
   1871      0      stevel 		} else {
   1872      0      stevel 			cmn_err(CE_WARN, "Time-of-day chip had "
   1873      0      stevel 			    "incorrect date; check and reset.");
   1874      0      stevel 		}
   1875      0      stevel 		set_clock = 1;
   1876      0      stevel 	}
   1877      0      stevel 
   1878      0      stevel 	if (!boot_time) {
   1879      0      stevel 		boot_time = ts.tv_sec;
   1880      0      stevel 		set_clock = 1;
   1881      0      stevel 	}
   1882      0      stevel 
   1883      0      stevel 	if (set_clock)
   1884      0      stevel 		set_hrestime(&ts);
   1885      0      stevel 
   1886      0      stevel 	mutex_exit(&tod_lock);
   1887      0      stevel }
   1888      0      stevel 
   1889   4123    dm120769 int	timechanged;	/* for testing if the system time has been reset */
   1890      0      stevel 
   1891      0      stevel void
   1892      0      stevel set_hrestime(timestruc_t *ts)
   1893      0      stevel {
   1894      0      stevel 	int spl = hr_clock_lock();
   1895      0      stevel 	hrestime = *ts;
   1896   4123    dm120769 	membar_enter();	/* hrestime must be visible before timechanged++ */
   1897      0      stevel 	timedelta = 0;
   1898   4123    dm120769 	timechanged++;
   1899      0      stevel 	hr_clock_unlock(spl);
   1900   8048    Madhavan 	callout_hrestime();
   1901      0      stevel }
   1902      0      stevel 
   1903      0      stevel static uint_t deadman_seconds;
   1904      0      stevel static uint32_t deadman_panics;
   1905      0      stevel static int deadman_enabled = 0;
   1906      0      stevel static int deadman_panic_timers = 1;
   1907      0      stevel 
   1908      0      stevel static void
   1909      0      stevel deadman(void)
   1910      0      stevel {
   1911      0      stevel 	if (panicstr) {
   1912      0      stevel 		/*
   1913      0      stevel 		 * During panic, other CPUs besides the panic
   1914      0      stevel 		 * master continue to handle cyclics and some other
   1915      0      stevel 		 * interrupts.  The code below is intended to be
   1916      0      stevel 		 * single threaded, so any CPU other than the master
   1917      0      stevel 		 * must keep out.
   1918      0      stevel 		 */
   1919      0      stevel 		if (CPU->cpu_id != panic_cpu.cpu_id)
   1920      0      stevel 			return;
   1921      0      stevel 
   1922      0      stevel 		if (!deadman_panic_timers)
   1923      0      stevel 			return; /* allow all timers to be manually disabled */
   1924      0      stevel 
   1925      0      stevel 		/*
   1926      0      stevel 		 * If we are generating a crash dump or syncing filesystems and
   1927      0      stevel 		 * the corresponding timer is set, decrement it and re-enter
   1928      0      stevel 		 * the panic code to abort it and advance to the next state.
   1929      0      stevel 		 * The panic states and triggers are explained in panic.c.
   1930      0      stevel 		 */
   1931      0      stevel 		if (panic_dump) {
   1932      0      stevel 			if (dump_timeleft && (--dump_timeleft == 0)) {
   1933      0      stevel 				panic("panic dump timeout");
   1934      0      stevel 				/*NOTREACHED*/
   1935      0      stevel 			}
   1936      0      stevel 		} else if (panic_sync) {
   1937      0      stevel 			if (sync_timeleft && (--sync_timeleft == 0)) {
   1938      0      stevel 				panic("panic sync timeout");
   1939      0      stevel 				/*NOTREACHED*/
   1940      0      stevel 			}
   1941      0      stevel 		}
   1942      0      stevel 
   1943      0      stevel 		return;
   1944      0      stevel 	}
   1945      0      stevel 
   1946  11066      rafael 	if (deadman_counter != CPU->cpu_deadman_counter) {
   1947  11066      rafael 		CPU->cpu_deadman_counter = deadman_counter;
   1948      0      stevel 		CPU->cpu_deadman_countdown = deadman_seconds;
   1949      0      stevel 		return;
   1950      0      stevel 	}
   1951      0      stevel 
   1952   6054    vb160487 	if (--CPU->cpu_deadman_countdown > 0)
   1953      0      stevel 		return;
   1954      0      stevel 
   1955      0      stevel 	/*
   1956      0      stevel 	 * Regardless of whether or not we actually bring the system down,
   1957      0      stevel 	 * bump the deadman_panics variable.
   1958      0      stevel 	 *
   1959      0      stevel 	 * N.B. deadman_panics is incremented once for each CPU that
   1960      0      stevel 	 * passes through here.  It's expected that all the CPUs will
   1961      0      stevel 	 * detect this condition within one second of each other, so
   1962      0      stevel 	 * when deadman_enabled is off, deadman_panics will
   1963      0      stevel 	 * typically be a multiple of the total number of CPUs in
   1964      0      stevel 	 * the system.
   1965      0      stevel 	 */
   1966      0      stevel 	atomic_add_32(&deadman_panics, 1);
   1967      0      stevel 
   1968      0      stevel 	if (!deadman_enabled) {
   1969      0      stevel 		CPU->cpu_deadman_countdown = deadman_seconds;
   1970      0      stevel 		return;
   1971      0      stevel 	}
   1972      0      stevel 
   1973      0      stevel 	/*
   1974      0      stevel 	 * If we're here, we want to bring the system down.
   1975      0      stevel 	 */
   1976      0      stevel 	panic("deadman: timed out after %d seconds of clock "
   1977      0      stevel 	    "inactivity", deadman_seconds);
   1978      0      stevel 	/*NOTREACHED*/
   1979      0      stevel }
   1980      0      stevel 
   1981      0      stevel /*ARGSUSED*/
   1982      0      stevel static void
   1983      0      stevel deadman_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
   1984      0      stevel {
   1985  11066      rafael 	cpu->cpu_deadman_counter = 0;
   1986      0      stevel 	cpu->cpu_deadman_countdown = deadman_seconds;
   1987      0      stevel 
   1988      0      stevel 	hdlr->cyh_func = (cyc_func_t)deadman;
   1989      0      stevel 	hdlr->cyh_level = CY_HIGH_LEVEL;
   1990      0      stevel 	hdlr->cyh_arg = NULL;
   1991      0      stevel 
   1992      0      stevel 	/*
   1993      0      stevel 	 * Stagger the CPUs so that they don't all run deadman() at
   1994      0      stevel 	 * the same time.  Simplest reason to do this is to make it
   1995      0      stevel 	 * more likely that only one CPU will panic in case of a
   1996      0      stevel 	 * timeout.  This is (strictly speaking) an aesthetic, not a
   1997      0      stevel 	 * technical consideration.
   1998      0      stevel 	 */
   1999      0      stevel 	when->cyt_when = cpu->cpu_id * (NANOSEC / NCPU);
   2000      0      stevel 	when->cyt_interval = NANOSEC;
   2001      0      stevel }
   2002      0      stevel 
   2003      0      stevel 
   2004      0      stevel void
   2005      0      stevel deadman_init(void)
   2006      0      stevel {
   2007      0      stevel 	cyc_omni_handler_t hdlr;
   2008      0      stevel 
   2009      0      stevel 	if (deadman_seconds == 0)
   2010      0      stevel 		deadman_seconds = snoop_interval / MICROSEC;
   2011      0      stevel 
   2012      0      stevel 	if (snooping)
   2013      0      stevel 		deadman_enabled = 1;
   2014      0      stevel 
   2015      0      stevel 	hdlr.cyo_online = deadman_online;
   2016      0      stevel 	hdlr.cyo_offline = NULL;
   2017      0      stevel 	hdlr.cyo_arg = NULL;
   2018      0      stevel 
   2019      0      stevel 	mutex_enter(&cpu_lock);
   2020      0      stevel 	deadman_cyclic = cyclic_add_omni(&hdlr);
   2021      0      stevel 	mutex_exit(&cpu_lock);
   2022      0      stevel }
   2023      0      stevel 
   2024      0      stevel /*
   2025      0      stevel  * tod_fault() is for updating tod validate mechanism state:
   2026      0      stevel  * (1) TOD_NOFAULT: for resetting the state to 'normal'.
   2027      0      stevel  *     currently used for debugging only
   2028      0      stevel  * (2) The following four cases detected by tod validate mechanism:
   2029      0      stevel  *       TOD_REVERSED: current tod value is less than previous value.
   2030      0      stevel  *       TOD_STALLED: current tod value hasn't advanced.
   2031      0      stevel  *       TOD_JUMPED: current tod value advanced too far from previous value.
   2032      0      stevel  *       TOD_RATECHANGED: the ratio between average tod delta and
   2033      0      stevel  *       average tick delta has changed.
   2034   5084     johnlev  * (3) TOD_RDONLY: when the TOD clock is not writeable e.g. because it is
   2035   5084     johnlev  *     a virtual TOD provided by a hypervisor.
   2036      0      stevel  */
   2037      0      stevel enum tod_fault_type
   2038      0      stevel tod_fault(enum tod_fault_type ftype, int off)
   2039      0      stevel {
   2040      0      stevel 	ASSERT(MUTEX_HELD(&tod_lock));
   2041      0      stevel 
   2042      0      stevel 	if (tod_faulted != ftype) {
   2043      0      stevel 		switch (ftype) {
   2044      0      stevel 		case TOD_NOFAULT:
   2045     78    ae112802 			plat_tod_fault(TOD_NOFAULT);
   2046      0      stevel 			cmn_err(CE_NOTE, "Restarted tracking "
   2047   5076      mishra 			    "Time of Day clock.");
   2048      0      stevel 			tod_faulted = ftype;
   2049      0      stevel 			break;
   2050      0      stevel 		case TOD_REVERSED:
   2051      0      stevel 		case TOD_JUMPED:
   2052      0      stevel 			if (tod_faulted == TOD_NOFAULT) {
   2053     78    ae112802 				plat_tod_fault(ftype);
   2054      0      stevel 				cmn_err(CE_WARN, "Time of Day clock error: "
   2055      0      stevel 				    "reason [%s by 0x%x]. -- "
   2056      0      stevel 				    " Stopped tracking Time Of Day clock.",
   2057      0      stevel 				    tod_fault_table[ftype], off);
   2058      0      stevel 				tod_faulted = ftype;
   2059      0      stevel 			}
   2060      0      stevel 			break;
   2061      0      stevel 		case TOD_STALLED:
   2062      0      stevel 		case TOD_RATECHANGED:
   2063      0      stevel 			if (tod_faulted == TOD_NOFAULT) {
   2064     78    ae112802 				plat_tod_fault(ftype);
   2065      0      stevel 				cmn_err(CE_WARN, "Time of Day clock error: "
   2066      0      stevel 				    "reason [%s]. -- "
   2067      0      stevel 				    " Stopped tracking Time Of Day clock.",
   2068      0      stevel 				    tod_fault_table[ftype]);
   2069   5084     johnlev 				tod_faulted = ftype;
   2070   5084     johnlev 			}
   2071   5084     johnlev 			break;
   2072   5084     johnlev 		case TOD_RDONLY:
   2073   5084     johnlev 			if (tod_faulted == TOD_NOFAULT) {
   2074   5084     johnlev 				plat_tod_fault(ftype);
   2075   5084     johnlev 				cmn_err(CE_NOTE, "!Time of Day clock is "
   2076   5084     johnlev 				    "Read-Only; set of Date/Time will not "
   2077   5084     johnlev 				    "persist across reboot.");
   2078      0      stevel 				tod_faulted = ftype;
   2079      0      stevel 			}
   2080      0      stevel 			break;
   2081      0      stevel 		default:
   2082      0      stevel 			break;
   2083      0      stevel 		}
   2084      0      stevel 	}
   2085      0      stevel 	return (tod_faulted);
   2086      0      stevel }
   2087      0      stevel 
   2088      0      stevel void
   2089      0      stevel tod_fault_reset()
   2090      0      stevel {
   2091      0      stevel 	tod_fault_reset_flag = 1;
   2092      0      stevel }
   2093      0      stevel 
   2094      0      stevel 
   2095      0      stevel /*
   2096      0      stevel  * tod_validate() is used for checking values returned by tod_get().
   2097      0      stevel  * Four error cases can be detected by this routine:
   2098      0      stevel  *   TOD_REVERSED: current tod value is less than previous.
   2099      0      stevel  *   TOD_STALLED: current tod value hasn't advanced.
   2100      0      stevel  *   TOD_JUMPED: current tod value advanced too far from previous value.
   2101      0      stevel  *   TOD_RATECHANGED: the ratio between average tod delta and
   2102      0      stevel  *   average tick delta has changed.
   2103      0      stevel  */
   2104      0      stevel time_t
   2105      0      stevel tod_validate(time_t tod)
   2106      0      stevel {
   2107      0      stevel 	time_t diff_tod;
   2108      0      stevel 	hrtime_t diff_tick;
   2109      0      stevel 
   2110      0      stevel 	long dtick;
   2111      0      stevel 	int dtick_delta;
   2112      0      stevel 
   2113      0      stevel 	int off = 0;
   2114      0      stevel 	enum tod_fault_type tod_bad = TOD_NOFAULT;
   2115      0      stevel 
   2116      0      stevel 	static int firsttime = 1;
   2117      0      stevel 
   2118      0      stevel 	static time_t prev_tod = 0;
   2119      0      stevel 	static hrtime_t prev_tick = 0;
   2120      0      stevel 	static long dtick_avg = TOD_REF_FREQ;
   2121      0      stevel 
   2122      0      stevel 	hrtime_t tick = gethrtime();
   2123      0      stevel 
   2124      0      stevel 	ASSERT(MUTEX_HELD(&tod_lock));
   2125      0      stevel 
   2126      0      stevel 	/*
   2127      0      stevel 	 * tod_validate_enable is patchable via /etc/system.
   2128    950       sethg 	 * If TOD is already faulted, or if TOD validation is deferred,
   2129    950       sethg 	 * there is nothing to do.
   2130      0      stevel 	 */
   2131    950       sethg 	if ((tod_validate_enable == 0) || (tod_faulted != TOD_NOFAULT) ||
   2132    950       sethg 	    tod_validate_deferred) {
   2133      0      stevel 		return (tod);
   2134      0      stevel 	}
   2135      0      stevel 
   2136      0      stevel 	/*
   2137      0      stevel 	 * Update prev_tod and prev_tick values for first run
   2138      0      stevel 	 */
   2139      0      stevel 	if (firsttime) {
   2140      0      stevel 		firsttime = 0;
   2141      0      stevel 		prev_tod = tod;
   2142      0      stevel 		prev_tick = tick;
   2143      0      stevel 		return (tod);
   2144      0      stevel 	}
   2145      0      stevel 
   2146      0      stevel 	/*
   2147      0      stevel 	 * For either of these conditions, we need to reset ourself
   2148      0      stevel 	 * and start validation from zero since each condition
   2149      0      stevel 	 * indicates that the TOD will be updated with new value
   2150      0      stevel 	 * Also, note that tod_needsync will be reset in clock()
   2151      0      stevel 	 */
   2152      0      stevel 	if (tod_needsync || tod_fault_reset_flag) {
   2153      0      stevel 		firsttime = 1;
   2154      0      stevel 		prev_tod = 0;
   2155      0      stevel 		prev_tick = 0;
   2156      0      stevel 		dtick_avg = TOD_REF_FREQ;
   2157      0      stevel 
   2158      0      stevel 		if (tod_fault_reset_flag)
   2159      0      stevel 			tod_fault_reset_flag = 0;
   2160      0      stevel 
   2161      0      stevel 		return (tod);
   2162      0      stevel 	}
   2163      0      stevel 
   2164      0      stevel 	/* test hook */
   2165      0      stevel 	switch (tod_unit_test) {
   2166      0      stevel 	case 1: /* for testing jumping tod */
   2167      0      stevel 		tod += tod_test_injector;
   2168      0      stevel 		tod_unit_test = 0;
   2169      0      stevel 		break;
   2170      0      stevel 	case 2:	/* for testing stuck tod bit */
   2171      0      stevel 		tod |= 1 << tod_test_injector;
   2172      0      stevel 		tod_unit_test = 0;
   2173      0      stevel 		break;
   2174      0      stevel 	case 3:	/* for testing stalled tod */
   2175      0      stevel 		tod = prev_tod;
   2176      0      stevel 		tod_unit_test = 0;
   2177      0      stevel 		break;
   2178      0      stevel 	case 4:	/* reset tod fault status */
   2179      0      stevel 		(void) tod_fault(TOD_NOFAULT, 0);
   2180      0      stevel 		tod_unit_test = 0;
   2181      0      stevel 		break;
   2182      0      stevel 	default:
   2183      0      stevel 		break;
   2184      0      stevel 	}
   2185      0      stevel 
   2186      0      stevel 	diff_tod = tod - prev_tod;
   2187      0      stevel 	diff_tick = tick - prev_tick;
   2188      0      stevel 
   2189      0      stevel 	ASSERT(diff_tick >= 0);
   2190      0      stevel 
   2191      0      stevel 	if (diff_tod < 0) {
   2192      0      stevel 		/* ERROR - tod reversed */
   2193      0      stevel 		tod_bad = TOD_REVERSED;
   2194      0      stevel 		off = (int)(prev_tod - tod);
   2195      0      stevel 	} else if (diff_tod == 0) {
   2196      0      stevel 		/* tod did not advance */
   2197      0      stevel 		if (diff_tick > TOD_STALL_THRESHOLD) {
   2198      0      stevel 			/* ERROR - tod stalled */
   2199      0      stevel 			tod_bad = TOD_STALLED;
   2200      0      stevel 		} else {
   2201      0      stevel 			/*
   2202      0      stevel 			 * Make sure we don't update prev_tick
   2203      0      stevel 			 * so that diff_tick is calculated since
   2204      0      stevel 			 * the first diff_tod == 0
   2205      0      stevel 			 */
   2206      0      stevel 			return (tod);
   2207      0      stevel 		}
   2208      0      stevel 	} else {
   2209      0      stevel 		/* calculate dtick */
   2210      0      stevel 		dtick = diff_tick / diff_tod;
   2211      0      stevel 
   2212      0      stevel 		/* update dtick averages */
   2213      0      stevel 		dtick_avg += ((dtick - dtick_avg) / TOD_FILTER_N);
   2214      0      stevel 
   2215      0      stevel 		/*
   2216      0      stevel 		 * Calculate dtick_delta as
   2217      0      stevel 		 * variation from reference freq in quartiles
   2218      0      stevel 		 */
   2219      0      stevel 		dtick_delta = (dtick_avg - TOD_REF_FREQ) /
   2220   5076      mishra 		    (TOD_REF_FREQ >> 2);
   2221      0      stevel 
   2222      0      stevel 		/*
   2223      0      stevel 		 * Even with a perfectly functioning TOD device,
   2224      0      stevel 		 * when the number of elapsed seconds is low the
   2225      0      stevel 		 * algorithm can calculate a rate that is beyond
   2226      0      stevel 		 * tolerance, causing an error.  The algorithm is
   2227      0      stevel 		 * inaccurate when elapsed time is low (less than
   2228      0      stevel 		 * 5 seconds).
   2229      0      stevel 		 */
   2230      0      stevel 		if (diff_tod > 4) {
   2231      0      stevel 			if (dtick < TOD_JUMP_THRESHOLD) {
   2232      0      stevel 				/* ERROR - tod jumped */
   2233      0      stevel 				tod_bad = TOD_JUMPED;
   2234      0      stevel 				off = (int)diff_tod;
   2235      0      stevel 			} else if (dtick_delta) {
   2236      0      stevel 				/* ERROR - change in clock rate */
   2237      0      stevel 				tod_bad = TOD_RATECHANGED;
   2238      0      stevel 			}
   2239      0      stevel 		}
   2240      0      stevel 	}
   2241      0      stevel 
   2242      0      stevel 	if (tod_bad != TOD_NOFAULT) {
   2243      0      stevel 		(void) tod_fault(tod_bad, off);
   2244      0      stevel 
   2245      0      stevel 		/*
   2246      0      stevel 		 * Disable dosynctodr since we are going to fault
   2247      0      stevel 		 * the TOD chip anyway here
   2248      0      stevel 		 */
   2249      0      stevel 		dosynctodr = 0;
   2250      0      stevel 
   2251      0      stevel 		/*
   2252      0      stevel 		 * Set tod to the correct value from hrestime
   2253      0      stevel 		 */
   2254      0      stevel 		tod = hrestime.tv_sec;
   2255      0      stevel 	}
   2256      0      stevel 
   2257      0      stevel 	prev_tod = tod;
   2258      0      stevel 	prev_tick = tick;
   2259      0      stevel 	return (tod);
   2260      0      stevel }
   2261      0      stevel 
   2262      0      stevel static void
   2263      0      stevel calcloadavg(int nrun, uint64_t *hp_ave)
   2264      0      stevel {
   2265      0      stevel 	static int64_t f[3] = { 135, 27, 9 };
   2266      0      stevel 	uint_t i;
   2267      0      stevel 	int64_t q, r;
   2268      0      stevel 
   2269      0      stevel 	/*
   2270      0      stevel 	 * Compute load average over the last 1, 5, and 15 minutes
   2271      0      stevel 	 * (60, 300, and 900 seconds).  The constants in f[3] are for
   2272      0      stevel 	 * exponential decay:
   2273      0      stevel 	 * (1 - exp(-1/60)) << 13 = 135,
   2274      0      stevel 	 * (1 - exp(-1/300)) << 13 = 27,
   2275      0      stevel 	 * (1 - exp(-1/900)) << 13 = 9.
   2276      0      stevel 	 */
   2277      0      stevel 
   2278      0      stevel 	/*
   2279      0      stevel 	 * a little hoop-jumping to avoid integer overflow
   2280      0      stevel 	 */
   2281      0      stevel 	for (i = 0; i < 3; i++) {
   2282      0      stevel 		q = (hp_ave[i]  >> 16) << 7;
   2283      0      stevel 		r = (hp_ave[i]  & 0xffff) << 7;
   2284      0      stevel 		hp_ave[i] += ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
   2285      0      stevel 	}
   2286      0      stevel }
   2287  11066      rafael 
   2288  11066      rafael /*
   2289  11066      rafael  * lbolt_hybrid() is used by ddi_get_lbolt() and ddi_get_lbolt64() to
   2290  11066      rafael  * calculate the value of lbolt according to the current mode. In the event
   2291  11066      rafael  * driven mode (the default), lbolt is calculated by dividing the current hires
   2292  11066      rafael  * time by the number of nanoseconds per clock tick. In the cyclic driven mode
   2293  11066      rafael  * an internal variable is incremented at each firing of the lbolt cyclic
   2294  11066      rafael  * and returned by lbolt_cyclic_driven().
   2295  11066      rafael  *
   2296  11066      rafael  * The system will transition from event to cyclic driven mode when the number
   2297  11066      rafael  * of calls to lbolt_event_driven() exceeds the (per CPU) threshold within a
   2298  11066      rafael  * window of time. It does so by reprograming lbolt_cyclic from CY_INFINITY to
   2299  11066      rafael  * nsec_per_tick. The lbolt cyclic will remain ON while at least one CPU is
   2300  11066      rafael  * causing enough activity to cross the thresholds.
   2301  11066      rafael  */
   2302  11066      rafael static int64_t
   2303  11066      rafael lbolt_bootstrap(void)
   2304  11066      rafael {
   2305  11066      rafael 	return (0);
   2306  11066      rafael }
   2307  11066      rafael 
   2308  11066      rafael /* ARGSUSED */
   2309  11066      rafael uint_t
   2310  11066      rafael lbolt_ev_to_cyclic(caddr_t arg1, caddr_t arg2)
   2311  11066      rafael {
   2312  11066      rafael 	hrtime_t ts, exp;
   2313  11066      rafael 	int ret;
   2314  11066      rafael 
   2315  11066      rafael 	ASSERT(lbolt_hybrid != lbolt_cyclic_driven);
   2316  11066      rafael 
   2317  11066      rafael 	kpreempt_disable();
   2318  11066      rafael 
   2319  11066      rafael 	ts = gethrtime();
   2320  11066      rafael 	lb_info->lbi_internal = (ts/nsec_per_tick);
   2321  11066      rafael 
   2322  11066      rafael 	/*
   2323  11066      rafael 	 * Align the next expiration to a clock tick boundary.
   2324  11066      rafael 	 */
   2325  11066      rafael 	exp = ts + nsec_per_tick - 1;
   2326  11066      rafael 	exp = (exp/nsec_per_tick) * nsec_per_tick;
   2327  11066      rafael 
   2328  11151      rafael 	ret = cyclic_reprogram(lb_info->id.lbi_cyclic_id, exp);
   2329  11066      rafael 	ASSERT(ret);
   2330  11066      rafael 
   2331  11066      rafael 	lbolt_hybrid = lbolt_cyclic_driven;
   2332  11066      rafael 	lb_info->lbi_cyc_deactivate = B_FALSE;
   2333  11066      rafael 	lb_info->lbi_cyc_deac_start = lb_info->lbi_internal;
   2334  11066      rafael 
   2335  11066      rafael 	kpreempt_enable();
   2336  11066      rafael 
   2337  11066      rafael 	ret = atomic_dec_32_nv(&lb_info->lbi_token);
   2338  11066      rafael 	ASSERT(ret == 0);
   2339  11066      rafael 
   2340  11066      rafael 	return (1);
   2341  11066      rafael }
   2342  11066      rafael 
   2343  11066      rafael int64_t
   2344  11066      rafael lbolt_event_driven(void)
   2345  11066      rafael {
   2346  11066      rafael 	hrtime_t ts;
   2347  11066      rafael 	int64_t lb;
   2348  11066      rafael 	int ret, cpu = CPU->cpu_seqid;
   2349  11066      rafael 
   2350  11066      rafael 	ts = gethrtime();
   2351  11066      rafael 	ASSERT(ts > 0);
   2352  11066      rafael 
   2353  11066      rafael 	ASSERT(nsec_per_tick > 0);
   2354  11066      rafael 	lb = (ts/nsec_per_tick);
   2355  11066      rafael 
   2356  11066      rafael 	/*
   2357  11066      rafael 	 * Switch to cyclic mode if the number of calls to this routine
   2358  11066      rafael 	 * has reached the threshold within the interval.
   2359  11066      rafael 	 */
   2360  11066      rafael 	if ((lb - lb_cpu[cpu].lbc_cnt_start) < lb_info->lbi_thresh_interval) {
   2361  11066      rafael 
   2362  11066      rafael 		if (--lb_cpu[cpu].lbc_counter == 0) {
   2363  11066      rafael 			/*
   2364  11066      rafael 			 * Reached the threshold within the interval, reset
   2365  11066      rafael 			 * the usage statistics.
   2366  11066      rafael 			 */
   2367  11066      rafael 			lb_cpu[cpu].lbc_counter = lb_info->lbi_thresh_calls;
   2368  11066      rafael 			lb_cpu[cpu].lbc_cnt_start = lb;
   2369  11066      rafael 
   2370  11066      rafael 			/*
   2371  11066      rafael 			 * Make sure only one thread reprograms the
   2372  11066      rafael 			 * lbolt cyclic and changes the mode.
   2373  11066      rafael 			 */
   2374  11066      rafael 			if (panicstr == NULL &&
   2375  11066      rafael 			    atomic_cas_32(&lb_info->lbi_token, 0, 1) == 0) {
   2376  11066      rafael 
   2377  11066      rafael 				if (lbolt_hybrid == lbolt_cyclic_driven) {
   2378  11066      rafael 					ret = atomic_dec_32_nv(
   2379  11066      rafael 					    &lb_info->lbi_token);
   2380  11066      rafael 					ASSERT(ret == 0);
   2381  11066      rafael 					return (lb);
   2382  11066      rafael 				}
   2383  11066      rafael 
   2384  11066      rafael 				lbolt_softint_post();
   2385  11066      rafael 			}
   2386  11066      rafael 		}
   2387  11066      rafael 	} else {
   2388  11066      rafael 		/*
   2389  11066      rafael 		 * Exceeded the interval, reset the usage statistics.
   2390  11066      rafael 		 */
   2391  11066      rafael 		lb_cpu[cpu].lbc_counter = lb_info->lbi_thresh_calls;
   2392  11066      rafael 		lb_cpu[cpu].lbc_cnt_start = lb;
   2393  11066      rafael 	}
   2394  11066      rafael 
   2395  11066      rafael 	ASSERT(lb >= lb_info->lbi_debug_time);
   2396  11066      rafael 
   2397  11066      rafael 	return (lb - lb_info->lbi_debug_time);
   2398  11066      rafael }
   2399  11066      rafael 
   2400  11066      rafael int64_t
   2401  11066      rafael lbolt_cyclic_driven(void)
   2402  11066      rafael {
   2403  11066      rafael 	int64_t lb = lb_info->lbi_internal;
   2404  11066      rafael 	int cpu = CPU->cpu_seqid;
   2405  11066      rafael 
   2406  11066      rafael 	if ((lb - lb_cpu[cpu].lbc_cnt_start) < lb_info->lbi_thresh_interval) {
   2407  11066      rafael 
   2408  11066      rafael 		if (lb_cpu[cpu].lbc_counter == 0)
   2409  11066      rafael 			/*
   2410  11066      rafael 			 * Reached the threshold within the interval,
   2411  11066      rafael 			 * prevent the lbolt cyclic from turning itself
   2412  11066      rafael 			 * off.
   2413  11066      rafael 			 */
   2414  11066      rafael 			lb_info->lbi_cyc_deactivate = B_FALSE;
   2415  11066      rafael 		else
   2416  11066      rafael 			lb_cpu[cpu].lbc_counter--;
   2417  11066      rafael 	} else {
   2418  11066      rafael 		/*
   2419  11066      rafael 		 * Only reset the usage statistics when the interval has
   2420  11066      rafael 		 * exceeded.
   2421  11066      rafael 		 */
   2422  11066      rafael 		lb_cpu[cpu].lbc_counter = lb_info->lbi_thresh_calls;
   2423  11066      rafael 		lb_cpu[cpu].lbc_cnt_start = lb;
   2424  11066      rafael 	}
   2425  11066      rafael 
   2426  11066      rafael 	ASSERT(lb >= lb_info->lbi_debug_time);
   2427  11066      rafael 
   2428  11066      rafael 	return (lb - lb_info->lbi_debug_time);
   2429  11066      rafael }
   2430  11066      rafael 
   2431  11066      rafael /*
   2432  11066      rafael  * The lbolt_cyclic() routine will fire at a nsec_per_tick rate to satisfy
   2433  11066      rafael  * performance needs of ddi_get_lbolt() and ddi_get_lbolt64() consumers.
   2434  11066      rafael  * It is inactive by default, and will be activated when switching from event
   2435  11066      rafael  * to cyclic driven lbolt. The cyclic will turn itself off unless signaled
   2436  11066      rafael  * by lbolt_cyclic_driven().
   2437  11066      rafael  */
   2438  11066      rafael static void
   2439  11066      rafael lbolt_cyclic(void)
   2440  11066      rafael {
   2441  11066      rafael 	int ret;
   2442  11066      rafael 
   2443  11066      rafael 	lb_info->lbi_internal++;
   2444  11066      rafael 
   2445  11066      rafael 	if (!lbolt_cyc_only) {
   2446  11066      rafael 
   2447  11066      rafael 		if (lb_info->lbi_cyc_deactivate) {
   2448  11066      rafael 			/*
   2449  11066      rafael 			 * Switching from cyclic to event driven mode.
   2450  11066      rafael 			 */
   2451  11066      rafael 			if (atomic_cas_32(&lb_info->lbi_token, 0, 1) == 0) {
   2452  11066      rafael 
   2453  11066      rafael 				if (lbolt_hybrid == lbolt_event_driven) {
   2454  11066      rafael 					ret = atomic_dec_32_nv(
   2455  11066      rafael 					    &lb_info->lbi_token);
   2456  11066      rafael 					ASSERT(ret == 0);
   2457  11066      rafael 					return;
   2458  11066      rafael 				}
   2459  11066      rafael 
   2460  11066      rafael 				kpreempt_disable();
   2461  11066      rafael 
   2462  11066      rafael 				lbolt_hybrid = lbolt_event_driven;
   2463  11151      rafael 				ret = cyclic_reprogram(
   2464  11151      rafael 				    lb_info->id.lbi_cyclic_id,
   2465  11066      rafael 				    CY_INFINITY);
   2466  11066      rafael 				ASSERT(ret);
   2467  11066      rafael 
   2468  11066      rafael 				kpreempt_enable();
   2469  11066      rafael 
   2470  11066      rafael 				ret = atomic_dec_32_nv(&lb_info->lbi_token);
   2471  11066      rafael 				ASSERT(ret == 0);
   2472  11066      rafael 			}
   2473  11066      rafael 		}
   2474  11066      rafael 
   2475  11066      rafael 		/*
   2476  11066      rafael 		 * The lbolt cyclic should not try to deactivate itself before
   2477  11066      rafael 		 * the sampling period has elapsed.
   2478  11066      rafael 		 */
   2479  11066      rafael 		if (lb_info->lbi_internal - lb_info->lbi_cyc_deac_start >=
   2480  11066      rafael 		    lb_info->lbi_thresh_interval) {
   2481  11066      rafael 			lb_info->lbi_cyc_deactivate = B_TRUE;
   2482  11066      rafael 			lb_info->lbi_cyc_deac_start = lb_info->lbi_internal;
   2483  11066      rafael 		}
   2484  11066      rafael 	}
   2485  11066      rafael }
   2486  11066      rafael 
   2487  11066      rafael /*
   2488  11066      rafael  * Since the lbolt service was historically cyclic driven, it must be 'stopped'
   2489  11066      rafael  * when the system drops into the kernel debugger. lbolt_debug_entry() is
   2490  11066      rafael  * called by the KDI system claim callbacks to record a hires timestamp at
   2491  11066      rafael  * debug enter time. lbolt_debug_return() is called by the sistem release
   2492  11066      rafael  * callbacks to account for the time spent in the debugger. The value is then
   2493  11066      rafael  * accumulated in the lb_info structure and used by lbolt_event_driven() and
   2494  11066      rafael  * lbolt_cyclic_driven(), as well as the mdb_get_lbolt() routine.
   2495  11066      rafael  */
   2496  11066      rafael void
   2497  11066      rafael lbolt_debug_entry(void)
   2498  11066      rafael {
   2499  11066      rafael 	lb_info->lbi_debug_ts = gethrtime();
   2500  11066      rafael }
   2501  11066      rafael 
   2502  11151      rafael /*
   2503  11151      rafael  * Calculate the time spent in the debugger and add it to the lbolt info
   2504  11151      rafael  * structure. We also update the internal lbolt value in case we were in
   2505  11151      rafael  * cyclic driven mode going in.
   2506  11151      rafael  */
   2507  11066      rafael void
   2508  11066      rafael lbolt_debug_return(void)
   2509  11066      rafael {
   2510  11151      rafael 	hrtime_t ts;
   2511  11151      rafael 
   2512  11151      rafael 	if (nsec_per_tick > 0) {
   2513  11151      rafael 		ts = gethrtime();
   2514  11151      rafael 
   2515  11151      rafael 		lb_info->lbi_internal = (ts/nsec_per_tick);
   2516  11066      rafael 		lb_info->lbi_debug_time +=
   2517  11151      rafael 		    ((ts - lb_info->lbi_debug_ts)/nsec_per_tick);
   2518  11151      rafael 	}
   2519  11066      rafael 
   2520  11066      rafael 	lb_info->lbi_debug_ts = 0;
   2521  11066      rafael }
   2522