Home | History | Annotate | Download | only in disp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/types.h>
     28 #include <sys/param.h>
     29 #include <sys/sysmacros.h>
     30 #include <sys/cred.h>
     31 #include <sys/proc.h>
     32 #include <sys/strsubr.h>
     33 #include <sys/priocntl.h>
     34 #include <sys/class.h>
     35 #include <sys/disp.h>
     36 #include <sys/procset.h>
     37 #include <sys/debug.h>
     38 #include <sys/kmem.h>
     39 #include <sys/errno.h>
     40 #include <sys/systm.h>
     41 #include <sys/schedctl.h>
     42 #include <sys/vmsystm.h>
     43 #include <sys/atomic.h>
     44 #include <sys/project.h>
     45 #include <sys/modctl.h>
     46 #include <sys/fss.h>
     47 #include <sys/fsspriocntl.h>
     48 #include <sys/cpupart.h>
     49 #include <sys/zone.h>
     50 #include <vm/rm.h>
     51 #include <vm/seg_kmem.h>
     52 #include <sys/tnf_probe.h>
     53 #include <sys/policy.h>
     54 #include <sys/sdt.h>
     55 #include <sys/cpucaps.h>
     56 
     57 /*
     58  * FSS Data Structures:
     59  *
     60  *                 fsszone
     61  *                  -----           -----
     62  *  -----          |     |         |     |
     63  * |     |-------->|     |<------->|     |<---->...
     64  * |     |          -----           -----
     65  * |     |          ^    ^            ^
     66  * |     |---       |     \            \
     67  *  -----    |      |      \            \
     68  * fsspset   |      |       \            \
     69  *           |      |        \            \
     70  *           |    -----       -----       -----
     71  *            -->|     |<--->|     |<--->|     |
     72  *               |     |     |     |     |     |
     73  *                -----       -----       -----
     74  *               fssproj
     75  *
     76  *
     77  * That is, fsspsets contain a list of fsszone's that are currently active in
     78  * the pset, and a list of fssproj's, corresponding to projects with runnable
     79  * threads on the pset.  fssproj's in turn point to the fsszone which they
     80  * are a member of.
     81  *
     82  * An fssproj_t is removed when there are no threads in it.
     83  *
     84  * An fsszone_t is removed when there are no projects with threads in it.
     85  *
     86  * Projects in a zone compete with each other for cpu time, receiving cpu
     87  * allocation within a zone proportional to fssproj->fssp_shares
     88  * (project.cpu-shares); at a higher level zones compete with each other,
     89  * receiving allocation in a pset proportional to fsszone->fssz_shares
     90  * (zone.cpu-shares).  See fss_decay_usage() for the precise formula.
     91  */
     92 
     93 static pri_t fss_init(id_t, int, classfuncs_t **);
     94 
     95 static struct sclass fss = {
     96 	"FSS",
     97 	fss_init,
     98 	0
     99 };
    100 
    101 extern struct mod_ops mod_schedops;
    102 
    103 /*
    104  * Module linkage information for the kernel.
    105  */
    106 static struct modlsched modlsched = {
    107 	&mod_schedops, "fair share scheduling class", &fss
    108 };
    109 
    110 static struct modlinkage modlinkage = {
    111 	MODREV_1, (void *)&modlsched, NULL
    112 };
    113 
    114 #define	FSS_MAXUPRI	60
    115 
    116 /*
    117  * The fssproc_t structures are kept in an array of circular doubly linked
    118  * lists.  A hash on the thread pointer is used to determine which list each
    119  * thread should be placed in.  Each list has a dummy "head" which is never
    120  * removed, so the list is never empty.  fss_update traverses these lists to
    121  * update the priorities of threads that have been waiting on the run queue.
    122  */
    123 #define	FSS_LISTS		16 /* number of lists, must be power of 2 */
    124 #define	FSS_LIST_HASH(t)	(((uintptr_t)(t) >> 9) & (FSS_LISTS - 1))
    125 #define	FSS_LIST_NEXT(i)	(((i) + 1) & (FSS_LISTS - 1))
    126 
    127 #define	FSS_LIST_INSERT(fssproc)				\
    128 {								\
    129 	int index = FSS_LIST_HASH(fssproc->fss_tp);		\
    130 	kmutex_t *lockp = &fss_listlock[index];			\
    131 	fssproc_t *headp = &fss_listhead[index];		\
    132 	mutex_enter(lockp);					\
    133 	fssproc->fss_next = headp->fss_next;			\
    134 	fssproc->fss_prev = headp;				\
    135 	headp->fss_next->fss_prev = fssproc;			\
    136 	headp->fss_next = fssproc;				\
    137 	mutex_exit(lockp);					\
    138 }
    139 
    140 #define	FSS_LIST_DELETE(fssproc)				\
    141 {								\
    142 	int index = FSS_LIST_HASH(fssproc->fss_tp);		\
    143 	kmutex_t *lockp = &fss_listlock[index];			\
    144 	mutex_enter(lockp);					\
    145 	fssproc->fss_prev->fss_next = fssproc->fss_next;	\
    146 	fssproc->fss_next->fss_prev = fssproc->fss_prev;	\
    147 	mutex_exit(lockp);					\
    148 }
    149 
    150 #define	FSS_TICK_COST	1000	/* tick cost for threads with nice level = 0 */
    151 
    152 /*
    153  * Decay rate percentages are based on n/128 rather than n/100 so  that
    154  * calculations can avoid having to do an integer divide by 100 (divide
    155  * by FSS_DECAY_BASE == 128 optimizes to an arithmetic shift).
    156  *
    157  * FSS_DECAY_MIN	=  83/128 ~= 65%
    158  * FSS_DECAY_MAX	= 108/128 ~= 85%
    159  * FSS_DECAY_USG	=  96/128 ~= 75%
    160  */
    161 #define	FSS_DECAY_MIN	83	/* fsspri decay pct for threads w/ nice -20 */
    162 #define	FSS_DECAY_MAX	108	/* fsspri decay pct for threads w/ nice +19 */
    163 #define	FSS_DECAY_USG	96	/* fssusage decay pct for projects */
    164 #define	FSS_DECAY_BASE	128	/* base for decay percentages above */
    165 
    166 #define	FSS_NICE_MIN	0
    167 #define	FSS_NICE_MAX	(2 * NZERO - 1)
    168 #define	FSS_NICE_RANGE	(FSS_NICE_MAX - FSS_NICE_MIN + 1)
    169 
    170 static int	fss_nice_tick[FSS_NICE_RANGE];
    171 static int	fss_nice_decay[FSS_NICE_RANGE];
    172 
    173 static pri_t	fss_maxupri = FSS_MAXUPRI; /* maximum FSS user priority */
    174 static pri_t	fss_maxumdpri; /* maximum user mode fss priority */
    175 static pri_t	fss_maxglobpri;	/* maximum global priority used by fss class */
    176 static pri_t	fss_minglobpri;	/* minimum global priority */
    177 
    178 static fssproc_t fss_listhead[FSS_LISTS];
    179 static kmutex_t	fss_listlock[FSS_LISTS];
    180 
    181 static fsspset_t *fsspsets;
    182 static kmutex_t fsspsets_lock;	/* protects fsspsets */
    183 
    184 static id_t	fss_cid;
    185 
    186 static time_t	fss_minrun = 2;	/* t_pri becomes 59 within 2 secs */
    187 static time_t	fss_minslp = 2;	/* min time on sleep queue for hardswap */
    188 static int	fss_quantum = 11;
    189 
    190 static void	fss_newpri(fssproc_t *);
    191 static void	fss_update(void *);
    192 static int	fss_update_list(int);
    193 static void	fss_change_priority(kthread_t *, fssproc_t *);
    194 
    195 static int	fss_admin(caddr_t, cred_t *);
    196 static int	fss_getclinfo(void *);
    197 static int	fss_parmsin(void *);
    198 static int	fss_parmsout(void *, pc_vaparms_t *);
    199 static int	fss_vaparmsin(void *, pc_vaparms_t *);
    200 static int	fss_vaparmsout(void *, pc_vaparms_t *);
    201 static int	fss_getclpri(pcpri_t *);
    202 static int	fss_alloc(void **, int);
    203 static void	fss_free(void *);
    204 
    205 static int	fss_enterclass(kthread_t *, id_t, void *, cred_t *, void *);
    206 static void	fss_exitclass(void *);
    207 static int	fss_canexit(kthread_t *, cred_t *);
    208 static int	fss_fork(kthread_t *, kthread_t *, void *);
    209 static void	fss_forkret(kthread_t *, kthread_t *);
    210 static void	fss_parmsget(kthread_t *, void *);
    211 static int	fss_parmsset(kthread_t *, void *, id_t, cred_t *);
    212 static void	fss_stop(kthread_t *, int, int);
    213 static void	fss_exit(kthread_t *);
    214 static void	fss_active(kthread_t *);
    215 static void	fss_inactive(kthread_t *);
    216 static pri_t	fss_swapin(kthread_t *, int);
    217 static pri_t	fss_swapout(kthread_t *, int);
    218 static void	fss_trapret(kthread_t *);
    219 static void	fss_preempt(kthread_t *);
    220 static void	fss_setrun(kthread_t *);
    221 static void	fss_sleep(kthread_t *);
    222 static void	fss_tick(kthread_t *);
    223 static void	fss_wakeup(kthread_t *);
    224 static int	fss_donice(kthread_t *, cred_t *, int, int *);
    225 static int	fss_doprio(kthread_t *, cred_t *, int, int *);
    226 static pri_t	fss_globpri(kthread_t *);
    227 static void	fss_yield(kthread_t *);
    228 static void	fss_nullsys();
    229 
    230 static struct classfuncs fss_classfuncs = {
    231 	/* class functions */
    232 	fss_admin,
    233 	fss_getclinfo,
    234 	fss_parmsin,
    235 	fss_parmsout,
    236 	fss_vaparmsin,
    237 	fss_vaparmsout,
    238 	fss_getclpri,
    239 	fss_alloc,
    240 	fss_free,
    241 
    242 	/* thread functions */
    243 	fss_enterclass,
    244 	fss_exitclass,
    245 	fss_canexit,
    246 	fss_fork,
    247 	fss_forkret,
    248 	fss_parmsget,
    249 	fss_parmsset,
    250 	fss_stop,
    251 	fss_exit,
    252 	fss_active,
    253 	fss_inactive,
    254 	fss_swapin,
    255 	fss_swapout,
    256 	fss_trapret,
    257 	fss_preempt,
    258 	fss_setrun,
    259 	fss_sleep,
    260 	fss_tick,
    261 	fss_wakeup,
    262 	fss_donice,
    263 	fss_globpri,
    264 	fss_nullsys,	/* set_process_group */
    265 	fss_yield,
    266 	fss_doprio,
    267 };
    268 
    269 int
    270 _init()
    271 {
    272 	return (mod_install(&modlinkage));
    273 }
    274 
    275 int
    276 _fini()
    277 {
    278 	return (EBUSY);
    279 }
    280 
    281 int
    282 _info(struct modinfo *modinfop)
    283 {
    284 	return (mod_info(&modlinkage, modinfop));
    285 }
    286 
    287 /*ARGSUSED*/
    288 static int
    289 fss_project_walker(kproject_t *kpj, void *buf)
    290 {
    291 	return (0);
    292 }
    293 
    294 void *
    295 fss_allocbuf(int op, int type)
    296 {
    297 	fssbuf_t *fssbuf;
    298 	void **fsslist;
    299 	int cnt;
    300 	int i;
    301 	size_t size;
    302 
    303 	ASSERT(op == FSS_NPSET_BUF || op == FSS_NPROJ_BUF || op == FSS_ONE_BUF);
    304 	ASSERT(type == FSS_ALLOC_PROJ || type == FSS_ALLOC_ZONE);
    305 	ASSERT(MUTEX_HELD(&cpu_lock));
    306 
    307 	fssbuf = kmem_zalloc(sizeof (fssbuf_t), KM_SLEEP);
    308 	switch (op) {
    309 	case FSS_NPSET_BUF:
    310 		cnt = cpupart_list(NULL, 0, CP_NONEMPTY);
    311 		break;
    312 	case FSS_NPROJ_BUF:
    313 		cnt = project_walk_all(ALL_ZONES, fss_project_walker, NULL);
    314 		break;
    315 	case FSS_ONE_BUF:
    316 		cnt = 1;
    317 		break;
    318 	}
    319 
    320 	switch (type) {
    321 	case FSS_ALLOC_PROJ:
    322 		size = sizeof (fssproj_t);
    323 		break;
    324 	case FSS_ALLOC_ZONE:
    325 		size = sizeof (fsszone_t);
    326 		break;
    327 	}
    328 	fsslist = kmem_zalloc(cnt * sizeof (void *), KM_SLEEP);
    329 	fssbuf->fssb_size = cnt;
    330 	fssbuf->fssb_list = fsslist;
    331 	for (i = 0; i < cnt; i++)
    332 		fsslist[i] = kmem_zalloc(size, KM_SLEEP);
    333 	return (fssbuf);
    334 }
    335 
    336 void
    337 fss_freebuf(fssbuf_t *fssbuf, int type)
    338 {
    339 	void **fsslist;
    340 	int i;
    341 	size_t size;
    342 
    343 	ASSERT(fssbuf != NULL);
    344 	ASSERT(type == FSS_ALLOC_PROJ || type == FSS_ALLOC_ZONE);
    345 	fsslist = fssbuf->fssb_list;
    346 
    347 	switch (type) {
    348 	case FSS_ALLOC_PROJ:
    349 		size = sizeof (fssproj_t);
    350 		break;
    351 	case FSS_ALLOC_ZONE:
    352 		size = sizeof (fsszone_t);
    353 		break;
    354 	}
    355 
    356 	for (i = 0; i < fssbuf->fssb_size; i++) {
    357 		if (fsslist[i] != NULL)
    358 			kmem_free(fsslist[i], size);
    359 	}
    360 	kmem_free(fsslist, sizeof (void *) * fssbuf->fssb_size);
    361 	kmem_free(fssbuf, sizeof (fssbuf_t));
    362 }
    363 
    364 static fsspset_t *
    365 fss_find_fsspset(cpupart_t *cpupart)
    366 {
    367 	int i;
    368 	fsspset_t *fsspset = NULL;
    369 	int found = 0;
    370 
    371 	ASSERT(cpupart != NULL);
    372 	ASSERT(MUTEX_HELD(&fsspsets_lock));
    373 
    374 	/*
    375 	 * Search for the cpupart pointer in the array of fsspsets.
    376 	 */
    377 	for (i = 0; i < max_ncpus; i++) {
    378 		fsspset = &fsspsets[i];
    379 		if (fsspset->fssps_cpupart == cpupart) {
    380 			ASSERT(fsspset->fssps_nproj > 0);
    381 			found = 1;
    382 			break;
    383 		}
    384 	}
    385 	if (found == 0) {
    386 		/*
    387 		 * If we didn't find anything, then use the first
    388 		 * available slot in the fsspsets array.
    389 		 */
    390 		for (i = 0; i < max_ncpus; i++) {
    391 			fsspset = &fsspsets[i];
    392 			if (fsspset->fssps_cpupart == NULL) {
    393 				ASSERT(fsspset->fssps_nproj == 0);
    394 				found = 1;
    395 				break;
    396 			}
    397 		}
    398 		fsspset->fssps_cpupart = cpupart;
    399 	}
    400 	ASSERT(found == 1);
    401 	return (fsspset);
    402 }
    403 
    404 static void
    405 fss_del_fsspset(fsspset_t *fsspset)
    406 {
    407 	ASSERT(MUTEX_HELD(&fsspsets_lock));
    408 	ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
    409 	ASSERT(fsspset->fssps_nproj == 0);
    410 	ASSERT(fsspset->fssps_list == NULL);
    411 	ASSERT(fsspset->fssps_zones == NULL);
    412 	fsspset->fssps_cpupart = NULL;
    413 	fsspset->fssps_maxfsspri = 0;
    414 	fsspset->fssps_shares = 0;
    415 }
    416 
    417 /*
    418  * The following routine returns a pointer to the fsszone structure which
    419  * belongs to zone "zone" and cpu partition fsspset, if such structure exists.
    420  */
    421 static fsszone_t *
    422 fss_find_fsszone(fsspset_t *fsspset, zone_t *zone)
    423 {
    424 	fsszone_t *fsszone;
    425 
    426 	ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
    427 
    428 	if (fsspset->fssps_list != NULL) {
    429 		/*
    430 		 * There are projects/zones active on this cpu partition
    431 		 * already.  Try to find our zone among them.
    432 		 */
    433 		fsszone = fsspset->fssps_zones;
    434 		do {
    435 			if (fsszone->fssz_zone == zone) {
    436 				return (fsszone);
    437 			}
    438 			fsszone = fsszone->fssz_next;
    439 		} while (fsszone != fsspset->fssps_zones);
    440 	}
    441 	return (NULL);
    442 }
    443 
    444 /*
    445  * The following routine links new fsszone structure into doubly linked list of
    446  * zones active on the specified cpu partition.
    447  */
    448 static void
    449 fss_insert_fsszone(fsspset_t *fsspset, zone_t *zone, fsszone_t *fsszone)
    450 {
    451 	ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
    452 
    453 	fsszone->fssz_zone = zone;
    454 	fsszone->fssz_rshares = zone->zone_shares;
    455 
    456 	if (fsspset->fssps_zones == NULL) {
    457 		/*
    458 		 * This will be the first fsszone for this fsspset
    459 		 */
    460 		fsszone->fssz_next = fsszone->fssz_prev = fsszone;
    461 		fsspset->fssps_zones = fsszone;
    462 	} else {
    463 		/*
    464 		 * Insert this fsszone to the doubly linked list.
    465 		 */
    466 		fsszone_t *fssz_head = fsspset->fssps_zones;
    467 
    468 		fsszone->fssz_next = fssz_head;
    469 		fsszone->fssz_prev = fssz_head->fssz_prev;
    470 		fssz_head->fssz_prev->fssz_next = fsszone;
    471 		fssz_head->fssz_prev = fsszone;
    472 		fsspset->fssps_zones = fsszone;
    473 	}
    474 }
    475 
    476 /*
    477  * The following routine removes a single fsszone structure from the doubly
    478  * linked list of zones active on the specified cpu partition.  Note that
    479  * global fsspsets_lock must be held in case this fsszone structure is the last
    480  * on the above mentioned list.  Also note that the fsszone structure is not
    481  * freed here, it is the responsibility of the caller to call kmem_free for it.
    482  */
    483 static void
    484 fss_remove_fsszone(fsspset_t *fsspset, fsszone_t *fsszone)
    485 {
    486 	ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
    487 	ASSERT(fsszone->fssz_nproj == 0);
    488 	ASSERT(fsszone->fssz_shares == 0);
    489 	ASSERT(fsszone->fssz_runnable == 0);
    490 
    491 	if (fsszone->fssz_next != fsszone) {
    492 		/*
    493 		 * This is not the last zone in the list.
    494 		 */
    495 		fsszone->fssz_prev->fssz_next = fsszone->fssz_next;
    496 		fsszone->fssz_next->fssz_prev = fsszone->fssz_prev;
    497 		if (fsspset->fssps_zones == fsszone)
    498 			fsspset->fssps_zones = fsszone->fssz_next;
    499 	} else {
    500 		/*
    501 		 * This was the last zone active in this cpu partition.
    502 		 */
    503 		fsspset->fssps_zones = NULL;
    504 	}
    505 }
    506 
    507 /*
    508  * The following routine returns a pointer to the fssproj structure
    509  * which belongs to project kpj and cpu partition fsspset, if such structure
    510  * exists.
    511  */
    512 static fssproj_t *
    513 fss_find_fssproj(fsspset_t *fsspset, kproject_t *kpj)
    514 {
    515 	fssproj_t *fssproj;
    516 
    517 	ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
    518 
    519 	if (fsspset->fssps_list != NULL) {
    520 		/*
    521 		 * There are projects running on this cpu partition already.
    522 		 * Try to find our project among them.
    523 		 */
    524 		fssproj = fsspset->fssps_list;
    525 		do {
    526 			if (fssproj->fssp_proj == kpj) {
    527 				ASSERT(fssproj->fssp_pset == fsspset);
    528 				return (fssproj);
    529 			}
    530 			fssproj = fssproj->fssp_next;
    531 		} while (fssproj != fsspset->fssps_list);
    532 	}
    533 	return (NULL);
    534 }
    535 
    536 /*
    537  * The following routine links new fssproj structure into doubly linked list
    538  * of projects running on the specified cpu partition.
    539  */
    540 static void
    541 fss_insert_fssproj(fsspset_t *fsspset, kproject_t *kpj, fsszone_t *fsszone,
    542     fssproj_t *fssproj)
    543 {
    544 	ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
    545 
    546 	fssproj->fssp_pset = fsspset;
    547 	fssproj->fssp_proj = kpj;
    548 	fssproj->fssp_shares = kpj->kpj_shares;
    549 
    550 	fsspset->fssps_nproj++;
    551 
    552 	if (fsspset->fssps_list == NULL) {
    553 		/*
    554 		 * This will be the first fssproj for this fsspset
    555 		 */
    556 		fssproj->fssp_next = fssproj->fssp_prev = fssproj;
    557 		fsspset->fssps_list = fssproj;
    558 	} else {
    559 		/*
    560 		 * Insert this fssproj to the doubly linked list.
    561 		 */
    562 		fssproj_t *fssp_head = fsspset->fssps_list;
    563 
    564 		fssproj->fssp_next = fssp_head;
    565 		fssproj->fssp_prev = fssp_head->fssp_prev;
    566 		fssp_head->fssp_prev->fssp_next = fssproj;
    567 		fssp_head->fssp_prev = fssproj;
    568 		fsspset->fssps_list = fssproj;
    569 	}
    570 	fssproj->fssp_fsszone = fsszone;
    571 	fsszone->fssz_nproj++;
    572 	ASSERT(fsszone->fssz_nproj != 0);
    573 }
    574 
    575 /*
    576  * The following routine removes a single fssproj structure from the doubly
    577  * linked list of projects running on the specified cpu partition.  Note that
    578  * global fsspsets_lock must be held in case if this fssproj structure is the
    579  * last on the above mentioned list.  Also note that the fssproj structure is
    580  * not freed here, it is the responsibility of the caller to call kmem_free
    581  * for it.
    582  */
    583 static void
    584 fss_remove_fssproj(fsspset_t *fsspset, fssproj_t *fssproj)
    585 {
    586 	fsszone_t *fsszone;
    587 
    588 	ASSERT(MUTEX_HELD(&fsspsets_lock));
    589 	ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
    590 	ASSERT(fssproj->fssp_runnable == 0);
    591 
    592 	fsspset->fssps_nproj--;
    593 
    594 	fsszone = fssproj->fssp_fsszone;
    595 	fsszone->fssz_nproj--;
    596 
    597 	if (fssproj->fssp_next != fssproj) {
    598 		/*
    599 		 * This is not the last part in the list.
    600 		 */
    601 		fssproj->fssp_prev->fssp_next = fssproj->fssp_next;
    602 		fssproj->fssp_next->fssp_prev = fssproj->fssp_prev;
    603 		if (fsspset->fssps_list == fssproj)
    604 			fsspset->fssps_list = fssproj->fssp_next;
    605 		if (fsszone->fssz_nproj == 0)
    606 			fss_remove_fsszone(fsspset, fsszone);
    607 	} else {
    608 		/*
    609 		 * This was the last project part running
    610 		 * at this cpu partition.
    611 		 */
    612 		fsspset->fssps_list = NULL;
    613 		ASSERT(fsspset->fssps_nproj == 0);
    614 		ASSERT(fsszone->fssz_nproj == 0);
    615 		fss_remove_fsszone(fsspset, fsszone);
    616 		fss_del_fsspset(fsspset);
    617 	}
    618 }
    619 
    620 static void
    621 fss_inactive(kthread_t *t)
    622 {
    623 	fssproc_t *fssproc;
    624 	fssproj_t *fssproj;
    625 	fsspset_t *fsspset;
    626 	fsszone_t *fsszone;
    627 
    628 	ASSERT(THREAD_LOCK_HELD(t));
    629 	fssproc = FSSPROC(t);
    630 	fssproj = FSSPROC2FSSPROJ(fssproc);
    631 	if (fssproj == NULL)	/* if this thread already exited */
    632 		return;
    633 	fsspset = FSSPROJ2FSSPSET(fssproj);
    634 	fsszone = fssproj->fssp_fsszone;
    635 	disp_lock_enter_high(&fsspset->fssps_displock);
    636 	ASSERT(fssproj->fssp_runnable > 0);
    637 	if (--fssproj->fssp_runnable == 0) {
    638 		fsszone->fssz_shares -= fssproj->fssp_shares;
    639 		if (--fsszone->fssz_runnable == 0)
    640 			fsspset->fssps_shares -= fsszone->fssz_rshares;
    641 	}
    642 	ASSERT(fssproc->fss_runnable == 1);
    643 	fssproc->fss_runnable = 0;
    644 	disp_lock_exit_high(&fsspset->fssps_displock);
    645 }
    646 
    647 static void
    648 fss_active(kthread_t *t)
    649 {
    650 	fssproc_t *fssproc;
    651 	fssproj_t *fssproj;
    652 	fsspset_t *fsspset;
    653 	fsszone_t *fsszone;
    654 
    655 	ASSERT(THREAD_LOCK_HELD(t));
    656 	fssproc = FSSPROC(t);
    657 	fssproj = FSSPROC2FSSPROJ(fssproc);
    658 	if (fssproj == NULL)	/* if this thread already exited */
    659 		return;
    660 	fsspset = FSSPROJ2FSSPSET(fssproj);
    661 	fsszone = fssproj->fssp_fsszone;
    662 	disp_lock_enter_high(&fsspset->fssps_displock);
    663 	if (++fssproj->fssp_runnable == 1) {
    664 		fsszone->fssz_shares += fssproj->fssp_shares;
    665 		if (++fsszone->fssz_runnable == 1)
    666 			fsspset->fssps_shares += fsszone->fssz_rshares;
    667 	}
    668 	ASSERT(fssproc->fss_runnable == 0);
    669 	fssproc->fss_runnable = 1;
    670 	disp_lock_exit_high(&fsspset->fssps_displock);
    671 }
    672 
    673 /*
    674  * Fair share scheduler initialization. Called by dispinit() at boot time.
    675  * We can ignore clparmsz argument since we know that the smallest possible
    676  * parameter buffer is big enough for us.
    677  */
    678 /*ARGSUSED*/
    679 static pri_t
    680 fss_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
    681 {
    682 	int i;
    683 
    684 	ASSERT(MUTEX_HELD(&cpu_lock));
    685 
    686 	fss_cid = cid;
    687 	fss_maxumdpri = minclsyspri - 1;
    688 	fss_maxglobpri = minclsyspri;
    689 	fss_minglobpri = 0;
    690 	fsspsets = kmem_zalloc(sizeof (fsspset_t) * max_ncpus, KM_SLEEP);
    691 
    692 	/*
    693 	 * Initialize the fssproc hash table.
    694 	 */
    695 	for (i = 0; i < FSS_LISTS; i++)
    696 		fss_listhead[i].fss_next = fss_listhead[i].fss_prev =
    697 		    &fss_listhead[i];
    698 
    699 	*clfuncspp = &fss_classfuncs;
    700 
    701 	/*
    702 	 * Fill in fss_nice_tick and fss_nice_decay arrays:
    703 	 * The cost of a tick is lower at positive nice values (so that it
    704 	 * will not increase its project's usage as much as normal) with 50%
    705 	 * drop at the maximum level and 50% increase at the minimum level.
    706 	 * The fsspri decay is slower at positive nice values.  fsspri values
    707 	 * of processes with negative nice levels must decay faster to receive
    708 	 * time slices more frequently than normal.
    709 	 */
    710 	for (i = 0; i < FSS_NICE_RANGE; i++) {
    711 		fss_nice_tick[i] = (FSS_TICK_COST * (((3 * FSS_NICE_RANGE) / 2)
    712 		    - i)) / FSS_NICE_RANGE;
    713 		fss_nice_decay[i] = FSS_DECAY_MIN +
    714 		    ((FSS_DECAY_MAX - FSS_DECAY_MIN) * i) /
    715 		    (FSS_NICE_RANGE - 1);
    716 	}
    717 
    718 	return (fss_maxglobpri);
    719 }
    720 
    721 /*
    722  * Calculate the new cpupri based on the usage, the number of shares and
    723  * the number of active threads.  Reset the tick counter for this thread.
    724  */
    725 static void
    726 fss_newpri(fssproc_t *fssproc)
    727 {
    728 	kthread_t *tp;
    729 	fssproj_t *fssproj;
    730 	fsspset_t *fsspset;
    731 	fsszone_t *fsszone;
    732 	fsspri_t fsspri, maxfsspri;
    733 	pri_t invpri;
    734 	uint32_t ticks;
    735 
    736 	tp = fssproc->fss_tp;
    737 	ASSERT(tp != NULL);
    738 
    739 	if (tp->t_cid != fss_cid)
    740 		return;
    741 
    742 	ASSERT(THREAD_LOCK_HELD(tp));
    743 
    744 	fssproj = FSSPROC2FSSPROJ(fssproc);
    745 	fsszone = FSSPROJ2FSSZONE(fssproj);
    746 	if (fssproj == NULL)
    747 		/*
    748 		 * No need to change priority of exited threads.
    749 		 */
    750 		return;
    751 
    752 	fsspset = FSSPROJ2FSSPSET(fssproj);
    753 	disp_lock_enter_high(&fsspset->fssps_displock);
    754 
    755 	if (fssproj->fssp_shares == 0 || fsszone->fssz_rshares == 0) {
    756 		/*
    757 		 * Special case: threads with no shares.
    758 		 */
    759 		fssproc->fss_umdpri = fss_minglobpri;
    760 		fssproc->fss_ticks = 0;
    761 		disp_lock_exit_high(&fsspset->fssps_displock);
    762 		return;
    763 	}
    764 
    765 	/*
    766 	 * fsspri += shusage * nrunnable * ticks
    767 	 */
    768 	ticks = fssproc->fss_ticks;
    769 	fssproc->fss_ticks = 0;
    770 	fsspri = fssproc->fss_fsspri;
    771 	fsspri += fssproj->fssp_shusage * fssproj->fssp_runnable * ticks;
    772 	fssproc->fss_fsspri = fsspri;
    773 
    774 	if (fsspri < fss_maxumdpri)
    775 		fsspri = fss_maxumdpri;	/* so that maxfsspri is != 0 */
    776 
    777 	/*
    778 	 * The general priority formula:
    779 	 *
    780 	 *			(fsspri * umdprirange)
    781 	 *   pri = maxumdpri - ------------------------
    782 	 *				maxfsspri
    783 	 *
    784 	 * If this thread's fsspri is greater than the previous largest
    785 	 * fsspri, then record it as the new high and priority for this
    786 	 * thread will be one (the lowest priority assigned to a thread
    787 	 * that has non-zero shares).
    788 	 * Note that this formula cannot produce out of bounds priority
    789 	 * values; if it is changed, additional checks may need  to  be
    790 	 * added.
    791 	 */
    792 	maxfsspri = fsspset->fssps_maxfsspri;
    793 	if (fsspri >= maxfsspri) {
    794 		fsspset->fssps_maxfsspri = fsspri;
    795 		disp_lock_exit_high(&fsspset->fssps_displock);
    796 		fssproc->fss_umdpri = 1;
    797 	} else {
    798 		disp_lock_exit_high(&fsspset->fssps_displock);
    799 		invpri = (fsspri * (fss_maxumdpri - 1)) / maxfsspri;
    800 		fssproc->fss_umdpri = fss_maxumdpri - invpri;
    801 	}
    802 }
    803 
    804 /*
    805  * Decays usages of all running projects and resets their tick counters.
    806  * Called once per second from fss_update() after updating priorities.
    807  */
    808 static void
    809 fss_decay_usage()
    810 {
    811 	uint32_t zone_ext_shares, zone_int_shares;
    812 	uint32_t kpj_shares, pset_shares;
    813 	fsspset_t *fsspset;
    814 	fssproj_t *fssproj;
    815 	fsszone_t *fsszone;
    816 	fsspri_t maxfsspri;
    817 	int psetid;
    818 
    819 	mutex_enter(&fsspsets_lock);
    820 	/*
    821 	 * Go through all active processor sets and decay usages of projects
    822 	 * running on them.
    823 	 */
    824 	for (psetid = 0; psetid < max_ncpus; psetid++) {
    825 		fsspset = &fsspsets[psetid];
    826 		mutex_enter(&fsspset->fssps_lock);
    827 
    828 		if (fsspset->fssps_cpupart == NULL ||
    829 		    (fssproj = fsspset->fssps_list) == NULL) {
    830 			mutex_exit(&fsspset->fssps_lock);
    831 			continue;
    832 		}
    833 
    834 		/*
    835 		 * Decay maxfsspri for this cpu partition with the
    836 		 * fastest possible decay rate.
    837 		 */
    838 		disp_lock_enter(&fsspset->fssps_displock);
    839 
    840 		maxfsspri = (fsspset->fssps_maxfsspri *
    841 		    fss_nice_decay[NZERO]) / FSS_DECAY_BASE;
    842 		if (maxfsspri < fss_maxumdpri)
    843 			maxfsspri = fss_maxumdpri;
    844 		fsspset->fssps_maxfsspri = maxfsspri;
    845 
    846 		do {
    847 			/*
    848 			 * Decay usage for each project running on
    849 			 * this cpu partition.
    850 			 */
    851 			fssproj->fssp_usage =
    852 			    (fssproj->fssp_usage * FSS_DECAY_USG) /
    853 			    FSS_DECAY_BASE + fssproj->fssp_ticks;
    854 			fssproj->fssp_ticks = 0;
    855 
    856 			fsszone = fssproj->fssp_fsszone;
    857 			/*
    858 			 * Readjust the project's number of shares if it has
    859 			 * changed since we checked it last time.
    860 			 */
    861 			kpj_shares = fssproj->fssp_proj->kpj_shares;
    862 			if (fssproj->fssp_shares != kpj_shares) {
    863 				if (fssproj->fssp_runnable != 0) {
    864 					fsszone->fssz_shares -=
    865 					    fssproj->fssp_shares;
    866 					fsszone->fssz_shares += kpj_shares;
    867 				}
    868 				fssproj->fssp_shares = kpj_shares;
    869 			}
    870 
    871 			/*
    872 			 * Readjust the zone's number of shares if it
    873 			 * has changed since we checked it last time.
    874 			 */
    875 			zone_ext_shares = fsszone->fssz_zone->zone_shares;
    876 			if (fsszone->fssz_rshares != zone_ext_shares) {
    877 				if (fsszone->fssz_runnable != 0) {
    878 					fsspset->fssps_shares -=
    879 					    fsszone->fssz_rshares;
    880 					fsspset->fssps_shares +=
    881 					    zone_ext_shares;
    882 				}
    883 				fsszone->fssz_rshares = zone_ext_shares;
    884 			}
    885 			zone_int_shares = fsszone->fssz_shares;
    886 			pset_shares = fsspset->fssps_shares;
    887 			/*
    888 			 * Calculate fssp_shusage value to be used
    889 			 * for fsspri increments for the next second.
    890 			 */
    891 			if (kpj_shares == 0 || zone_ext_shares == 0) {
    892 				fssproj->fssp_shusage = 0;
    893 			} else if (FSSPROJ2KPROJ(fssproj) == proj0p) {
    894 				/*
    895 				 * Project 0 in the global zone has 50%
    896 				 * of its zone.
    897 				 */
    898 				fssproj->fssp_shusage = (fssproj->fssp_usage *
    899 				    zone_int_shares * zone_int_shares) /
    900 				    (zone_ext_shares * zone_ext_shares);
    901 			} else {
    902 				/*
    903 				 * Thread's priority is based on its project's
    904 				 * normalized usage (shusage) value which gets
    905 				 * calculated this way:
    906 				 *
    907 				 *	   pset_shares^2    zone_int_shares^2
    908 				 * usage * ------------- * ------------------
    909 				 *	   kpj_shares^2	    zone_ext_shares^2
    910 				 *
    911 				 * Where zone_int_shares is the sum of shares
    912 				 * of all active projects within the zone (and
    913 				 * the pset), and zone_ext_shares is the number
    914 				 * of zone shares (ie, zone.cpu-shares).
    915 				 *
    916 				 * If there is only one zone active on the pset
    917 				 * the above reduces to:
    918 				 *
    919 				 * 			zone_int_shares^2
    920 				 * shusage = usage * ---------------------
    921 				 * 			kpj_shares^2
    922 				 *
    923 				 * If there's only one project active in the
    924 				 * zone this formula reduces to:
    925 				 *
    926 				 *			pset_shares^2
    927 				 * shusage = usage * ----------------------
    928 				 *			zone_ext_shares^2
    929 				 */
    930 				fssproj->fssp_shusage = fssproj->fssp_usage *
    931 				    pset_shares * zone_int_shares;
    932 				fssproj->fssp_shusage /=
    933 				    kpj_shares * zone_ext_shares;
    934 				fssproj->fssp_shusage *=
    935 				    pset_shares * zone_int_shares;
    936 				fssproj->fssp_shusage /=
    937 				    kpj_shares * zone_ext_shares;
    938 			}
    939 			fssproj = fssproj->fssp_next;
    940 		} while (fssproj != fsspset->fssps_list);
    941 
    942 		disp_lock_exit(&fsspset->fssps_displock);
    943 		mutex_exit(&fsspset->fssps_lock);
    944 	}
    945 	mutex_exit(&fsspsets_lock);
    946 }
    947 
    948 static void
    949 fss_change_priority(kthread_t *t, fssproc_t *fssproc)
    950 {
    951 	pri_t new_pri;
    952 
    953 	ASSERT(THREAD_LOCK_HELD(t));
    954 	new_pri = fssproc->fss_umdpri;
    955 	ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri);
    956 
    957 	t->t_cpri = fssproc->fss_upri;
    958 	fssproc->fss_flags &= ~FSSRESTORE;
    959 	if (t == curthread || t->t_state == TS_ONPROC) {
    960 		/*
    961 		 * curthread is always onproc
    962 		 */
    963 		cpu_t *cp = t->t_disp_queue->disp_cpu;
    964 		THREAD_CHANGE_PRI(t, new_pri);
    965 		if (t == cp->cpu_dispthread)
    966 			cp->cpu_dispatch_pri = DISP_PRIO(t);
    967 		if (DISP_MUST_SURRENDER(t)) {
    968 			fssproc->fss_flags |= FSSBACKQ;
    969 			cpu_surrender(t);
    970 		} else {
    971 			fssproc->fss_timeleft = fss_quantum;
    972 		}
    973 	} else {
    974 		/*
    975 		 * When the priority of a thread is changed, it may be
    976 		 * necessary to adjust its position on a sleep queue or
    977 		 * dispatch queue.  The function thread_change_pri accomplishes
    978 		 * this.
    979 		 */
    980 		if (thread_change_pri(t, new_pri, 0)) {
    981 			/*
    982 			 * The thread was on a run queue.
    983 			 */
    984 			fssproc->fss_timeleft = fss_quantum;
    985 		} else {
    986 			fssproc->fss_flags |= FSSBACKQ;
    987 		}
    988 	}
    989 }
    990 
    991 /*
    992  * Update priorities of all fair-sharing threads that are currently runnable
    993  * at a user mode priority based on the number of shares and current usage.
    994  * Called once per second via timeout which we reset here.
    995  *
    996  * There are several lists of fair-sharing threads broken up by a hash on the
    997  * thread pointer.  Each list has its own lock.  This avoids blocking all
    998  * fss_enterclass, fss_fork, and fss_exitclass operations while fss_update runs.
    999  * fss_update traverses each list in turn.
   1000  */
   1001 static void
   1002 fss_update(void *arg)
   1003 {
   1004 	int i;
   1005 	int new_marker = -1;
   1006 	static int fss_update_marker;
   1007 
   1008 	/*
   1009 	 * Decay and update usages for all projects.
   1010 	 */
   1011 	fss_decay_usage();
   1012 
   1013 	/*
   1014 	 * Start with the fss_update_marker list, then do the rest.
   1015 	 */
   1016 	i = fss_update_marker;
   1017 
   1018 	/*
   1019 	 * Go around all threads, set new priorities and decay
   1020 	 * per-thread CPU usages.
   1021 	 */
   1022 	do {
   1023 		/*
   1024 		 * If this is the first list after the current marker to have
   1025 		 * threads with priorities updates, advance the marker to this
   1026 		 * list for the next time fss_update runs.
   1027 		 */
   1028 		if (fss_update_list(i) &&
   1029 		    new_marker == -1 && i != fss_update_marker)
   1030 			new_marker = i;
   1031 	} while ((i = FSS_LIST_NEXT(i)) != fss_update_marker);
   1032 
   1033 	/*
   1034 	 * Advance marker for the next fss_update call
   1035 	 */
   1036 	if (new_marker != -1)
   1037 		fss_update_marker = new_marker;
   1038 
   1039 	(void) timeout(fss_update, arg, hz);
   1040 }
   1041 
   1042 /*
   1043  * Updates priority for a list of threads.  Returns 1 if the priority of one
   1044  * of the threads was actually updated, 0 if none were for various reasons
   1045  * (thread is no longer in the FSS class, is not runnable, has the preemption
   1046  * control no-preempt bit set, etc.)
   1047  */
   1048 static int
   1049 fss_update_list(int i)
   1050 {
   1051 	fssproc_t *fssproc;
   1052 	fssproj_t *fssproj;
   1053 	fsspri_t fsspri;
   1054 	kthread_t *t;
   1055 	int updated = 0;
   1056 
   1057 	mutex_enter(&fss_listlock[i]);
   1058 	for (fssproc = fss_listhead[i].fss_next; fssproc != &fss_listhead[i];
   1059 	    fssproc = fssproc->fss_next) {
   1060 		t = fssproc->fss_tp;
   1061 		/*
   1062 		 * Lock the thread and verify the state.
   1063 		 */
   1064 		thread_lock(t);
   1065 		/*
   1066 		 * Skip the thread if it is no longer in the FSS class or
   1067 		 * is running with kernel mode priority.
   1068 		 */
   1069 		if (t->t_cid != fss_cid)
   1070 			goto next;
   1071 		if ((fssproc->fss_flags & FSSKPRI) != 0)
   1072 			goto next;
   1073 
   1074 		fssproj = FSSPROC2FSSPROJ(fssproc);
   1075 		if (fssproj == NULL)
   1076 			goto next;
   1077 		if (fssproj->fssp_shares != 0) {
   1078 			/*
   1079 			 * Decay fsspri value.
   1080 			 */
   1081 			fsspri = fssproc->fss_fsspri;
   1082 			fsspri = (fsspri * fss_nice_decay[fssproc->fss_nice]) /
   1083 			    FSS_DECAY_BASE;
   1084 			fssproc->fss_fsspri = fsspri;
   1085 		}
   1086 
   1087 		if (t->t_schedctl && schedctl_get_nopreempt(t))
   1088 			goto next;
   1089 		if (t->t_state != TS_RUN && t->t_state != TS_WAIT) {
   1090 			/*
   1091 			 * Make next syscall/trap call fss_trapret
   1092 			 */
   1093 			t->t_trapret = 1;
   1094 			aston(t);
   1095 			goto next;
   1096 		}
   1097 		fss_newpri(fssproc);
   1098 		updated = 1;
   1099 
   1100 		/*
   1101 		 * Only dequeue the thread if it needs to be moved; otherwise
   1102 		 * it should just round-robin here.
   1103 		 */
   1104 		if (t->t_pri != fssproc->fss_umdpri)
   1105 			fss_change_priority(t, fssproc);
   1106 next:
   1107 		thread_unlock(t);
   1108 	}
   1109 	mutex_exit(&fss_listlock[i]);
   1110 	return (updated);
   1111 }
   1112 
   1113 /*ARGSUSED*/
   1114 static int
   1115 fss_admin(caddr_t uaddr, cred_t *reqpcredp)
   1116 {
   1117 	fssadmin_t fssadmin;
   1118 
   1119 	if (copyin(uaddr, &fssadmin, sizeof (fssadmin_t)))
   1120 		return (EFAULT);
   1121 
   1122 	switch (fssadmin.fss_cmd) {
   1123 	case FSS_SETADMIN:
   1124 		if (secpolicy_dispadm(reqpcredp) != 0)
   1125 			return (EPERM);
   1126 		if (fssadmin.fss_quantum <= 0 || fssadmin.fss_quantum >= hz)
   1127 			return (EINVAL);
   1128 		fss_quantum = fssadmin.fss_quantum;
   1129 		break;
   1130 	case FSS_GETADMIN:
   1131 		fssadmin.fss_quantum = fss_quantum;
   1132 		if (copyout(&fssadmin, uaddr, sizeof (fssadmin_t)))
   1133 			return (EFAULT);
   1134 		break;
   1135 	default:
   1136 		return (EINVAL);
   1137 	}
   1138 	return (0);
   1139 }
   1140 
   1141 static int
   1142 fss_getclinfo(void *infop)
   1143 {
   1144 	fssinfo_t *fssinfo = (fssinfo_t *)infop;
   1145 	fssinfo->fss_maxupri = fss_maxupri;
   1146 	return (0);
   1147 }
   1148 
   1149 static int
   1150 fss_parmsin(void *parmsp)
   1151 {
   1152 	fssparms_t *fssparmsp = (fssparms_t *)parmsp;
   1153 
   1154 	/*
   1155 	 * Check validity of parameters.
   1156 	 */
   1157 	if ((fssparmsp->fss_uprilim > fss_maxupri ||
   1158 	    fssparmsp->fss_uprilim < -fss_maxupri) &&
   1159 	    fssparmsp->fss_uprilim != FSS_NOCHANGE)
   1160 		return (EINVAL);
   1161 
   1162 	if ((fssparmsp->fss_upri > fss_maxupri ||
   1163 	    fssparmsp->fss_upri < -fss_maxupri) &&
   1164 	    fssparmsp->fss_upri != FSS_NOCHANGE)
   1165 		return (EINVAL);
   1166 
   1167 	return (0);
   1168 }
   1169 
   1170 /*ARGSUSED*/
   1171 static int
   1172 fss_parmsout(void *parmsp, pc_vaparms_t *vaparmsp)
   1173 {
   1174 	return (0);
   1175 }
   1176 
   1177 static int
   1178 fss_vaparmsin(void *parmsp, pc_vaparms_t *vaparmsp)
   1179 {
   1180 	fssparms_t *fssparmsp = (fssparms_t *)parmsp;
   1181 	int priflag = 0;
   1182 	int limflag = 0;
   1183 	uint_t cnt;
   1184 	pc_vaparm_t *vpp = &vaparmsp->pc_parms[0];
   1185 
   1186 	/*
   1187 	 * FSS_NOCHANGE (-32768) is outside of the range of values for
   1188 	 * fss_uprilim and fss_upri.  If the structure fssparms_t is changed,
   1189 	 * FSS_NOCHANGE should be replaced by a flag word.
   1190 	 */
   1191 	fssparmsp->fss_uprilim = FSS_NOCHANGE;
   1192 	fssparmsp->fss_upri = FSS_NOCHANGE;
   1193 
   1194 	/*
   1195 	 * Get the varargs parameter and check validity of parameters.
   1196 	 */
   1197 	if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT)
   1198 		return (EINVAL);
   1199 
   1200 	for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) {
   1201 		switch (vpp->pc_key) {
   1202 		case FSS_KY_UPRILIM:
   1203 			if (limflag++)
   1204 				return (EINVAL);
   1205 			fssparmsp->fss_uprilim = (pri_t)vpp->pc_parm;
   1206 			if (fssparmsp->fss_uprilim > fss_maxupri ||
   1207 			    fssparmsp->fss_uprilim < -fss_maxupri)
   1208 				return (EINVAL);
   1209 			break;
   1210 		case FSS_KY_UPRI:
   1211 			if (priflag++)
   1212 				return (EINVAL);
   1213 			fssparmsp->fss_upri = (pri_t)vpp->pc_parm;
   1214 			if (fssparmsp->fss_upri > fss_maxupri ||
   1215 			    fssparmsp->fss_upri < -fss_maxupri)
   1216 				return (EINVAL);
   1217 			break;
   1218 		default:
   1219 			return (EINVAL);
   1220 		}
   1221 	}
   1222 
   1223 	if (vaparmsp->pc_vaparmscnt == 0) {
   1224 		/*
   1225 		 * Use default parameters.
   1226 		 */
   1227 		fssparmsp->fss_upri = fssparmsp->fss_uprilim = 0;
   1228 	}
   1229 
   1230 	return (0);
   1231 }
   1232 
   1233 /*
   1234  * Copy all selected fair-sharing class parameters to the user.  The parameters
   1235  * are specified by a key.
   1236  */
   1237 static int
   1238 fss_vaparmsout(void *parmsp, pc_vaparms_t *vaparmsp)
   1239 {
   1240 	fssparms_t *fssparmsp = (fssparms_t *)parmsp;
   1241 	int priflag = 0;
   1242 	int limflag = 0;
   1243 	uint_t cnt;
   1244 	pc_vaparm_t *vpp = &vaparmsp->pc_parms[0];
   1245 
   1246 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
   1247 
   1248 	if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT)
   1249 		return (EINVAL);
   1250 
   1251 	for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) {
   1252 		switch (vpp->pc_key) {
   1253 		case FSS_KY_UPRILIM:
   1254 			if (limflag++)
   1255 				return (EINVAL);
   1256 			if (copyout(&fssparmsp->fss_uprilim,
   1257 			    (caddr_t)(uintptr_t)vpp->pc_parm, sizeof (pri_t)))
   1258 				return (EFAULT);
   1259 			break;
   1260 		case FSS_KY_UPRI:
   1261 			if (priflag++)
   1262 				return (EINVAL);
   1263 			if (copyout(&fssparmsp->fss_upri,
   1264 			    (caddr_t)(uintptr_t)vpp->pc_parm, sizeof (pri_t)))
   1265 				return (EFAULT);
   1266 			break;
   1267 		default:
   1268 			return (EINVAL);
   1269 		}
   1270 	}
   1271 
   1272 	return (0);
   1273 }
   1274 
   1275 /*
   1276  * Return the user mode scheduling priority range.
   1277  */
   1278 static int
   1279 fss_getclpri(pcpri_t *pcprip)
   1280 {
   1281 	pcprip->pc_clpmax = fss_maxupri;
   1282 	pcprip->pc_clpmin = -fss_maxupri;
   1283 	return (0);
   1284 }
   1285 
   1286 static int
   1287 fss_alloc(void **p, int flag)
   1288 {
   1289 	void *bufp;
   1290 
   1291 	if ((bufp = kmem_zalloc(sizeof (fssproc_t), flag)) == NULL) {
   1292 		return (ENOMEM);
   1293 	} else {
   1294 		*p = bufp;
   1295 		return (0);
   1296 	}
   1297 }
   1298 
   1299 static void
   1300 fss_free(void *bufp)
   1301 {
   1302 	if (bufp)
   1303 		kmem_free(bufp, sizeof (fssproc_t));
   1304 }
   1305 
   1306 /*
   1307  * Thread functions
   1308  */
   1309 static int
   1310 fss_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
   1311     void *bufp)
   1312 {
   1313 	fssparms_t	*fssparmsp = (fssparms_t *)parmsp;
   1314 	fssproc_t	*fssproc;
   1315 	pri_t		reqfssuprilim;
   1316 	pri_t		reqfssupri;
   1317 	static uint32_t fssexists = 0;
   1318 	fsspset_t	*fsspset;
   1319 	fssproj_t	*fssproj;
   1320 	fsszone_t	*fsszone;
   1321 	kproject_t	*kpj;
   1322 	zone_t		*zone;
   1323 	int		fsszone_allocated = 0;
   1324 
   1325 	fssproc = (fssproc_t *)bufp;
   1326 	ASSERT(fssproc != NULL);
   1327 
   1328 	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
   1329 
   1330 	/*
   1331 	 * Only root can move threads to FSS class.
   1332 	 */
   1333 	if (reqpcredp != NULL && secpolicy_setpriority(reqpcredp) != 0)
   1334 		return (EPERM);
   1335 	/*
   1336 	 * Initialize the fssproc structure.
   1337 	 */
   1338 	fssproc->fss_umdpri = fss_maxumdpri / 2;
   1339 
   1340 	if (fssparmsp == NULL) {
   1341 		/*
   1342 		 * Use default values.
   1343 		 */
   1344 		fssproc->fss_nice = NZERO;
   1345 		fssproc->fss_uprilim = fssproc->fss_upri = 0;
   1346 	} else {
   1347 		/*
   1348 		 * Use supplied values.
   1349 		 */
   1350 		if (fssparmsp->fss_uprilim == FSS_NOCHANGE) {
   1351 			reqfssuprilim = 0;
   1352 		} else {
   1353 			if (fssparmsp->fss_uprilim > 0 &&
   1354 			    secpolicy_setpriority(reqpcredp) != 0)
   1355 				return (EPERM);
   1356 			reqfssuprilim = fssparmsp->fss_uprilim;
   1357 		}
   1358 		if (fssparmsp->fss_upri == FSS_NOCHANGE) {
   1359 			reqfssupri = reqfssuprilim;
   1360 		} else {
   1361 			if (fssparmsp->fss_upri > 0 &&
   1362 			    secpolicy_setpriority(reqpcredp) != 0)
   1363 				return (EPERM);
   1364 			/*
   1365 			 * Set the user priority to the requested value or
   1366 			 * the upri limit, whichever is lower.
   1367 			 */
   1368 			reqfssupri = fssparmsp->fss_upri;
   1369 			if (reqfssupri > reqfssuprilim)
   1370 				reqfssupri = reqfssuprilim;
   1371 		}
   1372 		fssproc->fss_uprilim = reqfssuprilim;
   1373 		fssproc->fss_upri = reqfssupri;
   1374 		fssproc->fss_nice = NZERO - (NZERO * reqfssupri) / fss_maxupri;
   1375 		if (fssproc->fss_nice > FSS_NICE_MAX)
   1376 			fssproc->fss_nice = FSS_NICE_MAX;
   1377 	}
   1378 
   1379 	fssproc->fss_timeleft = fss_quantum;
   1380 	fssproc->fss_tp = t;
   1381 	cpucaps_sc_init(&fssproc->fss_caps);
   1382 
   1383 	/*
   1384 	 * Put a lock on our fsspset structure.
   1385 	 */
   1386 	mutex_enter(&fsspsets_lock);
   1387 	fsspset = fss_find_fsspset(t->t_cpupart);
   1388 	mutex_enter(&fsspset->fssps_lock);
   1389 	mutex_exit(&fsspsets_lock);
   1390 
   1391 	zone = ttoproc(t)->p_zone;
   1392 	if ((fsszone = fss_find_fsszone(fsspset, zone)) == NULL) {
   1393 		if ((fsszone = kmem_zalloc(sizeof (fsszone_t), KM_NOSLEEP))
   1394 		    == NULL) {
   1395 			mutex_exit(&fsspset->fssps_lock);
   1396 			return (ENOMEM);
   1397 		} else {
   1398 			fsszone_allocated = 1;
   1399 			fss_insert_fsszone(fsspset, zone, fsszone);
   1400 		}
   1401 	}
   1402 	kpj = ttoproj(t);
   1403 	if ((fssproj = fss_find_fssproj(fsspset, kpj)) == NULL) {
   1404 		if ((fssproj = kmem_zalloc(sizeof (fssproj_t), KM_NOSLEEP))
   1405 		    == NULL) {
   1406 			if (fsszone_allocated) {
   1407 				fss_remove_fsszone(fsspset, fsszone);
   1408 				kmem_free(fsszone, sizeof (fsszone_t));
   1409 			}
   1410 			mutex_exit(&fsspset->fssps_lock);
   1411 			return (ENOMEM);
   1412 		} else {
   1413 			fss_insert_fssproj(fsspset, kpj, fsszone, fssproj);
   1414 		}
   1415 	}
   1416 	fssproj->fssp_threads++;
   1417 	fssproc->fss_proj = fssproj;
   1418 
   1419 	/*
   1420 	 * Reset priority. Process goes to a "user mode" priority here
   1421 	 * regardless of whether or not it has slept since entering the kernel.
   1422 	 */
   1423 	thread_lock(t);
   1424 	t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
   1425 	t->t_cid = cid;
   1426 	t->t_cldata = (void *)fssproc;
   1427 	t->t_schedflag |= TS_RUNQMATCH;
   1428 	fss_change_priority(t, fssproc);
   1429 	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
   1430 	    t->t_state == TS_WAIT)
   1431 		fss_active(t);
   1432 	thread_unlock(t);
   1433 
   1434 	mutex_exit(&fsspset->fssps_lock);
   1435 
   1436 	/*
   1437 	 * Link new structure into fssproc list.
   1438 	 */
   1439 	FSS_LIST_INSERT(fssproc);
   1440 
   1441 	/*
   1442 	 * If this is the first fair-sharing thread to occur since boot,
   1443 	 * we set up the initial call to fss_update() here. Use an atomic
   1444 	 * compare-and-swap since that's easier and faster than a mutex
   1445 	 * (but check with an ordinary load first since most of the time
   1446 	 * this will already be done).
   1447 	 */
   1448 	if (fssexists == 0 && cas32(&fssexists, 0, 1) == 0)
   1449 		(void) timeout(fss_update, NULL, hz);
   1450 
   1451 	return (0);
   1452 }
   1453 
   1454 /*
   1455  * Remove fssproc_t from the list.
   1456  */
   1457 static void
   1458 fss_exitclass(void *procp)
   1459 {
   1460 	fssproc_t *fssproc = (fssproc_t *)procp;
   1461 	fssproj_t *fssproj;
   1462 	fsspset_t *fsspset;
   1463 	fsszone_t *fsszone;
   1464 	kthread_t *t = fssproc->fss_tp;
   1465 
   1466 	/*
   1467 	 * We should be either getting this thread off the deathrow or
   1468 	 * this thread has already moved to another scheduling class and
   1469 	 * we're being called with its old cldata buffer pointer.  In both
   1470 	 * cases, the content of this buffer can not be changed while we're
   1471 	 * here.
   1472 	 */
   1473 	mutex_enter(&fsspsets_lock);
   1474 	thread_lock(t);
   1475 	if (t->t_cid != fss_cid) {
   1476 		/*
   1477 		 * We're being called as a result of the priocntl() system
   1478 		 * call -- someone is trying to move our thread to another
   1479 		 * scheduling class. We can't call fss_inactive() here
   1480 		 * because our thread's t_cldata pointer already points
   1481 		 * to another scheduling class specific data.
   1482 		 */
   1483 		ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
   1484 
   1485 		fssproj = FSSPROC2FSSPROJ(fssproc);
   1486 		fsspset = FSSPROJ2FSSPSET(fssproj);
   1487 		fsszone = fssproj->fssp_fsszone;
   1488 
   1489 		if (fssproc->fss_runnable) {
   1490 			disp_lock_enter_high(&fsspset->fssps_displock);
   1491 			if (--fssproj->fssp_runnable == 0) {
   1492 				fsszone->fssz_shares -= fssproj->fssp_shares;
   1493 				if (--fsszone->fssz_runnable == 0)
   1494 					fsspset->fssps_shares -=
   1495 					    fsszone->fssz_rshares;
   1496 			}
   1497 			disp_lock_exit_high(&fsspset->fssps_displock);
   1498 		}
   1499 		thread_unlock(t);
   1500 
   1501 		mutex_enter(&fsspset->fssps_lock);
   1502 		if (--fssproj->fssp_threads == 0) {
   1503 			fss_remove_fssproj(fsspset, fssproj);
   1504 			if (fsszone->fssz_nproj == 0)
   1505 				kmem_free(fsszone, sizeof (fsszone_t));
   1506 			kmem_free(fssproj, sizeof (fssproj_t));
   1507 		}
   1508 		mutex_exit(&fsspset->fssps_lock);
   1509 
   1510 	} else {
   1511 		ASSERT(t->t_state == TS_FREE);
   1512 		/*
   1513 		 * We're being called from thread_free() when our thread
   1514 		 * is removed from the deathrow. There is nothing we need
   1515 		 * do here since everything should've been done earlier
   1516 		 * in fss_exit().
   1517 		 */
   1518 		thread_unlock(t);
   1519 	}
   1520 	mutex_exit(&fsspsets_lock);
   1521 
   1522 	FSS_LIST_DELETE(fssproc);
   1523 	fss_free(fssproc);
   1524 }
   1525 
   1526 /*ARGSUSED*/
   1527 static int
   1528 fss_canexit(kthread_t *t, cred_t *credp)
   1529 {
   1530 	/*
   1531 	 * A thread is allowed to exit FSS only if we have sufficient
   1532 	 * privileges.
   1533 	 */
   1534 	if (credp != NULL && secpolicy_setpriority(credp) != 0)
   1535 		return (EPERM);
   1536 	else
   1537 		return (0);
   1538 }
   1539 
   1540 /*
   1541  * Initialize fair-share class specific proc structure for a child.
   1542  */
   1543 static int
   1544 fss_fork(kthread_t *pt, kthread_t *ct, void *bufp)
   1545 {
   1546 	fssproc_t *pfssproc;	/* ptr to parent's fssproc structure	*/
   1547 	fssproc_t *cfssproc;	/* ptr to child's fssproc structure	*/
   1548 	fssproj_t *fssproj;
   1549 	fsspset_t *fsspset;
   1550 
   1551 	ASSERT(MUTEX_HELD(&ttoproc(pt)->p_lock));
   1552 	ASSERT(ct->t_state == TS_STOPPED);
   1553 
   1554 	cfssproc = (fssproc_t *)bufp;
   1555 	ASSERT(cfssproc != NULL);
   1556 	bzero(cfssproc, sizeof (fssproc_t));
   1557 
   1558 	thread_lock(pt);
   1559 	pfssproc = FSSPROC(pt);
   1560 	fssproj = FSSPROC2FSSPROJ(pfssproc);
   1561 	fsspset = FSSPROJ2FSSPSET(fssproj);
   1562 	thread_unlock(pt);
   1563 
   1564 	mutex_enter(&fsspset->fssps_lock);
   1565 	/*
   1566 	 * Initialize child's fssproc structure.
   1567 	 */
   1568 	thread_lock(pt);
   1569 	ASSERT(FSSPROJ(pt) == fssproj);
   1570 	cfssproc->fss_proj = fssproj;
   1571 	cfssproc->fss_timeleft = fss_quantum;
   1572 	cfssproc->fss_umdpri = pfssproc->fss_umdpri;
   1573 	cfssproc->fss_fsspri = 0;
   1574 	cfssproc->fss_uprilim = pfssproc->fss_uprilim;
   1575 	cfssproc->fss_upri = pfssproc->fss_upri;
   1576 	cfssproc->fss_tp = ct;
   1577 	cfssproc->fss_nice = pfssproc->fss_nice;
   1578 	cpucaps_sc_init(&cfssproc->fss_caps);
   1579 
   1580 	cfssproc->fss_flags =
   1581 	    pfssproc->fss_flags & ~(FSSKPRI | FSSBACKQ | FSSRESTORE);
   1582 	ct->t_cldata = (void *)cfssproc;
   1583 	ct->t_schedflag |= TS_RUNQMATCH;
   1584 	thread_unlock(pt);
   1585 
   1586 	fssproj->fssp_threads++;
   1587 	mutex_exit(&fsspset->fssps_lock);
   1588 
   1589 	/*
   1590 	 * Link new structure into fssproc hash table.
   1591 	 */
   1592 	FSS_LIST_INSERT(cfssproc);
   1593 	return (0);
   1594 }
   1595 
   1596 /*
   1597  * Child is placed at back of dispatcher queue and parent gives up processor
   1598  * so that the child runs first after the fork. This allows the child
   1599  * immediately execing to break the multiple use of copy on write pages with no
   1600  * disk home. The parent will get to steal them back rather than uselessly
   1601  * copying them.
   1602  */
   1603 static void
   1604 fss_forkret(kthread_t *t, kthread_t *ct)
   1605 {
   1606 	proc_t *pp = ttoproc(t);
   1607 	proc_t *cp = ttoproc(ct);
   1608 	fssproc_t *fssproc;
   1609 
   1610 	ASSERT(t == curthread);
   1611 	ASSERT(MUTEX_HELD(&pidlock));
   1612 
   1613 	/*
   1614 	 * Grab the child's p_lock before dropping pidlock to ensure the
   1615 	 * process does not disappear before we set it running.
   1616 	 */
   1617 	mutex_enter(&cp->p_lock);
   1618 	mutex_exit(&pidlock);
   1619 	continuelwps(cp);
   1620 	mutex_exit(&cp->p_lock);
   1621 
   1622 	mutex_enter(&pp->p_lock);
   1623 	continuelwps(pp);
   1624 	mutex_exit(&pp->p_lock);
   1625 
   1626 	thread_lock(t);
   1627 
   1628 	fssproc = FSSPROC(t);
   1629 	fss_newpri(fssproc);
   1630 	fssproc->fss_timeleft = fss_quantum;
   1631 	t->t_pri = fssproc->fss_umdpri;
   1632 	ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri);
   1633 	fssproc->fss_flags &= ~FSSKPRI;
   1634 	THREAD_TRANSITION(t);
   1635 
   1636 	/*
   1637 	 * We don't want to call fss_setrun(t) here because it may call
   1638 	 * fss_active, which we don't need.
   1639 	 */
   1640 	fssproc->fss_flags &= ~FSSBACKQ;
   1641 
   1642 	if (t->t_disp_time != ddi_get_lbolt())
   1643 		setbackdq(t);
   1644 	else
   1645 		setfrontdq(t);
   1646 
   1647 	thread_unlock(t);
   1648 
   1649 	swtch();
   1650 }
   1651 
   1652 /*
   1653  * Get the fair-sharing parameters of the thread pointed to by fssprocp into
   1654  * the buffer pointed by fssparmsp.
   1655  */
   1656 static void
   1657 fss_parmsget(kthread_t *t, void *parmsp)
   1658 {
   1659 	fssproc_t *fssproc = FSSPROC(t);
   1660 	fssparms_t *fssparmsp = (fssparms_t *)parmsp;
   1661 
   1662 	fssparmsp->fss_uprilim = fssproc->fss_uprilim;
   1663 	fssparmsp->fss_upri = fssproc->fss_upri;
   1664 }
   1665 
   1666 /*ARGSUSED*/
   1667 static int
   1668 fss_parmsset(kthread_t *t, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
   1669 {
   1670 	char		nice;
   1671 	pri_t		reqfssuprilim;
   1672 	pri_t		reqfssupri;
   1673 	fssproc_t	*fssproc = FSSPROC(t);
   1674 	fssparms_t	*fssparmsp = (fssparms_t *)parmsp;
   1675 
   1676 	ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
   1677 
   1678 	if (fssparmsp->fss_uprilim == FSS_NOCHANGE)
   1679 		reqfssuprilim = fssproc->fss_uprilim;
   1680 	else
   1681 		reqfssuprilim = fssparmsp->fss_uprilim;
   1682 
   1683 	if (fssparmsp->fss_upri == FSS_NOCHANGE)
   1684 		reqfssupri = fssproc->fss_upri;
   1685 	else
   1686 		reqfssupri = fssparmsp->fss_upri;
   1687 
   1688 	/*
   1689 	 * Make sure the user priority doesn't exceed the upri limit.
   1690 	 */
   1691 	if (reqfssupri > reqfssuprilim)
   1692 		reqfssupri = reqfssuprilim;
   1693 
   1694 	/*
   1695 	 * Basic permissions enforced by generic kernel code for all classes
   1696 	 * require that a thread attempting to change the scheduling parameters
   1697 	 * of a target thread be privileged or have a real or effective UID
   1698 	 * matching that of the target thread. We are not called unless these
   1699 	 * basic permission checks have already passed. The fair-sharing class
   1700 	 * requires in addition that the calling thread be privileged if it
   1701 	 * is attempting to raise the upri limit above its current value.
   1702 	 * This may have been checked previously but if our caller passed us
   1703 	 * a non-NULL credential pointer we assume it hasn't and we check it
   1704 	 * here.
   1705 	 */
   1706 	if ((reqpcredp != NULL) &&
   1707 	    (reqfssuprilim > fssproc->fss_uprilim) &&
   1708 	    secpolicy_setpriority(reqpcredp) != 0)
   1709 		return (EPERM);
   1710 
   1711 	/*
   1712 	 * Set fss_nice to the nice value corresponding to the user priority we
   1713 	 * are setting.  Note that setting the nice field of the parameter
   1714 	 * struct won't affect upri or nice.
   1715 	 */
   1716 	nice = NZERO - (reqfssupri * NZERO) / fss_maxupri;
   1717 	if (nice > FSS_NICE_MAX)
   1718 		nice = FSS_NICE_MAX;
   1719 
   1720 	thread_lock(t);
   1721 
   1722 	fssproc->fss_uprilim = reqfssuprilim;
   1723 	fssproc->fss_upri = reqfssupri;
   1724 	fssproc->fss_nice = nice;
   1725 	fss_newpri(fssproc);
   1726 
   1727 	if ((fssproc->fss_flags & FSSKPRI) != 0) {
   1728 		thread_unlock(t);
   1729 		return (0);
   1730 	}
   1731 
   1732 	fss_change_priority(t, fssproc);
   1733 	thread_unlock(t);
   1734 	return (0);
   1735 
   1736 }
   1737 
   1738 /*
   1739  * The thread is being stopped.
   1740  */
   1741 /*ARGSUSED*/
   1742 static void
   1743 fss_stop(kthread_t *t, int why, int what)
   1744 {
   1745 	ASSERT(THREAD_LOCK_HELD(t));
   1746 	ASSERT(t == curthread);
   1747 
   1748 	fss_inactive(t);
   1749 }
   1750 
   1751 /*
   1752  * The current thread is exiting, do necessary adjustments to its project
   1753  */
   1754 static void
   1755 fss_exit(kthread_t *t)
   1756 {
   1757 	fsspset_t *fsspset;
   1758 	fssproj_t *fssproj;
   1759 	fssproc_t *fssproc;
   1760 	fsszone_t *fsszone;
   1761 	int free = 0;
   1762 
   1763 	/*
   1764 	 * Thread t here is either a current thread (in which case we hold
   1765 	 * its process' p_lock), or a thread being destroyed by forklwp_fail(),
   1766 	 * in which case we hold pidlock and thread is no longer on the
   1767 	 * thread list.
   1768 	 */
   1769 	ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock) || MUTEX_HELD(&pidlock));
   1770 
   1771 	fssproc = FSSPROC(t);
   1772 	fssproj = FSSPROC2FSSPROJ(fssproc);
   1773 	fsspset = FSSPROJ2FSSPSET(fssproj);
   1774 	fsszone = fssproj->fssp_fsszone;
   1775 
   1776 	mutex_enter(&fsspsets_lock);
   1777 	mutex_enter(&fsspset->fssps_lock);
   1778 
   1779 	thread_lock(t);
   1780 	disp_lock_enter_high(&fsspset->fssps_displock);
   1781 	if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
   1782 		if (--fssproj->fssp_runnable == 0) {
   1783 			fsszone->fssz_shares -= fssproj->fssp_shares;
   1784 			if (--fsszone->fssz_runnable == 0)
   1785 				fsspset->fssps_shares -= fsszone->fssz_rshares;
   1786 		}
   1787 		ASSERT(fssproc->fss_runnable == 1);
   1788 		fssproc->fss_runnable = 0;
   1789 	}
   1790 	if (--fssproj->fssp_threads == 0) {
   1791 		fss_remove_fssproj(fsspset, fssproj);
   1792 		free = 1;
   1793 	}
   1794 	disp_lock_exit_high(&fsspset->fssps_displock);
   1795 	fssproc->fss_proj = NULL;	/* mark this thread as already exited */
   1796 	thread_unlock(t);
   1797 
   1798 	if (free) {
   1799 		if (fsszone->fssz_nproj == 0)
   1800 			kmem_free(fsszone, sizeof (fsszone_t));
   1801 		kmem_free(fssproj, sizeof (fssproj_t));
   1802 	}
   1803 	mutex_exit(&fsspset->fssps_lock);
   1804 	mutex_exit(&fsspsets_lock);
   1805 
   1806 	/*
   1807 	 * A thread could be exiting in between clock ticks, so we need to
   1808 	 * calculate how much CPU time it used since it was charged last time.
   1809 	 *
   1810 	 * CPU caps are not enforced on exiting processes - it is usually
   1811 	 * desirable to exit as soon as possible to free resources.
   1812 	 */
   1813 	if (CPUCAPS_ON()) {
   1814 		thread_lock(t);
   1815 		fssproc = FSSPROC(t);
   1816 		(void) cpucaps_charge(t, &fssproc->fss_caps,
   1817 		    CPUCAPS_CHARGE_ONLY);
   1818 		thread_unlock(t);
   1819 	}
   1820 }
   1821 
   1822 static void
   1823 fss_nullsys()
   1824 {
   1825 }
   1826 
   1827 /*
   1828  * fss_swapin() returns -1 if the thread is loaded or is not eligible to be
   1829  * swapped in. Otherwise, it returns the thread's effective priority based
   1830  * on swapout time and size of process (0 <= epri <= 0 SHRT_MAX).
   1831  */
   1832 /*ARGSUSED*/
   1833 static pri_t
   1834 fss_swapin(kthread_t *t, int flags)
   1835 {
   1836 	fssproc_t *fssproc = FSSPROC(t);
   1837 	long epri = -1;
   1838 	proc_t *pp = ttoproc(t);
   1839 
   1840 	ASSERT(THREAD_LOCK_HELD(t));
   1841 
   1842 	if (t->t_state == TS_RUN && (t->t_schedflag & TS_LOAD) == 0) {
   1843 		time_t swapout_time;
   1844 
   1845 		swapout_time = (ddi_get_lbolt() - t->t_stime) / hz;
   1846 		if (INHERITED(t) || (fssproc->fss_flags & FSSKPRI)) {
   1847 			epri = (long)DISP_PRIO(t) + swapout_time;
   1848 		} else {
   1849 			/*
   1850 			 * Threads which have been out for a long time,
   1851 			 * have high user mode priority and are associated
   1852 			 * with a small address space are more deserving.
   1853 			 */
   1854 			epri = fssproc->fss_umdpri;
   1855 			ASSERT(epri >= 0 && epri <= fss_maxumdpri);
   1856 			epri += swapout_time - pp->p_swrss / nz(maxpgio)/2;
   1857 		}
   1858 		/*
   1859 		 * Scale epri so that SHRT_MAX / 2 represents zero priority.
   1860 		 */
   1861 		epri += SHRT_MAX / 2;
   1862 		if (epri < 0)
   1863 			epri = 0;
   1864 		else if (epri > SHRT_MAX)
   1865 			epri = SHRT_MAX;
   1866 	}
   1867 	return ((pri_t)epri);
   1868 }
   1869 
   1870 /*
   1871  * fss_swapout() returns -1 if the thread isn't loaded or is not eligible to
   1872  * be swapped out. Otherwise, it returns the thread's effective priority
   1873  * based on if the swapper is in softswap or hardswap mode.
   1874  */
   1875 static pri_t
   1876 fss_swapout(kthread_t *t, int flags)
   1877 {
   1878 	fssproc_t *fssproc = FSSPROC(t);
   1879 	long epri = -1;
   1880 	proc_t *pp = ttoproc(t);
   1881 	time_t swapin_time;
   1882 
   1883 	ASSERT(THREAD_LOCK_HELD(t));
   1884 
   1885 	if (INHERITED(t) ||
   1886 	    (fssproc->fss_flags & FSSKPRI) ||
   1887 	    (t->t_proc_flag & TP_LWPEXIT) ||
   1888 	    (t->t_state & (TS_ZOMB|TS_FREE|TS_STOPPED|TS_ONPROC|TS_WAIT)) ||
   1889 	    !(t->t_schedflag & TS_LOAD) ||
   1890 	    !(SWAP_OK(t)))
   1891 		return (-1);
   1892 
   1893 	ASSERT(t->t_state & (TS_SLEEP | TS_RUN));
   1894 
   1895 	swapin_time = (ddi_get_lbolt() - t->t_stime) / hz;
   1896 
   1897 	if (flags == SOFTSWAP) {
   1898 		if (t->t_state == TS_SLEEP && swapin_time > maxslp) {
   1899 			epri = 0;
   1900 		} else {
   1901 			return ((pri_t)epri);
   1902 		}
   1903 	} else {
   1904 		pri_t pri;
   1905 
   1906 		if ((t->t_state == TS_SLEEP && swapin_time > fss_minslp) ||
   1907 		    (t->t_state == TS_RUN && swapin_time > fss_minrun)) {
   1908 			pri = fss_maxumdpri;
   1909 			epri = swapin_time -
   1910 			    (rm_asrss(pp->p_as) / nz(maxpgio)/2) - (long)pri;
   1911 		} else {
   1912 			return ((pri_t)epri);
   1913 		}
   1914 	}
   1915 
   1916 	/*
   1917 	 * Scale epri so that SHRT_MAX / 2 represents zero priority.
   1918 	 */
   1919 	epri += SHRT_MAX / 2;
   1920 	if (epri < 0)
   1921 		epri = 0;
   1922 	else if (epri > SHRT_MAX)
   1923 		epri = SHRT_MAX;
   1924 
   1925 	return ((pri_t)epri);
   1926 }
   1927 
   1928 /*
   1929  * If thread is currently at a kernel mode priority (has slept) and is
   1930  * returning to the userland we assign it the appropriate user mode priority
   1931  * and time quantum here.  If we're lowering the thread's priority below that
   1932  * of other runnable threads then we will set runrun via cpu_surrender() to
   1933  * cause preemption.
   1934  */
   1935 static void
   1936 fss_trapret(kthread_t *t)
   1937 {
   1938 	fssproc_t *fssproc = FSSPROC(t);
   1939 	cpu_t *cp = CPU;
   1940 
   1941 	ASSERT(THREAD_LOCK_HELD(t));
   1942 	ASSERT(t == curthread);
   1943 	ASSERT(cp->cpu_dispthread == t);
   1944 	ASSERT(t->t_state == TS_ONPROC);
   1945 
   1946 	t->t_kpri_req = 0;
   1947 	if (fssproc->fss_flags & FSSKPRI) {
   1948 		/*
   1949 		 * If thread has blocked in the kernel
   1950 		 */
   1951 		THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
   1952 		cp->cpu_dispatch_pri = DISP_PRIO(t);
   1953 		ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri);
   1954 		fssproc->fss_flags &= ~FSSKPRI;
   1955 
   1956 		if (DISP_MUST_SURRENDER(t))
   1957 			cpu_surrender(t);
   1958 	}
   1959 
   1960 	/*
   1961 	 * Swapout lwp if the swapper is waiting for this thread to reach
   1962 	 * a safe point.
   1963 	 */
   1964 	if (t->t_schedflag & TS_SWAPENQ) {
   1965 		thread_unlock(t);
   1966 		swapout_lwp(ttolwp(t));
   1967 		thread_lock(t);
   1968 	}
   1969 }
   1970 
   1971 /*
   1972  * Arrange for thread to be placed in appropriate location on dispatcher queue.
   1973  * This is called with the current thread in TS_ONPROC and locked.
   1974  */
   1975 static void
   1976 fss_preempt(kthread_t *t)
   1977 {
   1978 	fssproc_t *fssproc = FSSPROC(t);
   1979 	klwp_t *lwp;
   1980 	uint_t flags;
   1981 
   1982 	ASSERT(t == curthread);
   1983 	ASSERT(THREAD_LOCK_HELD(curthread));
   1984 	ASSERT(t->t_state == TS_ONPROC);
   1985 
   1986 	/*
   1987 	 * If preempted in the kernel, make sure the thread has a kernel
   1988 	 * priority if needed.
   1989 	 */
   1990 	lwp = curthread->t_lwp;
   1991 	if (!(fssproc->fss_flags & FSSKPRI) && lwp != NULL && t->t_kpri_req) {
   1992 		fssproc->fss_flags |= FSSKPRI;
   1993 		THREAD_CHANGE_PRI(t, minclsyspri);
   1994 		ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri);
   1995 		t->t_trapret = 1;	/* so that fss_trapret will run */
   1996 		aston(t);
   1997 	}
   1998 
   1999 	/*
   2000 	 * This thread may be placed on wait queue by CPU Caps. In this case we
   2001 	 * do not need to do anything until it is removed from the wait queue.
   2002 	 * Do not enforce CPU caps on threads running at a kernel priority
   2003 	 */
   2004 	if (CPUCAPS_ON()) {
   2005 		(void) cpucaps_charge(t, &fssproc->fss_caps,
   2006 		    CPUCAPS_CHARGE_ENFORCE);
   2007 
   2008 		if (!(fssproc->fss_flags & FSSKPRI) && CPUCAPS_ENFORCE(t))
   2009 			return;
   2010 	}
   2011 
   2012 	/*
   2013 	 * If preempted in user-land mark the thread as swappable because it
   2014 	 * cannot be holding any kernel locks.
   2015 	 */
   2016 	ASSERT(t->t_schedflag & TS_DONT_SWAP);
   2017 	if (lwp != NULL && lwp->lwp_state == LWP_USER)
   2018 		t->t_schedflag &= ~TS_DONT_SWAP;
   2019 
   2020 	/*
   2021 	 * Check to see if we're doing "preemption control" here.  If
   2022 	 * we are, and if the user has requested that this thread not
   2023 	 * be preempted, and if preemptions haven't been put off for
   2024 	 * too long, let the preemption happen here but try to make
   2025 	 * sure the thread is rescheduled as soon as possible.  We do
   2026 	 * this by putting it on the front of the highest priority run
   2027 	 * queue in the FSS class.  If the preemption has been put off
   2028 	 * for too long, clear the "nopreempt" bit and let the thread
   2029 	 * be preempted.
   2030 	 */
   2031 	if (t->t_schedctl && schedctl_get_nopreempt(t)) {
   2032 		if (fssproc->fss_timeleft > -SC_MAX_TICKS) {
   2033 			DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t);
   2034 			if (!(fssproc->fss_flags & FSSKPRI)) {
   2035 				/*
   2036 				 * If not already remembered, remember current
   2037 				 * priority for restoration in fss_yield().
   2038 				 */
   2039 				if (!(fssproc->fss_flags & FSSRESTORE)) {
   2040 					fssproc->fss_scpri = t->t_pri;
   2041 					fssproc->fss_flags |= FSSRESTORE;
   2042 				}
   2043 				THREAD_CHANGE_PRI(t, fss_maxumdpri);
   2044 				t->t_schedflag |= TS_DONT_SWAP;
   2045 			}
   2046 			schedctl_set_yield(t, 1);
   2047 			setfrontdq(t);
   2048 			return;
   2049 		} else {
   2050 			if (fssproc->fss_flags & FSSRESTORE) {
   2051 				THREAD_CHANGE_PRI(t, fssproc->fss_scpri);
   2052 				fssproc->fss_flags &= ~FSSRESTORE;
   2053 			}
   2054 			schedctl_set_nopreempt(t, 0);
   2055 			DTRACE_SCHED1(schedctl__preempt, kthread_t *, t);
   2056 			/*
   2057 			 * Fall through and be preempted below.
   2058 			 */
   2059 		}
   2060 	}
   2061 
   2062 	flags = fssproc->fss_flags & (FSSBACKQ | FSSKPRI);
   2063 
   2064 	if (flags == FSSBACKQ) {
   2065 		fssproc->fss_timeleft = fss_quantum;
   2066 		fssproc->fss_flags &= ~FSSBACKQ;
   2067 		setbackdq(t);
   2068 	} else if (flags == (FSSBACKQ | FSSKPRI)) {
   2069 		fssproc->fss_flags &= ~FSSBACKQ;
   2070 		setbackdq(t);
   2071 	} else {
   2072 		setfrontdq(t);
   2073 	}
   2074 }
   2075 
   2076 /*
   2077  * Called when a thread is waking up and is to be placed on the run queue.
   2078  */
   2079 static void
   2080 fss_setrun(kthread_t *t)
   2081 {
   2082 	fssproc_t *fssproc = FSSPROC(t);
   2083 
   2084 	ASSERT(THREAD_LOCK_HELD(t));	/* t should be in transition */
   2085 
   2086 	if (t->t_state == TS_SLEEP || t->t_state == TS_STOPPED)
   2087 		fss_active(t);
   2088 
   2089 	fssproc->fss_timeleft = fss_quantum;
   2090 
   2091 	fssproc->fss_flags &= ~FSSBACKQ;
   2092 	/*
   2093 	 * If previously were running at the kernel priority then keep that
   2094 	 * priority and the fss_timeleft doesn't matter.
   2095 	 */
   2096 	if ((fssproc->fss_flags & FSSKPRI) == 0)
   2097 		THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
   2098 
   2099 	if (t->t_disp_time != ddi_get_lbolt())
   2100 		setbackdq(t);
   2101 	else
   2102 		setfrontdq(t);
   2103 }
   2104 
   2105 /*
   2106  * Prepare thread for sleep. We reset the thread priority so it will run at the
   2107  * kernel priority level when it wakes up.
   2108  */
   2109 static void
   2110 fss_sleep(kthread_t *t)
   2111 {
   2112 	fssproc_t *fssproc = FSSPROC(t);
   2113 
   2114 	ASSERT(t == curthread);
   2115 	ASSERT(THREAD_LOCK_HELD(t));
   2116 
   2117 	ASSERT(t->t_state == TS_ONPROC);
   2118 
   2119 	/*
   2120 	 * Account for time spent on CPU before going to sleep.
   2121 	 */
   2122 	(void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE);
   2123 
   2124 	fss_inactive(t);
   2125 
   2126 	/*
   2127 	 * Assign a system priority to the thread and arrange for it to be
   2128 	 * retained when the thread is next placed on the run queue (i.e.,
   2129 	 * when it wakes up) instead of being given a new pri.  Also arrange
   2130 	 * for trapret processing as the thread leaves the system call so it
   2131 	 * will drop back to normal priority range.
   2132 	 */
   2133 	if (t->t_kpri_req) {
   2134 		THREAD_CHANGE_PRI(t, minclsyspri);
   2135 		fssproc->fss_flags |= FSSKPRI;
   2136 		t->t_trapret = 1;	/* so that fss_trapret will run */
   2137 		aston(t);
   2138 	} else if (fssproc->fss_flags & FSSKPRI) {
   2139 		/*
   2140 		 * The thread has done a THREAD_KPRI_REQUEST(), slept, then
   2141 		 * done THREAD_KPRI_RELEASE() (so no t_kpri_req is 0 again),
   2142 		 * then slept again all without finishing the current system
   2143 		 * call so trapret won't have cleared FSSKPRI
   2144 		 */
   2145 		fssproc->fss_flags &= ~FSSKPRI;
   2146 		THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
   2147 		if (DISP_MUST_SURRENDER(curthread))
   2148 			cpu_surrender(t);
   2149 	}
   2150 	t->t_stime = ddi_get_lbolt();	/* time stamp for the swapper */
   2151 }
   2152 
   2153 /*
   2154  * A tick interrupt has ocurrend on a running thread. Check to see if our
   2155  * time slice has expired.  We must also clear the TS_DONT_SWAP flag in
   2156  * t_schedflag if the thread is eligible to be swapped out.
   2157  */
   2158 static void
   2159 fss_tick(kthread_t *t)
   2160 {
   2161 	fssproc_t *fssproc;
   2162 	fssproj_t *fssproj;
   2163 	klwp_t *lwp;
   2164 	boolean_t call_cpu_surrender = B_FALSE;
   2165 	boolean_t cpucaps_enforce = B_FALSE;
   2166 
   2167 	ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
   2168 
   2169 	/*
   2170 	 * It's safe to access fsspset and fssproj structures because we're
   2171 	 * holding our p_lock here.
   2172 	 */
   2173 	thread_lock(t);
   2174 	fssproc = FSSPROC(t);
   2175 	fssproj = FSSPROC2FSSPROJ(fssproc);
   2176 	if (fssproj != NULL) {
   2177 		fsspset_t *fsspset = FSSPROJ2FSSPSET(fssproj);
   2178 		disp_lock_enter_high(&fsspset->fssps_displock);
   2179 		fssproj->fssp_ticks += fss_nice_tick[fssproc->fss_nice];
   2180 		fssproc->fss_ticks++;
   2181 		disp_lock_exit_high(&fsspset->fssps_displock);
   2182 	}
   2183 
   2184 	/*
   2185 	 * Keep track of thread's project CPU usage.  Note that projects
   2186 	 * get charged even when threads are running in the kernel.
   2187 	 * Do not surrender CPU if running in the SYS class.
   2188 	 */
   2189 	if (CPUCAPS_ON()) {
   2190 		cpucaps_enforce = cpucaps_charge(t,
   2191 		    &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE) &&
   2192 		    !(fssproc->fss_flags & FSSKPRI);
   2193 	}
   2194 
   2195 	/*
   2196 	 * A thread's execution time for threads running in the SYS class
   2197 	 * is not tracked.
   2198 	 */
   2199 	if ((fssproc->fss_flags & FSSKPRI) == 0) {
   2200 		/*
   2201 		 * If thread is not in kernel mode, decrement its fss_timeleft
   2202 		 */
   2203 		if (--fssproc->fss_timeleft <= 0) {
   2204 			pri_t new_pri;
   2205 
   2206 			/*
   2207 			 * If we're doing preemption control and trying to
   2208 			 * avoid preempting this thread, just note that the
   2209 			 * thread should yield soon and let it keep running
   2210 			 * (unless it's been a while).
   2211 			 */
   2212 			if (t->t_schedctl && schedctl_get_nopreempt(t)) {
   2213 				if (fssproc->fss_timeleft > -SC_MAX_TICKS) {
   2214 					DTRACE_SCHED1(schedctl__nopreempt,
   2215 					    kthread_t *, t);
   2216 					schedctl_set_yield(t, 1);
   2217 					thread_unlock_nopreempt(t);
   2218 					return;
   2219 				}
   2220 			}
   2221 			fssproc->fss_flags &= ~FSSRESTORE;
   2222 
   2223 			fss_newpri(fssproc);
   2224 			new_pri = fssproc->fss_umdpri;
   2225 			ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri);
   2226 
   2227 			/*
   2228 			 * When the priority of a thread is changed, it may
   2229 			 * be necessary to adjust its position on a sleep queue
   2230 			 * or dispatch queue. The function thread_change_pri
   2231 			 * accomplishes this.
   2232 			 */
   2233 			if (thread_change_pri(t, new_pri, 0)) {
   2234 				if ((t->t_schedflag & TS_LOAD) &&
   2235 				    (lwp = t->t_lwp) &&
   2236 				    lwp->lwp_state == LWP_USER)
   2237 					t->t_schedflag &= ~TS_DONT_SWAP;
   2238 				fssproc->fss_timeleft = fss_quantum;
   2239 			} else {
   2240 				call_cpu_surrender = B_TRUE;
   2241 			}
   2242 		} else if (t->t_state == TS_ONPROC &&
   2243 		    t->t_pri < t->t_disp_queue->disp_maxrunpri) {
   2244 			/*
   2245 			 * If there is a higher-priority thread which is
   2246 			 * waiting for a processor, then thread surrenders
   2247 			 * the processor.
   2248 			 */
   2249 			call_cpu_surrender = B_TRUE;
   2250 		}
   2251 	}
   2252 
   2253 	if (cpucaps_enforce && 2 * fssproc->fss_timeleft > fss_quantum) {
   2254 		/*
   2255 		 * The thread used more than half of its quantum, so assume that
   2256 		 * it used the whole quantum.
   2257 		 *
   2258 		 * Update thread's priority just before putting it on the wait
   2259 		 * queue so that it gets charged for the CPU time from its
   2260 		 * quantum even before that quantum expires.
   2261 		 */
   2262 		fss_newpri(fssproc);
   2263 		if (t->t_pri != fssproc->fss_umdpri)
   2264 			fss_change_priority(t, fssproc);
   2265 
   2266 		/*
   2267 		 * We need to call cpu_surrender for this thread due to cpucaps
   2268 		 * enforcement, but fss_change_priority may have already done
   2269 		 * so. In this case FSSBACKQ is set and there is no need to call
   2270 		 * cpu-surrender again.
   2271 		 */
   2272 		if (!(fssproc->fss_flags & FSSBACKQ))
   2273 			call_cpu_surrender = B_TRUE;
   2274 	}
   2275 
   2276 	if (call_cpu_surrender) {
   2277 		fssproc->fss_flags |= FSSBACKQ;
   2278 		cpu_surrender(t);
   2279 	}
   2280 
   2281 	thread_unlock_nopreempt(t);	/* clock thread can't be preempted */
   2282 }
   2283 
   2284 /*
   2285  * Processes waking up go to the back of their queue.  We don't need to assign
   2286  * a time quantum here because thread is still at a kernel mode priority and
   2287  * the time slicing is not done for threads running in the kernel after
   2288  * sleeping.  The proper time quantum will be assigned by fss_trapret before the
   2289  * thread returns to user mode.
   2290  */
   2291 static void
   2292 fss_wakeup(kthread_t *t)
   2293 {
   2294 	fssproc_t *fssproc;
   2295 
   2296 	ASSERT(THREAD_LOCK_HELD(t));
   2297 	ASSERT(t->t_state == TS_SLEEP);
   2298 
   2299 	fss_active(t);
   2300 
   2301 	t->t_stime = ddi_get_lbolt();		/* time stamp for the swapper */
   2302 	fssproc = FSSPROC(t);
   2303 	fssproc->fss_flags &= ~FSSBACKQ;
   2304 
   2305 	if (fssproc->fss_flags & FSSKPRI) {
   2306 		/*
   2307 		 * If we already have a kernel priority assigned, then we
   2308 		 * just use it.
   2309 		 */
   2310 		setbackdq(t);
   2311 	} else if (t->t_kpri_req) {
   2312 		/*
   2313 		 * Give thread a priority boost if we were asked.
   2314 		 */
   2315 		fssproc->fss_flags |= FSSKPRI;
   2316 		THREAD_CHANGE_PRI(t, minclsyspri);
   2317 		setbackdq(t);
   2318 		t->t_trapret = 1;	/* so that fss_trapret will run */
   2319 		aston(t);
   2320 	} else {
   2321 		/*
   2322 		 * Otherwise, we recalculate the priority.
   2323 		 */
   2324 		if (t->t_disp_time == ddi_get_lbolt()) {
   2325 			setfrontdq(t);
   2326 		} else {
   2327 			fssproc->fss_timeleft = fss_quantum;
   2328 			THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
   2329 			setbackdq(t);
   2330 		}
   2331 	}
   2332 }
   2333 
   2334 /*
   2335  * fss_donice() is called when a nice(1) command is issued on the thread to
   2336  * alter the priority. The nice(1) command exists in Solaris for compatibility.
   2337  * Thread priority adjustments should be done via priocntl(1).
   2338  */
   2339 static int
   2340 fss_donice(kthread_t *t, cred_t *cr, int incr, int *retvalp)
   2341 {
   2342 	int newnice;
   2343 	fssproc_t *fssproc = FSSPROC(t);
   2344 	fssparms_t fssparms;
   2345 
   2346 	/*
   2347 	 * If there is no change to priority, just return current setting.
   2348 	 */
   2349 	if (incr == 0) {
   2350 		if (retvalp)
   2351 			*retvalp = fssproc->fss_nice - NZERO;
   2352 		return (0);
   2353 	}
   2354 
   2355 	if ((incr < 0 || incr > 2 * NZERO) && secpolicy_setpriority(cr) != 0)
   2356 		return (EPERM);
   2357 
   2358 	/*
   2359 	 * Specifying a nice increment greater than the upper limit of
   2360 	 * FSS_NICE_MAX (== 2 * NZERO - 1) will result in the thread's nice
   2361 	 * value being set to the upper limit.  We check for this before
   2362 	 * computing the new value because otherwise we could get overflow
   2363 	 * if a privileged user specified some ridiculous increment.
   2364 	 */
   2365 	if (incr > FSS_NICE_MAX)
   2366 		incr = FSS_NICE_MAX;
   2367 
   2368 	newnice = fssproc->fss_nice + incr;
   2369 	if (newnice > FSS_NICE_MAX)
   2370 		newnice = FSS_NICE_MAX;
   2371 	else if (newnice < FSS_NICE_MIN)
   2372 		newnice = FSS_NICE_MIN;
   2373 
   2374 	fssparms.fss_uprilim = fssparms.fss_upri =
   2375 	    -((newnice - NZERO) * fss_maxupri) / NZERO;
   2376 
   2377 	/*
   2378 	 * Reset the uprilim and upri values of the thread.
   2379 	 */
   2380 	(void) fss_parmsset(t, (void *)&fssparms, (id_t)0, (cred_t *)NULL);
   2381 
   2382 	/*
   2383 	 * Although fss_parmsset already reset fss_nice it may not have been
   2384 	 * set to precisely the value calculated above because fss_parmsset
   2385 	 * determines the nice value from the user priority and we may have
   2386 	 * truncated during the integer conversion from nice value to user
   2387 	 * priority and back. We reset fss_nice to the value we calculated
   2388 	 * above.
   2389 	 */
   2390 	fssproc->fss_nice = (char)newnice;
   2391 
   2392 	if (retvalp)
   2393 		*retvalp = newnice - NZERO;
   2394 	return (0);
   2395 }
   2396 
   2397 /*
   2398  * Increment the priority of the specified thread by incr and
   2399  * return the new value in *retvalp.
   2400  */
   2401 static int
   2402 fss_doprio(kthread_t *t, cred_t *cr, int incr, int *retvalp)
   2403 {
   2404 	int newpri;
   2405 	fssproc_t *fssproc = FSSPROC(t);
   2406 	fssparms_t fssparms;
   2407 
   2408 	/*
   2409 	 * If there is no change to priority, just return current setting.
   2410 	 */
   2411 	if (incr == 0) {
   2412 		*retvalp = fssproc->fss_upri;
   2413 		return (0);
   2414 	}
   2415 
   2416 	newpri = fssproc->fss_upri + incr;
   2417 	if (newpri > fss_maxupri || newpri < -fss_maxupri)
   2418 		return (EINVAL);
   2419 
   2420 	*retvalp = newpri;
   2421 	fssparms.fss_uprilim = fssparms.fss_upri = newpri;
   2422 
   2423 	/*
   2424 	 * Reset the uprilim and upri values of the thread.
   2425 	 */
   2426 	return (fss_parmsset(t, &fssparms, (id_t)0, cr));
   2427 }
   2428 
   2429 /*
   2430  * Return the global scheduling priority that would be assigned to a thread
   2431  * entering the fair-sharing class with the fss_upri.
   2432  */
   2433 /*ARGSUSED*/
   2434 static pri_t
   2435 fss_globpri(kthread_t *t)
   2436 {
   2437 	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
   2438 
   2439 	return (fss_maxumdpri / 2);
   2440 }
   2441 
   2442 /*
   2443  * Called from the yield(2) system call when a thread is yielding (surrendering)
   2444  * the processor. The kernel thread is placed at the back of a dispatch queue.
   2445  */
   2446 static void
   2447 fss_yield(kthread_t *t)
   2448 {
   2449 	fssproc_t *fssproc = FSSPROC(t);
   2450 
   2451 	ASSERT(t == curthread);
   2452 	ASSERT(THREAD_LOCK_HELD(t));
   2453 
   2454 	/*
   2455 	 * Collect CPU usage spent before yielding
   2456 	 */
   2457 	(void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE);
   2458 
   2459 	/*
   2460 	 * Clear the preemption control "yield" bit since the user is
   2461 	 * doing a yield.
   2462 	 */
   2463 	if (t->t_schedctl)
   2464 		schedctl_set_yield(t, 0);
   2465 	/*
   2466 	 * If fss_preempt() artifically increased the thread's priority
   2467 	 * to avoid preemption, restore the original priority now.
   2468 	 */
   2469 	if (fssproc->fss_flags & FSSRESTORE) {
   2470 		THREAD_CHANGE_PRI(t, fssproc->fss_scpri);
   2471 		fssproc->fss_flags &= ~FSSRESTORE;
   2472 	}
   2473 	if (fssproc->fss_timeleft < 0) {
   2474 		/*
   2475 		 * Time slice was artificially extended to avoid preemption,
   2476 		 * so pretend we're preempting it now.
   2477 		 */
   2478 		DTRACE_SCHED1(schedctl__yield, int, -fssproc->fss_timeleft);
   2479 		fssproc->fss_timeleft = fss_quantum;
   2480 	}
   2481 	fssproc->fss_flags &= ~FSSBACKQ;
   2482 	setbackdq(t);
   2483 }
   2484 
   2485 void
   2486 fss_changeproj(kthread_t *t, void *kp, void *zp, fssbuf_t *projbuf,
   2487     fssbuf_t *zonebuf)
   2488 {
   2489 	kproject_t *kpj_new = kp;
   2490 	zone_t *zone = zp;
   2491 	fssproj_t *fssproj_old, *fssproj_new;
   2492 	fsspset_t *fsspset;
   2493 	kproject_t *kpj_old;
   2494 	fssproc_t *fssproc;
   2495 	fsszone_t *fsszone_old, *fsszone_new;
   2496 	int free = 0;
   2497 	int id;
   2498 
   2499 	ASSERT(MUTEX_HELD(&cpu_lock));
   2500 	ASSERT(MUTEX_HELD(&pidlock));
   2501 	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
   2502 
   2503 	if (t->t_cid != fss_cid)
   2504 		return;
   2505 
   2506 	fssproc = FSSPROC(t);
   2507 	mutex_enter(&fsspsets_lock);
   2508 	fssproj_old = FSSPROC2FSSPROJ(fssproc);
   2509 	if (fssproj_old == NULL) {
   2510 		mutex_exit(&fsspsets_lock);
   2511 		return;
   2512 	}
   2513 
   2514 	fsspset = FSSPROJ2FSSPSET(fssproj_old);
   2515 	mutex_enter(&fsspset->fssps_lock);
   2516 	kpj_old = FSSPROJ2KPROJ(fssproj_old);
   2517 	fsszone_old = fssproj_old->fssp_fsszone;
   2518 
   2519 	ASSERT(t->t_cpupart == fsspset->fssps_cpupart);
   2520 
   2521 	if (kpj_old == kpj_new) {
   2522 		mutex_exit(&fsspset->fssps_lock);
   2523 		mutex_exit(&fsspsets_lock);
   2524 		return;
   2525 	}
   2526 
   2527 	if ((fsszone_new = fss_find_fsszone(fsspset, zone)) == NULL) {
   2528 		/*
   2529 		 * If the zone for the new project is not currently active on
   2530 		 * the cpu partition we're on, get one of the pre-allocated
   2531 		 * buffers and link it in our per-pset zone list.  Such buffers
   2532 		 * should already exist.
   2533 		 */
   2534 		for (id = 0; id < zonebuf->fssb_size; id++) {
   2535 			if ((fsszone_new = zonebuf->fssb_list[id]) != NULL) {
   2536 				fss_insert_fsszone(fsspset, zone, fsszone_new);
   2537 				zonebuf->fssb_list[id] = NULL;
   2538 				break;
   2539 			}
   2540 		}
   2541 	}
   2542 	ASSERT(fsszone_new != NULL);
   2543 	if ((fssproj_new = fss_find_fssproj(fsspset, kpj_new)) == NULL) {
   2544 		/*
   2545 		 * If our new project is not currently running
   2546 		 * on the cpu partition we're on, get one of the
   2547 		 * pre-allocated buffers and link it in our new cpu
   2548 		 * partition doubly linked list. Such buffers should already
   2549 		 * exist.
   2550 		 */
   2551 		for (id = 0; id < projbuf->fssb_size; id++) {
   2552 			if ((fssproj_new = projbuf->fssb_list[id]) != NULL) {
   2553 				fss_insert_fssproj(fsspset, kpj_new,
   2554 				    fsszone_new, fssproj_new);
   2555 				projbuf->fssb_list[id] = NULL;
   2556 				break;
   2557 			}
   2558 		}
   2559 	}
   2560 	ASSERT(fssproj_new != NULL);
   2561 
   2562 	thread_lock(t);
   2563 	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
   2564 	    t->t_state == TS_WAIT)
   2565 		fss_inactive(t);
   2566 	ASSERT(fssproj_old->fssp_threads > 0);
   2567 	if (--fssproj_old->fssp_threads == 0) {
   2568 		fss_remove_fssproj(fsspset, fssproj_old);
   2569 		free = 1;
   2570 	}
   2571 	fssproc->fss_proj = fssproj_new;
   2572 	fssproc->fss_fsspri = 0;
   2573 	fssproj_new->fssp_threads++;
   2574 	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
   2575 	    t->t_state == TS_WAIT)
   2576 		fss_active(t);
   2577 	thread_unlock(t);
   2578 	if (free) {
   2579 		if (fsszone_old->fssz_nproj == 0)
   2580 			kmem_free(fsszone_old, sizeof (fsszone_t));
   2581 		kmem_free(fssproj_old, sizeof (fssproj_t));
   2582 	}
   2583 
   2584 	mutex_exit(&fsspset->fssps_lock);
   2585 	mutex_exit(&fsspsets_lock);
   2586 }
   2587 
   2588 void
   2589 fss_changepset(kthread_t *t, void *newcp, fssbuf_t *projbuf,
   2590     fssbuf_t *zonebuf)
   2591 {
   2592 	fsspset_t *fsspset_old, *fsspset_new;
   2593 	fssproj_t *fssproj_old, *fssproj_new;
   2594 	fsszone_t *fsszone_old, *fsszone_new;
   2595 	fssproc_t *fssproc;
   2596 	kproject_t *kpj;
   2597 	zone_t *zone;
   2598 	int id;
   2599 
   2600 	ASSERT(MUTEX_HELD(&cpu_lock));
   2601 	ASSERT(MUTEX_HELD(&pidlock));
   2602 	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
   2603 
   2604 	if (t->t_cid != fss_cid)
   2605 		return;
   2606 
   2607 	fssproc = FSSPROC(t);
   2608 	zone = ttoproc(t)->p_zone;
   2609 	mutex_enter(&fsspsets_lock);
   2610 	fssproj_old = FSSPROC2FSSPROJ(fssproc);
   2611 	if (fssproj_old == NULL) {
   2612 		mutex_exit(&fsspsets_lock);
   2613 		return;
   2614 	}
   2615 	fsszone_old = fssproj_old->fssp_fsszone;
   2616 	fsspset_old = FSSPROJ2FSSPSET(fssproj_old);
   2617 	kpj = FSSPROJ2KPROJ(fssproj_old);
   2618 
   2619 	if (fsspset_old->fssps_cpupart == newcp) {
   2620 		mutex_exit(&fsspsets_lock);
   2621 		return;
   2622 	}
   2623 
   2624 	ASSERT(ttoproj(t) == kpj);
   2625 
   2626 	fsspset_new = fss_find_fsspset(newcp);
   2627 
   2628 	mutex_enter(&fsspset_new->fssps_lock);
   2629 	if ((fsszone_new = fss_find_fsszone(fsspset_new, zone)) == NULL) {
   2630 		for (id = 0; id < zonebuf->fssb_size; id++) {
   2631 			if ((fsszone_new = zonebuf->fssb_list[id]) != NULL) {
   2632 				fss_insert_fsszone(fsspset_new, zone,
   2633 				    fsszone_new);
   2634 				zonebuf->fssb_list[id] = NULL;
   2635 				break;
   2636 			}
   2637 		}
   2638 	}
   2639 	ASSERT(fsszone_new != NULL);
   2640 	if ((fssproj_new = fss_find_fssproj(fsspset_new, kpj)) == NULL) {
   2641 		for (id = 0; id < projbuf->fssb_size; id++) {
   2642 			if ((fssproj_new = projbuf->fssb_list[id]) != NULL) {
   2643 				fss_insert_fssproj(fsspset_new, kpj,
   2644 				    fsszone_new, fssproj_new);
   2645 				projbuf->fssb_list[id] = NULL;
   2646 				break;
   2647 			}
   2648 		}
   2649 	}
   2650 	ASSERT(fssproj_new != NULL);
   2651 
   2652 	fssproj_new->fssp_threads++;
   2653 	thread_lock(t);
   2654 	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
   2655 	    t->t_state == TS_WAIT)
   2656 		fss_inactive(t);
   2657 	fssproc->fss_proj = fssproj_new;
   2658 	fssproc->fss_fsspri = 0;
   2659 	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
   2660 	    t->t_state == TS_WAIT)
   2661 		fss_active(t);
   2662 	thread_unlock(t);
   2663 	mutex_exit(&fsspset_new->fssps_lock);
   2664 
   2665 	mutex_enter(&fsspset_old->fssps_lock);
   2666 	if (--fssproj_old->fssp_threads == 0) {
   2667 		fss_remove_fssproj(fsspset_old, fssproj_old);
   2668 		if (fsszone_old->fssz_nproj == 0)
   2669 			kmem_free(fsszone_old, sizeof (fsszone_t));
   2670 		kmem_free(fssproj_old, sizeof (fssproj_t));
   2671 	}
   2672 	mutex_exit(&fsspset_old->fssps_lock);
   2673 
   2674 	mutex_exit(&fsspsets_lock);
   2675 }
   2676