Home | History | Annotate | Download | only in syscall
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 
     30 #pragma ident	"@(#)sem.c	1.86	07/12/26 SMI"
     31 
     32 /*
     33  * Inter-Process Communication Semaphore Facility.
     34  *
     35  * See os/ipc.c for a description of common IPC functionality.
     36  *
     37  * Resource controls
     38  * -----------------
     39  *
     40  * Control:      zone.max-sem-ids (rc_zone_semmni)
     41  * Description:  Maximum number of semaphore ids allowed a zone.
     42  *
     43  *   When semget() is used to allocate a semaphore set, one id is
     44  *   allocated.  If the id allocation doesn't succeed, semget() fails
     45  *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
     46  *   the id is deallocated.
     47  *
     48  * Control:      project.max-sem-ids (rc_project_semmni)
     49  * Description:  Maximum number of semaphore ids allowed a project.
     50  *
     51  *   When semget() is used to allocate a semaphore set, one id is
     52  *   allocated.  If the id allocation doesn't succeed, semget() fails
     53  *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
     54  *   the id is deallocated.
     55  *
     56  * Control:      process.max-sem-nsems (rc_process_semmsl)
     57  * Description:  Maximum number of semaphores allowed per semaphore set.
     58  *
     59  *   When semget() is used to allocate a semaphore set, the size of the
     60  *   set is compared with this limit.  If the number of semaphores
     61  *   exceeds the limit, semget() fails and errno is set to EINVAL.
     62  *
     63  * Control:      process.max-sem-ops (rc_process_semopm)
     64  * Description:  Maximum number of semaphore operations allowed per
     65  *               semop call.
     66  *
     67  *   When semget() successfully allocates a semaphore set, the minimum
     68  *   enforced value of this limit is used to initialize the
     69  *   "system-imposed maximum" number of operations a semop() call for
     70  *   this set can perform.
     71  *
     72  * Undo structures
     73  * ---------------
     74  *
     75  * Removing the undo structure tunables involved a serious redesign of
     76  * how they were implemented.  There is now one undo structure for
     77  * every process/semaphore array combination (lazily allocated, of
     78  * course), and each is equal in size to the semaphore it corresponds
     79  * to.  To avoid scalability and performance problems, the undo
     80  * structures are stored in two places: a per-process AVL tree sorted
     81  * by ksemid pointer (p_semacct, protected by p_lock) and an unsorted
     82  * per-semaphore linked list (sem_undos, protected by the semaphore's
     83  * ID lock).  The former is used by semop, where a lookup is performed
     84  * once and cached if SEM_UNDO is specified for any of the operations,
     85  * and at process exit where the undoable operations are rolled back.
     86  * The latter is used when removing the semaphore, so the undo
     87  * structures can be removed from the appropriate processes' trees.
     88  *
     89  * The undo structure itself contains pointers to the ksemid and proc
     90  * to which it corresponds, a list node, an AVL node, and an array of
     91  * adjust-on-exit (AOE) values.  When an undo structure is allocated it
     92  * is immediately added to both the process's tree and the semaphore's
     93  * list.  Lastly, the reference count on the semaphore is increased.
     94  *
     95  * Avoiding a lock ordering violation between p_lock and the ID lock,
     96  * wont to occur when there is a race between a process exiting and the
     97  * removal of a semaphore, mandates the delicate dance that exists
     98  * between semexit and sem_rmid.
     99  *
    100  * sem_rmid, holding the ID lock, iterates through all undo structures
    101  * and for each takes the appropriate process's p_lock and checks to
    102  * see if p_semacct is NULL.  If it is, it skips that undo structure
    103  * and continues to the next.  Otherwise, it removes the undo structure
    104  * from both the AVL tree and the semaphore's list, and releases the
    105  * hold that the undo structure had on the semaphore.
    106  *
    107  * The important other half of this is semexit, which will immediately
    108  * take p_lock, obtain the AVL pointer, clear p_semacct, and drop
    109  * p_lock.  From this point on it is semexit's responsibility to clean
    110  * up all undo structures found in the tree -- a coexecuting sem_rmid
    111  * will see the NULL p_semacct and skip that undo structure.  It walks
    112  * the AVL tree (using avl_destroy_nodes) and for each undo structure
    113  * takes the appropriate semaphore's ID lock (always legal since the
    114  * undo structure has a hold on the semaphore), updates all semaphores
    115  * with non-zero AOE values, and removes the structure from the
    116  * semaphore's list.  It then drops the structure's reference on the
    117  * semaphore, drops the ID lock, and frees the undo structure.
    118  */
    119 
    120 #include <sys/types.h>
    121 #include <sys/t_lock.h>
    122 #include <sys/param.h>
    123 #include <sys/systm.h>
    124 #include <sys/sysmacros.h>
    125 #include <sys/cred.h>
    126 #include <sys/vmem.h>
    127 #include <sys/kmem.h>
    128 #include <sys/errno.h>
    129 #include <sys/time.h>
    130 #include <sys/ipc.h>
    131 #include <sys/ipc_impl.h>
    132 #include <sys/sem.h>
    133 #include <sys/sem_impl.h>
    134 #include <sys/user.h>
    135 #include <sys/proc.h>
    136 #include <sys/cpuvar.h>
    137 #include <sys/debug.h>
    138 #include <sys/var.h>
    139 #include <sys/cmn_err.h>
    140 #include <sys/modctl.h>
    141 #include <sys/syscall.h>
    142 #include <sys/avl.h>
    143 #include <sys/list.h>
    144 #include <sys/zone.h>
    145 
    146 #include <c2/audit.h>
    147 
    148 extern rctl_hndl_t rc_zone_semmni;
    149 extern rctl_hndl_t rc_project_semmni;
    150 extern rctl_hndl_t rc_process_semmsl;
    151 extern rctl_hndl_t rc_process_semopm;
    152 static ipc_service_t *sem_svc;
    153 static zone_key_t sem_zone_key;
    154 
    155 /*
    156  * The following tunables are obsolete.  Though for compatibility we
    157  * still read and interpret seminfo_semmsl, seminfo_semopm and
    158  * seminfo_semmni (see os/project.c and os/rctl_proc.c), the preferred
    159  * mechanism for administrating the IPC Semaphore facility is through
    160  * the resource controls described at the top of this file.
    161  */
    162 int seminfo_semaem = 16384;	/* (obsolete) */
    163 int seminfo_semmap = 10;	/* (obsolete) */
    164 int seminfo_semmni = 10;	/* (obsolete) */
    165 int seminfo_semmns = 60;	/* (obsolete) */
    166 int seminfo_semmnu = 30;	/* (obsolete) */
    167 int seminfo_semmsl = 25;	/* (obsolete) */
    168 int seminfo_semopm = 10;	/* (obsolete) */
    169 int seminfo_semume = 10;	/* (obsolete) */
    170 int seminfo_semusz = 96;	/* (obsolete) */
    171 int seminfo_semvmx = 32767;	/* (obsolete) */
    172 
    173 #define	SEM_MAXUCOPS	4096	/* max # of unchecked ops per semop call */
    174 #define	SEM_UNDOSZ(n)	(sizeof (struct sem_undo) + (n - 1) * sizeof (int))
    175 
    176 static int semsys(int opcode, uintptr_t a0, uintptr_t a1,
    177     uintptr_t a2, uintptr_t a3);
    178 static void sem_dtor(kipc_perm_t *);
    179 static void sem_rmid(kipc_perm_t *);
    180 static void sem_remove_zone(zoneid_t, void *);
    181 
    182 static struct sysent ipcsem_sysent = {
    183 	5,
    184 	SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
    185 	semsys
    186 };
    187 
    188 /*
    189  * Module linkage information for the kernel.
    190  */
    191 static struct modlsys modlsys = {
    192 	&mod_syscallops, "System V semaphore facility", &ipcsem_sysent
    193 };
    194 
    195 #ifdef _SYSCALL32_IMPL
    196 static struct modlsys modlsys32 = {
    197 	&mod_syscallops32, "32-bit System V semaphore facility", &ipcsem_sysent
    198 };
    199 #endif
    200 
    201 static struct modlinkage modlinkage = {
    202 	MODREV_1,
    203 	&modlsys,
    204 #ifdef _SYSCALL32_IMPL
    205 	&modlsys32,
    206 #endif
    207 	NULL
    208 };
    209 
    210 
    211 int
    212 _init(void)
    213 {
    214 	int result;
    215 
    216 	sem_svc = ipcs_create("semids", rc_project_semmni, rc_zone_semmni,
    217 	    sizeof (ksemid_t), sem_dtor, sem_rmid, AT_IPC_SEM,
    218 	    offsetof(ipc_rqty_t, ipcq_semmni));
    219 	zone_key_create(&sem_zone_key, NULL, sem_remove_zone, NULL);
    220 
    221 	if ((result = mod_install(&modlinkage)) == 0)
    222 		return (0);
    223 
    224 	(void) zone_key_delete(sem_zone_key);
    225 	ipcs_destroy(sem_svc);
    226 
    227 	return (result);
    228 }
    229 
    230 int
    231 _fini(void)
    232 {
    233 	return (EBUSY);
    234 }
    235 
    236 int
    237 _info(struct modinfo *modinfop)
    238 {
    239 	return (mod_info(&modlinkage, modinfop));
    240 }
    241 
    242 static void
    243 sem_dtor(kipc_perm_t *perm)
    244 {
    245 	ksemid_t *sp = (ksemid_t *)perm;
    246 
    247 	kmem_free(sp->sem_base,
    248 	    P2ROUNDUP(sp->sem_nsems * sizeof (struct sem), 64));
    249 	list_destroy(&sp->sem_undos);
    250 }
    251 
    252 /*
    253  * sem_undo_add - Create or update adjust on exit entry.
    254  */
    255 static int
    256 sem_undo_add(short val, ushort_t num, struct sem_undo *undo)
    257 {
    258 	int newval = undo->un_aoe[num] - val;
    259 
    260 	if (newval > USHRT_MAX || newval < -USHRT_MAX)
    261 		return (ERANGE);
    262 	undo->un_aoe[num] = newval;
    263 
    264 	return (0);
    265 }
    266 
    267 /*
    268  * sem_undo_clear - clears all undo entries for specified semaphores
    269  *
    270  * Used when semaphores are reset by SETVAL or SETALL.
    271  */
    272 static void
    273 sem_undo_clear(ksemid_t *sp, ushort_t low, ushort_t high)
    274 {
    275 	struct sem_undo *undo;
    276 	int i;
    277 
    278 	ASSERT(low <= high);
    279 	ASSERT(high < sp->sem_nsems);
    280 
    281 	for (undo = list_head(&sp->sem_undos); undo;
    282 	    undo = list_next(&sp->sem_undos, undo))
    283 		for (i = low; i <= high; i++)
    284 			undo->un_aoe[i] = 0;
    285 }
    286 
    287 /*
    288  * sem_rollback - roll back work done so far if unable to complete operation
    289  */
    290 static void
    291 sem_rollback(ksemid_t *sp, struct sembuf *op, int n, struct sem_undo *undo)
    292 {
    293 	struct sem *semp;	/* semaphore ptr */
    294 
    295 	for (op += n - 1; n--; op--) {
    296 		if (op->sem_op == 0)
    297 			continue;
    298 		semp = &sp->sem_base[op->sem_num];
    299 		semp->semval -= op->sem_op;
    300 		if (op->sem_flg & SEM_UNDO) {
    301 			ASSERT(undo != NULL);
    302 			(void) sem_undo_add(-op->sem_op, op->sem_num, undo);
    303 		}
    304 	}
    305 }
    306 
    307 static void
    308 sem_rmid(kipc_perm_t *perm)
    309 {
    310 	ksemid_t *sp = (ksemid_t *)perm;
    311 	struct sem *semp;
    312 	struct sem_undo *undo;
    313 	size_t size = SEM_UNDOSZ(sp->sem_nsems);
    314 	int i;
    315 
    316 	/*LINTED*/
    317 	while (undo = list_head(&sp->sem_undos)) {
    318 		list_remove(&sp->sem_undos, undo);
    319 		mutex_enter(&undo->un_proc->p_lock);
    320 		if (undo->un_proc->p_semacct == NULL) {
    321 			mutex_exit(&undo->un_proc->p_lock);
    322 			continue;
    323 		}
    324 		avl_remove(undo->un_proc->p_semacct, undo);
    325 		mutex_exit(&undo->un_proc->p_lock);
    326 		kmem_free(undo, size);
    327 		ipc_rele_locked(sem_svc, (kipc_perm_t *)sp);
    328 	}
    329 
    330 	for (i = 0; i < sp->sem_nsems; i++) {
    331 		semp = &sp->sem_base[i];
    332 		semp->semval = semp->sempid = 0;
    333 		if (semp->semncnt) {
    334 			cv_broadcast(&semp->semncnt_cv);
    335 			semp->semncnt = 0;
    336 		}
    337 		if (semp->semzcnt) {
    338 			cv_broadcast(&semp->semzcnt_cv);
    339 			semp->semzcnt = 0;
    340 		}
    341 	}
    342 }
    343 
    344 /*
    345  * semctl - Semctl system call.
    346  */
    347 static int
    348 semctl(int semid, uint_t semnum, int cmd, uintptr_t arg)
    349 {
    350 	ksemid_t		*sp;	/* ptr to semaphore header */
    351 	struct sem		*p;	/* ptr to semaphore */
    352 	unsigned int		i;	/* loop control */
    353 	ushort_t		*vals, *vp;
    354 	size_t			vsize = 0;
    355 	int			error = 0;
    356 	int			retval = 0;
    357 	struct cred		*cr;
    358 	kmutex_t		*lock;
    359 	model_t			mdl = get_udatamodel();
    360 	STRUCT_DECL(semid_ds, sid);
    361 	struct semid_ds64	ds64;
    362 
    363 	STRUCT_INIT(sid, mdl);
    364 	cr = CRED();
    365 
    366 	/*
    367 	 * Perform pre- or non-lookup actions (e.g. copyins, RMID).
    368 	 */
    369 	switch (cmd) {
    370 	case IPC_SET:
    371 		if (copyin((void *)arg, STRUCT_BUF(sid), STRUCT_SIZE(sid)))
    372 			return (set_errno(EFAULT));
    373 		break;
    374 
    375 	case IPC_SET64:
    376 		if (copyin((void *)arg, &ds64, sizeof (struct semid_ds64)))
    377 			return (set_errno(EFAULT));
    378 		break;
    379 
    380 	case SETALL:
    381 		if ((lock = ipc_lookup(sem_svc, semid,
    382 		    (kipc_perm_t **)&sp)) == NULL)
    383 			return (set_errno(EINVAL));
    384 		vsize = sp->sem_nsems * sizeof (*vals);
    385 		mutex_exit(lock);
    386 
    387 		/* allocate space to hold all semaphore values */
    388 		vals = kmem_alloc(vsize, KM_SLEEP);
    389 
    390 		if (copyin((void *)arg, vals, vsize)) {
    391 			kmem_free(vals, vsize);
    392 			return (set_errno(EFAULT));
    393 		}
    394 		break;
    395 
    396 	case IPC_RMID:
    397 		if (error = ipc_rmid(sem_svc, semid, cr))
    398 			return (set_errno(error));
    399 		return (0);
    400 	}
    401 
    402 	if ((lock = ipc_lookup(sem_svc, semid, (kipc_perm_t **)&sp)) == NULL) {
    403 		if (vsize != 0)
    404 			kmem_free(vals, vsize);
    405 		return (set_errno(EINVAL));
    406 	}
    407 	switch (cmd) {
    408 	/* Set ownership and permissions. */
    409 	case IPC_SET:
    410 
    411 		if (error = ipcperm_set(sem_svc, cr, &sp->sem_perm,
    412 		    &STRUCT_BUF(sid)->sem_perm, mdl)) {
    413 			mutex_exit(lock);
    414 			return (set_errno(error));
    415 		}
    416 		sp->sem_ctime = gethrestime_sec();
    417 		mutex_exit(lock);
    418 		return (0);
    419 
    420 	/* Get semaphore data structure. */
    421 	case IPC_STAT:
    422 
    423 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
    424 			mutex_exit(lock);
    425 			return (set_errno(error));
    426 		}
    427 
    428 		ipcperm_stat(&STRUCT_BUF(sid)->sem_perm, &sp->sem_perm, mdl);
    429 		STRUCT_FSETP(sid, sem_base, NULL);	/* kernel addr */
    430 		STRUCT_FSET(sid, sem_nsems, sp->sem_nsems);
    431 		STRUCT_FSET(sid, sem_otime, sp->sem_otime);
    432 		STRUCT_FSET(sid, sem_ctime, sp->sem_ctime);
    433 		STRUCT_FSET(sid, sem_binary, sp->sem_binary);
    434 		mutex_exit(lock);
    435 
    436 		if (copyout(STRUCT_BUF(sid), (void *)arg, STRUCT_SIZE(sid)))
    437 			return (set_errno(EFAULT));
    438 		return (0);
    439 
    440 	case IPC_SET64:
    441 
    442 		if (error = ipcperm_set64(sem_svc, cr, &sp->sem_perm,
    443 		    &ds64.semx_perm)) {
    444 			mutex_exit(lock);
    445 			return (set_errno(error));
    446 		}
    447 		sp->sem_ctime = gethrestime_sec();
    448 		mutex_exit(lock);
    449 		return (0);
    450 
    451 	case IPC_STAT64:
    452 
    453 		ipcperm_stat64(&ds64.semx_perm, &sp->sem_perm);
    454 		ds64.semx_nsems = sp->sem_nsems;
    455 		ds64.semx_otime = sp->sem_otime;
    456 		ds64.semx_ctime = sp->sem_ctime;
    457 
    458 		mutex_exit(lock);
    459 		if (copyout(&ds64, (void *)arg, sizeof (struct semid_ds64)))
    460 			return (set_errno(EFAULT));
    461 
    462 		return (0);
    463 
    464 	/* Get # of processes sleeping for greater semval. */
    465 	case GETNCNT:
    466 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
    467 			mutex_exit(lock);
    468 			return (set_errno(error));
    469 		}
    470 		if (semnum >= sp->sem_nsems) {
    471 			mutex_exit(lock);
    472 			return (set_errno(EINVAL));
    473 		}
    474 		retval = sp->sem_base[semnum].semncnt;
    475 		mutex_exit(lock);
    476 		return (retval);
    477 
    478 	/* Get pid of last process to operate on semaphore. */
    479 	case GETPID:
    480 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
    481 			mutex_exit(lock);
    482 			return (set_errno(error));
    483 		}
    484 		if (semnum >= sp->sem_nsems) {
    485 			mutex_exit(lock);
    486 			return (set_errno(EINVAL));
    487 		}
    488 		retval = sp->sem_base[semnum].sempid;
    489 		mutex_exit(lock);
    490 		return (retval);
    491 
    492 	/* Get semval of one semaphore. */
    493 	case GETVAL:
    494 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
    495 			mutex_exit(lock);
    496 			return (set_errno(error));
    497 		}
    498 		if (semnum >= sp->sem_nsems) {
    499 			mutex_exit(lock);
    500 			return (set_errno(EINVAL));
    501 		}
    502 		retval = sp->sem_base[semnum].semval;
    503 		mutex_exit(lock);
    504 		return (retval);
    505 
    506 	/* Get all semvals in set. */
    507 	case GETALL:
    508 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
    509 			mutex_exit(lock);
    510 			return (set_errno(error));
    511 		}
    512 
    513 		/* allocate space to hold all semaphore values */
    514 		vsize = sp->sem_nsems * sizeof (*vals);
    515 		vals = vp = kmem_alloc(vsize, KM_SLEEP);
    516 
    517 		for (i = sp->sem_nsems, p = sp->sem_base; i--; p++, vp++)
    518 			bcopy(&p->semval, vp, sizeof (p->semval));
    519 
    520 		mutex_exit(lock);
    521 
    522 		if (copyout((void *)vals, (void *)arg, vsize)) {
    523 			kmem_free(vals, vsize);
    524 			return (set_errno(EFAULT));
    525 		}
    526 
    527 		kmem_free(vals, vsize);
    528 		return (0);
    529 
    530 	/* Get # of processes sleeping for semval to become zero. */
    531 	case GETZCNT:
    532 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
    533 			mutex_exit(lock);
    534 			return (set_errno(error));
    535 		}
    536 		if (semnum >= sp->sem_nsems) {
    537 			mutex_exit(lock);
    538 			return (set_errno(EINVAL));
    539 		}
    540 		retval = sp->sem_base[semnum].semzcnt;
    541 		mutex_exit(lock);
    542 		return (retval);
    543 
    544 	/* Set semval of one semaphore. */
    545 	case SETVAL:
    546 		if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
    547 			mutex_exit(lock);
    548 			return (set_errno(error));
    549 		}
    550 		if (semnum >= sp->sem_nsems) {
    551 			mutex_exit(lock);
    552 			return (set_errno(EINVAL));
    553 		}
    554 		if ((uint_t)arg > USHRT_MAX) {
    555 			mutex_exit(lock);
    556 			return (set_errno(ERANGE));
    557 		}
    558 		p = &sp->sem_base[semnum];
    559 		if ((p->semval = (ushort_t)arg) != 0) {
    560 			if (p->semncnt) {
    561 				cv_broadcast(&p->semncnt_cv);
    562 			}
    563 		} else if (p->semzcnt) {
    564 			cv_broadcast(&p->semzcnt_cv);
    565 		}
    566 		p->sempid = curproc->p_pid;
    567 		sem_undo_clear(sp, (ushort_t)semnum, (ushort_t)semnum);
    568 		mutex_exit(lock);
    569 		return (0);
    570 
    571 	/* Set semvals of all semaphores in set. */
    572 	case SETALL:
    573 		/* Check if semaphore set has been deleted and reallocated. */
    574 		if (sp->sem_nsems * sizeof (*vals) != vsize) {
    575 			error = set_errno(EINVAL);
    576 			goto seterr;
    577 		}
    578 		if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
    579 			error = set_errno(error);
    580 			goto seterr;
    581 		}
    582 		sem_undo_clear(sp, 0, sp->sem_nsems - 1);
    583 		for (i = 0, p = sp->sem_base; i < sp->sem_nsems;
    584 		    (p++)->sempid = curproc->p_pid) {
    585 			if ((p->semval = vals[i++]) != 0) {
    586 				if (p->semncnt) {
    587 					cv_broadcast(&p->semncnt_cv);
    588 				}
    589 			} else if (p->semzcnt) {
    590 				cv_broadcast(&p->semzcnt_cv);
    591 			}
    592 		}
    593 seterr:
    594 		mutex_exit(lock);
    595 		kmem_free(vals, vsize);
    596 		return (error);
    597 
    598 	default:
    599 		mutex_exit(lock);
    600 		return (set_errno(EINVAL));
    601 	}
    602 
    603 	/* NOTREACHED */
    604 }
    605 
    606 /*
    607  * semexit - Called by exit() to clean up on process exit.
    608  */
    609 void
    610 semexit(proc_t *pp)
    611 {
    612 	avl_tree_t	*tree;
    613 	struct sem_undo	*undo;
    614 	void		*cookie = NULL;
    615 
    616 	mutex_enter(&pp->p_lock);
    617 	tree = pp->p_semacct;
    618 	pp->p_semacct = NULL;
    619 	mutex_exit(&pp->p_lock);
    620 
    621 	while (undo = avl_destroy_nodes(tree, &cookie)) {
    622 		ksemid_t *sp = undo->un_sp;
    623 		size_t size = SEM_UNDOSZ(sp->sem_nsems);
    624 		int i;
    625 
    626 		(void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
    627 		if (!IPC_FREE(&sp->sem_perm)) {
    628 			for (i = 0; i < sp->sem_nsems; i++) {
    629 				int adj = undo->un_aoe[i];
    630 				if (adj) {
    631 					struct sem *semp = &sp->sem_base[i];
    632 					int v = (int)semp->semval + adj;
    633 
    634 					if (v < 0 || v > USHRT_MAX)
    635 						continue;
    636 					semp->semval = (ushort_t)v;
    637 					if (v == 0 && semp->semzcnt)
    638 						cv_broadcast(&semp->semzcnt_cv);
    639 					if (adj > 0 && semp->semncnt)
    640 						cv_broadcast(&semp->semncnt_cv);
    641 				}
    642 			}
    643 			list_remove(&sp->sem_undos, undo);
    644 		}
    645 		ipc_rele(sem_svc, (kipc_perm_t *)sp);
    646 		kmem_free(undo, size);
    647 	}
    648 
    649 	avl_destroy(tree);
    650 	kmem_free(tree, sizeof (avl_tree_t));
    651 }
    652 
    653 /*
    654  * Remove all semaphores associated with a given zone.  Called by
    655  * zone_shutdown when the zone is halted.
    656  */
    657 /*ARGSUSED1*/
    658 static void
    659 sem_remove_zone(zoneid_t zoneid, void *arg)
    660 {
    661 	ipc_remove_zone(sem_svc, zoneid);
    662 }
    663 
    664 /*
    665  * semget - Semget system call.
    666  */
    667 static int
    668 semget(key_t key, int nsems, int semflg)
    669 {
    670 	ksemid_t	*sp;
    671 	kmutex_t	*lock;
    672 	int		id, error;
    673 	proc_t		*pp = curproc;
    674 
    675 top:
    676 	if (error = ipc_get(sem_svc, key, semflg, (kipc_perm_t **)&sp, &lock))
    677 		return (set_errno(error));
    678 
    679 	if (!IPC_FREE(&sp->sem_perm)) {
    680 		/*
    681 		 * A semaphore with the requested key exists.
    682 		 */
    683 		if (!((nsems >= 0) && (nsems <= sp->sem_nsems))) {
    684 			mutex_exit(lock);
    685 			return (set_errno(EINVAL));
    686 		}
    687 	} else {
    688 		/*
    689 		 * This is a new semaphore set.  Finish initialization.
    690 		 */
    691 		if (nsems <= 0 || (rctl_test(rc_process_semmsl, pp->p_rctls, pp,
    692 		    nsems, RCA_SAFE) & RCT_DENY)) {
    693 			mutex_exit(lock);
    694 			mutex_exit(&pp->p_lock);
    695 			ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
    696 			return (set_errno(EINVAL));
    697 		}
    698 		mutex_exit(lock);
    699 		mutex_exit(&pp->p_lock);
    700 
    701 		/*
    702 		 * We round the allocation up to coherency granularity
    703 		 * so that multiple semaphore allocations won't result
    704 		 * in the false sharing of their sem structures.
    705 		 */
    706 		sp->sem_base =
    707 		    kmem_zalloc(P2ROUNDUP(nsems * sizeof (struct sem), 64),
    708 		    KM_SLEEP);
    709 		sp->sem_binary = (nsems == 1);
    710 		sp->sem_nsems = (ushort_t)nsems;
    711 		sp->sem_ctime = gethrestime_sec();
    712 		sp->sem_otime = 0;
    713 		list_create(&sp->sem_undos, sizeof (struct sem_undo),
    714 		    offsetof(struct sem_undo, un_list));
    715 
    716 		if (error = ipc_commit_begin(sem_svc, key, semflg,
    717 		    (kipc_perm_t *)sp)) {
    718 			if (error == EAGAIN)
    719 				goto top;
    720 			return (set_errno(error));
    721 		}
    722 		sp->sem_maxops =
    723 		    rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp);
    724 		if (rctl_test(rc_process_semmsl, pp->p_rctls, pp, nsems,
    725 		    RCA_SAFE) & RCT_DENY) {
    726 			ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
    727 			return (set_errno(EINVAL));
    728 		}
    729 		lock = ipc_commit_end(sem_svc, &sp->sem_perm);
    730 	}
    731 	if (audit_active)
    732 		audit_ipcget(AT_IPC_SEM, (void *)sp);
    733 	id = sp->sem_perm.ipc_id;
    734 	mutex_exit(lock);
    735 	return (id);
    736 }
    737 
    738 /*
    739  * semids system call.
    740  */
    741 static int
    742 semids(int *buf, uint_t nids, uint_t *pnids)
    743 {
    744 	int error;
    745 
    746 	if (error = ipc_ids(sem_svc, buf, nids, pnids))
    747 		return (set_errno(error));
    748 
    749 	return (0);
    750 }
    751 
    752 
    753 /*
    754  * Helper function for semop - copies in the provided timespec and
    755  * computes the absolute future time after which we must return.
    756  */
    757 static int
    758 compute_timeout(timespec_t **tsp, timespec_t *ts, timespec_t *now,
    759 	timespec_t *timeout)
    760 {
    761 	model_t datamodel = get_udatamodel();
    762 
    763 	if (datamodel == DATAMODEL_NATIVE) {
    764 		if (copyin(timeout, ts, sizeof (timespec_t)))
    765 			return (EFAULT);
    766 	} else {
    767 		timespec32_t ts32;
    768 
    769 		if (copyin(timeout, &ts32, sizeof (timespec32_t)))
    770 			return (EFAULT);
    771 		TIMESPEC32_TO_TIMESPEC(ts, &ts32)
    772 	}
    773 
    774 	if (itimerspecfix(ts))
    775 		return (EINVAL);
    776 
    777 	/*
    778 	 * Convert the timespec value into absolute time.
    779 	 */
    780 	timespecadd(ts, now);
    781 	*tsp = ts;
    782 
    783 	return (0);
    784 }
    785 
    786 /*
    787  * Undo structure comparator.  We sort based on ksemid_t pointer.
    788  */
    789 static int
    790 sem_undo_compar(const void *x, const void *y)
    791 {
    792 	struct sem_undo *undo1 = (struct sem_undo *)x;
    793 	struct sem_undo *undo2 = (struct sem_undo *)y;
    794 
    795 	if (undo1->un_sp < undo2->un_sp)
    796 		return (-1);
    797 	if (undo1->un_sp > undo2->un_sp)
    798 		return (1);
    799 	return (0);
    800 }
    801 
    802 /*
    803  * Helper function for semop - creates an undo structure and adds it to
    804  * the process's avl tree and the semaphore's list.
    805  */
    806 static int
    807 sem_undo_alloc(proc_t *pp, ksemid_t *sp, kmutex_t **lock,
    808     struct sem_undo *template, struct sem_undo **un)
    809 {
    810 	size_t size;
    811 	struct sem_undo *undo;
    812 	avl_tree_t *tree = NULL;
    813 	avl_index_t where;
    814 
    815 	mutex_exit(*lock);
    816 
    817 	size = SEM_UNDOSZ(sp->sem_nsems);
    818 	undo = kmem_zalloc(size, KM_SLEEP);
    819 	undo->un_proc = pp;
    820 	undo->un_sp = sp;
    821 
    822 	if (pp->p_semacct == NULL)
    823 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
    824 
    825 	*lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
    826 	if (IPC_FREE(&sp->sem_perm)) {
    827 		kmem_free(undo, size);
    828 		if (tree)
    829 			kmem_free(tree, sizeof (avl_tree_t));
    830 		return (EIDRM);
    831 	}
    832 
    833 	mutex_enter(&pp->p_lock);
    834 	if (tree) {
    835 		if (pp->p_semacct == NULL) {
    836 			avl_create(tree, sem_undo_compar,
    837 			    sizeof (struct sem_undo),
    838 			    offsetof(struct sem_undo, un_avl));
    839 			pp->p_semacct = tree;
    840 		} else {
    841 			kmem_free(tree, sizeof (avl_tree_t));
    842 		}
    843 	}
    844 
    845 	if (*un = avl_find(pp->p_semacct, template, &where)) {
    846 		mutex_exit(&pp->p_lock);
    847 		kmem_free(undo, size);
    848 	} else {
    849 		*un = undo;
    850 		avl_insert(pp->p_semacct, undo, where);
    851 		mutex_exit(&pp->p_lock);
    852 		list_insert_head(&sp->sem_undos, undo);
    853 		ipc_hold(sem_svc, (kipc_perm_t *)sp);
    854 	}
    855 
    856 
    857 	return (0);
    858 }
    859 
    860 /*
    861  * semop - Semop system call.
    862  */
    863 static int
    864 semop(int semid, struct sembuf *sops, size_t nsops, timespec_t *timeout)
    865 {
    866 	ksemid_t	*sp = NULL;
    867 	kmutex_t	*lock;
    868 	struct sembuf	*op;	/* ptr to operation */
    869 	int		i;	/* loop control */
    870 	struct sem	*semp;	/* ptr to semaphore */
    871 	int 		error = 0;
    872 	struct sembuf	*uops;	/* ptr to copy of user ops */
    873 	struct sembuf 	x_sem;	/* avoid kmem_alloc's */
    874 	timespec_t	now, ts, *tsp = NULL;
    875 	int		timecheck = 0;
    876 	int		cvres, needundo, mode;
    877 	struct sem_undo	*undo;
    878 	proc_t		*pp = curproc;
    879 	int		held = 0;
    880 
    881 	CPU_STATS_ADDQ(CPU, sys, sema, 1); /* bump semaphore op count */
    882 
    883 	/*
    884 	 * To avoid the cost of copying in 'timeout' in the common
    885 	 * case, we could only grab the time here and defer the copyin
    886 	 * and associated computations until we are about to block.
    887 	 *
    888 	 * The down side to this is that we would then have to spin
    889 	 * some goto top nonsense to avoid the copyin behind the semid
    890 	 * lock.  As a common use of timed semaphores is as an explicit
    891 	 * blocking mechanism, this could incur a greater penalty.
    892 	 *
    893 	 * If we eventually decide that this would be a wise route to
    894 	 * take, the deferrable functionality is completely contained
    895 	 * in 'compute_timeout', and the interface is defined such that
    896 	 * we can legally not validate 'timeout' if it is unused.
    897 	 */
    898 	if (timeout != NULL) {
    899 		timecheck = timechanged;
    900 		gethrestime(&now);
    901 		if (error = compute_timeout(&tsp, &ts, &now, timeout))
    902 			return (set_errno(error));
    903 	}
    904 
    905 	/*
    906 	 * Allocate space to hold the vector of semaphore ops.  If
    907 	 * there is only 1 operation we use a preallocated buffer on
    908 	 * the stack for speed.
    909 	 *
    910 	 * Since we don't want to allow the user to allocate an
    911 	 * arbitrary amount of kernel memory, we need to check against
    912 	 * the number of operations allowed by the semaphore.  We only
    913 	 * bother doing this if the number of operations is larger than
    914 	 * SEM_MAXUCOPS.
    915 	 */
    916 	if (nsops == 1)
    917 		uops = &x_sem;
    918 	else if (nsops == 0)
    919 		return (0);
    920 	else if (nsops <= SEM_MAXUCOPS)
    921 		uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
    922 
    923 	if (nsops > SEM_MAXUCOPS) {
    924 		if ((lock = ipc_lookup(sem_svc, semid,
    925 		    (kipc_perm_t **)&sp)) == NULL)
    926 			return (set_errno(EFAULT));
    927 
    928 		if (nsops > sp->sem_maxops) {
    929 			mutex_exit(lock);
    930 			return (set_errno(E2BIG));
    931 		}
    932 		held = 1;
    933 		ipc_hold(sem_svc, (kipc_perm_t *)sp);
    934 		mutex_exit(lock);
    935 
    936 		uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
    937 		if (copyin(sops, uops, nsops * sizeof (*op))) {
    938 			error = EFAULT;
    939 			(void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
    940 			goto semoperr;
    941 		}
    942 
    943 		lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
    944 		if (IPC_FREE(&sp->sem_perm)) {
    945 			error = EIDRM;
    946 			goto semoperr;
    947 		}
    948 	} else {
    949 		/*
    950 		 * This could be interleaved with the above code, but
    951 		 * keeping them separate improves readability.
    952 		 */
    953 		if (copyin(sops, uops, nsops * sizeof (*op))) {
    954 			error = EFAULT;
    955 			goto semoperr_unlocked;
    956 		}
    957 
    958 		if ((lock = ipc_lookup(sem_svc, semid,
    959 		    (kipc_perm_t **)&sp)) == NULL) {
    960 			error = EINVAL;
    961 			goto semoperr_unlocked;
    962 		}
    963 
    964 		if (nsops > sp->sem_maxops) {
    965 			error = E2BIG;
    966 			goto semoperr;
    967 		}
    968 	}
    969 
    970 	/*
    971 	 * Scan all operations.  Verify that sem #s are in range and
    972 	 * this process is allowed the requested operations.  If any
    973 	 * operations are marked SEM_UNDO, find (or allocate) the undo
    974 	 * structure for this process and semaphore.
    975 	 */
    976 	needundo = 0;
    977 	mode = 0;
    978 	for (i = 0, op = uops; i++ < nsops; op++) {
    979 		mode |= op->sem_op ? SEM_A : SEM_R;
    980 		if (op->sem_num >= sp->sem_nsems) {
    981 			error = EFBIG;
    982 			goto semoperr;
    983 		}
    984 		if ((op->sem_flg & SEM_UNDO) && op->sem_op)
    985 			needundo = 1;
    986 	}
    987 	if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
    988 		goto semoperr;
    989 
    990 	if (needundo) {
    991 		struct sem_undo template;
    992 
    993 		template.un_sp = sp;
    994 		mutex_enter(&pp->p_lock);
    995 		if (pp->p_semacct)
    996 			undo = avl_find(pp->p_semacct, &template, NULL);
    997 		else
    998 			undo = NULL;
    999 		mutex_exit(&pp->p_lock);
   1000 		if (undo == NULL) {
   1001 			if (error = sem_undo_alloc(pp, sp, &lock, &template,
   1002 			    &undo))
   1003 				goto semoperr;
   1004 
   1005 			/* sem_undo_alloc unlocks the semaphore */
   1006 			if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
   1007 				goto semoperr;
   1008 		}
   1009 	}
   1010 
   1011 check:
   1012 	/*
   1013 	 * Loop waiting for the operations to be satisfied atomically.
   1014 	 * Actually, do the operations and undo them if a wait is needed
   1015 	 * or an error is detected.
   1016 	 */
   1017 	for (i = 0; i < nsops; i++) {
   1018 		op = &uops[i];
   1019 		semp = &sp->sem_base[op->sem_num];
   1020 
   1021 		/*
   1022 		 * Raise the semaphore (i.e. sema_v)
   1023 		 */
   1024 		if (op->sem_op > 0) {
   1025 			if (op->sem_op + (int)semp->semval > USHRT_MAX ||
   1026 			    ((op->sem_flg & SEM_UNDO) &&
   1027 			    (error = sem_undo_add(op->sem_op, op->sem_num,
   1028 			    undo)))) {
   1029 				if (i)
   1030 					sem_rollback(sp, uops, i, undo);
   1031 				if (error == 0)
   1032 					error = ERANGE;
   1033 				goto semoperr;
   1034 			}
   1035 			semp->semval += op->sem_op;
   1036 			/*
   1037 			 * If we are only incrementing the semaphore value
   1038 			 * by one on a binary semaphore, we can cv_signal.
   1039 			 */
   1040 			if (semp->semncnt) {
   1041 				if (op->sem_op == 1 && sp->sem_binary)
   1042 					cv_signal(&semp->semncnt_cv);
   1043 				else
   1044 					cv_broadcast(&semp->semncnt_cv);
   1045 			}
   1046 			if (semp->semzcnt && !semp->semval)
   1047 				cv_broadcast(&semp->semzcnt_cv);
   1048 			continue;
   1049 		}
   1050 
   1051 		/*
   1052 		 * Lower the semaphore (i.e. sema_p)
   1053 		 */
   1054 		if (op->sem_op < 0) {
   1055 			if (semp->semval >= (unsigned)(-op->sem_op)) {
   1056 				if ((op->sem_flg & SEM_UNDO) &&
   1057 				    (error = sem_undo_add(op->sem_op,
   1058 				    op->sem_num, undo))) {
   1059 					if (i)
   1060 						sem_rollback(sp,