Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)exec.c	1.187	07/12/26 SMI"
     27 
     28 /*	Copyright (c) 1988 AT&T	*/
     29 /*	  All Rights Reserved  	*/
     30 
     31 
     32 #include <sys/types.h>
     33 #include <sys/param.h>
     34 #include <sys/sysmacros.h>
     35 #include <sys/systm.h>
     36 #include <sys/signal.h>
     37 #include <sys/cred_impl.h>
     38 #include <sys/policy.h>
     39 #include <sys/user.h>
     40 #include <sys/errno.h>
     41 #include <sys/file.h>
     42 #include <sys/vfs.h>
     43 #include <sys/vnode.h>
     44 #include <sys/mman.h>
     45 #include <sys/acct.h>
     46 #include <sys/cpuvar.h>
     47 #include <sys/proc.h>
     48 #include <sys/cmn_err.h>
     49 #include <sys/debug.h>
     50 #include <sys/pathname.h>
     51 #include <sys/vm.h>
     52 #include <sys/lgrp.h>
     53 #include <sys/vtrace.h>
     54 #include <sys/exec.h>
     55 #include <sys/exechdr.h>
     56 #include <sys/kmem.h>
     57 #include <sys/prsystm.h>
     58 #include <sys/modctl.h>
     59 #include <sys/vmparam.h>
     60 #include <sys/schedctl.h>
     61 #include <sys/utrap.h>
     62 #include <sys/systeminfo.h>
     63 #include <sys/stack.h>
     64 #include <sys/rctl.h>
     65 #include <sys/dtrace.h>
     66 #include <sys/lwpchan_impl.h>
     67 #include <sys/pool.h>
     68 #include <sys/sdt.h>
     69 #include <sys/brand.h>
     70 
     71 #include <c2/audit.h>
     72 
     73 #include <vm/hat.h>
     74 #include <vm/anon.h>
     75 #include <vm/as.h>
     76 #include <vm/seg.h>
     77 #include <vm/seg_vn.h>
     78 
     79 #define	PRIV_RESET		0x01	/* needs to reset privs */
     80 #define	PRIV_SETID		0x02	/* needs to change uids */
     81 #define	PRIV_SETUGID		0x04	/* is setuid/setgid/forced privs */
     82 #define	PRIV_INCREASE		0x08	/* child runs with more privs */
     83 #define	MAC_FLAGS		0x10	/* need to adjust MAC flags */
     84 
     85 static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *);
     86 static int hold_execsw(struct execsw *);
     87 
     88 uint_t auxv_hwcap = 0;	/* auxv AT_SUN_HWCAP value; determined on the fly */
     89 #if defined(_SYSCALL32_IMPL)
     90 uint_t auxv_hwcap32 = 0;	/* 32-bit version of auxv_hwcap */
     91 #endif
     92 
     93 #define	PSUIDFLAGS		(SNOCD|SUGID)
     94 
     95 /*
     96  * exec() - wrapper around exece providing NULL environment pointer
     97  */
     98 int
     99 exec(const char *fname, const char **argp)
    100 {
    101 	return (exece(fname, argp, NULL));
    102 }
    103 
    104 /*
    105  * exece() - system call wrapper around exec_common()
    106  */
    107 int
    108 exece(const char *fname, const char **argp, const char **envp)
    109 {
    110 	int error;
    111 
    112 	error = exec_common(fname, argp, envp, EBA_NONE);
    113 	return (error ? (set_errno(error)) : 0);
    114 }
    115 
    116 int
    117 exec_common(const char *fname, const char **argp, const char **envp,
    118     int brand_action)
    119 {
    120 	vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL;
    121 	proc_t *p = ttoproc(curthread);
    122 	klwp_t *lwp = ttolwp(curthread);
    123 	struct user *up = PTOU(p);
    124 	long execsz;		/* temporary count of exec size */
    125 	int i;
    126 	int error;
    127 	char exec_file[MAXCOMLEN+1];
    128 	struct pathname pn;
    129 	struct pathname resolvepn;
    130 	struct uarg args;
    131 	struct execa ua;
    132 	k_sigset_t savedmask;
    133 	lwpdir_t *lwpdir = NULL;
    134 	lwpdir_t **tidhash;
    135 	lwpdir_t *old_lwpdir = NULL;
    136 	uint_t old_lwpdir_sz;
    137 	lwpdir_t **old_tidhash;
    138 	uint_t old_tidhash_sz;
    139 	lwpent_t *lep;
    140 	int brandme = 0;
    141 
    142 	/*
    143 	 * exec() is not supported for the /proc agent lwp.
    144 	 */
    145 	if (curthread == p->p_agenttp)
    146 		return (ENOTSUP);
    147 
    148 	if ((error = secpolicy_basic_exec(CRED())) != 0)
    149 		return (error);
    150 
    151 	if (brand_action != EBA_NONE) {
    152 		/*
    153 		 * Brand actions are not supported for processes that are not
    154 		 * running in a branded zone.
    155 		 */
    156 		if (!ZONE_IS_BRANDED(p->p_zone))
    157 			return (ENOTSUP);
    158 
    159 		if (brand_action == EBA_NATIVE) {
    160 			/* Only branded processes can be unbranded */
    161 			if (!PROC_IS_BRANDED(p))
    162 				return (ENOTSUP);
    163 		} else {
    164 			/* Only unbranded processes can be branded */
    165 			if (PROC_IS_BRANDED(p))
    166 				return (ENOTSUP);
    167 			brandme = 1;
    168 		}
    169 	} else {
    170 		/*
    171 		 * If this is a native zone, or if the process is already
    172 		 * branded, then we don't need to do anything.  If this is
    173 		 * a native process in a branded zone, we need to brand the
    174 		 * process as it exec()s the new binary.
    175 		 */
    176 		if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p))
    177 			brandme = 1;
    178 	}
    179 
    180 	/*
    181 	 * Inform /proc that an exec() has started.
    182 	 * Hold signals that are ignored by default so that we will
    183 	 * not be interrupted by a signal that will be ignored after
    184 	 * successful completion of gexec().
    185 	 */
    186 	mutex_enter(&p->p_lock);
    187 	prexecstart();
    188 	schedctl_finish_sigblock(curthread);
    189 	savedmask = curthread->t_hold;
    190 	sigorset(&curthread->t_hold, &ignoredefault);
    191 	mutex_exit(&p->p_lock);
    192 
    193 	/*
    194 	 * Look up path name and remember last component for later.
    195 	 * To help coreadm expand its %d token, we attempt to save
    196 	 * the directory containing the executable in p_execdir. The
    197 	 * first call to lookuppn() may fail and return EINVAL because
    198 	 * dirvpp is non-NULL. In that case, we make a second call to
    199 	 * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
    200 	 * but coreadm is allowed to expand %d to the empty string and
    201 	 * there are other cases in which that failure may occur.
    202 	 */
    203 	if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
    204 		goto out;
    205 	pn_alloc(&resolvepn);
    206 	if ((error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp)) != 0) {
    207 		pn_free(&resolvepn);
    208 		pn_free(&pn);
    209 		if (error != EINVAL)
    210 			goto out;
    211 
    212 		dir = NULL;
    213 		if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
    214 			goto out;
    215 		pn_alloc(&resolvepn);
    216 		if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
    217 		    &vp)) != 0) {
    218 			pn_free(&resolvepn);
    219 			pn_free(&pn);
    220 			goto out;
    221 		}
    222 	}
    223 	if (vp == NULL) {
    224 		if (dir != NULL)
    225 			VN_RELE(dir);
    226 		error = ENOENT;
    227 		pn_free(&resolvepn);
    228 		pn_free(&pn);
    229 		goto out;
    230 	}
    231 
    232 	/*
    233 	 * We do not allow executing files in attribute directories.
    234 	 * We test this by determining whether the resolved path
    235 	 * contains a "/" when we're in an attribute directory;
    236 	 * only if the pathname does not contain a "/" the resolved path
    237 	 * points to a file in the current working (attribute) directory.
    238 	 */
    239 	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
    240 	    strchr(resolvepn.pn_path, '/') == NULL) {
    241 		if (dir != NULL)
    242 			VN_RELE(dir);
    243 		error = EACCES;
    244 		pn_free(&resolvepn);
    245 		pn_free(&pn);
    246 		VN_RELE(vp);
    247 		goto out;
    248 	}
    249 
    250 	bzero(exec_file, MAXCOMLEN+1);
    251 	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
    252 	bzero(&args, sizeof (args));
    253 	args.pathname = resolvepn.pn_path;
    254 	/* don't free resolvepn until we are done with args */
    255 	pn_free(&pn);
    256 
    257 	/*
    258 	 * Specific exec handlers, or policies determined via
    259 	 * /etc/system may override the historical default.
    260 	 */
    261 	args.stk_prot = PROT_ZFOD;
    262 	args.dat_prot = PROT_ZFOD;
    263 
    264 	CPU_STATS_ADD_K(sys, sysexec, 1);
    265 	DTRACE_PROC1(exec, char *, args.pathname);
    266 
    267 	ua.fname = fname;
    268 	ua.argp = argp;
    269 	ua.envp = envp;
    270 
    271 	/* If necessary, brand this process before we start the exec. */
    272 	if (brandme != 0)
    273 		brand_setbrand(p);
    274 
    275 	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
    276 	    exec_file, p->p_cred, brand_action)) != 0) {
    277 		if (brandme != 0)
    278 			BROP(p)->b_proc_exit(p, lwp);
    279 		VN_RELE(vp);
    280 		if (dir != NULL)
    281 			VN_RELE(dir);
    282 		pn_free(&resolvepn);
    283 		goto fail;
    284 	}
    285 
    286 	/*
    287 	 * Free floating point registers (sun4u only)
    288 	 */
    289 	ASSERT(lwp != NULL);
    290 	lwp_freeregs(lwp, 1);
    291 
    292 	/*
    293 	 * Free thread and process context ops.
    294 	 */
    295 	if (curthread->t_ctx)
    296 		freectx(curthread, 1);
    297 	if (p->p_pctx)
    298 		freepctx(p, 1);
    299 
    300 	/*
    301 	 * Remember file name for accounting; clear any cached DTrace predicate.
    302 	 */
    303 	up->u_acflag &= ~AFORK;
    304 	bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
    305 	curthread->t_predcache = NULL;
    306 
    307 	/*
    308 	 * Clear contract template state
    309 	 */
    310 	lwp_ctmpl_clear(lwp);
    311 
    312 	/*
    313 	 * Save the directory in which we found the executable for expanding
    314 	 * the %d token used in core file patterns.
    315 	 */
    316 	mutex_enter(&p->p_lock);
    317 	tmpvp = p->p_execdir;
    318 	p->p_execdir = dir;
    319 	if (p->p_execdir != NULL)
    320 		VN_HOLD(p->p_execdir);
    321 	mutex_exit(&p->p_lock);
    322 
    323 	if (tmpvp != NULL)
    324 		VN_RELE(tmpvp);
    325 
    326 	/*
    327 	 * Reset stack state to the user stack, clear set of signals
    328 	 * caught on the signal stack, and reset list of signals that
    329 	 * restart system calls; the new program's environment should
    330 	 * not be affected by detritus from the old program.  Any
    331 	 * pending held signals remain held, so don't clear t_hold.
    332 	 */
    333 	mutex_enter(&p->p_lock);
    334 	lwp->lwp_oldcontext = 0;
    335 	lwp->lwp_ustack = 0;
    336 	lwp->lwp_old_stk_ctl = 0;
    337 	sigemptyset(&up->u_signodefer);
    338 	sigemptyset(&up->u_sigonstack);
    339 	sigemptyset(&up->u_sigresethand);
    340 	lwp->lwp_sigaltstack.ss_sp = 0;
    341 	lwp->lwp_sigaltstack.ss_size = 0;
    342 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
    343 
    344 	/*
    345 	 * Make saved resource limit == current resource limit.
    346 	 */
    347 	for (i = 0; i < RLIM_NLIMITS; i++) {
    348 		/*CONSTCOND*/
    349 		if (RLIM_SAVED(i)) {
    350 			(void) rctl_rlimit_get(rctlproc_legacy[i], p,
    351 			    &up->u_saved_rlimit[i]);
    352 		}
    353 	}
    354 
    355 	/*
    356 	 * If the action was to catch the signal, then the action
    357 	 * must be reset to SIG_DFL.
    358 	 */
    359 	sigdefault(p);
    360 	p->p_flag &= ~(SNOWAIT|SJCTL);
    361 	p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
    362 	up->u_signal[SIGCLD - 1] = SIG_DFL;
    363 
    364 	/*
    365 	 * Delete the dot4 sigqueues/signotifies.
    366 	 */
    367 	sigqfree(p);
    368 
    369 	mutex_exit(&p->p_lock);
    370 
    371 	mutex_enter(&p->p_pflock);
    372 	p->p_prof.pr_base = NULL;
    373 	p->p_prof.pr_size = 0;
    374 	p->p_prof.pr_off = 0;
    375 	p->p_prof.pr_scale = 0;
    376 	p->p_prof.pr_samples = 0;
    377 	mutex_exit(&p->p_pflock);
    378 
    379 	ASSERT(curthread->t_schedctl == NULL);
    380 
    381 #if defined(__sparc)
    382 	if (p->p_utraps != NULL)
    383 		utrap_free(p);
    384 #endif	/* __sparc */
    385 
    386 	/*
    387 	 * Close all close-on-exec files.
    388 	 */
    389 	close_exec(P_FINFO(p));
    390 	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
    391 
    392 	/* Unbrand ourself if requested. */
    393 	if (brand_action == EBA_NATIVE)
    394 		BROP(p)->b_proc_exit(p, lwp);
    395 	ASSERT((brand_action != EBA_NATIVE) || !PROC_IS_BRANDED(p));
    396 
    397 	setregs(&args);
    398 
    399 	/* Mark this as an executable vnode */
    400 	mutex_enter(&vp->v_lock);
    401 	vp->v_flag |= VVMEXEC;
    402 	mutex_exit(&vp->v_lock);
    403 
    404 	VN_RELE(vp);
    405 	if (dir != NULL)
    406 		VN_RELE(dir);
    407 	pn_free(&resolvepn);
    408 
    409 	/*
    410 	 * Allocate a new lwp directory and lwpid hash table if necessary.
    411 	 */
    412 	if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
    413 		lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
    414 		lwpdir->ld_next = lwpdir + 1;
    415 		tidhash = kmem_zalloc(2 * sizeof (lwpdir_t *), KM_SLEEP);
    416 		if (p->p_lwpdir != NULL)
    417 			lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
    418 		else
    419 			lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
    420 	}
    421 
    422 	if (PROC_IS_BRANDED(p))
    423 		BROP(p)->b_exec();
    424 
    425 	mutex_enter(&p->p_lock);
    426 	prbarrier(p);
    427 
    428 	/*
    429 	 * Reset lwp id to the default value of 1.
    430 	 * This is a single-threaded process now
    431 	 * and lwp #1 is lwp_wait()able by default.
    432 	 * The t_unpark flag should not be inherited.
    433 	 */
    434 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
    435 	curthread->t_tid = 1;
    436 	kpreempt_disable();
    437 	ASSERT(curthread->t_lpl != NULL);
    438 	p->p_t1_lgrpid = curthread->t_lpl->lpl_lgrpid;
    439 	kpreempt_enable();
    440 	if (p->p_tr_lgrpid != LGRP_NONE && p->p_tr_lgrpid != p->p_t1_lgrpid) {
    441 		lgrp_update_trthr_migrations(1);
    442 	}
    443 	curthread->t_unpark = 0;
    444 	curthread->t_proc_flag |= TP_TWAIT;
    445 	curthread->t_proc_flag &= ~TP_DAEMON;	/* daemons shouldn't exec */
    446 	p->p_lwpdaemon = 0;			/* but oh well ... */
    447 	p->p_lwpid = 1;
    448 
    449 	/*
    450 	 * Install the newly-allocated lwp directory and lwpid hash table
    451 	 * and insert the current thread into the new hash table.
    452 	 */
    453 	if (lwpdir != NULL) {
    454 		old_lwpdir = p->p_lwpdir;
    455 		old_lwpdir_sz = p->p_lwpdir_sz;
    456 		old_tidhash = p->p_tidhash;
    457 		old_tidhash_sz = p->p_tidhash_sz;
    458 		p->p_lwpdir = p->p_lwpfree = lwpdir;
    459 		p->p_lwpdir_sz = 2;
    460 		p->p_tidhash = tidhash;
    461 		p->p_tidhash_sz = 2;
    462 		lep->le_thread = curthread;
    463 		lep->le_lwpid = curthread->t_tid;
    464 		lep->le_start = curthread->t_start;
    465 		lwp_hash_in(p, lep);
    466 	}
    467 
    468 	/*
    469 	 * Restore the saved signal mask and
    470 	 * inform /proc that the exec() has finished.
    471 	 */
    472 	curthread->t_hold = savedmask;
    473 	prexecend();
    474 	mutex_exit(&p->p_lock);
    475 	if (old_lwpdir) {
    476 		kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
    477 		kmem_free(old_tidhash, old_tidhash_sz * sizeof (lwpdir_t *));
    478 	}
    479 
    480 	ASSERT(error == 0);
    481 	DTRACE_PROC(exec__success);
    482 	return (0);
    483 
    484 fail:
    485 	DTRACE_PROC1(exec__failure, int, error);
    486 out:		/* error return */
    487 	mutex_enter(&p->p_lock);
    488 	curthread->t_hold = savedmask;
    489 	prexecend();
    490 	mutex_exit(&p->p_lock);
    491 	ASSERT(error != 0);
    492 	return (error);
    493 }
    494 
    495 
    496 /*
    497  * Perform generic exec duties and switchout to object-file specific
    498  * handler.
    499  */
    500 int
    501 gexec(
    502 	struct vnode **vpp,
    503 	struct execa *uap,
    504 	struct uarg *args,
    505 	struct intpdata *idatap,
    506 	int level,
    507 	long *execsz,
    508 	caddr_t exec_file,
    509 	struct cred *cred,
    510 	int brand_action)
    511 {
    512 	struct vnode *vp;
    513 	proc_t *pp = ttoproc(curthread);
    514 	struct execsw *eswp;
    515 	int error = 0;
    516 	int suidflags = 0;
    517 	ssize_t resid;
    518 	uid_t uid, gid;
    519 	struct vattr vattr;
    520 	char magbuf[MAGIC_BYTES];
    521 	int setid;
    522 	cred_t *oldcred, *newcred = NULL;
    523 	int privflags = 0;
    524 	int setidfl;
    525 
    526 	/*
    527 	 * If the SNOCD or SUGID flag is set, turn it off and remember the
    528 	 * previous setting so we can restore it if we encounter an error.
    529 	 */
    530 	if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
    531 		mutex_enter(&pp->p_lock);
    532 		suidflags = pp->p_flag & PSUIDFLAGS;
    533 		pp->p_flag &= ~PSUIDFLAGS;
    534 		mutex_exit(&pp->p_lock);
    535 	}
    536 
    537 	if ((error = execpermissions(*vpp, &vattr, args)) != 0)
    538 		goto bad;
    539 
    540 	/* need to open vnode for stateful file systems like rfs */
    541 	if ((error = VOP_OPEN(vpp, FREAD, CRED(), NULL)) != 0)
    542 		goto bad;
    543 	vp = *vpp;
    544 
    545 	/*
    546 	 * Note: to support binary compatibility with SunOS a.out
    547 	 * executables, we read in the first four bytes, as the
    548 	 * magic number is in bytes 2-3.
    549 	 */
    550 	if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
    551 	    (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
    552 		goto bad;
    553 	if (resid != 0)
    554 		goto bad;
    555 
    556 	if ((eswp = findexec_by_hdr(magbuf)) == NULL)
    557 		goto bad;
    558 
    559 	if (level == 0 &&
    560 	    (privflags = execsetid(vp, &vattr, &uid, &gid)) != 0) {
    561 
    562 		newcred = cred = crdup(cred);
    563 
    564 		/* If we can, drop the PA bit */
    565 		if ((privflags & PRIV_RESET) != 0)
    566 			priv_adjust_PA(cred);
    567 
    568 		if (privflags & PRIV_SETID) {
    569 			cred->cr_uid = uid;
    570 			cred->cr_gid = gid;
    571 			cred->cr_suid = uid;
    572 			cred->cr_sgid = gid;
    573 		}
    574 
    575 		if (privflags & MAC_FLAGS) {
    576 			if (!(CR_FLAGS(cred) & NET_MAC_AWARE_INHERIT))
    577 				CR_FLAGS(cred) &= ~NET_MAC_AWARE;
    578 			CR_FLAGS(cred) &= ~NET_MAC_AWARE_INHERIT;
    579 		}
    580 
    581 		/*
    582 		 * Implement the privilege updates:
    583 		 *
    584 		 * Restrict with L:
    585 		 *
    586 		 *	I' = I & L
    587 		 *
    588 		 *	E' = P' = (I' + F) & A
    589 		 *
    590 		 * But if running under ptrace, we cap I with P.
    591 		 */
    592 		if ((privflags & PRIV_RESET) != 0) {
    593 			if ((privflags & PRIV_INCREASE) != 0 &&
    594 			    (pp->p_proc_flag & P_PR_PTRACE) != 0)
    595 				priv_intersect(&CR_OPPRIV(cred),
    596 				    &CR_IPRIV(cred));
    597 			priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
    598 			CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
    599 			priv_adjust_PA(cred);
    600 		}
    601 	}
    602 
    603 	/* SunOS 4.x buy-back */
    604 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
    605 	    (vattr.va_mode & (VSUID|VSGID))) {
    606 		cmn_err(CE_NOTE,
    607 		    "!%s, uid %d: setuid execution not allowed, dev=%lx",
    608 		    exec_file, cred->cr_uid, vp->v_vfsp->vfs_dev);
    609 	}
    610 
    611 	/*
    612 	 * execsetid() told us whether or not we had to change the
    613 	 * credentials of the process.  In privflags, it told us
    614 	 * whether we gained any privileges or executed a set-uid executable.
    615 	 */
    616 	setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE));
    617 
    618 	/*
    619 	 * Use /etc/system variable to determine if the stack
    620 	 * should be marked as executable by default.
    621 	 */
    622 	if (noexec_user_stack)
    623 		args->stk_prot &= ~PROT_EXEC;
    624 
    625 	args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
    626 	args->ex_vp = vp;
    627 
    628 	/*
    629 	 * Traditionally, the setid flags told the sub processes whether
    630 	 * the file just executed was set-uid or set-gid; this caused
    631 	 * some confusion as the 'setid' flag did not match the SUGID
    632 	 * process flag which is only set when the uids/gids do not match.
    633 	 * A script set-gid/set-uid to the real uid/gid would start with
    634 	 * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
    635 	 * Now we flag those cases where the calling process cannot
    636 	 * be trusted to influence the newly exec'ed process, either
    637 	 * because it runs with more privileges or when the uids/gids
    638 	 * do in fact not match.
    639 	 * This also makes the runtime linker agree with the on exec
    640 	 * values of SNOCD and SUGID.
    641 	 */
    642 	setidfl = 0;
    643 	if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
    644 	    !supgroupmember(cred->cr_gid, cred))) {
    645 		setidfl |= EXECSETID_UGIDS;
    646 	}
    647 	if (setid & PRIV_SETUGID)
    648 		setidfl |= EXECSETID_SETID;
    649 	if (setid & PRIV_INCREASE)
    650 		setidfl |= EXECSETID_PRIVS;
    651 
    652 	error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
    653 	    setidfl, exec_file, cred, brand_action);
    654 	rw_exit(eswp->exec_lock);
    655 	if (error != 0) {
    656 		if (newcred != NULL)
    657 			crfree(newcred);
    658 		goto bad;
    659 	}
    660 
    661 	if (level == 0) {
    662 		mutex_enter(&pp->p_crlock);
    663 		if (newcred != NULL) {
    664 			/*
    665 			 * Free the old credentials, and set the new ones.
    666 			 * Do this for both the process and the (single) thread.
    667 			 */
    668 			crfree(pp->p_cred);
    669 			pp->p_cred = cred;	/* cred already held for proc */
    670 			crhold(cred);		/* hold new cred for thread */
    671 			/*
    672 			 * DTrace accesses t_cred in probe context.  t_cred
    673 			 * must always be either NULL, or point to a valid,
    674 			 * allocated cred structure.
    675 			 */
    676 			oldcred = curthread->t_cred;
    677 			curthread->t_cred = cred;
    678 			crfree(oldcred);
    679 		}
    680 		/*
    681 		 * On emerging from a successful exec(), the saved
    682 		 * uid and gid equal the effective uid and gid.
    683 		 */
    684 		cred->cr_suid = cred->cr_uid;
    685 		cred->cr_sgid = cred->cr_gid;
    686 
    687 		/*
    688 		 * If the real and effective ids do not match, this
    689 		 * is a setuid process that should not dump core.
    690 		 * The group comparison is tricky; we prevent the code
    691 		 * from flagging SNOCD when executing with an effective gid
    692 		 * which is a supplementary group.
    693 		 */
    694 		if (cred->cr_ruid != cred->cr_uid ||
    695 		    (cred->cr_rgid != cred->cr_gid &&
    696 		    !supgroupmember(cred->cr_gid, cred)) ||
    697 		    (privflags & PRIV_INCREASE) != 0)
    698 			suidflags = PSUIDFLAGS;
    699 		else
    700 			suidflags = 0;
    701 
    702 		mutex_exit(&pp->p_crlock);
    703 		if (suidflags) {
    704 			mutex_enter(&pp->p_lock);
    705 			pp->p_flag |= suidflags;
    706 			mutex_exit(&pp->p_lock);
    707 		}
    708 		if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
    709 			/*
    710 			 * If process is traced via /proc, arrange to
    711 			 * invalidate the associated /proc vnode.
    712 			 */
    713 			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
    714 				args->traceinval = 1;
    715 		}
    716 		if (pp->p_proc_flag & P_PR_PTRACE)
    717 			psignal(pp, SIGTRAP);
    718 		if (args->traceinval)
    719 			prinvalidate(&pp->p_user);
    720 	}
    721 
    722 	return (0);
    723 bad:
    724 	if (error == 0)
    725 		error = ENOEXEC;
    726 
    727 	if (suidflags) {
    728 		mutex_enter(&pp->p_lock);
    729 		pp->p_flag |= suidflags;
    730 		mutex_exit(&pp->p_lock);
    731 	}
    732 	return (error);
    733 }
    734 
    735 extern char *execswnames[];
    736 
    737 struct execsw *
    738 allocate_execsw(char *name, char *magic, size_t magic_size)
    739 {
    740 	int i, j;
    741 	char *ename;
    742 	char *magicp;
    743 
    744 	mutex_enter(&execsw_lock);
    745 	for (i = 0; i < nexectype; i++) {
    746 		if (execswnames[i] == NULL) {
    747 			ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
    748 			(void) strcpy(ename, name);
    749 			execswnames[i] = ename;
    750 			/*
    751 			 * Set the magic number last so that we
    752 			 * don't need to hold the execsw_lock in
    753 			 * findexectype().
    754 			 */
    755 			magicp = kmem_alloc(magic_size, KM_SLEEP);
    756 			for (j = 0; j < magic_size; j++)
    757 				magicp[j] = magic[j];
    758 			execsw[i].exec_magic = magicp;
    759 			mutex_exit(&execsw_lock);
    760 			return (&execsw[i]);
    761 		}
    762 	}
    763 	mutex_exit(&execsw_lock);
    764 	return (NULL);
    765 }
    766 
    767 /*
    768  * Find the exec switch table entry with the corresponding magic string.
    769  */
    770 struct execsw *
    771 findexecsw(char *magic)
    772 {
    773 	struct execsw *eswp;
    774 
    775 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
    776 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
    777 		if (magic && eswp->exec_maglen != 0 &&
    778 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
    779 			return (eswp);
    780 	}
    781 	return (NULL);
    782 }
    783 
    784 /*
    785  * Find the execsw[] index for the given exec header string by looking for the
    786  * magic string at a specified offset and length for each kind of executable
    787  * file format until one matches.  If no execsw[] entry is found, try to
    788  * autoload a module for this magic string.
    789  */
    790 struct execsw *
    791 findexec_by_hdr(char *header)
    792 {
    793 	struct execsw *eswp;
    794 
    795 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
    796 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
    797 		if (header && eswp->exec_maglen != 0 &&
    798 		    bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
    799 		    eswp->exec_maglen) == 0) {
    800 			if (hold_execsw(eswp) != 0)
    801 				return (NULL);
    802 			return (eswp);
    803 		}
    804 	}
    805 	return (NULL);	/* couldn't find the type */
    806 }
    807 
    808 /*
    809  * Find the execsw[] index for the given magic string.  If no execsw[] entry
    810  * is found, try to autoload a module for this magic string.
    811  */
    812 struct execsw *
    813 findexec_by_magic(char *magic)
    814 {
    815 	struct execsw *eswp;
    816 
    817 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
    818 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
    819 		if (magic && eswp->exec_maglen != 0 &&
    820 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
    821 			if (hold_execsw(eswp) != 0)
    822 				return (NULL);
    823 			return (eswp);
    824 		}
    825 	}
    826 	return (NULL);	/* couldn't find the type */
    827 }
    828 
    829 static int
    830 hold_execsw(struct execsw *eswp)
    831 {
    832 	char *name;
    833 
    834 	rw_enter(eswp->exec_lock, RW_READER);
    835 	while (!LOADED_EXEC(eswp)) {
    836 		rw_exit(eswp->exec_lock);
    837 		name = execswnames[eswp-execsw];
    838 		ASSERT(name);
    839 		if (modload("exec", name) == -1)
    840 			return (-1);
    841 		rw_enter(eswp->exec_lock, RW_READER);
    842 	}
    843 	return (0);
    844 }
    845 
    846 static int
    847 execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp)
    848 {
    849 	proc_t *pp = ttoproc(curthread);
    850 	uid_t uid, gid;
    851 	cred_t *cr = pp->p_cred;
    852 	int privflags = 0;
    853 
    854 	/*
    855 	 * Remember credentials.
    856 	 */
    857 	uid = cr->cr_uid;
    858 	gid = cr->cr_gid;
    859 
    860 	/* Will try to reset the PRIV_AWARE bit later. */
    861 	if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
    862 		privflags |= PRIV_RESET;
    863 
    864 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
    865 		/*
    866 		 * Set-uid root execution only allowed if the limit set
    867 		 * holds all unsafe privileges.
    868 		 */
    869 		if ((vattrp->va_mode & VSUID) && (vattrp->va_uid != 0 ||
    870 		    priv_issubset(&priv_unsafe, &CR_LPRIV(cr)))) {
    871 			uid = vattrp->va_uid;
    872 			privflags |= PRIV_SETUGID;
    873 		}
    874 		if (vattrp->va_mode & VSGID) {
    875 			gid = vattrp->va_gid;
    876 			privflags |= PRIV_SETUGID;
    877 		}
    878 	}
    879 
    880 	/*
    881 	 * Do we need to change our credential anyway?
    882 	 * This is the case when E != I or P != I, as
    883 	 * we need to do the assignments (with F empty and A full)
    884 	 * Or when I is not a subset of L; in that case we need to
    885 	 * enforce L.
    886 	 *
    887 	 *		I' = L & I
    888 	 *
    889 	 *		E' = P' = (I' + F) & A
    890 	 * or
    891 	 *		E' = P' = I'
    892 	 */
    893 	if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
    894 	    !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
    895 	    !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
    896 		privflags |= PRIV_RESET;
    897 
    898 	/* If MAC-aware flag(s) are on, need to update cred to remove. */
    899 	if ((CR_FLAGS(cr) & NET_MAC_AWARE) ||
    900 	    (CR_FLAGS(cr) & NET_MAC_AWARE_INHERIT))
    901 		privflags |= MAC_FLAGS;
    902 
    903 	/*
    904 	 * When we introduce the "forced" set then we will need
    905 	 * to set PRIV_INCREASE here if I not a subset of P.
    906 	 * If the "allowed" set is introduced we will need to do
    907 	 * a similar thing; however, it seems more reasonable to
    908 	 * have the allowed set reduce "L": script language interpreters
    909 	 * would typically have an allowed set of "all".
    910 	 */
    911 
    912 	/*
    913 	 * Set setuid/setgid protections if no ptrace() compatibility.
    914 	 * For privileged processes, honor setuid/setgid even in
    915 	 * the presence of ptrace() compatibility.
    916 	 */
    917 	if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
    918 	    PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
    919 	    (cr->cr_uid != uid ||
    920 	    cr->cr_gid != gid ||
    921 	    cr->cr_suid != uid ||
    922 	    cr->cr_sgid != gid)) {
    923 		*uidp = uid;
    924 		*gidp = gid;
    925 		privflags |= PRIV_SETID;
    926 	}
    927 	return (privflags);
    928 }
    929 
    930 int
    931 execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
    932 {
    933 	int error;
    934 	proc_t *p = ttoproc(curthread);
    935 
    936 	vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
    937 	if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred, NULL))
    938 		return (error);
    939 	/*
    940 	 * Check the access mode.
    941 	 * If VPROC, ask /proc if the file is an object file.
    942 	 */
    943 	if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred, NULL)) != 0 ||
    944 	    !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
    945 	    (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
    946 	    (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
    947 		if (error == 0)
    948 			error = EACCES;
    949 		return (error);
    950 	}
    951 
    952 	if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
    953 	    (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred, NULL))) {
    954 		/*
    955 		 * If process is under ptrace(2) compatibility,
    956 		 * fail the exec(2).
    957 		 */
    958 		if (p->p_proc_flag & P_PR_PTRACE)
    959 			goto bad;
    960 		/*
    961 		 * Process is traced via /proc.
    962 		 * Arrange to invalidate the /proc vnode.
    963 		 */
    964 		args->traceinval = 1;
    965 	}
    966 	return (0);
    967 bad:
    968 	if (error == 0)
    969 		error = ENOEXEC;
    970 	return (error);
    971 }
    972 
    973 /*
    974  * Map a section of an executable file into the user's
    975  * address space.
    976  */
    977 int
    978 execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
    979     off_t offset, int prot, int page, uint_t szc)
    980 {
    981 	int error = 0;
    982 	off_t oldoffset;
    983 	caddr_t zfodbase, oldaddr;
    984 	size_t end, oldlen;
    985 	size_t zfoddiff;
    986 	label_t ljb;
    987 	proc_t *p = ttoproc(curthread);
    988 
    989 	oldaddr = addr;
    990 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
    991 	if (len) {
    992 		oldlen = len;
    993 		len += ((size_t)oldaddr - (size_t)addr);
    994 		oldoffset = offset;
    995 		offset = (off_t)((uintptr_t)offset & PAGEMASK);
    996 		if (page) {
    997 			spgcnt_t  prefltmem, availm, npages;
    998 			int preread;
    999 			uint_t mflag = MAP_PRIVATE | MAP_FIXED;
   1000 
   1001 			if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
   1002 				mflag |= MAP_TEXT;
   1003 			} else {
   1004 				mflag |= MAP_INITDATA;
   1005 			}
   1006 
   1007 			if (valid_usr_range(addr, len, prot, p->p_as,
   1008 			    p->p_as->a_userlimit) != RANGE_OKAY) {
   1009 				error = ENOMEM;
   1010 				goto bad;
   1011 			}
   1012 			if (error = VOP_MAP(vp, (offset_t)offset,
   1013 			    p->p_as, &addr, len, prot, PROT_ALL,
   1014 			    mflag, CRED(), NULL))
   1015 				goto bad;
   1016 
   1017 			/*
   1018 			 * If the segment can fit, then we prefault
   1019 			 * the entire segment in.  This is based on the
   1020 			 * model that says the best working set of a
   1021 			 * small program is all of its pages.
   1022 			 */
   1023 			npages = (spgcnt_t)btopr(len);
   1024 			prefltmem = freemem - desfree;
   1025 			preread =
   1026 			    (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
   1027 
   1028 			/*
   1029 			 * If we aren't prefaulting the segment,
   1030 			 * increment "deficit", if necessary to ensure
   1031 			 * that pages will become available when this
   1032 			 * process starts executing.
   1033 			 */
   1034 			availm = freemem - lotsfree;
   1035 			if (preread == 0 && npages > availm &&
   1036 			    deficit < lotsfree) {
   1037 				deficit += MIN((pgcnt_t)(npages - availm),
   1038 				    lotsfree - deficit);
   1039 			}
   1040 
   1041 			if (preread) {
   1042 				TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
   1043 				    "execmap preread:freemem %d size %lu",
   1044 				    freemem, len);
   1045 				(void) as_fault(p->p_as->a_hat, p->p_as,
   1046 				    (caddr_t)addr, len, F_INVAL, S_READ);
   1047 			}
   1048 		} else {
   1049 			if (valid_usr_range(addr, len, prot, p->p_as,
   1050 			    p->p_as->a_userlimit) != RANGE_OKAY) {
   1051 				error = ENOMEM;
   1052 				goto bad;
   1053 			}
   1054 
   1055 			if (error = as_map(p->p_as, addr, len,
   1056 			    segvn_create, zfod_argsp))
   1057 				goto bad;
   1058 			/*
   1059 			 * Read in the segment in one big chunk.
   1060 			 */
   1061 			if (