Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 
     30 #pragma ident	"@(#)grow.c	1.91	07/10/25 SMI"
     31 
     32 #include <sys/types.h>
     33 #include <sys/inttypes.h>
     34 #include <sys/param.h>
     35 #include <sys/sysmacros.h>
     36 #include <sys/systm.h>
     37 #include <sys/signal.h>
     38 #include <sys/user.h>
     39 #include <sys/errno.h>
     40 #include <sys/var.h>
     41 #include <sys/proc.h>
     42 #include <sys/tuneable.h>
     43 #include <sys/debug.h>
     44 #include <sys/cmn_err.h>
     45 #include <sys/cred.h>
     46 #include <sys/vnode.h>
     47 #include <sys/vfs.h>
     48 #include <sys/vm.h>
     49 #include <sys/file.h>
     50 #include <sys/mman.h>
     51 #include <sys/vmparam.h>
     52 #include <sys/fcntl.h>
     53 #include <sys/lwpchan_impl.h>
     54 #include <sys/nbmlock.h>
     55 
     56 #include <vm/hat.h>
     57 #include <vm/as.h>
     58 #include <vm/seg.h>
     59 #include <vm/seg_dev.h>
     60 #include <vm/seg_vn.h>
     61 
     62 int use_brk_lpg = 1;
     63 int use_stk_lpg = 1;
     64 
     65 static int brk_lpg(caddr_t nva);
     66 static int grow_lpg(caddr_t sp);
     67 
     68 int
     69 brk(caddr_t nva)
     70 {
     71 	int error;
     72 	proc_t *p = curproc;
     73 
     74 	/*
     75 	 * Serialize brk operations on an address space.
     76 	 * This also serves as the lock protecting p_brksize
     77 	 * and p_brkpageszc.
     78 	 */
     79 	as_rangelock(p->p_as);
     80 	if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
     81 		error = brk_lpg(nva);
     82 	} else {
     83 		error = brk_internal(nva, p->p_brkpageszc);
     84 	}
     85 	as_rangeunlock(p->p_as);
     86 	return ((error != 0 ? set_errno(error) : 0));
     87 }
     88 
     89 /*
     90  * Algorithm: call arch-specific map_pgsz to get best page size to use,
     91  * then call brk_internal().
     92  * Returns 0 on success.
     93  */
     94 static int
     95 brk_lpg(caddr_t nva)
     96 {
     97 	struct proc *p = curproc;
     98 	size_t pgsz, len;
     99 	caddr_t addr, brkend;
    100 	caddr_t bssbase = p->p_bssbase;
    101 	caddr_t brkbase = p->p_brkbase;
    102 	int oszc, szc;
    103 	int err;
    104 
    105 	oszc = p->p_brkpageszc;
    106 
    107 	/*
    108 	 * If p_brkbase has not yet been set, the first call
    109 	 * to brk_internal() will initialize it.
    110 	 */
    111 	if (brkbase == 0) {
    112 		return (brk_internal(nva, oszc));
    113 	}
    114 
    115 	len = nva - bssbase;
    116 
    117 	pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
    118 	szc = page_szc(pgsz);
    119 
    120 	/*
    121 	 * Covers two cases:
    122 	 * 1. page_szc() returns -1 for invalid page size, so we want to
    123 	 * ignore it in that case.
    124 	 * 2. By design we never decrease page size, as it is more stable.
    125 	 */
    126 	if (szc <= oszc) {
    127 		err = brk_internal(nva, oszc);
    128 		/* If failed, back off to base page size. */
    129 		if (err != 0 && oszc != 0) {
    130 			err = brk_internal(nva, 0);
    131 		}
    132 		return (err);
    133 	}
    134 
    135 	err = brk_internal(nva, szc);
    136 	/* If using szc failed, map with base page size and return. */
    137 	if (err != 0) {
    138 		if (szc != 0) {
    139 			err = brk_internal(nva, 0);
    140 		}
    141 		return (err);
    142 	}
    143 
    144 	/*
    145 	 * Round up brk base to a large page boundary and remap
    146 	 * anything in the segment already faulted in beyond that
    147 	 * point.
    148 	 */
    149 	addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
    150 	brkend = brkbase + p->p_brksize;
    151 	len = brkend - addr;
    152 	/* Check that len is not negative. Update page size code for heap. */
    153 	if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
    154 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
    155 		p->p_brkpageszc = szc;
    156 	}
    157 
    158 	ASSERT(err == 0);
    159 	return (err);		/* should always be 0 */
    160 }
    161 
    162 /*
    163  * Returns 0 on success.
    164  */
    165 int
    166 brk_internal(caddr_t nva, uint_t brkszc)
    167 {
    168 	caddr_t ova;			/* current break address */
    169 	size_t size;
    170 	int	error;
    171 	struct proc *p = curproc;
    172 	struct as *as = p->p_as;
    173 	size_t pgsz;
    174 	uint_t szc;
    175 	rctl_qty_t as_rctl;
    176 
    177 	/*
    178 	 * extend heap to brkszc alignment but use current p->p_brkpageszc
    179 	 * for the newly created segment. This allows the new extension
    180 	 * segment to be concatenated successfully with the existing brk
    181 	 * segment.
    182 	 */
    183 	if ((szc = brkszc) != 0) {
    184 		pgsz = page_get_pagesize(szc);
    185 		ASSERT(pgsz > PAGESIZE);
    186 	} else {
    187 		pgsz = PAGESIZE;
    188 	}
    189 
    190 	mutex_enter(&p->p_lock);
    191 	as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
    192 	    p->p_rctls, p);
    193 	mutex_exit(&p->p_lock);
    194 
    195 	/*
    196 	 * If p_brkbase has not yet been set, the first call
    197 	 * to brk() will initialize it.
    198 	 */
    199 	if (p->p_brkbase == 0)
    200 		p->p_brkbase = nva;
    201 
    202 	/*
    203 	 * Before multiple page size support existed p_brksize was the value
    204 	 * not rounded to the pagesize (i.e. it stored the exact user request
    205 	 * for heap size). If pgsz is greater than PAGESIZE calculate the
    206 	 * heap size as the real new heap size by rounding it up to pgsz.
    207 	 * This is useful since we may want to know where the heap ends
    208 	 * without knowing heap pagesize (e.g. some old code) and also if
    209 	 * heap pagesize changes we can update p_brkpageszc but delay adding
    210 	 * new mapping yet still know from p_brksize where the heap really
    211 	 * ends. The user requested heap end is stored in libc variable.
    212 	 */
    213 	if (pgsz > PAGESIZE) {
    214 		caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
    215 		size = tnva - p->p_brkbase;
    216 		if (tnva < p->p_brkbase || (size > p->p_brksize &&
    217 		    size > (size_t)as_rctl)) {
    218 			szc = 0;
    219 			pgsz = PAGESIZE;
    220 			size = nva - p->p_brkbase;
    221 		}
    222 	} else {
    223 		size = nva - p->p_brkbase;
    224 	}
    225 
    226 	/*
    227 	 * use PAGESIZE to roundup ova because we want to know the real value
    228 	 * of the current heap end in case p_brkpageszc changes since the last
    229 	 * p_brksize was computed.
    230 	 */
    231 	nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
    232 	ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
    233 		PAGESIZE);
    234 
    235 	if ((nva < p->p_brkbase) || (size > p->p_brksize &&
    236 	    size > as_rctl)) {
    237 		mutex_enter(&p->p_lock);
    238 		(void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
    239 		    RCA_SAFE);
    240 		mutex_exit(&p->p_lock);
    241 		return (ENOMEM);
    242 	}
    243 
    244 	if (nva > ova) {
    245 		struct segvn_crargs crargs =
    246 		    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
    247 
    248 		if (!(p->p_datprot & PROT_EXEC)) {
    249 			crargs.prot &= ~PROT_EXEC;
    250 		}
    251 
    252 		/*
    253 		 * Add new zfod mapping to extend UNIX data segment
    254 		 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
    255 		 * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
    256 		 * page sizes if ova is not aligned to szc's pgsz.
    257 		 */
    258 		if (szc > 0) {
    259 			caddr_t rbss;
    260 
    261 			rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
    262 			    pgsz);
    263 			if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
    264 				crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
    265 				    AS_MAP_NO_LPOOB;
    266 			} else if (ova == rbss) {
    267 				crargs.szc = szc;
    268 			} else {
    269 				crargs.szc = AS_MAP_HEAP;
    270 			}
    271 		} else {
    272 			crargs.szc = AS_MAP_NO_LPOOB;
    273 		}
    274 		crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
    275 		error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
    276 		    &crargs);
    277 		if (error) {
    278 			return (error);
    279 		}
    280 
    281 	} else if (nva < ova) {
    282 		/*
    283 		 * Release mapping to shrink UNIX data segment.
    284 		 */
    285 		(void) as_unmap(as, nva, (size_t)(ova - nva));
    286 	}
    287 	p->p_brksize = size;
    288 	return (0);
    289 }
    290 
    291 /*
    292  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
    293  * This routine assumes that the stack grows downward.
    294  */
    295 int
    296 grow(caddr_t sp)
    297 {
    298 	struct proc *p = curproc;
    299 	struct as *as = p->p_as;
    300 	size_t oldsize = p->p_stksize;
    301 	size_t newsize;
    302 	int err;
    303 
    304 	/*
    305 	 * Serialize grow operations on an address space.
    306 	 * This also serves as the lock protecting p_stksize
    307 	 * and p_stkpageszc.
    308 	 */
    309 	as_rangelock(as);
    310 	if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
    311 		err = grow_lpg(sp);
    312 	} else {
    313 		err = grow_internal(sp, p->p_stkpageszc);
    314 	}
    315 	as_rangeunlock(as);
    316 
    317 	if (err == 0 && (newsize = p->p_stksize) > oldsize) {
    318 		ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
    319 		ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
    320 		/*
    321 		 * Set up translations so the process doesn't have to fault in
    322 		 * the stack pages we just gave it.
    323 		 */
    324 		(void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
    325 		    newsize - oldsize, F_INVAL, S_WRITE);
    326 	}
    327 	return ((err == 0 ? 1 : 0));
    328 }
    329 
    330 /*
    331  * Algorithm: call arch-specific map_pgsz to get best page size to use,
    332  * then call grow_internal().
    333  * Returns 0 on success.
    334  */
    335 static int
    336 grow_lpg(caddr_t sp)
    337 {
    338 	struct proc *p = curproc;
    339 	size_t pgsz;
    340 	size_t len, newsize;
    341 	caddr_t addr, saddr;
    342 	caddr_t growend;
    343 	int oszc, szc;
    344 	int err;
    345 
    346 	newsize = p->p_usrstack - sp;
    347 
    348 	oszc = p->p_stkpageszc;
    349 	pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
    350 	szc = page_szc(pgsz);
    351 
    352 	/*
    353 	 * Covers two cases:
    354 	 * 1. page_szc() returns -1 for invalid page size, so we want to
    355 	 * ignore it in that case.
    356 	 * 2. By design we never decrease page size, as it is more stable.
    357 	 * This shouldn't happen as the stack never shrinks.
    358 	 */
    359 	if (szc <= oszc) {
    360 		err = grow_internal(sp, oszc);
    361 		/* failed, fall back to base page size */
    362 		if (err != 0 && oszc != 0) {
    363 			err = grow_internal(sp, 0);
    364 		}
    365 		return (err);
    366 	}
    367 
    368 	/*
    369 	 * We've grown sufficiently to switch to a new page size.
    370 	 * So we are going to remap the whole segment with the new page size.
    371 	 */
    372 	err = grow_internal(sp, szc);
    373 	/* The grow with szc failed, so fall back to base page size. */
    374 	if (err != 0) {
    375 		if (szc != 0) {
    376 			err = grow_internal(sp, 0);
    377 		}
    378 		return (err);
    379 	}
    380 
    381 	/*
    382 	 * Round up stack pointer to a large page boundary and remap
    383 	 * any pgsz pages in the segment already faulted in beyond that
    384 	 * point.
    385 	 */
    386 	saddr = p->p_usrstack - p->p_stksize;
    387 	addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
    388 	growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
    389 	len = growend - addr;
    390 	/* Check that len is not negative. Update page size code for stack. */
    391 	if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
    392 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
    393 		p->p_stkpageszc = szc;
    394 	}
    395 
    396 	ASSERT(err == 0);
    397 	return (err);		/* should always be 0 */
    398 }
    399 
    400 /*
    401  * This routine assumes that the stack grows downward.
    402  * Returns 0 on success, errno on failure.
    403  */
    404 int
    405 grow_internal(caddr_t sp, uint_t growszc)
    406 {
    407 	struct proc *p = curproc;
    408 	size_t newsize;
    409 	size_t oldsize;
    410 	int    error;
    411 	size_t pgsz;
    412 	uint_t szc;
    413 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
    414 
    415 	ASSERT(sp < p->p_usrstack);
    416 	sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
    417 
    418 	/*
    419 	 * grow to growszc alignment but use current p->p_stkpageszc for
    420 	 * the segvn_crargs szc passed to segvn_create. For memcntl to
    421 	 * increase the szc, this allows the new extension segment to be
    422 	 * concatenated successfully with the existing stack segment.
    423 	 */
    424 	if ((szc = growszc) != 0) {
    425 		pgsz = page_get_pagesize(szc);
    426 		ASSERT(pgsz > PAGESIZE);
    427 		newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
    428 		if (newsize > (size_t)p->p_stk_ctl) {
    429 			szc = 0;
    430 			pgsz = PAGESIZE;
    431 			newsize = p->p_usrstack - sp;
    432 		}
    433 	} else {
    434 		pgsz = PAGESIZE;
    435 		newsize = p->p_usrstack - sp;
    436 	}
    437 
    438 	if (newsize > (size_t)p->p_stk_ctl) {
    439 		(void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
    440 		    RCA_UNSAFE_ALL);
    441 
    442 		return (ENOMEM);
    443 	}
    444 
    445 	oldsize = p->p_stksize;
    446 	ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
    447 
    448 	if (newsize <= oldsize) {	/* prevent the stack from shrinking */
    449 		return (0);
    450 	}
    451 
    452 	if (!(p->p_stkprot & PROT_EXEC)) {
    453 		crargs.prot &= ~PROT_EXEC;
    454 	}
    455 	/*
    456 	 * extend stack with the proposed new growszc, which is different
    457 	 * than p_stkpageszc only on a memcntl to increase the stack pagesize.
    458 	 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
    459 	 * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
    460 	 * if not aligned to szc's pgsz.
    461 	 */
    462 	if (szc > 0) {
    463 		caddr_t oldsp = p->p_usrstack - oldsize;
    464 		caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
    465 		    pgsz);
    466 
    467 		if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
    468 			crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
    469 			    AS_MAP_NO_LPOOB;
    470 		} else if (oldsp == austk) {
    471 			crargs.szc = szc;
    472 		} else {
    473 			crargs.szc = AS_MAP_STACK;
    474 		}
    475 	} else {
    476 		crargs.szc = AS_MAP_NO_LPOOB;
    477 	}
    478 	crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
    479 
    480 	if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
    481 	    segvn_create, &crargs)) != 0) {
    482 		if (error == EAGAIN) {
    483 			cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
    484 			    "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
    485 		}
    486 		return (error);
    487 	}
    488 	p->p_stksize = newsize;
    489 	return (0);
    490 }
    491 
    492 /*
    493  * Used for MAP_ANON - fast way to get anonymous pages
    494  */
    495 static int
    496 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
    497     offset_t pos)
    498 {
    499 	struct segvn_crargs vn_a;
    500 
    501 	if (((PROT_ALL & uprot) != uprot))
    502 		return (EACCES);
    503 
    504 	if ((flags & MAP_FIXED) != 0) {
    505 		caddr_t userlimit;
    506 
    507 		/*
    508 		 * Use the user address.  First verify that
    509 		 * the address to be used is page aligned.
    510 		 * Then make some simple bounds checks.
    511 		 */
    512 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
    513 			return (EINVAL);
    514 
    515 		userlimit = flags & _MAP_LOW32 ?
    516 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
    517 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
    518 		case RANGE_OKAY:
    519 			break;
    520 		case RANGE_BADPROT:
    521 			return (ENOTSUP);
    522 		case RANGE_BADADDR:
    523 		default:
    524 			return (ENOMEM);
    525 		}
    526 		(void) as_unmap(as, *addrp, len);
    527 	} else {
    528 		/*
    529 		 * No need to worry about vac alignment for anonymous
    530 		 * pages since this is a "clone" object that doesn't
    531 		 * yet exist.
    532 		 */
    533 		map_addr(addrp, len, pos, 0, flags);
    534 		if (*addrp == NULL)
    535 			return (ENOMEM);
    536 	}
    537 
    538 	/*
    539 	 * Use the seg_vn segment driver; passing in the NULL amp
    540 	 * gives the desired "cloning" effect.
    541 	 */
    542 	vn_a.vp = NULL;
    543 	vn_a.offset = 0;
    544 	vn_a.type = flags & MAP_TYPE;
    545 	vn_a.prot = uprot;
    546 	vn_a.maxprot = PROT_ALL;
    547 	vn_a.flags = flags & ~MAP_TYPE;
    548 	vn_a.cred = CRED();
    549 	vn_a.amp = NULL;
    550 	vn_a.szc = 0;
    551 	vn_a.lgrp_mem_policy_flags = 0;
    552 
    553 	return (as_map(as, *addrp, len, segvn_create, &vn_a));
    554 }
    555 
    556 static int
    557 smmap_common(caddr_t *addrp, size_t len,
    558     int prot, int flags, struct file *fp, offset_t pos)
    559 {
    560 	struct vnode *vp;
    561 	struct as *as = curproc->p_as;
    562 	uint_t uprot, maxprot, type;
    563 	int error;
    564 	int in_crit = 0;
    565 
    566 	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
    567 	    _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
    568 	    MAP_TEXT | MAP_INITDATA)) != 0) {
    569 		/* | MAP_RENAME */	/* not implemented, let user know */
    570 		return (EINVAL);
    571 	}
    572 
    573 	if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
    574 		return (EINVAL);
    575 	}
    576 
    577 	if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
    578 		return (EINVAL);
    579 	}
    580 
    581 #if defined(__sparc)
    582 	/*
    583 	 * See if this is an "old mmap call".  If so, remember this
    584 	 * fact and convert the flags value given to mmap to indicate
    585 	 * the specified address in the system call must be used.
    586 	 * _MAP_NEW is turned set by all new uses of mmap.
    587 	 */
    588 	if ((flags & _MAP_NEW) == 0)
    589 		flags |= MAP_FIXED;
    590 #endif
    591 	flags &= ~_MAP_NEW;
    592 
    593 	type = flags & MAP_TYPE;
    594 	if (type != MAP_PRIVATE && type != MAP_SHARED)
    595 		return (EINVAL);
    596 
    597 
    598 	if (flags & MAP_ALIGN) {
    599 
    600 		if (flags & MAP_FIXED)
    601 			return (EINVAL);
    602 
    603 		/* alignment needs to be a power of 2 >= page size */
    604 		if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
    605 			!ISP2((uintptr_t)*addrp))
    606 			return (EINVAL);
    607 	}
    608 	/*
    609 	 * Check for bad lengths and file position.
    610 	 * We let the VOP_MAP routine check for negative lengths
    611 	 * since on some vnode types this might be appropriate.
    612 	 */
    613 	if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
    614 		return (EINVAL);
    615 
    616 	maxprot = PROT_ALL;		/* start out allowing all accesses */
    617 	uprot = prot | PROT_USER;
    618 
    619 	if (fp == NULL) {
    620 		ASSERT(flags & MAP_ANON);
    621 		as_rangelock(as);
    622 		error = zmap(as, addrp, len, uprot, flags, pos);
    623 		as_rangeunlock(as);
    624 		return (error);
    625 	} else if ((flags & MAP_ANON) != 0)
    626 		return (EINVAL);
    627 
    628 	vp = fp->f_vnode;
    629 
    630 	/* Can't execute code from "noexec" mounted filesystem. */
    631 	if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
    632 		maxprot &= ~PROT_EXEC;
    633 
    634 	/*
    635 	 * These checks were added as part of large files.
    636 	 *
    637 	 * Return ENXIO if the initial position is negative; return EOVERFLOW
    638 	 * if (offset + len) would overflow the maximum allowed offset for the
    639 	 * type of file descriptor being used.
    640 	 */
    641 	if (vp->v_type == VREG) {
    642 		if (pos < 0)
    643 			return (ENXIO);
    644 		if ((offset_t)len > (OFFSET_MAX(fp) - pos))
    645 			return (EOVERFLOW);
    646 	}
    647 
    648 	if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
    649 		/* no write access allowed */
    650 		maxprot &= ~PROT_WRITE;
    651 	}
    652 
    653 	/*
    654 	 * XXX - Do we also adjust maxprot based on protections
    655 	 * of the vnode?  E.g. if no execute permission is given
    656 	 * on the vnode for the current user, maxprot probably
    657 	 * should disallow PROT_EXEC also?  This is different
    658 	 * from the write access as this would be a per vnode
    659 	 * test as opposed to a per fd test for writability.
    660 	 */
    661 
    662 	/*
    663 	 * Verify that the specified protections are not greater than
    664 	 * the maximum allowable protections.  Also test to make sure
    665 	 * that the file descriptor does allows for read access since
    666 	 * "write only" mappings are hard to do since normally we do
    667 	 * the read from the file before the page can be written.
    668 	 */
    669 	if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
    670 		return (EACCES);
    671 
    672 	/*
    673 	 * If the user specified an address, do some simple checks here
    674 	 */
    675 	if ((flags & MAP_FIXED) != 0) {
    676 		caddr_t userlimit;
    677 
    678 		/*
    679 		 * Use the user address.  First verify that
    680 		 * the address to be used is page aligned.
    681 		 * Then make some simple bounds checks.
    682 		 */
    683 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
    684 			return (EINVAL);
    685 
    686 		userlimit = flags & _MAP_LOW32 ?
    687 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
    688 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
    689 		case RANGE_OKAY:
    690 			break;
    691 		case RANGE_BADPROT:
    692 			return (ENOTSUP);
    693 		case RANGE_BADADDR:
    694 		default:
    695 			return (ENOMEM);
    696 		}
    697 	}
    698 
    699 	if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
    700 	    nbl_need_check(vp)) {
    701 		int svmand;
    702 		nbl_op_t nop;
    703 
    704 		nbl_start_crit(vp, RW_READER);
    705 		in_crit = 1;
    706 		error = nbl_svmand(vp, fp->f_cred, &svmand);
    707 		if (error != 0)
    708 			goto done;
    709 		if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
    710 			if (prot & (PROT_READ | PROT_EXEC)) {
    711 				nop = NBL_READWRITE;
    712 			} else {
    713 				nop = NBL_WRITE;
    714 			}
    715 		} else {
    716 			nop = NBL_READ;
    717 		}
    718 		if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
    719 			error = EACCES;
    720 			goto done;
    721 		}
    722 	}
    723 
    724 	/*
    725 	 * Ok, now let the vnode map routine do its thing to set things up.
    726 	 */
    727 	error = VOP_MAP(vp, pos, as,
    728 	    addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
    729 
    730 	if (error == 0) {
    731 		if (vp->v_type == VREG &&
    732 		    (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
    733 			/*
    734 			 * Mark this as an executable vnode
    735 			 */
    736 			mutex_enter(&vp->v_lock);
    737 			vp->v_flag |= VVMEXEC;
    738 			mutex_exit(&vp->v_lock);
    739 		}
    740 	}
    741 
    742 done:
    743 	if (in_crit)
    744 		nbl_end_crit(vp);
    745 	return (error);
    746 }
    747 
    748 #ifdef _LP64
    749 /*
    750  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
    751  *
    752  * The "large file" mmap routine mmap64(2) is also mapped to this routine
    753  * by the 64-bit version of libc.
    754  *
    755  * Eventually, this should be the only version, and have smmap_common()
    756  * folded back into it again.  Some day.
    757  */
    758 caddr_t
    759 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
    760 {
    761 	struct file *fp;
    762 	int error;
    763 
    764 	if (flags & _MAP_LOW32)
    765 		error = EINVAL;
    766 	else if (fd == -1 && (flags & MAP_ANON) != 0)
    767 		error = smmap_common(&addr, len, prot, flags,
    768 		    NULL, (offset_t)pos);
    769 	else if ((fp = getf(fd)) != NULL) {
    770 		error = smmap_common(&addr, len, prot, flags,
    771 		    fp, (offset_t)pos);
    772 		releasef(fd);
    773 	} else
    774 		error = EBADF;
    775 
    776 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
    777 }
    778 #endif	/* _LP64 */
    779 
    780 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
    781 
    782 /*
    783  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
    784  */
    785 caddr_t
    786 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
    787 {
    788 	struct file *fp;
    789 	int error;
    790 	caddr_t a = (caddr_t)(uintptr_t)addr;
    791 
    792 	if (flags & _MAP_LOW32)
    793 		error = EINVAL;
    794 	else if (fd == -1 && (flags & MAP_ANON) != 0)
    795 		error = smmap_common(&a, (size_t)len, prot,
    796 		    flags | _MAP_LOW32, NULL, (offset_t)pos);
    797 	else if ((fp = getf(fd)) != NULL) {
    798 		error = smmap_common(&a, (size_t)len, prot,
    799 		    flags | _MAP_LOW32, fp, (offset_t)pos);
    800 		releasef(fd);
    801 	} else
    802 		error = EBADF;
    803 
    804 	ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
    805 
    806 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
    807 }
    808 
    809 /*
    810  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
    811  *
    812  * Now things really get ugly because we can't use the C-style
    813  * calling convention for more than 6 args, and 64-bit parameter
    814  * passing on 32-bit systems is less than clean.
    815  */
    816 
    817 struct mmaplf32a {
    818 	caddr_t addr;
    819 	size_t len;
    820 #ifdef _LP64
    821 	/*
    822 	 * 32-bit contents, 64-bit cells
    823 	 */
    824 	uint64_t prot;
    825 	uint64_t flags;
    826 	uint64_t fd;
    827 	uint64_t offhi;
    828 	uint64_t offlo;
    829 #else
    830 	/*
    831 	 * 32-bit contents, 32-bit cells
    832 	 */
    833 	uint32_t prot;
    834 	uint32_t flags;
    835 	uint32_t fd;
    836 	uint32_t offhi;
    837 	uint32_t offlo;
    838 #endif
    839 };
    840 
    841 int
    842 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
    843 {
    844 	struct file *fp;
    845 	int error;
    846 	caddr_t a = uap->addr;
    847 	int flags = (int)uap->flags;
    848 	int fd = (int)uap->fd;
    849 #ifdef _BIG_ENDIAN
    850 	offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
    851 #else
    852 	offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
    853 #endif
    854 
    855 	if (flags & _MAP_LOW32)
    856 		error = EINVAL;
    857 	else if (fd == -1 && (flags & MAP_ANON) != 0)
    858 		error = smmap_common(&a, uap->len, (int)uap->prot,
    859 		    flags | _MAP_LOW32, NULL, off);
    860 	else if ((fp = getf(fd)) != NULL) {
    861 		error = smmap_common(&a, uap->len, (int)uap->prot,
    862 		    flags | _MAP_LOW32, fp, off);
    863 		releasef(fd);
    864 	} else
    865 		error = EBADF;
    866 
    867 	if (error == 0)
    868 		rvp->r_val1 = (uintptr_t)a;
    869 	return (error);
    870 }
    871 
    872 #endif	/* _SYSCALL32_IMPL || _ILP32 */
    873 
    874 int
    875 munmap(caddr_t addr, size_t len)
    876 {
    877 	struct proc *p = curproc;
    878 	struct as *as = p->p_as;
    879 
    880 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
    881 		return (set_errno(EINVAL));
    882 
    883 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
    884 		return (set_errno(EINVAL));
    885 
    886 	/*
    887 	 * Discard lwpchan mappings.
    888 	 */
    889 	if (p->p_lcp != NULL)
    890 		lwpchan_delete_mapping(p, addr, addr + len);
    891 	if (as_unmap(as, addr, len) != 0)
    892 		return (set_errno(EINVAL));
    893 
    894 	return (0);
    895 }
    896 
    897 int
    898 mprotect(caddr_t addr, size_t len, int prot)
    899 {
    900 	struct as *as = curproc->p_as;
    901 	uint_t uprot = prot | PROT_USER;
    902 	int error;
    903 
    904 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
    905 		return (set_errno(EINVAL));
    906 
    907 	switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
    908 	case RANGE_OKAY:
    909 		break;
    910 	case RANGE_BADPROT:
    911 		return (set_errno(ENOTSUP));
    912 	case RANGE_BADADDR:
    913 	default:
    914 		return (set_errno(ENOMEM));
    915 	}
    916 
    917 	error = as_setprot(as, addr, len, uprot);
    918 	if (error)
    919 		return (set_errno(error));
    920 	return (0);
    921 }
    922 
    923 #define	MC_CACHE	128			/* internal result buffer */
    924 #define	MC_QUANTUM	(MC_CACHE * PAGESIZE)	/* addresses covered in loop */
    925 
    926 int
    927 mincore(caddr_t addr, size_t len, char *vecp)
    928 {
    929 	struct as *as = curproc->p_as;
    930 	caddr_t ea;			/* end address of loop */
    931 	size_t rl;			/* inner result length */
    932 	char vec[MC_CACHE];		/* local vector cache */
    933 	int error;
    934 	model_t model;
    935 	long	llen;
    936 
    937 	model = get_udatamodel();
    938 	/*
    939 	 * Validate form of address parameters.
    940 	 */
    941 	if (model == DATAMODEL_NATIVE) {
    942 		llen = (long)len;
    943 	} else {
    944 		llen = (int32_t)(size32_t)len;
    945 	}
    946 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
    947 		return (set_errno(EINVAL));
    948 
    949 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
    950 		return (set_errno(ENOMEM));
    951 
    952 	/*
    953 	 * Loop over subranges of interval [addr : addr + len), recovering
    954 	 * results internally and then copying them out to caller.  Subrange
    955 	 * is based on the size of MC_CACHE, defined above.
    956 	 */
    957 	for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
    958 		error = as_incore(as, addr,
    959 		    (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
    960 		if (rl != 0) {
    961 			rl = (rl + PAGESIZE - 1) / PAGESIZE;
    962 			if (copyout(vec, vecp, rl) != 0)
    963 				return (set_errno(EFAULT));
    964 			vecp += rl;
    965 		}
    966 		if (error != 0)
    967 			return (set_errno(ENOMEM));
    968 	}
    969 	return (0);
    970 }
    971