1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "@(#)seg_map.c 1.143 07/10/25 SMI" 35 36 /* 37 * VM - generic vnode mapping segment. 38 * 39 * The segmap driver is used only by the kernel to get faster (than seg_vn) 40 * mappings [lower routine overhead; more persistent cache] to random 41 * vnode/offsets. Note than the kernel may (and does) use seg_vn as well. 42 */ 43 44 #include <sys/types.h> 45 #include <sys/t_lock.h> 46 #include <sys/param.h> 47 #include <sys/sysmacros.h> 48 #include <sys/buf.h> 49 #include <sys/systm.h> 50 #include <sys/vnode.h> 51 #include <sys/mman.h> 52 #include <sys/errno.h> 53 #include <sys/cred.h> 54 #include <sys/kmem.h> 55 #include <sys/vtrace.h> 56 #include <sys/cmn_err.h> 57 #include <sys/debug.h> 58 #include <sys/thread.h> 59 #include <sys/dumphdr.h> 60 #include <sys/bitmap.h> 61 #include <sys/lgrp.h> 62 63 #include <vm/seg_kmem.h> 64 #include <vm/hat.h> 65 #include <vm/as.h> 66 #include <vm/seg.h> 67 #include <vm/seg_kpm.h> 68 #include <vm/seg_map.h> 69 #include <vm/page.h> 70 #include <vm/pvn.h> 71 #include <vm/rm.h> 72 73 /* 74 * Private seg op routines. 75 */ 76 static void segmap_free(struct seg *seg); 77 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr, 78 size_t len, enum fault_type type, enum seg_rw rw); 79 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr); 80 static int segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, 81 uint_t prot); 82 static int segmap_kluster(struct seg *seg, caddr_t addr, ssize_t); 83 static int segmap_getprot(struct seg *seg, caddr_t addr, size_t len, 84 uint_t *protv); 85 static u_offset_t segmap_getoffset(struct seg *seg, caddr_t addr); 86 static int segmap_gettype(struct seg *seg, caddr_t addr); 87 static int segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp); 88 static void segmap_dump(struct seg *seg); 89 static int segmap_pagelock(struct seg *seg, caddr_t addr, size_t len, 90 struct page ***ppp, enum lock_type type, 91 enum seg_rw rw); 92 static void segmap_badop(void); 93 static int segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp); 94 static lgrp_mem_policy_info_t *segmap_getpolicy(struct seg *seg, 95 caddr_t addr); 96 static int segmap_capable(struct seg *seg, segcapability_t capability); 97 98 /* segkpm support */ 99 static caddr_t segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t, 100 struct smap *, enum seg_rw); 101 struct smap *get_smap_kpm(caddr_t, page_t **); 102 103 #define SEGMAP_BADOP(t) (t(*)())segmap_badop 104 105 static struct seg_ops segmap_ops = { 106 SEGMAP_BADOP(int), /* dup */ 107 SEGMAP_BADOP(int), /* unmap */ 108 segmap_free, 109 segmap_fault, 110 segmap_faulta, 111 SEGMAP_BADOP(int), /* setprot */ 112 segmap_checkprot, 113 segmap_kluster, 114 SEGMAP_BADOP(size_t), /* swapout */ 115 SEGMAP_BADOP(int), /* sync */ 116 SEGMAP_BADOP(size_t), /* incore */ 117 SEGMAP_BADOP(int), /* lockop */ 118 segmap_getprot, 119 segmap_getoffset, 120 segmap_gettype, 121 segmap_getvp, 122 SEGMAP_BADOP(int), /* advise */ 123 segmap_dump, 124 segmap_pagelock, /* pagelock */ 125 SEGMAP_BADOP(int), /* setpgsz */ 126 segmap_getmemid, /* getmemid */ 127 segmap_getpolicy, /* getpolicy */ 128 segmap_capable, /* capable */ 129 }; 130 131 /* 132 * Private segmap routines. 133 */ 134 static void segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr, 135 size_t len, enum seg_rw rw, struct smap *smp); 136 static void segmap_smapadd(struct smap *smp); 137 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp, 138 u_offset_t off, int hashid); 139 static void segmap_hashout(struct smap *smp); 140 141 142 /* 143 * Statistics for segmap operations. 144 * 145 * No explicit locking to protect these stats. 146 */ 147 struct segmapcnt segmapcnt = { 148 { "fault", KSTAT_DATA_ULONG }, 149 { "faulta", KSTAT_DATA_ULONG }, 150 { "getmap", KSTAT_DATA_ULONG }, 151 { "get_use", KSTAT_DATA_ULONG }, 152 { "get_reclaim", KSTAT_DATA_ULONG }, 153 { "get_reuse", KSTAT_DATA_ULONG }, 154 { "get_unused", KSTAT_DATA_ULONG }, 155 { "get_nofree", KSTAT_DATA_ULONG }, 156 { "rel_async", KSTAT_DATA_ULONG }, 157 { "rel_write", KSTAT_DATA_ULONG }, 158 { "rel_free", KSTAT_DATA_ULONG }, 159 { "rel_abort", KSTAT_DATA_ULONG }, 160 { "rel_dontneed", KSTAT_DATA_ULONG }, 161 { "release", KSTAT_DATA_ULONG }, 162 { "pagecreate", KSTAT_DATA_ULONG }, 163 { "free_notfree", KSTAT_DATA_ULONG }, 164 { "free_dirty", KSTAT_DATA_ULONG }, 165 { "free", KSTAT_DATA_ULONG }, 166 { "stolen", KSTAT_DATA_ULONG }, 167 { "get_nomtx", KSTAT_DATA_ULONG } 168 }; 169 170 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt; 171 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t); 172 173 /* 174 * Return number of map pages in segment. 175 */ 176 #define MAP_PAGES(seg) ((seg)->s_size >> MAXBSHIFT) 177 178 /* 179 * Translate addr into smap number within segment. 180 */ 181 #define MAP_PAGE(seg, addr) (((addr) - (seg)->s_base) >> MAXBSHIFT) 182 183 /* 184 * Translate addr in seg into struct smap pointer. 185 */ 186 #define GET_SMAP(seg, addr) \ 187 &(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)]) 188 189 /* 190 * Bit in map (16 bit bitmap). 191 */ 192 #define SMAP_BIT_MASK(bitindex) (1 << ((bitindex) & 0xf)) 193 194 static int smd_colormsk = 0; 195 static int smd_ncolor = 0; 196 static int smd_nfree = 0; 197 static int smd_freemsk = 0; 198 #ifdef DEBUG 199 static int *colors_used; 200 #endif 201 static struct smap *smd_smap; 202 static struct smaphash *smd_hash; 203 #ifdef SEGMAP_HASHSTATS 204 static unsigned int *smd_hash_len; 205 #endif 206 static struct smfree *smd_free; 207 static ulong_t smd_hashmsk = 0; 208 209 #define SEGMAP_MAXCOLOR 2 210 #define SEGMAP_CACHE_PAD 64 211 212 union segmap_cpu { 213 struct { 214 uint32_t scpu_free_ndx[SEGMAP_MAXCOLOR]; 215 struct smap *scpu_last_smap; 216 ulong_t scpu_getmap; 217 ulong_t scpu_release; 218 ulong_t scpu_get_reclaim; 219 ulong_t scpu_fault; 220 ulong_t scpu_pagecreate; 221 ulong_t scpu_get_reuse; 222 } scpu; 223 char scpu_pad[SEGMAP_CACHE_PAD]; 224 }; 225 static union segmap_cpu *smd_cpu; 226 227 /* 228 * There are three locks in seg_map: 229 * - per freelist mutexes 230 * - per hashchain mutexes 231 * - per smap mutexes 232 * 233 * The lock ordering is to get the smap mutex to lock down the slot 234 * first then the hash lock (for hash in/out (vp, off) list) or the 235 * freelist lock to put the slot back on the free list. 236 * 237 * The hash search is done by only holding the hashchain lock, when a wanted 238 * slot is found, we drop the hashchain lock then lock the slot so there 239 * is no overlapping of hashchain and smap locks. After the slot is 240 * locked, we verify again if the slot is still what we are looking 241 * for. 242 * 243 * Allocation of a free slot is done by holding the freelist lock, 244 * then locking the smap slot at the head of the freelist. This is 245 * in reversed lock order so mutex_tryenter() is used. 246 * 247 * The smap lock protects all fields in smap structure except for 248 * the link fields for hash/free lists which are protected by 249 * hashchain and freelist locks. 250 */ 251 252 #define SHASHMTX(hashid) (&smd_hash[hashid].sh_mtx) 253 254 #define SMP2SMF(smp) (&smd_free[(smp - smd_smap) & smd_freemsk]) 255 #define SMP2SMF_NDX(smp) (ushort_t)((smp - smd_smap) & smd_freemsk) 256 257 #define SMAPMTX(smp) (&smp->sm_mtx) 258 259 #define SMAP_HASHFUNC(vp, off, hashid) \ 260 { \ 261 hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \ 262 ((off) >> MAXBSHIFT)) & smd_hashmsk); \ 263 } 264 265 /* 266 * The most frequently updated kstat counters are kept in the 267 * per cpu array to avoid hot cache blocks. The update function 268 * sums the cpu local counters to update the global counters. 269 */ 270 271 /* ARGSUSED */ 272 int 273 segmap_kstat_update(kstat_t *ksp, int rw) 274 { 275 int i; 276 ulong_t getmap, release, get_reclaim; 277 ulong_t fault, pagecreate, get_reuse; 278 279 if (rw == KSTAT_WRITE) 280 return (EACCES); 281 getmap = release = get_reclaim = (ulong_t)0; 282 fault = pagecreate = get_reuse = (ulong_t)0; 283 for (i = 0; i < max_ncpus; i++) { 284 getmap += smd_cpu[i].scpu.scpu_getmap; 285 release += smd_cpu[i].scpu.scpu_release; 286 get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim; 287 fault += smd_cpu[i].scpu.scpu_fault; 288 pagecreate += smd_cpu[i].scpu.scpu_pagecreate; 289 get_reuse += smd_cpu[i].scpu.scpu_get_reuse; 290 } 291 segmapcnt.smp_getmap.value.ul = getmap; 292 segmapcnt.smp_release.value.ul = release; 293 segmapcnt.smp_get_reclaim.value.ul = get_reclaim; 294 segmapcnt.smp_fault.value.ul = fault; 295 segmapcnt.smp_pagecreate.value.ul = pagecreate; 296 segmapcnt.smp_get_reuse.value.ul = get_reuse; 297 return (0); 298 } 299 300 int 301 segmap_create(struct seg *seg, void *argsp) 302 { 303 struct segmap_data *smd; 304 struct smap *smp; 305 struct smfree *sm; 306 struct segmap_crargs *a = (struct segmap_crargs *)argsp; 307 struct smaphash *shashp; 308 union segmap_cpu *scpu; 309 long i, npages; 310 size_t hashsz; 311 uint_t nfreelist; 312 extern void prefetch_smap_w(void *); 313 extern int max_ncpus; 314 315 ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock)); 316 317 if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) { 318 panic("segkmap not MAXBSIZE aligned"); 319 /*NOTREACHED*/ 320 } 321 322 smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP); 323 324 seg->s_data = (void *)smd; 325 seg->s_ops = &segmap_ops; 326 smd->smd_prot = a->prot; 327 328 /* 329 * Scale the number of smap freelists to be 330 * proportional to max_ncpus * number of virtual colors. 331 * The caller can over-ride this scaling by providing 332 * a non-zero a->nfreelist argument. 333 */ 334 nfreelist = a->nfreelist; 335 if (nfreelist == 0) 336 nfreelist = max_ncpus; 337 else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) { 338 cmn_err(CE_WARN, "segmap_create: nfreelist out of range " 339 "%d, using %d", nfreelist, max_ncpus); 340 nfreelist = max_ncpus; 341 } 342 if (nfreelist & (nfreelist - 1)) { 343 /* round up nfreelist to the next power of two. */ 344 nfreelist = 1 << (highbit(nfreelist)); 345 } 346 347 /* 348 * Get the number of virtual colors - must be a power of 2. 349 */ 350 if (a->shmsize) 351 smd_ncolor = a->shmsize >> MAXBSHIFT; 352 else 353 smd_ncolor = 1; 354 ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0); 355 ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR); 356 smd_colormsk = smd_ncolor - 1; 357 smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist; 358 smd_freemsk = smd_nfree - 1; 359 360 /* 361 * Allocate and initialize the freelist headers. 362 * Note that sm_freeq[1] starts out as the release queue. This 363 * is known when the smap structures are initialized below. 364 */ 365 smd_free = smd->smd_free = 366 kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP); 367 for (i = 0; i < smd_nfree; i++) { 368 sm = &smd->smd_free[i]; 369 mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL); 370 mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL); 371 sm->sm_allocq = &sm->sm_freeq[0]; 372 sm->sm_releq = &sm->sm_freeq[1]; 373 } 374 375 /* 376 * Allocate and initialize the smap hash chain headers. 377 * Compute hash size rounding down to the next power of two. 378 */ 379 npages = MAP_PAGES(seg); 380 smd->smd_npages = npages; 381 hashsz = npages / SMAP_HASHAVELEN; 382 hashsz = 1 << (highbit(hashsz)-1); 383 smd_hashmsk = hashsz - 1; 384 smd_hash = smd->smd_hash = 385 kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP); 386 #ifdef SEGMAP_HASHSTATS 387 smd_hash_len = 388 kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP); 389 #endif 390 for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) { 391 shashp->sh_hash_list = NULL; 392 mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL); 393 } 394 395 /* 396 * Allocate and initialize the smap structures. 397 * Link all slots onto the appropriate freelist. 398 * The smap array is large enough to affect boot time 399 * on large systems, so use memory prefetching and only 400 * go through the array 1 time. Inline a optimized version 401 * of segmap_smapadd to add structures to freelists with 402 * knowledge that no locks are needed here. 403 */ 404 smd_smap = smd->smd_sm = 405 kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP); 406 407 for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1]; 408 smp >= smd->smd_sm; smp--) { 409 struct smap *smpfreelist; 410 struct sm_freeq *releq; 411 412 prefetch_smap_w((char *)smp); 413 414 smp->sm_vp = NULL; 415 smp->sm_hash = NULL; 416 smp->sm_off = 0; 417 smp->sm_bitmap = 0; 418 smp->sm_refcnt = 0; 419 mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL); 420 smp->sm_free_ndx = SMP2SMF_NDX(smp); 421 422 sm = SMP2SMF(smp); 423 releq = sm->sm_releq; 424 425 smpfreelist = releq->smq_free; 426 if (smpfreelist == 0) { 427 releq->smq_free = smp->sm_next = smp->sm_prev = smp; 428 } else { 429 smp->sm_next = smpfreelist; 430 smp->sm_prev = smpfreelist->sm_prev; 431 smpfreelist->sm_prev = smp; 432 smp->sm_prev->sm_next = smp; 433 releq->smq_free = smp->sm_next; 434 } 435 436 /* 437 * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1] 438 */ 439 smp->sm_flags = 0; 440 441 #ifdef SEGKPM_SUPPORT 442 /* 443 * Due to the fragile prefetch loop no 444 * separate function is used here. 445 */ 446 smp->sm_kpme_next = NULL; 447 smp->sm_kpme_prev = NULL; 448 smp->sm_kpme_page = NULL; 449 #endif 450 } 451 452 /* 453 * Allocate the per color indices that distribute allocation 454 * requests over the free lists. Each cpu will have a private 455 * rotor index to spread the allocations even across the available 456 * smap freelists. Init the scpu_last_smap field to the first 457 * smap element so there is no need to check for NULL. 458 */ 459 smd_cpu = 460 kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP); 461 for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) { 462 int j; 463 for (j = 0; j < smd_ncolor; j++) 464 scpu->scpu.scpu_free_ndx[j] = j; 465 scpu->scpu.scpu_last_smap = smd_smap; 466 } 467 468 if (vpm_enable) { 469 vpm_init(); 470 } 471 472 #ifdef DEBUG 473 /* 474 * Keep track of which colors are used more often. 475 */ 476 colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP); 477 #endif /* DEBUG */ 478 479 return (0); 480 } 481 482 static void 483 segmap_free(seg) 484 struct seg *seg; 485 { 486 ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock)); 487 } 488 489 /* 490 * Do a F_SOFTUNLOCK call over the range requested. 491 * The range must have already been F_SOFTLOCK'ed. 492 */ 493 static void 494 segmap_unlock( 495 struct hat *hat, 496 struct seg *seg, 497 caddr_t addr, 498 size_t len, 499 enum seg_rw rw, 500 struct smap *smp) 501 { 502 page_t *pp; 503 caddr_t adr; 504 u_offset_t off; 505 struct vnode *vp; 506 kmutex_t *smtx; 507 508 ASSERT(smp->sm_refcnt > 0); 509 510 #ifdef lint 511 seg = seg; 512 #endif 513 514 if (segmap_kpm && IS_KPM_ADDR(addr)) { 515 516 /* 517 * We're called only from segmap_fault and this was a 518 * NOP in case of a kpm based smap, so dangerous things 519 * must have happened in the meantime. Pages are prefaulted 520 * and locked in segmap_getmapflt and they will not be 521 * unlocked until segmap_release. 522 */ 523 panic("segmap_unlock: called with kpm addr %p", (void *)addr); 524 /*NOTREACHED*/ 525 } 526 527 vp = smp->sm_vp; 528 off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET); 529 530 hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE)); 531 for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) { 532 ushort_t bitmask; 533 534 /* 535 * Use page_find() instead of page_lookup() to 536 * find the page since we know that it has 537 * "shared" lock. 538 */ 539 pp = page_find(vp, off); 540 if (pp == NULL) { 541 panic("segmap_unlock: page not found"); 542 /*NOTREACHED*/ 543 } 544 545 if (rw == S_WRITE) { 546 hat_setrefmod(pp); 547 } else if (rw != S_OTHER) { 548 TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT, 549 "segmap_fault:pp %p vp %p offset %llx", 550 pp, vp, off); 551 hat_setref(pp); 552 } 553 554 /* 555 * Clear bitmap, if the bit corresponding to "off" is set, 556 * since the page and translation are being unlocked. 557 */ 558 bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT); 559 560 /* 561 * Large Files: Following assertion is to verify 562 * the correctness of the cast to (int) above. 563 */ 564 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX); 565 smtx = SMAPMTX(smp); 566 mutex_enter(smtx); 567 if (smp->sm_bitmap & bitmask) { 568 smp->sm_bitmap &= ~bitmask; 569 } 570 mutex_exit(smtx); 571 572 page_unlock(pp); 573 } 574 } 575 576 #define MAXPPB (MAXBSIZE/4096) /* assumes minimum page size of 4k */ 577 578 /* 579 * This routine is called via a machine specific fault handling 580 * routine. It is also called by software routines wishing to 581 * lock or unlock a range of addresses. 582 * 583 * Note that this routine expects a page-aligned "addr". 584 */ 585 faultcode_t 586 segmap_fault( 587 struct hat *hat, 588 struct seg *seg, 589 caddr_t addr, 590 size_t len, 591 enum fault_type type, 592 enum seg_rw rw) 593 { 594 struct segmap_data *smd = (struct segmap_data *)seg->s_data; 595 struct smap *smp; 596 page_t *pp, **ppp; 597 struct vnode *vp; 598 u_offset_t off; 599 page_t *pl[MAXPPB + 1]; 600 uint_t prot; 601 u_offset_t addroff; 602 caddr_t adr; 603 int err; 604 u_offset_t sm_off; 605 int hat_flag; 606 607 if (segmap_kpm && IS_KPM_ADDR(addr)) { 608 int newpage; 609 kmutex_t *smtx; 610 611 /* 612 * Pages are successfully prefaulted and locked in 613 * segmap_getmapflt and can't be unlocked until 614 * segmap_release. No hat mappings have to be locked 615 * and they also can't be unlocked as long as the 616 * caller owns an active kpm addr. 617 */ 618 #ifndef DEBUG 619 if (type != F_SOFTUNLOCK) 620 return (0); 621 #endif 622 623 if ((smp = get_smap_kpm(addr, NULL)) == NULL) { 624 panic("segmap_fault: smap not found " 625 "for addr %p", (void *)addr); 626 /*NOTREACHED*/ 627 } 628 629 smtx = SMAPMTX(smp); 630 #ifdef DEBUG 631 newpage = smp->sm_flags & SM_KPM_NEWPAGE; 632 if (newpage) { 633 cmn_err(CE_WARN, "segmap_fault: newpage? smp %p", 634 (void *)smp); 635 } 636 637 if (type != F_SOFTUNLOCK) { 638 mutex_exit(smtx); 639 return (0); 640 } 641 #endif 642 mutex_exit(smtx); 643 vp = smp->sm_vp; 644 sm_off = smp->sm_off; 645 646 if (vp == NULL) 647 return (FC_MAKE_ERR(EIO)); 648 649 ASSERT(smp->sm_refcnt > 0); 650 651 addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET); 652 if (addroff + len > MAXBSIZE) 653 panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk", 654 (void *)(addr + len)); 655 656 off = sm_off + addroff; 657 658 pp = page_find(vp, off); 659 660 if (pp == NULL) 661 panic("segmap_fault: softunlock page not found"); 662 663 /* 664 * Set ref bit also here in case of S_OTHER to avoid the 665 * overhead of supporting other cases than F_SOFTUNLOCK 666 * with segkpm. We can do this because the underlying 667 * pages are locked anyway. 668 */ 669 if (rw == S_WRITE) { 670 hat_setrefmod(pp); 671 } else { 672 TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT, 673 "segmap_fault:pp %p vp %p offset %llx", 674 pp, vp, off); 675 hat_setref(pp); 676 } 677 678 return (0); 679 } 680 681 smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++; 682 smp = GET_SMAP(seg, addr); 683 vp = smp->sm_vp; 684 sm_off = smp->sm_off; 685 686 if (vp == NULL) 687 return (FC_MAKE_ERR(EIO)); 688 689 ASSERT(smp->sm_refcnt > 0); 690 691 addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET); 692 if (addroff + len > MAXBSIZE) { 693 panic("segmap_fault: endaddr %p " 694 "exceeds MAXBSIZE chunk", (void *)(addr + len)); 695 /*NOTREACHED*/ 696 } 697 off = sm_off + addroff; 698 699 /* 700 * First handle the easy stuff 701 */ 702 if (type == F_SOFTUNLOCK) { 703 segmap_unlock(hat, seg, addr, len, rw, smp); 704 return (0); 705 } 706 707 TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE, 708 "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp); 709 err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE, 710 seg, addr, rw, CRED(), NULL); 711 712 if (err) 713 return (FC_MAKE_ERR(err)); 714 715 prot &= smd->smd_prot; 716 717 /* 718 * Handle all pages returned in the pl[] array. 719 * This loop is coded on the assumption that if 720 * there was no error from the VOP_GETPAGE routine, 721 * that the page list returned will contain all the 722 * needed pages for the vp from [off..off + len]. 723 */ 724 ppp = pl; 725 while ((pp = *ppp++) != NULL) { 726 u_offset_t poff; 727 ASSERT(pp->p_vnode == vp); 728 hat_flag = HAT_LOAD; 729 730 /* 731 * Verify that the pages returned are within the range 732 * of this segmap region. Note that it is theoretically 733 * possible for pages outside this range to be returned, 734 * but it is not very likely. If we cannot use the 735 * page here, just release it and go on to the next one. 736 */ 737 if (pp->p_offset < sm_off || 738 pp->p_offset >= sm_off + MAXBSIZE) { 739 (void) page_release(pp, 1); 740 continue; 741 } 742 743 ASSERT(hat == kas.a_hat); 744 poff = pp->p_offset; 745 adr = addr + (poff - off); 746 if (adr >= addr && adr < addr + len) { 747 hat_setref(pp); 748 TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT, 749 "segmap_fault:pp %p vp %p offset %llx", 750 pp, vp, poff); 751 if (type == F_SOFTLOCK) 752 hat_flag = HAT_LOAD_LOCK; 753 } 754 755 /* 756 * Deal with VMODSORT pages here. If we know this is a write 757 * do the setmod now and allow write protection. 758 * As long as it's modified or not S_OTHER, remove write 759 * protection. With S_OTHER it's up to the FS to deal with this. 760 */ 761 if (IS_VMODSORT(vp)) { 762 if (rw == S_WRITE) 763 hat_setmod(pp); 764 else if (rw != S_OTHER && !hat_ismod(pp)) 765 prot &= ~PROT_WRITE; 766 } 767 768 hat_memload(hat, adr, pp, prot, hat_flag); 769 if (hat_flag != HAT_LOAD_LOCK) 770 page_unlock(pp); 771 } 772 return (0); 773 } 774 775 /* 776 * This routine is used to start I/O on pages asynchronously. 777 */ 778 static faultcode_t 779 segmap_faulta(struct seg *seg, caddr_t addr) 780 { 781 struct smap *smp; 782 struct vnode *vp; 783 u_offset_t off; 784 int err; 785 786 if (segmap_kpm && IS_KPM_ADDR(addr)) { 787 int newpage; 788 kmutex_t *smtx; 789 790 /* 791 * Pages are successfully prefaulted and locked in 792 * segmap_getmapflt and can't be unlocked until 793 * segmap_release. No hat mappings have to be locked 794 * and they also can't be unlocked as long as the 795 * caller owns an active kpm addr. 796 */ 797 #ifdef DEBUG 798 if ((smp = get_smap_kpm(addr, NULL)) == NULL) { 799 panic("segmap_faulta: smap not found " 800 "for addr %p", (void *)addr); 801 /*NOTREACHED*/ 802 } 803 804 smtx = SMAPMTX(smp); 805 newpage = smp->sm_flags & SM_KPM_NEWPAGE; 806 mutex_exit(smtx); 807 if (newpage) 808 cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p", 809 (void *)smp); 810 #endif 811 return (0); 812 } 813 814 segmapcnt.smp_faulta.value.ul++; 815 smp = GET_SMAP(seg, addr); 816 817 ASSERT(smp->sm_refcnt > 0); 818 819 vp = smp->sm_vp; 820 off = smp->sm_off; 821 822 if (vp == NULL) { 823 cmn_err(CE_WARN, "segmap_faulta - no vp"); 824 return (FC_MAKE_ERR(EIO)); 825 } 826 827 TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE, 828 "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp); 829 830 err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr 831 & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0, 832 seg, addr, S_READ, CRED(), NULL); 833 834 if (err) 835 return (FC_MAKE_ERR(err)); 836 return (0); 837 } 838 839 /*ARGSUSED*/ 840 static int 841 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 842 { 843 struct segmap_data *smd = (struct segmap_data *)seg->s_data; 844 845 ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock)); 846 847 /* 848 * Need not acquire the segment lock since 849 * "smd_prot" is a read-only field. 850 */ 851 return (((smd->smd_prot & prot) != prot) ? EACCES : 0); 852 } 853 854 static int 855 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 856 { 857 struct segmap_data *smd = (struct segmap_data *)seg->s_data; 858 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 859 860 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 861 862 if (pgno != 0) { 863 do 864 protv[--pgno] = smd->smd_prot; 865 while (pgno != 0); 866 } 867 return (0); 868 } 869 870 static u_offset_t 871 segmap_getoffset(struct seg *seg, caddr_t addr) 872 { 873 struct segmap_data *smd = (struct segmap_data *)seg->s_data; 874 875 ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock)); 876 877 return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base)); 878 } 879 880 /*ARGSUSED*/ 881 static int 882 segmap_gettype(struct seg *seg, caddr_t addr) 883 { 884 ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock)); 885 886 return (MAP_SHARED); 887 } 888 889 /*ARGSUSED*/ 890 static int 891 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 892 { 893 struct segmap_data *smd = (struct segmap_data *)seg->s_data; 894 895 ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock)); 896 897 /* XXX - This doesn't make any sense */ 898 *vpp = smd->smd_sm->sm_vp; 899 return (0); 900 } 901 902 /* 903 * Check to see if it makes sense to do kluster/read ahead to 904 * addr + delta relative to the mapping at addr. We assume here 905 * that delta is a signed PAGESIZE'd multiple (which can be negative). 906 * 907 * For segmap we always "approve" of this action from our standpoint. 908 */ 909 /*ARGSUSED*/ 910 static int 911 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 912 { 913 return (0); 914 } 915 916 static void 917 segmap_badop() 918 { 919 panic("segmap_badop"); 920 /*NOTREACHED*/ 921 } 922 923 /* 924 * Special private segmap operations 925 */ 926 927 /* 928 * Add smap to the appropriate free list. 929 */ 930 static void 931 segmap_smapadd(struct smap *smp) 932 { 933 struct smfree *sm; 934 struct smap *smpfreelist; 935 struct sm_freeq *releq; 936 937 ASSERT(MUTEX_HELD(SMAPMTX(smp))); 938 939 if (smp->sm_refcnt != 0) { 940 panic("segmap_smapadd"); 941 /*NOTREACHED*/ 942 } 943 944 sm = &smd_free[smp->sm_free_ndx]; 945 /* 946 * Add to the tail of the release queue 947 * Note that sm_releq and sm_allocq could toggle 948 * before we get the lock. This does not affect 949 * correctness as the 2 queues are only maintained 950 * to reduce lock pressure. 951 */ 952 releq = sm->sm_releq; 953 if (releq == &sm->sm_freeq[0]) 954 smp->sm_flags |= SM_QNDX_ZERO; 955 else 956 smp->sm_flags &= ~SM_QNDX_ZERO; 957 mutex_enter(&releq->smq_mtx); 958 smpfreelist = releq->smq_free; 959 if (smpfreelist == 0) { 960 int want; 961 962 releq->smq_free = smp->sm_next = smp->sm_prev = smp; 963 /* 964 * Both queue mutexes held to set sm_want; 965 * snapshot the value before dropping releq mutex. 966 * If sm_want appears after the releq mutex is dropped, 967 * then the smap just freed is already gone. 968 */ 969 want = sm->sm_want; 970 mutex_exit(&releq->smq_mtx); 971 /* 972 * See if there was a waiter before dropping the releq mutex 973 * then recheck after obtaining sm_freeq[0] mutex as 974 * the another thread may have already signaled. 975 */ 976 if (want) { 977 mutex_enter(&sm->sm_freeq[0].smq_mtx); 978 if (sm->sm_want) 979 cv_signal(&sm->sm_free_cv); 980 mutex_exit(&sm->sm_freeq[0].smq_mtx); 981 } 982 } else { 983 smp->sm_next = smpfreelist; 984 smp->sm_prev = smpfreelist->sm_prev; 985 smpfreelist->sm_prev = smp; 986 smp->sm_prev->sm_next = smp; 987 mutex_exit(&releq->smq_mtx); 988 } 989 } 990 991 992 static struct smap * 993 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid) 994 { 995 struct smap **hpp; 996 struct smap *tmp; 997 kmutex_t *hmtx; 998 999 ASSERT(MUTEX_HELD(SMAPMTX(smp))); 1000 ASSERT(smp->sm_vp == NULL); 1001 ASSERT(smp->sm_hash == NULL); 1002 ASSERT(smp->sm_prev == NULL); 1003 ASSERT(smp->sm_next == NULL); 1004 ASSERT(hashid >= 0 && hashid <= smd_hashmsk); 1005 1006 hmtx = SHASHMTX(hashid); 1007 1008 mutex_enter(hmtx); 1009 /* 1010 * First we need to verify that no one has created a smp 1011 * with (vp,off) as its tag before we us. 1012 */ 1013 for (tmp = smd_hash[hashid].sh_hash_list; 1014 tmp != NULL; tmp = tmp->sm_hash) 1015 if (tmp->sm_vp == vp && tmp->sm_off == off) 1016 break; 1017 1018 if (tmp == NULL) { 1019 /* 1020 * No one created one yet. 1021 * 1022 * Funniness here - we don't increment the ref count on the 1023 * vnode * even though we have another pointer to it here. 1024 * The reason for this is that we don't want the fact that 1025 * a seg_map entry somewhere refers to a vnode to prevent the 1026 * vnode * itself from going away. This is because this 1027 * reference to the vnode is a "soft one". In the case where 1028 * a mapping is being used by a rdwr [or directory routine?] 1029 * there already has to be a non-zero ref count on the vnode. 1030 * In the case where the vp has been freed and the the smap 1031 * structure is on the free list, there are no pages in memory 1032 * that can refer to the vnode. Thus even if we reuse the same 1033 * vnode/smap structure for a vnode which has the same 1034 * address but represents a different object, we are ok. 1035 */ 1036 smp->sm_vp = vp; 1037 smp->sm_off = off; 1038 1039 hpp = &smd_hash[hashid].sh_hash_list; 1040 smp->sm_hash = *hpp; 1041 *hpp = smp; 1042 #ifdef SEGMAP_HASHSTATS 1043 smd_hash_len[hashid]++; 1044 #endif 1045 } 1046 mutex_exit(hmtx); 1047