Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License, Version 1.0 only
      6  * (the "License").  You may not use this file except in compliance
      7  * with the License.
      8  *
      9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
     10  * or http://www.opensolaris.org/os/licensing.
     11  * See the License for the specific language governing permissions
     12  * and limitations under the License.
     13  *
     14  * When distributing Covered Code, include this CDDL HEADER in each
     15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     16  * If applicable, add the following below this CDDL HEADER, with the
     17  * fields enclosed by brackets "[]" replaced with your own identifying
     18  * information: Portions Copyright [yyyy] [name of copyright owner]
     19  *
     20  * CDDL HEADER END
     21  */
     22 /*
     23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"@(#)seg_kpm.c	1.7	06/01/25 SMI"
     28 
     29 /*
     30  * Kernel Physical Mapping (kpm) segment driver (segkpm).
     31  *
     32  * This driver delivers along with the hat_kpm* interfaces an alternative
     33  * mechanism for kernel mappings within the 64-bit Solaris operating system,
     34  * which allows the mapping of all physical memory into the kernel address
     35  * space at once. This is feasible in 64 bit kernels, e.g. for Ultrasparc II
     36  * and beyond processors, since the available VA range is much larger than
     37  * possible physical memory. Momentarily all physical memory is supported,
     38  * that is represented by the list of memory segments (memsegs).
     39  *
     40  * Segkpm mappings have also very low overhead and large pages are used
     41  * (when possible) to minimize the TLB and TSB footprint. It is also
     42  * extentable for other than Sparc architectures (e.g. AMD64). Main
     43  * advantage is the avoidance of the TLB-shootdown X-calls, which are
     44  * normally needed when a kernel (global) mapping has to be removed.
     45  *
     46  * First example of a kernel facility that uses the segkpm mapping scheme
     47  * is seg_map, where it is used as an alternative to hat_memload().
     48  * See also hat layer for more information about the hat_kpm* routines.
     49  * The kpm facilty can be turned off at boot time (e.g. /etc/system).
     50  */
     51 
     52 #include <sys/types.h>
     53 #include <sys/param.h>
     54 #include <sys/sysmacros.h>
     55 #include <sys/systm.h>
     56 #include <sys/vnode.h>
     57 #include <sys/cmn_err.h>
     58 #include <sys/debug.h>
     59 #include <sys/thread.h>
     60 #include <sys/cpuvar.h>
     61 #include <sys/bitmap.h>
     62 #include <sys/atomic.h>
     63 #include <sys/lgrp.h>
     64 
     65 #include <vm/seg_kmem.h>
     66 #include <vm/seg_kpm.h>
     67 #include <vm/hat.h>
     68 #include <vm/as.h>
     69 #include <vm/seg.h>
     70 #include <vm/page.h>
     71 
     72 /*
     73  * Global kpm controls.
     74  * See also platform and mmu specific controls.
     75  *
     76  * kpm_enable -- global on/off switch for segkpm.
     77  * . Set by default on 64bit platforms that have kpm support.
     78  * . Will be disabled from platform layer if not supported.
     79  * . Can be disabled via /etc/system.
     80  *
     81  * kpm_smallpages -- use only regular/system pagesize for kpm mappings.
     82  * . Can be useful for critical debugging of kpm clients.
     83  * . Set to zero by default for platforms that support kpm large pages.
     84  *   The use of kpm large pages reduces the footprint of kpm meta data
     85  *   and has all the other advantages of using large pages (e.g TLB
     86  *   miss reduction).
     87  * . Set by default for platforms that don't support kpm large pages or
     88  *   where large pages cannot be used for other reasons (e.g. there are
     89  *   only few full associative TLB entries available for large pages).
     90  *
     91  * segmap_kpm -- separate on/off switch for segmap using segkpm:
     92  * . Set by default.
     93  * . Will be disabled when kpm_enable is zero.
     94  * . Will be disabled when MAXBSIZE != PAGESIZE.
     95  * . Can be disabled via /etc/system.
     96  *
     97  */
     98 int kpm_enable = 1;
     99 int kpm_smallpages = 0;
    100 int segmap_kpm = 1;
    101 
    102 /*
    103  * Private seg op routines.
    104  */
    105 faultcode_t segkpm_fault(struct hat *hat, struct seg *seg, caddr_t addr,
    106 			size_t len, enum fault_type type, enum seg_rw rw);
    107 static void	segkpm_dump(struct seg *);
    108 static void	segkpm_badop(void);
    109 static int	segkpm_notsup(void);
    110 static int	segkpm_capable(struct seg *, segcapability_t);
    111 
    112 #define	SEGKPM_BADOP(t)	(t(*)())segkpm_badop
    113 #define	SEGKPM_NOTSUP	(int(*)())segkpm_notsup
    114 
    115 static struct seg_ops segkpm_ops = {
    116 	SEGKPM_BADOP(int),	/* dup */
    117 	SEGKPM_BADOP(int),	/* unmap */
    118 	SEGKPM_BADOP(void),	/* free */
    119 	segkpm_fault,
    120 	SEGKPM_BADOP(int),	/* faulta */
    121 	SEGKPM_BADOP(int),	/* setprot */
    122 	SEGKPM_BADOP(int),	/* checkprot */
    123 	SEGKPM_BADOP(int),	/* kluster */
    124 	SEGKPM_BADOP(size_t),	/* swapout */
    125 	SEGKPM_BADOP(int),	/* sync */
    126 	SEGKPM_BADOP(size_t),	/* incore */
    127 	SEGKPM_BADOP(int),	/* lockop */
    128 	SEGKPM_BADOP(int),	/* getprot */
    129 	SEGKPM_BADOP(u_offset_t), /* getoffset */
    130 	SEGKPM_BADOP(int),	/* gettype */
    131 	SEGKPM_BADOP(int),	/* getvp */
    132 	SEGKPM_BADOP(int),	/* advise */
    133 	segkpm_dump,		/* dump */
    134 	SEGKPM_NOTSUP,		/* pagelock */
    135 	SEGKPM_BADOP(int),	/* setpgsz */
    136 	SEGKPM_BADOP(int),	/* getmemid */
    137 	SEGKPM_BADOP(lgrp_mem_policy_info_t *),	/* getpolicy */
    138 	segkpm_capable,		/* capable */
    139 };
    140 
    141 /*
    142  * kpm_pgsz and kpm_pgshft are set by platform layer.
    143  */
    144 size_t		kpm_pgsz;	/* kpm page size */
    145 uint_t		kpm_pgshft;	/* kpm page shift */
    146 u_offset_t	kpm_pgoff;	/* kpm page offset mask */
    147 uint_t		kpmp2pshft;	/* kpm page to page shift */
    148 pgcnt_t		kpmpnpgs;	/* how many pages per kpm page */
    149 
    150 
    151 #ifdef	SEGKPM_SUPPORT
    152 
    153 int
    154 segkpm_create(struct seg *seg, void *argsp)
    155 {
    156 	struct segkpm_data *skd;
    157 	struct segkpm_crargs *b = (struct segkpm_crargs *)argsp;
    158 	ushort_t *p;
    159 	int i, j;
    160 
    161 	ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
    162 	ASSERT(btokpmp(seg->s_size) >= 1 &&
    163 		kpmpageoff((uintptr_t)seg->s_base) == 0 &&
    164 		kpmpageoff((uintptr_t)seg->s_base + seg->s_size) == 0);
    165 
    166 	skd = kmem_zalloc(sizeof (struct segkpm_data), KM_SLEEP);
    167 
    168 	seg->s_data = (void *)skd;
    169 	seg->s_ops = &segkpm_ops;
    170 	skd->skd_prot = b->prot;
    171 
    172 	/*
    173 	 * (1) Segkpm virtual addresses are based on physical adresses.
    174 	 * From this and in opposite to other segment drivers it is
    175 	 * often required to allocate a page first to be able to
    176 	 * calculate the final segkpm virtual address.
    177 	 * (2) Page  allocation is done by calling page_create_va(),
    178 	 * one important input argument is a virtual address (also
    179 	 * expressed by the "va" in the function name). This function
    180 	 * is highly optimized to select the right page for an optimal
    181 	 * processor and platform support (e.g. virtual addressed
    182 	 * caches (VAC), physical addressed caches, NUMA).
    183 	 *
    184 	 * Because of (1) the approach is to generate a faked virtual
    185 	 * address for calling page_create_va(). In order to exploit
    186 	 * the abilities of (2), especially to utilize the cache
    187 	 * hierarchy (3) and to avoid VAC alias conflicts (4) the
    188 	 * selection has to be done carefully. For each virtual color
    189 	 * a separate counter is provided (4). The count values are
    190 	 * used for the utilization of all cache lines (3) and are
    191 	 * corresponding to the cache bins.
    192 	 */
    193 	skd->skd_nvcolors = b->nvcolors;
    194 
    195 	p = skd->skd_va_select =
    196 		kmem_zalloc(NCPU * b->nvcolors * sizeof (ushort_t), KM_SLEEP);
    197 
    198 	for (i = 0; i < NCPU; i++)
    199 		for (j = 0; j < b->nvcolors; j++, p++)
    200 			*p = j;
    201 
    202 	return (0);
    203 }
    204 
    205 /*
    206  * This routine is called via a machine specific fault handling
    207  * routine.
    208  */
    209 /* ARGSUSED */
    210 faultcode_t
    211 segkpm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
    212 	enum fault_type type, enum seg_rw rw)
    213 {
    214 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
    215 
    216 	switch (type) {
    217 	case F_INVAL:
    218 		return (hat_kpm_fault(hat, addr));
    219 	case F_SOFTLOCK:
    220 	case F_SOFTUNLOCK:
    221 		return (0);
    222 	default:
    223 		return (FC_NOSUPPORT);
    224 	}
    225 	/*NOTREACHED*/
    226 }
    227 
    228 #define	addr_to_vcolor(addr, vcolors) \
    229 	((int)(((uintptr_t)(addr) & ((vcolors << PAGESHIFT) - 1)) >> PAGESHIFT))
    230 
    231 /*
    232  * Create a virtual address that can be used for invocations of
    233  * page_create_va. Goal is to utilize the cache hierarchy (round
    234  * robin bins) and to select the right color for virtual indexed
    235  * caches. It isn't exact since we also increment the bin counter
    236  * when the caller uses VOP_GETPAGE and gets a hit in the page
    237  * cache, but we keep the bins turning for cache distribution
    238  * (see also segkpm_create block comment).
    239  */
    240 caddr_t
    241 segkpm_create_va(u_offset_t off)
    242 {
    243 	int vcolor;
    244 	ushort_t *p;
    245 	struct segkpm_data *skd = (struct segkpm_data *)segkpm->s_data;
    246 	int nvcolors = skd->skd_nvcolors;
    247 	caddr_t	va;
    248 
    249 	vcolor = (nvcolors > 1) ? addr_to_vcolor(off, nvcolors) : 0;
    250 	p = &skd->skd_va_select[(CPU->cpu_id * nvcolors) + vcolor];
    251 	va = (caddr_t)ptob(*p);
    252 
    253 	atomic_add_16(p, nvcolors);
    254 
    255 	return (va);
    256 }
    257 
    258 /*
    259  * Unload mapping if the instance has an active kpm mapping.
    260  */
    261 void
    262 segkpm_mapout_validkpme(struct kpme *kpme)
    263 {
    264 	caddr_t vaddr;
    265 	page_t *pp;
    266 
    267 retry:
    268 	if ((pp = kpme->kpe_page) == NULL) {
    269 		return;
    270 	}
    271 
    272 	if (page_lock(pp, SE_SHARED, (kmutex_t *)NULL, P_RECLAIM) == 0)
    273 		goto retry;
    274 
    275 	/*
    276 	 * Check if segkpm mapping is not unloaded in the meantime
    277 	 */
    278 	if (kpme->kpe_page == NULL) {
    279 		page_unlock(pp);
    280 		return;
    281 	}
    282 
    283 	vaddr = hat_kpm_page2va(pp, 1);
    284 	hat_kpm_mapout(pp, kpme, vaddr);
    285 	page_unlock(pp);
    286 }
    287 
    288 static void
    289 segkpm_badop()
    290 {
    291 	panic("segkpm_badop");
    292 }
    293 
    294 #else	/* SEGKPM_SUPPORT */
    295 
    296 /* segkpm stubs */
    297 
    298 /*ARGSUSED*/
    299 int segkpm_create(struct seg *seg, void *argsp) { return (0); }
    300 
    301 /* ARGSUSED */
    302 faultcode_t
    303 segkpm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
    304 	enum fault_type type, enum seg_rw rw)
    305 {
    306 	return ((faultcode_t)0);
    307 }
    308 
    309 /* ARGSUSED */
    310 caddr_t segkpm_create_va(u_offset_t off) { return (NULL); }
    311 
    312 /* ARGSUSED */
    313 void segkpm_mapout_validkpme(struct kpme *kpme) {}
    314 
    315 static void
    316 segkpm_badop() {}
    317 
    318 #endif	/* SEGKPM_SUPPORT */
    319 
    320 static int
    321 segkpm_notsup()
    322 {
    323 	return (ENOTSUP);
    324 }
    325 
    326 /*
    327  * segkpm pages are not dumped, so we just return
    328  */
    329 /*ARGSUSED*/
    330 static void
    331 segkpm_dump(struct seg *seg)
    332 {}
    333 
    334 /*
    335  * We claim to have no special capabilities.
    336  */
    337 /*ARGSUSED*/
    338 static int
    339 segkpm_capable(struct seg *seg, segcapability_t capability)
    340 {
    341 	return (0);
    342 }
    343