Home | History | Annotate | Download | only in cpr
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)cpr_misc.c	1.129	07/10/25 SMI"
     27 
     28 #include <sys/types.h>
     29 #include <sys/errno.h>
     30 #include <sys/cpuvar.h>
     31 #include <sys/vfs.h>
     32 #include <sys/vnode.h>
     33 #include <sys/pathname.h>
     34 #include <sys/callb.h>
     35 #include <sys/fs/ufs_inode.h>
     36 #include <vm/anon.h>
     37 #include <sys/fs/swapnode.h>	/* for swapfs_minfree */
     38 #include <sys/kmem.h>
     39 #include <sys/cpr.h>
     40 #include <sys/conf.h>
     41 #include <sys/machclock.h>
     42 
     43 /*
     44  * CPR miscellaneous support routines
     45  */
     46 #define	cpr_open(path, mode,  vpp)	(vn_open(path, UIO_SYSSPACE, \
     47 		mode, 0600, vpp, CRCREAT, 0))
     48 #define	cpr_rdwr(rw, vp, basep, cnt)	(vn_rdwr(rw, vp,  (caddr_t)(basep), \
     49 		cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \
     50 		(ssize_t *)NULL))
     51 
     52 extern void clkset(time_t);
     53 extern cpu_t *i_cpr_bootcpu(void);
     54 extern caddr_t i_cpr_map_setup(void);
     55 extern void i_cpr_free_memory_resources(void);
     56 
     57 extern kmutex_t cpr_slock;
     58 extern size_t cpr_buf_size;
     59 extern char *cpr_buf;
     60 extern size_t cpr_pagedata_size;
     61 extern char *cpr_pagedata;
     62 extern int cpr_bufs_allocated;
     63 extern int cpr_bitmaps_allocated;
     64 
     65 #if defined(__sparc)
     66 static struct cprconfig cprconfig;
     67 static int cprconfig_loaded = 0;
     68 static int cpr_statefile_ok(vnode_t *, int);
     69 static int cpr_p_online(cpu_t *, int);
     70 static void cpr_save_mp_state(void);
     71 #endif
     72 
     73 int cpr_is_ufs(struct vfs *);
     74 
     75 char cpr_default_path[] = CPR_DEFAULT;
     76 
     77 #define	COMPRESS_PERCENT 40	/* approx compression ratio in percent */
     78 #define	SIZE_RATE	115	/* increase size by 15% */
     79 #define	INTEGRAL	100	/* for integer math */
     80 
     81 
     82 /*
     83  * cmn_err() followed by a 1/4 second delay; this gives the
     84  * logging service a chance to flush messages and helps avoid
     85  * intermixing output from prom_printf().
     86  */
     87 /*PRINTFLIKE2*/
     88 void
     89 cpr_err(int ce, const char *fmt, ...)
     90 {
     91 	va_list adx;
     92 
     93 	va_start(adx, fmt);
     94 	vcmn_err(ce, fmt, adx);
     95 	va_end(adx);
     96 	drv_usecwait(MICROSEC >> 2);
     97 }
     98 
     99 
    100 int
    101 cpr_init(int fcn)
    102 {
    103 	/*
    104 	 * Allow only one suspend/resume process.
    105 	 */
    106 	if (mutex_tryenter(&cpr_slock) == 0)
    107 		return (EBUSY);
    108 
    109 	CPR->c_flags = 0;
    110 	CPR->c_substate = 0;
    111 	CPR->c_cprboot_magic = 0;
    112 	CPR->c_alloc_cnt = 0;
    113 
    114 	CPR->c_fcn = fcn;
    115 	if (fcn == AD_CPR_REUSABLE)
    116 		CPR->c_flags |= C_REUSABLE;
    117 	else
    118 		CPR->c_flags |= C_SUSPENDING;
    119 	if (fcn == AD_SUSPEND_TO_RAM || fcn == DEV_SUSPEND_TO_RAM) {
    120 		return (0);
    121 	}
    122 #if defined(__sparc)
    123 	if (fcn != AD_CPR_NOCOMPRESS && fcn != AD_CPR_TESTNOZ)
    124 		CPR->c_flags |= C_COMPRESSING;
    125 	/*
    126 	 * reserve CPR_MAXCONTIG virtual pages for cpr_dump()
    127 	 */
    128 	CPR->c_mapping_area = i_cpr_map_setup();
    129 	if (CPR->c_mapping_area == 0) {		/* no space in kernelmap */
    130 		cpr_err(CE_CONT, "Unable to alloc from kernelmap.\n");
    131 		mutex_exit(&cpr_slock);
    132 		return (EAGAIN);
    133 	}
    134 	if (cpr_debug & CPR_DEBUG3)
    135 		cpr_err(CE_CONT, "Reserved virtual range from 0x%p for writing "
    136 		    "kas\n", (void *)CPR->c_mapping_area);
    137 #endif
    138 
    139 	return (0);
    140 }
    141 
    142 /*
    143  * This routine releases any resources used during the checkpoint.
    144  */
    145 void
    146 cpr_done(void)
    147 {
    148 	cpr_stat_cleanup();
    149 	i_cpr_bitmap_cleanup();
    150 
    151 	/*
    152 	 * Free pages used by cpr buffers.
    153 	 */
    154 	if (cpr_buf) {
    155 		kmem_free(cpr_buf, cpr_buf_size);
    156 		cpr_buf = NULL;
    157 	}
    158 	if (cpr_pagedata) {
    159 		kmem_free(cpr_pagedata, cpr_pagedata_size);
    160 		cpr_pagedata = NULL;
    161 	}
    162 
    163 	i_cpr_free_memory_resources();
    164 	mutex_exit(&cpr_slock);
    165 	cpr_err(CE_CONT, "System has been resumed.\n");
    166 }
    167 
    168 
    169 #if defined(__sparc)
    170 /*
    171  * reads config data into cprconfig
    172  */
    173 static int
    174 cpr_get_config(void)
    175 {
    176 	static char config_path[] = CPR_CONFIG;
    177 	struct cprconfig *cf = &cprconfig;
    178 	struct vnode *vp;
    179 	char *fmt;
    180 	int err;
    181 
    182 	if (cprconfig_loaded)
    183 		return (0);
    184 
    185 	fmt = "cannot %s config file \"%s\", error %d\n";
    186 	if (err = vn_open(config_path, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0)) {
    187 		cpr_err(CE_CONT, fmt, "open", config_path, err);
    188 		return (err);
    189 	}
    190 
    191 	err = cpr_rdwr(UIO_READ, vp, cf, sizeof (*cf));
    192 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
    193 	VN_RELE(vp);
    194 	if (err) {
    195 		cpr_err(CE_CONT, fmt, "read", config_path, err);
    196 		return (err);
    197 	}
    198 
    199 	if (cf->cf_magic == CPR_CONFIG_MAGIC)
    200 		cprconfig_loaded = 1;
    201 	else {
    202 		cpr_err(CE_CONT, "invalid config file \"%s\", "
    203 		    "rerun pmconfig(1M)\n", config_path);
    204 		err = EINVAL;
    205 	}
    206 
    207 	return (err);
    208 }
    209 
    210 
    211 /*
    212  * concat fs and path fields of the cprconfig structure;
    213  * returns pointer to the base of static data
    214  */
    215 static char *
    216 cpr_cprconfig_to_path(void)
    217 {
    218 	static char full_path[MAXNAMELEN];
    219 	struct cprconfig *cf = &cprconfig;
    220 	char *ptr;
    221 
    222 	/*
    223 	 * build /fs/path without extra '/'
    224 	 */
    225 	(void) strcpy(full_path, cf->cf_fs);
    226 	if (strcmp(cf->cf_fs, "/"))
    227 		(void) strcat(full_path, "/");
    228 	ptr = cf->cf_path;
    229 	if (*ptr == '/')
    230 		ptr++;
    231 	(void) strcat(full_path, ptr);
    232 	return (full_path);
    233 }
    234 
    235 
    236 /*
    237  * Verify that the information in the configuration file regarding the
    238  * location for the statefile is still valid, depending on cf_type.
    239  * for CFT_UFS, cf_fs must still be a mounted filesystem, it must be
    240  *	mounted on the same device as when pmconfig was last run,
    241  *	and the translation of that device to a node in the prom's
    242  *	device tree must be the same as when pmconfig was last run.
    243  * for CFT_SPEC, cf_path must be the path to a block special file,
    244  *	it must have no file system mounted on it,
    245  *	and the translation of that device to a node in the prom's
    246  *	device tree must be the same as when pmconfig was last run.
    247  */
    248 static int
    249 cpr_verify_statefile_path(void)
    250 {
    251 	struct cprconfig *cf = &cprconfig;
    252 	static const char long_name[] = "Statefile pathname is too long.\n";
    253 	static const char lookup_fmt[] = "Lookup failed for "
    254 	    "cpr statefile device %s.\n";
    255 	static const char path_chg_fmt[] = "Device path for statefile "
    256 	    "has changed from %s to %s.\t%s\n";
    257 	static const char rerun[] = "Please rerun pmconfig(1m).";
    258 	struct vfs *vfsp = NULL, *vfsp_save = rootvfs;
    259 	ufsvfs_t *ufsvfsp = (ufsvfs_t *)rootvfs->vfs_data;
    260 	ufsvfs_t *ufsvfsp_save = ufsvfsp;
    261 	int error;
    262 	struct vnode *vp;
    263 	char *slash, *tail, *longest;
    264 	char *errstr;
    265 	int found = 0;
    266 	union {
    267 		char un_devpath[OBP_MAXPATHLEN];
    268 		char un_sfpath[MAXNAMELEN];
    269 	} un;
    270 #define	devpath	un.un_devpath
    271 #define	sfpath	un.un_sfpath
    272 
    273 	ASSERT(cprconfig_loaded);
    274 	/*
    275 	 * We need not worry about locking or the timing of releasing
    276 	 * the vnode, since we are single-threaded now.
    277 	 */
    278 
    279 	switch (cf->cf_type) {
    280 	case CFT_SPEC:
    281 		if (strlen(cf->cf_path) > sizeof (sfpath)) {
    282 			cpr_err(CE_CONT, long_name);
    283 			return (ENAMETOOLONG);
    284 		}
    285 		if ((error = lookupname(cf->cf_devfs,
    286 		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
    287 			cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
    288 			return (error);
    289 		}
    290 		if (vp->v_type != VBLK)
    291 			errstr = "statefile must be a block device";
    292 		else if (vfs_devismounted(vp->v_rdev))
    293 			errstr = "statefile device must not "
    294 			    "have a file system mounted on it";
    295 		else if (IS_SWAPVP(vp))
    296 			errstr = "statefile device must not "
    297 			    "be configured as swap file";
    298 		else
    299 			errstr = NULL;
    300 
    301 		VN_RELE(vp);
    302 		if (errstr) {
    303 			cpr_err(CE_CONT, "%s.\n", errstr);
    304 			return (ENOTSUP);
    305 		}
    306 
    307 		error = i_devname_to_promname(cf->cf_devfs, devpath,
    308 		    OBP_MAXPATHLEN);
    309 		if (error || strcmp(devpath, cf->cf_dev_prom)) {
    310 			cpr_err(CE_CONT, path_chg_fmt,
    311 			    cf->cf_dev_prom, devpath, rerun);
    312 		}
    313 		return (error);
    314 	case CFT_UFS:
    315 		break;		/* don't indent all the original code */
    316 	default:
    317 		cpr_err(CE_PANIC, "invalid cf_type");
    318 	}
    319 
    320 	/*
    321 	 * The original code for UFS statefile
    322 	 */
    323 	if (strlen(cf->cf_fs) + strlen(cf->cf_path) + 2 > sizeof (sfpath)) {
    324 		cpr_err(CE_CONT, long_name);
    325 		return (ENAMETOOLONG);
    326 	}
    327 
    328 	bzero(sfpath, sizeof (sfpath));
    329 	(void) strcpy(sfpath, cpr_cprconfig_to_path());
    330 
    331 	if (*sfpath != '/') {
    332 		cpr_err(CE_CONT, "Statefile pathname %s "
    333 		    "must begin with a /\n", sfpath);
    334 		return (EINVAL);
    335 	}
    336 
    337 	/*
    338 	 * Find the longest prefix of the statefile pathname which
    339 	 * is the mountpoint of a filesystem.  This string must
    340 	 * match the cf_fs field we read from the config file.  Other-
    341 	 * wise the user has changed things without running pmconfig.
    342 	 */
    343 	tail = longest = sfpath + 1;	/* pt beyond the leading "/" */
    344 	while ((slash = strchr(tail, '/')) != NULL) {
    345 		*slash = '\0';	  /* temporarily terminate the string */
    346 		if ((error = lookupname(sfpath,
    347 		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
    348 			*slash = '/';
    349 			cpr_err(CE_CONT, "A directory in the "
    350 			    "statefile path %s was not found.\n", sfpath);
    351 			VN_RELE(vp);
    352 
    353 			return (error);
    354 		}
    355 
    356 		vfs_list_read_lock();
    357 		vfsp = rootvfs;
    358 		do {
    359 			ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
    360 			if (ufsvfsp != NULL && ufsvfsp->vfs_root == vp) {
    361 				found = 1;
    362 				break;
    363 			}
    364 			vfsp = vfsp->vfs_next;
    365 		} while (vfsp != rootvfs);
    366 		vfs_list_unlock();
    367 
    368 		/*
    369 		 * If we have found a filesystem mounted on the current
    370 		 * path prefix, remember the end of the string in
    371 		 * "longest".  If it happens to be the the exact fs
    372 		 * saved in the configuration file, save the current
    373 		 * ufsvfsp so we can make additional checks further down.
    374 		 */
    375 		if (found) {
    376 			longest = slash;
    377 			if (strcmp(cf->cf_fs, sfpath) == 0) {
    378 				ufsvfsp_save = ufsvfsp;
    379 				vfsp_save = vfsp;
    380 			}
    381 			found = 0;
    382 		}
    383 
    384 		VN_RELE(vp);
    385 		*slash = '/';
    386 		tail = slash + 1;
    387 	}
    388 	*longest = '\0';
    389 	if (cpr_is_ufs(vfsp_save) == 0 || strcmp(cf->cf_fs, sfpath)) {
    390 		cpr_err(CE_CONT, "Filesystem containing "
    391 		    "the statefile when pmconfig was run (%s) has "
    392 		    "changed to %s. %s\n", cf->cf_fs, sfpath, rerun);
    393 		return (EINVAL);
    394 	}
    395 
    396 	if ((error = lookupname(cf->cf_devfs,
    397 	    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
    398 		cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
    399 		return (error);
    400 	}
    401 
    402 	if (ufsvfsp_save->vfs_devvp->v_rdev != vp->v_rdev) {
    403 		cpr_err(CE_CONT, "Filesystem containing "
    404 		    "statefile no longer mounted on device %s. "
    405 		    "See power.conf(4).", cf->cf_devfs);
    406 		VN_RELE(vp);
    407 		return (ENXIO);
    408 	}
    409 	VN_RELE(vp);
    410 
    411 	error = i_devname_to_promname(cf->cf_devfs, devpath, OBP_MAXPATHLEN);
    412 	if (error || strcmp(devpath, cf->cf_dev_prom)) {
    413 		cpr_err(CE_CONT, path_chg_fmt,
    414 		    cf->cf_dev_prom, devpath, rerun);
    415 		return (error);
    416 	}
    417 
    418 	return (0);
    419 }
    420 
    421 /*
    422  * Make sure that the statefile can be used as a block special statefile
    423  * (meaning that is exists and has nothing mounted on it)
    424  * Returns errno if not a valid statefile.
    425  */
    426 int
    427 cpr_check_spec_statefile(void)
    428 {
    429 	int err;
    430 
    431 	if (err = cpr_get_config())
    432 		return (err);
    433 	ASSERT(cprconfig.cf_type == CFT_SPEC);
    434 
    435 	if (cprconfig.cf_devfs == NULL)
    436 		return (ENXIO);
    437 
    438 	return (cpr_verify_statefile_path());
    439 
    440 }
    441 
    442 int
    443 cpr_alloc_statefile(int alloc_retry)
    444 {
    445 	register int rc = 0;
    446 	char *str;
    447 
    448 	/*
    449 	 * Statefile size validation. If checkpoint the first time, disk blocks
    450 	 * allocation will be done; otherwise, just do file size check.
    451 	 * if statefile allocation is being retried, C_VP will be inited
    452 	 */
    453 	if (alloc_retry) {
    454 		str = "\n-->Retrying statefile allocation...";
    455 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
    456 			prom_printf(str);
    457 		if (C_VP->v_type != VBLK)
    458 			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL);
    459 	} else {
    460 		/*
    461 		 * Open an exiting file for writing, the state file needs to be
    462 		 * pre-allocated since we can't and don't want to do allocation
    463 		 * during checkpoint (too much of the OS is disabled).
    464 		 *    - do a preliminary size checking here, if it is too small,
    465 		 *	allocate more space internally and retry.
    466 		 *    - check the vp to make sure it's the right type.
    467 		 */
    468 		char *path = cpr_build_statefile_path();
    469 
    470 		if (path == NULL)
    471 			return (ENXIO);
    472 		else if (rc = cpr_verify_statefile_path())
    473 			return (rc);
    474 
    475 		if (rc = vn_open(path, UIO_SYSSPACE,
    476 		    FCREAT|FWRITE, 0600, &C_VP, CRCREAT, 0)) {
    477 			cpr_err(CE_WARN, "cannot open statefile %s", path);
    478 			return (rc);
    479 		}
    480 	}
    481 
    482 	/*
    483 	 * Only ufs and block special statefiles supported
    484 	 */
    485 	if (C_VP->v_type != VREG && C_VP->v_type != VBLK) {
    486 		cpr_err(CE_CONT,
    487 		    "Statefile must be regular file or block special file.");
    488 		return (EACCES);
    489 	}
    490 
    491 	if (rc = cpr_statefile_ok(C_VP, alloc_retry))
    492 		return (rc);
    493 
    494 	if (C_VP->v_type != VBLK) {
    495 		/*
    496 		 * sync out the fs change due to the statefile reservation.
    497 		 */
    498 		(void) VFS_SYNC(C_VP->v_vfsp, 0, CRED());
    499 
    500 		/*
    501 		 * Validate disk blocks allocation for the state file.
    502 		 * Ask the file system prepare itself for the dump operation.
    503 		 */
    504 		if (rc = VOP_DUMPCTL(C_VP, DUMP_ALLOC, NULL, NULL)) {
    505 			cpr_err(CE_CONT, "Error allocating "
    506 			    "blocks for cpr statefile.");
    507 			return (rc);
    508 		}
    509 	}
    510 	return (0);
    511 }
    512 
    513 
    514 /*
    515  * Lookup device size and return available space in bytes.
    516  * NOTE: Since prop_op(9E) can't tell the difference between a character
    517  * and a block reference, it is ok to ask for "Size" instead of "Nblocks".
    518  */
    519 size_t
    520 cpr_get_devsize(dev_t dev)
    521 {
    522 	size_t bytes = 0;
    523 
    524 	bytes = cdev_Size(dev);
    525 	if (bytes == 0)
    526 		bytes = cdev_size(dev);
    527 
    528 	if (bytes > CPR_SPEC_OFFSET)
    529 		bytes -= CPR_SPEC_OFFSET;
    530 	else
    531 		bytes = 0;
    532 
    533 	return (bytes);
    534 }
    535 
    536 
    537 /*
    538  * increase statefile size
    539  */
    540 static int
    541 cpr_grow_statefile(vnode_t *vp, u_longlong_t newsize)
    542 {
    543 	extern uchar_t cpr_pagecopy[];
    544 	struct inode *ip = VTOI(vp);
    545 	u_longlong_t offset;
    546 	int error, increase;
    547 	ssize_t resid;
    548 
    549 	rw_enter(&ip->i_contents, RW_READER);
    550 	increase = (ip->i_size < newsize);
    551 	offset = ip->i_size;
    552 	rw_exit(&ip->i_contents);
    553 
    554 	if (increase == 0)
    555 		return (0);
    556 
    557 	/*
    558 	 * write to each logical block to reserve disk space
    559 	 */
    560 	error = 0;
    561 	cpr_pagecopy[0] = '1';
    562 	for (; offset < newsize; offset += ip->i_fs->fs_bsize) {
    563 		if (error = vn_rdwr(UIO_WRITE, vp, (caddr_t)cpr_pagecopy,
    564 		    ip->i_fs->fs_bsize, (offset_t)offset, UIO_SYSSPACE, 0,
    565 		    (rlim64_t)MAXOFF_T, CRED(), &resid)) {
    566 			if (error == ENOSPC) {
    567 				cpr_err(CE_WARN, "error %d while reserving "
    568 				    "disk space for statefile %s\n"
    569 				    "wanted %lld bytes, file is %lld short",
    570 				    error, cpr_cprconfig_to_path(),
    571 				    newsize, newsize - offset);
    572 			}
    573 			break;
    574 		}
    575 	}
    576 	return (error);
    577 }
    578 
    579 
    580 /*
    581  * do a simple estimate of the space needed to hold the statefile
    582  * taking compression into account, but be fairly conservative
    583  * so we have a better chance of completing; when dump fails,
    584  * the retry cost is fairly high.
    585  *
    586  * Do disk blocks allocation for the state file if no space has
    587  * been allocated yet. Since the state file will not be removed,
    588  * allocation should only be done once.
    589  */
    590 static int
    591 cpr_statefile_ok(vnode_t *vp, int alloc_retry)
    592 {
    593 	extern size_t cpr_bitmap_size;
    594 	struct inode *ip = VTOI(vp);
    595 	const int UCOMP_RATE = 20; /* comp. ratio*10 for user pages */
    596 	u_longlong_t size, isize, ksize, raw_data;
    597 	char *str, *est_fmt;
    598 	size_t space;
    599 	int error;
    600 
    601 	/*
    602 	 * number of pages short for swapping.
    603 	 */
    604 	STAT->cs_nosw_pages = k_anoninfo.ani_mem_resv;
    605 	if (STAT->cs_nosw_pages < 0)
    606 		STAT->cs_nosw_pages = 0;
    607 
    608 	str = "cpr_statefile_ok:";
    609 
    610 	CPR_DEBUG(CPR_DEBUG9, "Phys swap: max=%lu resv=%lu\n",
    611 	    k_anoninfo.ani_max, k_anoninfo.ani_phys_resv);
    612 	CPR_DEBUG(CPR_DEBUG9, "Mem swap: max=%ld resv=%lu\n",
    613 	    MAX(availrmem - swapfs_minfree, 0),
    614 	    k_anoninfo.ani_mem_resv);
    615 	CPR_DEBUG(CPR_DEBUG9, "Total available swap: %ld\n",
    616 	    CURRENT_TOTAL_AVAILABLE_SWAP);
    617 
    618 	/*
    619 	 * try increasing filesize by 15%
    620 	 */
    621 	if (alloc_retry) {
    622 		/*
    623 		 * block device doesn't get any bigger
    624 		 */
    625 		if (vp->v_type == VBLK) {
    626 			if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
    627 				prom_printf(
    628 				    "Retry statefile on special file\n");
    629 			return (ENOMEM);
    630 		} else {
    631 			rw_enter(&ip->i_contents, RW_READER);
    632 			size = (ip->i_size * SIZE_RATE) / INTEGRAL;
    633 			rw_exit(&ip->i_contents);
    634 		}
    635 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
    636 			prom_printf("Retry statefile size = %lld\n", size);
    637 	} else {
    638 		u_longlong_t cpd_size;
    639 		pgcnt_t npages, nback;
    640 		int ndvram;
    641 
    642 		ndvram = 0;
    643 		(void) callb_execute_class(CB_CL_CPR_FB,
    644 		    (int)(uintptr_t)&ndvram);
    645 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
    646 			prom_printf("ndvram size = %d\n", ndvram);
    647 
    648 		/*
    649 		 * estimate 1 cpd_t for every (CPR_MAXCONTIG / 2) pages
    650 		 */
    651 		npages = cpr_count_kpages(REGULAR_BITMAP, cpr_nobit);
    652 		cpd_size = sizeof (cpd_t) * (npages / (CPR_MAXCONTIG / 2));
    653 		raw_data = cpd_size + cpr_bitmap_size;
    654 		ksize = ndvram + mmu_ptob(npages);
    655 
    656 		est_fmt = "%s estimated size with "
    657 		    "%scompression %lld, ksize %lld\n";
    658 		nback = mmu_ptob(STAT->cs_nosw_pages);
    659 		if (CPR->c_flags & C_COMPRESSING) {
    660 			size = ((ksize * COMPRESS_PERCENT) / INTEGRAL) +
    661 			    raw_data + ((nback * 10) / UCOMP_RATE);
    662 			CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "", size, ksize);
    663 		} else {
    664 			size = ksize + raw_data + nback;
    665 			CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "no ",
    666 			    size, ksize);
    667 		}
    668 	}
    669 
    670 	/*
    671 	 * All this is much simpler for a block device
    672 	 */
    673 	if (vp->v_type == VBLK) {
    674 		space = cpr_get_devsize(vp->v_rdev);
    675 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
    676 			prom_printf("statefile dev size %lu\n", space);
    677 
    678 		/*
    679 		 * Export the estimated filesize info, this value will be
    680 		 * compared before dumping out the statefile in the case of
    681 		 * no compression.
    682 		 */
    683 		STAT->cs_est_statefsz = size;
    684 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
    685 			prom_printf("%s Estimated statefile size %llu, "
    686 			    "space %lu\n", str, size, space);
    687 		if (size > space) {
    688 			cpr_err(CE_CONT, "Statefile partition too small.");
    689 			return (ENOMEM);
    690 		}
    691 		return (0);
    692 	} else {
    693 		if (CPR->c_alloc_cnt++ > C_MAX_ALLOC_RETRY) {
    694 			cpr_err(CE_CONT, "Statefile allocation retry failed\n");
    695 			return (ENOMEM);
    696 		}
    697 
    698 		/*
    699 		 * Estimate space needed for the state file.
    700 		 *
    701 		 * State file size in bytes:
    702 		 * 	kernel size + non-cache pte seg +
    703 		 *	bitmap size + cpr state file headers size
    704 		 * (round up to fs->fs_bsize)
    705 		 */
    706 		size = blkroundup(ip->i_fs, size);
    707 
    708 		/*
    709 		 * Export the estimated filesize info, this value will be
    710 		 * compared before dumping out the statefile in the case of
    711 		 * no compression.
    712 		 */
    713 		STAT->cs_est_statefsz = size;
    714 		error = cpr_grow_statefile(vp, size);
    715 		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) {
    716 			rw_enter(&ip->i_contents, RW_READER);
    717 			isize = ip->i_size;
    718 			rw_exit(&ip->i_contents);
    719 			prom_printf("%s Estimated statefile size %lld, "
    720 			    "i_size %lld\n", str, size, isize);
    721 		}
    722 
    723 		return (error);
    724 	}
    725 }
    726 
    727 
    728 void
    729 cpr_statef_close(void)
    730 {
    731 	if (C_VP) {
    732 		if (!cpr_reusable_mode)
    733 			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL, NULL);
    734 		(void) VOP_CLOSE(C_VP, FWRITE, 1, (offset_t)0, CRED(), NULL);
    735 		VN_RELE(C_VP);
    736 		C_VP = 0;
    737 	}
    738 }
    739 
    740 
    741 /*
    742  * open cpr default file and display error
    743  */
    744 int
    745 cpr_open_deffile(int mode, vnode_t **vpp)
    746 {
    747 	int error;
    748 
    749 	if (error = cpr_open(cpr_default_path, mode, vpp))
    750 		cpr_err(CE_CONT, "cannot open \"%s\", error %d\n",
    751 		    cpr_default_path, error);
    752 	return (error);
    753 }
    754 
    755 
    756 /*
    757  * write cdef_t to disk.  This contains the original values of prom
    758  * properties that we modify.  We fill in the magic number of the file
    759  * here as a signal to the booter code that the state file is valid.
    760  * Be sure the file gets synced, since we may be shutting down the OS.
    761  */
    762 int
    763 cpr_write_deffile(cdef_t *cdef)
    764 {
    765 	struct vnode *vp;
    766 	char *str;
    767 	int rc;
    768 
    769 	if (rc = cpr_open_deffile(FCREAT|FWRITE, &vp))
    770 		return (rc);
    771 
    772 	if (rc = cpr_rdwr(UIO_WRITE, vp, cdef, sizeof (*cdef)))
    773 		str = "write";
    774 	else if (rc = VOP_FSYNC(vp, FSYNC, CRED(), NULL))
    775 		str = "fsync";
    776 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
    777 	VN_RELE(vp);
    778 
    779 	if (rc) {
    780 		cpr_err(CE_WARN, "%s error %d, file \"%s\"",
    781 		    str, rc, cpr_default_path);
    782 	}
    783 	return (rc);
    784 }
    785 
    786 /*
    787  * Clear the magic number in the defaults file.  This tells the booter
    788  * program that the state file is not current and thus prevents
    789  * any attempt to restore from an obsolete state file.
    790  */
    791 void
    792 cpr_clear_definfo(void)
    793 {
    794 	struct vnode *vp;
    795 	cmini_t mini;
    796 
    797 	if ((CPR->c_cprboot_magic != CPR_DEFAULT_MAGIC) ||
    798 	    cpr_open_deffile(FCREAT|FWRITE, &vp))
    799 		return;
    800 	mini.magic = mini.reusable = 0;
    801 	(void) cpr_rdwr(UIO_WRITE, vp, &mini, sizeof (mini));
    802 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
    803 	VN_RELE(vp);
    804 }
    805 
    806 /*
    807  * If the cpr default file is invalid, then we must not be in reusable mode
    808  * if it is valid, it tells us our mode
    809  */
    810 int
    811 cpr_get_reusable_mode(void)
    812 {
    813 	struct vnode *vp;
    814 	cmini_t mini;
    815 	int rc;
    816 
    817 	if (cpr_open(cpr_default_path, FREAD, &vp))
    818 		return (0);
    819 
    820 	rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini));
    821 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
    822 	VN_RELE(vp);
    823 	if (rc == 0 && mini.magic == CPR_DEFAULT_MAGIC)
    824 		return (mini.reusable);
    825 
    826 	return (0);
    827 }
    828 #endif
    829 
    830 /*
    831  * clock/time related routines
    832  */
    833 static time_t   cpr_time_stamp;
    834 
    835 
    836 void
    837 cpr_tod_get(cpr_time_t *ctp)
    838 {
    839 	timestruc_t ts;
    840 
    841 	mutex_enter(&tod_lock);
    842 	ts = TODOP_GET(tod_ops);
    843 	mutex_exit(&tod_lock);
    844 	ctp->tv_sec = (time32_t)ts.tv_sec;
    845 	ctp->tv_nsec = (int32_t)ts.tv_nsec;
    846 }
    847 
    848 void
    849 cpr_tod_fault_reset(void)
    850 {
    851 	mutex_enter(&tod_lock);
    852 	tod_fault_reset();
    853 	mutex_exit(&tod_lock);
    854 }
    855 
    856 void
    857 cpr_save_time(void)
    858 {
    859 	cpr_time_stamp = gethrestime_sec();
    860 }
    861 
    862 /*
    863  * correct time based on saved time stamp or hardware clock
    864  */
    865 void
    866 cpr_restore_time(void)
    867 {
    868 	clkset(cpr_time_stamp);
    869 }
    870 
    871 #if defined(__sparc)
    872 /*
    873  * CPU ONLINE/OFFLINE CODE
    874  */
    875 int
    876 cpr_mp_offline(void)
    877 {
    878 	cpu_t *cp, *bootcpu;
    879 	int rc = 0;
    880 	int brought_up_boot = 0;
    881 
    882 	/*
    883 	 * Do nothing for UP.
    884 	 */
    885 	if (ncpus == 1)
    886 		return (0);
    887 
    888 	mutex_enter(&cpu_lock);
    889 
    890 	cpr_save_mp_state();
    891 
    892 	bootcpu = i_cpr_bootcpu();
    893 	if (!CPU_ACTIVE(bootcpu)) {
    894 		if ((rc = cpr_p_online(bootcpu, CPU_CPR_ONLINE))) {
    895 			mutex_exit(&cpu_lock);
    896 			return (rc);
    897 		}
    898 		brought_up_boot = 1;
    899 	}
    900 
    901 	cp = cpu_list;
    902 	do {
    903 		if (cp == bootcpu)
    904 			continue;
    905 		if (cp->cpu_flags & CPU_OFFLINE)
    906 			continue;
    907 		if ((rc = cpr_p_online(cp, CPU_CPR_OFFLINE))) {
    908 			mutex_exit(&cpu_lock);
    909 			return (rc);
    910 		}
    911 	} while ((cp = cp->cpu_next) != cpu_list);
    912 	if (brought_up_boot && (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)))
    913 		prom_printf("changed cpu %p to state %d\n",
    914 		    bootcpu, CPU_CPR_ONLINE);
    915 	mutex_exit(&cpu_lock);
    916 
    917 	return (rc);
    918 }
    919 
    920 int
    921 cpr_mp_online(void)
    922 {
    923 	cpu_t *cp, *bootcpu = CPU;
    924 	int rc = 0;
    925 
    926 	/*
    927 	 * Do nothing for UP.
    928 	 */
    929 	if (ncpus == 1)
    930 		return (0);
    931 
    932 	/*
    933 	 * cpr_save_mp_state() sets CPU_CPR_ONLINE in cpu_cpr_flags
    934 	 * to indicate a cpu was online at the time of cpr_suspend();
    935 	 * now restart those cpus that were marked as CPU_CPR_ONLINE
    936 	 * and actually are offline.
    937 	 */
    938 	mutex_enter(&cpu_lock);
    939 	for (cp = bootcpu->cpu_next; cp != bootcpu; cp = cp->cpu_next) {
    940 		/*
    941 		 * Clear the CPU_FROZEN flag in all cases.
    942 		 */
    943 		cp->cpu_flags &= ~CPU_FROZEN;
    944 
    945 		if (CPU_CPR_IS_OFFLINE(cp))
    946 			continue;
    947 		if (CPU_ACTIVE(cp))
    948 			continue;
    949 		if ((rc = cpr_p_online(cp, CPU_CPR_ONLINE))) {
    950 			mutex_exit(&cpu_lock);
    951 			return (rc);
    952 		}
    953 	}
    954 
    955 	/*
    956 	 * turn off the boot cpu if it was offlined
    957 	 */
    958 	if (CPU_CPR_IS_OFFLINE(bootcpu)) {
    959 		if ((rc = cpr_p_online(bootcpu, CPU_CPR_OFFLINE))) {
    960 			mutex_exit(&cpu_lock);
    961 			return (rc);
    962 		}
    963 	}
    964 	mutex_exit(&cpu_lock);
    965 	return (0);
    966 }
    967 
    968 static void
    969 cpr_save_mp_state(void)
    970 {
    971 	cpu_t *cp;
    972 
    973 	ASSERT(MUTEX_HELD(&cpu_lock));
    974 
    975 	cp = cpu_list;
    976 	do {
    977 		cp->cpu_cpr_flags &= ~CPU_CPR_ONLINE;
    978 		if (CPU_ACTIVE(cp))
    979 			CPU_SET_CPR_FLAGS(cp, CPU_CPR_ONLINE);
    980 	} while ((cp = cp->cpu_next) != cpu_list);
    981 }
    982 
    983 /*
    984  * change cpu to online/offline
    985  */
    986 static int
    987 cpr_p_online(cpu_t *cp, int state)
    988 {
    989 	int rc;
    990 
    991 	ASSERT(MUTEX_HELD(&cpu_lock));
    992 
    993 	switch (state) {
    994 	case CPU_CPR_ONLINE:
    995 		rc = cpu_online(cp);
    996 		break;
    997 	case CPU_CPR_OFFLINE:
    998 		rc = cpu_offline(cp, CPU_FORCED);
    999 		break;
   1000 	}
   1001 	if (rc) {
   1002 		cpr_err(CE_WARN, "Failed to change processor %d to "
   1003 		    "state %d, (errno %d)", cp->cpu_id, state, rc);
   1004 	}
   1005 	return (rc);
   1006 }
   1007 
   1008 /*
   1009  * Construct the pathname of the state file and return a pointer to
   1010  * caller.  Read the config file to get the mount point of the
   1011  * filesystem and the pathname within fs.
   1012  */
   1013 char *
   1014 cpr_build_statefile_path(void)
   1015 {
   1016 	struct cprconfig *cf = &cprconfig;
   1017 
   1018 	if (cpr_get_config())
   1019 		return (NULL);
   1020 
   1021 	switch (cf->cf_type) {
   1022 	case CFT_UFS:
   1023 		if (strlen(cf->cf_path) + strlen(cf->cf_fs) >= MAXNAMELEN - 1) {
   1024 			cpr_err(CE_CONT, "Statefile path is too long.\n");
   1025 			return (NULL);
   1026 		}
   1027 		return (cpr_cprconfig_to_path());
   1028 	case CFT_SPEC:
   1029 		return (cf->cf_devfs);
   1030 	default:
   1031 		cpr_err(CE_PANIC, "invalid statefile type");
   1032 		/*NOTREACHED*/
   1033 		return (NULL);
   1034 	}
   1035 }
   1036 
   1037 int
   1038 cpr_statefile_is_spec(void)
   1039 {
   1040 	if (cpr_get_config())
   1041 		return (0);
   1042 	return (cprconfig.cf_type == CFT_SPEC);
   1043 }
   1044 
   1045 char *
   1046 cpr_get_statefile_prom_path(void)
   1047 {
   1048 	struct cprconfig *cf = &cprconfig;
   1049 
   1050 	ASSERT(cprconfig_loaded);
   1051 	ASSERT(cf->cf_magic == CPR_CONFIG_MAGIC);
   1052 	ASSERT(cf->cf_type == CFT_SPEC);
   1053 	return (cf->cf_dev_prom);
   1054 }
   1055 
   1056 
   1057 /*
   1058  * XXX The following routines need to be in the vfs source code.
   1059  */
   1060 
   1061 int
   1062 cpr_is_ufs(struct vfs *vfsp)
   1063 {
   1064 	char *fsname;
   1065 
   1066 	fsname = vfssw[vfsp->vfs_fstype].vsw_name;
   1067 	return (strcmp(fsname, "ufs") == 0);
   1068 }
   1069 
   1070 /*
   1071  * This is a list of file systems that are allowed to be writeable when a
   1072  * reusable statefile checkpoint is taken.  They must not have any state that
   1073  * cannot be restored to consistency by simply rebooting using the checkpoint.
   1074  * (In contrast to ufs, cachefs and pcfs which have disk state that could get
   1075  * out of sync with the in-kernel data).
   1076  */
   1077 int
   1078 cpr_reusable_mount_check(void)
   1079 {
   1080 	struct vfs *vfsp;
   1081 	char *fsname;
   1082 	char **cpp;
   1083 	static char *cpr_writeok_fss[] = {
   1084 		"autofs", "devfs", "fd", "lofs", "mntfs", "namefs", "nfs",
   1085 		"proc", "tmpfs", "ctfs", "objfs", "dev", NULL
   1086 	};
   1087 
   1088 	vfs_list_read_lock();
   1089 	vfsp = rootvfs;
   1090 	do {
   1091 		if (vfsp->vfs_flag & VFS_RDONLY) {
   1092 			vfsp = vfsp->vfs_next;
   1093 			continue;
   1094 		}
   1095 		fsname = vfssw[vfsp->vfs_fstype].vsw_name;
   1096 		for (cpp = cpr_writeok_fss; *cpp; cpp++) {
   1097 			if (strcmp(fsname, *cpp) == 0)
   1098 				break;
   1099 		}
   1100 		/*
   1101 		 * if the inner loop reached the NULL terminator,
   1102 		 * the current fs-type does not match any OK-type
   1103 		 */
   1104 		if (*cpp == NULL) {
   1105 			cpr_err(CE_CONT, "a filesystem of type %s is "
   1106 			    "mounted read/write.\nReusable statefile requires "
   1107 			    "no writeable filesystem of this type be mounted\n",
   1108 			    fsname);
   1109 			vfs_list_unlock();
   1110 			return (EINVAL);
   1111 		}
   1112 		vfsp = vfsp->vfs_next;
   1113 	} while (vfsp != rootvfs);
   1114 	vfs_list_unlock();
   1115 	return (0);
   1116 }
   1117 
   1118 /*
   1119  * return statefile offset in DEV_BSIZE units
   1120  */
   1121 int
   1122 cpr_statefile_offset(void)
   1123 {
   1124 	return (cpr_statefile_is_spec() ? btod(CPR_SPEC_OFFSET) : 0);
   1125 }
   1126 
   1127 /*
   1128  * Force a fresh read of the cprinfo per uadmin 3 call
   1129  */
   1130 void
   1131 cpr_forget_cprconfig(void)
   1132 {
   1133 	cprconfig_loaded = 0;
   1134 }
   1135 #endif
   1136