Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)devcache.c	1.14	07/10/25 SMI"
     27 
     28 #include <sys/note.h>
     29 #include <sys/t_lock.h>
     30 #include <sys/cmn_err.h>
     31 #include <sys/instance.h>
     32 #include <sys/conf.h>
     33 #include <sys/stat.h>
     34 #include <sys/ddi.h>
     35 #include <sys/hwconf.h>
     36 #include <sys/sunddi.h>
     37 #include <sys/sunndi.h>
     38 #include <sys/ddi_impldefs.h>
     39 #include <sys/ndi_impldefs.h>
     40 #include <sys/modctl.h>
     41 #include <sys/dacf.h>
     42 #include <sys/promif.h>
     43 #include <sys/cpuvar.h>
     44 #include <sys/pathname.h>
     45 #include <sys/kobj.h>
     46 #include <sys/devcache.h>
     47 #include <sys/devcache_impl.h>
     48 #include <sys/sysmacros.h>
     49 #include <sys/varargs.h>
     50 #include <sys/callb.h>
     51 
     52 /*
     53  * This facility provides interfaces to clients to register,
     54  * read and update cache data in persisted backing store files,
     55  * usually in /etc/devices.  The data persisted through this
     56  * mechanism should be stateless data, functioning in the sense
     57  * of a cache.  Writes are performed by a background daemon
     58  * thread, permitting a client to schedule an update without
     59  * blocking, then continue updating the data state in
     60  * parallel.  The data is only locked by the daemon thread
     61  * to pack the data in preparation for the write.
     62  *
     63  * Data persisted through this mechanism should be capable
     64  * of being regenerated through normal system operation,
     65  * for example attaching all disk devices would cause all
     66  * devids to be registered for those devices.  By caching
     67  * a devid-device tuple, the system can operate in a
     68  * more optimal way, directly attaching the device mapped
     69  * to a devid, rather than burdensomely driving attach of
     70  * the entire device tree to discover a single device.
     71  *
     72  * Note that a client should only need to include
     73  * <sys/devcache.h> for the supported interfaces.
     74  *
     75  * The data per client is entirely within the control of
     76  * the client.  When reading, data unpacked from the backing
     77  * store should be inserted in the list.  The pointer to
     78  * the list can be retrieved via nvf_list().  When writing,
     79  * the data on the list is to be packed and returned to the
     80  * nvpdaemon as an nvlist.
     81  *
     82  * Obvious restrictions are imposed by the limits of the
     83  * nvlist format.  The data cannot be read or written
     84  * piecemeal, and large amounts of data aren't recommended.
     85  * However, nvlists do allow that data be named and typed
     86  * and can be size-of-int invariant, and the cached data
     87  * can be versioned conveniently.
     88  *
     89  * The registration involves two steps: a handle is
     90  * allocated by calling the registration function.
     91  * This sets up the data referenced by the handle and
     92  * initializes the lock.  Following registration, the
     93  * client must initialize the data list.  The list
     94  * interfaces require that the list element with offset
     95  * to the node link be provided.  The format of the
     96  * list element is under the control of the client.
     97  *
     98  * Locking: the address of the data list r/w lock provided
     99  * can be accessed with nvf_lock().  The lock must be held
    100  * as reader when traversing the list or checking state,
    101  * such as nvf_is_dirty().  The lock must be held as
    102  * writer when updating the list or marking it dirty.
    103  * The lock must not be held when waking the daemon.
    104  *
    105  * The data r/w lock is held as writer when the pack,
    106  * unpack and free list handlers are called.  The
    107  * lock should not be dropped and must be still held
    108  * upon return.  The client should also hold the lock
    109  * as reader when checking if the list is dirty, and
    110  * as writer when marking the list dirty or initiating
    111  * a read.
    112  *
    113  * The asynchronous nature of updates allows for the
    114  * possibility that the data may continue to be updated
    115  * once the daemon has been notified that an update is
    116  * desired.  The data only needs to be locked against
    117  * updates when packing the data into the form to be
    118  * written.  When the write of the packed data has
    119  * completed, the daemon will automatically reschedule
    120  * an update if the data was marked dirty after the
    121  * point at which it was packed.  Before beginning an
    122  * update, the daemon attempts to lock the data as
    123  * writer; if the writer lock is already held, it
    124  * backs off and retries later.  The model is to give
    125  * priority to the kernel processes generating the
    126  * data, and that the nature of the data is that
    127  * it does not change often, can be re-generated when
    128  * needed, so updates should not happen often and
    129  * can be delayed until the data stops changing.
    130  * The client may update the list or mark it dirty
    131  * any time it is able to acquire the lock as
    132  * writer first.
    133  *
    134  * A failed write will be retried after some delay,
    135  * in the hope that the cause of the error will be
    136  * transient, for example a filesystem with no space
    137  * available.  An update on a read-only filesystem
    138  * is failed silently and not retried; this would be
    139  * the case when booted off install media.
    140  *
    141  * There is no unregister mechanism as of yet, as it
    142  * hasn't been needed so far.
    143  */
    144 
    145 /*
    146  * Global list of files registered and updated by the nvpflush
    147  * daemon, protected by the nvf_cache_mutex.  While an
    148  * update is taking place, a file is temporarily moved to
    149  * the dirty list to avoid locking the primary list for
    150  * the duration of the update.
    151  */
    152 list_t		nvf_cache_files;
    153 list_t		nvf_dirty_files;
    154 kmutex_t	nvf_cache_mutex;
    155 
    156 
    157 /*
    158  * Allow some delay from an update of the data before flushing
    159  * to permit simultaneous updates of multiple changes.
    160  * Changes in the data are expected to be bursty, ie
    161  * reconfig or hot-plug of a new adapter.
    162  *
    163  * kfio_report_error (default 0)
    164  *	Set to 1 to enable some error messages related to low-level
    165  *	kernel file i/o operations.
    166  *
    167  * nvpflush_delay (default 10)
    168  *	The number of seconds after data is marked dirty before the
    169  *	flush daemon is triggered to flush the data.  A longer period
    170  *	of time permits more data updates per write.  Note that
    171  *	every update resets the timer so no repository write will
    172  *	occur while data is being updated continuously.
    173  *
    174  * nvpdaemon_idle_time (default 60)
    175  *	The number of seconds the daemon will sleep idle before exiting.
    176  *
    177  */
    178 #define	NVPFLUSH_DELAY		10
    179 #define	NVPDAEMON_IDLE_TIME	60
    180 
    181 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
    182 
    183 /*
    184  * Tunables
    185  */
    186 int kfio_report_error = 0;		/* kernel file i/o operations */
    187 int kfio_disable_read = 0;		/* disable all reads */
    188 int kfio_disable_write = 0;		/* disable all writes */
    189 
    190 int nvpflush_delay	= NVPFLUSH_DELAY;
    191 int nvpdaemon_idle_time	= NVPDAEMON_IDLE_TIME;
    192 
    193 static timeout_id_t	nvpflush_id = 0;
    194 static int		nvpflush_timer_busy = 0;
    195 static int		nvpflush_daemon_active = 0;
    196 static kthread_t	*nvpflush_thr_id = 0;
    197 
    198 static int		do_nvpflush = 0;
    199 static int		nvpbusy = 0;
    200 static kmutex_t		nvpflush_lock;
    201 static kcondvar_t	nvpflush_cv;
    202 static kthread_id_t	nvpflush_thread;
    203 static clock_t		nvpticks;
    204 
    205 static void nvpflush_daemon(void);
    206 
    207 #ifdef	DEBUG
    208 int nvpdaemon_debug = 0;
    209 int kfio_debug = 0;
    210 #endif	/* DEBUG */
    211 
    212 extern int modrootloaded;
    213 extern void mdi_read_devices_files(void);
    214 extern void mdi_clean_vhcache(void);
    215 
    216 /*
    217  * Initialize the overall cache file management
    218  */
    219 void
    220 i_ddi_devices_init(void)
    221 {
    222 	list_create(&nvf_cache_files, sizeof (nvfd_t),
    223 	    offsetof(nvfd_t, nvf_link));
    224 	list_create(&nvf_dirty_files, sizeof (nvfd_t),
    225 	    offsetof(nvfd_t, nvf_link));
    226 	mutex_init(&nvf_cache_mutex, NULL, MUTEX_DEFAULT, NULL);
    227 	retire_store_init();
    228 	devid_cache_init();
    229 }
    230 
    231 /*
    232  * Read cache files
    233  * The files read here should be restricted to those
    234  * that may be required to mount root.
    235  */
    236 void
    237 i_ddi_read_devices_files(void)
    238 {
    239 	/*
    240 	 * The retire store should be the first file read as it
    241 	 * may need to offline devices. kfio_disable_read is not
    242 	 * used for retire. For the rationale see the tunable
    243 	 * ddi_retire_store_bypass and comments in:
    244 	 *	uts/common/os/retire_store.c
    245 	 */
    246 
    247 	retire_store_read();
    248 
    249 	if (!kfio_disable_read) {
    250 		mdi_read_devices_files();
    251 		devid_cache_read();
    252 	}
    253 }
    254 
    255 void
    256 i_ddi_start_flush_daemon(void)
    257 {
    258 	nvfd_t	*nvfdp;
    259 
    260 	ASSERT(i_ddi_io_initialized());
    261 
    262 	mutex_init(&nvpflush_lock, NULL, MUTEX_DRIVER, NULL);
    263 	cv_init(&nvpflush_cv, NULL, CV_DRIVER, NULL);
    264 
    265 	mutex_enter(&nvf_cache_mutex);
    266 	for (nvfdp = list_head(&nvf_cache_files); nvfdp;
    267 	    nvfdp = list_next(&nvf_cache_files, nvfdp)) {
    268 		if (NVF_IS_DIRTY(nvfdp)) {
    269 			nvf_wake_daemon();
    270 			break;
    271 		}
    272 	}
    273 	mutex_exit(&nvf_cache_mutex);
    274 }
    275 
    276 void
    277 i_ddi_clean_devices_files(void)
    278 {
    279 	devid_cache_cleanup();
    280 	mdi_clean_vhcache();
    281 }
    282 
    283 /*
    284  * Register a cache file to be managed and updated by the nvpflush daemon.
    285  * All operations are performed through the returned handle.
    286  * There is no unregister mechanism for now.
    287  */
    288 nvf_handle_t
    289 nvf_register_file(nvf_ops_t *ops)
    290 {
    291 	nvfd_t *nvfdp;
    292 
    293 	nvfdp = kmem_zalloc(sizeof (*nvfdp), KM_SLEEP);
    294 
    295 	nvfdp->nvf_ops = ops;
    296 	nvfdp->nvf_flags = 0;
    297 	rw_init(&nvfdp->nvf_lock, NULL, RW_DRIVER, NULL);
    298 
    299 	mutex_enter(&nvf_cache_mutex);
    300 	list_insert_tail(&nvf_cache_files, nvfdp);
    301 	mutex_exit(&nvf_cache_mutex);
    302 
    303 	return ((nvf_handle_t)nvfdp);
    304 }
    305 
    306 /*PRINTFLIKE1*/
    307 void
    308 nvf_error(const char *fmt, ...)
    309 {
    310 	va_list ap;
    311 
    312 	if (kfio_report_error) {
    313 		va_start(ap, fmt);
    314 		vcmn_err(CE_NOTE, fmt, ap);
    315 		va_end(ap);
    316 	}
    317 }
    318 
    319 /*
    320  * Some operations clients may use to manage the data
    321  * to be persisted in a cache file.
    322  */
    323 char *
    324 nvf_cache_name(nvf_handle_t handle)
    325 {
    326 	return (((nvfd_t *)handle)->nvf_cache_path);
    327 }
    328 
    329 krwlock_t *
    330 nvf_lock(nvf_handle_t handle)
    331 {
    332 	return (&(((nvfd_t *)handle)->nvf_lock));
    333 }
    334 
    335 list_t *
    336 nvf_list(nvf_handle_t handle)
    337 {
    338 	return (&(((nvfd_t *)handle)->nvf_data_list));
    339 }
    340 
    341 void
    342 nvf_mark_dirty(nvf_handle_t handle)
    343 {
    344 	ASSERT(RW_WRITE_HELD(&(((nvfd_t *)handle)->nvf_lock)));
    345 	NVF_MARK_DIRTY((nvfd_t *)handle);
    346 }
    347 
    348 int
    349 nvf_is_dirty(nvf_handle_t handle)
    350 {
    351 	ASSERT(RW_LOCK_HELD(&(((nvfd_t *)handle)->nvf_lock)));
    352 	return (NVF_IS_DIRTY((nvfd_t *)handle));
    353 }
    354 
    355 static uint16_t
    356 nvp_cksum(uchar_t *buf, int64_t buflen)
    357 {
    358 	uint16_t cksum = 0;
    359 	uint16_t *p = (uint16_t *)buf;
    360 	int64_t n;
    361 
    362 	if ((buflen & 0x01) != 0) {
    363 		buflen--;
    364 		cksum = buf[buflen];
    365 	}
    366 	n = buflen / 2;
    367 	while (n-- > 0)
    368 		cksum ^= *p++;
    369 	return (cksum);
    370 }
    371 
    372 int
    373 fread_nvlist(char *filename, nvlist_t **ret_nvlist)
    374 {
    375 	struct _buf	*file;
    376 	nvpf_hdr_t	hdr;
    377 	char		*buf;
    378 	nvlist_t	*nvl;
    379 	int		rval;
    380 	uint_t		offset;
    381 	int		n;
    382 	char		c;
    383 	uint16_t	cksum, hdrsum;
    384 
    385 	*ret_nvlist = NULL;
    386 
    387 	file = kobj_open_file(filename);
    388 	if (file == (struct _buf *)-1) {
    389 		KFDEBUG((CE_CONT, "cannot open file: %s\n", filename));
    390 		return (ENOENT);
    391 	}
    392 
    393 	offset = 0;
    394 	n = kobj_read_file(file, (char *)&hdr, sizeof (hdr), offset);
    395 	if (n != sizeof (hdr)) {
    396 		kobj_close_file(file);
    397 		if (n < 0) {
    398 			nvf_error("error reading header: %s\n", filename);
    399 			return (EIO);
    400 		} else if (n == 0) {
    401 			KFDEBUG((CE_CONT, "file empty: %s\n", filename));
    402 		} else {
    403 			nvf_error("header size incorrect: %s\n", filename);
    404 		}
    405 		return (EINVAL);
    406 	}
    407 	offset += n;
    408 
    409 	KFDEBUG2((CE_CONT, "nvpf_magic: 0x%x\n", hdr.nvpf_magic));
    410 	KFDEBUG2((CE_CONT, "nvpf_version: %d\n", hdr.nvpf_version));
    411 	KFDEBUG2((CE_CONT, "nvpf_size: %lld\n",
    412 		(longlong_t)hdr.nvpf_size));
    413 	KFDEBUG2((CE_CONT, "nvpf_hdr_chksum: 0x%x\n",
    414 		hdr.nvpf_hdr_chksum));
    415 	KFDEBUG2((CE_CONT, "nvpf_chksum: 0x%x\n", hdr.nvpf_chksum));
    416 
    417 	cksum = hdr.nvpf_hdr_chksum;
    418 	hdr.nvpf_hdr_chksum = 0;
    419 	hdrsum = nvp_cksum((uchar_t *)&hdr, sizeof (hdr));
    420 
    421 	if (hdr.nvpf_magic != NVPF_HDR_MAGIC ||
    422 	    hdr.nvpf_version != NVPF_HDR_VERSION || hdrsum != cksum) {
    423 		kobj_close_file(file);
    424 		if (hdrsum != cksum) {
    425 			nvf_error("%s: checksum error "
    426 			    "(actual 0x%x, expected 0x%x)\n",
    427 			    filename, hdrsum, cksum);
    428 		}
    429 		nvf_error("%s: header information incorrect", filename);
    430 		return (EINVAL);
    431 	}
    432 
    433 	ASSERT(hdr.nvpf_size >= 0);
    434 
    435 	buf = kmem_alloc(hdr.nvpf_size, KM_SLEEP);
    436 	n = kobj_read_file(file, buf, hdr.nvpf_size, offset);
    437 	if (n != hdr.nvpf_size) {
    438 		kmem_free(buf, hdr.nvpf_size);
    439 		kobj_close_file(file);
    440 		if (n < 0) {
    441 			nvf_error("%s: read error %d", filename, n);
    442 		} else {
    443 			nvf_error("%s: incomplete read %d/%lld",
    444 				filename, n, (longlong_t)hdr.nvpf_size);
    445 		}
    446 		return (EINVAL);
    447 	}
    448 	offset += n;
    449 
    450 	rval = kobj_read_file(file, &c, 1, offset);
    451 	kobj_close_file(file);
    452 	if (rval > 0) {
    453 		nvf_error("%s is larger than %lld\n",
    454 			filename, (longlong_t)hdr.nvpf_size);
    455 		kmem_free(buf, hdr.nvpf_size);
    456 		return (EINVAL);
    457 	}
    458 
    459 	cksum = nvp_cksum((uchar_t *)buf, hdr.nvpf_size);
    460 	if (hdr.nvpf_chksum != cksum) {
    461 		nvf_error("%s: checksum error (actual 0x%x, expected 0x%x)\n",
    462 		    filename, hdr.nvpf_chksum, cksum);
    463 		kmem_free(buf, hdr.nvpf_size);
    464 		return (EINVAL);
    465 	}
    466 
    467 	nvl = NULL;
    468 	rval = nvlist_unpack(buf, hdr.nvpf_size, &nvl, 0);
    469 	if (rval != 0) {
    470 		nvf_error("%s: error %d unpacking nvlist\n",
    471 			filename, rval);
    472 		kmem_free(buf, hdr.nvpf_size);
    473 		return (EINVAL);
    474 	}
    475 
    476 	kmem_free(buf, hdr.nvpf_size);
    477 	*ret_nvlist = nvl;
    478 	return (0);
    479 }
    480 
    481 static int
    482 kfcreate(char *filename, kfile_t **kfilep)
    483 {
    484 	kfile_t	*fp;
    485 	int	rval;
    486 
    487 	ASSERT(modrootloaded);
    488 
    489 	fp = kmem_alloc(sizeof (kfile_t), KM_SLEEP);
    490 
    491 	fp->kf_vnflags = FCREAT | FWRITE | FTRUNC;
    492 	fp->kf_fname = filename;
    493 	fp->kf_fpos = 0;
    494 	fp->kf_state = 0;
    495 
    496 	KFDEBUG((CE_CONT, "create: %s flags 0x%x\n",
    497 		filename, fp->kf_vnflags));
    498 	rval = vn_open(filename, UIO_SYSSPACE, fp->kf_vnflags,
    499 	    0444, &fp->kf_vp, CRCREAT, 0);
    500 	if (rval != 0) {
    501 		kmem_free(fp, sizeof (kfile_t));
    502 		KFDEBUG((CE_CONT, "%s: create error %d\n",
    503 			filename, rval));
    504 		return (rval);
    505 	}
    506 
    507 	*kfilep = fp;
    508 	return (0);
    509 }
    510 
    511 static int
    512 kfremove(char *filename)
    513 {
    514 	int rval;
    515 
    516 	KFDEBUG((CE_CONT, "remove: %s\n", filename));
    517 	rval = vn_remove(filename, UIO_SYSSPACE, RMFILE);
    518 	if (rval != 0) {
    519 		KFDEBUG((CE_CONT, "%s: remove error %d\n",
    520 			filename, rval));
    521 	}
    522 	return (rval);
    523 }
    524 
    525 static int
    526 kfread(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
    527 {
    528 	ssize_t		resid;
    529 	int		err;
    530 	ssize_t		n;
    531 
    532 	ASSERT(modrootloaded);
    533 
    534 	if (fp->kf_state != 0)
    535 		return (fp->kf_state);
    536 
    537 	err = vn_rdwr(UIO_READ, fp->kf_vp, buf, bufsiz, fp->kf_fpos,
    538 		UIO_SYSSPACE, 0, (rlim64_t)0, kcred, &resid);
    539 	if (err != 0) {
    540 		KFDEBUG((CE_CONT, "%s: read error %d\n",
    541 			fp->kf_fname, err));
    542 		fp->kf_state = err;
    543 		return (err);
    544 	}
    545 
    546 	ASSERT(resid >= 0 && resid <= bufsiz);
    547 	n = bufsiz - resid;
    548 
    549 	KFDEBUG1((CE_CONT, "%s: read %ld bytes ok %ld bufsiz, %ld resid\n",
    550 		fp->kf_fname, n, bufsiz, resid));
    551 
    552 	fp->kf_fpos += n;
    553 	*ret_n = n;
    554 	return (0);
    555 }
    556 
    557 static int
    558 kfwrite(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
    559 {
    560 	rlim64_t	rlimit;
    561 	ssize_t		resid;
    562 	int		err;
    563 	ssize_t		len;
    564 	ssize_t		n = 0;
    565 
    566 	ASSERT(modrootloaded);
    567 
    568 	if (fp->kf_state != 0)
    569 		return (fp->kf_state);
    570 
    571 	len = bufsiz;
    572 	rlimit = bufsiz + 1;
    573 	for (;;) {
    574 		err = vn_rdwr(UIO_WRITE, fp->kf_vp, buf, len, fp->kf_fpos,
    575 			UIO_SYSSPACE, FSYNC, rlimit, kcred, &resid);
    576 		if (err) {
    577 			KFDEBUG((CE_CONT, "%s: write error %d\n",
    578 				fp->kf_fname, err));
    579 			fp->kf_state = err;
    580 			return (err);
    581 		}
    582 
    583 		KFDEBUG1((CE_CONT, "%s: write %ld bytes ok %ld resid\n",
    584 			fp->kf_fname, len-resid, resid));
    585 
    586 		ASSERT(resid >= 0 && resid <= len);
    587 
    588 		n += (len - resid);
    589 		if (resid == 0)
    590 			break;
    591 
    592 		if (resid == len) {
    593 			KFDEBUG((CE_CONT, "%s: filesystem full?\n",
    594 				fp->kf_fname));
    595 			fp->kf_state = ENOSPC;
    596 			return (ENOSPC);
    597 		}
    598 
    599 		len -= resid;
    600 		buf += len;
    601 		fp->kf_fpos += len;
    602 		len = resid;
    603 	}
    604 
    605 	ASSERT(n == bufsiz);
    606 	KFDEBUG1((CE_CONT, "%s: wrote %ld bytes ok\n", fp->kf_fname, n));
    607 
    608 	*ret_n = n;
    609 	return (0);
    610 }
    611 
    612 
    613 static int
    614 kfclose(kfile_t *fp)
    615 {
    616 	int		rval;
    617 
    618 	KFDEBUG((CE_CONT, "close: %s\n", fp->kf_fname));
    619 
    620 	if ((fp->kf_vnflags & FWRITE) && fp->kf_state == 0) {
    621 		rval = VOP_FSYNC(fp->kf_vp, FSYNC, kcred, NULL);
    622 		if (rval != 0) {
    623 			nvf_error("%s: sync error %d\n",
    624 				fp->kf_fname, rval);
    625 		}
    626 		KFDEBUG((CE_CONT, "%s: sync ok\n", fp->kf_fname));
    627 	}
    628 
    629 	rval = VOP_CLOSE(fp->kf_vp, fp->kf_vnflags, 1, (offset_t)0, kcred,
    630 		NULL);
    631 	if (rval != 0) {
    632 		if (fp->kf_state == 0) {
    633 			nvf_error("%s: close error %d\n",
    634 				fp->kf_fname, rval);
    635 		}
    636 	} else {
    637 		if (fp->kf_state == 0)
    638 			KFDEBUG((CE_CONT, "%s: close ok\n", fp->kf_fname));
    639 	}
    640 
    641 	VN_RELE(fp->kf_vp);
    642 	kmem_free(fp, sizeof (kfile_t));
    643 	return (rval);
    644 }
    645 
    646 static int
    647 kfrename(char *oldname, char *newname)
    648 {
    649 	int rval;
    650 
    651 	ASSERT(modrootloaded);
    652 
    653 	KFDEBUG((CE_CONT, "renaming %s to %s\n", oldname, newname));
    654 
    655 	if ((rval = vn_rename(oldname, newname, UIO_SYSSPACE)) != 0) {
    656 		KFDEBUG((CE_CONT, "rename %s to %s: %d\n",
    657 			oldname, newname, rval));
    658 	}
    659 
    660 	return (rval);
    661 }
    662 
    663 int
    664 fwrite_nvlist(char *filename, nvlist_t *nvl)
    665 {
    666 	char	*buf;
    667 	char	*nvbuf;
    668 	kfile_t	*fp;
    669 	char	*newname;
    670 	int	len, err, err1;
    671 	size_t	buflen;
    672 	ssize_t	n;
    673 
    674 	ASSERT(modrootloaded);
    675 
    676 	nvbuf = NULL;
    677 	err = nvlist_pack(nvl, &nvbuf, &buflen, NV_ENCODE_NATIVE, 0);
    678 	if (err != 0) {
    679 		nvf_error("%s: error %d packing nvlist\n",
    680 			filename, err);
    681 		return (err);
    682 	}
    683 
    684 	buf = kmem_alloc(sizeof (nvpf_hdr_t) + buflen, KM_SLEEP);
    685 	bzero(buf, sizeof (nvpf_hdr_t));
    686 
    687 	((nvpf_hdr_t *)buf)->nvpf_magic = NVPF_HDR_MAGIC;
    688 	((nvpf_hdr_t *)buf)->nvpf_version = NVPF_HDR_VERSION;
    689 	((nvpf_hdr_t *)buf)->nvpf_size = buflen;
    690 	((nvpf_hdr_t *)buf)->nvpf_chksum = nvp_cksum((uchar_t *)nvbuf, buflen);
    691 	((nvpf_hdr_t *)buf)->nvpf_hdr_chksum =
    692 		nvp_cksum((uchar_t *)buf, sizeof (nvpf_hdr_t));
    693 
    694 	bcopy(nvbuf, buf + sizeof (nvpf_hdr_t), buflen);
    695 	kmem_free(nvbuf, buflen);
    696 	buflen += sizeof (nvpf_hdr_t);
    697 
    698 	len = strlen(filename) + MAX_SUFFIX_LEN + 2;
    699 	newname = kmem_alloc(len, KM_SLEEP);
    700 
    701 
    702 	(void) sprintf(newname, "%s.%s",
    703 		filename, NEW_FILENAME_SUFFIX);
    704 
    705 	/*
    706 	 * To make it unlikely we suffer data loss, write
    707 	 * data to the new temporary file.  Once successful
    708 	 * complete the transaction by renaming the new file
    709 	 * to replace the previous.
    710 	 */
    711 
    712 	if ((err = kfcreate(newname, &fp)) == 0) {
    713 		err = kfwrite(fp, buf, buflen, &n);
    714 		if (err) {
    715 			nvf_error("%s: write error - %d\n",
    716 				newname, err);
    717 		} else {
    718 			if (n != buflen) {
    719 				nvf_error(
    720 				    "%s: partial write %ld of %ld bytes\n",
    721 				    newname, n, buflen);
    722 				nvf_error("%s: filesystem may be full?\n",
    723 				    newname);
    724 				err = EIO;
    725 			}
    726 		}
    727 		if ((err1 = kfclose(fp)) != 0) {
    728 			nvf_error("%s: close error\n", newname);
    729 			if (err == 0)
    730 				err = err1;
    731 		}
    732 		if (err != 0) {
    733 			if (kfremove(newname) != 0) {
    734 				nvf_error("%s: remove failed\n",
    735 				    newname);
    736 			}
    737 		}
    738 	} else {
    739 		nvf_error("%s: create failed - %d\n", filename, err);
    740 	}
    741 
    742 	if (err == 0) {
    743 		if ((err = kfrename(newname, filename)) != 0) {
    744 			nvf_error("%s: rename from %s failed\n",
    745 				newname, filename);
    746 		}
    747 	}
    748 
    749 	kmem_free(newname, len);
    750 	kmem_free(buf, buflen);
    751 
    752 	return (err);
    753 }
    754 
    755 static int
    756 e_fwrite_nvlist(nvfd_t *nvfd, nvlist_t *nvl)
    757 {
    758 	int err;
    759 
    760 	if ((err = fwrite_nvlist(nvfd->nvf_cache_path, nvl)) == 0)
    761 		return (DDI_SUCCESS);
    762 	else {
    763 		if (err == EROFS)
    764 			NVF_MARK_READONLY(nvfd);
    765 		return (DDI_FAILURE);
    766 	}
    767 }
    768 
    769 static void
    770 nvp_list_free(nvfd_t *nvf)
    771 {
    772 	ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
    773 	(nvf->nvf_list_free)((nvf_handle_t)nvf);
    774 	ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
    775 }
    776 
    777 /*
    778  * Read a file in the nvlist format
    779  *	EIO - i/o error during read
    780  *	ENOENT - file not found
    781  *	EINVAL - file contents corrupted
    782  */
    783 static int
    784 fread_nvp_list(nvfd_t *nvfd)
    785 {
    786 	nvlist_t	*nvl;
    787 	nvpair_t	*nvp;
    788 	char		*name;
    789 	nvlist_t	*sublist;
    790 	int		rval;
    791 	int		rv;
    792 
    793 	ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
    794 
    795 	rval = fread_nvlist(nvfd->nvf_cache_path, &nvl);
    796 	if (rval != 0)
    797 		return (rval);
    798 	ASSERT(nvl != NULL);
    799 
    800 	nvp = NULL;
    801 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
    802 		name = nvpair_name(nvp);
    803 		ASSERT(strlen(name) > 0);
    804 
    805 		switch (nvpair_type(nvp)) {
    806 		case DATA_TYPE_NVLIST:
    807 			rval = nvpair_value_nvlist(nvp, &sublist);
    808 			if (rval != 0) {
    809 				nvf_error(
    810 				    "nvpair_value_nvlist error %s %d\n",
    811 				    name, rval);
    812 				goto error;
    813 			}
    814 
    815 			/*
    816 			 * unpack nvlist for this device and
    817 			 * add elements to data list.
    818 			 */
    819 			ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
    820 			rv = (nvfd->nvf_unpack_nvlist)
    821 			    ((nvf_handle_t)nvfd, sublist, name);
    822 			ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
    823 			if (rv != 0) {
    824 				nvf_error(
    825 				    "%s: %s invalid list element\n",
    826 				    nvfd->nvf_cache_path, name);
    827 				rval = EINVAL;
    828 				goto error;
    829 			}
    830 			break;
    831 
    832 		default:
    833 			nvf_error("%s: %s unsupported data type %d\n",
    834 				nvfd->nvf_cache_path, name, nvpair_type(nvp));
    835 			rval = EINVAL;
    836 			goto error;
    837 		}
    838 	}
    839 
    840 	nvlist_free(nvl);
    841 
    842 	return (0);
    843 
    844 error:
    845 	nvlist_free(nvl);
    846 	nvp_list_free(nvfd);
    847 	return (rval);
    848 }
    849 
    850 
    851 int
    852 nvf_read_file(nvf_handle_t nvf_handle)
    853 {
    854 	nvfd_t *nvfd = (nvfd_t *)nvf_handle;
    855 	int rval;
    856 
    857 	ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
    858 
    859 	if (kfio_disable_read)
    860 		return (0);
    861 
    862 	KFDEBUG((CE_CONT, "reading %s\n", nvfd->nvf_cache_path));
    863 
    864 	rval = fread_nvp_list(nvfd);
    865 	if (rval) {
    866 		switch (rval) {
    867 		case EIO:
    868 			nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
    869 			cmn_err(CE_WARN, "%s: I/O error",
    870 				nvfd->nvf_cache_path);
    871 			break;
    872 		case ENOENT:
    873 			nvfd->nvf_flags |= NVF_F_CREATE_MSG;
    874 			nvf_error("%s: not found\n",
    875 				nvfd->nvf_cache_path);
    876 			break;
    877 		case EINVAL:
    878 		default:
    879 			nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
    880 			cmn_err(CE_WARN, "%s: data file corrupted",
    881 				nvfd->nvf_cache_path);
    882 			break;
    883 		}
    884 	}
    885 	return (rval);
    886 }
    887 
    888 static void
    889 nvf_write_is_complete(nvfd_t *fd)
    890 {
    891 	if (fd->nvf_write_complete) {
    892 		(fd->nvf_write_complete)((nvf_handle_t)fd);
    893 	}
    894 }
    895 
    896 /*ARGSUSED*/
    897 static void
    898 nvpflush_timeout(void *arg)
    899 {
    900 	clock_t nticks;
    901 
    902 	mutex_enter(&nvpflush_lock);
    903 	nticks = nvpticks - ddi_get_lbolt();
    904 	if (nticks > 4) {
    905 		nvpflush_timer_busy = 1;
    906 		mutex_exit(&nvpflush_lock);
    907 		nvpflush_id = timeout(nvpflush_timeout, NULL, nticks);
    908 	} else {
    909 		do_nvpflush = 1;
    910 		NVPDAEMON_DEBUG((CE_CONT, "signal nvpdaemon\n"));
    911 		cv_signal(&nvpflush_cv);
    912 		nvpflush_id = 0;
    913 		nvpflush_timer_busy = 0;
    914 		mutex_exit(&nvpflush_lock);
    915 	}
    916 }
    917 
    918 /*
    919  * After marking a list as dirty, wake the nvpflush daemon
    920  * to perform the update.
    921  */
    922 void
    923 nvf_wake_daemon(void)
    924 {
    925 	clock_t nticks;
    926 
    927 	/*
    928 	 * If the system isn't up yet
    929 	 * don't even think about starting a flush.
    930 	 */
    931 	if (!i_ddi_io_initialized())
    932 		return;
    933 
    934 	mutex_enter(&nvpflush_lock);
    935 
    936 	if (nvpflush_daemon_active == 0) {
    937 		nvpflush_daemon_active = 1;
    938 		mutex_exit(&nvpflush_lock);
    939 		NVPDAEMON_DEBUG((CE_CONT, "starting nvpdaemon thread\n"));
    940 		nvpflush_thr_id = thread_create(NULL, 0,
    941 		    (void (*)())nvpflush_daemon,
    942 		    NULL, 0, &p0, TS_RUN, minclsyspri);
    943 		mutex_enter(&nvpflush_lock);
    944 	}
    945 
    946 	nticks = nvpflush_delay * TICKS_PER_SECOND;
    947 	nvpticks = ddi_get_lbolt() + nticks;
    948 	if (nvpflush_timer_busy == 0) {
    949 		nvpflush_timer_busy = 1;
    950 		mutex_exit(&nvpflush_lock);
    951 		nvpflush_id = timeout(nvpflush_timeout, NULL, nticks + 4);
    952 	} else
    953 		mutex_exit(&nvpflush_lock);
    954 }
    955 
    956 static int
    957 nvpflush_one(nvfd_t *nvfd)
    958 {
    959 	int rval = DDI_SUCCESS;
    960 	nvlist_t *nvl;
    961 
    962 	rw_enter(&nvfd->nvf_lock, RW_READER);
    963 
    964 	ASSERT((nvfd->nvf_flags & NVF_F_FLUSHING) == 0);
    965 
    966 	if (!NVF_IS_DIRTY(nvfd) ||
    967 	    NVF_IS_READONLY(nvfd) || kfio_disable_write) {
    968 		NVF_CLEAR_DIRTY(nvfd);
    969 		rw_exit(&nvfd->nvf_lock);
    970 		return (DDI_SUCCESS);
    971 	}
    972 
    973 	if (rw_tryupgrade(&nvfd->nvf_lock) == 0) {
    974 		nvf_error("nvpflush: "
    975 		    "%s rw upgrade failed\n", nvfd->nvf_cache_path);
    976 		rw_exit(&nvfd->nvf_lock);
    977 		return (DDI_FAILURE);
    978 	}
    979 	if (((nvfd->nvf_pack_list)
    980 	    ((nvf_handle_t)nvfd, &nvl)) != DDI_SUCCESS) {
    981 		nvf_error("nvpflush: "
    982 		    "%s nvlist construction failed\n", nvfd->nvf_cache_path);
    983 		ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
    984 		rw_exit(&nvfd->nvf_lock);
    985 		return (DDI_FAILURE);
    986 	}
    987 	ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
    988 
    989 	NVF_CLEAR_DIRTY(nvfd);
    990 	nvfd->nvf_flags |= NVF_F_FLUSHING;
    991 	rw_exit(&nvfd->nvf_lock);
    992 
    993 	rval = e_fwrite_nvlist(nvfd, nvl);
    994 	nvlist_free(nvl);
    995 
    996 	rw_enter(&nvfd->nvf_lock, RW_WRITER);
    997 	nvfd->nvf_flags &= ~NVF_F_FLUSHING;
    998 	if (rval == DDI_FAILURE) {
    999 		if (NVF_IS_READONLY(nvfd)) {
   1000 			rval = DDI_SUCCESS;
   1001 			nvfd->nvf_flags &= ~(NVF_F_ERROR | NVF_F_DIRTY);
   1002 		} else if ((nvfd->nvf_flags & NVF_F_ERROR) == 0) {
   1003 			cmn_err(CE_CONT,
   1004 			    "%s: updated failed\n", nvfd->nvf_cache_path);
   1005 			nvfd->nvf_flags |= NVF_F_ERROR | NVF_F_DIRTY;
   1006 		}
   1007 	} else {
   1008 		if (nvfd->nvf_flags & NVF_F_CREATE_MSG) {
   1009 			cmn_err(CE_CONT,
   1010 			    "!Creating %s\n", nvfd->nvf_cache_path);
   1011 			nvfd->nvf_flags &= ~NVF_F_CREATE_MSG;
   1012 		}
   1013 		if (nvfd->nvf_flags & NVF_F_REBUILD_MSG) {
   1014 			cmn_err(CE_CONT,
   1015 			    "!Rebuilding %s\n", nvfd->nvf_cache_path);
   1016 			nvfd->nvf_flags &= ~NVF_F_REBUILD_MSG;
   1017 		}
   1018 		if (nvfd->nvf_flags & NVF_F_ERROR) {
   1019 			cmn_err(CE_CONT,
   1020 			    "%s: update now ok\n", nvfd->nvf_cache_path);
   1021 			nvfd->nvf_flags &= ~NVF_F_ERROR;
   1022 		}
   1023 		/*
   1024 		 * The file may need to be flushed again if the cached
   1025 		 * data was touched while writing the earlier contents.
   1026 		 */
   1027 		if (NVF_IS_DIRTY(nvfd))
   1028 			rval = DDI_FAILURE;
   1029 	}
   1030 
   1031 	rw_exit(&nvfd->nvf_lock);
   1032 	return (rval);
   1033 }
   1034 
   1035 
   1036 static void
   1037 nvpflush_daemon(void)
   1038 {
   1039 	callb_cpr_t cprinfo;
   1040 	nvfd_t *nvfdp, *nextfdp;
   1041 	clock_t clk;
   1042 	int rval;
   1043 	int want_wakeup;
   1044 	int is_now_clean;
   1045 
   1046 	ASSERT(modrootloaded);
   1047 
   1048 	nvpflush_thread = curthread;
   1049 	NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: init\n"));
   1050 
   1051 	CALLB_CPR_INIT(&cprinfo, &nvpflush_lock, callb_generic_cpr, "nvp");
   1052 	mutex_enter(&nvpflush_lock);
   1053 	for (;;) {
   1054 
   1055 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   1056 		while (do_nvpflush == 0) {
   1057 			clk = cv_timedwait(&nvpflush_cv, &nvpflush_lock,
   1058 			    ddi_get_lbolt() +
   1059 				(nvpdaemon_idle_time * TICKS_PER_SECOND));
   1060 			if (clk == -1 &&
   1061 			    do_nvpflush == 0 && nvpflush_timer_busy == 0) {
   1062 				/*
   1063 				 * Note that CALLB_CPR_EXIT calls mutex_exit()
   1064 				 * on the lock passed in to CALLB_CPR_INIT,
   1065 				 * so the lock must be held when invoking it.
   1066 				 */
   1067 				CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
   1068 				NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: exit\n"));
   1069 				ASSERT(mutex_owned(&nvpflush_lock));
   1070 				nvpflush_thr_id = NULL;
   1071 				nvpflush_daemon_active = 0;
   1072 				CALLB_CPR_EXIT(&cprinfo);
   1073 				thread_exit();
   1074 			}
   1075 		}
   1076 		CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
   1077 
   1078 		nvpbusy = 1;
   1079 		want_wakeup = 0;
   1080 		do_nvpflush = 0;
   1081 		mutex_exit(&nvpflush_lock);
   1082 
   1083 		/*
   1084 		 * Try flushing what's dirty, reschedule if there's
   1085 		 * a failure or data gets marked as dirty again.
   1086 		 * First move each file marked dirty to the dirty
   1087 		 * list to avoid locking the list across the write.
   1088 		 */
   1089 		mutex_enter(&nvf_cache_mutex);
   1090 		for (nvfdp = list_head(&nvf_cache_files);
   1091 		    nvfdp; nvfdp = nextfdp) {
   1092 			nextfdp = list_next(&nvf_cache_files, nvfdp);
   1093 			rw_enter(&nvfdp->nvf_lock, RW_READER);
   1094 			if (NVF_IS_DIRTY(nvfdp)) {
   1095 				list_remove(&nvf_cache_files, nvfdp);
   1096 				list_insert_tail(&nvf_dirty_files, nvfdp);
   1097 				rw_exit(&nvfdp->nvf_lock);
   1098 			} else {
   1099 				NVPDAEMON_DEBUG((CE_CONT,
   1100 				    "nvpdaemon: not dirty %s\n",
   1101 				    nvfdp->nvf_cache_path));
   1102 				rw_exit(&nvfdp->nvf_lock);
   1103 			}
   1104 		}
   1105 		mutex_exit(&nvf_cache_mutex);
   1106 
   1107 		/*
   1108 		 * Now go through the dirty list
   1109 		 */
   1110 		for (nvfdp = list_head(&nvf_dirty_files);
   1111 		    nvfdp; nvfdp = nextfdp) {
   1112 			nextfdp = list_next(&nvf_dirty_files, nvfdp);
   1113 
   1114 			is_now_clean = 0;
   1115 			rw_enter(&nvfdp->nvf_lock, RW_READER);
   1116 			if (NVF_IS_DIRTY(nvfdp)) {
   1117 				NVPDAEMON_DEBUG((CE_CONT,
   1118 				    "nvpdaemon: flush %s\n",
   1119 				    nvfdp->nvf_cache_path));
   1120 				rw_exit(&nvfdp->nvf_lock);
   1121 				rval = nvpflush_one(nvfdp);
   1122 				rw_enter(&nvfdp->nvf_lock, RW_READER);
   1123 				if (rval != DDI_SUCCESS ||
   1124