Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/note.h>
     27 #include <sys/t_lock.h>
     28 #include <sys/cmn_err.h>
     29 #include <sys/instance.h>
     30 #include <sys/conf.h>
     31 #include <sys/stat.h>
     32 #include <sys/ddi.h>
     33 #include <sys/hwconf.h>
     34 #include <sys/sunddi.h>
     35 #include <sys/sunndi.h>
     36 #include <sys/ddi_impldefs.h>
     37 #include <sys/ndi_impldefs.h>
     38 #include <sys/modctl.h>
     39 #include <sys/dacf.h>
     40 #include <sys/promif.h>
     41 #include <sys/cpuvar.h>
     42 #include <sys/pathname.h>
     43 #include <sys/kobj.h>
     44 #include <sys/devcache.h>
     45 #include <sys/devcache_impl.h>
     46 #include <sys/sysmacros.h>
     47 #include <sys/varargs.h>
     48 #include <sys/callb.h>
     49 
     50 /*
     51  * This facility provides interfaces to clients to register,
     52  * read and update cache data in persisted backing store files,
     53  * usually in /etc/devices.  The data persisted through this
     54  * mechanism should be stateless data, functioning in the sense
     55  * of a cache.  Writes are performed by a background daemon
     56  * thread, permitting a client to schedule an update without
     57  * blocking, then continue updating the data state in
     58  * parallel.  The data is only locked by the daemon thread
     59  * to pack the data in preparation for the write.
     60  *
     61  * Data persisted through this mechanism should be capable
     62  * of being regenerated through normal system operation,
     63  * for example attaching all disk devices would cause all
     64  * devids to be registered for those devices.  By caching
     65  * a devid-device tuple, the system can operate in a
     66  * more optimal way, directly attaching the device mapped
     67  * to a devid, rather than burdensomely driving attach of
     68  * the entire device tree to discover a single device.
     69  *
     70  * Note that a client should only need to include
     71  * <sys/devcache.h> for the supported interfaces.
     72  *
     73  * The data per client is entirely within the control of
     74  * the client.  When reading, data unpacked from the backing
     75  * store should be inserted in the list.  The pointer to
     76  * the list can be retrieved via nvf_list().  When writing,
     77  * the data on the list is to be packed and returned to the
     78  * nvpdaemon as an nvlist.
     79  *
     80  * Obvious restrictions are imposed by the limits of the
     81  * nvlist format.  The data cannot be read or written
     82  * piecemeal, and large amounts of data aren't recommended.
     83  * However, nvlists do allow that data be named and typed
     84  * and can be size-of-int invariant, and the cached data
     85  * can be versioned conveniently.
     86  *
     87  * The registration involves two steps: a handle is
     88  * allocated by calling the registration function.
     89  * This sets up the data referenced by the handle and
     90  * initializes the lock.  Following registration, the
     91  * client must initialize the data list.  The list
     92  * interfaces require that the list element with offset
     93  * to the node link be provided.  The format of the
     94  * list element is under the control of the client.
     95  *
     96  * Locking: the address of the data list r/w lock provided
     97  * can be accessed with nvf_lock().  The lock must be held
     98  * as reader when traversing the list or checking state,
     99  * such as nvf_is_dirty().  The lock must be held as
    100  * writer when updating the list or marking it dirty.
    101  * The lock must not be held when waking the daemon.
    102  *
    103  * The data r/w lock is held as writer when the pack,
    104  * unpack and free list handlers are called.  The
    105  * lock should not be dropped and must be still held
    106  * upon return.  The client should also hold the lock
    107  * as reader when checking if the list is dirty, and
    108  * as writer when marking the list dirty or initiating
    109  * a read.
    110  *
    111  * The asynchronous nature of updates allows for the
    112  * possibility that the data may continue to be updated
    113  * once the daemon has been notified that an update is
    114  * desired.  The data only needs to be locked against
    115  * updates when packing the data into the form to be
    116  * written.  When the write of the packed data has
    117  * completed, the daemon will automatically reschedule
    118  * an update if the data was marked dirty after the
    119  * point at which it was packed.  Before beginning an
    120  * update, the daemon attempts to lock the data as
    121  * writer; if the writer lock is already held, it
    122  * backs off and retries later.  The model is to give
    123  * priority to the kernel processes generating the
    124  * data, and that the nature of the data is that
    125  * it does not change often, can be re-generated when
    126  * needed, so updates should not happen often and
    127  * can be delayed until the data stops changing.
    128  * The client may update the list or mark it dirty
    129  * any time it is able to acquire the lock as
    130  * writer first.
    131  *
    132  * A failed write will be retried after some delay,
    133  * in the hope that the cause of the error will be
    134  * transient, for example a filesystem with no space
    135  * available.  An update on a read-only filesystem
    136  * is failed silently and not retried; this would be
    137  * the case when booted off install media.
    138  *
    139  * There is no unregister mechanism as of yet, as it
    140  * hasn't been needed so far.
    141  */
    142 
    143 /*
    144  * Global list of files registered and updated by the nvpflush
    145  * daemon, protected by the nvf_cache_mutex.  While an
    146  * update is taking place, a file is temporarily moved to
    147  * the dirty list to avoid locking the primary list for
    148  * the duration of the update.
    149  */
    150 list_t		nvf_cache_files;
    151 list_t		nvf_dirty_files;
    152 kmutex_t	nvf_cache_mutex;
    153 
    154 
    155 /*
    156  * Allow some delay from an update of the data before flushing
    157  * to permit simultaneous updates of multiple changes.
    158  * Changes in the data are expected to be bursty, ie
    159  * reconfig or hot-plug of a new adapter.
    160  *
    161  * kfio_report_error (default 0)
    162  *	Set to 1 to enable some error messages related to low-level
    163  *	kernel file i/o operations.
    164  *
    165  * nvpflush_delay (default 10)
    166  *	The number of seconds after data is marked dirty before the
    167  *	flush daemon is triggered to flush the data.  A longer period
    168  *	of time permits more data updates per write.  Note that
    169  *	every update resets the timer so no repository write will
    170  *	occur while data is being updated continuously.
    171  *
    172  * nvpdaemon_idle_time (default 60)
    173  *	The number of seconds the daemon will sleep idle before exiting.
    174  *
    175  */
    176 #define	NVPFLUSH_DELAY		10
    177 #define	NVPDAEMON_IDLE_TIME	60
    178 
    179 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
    180 
    181 /*
    182  * Tunables
    183  */
    184 int kfio_report_error = 0;		/* kernel file i/o operations */
    185 int kfio_disable_read = 0;		/* disable all reads */
    186 int kfio_disable_write = 0;		/* disable all writes */
    187 
    188 int nvpflush_delay	= NVPFLUSH_DELAY;
    189 int nvpdaemon_idle_time	= NVPDAEMON_IDLE_TIME;
    190 
    191 static timeout_id_t	nvpflush_id = 0;
    192 static int		nvpflush_timer_busy = 0;
    193 static int		nvpflush_daemon_active = 0;
    194 static kthread_t	*nvpflush_thr_id = 0;
    195 
    196 static int		do_nvpflush = 0;
    197 static int		nvpbusy = 0;
    198 static kmutex_t		nvpflush_lock;
    199 static kcondvar_t	nvpflush_cv;
    200 static kthread_id_t	nvpflush_thread;
    201 static clock_t		nvpticks;
    202 
    203 static void nvpflush_daemon(void);
    204 
    205 #ifdef	DEBUG
    206 int nvpdaemon_debug = 0;
    207 int kfio_debug = 0;
    208 #endif	/* DEBUG */
    209 
    210 extern int modrootloaded;
    211 extern void mdi_read_devices_files(void);
    212 extern void mdi_clean_vhcache(void);
    213 extern int sys_shutdown;
    214 
    215 /*
    216  * Initialize the overall cache file management
    217  */
    218 void
    219 i_ddi_devices_init(void)
    220 {
    221 	list_create(&nvf_cache_files, sizeof (nvfd_t),
    222 	    offsetof(nvfd_t, nvf_link));
    223 	list_create(&nvf_dirty_files, sizeof (nvfd_t),
    224 	    offsetof(nvfd_t, nvf_link));
    225 	mutex_init(&nvf_cache_mutex, NULL, MUTEX_DEFAULT, NULL);
    226 	retire_store_init();
    227 	devid_cache_init();
    228 }
    229 
    230 /*
    231  * Read cache files
    232  * The files read here should be restricted to those
    233  * that may be required to mount root.
    234  */
    235 void
    236 i_ddi_read_devices_files(void)
    237 {
    238 	/*
    239 	 * The retire store should be the first file read as it
    240 	 * may need to offline devices. kfio_disable_read is not
    241 	 * used for retire. For the rationale see the tunable
    242 	 * ddi_retire_store_bypass and comments in:
    243 	 *	uts/common/os/retire_store.c
    244 	 */
    245 
    246 	retire_store_read();
    247 
    248 	if (!kfio_disable_read) {
    249 		mdi_read_devices_files();
    250 		devid_cache_read();
    251 	}
    252 }
    253 
    254 void
    255 i_ddi_start_flush_daemon(void)
    256 {
    257 	nvfd_t	*nvfdp;
    258 
    259 	ASSERT(i_ddi_io_initialized());
    260 
    261 	mutex_init(&nvpflush_lock, NULL, MUTEX_DRIVER, NULL);
    262 	cv_init(&nvpflush_cv, NULL, CV_DRIVER, NULL);
    263 
    264 	mutex_enter(&nvf_cache_mutex);
    265 	for (nvfdp = list_head(&nvf_cache_files); nvfdp;
    266 	    nvfdp = list_next(&nvf_cache_files, nvfdp)) {
    267 		if (NVF_IS_DIRTY(nvfdp)) {
    268 			nvf_wake_daemon();
    269 			break;
    270 		}
    271 	}
    272 	mutex_exit(&nvf_cache_mutex);
    273 }
    274 
    275 void
    276 i_ddi_clean_devices_files(void)
    277 {
    278 	devid_cache_cleanup();
    279 	mdi_clean_vhcache();
    280 }
    281 
    282 /*
    283  * Register a cache file to be managed and updated by the nvpflush daemon.
    284  * All operations are performed through the returned handle.
    285  * There is no unregister mechanism for now.
    286  */
    287 nvf_handle_t
    288 nvf_register_file(nvf_ops_t *ops)
    289 {
    290 	nvfd_t *nvfdp;
    291 
    292 	nvfdp = kmem_zalloc(sizeof (*nvfdp), KM_SLEEP);
    293 
    294 	nvfdp->nvf_ops = ops;
    295 	nvfdp->nvf_flags = 0;
    296 	rw_init(&nvfdp->nvf_lock, NULL, RW_DRIVER, NULL);
    297 
    298 	mutex_enter(&nvf_cache_mutex);
    299 	list_insert_tail(&nvf_cache_files, nvfdp);
    300 	mutex_exit(&nvf_cache_mutex);
    301 
    302 	return ((nvf_handle_t)nvfdp);
    303 }
    304 
    305 /*PRINTFLIKE1*/
    306 void
    307 nvf_error(const char *fmt, ...)
    308 {
    309 	va_list ap;
    310 
    311 	if (kfio_report_error) {
    312 		va_start(ap, fmt);
    313 		vcmn_err(CE_NOTE, fmt, ap);
    314 		va_end(ap);
    315 	}
    316 }
    317 
    318 /*
    319  * Some operations clients may use to manage the data
    320  * to be persisted in a cache file.
    321  */
    322 char *
    323 nvf_cache_name(nvf_handle_t handle)
    324 {
    325 	return (((nvfd_t *)handle)->nvf_cache_path);
    326 }
    327 
    328 krwlock_t *
    329 nvf_lock(nvf_handle_t handle)
    330 {
    331 	return (&(((nvfd_t *)handle)->nvf_lock));
    332 }
    333 
    334 list_t *
    335 nvf_list(nvf_handle_t handle)
    336 {
    337 	return (&(((nvfd_t *)handle)->nvf_data_list));
    338 }
    339 
    340 void
    341 nvf_mark_dirty(nvf_handle_t handle)
    342 {
    343 	ASSERT(RW_WRITE_HELD(&(((nvfd_t *)handle)->nvf_lock)));
    344 	NVF_MARK_DIRTY((nvfd_t *)handle);
    345 }
    346 
    347 int
    348 nvf_is_dirty(nvf_handle_t handle)
    349 {
    350 	ASSERT(RW_LOCK_HELD(&(((nvfd_t *)handle)->nvf_lock)));
    351 	return (NVF_IS_DIRTY((nvfd_t *)handle));
    352 }
    353 
    354 static uint16_t
    355 nvp_cksum(uchar_t *buf, int64_t buflen)
    356 {
    357 	uint16_t cksum = 0;
    358 	uint16_t *p = (uint16_t *)buf;
    359 	int64_t n;
    360 
    361 	if ((buflen & 0x01) != 0) {
    362 		buflen--;
    363 		cksum = buf[buflen];
    364 	}
    365 	n = buflen / 2;
    366 	while (n-- > 0)
    367 		cksum ^= *p++;
    368 	return (cksum);
    369 }
    370 
    371 int
    372 fread_nvlist(char *filename, nvlist_t **ret_nvlist)
    373 {
    374 	struct _buf	*file;
    375 	nvpf_hdr_t	hdr;
    376 	char		*buf;
    377 	nvlist_t	*nvl;
    378 	int		rval;
    379 	uint_t		offset;
    380 	int		n;
    381 	char		c;
    382 	uint16_t	cksum, hdrsum;
    383 
    384 	*ret_nvlist = NULL;
    385 
    386 	file = kobj_open_file(filename);
    387 	if (file == (struct _buf *)-1) {
    388 		KFDEBUG((CE_CONT, "cannot open file: %s\n", filename));
    389 		return (ENOENT);
    390 	}
    391 
    392 	offset = 0;
    393 	n = kobj_read_file(file, (char *)&hdr, sizeof (hdr), offset);
    394 	if (n != sizeof (hdr)) {
    395 		kobj_close_file(file);
    396 		if (n < 0) {
    397 			nvf_error("error reading header: %s\n", filename);
    398 			return (EIO);
    399 		} else if (n == 0) {
    400 			KFDEBUG((CE_CONT, "file empty: %s\n", filename));
    401 		} else {
    402 			nvf_error("header size incorrect: %s\n", filename);
    403 		}
    404 		return (EINVAL);
    405 	}
    406 	offset += n;
    407 
    408 	KFDEBUG2((CE_CONT, "nvpf_magic: 0x%x\n", hdr.nvpf_magic));
    409 	KFDEBUG2((CE_CONT, "nvpf_version: %d\n", hdr.nvpf_version));
    410 	KFDEBUG2((CE_CONT, "nvpf_size: %lld\n",
    411 	    (longlong_t)hdr.nvpf_size));
    412 	KFDEBUG2((CE_CONT, "nvpf_hdr_chksum: 0x%x\n",
    413 	    hdr.nvpf_hdr_chksum));
    414 	KFDEBUG2((CE_CONT, "nvpf_chksum: 0x%x\n", hdr.nvpf_chksum));
    415 
    416 	cksum = hdr.nvpf_hdr_chksum;
    417 	hdr.nvpf_hdr_chksum = 0;
    418 	hdrsum = nvp_cksum((uchar_t *)&hdr, sizeof (hdr));
    419 
    420 	if (hdr.nvpf_magic != NVPF_HDR_MAGIC ||
    421 	    hdr.nvpf_version != NVPF_HDR_VERSION || hdrsum != cksum) {
    422 		kobj_close_file(file);
    423 		if (hdrsum != cksum) {
    424 			nvf_error("%s: checksum error "
    425 			    "(actual 0x%x, expected 0x%x)\n",
    426 			    filename, hdrsum, cksum);
    427 		}
    428 		nvf_error("%s: header information incorrect", filename);
    429 		return (EINVAL);
    430 	}
    431 
    432 	ASSERT(hdr.nvpf_size >= 0);
    433 
    434 	buf = kmem_alloc(hdr.nvpf_size, KM_SLEEP);
    435 	n = kobj_read_file(file, buf, hdr.nvpf_size, offset);
    436 	if (n != hdr.nvpf_size) {
    437 		kmem_free(buf, hdr.nvpf_size);
    438 		kobj_close_file(file);
    439 		if (n < 0) {
    440 			nvf_error("%s: read error %d", filename, n);
    441 		} else {
    442 			nvf_error("%s: incomplete read %d/%lld",
    443 			    filename, n, (longlong_t)hdr.nvpf_size);
    444 		}
    445 		return (EINVAL);
    446 	}
    447 	offset += n;
    448 
    449 	rval = kobj_read_file(file, &c, 1, offset);
    450 	kobj_close_file(file);
    451 	if (rval > 0) {
    452 		nvf_error("%s is larger than %lld\n",
    453 		    filename, (longlong_t)hdr.nvpf_size);
    454 		kmem_free(buf, hdr.nvpf_size);
    455 		return (EINVAL);
    456 	}
    457 
    458 	cksum = nvp_cksum((uchar_t *)buf, hdr.nvpf_size);
    459 	if (hdr.nvpf_chksum != cksum) {
    460 		nvf_error("%s: checksum error (actual 0x%x, expected 0x%x)\n",
    461 		    filename, hdr.nvpf_chksum, cksum);
    462 		kmem_free(buf, hdr.nvpf_size);
    463 		return (EINVAL);
    464 	}
    465 
    466 	nvl = NULL;
    467 	rval = nvlist_unpack(buf, hdr.nvpf_size, &nvl, 0);
    468 	if (rval != 0) {
    469 		nvf_error("%s: error %d unpacking nvlist\n",
    470 		    filename, rval);
    471 		kmem_free(buf, hdr.nvpf_size);
    472 		return (EINVAL);
    473 	}
    474 
    475 	kmem_free(buf, hdr.nvpf_size);
    476 	*ret_nvlist = nvl;
    477 	return (0);
    478 }
    479 
    480 static int
    481 kfcreate(char *filename, kfile_t **kfilep)
    482 {
    483 	kfile_t	*fp;
    484 	int	rval;
    485 
    486 	ASSERT(modrootloaded);
    487 
    488 	fp = kmem_alloc(sizeof (kfile_t), KM_SLEEP);
    489 
    490 	fp->kf_vnflags = FCREAT | FWRITE | FTRUNC;
    491 	fp->kf_fname = filename;
    492 	fp->kf_fpos = 0;
    493 	fp->kf_state = 0;
    494 
    495 	KFDEBUG((CE_CONT, "create: %s flags 0x%x\n",
    496 	    filename, fp->kf_vnflags));
    497 	rval = vn_open(filename, UIO_SYSSPACE, fp->kf_vnflags,
    498 	    0444, &fp->kf_vp, CRCREAT, 0);
    499 	if (rval != 0) {
    500 		kmem_free(fp, sizeof (kfile_t));
    501 		KFDEBUG((CE_CONT, "%s: create error %d\n",
    502 		    filename, rval));
    503 		return (rval);
    504 	}
    505 
    506 	*kfilep = fp;
    507 	return (0);
    508 }
    509 
    510 static int
    511 kfremove(char *filename)
    512 {
    513 	int rval;
    514 
    515 	KFDEBUG((CE_CONT, "remove: %s\n", filename));
    516 	rval = vn_remove(filename, UIO_SYSSPACE, RMFILE);
    517 	if (rval != 0) {
    518 		KFDEBUG((CE_CONT, "%s: remove error %d\n",
    519 		    filename, rval));
    520 	}
    521 	return (rval);
    522 }
    523 
    524 static int
    525 kfread(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
    526 {
    527 	ssize_t		resid;
    528 	int		err;
    529 	ssize_t		n;
    530 
    531 	ASSERT(modrootloaded);
    532 
    533 	if (fp->kf_state != 0)
    534 		return (fp->kf_state);
    535 
    536 	err = vn_rdwr(UIO_READ, fp->kf_vp, buf, bufsiz, fp->kf_fpos,
    537 	    UIO_SYSSPACE, 0, (rlim64_t)0, kcred, &resid);
    538 	if (err != 0) {
    539 		KFDEBUG((CE_CONT, "%s: read error %d\n",
    540 		    fp->kf_fname, err));
    541 		fp->kf_state = err;
    542 		return (err);
    543 	}
    544 
    545 	ASSERT(resid >= 0 && resid <= bufsiz);
    546 	n = bufsiz - resid;
    547 
    548 	KFDEBUG1((CE_CONT, "%s: read %ld bytes ok %ld bufsiz, %ld resid\n",
    549 	    fp->kf_fname, n, bufsiz, resid));
    550 
    551 	fp->kf_fpos += n;
    552 	*ret_n = n;
    553 	return (0);
    554 }
    555 
    556 static int
    557 kfwrite(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
    558 {
    559 	rlim64_t	rlimit;
    560 	ssize_t		resid;
    561 	int		err;
    562 	ssize_t		len;
    563 	ssize_t		n = 0;
    564 
    565 	ASSERT(modrootloaded);
    566 
    567 	if (fp->kf_state != 0)
    568 		return (fp->kf_state);
    569 
    570 	len = bufsiz;
    571 	rlimit = bufsiz + 1;
    572 	for (;;) {
    573 		err = vn_rdwr(UIO_WRITE, fp->kf_vp, buf, len, fp->kf_fpos,
    574 		    UIO_SYSSPACE, FSYNC, rlimit, kcred, &resid);
    575 		if (err) {
    576 			KFDEBUG((CE_CONT, "%s: write error %d\n",
    577 			    fp->kf_fname, err));
    578 			fp->kf_state = err;
    579 			return (err);
    580 		}
    581 
    582 		KFDEBUG1((CE_CONT, "%s: write %ld bytes ok %ld resid\n",
    583 		    fp->kf_fname, len-resid, resid));
    584 
    585 		ASSERT(resid >= 0 && resid <= len);
    586 
    587 		n += (len - resid);
    588 		if (resid == 0)
    589 			break;
    590 
    591 		if (resid == len) {
    592 			KFDEBUG((CE_CONT, "%s: filesystem full?\n",
    593 			    fp->kf_fname));
    594 			fp->kf_state = ENOSPC;
    595 			return (ENOSPC);
    596 		}
    597 
    598 		len -= resid;
    599 		buf += len;
    600 		fp->kf_fpos += len;
    601 		len = resid;
    602 	}
    603 
    604 	ASSERT(n == bufsiz);
    605 	KFDEBUG1((CE_CONT, "%s: wrote %ld bytes ok\n", fp->kf_fname, n));
    606 
    607 	*ret_n = n;
    608 	return (0);
    609 }
    610 
    611 
    612 static int
    613 kfclose(kfile_t *fp)
    614 {
    615 	int		rval;
    616 
    617 	KFDEBUG((CE_CONT, "close: %s\n", fp->kf_fname));
    618 
    619 	if ((fp->kf_vnflags & FWRITE) && fp->kf_state == 0) {
    620 		rval = VOP_FSYNC(fp->kf_vp, FSYNC, kcred, NULL);
    621 		if (rval != 0) {
    622 			nvf_error("%s: sync error %d\n",
    623 			    fp->kf_fname, rval);
    624 		}
    625 		KFDEBUG((CE_CONT, "%s: sync ok\n", fp->kf_fname));
    626 	}
    627 
    628 	rval = VOP_CLOSE(fp->kf_vp, fp->kf_vnflags, 1,
    629 	    (offset_t)0, kcred, NULL);
    630 	if (rval != 0) {
    631 		if (fp->kf_state == 0) {
    632 			nvf_error("%s: close error %d\n",
    633 			    fp->kf_fname, rval);
    634 		}
    635 	} else {
    636 		if (fp->kf_state == 0)
    637 			KFDEBUG((CE_CONT, "%s: close ok\n", fp->kf_fname));
    638 	}
    639 
    640 	VN_RELE(fp->kf_vp);
    641 	kmem_free(fp, sizeof (kfile_t));
    642 	return (rval);
    643 }
    644 
    645 static int
    646 kfrename(char *oldname, char *newname)
    647 {
    648 	int rval;
    649 
    650 	ASSERT(modrootloaded);
    651 
    652 	KFDEBUG((CE_CONT, "renaming %s to %s\n", oldname, newname));
    653 
    654 	if ((rval = vn_rename(oldname, newname, UIO_SYSSPACE)) != 0) {
    655 		KFDEBUG((CE_CONT, "rename %s to %s: %d\n",
    656 		    oldname, newname, rval));
    657 	}
    658 
    659 	return (rval);
    660 }
    661 
    662 int
    663 fwrite_nvlist(char *filename, nvlist_t *nvl)
    664 {
    665 	char	*buf;
    666 	char	*nvbuf;
    667 	kfile_t	*fp;
    668 	char	*newname;
    669 	int	len, err, err1;
    670 	size_t	buflen;
    671 	ssize_t	n;
    672 
    673 	ASSERT(modrootloaded);
    674 
    675 	nvbuf = NULL;
    676 	err = nvlist_pack(nvl, &nvbuf, &buflen, NV_ENCODE_NATIVE, 0);
    677 	if (err != 0) {
    678 		nvf_error("%s: error %d packing nvlist\n",
    679 		    filename, err);
    680 		return (err);
    681 	}
    682 
    683 	buf = kmem_alloc(sizeof (nvpf_hdr_t) + buflen, KM_SLEEP);
    684 	bzero(buf, sizeof (nvpf_hdr_t));
    685 
    686 	((nvpf_hdr_t *)buf)->nvpf_magic = NVPF_HDR_MAGIC;
    687 	((nvpf_hdr_t *)buf)->nvpf_version = NVPF_HDR_VERSION;
    688 	((nvpf_hdr_t *)buf)->nvpf_size = buflen;
    689 	((nvpf_hdr_t *)buf)->nvpf_chksum = nvp_cksum((uchar_t *)nvbuf, buflen);
    690 	((nvpf_hdr_t *)buf)->nvpf_hdr_chksum =
    691 	    nvp_cksum((uchar_t *)buf, sizeof (nvpf_hdr_t));
    692 
    693 	bcopy(nvbuf, buf + sizeof (nvpf_hdr_t), buflen);
    694 	kmem_free(nvbuf, buflen);
    695 	buflen += sizeof (nvpf_hdr_t);
    696 
    697 	len = strlen(filename) + MAX_SUFFIX_LEN + 2;
    698 	newname = kmem_alloc(len, KM_SLEEP);
    699 
    700 
    701 	(void) sprintf(newname, "%s.%s", filename, NEW_FILENAME_SUFFIX);
    702 
    703 	/*
    704 	 * To make it unlikely we suffer data loss, write
    705 	 * data to the new temporary file.  Once successful
    706 	 * complete the transaction by renaming the new file
    707 	 * to replace the previous.
    708 	 */
    709 
    710 	if ((err = kfcreate(newname, &fp)) == 0) {
    711 		err = kfwrite(fp, buf, buflen, &n);
    712 		if (err) {
    713 			nvf_error("%s: write error - %d\n",
    714 			    newname, err);
    715 		} else {
    716 			if (n != buflen) {
    717 				nvf_error(
    718 				    "%s: partial write %ld of %ld bytes\n",
    719 				    newname, n, buflen);
    720 				nvf_error("%s: filesystem may be full?\n",
    721 				    newname);
    722 				err = EIO;
    723 			}
    724 		}
    725 		if ((err1 = kfclose(fp)) != 0) {
    726 			nvf_error("%s: close error\n", newname);
    727 			if (err == 0)
    728 				err = err1;
    729 		}
    730 		if (err != 0) {
    731 			if (kfremove(newname) != 0) {
    732 				nvf_error("%s: remove failed\n",
    733 				    newname);
    734 			}
    735 		}
    736 	} else {
    737 		nvf_error("%s: create failed - %d\n", filename, err);
    738 	}
    739 
    740 	if (err == 0) {
    741 		if ((err = kfrename(newname, filename)) != 0) {
    742 			nvf_error("%s: rename from %s failed\n",
    743 			    newname, filename);
    744 		}
    745 	}
    746 
    747 	kmem_free(newname, len);
    748 	kmem_free(buf, buflen);
    749 
    750 	return (err);
    751 }
    752 
    753 static int
    754 e_fwrite_nvlist(nvfd_t *nvfd, nvlist_t *nvl)
    755 {
    756 	int err;
    757 
    758 	if ((err = fwrite_nvlist(nvfd->nvf_cache_path, nvl)) == 0)
    759 		return (DDI_SUCCESS);
    760 	else {
    761 		if (err == EROFS)
    762 			NVF_MARK_READONLY(nvfd);
    763 		return (DDI_FAILURE);
    764 	}
    765 }
    766 
    767 static void
    768 nvp_list_free(nvfd_t *nvf)
    769 {
    770 	ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
    771 	(nvf->nvf_list_free)((nvf_handle_t)nvf);
    772 	ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
    773 }
    774 
    775 /*
    776  * Read a file in the nvlist format
    777  *	EIO - i/o error during read
    778  *	ENOENT - file not found
    779  *	EINVAL - file contents corrupted
    780  */
    781 static int
    782 fread_nvp_list(nvfd_t *nvfd)
    783 {
    784 	nvlist_t	*nvl;
    785 	nvpair_t	*nvp;
    786 	char		*name;
    787 	nvlist_t	*sublist;
    788 	int		rval;
    789 	int		rv;
    790 
    791 	ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
    792 
    793 	rval = fread_nvlist(nvfd->nvf_cache_path, &nvl);
    794 	if (rval != 0)
    795 		return (rval);
    796 	ASSERT(nvl != NULL);
    797 
    798 	nvp = NULL;
    799 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
    800 		name = nvpair_name(nvp);
    801 		ASSERT(strlen(name) > 0);
    802 
    803 		switch (nvpair_type(nvp)) {
    804 		case DATA_TYPE_NVLIST:
    805 			rval = nvpair_value_nvlist(nvp, &sublist);
    806 			if (rval != 0) {
    807 				nvf_error(
    808 				    "nvpair_value_nvlist error %s %d\n",
    809 				    name, rval);
    810 				goto error;
    811 			}
    812 
    813 			/*
    814 			 * unpack nvlist for this device and
    815 			 * add elements to data list.
    816 			 */
    817 			ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
    818 			rv = (nvfd->nvf_unpack_nvlist)
    819 			    ((nvf_handle_t)nvfd, sublist, name);
    820 			ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
    821 			if (rv != 0) {
    822 				nvf_error(
    823 				    "%s: %s invalid list element\n",
    824 				    nvfd->nvf_cache_path, name);
    825 				rval = EINVAL;
    826 				goto error;
    827 			}
    828 			break;
    829 
    830 		default:
    831 			nvf_error("%s: %s unsupported data type %d\n",
    832 			    nvfd->nvf_cache_path, name, nvpair_type(nvp));
    833 			rval = EINVAL;
    834 			goto error;
    835 		}
    836 	}
    837 
    838 	nvlist_free(nvl);
    839 
    840 	return (0);
    841 
    842 error:
    843 	nvlist_free(nvl);
    844 	nvp_list_free(nvfd);
    845 	return (rval);
    846 }
    847 
    848 
    849 int
    850 nvf_read_file(nvf_handle_t nvf_handle)
    851 {
    852 	nvfd_t *nvfd = (nvfd_t *)nvf_handle;
    853 	int rval;
    854 
    855 	ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
    856 
    857 	if (kfio_disable_read)
    858 		return (0);
    859 
    860 	KFDEBUG((CE_CONT, "reading %s\n", nvfd->nvf_cache_path));
    861 
    862 	rval = fread_nvp_list(nvfd);
    863 	if (rval) {
    864 		switch (rval) {
    865 		case EIO:
    866 			nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
    867 			cmn_err(CE_WARN, "%s: I/O error",
    868 			    nvfd->nvf_cache_path);
    869 			break;
    870 		case ENOENT:
    871 			nvfd->nvf_flags |= NVF_F_CREATE_MSG;
    872 			nvf_error("%s: not found\n",
    873 			    nvfd->nvf_cache_path);
    874 			break;
    875 		case EINVAL:
    876 		default:
    877 			nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
    878 			cmn_err(CE_WARN, "%s: data file corrupted",
    879 			    nvfd->nvf_cache_path);
    880 			break;
    881 		}
    882 	}
    883 	return (rval);
    884 }
    885 
    886 static void
    887 nvf_write_is_complete(nvfd_t *fd)
    888 {
    889 	if (fd->nvf_write_complete) {
    890 		(fd->nvf_write_complete)((nvf_handle_t)fd);
    891 	}
    892 }
    893 
    894 /*ARGSUSED*/
    895 static void
    896 nvpflush_timeout(void *arg)
    897 {
    898 	clock_t nticks;
    899 
    900 	mutex_enter(&nvpflush_lock);
    901 	nticks = nvpticks - ddi_get_lbolt();
    902 	if (nticks > 4) {
    903 		nvpflush_timer_busy = 1;
    904 		mutex_exit(&nvpflush_lock);
    905 		nvpflush_id = timeout(nvpflush_timeout, NULL, nticks);
    906 	} else {
    907 		do_nvpflush = 1;
    908 		NVPDAEMON_DEBUG((CE_CONT, "signal nvpdaemon\n"));
    909 		cv_signal(&nvpflush_cv);
    910 		nvpflush_id = 0;
    911 		nvpflush_timer_busy = 0;
    912 		mutex_exit(&nvpflush_lock);
    913 	}
    914 }
    915 
    916 /*
    917  * After marking a list as dirty, wake the nvpflush daemon
    918  * to perform the update.
    919  */
    920 void
    921 nvf_wake_daemon(void)
    922 {
    923 	clock_t nticks;
    924 
    925 	/*
    926 	 * If the system isn't up yet or is shutting down,
    927 	 * don't even think about starting a flush.
    928 	 */
    929 	if (!i_ddi_io_initialized() || sys_shutdown)
    930 		return;
    931 
    932 	mutex_enter(&nvpflush_lock);
    933 
    934 	if (nvpflush_daemon_active == 0) {
    935 		nvpflush_daemon_active = 1;
    936 		mutex_exit(&nvpflush_lock);
    937 		NVPDAEMON_DEBUG((CE_CONT, "starting nvpdaemon thread\n"));
    938 		nvpflush_thr_id = thread_create(NULL, 0,
    939 		    (void (*)())nvpflush_daemon,
    940 		    NULL, 0, &p0, TS_RUN, minclsyspri);
    941 		mutex_enter(&nvpflush_lock);
    942 	}
    943 
    944 	nticks = nvpflush_delay * TICKS_PER_SECOND;
    945 	nvpticks = ddi_get_lbolt() + nticks;
    946 	if (nvpflush_timer_busy == 0) {
    947 		nvpflush_timer_busy = 1;
    948 		mutex_exit(&nvpflush_lock);
    949 		nvpflush_id = timeout(nvpflush_timeout, NULL, nticks + 4);
    950 	} else
    951 		mutex_exit(&nvpflush_lock);
    952 }
    953 
    954 static int
    955 nvpflush_one(nvfd_t *nvfd)
    956 {
    957 	int rval = DDI_SUCCESS;
    958 	nvlist_t *nvl;
    959 
    960 	rw_enter(&nvfd->nvf_lock, RW_READER);
    961 
    962 	ASSERT((nvfd->nvf_flags & NVF_F_FLUSHING) == 0);
    963 
    964 	if (!NVF_IS_DIRTY(nvfd) ||
    965 	    NVF_IS_READONLY(nvfd) || kfio_disable_write || sys_shutdown) {
    966 		NVF_CLEAR_DIRTY(nvfd);
    967 		rw_exit(&nvfd->nvf_lock);
    968 		return (DDI_SUCCESS);
    969 	}
    970 
    971 	if (rw_tryupgrade(&nvfd->nvf_lock) == 0) {
    972 		nvf_error("nvpflush: "
    973 		    "%s rw upgrade failed\n", nvfd->nvf_cache_path);
    974 		rw_exit(&nvfd->nvf_lock);
    975 		return (DDI_FAILURE);
    976 	}
    977 	if (((nvfd->nvf_pack_list)
    978 	    ((nvf_handle_t)nvfd, &nvl)) != DDI_SUCCESS) {
    979 		nvf_error("nvpflush: "
    980 		    "%s nvlist construction failed\n", nvfd->nvf_cache_path);
    981 		ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
    982 		rw_exit(&nvfd->nvf_lock);
    983 		return (DDI_FAILURE);
    984 	}
    985 	ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
    986 
    987 	NVF_CLEAR_DIRTY(nvfd);
    988 	nvfd->nvf_flags |= NVF_F_FLUSHING;
    989 	rw_exit(&nvfd->nvf_lock);
    990 
    991 	rval = e_fwrite_nvlist(nvfd, nvl);
    992 	nvlist_free(nvl);
    993 
    994 	rw_enter(&nvfd->nvf_lock, RW_WRITER);
    995 	nvfd->nvf_flags &= ~NVF_F_FLUSHING;
    996 	if (rval == DDI_FAILURE) {
    997 		if (NVF_IS_READONLY(nvfd)) {
    998 			rval = DDI_SUCCESS;
    999 			nvfd->nvf_flags &= ~(NVF_F_ERROR | NVF_F_DIRTY);
   1000 		} else if ((nvfd->nvf_flags & NVF_F_ERROR) == 0) {
   1001 			cmn_err(CE_CONT,
   1002 			    "%s: update failed\n", nvfd->nvf_cache_path);
   1003 			nvfd->nvf_flags |= NVF_F_ERROR | NVF_F_DIRTY;
   1004 		}
   1005 	} else {
   1006 		if (nvfd->nvf_flags & NVF_F_CREATE_MSG) {
   1007 			cmn_err(CE_CONT,
   1008 			    "!Creating %s\n", nvfd->nvf_cache_path);
   1009 			nvfd->nvf_flags &= ~NVF_F_CREATE_MSG;
   1010 		}
   1011 		if (nvfd->nvf_flags & NVF_F_REBUILD_MSG) {
   1012 			cmn_err(CE_CONT,
   1013 			    "!Rebuilding %s\n", nvfd->nvf_cache_path);
   1014 			nvfd->nvf_flags &= ~NVF_F_REBUILD_MSG;
   1015 		}
   1016 		if (nvfd->nvf_flags & NVF_F_ERROR) {
   1017 			cmn_err(CE_CONT,
   1018 			    "%s: update now ok\n", nvfd->nvf_cache_path);
   1019 			nvfd->nvf_flags &= ~NVF_F_ERROR;
   1020 		}
   1021 		/*
   1022 		 * The file may need to be flushed again if the cached
   1023 		 * data was touched while writing the earlier contents.
   1024 		 */
   1025 		if (NVF_IS_DIRTY(nvfd))
   1026 			rval = DDI_FAILURE;
   1027 	}
   1028 
   1029 	rw_exit(&nvfd->nvf_lock);
   1030 	return (rval);
   1031 }
   1032 
   1033 
   1034 static void
   1035 nvpflush_daemon(void)
   1036 {
   1037 	callb_cpr_t cprinfo;
   1038 	nvfd_t *nvfdp, *nextfdp;
   1039 	clock_t clk;
   1040 	int rval;
   1041 	int want_wakeup;
   1042 	int is_now_clean;
   1043 
   1044 	ASSERT(modrootloaded);
   1045 
   1046 	nvpflush_thread = curthread;
   1047 	NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: init\n"));
   1048 
   1049 	CALLB_CPR_INIT(&cprinfo, &nvpflush_lock, callb_generic_cpr, "nvp");
   1050 	mutex_enter(&nvpflush_lock);
   1051 	for (;;) {
   1052 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   1053 		while (do_nvpflush == 0) {
   1054 			clk = cv_reltimedwait(&nvpflush_cv, &nvpflush_lock,
   1055 			    (nvpdaemon_idle_time * TICKS_PER_SECOND),
   1056 			    TR_CLOCK_TICK);
   1057 			if ((clk == -1 && do_nvpflush == 0 &&
   1058 			    nvpflush_timer_busy == 0) || sys_shutdown) {
   1059 				/*
   1060 				 * Note that CALLB_CPR_EXIT calls mutex_exit()
   1061 				 * on the lock passed in to CALLB_CPR_INIT,
   1062 				 * so the lock must be held when invoking it.
   1063 				 */
   1064 				CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
   1065 				NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: exit\n"));
   1066 				ASSERT(mutex_owned(&nvpflush_lock));
   1067 				nvpflush_thr_id = NULL;
   1068 				nvpflush_daemon_active = 0;
   1069 				CALLB_CPR_EXIT(&cprinfo);
   1070 				thread_exit();
   1071 			}
   1072 		}
   1073 		CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
   1074 
   1075 		nvpbusy = 1;
   1076 		want_wakeup = 0;
   1077 		do_nvpflush = 0;
   1078 		mutex_exit(&nvpflush_lock);
   1079 
   1080 		/*
   1081 		 * Try flushing what's dirty, reschedule if there's
   1082 		 * a failure or data gets marked as dirty again.
   1083 		 * First move each file marked dirty to the dirty
   1084 		 * list to avoid locking the list across the write.
   1085 		 */
   1086 		mutex_enter(&nvf_cache_mutex);
   1087 		for (nvfdp = list_head(&nvf_cache_files);
   1088 		    nvfdp; nvfdp = nextfdp) {
   1089 			nextfdp = list_next(&nvf_cache_files, nvfdp);
   1090 			rw_enter(&nvfdp->nvf_lock, RW_READER);
   1091 			if (NVF_IS_DIRTY(nvfdp)) {
   1092 				list_remove(&nvf_cache_files, nvfdp);
   1093 				list_insert_tail(&nvf_dirty_files, nvfdp);
   1094 				rw_exit(&nvfdp->nvf_lock);
   1095 			} else {
   1096 				NVPDAEMON_DEBUG((CE_CONT,
   1097 				    "nvpdaemon: not dirty %s\n",
   1098 				    nvfdp->nvf_cache_path));
   1099 				rw_exit(&nvfdp->nvf_lock);
   1100 			}
   1101 		}
   1102 		mutex_exit(&nvf_cache_mutex);
   1103 
   1104 		/*
   1105 		 * Now go through the dirty list
   1106 		 */
   1107 		for (nvfdp = list_head(&nvf_dirty_files);
   1108 		    nvfdp; nvfdp = nextfdp) {
   1109 			nextfdp = list_next(&nvf_dirty_files, nvfdp);
   1110 
   1111 			is_now_clean = 0;
   1112 			rw_enter(&nvfdp->nvf_lock, RW_READER);
   1113 			if (NVF_IS_DIRTY(nvfdp)) {
   1114 				NVPDAEMON_DEBUG((CE_CONT,
   1115 				    "nvpdaemon: flush %s\n",
   1116 				    nvfdp->nvf_cache_path));
   1117 				rw_exit(&nvfdp->nvf_lock);
   1118 				rval = nvpflush_one(nvfdp);
   1119 				rw_enter(&nvfdp->nvf_lock, RW_READER);
   1120 				if (rval != DDI_SUCCESS ||
   1121 				    NVF_IS_DIRTY(nvfdp)) {
   1122 					rw_exit(&nvfdp->nvf_lock);
   1123 					NVPDAEMON_DEBUG((CE_CONT,
   1124 					    "nvpdaemon: %s dirty again\n",
   1125 					    nvfdp->nvf_cache_path));
   1126 					want_wakeup = 1;
   1127 				} else {
   1128 					rw_exit(&nvfdp->nvf_lock);
   1129 					nvf_write_is_complete(nvfdp);
   1130 					is_now_clean = 1;
   1131 				}
   1132 			} else {
   1133 				NVPDAEMON_DEBUG((CE_CONT,
   1134 				    "nvpdaemon: not dirty %s\n",
   1135 				    nvfdp->nvf_cache_path));
   1136 				rw_exit(&nvfdp->nvf_lock);
   1137 				is_now_clean = 1;
   1138 			}
   1139 
   1140 			if (is_now_clean) {
   1141 				mutex_enter(&nvf_cache_mutex);
   1142 				list_remove(&nvf_dirty_files, nvfdp);
   1143 				list_insert_tail(&nvf_cache_files,
   1144 				    nvfdp);
   1145 				mutex_exit(&nvf_cache_mutex);
   1146 			}
   1147 		}
   1148 
   1149 		if (want_wakeup)
   1150 			nvf_wake_daemon();
   1151 
   1152 		mutex_enter(&nvpflush_lock);
   1153 		nvpbusy = 0;
   1154 	}
   1155 }
   1156