Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"@(#)spa.c	1.46	08/01/02 SMI"
     28 
     29 /*
     30  * This file contains all the routines used when modifying on-disk SPA state.
     31  * This includes opening, importing, destroying, exporting a pool, and syncing a
     32  * pool.
     33  */
     34 
     35 #include <sys/zfs_context.h>
     36 #include <sys/fm/fs/zfs.h>
     37 #include <sys/spa_impl.h>
     38 #include <sys/zio.h>
     39 #include <sys/zio_checksum.h>
     40 #include <sys/zio_compress.h>
     41 #include <sys/dmu.h>
     42 #include <sys/dmu_tx.h>
     43 #include <sys/zap.h>
     44 #include <sys/zil.h>
     45 #include <sys/vdev_impl.h>
     46 #include <sys/metaslab.h>
     47 #include <sys/uberblock_impl.h>
     48 #include <sys/txg.h>
     49 #include <sys/avl.h>
     50 #include <sys/dmu_traverse.h>
     51 #include <sys/dmu_objset.h>
     52 #include <sys/unique.h>
     53 #include <sys/dsl_pool.h>
     54 #include <sys/dsl_dataset.h>
     55 #include <sys/dsl_dir.h>
     56 #include <sys/dsl_prop.h>
     57 #include <sys/dsl_synctask.h>
     58 #include <sys/fs/zfs.h>
     59 #include <sys/arc.h>
     60 #include <sys/callb.h>
     61 #include <sys/systeminfo.h>
     62 #include <sys/sunddi.h>
     63 
     64 #include "zfs_prop.h"
     65 
     66 int zio_taskq_threads = 8;
     67 
     68 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
     69 
     70 /*
     71  * ==========================================================================
     72  * SPA properties routines
     73  * ==========================================================================
     74  */
     75 
     76 /*
     77  * Add a (source=src, propname=propval) list to an nvlist.
     78  */
     79 static int
     80 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
     81     uint64_t intval, zprop_source_t src)
     82 {
     83 	const char *propname = zpool_prop_to_name(prop);
     84 	nvlist_t *propval;
     85 	int err = 0;
     86 
     87 	if (err = nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP))
     88 		return (err);
     89 
     90 	if (err = nvlist_add_uint64(propval, ZPROP_SOURCE, src))
     91 		goto out;
     92 
     93 	if (strval != NULL) {
     94 		if (err = nvlist_add_string(propval, ZPROP_VALUE, strval))
     95 			goto out;
     96 	} else {
     97 		if (err = nvlist_add_uint64(propval, ZPROP_VALUE, intval))
     98 			goto out;
     99 	}
    100 
    101 	err = nvlist_add_nvlist(nvl, propname, propval);
    102 out:
    103 	nvlist_free(propval);
    104 	return (err);
    105 }
    106 
    107 /*
    108  * Get property values from the spa configuration.
    109  */
    110 static int
    111 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
    112 {
    113 	uint64_t size = spa_get_space(spa);
    114 	uint64_t used = spa_get_alloc(spa);
    115 	uint64_t cap, version;
    116 	zprop_source_t src = ZPROP_SRC_NONE;
    117 	int err;
    118 	char *cachefile;
    119 	size_t len;
    120 
    121 	/*
    122 	 * readonly properties
    123 	 */
    124 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name,
    125 	    0, src))
    126 		return (err);
    127 
    128 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src))
    129 		return (err);
    130 
    131 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src))
    132 		return (err);
    133 
    134 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL,
    135 	    size - used, src))
    136 		return (err);
    137 
    138 	cap = (size == 0) ? 0 : (used * 100 / size);
    139 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src))
    140 		return (err);
    141 
    142 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL,
    143 	    spa_guid(spa), src))
    144 		return (err);
    145 
    146 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
    147 	    spa->spa_root_vdev->vdev_state, src))
    148 		return (err);
    149 
    150 	/*
    151 	 * settable properties that are not stored in the pool property object.
    152 	 */
    153 	version = spa_version(spa);
    154 	if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
    155 		src = ZPROP_SRC_DEFAULT;
    156 	else
    157 		src = ZPROP_SRC_LOCAL;
    158 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
    159 	    version, src))
    160 		return (err);
    161 
    162 	if (spa->spa_root != NULL) {
    163 		src = ZPROP_SRC_LOCAL;
    164 		if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT,
    165 		    spa->spa_root, 0, src))
    166 			return (err);
    167 	}
    168 
    169 	if (spa->spa_config_dir != NULL) {
    170 		if (strcmp(spa->spa_config_dir, "none") == 0) {
    171 			err = spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
    172 			    spa->spa_config_dir, 0, ZPROP_SRC_LOCAL);
    173 		} else {
    174 			len = strlen(spa->spa_config_dir) +
    175 			    strlen(spa->spa_config_file) + 2;
    176 			cachefile = kmem_alloc(len, KM_SLEEP);
    177 			(void) snprintf(cachefile, len, "%s/%s",
    178 			    spa->spa_config_dir, spa->spa_config_file);
    179 			err = spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
    180 			    cachefile, 0, ZPROP_SRC_LOCAL);
    181 			kmem_free(cachefile, len);
    182 		}
    183 
    184 		if (err)
    185 			return (err);
    186 	}
    187 
    188 	return (0);
    189 }
    190 
    191 /*
    192  * Get zpool property values.
    193  */
    194 int
    195 spa_prop_get(spa_t *spa, nvlist_t **nvp)
    196 {
    197 	zap_cursor_t zc;
    198 	zap_attribute_t za;
    199 	objset_t *mos = spa->spa_meta_objset;
    200 	int err;
    201 
    202 	if (err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP))
    203 		return (err);
    204 
    205 	/*
    206 	 * Get properties from the spa config.
    207 	 */
    208 	if (err = spa_prop_get_config(spa, nvp))
    209 		goto out;
    210 
    211 	mutex_enter(&spa->spa_props_lock);
    212 	/* If no pool property object, no more prop to get. */
    213 	if (spa->spa_pool_props_object == 0) {
    214 		mutex_exit(&spa->spa_props_lock);
    215 		return (0);
    216 	}
    217 
    218 	/*
    219 	 * Get properties from the MOS pool property object.
    220 	 */
    221 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
    222 	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
    223 	    zap_cursor_advance(&zc)) {
    224 		uint64_t intval = 0;
    225 		char *strval = NULL;
    226 		zprop_source_t src = ZPROP_SRC_DEFAULT;
    227 		zpool_prop_t prop;
    228 
    229 		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
    230 			continue;
    231 
    232 		switch (za.za_integer_length) {
    233 		case 8:
    234 			/* integer property */
    235 			if (za.za_first_integer !=
    236 			    zpool_prop_default_numeric(prop))
    237 				src = ZPROP_SRC_LOCAL;
    238 
    239 			if (prop == ZPOOL_PROP_BOOTFS) {
    240 				dsl_pool_t *dp;
    241 				dsl_dataset_t *ds = NULL;
    242 
    243 				dp = spa_get_dsl(spa);
    244 				rw_enter(&dp->dp_config_rwlock, RW_READER);
    245 				if (err = dsl_dataset_open_obj(dp,
    246 				    za.za_first_integer, NULL, DS_MODE_NONE,
    247 				    FTAG, &ds)) {
    248 					rw_exit(&dp->dp_config_rwlock);
    249 					break;
    250 				}
    251 
    252 				strval = kmem_alloc(
    253 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
    254 				    KM_SLEEP);
    255 				dsl_dataset_name(ds, strval);
    256 				dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
    257 				rw_exit(&dp->dp_config_rwlock);
    258 			} else {
    259 				strval = NULL;
    260 				intval = za.za_first_integer;
    261 			}
    262 
    263 			err = spa_prop_add_list(*nvp, prop, strval,
    264 			    intval, src);
    265 
    266 			if (strval != NULL)
    267 				kmem_free(strval,
    268 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
    269 
    270 			break;
    271 
    272 		case 1:
    273 			/* string property */
    274 			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
    275 			err = zap_lookup(mos, spa->spa_pool_props_object,
    276 			    za.za_name, 1, za.za_num_integers, strval);
    277 			if (err) {
    278 				kmem_free(strval, za.za_num_integers);
    279 				break;
    280 			}
    281 			err = spa_prop_add_list(*nvp, prop, strval, 0, src);
    282 			kmem_free(strval, za.za_num_integers);
    283 			break;
    284 
    285 		default:
    286 			break;
    287 		}
    288 	}
    289 	zap_cursor_fini(&zc);
    290 	mutex_exit(&spa->spa_props_lock);
    291 out:
    292 	if (err && err != ENOENT) {
    293 		nvlist_free(*nvp);
    294 		return (err);
    295 	}
    296 
    297 	return (0);
    298 }
    299 
    300 /*
    301  * Validate the given pool properties nvlist and modify the list
    302  * for the property values to be set.
    303  */
    304 static int
    305 spa_prop_validate(spa_t *spa, nvlist_t *props)
    306 {
    307 	nvpair_t *elem;
    308 	int error = 0, reset_bootfs = 0;
    309 	uint64_t objnum;
    310 
    311 	elem = NULL;
    312 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
    313 		zpool_prop_t prop;
    314 		char *propname, *strval;
    315 		uint64_t intval;
    316 		vdev_t *rvdev;
    317 		char *vdev_type;
    318 		objset_t *os;
    319 		char *slash;
    320 
    321 		propname = nvpair_name(elem);
    322 
    323 		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
    324 			return (EINVAL);
    325 
    326 		switch (prop) {
    327 		case ZPOOL_PROP_VERSION:
    328 			error = nvpair_value_uint64(elem, &intval);
    329 			if (!error &&
    330 			    (intval < spa_version(spa) || intval > SPA_VERSION))
    331 				error = EINVAL;
    332 			break;
    333 
    334 		case ZPOOL_PROP_DELEGATION:
    335 		case ZPOOL_PROP_AUTOREPLACE:
    336 			error = nvpair_value_uint64(elem, &intval);
    337 			if (!error && intval > 1)
    338 				error = EINVAL;
    339 			break;
    340 
    341 		case ZPOOL_PROP_BOOTFS:
    342 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
    343 				error = ENOTSUP;
    344 				break;
    345 			}
    346 
    347 			/*
    348 			 * A bootable filesystem can not be on a RAIDZ pool
    349 			 * nor a striped pool with more than 1 device.
    350 			 */
    351 			rvdev = spa->spa_root_vdev;
    352 			vdev_type =
    353 			    rvdev->vdev_child[0]->vdev_ops->vdev_op_type;
    354 			if (rvdev->vdev_children > 1 ||
    355 			    strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
    356 			    strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
    357 				error = ENOTSUP;
    358 				break;
    359 			}
    360 
    361 			reset_bootfs = 1;
    362 
    363 			error = nvpair_value_string(elem, &strval);
    364 
    365 			if (!error) {
    366 				if (strval == NULL || strval[0] == '\0') {
    367 					objnum = zpool_prop_default_numeric(
    368 					    ZPOOL_PROP_BOOTFS);
    369 					break;
    370 				}
    371 
    372 				if (error = dmu_objset_open(strval, DMU_OST_ZFS,
    373 				    DS_MODE_STANDARD | DS_MODE_READONLY, &os))
    374 					break;
    375 				objnum = dmu_objset_id(os);
    376 				dmu_objset_close(os);
    377 			}
    378 			break;
    379 		case ZPOOL_PROP_FAILUREMODE:
    380 			error = nvpair_value_uint64(elem, &intval);
    381 			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
    382 			    intval > ZIO_FAILURE_MODE_PANIC))
    383 				error = EINVAL;
    384 
    385 			/*
    386 			 * This is a special case which only occurs when
    387 			 * the pool has completely failed. This allows
    388 			 * the user to change the in-core failmode property
    389 			 * without syncing it out to disk (I/Os might
    390 			 * currently be blocked). We do this by returning
    391 			 * EIO to the caller (spa_prop_set) to trick it
    392 			 * into thinking we encountered a property validation
    393 			 * error.
    394 			 */
    395 			if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) {
    396 				spa->spa_failmode = intval;
    397 				error = EIO;
    398 			}
    399 			break;
    400 
    401 		case ZPOOL_PROP_CACHEFILE:
    402 			if ((error = nvpair_value_string(elem, &strval)) != 0)
    403 				break;
    404 
    405 			if (strval[0] == '\0')
    406 				break;
    407 
    408 			if (strcmp(strval, "none") == 0)
    409 				break;
    410 
    411 			if (strval[0] != '/') {
    412 				error = EINVAL;
    413 				break;
    414 			}
    415 
    416 			slash = strrchr(strval, '/');
    417 			ASSERT(slash != NULL);
    418 
    419 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
    420 			    strcmp(slash, "/..") == 0)
    421 				error = EINVAL;
    422 			break;
    423 		}
    424 
    425 		if (error)
    426 			break;
    427 	}
    428 
    429 	if (!error && reset_bootfs) {
    430 		error = nvlist_remove(props,
    431 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
    432 
    433 		if (!error) {
    434 			error = nvlist_add_uint64(props,
    435 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
    436 		}
    437 	}
    438 
    439 	return (error);
    440 }
    441 
    442 int
    443 spa_prop_set(spa_t *spa, nvlist_t *nvp)
    444 {
    445 	int error;
    446 
    447 	if ((error = spa_prop_validate(spa, nvp)) != 0)
    448 		return (error);
    449 
    450 	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
    451 	    spa, nvp, 3));
    452 }
    453 
    454 /*
    455  * If the bootfs property value is dsobj, clear it.
    456  */
    457 void
    458 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
    459 {
    460 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
    461 		VERIFY(zap_remove(spa->spa_meta_objset,
    462 		    spa->spa_pool_props_object,
    463 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
    464 		spa->spa_bootfs = 0;
    465 	}
    466 }
    467 
    468 /*
    469  * ==========================================================================
    470  * SPA state manipulation (open/create/destroy/import/export)
    471  * ==========================================================================
    472  */
    473 
    474 static int
    475 spa_error_entry_compare(const void *a, const void *b)
    476 {
    477 	spa_error_entry_t *sa = (spa_error_entry_t *)a;
    478 	spa_error_entry_t *sb = (spa_error_entry_t *)b;
    479 	int ret;
    480 
    481 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
    482 	    sizeof (zbookmark_t));
    483 
    484 	if (ret < 0)
    485 		return (-1);
    486 	else if (ret > 0)
    487 		return (1);
    488 	else
    489 		return (0);
    490 }
    491 
    492 /*
    493  * Utility function which retrieves copies of the current logs and
    494  * re-initializes them in the process.
    495  */
    496 void
    497 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
    498 {
    499 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
    500 
    501 	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
    502 	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
    503 
    504 	avl_create(&spa->spa_errlist_scrub,
    505 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
    506 	    offsetof(spa_error_entry_t, se_avl));
    507 	avl_create(&spa->spa_errlist_last,
    508 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
    509 	    offsetof(spa_error_entry_t, se_avl));
    510 }
    511 
    512 /*
    513  * Activate an uninitialized pool.
    514  */
    515 static void
    516 spa_activate(spa_t *spa)
    517 {
    518 	int t;
    519 
    520 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
    521 
    522 	spa->spa_state = POOL_STATE_ACTIVE;
    523 
    524 	spa->spa_normal_class = metaslab_class_create();
    525 	spa->spa_log_class = metaslab_class_create();
    526 
    527 	for (t = 0; t < ZIO_TYPES; t++) {
    528 		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
    529 		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
    530 		    TASKQ_PREPOPULATE);
    531 		spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
    532 		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
    533 		    TASKQ_PREPOPULATE);
    534 	}
    535 
    536 	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
    537 	    offsetof(vdev_t, vdev_dirty_node));
    538 	list_create(&spa->spa_zio_list, sizeof (zio_t),
    539 	    offsetof(zio_t, zio_link_node));
    540 
    541 	txg_list_create(&spa->spa_vdev_txg_list,
    542 	    offsetof(struct vdev, vdev_txg_node));
    543 
    544 	avl_create(&spa->spa_errlist_scrub,
    545 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
    546 	    offsetof(spa_error_entry_t, se_avl));
    547 	avl_create(&spa->spa_errlist_last,
    548 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
    549 	    offsetof(spa_error_entry_t, se_avl));
    550 }
    551 
    552 /*
    553  * Opposite of spa_activate().
    554  */
    555 static void
    556 spa_deactivate(spa_t *spa)
    557 {
    558 	int t;
    559 
    560 	ASSERT(spa->spa_sync_on == B_FALSE);
    561 	ASSERT(spa->spa_dsl_pool == NULL);
    562 	ASSERT(spa->spa_root_vdev == NULL);
    563 
    564 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
    565 
    566 	txg_list_destroy(&spa->spa_vdev_txg_list);
    567 
    568 	list_destroy(&spa->spa_dirty_list);
    569 	list_destroy(&spa->spa_zio_list);
    570 
    571 	for (t = 0; t < ZIO_TYPES; t++) {
    572 		taskq_destroy(spa->spa_zio_issue_taskq[t]);
    573 		taskq_destroy(spa->spa_zio_intr_taskq[t]);
    574 		spa->spa_zio_issue_taskq[t] = NULL;
    575 		spa->spa_zio_intr_taskq[t] = NULL;
    576 	}
    577 
    578 	metaslab_class_destroy(spa->spa_normal_class);
    579 	spa->spa_normal_class = NULL;
    580 
    581 	metaslab_class_destroy(spa->spa_log_class);
    582 	spa->spa_log_class = NULL;
    583 
    584 	/*
    585 	 * If this was part of an import or the open otherwise failed, we may
    586 	 * still have errors left in the queues.  Empty them just in case.
    587 	 */
    588 	spa_errlog_drain(spa);
    589 
    590 	avl_destroy(&spa->spa_errlist_scrub);
    591 	avl_destroy(&spa->spa_errlist_last);
    592 
    593 	spa->spa_state = POOL_STATE_UNINITIALIZED;
    594 }
    595 
    596 /*
    597  * Verify a pool configuration, and construct the vdev tree appropriately.  This
    598  * will create all the necessary vdevs in the appropriate layout, with each vdev
    599  * in the CLOSED state.  This will prep the pool before open/creation/import.
    600  * All vdev validation is done by the vdev_alloc() routine.
    601  */
    602 static int
    603 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
    604     uint_t id, int atype)
    605 {
    606 	nvlist_t **child;
    607 	uint_t c, children;
    608 	int error;
    609 
    610 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
    611 		return (error);
    612 
    613 	if ((*vdp)->vdev_ops->vdev_op_leaf)
    614 		return (0);
    615 
    616 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
    617 	    &child, &children) != 0) {
    618 		vdev_free(*vdp);
    619 		*vdp = NULL;
    620 		return (EINVAL);
    621 	}
    622 
    623 	for (c = 0; c < children; c++) {
    624 		vdev_t *vd;
    625 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
    626 		    atype)) != 0) {
    627 			vdev_free(*vdp);
    628 			*vdp = NULL;
    629 			return (error);
    630 		}
    631 	}
    632 
    633 	ASSERT(*vdp != NULL);
    634 
    635 	return (0);
    636 }
    637 
    638 /*
    639  * Opposite of spa_load().
    640  */
    641 static void
    642 spa_unload(spa_t *spa)
    643 {
    644 	int i;
    645 
    646 	/*
    647 	 * Stop async tasks.
    648 	 */
    649 	spa_async_suspend(spa);
    650 
    651 	/*
    652 	 * Stop syncing.
    653 	 */
    654 	if (spa->spa_sync_on) {
    655 		txg_sync_stop(spa->spa_dsl_pool);
    656 		spa->spa_sync_on = B_FALSE;
    657 	}
    658 
    659 	/*
    660 	 * Wait for any outstanding prefetch I/O to complete.
    661 	 */
    662 	spa_config_enter(spa, RW_WRITER, FTAG);
    663 	spa_config_exit(spa, FTAG);
    664 
    665 	/*
    666 	 * Drop and purge level 2 cache
    667 	 */
    668 	spa_l2cache_drop(spa);
    669 
    670 	/*
    671 	 * Close the dsl pool.
    672 	 */
    673 	if (spa->spa_dsl_pool) {
    674 		dsl_pool_close(spa->spa_dsl_pool);
    675 		spa->spa_dsl_pool = NULL;
    676 	}
    677 
    678 	/*
    679 	 * Close all vdevs.
    680 	 */
    681 	if (spa->spa_root_vdev)
    682 		vdev_free(spa->spa_root_vdev);
    683 	ASSERT(spa->spa_root_vdev == NULL);
    684 
    685 	for (i = 0; i < spa->spa_spares.sav_count; i++)
    686 		vdev_free(spa->spa_spares.sav_vdevs[i]);
    687 	if (spa->spa_spares.sav_vdevs) {
    688 		kmem_free(spa->spa_spares.sav_vdevs,
    689 		    spa->spa_spares.sav_count * sizeof (void *));
    690 		spa->spa_spares.sav_vdevs = NULL;
    691 	}
    692 	if (spa->spa_spares.sav_config) {
    693 		nvlist_free(spa->spa_spares.sav_config);
    694 		spa->spa_spares.sav_config = NULL;
    695 	}
    696 
    697 	for (i = 0; i < spa->spa_l2cache.sav_count; i++)
    698 		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
    699 	if (spa->spa_l2cache.sav_vdevs) {
    700 		kmem_free(spa->spa_l2cache.sav_vdevs,
    701 		    spa->spa_l2cache.sav_count * sizeof (void *));
    702 		spa->spa_l2cache.sav_vdevs = NULL;
    703 	}
    704 	if (spa->spa_l2cache.sav_config) {
    705 		nvlist_free(spa->spa_l2cache.sav_config);
    706 		spa->spa_l2cache.sav_config = NULL;
    707 	}
    708 
    709 	spa->spa_async_suspended = 0;
    710 }
    711 
    712 /*
    713  * Load (or re-load) the current list of vdevs describing the active spares for
    714  * this pool.  When this is called, we have some form of basic information in
    715  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
    716  * then re-generate a more complete list including status information.
    717  */
    718 static void
    719 spa_load_spares(spa_t *spa)
    720 {
    721 	nvlist_t **spares;
    722 	uint_t nspares;
    723 	int i;
    724 	vdev_t *vd, *tvd;
    725 
    726 	/*
    727 	 * First, close and free any existing spare vdevs.
    728 	 */
    729 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
    730 		vd = spa->spa_spares.sav_vdevs[i];
    731 
    732 		/* Undo the call to spa_activate() below */
    733 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
    734 		    tvd->vdev_isspare)
    735 			spa_spare_remove(tvd);
    736 		vdev_close(vd);
    737 		vdev_free(vd);
    738 	}
    739 
    740 	if (spa->spa_spares.sav_vdevs)
    741 		kmem_free(spa->spa_spares.sav_vdevs,
    742 		    spa->spa_spares.sav_count * sizeof (void *));
    743 
    744 	if (spa->spa_spares.sav_config == NULL)
    745 		nspares = 0;
    746 	else
    747 		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
    748 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
    749 
    750 	spa->spa_spares.sav_count = (int)nspares;
    751 	spa->spa_spares.sav_vdevs = NULL;
    752 
    753 	if (nspares == 0)
    754 		return;
    755 
    756 	/*
    757 	 * Construct the array of vdevs, opening them to get status in the
    758 	 * process.   For each spare, there is potentially two different vdev_t
    759 	 * structures associated with it: one in the list of spares (used only
    760 	 * for basic validation purposes) and one in the active vdev
    761 	 * configuration (if it's spared in).  During this phase we open and
    762 	 * validate each vdev on the spare list.  If the vdev also exists in the
    763 	 * active configuration, then we also mark this vdev as an active spare.
    764 	 */
    765 	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
    766 	    KM_SLEEP);
    767 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
    768 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
    769 		    VDEV_ALLOC_SPARE) == 0);
    770 		ASSERT(vd != NULL);
    771 
    772 		spa->spa_spares.sav_vdevs[i] = vd;
    773 
    774 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
    775 			if (!tvd->vdev_isspare)
    776 				spa_spare_add(tvd);
    777 
    778 			/*
    779 			 * We only mark the spare active if we were successfully
    780 			 * able to load the vdev.  Otherwise, importing a pool
    781 			 * with a bad active spare would result in strange
    782 			 * behavior, because multiple pool would think the spare
    783 			 * is actively in use.
    784 			 *
    785 			 * There is a vulnerability here to an equally bizarre
    786 			 * circumstance, where a dead active spare is later
    787 			 * brought back to life (onlined or otherwise).  Given
    788 			 * the rarity of this scenario, and the extra complexity
    789 			 * it adds, we ignore the possibility.
    790 			 */
    791 			if (!vdev_is_dead(tvd))
    792 				spa_spare_activate(tvd);
    793 		}
    794 
    795 		if (vdev_open(vd) != 0)
    796 			continue;
    797 
    798 		vd->vdev_top = vd;
    799 		if (vdev_validate_aux(vd) == 0)
    800 			spa_spare_add(vd);
    801 	}
    802 
    803 	/*
    804 	 * Recompute the stashed list of spares, with status information
    805 	 * this time.
    806 	 */
    807 	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
    808 	    DATA_TYPE_NVLIST_ARRAY) == 0);
    809 
    810 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
    811 	    KM_SLEEP);
    812 	for (i = 0; i < spa->spa_spares.sav_count; i++)
    813 		spares[i] = vdev_config_generate(spa,
    814 		    spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
    815 	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
    816 	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
    817 	for (i = 0; i < spa->spa_spares.sav_count; i++)
    818 		nvlist_free(spares[i]);
    819 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
    820 }
    821 
    822 /*
    823  * Load (or re-load) the current list of vdevs describing the active l2cache for
    824  * this pool.  When this is called, we have some form of basic information in
    825  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
    826  * then re-generate a more complete list including status information.
    827  * Devices which are already active have their details maintained, and are
    828  * not re-opened.
    829  */
    830 static void
    831 spa_load_l2cache(spa_t *spa)
    832 {
    833 	nvlist_t **l2cache;
    834 	uint_t nl2cache;
    835 	int i, j, oldnvdevs;
    836 	uint64_t guid;
    837 	vdev_t *vd, **oldvdevs, **newvdevs;
    838 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
    839 
    840 	if (sav->sav_config != NULL) {
    841 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
    842 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
    843 		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
    844 	} else {
    845 		nl2cache = 0;
    846 	}
    847 
    848 	oldvdevs = sav->sav_vdevs;
    849 	oldnvdevs = sav->sav_count;
    850 	sav->sav_vdevs = NULL;
    851 	sav->sav_count = 0;
    852 
    853 	/*
    854 	 * Process new nvlist of vdevs.
    855 	 */
    856 	for (i = 0; i < nl2cache; i++) {
    857 		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
    858 		    &guid) == 0);
    859 
    860 		newvdevs[i] = NULL;
    861 		for (j = 0; j < oldnvdevs; j++) {
    862 			vd = oldvdevs[j];
    863 			if (vd != NULL && guid == vd->vdev_guid) {
    864 				/*
    865 				 * Retain previous vdev for add/remove ops.
    866 				 */
    867 				newvdevs[i] = vd;
    868 				oldvdevs[j] = NULL;
    869 				break;
    870 			}
    871 		}
    872 
    873 		if (newvdevs[i] == NULL) {
    874 			/*
    875 			 * Create new vdev
    876 			 */
    877 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
    878 			    VDEV_ALLOC_L2CACHE) == 0);
    879 			ASSERT(vd != NULL);
    880 			newvdevs[i] = vd;
    881 
    882 			/*
    883 			 * Commit this vdev as an l2cache device,
    884 			 * even if it fails to open.
    885 			 */
    886 			spa_l2cache_add(vd);
    887 
    888 			if (vdev_open(vd) != 0)
    889 				continue;
    890 
    891 			vd->vdev_top = vd;
    892 			(void) vdev_validate_aux(vd);
    893 
    894 			if (!vdev_is_dead(vd)) {
    895 				uint64_t size;
    896 				size = vdev_get_rsize(vd);
    897 				ASSERT3U(size, >, 0);
    898 				if (spa_mode & FWRITE) {
    899 					l2arc_add_vdev(spa, vd,
    900 					    VDEV_LABEL_START_SIZE,
    901 					    size - VDEV_LABEL_START_SIZE);
    902 				}
    903 				spa_l2cache_activate(vd);
    904 			}
    905 		}
    906 	}
    907 
    908 	/*
    909 	 * Purge vdevs that were dropped
    910 	 */
    911 	for (i = 0; i < oldnvdevs; i++) {
    912 		uint64_t pool;
    913 
    914 		vd = oldvdevs[i];
    915 		if (vd != NULL) {
    916 			if (spa_mode & FWRITE &&
    917 			    spa_l2cache_exists(vd->vdev_guid, &pool) &&
    918 			    pool != 0ULL) {
    919 				l2arc_remove_vdev(vd);
    920 			}
    921 			(void) vdev_close(vd);
    922 			spa_l2cache_remove(vd);
    923 		}
    924 	}
    925 
    926 	if (oldvdevs)
    927 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
    928 
    929 	if (sav->sav_config == NULL)
    930 		goto out;
    931 
    932 	sav->sav_vdevs = newvdevs;
    933 	sav->sav_count = (int)nl2cache;
    934 
    935 	/*
    936 	 * Recompute the stashed list of l2cache devices, with status
    937 	 * information this time.
    938 	 */
    939 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
    940 	    DATA_TYPE_NVLIST_ARRAY) == 0);
    941 
    942 	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
    943 	for (i = 0; i < sav->sav_count; i++)
    944 		l2cache[i] = vdev_config_generate(spa,
    945 		    sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
    946 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
    947 	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
    948 out:
    949 	for (i = 0; i < sav->sav_count; i++)
    950 		nvlist_free(l2cache[i]);
    951 	if (sav->sav_count)
    952 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
    953 }
    954 
    955 static int
    956 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
    957 {
    958 	dmu_buf_t *db;
    959 	char *packed = NULL;
    960 	size_t nvsize = 0;
    961 	int error;
    962 	*value = NULL;
    963 
    964 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
    965 	nvsize = *(uint64_t *)db->db_data;
    966 	dmu_buf_rele(db, FTAG);
    967 
    968 	packed = kmem_alloc(nvsize, KM_SLEEP);
    969 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
    970 	if (error == 0)
    971 		error = nvlist_unpack(packed, nvsize, value, 0);
    972 	kmem_free(packed, nvsize);
    973 
    974 	return (error);
    975 }
    976 
    977 /*
    978  * Checks to see if the given vdev could not be opened, in which case we post a
    979  * sysevent to notify the autoreplace code that the device has been removed.
    980  */
    981 static void
    982 spa_check_removed(vdev_t *vd)
    983 {
    984 	int c;
    985 
    986 	for (c = 0; c < vd->vdev_children; c++)
    987 		spa_check_removed(vd->vdev_child[c]);
    988 
    989 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
    990 		zfs_post_autoreplace(vd->vdev_spa, vd);
    991 		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
    992 	}
    993 }
    994 
    995 /*
    996  * Load an existing storage pool, using the pool's builtin spa_config as a
    997  * source of configuration information.
    998  */
    999 static int
   1000 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
   1001 {
   1002 	int error = 0;
   1003 	nvlist_t *nvroot = NULL;
   1004 	vdev_t *rvd;
   1005 	uberblock_t *ub = &spa->spa_uberblock;
   1006 	uint64_t config_cache_txg = spa->spa_config_txg;
   1007 	uint64_t pool_guid;
   1008 	uint64_t version;
   1009 	zio_t *zio;
   1010 	uint64_t autoreplace = 0;
   1011 
   1012 	spa->spa_load_state = state;
   1013 
   1014 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
   1015 	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
   1016 		error = EINVAL;
   1017 		goto out;
   1018 	}
   1019 
   1020 	/*
   1021 	 * Versioning wasn't explicitly added to the label until later, so if
   1022 	 * it's not present treat it as the initial version.
   1023 	 */
   1024 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
   1025 		version = SPA_VERSION_INITIAL;
   1026 
   1027 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
   1028 	    &spa->spa_config_txg);
   1029 
   1030 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
   1031 	    spa_guid_exists(pool_guid, 0)) {
   1032 		error = EEXIST;
   1033 		goto out;
   1034 	}
   1035 
   1036 	spa->spa_load_guid = pool_guid;
   1037 
   1038 	/*
   1039 	 * Parse the configuration into a vdev tree.  We explicitly set the
   1040 	 * value that will be returned by spa_version() since parsing the
   1041 	 * configuration requires knowing the version number.
   1042 	 */
   1043 	spa_config_enter(spa, RW_WRITER, FTAG);
   1044 	spa->spa_ubsync.ub_version = version;
   1045 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
   1046 	spa_config_exit(spa, FTAG);
   1047 
   1048 	if (error != 0)
   1049 		goto out;
   1050 
   1051 	ASSERT(spa->spa_root_vdev == rvd);
   1052 	ASSERT(spa_guid(spa) == pool_guid);
   1053 
   1054 	/*
   1055 	 * Try to open all vdevs, loading each label in the process.
   1056 	 */
   1057 	error = vdev_open(rvd);
   1058 	if (error != 0)
   1059 		goto