Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"@(#)spa.c	1.46	08/01/02 SMI"
     28 
     29 /*
     30  * This file contains all the routines used when modifying on-disk SPA state.
     31  * This includes opening, importing, destroying, exporting a pool, and syncing a
     32  * pool.
     33  */
     34 
     35 #include <sys/zfs_context.h>
     36 #include <sys/fm/fs/zfs.h>
     37 #include <sys/spa_impl.h>
     38 #include <sys/zio.h>
     39 #include <sys/zio_checksum.h>
     40 #include <sys/zio_compress.h>
     41 #include <sys/dmu.h>
     42 #include <sys/dmu_tx.h>
     43 #include <sys/zap.h>
     44 #include <sys/zil.h>
     45 #include <sys/vdev_impl.h>
     46 #include <sys/metaslab.h>
     47 #include <sys/uberblock_impl.h>
     48 #include <sys/txg.h>
     49 #include <sys/avl.h>
     50 #include <sys/dmu_traverse.h>
     51 #include <sys/dmu_objset.h>
     52 #include <sys/unique.h>
     53 #include <sys/dsl_pool.h>
     54 #include <sys/dsl_dataset.h>
     55 #include <sys/dsl_dir.h>
     56 #include <sys/dsl_prop.h>
     57 #include <sys/dsl_synctask.h>
     58 #include <sys/fs/zfs.h>
     59 #include <sys/arc.h>
     60 #include <sys/callb.h>
     61 #include <sys/systeminfo.h>
     62 #include <sys/sunddi.h>
     63 
     64 #include "zfs_prop.h"
     65 
     66 int zio_taskq_threads = 8;
     67 
     68 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
     69 
     70 /*
     71  * ==========================================================================
     72  * SPA properties routines
     73  * ==========================================================================
     74  */
     75 
     76 /*
     77  * Add a (source=src, propname=propval) list to an nvlist.
     78  */
     79 static int
     80 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
     81     uint64_t intval, zprop_source_t src)
     82 {
     83 	const char *propname = zpool_prop_to_name(prop);
     84 	nvlist_t *propval;
     85 	int err = 0;
     86 
     87 	if (err = nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP))
     88 		return (err);
     89 
     90 	if (err = nvlist_add_uint64(propval, ZPROP_SOURCE, src))
     91 		goto out;
     92 
     93 	if (strval != NULL) {
     94 		if (err = nvlist_add_string(propval, ZPROP_VALUE, strval))
     95 			goto out;
     96 	} else {
     97 		if (err = nvlist_add_uint64(propval, ZPROP_VALUE, intval))
     98 			goto out;
     99 	}
    100 
    101 	err = nvlist_add_nvlist(nvl, propname, propval);
    102 out:
    103 	nvlist_free(propval);
    104 	return (err);
    105 }
    106 
    107 /*
    108  * Get property values from the spa configuration.
    109  */
    110 static int
    111 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
    112 {
    113 	uint64_t size = spa_get_space(spa);
    114 	uint64_t used = spa_get_alloc(spa);
    115 	uint64_t cap, version;
    116 	zprop_source_t src = ZPROP_SRC_NONE;
    117 	int err;
    118 	char *cachefile;
    119 	size_t len;
    120 
    121 	/*
    122 	 * readonly properties
    123 	 */
    124 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name,
    125 	    0, src))
    126 		return (err);
    127 
    128 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src))
    129 		return (err);
    130 
    131 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src))
    132 		return (err);
    133 
    134 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL,
    135 	    size - used, src))
    136 		return (err);
    137 
    138 	cap = (size == 0) ? 0 : (used * 100 / size);
    139 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src))
    140 		return (err);
    141 
    142 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL,
    143 	    spa_guid(spa), src))
    144 		return (err);
    145 
    146 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
    147 	    spa->spa_root_vdev->vdev_state, src))
    148 		return (err);
    149 
    150 	/*
    151 	 * settable properties that are not stored in the pool property object.
    152 	 */
    153 	version = spa_version(spa);
    154 	if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
    155 		src = ZPROP_SRC_DEFAULT;
    156 	else
    157 		src = ZPROP_SRC_LOCAL;
    158 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
    159 	    version, src))
    160 		return (err);
    161 
    162 	if (spa->spa_root != NULL) {
    163 		src = ZPROP_SRC_LOCAL;
    164 		if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT,
    165 		    spa->spa_root, 0, src))
    166 			return (err);
    167 	}
    168 
    169 	if (spa->spa_config_dir != NULL) {
    170 		if (strcmp(spa->spa_config_dir, "none") == 0) {
    171 			err = spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
    172 			    spa->spa_config_dir, 0, ZPROP_SRC_LOCAL);
    173 		} else {
    174 			len = strlen(spa->spa_config_dir) +
    175 			    strlen(spa->spa_config_file) + 2;
    176 			cachefile = kmem_alloc(len, KM_SLEEP);
    177 			(void) snprintf(cachefile, len, "%s/%s",
    178 			    spa->spa_config_dir, spa->spa_config_file);
    179 			err = spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
    180 			    cachefile, 0, ZPROP_SRC_LOCAL);
    181 			kmem_free(cachefile, len);
    182 		}
    183 
    184 		if (err)
    185 			return (err);
    186 	}
    187 
    188 	return (0);
    189 }
    190 
    191 /*
    192  * Get zpool property values.
    193  */
    194 int
    195 spa_prop_get(spa_t *spa, nvlist_t **nvp)
    196 {
    197 	zap_cursor_t zc;
    198 	zap_attribute_t za;
    199 	objset_t *mos = spa->spa_meta_objset;
    200 	int err;
    201 
    202 	if (err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP))
    203 		return (err);
    204 
    205 	/*
    206 	 * Get properties from the spa config.
    207 	 */
    208 	if (err = spa_prop_get_config(spa, nvp))
    209 		goto out;
    210 
    211 	mutex_enter(&spa->spa_props_lock);
    212 	/* If no pool property object, no more prop to get. */
    213 	if (spa->spa_pool_props_object == 0) {
    214 		mutex_exit(&spa->spa_props_lock);
    215 		return (0);
    216 	}
    217 
    218 	/*
    219 	 * Get properties from the MOS pool property object.
    220 	 */
    221 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
    222 	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
    223 	    zap_cursor_advance(&zc)) {
    224 		uint64_t intval = 0;
    225 		char *strval = NULL;
    226 		zprop_source_t src = ZPROP_SRC_DEFAULT;
    227 		zpool_prop_t prop;
    228 
    229 		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
    230 			continue;
    231 
    232 		switch (za.za_integer_length) {
    233 		case 8:
    234 			/* integer property */
    235 			if (za.za_first_integer !=
    236 			    zpool_prop_default_numeric(prop))
    237 				src = ZPROP_SRC_LOCAL;
    238 
    239 			if (prop == ZPOOL_PROP_BOOTFS) {
    240 				dsl_pool_t *dp;
    241 				dsl_dataset_t *ds = NULL;
    242 
    243 				dp = spa_get_dsl(spa);
    244 				rw_enter(&dp->dp_config_rwlock, RW_READER);
    245 				if (err = dsl_dataset_open_obj(dp,
    246 				    za.za_first_integer, NULL, DS_MODE_NONE,
    247 				    FTAG, &ds)) {
    248 					rw_exit(&dp->dp_config_rwlock);
    249 					break;
    250 				}
    251 
    252 				strval = kmem_alloc(
    253 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
    254 				    KM_SLEEP);
    255 				dsl_dataset_name(ds, strval);
    256 				dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
    257 				rw_exit(&dp->dp_config_rwlock);
    258 			} else {
    259 				strval = NULL;
    260 				intval = za.za_first_integer;
    261 			}
    262 
    263 			err = spa_prop_add_list(*nvp, prop, strval,
    264 			    intval, src);
    265 
    266 			if (strval != NULL)
    267 				kmem_free(strval,
    268 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
    269 
    270 			break;
    271 
    272 		case 1:
    273 			/* string property */
    274 			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
    275 			err = zap_lookup(mos, spa->spa_pool_props_object,
    276 			    za.za_name, 1, za.za_num_integers, strval);
    277 			if (err) {
    278 				kmem_free(strval, za.za_num_integers);
    279 				break;
    280 			}
    281 			err = spa_prop_add_list(*nvp, prop, strval, 0, src);
    282 			kmem_free(strval, za.za_num_integers);
    283 			break;
    284 
    285 		default:
    286 			break;
    287 		}
    288 	}
    289 	zap_cursor_fini(&zc);
    290 	mutex_exit(&spa->spa_props_lock);
    291 out:
    292 	if (err && err != ENOENT) {
    293 		nvlist_free(*nvp);
    294 		return (err);
    295 	}
    296 
    297 	return (0);
    298 }
    299 
    300 /*
    301  * Validate the given pool properties nvlist and modify the list
    302  * for the property values to be set.
    303  */
    304 static int
    305 spa_prop_validate(spa_t *spa, nvlist_t *props)
    306 {
    307 	nvpair_t *elem;
    308 	int error = 0, reset_bootfs = 0;
    309 	uint64_t objnum;
    310 
    311 	elem = NULL;
    312 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
    313 		zpool_prop_t prop;
    314 		char *propname, *strval;
    315 		uint64_t intval;
    316 		vdev_t *rvdev;
    317 		char *vdev_type;
    318 		objset_t *os;
    319 		char *slash;
    320 
    321 		propname = nvpair_name(elem);
    322 
    323 		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
    324 			return (EINVAL);
    325 
    326 		switch (prop) {
    327 		case ZPOOL_PROP_VERSION:
    328 			error = nvpair_value_uint64(elem, &intval);
    329 			if (!error &&
    330 			    (intval < spa_version(spa) || intval > SPA_VERSION))
    331 				error = EINVAL;
    332 			break;
    333 
    334 		case ZPOOL_PROP_DELEGATION:
    335 		case ZPOOL_PROP_AUTOREPLACE:
    336 			error = nvpair_value_uint64(elem, &intval);
    337 			if (!error && intval > 1)
    338 				error = EINVAL;
    339 			break;
    340 
    341 		case ZPOOL_PROP_BOOTFS:
    342 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
    343 				error = ENOTSUP;
    344 				break;
    345 			}
    346 
    347 			/*
    348 			 * A bootable filesystem can not be on a RAIDZ pool
    349 			 * nor a striped pool with more than 1 device.
    350 			 */
    351 			rvdev = spa->spa_root_vdev;
    352 			vdev_type =
    353 			    rvdev->vdev_child[0]->vdev_ops->vdev_op_type;
    354 			if (rvdev->vdev_children > 1 ||
    355 			    strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
    356 			    strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
    357 				error = ENOTSUP;
    358 				break;
    359 			}
    360 
    361 			reset_bootfs = 1;
    362 
    363 			error = nvpair_value_string(elem, &strval);
    364 
    365 			if (!error) {
    366 				if (strval == NULL || strval[0] == '\0') {
    367 					objnum = zpool_prop_default_numeric(
    368 					    ZPOOL_PROP_BOOTFS);
    369 					break;
    370 				}
    371 
    372 				if (error = dmu_objset_open(strval, DMU_OST_ZFS,
    373 				    DS_MODE_STANDARD | DS_MODE_READONLY, &os))
    374 					break;
    375 				objnum = dmu_objset_id(os);
    376 				dmu_objset_close(os);
    377 			}
    378 			break;
    379 		case ZPOOL_PROP_FAILUREMODE:
    380 			error = nvpair_value_uint64(elem, &intval);
    381 			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
    382 			    intval > ZIO_FAILURE_MODE_PANIC))
    383 				error = EINVAL;
    384 
    385 			/*
    386 			 * This is a special case which only occurs when
    387 			 * the pool has completely failed. This allows
    388 			 * the user to change the in-core failmode property
    389 			 * without syncing it out to disk (I/Os might
    390 			 * currently be blocked). We do this by returning
    391 			 * EIO to the caller (spa_prop_set) to trick it
    392 			 * into thinking we encountered a property validation
    393 			 * error.
    394 			 */
    395 			if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) {
    396 				spa->spa_failmode = intval;
    397 				error = EIO;
    398 			}
    399 			break;
    400 
    401 		case ZPOOL_PROP_CACHEFILE:
    402 			if ((error = nvpair_value_string(elem, &strval)) != 0)
    403 				break;
    404 
    405 			if (strval[0] == '\0')
    406 				break;
    407 
    408 			if (strcmp(strval, "none") == 0)
    409 				break;
    410 
    411 			if (strval[0] != '/') {
    412 				error = EINVAL;
    413 				break;
    414 			}
    415 
    416 			slash = strrchr(strval, '/');
    417 			ASSERT(slash != NULL);
    418 
    419 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
    420 			    strcmp(slash, "/..") == 0)
    421 				error = EINVAL;
    422 			break;
    423 		}
    424 
    425 		if (error)
    426 			break;
    427 	}
    428 
    429 	if (!error && reset_bootfs) {
    430 		error = nvlist_remove(props,
    431 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
    432 
    433 		if (!error) {
    434 			error = nvlist_add_uint64(props,
    435 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
    436 		}
    437 	}
    438 
    439 	return (error);
    440 }
    441 
    442 int
    443 spa_prop_set(spa_t *spa, nvlist_t *nvp)
    444 {
    445 	int error;
    446 
    447 	if ((error = spa_prop_validate(spa, nvp)) != 0)
    448 		return (error);
    449 
    450 	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
    451 	    spa, nvp, 3));
    452 }
    453 
    454 /*
    455  * If the bootfs property value is dsobj, clear it.
    456  */
    457 void
    458 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
    459 {
    460 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
    461 		VERIFY(zap_remove(spa->spa_meta_objset,
    462 		    spa->spa_pool_props_object,
    463 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
    464 		spa->spa_bootfs = 0;
    465 	}
    466 }
    467 
    468 /*
    469  * ==========================================================================
    470  * SPA state manipulation (open/create/destroy/import/export)
    471  * ==========================================================================
    472  */
    473 
    474 static int
    475 spa_error_entry_compare(const void *a, const void *b)
    476 {
    477 	spa_error_entry_t *sa = (spa_error_entry_t *)a;
    478 	spa_error_entry_t *sb = (spa_error_entry_t *)b;
    479 	int ret;
    480 
    481 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
    482 	    sizeof (zbookmark_t));
    483 
    484 	if (ret < 0)
    485 		return (-1);
    486 	else if (ret > 0)
    487 		return (1);
    488 	else
    489 		return (0);
    490 }
    491 
    492 /*
    493  * Utility function which retrieves copies of the current logs and
    494  * re-initializes them in the process.
    495  */
    496 void
    497 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
    498 {
    499 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
    500 
    501 	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
    502 	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
    503 
    504 	avl_create(&spa->spa_errlist_scrub,
    505 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
    506 	    offsetof(spa_error_entry_t, se_avl));
    507 	avl_create(&spa->spa_errlist_last,
    508 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
    509 	    offsetof(spa_error_entry_t, se_avl));
    510 }
    511 
    512 /*
    513  * Activate an uninitialized pool.
    514  */
    515 static void
    516 spa_activate(spa_t *spa)
    517 {
    518 	int t;
    519 
    520 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
    521 
    522 	spa->spa_state = POOL_STATE_ACTIVE;
    523 
    524 	spa->spa_normal_class = metaslab_class_create();
    525 	spa->spa_log_class = metaslab_class_create();
    526 
    527 	for (t = 0; t < ZIO_TYPES; t++) {
    528 		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
    529 		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
    530 		    TASKQ_PREPOPULATE);
    531 		spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
    532 		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
    533 		    TASKQ_PREPOPULATE);
    534 	}
    535 
    536 	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
    537 	    offsetof(vdev_t, vdev_dirty_node));
    538 	list_create(&spa->spa_zio_list, sizeof (zio_t),
    539 	    offsetof(zio_t, zio_link_node));
    540 
    541 	txg_list_create(&spa->spa_vdev_txg_list,
    542 	    offsetof(struct vdev, vdev_txg_node));
    543 
    544 	avl_create(&spa->spa_errlist_scrub,
    545 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
    546 	    offsetof(spa_error_entry_t, se_avl));
    547 	avl_create(&spa->spa_errlist_last,
    548 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
    549 	    offsetof(spa_error_entry_t, se_avl));
    550 }
    551 
    552 /*
    553  * Opposite of spa_activate().
    554  */
    555 static void
    556 spa_deactivate(spa_t *spa)
    557 {
    558 	int t;
    559 
    560 	ASSERT(spa->spa_sync_on == B_FALSE);
    561 	ASSERT(spa->spa_dsl_pool == NULL);
    562 	ASSERT(spa->spa_root_vdev == NULL);
    563 
    564 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
    565 
    566 	txg_list_destroy(&spa->spa_vdev_txg_list);
    567 
    568 	list_destroy(&spa->spa_dirty_list);
    569 	list_destroy(&spa->spa_zio_list);
    570 
    571 	for (t = 0; t < ZIO_TYPES; t++) {
    572 		taskq_destroy(spa->spa_zio_issue_taskq[t]);
    573 		taskq_destroy(spa->spa_zio_intr_taskq[t]);
    574 		spa->spa_zio_issue_taskq[t] = NULL;
    575 		spa->spa_zio_intr_taskq[t] = NULL;
    576 	}
    577 
    578 	metaslab_class_destroy(spa->spa_normal_class);
    579 	spa->spa_normal_class = NULL;
    580 
    581 	metaslab_class_destroy(spa->spa_log_class);
    582 	spa->spa_log_class = NULL;
    583 
    584 	/*
    585 	 * If this was part of an import or the open otherwise failed, we may
    586 	 * still have errors left in the queues.  Empty them just in case.
    587 	 */
    588 	spa_errlog_drain(spa);
    589 
    590 	avl_destroy(&spa->spa_errlist_scrub);
    591 	avl_destroy(&spa->spa_errlist_last);
    592 
    593 	spa->spa_state = POOL_STATE_UNINITIALIZED;
    594 }
    595 
    596 /*
    597  * Verify a pool configuration, and construct the vdev tree appropriately.  This
    598  * will create all the necessary vdevs in the appropriate layout, with each vdev
    599  * in the CLOSED state.  This will prep the pool before open/creation/import.
    600  * All vdev validation is done by the vdev_alloc() routine.
    601  */
    602 static int
    603 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
    604     uint_t id, int atype)
    605 {
    606 	nvlist_t **child;
    607 	uint_t c, children;
    608 	int error;
    609 
    610 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
    611 		return (error);
    612 
    613 	if ((*vdp)->vdev_ops->vdev_op_leaf)
    614 		return (0);
    615 
    616 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
    617 	    &child, &children) != 0) {
    618 		vdev_free(*vdp);
    619 		*vdp = NULL;
    620 		return (EINVAL);
    621 	}
    622 
    623 	for (c = 0; c < children; c++) {
    624 		vdev_t *vd;
    625 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
    626 		    atype)) != 0) {
    627 			vdev_free(*vdp);
    628 			*vdp = NULL;
    629 			return (error);
    630 		}
    631 	}
    632 
    633 	ASSERT(*vdp != NULL);
    634 
    635 	return (0);
    636 }
    637 
    638 /*
    639  * Opposite of spa_load().
    640  */
    641 static void
    642 spa_unload(spa_t *spa)
    643 {
    644 	int i;
    645 
    646 	/*
    647 	 * Stop async tasks.
    648 	 */
    649 	spa_async_suspend(spa);
    650 
    651 	/*
    652 	 * Stop syncing.
    653 	 */
    654 	if (spa->spa_sync_on) {
    655 		txg_sync_stop(spa->spa_dsl_pool);
    656 		spa->spa_sync_on = B_FALSE;
    657 	}
    658 
    659 	/*
    660 	 * Wait for any outstanding prefetch I/O to complete.
    661 	 */
    662 	spa_config_enter(spa, RW_WRITER, FTAG);
    663 	spa_config_exit(spa, FTAG);
    664 
    665 	/*
    666 	 * Drop and purge level 2 cache
    667 	 */
    668 	spa_l2cache_drop(spa);
    669 
    670 	/*
    671 	 * Close the dsl pool.
    672 	 */
    673 	if (spa->spa_dsl_pool) {
    674 		dsl_pool_close(spa->spa_dsl_pool);
    675 		spa->spa_dsl_pool = NULL;
    676 	}
    677 
    678 	/*
    679 	 * Close all vdevs.
    680 	 */
    681 	if (spa->spa_root_vdev)
    682 		vdev_free(spa->spa_root_vdev);
    683 	ASSERT(spa->spa_root_vdev == NULL);
    684 
    685 	for (i = 0; i < spa->spa_spares.sav_count; i++)
    686 		vdev_free(spa->spa_spares.sav_vdevs[i]);
    687 	if (spa->spa_spares.sav_vdevs) {
    688 		kmem_free(spa->spa_spares.sav_vdevs,
    689 		    spa->spa_spares.sav_count * sizeof (void *));
    690 		spa->spa_spares.sav_vdevs = NULL;
    691 	}
    692 	if (spa->spa_spares.sav_config) {
    693 		nvlist_free(spa->spa_spares.sav_config);
    694 		spa->spa_spares.sav_config = NULL;
    695 	}
    696 
    697 	for (i = 0; i < spa->spa_l2cache.sav_count; i++)
    698 		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
    699 	if (spa->spa_l2cache.sav_vdevs) {
    700 		kmem_free(spa->spa_l2cache.sav_vdevs,
    701 		    spa->spa_l2cache.sav_count * sizeof (void *));
    702 		spa->spa_l2cache.sav_vdevs = NULL;
    703 	}
    704 	if (spa->spa_l2cache.sav_config) {
    705 		nvlist_free(spa->spa_l2cache.sav_config);
    706 		spa->spa_l2cache.sav_config = NULL;
    707 	}
    708 
    709 	spa->spa_async_suspended = 0;
    710 }
    711 
    712 /*
    713  * Load (or re-load) the current list of vdevs describing the active spares for
    714  * this pool.  When this is called, we have some form of basic information in
    715  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
    716  * then re-generate a more complete list including status information.
    717  */
    718 static void
    719 spa_load_spares(spa_t *spa)
    720 {
    721 	nvlist_t **spares;
    722 	uint_t nspares;
    723 	int i;
    724 	vdev_t *vd, *tvd;
    725 
    726 	/*
    727 	 * First, close and free any existing spare vdevs.
    728 	 */
    729 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
    730 		vd = spa->spa_spares.sav_vdevs[i];
    731 
    732 		/* Undo the call to spa_activate() below */
    733 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
    734 		    tvd->vdev_isspare)
    735 			spa_spare_remove(tvd);
    736 		vdev_close(vd);
    737 		vdev_free(vd);
    738 	}
    739 
    740 	if (spa->spa_spares.sav_vdevs)
    741 		kmem_free(spa->spa_spares.sav_vdevs,
    742 		    spa->spa_spares.sav_count * sizeof (void *));
    743 
    744 	if (spa->spa_spares.sav_config == NULL)
    745 		nspares = 0;
    746 	else
    747 		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
    748 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
    749 
    750 	spa->spa_spares.sav_count = (int)nspares;
    751 	spa->spa_spares.sav_vdevs = NULL;
    752 
    753 	if (nspares == 0)
    754 		return;
    755 
    756 	/*
    757 	 * Construct the array of vdevs, opening them to get status in the
    758 	 * process.   For each spare, there is potentially two different vdev_t
    759 	 * structures associated with it: one in the list of spares (used only
    760 	 * for basic validation purposes) and one in the active vdev
    761 	 * configuration (if it's spared in).  During this phase we open and
    762 	 * validate each vdev on the spare list.  If the vdev also exists in the
    763 	 * active configuration, then we also mark this vdev as an active spare.
    764 	 */
    765 	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
    766 	    KM_SLEEP);
    767 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
    768 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
    769 		    VDEV_ALLOC_SPARE) == 0);
    770 		ASSERT(vd != NULL);
    771 
    772 		spa->spa_spares.sav_vdevs[i] = vd;
    773 
    774 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
    775 			if (!tvd->vdev_isspare)
    776 				spa_spare_add(tvd);
    777 
    778 			/*
    779 			 * We only mark the spare active if we were successfully
    780 			 * able to load the vdev.  Otherwise, importing a pool
    781 			 * with a bad active spare would result in strange
    782 			 * behavior, because multiple pool would think the spare
    783 			 * is actively in use.
    784 			 *
    785 			 * There is a vulnerability here to an equally bizarre
    786 			 * circumstance, where a dead active spare is later
    787 			 * brought back to life (onlined or otherwise).  Given
    788 			 * the rarity of this scenario, and the extra complexity
    789 			 * it adds, we ignore the possibility.
    790 			 */
    791 			if (!vdev_is_dead(tvd))
    792 				spa_spare_activate(tvd);
    793 		}
    794 
    795 		if (vdev_open(vd) != 0)
    796 			continue;
    797 
    798 		vd->vdev_top = vd;
    799 		if (vdev_validate_aux(vd) == 0)
    800 			spa_spare_add(vd);
    801 	}
    802 
    803 	/*
    804 	 * Recompute the stashed list of spares, with status information
    805 	 * this time.
    806 	 */
    807 	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
    808 	    DATA_TYPE_NVLIST_ARRAY) == 0);
    809 
    810 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
    811 	    KM_SLEEP);
    812 	for (i = 0; i < spa->spa_spares.sav_count; i++)
    813 		spares[i] = vdev_config_generate(spa,
    814 		    spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
    815 	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
    816 	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
    817 	for (i = 0; i < spa->spa_spares.sav_count; i++)
    818 		nvlist_free(spares[i]);
    819 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
    820 }
    821 
    822 /*
    823  * Load (or re-load) the current list of vdevs describing the active l2cache for
    824  * this pool.  When this is called, we have some form of basic information in
    825  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
    826  * then re-generate a more complete list including status information.
    827  * Devices which are already active have their details maintained, and are
    828  * not re-opened.
    829  */
    830 static void
    831 spa_load_l2cache(spa_t *spa)
    832 {
    833 	nvlist_t **l2cache;
    834 	uint_t nl2cache;
    835 	int i, j, oldnvdevs;
    836 	uint64_t guid;
    837 	vdev_t *vd, **oldvdevs, **newvdevs;
    838 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
    839 
    840 	if (sav->sav_config != NULL) {
    841 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
    842 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
    843 		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
    844 	} else {
    845 		nl2cache = 0;
    846 	}
    847 
    848 	oldvdevs = sav->sav_vdevs;
    849 	oldnvdevs = sav->sav_count;
    850 	sav->sav_vdevs = NULL;
    851 	sav->sav_count = 0;
    852 
    853 	/*
    854 	 * Process new nvlist of vdevs.
    855 	 */
    856 	for (i = 0; i < nl2cache; i++) {
    857 		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
    858 		    &guid) == 0);
    859 
    860 		newvdevs[i] = NULL;
    861 		for (j = 0; j < oldnvdevs; j++) {
    862 			vd = oldvdevs[j];
    863 			if (vd != NULL && guid == vd->vdev_guid) {
    864 				/*
    865 				 * Retain previous vdev for add/remove ops.
    866 				 */
    867 				newvdevs[i] = vd;
    868 				oldvdevs[j] = NULL;
    869 				break;
    870 			}
    871 		}
    872 
    873 		if (newvdevs[i] == NULL) {
    874 			/*
    875 			 * Create new vdev
    876 			 */
    877 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
    878 			    VDEV_ALLOC_L2CACHE) == 0);
    879 			ASSERT(vd != NULL);
    880 			newvdevs[i] = vd;
    881 
    882 			/*
    883 			 * Commit this vdev as an l2cache device,
    884 			 * even if it fails to open.
    885 			 */
    886 			spa_l2cache_add(vd);
    887 
    888 			if (vdev_open(vd) != 0)
    889 				continue;
    890 
    891 			vd->vdev_top = vd;
    892 			(void) vdev_validate_aux(vd);
    893 
    894 			if (!vdev_is_dead(vd)) {
    895 				uint64_t size;
    896 				size = vdev_get_rsize(vd);
    897 				ASSERT3U(size, >, 0);
    898 				if (spa_mode & FWRITE) {
    899 					l2arc_add_vdev(spa, vd,
    900 					    VDEV_LABEL_START_SIZE,
    901 					    size - VDEV_LABEL_START_SIZE);
    902 				}
    903 				spa_l2cache_activate(vd);
    904 			}
    905 		}
    906 	}
    907 
    908 	/*
    909 	 * Purge vdevs that were dropped
    910 	 */
    911 	for (i = 0; i < oldnvdevs; i++) {
    912 		uint64_t pool;
    913 
    914 		vd = oldvdevs[i];
    915 		if (vd != NULL) {
    916 			if (spa_mode & FWRITE &&
    917 			    spa_l2cache_exists(vd->vdev_guid, &pool) &&
    918 			    pool != 0ULL) {
    919 				l2arc_remove_vdev(vd);
    920 			}
    921 			(void) vdev_close(vd);
    922 			spa_l2cache_remove(vd);
    923 		}
    924 	}
    925 
    926 	if (oldvdevs)
    927 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
    928 
    929 	if (sav->sav_config == NULL)
    930 		goto out;
    931 
    932 	sav->sav_vdevs = newvdevs;
    933 	sav->sav_count = (int)nl2cache;
    934 
    935 	/*
    936 	 * Recompute the stashed list of l2cache devices, with status
    937 	 * information this time.
    938 	 */
    939 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
    940 	    DATA_TYPE_NVLIST_ARRAY) == 0);
    941 
    942 	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
    943 	for (i = 0; i < sav->sav_count; i++)
    944 		l2cache[i] = vdev_config_generate(spa,
    945 		    sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
    946 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
    947 	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
    948 out:
    949 	for (i = 0; i < sav->sav_count; i++)
    950 		nvlist_free(l2cache[i]);
    951 	if (sav->sav_count)
    952 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
    953 }
    954 
    955 static int
    956 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
    957 {
    958 	dmu_buf_t *db;
    959 	char *packed = NULL;
    960 	size_t nvsize = 0;
    961 	int error;
    962 	*value = NULL;
    963 
    964 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
    965 	nvsize = *(uint64_t *)db->db_data;
    966 	dmu_buf_rele(db, FTAG);
    967 
    968 	packed = kmem_alloc(nvsize, KM_SLEEP);
    969 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
    970 	if (error == 0)
    971 		error = nvlist_unpack(packed, nvsize, value, 0);
    972 	kmem_free(packed, nvsize);
    973 
    974 	return (error);
    975 }
    976 
    977 /*
    978  * Checks to see if the given vdev could not be opened, in which case we post a
    979  * sysevent to notify the autoreplace code that the device has been removed.
    980  */
    981 static void
    982 spa_check_removed(vdev_t *vd)
    983 {
    984 	int c;
    985 
    986 	for (c = 0; c < vd->vdev_children; c++)
    987 		spa_check_removed(vd->vdev_child[c]);
    988 
    989 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
    990 		zfs_post_autoreplace(vd->vdev_spa, vd);
    991 		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
    992 	}
    993 }
    994 
    995 /*
    996  * Load an existing storage pool, using the pool's builtin spa_config as a
    997  * source of configuration information.
    998  */
    999 static int
   1000 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
   1001 {
   1002 	int error = 0;
   1003 	nvlist_t *nvroot = NULL;
   1004 	vdev_t *rvd;
   1005 	uberblock_t *ub = &spa->spa_uberblock;
   1006 	uint64_t config_cache_txg = spa->spa_config_txg;
   1007 	uint64_t pool_guid;
   1008 	uint64_t version;
   1009 	zio_t *zio;
   1010 	uint64_t autoreplace = 0;
   1011 
   1012 	spa->spa_load_state = state;
   1013 
   1014 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
   1015 	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
   1016 		error = EINVAL;
   1017 		goto out;
   1018 	}
   1019 
   1020 	/*
   1021 	 * Versioning wasn't explicitly added to the label until later, so if
   1022 	 * it's not present treat it as the initial version.
   1023 	 */
   1024 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
   1025 		version = SPA_VERSION_INITIAL;
   1026 
   1027 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
   1028 	    &spa->spa_config_txg);
   1029 
   1030 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
   1031 	    spa_guid_exists(pool_guid, 0)) {
   1032 		error = EEXIST;
   1033 		goto out;
   1034 	}
   1035 
   1036 	spa->spa_load_guid = pool_guid;
   1037 
   1038 	/*
   1039 	 * Parse the configuration into a vdev tree.  We explicitly set the
   1040 	 * value that will be returned by spa_version() since parsing the
   1041 	 * configuration requires knowing the version number.
   1042 	 */
   1043 	spa_config_enter(spa, RW_WRITER, FTAG);
   1044 	spa->spa_ubsync.ub_version = version;
   1045 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
   1046 	spa_config_exit(spa, FTAG);
   1047 
   1048 	if (error != 0)
   1049 		goto out;
   1050 
   1051 	ASSERT(spa->spa_root_vdev == rvd);
   1052 	ASSERT(spa_guid(spa) == pool_guid);
   1053 
   1054 	/*
   1055 	 * Try to open all vdevs, loading each label in the process.
   1056 	 */
   1057 	error = vdev_open(rvd);
   1058 	if (error != 0)
   1059 		goto out;
   1060 
   1061 	/*
   1062 	 * Validate the labels for all leaf vdevs.  We need to grab the config
   1063 	 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
   1064 	 * flag.
   1065 	 */
   1066 	spa_config_enter(spa, RW_READER, FTAG);
   1067 	error = vdev_validate(rvd);
   1068 	spa_config_exit(spa, FTAG);
   1069 
   1070 	if (error != 0)
   1071 		goto out;
   1072 
   1073 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
   1074 		error = ENXIO;
   1075 		goto out;
   1076 	}
   1077 
   1078 	/*
   1079 	 * Find the best uberblock.
   1080 	 */
   1081 	bzero(ub, sizeof (uberblock_t));
   1082 
   1083 	zio = zio_root(spa, NULL, NULL,
   1084 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
   1085 	vdev_uberblock_load(zio, rvd, ub);
   1086 	error = zio_wait(zio);
   1087 
   1088 	/*
   1089 	 * If we weren't able to find a single valid uberblock, return failure.
   1090 	 */
   1091 	if (ub->ub_txg == 0) {
   1092 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1093 		    VDEV_AUX_CORRUPT_DATA);
   1094 		error = ENXIO;
   1095 		goto out;
   1096 	}
   1097 
   1098 	/*
   1099 	 * If the pool is newer than the code, we can't open it.
   1100 	 */
   1101 	if (ub->ub_version > SPA_VERSION) {
   1102 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1103 		    VDEV_AUX_VERSION_NEWER);
   1104 		error = ENOTSUP;
   1105 		goto out;
   1106 	}
   1107 
   1108 	/*
   1109 	 * If the vdev guid sum doesn't match the uberblock, we have an
   1110 	 * incomplete configuration.
   1111 	 */
   1112 	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
   1113 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1114 		    VDEV_AUX_BAD_GUID_SUM);
   1115 		error = ENXIO;
   1116 		goto out;
   1117 	}
   1118 
   1119 	/*
   1120 	 * Initialize internal SPA structures.
   1121 	 */
   1122 	spa->spa_state = POOL_STATE_ACTIVE;
   1123 	spa->spa_ubsync = spa->spa_uberblock;
   1124 	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
   1125 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
   1126 	if (error) {
   1127 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1128 		    VDEV_AUX_CORRUPT_DATA);
   1129 		goto out;
   1130 	}
   1131 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
   1132 
   1133 	if (zap_lookup(spa->spa_meta_objset,
   1134 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
   1135 	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
   1136 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1137 		    VDEV_AUX_CORRUPT_DATA);
   1138 		error = EIO;
   1139 		goto out;
   1140 	}
   1141 
   1142 	if (!mosconfig) {
   1143 		nvlist_t *newconfig;
   1144 		uint64_t hostid;
   1145 
   1146 		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
   1147 			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1148 			    VDEV_AUX_CORRUPT_DATA);
   1149 			error = EIO;
   1150 			goto out;
   1151 		}
   1152 
   1153 		if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID,
   1154 		    &hostid) == 0) {
   1155 			char *hostname;
   1156 			unsigned long myhostid = 0;
   1157 
   1158 			VERIFY(nvlist_lookup_string(newconfig,
   1159 			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
   1160 
   1161 			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
   1162 			if (hostid != 0 && myhostid != 0 &&
   1163 			    (unsigned long)hostid != myhostid) {
   1164 				cmn_err(CE_WARN, "pool '%s' could not be "
   1165 				    "loaded as it was last accessed by "
   1166 				    "another system (host: %s hostid: 0x%lx).  "
   1167 				    "See: http://www.sun.com/msg/ZFS-8000-EY",
   1168 				    spa->spa_name, hostname,
   1169 				    (unsigned long)hostid);
   1170 				error = EBADF;
   1171 				goto out;
   1172 			}
   1173 		}
   1174 
   1175 		spa_config_set(spa, newconfig);
   1176 		spa_unload(spa);
   1177 		spa_deactivate(spa);
   1178 		spa_activate(spa);
   1179 
   1180 		return (spa_load(spa, newconfig, state, B_TRUE));
   1181 	}
   1182 
   1183 	if (zap_lookup(spa->spa_meta_objset,
   1184 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
   1185 	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
   1186 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1187 		    VDEV_AUX_CORRUPT_DATA);
   1188 		error = EIO;
   1189 		goto out;
   1190 	}
   1191 
   1192 	/*
   1193 	 * Load the bit that tells us to use the new accounting function
   1194 	 * (raid-z deflation).  If we have an older pool, this will not
   1195 	 * be present.
   1196 	 */
   1197 	error = zap_lookup(spa->spa_meta_objset,
   1198 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
   1199 	    sizeof (uint64_t), 1, &spa->spa_deflate);
   1200 	if (error != 0 && error != ENOENT) {
   1201 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1202 		    VDEV_AUX_CORRUPT_DATA);
   1203 		error = EIO;
   1204 		goto out;
   1205 	}
   1206 
   1207 	/*
   1208 	 * Load the persistent error log.  If we have an older pool, this will
   1209 	 * not be present.
   1210 	 */
   1211 	error = zap_lookup(spa->spa_meta_objset,
   1212 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
   1213 	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
   1214 	if (error != 0 && error != ENOENT) {
   1215 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1216 		    VDEV_AUX_CORRUPT_DATA);
   1217 		error = EIO;
   1218 		goto out;
   1219 	}
   1220 
   1221 	error = zap_lookup(spa->spa_meta_objset,
   1222 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
   1223 	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
   1224 	if (error != 0 && error != ENOENT) {
   1225 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1226 		    VDEV_AUX_CORRUPT_DATA);
   1227 		error = EIO;
   1228 		goto out;
   1229 	}
   1230 
   1231 	/*
   1232 	 * Load the history object.  If we have an older pool, this
   1233 	 * will not be present.
   1234 	 */
   1235 	error = zap_lookup(spa->spa_meta_objset,
   1236 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
   1237 	    sizeof (uint64_t), 1, &spa->spa_history);
   1238 	if (error != 0 && error != ENOENT) {
   1239 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1240 		    VDEV_AUX_CORRUPT_DATA);
   1241 		error = EIO;
   1242 		goto out;
   1243 	}
   1244 
   1245 	/*
   1246 	 * Load any hot spares for this pool.
   1247 	 */
   1248 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
   1249 	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
   1250 	if (error != 0 && error != ENOENT) {
   1251 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1252 		    VDEV_AUX_CORRUPT_DATA);
   1253 		error = EIO;
   1254 		goto out;
   1255 	}
   1256 	if (error == 0) {
   1257 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
   1258 		if (load_nvlist(spa, spa->spa_spares.sav_object,
   1259 		    &spa->spa_spares.sav_config) != 0) {
   1260 			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1261 			    VDEV_AUX_CORRUPT_DATA);
   1262 			error = EIO;
   1263 			goto out;
   1264 		}
   1265 
   1266 		spa_config_enter(spa, RW_WRITER, FTAG);
   1267 		spa_load_spares(spa);
   1268 		spa_config_exit(spa, FTAG);
   1269 	}
   1270 
   1271 	/*
   1272 	 * Load any level 2 ARC devices for this pool.
   1273 	 */
   1274 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
   1275 	    DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
   1276 	    &spa->spa_l2cache.sav_object);
   1277 	if (error != 0 && error != ENOENT) {
   1278 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1279 		    VDEV_AUX_CORRUPT_DATA);
   1280 		error = EIO;
   1281 		goto out;
   1282 	}
   1283 	if (error == 0) {
   1284 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
   1285 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
   1286 		    &spa->spa_l2cache.sav_config) != 0) {
   1287 			vdev_set_state(rvd, B_TRUE,
   1288 			    VDEV_STATE_CANT_OPEN,
   1289 			    VDEV_AUX_CORRUPT_DATA);
   1290 			error = EIO;
   1291 			goto out;
   1292 		}
   1293 
   1294 		spa_config_enter(spa, RW_WRITER, FTAG);
   1295 		spa_load_l2cache(spa);
   1296 		spa_config_exit(spa, FTAG);
   1297 	}
   1298 
   1299 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
   1300 
   1301 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
   1302 	    DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
   1303 
   1304 	if (error && error != ENOENT) {
   1305 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
   1306 		    VDEV_AUX_CORRUPT_DATA);
   1307 		error = EIO;
   1308 		goto out;
   1309 	}
   1310 
   1311 	if (error == 0) {
   1312 		(void) zap_lookup(spa->spa_meta_objset,
   1313 		    spa->spa_pool_props_object,
   1314 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
   1315 		    sizeof (uint64_t), 1, &spa->spa_bootfs);
   1316 		(void) zap_lookup(spa->spa_meta_objset,
   1317 		    spa->spa_pool_props_object,
   1318 		    zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
   1319 		    sizeof (uint64_t), 1, &autoreplace);
   1320 		(void) zap_lookup(spa->spa_meta_objset,
   1321 		    spa->spa_pool_props_object,
   1322 		    zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
   1323 		    sizeof (uint64_t), 1, &spa->spa_delegation);
   1324 		(void) zap_lookup(spa->spa_meta_objset,
   1325 		    spa->spa_pool_props_object,
   1326 		    zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
   1327 		    sizeof (uint64_t), 1, &spa->spa_failmode);
   1328 	}
   1329 
   1330 	/*
   1331 	 * If the 'autoreplace' property is set, then post a resource notifying
   1332 	 * the ZFS DE that it should not issue any faults for unopenable
   1333 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
   1334 	 * unopenable vdevs so that the normal autoreplace handler can take
   1335 	 * over.
   1336 	 */
   1337 	if (autoreplace && state != SPA_LOAD_TRYIMPORT)
   1338 		spa_check_removed(spa->spa_root_vdev);
   1339 
   1340 	/*
   1341 	 * Load the vdev state for all toplevel vdevs.
   1342 	 */
   1343 	vdev_load(rvd);
   1344 
   1345 	/*
   1346 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
   1347 	 */
   1348 	spa_config_enter(spa, RW_WRITER, FTAG);
   1349 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
   1350 	spa_config_exit(spa, FTAG);
   1351 
   1352 	/*
   1353 	 * Check the state of the root vdev.  If it can't be opened, it
   1354 	 * indicates one or more toplevel vdevs are faulted.
   1355 	 */
   1356 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
   1357 		error = ENXIO;
   1358 		goto out;
   1359 	}
   1360 
   1361 	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
   1362 		dmu_tx_t *tx;
   1363 		int need_update = B_FALSE;
   1364 		int c;
   1365 
   1366 		/*
   1367 		 * Claim log blocks that haven't been committed yet.
   1368 		 * This must all happen in a single txg.
   1369 		 */
   1370 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
   1371 		    spa_first_txg(spa));
   1372 		(void) dmu_objset_find(spa->spa_name,
   1373 		    zil_claim, tx, DS_FIND_CHILDREN);
   1374 		dmu_tx_commit(tx);
   1375 
   1376 		spa->spa_sync_on = B_TRUE;
   1377 		txg_sync_start(spa->spa_dsl_pool);
   1378 
   1379 		/*
   1380 		 * Wait for all claims to sync.
   1381 		 */
   1382 		txg_wait_synced(spa->spa_dsl_pool, 0);
   1383 
   1384 		/*
   1385 		 * If the config cache is stale, or we have uninitialized
   1386 		 * metaslabs (see spa_vdev_add()), then update the config.
   1387 		 */
   1388 		if (config_cache_txg != spa->spa_config_txg ||
   1389 		    state == SPA_LOAD_IMPORT)
   1390 			need_update = B_TRUE;
   1391 
   1392 		for (c = 0; c < rvd->vdev_children; c++)
   1393 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
   1394 				need_update = B_TRUE;
   1395 
   1396 		/*
   1397 		 * Update the config cache asychronously in case we're the
   1398 		 * root pool, in which case the config cache isn't writable yet.
   1399 		 */
   1400 		if (need_update)
   1401 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
   1402 	}
   1403 
   1404 	error = 0;
   1405 out:
   1406 	if (error && error != EBADF)
   1407 		zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
   1408 	spa->spa_load_state = SPA_LOAD_NONE;
   1409 	spa->spa_ena = 0;
   1410 
   1411 	return (error);
   1412 }
   1413 
   1414 /*
   1415  * Pool Open/Import
   1416  *
   1417  * The import case is identical to an open except that the configuration is sent
   1418  * down from userland, instead of grabbed from the configuration cache.  For the
   1419  * case of an open, the pool configuration will exist in the
   1420  * POOL_STATE_UNINITIALIZED state.
   1421  *
   1422  * The stats information (gen/count/ustats) is used to gather vdev statistics at
   1423  * the same time open the pool, without having to keep around the spa_t in some
   1424  * ambiguous state.
   1425  */
   1426 static int
   1427 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
   1428 {
   1429 	spa_t *spa;
   1430 	int error;
   1431 	int loaded = B_FALSE;
   1432 	int locked = B_FALSE;
   1433 
   1434 	*spapp = NULL;
   1435 
   1436 	/*
   1437 	 * As disgusting as this is, we need to support recursive calls to this
   1438 	 * function because dsl_dir_open() is called during spa_load(), and ends
   1439 	 * up calling spa_open() again.  The real fix is to figure out how to
   1440 	 * avoid dsl_dir_open() calling this in the first place.
   1441 	 */
   1442 	if (mutex_owner(&spa_namespace_lock) != curthread) {
   1443 		mutex_enter(&spa_namespace_lock);
   1444 		locked = B_TRUE;
   1445 	}
   1446 
   1447 	if ((spa = spa_lookup(pool)) == NULL) {
   1448 		if (locked)
   1449 			mutex_exit(&spa_namespace_lock);
   1450 		return (ENOENT);
   1451 	}
   1452 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
   1453 
   1454 		spa_activate(spa);
   1455 
   1456 		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
   1457 
   1458 		if (error == EBADF) {
   1459 			/*
   1460 			 * If vdev_validate() returns failure (indicated by
   1461 			 * EBADF), it indicates that one of the vdevs indicates
   1462 			 * that the pool has been exported or destroyed.  If
   1463 			 * this is the case, the config cache is out of sync and
   1464 			 * we should remove the pool from the namespace.
   1465 			 */
   1466 			zfs_post_ok(spa, NULL);
   1467 			spa_unload(spa);
   1468 			spa_deactivate(spa);
   1469 			spa_remove(spa);
   1470 			spa_config_sync();
   1471 			if (locked)
   1472 				mutex_exit(&spa_namespace_lock);
   1473 			return (ENOENT);
   1474 		}
   1475 
   1476 		if (error) {
   1477 			/*
   1478 			 * We can't open the pool, but we still have useful
   1479 			 * information: the state of each vdev after the
   1480 			 * attempted vdev_open().  Return this to the user.
   1481 			 */
   1482 			if (config != NULL && spa->spa_root_vdev != NULL) {
   1483 				spa_config_enter(spa, RW_READER, FTAG);
   1484 				*config = spa_config_generate(spa, NULL, -1ULL,
   1485 				    B_TRUE);
   1486 				spa_config_exit(spa, FTAG);
   1487 			}
   1488 			spa_unload(spa);
   1489 			spa_deactivate(spa);
   1490 			spa->spa_last_open_failed = B_TRUE;
   1491 			if (locked)
   1492 				mutex_exit(&spa_namespace_lock);
   1493 			*spapp = NULL;
   1494 			return (error);
   1495 		} else {
   1496 			zfs_post_ok(spa, NULL);
   1497 			spa->spa_last_open_failed = B_FALSE;
   1498 		}
   1499 
   1500 		loaded = B_TRUE;
   1501 	}
   1502 
   1503 	spa_open_ref(spa, tag);
   1504 
   1505 	/*
   1506 	 * If we just loaded the pool, resilver anything that's out of date.
   1507 	 */
   1508 	if (loaded && (spa_mode & FWRITE))
   1509 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
   1510 
   1511 	if (locked)
   1512 		mutex_exit(&spa_namespace_lock);
   1513 
   1514 	*spapp = spa;
   1515 
   1516 	if (config != NULL) {
   1517 		spa_config_enter(spa, RW_READER, FTAG);
   1518 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
   1519 		spa_config_exit(spa, FTAG);
   1520 	}
   1521 
   1522 	return (0);
   1523 }
   1524 
   1525 int
   1526 spa_open(const char *name, spa_t **spapp, void *tag)
   1527 {
   1528 	return (spa_open_common(name, spapp, tag, NULL));
   1529 }
   1530 
   1531 /*
   1532  * Lookup the given spa_t, incrementing the inject count in the process,
   1533  * preventing it from being exported or destroyed.
   1534  */
   1535 spa_t *
   1536 spa_inject_addref(char *name)
   1537 {
   1538 	spa_t *spa;
   1539 
   1540 	mutex_enter(&spa_namespace_lock);
   1541 	if ((spa = spa_lookup(name)) == NULL) {
   1542 		mutex_exit(&spa_namespace_lock);
   1543 		return (NULL);
   1544 	}
   1545 	spa->spa_inject_ref++;
   1546 	mutex_exit(&spa_namespace_lock);
   1547 
   1548 	return (spa);
   1549 }
   1550 
   1551 void
   1552 spa_inject_delref(spa_t *spa)
   1553 {
   1554 	mutex_enter(&spa_namespace_lock);
   1555 	spa->spa_inject_ref--;
   1556 	mutex_exit(&spa_namespace_lock);
   1557 }
   1558 
   1559 /*
   1560  * Add spares device information to the nvlist.
   1561  */
   1562 static void
   1563 spa_add_spares(spa_t *spa, nvlist_t *config)
   1564 {
   1565 	nvlist_t **spares;
   1566 	uint_t i, nspares;
   1567 	nvlist_t *nvroot;
   1568 	uint64_t guid;
   1569 	vdev_stat_t *vs;
   1570 	uint_t vsc;
   1571 	uint64_t pool;
   1572 
   1573 	if (spa->spa_spares.sav_count == 0)
   1574 		return;
   1575 
   1576 	VERIFY(nvlist_lookup_nvlist(config,
   1577 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
   1578 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
   1579 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
   1580 	if (nspares != 0) {
   1581 		VERIFY(nvlist_add_nvlist_array(nvroot,
   1582 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
   1583 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
   1584 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
   1585 
   1586 		/*
   1587 		 * Go through and find any spares which have since been
   1588 		 * repurposed as an active spare.  If this is the case, update
   1589 		 * their status appropriately.
   1590 		 */
   1591 		for (i = 0; i < nspares; i++) {
   1592 			VERIFY(nvlist_lookup_uint64(spares[i],
   1593 			    ZPOOL_CONFIG_GUID, &guid) == 0);
   1594 			if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
   1595 				VERIFY(nvlist_lookup_uint64_array(
   1596 				    spares[i], ZPOOL_CONFIG_STATS,
   1597 				    (uint64_t **)&vs, &vsc) == 0);
   1598 				vs->vs_state = VDEV_STATE_CANT_OPEN;
   1599 				vs->vs_aux = VDEV_AUX_SPARED;
   1600 			}
   1601 		}
   1602 	}
   1603 }
   1604 
   1605 /*
   1606  * Add l2cache device information to the nvlist, including vdev stats.
   1607  */
   1608 static void
   1609 spa_add_l2cache(spa_t *spa, nvlist_t *config)
   1610 {
   1611 	nvlist_t **l2cache;
   1612 	uint_t i, j, nl2cache;
   1613 	nvlist_t *nvroot;
   1614 	uint64_t guid;
   1615 	vdev_t *vd;
   1616 	vdev_stat_t *vs;
   1617 	uint_t vsc;
   1618 
   1619 	if (spa->spa_l2cache.sav_count == 0)
   1620 		return;
   1621 
   1622 	spa_config_enter(spa, RW_READER, FTAG);
   1623 
   1624 	VERIFY(nvlist_lookup_nvlist(config,
   1625 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
   1626 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
   1627 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
   1628 	if (nl2cache != 0) {
   1629 		VERIFY(nvlist_add_nvlist_array(nvroot,
   1630 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
   1631 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
   1632 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
   1633 
   1634 		/*
   1635 		 * Update level 2 cache device stats.
   1636 		 */
   1637 
   1638 		for (i = 0; i < nl2cache; i++) {
   1639 			VERIFY(nvlist_lookup_uint64(l2cache[i],
   1640 			    ZPOOL_CONFIG_GUID, &guid) == 0);
   1641 
   1642 			vd = NULL;
   1643 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
   1644 				if (guid ==
   1645 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
   1646 					vd = spa->spa_l2cache.sav_vdevs[j];
   1647 					break;
   1648 				}
   1649 			}
   1650 			ASSERT(vd != NULL);
   1651 
   1652 			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
   1653 			    ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
   1654 			vdev_get_stats(vd, vs);
   1655 		}
   1656 	}
   1657 
   1658 	spa_config_exit(spa, FTAG);
   1659 }
   1660 
   1661 int
   1662 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
   1663 {
   1664 	int error;
   1665 	spa_t *spa;
   1666 
   1667 	*config = NULL;
   1668 	error = spa_open_common(name, &spa, FTAG, config);
   1669 
   1670 	if (spa && *config != NULL) {
   1671 		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
   1672 		    spa_get_errlog_size(spa)) == 0);
   1673 
   1674 		spa_add_spares(spa, *config);
   1675 		spa_add_l2cache(spa, *config);
   1676 	}
   1677 
   1678 	/*
   1679 	 * We want to get the alternate root even for faulted pools, so we cheat
   1680 	 * and call spa_lookup() directly.
   1681 	 */
   1682 	if (altroot) {
   1683 		if (spa == NULL) {
   1684 			mutex_enter(&spa_namespace_lock);
   1685 			spa = spa_lookup(name);
   1686 			if (spa)
   1687 				spa_altroot(spa, altroot, buflen);
   1688 			else
   1689 				altroot[0] = '\0';
   1690 			spa = NULL;
   1691 			mutex_exit(&spa_namespace_lock);
   1692 		} else {
   1693 			spa_altroot(spa, altroot, buflen);
   1694 		}
   1695 	}
   1696 
   1697 	if (spa != NULL)
   1698 		spa_close(spa, FTAG);
   1699 
   1700 	return (error);
   1701 }
   1702 
   1703 /*
   1704  * Validate that the auxiliary device array is well formed.  We must have an
   1705  * array of nvlists, each which describes a valid leaf vdev.  If this is an
   1706  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
   1707  * specified, as long as they are well-formed.
   1708  */
   1709 static int
   1710 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
   1711     spa_aux_vdev_t *sav, const char *config, uint64_t version,
   1712     vdev_labeltype_t label)
   1713 {
   1714 	nvlist_t **dev;
   1715 	uint_t i, ndev;
   1716 	vdev_t *vd;
   1717 	int error;
   1718 
   1719 	/*
   1720 	 * It's acceptable to have no devs specified.
   1721 	 */
   1722 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
   1723 		return (0);
   1724 
   1725 	if (ndev == 0)
   1726 		return (EINVAL);
   1727 
   1728 	/*
   1729 	 * Make sure the pool is formatted with a version that supports this
   1730 	 * device type.
   1731 	 */
   1732 	if (spa_version(spa) < version)
   1733 		return (ENOTSUP);
   1734 
   1735 	/*
   1736 	 * Set the pending device list so we correctly handle device in-use
   1737 	 * checking.
   1738 	 */
   1739 	sav->sav_pending = dev;
   1740 	sav->sav_npending = ndev;
   1741 
   1742 	for (i = 0; i < ndev; i++) {
   1743 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
   1744 		    mode)) != 0)
   1745 			goto out;
   1746 
   1747 		if (!vd->vdev_ops->vdev_op_leaf) {
   1748 			vdev_free(vd);
   1749 			error = EINVAL;
   1750 			goto out;
   1751 		}
   1752 
   1753 		/*
   1754 		 * The L2ARC currently only supports disk devices.
   1755 		 */
   1756 		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
   1757 		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
   1758 			error = ENOTBLK;
   1759 			goto out;
   1760 		}
   1761 
   1762 		vd->vdev_top = vd;
   1763 
   1764 		if ((error = vdev_open(vd)) == 0 &&
   1765 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
   1766 			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
   1767 			    vd->vdev_guid) == 0);
   1768 		}
   1769 
   1770 		vdev_free(vd);
   1771 
   1772 		if (error &&
   1773 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
   1774 			goto out;
   1775 		else
   1776 			error = 0;
   1777 	}
   1778 
   1779 out:
   1780 	sav->sav_pending = NULL;
   1781 	sav->sav_npending = 0;
   1782 	return (error);
   1783 }
   1784 
   1785 static int
   1786 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
   1787 {
   1788 	int error;
   1789 
   1790 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
   1791 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
   1792 	    VDEV_LABEL_SPARE)) != 0) {
   1793 		return (error);
   1794 	}
   1795 
   1796 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
   1797 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
   1798 	    VDEV_LABEL_L2CACHE));
   1799 }
   1800 
   1801 static void
   1802 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
   1803     const char *config)
   1804 {
   1805 	int i;
   1806 
   1807 	if (sav->sav_config != NULL) {
   1808 		nvlist_t **olddevs;
   1809 		uint_t oldndevs;
   1810 		nvlist_t **newdevs;
   1811 
   1812 		/*
   1813 		 * Generate new dev list by concatentating with the
   1814 		 * current dev list.
   1815 		 */
   1816 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
   1817 		    &olddevs, &oldndevs) == 0);
   1818 
   1819 		newdevs = kmem_alloc(sizeof (void *) *
   1820 		    (ndevs + oldndevs), KM_SLEEP);
   1821 		for (i = 0; i < oldndevs; i++)
   1822 			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
   1823 			    KM_SLEEP) == 0);
   1824 		for (i = 0; i < ndevs; i++)
   1825 			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
   1826 			    KM_SLEEP) == 0);
   1827 
   1828 		VERIFY(nvlist_remove(sav->sav_config, config,
   1829 		    DATA_TYPE_NVLIST_ARRAY) == 0);
   1830 
   1831 		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
   1832 		    config, newdevs, ndevs + oldndevs) == 0);
   1833 		for (i = 0; i < oldndevs + ndevs; i++)
   1834 			nvlist_free(newdevs[i]);
   1835 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
   1836 	} else {
   1837 		/*
   1838 		 * Generate a new dev list.
   1839 		 */
   1840 		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
   1841 		    KM_SLEEP) == 0);
   1842 		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
   1843 		    devs, ndevs) == 0);
   1844 	}
   1845 }
   1846 
   1847 /*
   1848  * Stop and drop level 2 ARC devices
   1849  */
   1850 void
   1851 spa_l2cache_drop(spa_t *spa)
   1852 {
   1853 	vdev_t *vd;
   1854 	int i;
   1855 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
   1856 
   1857 	for (i = 0; i < sav->sav_count; i++) {
   1858 		uint64_t pool;
   1859 
   1860 		vd = sav->sav_vdevs[i];
   1861 		ASSERT(vd != NULL);
   1862 
   1863 		if (spa_mode & FWRITE &&
   1864 		    spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL) {
   1865 			l2arc_remove_vdev(vd);
   1866 		}
   1867 		if (vd->vdev_isl2cache)
   1868 			spa_l2cache_remove(vd);
   1869 		vdev_clear_stats(vd);
   1870 		(void) vdev_close(vd);
   1871 	}
   1872 }
   1873 
   1874 /*
   1875  * Pool Creation
   1876  */
   1877 int
   1878 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
   1879     const char *history_str)
   1880 {
   1881 	spa_t *spa;
   1882 	char *altroot = NULL;
   1883 	vdev_t *rvd;
   1884 	dsl_pool_t *dp;
   1885 	dmu_tx_t *tx;
   1886 	int c, error = 0;
   1887 	uint64_t txg = TXG_INITIAL;
   1888 	nvlist_t **spares, **l2cache;
   1889 	uint_t nspares, nl2cache;
   1890 	uint64_t version;
   1891 
   1892 	/*
   1893 	 * If this pool already exists, return failure.
   1894 	 */
   1895 	mutex_enter(&spa_namespace_lock);
   1896 	if (spa_lookup(pool) != NULL) {
   1897 		mutex_exit(&spa_namespace_lock);
   1898 		return (EEXIST);
   1899 	}
   1900 
   1901 	/*
   1902 	 * Allocate a new spa_t structure.
   1903 	 */
   1904 	(void) nvlist_lookup_string(props,
   1905 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
   1906 	spa = spa_add(pool, altroot);
   1907 	spa_activate(spa);
   1908 
   1909 	spa->spa_uberblock.ub_txg = txg - 1;
   1910 
   1911 	if (props && (error = spa_prop_validate(spa, props))) {
   1912 		spa_unload(spa);
   1913 		spa_deactivate(spa);
   1914 		spa_remove(spa);
   1915 		return (error);
   1916 	}
   1917 
   1918 	if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
   1919 	    &version) != 0)
   1920 		version = SPA_VERSION;
   1921 	ASSERT(version <= SPA_VERSION);
   1922 	spa->spa_uberblock.ub_version = version;
   1923 	spa->spa_ubsync = spa->spa_uberblock;
   1924 
   1925 	/*
   1926 	 * Create the root vdev.
   1927 	 */
   1928 	spa_config_enter(spa, RW_WRITER, FTAG);
   1929 
   1930 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
   1931 
   1932 	ASSERT(error != 0 || rvd != NULL);
   1933 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
   1934 
   1935 	if (error == 0 && rvd->vdev_children == 0)
   1936 		error = EINVAL;
   1937 
   1938 	if (error == 0 &&
   1939 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
   1940 	    (error = spa_validate_aux(spa, nvroot, txg,
   1941 	    VDEV_ALLOC_ADD)) == 0) {
   1942 		for (c = 0; c < rvd->vdev_children; c++)
   1943 			vdev_init(rvd->vdev_child[c], txg);
   1944 		vdev_config_dirty(rvd);
   1945 	}
   1946 
   1947 	spa_config_exit(spa, FTAG);
   1948 
   1949 	if (error != 0) {
   1950 		spa_unload(spa);
   1951 		spa_deactivate(spa);
   1952 		spa_remove(spa);
   1953 		mutex_exit(&spa_namespace_lock);
   1954 		return (error);
   1955 	}
   1956 
   1957 	/*
   1958 	 * Get the list of spares, if specified.
   1959 	 */
   1960 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
   1961 	    &spares, &nspares) == 0) {
   1962 		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
   1963 		    KM_SLEEP) == 0);
   1964 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
   1965 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
   1966 		spa_config_enter(spa, RW_WRITER, FTAG);
   1967 		spa_load_spares(spa);
   1968 		spa_config_exit(spa, FTAG);
   1969 		spa->spa_spares.sav_sync = B_TRUE;
   1970 	}
   1971 
   1972 	/*
   1973 	 * Get the list of level 2 cache devices, if specified.
   1974 	 */
   1975 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
   1976 	    &l2cache, &nl2cache) == 0) {
   1977 		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
   1978 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
   1979 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
   1980 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
   1981 		spa_config_enter(spa, RW_WRITER, FTAG);
   1982 		spa_load_l2cache(spa);
   1983 		spa_config_exit(spa, FTAG);
   1984 		spa->spa_l2cache.sav_sync = B_TRUE;
   1985 	}
   1986 
   1987 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
   1988 	spa->spa_meta_objset = dp->dp_meta_objset;
   1989 
   1990 	tx = dmu_tx_create_assigned(dp, txg);
   1991 
   1992 	/*
   1993 	 * Create the pool config object.
   1994 	 */
   1995 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
   1996 	    DMU_OT_PACKED_NVLIST, 1 << 14,
   1997 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
   1998 
   1999 	if (zap_add(spa->spa_meta_objset,
   2000 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
   2001 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
   2002 		cmn_err(CE_PANIC, "failed to add pool config");
   2003 	}
   2004 
   2005 	/* Newly created pools with the right version are always deflated. */
   2006 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
   2007 		spa->spa_deflate = TRUE;
   2008 		if (zap_add(spa->spa_meta_objset,
   2009 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
   2010 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
   2011 			cmn_err(CE_PANIC, "failed to add deflate");
   2012 		}
   2013 	}
   2014 
   2015 	/*
   2016 	 * Create the deferred-free bplist object.  Turn off compression
   2017 	 * because sync-to-convergence takes longer if the blocksize
   2018 	 * keeps changing.
   2019 	 */
   2020 	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
   2021 	    1 << 14, tx);
   2022 	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
   2023 	    ZIO_COMPRESS_OFF, tx);
   2024 
   2025 	if (zap_add(spa->spa_meta_objset,
   2026 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
   2027 	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
   2028 		cmn_err(CE_PANIC, "failed to add bplist");
   2029 	}
   2030 
   2031 	/*
   2032 	 * Create the pool's history object.
   2033 	 */
   2034 	if (version >= SPA_VERSION_ZPOOL_HISTORY)
   2035 		spa_history_create_obj(spa, tx);
   2036 
   2037 	/*
   2038 	 * Set pool properties.
   2039 	 */
   2040 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
   2041 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
   2042 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
   2043 	if (props)
   2044 		spa_sync_props(spa, props, CRED(), tx);
   2045 
   2046 	dmu_tx_commit(tx);
   2047 
   2048 	spa->spa_sync_on = B_TRUE;
   2049 	txg_sync_start(spa->spa_dsl_pool);
   2050 
   2051 	/*
   2052 	 * We explicitly wait for the first transaction to complete so that our
   2053 	 * bean counters are appropriately updated.
   2054 	 */
   2055 	txg_wait_synced(spa->spa_dsl_pool, txg);
   2056 
   2057 	spa_config_sync();
   2058 
   2059 	if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
   2060 		(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
   2061 
   2062 	mutex_exit(&spa_namespace_lock);
   2063 
   2064 	return (0);
   2065 }
   2066 
   2067 /*
   2068  * Import the given pool into the system.  We set up the necessary spa_t and
   2069  * then call spa_load() to do the dirty work.
   2070  */
   2071 int
   2072 spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
   2073 {
   2074 	spa_t *spa;
   2075 	char *altroot = NULL;
   2076 	int error;
   2077 	nvlist_t *nvroot;
   2078 	nvlist_t **spares, **l2cache;
   2079 	uint_t nspares, nl2cache;
   2080 
   2081 	/*
   2082 	 * If a pool with this name exists, return failure.
   2083 	 */
   2084 	mutex_enter(&spa_namespace_lock);
   2085 	if (spa_lookup(pool) != NULL) {
   2086 		mutex_exit(&spa_namespace_lock);
   2087 		return (EEXIST);
   2088 	}
   2089 
   2090 	/*
   2091 	 * Create and initialize the spa structure.
   2092 	 */
   2093 	(void) nvlist_lookup_string(props,
   2094 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
   2095 	spa = spa_add(pool, altroot);
   2096 	spa_activate(spa);
   2097 
   2098 	/*
   2099 	 * Pass off the heavy lifting to spa_load().
   2100 	 * Pass TRUE for mosconfig because the user-supplied config
   2101 	 * is actually the one to trust when doing an import.
   2102 	 */
   2103 	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
   2104 
   2105 	spa_config_enter(spa, RW_WRITER, FTAG);
   2106 	/*
   2107 	 * Toss any existing sparelist, as it doesn't have any validity anymore,
   2108 	 * and conflicts with spa_has_spare().
   2109 	 */
   2110 	if (spa->spa_spares.sav_config) {
   2111 		nvlist_free(spa->spa_spares.sav_config);
   2112 		spa->spa_spares.sav_config = NULL;
   2113 		spa_load_spares(spa);
   2114 	}
   2115 	if (spa->spa_l2cache.sav_config) {
   2116 		nvlist_free(spa->spa_l2cache.sav_config);
   2117 		spa->spa_l2cache.sav_config = NULL;
   2118 		spa_load_l2cache(spa);
   2119 	}
   2120 
   2121 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
   2122 	    &nvroot) == 0);
   2123 	if (error == 0)
   2124 		error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE);
   2125 	if (error == 0)
   2126 		error = spa_validate_aux(spa, nvroot, -1ULL,
   2127 		    VDEV_ALLOC_L2CACHE);
   2128 	spa_config_exit(spa, FTAG);
   2129 
   2130 	if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
   2131 		spa_unload(spa);
   2132 		spa_deactivate(spa);
   2133 		spa_remove(spa);
   2134 		mutex_exit(&spa_namespace_lock);
   2135 		return (error);
   2136 	}
   2137 
   2138 	/*
   2139 	 * Override any spares and level 2 cache devices as specified by
   2140 	 * the user, as these may have correct device names/devids, etc.
   2141 	 */
   2142 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
   2143 	    &spares, &nspares) == 0) {
   2144 		if (spa->spa_spares.sav_config)
   2145 			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
   2146 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
   2147 		else
   2148 			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
   2149 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
   2150 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
   2151 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
   2152 		spa_config_enter(spa, RW_WRITER, FTAG);
   2153 		spa_load_spares(spa);
   2154 		spa_config_exit(spa, FTAG);
   2155 		spa->spa_spares.sav_sync = B_TRUE;
   2156 	}
   2157 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
   2158 	    &l2cache, &nl2cache) == 0) {
   2159 		if (spa->spa_l2cache.sav_config)
   2160 			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
   2161 			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
   2162 		else
   2163 			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
   2164 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
   2165 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
   2166 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
   2167 		spa_config_enter(spa, RW_WRITER, FTAG);
   2168 		spa_load_l2cache(spa);
   2169 		spa_config_exit(spa, FTAG);
   2170 		spa->spa_l2cache.sav_sync = B_TRUE;
   2171 	}
   2172 
   2173 	/*
   2174 	 * Update the config cache to include the newly-imported pool.
   2175 	 */
   2176 	if (spa_mode & FWRITE)
   2177 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
   2178 
   2179 	/*
   2180 	 * Resilver anything that's out of date.
   2181 	 */
   2182 	if (spa_mode & FWRITE)
   2183 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
   2184 
   2185 	mutex_exit(&spa_namespace_lock);
   2186 
   2187 	return (0);
   2188 }
   2189 
   2190 /*
   2191  * This (illegal) pool name is used when temporarily importing a spa_t in order
   2192  * to get the vdev stats associated with the imported devices.
   2193  */
   2194 #define	TRYIMPORT_NAME	"$import"
   2195 
   2196 nvlist_t *
   2197 spa_tryimport(nvlist_t *tryconfig)
   2198 {
   2199 	nvlist_t *config = NULL;
   2200 	char *poolname;
   2201 	spa_t *spa;
   2202 	uint64_t state;
   2203 
   2204 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
   2205 		return (NULL);
   2206 
   2207 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
   2208 		return (NULL);
   2209 
   2210 	/*
   2211 	 * Create and initialize the spa structure.
   2212 	 */
   2213 	mutex_enter(&spa_namespace_lock);
   2214 	spa = spa_add(TRYIMPORT_NAME, NULL);
   2215 	spa_activate(spa);
   2216 
   2217 	/*
   2218 	 * Pass off the heavy lifting to spa_load().
   2219 	 * Pass TRUE for mosconfig because the user-supplied config
   2220 	 * is actually the one to trust when doing an import.
   2221 	 */
   2222 	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
   2223 
   2224 	/*
   2225 	 * If 'tryconfig' was at least parsable, return the current config.
   2226 	 */
   2227 	if (spa->spa_root_vdev != NULL) {
   2228 		spa_config_enter(spa, RW_READER, FTAG);
   2229 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
   2230 		spa_config_exit(spa, FTAG);
   2231 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
   2232 		    poolname) == 0);
   2233 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
   2234 		    state) == 0);
   2235 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
   2236 		    spa->spa_uberblock.ub_timestamp) == 0);
   2237 
   2238 		/*
   2239 		 * Add the list of hot spares and level 2 cache devices.
   2240 		 */
   2241 		spa_add_spares(spa, config);
   2242 		spa_add_l2cache(spa, config);
   2243 	}
   2244 
   2245 	spa_unload(spa);
   2246 	spa_deactivate(spa);
   2247 	spa_remove(spa);
   2248 	mutex_exit(&spa_namespace_lock);
   2249 
   2250 	return (config);
   2251 }
   2252 
   2253 /*
   2254  * Pool export/destroy
   2255  *
   2256  * The act of destroying or exporting a pool is very simple.  We make sure there
   2257  * is no more pending I/O and any references to the pool are gone.  Then, we
   2258  * update the pool state and sync all the labels to disk, removing the
   2259  * configuration from the cache afterwards.
   2260  */
   2261 static int
   2262 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
   2263 {
   2264 	spa_t *spa;
   2265 
   2266 	if (oldconfig)
   2267 		*oldconfig = NULL;
   2268 
   2269 	if (!(spa_mode & FWRITE))
   2270 		return (EROFS);
   2271 
   2272 	mutex_enter(&spa_namespace_lock);
   2273 	if ((spa = spa_lookup(pool)) == NULL) {
   2274 		mutex_exit(&spa_namespace_lock);
   2275 		return (ENOENT);
   2276 	}
   2277 
   2278 	/*
   2279 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
   2280 	 * reacquire the namespace lock, and see if we can export.
   2281 	 */
   2282 	spa_open_ref(spa, FTAG);
   2283 	mutex_exit(&spa_namespace_lock);
   2284 	spa_async_suspend(spa);
   2285 	mutex_enter(&spa_namespace_lock);
   2286 	spa_close(spa, FTAG);
   2287 
   2288 	/*
   2289 	 * The pool will be in core if it's openable,
   2290 	 * in which case we can modify its state.
   2291 	 */
   2292 	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
   2293 		/*
   2294 		 * Objsets may be open only because they're dirty, so we
   2295 		 * have to force it to sync before checking spa_refcnt.
   2296 		 */
   2297 		spa_scrub_suspend(spa);
   2298 		txg_wait_synced(spa->spa_dsl_pool, 0);
   2299 
   2300 		/*
   2301 		 * A pool cannot be exported or destroyed if there are active
   2302 		 * references.  If we are resetting a pool, allow references by
   2303 		 * fault injection handlers.
   2304 		 */
   2305 		if (!spa_refcount_zero(spa) ||
   2306 		    (spa->spa_inject_ref != 0 &&
   2307 		    new_state != POOL_STATE_UNINITIALIZED)) {
   2308 			spa_scrub_resume(spa);
   2309 			spa_async_resume(spa);
   2310 			mutex_exit(&spa_namespace_lock);
   2311 			return (EBUSY);
   2312 		}
   2313 
   2314 		spa_scrub_resume(spa);
   2315 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
   2316 
   2317 		/*
   2318 		 * We want this to be reflected on every label,
   2319 		 * so mark them all dirty.  spa_unload() will do the
   2320 		 * final sync that pushes these changes out.
   2321 		 */
   2322 		if (new_state != POOL_STATE_UNINITIALIZED) {
   2323 			spa_config_enter(spa, RW_WRITER, FTAG);
   2324 			spa->spa_state = new_state;
   2325 			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
   2326 			vdev_config_dirty(spa->spa_root_vdev);
   2327 			spa_config_exit(spa, FTAG);
   2328 		}
   2329 	}
   2330 
   2331 	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
   2332 
   2333 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
   2334 		spa_unload(spa);
   2335 		spa_deactivate(spa);
   2336 	}
   2337 
   2338 	if (oldconfig && spa->spa_config)
   2339 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
   2340 
   2341 	if (new_state != POOL_STATE_UNINITIALIZED) {
   2342 		spa_config_check(spa->spa_config_dir,
   2343 		    spa->spa_config_file);
   2344 		spa_remove(spa);
   2345 		spa_config_sync();
   2346 	}
   2347 	mutex_exit(&spa_namespace_lock);
   2348 
   2349 	return (0);
   2350 }
   2351 
   2352 /*
   2353  * Destroy a storage pool.
   2354  */
   2355 int
   2356 spa_destroy(char *pool)
   2357 {
   2358 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
   2359 }
   2360 
   2361 /*
   2362  * Export a storage pool.
   2363  */
   2364 int
   2365 spa_export(char *pool, nvlist_t **oldconfig)
   2366 {
   2367 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
   2368 }
   2369 
   2370 /*
   2371  * Similar to spa_export(), this unloads the spa_t without actually removing it
   2372  * from the namespace in any way.
   2373  */
   2374 int
   2375 spa_reset(char *pool)
   2376 {
   2377 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
   2378 }
   2379 
   2380 
   2381 /*
   2382  * ==========================================================================
   2383  * Device manipulation
   2384  * ==========================================================================
   2385  */
   2386 
   2387 /*
   2388  * Add a device to a storage pool.
   2389  */
   2390 int
   2391 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
   2392 {
   2393 	uint64_t txg;
   2394 	int c, error;
   2395 	vdev_t *rvd = spa->spa_root_vdev;
   2396 	vdev_t *vd, *tvd;
   2397 	nvlist_t **spares, **l2cache;
   2398 	uint_t nspares, nl2cache;
   2399 
   2400 	txg = spa_vdev_enter(spa);
   2401 
   2402 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
   2403 	    VDEV_ALLOC_ADD)) != 0)
   2404 		return (spa_vdev_exit(spa, NULL, txg, error));
   2405 
   2406 	spa->spa_pending_vdev = vd;
   2407 
   2408 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
   2409 	    &nspares) != 0)
   2410 		nspares = 0;
   2411 
   2412 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
   2413 	    &nl2cache) != 0)
   2414 		nl2cache = 0;
   2415 
   2416 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) {
   2417 		spa->spa_pending_vdev = NULL;
   2418 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
   2419 	}
   2420 
   2421 	if (vd->vdev_children != 0) {
   2422 		if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
   2423 			spa->spa_pending_vdev = NULL;
   2424 			return (spa_vdev_exit(spa, vd, txg, error));
   2425 		}
   2426 	}
   2427 
   2428 	/*
   2429 	 * We must validate the spares and l2cache devices after checking the
   2430 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
   2431 	 */
   2432 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) {
   2433 		spa->spa_pending_vdev = NULL;
   2434 		return (spa_vdev_exit(spa, vd, txg, error));
   2435 	}
   2436 
   2437 	spa->spa_pending_vdev = NULL;
   2438 
   2439 	/*
   2440 	 * Transfer each new top-level vdev from vd to rvd.
   2441 	 */
   2442 	for (c = 0; c < vd->vdev_children; c++) {
   2443 		tvd = vd->vdev_child[c];
   2444 		vdev_remove_child(vd, tvd);
   2445 		tvd->vdev_id = rvd->vdev_children;
   2446 		vdev_add_child(rvd, tvd);
   2447 		vdev_config_dirty(tvd);
   2448 	}
   2449 
   2450 	if (nspares != 0) {
   2451 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
   2452 		    ZPOOL_CONFIG_SPARES);
   2453 		spa_load_spares(spa);
   2454 		spa->spa_spares.sav_sync = B_TRUE;
   2455 	}
   2456 
   2457 	if (nl2cache != 0) {
   2458 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
   2459 		    ZPOOL_CONFIG_L2CACHE);
   2460 		spa_load_l2cache(spa);
   2461 		spa->spa_l2cache.sav_sync = B_TRUE;
   2462 	}
   2463 
   2464 	/*
   2465 	 * We have to be careful when adding new vdevs to an existing pool.
   2466 	 * If other threads start allocating from these vdevs before we
   2467 	 * sync the config cache, and we lose power, then upon reboot we may
   2468 	 * fail to open the pool because there are DVAs that the config cache
   2469 	 * can't translate.  Therefore, we first add the vdevs without
   2470 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
   2471 	 * and then let spa_config_update() initialize the new metaslabs.
   2472 	 *
   2473 	 * spa_load() checks for added-but-not-initialized vdevs, so that
   2474 	 * if we lose power at any point in this sequence, the remaining
   2475 	 * steps will be completed the next time we load the pool.
   2476 	 */
   2477 	(void) spa_vdev_exit(spa, vd, txg, 0);
   2478 
   2479 	mutex_enter(&spa_namespace_lock);
   2480 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
   2481 	mutex_exit(&spa_namespace_lock);
   2482 
   2483 	return (0);
   2484 }
   2485 
   2486 /*
   2487  * Attach a device to a mirror.  The arguments are the path to any device
   2488  * in the mirror, and the nvroot for the new device.  If the path specifies
   2489  * a device that is not mirrored, we automatically insert the mirror vdev.
   2490  *
   2491  * If 'replacing' is specified, the new device is intended to replace the
   2492  * existing device; in this case the two devices are made into their own
   2493  * mirror using the 'replacing' vdev, which is functionally identical to
   2494  * the mirror vdev (it actually reuses all the same ops) but has a few
   2495  * extra rules: you can't attach to it after it's been created, and upon
   2496  * completion of resilvering, the first disk (the one being replaced)
   2497  * is automatically detached.
   2498  */
   2499 int
   2500 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
   2501 {
   2502 	uint64_t txg, open_txg;
   2503 	int error;
   2504 	vdev_t *rvd = spa->spa_root_vdev;
   2505 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
   2506 	vdev_ops_t *pvops;
   2507 	int is_log;
   2508 
   2509 	txg = spa_vdev_enter(spa);
   2510 
   2511 	oldvd = vdev_lookup_by_guid(rvd, guid);
   2512 
   2513 	if (oldvd == NULL)
   2514 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
   2515 
   2516 	if (!oldvd->vdev_ops->vdev_op_leaf)
   2517 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
   2518 
   2519 	pvd = oldvd->vdev_parent;
   2520 
   2521 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
   2522 	    VDEV_ALLOC_ADD)) != 0)
   2523 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
   2524 
   2525 	if (newrootvd->vdev_children != 1)
   2526 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
   2527 
   2528 	newvd = newrootvd->vdev_child[0];
   2529 
   2530 	if (!newvd->vdev_ops->vdev_op_leaf)
   2531 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
   2532 
   2533 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
   2534 		return (spa_vdev_exit(spa, newrootvd, txg, error));
   2535 
   2536 	/*
   2537 	 * Spares can't replace logs
   2538 	 */
   2539 	is_log = oldvd->vdev_islog;
   2540 	if (is_log && newvd->vdev_isspare)
   2541 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
   2542 
   2543 	if (!replacing) {
   2544 		/*
   2545 		 * For attach, the only allowable parent is a mirror or the root
   2546 		 * vdev.
   2547 		 */
   2548 		if (pvd->vdev_ops != &vdev_mirror_ops &&
   2549 		    pvd->vdev_ops != &vdev_root_ops)
   2550 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
   2551 
   2552 		pvops = &vdev_mirror_ops;
   2553 	} else {
   2554 		/*
   2555 		 * Active hot spares can only be replaced by inactive hot
   2556 		 * spares.
   2557 		 */
   2558 		if (pvd->vdev_ops == &vdev_spare_ops &&
   2559 		    pvd->vdev_child[1] == oldvd &&
   2560 		    !spa_has_spare(spa, newvd->vdev_guid))
   2561 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
   2562 
   2563 		/*
   2564 		 * If the source is a hot spare, and the parent isn't already a
   2565 		 * spare, then we want to create a new hot spare.  Otherwise, we
   2566 		 * want to create a replacing vdev.  The user is not allowed to
   2567 		 * attach to a spared vdev child unless the 'isspare' state is
   2568 		 * the same (spare replaces spare, non-spare replaces
   2569 		 * non-spare).
   2570 		 */
   2571 		if (pvd->vdev_ops == &vdev_replacing_ops)
   2572 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
   2573 		else if (pvd->vdev_ops == &vdev_spare_ops &&
   2574 		    newvd->vdev_isspare != oldvd->vdev_isspare)
   2575 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
   2576 		else if (pvd->vdev_ops != &vdev_spare_ops &&
   2577 		    newvd->vdev_isspare)
   2578 			pvops = &vdev_spare_ops;
   2579 		else
   2580 			pvops = &vdev_replacing_ops;
   2581 	}
   2582 
   2583 	/*
   2584 	 * Compare the new device size with the replaceable/attachable
   2585 	 * device size.
   2586 	 */
   2587 	if (newvd->vdev_psize < vdev_get_rsize(oldvd))
   2588 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
   2589 
   2590 	/*
   2591 	 * The new device cannot have a higher alignment requirement
   2592 	 * than the top-level vdev.
   2593 	 */
   2594 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
   2595 		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
   2596 
   2597 	/*
   2598 	 * If this is an in-place replacement, update oldvd's path and devid
   2599 	 * to make it distinguishable from newvd, and unopenable from now on.
   2600 	 */
   2601 	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
   2602 		spa_strfree(oldvd->vdev_path);
   2603 		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
   2604 		    KM_SLEEP);
   2605 		(void) sprintf(oldvd->vdev_path, "%s/%s",
   2606 		    newvd->vdev_path, "old");
   2607 		if (oldvd->vdev_devid != NULL) {
   2608 			spa_strfree(oldvd->vdev_devid);
   2609 			oldvd->vdev_devid = NULL;
   2610 		}
   2611 	}
   2612 
   2613 	/*
   2614 	 * If the parent is not a mirror, or if we're replacing, insert the new
   2615 	 * mirror/replacing/spare vdev above oldvd.
   2616 	 */
   2617 	if (pvd->vdev_ops != pvops)
   2618 		pvd = vdev_add_parent(oldvd, pvops);
   2619 
   2620 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
   2621 	ASSERT(pvd->vdev_ops == pvops);
   2622 	ASSERT(oldvd->vdev_parent == pvd);
   2623 
   2624 	/*
   2625 	 * Extract the new device from its root and add it to pvd.
   2626 	 */
   2627 	vdev_remove_child(newrootvd, newvd);
   2628 	newvd->vdev_id = pvd->vdev_children;
   2629 	vdev_add_child(pvd, newvd);
   2630 
   2631 	/*
   2632 	 * If newvd is smaller than oldvd, but larger than its rsize,
   2633 	 * the addition of newvd may have decreased our parent's asize.
   2634 	 */
   2635 	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
   2636 
   2637 	tvd = newvd->vdev_top;
   2638 	ASSERT(pvd->vdev_top == tvd);
   2639 	ASSERT(tvd->vdev_parent == rvd);
   2640 
   2641 	vdev_config_dirty(tvd);
   2642 
   2643 	/*
   2644 	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
   2645 	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
   2646 	 */
   2647 	open_txg = txg + TXG_CONCURRENT_STATES - 1;
   2648 
   2649 	mutex_enter(&newvd->vdev_dtl_lock);
   2650 	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
   2651 	    open_txg - TXG_INITIAL + 1);
   2652 	mutex_exit(&newvd->vdev_dtl_lock);
   2653 
   2654 	if (newvd->vdev_isspare)
   2655 		spa_spare_activate(newvd);
   2656 
   2657 	/*
   2658 	 * Mark newvd's DTL dirty in this txg.
   2659 	 */
   2660 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
   2661 
   2662 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
   2663 
   2664 	/*
   2665 	 * Kick off a resilver to update newvd.  We need to grab the namespace
   2666 	 * lock because spa_scrub() needs to post a sysevent with the pool name.
   2667 	 */
   2668 	mutex_enter(&spa_namespace_lock);
   2669 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
   2670 	mutex_exit(&spa_namespace_lock);
   2671 
   2672 	return (0);
   2673 }
   2674 
   2675 /*
   2676  * Detach a device from a mirror or replacing vdev.
   2677  * If 'replace_done' is specified, only detach if the parent
   2678  * is a replacing vdev.
   2679  */
   2680 int
   2681 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
   2682 {
   2683 	uint64_t txg;
   2684 	int c, t, error;
   2685 	vdev_t *rvd = spa->spa_root_vdev;
   2686 	vdev_t *vd, *pvd, *cvd, *tvd;
   2687 	boolean_t unspare = B_FALSE;
   2688 	uint64_t unspare_guid;
   2689 
   2690 	txg = spa_vdev_enter(spa);
   2691 
   2692 	vd = vdev_lookup_by_guid(rvd, guid);
   2693 
   2694 	if (vd == NULL)
   2695 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
   2696 
   2697 	if (!vd->vdev_ops->vdev_op_leaf)
   2698 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
   2699 
   2700 	pvd = vd->vdev_parent;
   2701 
   2702 	/*
   2703 	 * If replace_done is specified, only remove this device if it's
   2704 	 * the first child of a replacing vdev.  For the 'spare' vdev, either
   2705 	 * disk can be removed.
   2706 	 */
   2707 	if (replace_done) {
   2708 		if (pvd->vdev_ops == &vdev_replacing_ops) {
   2709 			if (vd->vdev_id != 0)
   2710 				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
   2711 		} else if (pvd->vdev_ops != &vdev_spare_ops) {
   2712 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
   2713 		}
   2714 	}
   2715 
   2716 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
   2717 	    spa_version(spa) >= SPA_VERSION_SPARES);
   2718 
   2719 	/*
   2720 	 * Only mirror, replacing, and spare vdevs support detach.
   2721 	 */
   2722 	if (pvd->vdev_ops != &vdev_replacing_ops &&
   2723 	    pvd->vdev_ops != &vdev_mirror_ops &&
   2724 	    pvd->vdev_ops != &vdev_spare_ops)
   2725 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
   2726 
   2727 	/*
   2728 	 * If there's only one replica, you can't detach it.
   2729 	 */
   2730 	if (pvd->vdev_children <= 1)
   2731 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
   2732 
   2733 	/*
   2734 	 * If all siblings have non-empty DTLs, this device may have the only
   2735 	 * valid copy of the data, which means we cannot safely detach it.
   2736 	 *
   2737 	 * XXX -- as in the vdev_offline() case, we really want a more
   2738 	 * precise DTL check.
   2739 	 */
   2740 	for (c = 0; c < pvd->vdev_children; c++) {
   2741 		uint64_t dirty;
   2742 
   2743 		cvd = pvd->vdev_child[c];
   2744 		if (cvd == vd)
   2745 			continue;
   2746 		if (vdev_is_dead(cvd))
   2747 			continue;
   2748 		mutex_enter(&cvd->vdev_dtl_lock);
   2749 		dirty = cvd->vdev_dtl_map.sm_space |
   2750 		    cvd->vdev_dtl_scrub.sm_space;
   2751 		mutex_exit(&cvd->vdev_dtl_lock);
   2752 		if (!dirty)
   2753 			break;
   2754 	}
   2755 
   2756 	/*
   2757 	 * If we are a replacing or spare vdev, then we can always detach the
   2758 	 * latter child, as that is how one cancels the operation.
   2759 	 */
   2760 	if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
   2761 	    c == pvd->vdev_children)
   2762 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
   2763 
   2764 	/*
   2765 	 * If we are detaching the original disk from a spare, then it implies
   2766 	 * that the spare should become a real disk, and be removed from the
   2767 	 * active spare list for the pool.
   2768 	 */
   2769 	if (pvd->vdev_ops == &vdev_spare_ops &&
   2770 	    vd->vdev_id == 0)
   2771 		unspare = B_TRUE;
   2772 
   2773 	/*
   2774 	 * Erase the disk labels so the disk can be used for other things.
   2775 	 * This must be done after all other error cases are handled,
   2776 	 * but before we disembowel vd (so we can still do I/O to it).
   2777 	 * But if we can't do it, don't treat the error as fatal --
   2778 	 * it may be that the unwritability of the disk is the reason
   2779 	 * it's being detached!
   2780 	 */
   2781 	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
   2782 
   2783 	/*
   2784 	 * Remove vd from its parent and compact the parent's children.
   2785 	 */
   2786 	vdev_remove_child(pvd, vd);
   2787 	vdev_compact_children(pvd);
   2788 
   2789 	/*
   2790 	 * Remember one of the remaining children so we can get tvd below.
   2791 	 */
   2792 	cvd = pvd->vdev_child[0];
   2793 
   2794 	/*
   2795 	 * If we need to remove the remaining child from the list of hot spares,
   2796 	 * do it now, marking the vdev as no longer a spare in the process.  We
   2797 	 * must do this before vdev_remove_parent(), because that can change the
   2798 	 * GUID if it creates a new toplevel GUID.
   2799 	 */
   2800 	if (unspare) {
   2801 		ASSERT(cvd->vdev_isspare);
   2802 		spa_spare_remove(cvd);
   2803 		unspare_guid = cvd->vdev_guid;
   2804 	}
   2805 
   2806 	/*
   2807 	 * If the parent mirror/replacing vdev only has one child,
   2808 	 * the parent is no longer needed.  Remove it from the tree.
   2809 	 */
   2810 	if (pvd->vdev_children == 1)
   2811 		vdev_remove_parent(cvd);
   2812 
   2813 	/*
   2814 	 * We don't set tvd until now because the parent we just removed
   2815 	 * may have been the previous top-level vdev.
   2816 	 */
   2817 	tvd = cvd->vdev_top;
   2818 	ASSERT(tvd->vdev_parent == rvd);
   2819 
   2820 	/*
   2821 	 * Reevaluate the parent vdev state.
   2822 	 */
   2823 	vdev_propagate_state(cvd);
   2824 
   2825 	/*
   2826 	 * If the device we just detached was smaller than the others, it may be
   2827 	 * possible to add metaslabs (i.e. grow the pool).  vdev_metaslab_init()
   2828 	 * can't fail because the existing metaslabs are already in core, so
   2829 	 * there's nothing to read from disk.
   2830 	 */
   2831 	VERIFY(vdev_metaslab_init(tvd, txg) == 0);
   2832 
   2833 	vdev_config_dirty(tvd);
   2834 
   2835 	/*
   2836 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
   2837 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
   2838 	 * But first make sure we're not on any *other* txg's DTL list, to
   2839 	 * prevent vd from being accessed after it's freed.
   2840 	 */
   2841 	for (t = 0; t < TXG_SIZE; t++)
   2842 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
   2843 	vd->vdev_detached = B_TRUE;
   2844 	vdev_dirty(tvd, VDD_DTL, vd, txg);
   2845 
   2846 	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
   2847 
   2848 	error = spa_vdev_exit(spa, vd, txg, 0);
   2849 
   2850 	/*
   2851 	 * If this was the removal of the original device in a hot spare vdev,
   2852 	 * then we want to go through and remove the device from the hot spare
   2853 	 * list of every other pool.
   2854 	 */
   2855 	if (unspare) {
   2856 		spa = NULL;
   2857 		mutex_enter(&spa_namespace_lock);
   2858 		while ((spa = spa_next(spa)) != NULL) {
   2859 			if (spa->spa_state != POOL_STATE_ACTIVE)
   2860 				continue;
   2861 
   2862 			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
   2863 		}
   2864 		mutex_exit(&spa_namespace_lock);
   2865 	}
   2866 
   2867 	return (error);
   2868 }
   2869 
   2870 /*
   2871  * Remove a spares vdev from the nvlist config.
   2872  */
   2873 static int
   2874 spa_remove_spares(spa_aux_vdev_t *sav, uint64_t guid, boolean_t unspare,
   2875     nvlist_t **spares, int nspares, vdev_t *vd)
   2876 {
   2877 	nvlist_t *nv, **newspares;
   2878 	int i, j;
   2879 
   2880 	nv = NULL;
   2881 	for (i = 0; i < nspares; i++) {
   2882 		uint64_t theguid;
   2883 
   2884 		VERIFY(nvlist_lookup_uint64(spares[i],
   2885 		    ZPOOL_CONFIG_GUID, &theguid) == 0);
   2886 		if (theguid == guid) {
   2887 			nv = spares[i];
   2888 			break;
   2889 		}
   2890 	}
   2891 
   2892 	/*
   2893 	 * Only remove the hot spare if it's not currently in use in this pool.
   2894 	 */
   2895 	if (nv == NULL && vd == NULL)
   2896 		return (ENOENT);
   2897 
   2898 	if (nv == NULL && vd != NULL)
   2899 		return (ENOTSUP);
   2900 
   2901 	if (!unspare && nv != NULL && vd != NULL)
   2902 		return (EBUSY);
   2903 
   2904 	if (nspares == 1) {
   2905 		newspares = NULL;
   2906 	} else {
   2907 		newspares = kmem_alloc((nspares - 1) * sizeof (void *),
   2908 		    KM_SLEEP);
   2909 		for (i = 0, j = 0; i < nspares; i++) {
   2910 			if (spares[i] != nv)
   2911 				VERIFY(nvlist_dup(spares[i],
   2912 				    &newspares[j++], KM_SLEEP) == 0);
   2913 		}
   2914 	}
   2915 
   2916 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_SPARES,
   2917 	    DATA_TYPE_NVLIST_ARRAY) == 0);
   2918 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
   2919 	    ZPOOL_CONFIG_SPARES, newspares, nspares - 1) == 0);
   2920 	for (i = 0; i < nspares - 1; i++)
   2921 		nvlist_free(newspares[i]);
   2922 	kmem_free(newspares, (nspares - 1) * sizeof (void *));
   2923 
   2924 	return (0);
   2925 }
   2926 
   2927 /*
   2928  * Remove an l2cache vdev from the nvlist config.
   2929  */
   2930 static int
   2931 spa_remove_l2cache(spa_aux_vdev_t *sav, uint64_t guid, nvlist_t **l2cache,
   2932     int nl2cache, vdev_t *vd)
   2933 {
   2934 	nvlist_t *nv, **newl2cache;
   2935 	int i, j;
   2936 
   2937 	nv = NULL;
   2938 	for (i = 0; i < nl2cache; i++) {
   2939 		uint64_t theguid;
   2940 
   2941 		VERIFY(nvlist_lookup_uint64(l2cache[i],
   2942 		    ZPOOL_CONFIG_GUID, &theguid) == 0);
   2943 		if (theguid == guid) {
   2944 			nv = l2cache[i];
   2945 			break;
   2946 		}
   2947 	}
   2948 
   2949 	if (vd == NULL) {
   2950 		for (i = 0; i < nl2cache; i++) {
   2951 			if (sav->sav_vdevs[i]->vdev_guid == guid) {
   2952 				vd = sav->sav_vdevs[i];
   2953 				break;
   2954 			}
   2955 		}
   2956 	}
   2957 
   2958 	if (nv == NULL && vd == NULL)
   2959 		return (ENOENT);
   2960 
   2961 	if (nv == NULL && vd != NULL)
   2962 		return (ENOTSUP);
   2963 
   2964 	if (nl2cache == 1) {
   2965 		newl2cache = NULL;
   2966 	} else {
   2967 		newl2cache = kmem_alloc((nl2cache - 1) * sizeof (void *),
   2968 		    KM_SLEEP);
   2969 		for (i = 0, j = 0; i < nl2cache; i++) {
   2970 			if (l2cache[i] != nv)
   2971 				VERIFY(nvlist_dup(l2cache[i],
   2972 				    &newl2cache[j++], KM_SLEEP) == 0);
   2973 		}
   2974 	}
   2975 
   2976 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
   2977 	    DATA_TYPE_NVLIST_ARRAY) == 0);
   2978 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
   2979 	    ZPOOL_CONFIG_L2CACHE, newl2cache, nl2cache - 1) == 0);
   2980 	for (i = 0; i < nl2cache - 1; i++)
   2981 		nvlist_free(newl2cache[i]);
   2982 	kmem_free(newl2cache, (nl2cache - 1) * sizeof (void *));
   2983 
   2984 	return (0);
   2985 }
   2986 
   2987 /*
   2988  * Remove a device from the pool.  Currently, this supports removing only hot
   2989  * spares and level 2 ARC devices.
   2990  */
   2991 int
   2992 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
   2993 {
   2994 	vdev_t *vd;
   2995 	nvlist_t **spares, **l2cache;
   2996 	uint_t nspares, nl2cache;
   2997 	int error = 0;
   2998 
   2999 	spa_config_enter(spa, RW_WRITER, FTAG);
   3000 
   3001 	vd = spa_lookup_by_guid(spa, guid);
   3002 
   3003 	if (spa->spa_spares.sav_vdevs != NULL &&
   3004 	    spa_spare_exists(guid, NULL) &&
   3005 	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
   3006 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) {
   3007 		if ((error = spa_remove_spares(&spa->spa_spares, guid, unspare,
   3008 		    spares, nspares, vd)) != 0)
   3009 			goto out;
   3010 		spa_load_spares(spa);
   3011 		spa->spa_spares.sav_sync = B_TRUE;
   3012 		goto out;
   3013 	}
   3014 
   3015 	if (spa->spa_l2cache.sav_vdevs != NULL &&
   3016 	    spa_l2cache_exists(guid, NULL) &&
   3017 	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
   3018 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) {
   3019 		if ((error = spa_remove_l2cache(&spa->spa_l2cache, guid,
   3020 		    l2cache, nl2cache, vd)) != 0)
   3021 			goto out;
   3022 		spa_load_l2cache(spa);
   3023 		spa->spa_l2cache.sav_sync = B_TRUE;
   3024 	}
   3025 
   3026 out:
   3027 	spa_config_exit(spa, FTAG);
   3028 	return (error);
   3029 }
   3030 
   3031 /*
   3032  * Find any device that's done replacing, or a vdev marked 'unspare' that's
   3033  * current spared, so we can detach it.
   3034  */
   3035 static vdev_t *
   3036 spa_vdev_resilver_done_hunt(vdev_t *vd)
   3037 {
   3038 	vdev_t *newvd, *oldvd;
   3039 	int c;
   3040 
   3041 	for (c = 0; c < vd->vdev_children; c++) {
   3042 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
   3043 		if (oldvd != NULL)
   3044 			return (oldvd);
   3045 	}
   3046 
   3047 	/*
   3048 	 * Check for a completed replacement.
   3049 	 */
   3050 	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
   3051 		oldvd = vd->vdev_child[0];
   3052 		newvd = vd->vdev_child[1];
   3053 
   3054 		mutex_enter(&newvd->vdev_dtl_lock);
   3055 		if (newvd->vdev_dtl_map.sm_space == 0 &&
   3056 		    newvd->vdev_dtl_scrub.sm_space == 0) {
   3057 			mutex_exit(&newvd->vdev_dtl_lock);
   3058 			return (oldvd);
   3059 		}
   3060 		mutex_exit(&newvd->vdev_dtl_lock);
   3061 	}
   3062 
   3063 	/*
   3064 	 * Check for a completed resilver with the 'unspare' flag set.
   3065 	 */
   3066 	if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
   3067 		newvd = vd->vdev_child[0];
   3068 		oldvd = vd->vdev_child[1];
   3069 
   3070 		mutex_enter(&newvd->vdev_dtl_lock);
   3071 		if (newvd->vdev_unspare &&
   3072 		    newvd->vdev_dtl_map.sm_space == 0 &&
   3073 		    newvd->vdev_dtl_scrub.sm_space == 0) {
   3074 			newvd->vdev_unspare = 0;
   3075 			mutex_exit(&newvd->vdev_dtl_lock);
   3076 			return (oldvd);
   3077 		}
   3078 		mutex_exit(&newvd->vdev_dtl_lock);
   3079 	}
   3080 
   3081 	return (NULL);
   3082 }
   3083 
   3084 static void
   3085 spa_vdev_resilver_done(spa_t *spa)
   3086 {
   3087 	vdev_t *vd;
   3088 	vdev_t *pvd;
   3089 	uint64_t guid;
   3090 	uint64_t pguid = 0;
   3091 
   3092 	spa_config_enter(spa, RW_READER, FTAG);
   3093 
   3094 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
   3095 		guid = vd->vdev_guid;
   3096 		/*
   3097 		 * If we have just finished replacing a hot spared device, then
   3098 		 * we need to detach the parent's first child (the original hot
   3099 		 * spare) as well.
   3100 		 */
   3101 		pvd = vd->vdev_parent;
   3102 		if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
   3103 		    pvd->vdev_id == 0) {
   3104 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
   3105 			ASSERT(pvd->vdev_parent->vdev_children == 2);
   3106 			pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
   3107 		}
   3108 		spa_config_exit(spa, FTAG);
   3109 		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
   3110 			return;
   3111 		if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
   3112 			return;
   3113 		spa_config_enter(spa, RW_READER, FTAG);
   3114 	}
   3115 
   3116 	spa_config_exit(spa, FTAG);
   3117 }
   3118 
   3119 /*
   3120  * Update the stored path for this vdev.  Dirty the vdev configuration, relying
   3121  * on spa_vdev_enter/exit() to synchronize the labels and cache.
   3122  */
   3123 int
   3124 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
   3125 {
   3126 	vdev_t *rvd, *vd;
   3127 	uint64_t txg;
   3128 
   3129 	rvd = spa->spa_root_vdev;
   3130 
   3131 	txg = spa_vdev_enter(spa);
   3132 
   3133 	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
   3134 		/*
   3135 		 * Determine if this is a reference to a hot spare or l2cache
   3136 		 * device.  If it is, update the path as stored in their
   3137 		 * device list.
   3138 		 */
   3139 		nvlist_t **spares, **l2cache;
   3140 		uint_t i, nspares, nl2cache;
   3141 
   3142 		if (spa->spa_spares.sav_config != NULL) {
   3143 			VERIFY(nvlist_lookup_nvlist_array(
   3144 			    spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
   3145 			    &spares, &nspares) == 0);
   3146 			for (i = 0; i < nspares; i++) {
   3147 				uint64_t theguid;
   3148 				VERIFY(nvlist_lookup_uint64(spares[i],
   3149 				    ZPOOL_CONFIG_GUID, &theguid) == 0);
   3150 				if (theguid == guid) {
   3151 					VERIFY(nvlist_add_string(spares[i],
   3152 					    ZPOOL_CONFIG_PATH, newpath) == 0);
   3153 					spa_load_spares(spa);
   3154 					spa->spa_spares.sav_sync = B_TRUE;
   3155 					return (spa_vdev_exit(spa, NULL, txg,
   3156 					    0));
   3157 				}
   3158 			}
   3159 		}
   3160 
   3161 		if (spa->spa_l2cache.sav_config != NULL) {
   3162 			VERIFY(nvlist_lookup_nvlist_array(
   3163 			    spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE,
   3164 			    &l2cache, &nl2cache) == 0);
   3165 			for (i = 0; i < nl2cache; i++) {
   3166 				uint64_t theguid;
   3167 				VERIFY(nvlist_lookup_uint64(l2cache[i],
   3168 				    ZPOOL_CONFIG_GUID, &theguid) == 0);
   3169 				if (theguid == guid) {
   3170 					VERIFY(nvlist_add_string(l2cache[i],
   3171 					    ZPOOL_CONFIG_PATH, newpath) == 0);
   3172 					spa_load_l2cache(spa);
   3173 					spa->spa_l2cache.sav_sync = B_TRUE;
   3174 					return (spa_vdev_exit(spa, NULL, txg,
   3175 					    0));
   3176 				}
   3177 			}
   3178 		}
   3179 
   3180 		return (spa_vdev_exit(spa, NULL, txg, ENOENT));
   3181 	}
   3182 
   3183 	if (!vd->vdev_ops->vdev_op_leaf)
   3184 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
   3185 
   3186 	spa_strfree(vd->vdev_path);
   3187 	vd->vdev_path = spa_strdup(newpath);
   3188 
   3189 	vdev_config_dirty(vd->vdev_top);
   3190 
   3191 	return (spa_vdev_exit(spa, NULL, txg, 0));
   3192 }
   3193 
   3194 /*
   3195  * ==========================================================================
   3196  * SPA Scrubbing
   3197  * ==========================================================================
   3198  */
   3199 
   3200 static void
   3201 spa_scrub_io_done(zio_t *zio)
   3202 {
   3203 	spa_t *spa = zio->io_spa;
   3204 
   3205 	arc_data_buf_free(zio->io_data, zio->io_size);
   3206 
   3207 	mutex_enter(&spa->spa_scrub_lock);
   3208 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
   3209 		vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
   3210 		spa->spa_scrub_errors++;
   3211 		mutex_enter(&vd->vdev_stat_lock);
   3212 		vd->vdev_stat.vs_scrub_errors++;
   3213 		mutex_exit(&vd->vdev_stat_lock);
   3214 	}
   3215 
   3216 	if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight)
   3217 		cv_broadcast(&spa->spa_scrub_io_cv);
   3218 
   3219 	ASSERT(spa->spa_scrub_inflight >= 0);
   3220 
   3221 	mutex_exit(&spa->spa_scrub_lock);
   3222 }
   3223 
   3224 static void
   3225 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
   3226     zbookmark_t *zb)
   3227 {
   3228 	size_t size = BP_GET_LSIZE(bp);
   3229 	void *data;
   3230 
   3231 	mutex_enter(&spa->spa_scrub_lock);
   3232 	/*
   3233 	 * Do not give too much work to vdev(s).
   3234 	 */
   3235 	while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) {
   3236 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
   3237 	}
   3238 	spa->spa_scrub_inflight++;
   3239 	mutex_exit(&spa->spa_scrub_lock);
   3240 
   3241 	data = arc_data_buf_alloc(size);
   3242 
   3243 	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
   3244 		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
   3245 
   3246 	flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
   3247 
   3248 	zio_nowait(zio_read(NULL, spa, bp, data, size,
   3249 	    spa_scrub_io_done, NULL, priority, flags, zb));
   3250 }
   3251 
   3252 /* ARGSUSED */
   3253 static int
   3254 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
   3255 {
   3256 	blkptr_t *bp = &bc->bc_blkptr;
   3257 	vdev_t *vd = spa->spa_root_vdev;
   3258 	dva_t *dva = bp->blk_dva;
   3259 	int needs_resilver = B_FALSE;
   3260 	int d;
   3261 
   3262 	if (bc->bc_errno) {
   3263 		/*
   3264 		 * We can't scrub this block, but we can continue to scrub
   3265 		 * the rest of the pool.  Note the error and move along.
   3266 		 */
   3267 		mutex_enter(&spa->spa_scrub_lock);
   3268 		spa->spa_scrub_errors++;
   3269 		mutex_exit(&spa->spa_scrub_lock);
   3270 
   3271 		mutex_enter(&vd->vdev_stat_lock);
   3272 		vd->vdev_stat.vs_scrub_errors++;
   3273 		mutex_exit(&vd->vdev_stat_lock);
   3274 
   3275 		return (ERESTART);
   3276 	}
   3277 
   3278 	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
   3279 
   3280 	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
   3281 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
   3282 
   3283 		ASSERT(vd != NULL);
   3284 
   3285 		/*
   3286 		 * Keep track of how much data we've examined so that
   3287 		 * zpool(1M) status can make useful progress reports.
   3288 		 */
   3289 		mutex_enter(&vd->vdev_stat_lock);
   3290 		vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
   3291 		mutex_exit(&vd->vdev_stat_lock);
   3292 
   3293 		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
   3294 			if (DVA_GET_GANG(&dva[d])) {
   3295 				/*
   3296 				 * Gang members may be spread across multiple
   3297 				 * vdevs, so the best we can do is look at the
   3298 				 * pool-wide DTL.
   3299 				 * XXX -- it would be better to change our
   3300 				 * allocation policy to ensure that this can't
   3301 				 * happen.
   3302 				 */
   3303 				vd = spa->spa_root_vdev;
   3304 			}
   3305 			if (vdev_dtl_contains(&vd->vdev_dtl_map,
   3306 			    bp->blk_birth, 1))
   3307 				needs_resilver = B_TRUE;
   3308 		}
   3309 	}
   3310 
   3311 	if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
   3312 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
   3313 		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
   3314 	else if (needs_resilver)
   3315 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
   3316 		    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
   3317 
   3318 	return (0);
   3319 }
   3320 
   3321 static void
   3322 spa_scrub_thread(spa_t *spa)
   3323 {
   3324 	callb_cpr_t cprinfo;
   3325 	traverse_handle_t *th = spa->spa_scrub_th;
   3326 	vdev_t *rvd = spa->spa_root_vdev;
   3327 	pool_scrub_type_t scrub_type = spa->spa_scrub_type;
   3328 	int error = 0;
   3329 	boolean_t complete;
   3330 
   3331 	CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
   3332 
   3333 	/*
   3334 	 * If we're restarting due to a snapshot create/delete,
   3335 	 * wait for that to complete.
   3336 	 */
   3337 	txg_wait_synced(spa_get_dsl(spa), 0);
   3338 
   3339 	dprintf("start %s mintxg=%llu maxtxg=%llu\n",
   3340 	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
   3341 	    spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
   3342 
   3343 	spa_config_enter(spa, RW_WRITER, FTAG);
   3344 	vdev_reopen(rvd);		/* purge all vdev caches */
   3345 	vdev_config_dirty(rvd);		/* rewrite all disk labels */
   3346 	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
   3347 	spa_config_exit(spa, FTAG);
   3348 
   3349 	mutex_enter(&spa->spa_scrub_lock);
   3350 	spa->spa_scrub_errors = 0;
   3351 	spa->spa_scrub_active = 1;
   3352 	ASSERT(spa->spa_scrub_inflight == 0);
   3353 
   3354 	while (!spa->spa_scrub_stop) {
   3355 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   3356 		while (spa->spa_scrub_suspended) {
   3357 			spa->spa_scrub_active = 0;
   3358 			cv_broadcast(&spa->spa_scrub_cv);
   3359 			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
   3360 			spa->spa_scrub_active = 1;
   3361 		}
   3362 		CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
   3363 
   3364 		if (spa->spa_scrub_restart_txg != 0)
   3365 			break;
   3366 
   3367 		mutex_exit(&spa->spa_scrub_lock);
   3368 		error = traverse_more(th);
   3369 		mutex_enter(&spa->spa_scrub_lock);
   3370 		if (error != EAGAIN)
   3371 			break;
   3372 	}
   3373 
   3374 	while (spa->spa_scrub_inflight)
   3375 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
   3376 
   3377 	spa->spa_scrub_active = 0;
   3378 	cv_broadcast(&spa->spa_scrub_cv);
   3379 
   3380 	mutex_exit(&spa->spa_scrub_lock);
   3381 
   3382 	spa_config_enter(spa, RW_WRITER, FTAG);
   3383 
   3384 	mutex_enter(&spa->spa_scrub_lock);
   3385 
   3386 	/*
   3387 	 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
   3388 	 * AND the spa config lock to synchronize with any config changes
   3389 	 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
   3390 	 */
   3391 	if (spa->spa_scrub_restart_txg != 0)
   3392 		error = ERESTART;
   3393 
   3394 	if (spa->spa_scrub_stop)
   3395 		error = EINTR;
   3396 
   3397 	/*
   3398 	 * Even if there were uncorrectable errors, we consider the scrub
   3399 	 * completed.  The downside is that if there is a transient error during
   3400 	 * a resilver, we won't resilver the data properly to the target.  But
   3401 	 * if the damage is permanent (more likely) we will resilver forever,
   3402 	 * which isn't really acceptable.  Since there is enough information for
   3403 	 * the user to know what has failed and why, this seems like a more
   3404 	 * tractable approach.
   3405 	 */
   3406 	complete = (error == 0);
   3407 
   3408 	dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
   3409 	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
   3410 	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
   3411 	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
   3412 
   3413 	mutex_exit(&spa->spa_scrub_lock);
   3414 
   3415 	/*
   3416 	 * If the scrub/resilver completed, update all DTLs to reflect this.
   3417 	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
   3418 	 */
   3419 	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
   3420 	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
   3421 	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
   3422 	spa_errlog_rotate(spa);
   3423 
   3424 	if (scrub_type == POOL_SCRUB_RESILVER && complete)
   3425 		spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH);
   3426 
   3427 	spa_config_exit(spa, FTAG);
   3428 
   3429 	mutex_enter(&spa->spa_scrub_lock);
   3430 
   3431 	/*
   3432 	 * We may have finished replacing a device.
   3433 	 * Let the async thread assess this and handle the detach.
   3434 	 */
   3435 	spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
   3436 
   3437 	/*
   3438 	 * If we were told to restart, our final act is to start a new scrub.
   3439 	 */
   3440 	if (error == ERESTART)
   3441 		spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
   3442 		    SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
   3443 
   3444 	spa->spa_scrub_type = POOL_SCRUB_NONE;
   3445 	spa->spa_scrub_active = 0;
   3446 	spa->spa_scrub_thread = NULL;
   3447 	cv_broadcast(&spa->spa_scrub_cv);
   3448 	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
   3449 	thread_exit();
   3450 }
   3451 
   3452 void
   3453 spa_scrub_suspend(spa_t *spa)
   3454 {
   3455 	mutex_enter(&spa->spa_scrub_lock);
   3456 	spa->spa_scrub_suspended++;
   3457 	while (spa->spa_scrub_active) {
   3458 		cv_broadcast(&spa->spa_scrub_cv);
   3459 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
   3460 	}
   3461 	while (spa->spa_scrub_inflight)
   3462 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
   3463 	mutex_exit(&spa->spa_scrub_lock);
   3464 }
   3465 
   3466 void
   3467 spa_scrub_resume(spa_t *spa)
   3468 {
   3469 	mutex_enter(&spa->spa_scrub_lock);
   3470 	ASSERT(spa->spa_scrub_suspended != 0);
   3471 	if (--spa->spa_scrub_suspended == 0)
   3472 		cv_broadcast(&spa->spa_scrub_cv);
   3473 	mutex_exit(&spa->spa_scrub_lock);
   3474 }
   3475 
   3476 void
   3477 spa_scrub_restart(spa_t *spa, uint64_t txg)
   3478 {
   3479 	/*
   3480 	 * Something happened (e.g. snapshot create/delete) that means
   3481 	 * we must restart any in-progress scrubs.  The itinerary will
   3482 	 * fix this properly.
   3483 	 */
   3484 	mutex_enter(&spa->spa_scrub_lock);
   3485 	spa->spa_scrub_restart_txg = txg;
   3486 	mutex_exit(&spa->spa_scrub_lock);
   3487 }
   3488 
   3489 int
   3490 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
   3491 {
   3492 	space_seg_t *ss;
   3493 	uint64_t mintxg, maxtxg;
   3494 	vdev_t *rvd = spa->spa_root_vdev;
   3495 
   3496 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
   3497 	ASSERT(!spa_config_held(spa, RW_WRITER));
   3498 
   3499 	if ((uint_t)type >= POOL_SCRUB_TYPES)
   3500 		return (ENOTSUP);
   3501 
   3502 	mutex_enter(&spa->spa_scrub_lock);
   3503 
   3504 	/*
   3505 	 * If there's a scrub or resilver already in progress, stop it.
   3506 	 */
   3507 	while (spa->spa_scrub_thread != NULL) {
   3508 		/*
   3509 		 * Don't stop a resilver unless forced.
   3510 		 */
   3511 		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
   3512 			mutex_exit(&spa->spa_scrub_lock);
   3513 			return (EBUSY);
   3514 		}
   3515 		spa->spa_scrub_stop = 1;
   3516 		cv_broadcast(&spa->spa_scrub_cv);
   3517 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
   3518 	}
   3519 
   3520 	/*
   3521 	 * Terminate the previous traverse.
   3522 	 */
   3523 	if (spa->spa_scrub_th != NULL) {
   3524 		traverse_fini(spa->spa_scrub_th);
   3525 		spa->spa_scrub_th = NULL;
   3526 	}
   3527 
   3528 	if (rvd == NULL) {
   3529 		ASSERT(spa->spa_scrub_stop == 0);
   3530 		ASSERT(spa->spa_scrub_type == type);
   3531 		ASSERT(spa->spa_scrub_restart_txg == 0);
   3532 		mutex_exit(&spa->spa_scrub_lock);
   3533 		return (0);
   3534 	}
   3535 
   3536 	mintxg = TXG_INITIAL - 1;
   3537 	maxtxg = spa_last_synced_txg(spa) + 1;
   3538 
   3539 	mutex_enter(&rvd->vdev_dtl_lock);
   3540 
   3541 	if (rvd->vdev_dtl_map.sm_space == 0) {
   3542 		/*
   3543 		 * The pool-wide DTL is empty.
   3544 		 * If this is a resilver, there's nothing to do except
   3545 		 * check whether any in-progress replacements have completed.
   3546 		 */
   3547 		if (type == POOL_SCRUB_RESILVER) {
   3548 			type = POOL_SCRUB_NONE;
   3549 			spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
   3550 		}
   3551 	} else {
   3552 		/*
   3553 		 * The pool-wide DTL is non-empty.
   3554 		 * If this is a normal scrub, upgrade to a resilver instead.
   3555 		 */
   3556 		if (type == POOL_SCRUB_EVERYTHING)
   3557 			type = POOL_SCRUB_RESILVER;
   3558 	}
   3559 
   3560 	if (type == POOL_SCRUB_RESILVER) {
   3561 		/*
   3562 		 * Determine the resilvering boundaries.
   3563 		 *
   3564 		 * Note: (mintxg, maxtxg) is an open interval,
   3565 		 * i.e. mintxg and maxtxg themselves are not included.
   3566 		 *
   3567 		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
   3568 		 * so we don't claim to resilver a txg that's still changing.
   3569 		 */
   3570 		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
   3571 		mintxg = ss->ss_start - 1;
   3572 		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
   3573 		maxtxg = MIN(ss->ss_end, maxtxg);
   3574 
   3575 		spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
   3576 	}
   3577 
   3578 	mutex_exit(&rvd->vdev_dtl_lock);
   3579 
   3580 	spa->spa_scrub_stop = 0;
   3581 	spa->spa_scrub_type = type;
   3582 	spa->spa_scrub_restart_txg = 0;
   3583 
   3584 	if (type != POOL_SCRUB_NONE) {
   3585 		spa->spa_scrub_mintxg = mintxg;
   3586 		spa->spa_scrub_maxtxg = maxtxg;
   3587 		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
   3588 		    ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
   3589 		    ZIO_FLAG_CANFAIL);
   3590 		traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
   3591 		spa->spa_scrub_thread = thread_create(NULL, 0,
   3592 		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
   3593 	}
   3594 
   3595 	mutex_exit(&spa->spa_scrub_lock);
   3596 
   3597 	return (0);
   3598 }
   3599 
   3600 /*
   3601  * ==========================================================================
   3602  * SPA async task processing
   3603  * ==========================================================================
   3604  */
   3605 
   3606 static void
   3607 spa_async_remove(spa_t *spa, vdev_t *vd)
   3608 {
   3609 	vdev_t *tvd;
   3610 	int c;
   3611 
   3612 	for (c = 0; c < vd->vdev_children; c++) {
   3613 		tvd = vd->vdev_child[c];
   3614 		if (tvd->vdev_remove_wanted) {
   3615 			tvd->vdev_remove_wanted = 0;
   3616 			vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED,
   3617 			    VDEV_AUX_NONE);
   3618 			vdev_clear(spa, tvd, B_TRUE);
   3619 			vdev_config_dirty(tvd->vdev_top);
   3620 		}
   3621 		spa_async_remove(spa, tvd);
   3622 	}
   3623 }
   3624 
   3625 static void
   3626 spa_async_thread(spa_t *spa)
   3627 {
   3628 	int tasks;
   3629 	uint64_t txg;
   3630 
   3631 	ASSERT(spa->spa_sync_on);
   3632 
   3633 	mutex_enter(&spa->spa_async_lock);
   3634 	tasks = spa->spa_async_tasks;
   3635 	spa->spa_async_tasks = 0;
   3636 	mutex_exit(&spa->spa_async_lock);
   3637 
   3638 	/*
   3639 	 * See if the config needs to be updated.
   3640 	 */
   3641 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
   3642 		mutex_enter(&spa_namespace_lock);
   3643 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
   3644 		mutex_exit(&spa_namespace_lock);
   3645 	}
   3646 
   3647 	/*
   3648 	 * See if any devices need to be marked REMOVED.
   3649 	 *
   3650 	 * XXX - We avoid doing this when we are in
   3651 	 * I/O failure state since spa_vdev_enter() grabs
   3652 	 * the namespace lock and would not be able to obtain
   3653 	 * the writer config lock.
   3654 	 */
   3655 	if (tasks & SPA_ASYNC_REMOVE &&
   3656 	    spa_state(spa) != POOL_STATE_IO_FAILURE) {
   3657 		txg = spa_vdev_enter(spa);
   3658 		spa_async_remove(spa, spa->spa_root_vdev);
   3659 		(void) spa_vdev_exit(spa, NULL, txg, 0);
   3660 	}
   3661 
   3662 	/*
   3663 	 * If any devices are done replacing, detach them.
   3664 	 */
   3665 	if (tasks & SPA_ASYNC_RESILVER_DONE)
   3666 		spa_vdev_resilver_done(spa);
   3667 
   3668 	/*
   3669 	 * Kick off a scrub.  When starting a RESILVER scrub (or an EVERYTHING
   3670 	 * scrub which can become a resilver), we need to hold
   3671 	 * spa_namespace_lock() because the sysevent we post via
   3672 	 * spa_event_notify() needs to get the name of the pool.
   3673 	 */
   3674 	if (tasks & SPA_ASYNC_SCRUB) {
   3675 		mutex_enter(&spa_namespace_lock);
   3676 		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
   3677 		mutex_exit(&spa_namespace_lock);
   3678 	}
   3679 
   3680 	/*
   3681 	 * Kick off a resilver.
   3682 	 */
   3683 	if (tasks & SPA_ASYNC_RESILVER) {
   3684 		mutex_enter(&spa_namespace_lock);
   3685 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
   3686 		mutex_exit(&spa_namespace_lock);
   3687 	}
   3688 
   3689 	/*
   3690 	 * Let the world know that we're done.
   3691 	 */
   3692 	mutex_enter(&spa->spa_async_lock);
   3693 	spa->spa_async_thread = NULL;
   3694 	cv_broadcast(&spa->spa_async_cv);
   3695 	mutex_exit(&spa->spa_async_lock);
   3696 	thread_exit();
   3697 }
   3698 
   3699 void
   3700 spa_async_suspend(spa_t *spa)
   3701 {
   3702 	mutex_enter(&spa->spa_async_lock);
   3703 	spa->spa_async_suspended++;
   3704 	while (spa->spa_async_thread != NULL)
   3705 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
   3706 	mutex_exit(&spa->spa_async_lock);
   3707 }
   3708 
   3709 void
   3710 spa_async_resume(spa_t *spa)
   3711 {
   3712 	mutex_enter(&spa->spa_async_lock);
   3713 	ASSERT(spa->spa_async_suspended != 0);
   3714 	spa->spa_async_suspended--;
   3715 	mutex_exit(&spa->spa_async_lock);
   3716 }
   3717 
   3718 static void
   3719 spa_async_dispatch(spa_t *spa)
   3720 {
   3721 	mutex_enter(&spa->spa_async_lock);
   3722 	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
   3723 	    spa->spa_async_thread == NULL &&
   3724 	    rootdir != NULL && !vn_is_readonly(rootdir))
   3725 		spa->spa_async_thread = thread_create(NULL, 0,
   3726 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
   3727 	mutex_exit(&spa->spa_async_lock);
   3728 }
   3729 
   3730 void
   3731 spa_async_request(spa_t *spa, int task)
   3732 {
   3733 	mutex_enter(&spa->spa_async_lock);
   3734 	spa->spa_async_tasks |= task;
   3735 	mutex_exit(&spa->spa_async_lock);
   3736 }
   3737 
   3738 /*
   3739  * ==========================================================================
   3740  * SPA syncing routines
   3741  * ==========================================================================
   3742  */
   3743 
   3744 static void
   3745 spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
   3746 {
   3747 	bplist_t *bpl = &spa->spa_sync_bplist;
   3748 	dmu_tx_t *tx;
   3749 	blkptr_t blk;
   3750 	uint64_t itor = 0;
   3751 	zio_t *zio;
   3752 	int error;
   3753 	uint8_t c = 1;
   3754 
   3755 	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
   3756 
   3757 	while (bplist_iterate(bpl, &itor, &blk) == 0)
   3758 		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
   3759 
   3760 	error = zio_wait(zio);
   3761 	ASSERT3U(error, ==, 0);
   3762 
   3763 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
   3764 	bplist_vacate(bpl, tx);
   3765 
   3766 	/*
   3767 	 * Pre-dirty the first block so we sync to convergence faster.
   3768 	 * (Usually only the first block is needed.)
   3769 	 */
   3770 	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
   3771 	dmu_tx_commit(tx);
   3772 }
   3773 
   3774 static void
   3775 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
   3776 {
   3777 	char *packed = NULL;
   3778 	size_t nvsize = 0;
   3779 	dmu_buf_t *db;
   3780 
   3781 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
   3782 
   3783 	packed = kmem_alloc(nvsize, KM_SLEEP);
   3784 
   3785 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
   3786 	    KM_SLEEP) == 0);
   3787 
   3788 	dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
   3789 
   3790 	kmem_free(packed, nvsize);
   3791 
   3792 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
   3793 	dmu_buf_will_dirty(db, tx);
   3794 	*(uint64_t *)db->db_data = nvsize;
   3795 	dmu_buf_rele(db, FTAG);
   3796 }
   3797 
   3798 static void
   3799 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
   3800     const char *config, const char *entry)
   3801 {
   3802 	nvlist_t *nvroot;
   3803 	nvlist_t **list;
   3804 	int i;
   3805 
   3806 	if (!sav->sav_sync)
   3807 		return;
   3808 
   3809 	/*
   3810 	 * Update the MOS nvlist describing the list of available devices.
   3811 	 * spa_validate_aux() will have already made sure this nvlist is
   3812 	 * valid and the vdevs are labeled appropriately.
   3813 	 */
   3814 	if (sav->sav_object == 0) {
   3815 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
   3816 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
   3817 		    sizeof (uint64_t), tx);
   3818 		VERIFY(zap_update(spa->spa_meta_objset,
   3819 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
   3820 		    &sav->sav_object, tx) == 0);
   3821 	}
   3822 
   3823 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
   3824 	if (sav->sav_count == 0) {
   3825 		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
   3826 	} else {
   3827 		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
   3828 		for (i = 0; i < sav->sav_count; i++)
   3829 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
   3830 			    B_FALSE, B_FALSE, B_TRUE);
   3831 		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
   3832 		    sav->sav_count) == 0);
   3833 		for (i = 0; i < sav->sav_count; i++)
   3834 			nvlist_free(list[i]);
   3835 		kmem_free(list, sav->sav_count * sizeof (void *));
   3836 	}
   3837 
   3838 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
   3839 	nvlist_free(nvroot);
   3840 
   3841 	sav->sav_sync = B_FALSE;
   3842 }
   3843 
   3844 static void
   3845 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
   3846 {
   3847 	nvlist_t *config;
   3848 
   3849 	if (list_is_empty(&spa->spa_dirty_list))
   3850 		return;
   3851 
   3852 	config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
   3853 
   3854 	if (spa->spa_config_syncing)
   3855 		nvlist_free(spa->spa_config_syncing);
   3856 	spa->spa_config_syncing = config;
   3857 
   3858 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
   3859 }
   3860 
   3861 /*
   3862  * Set zpool properties.
   3863  */
   3864 static void
   3865 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
   3866 {
   3867 	spa_t *spa = arg1;
   3868 	objset_t *mos = spa->spa_meta_objset;
   3869 	nvlist_t *nvp = arg2;
   3870 	nvpair_t *elem;
   3871 	uint64_t intval;
   3872 	char *strval, *slash;
   3873 	zpool_prop_t prop;
   3874 	const char *propname;
   3875 	zprop_type_t proptype;
   3876 
   3877 	elem = NULL;
   3878 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
   3879 		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
   3880 		case ZPOOL_PROP_VERSION:
   3881 			/*
   3882 			 * Only set version for non-zpool-creation cases
   3883 			 * (set/import). spa_create() needs special care
   3884 			 * for version setting.
   3885 			 */
   3886 			if (tx->tx_txg != TXG_INITIAL) {
   3887 				VERIFY(nvpair_value_uint64(elem,
   3888 				    &intval) == 0);
   3889 				ASSERT(intval <= SPA_VERSION);
   3890 				ASSERT(intval >= spa_version(spa));
   3891 				spa->spa_uberblock.ub_version = intval;
   3892 				vdev_config_dirty(spa->spa_root_vdev);
   3893 			}
   3894 			break;
   3895 
   3896 		case ZPOOL_PROP_ALTROOT:
   3897 			/*
   3898 			 * 'altroot' is a non-persistent property. It should
   3899 			 * have been set temporarily at creation or import time.
   3900 			 */
   3901 			ASSERT(spa->spa_root != NULL);
   3902 			break;
   3903 
   3904 		case ZPOOL_PROP_CACHEFILE:
   3905 			/*
   3906 			 * 'cachefile' is a non-persistent property, but note
   3907 			 * an async request that the config cache needs to be
   3908 			 * udpated.
   3909 			 */
   3910 			VERIFY(nvpair_value_string(elem, &strval) == 0);
   3911 			if (spa->spa_config_dir)
   3912 				spa_strfree(spa->spa_config_dir);
   3913 			if (spa->spa_config_file)
   3914 				spa_strfree(spa->spa_config_file);
   3915 
   3916 			if (strval[0] == '\0') {
   3917 				spa->spa_config_dir = NULL;
   3918 				spa->spa_config_file = NULL;
   3919 			} else if (strcmp(strval, "none") == 0) {
   3920 				spa->spa_config_dir = spa_strdup(strval);
   3921 				spa->spa_config_file = NULL;
   3922 			} else {
   3923 				/*
   3924 				 * If the cachefile is in the root directory,
   3925 				 * we will end up with an empty string for
   3926 				 * spa_config_dir.  This value is only ever
   3927 				 * used when concatenated with '/', so an empty
   3928 				 * string still behaves correctly and keeps the
   3929 				 * rest of the code simple.
   3930 				 */
   3931 				slash = strrchr(strval, '/');
   3932 				ASSERT(slash != NULL);
   3933 				*slash = '\0';
   3934 				if (strcmp(strval, spa_config_dir) == 0 &&
   3935 				    strcmp(slash + 1, ZPOOL_CACHE_FILE) == 0) {
   3936 					spa->spa_config_dir = NULL;
   3937 					spa->spa_config_file = NULL;
   3938 				} else {
   3939 					spa->spa_config_dir =
   3940 					    spa_strdup(strval);
   3941 					spa->spa_config_file =
   3942 					    spa_strdup(slash + 1);
   3943 				}
   3944 			}
   3945 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
   3946 			break;
   3947 		default:
   3948 			/*
   3949 			 * Set pool property values in the poolprops mos object.
   3950 			 */
   3951 			mutex_enter(&spa->spa_props_lock);
   3952 			if (spa->spa_pool_props_object == 0) {
   3953 				objset_t *mos = spa->spa_meta_objset;
   3954 
   3955 				VERIFY((spa->spa_pool_props_object =
   3956 				    zap_create(mos, DMU_OT_POOL_PROPS,
   3957 				    DMU_OT_NONE, 0, tx)) > 0);
   3958 
   3959 				VERIFY(zap_update(mos,
   3960 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
   3961 				    8, 1, &spa->spa_pool_props_object, tx)
   3962 				    == 0);
   3963 			}
   3964 			mutex_exit(&spa->spa_props_lock);
   3965 
   3966 			/* normalize the property name */
   3967 			propname = zpool_prop_to_name(prop);
   3968 			proptype = zpool_prop_get_type(prop);
   3969 
   3970 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
   3971 				ASSERT(proptype == PROP_TYPE_STRING);
   3972 				VERIFY(nvpair_value_string(elem, &strval) == 0);
   3973 				VERIFY(zap_update(mos,
   3974 				    spa->spa_pool_props_object, propname,
   3975 				    1, strlen(strval) + 1, strval, tx) == 0);
   3976 
   3977 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
   3978 				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
   3979 
   3980 				if (proptype == PROP_TYPE_INDEX) {
   3981 					const char *unused;
   3982 					VERIFY(zpool_prop_index_to_string(
   3983 					    prop, intval, &unused) == 0);
   3984 				}
   3985 				VERIFY(zap_update(mos,
   3986 				    spa->spa_pool_props_object, propname,
   3987 				    8, 1, &intval, tx) == 0);
   3988 			} else {
   3989 				ASSERT(0); /* not allowed */
   3990 			}
   3991 
   3992 			switch (prop) {
   3993 			case ZPOOL_PROP_DELEGATION:
   3994 				spa->spa_delegation = intval;
   3995 				break;
   3996 			case ZPOOL_PROP_BOOTFS:
   3997 				spa->spa_bootfs = intval;
   3998 				break;
   3999 			case ZPOOL_PROP_FAILUREMODE:
   4000 				spa->spa_failmode = intval;
   4001 				break;
   4002 			default:
   4003 				break;
   4004 			}
   4005 		}
   4006 
   4007 		/* log internal history if this is not a zpool create */
   4008 		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
   4009 		    tx->tx_txg != TXG_INITIAL) {
   4010 			spa_history_internal_log(LOG_POOL_PROPSET,
   4011 			    spa, tx, cr, "%s %lld %s",
   4012 			    nvpair_name(elem), intval, spa->spa_name);
   4013 		}
   4014 	}
   4015 }
   4016 
   4017 /*
   4018  * Sync the specified transaction group.  New blocks may be dirtied as
   4019  * part of the process, so we iterate until it converges.
   4020  */
   4021 void
   4022 spa_sync(spa_t *spa, uint64_t txg)
   4023 {
   4024 	dsl_pool_t *dp = spa->spa_dsl_pool;
   4025 	objset_t *mos = spa->spa_meta_objset;
   4026 	bplist_t *bpl = &spa->spa_sync_bplist;
   4027 	vdev_t *rvd = spa->spa_root_vdev;
   4028 	vdev_t *vd;
   4029 	vdev_t *svd[SPA_DVAS_PER_BP];
   4030 	int svdcount = 0;
   4031 	dmu_tx_t *tx;
   4032 	int dirty_vdevs;
   4033 
   4034 	/*
   4035 	 * Lock out configuration changes.
   4036 	 */
   4037 	spa_config_enter(spa, RW_READER, FTAG);
   4038 
   4039 	spa->spa_syncing_txg = txg;
   4040 	spa->spa_sync_pass = 0;
   4041 
   4042 	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
   4043 
   4044 	tx = dmu_tx_create_assigned(dp, txg);
   4045 
   4046 	/*
   4047 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
   4048 	 * set spa_deflate if we have no raid-z vdevs.
   4049 	 */
   4050 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
   4051 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
   4052 		int i;
   4053 
   4054 		for (i = 0; i < rvd->vdev_children; i++) {
   4055 			vd = rvd->vdev_child[i];
   4056 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
   4057 				break;
   4058 		}
   4059 		if (i == rvd->vdev_children) {
   4060 			spa->spa_deflate = TRUE;
   4061 			VERIFY(0 == zap_add(spa->spa_meta_objset,
   4062 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
   4063 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
   4064 		}
   4065 	}
   4066 
   4067 	/*
   4068 	 * If anything has changed in this txg, push the deferred frees
   4069 	 * from the previous txg.  If not, leave them alone so that we
   4070 	 * don't generate work on an otherwise idle system.
   4071 	 */
   4072 	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
   4073 	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
   4074 	    !txg_list_empty(&dp->dp_sync_tasks, txg))
   4075 		spa_sync_deferred_frees(spa, txg);
   4076 
   4077 	/*
   4078 	 * Iterate to convergence.
   4079 	 */
   4080 	do {
   4081 		spa->spa_sync_pass++;
   4082 
   4083 		spa_sync_config_object(spa, tx);
   4084 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
   4085 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
   4086 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
   4087 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
   4088 		spa_errlog_sync(spa, txg);
   4089 		dsl_pool_sync(dp, txg);
   4090 
   4091 		dirty_vdevs = 0;
   4092 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
   4093 			vdev_sync(vd, txg);
   4094 			dirty_vdevs++;
   4095 		}
   4096 
   4097 		bplist_sync(bpl, tx);
   4098 	} while (dirty_vdevs);
   4099 
   4100 	bplist_close(bpl);
   4101 
   4102 	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
   4103 
   4104 	/*
   4105 	 * Rewrite the vdev configuration (which includes the uberblock)
   4106 	 * to commit the transaction group.
   4107 	 *
   4108 	 * If there are no dirty vdevs, we sync the uberblock to a few
   4109 	 * random top-level vdevs that are known to be visible in the
   4110 	 * config cache (see spa_vdev_add() for details).  If there *are*
   4111 	 * dirty vdevs -- or if the sync to our random subset fails --
   4112 	 * then sync the uberblock to all vdevs.
   4113 	 */
   4114 	if (list_is_empty(&spa->spa_dirty_list)) {
   4115 		int children = rvd->vdev_children;
   4116 		int c0 = spa_get_random(children);
   4117 		int c;
   4118 
   4119 		for (c = 0; c < children; c++) {
   4120 			vd = rvd->vdev_child[(c0 + c) % children];
   4121 			if (vd->vdev_ms_array == 0 || vd->vdev_islog)
   4122 				continue;
   4123 			svd[svdcount++] = vd;
   4124 			if (svdcount == SPA_DVAS_PER_BP)
   4125 				break;
   4126 		}
   4127 	}
   4128 	if (svdcount == 0 || vdev_config_sync(svd, svdcount, txg) != 0)
   4129 		VERIFY3U(vdev_config_sync(rvd->vdev_child,
   4130 		    rvd->vdev_children, txg), ==, 0);
   4131 
   4132 	dmu_tx_commit(tx);
   4133 
   4134 	/*
   4135 	 * Clear the dirty config list.
   4136 	 */
   4137 	while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
   4138 		vdev_config_clean(vd);
   4139 
   4140 	/*
   4141 	 * Now that the new config has synced transactionally,
   4142 	 * let it become visible to the config cache.
   4143 	 */
   4144 	if (spa->spa_config_syncing != NULL) {
   4145 		spa_config_set(spa, spa->spa_config_syncing);
   4146 		spa->spa_config_txg = txg;
   4147 		spa->spa_config_syncing = NULL;
   4148 	}
   4149 
   4150 	/*
   4151 	 * Make a stable copy of the fully synced uberblock.
   4152 	 * We use this as the root for pool traversals.
   4153 	 */
   4154 	spa->spa_traverse_wanted = 1;	/* tells traverse_more() to stop */
   4155 
   4156 	spa_scrub_suspend(spa);		/* stop scrubbing and finish I/Os */
   4157 
   4158 	rw_enter(&spa->spa_traverse_lock, RW_WRITER);
   4159 	spa->spa_traverse_wanted = 0;
   4160 	spa->spa_ubsync = spa->spa_uberblock;
   4161 	rw_exit(&spa->spa_traverse_lock);
   4162 
   4163 	spa_scrub_resume(spa);		/* resume scrub with new ubsync */
   4164 
   4165 	/*
   4166 	 * Clean up the ZIL records for the synced txg.
   4167 	 */
   4168 	dsl_pool_zil_clean(dp);
   4169 
   4170 	/*
   4171 	 * Update usable space statistics.
   4172 	 */
   4173 	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
   4174 		vdev_sync_done(vd, txg);
   4175 
   4176 	/*
   4177 	 * It had better be the case that we didn't dirty anything
   4178 	 * since vdev_config_sync().
   4179 	 */
   4180 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
   4181 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
   4182 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
   4183 	ASSERT(bpl->bpl_queue == NULL);
   4184 
   4185 	spa_config_exit(spa, FTAG);
   4186 
   4187 	/*
   4188 	 * If any async tasks have been requested, kick them off.
   4189 	 */
   4190 	spa_async_dispatch(spa);
   4191 }
   4192 
   4193 /*
   4194  * Sync all pools.  We don't want to hold the namespace lock across these
   4195  * operations, so we take a reference on the spa_t and drop the lock during the
   4196  * sync.
   4197  */
   4198 void
   4199 spa_sync_allpools(void)
   4200 {
   4201 	spa_t *spa = NULL;
   4202 	mutex_enter(&spa_namespace_lock);
   4203 	while ((spa = spa_next(spa)) != NULL) {
   4204 		if (spa_state(spa) != POOL_STATE_ACTIVE)
   4205 			continue;
   4206 		spa_open_ref(spa, FTAG);
   4207 		mutex_exit(&spa_namespace_lock);
   4208 		txg_wait_synced(spa_get_dsl(spa), 0);
   4209 		mutex_enter(&spa_namespace_lock);
   4210 		spa_close(spa, FTAG);
   4211 	}
   4212 	mutex_exit(&spa_namespace_lock);
   4213 }
   4214 
   4215 /*
   4216  * ==========================================================================
   4217  * Miscellaneous routines
   4218  * ==========================================================================
   4219  */
   4220 
   4221 /*
   4222  * Remove all pools in the system.
   4223  */
   4224 void
   4225 spa_evict_all(void)
   4226 {
   4227 	spa_t *spa;
   4228 
   4229 	/*
   4230 	 * Remove all cached state.  All pools should be closed now,
   4231 	 * so every spa in the AVL tree should be unreferenced.
   4232 	 */
   4233 	mutex_enter(&spa_namespace_lock);
   4234 	while ((spa = spa_next(NULL)) != NULL) {
   4235 		/*
   4236 		 * Stop async tasks.  The async thread may need to detach
   4237 		 * a device that's been replaced, which requires grabbing
   4238 		 * spa_namespace_lock, so we must drop it here.
   4239 		 */
   4240 		spa_open_ref(spa, FTAG);
   4241 		mutex_exit(&spa_namespace_lock);
   4242 		spa_async_suspend(spa);
   4243 		mutex_enter(&spa_namespace_lock);
   4244 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
   4245 		spa_close(spa, FTAG);
   4246 
   4247 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
   4248 			spa_unload(spa);
   4249 			spa_deactivate(spa);
   4250 		}
   4251 		spa_remove(spa);
   4252 	}
   4253 	mutex_exit(&spa_namespace_lock);
   4254 }
   4255 
   4256 vdev_t *
   4257 spa_lookup_by_guid(spa_t *spa, uint64_t guid)
   4258 {
   4259 	return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
   4260 }
   4261 
   4262 void
   4263 spa_upgrade(spa_t *spa, uint64_t version)
   4264 {
   4265 	spa_config_enter(spa, RW_WRITER, FTAG);
   4266 
   4267 	/*
   4268 	 * This should only be called for a non-faulted pool, and since a
   4269 	 * future version would result in an unopenable pool, this shouldn't be
   4270 	 * possible.
   4271 	 */
   4272 	ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
   4273 	ASSERT(version >= spa->spa_uberblock.ub_version);
   4274 
   4275 	spa->spa_uberblock.ub_version = version;
   4276 	vdev_config_dirty(spa->spa_root_vdev);
   4277 
   4278 	spa_config_exit(spa, FTAG);
   4279 
   4280 	txg_wait_synced(spa_get_dsl(spa), 0);
   4281 }
   4282 
   4283 boolean_t
   4284 spa_has_spare(spa_t *spa, uint64_t guid)
   4285 {
   4286 	int i;
   4287 	uint64_t spareguid;
   4288 	spa_aux_vdev_t *sav = &spa->spa_spares;
   4289 
   4290 	for (i = 0; i < sav->sav_count; i++)
   4291 		if (sav->sav_vdevs[i]->vdev_guid == guid)
   4292 			return (B_TRUE);
   4293 
   4294 	for (i = 0; i < sav->sav_npending; i++) {
   4295 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
   4296 		    &spareguid) == 0 && spareguid == guid)
   4297 			return (B_TRUE);
   4298 	}
   4299 
   4300 	return (B_FALSE);
   4301 }
   4302 
   4303 /*
   4304  * Post a sysevent corresponding to the given event.  The 'name' must be one of
   4305  * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
   4306  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
   4307  * in the userland libzpool, as we don't want consumers to misinterpret ztest
   4308  * or zdb as real changes.
   4309  */
   4310 void
   4311 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
   4312 {
   4313 #ifdef _KERNEL
   4314 	sysevent_t		*ev;
   4315 	sysevent_attr_list_t	*attr = NULL;
   4316 	sysevent_value_t	value;
   4317 	sysevent_id_t		eid;
   4318 
   4319 	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
   4320 	    SE_SLEEP);
   4321 
   4322 	value.value_type = SE_DATA_TYPE_STRING;
   4323 	value.value.sv_string = spa_name(spa);
   4324 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
   4325 		goto done;
   4326 
   4327 	value.value_type = SE_DATA_TYPE_UINT64;
   4328 	value.value.sv_uint64 = spa_guid(spa);
   4329 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
   4330 		goto done;
   4331 
   4332 	if (vd) {
   4333 		value.value_type = SE_DATA_TYPE_UINT64;
   4334 		value.value.sv_uint64 = vd->vdev_guid;
   4335 		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
   4336 		    SE_SLEEP) != 0)
   4337 			goto done;
   4338 
   4339 		if (vd->vdev_path) {
   4340 			value.value_type = SE_DATA_TYPE_STRING;
   4341 			value.value.sv_string = vd->vdev_path;
   4342 			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
   4343 			    &value, SE_SLEEP) != 0)
   4344 				goto done;
   4345 		}
   4346 	}
   4347 
   4348 	if (sysevent_attach_attributes(ev, attr) != 0)
   4349 		goto done;
   4350 	attr = NULL;
   4351 
   4352 	(void) log_sysevent(ev, SE_SLEEP, &eid);
   4353 
   4354 done:
   4355 	if (attr)
   4356 		sysevent_free_attr(attr);
   4357 	sysevent_free(ev);
   4358 #endif
   4359 }
   4360