Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/zfs_context.h>
     29 #include <sys/fm/fs/zfs.h>
     30 #include <sys/spa.h>
     31 #include <sys/txg.h>
     32 #include <sys/spa_impl.h>
     33 #include <sys/vdev_impl.h>
     34 #include <sys/zio_impl.h>
     35 #include <sys/zio_compress.h>
     36 #include <sys/zio_checksum.h>
     37 
     38 /*
     39  * ==========================================================================
     40  * I/O priority table
     41  * ==========================================================================
     42  */
     43 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
     44 	0,	/* ZIO_PRIORITY_NOW		*/
     45 	0,	/* ZIO_PRIORITY_SYNC_READ	*/
     46 	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
     47 	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
     48 	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
     49 	4,	/* ZIO_PRIORITY_FREE		*/
     50 	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
     51 	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
     52 	10,	/* ZIO_PRIORITY_RESILVER	*/
     53 	20,	/* ZIO_PRIORITY_SCRUB		*/
     54 };
     55 
     56 /*
     57  * ==========================================================================
     58  * I/O type descriptions
     59  * ==========================================================================
     60  */
     61 char *zio_type_name[ZIO_TYPES] = {
     62 	"null", "read", "write", "free", "claim", "ioctl" };
     63 
     64 /* Force an allocation failure when non-zero */
     65 uint16_t zio_zil_fail_shift = 0;
     66 uint16_t zio_io_fail_shift = 0;
     67 
     68 /* Enable/disable the write-retry logic */
     69 int zio_write_retry = 1;
     70 
     71 /* Taskq to handle reissuing of I/Os */
     72 taskq_t *zio_taskq;
     73 int zio_resume_threads = 4;
     74 
     75 typedef struct zio_sync_pass {
     76 	int	zp_defer_free;		/* defer frees after this pass */
     77 	int	zp_dontcompress;	/* don't compress after this pass */
     78 	int	zp_rewrite;		/* rewrite new bps after this pass */
     79 } zio_sync_pass_t;
     80 
     81 zio_sync_pass_t zio_sync_pass = {
     82 	1,	/* zp_defer_free */
     83 	4,	/* zp_dontcompress */
     84 	1,	/* zp_rewrite */
     85 };
     86 
     87 static boolean_t zio_io_should_fail(uint16_t);
     88 
     89 /*
     90  * ==========================================================================
     91  * I/O kmem caches
     92  * ==========================================================================
     93  */
     94 kmem_cache_t *zio_cache;
     95 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
     96 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
     97 
     98 #ifdef _KERNEL
     99 extern vmem_t *zio_alloc_arena;
    100 #endif
    101 
    102 /*
    103  * Determine if we are allowed to issue the IO based on the
    104  * pool state. If we must wait then block until we are told
    105  * that we may continue.
    106  */
    107 #define	ZIO_ENTER(spa) {						\
    108 	if (spa->spa_state == POOL_STATE_IO_FAILURE) {			\
    109 		mutex_enter(&spa->spa_zio_lock);			\
    110 		while (spa->spa_state == POOL_STATE_IO_FAILURE)		\
    111 			cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock);	\
    112 		mutex_exit(&spa->spa_zio_lock);				\
    113 	}								\
    114 }
    115 
    116 /*
    117  * An allocation zio is one that either currently has the DVA allocate
    118  * stage set or will have it later in it's lifetime.
    119  */
    120 #define	IO_IS_ALLOCATING(zio) \
    121 	((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
    122 
    123 void
    124 zio_init(void)
    125 {
    126 	size_t c;
    127 	vmem_t *data_alloc_arena = NULL;
    128 
    129 #ifdef _KERNEL
    130 	data_alloc_arena = zio_alloc_arena;
    131 #endif
    132 
    133 	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
    134 	    NULL, NULL, NULL, NULL, NULL, 0);
    135 
    136 	/*
    137 	 * For small buffers, we want a cache for each multiple of
    138 	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
    139 	 * for each quarter-power of 2.  For large buffers, we want
    140 	 * a cache for each multiple of PAGESIZE.
    141 	 */
    142 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
    143 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
    144 		size_t p2 = size;
    145 		size_t align = 0;
    146 
    147 		while (p2 & (p2 - 1))
    148 			p2 &= p2 - 1;
    149 
    150 		if (size <= 4 * SPA_MINBLOCKSIZE) {
    151 			align = SPA_MINBLOCKSIZE;
    152 		} else if (P2PHASE(size, PAGESIZE) == 0) {
    153 			align = PAGESIZE;
    154 		} else if (P2PHASE(size, p2 >> 2) == 0) {
    155 			align = p2 >> 2;
    156 		}
    157 
    158 		if (align != 0) {
    159 			char name[36];
    160 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
    161 			zio_buf_cache[c] = kmem_cache_create(name, size,
    162 			    align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
    163 
    164 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
    165 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
    166 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
    167 			    KMC_NODEBUG);
    168 
    169 		}
    170 	}
    171 
    172 	while (--c != 0) {
    173 		ASSERT(zio_buf_cache[c] != NULL);
    174 		if (zio_buf_cache[c - 1] == NULL)
    175 			zio_buf_cache[c - 1] = zio_buf_cache[c];
    176 
    177 		ASSERT(zio_data_buf_cache[c] != NULL);
    178 		if (zio_data_buf_cache[c - 1] == NULL)
    179 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
    180 	}
    181 
    182 	zio_taskq = taskq_create("zio_taskq", zio_resume_threads,
    183 	    maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
    184 
    185 	zio_inject_init();
    186 }
    187 
    188 void
    189 zio_fini(void)
    190 {
    191 	size_t c;
    192 	kmem_cache_t *last_cache = NULL;
    193 	kmem_cache_t *last_data_cache = NULL;
    194 
    195 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
    196 		if (zio_buf_cache[c] != last_cache) {
    197 			last_cache = zio_buf_cache[c];
    198 			kmem_cache_destroy(zio_buf_cache[c]);
    199 		}
    200 		zio_buf_cache[c] = NULL;
    201 
    202 		if (zio_data_buf_cache[c] != last_data_cache) {
    203 			last_data_cache = zio_data_buf_cache[c];
    204 			kmem_cache_destroy(zio_data_buf_cache[c]);
    205 		}
    206 		zio_data_buf_cache[c] = NULL;
    207 	}
    208 
    209 	taskq_destroy(zio_taskq);
    210 
    211 	kmem_cache_destroy(zio_cache);
    212 
    213 	zio_inject_fini();
    214 }
    215 
    216 /*
    217  * ==========================================================================
    218  * Allocate and free I/O buffers
    219  * ==========================================================================
    220  */
    221 
    222 /*
    223  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
    224  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
    225  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
    226  * excess / transient data in-core during a crashdump.
    227  */
    228 void *
    229 zio_buf_alloc(size_t size)
    230 {
    231 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
    232 
    233 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
    234 
    235 	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
    236 }
    237 
    238 /*
    239  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
    240  * crashdump if the kernel panics.  This exists so that we will limit the amount
    241  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
    242  * of kernel heap dumped to disk when the kernel panics)
    243  */
    244 void *
    245 zio_data_buf_alloc(size_t size)
    246 {
    247 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
    248 
    249 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
    250 
    251 	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
    252 }
    253 
    254 void
    255 zio_buf_free(void *buf, size_t size)
    256 {
    257 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
    258 
    259 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
    260 
    261 	kmem_cache_free(zio_buf_cache[c], buf);
    262 }
    263 
    264 void
    265 zio_data_buf_free(void *buf, size_t size)
    266 {
    267 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
    268 
    269 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
    270 
    271 	kmem_cache_free(zio_data_buf_cache[c], buf);
    272 }
    273 
    274 /*
    275  * ==========================================================================
    276  * Push and pop I/O transform buffers
    277  * ==========================================================================
    278  */
    279 static void
    280 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
    281 {
    282 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
    283 
    284 	zt->zt_data = data;
    285 	zt->zt_size = size;
    286 	zt->zt_bufsize = bufsize;
    287 
    288 	zt->zt_next = zio->io_transform_stack;
    289 	zio->io_transform_stack = zt;
    290 
    291 	zio->io_data = data;
    292 	zio->io_size = size;
    293 }
    294 
    295 static void
    296 zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
    297 {
    298 	zio_transform_t *zt = zio->io_transform_stack;
    299 
    300 	*data = zt->zt_data;
    301 	*size = zt->zt_size;
    302 	*bufsize = zt->zt_bufsize;
    303 
    304 	zio->io_transform_stack = zt->zt_next;
    305 	kmem_free(zt, sizeof (zio_transform_t));
    306 
    307 	if ((zt = zio->io_transform_stack) != NULL) {
    308 		zio->io_data = zt->zt_data;
    309 		zio->io_size = zt->zt_size;
    310 	}
    311 }
    312 
    313 static void
    314 zio_clear_transform_stack(zio_t *zio)
    315 {
    316 	void *data;
    317 	uint64_t size, bufsize;
    318 
    319 	ASSERT(zio->io_transform_stack != NULL);
    320 
    321 	zio_pop_transform(zio, &data, &size, &bufsize);
    322 	while (zio->io_transform_stack != NULL) {
    323 		zio_buf_free(data, bufsize);
    324 		zio_pop_transform(zio, &data, &size, &bufsize);
    325 	}
    326 }
    327 
    328 /*
    329  * ==========================================================================
    330  * Create the various types of I/O (read, write, free)
    331  * ==========================================================================
    332  */
    333 static zio_t *
    334 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
    335     void *data, uint64_t size, zio_done_func_t *done, void *private,
    336     zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
    337 {
    338 	zio_t *zio;
    339 
    340 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
    341 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
    342 
    343 	/* Only we should set CONFIG_GRABBED */
    344 	ASSERT(!(flags & ZIO_FLAG_CONFIG_GRABBED));
    345 
    346 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
    347 	bzero(zio, sizeof (zio_t));
    348 	zio->io_parent = pio;
    349 	zio->io_spa = spa;
    350 	zio->io_txg = txg;
    351 	zio->io_flags = flags;
    352 	if (bp != NULL) {
    353 		zio->io_bp = bp;
    354 		zio->io_bp_copy = *bp;
    355 		zio->io_bp_orig = *bp;
    356 	}
    357 	zio->io_done = done;
    358 	zio->io_private = private;
    359 	zio->io_type = type;
    360 	zio->io_priority = priority;
    361 	zio->io_stage = stage;
    362 	zio->io_pipeline = pipeline;
    363 	zio->io_timestamp = lbolt64;
    364 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
    365 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
    366 	zio_push_transform(zio, data, size, size);
    367 
    368 	/*
    369 	 * Note on config lock:
    370 	 *
    371 	 * If CONFIG_HELD is set, then the caller already has the config
    372 	 * lock, so we don't need it for this io.
    373 	 *
    374 	 * We set CONFIG_GRABBED to indicate that we have grabbed the
    375 	 * config lock on behalf of this io, so it should be released
    376 	 * in zio_done.
    377 	 *
    378 	 * Unless CONFIG_HELD is set, we will grab the config lock for
    379 	 * any top-level (parent-less) io, *except* NULL top-level ios.
    380 	 * The NULL top-level ios rarely have any children, so we delay
    381 	 * grabbing the lock until the first child is added (but it is
    382 	 * still grabbed on behalf of the top-level i/o, so additional
    383 	 * children don't need to also grab it).  This greatly reduces
    384 	 * contention on the config lock.
    385 	 */
    386 	if (pio == NULL) {
    387 		if (type != ZIO_TYPE_NULL &&
    388 		    !(flags & ZIO_FLAG_CONFIG_HELD)) {
    389 			spa_config_enter(spa, RW_READER, zio);
    390 			zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
    391 		}
    392 		zio->io_root = zio;
    393 	} else {
    394 		zio->io_root = pio->io_root;
    395 		if (!(flags & ZIO_FLAG_NOBOOKMARK))
    396 			zio->io_logical = pio->io_logical;
    397 		mutex_enter(&pio->io_lock);
    398 		if (pio->io_parent == NULL &&
    399 		    pio->io_type == ZIO_TYPE_NULL &&
    400 		    !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
    401 		    !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
    402 			pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
    403 			spa_config_enter(spa, RW_READER, pio);
    404 		}
    405 		if (stage < ZIO_STAGE_READY)
    406 			pio->io_children_notready++;
    407 		pio->io_children_notdone++;
    408 		zio->io_sibling_next = pio->io_child;
    409 		zio->io_sibling_prev = NULL;
    410 		if (pio->io_child != NULL)
    411 			pio->io_child->io_sibling_prev = zio;
    412 		pio->io_child = zio;
    413 		zio->io_ndvas = pio->io_ndvas;
    414 		mutex_exit(&pio->io_lock);
    415 	}
    416 
    417 	/*
    418 	 * Save off the original state incase we need to retry later.
    419 	 */
    420 	zio->io_orig_stage = zio->io_stage;
    421 	zio->io_orig_pipeline = zio->io_pipeline;
    422 	zio->io_orig_flags = zio->io_flags;
    423 
    424 	/*
    425 	 * If this is not a null zio, and config is not already held,
    426 	 * then the root zio should have grabbed the config lock.
    427 	 * If this is not a root zio, it should not have grabbed the
    428 	 * config lock.
    429 	 */
    430 	ASSERT((zio->io_root->io_flags & ZIO_FLAG_CONFIG_HELD) ||
    431 	    zio->io_type == ZIO_TYPE_NULL ||
    432 	    (zio->io_root->io_flags & ZIO_FLAG_CONFIG_GRABBED));
    433 	ASSERT(zio->io_root == zio ||
    434 	    !(zio->io_flags & ZIO_FLAG_CONFIG_GRABBED));
    435 
    436 	return (zio);
    437 }
    438 
    439 static void
    440 zio_reset(zio_t *zio)
    441 {
    442 	zio_clear_transform_stack(zio);
    443 
    444 	zio->io_flags = zio->io_orig_flags;
    445 	zio->io_stage = zio->io_orig_stage;
    446 	zio->io_pipeline = zio->io_orig_pipeline;
    447 	zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size);
    448 }
    449 
    450 zio_t *
    451 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
    452 	int flags)
    453 {
    454 	zio_t *zio;
    455 
    456 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
    457 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
    458 	    ZIO_WAIT_FOR_CHILDREN_PIPELINE);
    459 
    460 	return (zio);
    461 }
    462 
    463 zio_t *
    464 zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
    465 {
    466 	return (zio_null(NULL, spa, done, private, flags));
    467 }
    468 
    469 zio_t *
    470 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
    471     uint64_t size, zio_done_func_t *done, void *private,
    472     int priority, int flags, const zbookmark_t *zb)
    473 {
    474 	zio_t *zio;
    475 
    476 	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
    477 
    478 	/*
    479 	 * If the user has specified that we allow I/Os to continue
    480 	 * then attempt to satisfy the read.
    481 	 */
    482 	if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
    483 		ZIO_ENTER(spa);
    484 
    485 	zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp,
    486 	    data, size, done, private,
    487 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
    488 	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
    489 	zio->io_bookmark = *zb;
    490 
    491 	zio->io_logical = zio;
    492 
    493 	/*
    494 	 * Work off our copy of the bp so the caller can free it.
    495 	 */
    496 	zio->io_bp = &zio->io_bp_copy;
    497 
    498 	return (zio);
    499 }
    500 
    501 zio_t *
    502 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
    503     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
    504     zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
    505     int flags, const zbookmark_t *zb)
    506 {
    507 	zio_t *zio;
    508 
    509 	ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
    510 	    checksum < ZIO_CHECKSUM_FUNCTIONS);
    511 
    512 	ASSERT(compress >= ZIO_COMPRESS_OFF &&
    513 	    compress < ZIO_COMPRESS_FUNCTIONS);
    514 
    515 	ZIO_ENTER(spa);
    516 
    517 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
    518 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
    519 	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
    520 
    521 	zio->io_ready = ready;
    522 
    523 	zio->io_bookmark = *zb;
    524 
    525 	zio->io_logical = zio;
    526 
    527 	zio->io_checksum = checksum;
    528 	zio->io_compress = compress;
    529 	zio->io_ndvas = ncopies;
    530 
    531 	if (bp->blk_birth != txg) {
    532 		/* XXX the bp usually (always?) gets re-zeroed later */
    533 		BP_ZERO(bp);
    534 		BP_SET_LSIZE(bp, size);
    535 		BP_SET_PSIZE(bp, size);
    536 	} else {
    537 		/* Make sure someone doesn't change their mind on overwrites */
    538 		ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
    539 		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
    540 	}
    541 
    542 	return (zio);
    543 }
    544 
    545 zio_t *
    546 zio_rewrite(zio_t *pio, spa_t *spa, int checksum, uint64_t txg,
    547     blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done,
    548     void *private, int priority, int flags, zbookmark_t *zb)
    549 {
    550 	zio_t *zio;
    551 
    552 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
    553 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
    554 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp));
    555 
    556 	zio->io_bookmark = *zb;
    557 	zio->io_checksum = checksum;
    558 	zio->io_compress = ZIO_COMPRESS_OFF;
    559 
    560 	if (pio != NULL)
    561 		ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
    562 
    563 	return (zio);
    564 }
    565 
    566 static void
    567 zio_write_allocate_ready(zio_t *zio)
    568 {
    569 	/* Free up the previous block */
    570 	if (!BP_IS_HOLE(&zio->io_bp_orig)) {
    571 		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
    572 		    &zio->io_bp_orig, NULL, NULL));
    573 	}
    574 }
    575 
    576 static zio_t *
    577 zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
    578     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
    579     zio_done_func_t *done, void *private, int priority, int flags)
    580 {
    581 	zio_t *zio;
    582 
    583 	BP_ZERO(bp);
    584 	BP_SET_LSIZE(bp, size);
    585 	BP_SET_PSIZE(bp, size);
    586 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
    587 
    588 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
    589 	    ZIO_TYPE_WRITE, priority, flags,
    590 	    ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
    591 
    592 	zio->io_checksum = checksum;
    593 	zio->io_compress = ZIO_COMPRESS_OFF;
    594 	zio->io_ready = zio_write_allocate_ready;
    595 
    596 	return (zio);
    597 }
    598 
    599 zio_t *
    600 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
    601     zio_done_func_t *done, void *private)
    602 {
    603 	zio_t *zio;
    604 
    605 	ASSERT(!BP_IS_HOLE(bp));
    606 
    607 	if (txg == spa->spa_syncing_txg &&
    608 	    spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
    609 		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
    610 		return (zio_null(pio, spa, NULL, NULL, 0));
    611 	}
    612 
    613 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
    614 	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
    615 	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp));
    616 
    617 	zio->io_bp = &zio->io_bp_copy;
    618 
    619 	return (zio);
    620 }
    621 
    622 zio_t *
    623 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
    624     zio_done_func_t *done, void *private)
    625 {
    626 	zio_t *zio;
    627 
    628 	/*
    629 	 * A claim is an allocation of a specific block.  Claims are needed
    630 	 * to support immediate writes in the intent log.  The issue is that
    631 	 * immediate writes contain committed data, but in a txg that was
    632 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
    633 	 * the intent log claims all blocks that contain immediate write data
    634 	 * so that the SPA knows they're in use.
    635 	 *
    636 	 * All claims *must* be resolved in the first txg -- before the SPA
    637 	 * starts allocating blocks -- so that nothing is allocated twice.
    638 	 */
    639 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
    640 	ASSERT3U(spa_first_txg(spa), <=, txg);
    641 
    642 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
    643 	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
    644 	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp));
    645 
    646 	zio->io_bp = &zio->io_bp_copy;
    647 
    648 	return (zio);
    649 }
    650 
    651 zio_t *
    652 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
    653     zio_done_func_t *done, void *private, int priority, int flags)
    654 {
    655 	zio_t *zio;
    656 	int c;
    657 
    658 	if (vd->vdev_children == 0) {
    659 		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
    660 		    ZIO_TYPE_IOCTL, priority, flags,
    661 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
    662 
    663 		zio->io_vd = vd;
    664 		zio->io_cmd = cmd;
    665 	} else {
    666 		zio = zio_null(pio, spa, NULL, NULL, flags);
    667 
    668 		for (c = 0; c < vd->vdev_children; c++)
    669 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
    670 			    done, private, priority, flags));
    671 	}
    672 
    673 	return (zio);
    674 }
    675 
    676 static void
    677 zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
    678     int checksum, boolean_t labels)
    679 {
    680 	ASSERT(vd->vdev_children == 0);
    681 
    682 	ASSERT(size <= SPA_MAXBLOCKSIZE);
    683 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
    684 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
    685 
    686 #ifdef ZFS_DEBUG
    687 	if (labels) {
    688 		ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
    689 		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
    690 	}
    691 #endif
    692 	ASSERT3U(offset + size, <=, vd->vdev_psize);
    693 
    694 	BP_ZERO(bp);
    695 
    696 	BP_SET_LSIZE(bp, size);
    697 	BP_SET_PSIZE(bp, size);
    698 
    699 	BP_SET_CHECKSUM(bp, checksum);
    700 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
    701 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
    702 
    703 	if (checksum != ZIO_CHECKSUM_OFF)
    704 		ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
    705 }
    706 
    707 zio_t *
    708 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
    709     void *data, int checksum, zio_done_func_t *done, void *private,
    710     int priority, int flags, boolean_t labels)
    711 {
    712 	zio_t *zio;
    713 	blkptr_t blk;
    714 
    715 	ZIO_ENTER(vd->vdev_spa);
    716 
    717 	zio_phys_bp_init(vd, &blk, offset, size, checksum, labels);
    718 
    719 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
    720 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
    721 	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
    722 
    723 	zio->io_vd = vd;
    724 	zio->io_offset = offset;
    725 
    726 	/*
    727 	 * Work off our copy of the bp so the caller can free it.
    728 	 */
    729 	zio->io_bp = &zio->io_bp_copy;
    730 
    731 	return (zio);
    732 }
    733 
    734 zio_t *
    735 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
    736     void *data, int checksum, zio_done_func_t *done, void *private,
    737     int priority, int flags, boolean_t labels)
    738 {
    739 	zio_block_tail_t *zbt;
    740 	void *wbuf;
    741 	zio_t *zio;
    742 	blkptr_t blk;
    743 
    744 	ZIO_ENTER(vd->vdev_spa);
    745 
    746 	zio_phys_bp_init(vd, &blk, offset, size, checksum, labels);
    747 
    748 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
    749 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
    750 	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
    751 
    752 	zio->io_vd = vd;
    753 	zio->io_offset = offset;
    754 
    755 	zio->io_bp = &zio->io_bp_copy;
    756 	zio->io_checksum = checksum;
    757 
    758 	if (zio_checksum_table[checksum].ci_zbt) {
    759 		/*
    760 		 * zbt checksums are necessarily destructive -- they modify
    761 		 * one word of the write buffer to hold the verifier/checksum.
    762 		 * Therefore, we must make a local copy in case the data is
    763 		 * being written to multiple places.
    764 		 */
    765 		wbuf = zio_buf_alloc(size);
    766 		bcopy(data, wbuf, size);
    767 		zio_push_transform(zio, wbuf, size, size);
    768 
    769 		zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
    770 		zbt->zbt_cksum = blk.blk_cksum;
    771 	}
    772 
    773 	return (zio);
    774 }
    775 
    776 /*
    777  * Create a child I/O to do some work for us.  It has no associated bp.
    778  */
    779 zio_t *
    780 zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
    781 	void *data, uint64_t size, int type, int priority, int flags,
    782 	zio_done_func_t *done, void *private)
    783 {
    784 	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
    785 	zio_t *cio;
    786 
    787 	if (type == ZIO_TYPE_READ && bp != NULL) {
    788 		/*
    789 		 * If we have the bp, then the child should perform the
    790 		 * checksum and the parent need not.  This pushes error
    791 		 * detection as close to the leaves as possible and
    792 		 * eliminates redundant checksums in the interior nodes.
    793 		 */
    794 		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
    795 		zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
    796 	}
    797 
    798 	cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
    799 	    done, private, type, priority,
    800 	    (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
    801 	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
    802 
    803 	cio->io_vd = vd;
    804 	cio->io_offset = offset;
    805 
    806 	return (cio);
    807 }
    808 
    809 /*
    810  * ==========================================================================
    811  * Initiate I/O, either sync or async
    812  * ==========================================================================
    813  */
    814 static void
    815 zio_destroy(zio_t *zio)
    816 {
    817 	mutex_destroy(&zio->io_lock);
    818 	cv_destroy(&zio->io_cv);
    819 	if (zio->io_failed_vds != NULL) {
    820 		kmem_free(zio->io_failed_vds,
    821 		    zio->io_failed_vds_count * sizeof (vdev_t *));
    822 		zio->io_failed_vds = NULL;
    823 		zio->io_failed_vds_count = 0;
    824 	}
    825 	kmem_cache_free(zio_cache, zio);
    826 }
    827 
    828 int
    829 zio_wait(zio_t *zio)
    830 {
    831 	int error;
    832 
    833 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
    834 
    835 	zio->io_waiter = curthread;
    836 
    837 	zio_execute(zio);
    838 
    839 	mutex_enter(&zio->io_lock);
    840 	while (zio->io_stalled != ZIO_STAGE_DONE)
    841 		cv_wait(&zio->io_cv, &zio->io_lock);
    842 	mutex_exit(&zio->io_lock);
    843 
    844 	error = zio->io_error;
    845 	zio_destroy(zio);
    846 
    847 	return (error);
    848 }
    849 
    850 void
    851 zio_nowait(zio_t *zio)
    852 {
    853 	zio_execute(zio);
    854 }
    855 
    856 void
    857 zio_interrupt(zio_t *zio)
    858 {
    859 	(void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type],
    860 	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
    861 }
    862 
    863 static int
    864 zio_issue_async(zio_t *zio)
    865 {
    866 	(void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type],
    867 	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
    868 
    869 	return (ZIO_PIPELINE_STOP);
    870 }
    871 
    872 /*
    873  * ==========================================================================
    874  * I/O pipeline interlocks: parent/child dependency scoreboarding
    875  * ==========================================================================
    876  */
    877 static int
    878 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
    879 {
    880 	int rv = ZIO_PIPELINE_CONTINUE;
    881 
    882 	mutex_enter(&zio->io_lock);
    883 	ASSERT(zio->io_stalled == 0);
    884 	if (*countp != 0) {
    885 		zio->io_stalled = stage;
    886 		rv = ZIO_PIPELINE_STOP;
    887 	}
    888 	mutex_exit(&zio->io_lock);
    889 
    890 	return (rv);
    891 }
    892 
    893 static void
    894 zio_add_failed_vdev(zio_t *pio, zio_t *zio)
    895 {
    896 	uint64_t oldcount = pio->io_failed_vds_count;
    897 	vdev_t **new_vds;
    898 	int i;
    899 
    900 	ASSERT(MUTEX_HELD(&pio->io_lock));
    901 
    902 	if (zio->io_vd == NULL)
    903 		return;
    904 
    905 	for (i = 0; i < oldcount; i++) {
    906 		if (pio->io_failed_vds[i] == zio->io_vd)
    907 			return;
    908 	}
    909 
    910 	new_vds = kmem_zalloc((oldcount + 1) * sizeof (vdev_t *), KM_SLEEP);
    911 	if (pio->io_failed_vds != NULL) {
    912 		bcopy(pio->io_failed_vds, new_vds,
    913 		    oldcount * sizeof (vdev_t *));
    914 		kmem_free(pio->io_failed_vds, oldcount * sizeof (vdev_t *));
    915 	}
    916 	pio->io_failed_vds = new_vds;
    917 	pio->io_failed_vds[oldcount] = zio->io_vd;
    918 	pio->io_failed_vds_count++;
    919 }
    920 
    921 static void
    922 zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
    923 {
    924 	zio_t *pio = zio->io_parent;
    925 
    926 	mutex_enter(&pio->io_lock);
    927 	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) {
    928 		pio->io_error = zio->io_error;
    929 		if (zio->io_error && zio->io_error != ENOTSUP)
    930 			zio_add_failed_vdev(pio, zio);
    931 	}
    932 	ASSERT3U(*countp, >, 0);
    933 	if (--*countp == 0 && pio->io_stalled == stage) {
    934 		pio->io_stalled = 0;
    935 		mutex_exit(&pio->io_lock);
    936 		zio_execute(pio);
    937 	} else {
    938 		mutex_exit(&pio->io_lock);
    939 	}
    940 }
    941 
    942 int
    943 zio_wait_for_children_ready(zio_t *zio)
    944 {
    945 	return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
    946 	    &zio->io_children_notready));
    947 }
    948 
    949 int
    950 zio_wait_for_children_done(zio_t *zio)
    951 {
    952 	return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
    953 	    &zio->io_children_notdone));
    954 }
    955 
    956 static int
    957 zio_read_init(zio_t *zio)
    958 {
    959 	blkptr_t *bp = zio->io_bp;
    960 
    961 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
    962 		uint64_t csize = BP_GET_PSIZE(bp);
    963 		void *cbuf = zio_buf_alloc(csize);
    964 
    965 		zio_push_transform(zio, cbuf, csize, csize);
    966 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
    967 	}
    968 
    969 	if (BP_IS_GANG(bp)) {
    970 		uint64_t gsize = SPA_GANGBLOCKSIZE;
    971 		void *gbuf = zio_buf_alloc(gsize);
    972 
    973 		zio_push_transform(zio, gbuf, gsize, gsize);
    974 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
    975 	}
    976 
    977 	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
    978 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
    979 
    980 	return (ZIO_PIPELINE_CONTINUE);
    981 }
    982 
    983 static int
    984 zio_ready(zio_t *zio)
    985 {
    986 	zio_t *pio = zio->io_parent;
    987 
    988 	if (zio->io_ready)
    989