Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)zio.c	1.31	07/12/12 SMI"
     27 
     28 #include <sys/zfs_context.h>
     29 #include <sys/fm/fs/zfs.h>
     30 #include <sys/spa.h>
     31 #include <sys/txg.h>
     32 #include <sys/spa_impl.h>
     33 #include <sys/vdev_impl.h>
     34 #include <sys/zio_impl.h>
     35 #include <sys/zio_compress.h>
     36 #include <sys/zio_checksum.h>
     37 
     38 /*
     39  * ==========================================================================
     40  * I/O priority table
     41  * ==========================================================================
     42  */
     43 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
     44 	0,	/* ZIO_PRIORITY_NOW		*/
     45 	0,	/* ZIO_PRIORITY_SYNC_READ	*/
     46 	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
     47 	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
     48 	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
     49 	4,	/* ZIO_PRIORITY_FREE		*/
     50 	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
     51 	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
     52 	10,	/* ZIO_PRIORITY_RESILVER	*/
     53 	20,	/* ZIO_PRIORITY_SCRUB		*/
     54 };
     55 
     56 /*
     57  * ==========================================================================
     58  * I/O type descriptions
     59  * ==========================================================================
     60  */
     61 char *zio_type_name[ZIO_TYPES] = {
     62 	"null", "read", "write", "free", "claim", "ioctl" };
     63 
     64 /* Force an allocation failure when non-zero */
     65 uint16_t zio_zil_fail_shift = 0;
     66 uint16_t zio_io_fail_shift = 0;
     67 
     68 /* Enable/disable the write-retry logic */
     69 int zio_write_retry = 1;
     70 
     71 /* Taskq to handle reissuing of I/Os */
     72 taskq_t *zio_taskq;
     73 int zio_resume_threads = 4;
     74 
     75 typedef struct zio_sync_pass {
     76 	int	zp_defer_free;		/* defer frees after this pass */
     77 	int	zp_dontcompress;	/* don't compress after this pass */
     78 	int	zp_rewrite;		/* rewrite new bps after this pass */
     79 } zio_sync_pass_t;
     80 
     81 zio_sync_pass_t zio_sync_pass = {
     82 	1,	/* zp_defer_free */
     83 	4,	/* zp_dontcompress */
     84 	1,	/* zp_rewrite */
     85 };
     86 
     87 static boolean_t zio_io_should_fail(uint16_t);
     88 
     89 /*
     90  * ==========================================================================
     91  * I/O kmem caches
     92  * ==========================================================================
     93  */
     94 kmem_cache_t *zio_cache;
     95 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
     96 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
     97 
     98 #ifdef _KERNEL
     99 extern vmem_t *zio_alloc_arena;
    100 #endif
    101 
    102 /*
    103  * Determine if we are allowed to issue the IO based on the
    104  * pool state. If we must wait then block until we are told
    105  * that we may continue.
    106  */
    107 #define	ZIO_ENTER(spa) {						\
    108 	if (spa->spa_state == POOL_STATE_IO_FAILURE) {			\
    109 		mutex_enter(&spa->spa_zio_lock);			\
    110 		while (spa->spa_state == POOL_STATE_IO_FAILURE)		\
    111 			cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock);	\
    112 		mutex_exit(&spa->spa_zio_lock);				\
    113 	}								\
    114 }
    115 
    116 /*
    117  * An allocation zio is one that either currently has the DVA allocate
    118  * stage set or will have it later in it's lifetime.
    119  */
    120 #define	IO_IS_ALLOCATING(zio) \
    121 	((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
    122 
    123 void
    124 zio_init(void)
    125 {
    126 	size_t c;
    127 	vmem_t *data_alloc_arena = NULL;
    128 
    129 #ifdef _KERNEL
    130 	data_alloc_arena = zio_alloc_arena;
    131 #endif
    132 
    133 	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
    134 	    NULL, NULL, NULL, NULL, NULL, 0);
    135 
    136 	/*
    137 	 * For small buffers, we want a cache for each multiple of
    138 	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
    139 	 * for each quarter-power of 2.  For large buffers, we want
    140 	 * a cache for each multiple of PAGESIZE.
    141 	 */
    142 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
    143 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
    144 		size_t p2 = size;
    145 		size_t align = 0;
    146 
    147 		while (p2 & (p2 - 1))
    148 			p2 &= p2 - 1;
    149 
    150 		if (size <= 4 * SPA_MINBLOCKSIZE) {
    151 			align = SPA_MINBLOCKSIZE;
    152 		} else if (P2PHASE(size, PAGESIZE) == 0) {
    153 			align = PAGESIZE;
    154 		} else if (P2PHASE(size, p2 >> 2) == 0) {
    155 			align = p2 >> 2;
    156 		}
    157 
    158 		if (align != 0) {
    159 			char name[36];
    160 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
    161 			zio_buf_cache[c] = kmem_cache_create(name, size,
    162 			    align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
    163 
    164 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
    165 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
    166 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
    167 			    KMC_NODEBUG);
    168 
    169 		}
    170 	}
    171 
    172 	while (--c != 0) {
    173 		ASSERT(zio_buf_cache[c] != NULL);
    174 		if (zio_buf_cache[c - 1] == NULL)
    175 			zio_buf_cache[c - 1] = zio_buf_cache[c];
    176 
    177 		ASSERT(zio_data_buf_cache[c] != NULL);
    178 		if (zio_data_buf_cache[c - 1] == NULL)
    179 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
    180 	}
    181 
    182 	zio_taskq = taskq_create("zio_taskq", zio_resume_threads,
    183 	    maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
    184 
    185 	zio_inject_init();
    186 }
    187 
    188 void
    189 zio_fini(void)
    190 {
    191 	size_t c;
    192 	kmem_cache_t *last_cache = NULL;
    193 	kmem_cache_t *last_data_cache = NULL;
    194 
    195 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
    196 		if (zio_buf_cache[c] != last_cache) {
    197 			last_cache = zio_buf_cache[c];
    198 			kmem_cache_destroy(zio_buf_cache[c]);
    199 		}
    200 		zio_buf_cache[c] = NULL;
    201 
    202 		if (zio_data_buf_cache[c] != last_data_cache) {
    203 			last_data_cache = zio_data_buf_cache[c];
    204 			kmem_cache_destroy(zio_data_buf_cache[c]);
    205 		}
    206 		zio_data_buf_cache[c] = NULL;
    207 	}
    208 
    209 	taskq_destroy(zio_taskq);
    210 
    211 	kmem_cache_destroy(zio_cache);
    212 
    213 	zio_inject_fini();
    214 }
    215 
    216 /*
    217  * ==========================================================================
    218  * Allocate and free I/O buffers
    219  * ==========================================================================
    220  */
    221 
    222 /*
    223  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
    224  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
    225  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
    226  * excess / transient data in-core during a crashdump.
    227  */
    228 void *
    229 zio_buf_alloc(size_t size)
    230 {
    231 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
    232 
    233 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
    234 
    235 	return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
    236 }
    237 
    238 /*
    239  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
    240  * crashdump if the kernel panics.  This exists so that we will limit the amount
    241  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
    242  * of kernel heap dumped to disk when the kernel panics)
    243  */
    244 void *
    245 zio_data_buf_alloc(size_t size)
    246 {
    247 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
    248 
    249 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
    250 
    251 	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP));
    252 }
    253 
    254 void
    255 zio_buf_free(void *buf, size_t size)
    256 {
    257 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
    258 
    259 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
    260 
    261 	kmem_cache_free(zio_buf_cache[c], buf);
    262 }
    263 
    264 void
    265 zio_data_buf_free(void *buf, size_t size)
    266 {
    267 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
    268 
    269 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
    270 
    271 	kmem_cache_free(zio_data_buf_cache[c], buf);
    272 }
    273 
    274 /*
    275  * ==========================================================================
    276  * Push and pop I/O transform buffers
    277  * ==========================================================================
    278  */
    279 static void
    280 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
    281 {
    282 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
    283 
    284 	zt->zt_data = data;
    285 	zt->zt_size = size;
    286 	zt->zt_bufsize = bufsize;
    287 
    288 	zt->zt_next = zio->io_transform_stack;
    289 	zio->io_transform_stack = zt;
    290 
    291 	zio->io_data = data;
    292 	zio->io_size = size;
    293 }
    294 
    295 static void
    296 zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
    297 {
    298 	zio_transform_t *zt = zio->io_transform_stack;
    299 
    300 	*data = zt->zt_data;
    301 	*size = zt->zt_size;
    302 	*bufsize = zt->zt_bufsize;
    303 
    304 	zio->io_transform_stack = zt->zt_next;
    305 	kmem_free(zt, sizeof (zio_transform_t));
    306 
    307 	if ((zt = zio->io_transform_stack) != NULL) {
    308 		zio->io_data = zt->zt_data;
    309 		zio->io_size = zt->zt_size;
    310 	}
    311 }
    312 
    313 static void
    314 zio_clear_transform_stack(zio_t *zio)
    315 {
    316 	void *data;
    317 	uint64_t size, bufsize;
    318 
    319 	ASSERT(zio->io_transform_stack != NULL);
    320 
    321 	zio_pop_transform(zio, &data, &size, &bufsize);
    322 	while (zio->io_transform_stack != NULL) {
    323 		zio_buf_free(data, bufsize);
    324 		zio_pop_transform(zio, &data, &size, &bufsize);
    325 	}
    326 }
    327 
    328 /*
    329  * ==========================================================================
    330  * Create the various types of I/O (read, write, free)
    331  * ==========================================================================
    332  */
    333 static zio_t *
    334 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
    335     void *data, uint64_t size, zio_done_func_t *done, void *private,
    336     zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
    337 {
    338 	zio_t *zio;
    339 
    340 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
    341 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
    342 
    343 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
    344 	bzero(zio, sizeof (zio_t));
    345 	zio->io_parent = pio;
    346 	zio->io_spa = spa;
    347 	zio->io_txg = txg;
    348 	zio->io_flags = flags;
    349 	if (bp != NULL) {
    350 		zio->io_bp = bp;
    351 		zio->io_bp_copy = *bp;
    352 		zio->io_bp_orig = *bp;
    353 	}
    354 	zio->io_done = done;
    355 	zio->io_private = private;
    356 	zio->io_type = type;
    357 	zio->io_priority = priority;
    358 	zio->io_stage = stage;
    359 	zio->io_pipeline = pipeline;
    360 	zio->io_timestamp = lbolt64;
    361 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
    362 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
    363 	zio_push_transform(zio, data, size, size);
    364 
    365 	/*
    366 	 * Note on config lock:
    367 	 *
    368 	 * If CONFIG_HELD is set, then the caller already has the config
    369 	 * lock, so we don't need it for this io.
    370 	 *
    371 	 * We set CONFIG_GRABBED to indicate that we have grabbed the
    372 	 * config lock on behalf of this io, so it should be released
    373 	 * in zio_done.
    374 	 *
    375 	 * Unless CONFIG_HELD is set, we will grab the config lock for
    376 	 * any top-level (parent-less) io, *except* NULL top-level ios.
    377 	 * The NULL top-level ios rarely have any children, so we delay
    378 	 * grabbing the lock until the first child is added (but it is
    379 	 * still grabbed on behalf of the top-level i/o, so additional
    380 	 * children don't need to also grab it).  This greatly reduces
    381 	 * contention on the config lock.
    382 	 */
    383 	if (pio == NULL) {
    384 		if (type != ZIO_TYPE_NULL &&
    385 		    !(flags & ZIO_FLAG_CONFIG_HELD)) {
    386 			spa_config_enter(spa, RW_READER, zio);
    387 			zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
    388 		}
    389 		zio->io_root = zio;
    390 	} else {
    391 		zio->io_root = pio->io_root;
    392 		if (!(flags & ZIO_FLAG_NOBOOKMARK))
    393 			zio->io_logical = pio->io_logical;
    394 		mutex_enter(&pio->io_lock);
    395 		if (pio->io_parent == NULL &&
    396 		    pio->io_type == ZIO_TYPE_NULL &&
    397 		    !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
    398 		    !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
    399 			pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
    400 			spa_config_enter(spa, RW_READER, pio);
    401 		}
    402 		if (stage < ZIO_STAGE_READY)
    403 			pio->io_children_notready++;
    404 		pio->io_children_notdone++;
    405 		zio->io_sibling_next = pio->io_child;
    406 		zio->io_sibling_prev = NULL;
    407 		if (pio->io_child != NULL)
    408 			pio->io_child->io_sibling_prev = zio;
    409 		pio->io_child = zio;
    410 		zio->io_ndvas = pio->io_ndvas;
    411 		mutex_exit(&pio->io_lock);
    412 	}
    413 
    414 	/*
    415 	 * Save off the original state incase we need to retry later.
    416 	 */
    417 	zio->io_orig_stage = zio->io_stage;
    418 	zio->io_orig_pipeline = zio->io_pipeline;
    419 	zio->io_orig_flags = zio->io_flags;
    420 
    421 	return (zio);
    422 }
    423 
    424 static void
    425 zio_reset(zio_t *zio)
    426 {
    427 	zio_clear_transform_stack(zio);
    428 
    429 	zio->io_flags = zio->io_orig_flags;
    430 	zio->io_stage = zio->io_orig_stage;
    431 	zio->io_pipeline = zio->io_orig_pipeline;
    432 	zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size);
    433 }
    434 
    435 zio_t *
    436 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
    437 	int flags)
    438 {
    439 	zio_t *zio;
    440 
    441 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
    442 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
    443 	    ZIO_WAIT_FOR_CHILDREN_PIPELINE);
    444 
    445 	return (zio);
    446 }
    447 
    448 zio_t *
    449 zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
    450 {
    451 	return (zio_null(NULL, spa, done, private, flags));
    452 }
    453 
    454 zio_t *
    455 zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
    456     uint64_t size, zio_done_func_t *done, void *private,
    457     int priority, int flags, zbookmark_t *zb)
    458 {
    459 	zio_t *zio;
    460 
    461 	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
    462 
    463 	/*
    464 	 * If the user has specified that we allow I/Os to continue
    465 	 * then attempt to satisfy the read.
    466 	 */
    467 	if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
    468 		ZIO_ENTER(spa);
    469 
    470 	zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
    471 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
    472 	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
    473 	zio->io_bookmark = *zb;
    474 
    475 	zio->io_logical = zio;
    476 
    477 	/*
    478 	 * Work off our copy of the bp so the caller can free it.
    479 	 */
    480 	zio->io_bp = &zio->io_bp_copy;
    481 
    482 	return (zio);
    483 }
    484 
    485 zio_t *
    486 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
    487     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
    488     zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
    489     int flags, zbookmark_t *zb)
    490 {
    491 	zio_t *zio;
    492 
    493 	ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
    494 	    checksum < ZIO_CHECKSUM_FUNCTIONS);
    495 
    496 	ASSERT(compress >= ZIO_COMPRESS_OFF &&
    497 	    compress < ZIO_COMPRESS_FUNCTIONS);
    498 
    499 	ZIO_ENTER(spa);
    500 
    501 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
    502 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
    503 	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
    504 
    505 	zio->io_ready = ready;
    506 
    507 	zio->io_bookmark = *zb;
    508 
    509 	zio->io_logical = zio;
    510 
    511 	zio->io_checksum = checksum;
    512 	zio->io_compress = compress;
    513 	zio->io_ndvas = ncopies;
    514 
    515 	if (bp->blk_birth != txg) {
    516 		/* XXX the bp usually (always?) gets re-zeroed later */
    517 		BP_ZERO(bp);
    518 		BP_SET_LSIZE(bp, size);
    519 		BP_SET_PSIZE(bp, size);
    520 	} else {
    521 		/* Make sure someone doesn't change their mind on overwrites */
    522 		ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
    523 		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
    524 	}
    525 
    526 	return (zio);
    527 }
    528 
    529 zio_t *
    530 zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
    531     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
    532     zio_done_func_t *done, void *private, int priority, int flags,
    533     zbookmark_t *zb)
    534 {
    535 	zio_t *zio;
    536 
    537 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
    538 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
    539 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp));
    540 
    541 	zio->io_bookmark = *zb;
    542 	zio->io_checksum = checksum;
    543 	zio->io_compress = ZIO_COMPRESS_OFF;
    544 
    545 	if (pio != NULL)
    546 		ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
    547 
    548 	return (zio);
    549 }
    550 
    551 static void
    552 zio_write_allocate_ready(zio_t *zio)
    553 {
    554 	/* Free up the previous block */
    555 	if (!BP_IS_HOLE(&zio->io_bp_orig)) {
    556 		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
    557 		    &zio->io_bp_orig, NULL, NULL));
    558 	}
    559 }
    560 
    561 static zio_t *
    562 zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
    563     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
    564     zio_done_func_t *done, void *private, int priority, int flags)
    565 {
    566 	zio_t *zio;
    567 
    568 	BP_ZERO(bp);
    569 	BP_SET_LSIZE(bp, size);
    570 	BP_SET_PSIZE(bp, size);
    571 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
    572 
    573 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
    574 	    ZIO_TYPE_WRITE, priority, flags,
    575 	    ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
    576 
    577 	zio->io_checksum = checksum;
    578 	zio->io_compress = ZIO_COMPRESS_OFF;
    579 	zio->io_ready = zio_write_allocate_ready;
    580 
    581 	return (zio);
    582 }
    583 
    584 zio_t *
    585 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
    586     zio_done_func_t *done, void *private)
    587 {
    588 	zio_t *zio;
    589 
    590 	ASSERT(!BP_IS_HOLE(bp));
    591 
    592 	if (txg == spa->spa_syncing_txg &&
    593 	    spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
    594 		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
    595 		return (zio_null(pio, spa, NULL, NULL, 0));
    596 	}
    597 
    598 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
    599 	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
    600 	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp));
    601 
    602 	zio->io_bp = &zio->io_bp_copy;
    603 
    604 	return (zio);
    605 }
    606 
    607 zio_t *
    608 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
    609     zio_done_func_t *done, void *private)
    610 {
    611 	zio_t *zio;
    612 
    613 	/*
    614 	 * A claim is an allocation of a specific block.  Claims are needed
    615 	 * to support immediate writes in the intent log.  The issue is that
    616 	 * immediate writes contain committed data, but in a txg that was
    617 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
    618 	 * the intent log claims all blocks that contain immediate write data
    619 	 * so that the SPA knows they're in use.
    620 	 *
    621 	 * All claims *must* be resolved in the first txg -- before the SPA
    622 	 * starts allocating blocks -- so that nothing is allocated twice.
    623 	 */
    624 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
    625 	ASSERT3U(spa_first_txg(spa), <=, txg);
    626 
    627 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
    628 	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
    629 	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp));
    630 
    631 	zio->io_bp = &zio->io_bp_copy;
    632 
    633 	return (zio);
    634 }
    635 
    636 zio_t *
    637 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
    638     zio_done_func_t *done, void *private, int priority, int flags)
    639 {
    640 	zio_t *zio;
    641 	int c;
    642 
    643 	if (vd->vdev_children == 0) {
    644 		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
    645 		    ZIO_TYPE_IOCTL, priority, flags,
    646 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
    647 
    648 		zio->io_vd = vd;
    649 		zio->io_cmd = cmd;
    650 	} else {
    651 		zio = zio_null(pio, spa, NULL, NULL, flags);
    652 
    653 		for (c = 0; c < vd->vdev_children; c++)
    654 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
    655 			    done, private, priority, flags));
    656 	}
    657 
    658 	return (zio);
    659 }
    660 
    661 static void
    662 zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
    663     int checksum, boolean_t labels)
    664 {
    665 	ASSERT(vd->vdev_children == 0);
    666 
    667 	ASSERT(size <= SPA_MAXBLOCKSIZE);
    668 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
    669 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
    670 
    671 #ifdef ZFS_DEBUG
    672 	if (labels) {
    673 		ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
    674 		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
    675 	}
    676 #endif
    677 	ASSERT3U(offset + size, <=, vd->vdev_psize);
    678 
    679 	BP_ZERO(bp);
    680 
    681 	BP_SET_LSIZE(bp, size);
    682 	BP_SET_PSIZE(bp, size);
    683 
    684 	BP_SET_CHECKSUM(bp, checksum);
    685 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
    686 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
    687 
    688 	if (checksum != ZIO_CHECKSUM_OFF)
    689 		ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
    690 }
    691 
    692 zio_t *
    693 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
    694     void *data, int checksum, zio_done_func_t *done, void *private,
    695     int priority, int flags, boolean_t labels)
    696 {
    697 	zio_t *zio;
    698 	blkptr_t blk;
    699 
    700 	ZIO_ENTER(vd->vdev_spa);
    701 
    702 	zio_phys_bp_init(vd, &blk, offset, size, checksum, labels);
    703 
    704 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
    705 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
    706 	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
    707 
    708 	zio->io_vd = vd;
    709 	zio->io_offset = offset;
    710 
    711 	/*
    712 	 * Work off our copy of the bp so the caller can free it.
    713 	 */
    714 	zio->io_bp = &zio->io_bp_copy;
    715 
    716 	return (zio);
    717 }
    718 
    719 zio_t *
    720 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
    721     void *data, int checksum, zio_done_func_t *done, void *private,
    722     int priority, int flags, boolean_t labels)
    723 {
    724 	zio_block_tail_t *zbt;
    725 	void *wbuf;
    726 	zio_t *zio;
    727 	blkptr_t blk;
    728 
    729 	ZIO_ENTER(vd->vdev_spa);
    730 
    731 	zio_phys_bp_init(vd, &blk, offset, size, checksum, labels);
    732 
    733 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
    734 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
    735 	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
    736 
    737 	zio->io_vd = vd;
    738 	zio->io_offset = offset;
    739 
    740 	zio->io_bp = &zio->io_bp_copy;
    741 	zio->io_checksum = checksum;
    742 
    743 	if (zio_checksum_table[checksum].ci_zbt) {
    744 		/*
    745 		 * zbt checksums are necessarily destructive -- they modify
    746 		 * one word of the write buffer to hold the verifier/checksum.
    747 		 * Therefore, we must make a local copy in case the data is
    748 		 * being written to multiple places.
    749 		 */
    750 		wbuf = zio_buf_alloc(size);
    751 		bcopy(data, wbuf, size);
    752 		zio_push_transform(zio, wbuf, size, size);
    753 
    754 		zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
    755 		zbt->zbt_cksum = blk.blk_cksum;
    756 	}
    757 
    758 	return (zio);
    759 }
    760 
    761 /*
    762  * Create a child I/O to do some work for us.  It has no associated bp.
    763  */
    764 zio_t *
    765 zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
    766 	void *data, uint64_t size, int type, int priority, int flags,
    767 	zio_done_func_t *done, void *private)
    768 {
    769 	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
    770 	zio_t *cio;
    771 
    772 	if (type == ZIO_TYPE_READ && bp != NULL) {
    773 		/*
    774 		 * If we have the bp, then the child should perform the
    775 		 * checksum and the parent need not.  This pushes error
    776 		 * detection as close to the leaves as possible and
    777 		 * eliminates redundant checksums in the interior nodes.
    778 		 */
    779 		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
    780 		zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
    781 	}
    782 
    783 	cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
    784 	    done, private, type, priority,
    785 	    (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
    786 	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
    787 
    788 	cio->io_vd = vd;
    789 	cio->io_offset = offset;
    790 
    791 	return (cio);
    792 }
    793 
    794 /*
    795  * ==========================================================================
    796  * Initiate I/O, either sync or async
    797  * ==========================================================================
    798  */
    799 int
    800 zio_wait(zio_t *zio)
    801 {
    802 	int error;
    803 
    804 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
    805 
    806 	zio->io_waiter = curthread;
    807 
    808 	zio_execute(zio);
    809 
    810 	mutex_enter(&zio->io_lock);
    811 	while (zio->io_stalled != ZIO_STAGE_DONE)
    812 		cv_wait(&zio->io_cv, &zio->io_lock);
    813 	mutex_exit(&zio->io_lock);
    814 
    815 	error = zio->io_error;
    816 	mutex_destroy(&zio->io_lock);
    817 	cv_destroy(&zio->io_cv);
    818 	kmem_cache_free(zio_cache, zio);
    819 
    820 	return (error);
    821 }
    822 
    823 void
    824 zio_nowait(zio_t *zio)
    825 {
    826 	zio_execute(zio);
    827 }
    828 
    829 void
    830 zio_interrupt(zio_t *zio)
    831 {
    832 	(void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type],
    833 	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
    834 }
    835 
    836 static int
    837 zio_issue_async(zio_t *zio)
    838 {
    839 	(void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type],
    840 	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
    841 
    842 	return (ZIO_PIPELINE_STOP);
    843 }
    844 
    845 /*
    846  * ==========================================================================
    847  * I/O pipeline interlocks: parent/child dependency scoreboarding
    848  * ==========================================================================
    849  */
    850 static int
    851 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
    852 {
    853 	int rv = ZIO_PIPELINE_CONTINUE;
    854 
    855 	mutex_enter(&zio->io_lock);
    856 	ASSERT(zio->io_stalled == 0);
    857 	if (*countp != 0) {
    858 		zio->io_stalled = stage;
    859 		rv = ZIO_PIPELINE_STOP;
    860 	}
    861 	mutex_exit(&zio->io_lock);
    862 
    863 	return (rv);
    864 }
    865 
    866 static void
    867 zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
    868 {
    869 	zio_t *pio = zio->io_parent;
    870 
    871 	mutex_enter(&pio->io_lock);
    872 	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
    873 		pio->io_error = zio->io_error;
    874 	ASSERT3U(*countp, >, 0);
    875 	if (--*countp == 0 && pio->io_stalled == stage) {
    876 		pio->io_stalled = 0;
    877 		mutex_exit(&pio->io_lock);
    878 		zio_execute(pio);
    879 	} else {
    880 		mutex_exit(&pio->io_lock);
    881 	}
    882 }
    883 
    884 int
    885 zio_wait_for_children_ready(zio_t *zio)
    886 {
    887 	return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
    888 	    &zio->io_children_notready));
    889 }
    890 
    891 int
    892 zio_wait_for_children_done(zio_t *zio)
    893 {
    894 	return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
    895 	    &zio->io_children_notdone));
    896 }
    897 
    898 static int
    899 zio_read_init(zio_t *zio)
    900 {
    901 	blkptr_t *bp = zio->io_bp;
    902 
    903 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
    904 		uint64_t csize = BP_GET_PSIZE(bp);
    905 		void *cbuf = zio_buf_alloc(csize);
    906 
    907 		zio_push_transform(zio, cbuf, csize, csize);
    908 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
    909 	}
    910 
    911 	if (BP_IS_GANG(bp)) {
    912 		uint64_t gsize = SPA_GANGBLOCKSIZE;
    913 		void *gbuf = zio_buf_alloc(gsize);
    914 
    915 		zio_push_transform(zio, gbuf, gsize, gsize);
    916 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
    917 	}
    918 
    919 	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
    920 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
    921 
    922 	return (ZIO_PIPELINE_CONTINUE);
    923 }
    924 
    925 static int
    926 zio_ready(zio_t *zio)
    927 {
    928 	zio_t *pio = zio->io_parent;
    929 
    930 	if (zio->io_ready)
    931 		zio->io_ready(zio);
    932 
    933 	if (pio != NULL)
    934 		zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
    935 		    &pio->io_children_notready);
    936 
    937 	if (zio->io_bp)
    938 		zio->io_bp_copy = *zio->io_bp;
    939 
    940 	return (ZIO_PIPELINE_CONTINUE);
    941 }
    942 
    943 static int
    944 zio_vdev_retry_io(zio_t *zio)
    945 {
    946 	zio_t *pio = zio->io_parent;
    947 
    948 	/*
    949 	 * Preserve the failed bp so that the io_ready() callback can
    950 	 * update the accounting accordingly. The callback will also be
    951 	 * responsible for freeing the previously allocated block, if one
    952 	 * exists.
    953 	 */
    954 	zio->io_bp_orig = *zio->io_bp;
    955 
    956 	/*
    957 	 * We must zero out the old DVA and blk_birth before reallocating
    958 	 * the bp.
    959 	 */
    960 	BP_ZERO_DVAS(zio->io_bp);
    961 	zio_reset(zio);
    962 
    963 	if (pio) {
    964 		/*
    965 		 * Let the parent know that we will
    966 		 * re-alloc the write (=> new bp info).
    967 		 */
    968 		mutex_enter(&pio->io_lock);
    969 		pio->io_children_notready++;
    970 
    971 		/*
    972 		 * If the parent I/O is still in the open stage, then
    973 		 * don't bother telling it to retry since it hasn't
    974 		 * progressed far enough for it to care.
    975 		 */
    976 		if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio))
    977 			pio->io_flags |= ZIO_FLAG_WRITE_RETRY;
    978 
    979 		ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE);
    980 		mutex_exit(&pio->io_lock);
    981 	}
    982 
    983 	/*
    984 	 * We are getting ready to process the retry request so clear
    985 	 * the flag and the zio's current error status.
    986 	 */
    987 	zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY;
    988 	zio->io_error = 0;
    989 
    990 	return (ZIO_PIPELINE_CONTINUE);
    991 }
    992 
    993 int
    994 zio_vdev_resume_io(spa_t *spa)
    995 {
    996 	zio_t *zio;
    997 
    998 	mutex_enter(&spa->spa_zio_lock);
    999 
   1000 	/*
   1001 	 * Probe all of vdevs that have experienced an I/O error.
   1002 	 * If we are still unable to verify the integrity of the vdev
   1003 	 * then we prevent the resume from proceeeding.
   1004 	 */
   1005 	for (zio = list_head(&spa->spa_zio_list); zio != NULL;
   1006 	    zio = list_next(&spa->spa_zio_list, zio)) {
   1007 		int error = 0;
   1008 
   1009 		/* We only care about I/Os that must succeed */
   1010 		if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL)
   1011 			continue;
   1012 		error = vdev_probe(zio->io_vd);
   1013 		if (error) {
   1014 			mutex_exit(&spa->spa_zio_lock);
   1015 			return (error);
   1016 		}
   1017 	}
   1018 
   1019 	/*
   1020 	 * Clear the vdev stats so that I/O can flow.
   1021 	 */
   1022 	vdev_clear(spa, NULL, B_FALSE);
   1023 
   1024 	spa->spa_state = POOL_STATE_ACTIVE;
   1025 	while ((zio = list_head(&spa->spa_zio_list)) != NULL) {
   1026 		list_remove(&spa->spa_zio_list, zio);
   1027 		zio->io_error = 0;
   1028 
   1029 		/*
   1030 		 * If we are resuming an allocating I/O then we force it
   1031 		 * to retry and let it resume operation where it left off.
   1032 		 * Otherwise, go back to the ready stage and pick up from
   1033 		 * there.
   1034 		 */
   1035 		if (zio_write_retry && IO_IS_ALLOCATING(zio)) {
   1036 			zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
   1037 			zio->io_stage--;
   1038 		} else {
   1039 			zio->io_stage = ZIO_STAGE_READY;
   1040 		}
   1041 
   1042 		(void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute,
   1043 		    zio, TQ_SLEEP);
   1044 	}
   1045 	mutex_exit(&spa->spa_zio_lock);
   1046 
   1047 	/*
   1048 	 * Wait for the taskqs to finish and recheck the pool state since
   1049 	 * it's possible that a resumed I/O has failed again.
   1050 	 */
   1051 	taskq_wait(zio_taskq);
   1052 	if (spa_state(spa) == POOL_STATE_IO_FAILURE)
   1053 		return (EIO);
   1054 
   1055 	mutex_enter(&spa->spa_zio_lock);
   1056 	cv_broadcast(&spa->spa_zio_cv);
   1057 	mutex_exit(&spa->spa_zio_lock);
   1058 
   1059 	return (0);
   1060 }
   1061 
   1062 static int
   1063 zio_vdev_suspend_io(zio_t *zio)
   1064 {
   1065 	spa_t *spa = zio->io_spa;
   1066 
   1067 	/*
   1068 	 * We've experienced an unrecoverable failure so
   1069 	 * set the pool state accordingly and queue all
   1070 	 * failed IOs.
   1071 	 */
   1072 	spa->spa_state = POOL_STATE_IO_FAILURE;
   1073 
   1074 	mutex_enter(&spa->spa_zio_lock);
   1075 	list_insert_tail(&spa->spa_zio_list, zio);
   1076 
   1077 #ifndef _KERNEL
   1078 	/* Used to notify ztest that the pool has suspended */
   1079 	cv_broadcast(&spa->spa_zio_cv);
   1080 #endif
   1081 	mutex_exit(&spa->spa_zio_lock);
   1082 
   1083 	return (ZIO_PIPELINE_STOP);
   1084 }
   1085 
   1086 static int
   1087 zio_assess(zio_t *zio)
   1088 {
   1089 	spa_t *spa = zio->io_spa;
   1090 	blkptr_t *bp = zio->io_bp;
   1091 	vdev_t *vd = zio->io_vd;
   1092 
   1093 	ASSERT(zio->io_children_notready == 0);
   1094 	ASSERT(zio->io_children_notdone == 0);
   1095 
   1096 	if (bp != NULL) {
   1097 		ASSERT(bp->blk_pad[0] == 0);
   1098 		ASSERT(bp->blk_pad[1] == 0);
   1099 		ASSERT(bp->blk_pad[2] == 0);
   1100 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
   1101 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
   1102 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
   1103 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
   1104 			if (zio->io_ndvas != 0)
   1105 				ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
   1106 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
   1107 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
   1108 		}
   1109 	}
   1110 
   1111 	/*
   1112 	 * Some child I/O has indicated that a retry is necessary, so
   1113 	 * we set an error on the I/O and let the logic below do the
   1114 	 * rest.
   1115 	 */
   1116 	if (zio->io_flags & ZIO_FLAG_WRITE_RETRY)
   1117 		zio->io_error = ERESTART;
   1118 
   1119 	if (vd != NULL)
   1120 		vdev_stat_update(zio);
   1121 
   1122 	if (zio->io_error) {
   1123 		/*
   1124 		 * If this I/O is attached to a particular vdev,
   1125 		 * generate an error message describing the I/O failure
   1126 		 * at the block level.  We ignore these errors if the
   1127 		 * device is currently unavailable.
   1128 		 */
   1129 		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
   1130 			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
   1131 
   1132 		if ((zio->io_error == EIO ||
   1133 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
   1134 		    zio->io_logical == zio) {
   1135 			/*
   1136 			 * For root I/O requests, tell the SPA to log the error
   1137 			 * appropriately.  Also, generate a logical data
   1138 			 * ereport.
   1139 			 */
   1140 			spa_log_error(spa, zio);
   1141 
   1142 			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
   1143 			    0, 0);
   1144 		}
   1145 
   1146 		/*
   1147 		 * If we are an allocating I/O then we attempt to reissue
   1148 		 * the I/O on another vdev unless the pool is out of space.
   1149 		 * We handle this condition based on the spa's failmode
   1150 		 * property.
   1151 		 */
   1152 		if (zio_write_retry && zio->io_error != ENOSPC &&
   1153 		    IO_IS_ALLOCATING(zio))
   1154 			return (zio_vdev_retry_io(zio));
   1155 
   1156 		ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
   1157 
   1158 		/*
   1159 		 * For I/O requests that cannot fail, we carry out
   1160 		 * the requested behavior based on the failmode pool
   1161 		 * property.
   1162 		 *
   1163 		 * XXX - Need to differentiate between an ENOSPC as
   1164 		 * a result of vdev failures vs. a full pool.
   1165 		 */
   1166 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
   1167 			char *blkbuf;
   1168 
   1169 #ifdef ZFS_DEBUG
   1170 			blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP);
   1171 			if (blkbuf) {
   1172 				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
   1173 				    bp ? bp : &zio->io_bp_copy);
   1174 			}
   1175 			cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p "
   1176 			    "%s): error %d", zio->io_error == ECKSUM ?
   1177 			    "bad checksum" : "I/O failure",
   1178 			    zio_type_name[zio->io_type],
   1179 			    vdev_description(vd),
   1180 			    (u_longlong_t)zio->io_offset,
   1181 			    (void *)zio, blkbuf ? blkbuf : "", zio->io_error);
   1182 #endif
   1183 
   1184 			if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) {
   1185 				fm_panic("Pool '%s' has encountered an "
   1186 				    "uncorrectable I/O failure and the "
   1187 				    "failure mode property for this pool "
   1188 				    "is set to panic.", spa_name(spa));
   1189 			}
   1190 			cmn_err(CE_WARN, "Pool '%s' has encountered "
   1191 			    "an uncorrectable I/O error. "
   1192 			    "Manual intervention is required.", spa_name(spa));
   1193 			return (zio_vdev_suspend_io(zio));
   1194 		}
   1195 	}
   1196 	ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
   1197 	ASSERT(zio->io_children_notready == 0);
   1198 
   1199 	return (ZIO_PIPELINE_CONTINUE);
   1200 }
   1201 
   1202 static int
   1203 zio_done(zio_t *zio)
   1204 {
   1205 	zio_t *pio = zio->io_parent;
   1206 	spa_t *spa = zio->io_spa;
   1207 
   1208 	ASSERT(zio->io_children_notready == 0);
   1209 	ASSERT(zio->io_children_notdone == 0);
   1210 
   1211 	zio_clear_transform_stack(zio);
   1212 
   1213 	if (zio->io_done)
   1214 		zio->io_done(zio);
   1215 
   1216 	ASSERT(zio->io_delegate_list == NULL);
   1217 	ASSERT(zio->io_delegate_next == NULL);
   1218 
   1219 	if (pio != NULL) {
   1220 		zio_t *next, *prev;
   1221 
   1222 		mutex_enter(&pio->io_lock);
   1223 		next = zio->io_sibling_next;
   1224 		prev = zio->io_sibling_prev;
   1225 		if (next != NULL)
   1226 			next->io_sibling_prev = prev;
   1227 		if (prev != NULL)
   1228 			prev->io_sibling_next = next;
   1229 		if (pio->io_child == zio)
   1230 			pio->io_child = next;
   1231 		mutex_exit(&pio->io_lock);
   1232 
   1233 		zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
   1234 		    &pio->io_children_notdone);
   1235 	}
   1236 
   1237 	/*
   1238 	 * Note: this I/O is now done, and will shortly be freed, so there is no
   1239 	 * need to clear this (or any other) flag.
   1240 	 */
   1241 	if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)
   1242 		spa_config_exit(spa, zio);
   1243 
   1244 	if (zio->io_waiter != NULL) {
   1245 		mutex_enter(&zio->io_lock);
   1246 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
   1247 		zio->io_stalled = zio->io_stage;
   1248 		cv_broadcast(&zio->io_cv);
   1249 		mutex_exit(&zio->io_lock);
   1250 	} else {
   1251 		mutex_destroy(&zio->io_lock);
   1252 		cv_destroy(&zio->io_cv);
   1253 		kmem_cache_free(zio_cache, zio);
   1254 	}
   1255 
   1256 	return (ZIO_PIPELINE_STOP);
   1257 }
   1258 
   1259 /*
   1260  * ==========================================================================
   1261  * Compression support
   1262  * ==========================================================================
   1263  */
   1264 static int
   1265 zio_write_compress(zio_t *zio)
   1266 {
   1267 	int compress = zio->io_compress;
   1268 	blkptr_t *bp = zio->io_bp;
   1269 	void *cbuf;
   1270 	uint64_t lsize = zio->io_size;
   1271 	uint64_t csize = lsize;
   1272 	uint64_t cbufsize = 0;
   1273 	int pass;
   1274 
   1275 	if (bp->blk_birth == zio->io_txg) {
   1276 		/*
   1277 		 * We're rewriting an existing block, which means we're
   1278 		 * working on behalf of spa_sync().  For spa_sync() to
   1279 		 * converge, it must eventually be the case that we don't
   1280 		 * have to allocate new blocks.  But compression changes
   1281 		 * the blocksize, which forces a reallocate, and makes
   1282 		 * convergence take longer.  Therefore, after the first
   1283 		 * few passes, stop compressing to ensure convergence.
   1284 		 */
   1285 		pass = spa_sync_pass(zio->io_spa);
   1286 		if (pass > zio_sync_pass.zp_dontcompress)
   1287 			compress = ZIO_COMPRESS_OFF;
   1288 	} else {
   1289 		ASSERT(BP_IS_HOLE(bp));
   1290 		pass = 1;
   1291 	}
   1292 
   1293 	if (compress != ZIO_COMPRESS_OFF)
   1294 		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
   1295 		    &cbuf, &csize, &cbufsize))
   1296 			compress = ZIO_COMPRESS_OFF;
   1297 
   1298 	if (compress != ZIO_COMPRESS_OFF && csize != 0)
   1299 		zio_push_transform(zio, cbuf, csize, cbufsize);
   1300 
   1301 	/*
   1302 	 * The final pass of spa_sync() must be all rewrites, but the first
   1303 	 * few passes offer a trade-off: allocating blocks defers convergence,
   1304 	 * but newly allocated blocks are sequential, so they can be written
   1305 	 * to disk faster.  Therefore, we allow the first few passes of
   1306 	 * spa_sync() to reallocate new blocks, but force rewrites after that.
   1307 	 * There should only be a handful of blocks after pass 1 in any case.
   1308 	 */
   1309 	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
   1310 	    pass > zio_sync_pass.zp_rewrite) {
   1311 		ASSERT(csize != 0);
   1312 		BP_SET_LSIZE(bp, lsize);
   1313 		BP_SET_COMPRESS(bp, compress);
   1314 		zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp);
   1315 	} else {
   1316 		if (bp->blk_birth == zio->io_txg)
   1317 			BP_ZERO(bp);
   1318 		if (csize == 0) {
   1319 			BP_ZERO(bp);
   1320 			zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
   1321 		} else {
   1322 			ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
   1323 			BP_SET_LSIZE(bp, lsize);
   1324 			BP_SET_PSIZE(bp, csize);
   1325 			BP_SET_COMPRESS(bp, compress);
   1326 		}
   1327 	}
   1328 
   1329 	return (ZIO_PIPELINE_CONTINUE);
   1330 }
   1331 
   1332 static int
   1333 zio_read_decompress(zio_t *zio)
   1334 {
   1335 	blkptr_t *bp = zio->io_bp;
   1336 	void *data;
   1337 	uint64_t size;
   1338 	uint64_t bufsize;
   1339 	int compress = BP_GET_COMPRESS(bp);
   1340 
   1341 	ASSERT(compress != ZIO_COMPRESS_OFF);
   1342 
   1343 	zio_pop_transform(zio, &data, &size, &bufsize);
   1344 
   1345 	if (zio_decompress_data(compress, data, size,
   1346 	    zio->io_data, zio->io_size))
   1347 		zio->io_error = EIO;
   1348 
   1349 	zio_buf_free(data, bufsize);
   1350 
   1351 	return (ZIO_PIPELINE_CONTINUE);
   1352 }
   1353 
   1354 /*
   1355  * ==========================================================================
   1356  * Gang block support
   1357  * ==========================================================================
   1358  */
   1359 static void
   1360 zio_gang_byteswap(zio_t *zio)
   1361 {
   1362 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
   1363 
   1364 	if (BP_SHOULD_BYTESWAP(zio->io_bp))
   1365 		byteswap_uint64_array(zio->io_data, zio->io_size);
   1366 }
   1367 
   1368 static int
   1369 zio_get_gang_header(zio_t *zio)
   1370 {
   1371 	blkptr_t *bp = zio->io_bp;
   1372 	uint64_t gsize = SPA_GANGBLOCKSIZE;
   1373 	void *gbuf = zio_buf_alloc(gsize);
   1374 
   1375 	ASSERT(BP_IS_GANG(bp));
   1376 
   1377 	zio_push_transform(zio, gbuf, gsize, gsize);
   1378 
   1379 	zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
   1380 	    NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
   1381 	    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
   1382 	    ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE));
   1383 
   1384 	return (zio_wait_for_children_done(zio));
   1385 }
   1386 
   1387 static int
   1388 zio_read_gang_members(zio_t *zio)
   1389 {
   1390 	zio_gbh_phys_t *gbh;
   1391 	uint64_t gsize, gbufsize, loff, lsize;
   1392 	int i;
   1393 
   1394 	ASSERT(BP_IS_GANG(zio->io_bp));
   1395 
   1396 	zio_gang_byteswap(zio);
   1397 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
   1398 
   1399 	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
   1400 		blkptr_t *gbp = &gbh->zg_blkptr[i];
   1401 		lsize = BP_GET_PSIZE(gbp);
   1402 
   1403 		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
   1404 		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
   1405 		ASSERT3U(loff + lsize, <=, zio->io_size);
   1406 		ASSERT(i < SPA_GBH_NBLKPTRS);
   1407 		ASSERT(!BP_IS_HOLE(gbp));
   1408 
   1409 		zio_nowait(zio_read(zio, zio->io_spa, gbp,
   1410 		    (char *)zio->io_data + loff, lsize,
   1411 		    NULL, NULL, zio->io_priority,
   1412 		    zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
   1413 	}
   1414 
   1415 	zio_buf_free(gbh, gbufsize);
   1416 
   1417 	return (zio_wait_for_children_done(zio));
   1418 }
   1419 
   1420 static int
   1421 zio_rewrite_gang_members(zio_t *zio)
   1422 {
   1423 	zio_gbh_phys_t *gbh;
   1424 	uint64_t gsize, gbufsize, loff, lsize;
   1425 	int i;
   1426 
   1427 	ASSERT(BP_IS_GANG(zio->io_bp));
   1428 	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
   1429 
   1430 	zio_gang_byteswap(zio);
   1431 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
   1432 
   1433 	ASSERT(gsize == gbufsize);
   1434 
   1435 	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
   1436 		blkptr_t *gbp = &gbh->zg_blkptr[i];
   1437 		lsize = BP_GET_PSIZE(gbp);
   1438 
   1439 		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
   1440 		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
   1441 		ASSERT3U(loff + lsize, <=, zio->io_size);
   1442 		ASSERT(i < SPA_GBH_NBLKPTRS);
   1443 		ASSERT(!BP_IS_HOLE(gbp));
   1444 
   1445 		zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
   1446 		    zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
   1447 		    NULL, NULL, zio->io_priority,
   1448 		    zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
   1449 	}
   1450 
   1451 	zio_push_transform(zio, gbh, gsize, gbufsize);
   1452 
   1453 	return (zio_wait_for_children_ready(zio));
   1454 }
   1455 
   1456 static int
   1457 zio_free_gang_members(zio_t *zio)
   1458 {
   1459 	zio_gbh_phys_t *gbh;
   1460 	uint64_t gsize, gbufsize;
   1461 	int i;
   1462 
   1463 	ASSERT(BP_IS_GANG(zio->io_bp));
   1464 
   1465 	zio_gang_byteswap(zio);
   1466 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
   1467 
   1468 	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
   1469 		blkptr_t *gbp = &gbh->zg_blkptr[i];
   1470 
   1471 		if (BP_IS_HOLE(gbp))
   1472 			continue;
   1473 		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
   1474 		    gbp, NULL, NULL));
   1475 	}
   1476 
   1477 	zio_buf_free(gbh, gbufsize);
   1478 
   1479 	return (ZIO_PIPELINE_CONTINUE);
   1480 }
   1481 
   1482 static int
   1483 zio_claim_gang_members(zio_t *zio)
   1484 {
   1485 	zio_gbh_phys_t *gbh;
   1486 	uint64_t gsize, gbufsize;
   1487 	int i;
   1488 
   1489 	ASSERT(BP_IS_GANG(zio->io_bp));
   1490 
   1491 	zio_gang_byteswap(zio);
   1492 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
   1493 
   1494 	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
   1495 		blkptr_t *gbp = &gbh->zg_blkptr[i];
   1496 		if (BP_IS_HOLE(gbp))
   1497 			continue;
   1498 		zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
   1499 		    gbp, NULL, NULL));
   1500 	}
   1501 
   1502 	zio_buf_free(gbh, gbufsize);
   1503 
   1504 	return (ZIO_PIPELINE_CONTINUE);
   1505 }
   1506 
   1507 static void
   1508 zio_write_allocate_gang_member_done(zio_t *zio)
   1509 {
   1510 	zio_t *pio = zio->io_parent;
   1511 	dva_t *cdva = zio->io_bp->blk_dva;
   1512 	dva_t *pdva = pio->io_bp->blk_dva;
   1513 	uint64_t asize;
   1514 	int d;
   1515 
   1516 	ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
   1517 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
   1518 	ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
   1519 	ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
   1520 
   1521 	mutex_enter(&pio->io_lock);
   1522 	for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
   1523 		ASSERT(DVA_GET_GANG(&pdva[d]));
   1524 		asize = DVA_GET_ASIZE(&pdva[d]);
   1525 		asize += DVA_GET_ASIZE(&cdva[d]);
   1526 		DVA_SET_ASIZE(&pdva[d], asize);
   1527 	}
   1528 	mutex_exit(&pio->io_lock);
   1529 }
   1530 
   1531 static int
   1532 zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
   1533 {
   1534 	blkptr_t *bp = zio->io_bp;
   1535 	dva_t *dva = bp->blk_dva;
   1536 	spa_t *spa = zio->io_spa;
   1537 	zio_gbh_phys_t *gbh;
   1538 	uint64_t txg = zio->io_txg;
   1539 	uint64_t resid = zio->io_size;
   1540 	uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
   1541 	uint64_t gsize, loff, lsize;
   1542 	uint32_t gbps_left;
   1543 	int ndvas = zio->io_ndvas;
   1544 	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
   1545 	int error;
   1546 	int i, d;
   1547 
   1548 	gsize = SPA_GANGBLOCKSIZE;
   1549 	gbps_left = SPA_GBH_NBLKPTRS;
   1550 
   1551 	error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL,
   1552 	    B_FALSE);
   1553 	if (error) {
   1554 		zio->io_error = error;
   1555 		return (ZIO_PIPELINE_CONTINUE);
   1556 	}
   1557 
   1558 	for (d = 0; d < gbh_ndvas; d++)
   1559 		DVA_SET_GANG(&dva[d], 1);
   1560 
   1561 	bp->blk_birth = txg;
   1562 
   1563 	gbh = zio_buf_alloc(gsize);
   1564 	bzero(gbh, gsize);
   1565 
   1566 	for (loff = 0, i = 0; loff != zio->io_size;
   1567 	    loff += lsize, resid -= lsize, gbps_left--, i++) {
   1568 		blkptr_t *gbp = &gbh->zg_blkptr[i];
   1569 		dva = gbp->blk_dva;
   1570 
   1571 		ASSERT(gbps_left != 0);
   1572 		maxalloc = MIN(maxalloc, resid);
   1573 
   1574 		while (resid <= maxalloc * gbps_left) {
   1575 			error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas,
   1576 			    txg, bp, B_FALSE);
   1577 			if (error == 0)
   1578 				break;
   1579 			ASSERT3U(error, ==, ENOSPC);
   1580 			/* XXX - free up previous allocations? */
   1581 			if (maxalloc == SPA_MINBLOCKSIZE) {
   1582 				zio->io_error = error;
   1583 				return (ZIO_PIPELINE_CONTINUE);
   1584 			}
   1585 			maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
   1586 		}
   1587 
   1588 		if (resid <= maxalloc * gbps_left) {
   1589 			lsize = maxalloc;
   1590 			BP_SET_LSIZE(gbp, lsize);
   1591 			BP_SET_PSIZE(gbp, lsize);
   1592 			BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
   1593 			gbp->blk_birth = txg;
   1594 			zio_nowait(zio_rewrite(zio, spa,
   1595 			    zio->io_checksum, txg, gbp,
   1596 			    (char *)zio->io_data + loff, lsize,
   1597 			    zio_write_allocate_gang_member_done, NULL,
   1598 			    zio->io_priority,
   1599 			    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
   1600 			    &zio->io_bookmark));
   1601 		} else {
   1602 			lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
   1603 			ASSERT(lsize != SPA_MINBLOCKSIZE);
   1604 			zio_nowait(zio_write_allocate(zio, spa,
   1605 			    zio->io_checksum, txg, gbp,
   1606 			    (char *)zio->io_data + loff, lsize,
   1607 			    zio_write_allocate_gang_member_done, NULL,
   1608 			    zio->io_priority,
   1609 			    zio->io_flags & ZIO_FLAG_GANG_INHERIT));
   1610 		}
   1611 	}
   1612 
   1613 	ASSERT(resid == 0 && loff == zio->io_size);
   1614 
   1615 	zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
   1616 
   1617 	zio_push_transform(zio, gbh, gsize, gsize);
   1618 
   1619 	/*
   1620 	 * As much as we'd like this to be 'ready' instead of 'done',
   1621 	 * updating our ASIZE doesn't happen until the io_done callback,
   1622 	 * so we have to wait for that to finish in order for our BP
   1623 	 * to be stable.
   1624 	 */
   1625 	return (zio_wait_for_children_done(zio));
   1626 }
   1627 
   1628 /*
   1629  * ==========================================================================
   1630  * Allocate and free blocks
   1631  * ==========================================================================
   1632  */
   1633 static int
   1634 zio_dva_allocate(zio_t *zio)
   1635 {
   1636 	spa_t *spa = zio->io_spa;
   1637 	metaslab_class_t *mc = spa->spa_normal_class;
   1638 	blkptr_t *bp = zio->io_bp;
   1639 	int error;
   1640 
   1641 	ASSERT(BP_IS_HOLE(bp));
   1642 	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
   1643 	ASSERT3U(zio->io_ndvas, >, 0);
   1644 	ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa));
   1645 
   1646 	/*
   1647 	 * For testing purposes, we force I/Os to retry. We don't allow
   1648 	 * retries beyond the first pass since those I/Os are non-allocating
   1649 	 * writes.
   1650 	 */
   1651 	if (zio_io_fail_shift &&
   1652 	    spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite &&
   1653 	    zio_io_should_fail(zio_io_fail_shift))
   1654 		zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
   1655 
   1656 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
   1657 
   1658 	error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas,
   1659 	    zio->io_txg, NULL, B_FALSE);
   1660 
   1661 	if (error == 0) {
   1662 		bp->blk_birth = zio->io_txg;
   1663 	} else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
   1664 		return (zio_write_allocate_gang_members(zio, mc));
   1665 	} else {
   1666 		zio->io_error = error;
   1667 	}
   1668 
   1669 	return (ZIO_PIPELINE_CONTINUE);
   1670 }
   1671 
   1672 static int
   1673 zio_dva_free(zio_t *zio)
   1674 {
   1675 	blkptr_t *bp = zio->io_bp;
   1676 
   1677 	metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE);
   1678 
   1679 	BP_ZERO(bp);
   1680 
   1681 	return (ZIO_PIPELINE_CONTINUE);
   1682 }
   1683 
   1684 static int
   1685 zio_dva_claim(zio_t *zio)
   1686 {
   1687 	zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
   1688 
   1689 	return (ZIO_PIPELINE_CONTINUE);
   1690 }
   1691 
   1692 /*
   1693  * ==========================================================================
   1694  * Read and write to physical devices
   1695  * ==========================================================================
   1696  */
   1697 
   1698 static int
   1699 zio_vdev_io_start(zio_t *zio)
   1700 {
   1701 	vdev_t *vd = zio->io_vd;
   1702 	vdev_t *tvd = vd ? vd->vdev_top : NULL;
   1703 	blkptr_t *bp = zio->io_bp;
   1704 	uint64_t align;
   1705 	spa_t *spa = zio->io_spa;
   1706 
   1707 	/*
   1708 	 * If the pool is already in a failure state then just suspend
   1709 	 * this IO until the problem is resolved. We will reissue them
   1710 	 * at that time.
   1711 	 */
   1712 	if (spa_state(spa) == POOL_STATE_IO_FAILURE &&
   1713 	    zio->io_type == ZIO_TYPE_WRITE)
   1714 		return (zio_vdev_suspend_io(zio));
   1715 
   1716 	/*
   1717 	 * The mirror_ops handle multiple DVAs in a single BP
   1718 	 */
   1719 	if (vd == NULL)
   1720 		return (vdev_mirror_ops.vdev_op_io_start(zio));
   1721 
   1722 	align = 1ULL << tvd->vdev_ashift;
   1723 
   1724 	if (zio->io_retries == 0 && vd == tvd)
   1725 		zio->io_flags |= ZIO_FLAG_FAILFAST;
   1726 
   1727 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
   1728 		zio->io_flags |= ZIO_FLAG_PHYSICAL;
   1729 		zio->io_offset += VDEV_LABEL_START_SIZE;
   1730 	}
   1731 
   1732 	if (P2PHASE(zio->io_size, align) != 0) {
   1733 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
   1734 		char *abuf = zio_buf_alloc(asize);
   1735 		ASSERT(vd == tvd);
   1736 		if (zio->io_type == ZIO_TYPE_WRITE) {
   1737 			bcopy(zio->io_data, abuf, zio->io_size);
   1738 			bzero(abuf + zio->io_size, asize - zio->io_size);
   1739 		}
   1740 		zio_push_transform(zio, abuf, asize, asize);
   1741 		ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK));
   1742 		zio->io_flags |= ZIO_FLAG_SUBBLOCK;
   1743 	}
   1744 
   1745 	ASSERT(P2PHASE(zio->io_offset, align) == 0);
   1746 	ASSERT(P2PHASE(zio->io_size, align) == 0);
   1747 	ASSERT(bp == NULL ||
   1748 	    P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
   1749 	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
   1750 
   1751 	return (vd->vdev_ops->vdev_op_io_start(zio));
   1752 }
   1753 
   1754 static int
   1755 zio_vdev_io_done(zio_t *zio)
   1756 {
   1757 	if (zio->io_vd == NULL)
   1758 		return (vdev_mirror_ops.vdev_op_io_done(zio));
   1759 
   1760 	return (zio->io_vd->vdev_ops->vdev_op_io_done(zio));
   1761 }
   1762 
   1763 /* XXPOLICY */
   1764 boolean_t
   1765 zio_should_retry(zio_t *zio)
   1766 {
   1767 	vdev_t *vd = zio->io_vd;
   1768 
   1769 	if (zio->io_error == 0)
   1770 		return (B_FALSE);
   1771 	if (zio->io_delegate_list != NULL)
   1772 		return (B_FALSE);
   1773 	if (vd && vd != vd->vdev_top)
   1774 		return (B_FALSE);
   1775 	if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
   1776 		return (B_FALSE);
   1777 	if (zio->io_retries > 0)
   1778 		return (B_FALSE);
   1779 
   1780 	return (B_TRUE);
   1781 }
   1782 
   1783 static int
   1784 zio_vdev_io_assess(zio_t *zio)
   1785 {
   1786 	vdev_t *vd = zio->io_vd;
   1787 	vdev_t *tvd = vd ? vd->vdev_top : NULL;
   1788 
   1789 	ASSERT(zio->io_vsd == NULL);
   1790 
   1791 	if (zio->io_flags & ZIO_FLAG_SUBBLOCK) {
   1792 		void *abuf;
   1793 		uint64_t asize;
   1794 		ASSERT(vd == tvd);
   1795 		zio_pop_transform(zio, &abuf, &asize, &asize);
   1796 		if (zio->io_type == ZIO_TYPE_READ)
   1797 			bcopy(abuf, zio->io_data, zio->io_size);
   1798 		zio_buf_free(abuf, asize);
   1799 		zio->io_flags &= ~ZIO_FLAG_SUBBLOCK;
   1800 	}
   1801 
   1802 	if (zio_injection_enabled && !zio->io_error)
   1803 		zio->io_error = zio_handle_fault_injection(zio, EIO);
   1804 
   1805 	/*
   1806 	 * If the I/O failed, determine whether we should attempt to retry it.
   1807 	 */
   1808 	/* XXPOLICY */
   1809 	if (zio_should_retry(zio)) {
   1810 		ASSERT(tvd == vd);
   1811 
   1812 		zio->io_retries++;
   1813 		zio->io_error = 0;
   1814 		zio->io_flags &= ZIO_FLAG_RETRY_INHERIT;
   1815 		/* XXPOLICY */
   1816 		zio->io_flags &= ~ZIO_FLAG_FAILFAST;
   1817 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
   1818 		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
   1819 
   1820 		return (ZIO_PIPELINE_CONTINUE);
   1821 	}
   1822 
   1823 	return (ZIO_PIPELINE_CONTINUE);
   1824 }
   1825 
   1826 void
   1827 zio_vdev_io_reissue(zio_t *zio)
   1828 {
   1829 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
   1830 	ASSERT(zio->io_error == 0);
   1831 
   1832 	zio->io_stage--;
   1833 }
   1834 
   1835 void
   1836 zio_vdev_io_redone(zio_t *zio)
   1837 {
   1838 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
   1839 
   1840 	zio->io_stage--;
   1841 }
   1842 
   1843 void
   1844 zio_vdev_io_bypass(zio_t *zio)
   1845 {
   1846 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
   1847 	ASSERT(zio->io_error == 0);
   1848 
   1849 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
   1850 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
   1851 }
   1852 
   1853 /*
   1854  * ==========================================================================
   1855  * Generate and verify checksums
   1856  * ==========================================================================
   1857  */
   1858 static int
   1859 zio_checksum_generate(zio_t *zio)
   1860 {
   1861 	int checksum = zio->io_checksum;
   1862 	blkptr_t *bp = zio->io_bp;
   1863 
   1864 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
   1865 
   1866 	BP_SET_CHECKSUM(bp, checksum);
   1867 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
   1868 
   1869 	zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
   1870 
   1871 	return (ZIO_PIPELINE_CONTINUE);
   1872 }
   1873 
   1874 static int
   1875 zio_gang_checksum_generate(zio_t *zio)
   1876 {
   1877 	zio_cksum_t zc;
   1878 	zio_gbh_phys_t *gbh = zio->io_data;
   1879 
   1880 	ASSERT(BP_IS_GANG(zio->io_bp));
   1881 	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
   1882 
   1883 	zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
   1884 
   1885 	zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
   1886 
   1887 	return (ZIO_PIPELINE_CONTINUE);
   1888 }
   1889 
   1890 static int
   1891 zio_checksum_verify(zio_t *zio)
   1892 {
   1893 	if (zio->io_bp != NULL) {
   1894 		zio->io_error = zio_checksum_error(zio);
   1895 		if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
   1896 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
   1897 			    zio->io_spa, zio->io_vd, zio, 0, 0);
   1898 	}
   1899 
   1900 	return (ZIO_PIPELINE_CONTINUE);
   1901 }
   1902 
   1903 /*
   1904  * Called by RAID-Z to ensure we don't compute the checksum twice.
   1905  */
   1906 void
   1907 zio_checksum_verified(zio_t *zio)
   1908 {
   1909 	zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
   1910 }
   1911 
   1912 /*
   1913  * Set the external verifier for a gang block based on stuff in the bp
   1914  */
   1915 void
   1916 zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
   1917 {
   1918 	blkptr_t *bp = zio->io_bp;
   1919 
   1920 	zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
   1921 	zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
   1922 	zcp->zc_word[2] = bp->blk_birth;
   1923 	zcp->zc_word[3] = 0;
   1924 }
   1925 
   1926 /*
   1927  * ==========================================================================
   1928  * Define the pipeline
   1929  * ==========================================================================
   1930  */
   1931 typedef int zio_pipe_stage_t(zio_t *zio);
   1932 
   1933 zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
   1934 	NULL,
   1935 	zio_wait_for_children_ready,
   1936 	zio_read_init,
   1937 	zio_issue_async,
   1938 	zio_write_compress,
   1939 	zio_checksum_generate,
   1940 	zio_get_gang_header,
   1941 	zio_rewrite_gang_members,
   1942 	zio_free_gang_members,
   1943 	zio_claim_gang_members,
   1944 	zio_dva_allocate,
   1945 	zio_dva_free,
   1946 	zio_dva_claim,
   1947 	zio_gang_checksum_generate,
   1948 	zio_ready,
   1949 	zio_vdev_io_start,
   1950 	zio_vdev_io_done,
   1951 	zio_vdev_io_assess,
   1952 	zio_wait_for_children_done,
   1953 	zio_checksum_verify,
   1954 	zio_read_gang_members,
   1955 	zio_read_decompress,
   1956 	zio_assess,
   1957 	zio_done,
   1958 	NULL
   1959 };
   1960 
   1961 /*
   1962  * Execute the I/O pipeline until one of the following occurs:
   1963  * (1) the I/O completes; (2) the pipeline stalls waiting for
   1964  * dependent child I/Os; (3) the I/O issues, so we're waiting
   1965  * for an I/O completion interrupt; (4) the I/O is delegated by
   1966  * vdev-level caching or aggregation; (5) the I/O is deferred
   1967  * due to vdev-level queueing; (6) the I/O is handed off to
   1968  * another thread.  In all cases, the pipeline stops whenever
   1969  * there's no CPU work; it never burns a thread in cv_wait().
   1970  *
   1971  * There's no locking on io_stage because there's no legitimate way
   1972  * for multiple threads to be attempting to process the same I/O.
   1973  */
   1974 void
   1975 zio_execute(zio_t *zio)
   1976 {
   1977 	while (zio->io_stage < ZIO_STAGE_DONE) {
   1978 		uint32_t pipeline = zio->io_pipeline;
   1979 		int rv;
   1980 
   1981 		ASSERT(!MUTEX_HELD(&zio->io_lock));
   1982 
   1983 		/*
   1984 		 * If an error occurred outside the vdev stack,
   1985 		 * just execute the interlock stages to clean up.
   1986 		 */
   1987 		if (zio->io_error &&
   1988 		    ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0)
   1989 			pipeline &= ZIO_ERROR_PIPELINE_MASK;
   1990 
   1991 		while (((1U << ++zio->io_stage) & pipeline) == 0)
   1992 			continue;
   1993 
   1994 		ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
   1995 		ASSERT(zio->io_stalled == 0);
   1996 
   1997 		rv = zio_pipeline[zio->io_stage](zio);
   1998 
   1999 		if (rv == ZIO_PIPELINE_STOP)
   2000 			return;
   2001 
   2002 		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
   2003 	}
   2004 }
   2005 
   2006 static boolean_t
   2007 zio_io_should_fail(uint16_t range)
   2008 {
   2009 	static uint16_t	allocs = 0;
   2010 
   2011 	return (P2PHASE(allocs++, 1U<<range) == 0);
   2012 }
   2013 
   2014 /*
   2015  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
   2016  */
   2017 int
   2018 zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
   2019     uint64_t txg)
   2020 {
   2021 	int error;
   2022 
   2023 	spa_config_enter(spa, RW_READER, FTAG);
   2024 
   2025 	if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) {
   2026 		spa_config_exit(spa, FTAG);
   2027 		return (ENOSPC);
   2028 	}
   2029 
   2030 	/*
   2031 	 * We were passed the previous log block's DVA in bp->blk_dva[0].
   2032 	 * We use that as a hint for which vdev to allocate from next.
   2033 	 */
   2034 	error = metaslab_alloc(spa, spa->spa_log_class, size,
   2035 	    new_bp, 1, txg, old_bp, B_TRUE);
   2036 
   2037 	if (error)
   2038 		error = metaslab_alloc(spa, spa->spa_normal_class, size,
   2039 		    new_bp, 1, txg, old_bp, B_TRUE);
   2040 
   2041 	if (error == 0) {
   2042 		BP_SET_LSIZE(new_bp, size);
   2043 		BP_SET_PSIZE(new_bp, size);
   2044 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
   2045 		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
   2046 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
   2047 		BP_SET_LEVEL(new_bp, 0);
   2048 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
   2049 		new_bp->blk_birth = txg;
   2050 	}
   2051 
   2052 	spa_config_exit(spa, FTAG);
   2053 
   2054 	return (error);
   2055 }
   2056 
   2057 /*
   2058  * Free an intent log block.  We know it can't be a gang block, so there's
   2059  * nothing to do except metaslab_free() it.
   2060  */
   2061 void
   2062 zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
   2063 {
   2064 	ASSERT(!BP_IS_GANG(bp));
   2065 
   2066 	spa_config_enter(spa, RW_READER, FTAG);
   2067 
   2068 	metaslab_free(spa, bp, txg, B_FALSE);
   2069 
   2070 	spa_config_exit(spa, FTAG);
   2071 }
   2072 
   2073 /*
   2074  * start an async flush of the write cache for this vdev
   2075  */
   2076 void
   2077 zio_flush(zio_t *zio, vdev_t *vd)
   2078 {
   2079 	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
   2080 	    NULL, NULL, ZIO_PRIORITY_NOW,
   2081 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
   2082 }
   2083