Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)zio.c	1.31	07/12/12 SMI"
     27 
     28 #include <sys/zfs_context.h>
     29 #include <sys/fm/fs/zfs.h>
     30 #include <sys/spa.h>
     31 #include <sys/txg.h>
     32 #include <sys/spa_impl.h>
     33 #include <sys/vdev_impl.h>
     34 #include <sys/zio_impl.h>
     35 #include <sys/zio_compress.h>
     36 #include <sys/zio_checksum.h>
     37 
     38 /*
     39  * ==========================================================================
     40  * I/O priority table
     41  * ==========================================================================
     42  */
     43 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
     44 	0,	/* ZIO_PRIORITY_NOW		*/
     45 	0,	/* ZIO_PRIORITY_SYNC_READ	*/
     46 	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
     47 	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
     48 	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
     49 	4,	/* ZIO_PRIORITY_FREE		*/
     50 	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
     51 	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
     52 	10,	/* ZIO_PRIORITY_RESILVER	*/
     53 	20,	/* ZIO_PRIORITY_SCRUB		*/
     54 };
     55 
     56 /*
     57  * ==========================================================================
     58  * I/O type descriptions
     59  * ==========================================================================
     60  */
     61 char *zio_type_name[ZIO_TYPES] = {
     62 	"null", "read", "write", "free", "claim", "ioctl" };
     63 
     64 /* Force an allocation failure when non-zero */
     65 uint16_t zio_zil_fail_shift = 0;
     66 uint16_t zio_io_fail_shift = 0;
     67 
     68 /* Enable/disable the write-retry logic */
     69 int zio_write_retry = 1;
     70 
     71 /* Taskq to handle reissuing of I/Os */
     72 taskq_t *zio_taskq;
     73 int zio_resume_threads = 4;
     74 
     75 typedef struct zio_sync_pass {
     76 	int	zp_defer_free;		/* defer frees after this pass */
     77 	int	zp_dontcompress;	/* don't compress after this pass */
     78 	int	zp_rewrite;		/* rewrite new bps after this pass */
     79 } zio_sync_pass_t;
     80 
     81 zio_sync_pass_t zio_sync_pass = {
     82 	1,	/* zp_defer_free */
     83 	4,	/* zp_dontcompress */
     84 	1,	/* zp_rewrite */
     85 };
     86 
     87 static boolean_t zio_io_should_fail(uint16_t);
     88 
     89 /*
     90  * ==========================================================================
     91  * I/O kmem caches
     92  * ==========================================================================
     93  */
     94 kmem_cache_t *zio_cache;
     95 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
     96 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
     97 
     98 #ifdef _KERNEL
     99 extern vmem_t *zio_alloc_arena;
    100 #endif
    101 
    102 /*
    103  * Determine if we are allowed to issue the IO based on the
    104  * pool state. If we must wait then block until we are told
    105  * that we may continue.
    106  */
    107 #define	ZIO_ENTER(spa) {						\
    108 	if (spa->spa_state == POOL_STATE_IO_FAILURE) {			\
    109 		mutex_enter(&spa->spa_zio_lock);			\
    110 		while (spa->spa_state == POOL_STATE_IO_FAILURE)		\
    111 			cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock);	\
    112 		mutex_exit(&spa->spa_zio_lock);				\
    113 	}								\
    114 }
    115 
    116 /*
    117  * An allocation zio is one that either currently has the DVA allocate
    118  * stage set or will have it later in it's lifetime.
    119  */
    120 #define	IO_IS_ALLOCATING(zio) \
    121 	((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
    122 
    123 void
    124 zio_init(void)
    125 {
    126 	size_t c;
    127 	vmem_t *data_alloc_arena = NULL;
    128 
    129 #ifdef _KERNEL
    130 	data_alloc_arena = zio_alloc_arena;
    131 #endif
    132 
    133 	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
    134 	    NULL, NULL, NULL, NULL, NULL, 0);
    135 
    136 	/*
    137 	 * For small buffers, we want a cache for each multiple of
    138 	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
    139 	 * for each quarter-power of 2.  For large buffers, we want
    140 	 * a cache for each multiple of PAGESIZE.
    141 	 */
    142 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
    143 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
    144 		size_t p2 = size;
    145 		size_t align = 0;
    146 
    147 		while (p2 & (p2 - 1))
    148 			p2 &= p2 - 1;
    149 
    150 		if (size <= 4 * SPA_MINBLOCKSIZE) {
    151 			align = SPA_MINBLOCKSIZE;
    152 		} else if (P2PHASE(size, PAGESIZE) == 0) {
    153 			align = PAGESIZE;
    154 		} else if (P2PHASE(size, p2 >> 2) == 0) {
    155 			align = p2 >> 2;
    156 		}
    157 
    158 		if (align != 0) {
    159 			char name[36];
    160 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
    161 			zio_buf_cache[c] = kmem_cache_create(name, size,
    162 			    align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
    163 
    164 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
    165 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
    166 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
    167 			    KMC_NODEBUG);
    168 
    169 		}
    170 	}
    171 
    172 	while (--c != 0) {
    173 		ASSERT(zio_buf_cache[c] != NULL);
    174 		if (zio_buf_cache[c - 1] == NULL)
    175 			zio_buf_cache[c - 1] = zio_buf_cache[c];
    176 
    177 		ASSERT(zio_data_buf_cache[c] != NULL);
    178 		if (zio_data_buf_cache[c - 1] == NULL)
    179 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
    180 	}
    181 
    182 	zio_taskq = taskq_create("zio_taskq", zio_resume_threads,
    183 	    maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
    184 
    185 	zio_inject_init();
    186 }
    187 
    188 void
    189 zio_fini(void)
    190 {
    191 	size_t c;
    192 	kmem_cache_t *last_cache = NULL;
    193 	kmem_cache_t *last_data_cache = NULL;
    194 
    195 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
    196 		if (zio_buf_cache[c] != last_cache) {
    197 			last_cache = zio_buf_cache[c];
    198 			kmem_cache_destroy(zio_buf_cache[c]);
    199 		}
    200 		zio_buf_cache[c] = NULL;
    201 
    202 		if (zio_data_buf_cache[c] != last_data_cache) {
    203 			last_data_cache = zio_data_buf_cache[c];
    204 			kmem_cache_destroy(zio_data_buf_cache[c]);
    205 		}
    206 		zio_data_buf_cache[c] = NULL;
    207 	}
    208 
    209 	taskq_destroy(zio_taskq);
    210 
    211 	kmem_cache_destroy(zio_cache);
    212 
    213 	zio_inject_fini();
    214 }
    215 
    216 /*
    217  * ==========================================================================
    218  * Allocate and free I/O buffers
    219  * ==========================================================================
    220  */
    221 
    222 /*
    223  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
    224  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
    225  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
    226  * excess / transient data in-core during a crashdump.
    227  */
    228 void *
    229 zio_buf_alloc(size_t size)
    230 {
    231 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
    232 
    233 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
    234 
    235 	return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
    236 }
    237 
    238 /*
    239  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
    240  * crashdump if the kernel panics.  This exists so that we will limit the amount
    241  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
    242  * of kernel heap dumped to disk when the kernel panics)
    243  */
    244 void *
    245 zio_data_buf_alloc(size_t size)
    246 {
    247 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
    248 
    249 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
    250 
    251 	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP));
    252 }
    253 
    254 void
    255 zio_buf_free(void *buf, size_t size)
    256 {
    257 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
    258 
    259 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
    260 
    261 	kmem_cache_free(zio_buf_cache[c], buf);
    262 }
    263 
    264 void
    265 zio_data_buf_free(void *buf, size_t size)
    266 {
    267 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
    268 
    269 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
    270 
    271 	kmem_cache_free(zio_data_buf_cache[c], buf);
    272 }
    273 
    274 /*
    275  * ==========================================================================
    276  * Push and pop I/O transform buffers
    277  * ==========================================================================
    278  */
    279 static void
    280 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
    281 {
    282 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
    283 
    284 	zt->zt_data = data;
    285 	zt->zt_size = size;
    286 	zt->zt_bufsize = bufsize;
    287 
    288 	zt->zt_next = zio->io_transform_stack;
    289 	zio->io_transform_stack = zt;
    290 
    291 	zio->io_data = data;
    292 	zio->io_size = size;
    293 }
    294 
    295 static void
    296 zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
    297 {
    298 	zio_transform_t *zt = zio->io_transform_stack;
    299 
    300 	*data = zt->zt_data;
    301 	*size = zt->zt_size;
    302 	*bufsize = zt->zt_bufsize;
    303 
    304 	zio->io_transform_stack = zt->zt_next;
    305 	kmem_free(zt, sizeof (zio_transform_t));
    306 
    307 	if ((zt = zio->io_transform_stack) != NULL) {
    308 		zio->io_data = zt->zt_data;
    309 		zio->io_size = zt->zt_size;
    310 	}
    311 }
    312 
    313 static void
    314 zio_clear_transform_stack(zio_t *zio)
    315 {
    316 	void *data;
    317 	uint64_t size, bufsize;
    318 
    319 	ASSERT(zio->io_transform_stack != NULL);
    320 
    321 	zio_pop_transform(zio, &data, &size, &bufsize);
    322 	while (zio->io_transform_stack != NULL) {
    323 		zio_buf_free(data, bufsize);
    324 		zio_pop_transform(zio, &data, &size, &bufsize);
    325 	}
    326 }
    327 
    328 /*
    329  * ==========================================================================
    330  * Create the various types of I/O (read, write, free)
    331  * ==========================================================================
    332  */
    333 static zio_t *
    334 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
    335     void *data, uint64_t size, zio_done_func_t *done, void *private,
    336     zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
    337 {
    338 	zio_t *zio;
    339 
    340 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
    341 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
    342 
    343 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
    344 	bzero(zio, sizeof (zio_t));
    345 	zio->io_parent = pio;
    346 	zio->io_spa = spa;
    347 	zio->io_txg = txg;
    348 	zio->io_flags = flags;
    349 	if (bp != NULL) {
    350 		zio->io_bp = bp;
    351 		zio->io_bp_copy = *bp;
    352 		zio->io_bp_orig = *bp;
    353 	}
    354 	zio->io_done = done;
    355 	zio->io_private = private;
    356 	zio->io_type = type;
    357 	zio->io_priority = priority;
    358 	zio->io_stage = stage;
    359 	zio->io_pipeline = pipeline;
    360 	zio->io_timestamp = lbolt64;
    361 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
    362 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
    363 	zio_push_transform(zio, data, size, size);
    364 
    365 	/*
    366 	 * Note on config lock:
    367 	 *
    368 	 * If CONFIG_HELD is set, then the caller already has the config
    369 	 * lock, so we don't need it for this io.
    370 	 *
    371 	 * We set CONFIG_GRABBED to indicate that we have grabbed the
    372 	 * config lock on behalf of this io, so it should be released
    373 	 * in zio_done.
    374 	 *
    375 	 * Unless CONFIG_HELD is set, we will grab the config lock for
    376 	 * any top-level (parent-less) io, *except* NULL top-level ios.
    377 	 * The NULL top-level ios rarely have any children, so we delay
    378 	 * grabbing the lock until the first child is added (but it is
    379 	 * still grabbed on behalf of the top-level i/o, so additional
    380 	 * children don't need to also grab it).  This greatly reduces
    381 	 * contention on the config lock.
    382 	 */
    383 	if (pio == NULL) {
    384 		if (type != ZIO_TYPE_NULL &&
    385 		    !(flags & ZIO_FLAG_CONFIG_HELD)) {
    386 			spa_config_enter(spa, RW_READER, zio);
    387 			zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
    388 		}
    389 		zio->io_root = zio;
    390 	} else {
    391 		zio->io_root = pio->io_root;
    392 		if (!(flags & ZIO_FLAG_NOBOOKMARK))
    393 			zio->io_logical = pio->io_logical;
    394 		mutex_enter(&pio->io_lock);
    395 		if (pio->io_parent == NULL &&
    396 		    pio->io_type == ZIO_TYPE_NULL &&
    397 		    !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
    398 		    !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
    399 			pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
    400 			spa_config_enter(spa, RW_READER, pio);
    401 		}
    402 		if (stage < ZIO_STAGE_READY)
    403 			pio->io_children_notready++;
    404 		pio->io_children_notdone++;
    405 		zio->io_sibling_next = pio->io_child;
    406 		zio->io_sibling_prev = NULL;
    407 		if (pio->io_child != NULL)
    408 			pio->io_child->io_sibling_prev = zio;
    409 		pio->io_child = zio;
    410 		zio->io_ndvas = pio->io_ndvas;
    411 		mutex_exit(&pio->io_lock);
    412 	}
    413 
    414 	/*
    415 	 * Save off the original state incase we need to retry later.
    416 	 */
    417 	zio->io_orig_stage = zio->io_stage;
    418 	zio->io_orig_pipeline = zio->io_pipeline;
    419 	zio->io_orig_flags = zio->io_flags;
    420 
    421 	return (zio);
    422 }
    423 
    424 static void
    425 zio_reset(zio_t *zio)
    426 {
    427 	zio_clear_transform_stack(zio);
    428 
    429 	zio->io_flags = zio->io_orig_flags;
    430 	zio->io_stage = zio->io_orig_stage;
    431 	zio->io_pipeline = zio->io_orig_pipeline;
    432 	zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size);
    433 }
    434 
    435 zio_t *
    436 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
    437 	int flags)
    438 {
    439 	zio_t *zio;
    440 
    441 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
    442 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
    443 	    ZIO_WAIT_FOR_CHILDREN_PIPELINE);
    444 
    445 	return (zio);
    446 }
    447 
    448 zio_t *
    449 zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
    450 {
    451 	return (zio_null(NULL, spa, done, private, flags));
    452 }
    453 
    454 zio_t *
    455 zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
    456     uint64_t size, zio_done_func_t *done, void *private,
    457     int priority, int flags, zbookmark_t *zb)
    458 {
    459 	zio_t *zio;
    460 
    461 	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
    462 
    463 	/*
    464 	 * If the user has specified that we allow I/Os to continue
    465 	 * then attempt to satisfy the read.
    466 	 */
    467 	if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
    468 		ZIO_ENTER(spa);
    469 
    470 	zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
    471 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
    472 	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
    473 	zio->io_bookmark = *zb;
    474 
    475 	zio->io_logical = zio;
    476 
    477 	/*
    478 	 * Work off our copy of the bp so the caller can free it.
    479 	 */
    480 	zio->io_bp = &zio->io_bp_copy;
    481 
    482 	return (zio);
    483 }
    484 
    485 zio_t *
    486 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
    487     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
    488     zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
    489     int flags, zbookmark_t *zb)
    490 {
    491 	zio_t *zio;
    492 
    493 	ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
    494 	    checksum < ZIO_CHECKSUM_FUNCTIONS);
    495 
    496 	ASSERT(compress >= ZIO_COMPRESS_OFF &&
    497 	    compress < ZIO_COMPRESS_FUNCTIONS);
    498 
    499 	ZIO_ENTER(spa);
    500 
    501 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
    502 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
    503 	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
    504 
    505 	zio->io_ready = ready;
    506 
    507 	zio->io_bookmark = *zb;
    508 
    509 	zio->io_logical = zio;
    510 
    511 	zio->io_checksum = checksum;
    512 	zio->io_compress = compress;
    513 	zio->io_ndvas = ncopies;
    514 
    515 	if (bp->blk_birth != txg) {
    516 		/* XXX the bp usually (always?) gets re-zeroed later */
    517 		BP_ZERO(bp);
    518 		BP_SET_LSIZE(bp, size);
    519 		BP_SET_PSIZE(bp, size);
    520 	} else {
    521 		/* Make sure someone doesn't change their mind on overwrites */
    522 		ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
    523 		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
    524 	}
    525 
    526 	return (zio);
    527 }
    528 
    529 zio_t *
    530 zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
    531     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
    532     zio_done_func_t *done, void *private, int priority, int flags,
    533     zbookmark_t *zb)
    534 {
    535 	zio_t *zio;
    536 
    537 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
    538 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
    539 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp));
    540 
    541 	zio->io_bookmark = *zb;
    542 	zio->io_checksum = checksum;
    543 	zio->io_compress = ZIO_COMPRESS_OFF;
    544 
    545 	if (pio != NULL)
    546 		ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
    547 
    548 	return (zio);
    549 }
    550 
    551 static void
    552 zio_write_allocate_ready(zio_t *zio)
    553 {
    554 	/* Free up the previous block */
    555 	if (!BP_IS_HOLE(&zio->io_bp_orig)) {
    556 		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
    557 		    &zio->io_bp_orig, NULL, NULL));
    558 	}
    559 }
    560 
    561 static zio_t *
    562 zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
    563     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
    564     zio_done_func_t *done, void *private, int priority, int flags)
    565 {
    566 	zio_t *zio;
    567 
    568 	BP_ZERO(bp);
    569 	BP_SET_LSIZE(bp, size);
    570 	BP_SET_PSIZE(bp, size);
    571 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
    572 
    573 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
    574 	    ZIO_TYPE_WRITE, priority, flags,
    575 	    ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
    576 
    577 	zio->io_checksum = checksum;
    578 	zio->io_compress = ZIO_COMPRESS_OFF;
    579 	zio->io_ready = zio_write_allocate_ready;
    580 
    581 	return (zio);
    582 }
    583 
    584 zio_t *
    585 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
    586     zio_done_func_t *done, void *private)
    587 {
    588 	zio_t *zio;
    589 
    590 	ASSERT(!BP_IS_HOLE(bp));
    591 
    592 	if (txg == spa->spa_syncing_txg &&
    593 	    spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
    594 		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
    595 		return (zio_null(pio, spa, NULL, NULL, 0));
    596 	}
    597 
    598 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
    599 	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
    600 	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp));
    601 
    602 	zio->io_bp = &zio->io_bp_copy;
    603 
    604 	return (zio);
    605 }
    606 
    607 zio_t *
    608 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
    609     zio_done_func_t *done, void *private)
    610 {
    611 	zio_t *zio;
    612 
    613 	/*
    614 	 * A claim is an allocation of a specific block.  Claims are needed
    615 	 * to support immediate writes in the intent log.  The issue is that
    616 	 * immediate writes contain committed data, but in a txg that was
    617 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
    618 	 * the intent log claims all blocks that contain immediate write data
    619 	 * so that the SPA knows they're in use.
    620 	 *
    621 	 * All claims *must* be resolved in the first txg -- before the SPA
    622 	 * starts allocating blocks -- so that nothing is allocated twice.
    623 	 */
    624 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
    625 	ASSERT3U(spa_first_txg(spa), <=, txg);
    626 
    627 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
    628 	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
    629 	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp));
    630 
    631 	zio->io_bp = &zio->io_bp_copy;
    632 
    633 	return (zio);
    634 }
    635 
    636 zio_t *
    637 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
    638     zio_done_func_t *done, void *private, int priority, int flags)
    639 {
    640 	zio_t *zio;
    641 	int c;
    642 
    643 	if (vd->vdev_children == 0) {
    644 		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
    645 		    ZIO_TYPE_IOCTL, priority, flags,
    646 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
    647 
    648 		zio->io_vd = vd;
    649 		zio->io_cmd = cmd;
    650 	} else {
    651 		zio = zio_null(pio, spa, NULL, NULL, flags);
    652 
    653 		for (c = 0; c < vd->vdev_children; c++)
    654 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
    655 			    done, private, priority, flags));
    656 	}
    657 
    658 	return (zio);
    659 }
    660 
    661 static void
    662 zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
    663     int checksum, boolean_t labels)
    664 {
    665 	ASSERT(vd->vdev_children == 0);
    666 
    667 	ASSERT(size <= SPA_MAXBLOCKSIZE);
    668 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
    669 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
    670 
    671 #ifdef ZFS_DEBUG
    672 	if (labels) {
    673 		ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
    674 		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
    675 	}
    676 #endif
    677 	ASSERT3U(offset + size, <=, vd->vdev_psize);
    678 
    679 	BP_ZERO(bp);
    680 
    681 	BP_SET_LSIZE(bp, size);
    682 	BP_SET_PSIZE(bp, size);
    683 
    684 	BP_SET_CHECKSUM(bp, checksum);
    685 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
    686 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
    687 
    688 	if (checksum != ZIO_CHECKSUM_OFF)
    689 		ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
    690 }
    691 
    692 zio_t *
    693 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
    694     void *data, int checksum, zio_done_func_t *done, void *private,
    695     int priority, int flags, boolean_t labels)
    696 {
    697 	zio_t *zio;
    698 	blkptr_t blk;
    699 
    700 	ZIO_ENTER(vd->vdev_spa);
    701 
    702 	zio_phys_bp_init(vd, &blk, offset, size, checksum, labels);
    703 
    704 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
    705 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
    706 	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
    707 
    708 	zio->io_vd = vd;
    709 	zio->io_offset = offset;
    710 
    711 	/*
    712 	 * Work off our copy of the bp so the caller can free it.
    713 	 */
    714 	zio->io_bp = &zio->io_bp_copy;
    715 
    716 	return (zio);
    717 }
    718 
    719 zio_t *
    720 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
    721     void *data, int checksum, zio_done_func_t *done, void *private,
    722     int priority, int flags, boolean_t labels)
    723 {
    724 	zio_block_tail_t *zbt;
    725 	void *wbuf;
    726 	zio_t *zio;
    727 	blkptr_t blk;
    728 
    729 	ZIO_ENTER(vd->vdev_spa);
    730 
    731 	zio_phys_bp_init(vd, &blk, offset, size, checksum, labels);
    732 
    733 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
    734 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
    735 	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
    736 
    737 	zio->io_vd = vd;
    738 	zio->io_offset = offset;
    739 
    740 	zio->io_bp = &zio->io_bp_copy;
    741 	zio->io_checksum = checksum;
    742 
    743 	if (zio_checksum_table[checksum].ci_zbt) {
    744 		/*
    745 		 * zbt checksums are necessarily destructive -- they modify
    746 		 * one word of the write buffer to hold the verifier/checksum.
    747 		 * Therefore, we must make a local copy in case the data is
    748 		 * being written to multiple places.
    749 		 */
    750 		wbuf = zio_buf_alloc(size);
    751 		bcopy(data, wbuf, size);
    752 		zio_push_transform(zio, wbuf, size, size);
    753 
    754 		zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
    755 		zbt->zbt_cksum = blk.blk_cksum;
    756 	}
    757 
    758 	return (zio);
    759 }
    760 
    761 /*
    762  * Create a child I/O to do some work for us.  It has no associated bp.
    763  */
    764 zio_t *
    765 zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
    766 	void *data, uint64_t size, int type, int priority, int flags,
    767 	zio_done_func_t *done, void *private)
    768 {
    769 	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
    770 	zio_t *cio;
    771 
    772 	if (type == ZIO_TYPE_READ && bp != NULL) {
    773 		/*
    774 		 * If we have the bp, then the child should perform the
    775 		 * checksum and the parent need not.  This pushes error
    776 		 * detection as close to the leaves as possible and
    777 		 * eliminates redundant checksums in the interior nodes.
    778 		 */
    779 		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
    780 		zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
    781 	}
    782 
    783 	cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
    784 	    done, private, type, priority,
    785 	    (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
    786 	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
    787 
    788 	cio->io_vd = vd;
    789 	cio->io_offset = offset;
    790 
    791 	return (cio);
    792 }
    793 
    794 /*
    795  * ==========================================================================
    796  * Initiate I/O, either sync or async
    797  * ==========================================================================
    798  */
    799 int
    800 zio_wait(zio_t *zio)
    801 {
    802 	int error;
    803 
    804 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
    805 
    806 	zio->io_waiter = curthread;
    807 
    808 	zio_execute(zio);
    809 
    810 	mutex_enter(&zio->io_lock);
    811 	while (zio->io_stalled != ZIO_STAGE_DONE)
    812 		cv_wait(&zio->io_cv, &zio->io_lock);
    813 	mutex_exit(&zio->io_lock);
    814 
    815 	error = zio->io_error;
    816 	mutex_destroy(&zio->io_lock);
    817 	cv_destroy(&zio->io_cv);
    818 	kmem_cache_free(zio_cache, zio);
    819 
    820 	return (error);
    821 }
    822 
    823 void
    824 zio_nowait(zio_t *zio)
    825 {
    826 	zio_execute(zio);
    827 }
    828 
    829 void
    830 zio_interrupt(zio_t *zio)
    831 {
    832 	(void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type],
    833 	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
    834 }
    835 
    836 static int
    837 zio_issue_async(zio_t *zio)
    838 {
    839 	(void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type],
    840 	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
    841 
    842 	return (ZIO_PIPELINE_STOP);
    843 }
    844 
    845 /*
    846  * ==========================================================================
    847  * I/O pipeline interlocks: parent/child dependency scoreboarding
    848  * ==========================================================================
    849  */
    850 static int
    851 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
    852 {
    853 	int rv = ZIO_PIPELINE_CONTINUE;
    854 
    855 	mutex_enter(&zio->io_lock);
    856 	ASSERT(zio->io_stalled == 0);
    857 	if (*countp != 0) {
    858 		zio->io_stalled = stage;
    859 		rv = ZIO_PIPELINE_STOP;
    860 	}
    861 	mutex_exit(&zio->io_lock);
    862 
    863 	return (rv);
    864 }
    865 
    866 static void
    867 zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
    868 {
    869 	zio_t *pio = zio->io_parent;
    870 
    871 	mutex_enter(&pio->io_lock);
    872 	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
    873 		pio->io_error = zio->io_error;
    874 	ASSERT3U(*countp, >, 0);
    875 	if (--*countp == 0 && pio->io_stalled == stage) {
    876 		pio->io_stalled = 0;
    877 		mutex_exit(&pio->io_lock);
    878 		zio_execute(pio);
    879 	} else {
    880 		mutex_exit(&pio->io_lock);
    881 	}
    882 }
    883 
    884 int
    885 zio_wait_for_children_ready(zio_t *zio)
    886 {
    887 	return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
    888 	    &zio->io_children_notready));
    889 }
    890 
    891 int
    892 zio_wait_for_children_done(zio_t *zio)
    893 {
    894 	return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
    895 	    &zio->io_children_notdone));
    896 }
    897 
    898 static int
    899 zio_read_init(zio_t *zio)
    900 {
    901 	blkptr_t *bp = zio->io_bp;
    902 
    903 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
    904 		uint64_t csize = BP_GET_PSIZE(bp);
    905 		void *cbuf = zio_buf_alloc(csize);
    906 
    907 		zio_push_transform(zio, cbuf, csize, csize);
    908 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
    909 	}
    910 
    911 	if (BP_IS_GANG(bp)) {
    912 		uint64_t gsize = SPA_GANGBLOCKSIZE;
    913 		void *gbuf = zio_buf_alloc(gsize);
    914 
    915 		zio_push_transform(zio, gbuf, gsize, gsize);
    916 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
    917 	}
    918 
    919 	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
    920 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
    921 
    922 	return (ZIO_PIPELINE_CONTINUE);
    923 }
    924 
    925 static int
    926 zio_ready(zio_t *zio)
    927 {
    928 	zio_t *pio = zio->io_parent;
    929 
    930 	if (zio->io_ready)
    931 		zio->io_ready(zio);
    932 
    933 	if (pio != NULL)
    934 		zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
    935 		    &pio->io_children_notready);
    936 
    937 	if (zio->io_bp)
    938 		zio->io_bp_copy = *zio->io_bp;
    939 
    940 	return (ZIO_PIPELINE_CONTINUE);
    941 }
    942 
    943 static int
    944 zio_vdev_retry_io(zio_t *zio)
    945 {
    946 	zio_t *pio = zio->io_parent;
    947 
    948 	/*
    949 	 * Preserve the failed bp so that the io_ready() callback can
    950 	 * update the accounting accordingly. The callback will also be
    951 	 * responsible for freeing the previously allocated block, if one
    952 	 * exists.
    953 	 */
    954 	zio->io_bp_orig = *zio->io_bp;
    955 
    956 	/*
    957 	 * We must zero out the old DVA and blk_birth before reallocating
    958 	 * the bp.
    959 	 */
    960 	BP_ZERO_DVAS(zio->io_bp);
    961 	zio_reset(zio);
    962 
    963 	if (pio) {
    964 		/*
    965 		 * Let the parent know that we will
    966 		 * re-alloc the write (=> new bp info).
    967 		 */
    968 		mutex_enter(&pio->io_lock);
    969 		pio->io_children_notready++;
    970 
    971 		/*
    972 		 * If the parent I/O is still in the open stage, then
    973 		 * don't bother telling it to retry since it hasn't
    974 		 * progressed far enough for it to care.
    975 		 */
    976 		if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio))
    977 			pio->io_flags |= ZIO_FLAG_WRITE_RETRY;
    978 
    979 		ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE);
    980 		mutex_exit(&pio->io_lock);
    981 	}
    982 
    983 	/*
    984 	 * We are getting ready to process the retry request so clear
    985 	 * the flag and the zio's current error status.
    986 	 */
    987 	zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY;
    988 	zio->io_error = 0;
    989 
    990 	return (ZIO_PIPELINE_CONTINUE);
    991 }
    992 
    993 int
    994 zio_vdev_resume_io(spa_t *spa)
    995 {
    996 	zio_t *zio;
    997 
    998 	mutex_enter(&spa->spa_zio_lock);
    999 
   1000 	/*
   1001 	 * Probe all of vdevs that have experienced an I/O error.
   1002 	 * If we are still unable to verify the integrity of the vdev
   1003 	 * then we prevent the resume from proceeeding.
   1004 	 */
   1005 	for (zio = list_head(&spa->spa_zio_list); zio != NULL;
   1006 	    zio = list_next(&spa->spa_zio_list, zio)) {
   1007 		int error = 0;
   1008