1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/fm/fs/zfs.h> 30 #include <sys/spa.h> 31 #include <sys/txg.h> 32 #include <sys/spa_impl.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio_impl.h> 35 #include <sys/zio_compress.h> 36 #include <sys/zio_checksum.h> 37 38 /* 39 * ========================================================================== 40 * I/O priority table 41 * ========================================================================== 42 */ 43 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44 0, /* ZIO_PRIORITY_NOW */ 45 0, /* ZIO_PRIORITY_SYNC_READ */ 46 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47 6, /* ZIO_PRIORITY_ASYNC_READ */ 48 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49 4, /* ZIO_PRIORITY_FREE */ 50 0, /* ZIO_PRIORITY_CACHE_FILL */ 51 0, /* ZIO_PRIORITY_LOG_WRITE */ 52 10, /* ZIO_PRIORITY_RESILVER */ 53 20, /* ZIO_PRIORITY_SCRUB */ 54 }; 55 56 /* 57 * ========================================================================== 58 * I/O type descriptions 59 * ========================================================================== 60 */ 61 char *zio_type_name[ZIO_TYPES] = { 62 "null", "read", "write", "free", "claim", "ioctl" }; 63 64 /* Force an allocation failure when non-zero */ 65 uint16_t zio_zil_fail_shift = 0; 66 uint16_t zio_io_fail_shift = 0; 67 68 /* Enable/disable the write-retry logic */ 69 int zio_write_retry = 1; 70 71 /* Taskq to handle reissuing of I/Os */ 72 taskq_t *zio_taskq; 73 int zio_resume_threads = 4; 74 75 typedef struct zio_sync_pass { 76 int zp_defer_free; /* defer frees after this pass */ 77 int zp_dontcompress; /* don't compress after this pass */ 78 int zp_rewrite; /* rewrite new bps after this pass */ 79 } zio_sync_pass_t; 80 81 zio_sync_pass_t zio_sync_pass = { 82 1, /* zp_defer_free */ 83 4, /* zp_dontcompress */ 84 1, /* zp_rewrite */ 85 }; 86 87 static boolean_t zio_io_should_fail(uint16_t); 88 89 /* 90 * ========================================================================== 91 * I/O kmem caches 92 * ========================================================================== 93 */ 94 kmem_cache_t *zio_cache; 95 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 96 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 97 98 #ifdef _KERNEL 99 extern vmem_t *zio_alloc_arena; 100 #endif 101 102 /* 103 * Determine if we are allowed to issue the IO based on the 104 * pool state. If we must wait then block until we are told 105 * that we may continue. 106 */ 107 #define ZIO_ENTER(spa) { \ 108 if (spa->spa_state == POOL_STATE_IO_FAILURE) { \ 109 mutex_enter(&spa->spa_zio_lock); \ 110 while (spa->spa_state == POOL_STATE_IO_FAILURE) \ 111 cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock); \ 112 mutex_exit(&spa->spa_zio_lock); \ 113 } \ 114 } 115 116 /* 117 * An allocation zio is one that either currently has the DVA allocate 118 * stage set or will have it later in it's lifetime. 119 */ 120 #define IO_IS_ALLOCATING(zio) \ 121 ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) 122 123 void 124 zio_init(void) 125 { 126 size_t c; 127 vmem_t *data_alloc_arena = NULL; 128 129 #ifdef _KERNEL 130 data_alloc_arena = zio_alloc_arena; 131 #endif 132 133 zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, 134 NULL, NULL, NULL, NULL, NULL, 0); 135 136 /* 137 * For small buffers, we want a cache for each multiple of 138 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 139 * for each quarter-power of 2. For large buffers, we want 140 * a cache for each multiple of PAGESIZE. 141 */ 142 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 143 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 144 size_t p2 = size; 145 size_t align = 0; 146 147 while (p2 & (p2 - 1)) 148 p2 &= p2 - 1; 149 150 if (size <= 4 * SPA_MINBLOCKSIZE) { 151 align = SPA_MINBLOCKSIZE; 152 } else if (P2PHASE(size, PAGESIZE) == 0) { 153 align = PAGESIZE; 154 } else if (P2PHASE(size, p2 >> 2) == 0) { 155 align = p2 >> 2; 156 } 157 158 if (align != 0) { 159 char name[36]; 160 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 161 zio_buf_cache[c] = kmem_cache_create(name, size, 162 align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 163 164 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 165 zio_data_buf_cache[c] = kmem_cache_create(name, size, 166 align, NULL, NULL, NULL, NULL, data_alloc_arena, 167 KMC_NODEBUG); 168 169 } 170 } 171 172 while (--c != 0) { 173 ASSERT(zio_buf_cache[c] != NULL); 174 if (zio_buf_cache[c - 1] == NULL) 175 zio_buf_cache[c - 1] = zio_buf_cache[c]; 176 177 ASSERT(zio_data_buf_cache[c] != NULL); 178 if (zio_data_buf_cache[c - 1] == NULL) 179 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 180 } 181 182 zio_taskq = taskq_create("zio_taskq", zio_resume_threads, 183 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 184 185 zio_inject_init(); 186 } 187 188 void 189 zio_fini(void) 190 { 191 size_t c; 192 kmem_cache_t *last_cache = NULL; 193 kmem_cache_t *last_data_cache = NULL; 194 195 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 196 if (zio_buf_cache[c] != last_cache) { 197 last_cache = zio_buf_cache[c]; 198 kmem_cache_destroy(zio_buf_cache[c]); 199 } 200 zio_buf_cache[c] = NULL; 201 202 if (zio_data_buf_cache[c] != last_data_cache) { 203 last_data_cache = zio_data_buf_cache[c]; 204 kmem_cache_destroy(zio_data_buf_cache[c]); 205 } 206 zio_data_buf_cache[c] = NULL; 207 } 208 209 taskq_destroy(zio_taskq); 210 211 kmem_cache_destroy(zio_cache); 212 213 zio_inject_fini(); 214 } 215 216 /* 217 * ========================================================================== 218 * Allocate and free I/O buffers 219 * ========================================================================== 220 */ 221 222 /* 223 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 224 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 225 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 226 * excess / transient data in-core during a crashdump. 227 */ 228 void * 229 zio_buf_alloc(size_t size) 230 { 231 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 232 233 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 234 235 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 236 } 237 238 /* 239 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 240 * crashdump if the kernel panics. This exists so that we will limit the amount 241 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 242 * of kernel heap dumped to disk when the kernel panics) 243 */ 244 void * 245 zio_data_buf_alloc(size_t size) 246 { 247 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 248 249 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 250 251 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 252 } 253 254 void 255 zio_buf_free(void *buf, size_t size) 256 { 257 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 258 259 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 260 261 kmem_cache_free(zio_buf_cache[c], buf); 262 } 263 264 void 265 zio_data_buf_free(void *buf, size_t size) 266 { 267 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 268 269 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 270 271 kmem_cache_free(zio_data_buf_cache[c], buf); 272 } 273 274 /* 275 * ========================================================================== 276 * Push and pop I/O transform buffers 277 * ========================================================================== 278 */ 279 static void 280 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 281 { 282 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 283 284 zt->zt_data = data; 285 zt->zt_size = size; 286 zt->zt_bufsize = bufsize; 287 288 zt->zt_next = zio->io_transform_stack; 289 zio->io_transform_stack = zt; 290 291 zio->io_data = data; 292 zio->io_size = size; 293 } 294 295 static void 296 zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 297 { 298 zio_transform_t *zt = zio->io_transform_stack; 299 300 *data = zt->zt_data; 301 *size = zt->zt_size; 302 *bufsize = zt->zt_bufsize; 303 304 zio->io_transform_stack = zt->zt_next; 305 kmem_free(zt, sizeof (zio_transform_t)); 306 307 if ((zt = zio->io_transform_stack) != NULL) { 308 zio->io_data = zt->zt_data; 309 zio->io_size = zt->zt_size; 310 } 311 } 312 313 static void 314 zio_clear_transform_stack(zio_t *zio) 315 { 316 void *data; 317 uint64_t size, bufsize; 318 319 ASSERT(zio->io_transform_stack != NULL); 320 321 zio_pop_transform(zio, &data, &size, &bufsize); 322 while (zio->io_transform_stack != NULL) { 323 zio_buf_free(data, bufsize); 324 zio_pop_transform(zio, &data, &size, &bufsize); 325 } 326 } 327 328 /* 329 * ========================================================================== 330 * Create the various types of I/O (read, write, free) 331 * ========================================================================== 332 */ 333 static zio_t * 334 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 335 void *data, uint64_t size, zio_done_func_t *done, void *private, 336 zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 337 { 338 zio_t *zio; 339 340 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 341 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 342 343 /* Only we should set CONFIG_GRABBED */ 344 ASSERT(!(flags & ZIO_FLAG_CONFIG_GRABBED)); 345 346 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 347 bzero(zio, sizeof (zio_t)); 348 zio->io_parent = pio; 349 zio->io_spa = spa; 350 zio->io_txg = txg; 351 zio->io_flags = flags; 352 if (bp != NULL) { 353 zio->io_bp = bp; 354 zio->io_bp_copy = *bp; 355 zio->io_bp_orig = *bp; 356 } 357 zio->io_done = done; 358 zio->io_private = private; 359 zio->io_type = type; 360 zio->io_priority = priority; 361 zio->io_stage = stage; 362 zio->io_pipeline = pipeline; 363 zio->io_timestamp = lbolt64; 364 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 365 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 366 zio_push_transform(zio, data, size, size); 367 368 /* 369 * Note on config lock: 370 * 371 * If CONFIG_HELD is set, then the caller already has the config 372 * lock, so we don't need it for this io. 373 * 374 * We set CONFIG_GRABBED to indicate that we have grabbed the 375 * config lock on behalf of this io, so it should be released 376 * in zio_done. 377 * 378 * Unless CONFIG_HELD is set, we will grab the config lock for 379 * any top-level (parent-less) io, *except* NULL top-level ios. 380 * The NULL top-level ios rarely have any children, so we delay 381 * grabbing the lock until the first child is added (but it is 382 * still grabbed on behalf of the top-level i/o, so additional 383 * children don't need to also grab it). This greatly reduces 384 * contention on the config lock. 385 */ 386 if (pio == NULL) { 387 if (type != ZIO_TYPE_NULL && 388 !(flags & ZIO_FLAG_CONFIG_HELD)) { 389 spa_config_enter(spa, RW_READER, zio); 390 zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 391 } 392 zio->io_root = zio; 393 } else { 394 zio->io_root = pio->io_root; 395 if (!(flags & ZIO_FLAG_NOBOOKMARK)) 396 zio->io_logical = pio->io_logical; 397 mutex_enter(&pio->io_lock); 398 if (pio->io_parent == NULL && 399 pio->io_type == ZIO_TYPE_NULL && 400 !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && 401 !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { 402 pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 403 spa_config_enter(spa, RW_READER, pio); 404 } 405 if (stage < ZIO_STAGE_READY) 406 pio->io_children_notready++; 407 pio->io_children_notdone++; 408 zio->io_sibling_next = pio->io_child; 409 zio->io_sibling_prev = NULL; 410 if (pio->io_child != NULL) 411 pio->io_child->io_sibling_prev = zio; 412 pio->io_child = zio; 413 zio->io_ndvas = pio->io_ndvas; 414 mutex_exit(&pio->io_lock); 415 } 416 417 /* 418 * Save off the original state incase we need to retry later. 419 */ 420 zio->io_orig_stage = zio->io_stage; 421 zio->io_orig_pipeline = zio->io_pipeline; 422 zio->io_orig_flags = zio->io_flags; 423 424 /* 425 * If this is not a null zio, and config is not already held, 426 * then the root zio should have grabbed the config lock. 427 * If this is not a root zio, it should not have grabbed the 428 * config lock. 429 */ 430 ASSERT((zio->io_root->io_flags & ZIO_FLAG_CONFIG_HELD) || 431 zio->io_type == ZIO_TYPE_NULL || 432 (zio->io_root->io_flags & ZIO_FLAG_CONFIG_GRABBED)); 433 ASSERT(zio->io_root == zio || 434 !(zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)); 435 436 return (zio); 437 } 438 439 static void 440 zio_reset(zio_t *zio) 441 { 442 zio_clear_transform_stack(zio); 443 444 zio->io_flags = zio->io_orig_flags; 445 zio->io_stage = zio->io_orig_stage; 446 zio->io_pipeline = zio->io_orig_pipeline; 447 zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size); 448 } 449 450 zio_t * 451 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 452 int flags) 453 { 454 zio_t *zio; 455 456 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 457 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 458 ZIO_WAIT_FOR_CHILDREN_PIPELINE); 459 460 return (zio); 461 } 462 463 zio_t * 464 zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 465 { 466 return (zio_null(NULL, spa, done, private, flags)); 467 } 468 469 zio_t * 470 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, 471 uint64_t size, zio_done_func_t *done, void *private, 472 int priority, int flags, const zbookmark_t *zb) 473 { 474 zio_t *zio; 475 476 ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 477 478 /* 479 * If the user has specified that we allow I/Os to continue 480 * then attempt to satisfy the read. 481 */ 482 if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 483 ZIO_ENTER(spa); 484 485 zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp, 486 data, size, done, private, 487 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 488 ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 489 zio->io_bookmark = *zb; 490 491 zio->io_logical = zio; 492 493 /* 494 * Work off our copy of the bp so the caller can free it. 495 */ 496 zio->io_bp = &zio->io_bp_copy; 497 498 return (zio); 499 } 500 501 zio_t * 502 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 503 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 504 zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, 505 int flags, const zbookmark_t *zb) 506 { 507 zio_t *zio; 508 509 ASSERT(checksum >= ZIO_CHECKSUM_OFF && 510 checksum < ZIO_CHECKSUM_FUNCTIONS); 511 512 ASSERT(compress >= ZIO_COMPRESS_OFF && 513 compress < ZIO_COMPRESS_FUNCTIONS); 514 515 ZIO_ENTER(spa); 516 517 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 518 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 519 ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 520 521 zio->io_ready = ready; 522 523 zio->io_bookmark = *zb; 524 525 zio->io_logical = zio; 526 527 zio->io_checksum = checksum; 528 zio->io_compress = compress; 529 zio->io_ndvas = ncopies; 530 531 if (bp->blk_birth != txg) { 532 /* XXX the bp usually (always?) gets re-zeroed later */ 533 BP_ZERO(bp); 534 BP_SET_LSIZE(bp, size); 535 BP_SET_PSIZE(bp, size); 536 } else { 537 /* Make sure someone doesn't change their mind on overwrites */ 538 ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 539 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 540 } 541 542 return (zio); 543 } 544 545 zio_t * 546 zio_rewrite(zio_t *pio, spa_t *spa, int checksum, uint64_t txg, 547 blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, 548 void *private, int priority, int flags, zbookmark_t *zb) 549 { 550 zio_t *zio; 551 552 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 553 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 554 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp)); 555 556 zio->io_bookmark = *zb; 557 zio->io_checksum = checksum; 558 zio->io_compress = ZIO_COMPRESS_OFF; 559 560 if (pio != NULL) 561 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 562 563 return (zio); 564 } 565 566 static void 567 zio_write_allocate_ready(zio_t *zio) 568 { 569 /* Free up the previous block */ 570 if (!BP_IS_HOLE(&zio->io_bp_orig)) { 571 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 572 &zio->io_bp_orig, NULL, NULL)); 573 } 574 } 575 576 static zio_t * 577 zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 578 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 579 zio_done_func_t *done, void *private, int priority, int flags) 580 { 581 zio_t *zio; 582 583 BP_ZERO(bp); 584 BP_SET_LSIZE(bp, size); 585 BP_SET_PSIZE(bp, size); 586 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 587 588 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 589 ZIO_TYPE_WRITE, priority, flags, 590 ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 591 592 zio->io_checksum = checksum; 593 zio->io_compress = ZIO_COMPRESS_OFF; 594 zio->io_ready = zio_write_allocate_ready; 595 596 return (zio); 597 } 598 599 zio_t * 600 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 601 zio_done_func_t *done, void *private) 602 { 603 zio_t *zio; 604 605 ASSERT(!BP_IS_HOLE(bp)); 606 607 if (txg == spa->spa_syncing_txg && 608 spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 609 bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 610 return (zio_null(pio, spa, NULL, NULL, 0)); 611 } 612 613 zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 614 ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 615 ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp)); 616 617 zio->io_bp = &zio->io_bp_copy; 618 619 return (zio); 620 } 621 622 zio_t * 623 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 624 zio_done_func_t *done, void *private) 625 { 626 zio_t *zio; 627 628 /* 629 * A claim is an allocation of a specific block. Claims are needed 630 * to support immediate writes in the intent log. The issue is that 631 * immediate writes contain committed data, but in a txg that was 632 * *not* committed. Upon opening the pool after an unclean shutdown, 633 * the intent log claims all blocks that contain immediate write data 634 * so that the SPA knows they're in use. 635 * 636 * All claims *must* be resolved in the first txg -- before the SPA 637 * starts allocating blocks -- so that nothing is allocated twice. 638 */ 639 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 640 ASSERT3U(spa_first_txg(spa), <=, txg); 641 642 zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 643 ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 644 ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp)); 645 646 zio->io_bp = &zio->io_bp_copy; 647 648 return (zio); 649 } 650 651 zio_t * 652 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 653 zio_done_func_t *done, void *private, int priority, int flags) 654 { 655 zio_t *zio; 656 int c; 657 658 if (vd->vdev_children == 0) { 659 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 660 ZIO_TYPE_IOCTL, priority, flags, 661 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 662 663 zio->io_vd = vd; 664 zio->io_cmd = cmd; 665 } else { 666 zio = zio_null(pio, spa, NULL, NULL, flags); 667 668 for (c = 0; c < vd->vdev_children; c++) 669 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 670 done, private, priority, flags)); 671 } 672 673 return (zio); 674 } 675 676 static void 677 zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 678 int checksum, boolean_t labels) 679 { 680 ASSERT(vd->vdev_children == 0); 681 682 ASSERT(size <= SPA_MAXBLOCKSIZE); 683 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 684 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 685 686 #ifdef ZFS_DEBUG 687 if (labels) { 688 ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 689 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 690 } 691 #endif 692 ASSERT3U(offset + size, <=, vd->vdev_psize); 693 694 BP_ZERO(bp); 695 696 BP_SET_LSIZE(bp, size); 697 BP_SET_PSIZE(bp, size); 698 699 BP_SET_CHECKSUM(bp, checksum); 700 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 701 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 702 703 if (checksum != ZIO_CHECKSUM_OFF) 704 ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 705 } 706 707 zio_t * 708 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 709 void *data, int checksum, zio_done_func_t *done, void *private, 710 int priority, int flags, boolean_t labels) 711 { 712 zio_t *zio; 713 blkptr_t blk; 714 715 ZIO_ENTER(vd->vdev_spa); 716 717 zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 718 719 zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 720 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 721 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 722 723 zio->io_vd = vd; 724 zio->io_offset = offset; 725 726 /* 727 * Work off our copy of the bp so the caller can free it. 728 */ 729 zio->io_bp = &zio->io_bp_copy; 730 731 return (zio); 732 } 733 734 zio_t * 735 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 736 void *data, int checksum, zio_done_func_t *done, void *private, 737 int priority, int flags, boolean_t labels) 738 { 739 zio_block_tail_t *zbt; 740 void *wbuf; 741 zio_t *zio; 742 blkptr_t blk; 743 744 ZIO_ENTER(vd->vdev_spa); 745 746 zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 747 748 zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 749 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 750 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 751 752 zio->io_vd = vd; 753 zio->io_offset = offset; 754 755 zio->io_bp = &zio->io_bp_copy; 756 zio->io_checksum = checksum; 757 758 if (zio_checksum_table[checksum].ci_zbt) { 759 /* 760 * zbt checksums are necessarily destructive -- they modify 761 * one word of the write buffer to hold the verifier/checksum. 762 * Therefore, we must make a local copy in case the data is 763 * being written to multiple places. 764 */ 765 wbuf = zio_buf_alloc(size); 766 bcopy(data, wbuf, size); 767 zio_push_transform(zio, wbuf, size, size); 768 769 zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 770 zbt->zbt_cksum = blk.blk_cksum; 771 } 772 773 return (zio); 774 } 775 776 /* 777 * Create a child I/O to do some work for us. It has no associated bp. 778 */ 779 zio_t * 780 zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 781 void *data, uint64_t size, int type, int priority, int flags, 782 zio_done_func_t *done, void *private) 783 { 784 uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 785 zio_t *cio; 786 787 if (type == ZIO_TYPE_READ && bp != NULL) { 788 /* 789 * If we have the bp, then the child should perform the 790 * checksum and the parent need not. This pushes error 791 * detection as close to the leaves as possible and 792 * eliminates redundant checksums in the interior nodes. 793 */ 794 pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 795 zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 796 } 797 798 cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 799 done, private, type, priority, 800 (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 801 ZIO_STAGE_VDEV_IO_START - 1, pipeline); 802 803 cio->io_vd = vd; 804 cio->io_offset = offset; 805 806 return (cio); 807 } 808 809 /* 810 * ========================================================================== 811 * Initiate I/O, either sync or async 812 * ========================================================================== 813 */ 814 static void 815 zio_destroy(zio_t *zio) 816 { 817 mutex_destroy(&zio->io_lock); 818 cv_destroy(&zio->io_cv); 819 if (zio->io_failed_vds != NULL) { 820 kmem_free(zio->io_failed_vds, 821 zio->io_failed_vds_count * sizeof (vdev_t *)); 822 zio->io_failed_vds = NULL; 823 zio->io_failed_vds_count = 0; 824 } 825 kmem_cache_free(zio_cache, zio); 826 } 827 828 int 829 zio_wait(zio_t *zio) 830 { 831 int error; 832 833 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 834 835 zio->io_waiter = curthread; 836 837 zio_execute(zio); 838 839 mutex_enter(&zio->io_lock); 840 while (zio->io_stalled != ZIO_STAGE_DONE) 841 cv_wait(&zio->io_cv, &zio->io_lock); 842 mutex_exit(&zio->io_lock); 843 844 error = zio->io_error; 845 zio_destroy(zio); 846 847 return (error); 848 } 849 850 void 851 zio_nowait(zio_t *zio) 852 { 853 zio_execute(zio); 854 } 855 856 void 857 zio_interrupt(zio_t *zio) 858 { 859 (void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type], 860 (task_func_t *)zio_execute, zio, TQ_SLEEP); 861 } 862 863 static int 864 zio_issue_async(zio_t *zio) 865 { 866 (void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type], 867 (task_func_t *)zio_execute, zio, TQ_SLEEP); 868 869 return (ZIO_PIPELINE_STOP); 870 } 871 872 /* 873 * ========================================================================== 874 * I/O pipeline interlocks: parent/child dependency scoreboarding 875 * ========================================================================== 876 */ 877 static int 878 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 879 { 880 int rv = ZIO_PIPELINE_CONTINUE; 881 882 mutex_enter(&zio->io_lock); 883 ASSERT(zio->io_stalled == 0); 884 if (*countp != 0) { 885 zio->io_stalled = stage; 886 rv = ZIO_PIPELINE_STOP; 887 } 888 mutex_exit(&zio->io_lock); 889 890 return (rv); 891 } 892 893 static void 894 zio_add_failed_vdev(zio_t *pio, zio_t *zio) 895 { 896 uint64_t oldcount = pio->io_failed_vds_count; 897 vdev_t **new_vds; 898 int i; 899 900 ASSERT(MUTEX_HELD(&pio->io_lock)); 901 902 if (zio->io_vd == NULL) 903 return; 904 905 for (i = 0; i < oldcount; i++) { 906 if (pio->io_failed_vds[i] == zio->io_vd) 907 return; 908 } 909 910 new_vds = kmem_zalloc((oldcount + 1) * sizeof (vdev_t *), KM_SLEEP); 911 if (pio->io_failed_vds != NULL) { 912 bcopy(pio->io_failed_vds, new_vds, 913 oldcount * sizeof (vdev_t *)); 914 kmem_free(pio->io_failed_vds, oldcount * sizeof (vdev_t *)); 915 } 916 pio->io_failed_vds = new_vds; 917 pio->io_failed_vds[oldcount] = zio->io_vd; 918 pio->io_failed_vds_count++; 919 } 920 921 static void 922 zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 923 { 924 zio_t *pio = zio->io_parent; 925 926 mutex_enter(&pio->io_lock); 927 if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) { 928 pio->io_error = zio->io_error; 929 if (zio->io_error && zio->io_error != ENOTSUP) 930 zio_add_failed_vdev(pio, zio); 931 } 932 ASSERT3U(*countp, >, 0); 933 if (--*countp == 0 && pio->io_stalled == stage) { 934 pio->io_stalled = 0; 935 mutex_exit(&pio->io_lock); 936 zio_execute(pio); 937 } else { 938 mutex_exit(&pio->io_lock); 939 } 940 } 941 942 int 943 zio_wait_for_children_ready(zio_t *zio) 944 { 945 return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 946 &zio->io_children_notready)); 947 } 948 949 int 950 zio_wait_for_children_done(zio_t *zio) 951 { 952 return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, 953 &zio->io_children_notdone)); 954 } 955 956 static int 957 zio_read_init(zio_t *zio) 958 { 959 blkptr_t *bp = zio->io_bp; 960 961 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 962 uint64_t csize = BP_GET_PSIZE(bp); 963 void *cbuf = zio_buf_alloc(csize); 964 965 zio_push_transform(zio, cbuf, csize, csize); 966 zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 967 } 968 969 if (BP_IS_GANG(bp)) { 970 uint64_t gsize = SPA_GANGBLOCKSIZE; 971 void *gbuf = zio_buf_alloc(gsize); 972 973 zio_push_transform(zio, gbuf, gsize, gsize); 974 zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 975 } 976 977 if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 978 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 979 980 return (ZIO_PIPELINE_CONTINUE); 981 } 982 983 static int 984 zio_ready(zio_t *zio) 985 { 986 zio_t *pio = zio->io_parent; 987 988 if (zio->io_ready) 989