1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "@(#)zio.c 1.31 07/12/12 SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/fm/fs/zfs.h> 30 #include <sys/spa.h> 31 #include <sys/txg.h> 32 #include <sys/spa_impl.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio_impl.h> 35 #include <sys/zio_compress.h> 36 #include <sys/zio_checksum.h> 37 38 /* 39 * ========================================================================== 40 * I/O priority table 41 * ========================================================================== 42 */ 43 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44 0, /* ZIO_PRIORITY_NOW */ 45 0, /* ZIO_PRIORITY_SYNC_READ */ 46 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47 6, /* ZIO_PRIORITY_ASYNC_READ */ 48 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49 4, /* ZIO_PRIORITY_FREE */ 50 0, /* ZIO_PRIORITY_CACHE_FILL */ 51 0, /* ZIO_PRIORITY_LOG_WRITE */ 52 10, /* ZIO_PRIORITY_RESILVER */ 53 20, /* ZIO_PRIORITY_SCRUB */ 54 }; 55 56 /* 57 * ========================================================================== 58 * I/O type descriptions 59 * ========================================================================== 60 */ 61 char *zio_type_name[ZIO_TYPES] = { 62 "null", "read", "write", "free", "claim", "ioctl" }; 63 64 /* Force an allocation failure when non-zero */ 65 uint16_t zio_zil_fail_shift = 0; 66 uint16_t zio_io_fail_shift = 0; 67 68 /* Enable/disable the write-retry logic */ 69 int zio_write_retry = 1; 70 71 /* Taskq to handle reissuing of I/Os */ 72 taskq_t *zio_taskq; 73 int zio_resume_threads = 4; 74 75 typedef struct zio_sync_pass { 76 int zp_defer_free; /* defer frees after this pass */ 77 int zp_dontcompress; /* don't compress after this pass */ 78 int zp_rewrite; /* rewrite new bps after this pass */ 79 } zio_sync_pass_t; 80 81 zio_sync_pass_t zio_sync_pass = { 82 1, /* zp_defer_free */ 83 4, /* zp_dontcompress */ 84 1, /* zp_rewrite */ 85 }; 86 87 static boolean_t zio_io_should_fail(uint16_t); 88 89 /* 90 * ========================================================================== 91 * I/O kmem caches 92 * ========================================================================== 93 */ 94 kmem_cache_t *zio_cache; 95 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 96 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 97 98 #ifdef _KERNEL 99 extern vmem_t *zio_alloc_arena; 100 #endif 101 102 /* 103 * Determine if we are allowed to issue the IO based on the 104 * pool state. If we must wait then block until we are told 105 * that we may continue. 106 */ 107 #define ZIO_ENTER(spa) { \ 108 if (spa->spa_state == POOL_STATE_IO_FAILURE) { \ 109 mutex_enter(&spa->spa_zio_lock); \ 110 while (spa->spa_state == POOL_STATE_IO_FAILURE) \ 111 cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock); \ 112 mutex_exit(&spa->spa_zio_lock); \ 113 } \ 114 } 115 116 /* 117 * An allocation zio is one that either currently has the DVA allocate 118 * stage set or will have it later in it's lifetime. 119 */ 120 #define IO_IS_ALLOCATING(zio) \ 121 ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) 122 123 void 124 zio_init(void) 125 { 126 size_t c; 127 vmem_t *data_alloc_arena = NULL; 128 129 #ifdef _KERNEL 130 data_alloc_arena = zio_alloc_arena; 131 #endif 132 133 zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, 134 NULL, NULL, NULL, NULL, NULL, 0); 135 136 /* 137 * For small buffers, we want a cache for each multiple of 138 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 139 * for each quarter-power of 2. For large buffers, we want 140 * a cache for each multiple of PAGESIZE. 141 */ 142 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 143 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 144 size_t p2 = size; 145 size_t align = 0; 146 147 while (p2 & (p2 - 1)) 148 p2 &= p2 - 1; 149 150 if (size <= 4 * SPA_MINBLOCKSIZE) { 151 align = SPA_MINBLOCKSIZE; 152 } else if (P2PHASE(size, PAGESIZE) == 0) { 153 align = PAGESIZE; 154 } else if (P2PHASE(size, p2 >> 2) == 0) { 155 align = p2 >> 2; 156 } 157 158 if (align != 0) { 159 char name[36]; 160 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 161 zio_buf_cache[c] = kmem_cache_create(name, size, 162 align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 163 164 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 165 zio_data_buf_cache[c] = kmem_cache_create(name, size, 166 align, NULL, NULL, NULL, NULL, data_alloc_arena, 167 KMC_NODEBUG); 168 169 } 170 } 171 172 while (--c != 0) { 173 ASSERT(zio_buf_cache[c] != NULL); 174 if (zio_buf_cache[c - 1] == NULL) 175 zio_buf_cache[c - 1] = zio_buf_cache[c]; 176 177 ASSERT(zio_data_buf_cache[c] != NULL); 178 if (zio_data_buf_cache[c - 1] == NULL) 179 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 180 } 181 182 zio_taskq = taskq_create("zio_taskq", zio_resume_threads, 183 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 184 185 zio_inject_init(); 186 } 187 188 void 189 zio_fini(void) 190 { 191 size_t c; 192 kmem_cache_t *last_cache = NULL; 193 kmem_cache_t *last_data_cache = NULL; 194 195 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 196 if (zio_buf_cache[c] != last_cache) { 197 last_cache = zio_buf_cache[c]; 198 kmem_cache_destroy(zio_buf_cache[c]); 199 } 200 zio_buf_cache[c] = NULL; 201 202 if (zio_data_buf_cache[c] != last_data_cache) { 203 last_data_cache = zio_data_buf_cache[c]; 204 kmem_cache_destroy(zio_data_buf_cache[c]); 205 } 206 zio_data_buf_cache[c] = NULL; 207 } 208 209 taskq_destroy(zio_taskq); 210 211 kmem_cache_destroy(zio_cache); 212 213 zio_inject_fini(); 214 } 215 216 /* 217 * ========================================================================== 218 * Allocate and free I/O buffers 219 * ========================================================================== 220 */ 221 222 /* 223 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 224 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 225 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 226 * excess / transient data in-core during a crashdump. 227 */ 228 void * 229 zio_buf_alloc(size_t size) 230 { 231 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 232 233 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 234 235 return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); 236 } 237 238 /* 239 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 240 * crashdump if the kernel panics. This exists so that we will limit the amount 241 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 242 * of kernel heap dumped to disk when the kernel panics) 243 */ 244 void * 245 zio_data_buf_alloc(size_t size) 246 { 247 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 248 249 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 250 251 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP)); 252 } 253 254 void 255 zio_buf_free(void *buf, size_t size) 256 { 257 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 258 259 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 260 261 kmem_cache_free(zio_buf_cache[c], buf); 262 } 263 264 void 265 zio_data_buf_free(void *buf, size_t size) 266 { 267 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 268 269 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 270 271 kmem_cache_free(zio_data_buf_cache[c], buf); 272 } 273 274 /* 275 * ========================================================================== 276 * Push and pop I/O transform buffers 277 * ========================================================================== 278 */ 279 static void 280 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 281 { 282 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 283 284 zt->zt_data = data; 285 zt->zt_size = size; 286 zt->zt_bufsize = bufsize; 287 288 zt->zt_next = zio->io_transform_stack; 289 zio->io_transform_stack = zt; 290 291 zio->io_data = data; 292 zio->io_size = size; 293 } 294 295 static void 296 zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 297 { 298 zio_transform_t *zt = zio->io_transform_stack; 299 300 *data = zt->zt_data; 301 *size = zt->zt_size; 302 *bufsize = zt->zt_bufsize; 303 304 zio->io_transform_stack = zt->zt_next; 305 kmem_free(zt, sizeof (zio_transform_t)); 306 307 if ((zt = zio->io_transform_stack) != NULL) { 308 zio->io_data = zt->zt_data; 309 zio->io_size = zt->zt_size; 310 } 311 } 312 313 static void 314 zio_clear_transform_stack(zio_t *zio) 315 { 316 void *data; 317 uint64_t size, bufsize; 318 319 ASSERT(zio->io_transform_stack != NULL); 320 321 zio_pop_transform(zio, &data, &size, &bufsize); 322 while (zio->io_transform_stack != NULL) { 323 zio_buf_free(data, bufsize); 324 zio_pop_transform(zio, &data, &size, &bufsize); 325 } 326 } 327 328 /* 329 * ========================================================================== 330 * Create the various types of I/O (read, write, free) 331 * ========================================================================== 332 */ 333 static zio_t * 334 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 335 void *data, uint64_t size, zio_done_func_t *done, void *private, 336 zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 337 { 338 zio_t *zio; 339 340 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 341 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 342 343 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 344 bzero(zio, sizeof (zio_t)); 345 zio->io_parent = pio; 346 zio->io_spa = spa; 347 zio->io_txg = txg; 348 zio->io_flags = flags; 349 if (bp != NULL) { 350 zio->io_bp = bp; 351 zio->io_bp_copy = *bp; 352 zio->io_bp_orig = *bp; 353 } 354 zio->io_done = done; 355 zio->io_private = private; 356 zio->io_type = type; 357 zio->io_priority = priority; 358 zio->io_stage = stage; 359 zio->io_pipeline = pipeline; 360 zio->io_timestamp = lbolt64; 361 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 362 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 363 zio_push_transform(zio, data, size, size); 364 365 /* 366 * Note on config lock: 367 * 368 * If CONFIG_HELD is set, then the caller already has the config 369 * lock, so we don't need it for this io. 370 * 371 * We set CONFIG_GRABBED to indicate that we have grabbed the 372 * config lock on behalf of this io, so it should be released 373 * in zio_done. 374 * 375 * Unless CONFIG_HELD is set, we will grab the config lock for 376 * any top-level (parent-less) io, *except* NULL top-level ios. 377 * The NULL top-level ios rarely have any children, so we delay 378 * grabbing the lock until the first child is added (but it is 379 * still grabbed on behalf of the top-level i/o, so additional 380 * children don't need to also grab it). This greatly reduces 381 * contention on the config lock. 382 */ 383 if (pio == NULL) { 384 if (type != ZIO_TYPE_NULL && 385 !(flags & ZIO_FLAG_CONFIG_HELD)) { 386 spa_config_enter(spa, RW_READER, zio); 387 zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 388 } 389 zio->io_root = zio; 390 } else { 391 zio->io_root = pio->io_root; 392 if (!(flags & ZIO_FLAG_NOBOOKMARK)) 393 zio->io_logical = pio->io_logical; 394 mutex_enter(&pio->io_lock); 395 if (pio->io_parent == NULL && 396 pio->io_type == ZIO_TYPE_NULL && 397 !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && 398 !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { 399 pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 400 spa_config_enter(spa, RW_READER, pio); 401 } 402 if (stage < ZIO_STAGE_READY) 403 pio->io_children_notready++; 404 pio->io_children_notdone++; 405 zio->io_sibling_next = pio->io_child; 406 zio->io_sibling_prev = NULL; 407 if (pio->io_child != NULL) 408 pio->io_child->io_sibling_prev = zio; 409 pio->io_child = zio; 410 zio->io_ndvas = pio->io_ndvas; 411 mutex_exit(&pio->io_lock); 412 } 413 414 /* 415 * Save off the original state incase we need to retry later. 416 */ 417 zio->io_orig_stage = zio->io_stage; 418 zio->io_orig_pipeline = zio->io_pipeline; 419 zio->io_orig_flags = zio->io_flags; 420 421 return (zio); 422 } 423 424 static void 425 zio_reset(zio_t *zio) 426 { 427 zio_clear_transform_stack(zio); 428 429 zio->io_flags = zio->io_orig_flags; 430 zio->io_stage = zio->io_orig_stage; 431 zio->io_pipeline = zio->io_orig_pipeline; 432 zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size); 433 } 434 435 zio_t * 436 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 437 int flags) 438 { 439 zio_t *zio; 440 441 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 442 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 443 ZIO_WAIT_FOR_CHILDREN_PIPELINE); 444 445 return (zio); 446 } 447 448 zio_t * 449 zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 450 { 451 return (zio_null(NULL, spa, done, private, flags)); 452 } 453 454 zio_t * 455 zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 456 uint64_t size, zio_done_func_t *done, void *private, 457 int priority, int flags, zbookmark_t *zb) 458 { 459 zio_t *zio; 460 461 ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 462 463 /* 464 * If the user has specified that we allow I/Os to continue 465 * then attempt to satisfy the read. 466 */ 467 if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 468 ZIO_ENTER(spa); 469 470 zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 471 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 472 ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 473 zio->io_bookmark = *zb; 474 475 zio->io_logical = zio; 476 477 /* 478 * Work off our copy of the bp so the caller can free it. 479 */ 480 zio->io_bp = &zio->io_bp_copy; 481 482 return (zio); 483 } 484 485 zio_t * 486 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 487 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 488 zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, 489 int flags, zbookmark_t *zb) 490 { 491 zio_t *zio; 492 493 ASSERT(checksum >= ZIO_CHECKSUM_OFF && 494 checksum < ZIO_CHECKSUM_FUNCTIONS); 495 496 ASSERT(compress >= ZIO_COMPRESS_OFF && 497 compress < ZIO_COMPRESS_FUNCTIONS); 498 499 ZIO_ENTER(spa); 500 501 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 502 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 503 ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 504 505 zio->io_ready = ready; 506 507 zio->io_bookmark = *zb; 508 509 zio->io_logical = zio; 510 511 zio->io_checksum = checksum; 512 zio->io_compress = compress; 513 zio->io_ndvas = ncopies; 514 515 if (bp->blk_birth != txg) { 516 /* XXX the bp usually (always?) gets re-zeroed later */ 517 BP_ZERO(bp); 518 BP_SET_LSIZE(bp, size); 519 BP_SET_PSIZE(bp, size); 520 } else { 521 /* Make sure someone doesn't change their mind on overwrites */ 522 ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 523 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 524 } 525 526 return (zio); 527 } 528 529 zio_t * 530 zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 531 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 532 zio_done_func_t *done, void *private, int priority, int flags, 533 zbookmark_t *zb) 534 { 535 zio_t *zio; 536 537 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 538 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 539 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp)); 540 541 zio->io_bookmark = *zb; 542 zio->io_checksum = checksum; 543 zio->io_compress = ZIO_COMPRESS_OFF; 544 545 if (pio != NULL) 546 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 547 548 return (zio); 549 } 550 551 static void 552 zio_write_allocate_ready(zio_t *zio) 553 { 554 /* Free up the previous block */ 555 if (!BP_IS_HOLE(&zio->io_bp_orig)) { 556 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 557 &zio->io_bp_orig, NULL, NULL)); 558 } 559 } 560 561 static zio_t * 562 zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 563 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 564 zio_done_func_t *done, void *private, int priority, int flags) 565 { 566 zio_t *zio; 567 568 BP_ZERO(bp); 569 BP_SET_LSIZE(bp, size); 570 BP_SET_PSIZE(bp, size); 571 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 572 573 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 574 ZIO_TYPE_WRITE, priority, flags, 575 ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 576 577 zio->io_checksum = checksum; 578 zio->io_compress = ZIO_COMPRESS_OFF; 579 zio->io_ready = zio_write_allocate_ready; 580 581 return (zio); 582 } 583 584 zio_t * 585 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 586 zio_done_func_t *done, void *private) 587 { 588 zio_t *zio; 589 590 ASSERT(!BP_IS_HOLE(bp)); 591 592 if (txg == spa->spa_syncing_txg && 593 spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 594 bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 595 return (zio_null(pio, spa, NULL, NULL, 0)); 596 } 597 598 zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 599 ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 600 ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp)); 601 602 zio->io_bp = &zio->io_bp_copy; 603 604 return (zio); 605 } 606 607 zio_t * 608 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 609 zio_done_func_t *done, void *private) 610 { 611 zio_t *zio; 612 613 /* 614 * A claim is an allocation of a specific block. Claims are needed 615 * to support immediate writes in the intent log. The issue is that 616 * immediate writes contain committed data, but in a txg that was 617 * *not* committed. Upon opening the pool after an unclean shutdown, 618 * the intent log claims all blocks that contain immediate write data 619 * so that the SPA knows they're in use. 620 * 621 * All claims *must* be resolved in the first txg -- before the SPA 622 * starts allocating blocks -- so that nothing is allocated twice. 623 */ 624 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 625 ASSERT3U(spa_first_txg(spa), <=, txg); 626 627 zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 628 ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 629 ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp)); 630 631 zio->io_bp = &zio->io_bp_copy; 632 633 return (zio); 634 } 635 636 zio_t * 637 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 638 zio_done_func_t *done, void *private, int priority, int flags) 639 { 640 zio_t *zio; 641 int c; 642 643 if (vd->vdev_children == 0) { 644 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 645 ZIO_TYPE_IOCTL, priority, flags, 646 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 647 648 zio->io_vd = vd; 649 zio->io_cmd = cmd; 650 } else { 651 zio = zio_null(pio, spa, NULL, NULL, flags); 652 653 for (c = 0; c < vd->vdev_children; c++) 654 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 655 done, private, priority, flags)); 656 } 657 658 return (zio); 659 } 660 661 static void 662 zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 663 int checksum, boolean_t labels) 664 { 665 ASSERT(vd->vdev_children == 0); 666 667 ASSERT(size <= SPA_MAXBLOCKSIZE); 668 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 669 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 670 671 #ifdef ZFS_DEBUG 672 if (labels) { 673 ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 674 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 675 } 676 #endif 677 ASSERT3U(offset + size, <=, vd->vdev_psize); 678 679 BP_ZERO(bp); 680 681 BP_SET_LSIZE(bp, size); 682 BP_SET_PSIZE(bp, size); 683 684 BP_SET_CHECKSUM(bp, checksum); 685 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 686 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 687 688 if (checksum != ZIO_CHECKSUM_OFF) 689 ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 690 } 691 692 zio_t * 693 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 694 void *data, int checksum, zio_done_func_t *done, void *private, 695 int priority, int flags, boolean_t labels) 696 { 697 zio_t *zio; 698 blkptr_t blk; 699 700 ZIO_ENTER(vd->vdev_spa); 701 702 zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 703 704 zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 705 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 706 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 707 708 zio->io_vd = vd; 709 zio->io_offset = offset; 710 711 /* 712 * Work off our copy of the bp so the caller can free it. 713 */ 714 zio->io_bp = &zio->io_bp_copy; 715 716 return (zio); 717 } 718 719 zio_t * 720 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 721 void *data, int checksum, zio_done_func_t *done, void *private, 722 int priority, int flags, boolean_t labels) 723 { 724 zio_block_tail_t *zbt; 725 void *wbuf; 726 zio_t *zio; 727 blkptr_t blk; 728 729 ZIO_ENTER(vd->vdev_spa); 730 731 zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 732 733 zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 734 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 735 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 736 737 zio->io_vd = vd; 738 zio->io_offset = offset; 739 740 zio->io_bp = &zio->io_bp_copy; 741 zio->io_checksum = checksum; 742 743 if (zio_checksum_table[checksum].ci_zbt) { 744 /* 745 * zbt checksums are necessarily destructive -- they modify 746 * one word of the write buffer to hold the verifier/checksum. 747 * Therefore, we must make a local copy in case the data is 748 * being written to multiple places. 749 */ 750 wbuf = zio_buf_alloc(size); 751 bcopy(data, wbuf, size); 752 zio_push_transform(zio, wbuf, size, size); 753 754 zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 755 zbt->zbt_cksum = blk.blk_cksum; 756 } 757 758 return (zio); 759 } 760 761 /* 762 * Create a child I/O to do some work for us. It has no associated bp. 763 */ 764 zio_t * 765 zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 766 void *data, uint64_t size, int type, int priority, int flags, 767 zio_done_func_t *done, void *private) 768 { 769 uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 770 zio_t *cio; 771 772 if (type == ZIO_TYPE_READ && bp != NULL) { 773 /* 774 * If we have the bp, then the child should perform the 775 * checksum and the parent need not. This pushes error 776 * detection as close to the leaves as possible and 777 * eliminates redundant checksums in the interior nodes. 778 */ 779 pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 780 zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 781 } 782 783 cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 784 done, private, type, priority, 785 (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 786 ZIO_STAGE_VDEV_IO_START - 1, pipeline); 787 788 cio->io_vd = vd; 789 cio->io_offset = offset; 790 791 return (cio); 792 } 793 794 /* 795 * ========================================================================== 796 * Initiate I/O, either sync or async 797 * ========================================================================== 798 */ 799 int 800 zio_wait(zio_t *zio) 801 { 802 int error; 803 804 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 805 806 zio->io_waiter = curthread; 807 808 zio_execute(zio); 809 810 mutex_enter(&zio->io_lock); 811 while (zio->io_stalled != ZIO_STAGE_DONE) 812 cv_wait(&zio->io_cv, &zio->io_lock); 813 mutex_exit(&zio->io_lock); 814 815 error = zio->io_error; 816 mutex_destroy(&zio->io_lock); 817 cv_destroy(&zio->io_cv); 818 kmem_cache_free(zio_cache, zio); 819 820 return (error); 821 } 822 823 void 824 zio_nowait(zio_t *zio) 825 { 826 zio_execute(zio); 827 } 828 829 void 830 zio_interrupt(zio_t *zio) 831 { 832 (void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type], 833 (task_func_t *)zio_execute, zio, TQ_SLEEP); 834 } 835 836 static int 837 zio_issue_async(zio_t *zio) 838 { 839 (void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type], 840 (task_func_t *)zio_execute, zio, TQ_SLEEP); 841 842 return (ZIO_PIPELINE_STOP); 843 } 844 845 /* 846 * ========================================================================== 847 * I/O pipeline interlocks: parent/child dependency scoreboarding 848 * ========================================================================== 849 */ 850 static int 851 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 852 { 853 int rv = ZIO_PIPELINE_CONTINUE; 854 855 mutex_enter(&zio->io_lock); 856 ASSERT(zio->io_stalled == 0); 857 if (*countp != 0) { 858 zio->io_stalled = stage; 859 rv = ZIO_PIPELINE_STOP; 860 } 861 mutex_exit(&zio->io_lock); 862 863 return (rv); 864 } 865 866 static void 867 zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 868 { 869 zio_t *pio = zio->io_parent; 870 871 mutex_enter(&pio->io_lock); 872 if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 873 pio->io_error = zio->io_error; 874 ASSERT3U(*countp, >, 0); 875 if (--*countp == 0 && pio->io_stalled == stage) { 876 pio->io_stalled = 0; 877 mutex_exit(&pio->io_lock); 878 zio_execute(pio); 879 } else { 880 mutex_exit(&pio->io_lock); 881 } 882 } 883 884 int 885 zio_wait_for_children_ready(zio_t *zio) 886 { 887 return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 888 &zio->io_children_notready)); 889 } 890 891 int 892 zio_wait_for_children_done(zio_t *zio) 893 { 894 return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, 895 &zio->io_children_notdone)); 896 } 897 898 static int 899 zio_read_init(zio_t *zio) 900 { 901 blkptr_t *bp = zio->io_bp; 902 903 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 904 uint64_t csize = BP_GET_PSIZE(bp); 905 void *cbuf = zio_buf_alloc(csize); 906 907 zio_push_transform(zio, cbuf, csize, csize); 908 zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 909 } 910 911 if (BP_IS_GANG(bp)) { 912 uint64_t gsize = SPA_GANGBLOCKSIZE; 913 void *gbuf = zio_buf_alloc(gsize); 914 915 zio_push_transform(zio, gbuf, gsize, gsize); 916 zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 917 } 918 919 if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 920 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 921 922 return (ZIO_PIPELINE_CONTINUE); 923 } 924 925 static int 926 zio_ready(zio_t *zio) 927 { 928 zio_t *pio = zio->io_parent; 929 930 if (zio->io_ready) 931 zio->io_ready(zio); 932 933 if (pio != NULL) 934 zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 935 &pio->io_children_notready); 936 937 if (zio->io_bp) 938 zio->io_bp_copy = *zio->io_bp; 939 940 return (ZIO_PIPELINE_CONTINUE); 941 } 942 943 static int 944 zio_vdev_retry_io(zio_t *zio) 945 { 946 zio_t *pio = zio->io_parent; 947 948 /* 949 * Preserve the failed bp so that the io_ready() callback can 950 * update the accounting accordingly. The callback will also be 951 * responsible for freeing the previously allocated block, if one 952 * exists. 953 */ 954 zio->io_bp_orig = *zio->io_bp; 955 956 /* 957 * We must zero out the old DVA and blk_birth before reallocating 958 * the bp. 959 */ 960 BP_ZERO_DVAS(zio->io_bp); 961 zio_reset(zio); 962 963 if (pio) { 964 /* 965 * Let the parent know that we will 966 * re-alloc the write (=> new bp info). 967 */ 968 mutex_enter(&pio->io_lock); 969 pio->io_children_notready++; 970 971 /* 972 * If the parent I/O is still in the open stage, then 973 * don't bother telling it to retry since it hasn't 974 * progressed far enough for it to care. 975 */ 976 if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio)) 977 pio->io_flags |= ZIO_FLAG_WRITE_RETRY; 978 979 ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE); 980 mutex_exit(&pio->io_lock); 981 } 982 983 /* 984 * We are getting ready to process the retry request so clear 985 * the flag and the zio's current error status. 986 */ 987 zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY; 988 zio->io_error = 0; 989 990 return (ZIO_PIPELINE_CONTINUE); 991 } 992 993 int 994 zio_vdev_resume_io(spa_t *spa) 995 { 996 zio_t *zio; 997 998 mutex_enter(&spa->spa_zio_lock); 999 1000 /* 1001 * Probe all of vdevs that have experienced an I/O error. 1002 * If we are still unable to verify the integrity of the vdev 1003 * then we prevent the resume from proceeeding. 1004 */ 1005 for (zio = list_head(&spa->spa_zio_list); zio != NULL; 1006 zio = list_next(&spa->spa_zio_list, zio)) { 1007 int error = 0; 1008