1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "@(#)dbuf.c 1.31 07/12/12 SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_impl.h> 31 #include <sys/dbuf.h> 32 #include <sys/dmu_objset.h> 33 #include <sys/dsl_dataset.h> 34 #include <sys/dsl_dir.h> 35 #include <sys/dmu_tx.h> 36 #include <sys/spa.h> 37 #include <sys/zio.h> 38 #include <sys/dmu_zfetch.h> 39 40 static void dbuf_destroy(dmu_buf_impl_t *db); 41 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 42 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, 43 int compress, dmu_tx_t *tx); 44 static arc_done_func_t dbuf_write_ready; 45 static arc_done_func_t dbuf_write_done; 46 47 int zfs_mdcomp_disable = 0; 48 49 /* 50 * Global data structures and functions for the dbuf cache. 51 */ 52 static kmem_cache_t *dbuf_cache; 53 54 /* ARGSUSED */ 55 static int 56 dbuf_cons(void *vdb, void *unused, int kmflag) 57 { 58 dmu_buf_impl_t *db = vdb; 59 bzero(db, sizeof (dmu_buf_impl_t)); 60 61 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 62 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 63 refcount_create(&db->db_holds); 64 return (0); 65 } 66 67 /* ARGSUSED */ 68 static void 69 dbuf_dest(void *vdb, void *unused) 70 { 71 dmu_buf_impl_t *db = vdb; 72 mutex_destroy(&db->db_mtx); 73 cv_destroy(&db->db_changed); 74 refcount_destroy(&db->db_holds); 75 } 76 77 /* 78 * dbuf hash table routines 79 */ 80 static dbuf_hash_table_t dbuf_hash_table; 81 82 static uint64_t dbuf_hash_count; 83 84 static uint64_t 85 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 86 { 87 uintptr_t osv = (uintptr_t)os; 88 uint64_t crc = -1ULL; 89 90 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 92 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 93 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 94 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 95 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 96 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 97 98 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 99 100 return (crc); 101 } 102 103 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 104 105 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 106 ((dbuf)->db.db_object == (obj) && \ 107 (dbuf)->db_objset == (os) && \ 108 (dbuf)->db_level == (level) && \ 109 (dbuf)->db_blkid == (blkid)) 110 111 dmu_buf_impl_t * 112 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 113 { 114 dbuf_hash_table_t *h = &dbuf_hash_table; 115 objset_impl_t *os = dn->dn_objset; 116 uint64_t obj = dn->dn_object; 117 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 118 uint64_t idx = hv & h->hash_table_mask; 119 dmu_buf_impl_t *db; 120 121 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 122 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 123 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 124 mutex_enter(&db->db_mtx); 125 if (db->db_state != DB_EVICTING) { 126 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 127 return (db); 128 } 129 mutex_exit(&db->db_mtx); 130 } 131 } 132 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 133 return (NULL); 134 } 135 136 /* 137 * Insert an entry into the hash table. If there is already an element 138 * equal to elem in the hash table, then the already existing element 139 * will be returned and the new element will not be inserted. 140 * Otherwise returns NULL. 141 */ 142 static dmu_buf_impl_t * 143 dbuf_hash_insert(dmu_buf_impl_t *db) 144 { 145 dbuf_hash_table_t *h = &dbuf_hash_table; 146 objset_impl_t *os = db->db_objset; 147 uint64_t obj = db->db.db_object; 148 int level = db->db_level; 149 uint64_t blkid = db->db_blkid; 150 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 151 uint64_t idx = hv & h->hash_table_mask; 152 dmu_buf_impl_t *dbf; 153 154 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 155 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 156 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 157 mutex_enter(&dbf->db_mtx); 158 if (dbf->db_state != DB_EVICTING) { 159 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 160 return (dbf); 161 } 162 mutex_exit(&dbf->db_mtx); 163 } 164 } 165 166 mutex_enter(&db->db_mtx); 167 db->db_hash_next = h->hash_table[idx]; 168 h->hash_table[idx] = db; 169 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 170 atomic_add_64(&dbuf_hash_count, 1); 171 172 return (NULL); 173 } 174 175 /* 176 * Remove an entry from the hash table. This operation will 177 * fail if there are any existing holds on the db. 178 */ 179 static void 180 dbuf_hash_remove(dmu_buf_impl_t *db) 181 { 182 dbuf_hash_table_t *h = &dbuf_hash_table; 183 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 184 db->db_level, db->db_blkid); 185 uint64_t idx = hv & h->hash_table_mask; 186 dmu_buf_impl_t *dbf, **dbp; 187 188 /* 189 * We musn't hold db_mtx to maintin lock ordering: 190 * DBUF_HASH_MUTEX > db_mtx. 191 */ 192 ASSERT(refcount_is_zero(&db->db_holds)); 193 ASSERT(db->db_state == DB_EVICTING); 194 ASSERT(!MUTEX_HELD(&db->db_mtx)); 195 196 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 197 dbp = &h->hash_table[idx]; 198 while ((dbf = *dbp) != db) { 199 dbp = &dbf->db_hash_next; 200 ASSERT(dbf != NULL); 201 } 202 *dbp = db->db_hash_next; 203 db->db_hash_next = NULL; 204 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 205 atomic_add_64(&dbuf_hash_count, -1); 206 } 207 208 static arc_evict_func_t dbuf_do_evict; 209 210 static void 211 dbuf_evict_user(dmu_buf_impl_t *db) 212 { 213 ASSERT(MUTEX_HELD(&db->db_mtx)); 214 215 if (db->db_level != 0 || db->db_evict_func == NULL) 216 return; 217 218 if (db->db_user_data_ptr_ptr) 219 *db->db_user_data_ptr_ptr = db->db.db_data; 220 db->db_evict_func(&db->db, db->db_user_ptr); 221 db->db_user_ptr = NULL; 222 db->db_user_data_ptr_ptr = NULL; 223 db->db_evict_func = NULL; 224 } 225 226 void 227 dbuf_evict(dmu_buf_impl_t *db) 228 { 229 ASSERT(MUTEX_HELD(&db->db_mtx)); 230 ASSERT(db->db_buf == NULL); 231 ASSERT(db->db_data_pending == NULL); 232 233 dbuf_clear(db); 234 dbuf_destroy(db); 235 } 236 237 void 238 dbuf_init(void) 239 { 240 uint64_t hsize = 1ULL << 16; 241 dbuf_hash_table_t *h = &dbuf_hash_table; 242 int i; 243 244 /* 245 * The hash table is big enough to fill all of physical memory 246 * with an average 4K block size. The table will take up 247 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 248 */ 249 while (hsize * 4096 < physmem * PAGESIZE) 250 hsize <<= 1; 251 252 retry: 253 h->hash_table_mask = hsize - 1; 254 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 255 if (h->hash_table == NULL) { 256 /* XXX - we should really return an error instead of assert */ 257 ASSERT(hsize > (1ULL << 10)); 258 hsize >>= 1; 259 goto retry; 260 } 261 262 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 263 sizeof (dmu_buf_impl_t), 264 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 265 266 for (i = 0; i < DBUF_MUTEXES; i++) 267 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 268 } 269 270 void 271 dbuf_fini(void) 272 { 273 dbuf_hash_table_t *h = &dbuf_hash_table; 274 int i; 275 276 for (i = 0; i < DBUF_MUTEXES; i++) 277 mutex_destroy(&h->hash_mutexes[i]); 278 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 279 kmem_cache_destroy(dbuf_cache); 280 } 281 282 /* 283 * Other stuff. 284 */ 285 286 #ifdef ZFS_DEBUG 287 static void 288 dbuf_verify(dmu_buf_impl_t *db) 289 { 290 dnode_t *dn = db->db_dnode; 291 292 ASSERT(MUTEX_HELD(&db->db_mtx)); 293 294 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 295 return; 296 297 ASSERT(db->db_objset != NULL); 298 if (dn == NULL) { 299 ASSERT(db->db_parent == NULL); 300 ASSERT(db->db_blkptr == NULL); 301 } else { 302 ASSERT3U(db->db.db_object, ==, dn->dn_object); 303 ASSERT3P(db->db_objset, ==, dn->dn_objset); 304 ASSERT3U(db->db_level, <, dn->dn_nlevels); 305 ASSERT(db->db_blkid == DB_BONUS_BLKID || 306 list_head(&dn->dn_dbufs)); 307 } 308 if (db->db_blkid == DB_BONUS_BLKID) { 309 ASSERT(dn != NULL); 310 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 311 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); 312 } else { 313 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 314 } 315 316 if (db->db_level == 0) { 317 /* we can be momentarily larger in dnode_set_blksz() */ 318 if (db->db_blkid != DB_BONUS_BLKID && dn) { 319 ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); 320 } 321 if (db->db.db_object == DMU_META_DNODE_OBJECT) { 322 dbuf_dirty_record_t *dr = db->db_data_pending; 323 /* 324 * it should only be modified in syncing 325 * context, so make sure we only have 326 * one copy of the data. 327 */ 328 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 329 } 330 } 331 332 /* verify db->db_blkptr */ 333 if (db->db_blkptr) { 334 if (db->db_parent == dn->dn_dbuf) { 335 /* db is pointed to by the dnode */ 336 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 337 if (db->db.db_object == DMU_META_DNODE_OBJECT) 338 ASSERT(db->db_parent == NULL); 339 else 340 ASSERT(db->db_parent != NULL); 341 ASSERT3P(db->db_blkptr, ==, 342 &dn->dn_phys->dn_blkptr[db->db_blkid]); 343 } else { 344 /* db is pointed to by an indirect block */ 345 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 346 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 347 ASSERT3U(db->db_parent->db.db_object, ==, 348 db->db.db_object); 349 /* 350 * dnode_grow_indblksz() can make this fail if we don't 351 * have the struct_rwlock. XXX indblksz no longer 352 * grows. safe to do this now? 353 */ 354 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { 355 ASSERT3P(db->db_blkptr, ==, 356 ((blkptr_t *)db->db_parent->db.db_data + 357 db->db_blkid % epb)); 358 } 359 } 360 } 361 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 362 db->db.db_data && db->db_blkid != DB_BONUS_BLKID && 363 db->db_state != DB_FILL && !dn->dn_free_txg) { 364 /* 365 * If the blkptr isn't set but they have nonzero data, 366 * it had better be dirty, otherwise we'll lose that 367 * data when we evict this buffer. 368 */ 369 if (db->db_dirtycnt == 0) { 370 uint64_t *buf = db->db.db_data; 371 int i; 372 373 for (i = 0; i < db->db.db_size >> 3; i++) { 374 ASSERT(buf[i] == 0); 375 } 376 } 377 } 378 } 379 #endif 380 381 static void 382 dbuf_update_data(dmu_buf_impl_t *db) 383 { 384 ASSERT(MUTEX_HELD(&db->db_mtx)); 385 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 386 ASSERT(!refcount_is_zero(&db->db_holds)); 387 *db->db_user_data_ptr_ptr = db->db.db_data; 388 } 389 } 390 391 static void 392 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 393 { 394 ASSERT(MUTEX_HELD(&db->db_mtx)); 395 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 396 db->db_buf = buf; 397 if (buf != NULL) { 398 ASSERT(buf->b_data != NULL); 399 db->db.db_data = buf->b_data; 400 if (!arc_released(buf)) 401 arc_set_callback(buf, dbuf_do_evict, db); 402 dbuf_update_data(db); 403 } else { 404 dbuf_evict_user(db); 405 db->db.db_data = NULL; 406 db->db_state = DB_UNCACHED; 407 } 408 } 409 410 uint64_t 411 dbuf_whichblock(dnode_t *dn, uint64_t offset) 412 { 413 if (dn->dn_datablkshift) { 414 return (offset >> dn->dn_datablkshift); 415 } else { 416 ASSERT3U(offset, <, dn->dn_datablksz); 417 return (0); 418 } 419 } 420 421 static void 422 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 423 { 424 dmu_buf_impl_t *db = vdb; 425 426 mutex_enter(&db->db_mtx); 427 ASSERT3U(db->db_state, ==, DB_READ); 428 /* 429 * All reads are synchronous, so we must have a hold on the dbuf 430 */ 431 ASSERT(refcount_count(&db->db_holds) > 0); 432 ASSERT(db->db_buf == NULL); 433 ASSERT(db->db.db_data == NULL); 434 if (db->db_level == 0 && db->db_freed_in_flight) { 435 /* we were freed in flight; disregard any error */ 436 arc_release(buf, db); 437 bzero(buf->b_data, db->db.db_size); 438 arc_buf_freeze(buf); 439 db->db_freed_in_flight = FALSE; 440 dbuf_set_data(db, buf); 441 db->db_state = DB_CACHED; 442 } else if (zio == NULL || zio->io_error == 0) { 443 dbuf_set_data(db, buf); 444 db->db_state = DB_CACHED; 445 } else { 446 ASSERT(db->db_blkid != DB_BONUS_BLKID); 447 ASSERT3P(db->db_buf, ==, NULL); 448 VERIFY(arc_buf_remove_ref(buf, db) == 1); 449 db->db_state = DB_UNCACHED; 450 } 451 cv_broadcast(&db->db_changed); 452 mutex_exit(&db->db_mtx); 453 dbuf_rele(db, NULL); 454 } 455 456 static void 457 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 458 { 459 blkptr_t *bp; 460 zbookmark_t zb; 461 uint32_t aflags = ARC_NOWAIT; 462 463 ASSERT(!refcount_is_zero(&db->db_holds)); 464 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 465 ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); 466 ASSERT(MUTEX_HELD(&db->db_mtx)); 467 ASSERT(db->db_state == DB_UNCACHED); 468 ASSERT(db->db_buf == NULL); 469 470 if (db->db_blkid == DB_BONUS_BLKID) { 471 int bonuslen = db->db_dnode->dn_bonuslen; 472 473 ASSERT3U(bonuslen, <=, db->db.db_size); 474 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 475 arc_space_consume(DN_MAX_BONUSLEN); 476 if (bonuslen < DN_MAX_BONUSLEN) 477 bzero(db->db.db_data, DN_MAX_BONUSLEN); 478 bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data, 479 bonuslen); 480 dbuf_update_data(db); 481 db->db_state = DB_CACHED; 482 mutex_exit(&db->db_mtx); 483 return; 484 } 485 486 if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid)) 487 bp = NULL; 488 else 489 bp = db->db_blkptr; 490 491 if (bp == NULL) 492 dprintf_dbuf(db, "blkptr: %s\n", "NULL"); 493 else 494 dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); 495 496 if (bp == NULL || BP_IS_HOLE(bp)) { 497 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 498 499 ASSERT(bp == NULL || BP_IS_HOLE(bp)); 500 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 501 db->db.db_size, db, type)); 502 bzero(db->db.db_data, db->db.db_size); 503 db->db_state = DB_CACHED; 504 *flags |= DB_RF_CACHED; 505 mutex_exit(&db->db_mtx); 506 return; 507 } 508 509 db->db_state = DB_READ; 510 mutex_exit(&db->db_mtx); 511 512 zb.zb_objset = db->db_objset->os_dsl_dataset ? 513 db->db_objset->os_dsl_dataset->ds_object : 0; 514 zb.zb_object = db->db.db_object; 515 zb.zb_level = db->db_level; 516 zb.zb_blkid = db->db_blkid; 517 518 dbuf_add_ref(db, NULL); 519 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 520 ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES); 521 (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, 522 db->db_level > 0 ? byteswap_uint64_array : 523 dmu_ot[db->db_dnode->dn_type].ot_byteswap, 524 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 525 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 526 &aflags, &zb); 527 if (aflags & ARC_CACHED) 528 *flags |= DB_RF_CACHED; 529 } 530 531 int 532 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 533 { 534 int err = 0; 535 int havepzio = (zio != NULL); 536 int prefetch; 537 538 /* 539 * We don't have to hold the mutex to check db_state because it 540 * can't be freed while we have a hold on the buffer. 541 */ 542 ASSERT(!refcount_is_zero(&db->db_holds)); 543 544 if ((flags & DB_RF_HAVESTRUCT) == 0) 545 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); 546 547 prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 548 (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL; 549 550 mutex_enter(&db->db_mtx); 551 if (db->db_state == DB_CACHED) { 552 mutex_exit(&db->db_mtx); 553 if (prefetch) 554 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 555 db->db.db_size, TRUE); 556 if ((flags & DB_RF_HAVESTRUCT) == 0) 557 rw_exit(&db->db_dnode->dn_struct_rwlock); 558 } else if (db->db_state == DB_UNCACHED) { 559 if (zio == NULL) { 560 zio = zio_root(db->db_dnode->dn_objset->os_spa, 561 NULL, NULL, ZIO_FLAG_CANFAIL); 562 } 563 dbuf_read_impl(db, zio, &flags); 564 565 /* dbuf_read_impl has dropped db_mtx for us */ 566 567 if (prefetch) 568 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 569 db->db.db_size, flags & DB_RF_CACHED); 570 571 if ((flags & DB_RF_HAVESTRUCT) == 0) 572 rw_exit(&db->db_dnode->dn_struct_rwlock); 573 574 if (!havepzio) 575 err = zio_wait(zio); 576 } else { 577 mutex_exit(&db->db_mtx); 578 if (prefetch) 579 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 580 db->db.db_size, TRUE); 581 if ((flags & DB_RF_HAVESTRUCT) == 0) 582 rw_exit(&db->db_dnode->dn_struct_rwlock); 583 584 mutex_enter(&db->db_mtx); 585 if ((flags & DB_RF_NEVERWAIT) == 0) { 586 while (db->db_state == DB_READ || 587 db->db_state == DB_FILL) { 588 ASSERT(db->db_state == DB_READ || 589 (flags & DB_RF_HAVESTRUCT) == 0); 590 cv_wait(&db->db_changed, &db->db_mtx); 591 } 592 if (db->db_state == DB_UNCACHED) 593 err = EIO; 594 } 595 mutex_exit(&db->db_mtx); 596 } 597 598 ASSERT(err || havepzio || db->db_state == DB_CACHED); 599 return (err); 600 } 601 602 static void 603 dbuf_noread(dmu_buf_impl_t *db) 604 { 605 ASSERT(!refcount_is_zero(&db->db_holds)); 606 ASSERT(db->db_blkid != DB_BONUS_BLKID); 607 mutex_enter(&db->db_mtx); 608 while (db->db_state == DB_READ || db->db_state == DB_FILL) 609 cv_wait(&db->db_changed, &db->db_mtx); 610 if (db->db_state == DB_UNCACHED) { 611 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 612 613 ASSERT(db->db_buf == NULL); 614 ASSERT(db->db.db_data == NULL); 615 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 616 db->db.db_size, db, type)); 617 db->db_state = DB_FILL; 618 } else { 619 ASSERT3U(db->db_state, ==, DB_CACHED); 620 } 621 mutex_exit(&db->db_mtx); 622 } 623 624 /* 625 * This is our just-in-time copy function. It makes a copy of 626 * buffers, that have been modified in a previous transaction 627 * group, before we modify them in the current active group. 628 * 629 * This function is used in two places: when we are dirtying a 630 * buffer for the first time in a txg, and when we are freeing 631 * a range in a dnode that includes this buffer. 632 * 633 * Note that when we are called from dbuf_free_range() we do 634 * not put a hold on the buffer, we just traverse the active 635 * dbuf list for the dnode. 636 */ 637 static void 638 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 639 { 640 dbuf_dirty_record_t *dr = db->db_last_dirty; 641 642 ASSERT(MUTEX_HELD(&db->db_mtx)); 643 ASSERT(db->db.db_data != NULL); 644 ASSERT(db->db_level == 0); 645 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 646 647 if (dr == NULL || 648 (dr->dt.dl.dr_data != 649 ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 650 return; 651 652 /* 653 * If the last dirty record for this dbuf has not yet synced 654 * and its referencing the dbuf data, either: 655 * reset the reference to point to a new copy, 656 * or (if there a no active holders) 657 * just null out the current db_data pointer. 658 */ 659 ASSERT(dr->dr_txg >= txg - 2); 660 if (db->db_blkid == DB_BONUS_BLKID) { 661 /* Note that the data bufs here are zio_bufs */ 662 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 663 arc_space_consume(DN_MAX_BONUSLEN); 664 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 665 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 666 int size = db->db.db_size; 667 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 668 dr->dt.dl.dr_data = arc_buf_alloc( 669 db->db_dnode->dn_objset->os_spa, size, db, type); 670 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 671 } else { 672 dbuf_set_data(db, NULL); 673 } 674 } 675 676 void 677 dbuf_unoverride(dbuf_dirty_record_t *dr) 678 { 679 dmu_buf_impl_t *db = dr->dr_dbuf; 680 uint64_t txg = dr->dr_txg; 681 682 ASSERT(MUTEX_HELD(&db->db_mtx)); 683 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 684 ASSERT(db->db_level == 0); 685 686 if (db->db_blkid == DB_BONUS_BLKID || 687 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 688 return; 689 690 /* free this block */ 691 if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) { 692 /* XXX can get silent EIO here */ 693 (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, 694 txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT); 695 } 696 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 697 /* 698 * Release the already-written buffer, so we leave it in 699 * a consistent dirty state. Note that all callers are 700 * modifying the buffer, so they will immediately do 701 * another (redundant) arc_release(). Therefore, leave 702 * the buf thawed to save the effort of freezing & 703 * immediately re-thawing it. 704 */ 705 arc_release(dr->dt.dl.dr_data, db); 706 } 707 708 void 709 dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) 710 { 711 dmu_buf_impl_t *db, *db_next; 712 uint64_t txg = tx->tx_txg; 713 714 dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); 715 mutex_enter(&dn->dn_dbufs_mtx); 716 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 717 db_next = list_next(&dn->dn_dbufs, db); 718 ASSERT(db->db_blkid != DB_BONUS_BLKID); 719 if (db->db_level != 0) 720 continue; 721 dprintf_dbuf(db, "found buf %s\n", ""); 722 if (db->db_blkid < blkid || 723 db->db_blkid >= blkid+nblks) 724 continue; 725 726 /* found a level 0 buffer in the range */ 727 if (dbuf_undirty(db, tx)) 728 continue; 729 730 mutex_enter(&db->db_mtx); 731 if (db->db_state == DB_UNCACHED || 732 db->db_state == DB_EVICTING) { 733 ASSERT(db->db.db_data == NULL); 734 mutex_exit(&db->db_mtx); 735 continue; 736 } 737 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 738 /* will be handled in dbuf_read_done or dbuf_rele */ 739 db->db_freed_in_flight = TRUE; 740 mutex_exit(&db->db_mtx); 741 continue; 742 } 743 if (refcount_count(&db->db_holds) == 0) { 744 ASSERT(db->db_buf); 745 dbuf_clear(db); 746 continue; 747 } 748 /* The dbuf is referenced */ 749 750 if (db->db_last_dirty != NULL) { 751 dbuf_dirty_record_t *dr = db->db_last_dirty; 752 753 if (dr->dr_txg == txg) { 754 /* 755 * This buffer is "in-use", re-adjust the file 756 * size to reflect that this buffer may 757 * contain new data when we sync. 758 */ 759 if (db->db_blkid > dn->dn_maxblkid) 760 dn->dn_maxblkid = db->db_blkid; 761 dbuf_unoverride(dr); 762 } else { 763 /* 764 * This dbuf is not dirty in the open context. 765 * Either uncache it (if its not referenced in 766 * the open context) or reset its contents to 767 * empty. 768 */ 769 dbuf_fix_old_data(db, txg); 770 } 771 } 772 /* clear the contents if its cached */ 773 if (db->db_state == DB_CACHED) { 774 ASSERT(db->db.db_data != NULL); 775 arc_release(db->db_buf, db); 776 bzero(db->db.db_data, db->db.db_size); 777 arc_buf_freeze(db->db_buf); 778 } 779 780 mutex_exit(&db->db_mtx); 781 } 782 mutex_exit(&dn->dn_dbufs_mtx); 783 } 784 785 static int 786 dbuf_block_freeable(dmu_buf_impl_t *db) 787 { 788 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 789 uint64_t birth_txg = 0; 790 791 /* 792 * We don't need any locking to protect db_blkptr: 793 * If it's syncing, then db_last_dirty will be set 794 * so we'll ignore db_blkptr. 795 */ 796 ASSERT(MUTEX_HELD(&db->db_mtx)); 797 if (db->db_last_dirty) 798 birth_txg = db->db_last_dirty->dr_txg; 799 else if (db->db_blkptr) 800 birth_txg = db->db_blkptr->blk_birth; 801 802 /* If we don't exist or are in a snapshot, we can't be freed */ 803 if (birth_txg) 804 return (ds == NULL || 805 dsl_dataset_block_freeable(ds, birth_txg)); 806 else 807 return (FALSE); 808 } 809 810 void 811 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 812 { 813 arc_buf_t *buf, *obuf; 814 int osize = db->db.db_size; 815 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 816 817 ASSERT(db->db_blkid != DB_BONUS_BLKID); 818 819 /* XXX does *this* func really need the lock? */ 820 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); 821 822 /* 823 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 824 * is OK, because there can be no other references to the db 825 * when we are changing its size, so no concurrent DB_FILL can 826 * be happening. 827 */ 828 /* 829 * XXX we should be doing a dbuf_read, checking the return 830 * value and returning that up to our callers 831 */ 832 dbuf_will_dirty(db, tx); 833 834 /* create the data buffer for the new block */ 835 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); 836 837 /* copy old block data to the new block */ 838 obuf = db->db_buf; 839 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 840 /* zero the remainder */ 841 if (size > osize) 842 bzero((uint8_t *)buf->b_data + osize, size - osize); 843 844 mutex_enter(&db->db_mtx); 845 dbuf_set_data(db, buf); 846 VERIFY(arc_buf_remove_ref(obuf, db) == 1); 847 db->db.db_size = size; 848 849 if (db->db_level == 0) { 850 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 851 db->db_last_dirty->dt.dl.dr_data = buf; 852 } 853 mutex_exit(&db->db_mtx); 854 855 dnode_willuse_space(db->db_dnode, size-osize, tx); 856 } 857 858 dbuf_dirty_record_t * 859 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 860 { 861 dnode_t *dn = db->db_dnode; 862 objset_impl_t *os = dn->dn_objset; 863 dbuf_dirty_record_t **drp, *dr; 864 int drop_struct_lock = FALSE; 865 int txgoff = tx->tx_txg & TXG_MASK; 866 867 ASSERT(tx->tx_txg != 0); 868 ASSERT(!refcount_is_zero(&db->db_holds)); 869 DMU_TX_DIRTY_BUF(tx, db); 870 871 /* 872 * Shouldn't dirty a regular buffer in syncing context. Private 873 * objects may be dirtied in syncing context, but only if they 874 * were already pre-dirtied in open context. 875 * XXX We may want to prohibit dirtying in syncing context even 876 * if they did pre-dirty. 877 */ 878 ASSERT(!dmu_tx_is_syncing(tx) || 879 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 880 dn->dn_object == DMU_META_DNODE_OBJECT || 881 dn->dn_objset->os_dsl_dataset == NULL || 882 dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir)); 883 884 /* 885 * We make this assert for private objects as well, but after we 886 * check if we're already dirty. They are allowed to re-dirty 887 * in syncing context. 888 */ 889 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 890 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 891 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 892 893 mutex_enter(&db->db_mtx); 894 /* 895 * XXX make this true for indirects too? The problem is that 896 * transactions created with dmu_tx_create_assigned() from 897 * syncing context don't bother holding ahead. 898 */ 899 ASSERT(db->db_level != 0 || 900 db->db_state == DB_CACHED || db->