1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/dmu.h> 28 #include <sys/dmu_impl.h> 29 #include <sys/dbuf.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_dir.h> 33 #include <sys/dmu_tx.h> 34 #include <sys/spa.h> 35 #include <sys/zio.h> 36 #include <sys/dmu_zfetch.h> 37 38 static void dbuf_destroy(dmu_buf_impl_t *db); 39 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 40 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 41 static arc_done_func_t dbuf_write_ready; 42 static arc_done_func_t dbuf_write_done; 43 44 /* 45 * Global data structures and functions for the dbuf cache. 46 */ 47 static kmem_cache_t *dbuf_cache; 48 49 /* ARGSUSED */ 50 static int 51 dbuf_cons(void *vdb, void *unused, int kmflag) 52 { 53 dmu_buf_impl_t *db = vdb; 54 bzero(db, sizeof (dmu_buf_impl_t)); 55 56 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 57 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 58 refcount_create(&db->db_holds); 59 return (0); 60 } 61 62 /* ARGSUSED */ 63 static void 64 dbuf_dest(void *vdb, void *unused) 65 { 66 dmu_buf_impl_t *db = vdb; 67 mutex_destroy(&db->db_mtx); 68 cv_destroy(&db->db_changed); 69 refcount_destroy(&db->db_holds); 70 } 71 72 /* 73 * dbuf hash table routines 74 */ 75 static dbuf_hash_table_t dbuf_hash_table; 76 77 static uint64_t dbuf_hash_count; 78 79 static uint64_t 80 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 81 { 82 uintptr_t osv = (uintptr_t)os; 83 uint64_t crc = -1ULL; 84 85 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 86 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 87 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 92 93 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 94 95 return (crc); 96 } 97 98 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 99 100 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 101 ((dbuf)->db.db_object == (obj) && \ 102 (dbuf)->db_objset == (os) && \ 103 (dbuf)->db_level == (level) && \ 104 (dbuf)->db_blkid == (blkid)) 105 106 dmu_buf_impl_t * 107 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 108 { 109 dbuf_hash_table_t *h = &dbuf_hash_table; 110 objset_impl_t *os = dn->dn_objset; 111 uint64_t obj = dn->dn_object; 112 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 113 uint64_t idx = hv & h->hash_table_mask; 114 dmu_buf_impl_t *db; 115 116 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 117 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 118 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 119 mutex_enter(&db->db_mtx); 120 if (db->db_state != DB_EVICTING) { 121 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 122 return (db); 123 } 124 mutex_exit(&db->db_mtx); 125 } 126 } 127 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 128 return (NULL); 129 } 130 131 /* 132 * Insert an entry into the hash table. If there is already an element 133 * equal to elem in the hash table, then the already existing element 134 * will be returned and the new element will not be inserted. 135 * Otherwise returns NULL. 136 */ 137 static dmu_buf_impl_t * 138 dbuf_hash_insert(dmu_buf_impl_t *db) 139 { 140 dbuf_hash_table_t *h = &dbuf_hash_table; 141 objset_impl_t *os = db->db_objset; 142 uint64_t obj = db->db.db_object; 143 int level = db->db_level; 144 uint64_t blkid = db->db_blkid; 145 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 146 uint64_t idx = hv & h->hash_table_mask; 147 dmu_buf_impl_t *dbf; 148 149 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 150 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 151 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 152 mutex_enter(&dbf->db_mtx); 153 if (dbf->db_state != DB_EVICTING) { 154 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 155 return (dbf); 156 } 157 mutex_exit(&dbf->db_mtx); 158 } 159 } 160 161 mutex_enter(&db->db_mtx); 162 db->db_hash_next = h->hash_table[idx]; 163 h->hash_table[idx] = db; 164 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 165 atomic_add_64(&dbuf_hash_count, 1); 166 167 return (NULL); 168 } 169 170 /* 171 * Remove an entry from the hash table. This operation will 172 * fail if there are any existing holds on the db. 173 */ 174 static void 175 dbuf_hash_remove(dmu_buf_impl_t *db) 176 { 177 dbuf_hash_table_t *h = &dbuf_hash_table; 178 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 179 db->db_level, db->db_blkid); 180 uint64_t idx = hv & h->hash_table_mask; 181 dmu_buf_impl_t *dbf, **dbp; 182 183 /* 184 * We musn't hold db_mtx to maintin lock ordering: 185 * DBUF_HASH_MUTEX > db_mtx. 186 */ 187 ASSERT(refcount_is_zero(&db->db_holds)); 188 ASSERT(db->db_state == DB_EVICTING); 189 ASSERT(!MUTEX_HELD(&db->db_mtx)); 190 191 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 192 dbp = &h->hash_table[idx]; 193 while ((dbf = *dbp) != db) { 194 dbp = &dbf->db_hash_next; 195 ASSERT(dbf != NULL); 196 } 197 *dbp = db->db_hash_next; 198 db->db_hash_next = NULL; 199 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 200 atomic_add_64(&dbuf_hash_count, -1); 201 } 202 203 static arc_evict_func_t dbuf_do_evict; 204 205 static void 206 dbuf_evict_user(dmu_buf_impl_t *db) 207 { 208 ASSERT(MUTEX_HELD(&db->db_mtx)); 209 210 if (db->db_level != 0 || db->db_evict_func == NULL) 211 return; 212 213 if (db->db_user_data_ptr_ptr) 214 *db->db_user_data_ptr_ptr = db->db.db_data; 215 db->db_evict_func(&db->db, db->db_user_ptr); 216 db->db_user_ptr = NULL; 217 db->db_user_data_ptr_ptr = NULL; 218 db->db_evict_func = NULL; 219 } 220 221 void 222 dbuf_evict(dmu_buf_impl_t *db) 223 { 224 ASSERT(MUTEX_HELD(&db->db_mtx)); 225 ASSERT(db->db_buf == NULL); 226 ASSERT(db->db_data_pending == NULL); 227 228 dbuf_clear(db); 229 dbuf_destroy(db); 230 } 231 232 void 233 dbuf_init(void) 234 { 235 uint64_t hsize = 1ULL << 16; 236 dbuf_hash_table_t *h = &dbuf_hash_table; 237 int i; 238 239 /* 240 * The hash table is big enough to fill all of physical memory 241 * with an average 4K block size. The table will take up 242 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 243 */ 244 while (hsize * 4096 < physmem * PAGESIZE) 245 hsize <<= 1; 246 247 retry: 248 h->hash_table_mask = hsize - 1; 249 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 250 if (h->hash_table == NULL) { 251 /* XXX - we should really return an error instead of assert */ 252 ASSERT(hsize > (1ULL << 10)); 253 hsize >>= 1; 254 goto retry; 255 } 256 257 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 258 sizeof (dmu_buf_impl_t), 259 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 260 261 for (i = 0; i < DBUF_MUTEXES; i++) 262 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 263 } 264 265 void 266 dbuf_fini(void) 267 { 268 dbuf_hash_table_t *h = &dbuf_hash_table; 269 int i; 270 271 for (i = 0; i < DBUF_MUTEXES; i++) 272 mutex_destroy(&h->hash_mutexes[i]); 273 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 274 kmem_cache_destroy(dbuf_cache); 275 } 276 277 /* 278 * Other stuff. 279 */ 280 281 #ifdef ZFS_DEBUG 282 static void 283 dbuf_verify(dmu_buf_impl_t *db) 284 { 285 dnode_t *dn = db->db_dnode; 286 287 ASSERT(MUTEX_HELD(&db->db_mtx)); 288 289 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 290 return; 291 292 ASSERT(db->db_objset != NULL); 293 if (dn == NULL) { 294 ASSERT(db->db_parent == NULL); 295 ASSERT(db->db_blkptr == NULL); 296 } else { 297 ASSERT3U(db->db.db_object, ==, dn->dn_object); 298 ASSERT3P(db->db_objset, ==, dn->dn_objset); 299 ASSERT3U(db->db_level, <, dn->dn_nlevels); 300 ASSERT(db->db_blkid == DB_BONUS_BLKID || 301 list_head(&dn->dn_dbufs)); 302 } 303 if (db->db_blkid == DB_BONUS_BLKID) { 304 ASSERT(dn != NULL); 305 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 306 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); 307 } else { 308 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 309 } 310 311 if (db->db_level == 0) { 312 /* we can be momentarily larger in dnode_set_blksz() */ 313 if (db->db_blkid != DB_BONUS_BLKID && dn) { 314 ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); 315 } 316 if (db->db.db_object == DMU_META_DNODE_OBJECT) { 317 dbuf_dirty_record_t *dr = db->db_data_pending; 318 /* 319 * it should only be modified in syncing 320 * context, so make sure we only have 321 * one copy of the data. 322 */ 323 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 324 } 325 } 326 327 /* verify db->db_blkptr */ 328 if (db->db_blkptr) { 329 if (db->db_parent == dn->dn_dbuf) { 330 /* db is pointed to by the dnode */ 331 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 332 if (db->db.db_object == DMU_META_DNODE_OBJECT) 333 ASSERT(db->db_parent == NULL); 334 else 335 ASSERT(db->db_parent != NULL); 336 ASSERT3P(db->db_blkptr, ==, 337 &dn->dn_phys->dn_blkptr[db->db_blkid]); 338 } else { 339 /* db is pointed to by an indirect block */ 340 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 341 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 342 ASSERT3U(db->db_parent->db.db_object, ==, 343 db->db.db_object); 344 /* 345 * dnode_grow_indblksz() can make this fail if we don't 346 * have the struct_rwlock. XXX indblksz no longer 347 * grows. safe to do this now? 348 */ 349 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { 350 ASSERT3P(db->db_blkptr, ==, 351 ((blkptr_t *)db->db_parent->db.db_data + 352 db->db_blkid % epb)); 353 } 354 } 355 } 356 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 357 db->db.db_data && db->db_blkid != DB_BONUS_BLKID && 358 db->db_state != DB_FILL && !dn->dn_free_txg) { 359 /* 360 * If the blkptr isn't set but they have nonzero data, 361 * it had better be dirty, otherwise we'll lose that 362 * data when we evict this buffer. 363 */ 364 if (db->db_dirtycnt == 0) { 365 uint64_t *buf = db->db.db_data; 366 int i; 367 368 for (i = 0; i < db->db.db_size >> 3; i++) { 369 ASSERT(buf[i] == 0); 370 } 371 } 372 } 373 } 374 #endif 375 376 static void 377 dbuf_update_data(dmu_buf_impl_t *db) 378 { 379 ASSERT(MUTEX_HELD(&db->db_mtx)); 380 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 381 ASSERT(!refcount_is_zero(&db->db_holds)); 382 *db->db_user_data_ptr_ptr = db->db.db_data; 383 } 384 } 385 386 static void 387 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 388 { 389 ASSERT(MUTEX_HELD(&db->db_mtx)); 390 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 391 db->db_buf = buf; 392 if (buf != NULL) { 393 ASSERT(buf->b_data != NULL); 394 db->db.db_data = buf->b_data; 395 if (!arc_released(buf)) 396 arc_set_callback(buf, dbuf_do_evict, db); 397 dbuf_update_data(db); 398 } else { 399 dbuf_evict_user(db); 400 db->db.db_data = NULL; 401 db->db_state = DB_UNCACHED; 402 } 403 } 404 405 uint64_t 406 dbuf_whichblock(dnode_t *dn, uint64_t offset) 407 { 408 if (dn->dn_datablkshift) { 409 return (offset >> dn->dn_datablkshift); 410 } else { 411 ASSERT3U(offset, <, dn->dn_datablksz); 412 return (0); 413 } 414 } 415 416 static void 417 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 418 { 419 dmu_buf_impl_t *db = vdb; 420 421 mutex_enter(&db->db_mtx); 422 ASSERT3U(db->db_state, ==, DB_READ); 423 /* 424 * All reads are synchronous, so we must have a hold on the dbuf 425 */ 426 ASSERT(refcount_count(&db->db_holds) > 0); 427 ASSERT(db->db_buf == NULL); 428 ASSERT(db->db.db_data == NULL); 429 if (db->db_level == 0 && db->db_freed_in_flight) { 430 /* we were freed in flight; disregard any error */ 431 arc_release(buf, db); 432 bzero(buf->b_data, db->db.db_size); 433 arc_buf_freeze(buf); 434 db->db_freed_in_flight = FALSE; 435 dbuf_set_data(db, buf); 436 db->db_state = DB_CACHED; 437 } else if (zio == NULL || zio->io_error == 0) { 438 dbuf_set_data(db, buf); 439 db->db_state = DB_CACHED; 440 } else { 441 ASSERT(db->db_blkid != DB_BONUS_BLKID); 442 ASSERT3P(db->db_buf, ==, NULL); 443 VERIFY(arc_buf_remove_ref(buf, db) == 1); 444 db->db_state = DB_UNCACHED; 445 } 446 cv_broadcast(&db->db_changed); 447 mutex_exit(&db->db_mtx); 448 dbuf_rele(db, NULL); 449 } 450 451 static void 452 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 453 { 454 dnode_t *dn = db->db_dnode; 455 zbookmark_t zb; 456 uint32_t aflags = ARC_NOWAIT; 457 arc_buf_t *pbuf; 458 459 ASSERT(!refcount_is_zero(&db->db_holds)); 460 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 461 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 462 ASSERT(MUTEX_HELD(&db->db_mtx)); 463 ASSERT(db->db_state == DB_UNCACHED); 464 ASSERT(db->db_buf == NULL); 465 466 if (db->db_blkid == DB_BONUS_BLKID) { 467 int bonuslen = dn->dn_bonuslen; 468 469 ASSERT3U(bonuslen, <=, db->db.db_size); 470 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 471 arc_space_consume(DN_MAX_BONUSLEN); 472 if (bonuslen < DN_MAX_BONUSLEN) 473 bzero(db->db.db_data, DN_MAX_BONUSLEN); 474 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, 475 bonuslen); 476 dbuf_update_data(db); 477 db->db_state = DB_CACHED; 478 mutex_exit(&db->db_mtx); 479 return; 480 } 481 482 /* 483 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 484 * processes the delete record and clears the bp while we are waiting 485 * for the dn_mtx (resulting in a "no" from block_freed). 486 */ 487 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 488 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 489 BP_IS_HOLE(db->db_blkptr)))) { 490 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 491 492 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, 493 db->db.db_size, db, type)); 494 bzero(db->db.db_data, db->db.db_size); 495 db->db_state = DB_CACHED; 496 *flags |= DB_RF_CACHED; 497 mutex_exit(&db->db_mtx); 498 return; 499 } 500 501 db->db_state = DB_READ; 502 mutex_exit(&db->db_mtx); 503 504 if (DBUF_IS_L2CACHEABLE(db)) 505 aflags |= ARC_L2CACHE; 506 507 zb.zb_objset = db->db_objset->os_dsl_dataset ? 508 db->db_objset->os_dsl_dataset->ds_object : 0; 509 zb.zb_object = db->db.db_object; 510 zb.zb_level = db->db_level; 511 zb.zb_blkid = db->db_blkid; 512 513 dbuf_add_ref(db, NULL); 514 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 515 516 if (db->db_parent) 517 pbuf = db->db_parent->db_buf; 518 else 519 pbuf = db->db_objset->os_phys_buf; 520 521 (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, 522 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 523 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 524 &aflags, &zb); 525 if (aflags & ARC_CACHED) 526 *flags |= DB_RF_CACHED; 527 } 528 529 int 530 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 531 { 532 int err = 0; 533 int havepzio = (zio != NULL); 534 int prefetch; 535 536 /* 537 * We don't have to hold the mutex to check db_state because it 538 * can't be freed while we have a hold on the buffer. 539 */ 540 ASSERT(!refcount_is_zero(&db->db_holds)); 541 542 if ((flags & DB_RF_HAVESTRUCT) == 0) 543 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); 544 545 prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 546 (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && 547 DBUF_IS_CACHEABLE(db); 548 549 mutex_enter(&db->db_mtx); 550 if (db->db_state == DB_CACHED) { 551 mutex_exit(&db->db_mtx); 552 if (prefetch) 553 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 554 db->db.db_size, TRUE); 555 if ((flags & DB_RF_HAVESTRUCT) == 0) 556 rw_exit(&db->db_dnode->dn_struct_rwlock); 557 } else if (db->db_state == DB_UNCACHED) { 558 if (zio == NULL) { 559 zio = zio_root(db->db_dnode->dn_objset->os_spa, 560 NULL, NULL, ZIO_FLAG_CANFAIL); 561 } 562 dbuf_read_impl(db, zio, &flags); 563 564 /* dbuf_read_impl has dropped db_mtx for us */ 565 566 if (prefetch) 567 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 568 db->db.db_size, flags & DB_RF_CACHED); 569 570 if ((flags & DB_RF_HAVESTRUCT) == 0) 571 rw_exit(&db->db_dnode->dn_struct_rwlock); 572 573 if (!havepzio) 574 err = zio_wait(zio); 575 } else { 576 mutex_exit(&db->db_mtx); 577 if (prefetch) 578 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 579 db->db.db_size, TRUE); 580 if ((flags & DB_RF_HAVESTRUCT) == 0) 581 rw_exit(&db->db_dnode->dn_struct_rwlock); 582 583 mutex_enter(&db->db_mtx); 584 if ((flags & DB_RF_NEVERWAIT) == 0) { 585 while (db->db_state == DB_READ || 586 db->db_state == DB_FILL) { 587 ASSERT(db->db_state == DB_READ || 588 (flags & DB_RF_HAVESTRUCT) == 0); 589 cv_wait(&db->db_changed, &db->db_mtx); 590 } 591 if (db->db_state == DB_UNCACHED) 592 err = EIO; 593 } 594 mutex_exit(&db->db_mtx); 595 } 596 597 ASSERT(err || havepzio || db->db_state == DB_CACHED); 598 return (err); 599 } 600 601 static void 602 dbuf_noread(dmu_buf_impl_t *db) 603 { 604 ASSERT(!refcount_is_zero(&db->db_holds)); 605 ASSERT(db->db_blkid != DB_BONUS_BLKID); 606 mutex_enter(&db->db_mtx); 607 while (db->db_state == DB_READ || db->db_state == DB_FILL) 608 cv_wait(&db->db_changed, &db->db_mtx); 609 if (db->db_state == DB_UNCACHED) { 610 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 611 612 ASSERT(db->db_buf == NULL); 613 ASSERT(db->db.db_data == NULL); 614 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 615 db->db.db_size, db, type)); 616 db->db_state = DB_FILL; 617 } else { 618 ASSERT3U(db->db_state, ==, DB_CACHED); 619 } 620 mutex_exit(&db->db_mtx); 621 } 622 623 /* 624 * This is our just-in-time copy function. It makes a copy of 625 * buffers, that have been modified in a previous transaction 626 * group, before we modify them in the current active group. 627 * 628 * This function is used in two places: when we are dirtying a 629 * buffer for the first time in a txg, and when we are freeing 630 * a range in a dnode that includes this buffer. 631 * 632 * Note that when we are called from dbuf_free_range() we do 633 * not put a hold on the buffer, we just traverse the active 634 * dbuf list for the dnode. 635 */ 636 static void 637 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 638 { 639 dbuf_dirty_record_t *dr = db->db_last_dirty; 640 641 ASSERT(MUTEX_HELD(&db->db_mtx)); 642 ASSERT(db->db.db_data != NULL); 643 ASSERT(db->db_level == 0); 644 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 645 646 if (dr == NULL || 647 (dr->dt.dl.dr_data != 648 ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 649 return; 650 651 /* 652 * If the last dirty record for this dbuf has not yet synced 653 * and its referencing the dbuf data, either: 654 * reset the reference to point to a new copy, 655 * or (if there a no active holders) 656 * just null out the current db_data pointer. 657 */ 658 ASSERT(dr->dr_txg >= txg - 2); 659 if (db->db_blkid == DB_BONUS_BLKID) { 660 /* Note that the data bufs here are zio_bufs */ 661 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 662 arc_space_consume(DN_MAX_BONUSLEN); 663 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 664 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 665 int size = db->db.db_size; 666 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 667 dr->dt.dl.dr_data = arc_buf_alloc( 668 db->db_dnode->dn_objset->os_spa, size, db, type); 669 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 670 } else { 671 dbuf_set_data(db, NULL); 672 } 673 } 674 675 void 676 dbuf_unoverride(dbuf_dirty_record_t *dr) 677 { 678 dmu_buf_impl_t *db = dr->dr_dbuf; 679 uint64_t txg = dr->dr_txg; 680 681 ASSERT(MUTEX_HELD(&db->db_mtx)); 682 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 683 ASSERT(db->db_level == 0); 684 685 if (db->db_blkid == DB_BONUS_BLKID || 686 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 687 return; 688 689 /* free this block */ 690 if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) { 691 /* XXX can get silent EIO here */ 692 (void) dsl_free(NULL, 693 spa_get_dsl(db->db_dnode->dn_objset->os_spa), 694 txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT); 695 } 696 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 697 /* 698 * Release the already-written buffer, so we leave it in 699 * a consistent dirty state. Note that all callers are 700 * modifying the buffer, so they will immediately do 701 * another (redundant) arc_release(). Therefore, leave 702 * the buf thawed to save the effort of freezing & 703 * immediately re-thawing it. 704 */ 705 arc_release(dr->dt.dl.dr_data, db); 706 } 707 708 /* 709 * Evict (if its unreferenced) or clear (if its referenced) any level-0 710 * data blocks in the free range, so that any future readers will find 711 * empty blocks. Also, if we happen accross any level-1 dbufs in the 712 * range that have not already been marked dirty, mark them dirty so 713 * they stay in memory. 714 */ 715 void 716 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 717 { 718 dmu_buf_impl_t *db, *db_next; 719 uint64_t txg = tx->tx_txg; 720 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 721 uint64_t first_l1 = start >> epbs; 722 uint64_t last_l1 = end >> epbs; 723 724 if (end > dn->dn_maxblkid) { 725 end = dn->dn_maxblkid; 726 last_l1 = end >> epbs; 727 } 728 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 729 mutex_enter(&dn->dn_dbufs_mtx); 730 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 731 db_next = list_next(&dn->dn_dbufs, db); 732 ASSERT(db->db_blkid != DB_BONUS_BLKID); 733 734 if (db->db_level == 1 && 735 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { 736 mutex_enter(&db->db_mtx); 737 if (db->db_last_dirty && 738 db->db_last_dirty->dr_txg < txg) { 739 dbuf_add_ref(db, FTAG); 740 mutex_exit(&db->db_mtx); 741 dbuf_will_dirty(db, tx); 742 dbuf_rele(db, FTAG); 743 } else { 744 mutex_exit(&db->db_mtx); 745 } 746 } 747 748 if (db->db_level != 0) 749 continue; 750 dprintf_dbuf(db, "found buf %s\n", ""); 751 if (db->db_blkid < start || db->db_blkid > end) 752 continue; 753 754 /* found a level 0 buffer in the range */ 755 if (dbuf_undirty(db, tx)) 756 continue; 757 758 mutex_enter(&db->db_mtx); 759 if (db->db_state == DB_UNCACHED || 760 db->db_state == DB_EVICTING) { 761 ASSERT(db->db.db_data == NULL); 762 mutex_exit(&db->db_mtx); 763 continue; 764 } 765 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 766 /* will be handled in dbuf_read_done or dbuf_rele */ 767 db->db_freed_in_flight = TRUE; 768 mutex_exit(&db->db_mtx); 769 continue; 770 } 771 if (refcount_count(&db->db_holds) == 0) { 772 ASSERT(db->db_buf); 773 dbuf_clear(db); 774 continue; 775 } 776 /* The dbuf is referenced */ 777 778 if (db->db_last_dirty != NULL) { 779 dbuf_dirty_record_t *dr = db->db_last_dirty; 780 781 if (dr->dr_txg == txg) { 782 /* 783 * This buffer is "in-use", re-adjust the file 784 * size to reflect that this buffer may 785 * contain new data when we sync. 786 */ 787 if (db->db_blkid > dn->dn_maxblkid) 788 dn->dn_maxblkid = db->db_blkid; 789 dbuf_unoverride(dr); 790 } else { 791 /* 792 * This dbuf is not dirty in the open context. 793 * Either uncache it (if its not referenced in 794 * the open context) or reset its contents to 795 * empty. 796 */ 797 dbuf_fix_old_data(db, txg); 798 } 799 } 800 /* clear the contents if its cached */ 801 if (db->db_state == DB_CACHED) { 802 ASSERT(db->db.db_data != NULL); 803 arc_release(db->db_buf, db); 804 bzero(db->db.db_data, db->db.db_size); 805 arc_buf_freeze(db->db_buf); 806 } 807 808 mutex_exit(&db->db_mtx); 809 } 810 mutex_exit(&dn->dn_dbufs_mtx); 811 } 812 813 static int 814 dbuf_block_freeable(dmu_buf_impl_t *db) 815 { 816 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 817 uint64_t birth_txg = 0; 818 819 /* 820 * We don't need any locking to protect db_blkptr: 821 * If it's syncing, then db_last_dirty will be set 822 * so we'll ignore db_blkptr. 823 */ 824 ASSERT(MUTEX_HELD(&db->db_mtx)); 825 if (db->db_last_dirty) 826 birth_txg = db->db_last_dirty->dr_txg; 827 else if (db->db_blkptr) 828 birth_txg = db->db_blkptr->blk_birth; 829 830 /* If we don't exist or are in a snapshot, we can't be freed */ 831 if (birth_txg) 832 return (ds == NULL || 833 dsl_dataset_block_freeable(ds, birth_txg)); 834 else 835 return (FALSE); 836 } 837 838 void 839 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 840 { 841 arc_buf_t *buf, *obuf; 842 int osize = db->db.db_size; 843 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 844 845 ASSERT(db->db_blkid != DB_BONUS_BLKID); 846 847 /* XXX does *this* func really need the lock? */ 848 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); 849 850 /* 851 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 852 * is OK, because there can be no other references to the db 853 * when we are changing its size, so no concurrent DB_FILL can 854 * be happening. 855 */ 856 /* 857 * XXX we should be doing a dbuf_read, checking the return 858 * value and returning that up to our callers 859 */ 860 dbuf_will_dirty(db, tx); 861 862 /* create the data buffer for the new block */ 863 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); 864 865 /* copy old block data to the new block */ 866 obuf = db->db_buf; 867 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 868 /* zero the remainder */ 869 if (size > osize) 870 bzero((uint8_t *)buf->b_data + osize, size - osize); 871 872 mutex_enter(&db->db_mtx); 873 dbuf_set_data(db, buf); 874 VERIFY(arc_buf_remove_ref(obuf, db) == 1); 875 db->db.db_size = size; 876 877 if (db->db_level == 0) { 878 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 879 db->db_last_dirty->dt.dl.dr_data = buf; 880 } 881 mutex_exit(&db->db_mtx); 882 883 dnode_willuse_space(db->db_dnode, size-osize, tx); 884 } 885 886 dbuf_dirty_record_t * 887 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 888 { 889 dnode_t *dn = db->db_dnode; 890 objset_impl_t *os = dn->dn_objset; 891 dbuf_dirty_record_t **drp, *dr; 892 int drop_struct_lock = FALSE; 893 boolean_t do_free_accounting = B_FALSE; 894 int txgoff = tx->tx_txg & TXG_MASK; 895 896 ASSERT(tx->tx_txg != 0); 897 ASSERT(!refcount_is_zero(&db->db_holds)); 898 DMU_TX_DIRTY_BUF(tx, db); 899 900 /* 901 * Shouldn't dirty a regular buffer in syncing context. Private