1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * DVA-based Adjustable Replacement Cache 28 * 29 * While much of the theory of operation used here is 30 * based on the self-tuning, low overhead replacement cache 31 * presented by Megiddo and Modha at FAST 2003, there are some 32 * significant differences: 33 * 34 * 1. The Megiddo and Modha model assumes any page is evictable. 35 * Pages in its cache cannot be "locked" into memory. This makes 36 * the eviction algorithm simple: evict the last page in the list. 37 * This also make the performance characteristics easy to reason 38 * about. Our cache is not so simple. At any given moment, some 39 * subset of the blocks in the cache are un-evictable because we 40 * have handed out a reference to them. Blocks are only evictable 41 * when there are no external references active. This makes 42 * eviction far more problematic: we choose to evict the evictable 43 * blocks that are the "lowest" in the list. 44 * 45 * There are times when it is not possible to evict the requested 46 * space. In these circumstances we are unable to adjust the cache 47 * size. To prevent the cache growing unbounded at these times we 48 * implement a "cache throttle" that slows the flow of new data 49 * into the cache until we can make space available. 50 * 51 * 2. The Megiddo and Modha model assumes a fixed cache size. 52 * Pages are evicted when the cache is full and there is a cache 53 * miss. Our model has a variable sized cache. It grows with 54 * high use, but also tries to react to memory pressure from the 55 * operating system: decreasing its size when system memory is 56 * tight. 57 * 58 * 3. The Megiddo and Modha model assumes a fixed page size. All 59 * elements of the cache are therefor exactly the same size. So 60 * when adjusting the cache size following a cache miss, its simply 61 * a matter of choosing a single page to evict. In our model, we 62 * have variable sized cache blocks (rangeing from 512 bytes to 63 * 128K bytes). We therefor choose a set of blocks to evict to make 64 * space for a cache miss that approximates as closely as possible 65 * the space used by the new block. 66 * 67 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 68 * by N. Megiddo & D. Modha, FAST 2003 69 */ 70 71 /* 72 * The locking model: 73 * 74 * A new reference to a cache buffer can be obtained in two 75 * ways: 1) via a hash table lookup using the DVA as a key, 76 * or 2) via one of the ARC lists. The arc_read() interface 77 * uses method 1, while the internal arc algorithms for 78 * adjusting the cache use method 2. We therefor provide two 79 * types of locks: 1) the hash table lock array, and 2) the 80 * arc list locks. 81 * 82 * Buffers do not have their own mutexs, rather they rely on the 83 * hash table mutexs for the bulk of their protection (i.e. most 84 * fields in the arc_buf_hdr_t are protected by these mutexs). 85 * 86 * buf_hash_find() returns the appropriate mutex (held) when it 87 * locates the requested buffer in the hash table. It returns 88 * NULL for the mutex if the buffer was not in the table. 89 * 90 * buf_hash_remove() expects the appropriate hash mutex to be 91 * already held before it is invoked. 92 * 93 * Each arc state also has a mutex which is used to protect the 94 * buffer list associated with the state. When attempting to 95 * obtain a hash table lock while holding an arc list lock you 96 * must use: mutex_tryenter() to avoid deadlock. Also note that 97 * the active state mutex must be held before the ghost state mutex. 98 * 99 * Arc buffers may have an associated eviction callback function. 100 * This function will be invoked prior to removing the buffer (e.g. 101 * in arc_do_user_evicts()). Note however that the data associated 102 * with the buffer may be evicted prior to the callback. The callback 103 * must be made with *no locks held* (to prevent deadlock). Additionally, 104 * the users of callbacks must ensure that their private data is 105 * protected from simultaneous callbacks from arc_buf_evict() 106 * and arc_do_user_evicts(). 107 * 108 * Note that the majority of the performance stats are manipulated 109 * with atomic operations. 110 * 111 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 112 * 113 * - L2ARC buflist creation 114 * - L2ARC buflist eviction 115 * - L2ARC write completion, which walks L2ARC buflists 116 * - ARC header destruction, as it removes from L2ARC buflists 117 * - ARC header release, as it removes from L2ARC buflists 118 */ 119 120 #include <sys/spa.h> 121 #include <sys/zio.h> 122 #include <sys/zio_checksum.h> 123 #include <sys/zfs_context.h> 124 #include <sys/arc.h> 125 #include <sys/refcount.h> 126 #include <sys/vdev.h> 127 #ifdef _KERNEL 128 #include <sys/vmsystm.h> 129 #include <vm/anon.h> 130 #include <sys/fs/swapnode.h> 131 #include <sys/dnlc.h> 132 #endif 133 #include <sys/callb.h> 134 #include <sys/kstat.h> 135 136 static kmutex_t arc_reclaim_thr_lock; 137 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 138 static uint8_t arc_thread_exit; 139 140 extern int zfs_write_limit_shift; 141 extern uint64_t zfs_write_limit_max; 142 extern kmutex_t zfs_write_limit_lock; 143 144 #define ARC_REDUCE_DNLC_PERCENT 3 145 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 146 147 typedef enum arc_reclaim_strategy { 148 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 149 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 150 } arc_reclaim_strategy_t; 151 152 /* number of seconds before growing cache again */ 153 static int arc_grow_retry = 60; 154 155 /* 156 * minimum lifespan of a prefetch block in clock ticks 157 * (initialized in arc_init()) 158 */ 159 static int arc_min_prefetch_lifespan; 160 161 static int arc_dead; 162 163 /* 164 * The arc has filled available memory and has now warmed up. 165 */ 166 static boolean_t arc_warm; 167 168 /* 169 * These tunables are for performance analysis. 170 */ 171 uint64_t zfs_arc_max; 172 uint64_t zfs_arc_min; 173 uint64_t zfs_arc_meta_limit = 0; 174 int zfs_mdcomp_disable = 0; 175 176 /* 177 * Note that buffers can be in one of 6 states: 178 * ARC_anon - anonymous (discussed below) 179 * ARC_mru - recently used, currently cached 180 * ARC_mru_ghost - recentely used, no longer in cache 181 * ARC_mfu - frequently used, currently cached 182 * ARC_mfu_ghost - frequently used, no longer in cache 183 * ARC_l2c_only - exists in L2ARC but not other states 184 * When there are no active references to the buffer, they are 185 * are linked onto a list in one of these arc states. These are 186 * the only buffers that can be evicted or deleted. Within each 187 * state there are multiple lists, one for meta-data and one for 188 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 189 * etc.) is tracked separately so that it can be managed more 190 * explicitly: favored over data, limited explicitly. 191 * 192 * Anonymous buffers are buffers that are not associated with 193 * a DVA. These are buffers that hold dirty block copies 194 * before they are written to stable storage. By definition, 195 * they are "ref'd" and are considered part of arc_mru 196 * that cannot be freed. Generally, they will aquire a DVA 197 * as they are written and migrate onto the arc_mru list. 198 * 199 * The ARC_l2c_only state is for buffers that are in the second 200 * level ARC but no longer in any of the ARC_m* lists. The second 201 * level ARC itself may also contain buffers that are in any of 202 * the ARC_m* states - meaning that a buffer can exist in two 203 * places. The reason for the ARC_l2c_only state is to keep the 204 * buffer header in the hash table, so that reads that hit the 205 * second level ARC benefit from these fast lookups. 206 */ 207 208 typedef struct arc_state { 209 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 210 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 211 uint64_t arcs_size; /* total amount of data in this state */ 212 kmutex_t arcs_mtx; 213 } arc_state_t; 214 215 /* The 6 states: */ 216 static arc_state_t ARC_anon; 217 static arc_state_t ARC_mru; 218 static arc_state_t ARC_mru_ghost; 219 static arc_state_t ARC_mfu; 220 static arc_state_t ARC_mfu_ghost; 221 static arc_state_t ARC_l2c_only; 222 223 typedef struct arc_stats { 224 kstat_named_t arcstat_hits; 225 kstat_named_t arcstat_misses; 226 kstat_named_t arcstat_demand_data_hits; 227 kstat_named_t arcstat_demand_data_misses; 228 kstat_named_t arcstat_demand_metadata_hits; 229 kstat_named_t arcstat_demand_metadata_misses; 230 kstat_named_t arcstat_prefetch_data_hits; 231 kstat_named_t arcstat_prefetch_data_misses; 232 kstat_named_t arcstat_prefetch_metadata_hits; 233 kstat_named_t arcstat_prefetch_metadata_misses; 234 kstat_named_t arcstat_mru_hits; 235 kstat_named_t arcstat_mru_ghost_hits; 236 kstat_named_t arcstat_mfu_hits; 237 kstat_named_t arcstat_mfu_ghost_hits; 238 kstat_named_t arcstat_deleted; 239 kstat_named_t arcstat_recycle_miss; 240 kstat_named_t arcstat_mutex_miss; 241 kstat_named_t arcstat_evict_skip; 242 kstat_named_t arcstat_hash_elements; 243 kstat_named_t arcstat_hash_elements_max; 244 kstat_named_t arcstat_hash_collisions; 245 kstat_named_t arcstat_hash_chains; 246 kstat_named_t arcstat_hash_chain_max; 247 kstat_named_t arcstat_p; 248 kstat_named_t arcstat_c; 249 kstat_named_t arcstat_c_min; 250 kstat_named_t arcstat_c_max; 251 kstat_named_t arcstat_size; 252 kstat_named_t arcstat_hdr_size; 253 kstat_named_t arcstat_l2_hits; 254 kstat_named_t arcstat_l2_misses; 255 kstat_named_t arcstat_l2_feeds; 256 kstat_named_t arcstat_l2_rw_clash; 257 kstat_named_t arcstat_l2_writes_sent; 258 kstat_named_t arcstat_l2_writes_done; 259 kstat_named_t arcstat_l2_writes_error; 260 kstat_named_t arcstat_l2_writes_hdr_miss; 261 kstat_named_t arcstat_l2_evict_lock_retry; 262 kstat_named_t arcstat_l2_evict_reading; 263 kstat_named_t arcstat_l2_free_on_write; 264 kstat_named_t arcstat_l2_abort_lowmem; 265 kstat_named_t arcstat_l2_cksum_bad; 266 kstat_named_t arcstat_l2_io_error; 267 kstat_named_t arcstat_l2_size; 268 kstat_named_t arcstat_l2_hdr_size; 269 kstat_named_t arcstat_memory_throttle_count; 270 } arc_stats_t; 271 272 static arc_stats_t arc_stats = { 273 { "hits", KSTAT_DATA_UINT64 }, 274 { "misses", KSTAT_DATA_UINT64 }, 275 { "demand_data_hits", KSTAT_DATA_UINT64 }, 276 { "demand_data_misses", KSTAT_DATA_UINT64 }, 277 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 278 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 279 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 280 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 281 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 282 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 283 { "mru_hits", KSTAT_DATA_UINT64 }, 284 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 285 { "mfu_hits", KSTAT_DATA_UINT64 }, 286 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 287 { "deleted", KSTAT_DATA_UINT64 }, 288 { "recycle_miss", KSTAT_DATA_UINT64 }, 289 { "mutex_miss", KSTAT_DATA_UINT64 }, 290 { "evict_skip", KSTAT_DATA_UINT64 }, 291 { "hash_elements", KSTAT_DATA_UINT64 }, 292 { "hash_elements_max", KSTAT_DATA_UINT64 }, 293 { "hash_collisions", KSTAT_DATA_UINT64 }, 294 { "hash_chains", KSTAT_DATA_UINT64 }, 295 { "hash_chain_max", KSTAT_DATA_UINT64 }, 296 { "p", KSTAT_DATA_UINT64 }, 297 { "c", KSTAT_DATA_UINT64 }, 298 { "c_min", KSTAT_DATA_UINT64 }, 299 { "c_max", KSTAT_DATA_UINT64 }, 300 { "size", KSTAT_DATA_UINT64 }, 301 { "hdr_size", KSTAT_DATA_UINT64 }, 302 { "l2_hits", KSTAT_DATA_UINT64 }, 303 { "l2_misses", KSTAT_DATA_UINT64 }, 304 { "l2_feeds", KSTAT_DATA_UINT64 }, 305 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 306 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 307 { "l2_writes_done", KSTAT_DATA_UINT64 }, 308 { "l2_writes_error", KSTAT_DATA_UINT64 }, 309 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 310 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 311 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 312 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 313 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 314 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 315 { "l2_io_error", KSTAT_DATA_UINT64 }, 316 { "l2_size", KSTAT_DATA_UINT64 }, 317 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 318 { "memory_throttle_count", KSTAT_DATA_UINT64 } 319 }; 320 321 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 322 323 #define ARCSTAT_INCR(stat, val) \ 324 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 325 326 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 327 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 328 329 #define ARCSTAT_MAX(stat, val) { \ 330 uint64_t m; \ 331 while ((val) > (m = arc_stats.stat.value.ui64) && \ 332 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 333 continue; \ 334 } 335 336 #define ARCSTAT_MAXSTAT(stat) \ 337 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 338 339 /* 340 * We define a macro to allow ARC hits/misses to be easily broken down by 341 * two separate conditions, giving a total of four different subtypes for 342 * each of hits and misses (so eight statistics total). 343 */ 344 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 345 if (cond1) { \ 346 if (cond2) { \ 347 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 348 } else { \ 349 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 350 } \ 351 } else { \ 352 if (cond2) { \ 353 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 354 } else { \ 355 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 356 } \ 357 } 358 359 kstat_t *arc_ksp; 360 static arc_state_t *arc_anon; 361 static arc_state_t *arc_mru; 362 static arc_state_t *arc_mru_ghost; 363 static arc_state_t *arc_mfu; 364 static arc_state_t *arc_mfu_ghost; 365 static arc_state_t *arc_l2c_only; 366 367 /* 368 * There are several ARC variables that are critical to export as kstats -- 369 * but we don't want to have to grovel around in the kstat whenever we wish to 370 * manipulate them. For these variables, we therefore define them to be in 371 * terms of the statistic variable. This assures that we are not introducing 372 * the possibility of inconsistency by having shadow copies of the variables, 373 * while still allowing the code to be readable. 374 */ 375 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 376 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 377 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 378 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 379 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 380 381 static int arc_no_grow; /* Don't try to grow cache size */ 382 static uint64_t arc_tempreserve; 383 static uint64_t arc_meta_used; 384 static uint64_t arc_meta_limit; 385 static uint64_t arc_meta_max = 0; 386 387 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 388 389 typedef struct arc_callback arc_callback_t; 390 391 struct arc_callback { 392 void *acb_private; 393 arc_done_func_t *acb_done; 394 arc_buf_t *acb_buf; 395 zio_t *acb_zio_dummy; 396 arc_callback_t *acb_next; 397 }; 398 399 typedef struct arc_write_callback arc_write_callback_t; 400 401 struct arc_write_callback { 402 void *awcb_private; 403 arc_done_func_t *awcb_ready; 404 arc_done_func_t *awcb_done; 405 arc_buf_t *awcb_buf; 406 }; 407 408 struct arc_buf_hdr { 409 /* protected by hash lock */ 410 dva_t b_dva; 411 uint64_t b_birth; 412 uint64_t b_cksum0; 413 414 kmutex_t b_freeze_lock; 415 zio_cksum_t *b_freeze_cksum; 416 417 arc_buf_hdr_t *b_hash_next; 418 arc_buf_t *b_buf; 419 uint32_t b_flags; 420 uint32_t b_datacnt; 421 422 arc_callback_t *b_acb; 423 kcondvar_t b_cv; 424 425 /* immutable */ 426 arc_buf_contents_t b_type; 427 uint64_t b_size; 428 spa_t *b_spa; 429 430 /* protected by arc state mutex */ 431 arc_state_t *b_state; 432 list_node_t b_arc_node; 433 434 /* updated atomically */ 435 clock_t b_arc_access; 436 437 /* self protecting */ 438 refcount_t b_refcnt; 439 440 l2arc_buf_hdr_t *b_l2hdr; 441 list_node_t b_l2node; 442 }; 443 444 static arc_buf_t *arc_eviction_list; 445 static kmutex_t arc_eviction_mtx; 446 static arc_buf_hdr_t arc_eviction_hdr; 447 static void arc_get_data_buf(arc_buf_t *buf); 448 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 449 static int arc_evict_needed(arc_buf_contents_t type); 450 static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); 451 452 #define GHOST_STATE(state) \ 453 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 454 (state) == arc_l2c_only) 455 456 /* 457 * Private ARC flags. These flags are private ARC only flags that will show up 458 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 459 * be passed in as arc_flags in things like arc_read. However, these flags 460 * should never be passed and should only be set by ARC code. When adding new 461 * public flags, make sure not to smash the private ones. 462 */ 463 464 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 465 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 466 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 467 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 468 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 469 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 470 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ 471 #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ 472 #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ 473 #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ 474 #define ARC_STORED (1 << 19) /* has been store()d to */ 475 476 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 477 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 478 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 479 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 480 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 481 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 482 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) 483 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 484 (hdr)->b_l2hdr != NULL) 485 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 486 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 487 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 488 489 /* 490 * Other sizes 491 */ 492 493 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 494 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 495 496 /* 497 * Hash table routines 498 */ 499 500 #define HT_LOCK_PAD 64 501 502 struct ht_lock { 503 kmutex_t ht_lock; 504 #ifdef _KERNEL 505 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 506 #endif 507 }; 508 509 #define BUF_LOCKS 256 510 typedef struct buf_hash_table { 511 uint64_t ht_mask; 512 arc_buf_hdr_t **ht_table; 513 struct ht_lock ht_locks[BUF_LOCKS]; 514 } buf_hash_table_t; 515 516 static buf_hash_table_t buf_hash_table; 517 518 #define BUF_HASH_INDEX(spa, dva, birth) \ 519 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 520 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 521 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 522 #define HDR_LOCK(buf) \ 523 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 524 525 uint64_t zfs_crc64_table[256]; 526 527 /* 528 * Level 2 ARC 529 */ 530 531 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 532 #define L2ARC_HEADROOM 4 /* num of writes */ 533 #define L2ARC_FEED_SECS 1 /* caching interval */ 534 535 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 536 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 537 538 /* 539 * L2ARC Performance Tunables 540 */ 541 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 542 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 543 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 544 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 545 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 546 547 /* 548 * L2ARC Internals 549 */ 550 typedef struct l2arc_dev { 551 vdev_t *l2ad_vdev; /* vdev */ 552 spa_t *l2ad_spa; /* spa */ 553 uint64_t l2ad_hand; /* next write location */ 554 uint64_t l2ad_write; /* desired write size, bytes */ 555 uint64_t l2ad_boost; /* warmup write boost, bytes */ 556 uint64_t l2ad_start; /* first addr on device */ 557 uint64_t l2ad_end; /* last addr on device */ 558 uint64_t l2ad_evict; /* last addr eviction reached */ 559 boolean_t l2ad_first; /* first sweep through */ 560 list_t *l2ad_buflist; /* buffer list */ 561 list_node_t l2ad_node; /* device list node */ 562 } l2arc_dev_t; 563 564 static list_t L2ARC_dev_list; /* device list */ 565 static list_t *l2arc_dev_list; /* device list pointer */ 566 static kmutex_t l2arc_dev_mtx; /* device list mutex */ 567 static l2arc_dev_t *l2arc_dev_last; /* last device used */ 568 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ 569 static list_t L2ARC_free_on_write; /* free after write buf list */ 570 static list_t *l2arc_free_on_write; /* free after write list ptr */ 571 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 572 static uint64_t l2arc_ndev; /* number of devices */ 573 574 typedef struct l2arc_read_callback { 575 arc_buf_t *l2rcb_buf; /* read buffer */ 576 spa_t *l2rcb_spa; /* spa */ 577 blkptr_t l2rcb_bp; /* original blkptr */ 578 zbookmark_t l2rcb_zb; /* original bookmark */ 579 int l2rcb_flags; /* original flags */ 580 } l2arc_read_callback_t; 581 582 typedef struct l2arc_write_callback { 583 l2arc_dev_t *l2wcb_dev; /* device info */ 584 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 585 } l2arc_write_callback_t; 586 587 struct l2arc_buf_hdr { 588 /* protected by arc_buf_hdr mutex */ 589 l2arc_dev_t *b_dev; /* L2ARC device */ 590 daddr_t b_daddr; /* disk address, offset byte */ 591 }; 592 593 typedef struct l2arc_data_free { 594 /* protected by l2arc_free_on_write_mtx */ 595 void *l2df_data; 596 size_t l2df_size; 597 void (*l2df_func)(void *, size_t); 598 list_node_t l2df_list_node; 599 } l2arc_data_free_t; 600 601 static kmutex_t l2arc_feed_thr_lock; 602 static kcondvar_t l2arc_feed_thr_cv; 603 static uint8_t l2arc_thread_exit; 604 605 static void l2arc_read_done(zio_t *zio); 606 static void l2arc_hdr_stat_add(void); 607 static void l2arc_hdr_stat_remove(void); 608 609 static uint64_t 610 buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) 611 { 612 uintptr_t spav = (uintptr_t)spa; 613 uint8_t *vdva = (uint8_t *)dva; 614 uint64_t crc = -1ULL; 615 int i; 616 617 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 618 619 for (i = 0; i < sizeof (dva_t); i++) 620 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 621 622 crc ^= (spav>>8) ^ birth; 623 624 return (crc); 625 } 626 627 #define BUF_EMPTY(buf) \ 628 ((buf)->b_dva.dva_word[0] == 0 && \ 629 (buf)->b_dva.dva_word[1] == 0 && \ 630 (buf)->b_birth == 0) 631 632 #define BUF_EQUAL(spa, dva, birth, buf) \ 633 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 634 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 635 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 636 637 static arc_buf_hdr_t * 638 buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) 639 { 640 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 641 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 642 arc_buf_hdr_t *buf; 643 644 mutex_enter(hash_lock); 645 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 646 buf = buf->b_hash_next) { 647 if (BUF_EQUAL(spa, dva, birth, buf)) { 648 *lockp = hash_lock; 649 return (buf); 650 } 651 } 652 mutex_exit(hash_lock); 653 *lockp = NULL; 654 return (NULL); 655 } 656 657 /* 658 * Insert an entry into the hash table. If there is already an element 659 * equal to elem in the hash table, then the already existing element 660 * will be returned and the new element will not be inserted. 661 * Otherwise returns NULL. 662 */ 663 static arc_buf_hdr_t * 664 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 665 { 666 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 667 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 668 arc_buf_hdr_t *fbuf; 669 uint32_t i; 670 671 ASSERT(!HDR_IN_HASH_TABLE(buf)); 672 *lockp = hash_lock; 673 mutex_enter(hash_lock); 674 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 675 fbuf = fbuf->b_hash_next, i++) { 676 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 677 return (fbuf); 678 } 679 680 buf->b_hash_next = buf_hash_table.ht_table[idx]; 681 buf_hash_table.ht_table[idx] = buf; 682 buf->b_flags |= ARC_IN_HASH_TABLE; 683 684 /* collect some hash table performance data */ 685 if (i > 0) { 686 ARCSTAT_BUMP(arcstat_hash_collisions); 687 if (i == 1) 688 ARCSTAT_BUMP(arcstat_hash_chains); 689 690 ARCSTAT_MAX(arcstat_hash_chain_max, i); 691 } 692 693 ARCSTAT_BUMP(arcstat_hash_elements); 694 ARCSTAT_MAXSTAT(arcstat_hash_elements); 695 696 return (NULL); 697 } 698 699 static void 700 buf_hash_remove(arc_buf_hdr_t *buf) 701 { 702 arc_buf_hdr_t *fbuf, **bufp; 703 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 704 705 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 706 ASSERT(HDR_IN_HASH_TABLE(buf)); 707 708 bufp = &buf_hash_table.ht_table[idx]; 709 while ((fbuf = *bufp) != buf) { 710 ASSERT(fbuf != NULL); 711 bufp = &fbuf->b_hash_next; 712 } 713 *bufp = buf->b_hash_next; 714 buf->b_hash_next = NULL; 715 buf->b_flags &= ~ARC_IN_HASH_TABLE; 716 717 /* collect some hash table performance data */ 718 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 719 720 if (buf_hash_table.ht_table[idx] && 721 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 722 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 723 } 724 725 /* 726 * Global data structures and functions for the buf kmem cache. 727 */ 728 static kmem_cache_t *hdr_cache; 729 static kmem_cache_t *buf_cache; 730 731 static void 732 buf_fini(void) 733 { 734 int i; 735 736 kmem_free(buf_hash_table.ht_table, 737 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 738 for (i = 0; i < BUF_LOCKS; i++) 739 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 740 kmem_cache_destroy(hdr_cache); 741 kmem_cache_destroy(buf_cache); 742 } 743 744 /* 745 * Constructor callback - called when the cache is empty 746 * and a new buf is requested. 747 */ 748 /* ARGSUSED */ 749 static int 750 hdr_cons(void *vbuf, void *unused, int kmflag) 751 { 752 arc_buf_hdr_t *buf = vbuf; 753 754 bzero(buf, sizeof (arc_buf_hdr_t)); 755 refcount_create(&buf->b_refcnt); 756 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 757 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 758 759 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 760 return (0); 761 } 762 763 /* ARGSUSED */ 764 static int 765 buf_cons(void *vbuf, void *unused, int kmflag) 766 { 767 arc_buf_t *buf = vbuf; 768 769 bzero(buf, sizeof (arc_buf_t)); 770 rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); 771 return (0); 772 } 773 774 /* 775 * Destructor callback - called when a cached buf is 776 * no longer required. 777 */ 778 /* ARGSUSED */ 779 static void 780 hdr_dest(void *vbuf, void *unused) 781 { 782 arc_buf_hdr_t *buf = vbuf; 783 784 refcount_destroy(&buf->b_refcnt); 785 cv_destroy(&buf->b_cv); 786 mutex_destroy(&buf->b_freeze_lock); 787 788 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 789 } 790 791 /* ARGSUSED */ 792 static void 793 buf_dest(void *vbuf, void *unused) 794 { 795 arc_buf_t *buf = vbuf; 796 797 rw_destroy(&buf->b_lock); 798 } 799 800 /* 801 * Reclaim callback -- invoked when memory is low. 802 */ 803 /* ARGSUSED */ 804 static void 805 hdr_recl(void *unused) 806 { 807 dprintf("hdr_recl called\n"); 808 /* 809 * umem calls the reclaim func when we destroy the buf cache, 810 * which is after we do arc_fini(). 811 */ 812 if (!arc_dead) 813 cv_signal(&arc_reclaim_thr_cv); 814 } 815 816 static void 817 buf_init(void) 818 { 819 uint64_t *ct; 820 uint64_t hsize = 1ULL << 12; 821 int i, j; 822 823 /* 824 * The hash table is big enough to fill all of physical memory 825 * with an average 64K block size. The table will take up 826 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 827 */ 828 while (hsize * 65536 < physmem * PAGESIZE) 829 hsize <<= 1; 830 retry: 831 buf_hash_table.ht_mask = hsize - 1; 832 buf_hash_table.ht_table = 833 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 834 if (buf_hash_table.ht_table == NULL) { 835 ASSERT(hsize > (1ULL << 8)); 836 hsize >>= 1; 837 goto retry; 838 } 839 840 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 841 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 842 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 843 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 844 845 for (i = 0; i < 256; i++) 846 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 847 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 848 849 for (i = 0; i < BUF_LOCKS; i++) { 850 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 851 NULL, MUTEX_DEFAULT, NULL); 852 } 853 } 854 855 #define ARC_MINTIME (hz>>4) /* 62 ms */ 856 857 static void 858 arc_cksum_verify(arc_buf_t *buf) 859 { 860 zio_cksum_t zc; 861 862 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 863 return; 864 865 mutex_enter(&buf->b_hdr->b_freeze_lock); 866 if (buf->b_hdr->b_freeze_cksum == NULL || 867 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 868 mutex_exit(&buf->b_hdr->b_freeze_lock); 869 return; 870 } 871 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 872 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 873 panic("buffer modified while frozen!"); 874 mutex_exit(&buf->b_hdr->b_freeze_lock); 875 } 876 877 static int 878 arc_cksum_equal(arc_buf_t *buf) 879 { 880 zio_cksum_t zc; 881 int equal; 882 883 mutex_enter(&buf->b_hdr->b_freeze_lock); 884 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 885 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 886 mutex_exit(&buf->b_hdr->b_freeze_lock); 887 888 return (equal); 889 } 890 891 static void 892 arc_cksum_compute(arc_buf_t *buf, boolean_t force) 893 { 894 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 895 return; 896 897 mutex_enter(&buf->b_hdr->b_freeze_lock); 898 if (buf->b_hdr->b_freeze_cksum != NULL) { 899 mutex_exit(&buf->b_hdr->b_freeze_lock); 900 return; 901 } 902 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 903 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 904 buf->b_hdr->b_freeze_cksum); 905 mutex_exit(&buf->b_hdr->b_freeze_lock); 906 } 907 908 void 909 arc_buf_thaw(arc_buf_t *buf) 910 { 911 if (zfs_flags & ZFS_DEBUG_MODIFY) { 912 if (buf->b_hdr->b_state != arc_anon) 913 panic("modifying non-anon buffer!"); 914 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 915 panic("modifying buffer while i/o in progress!"); 916 arc_cksum_verify(buf); 917 } 918 919 mutex_enter(&buf->b_hdr->b_freeze_lock); 920 if (buf->b_hdr->b_freeze_cksum != NULL) { 921