Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * DVA-based Adjustable Replacement Cache
     28  *
     29  * While much of the theory of operation used here is
     30  * based on the self-tuning, low overhead replacement cache
     31  * presented by Megiddo and Modha at FAST 2003, there are some
     32  * significant differences:
     33  *
     34  * 1. The Megiddo and Modha model assumes any page is evictable.
     35  * Pages in its cache cannot be "locked" into memory.  This makes
     36  * the eviction algorithm simple: evict the last page in the list.
     37  * This also make the performance characteristics easy to reason
     38  * about.  Our cache is not so simple.  At any given moment, some
     39  * subset of the blocks in the cache are un-evictable because we
     40  * have handed out a reference to them.  Blocks are only evictable
     41  * when there are no external references active.  This makes
     42  * eviction far more problematic:  we choose to evict the evictable
     43  * blocks that are the "lowest" in the list.
     44  *
     45  * There are times when it is not possible to evict the requested
     46  * space.  In these circumstances we are unable to adjust the cache
     47  * size.  To prevent the cache growing unbounded at these times we
     48  * implement a "cache throttle" that slows the flow of new data
     49  * into the cache until we can make space available.
     50  *
     51  * 2. The Megiddo and Modha model assumes a fixed cache size.
     52  * Pages are evicted when the cache is full and there is a cache
     53  * miss.  Our model has a variable sized cache.  It grows with
     54  * high use, but also tries to react to memory pressure from the
     55  * operating system: decreasing its size when system memory is
     56  * tight.
     57  *
     58  * 3. The Megiddo and Modha model assumes a fixed page size. All
     59  * elements of the cache are therefor exactly the same size.  So
     60  * when adjusting the cache size following a cache miss, its simply
     61  * a matter of choosing a single page to evict.  In our model, we
     62  * have variable sized cache blocks (rangeing from 512 bytes to
     63  * 128K bytes).  We therefor choose a set of blocks to evict to make
     64  * space for a cache miss that approximates as closely as possible
     65  * the space used by the new block.
     66  *
     67  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
     68  * by N. Megiddo & D. Modha, FAST 2003
     69  */
     70 
     71 /*
     72  * The locking model:
     73  *
     74  * A new reference to a cache buffer can be obtained in two
     75  * ways: 1) via a hash table lookup using the DVA as a key,
     76  * or 2) via one of the ARC lists.  The arc_read() interface
     77  * uses method 1, while the internal arc algorithms for
     78  * adjusting the cache use method 2.  We therefor provide two
     79  * types of locks: 1) the hash table lock array, and 2) the
     80  * arc list locks.
     81  *
     82  * Buffers do not have their own mutexs, rather they rely on the
     83  * hash table mutexs for the bulk of their protection (i.e. most
     84  * fields in the arc_buf_hdr_t are protected by these mutexs).
     85  *
     86  * buf_hash_find() returns the appropriate mutex (held) when it
     87  * locates the requested buffer in the hash table.  It returns
     88  * NULL for the mutex if the buffer was not in the table.
     89  *
     90  * buf_hash_remove() expects the appropriate hash mutex to be
     91  * already held before it is invoked.
     92  *
     93  * Each arc state also has a mutex which is used to protect the
     94  * buffer list associated with the state.  When attempting to
     95  * obtain a hash table lock while holding an arc list lock you
     96  * must use: mutex_tryenter() to avoid deadlock.  Also note that
     97  * the active state mutex must be held before the ghost state mutex.
     98  *
     99  * Arc buffers may have an associated eviction callback function.
    100  * This function will be invoked prior to removing the buffer (e.g.
    101  * in arc_do_user_evicts()).  Note however that the data associated
    102  * with the buffer may be evicted prior to the callback.  The callback
    103  * must be made with *no locks held* (to prevent deadlock).  Additionally,
    104  * the users of callbacks must ensure that their private data is
    105  * protected from simultaneous callbacks from arc_buf_evict()
    106  * and arc_do_user_evicts().
    107  *
    108  * Note that the majority of the performance stats are manipulated
    109  * with atomic operations.
    110  *
    111  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
    112  *
    113  *	- L2ARC buflist creation
    114  *	- L2ARC buflist eviction
    115  *	- L2ARC write completion, which walks L2ARC buflists
    116  *	- ARC header destruction, as it removes from L2ARC buflists
    117  *	- ARC header release, as it removes from L2ARC buflists
    118  */
    119 
    120 #include <sys/spa.h>
    121 #include <sys/zio.h>
    122 #include <sys/zio_checksum.h>
    123 #include <sys/zfs_context.h>
    124 #include <sys/arc.h>
    125 #include <sys/refcount.h>
    126 #include <sys/vdev.h>
    127 #ifdef _KERNEL
    128 #include <sys/vmsystm.h>
    129 #include <vm/anon.h>
    130 #include <sys/fs/swapnode.h>
    131 #include <sys/dnlc.h>
    132 #endif
    133 #include <sys/callb.h>
    134 #include <sys/kstat.h>
    135 
    136 static kmutex_t		arc_reclaim_thr_lock;
    137 static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
    138 static uint8_t		arc_thread_exit;
    139 
    140 extern int zfs_write_limit_shift;
    141 extern uint64_t zfs_write_limit_max;
    142 extern kmutex_t zfs_write_limit_lock;
    143 
    144 #define	ARC_REDUCE_DNLC_PERCENT	3
    145 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
    146 
    147 typedef enum arc_reclaim_strategy {
    148 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
    149 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
    150 } arc_reclaim_strategy_t;
    151 
    152 /* number of seconds before growing cache again */
    153 static int		arc_grow_retry = 60;
    154 
    155 /*
    156  * minimum lifespan of a prefetch block in clock ticks
    157  * (initialized in arc_init())
    158  */
    159 static int		arc_min_prefetch_lifespan;
    160 
    161 static int arc_dead;
    162 
    163 /*
    164  * The arc has filled available memory and has now warmed up.
    165  */
    166 static boolean_t arc_warm;
    167 
    168 /*
    169  * These tunables are for performance analysis.
    170  */
    171 uint64_t zfs_arc_max;
    172 uint64_t zfs_arc_min;
    173 uint64_t zfs_arc_meta_limit = 0;
    174 int zfs_mdcomp_disable = 0;
    175 
    176 /*
    177  * Note that buffers can be in one of 6 states:
    178  *	ARC_anon	- anonymous (discussed below)
    179  *	ARC_mru		- recently used, currently cached
    180  *	ARC_mru_ghost	- recentely used, no longer in cache
    181  *	ARC_mfu		- frequently used, currently cached
    182  *	ARC_mfu_ghost	- frequently used, no longer in cache
    183  *	ARC_l2c_only	- exists in L2ARC but not other states
    184  * When there are no active references to the buffer, they are
    185  * are linked onto a list in one of these arc states.  These are
    186  * the only buffers that can be evicted or deleted.  Within each
    187  * state there are multiple lists, one for meta-data and one for
    188  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
    189  * etc.) is tracked separately so that it can be managed more
    190  * explicitly: favored over data, limited explicitly.
    191  *
    192  * Anonymous buffers are buffers that are not associated with
    193  * a DVA.  These are buffers that hold dirty block copies
    194  * before they are written to stable storage.  By definition,
    195  * they are "ref'd" and are considered part of arc_mru
    196  * that cannot be freed.  Generally, they will aquire a DVA
    197  * as they are written and migrate onto the arc_mru list.
    198  *
    199  * The ARC_l2c_only state is for buffers that are in the second
    200  * level ARC but no longer in any of the ARC_m* lists.  The second
    201  * level ARC itself may also contain buffers that are in any of
    202  * the ARC_m* states - meaning that a buffer can exist in two
    203  * places.  The reason for the ARC_l2c_only state is to keep the
    204  * buffer header in the hash table, so that reads that hit the
    205  * second level ARC benefit from these fast lookups.
    206  */
    207 
    208 typedef struct arc_state {
    209 	list_t	arcs_list[ARC_BUFC_NUMTYPES];	/* list of evictable buffers */
    210 	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
    211 	uint64_t arcs_size;	/* total amount of data in this state */
    212 	kmutex_t arcs_mtx;
    213 } arc_state_t;
    214 
    215 /* The 6 states: */
    216 static arc_state_t ARC_anon;
    217 static arc_state_t ARC_mru;
    218 static arc_state_t ARC_mru_ghost;
    219 static arc_state_t ARC_mfu;
    220 static arc_state_t ARC_mfu_ghost;
    221 static arc_state_t ARC_l2c_only;
    222 
    223 typedef struct arc_stats {
    224 	kstat_named_t arcstat_hits;
    225 	kstat_named_t arcstat_misses;
    226 	kstat_named_t arcstat_demand_data_hits;
    227 	kstat_named_t arcstat_demand_data_misses;
    228 	kstat_named_t arcstat_demand_metadata_hits;
    229 	kstat_named_t arcstat_demand_metadata_misses;
    230 	kstat_named_t arcstat_prefetch_data_hits;
    231 	kstat_named_t arcstat_prefetch_data_misses;
    232 	kstat_named_t arcstat_prefetch_metadata_hits;
    233 	kstat_named_t arcstat_prefetch_metadata_misses;
    234 	kstat_named_t arcstat_mru_hits;
    235 	kstat_named_t arcstat_mru_ghost_hits;
    236 	kstat_named_t arcstat_mfu_hits;
    237 	kstat_named_t arcstat_mfu_ghost_hits;
    238 	kstat_named_t arcstat_deleted;
    239 	kstat_named_t arcstat_recycle_miss;
    240 	kstat_named_t arcstat_mutex_miss;
    241 	kstat_named_t arcstat_evict_skip;
    242 	kstat_named_t arcstat_hash_elements;
    243 	kstat_named_t arcstat_hash_elements_max;
    244 	kstat_named_t arcstat_hash_collisions;
    245 	kstat_named_t arcstat_hash_chains;
    246 	kstat_named_t arcstat_hash_chain_max;
    247 	kstat_named_t arcstat_p;
    248 	kstat_named_t arcstat_c;
    249 	kstat_named_t arcstat_c_min;
    250 	kstat_named_t arcstat_c_max;
    251 	kstat_named_t arcstat_size;
    252 	kstat_named_t arcstat_hdr_size;
    253 	kstat_named_t arcstat_l2_hits;
    254 	kstat_named_t arcstat_l2_misses;
    255 	kstat_named_t arcstat_l2_feeds;
    256 	kstat_named_t arcstat_l2_rw_clash;
    257 	kstat_named_t arcstat_l2_writes_sent;
    258 	kstat_named_t arcstat_l2_writes_done;
    259 	kstat_named_t arcstat_l2_writes_error;
    260 	kstat_named_t arcstat_l2_writes_hdr_miss;
    261 	kstat_named_t arcstat_l2_evict_lock_retry;
    262 	kstat_named_t arcstat_l2_evict_reading;
    263 	kstat_named_t arcstat_l2_free_on_write;
    264 	kstat_named_t arcstat_l2_abort_lowmem;
    265 	kstat_named_t arcstat_l2_cksum_bad;
    266 	kstat_named_t arcstat_l2_io_error;
    267 	kstat_named_t arcstat_l2_size;
    268 	kstat_named_t arcstat_l2_hdr_size;
    269 	kstat_named_t arcstat_memory_throttle_count;
    270 } arc_stats_t;
    271 
    272 static arc_stats_t arc_stats = {
    273 	{ "hits",			KSTAT_DATA_UINT64 },
    274 	{ "misses",			KSTAT_DATA_UINT64 },
    275 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
    276 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
    277 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
    278 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
    279 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
    280 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
    281 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
    282 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
    283 	{ "mru_hits",			KSTAT_DATA_UINT64 },
    284 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
    285 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
    286 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
    287 	{ "deleted",			KSTAT_DATA_UINT64 },
    288 	{ "recycle_miss",		KSTAT_DATA_UINT64 },
    289 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
    290 	{ "evict_skip",			KSTAT_DATA_UINT64 },
    291 	{ "hash_elements",		KSTAT_DATA_UINT64 },
    292 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
    293 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
    294 	{ "hash_chains",		KSTAT_DATA_UINT64 },
    295 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
    296 	{ "p",				KSTAT_DATA_UINT64 },
    297 	{ "c",				KSTAT_DATA_UINT64 },
    298 	{ "c_min",			KSTAT_DATA_UINT64 },
    299 	{ "c_max",			KSTAT_DATA_UINT64 },
    300 	{ "size",			KSTAT_DATA_UINT64 },
    301 	{ "hdr_size",			KSTAT_DATA_UINT64 },
    302 	{ "l2_hits",			KSTAT_DATA_UINT64 },
    303 	{ "l2_misses",			KSTAT_DATA_UINT64 },
    304 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
    305 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
    306 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
    307 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
    308 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
    309 	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
    310 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
    311 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
    312 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
    313 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
    314 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
    315 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
    316 	{ "l2_size",			KSTAT_DATA_UINT64 },
    317 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
    318 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 }
    319 };
    320 
    321 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
    322 
    323 #define	ARCSTAT_INCR(stat, val) \
    324 	atomic_add_64(&arc_stats.stat.value.ui64, (val));
    325 
    326 #define	ARCSTAT_BUMP(stat) 	ARCSTAT_INCR(stat, 1)
    327 #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
    328 
    329 #define	ARCSTAT_MAX(stat, val) {					\
    330 	uint64_t m;							\
    331 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
    332 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
    333 		continue;						\
    334 }
    335 
    336 #define	ARCSTAT_MAXSTAT(stat) \
    337 	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
    338 
    339 /*
    340  * We define a macro to allow ARC hits/misses to be easily broken down by
    341  * two separate conditions, giving a total of four different subtypes for
    342  * each of hits and misses (so eight statistics total).
    343  */
    344 #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
    345 	if (cond1) {							\
    346 		if (cond2) {						\
    347 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
    348 		} else {						\
    349 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
    350 		}							\
    351 	} else {							\
    352 		if (cond2) {						\
    353 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
    354 		} else {						\
    355 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
    356 		}							\
    357 	}
    358 
    359 kstat_t			*arc_ksp;
    360 static arc_state_t 	*arc_anon;
    361 static arc_state_t	*arc_mru;
    362 static arc_state_t	*arc_mru_ghost;
    363 static arc_state_t	*arc_mfu;
    364 static arc_state_t	*arc_mfu_ghost;
    365 static arc_state_t	*arc_l2c_only;
    366 
    367 /*
    368  * There are several ARC variables that are critical to export as kstats --
    369  * but we don't want to have to grovel around in the kstat whenever we wish to
    370  * manipulate them.  For these variables, we therefore define them to be in
    371  * terms of the statistic variable.  This assures that we are not introducing
    372  * the possibility of inconsistency by having shadow copies of the variables,
    373  * while still allowing the code to be readable.
    374  */
    375 #define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
    376 #define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
    377 #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
    378 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
    379 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
    380 
    381 static int		arc_no_grow;	/* Don't try to grow cache size */
    382 static uint64_t		arc_tempreserve;
    383 static uint64_t		arc_meta_used;
    384 static uint64_t		arc_meta_limit;
    385 static uint64_t		arc_meta_max = 0;
    386 
    387 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
    388 
    389 typedef struct arc_callback arc_callback_t;
    390 
    391 struct arc_callback {
    392 	void			*acb_private;
    393 	arc_done_func_t		*acb_done;
    394 	arc_buf_t		*acb_buf;
    395 	zio_t			*acb_zio_dummy;
    396 	arc_callback_t		*acb_next;
    397 };
    398 
    399 typedef struct arc_write_callback arc_write_callback_t;
    400 
    401 struct arc_write_callback {
    402 	void		*awcb_private;
    403 	arc_done_func_t	*awcb_ready;
    404 	arc_done_func_t	*awcb_done;
    405 	arc_buf_t	*awcb_buf;
    406 };
    407 
    408 struct arc_buf_hdr {
    409 	/* protected by hash lock */
    410 	dva_t			b_dva;
    411 	uint64_t		b_birth;
    412 	uint64_t		b_cksum0;
    413 
    414 	kmutex_t		b_freeze_lock;
    415 	zio_cksum_t		*b_freeze_cksum;
    416 
    417 	arc_buf_hdr_t		*b_hash_next;
    418 	arc_buf_t		*b_buf;
    419 	uint32_t		b_flags;
    420 	uint32_t		b_datacnt;
    421 
    422 	arc_callback_t		*b_acb;
    423 	kcondvar_t		b_cv;
    424 
    425 	/* immutable */
    426 	arc_buf_contents_t	b_type;
    427 	uint64_t		b_size;
    428 	spa_t			*b_spa;
    429 
    430 	/* protected by arc state mutex */
    431 	arc_state_t		*b_state;
    432 	list_node_t		b_arc_node;
    433 
    434 	/* updated atomically */
    435 	clock_t			b_arc_access;
    436 
    437 	/* self protecting */
    438 	refcount_t		b_refcnt;
    439 
    440 	l2arc_buf_hdr_t		*b_l2hdr;
    441 	list_node_t		b_l2node;
    442 };
    443 
    444 static arc_buf_t *arc_eviction_list;
    445 static kmutex_t arc_eviction_mtx;
    446 static arc_buf_hdr_t arc_eviction_hdr;
    447 static void arc_get_data_buf(arc_buf_t *buf);
    448 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
    449 static int arc_evict_needed(arc_buf_contents_t type);
    450 static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
    451 
    452 #define	GHOST_STATE(state)	\
    453 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
    454 	(state) == arc_l2c_only)
    455 
    456 /*
    457  * Private ARC flags.  These flags are private ARC only flags that will show up
    458  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
    459  * be passed in as arc_flags in things like arc_read.  However, these flags
    460  * should never be passed and should only be set by ARC code.  When adding new
    461  * public flags, make sure not to smash the private ones.
    462  */
    463 
    464 #define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
    465 #define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
    466 #define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
    467 #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
    468 #define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
    469 #define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
    470 #define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
    471 #define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
    472 #define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
    473 #define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
    474 #define	ARC_STORED		(1 << 19)	/* has been store()d to */
    475 
    476 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
    477 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
    478 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
    479 #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
    480 #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
    481 #define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
    482 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
    483 #define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
    484 				    (hdr)->b_l2hdr != NULL)
    485 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
    486 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
    487 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
    488 
    489 /*
    490  * Other sizes
    491  */
    492 
    493 #define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
    494 #define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
    495 
    496 /*
    497  * Hash table routines
    498  */
    499 
    500 #define	HT_LOCK_PAD	64
    501 
    502 struct ht_lock {
    503 	kmutex_t	ht_lock;
    504 #ifdef _KERNEL
    505 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
    506 #endif
    507 };
    508 
    509 #define	BUF_LOCKS 256
    510 typedef struct buf_hash_table {
    511 	uint64_t ht_mask;
    512 	arc_buf_hdr_t **ht_table;
    513 	struct ht_lock ht_locks[BUF_LOCKS];
    514 } buf_hash_table_t;
    515 
    516 static buf_hash_table_t buf_hash_table;
    517 
    518 #define	BUF_HASH_INDEX(spa, dva, birth) \
    519 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
    520 #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
    521 #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
    522 #define	HDR_LOCK(buf) \
    523 	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
    524 
    525 uint64_t zfs_crc64_table[256];
    526 
    527 /*
    528  * Level 2 ARC
    529  */
    530 
    531 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
    532 #define	L2ARC_HEADROOM		4		/* num of writes */
    533 #define	L2ARC_FEED_SECS		1		/* caching interval */
    534 
    535 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
    536 #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
    537 
    538 /*
    539  * L2ARC Performance Tunables
    540  */
    541 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
    542 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
    543 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
    544 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
    545 boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
    546 
    547 /*
    548  * L2ARC Internals
    549  */
    550 typedef struct l2arc_dev {
    551 	vdev_t			*l2ad_vdev;	/* vdev */
    552 	spa_t			*l2ad_spa;	/* spa */
    553 	uint64_t		l2ad_hand;	/* next write location */
    554 	uint64_t		l2ad_write;	/* desired write size, bytes */
    555 	uint64_t		l2ad_boost;	/* warmup write boost, bytes */
    556 	uint64_t		l2ad_start;	/* first addr on device */
    557 	uint64_t		l2ad_end;	/* last addr on device */
    558 	uint64_t		l2ad_evict;	/* last addr eviction reached */
    559 	boolean_t		l2ad_first;	/* first sweep through */
    560 	list_t			*l2ad_buflist;	/* buffer list */
    561 	list_node_t		l2ad_node;	/* device list node */
    562 } l2arc_dev_t;
    563 
    564 static list_t L2ARC_dev_list;			/* device list */
    565 static list_t *l2arc_dev_list;			/* device list pointer */
    566 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
    567 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
    568 static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
    569 static list_t L2ARC_free_on_write;		/* free after write buf list */
    570 static list_t *l2arc_free_on_write;		/* free after write list ptr */
    571 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
    572 static uint64_t l2arc_ndev;			/* number of devices */
    573 
    574 typedef struct l2arc_read_callback {
    575 	arc_buf_t	*l2rcb_buf;		/* read buffer */
    576 	spa_t		*l2rcb_spa;		/* spa */
    577 	blkptr_t	l2rcb_bp;		/* original blkptr */
    578 	zbookmark_t	l2rcb_zb;		/* original bookmark */
    579 	int		l2rcb_flags;		/* original flags */
    580 } l2arc_read_callback_t;
    581 
    582 typedef struct l2arc_write_callback {
    583 	l2arc_dev_t	*l2wcb_dev;		/* device info */
    584 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
    585 } l2arc_write_callback_t;
    586 
    587 struct l2arc_buf_hdr {
    588 	/* protected by arc_buf_hdr  mutex */
    589 	l2arc_dev_t	*b_dev;			/* L2ARC device */
    590 	daddr_t		b_daddr;		/* disk address, offset byte */
    591 };
    592 
    593 typedef struct l2arc_data_free {
    594 	/* protected by l2arc_free_on_write_mtx */
    595 	void		*l2df_data;
    596 	size_t		l2df_size;
    597 	void		(*l2df_func)(void *, size_t);
    598 	list_node_t	l2df_list_node;
    599 } l2arc_data_free_t;
    600 
    601 static kmutex_t l2arc_feed_thr_lock;
    602 static kcondvar_t l2arc_feed_thr_cv;
    603 static uint8_t l2arc_thread_exit;
    604 
    605 static void l2arc_read_done(zio_t *zio);
    606 static void l2arc_hdr_stat_add(void);
    607 static void l2arc_hdr_stat_remove(void);
    608 
    609 static uint64_t
    610 buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
    611 {
    612 	uintptr_t spav = (uintptr_t)spa;
    613 	uint8_t *vdva = (uint8_t *)dva;
    614 	uint64_t crc = -1ULL;
    615 	int i;
    616 
    617 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
    618 
    619 	for (i = 0; i < sizeof (dva_t); i++)
    620 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
    621 
    622 	crc ^= (spav>>8) ^ birth;
    623 
    624 	return (crc);
    625 }
    626 
    627 #define	BUF_EMPTY(buf)						\
    628 	((buf)->b_dva.dva_word[0] == 0 &&			\
    629 	(buf)->b_dva.dva_word[1] == 0 &&			\
    630 	(buf)->b_birth == 0)
    631 
    632 #define	BUF_EQUAL(spa, dva, birth, buf)				\
    633 	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
    634 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
    635 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
    636 
    637 static arc_buf_hdr_t *
    638 buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
    639 {
    640 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
    641 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
    642 	arc_buf_hdr_t *buf;
    643 
    644 	mutex_enter(hash_lock);
    645 	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
    646 	    buf = buf->b_hash_next) {
    647 		if (BUF_EQUAL(spa, dva, birth, buf)) {
    648 			*lockp = hash_lock;
    649 			return (buf);
    650 		}
    651 	}
    652 	mutex_exit(hash_lock);
    653 	*lockp = NULL;
    654 	return (NULL);
    655 }
    656 
    657 /*
    658  * Insert an entry into the hash table.  If there is already an element
    659  * equal to elem in the hash table, then the already existing element
    660  * will be returned and the new element will not be inserted.
    661  * Otherwise returns NULL.
    662  */
    663 static arc_buf_hdr_t *
    664 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
    665 {
    666 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
    667 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
    668 	arc_buf_hdr_t *fbuf;
    669 	uint32_t i;
    670 
    671 	ASSERT(!HDR_IN_HASH_TABLE(buf));
    672 	*lockp = hash_lock;
    673 	mutex_enter(hash_lock);
    674 	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
    675 	    fbuf = fbuf->b_hash_next, i++) {
    676 		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
    677 			return (fbuf);
    678 	}
    679 
    680 	buf->b_hash_next = buf_hash_table.ht_table[idx];
    681 	buf_hash_table.ht_table[idx] = buf;
    682 	buf->b_flags |= ARC_IN_HASH_TABLE;
    683 
    684 	/* collect some hash table performance data */
    685 	if (i > 0) {
    686 		ARCSTAT_BUMP(arcstat_hash_collisions);
    687 		if (i == 1)
    688 			ARCSTAT_BUMP(arcstat_hash_chains);
    689 
    690 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
    691 	}
    692 
    693 	ARCSTAT_BUMP(arcstat_hash_elements);
    694 	ARCSTAT_MAXSTAT(arcstat_hash_elements);
    695 
    696 	return (NULL);
    697 }
    698 
    699 static void
    700 buf_hash_remove(arc_buf_hdr_t *buf)
    701 {
    702 	arc_buf_hdr_t *fbuf, **bufp;
    703 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
    704 
    705 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
    706 	ASSERT(HDR_IN_HASH_TABLE(buf));
    707 
    708 	bufp = &buf_hash_table.ht_table[idx];
    709 	while ((fbuf = *bufp) != buf) {
    710 		ASSERT(fbuf != NULL);
    711 		bufp = &fbuf->b_hash_next;
    712 	}
    713 	*bufp = buf->b_hash_next;
    714 	buf->b_hash_next = NULL;
    715 	buf->b_flags &= ~ARC_IN_HASH_TABLE;
    716 
    717 	/* collect some hash table performance data */
    718 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
    719 
    720 	if (buf_hash_table.ht_table[idx] &&
    721 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
    722 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
    723 }
    724 
    725 /*
    726  * Global data structures and functions for the buf kmem cache.
    727  */
    728 static kmem_cache_t *hdr_cache;
    729 static kmem_cache_t *buf_cache;
    730 
    731 static void
    732 buf_fini(void)
    733 {
    734 	int i;
    735 
    736 	kmem_free(buf_hash_table.ht_table,
    737 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
    738 	for (i = 0; i < BUF_LOCKS; i++)
    739 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
    740 	kmem_cache_destroy(hdr_cache);
    741 	kmem_cache_destroy(buf_cache);
    742 }
    743 
    744 /*
    745  * Constructor callback - called when the cache is empty
    746  * and a new buf is requested.
    747  */
    748 /* ARGSUSED */
    749 static int
    750 hdr_cons(void *vbuf, void *unused, int kmflag)
    751 {
    752 	arc_buf_hdr_t *buf = vbuf;
    753 
    754 	bzero(buf, sizeof (arc_buf_hdr_t));
    755 	refcount_create(&buf->b_refcnt);
    756 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
    757 	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
    758 
    759 	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
    760 	return (0);
    761 }
    762 
    763 /* ARGSUSED */
    764 static int
    765 buf_cons(void *vbuf, void *unused, int kmflag)
    766 {
    767 	arc_buf_t *buf = vbuf;
    768 
    769 	bzero(buf, sizeof (arc_buf_t));
    770 	rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
    771 	return (0);
    772 }
    773 
    774 /*
    775  * Destructor callback - called when a cached buf is
    776  * no longer required.
    777  */
    778 /* ARGSUSED */
    779 static void
    780 hdr_dest(void *vbuf, void *unused)
    781 {
    782 	arc_buf_hdr_t *buf = vbuf;
    783 
    784 	refcount_destroy(&buf->b_refcnt);
    785 	cv_destroy(&buf->b_cv);
    786 	mutex_destroy(&buf->b_freeze_lock);
    787 
    788 	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
    789 }
    790 
    791 /* ARGSUSED */
    792 static void
    793 buf_dest(void *vbuf, void *unused)
    794 {
    795 	arc_buf_t *buf = vbuf;
    796 
    797 	rw_destroy(&buf->b_lock);
    798 }
    799 
    800 /*
    801  * Reclaim callback -- invoked when memory is low.
    802  */
    803 /* ARGSUSED */
    804 static void
    805 hdr_recl(void *unused)
    806 {
    807 	dprintf("hdr_recl called\n");
    808 	/*
    809 	 * umem calls the reclaim func when we destroy the buf cache,
    810 	 * which is after we do arc_fini().
    811 	 */
    812 	if (!arc_dead)
    813 		cv_signal(&arc_reclaim_thr_cv);
    814 }
    815 
    816 static void
    817 buf_init(void)
    818 {
    819 	uint64_t *ct;
    820 	uint64_t hsize = 1ULL << 12;
    821 	int i, j;
    822 
    823 	/*
    824 	 * The hash table is big enough to fill all of physical memory
    825 	 * with an average 64K block size.  The table will take up
    826 	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
    827 	 */
    828 	while (hsize * 65536 < physmem * PAGESIZE)
    829 		hsize <<= 1;
    830 retry:
    831 	buf_hash_table.ht_mask = hsize - 1;
    832 	buf_hash_table.ht_table =
    833 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
    834 	if (buf_hash_table.ht_table == NULL) {
    835 		ASSERT(hsize > (1ULL << 8));
    836 		hsize >>= 1;
    837 		goto retry;
    838 	}
    839 
    840 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
    841 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
    842 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
    843 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
    844 
    845 	for (i = 0; i < 256; i++)
    846 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
    847 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
    848 
    849 	for (i = 0; i < BUF_LOCKS; i++) {
    850 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
    851 		    NULL, MUTEX_DEFAULT, NULL);
    852 	}
    853 }
    854 
    855 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
    856 
    857 static void
    858 arc_cksum_verify(arc_buf_t *buf)
    859 {
    860 	zio_cksum_t zc;
    861 
    862 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
    863 		return;
    864 
    865 	mutex_enter(&buf->b_hdr->b_freeze_lock);
    866 	if (buf->b_hdr->b_freeze_cksum == NULL ||
    867 	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
    868 		mutex_exit(&buf->b_hdr->b_freeze_lock);
    869 		return;
    870 	}
    871 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
    872 	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
    873 		panic("buffer modified while frozen!");
    874 	mutex_exit(&buf->b_hdr->b_freeze_lock);
    875 }
    876 
    877 static int
    878 arc_cksum_equal(arc_buf_t *buf)
    879 {
    880 	zio_cksum_t zc;
    881 	int equal;
    882 
    883 	mutex_enter(&buf->b_hdr->b_freeze_lock);
    884 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
    885 	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
    886 	mutex_exit(&buf->b_hdr->b_freeze_lock);
    887 
    888 	return (equal);
    889 }
    890 
    891 static void
    892 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
    893 {
    894 	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
    895 		return;
    896 
    897 	mutex_enter(&buf->b_hdr->b_freeze_lock);
    898 	if (buf->b_hdr->b_freeze_cksum != NULL) {
    899 		mutex_exit(&buf->b_hdr->b_freeze_lock);
    900 		return;
    901 	}
    902 	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
    903 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
    904 	    buf->b_hdr->b_freeze_cksum);
    905 	mutex_exit(&buf->b_hdr->b_freeze_lock);
    906 }
    907 
    908 void
    909 arc_buf_thaw(arc_buf_t *buf)
    910 {
    911 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
    912 		if (buf->b_hdr->b_state != arc_anon)
    913 			panic("modifying non-anon buffer!");
    914 		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
    915 			panic("modifying buffer while i/o in progress!");
    916 		arc_cksum_verify(buf);
    917 	}
    918 
    919 	mutex_enter(&buf->b_hdr->b_freeze_lock);
    920 	if (buf->b_hdr->b_freeze_cksum != NULL) {
    921