Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)vdev_cache.c	1.6	07/11/27 SMI"
     27 
     28 #include <sys/zfs_context.h>
     29 #include <sys/spa.h>
     30 #include <sys/vdev_impl.h>
     31 #include <sys/zio.h>
     32 
     33 /*
     34  * Virtual device read-ahead caching.
     35  *
     36  * This file implements a simple LRU read-ahead cache.  When the DMU reads
     37  * a given block, it will often want other, nearby blocks soon thereafter.
     38  * We take advantage of this by reading a larger disk region and caching
     39  * the result.  In the best case, this can turn 256 back-to-back 512-byte
     40  * reads into a single 128k read followed by 255 cache hits; this reduces
     41  * latency dramatically.  In the worst case, it can turn an isolated 512-byte
     42  * read into a 128k read, which doesn't affect latency all that much but is
     43  * terribly wasteful of bandwidth.  A more intelligent version of the cache
     44  * could keep track of access patterns and not do read-ahead unless it sees
     45  * at least two temporally close I/Os to the same region.  Currently, only
     46  * metadata I/O is inflated.  A futher enhancement could take advantage of
     47  * more semantic information about the I/O.  And it could use something
     48  * faster than an AVL tree; that was chosen solely for convenience.
     49  *
     50  * There are five cache operations: allocate, fill, read, write, evict.
     51  *
     52  * (1) Allocate.  This reserves a cache entry for the specified region.
     53  *     We separate the allocate and fill operations so that multiple threads
     54  *     don't generate I/O for the same cache miss.
     55  *
     56  * (2) Fill.  When the I/O for a cache miss completes, the fill routine
     57  *     places the data in the previously allocated cache entry.
     58  *
     59  * (3) Read.  Read data from the cache.
     60  *
     61  * (4) Write.  Update cache contents after write completion.
     62  *
     63  * (5) Evict.  When allocating a new entry, we evict the oldest (LRU) entry
     64  *     if the total cache size exceeds zfs_vdev_cache_size.
     65  */
     66 
     67 /*
     68  * These tunables are for performance analysis.
     69  */
     70 /*
     71  * All i/os smaller than zfs_vdev_cache_max will be turned into
     72  * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
     73  * track buffer.  At most zfs_vdev_cache_size bytes will be kept in each
     74  * vdev's vdev_cache.
     75  */
     76 int zfs_vdev_cache_max = 1<<14;
     77 int zfs_vdev_cache_size = 10ULL << 20;
     78 int zfs_vdev_cache_bshift = 16;
     79 
     80 #define	VCBS (1 << zfs_vdev_cache_bshift)
     81 
     82 static int
     83 vdev_cache_offset_compare(const void *a1, const void *a2)
     84 {
     85 	const vdev_cache_entry_t *ve1 = a1;
     86 	const vdev_cache_entry_t *ve2 = a2;
     87 
     88 	if (ve1->ve_offset < ve2->ve_offset)
     89 		return (-1);
     90 	if (ve1->ve_offset > ve2->ve_offset)
     91 		return (1);
     92 	return (0);
     93 }
     94 
     95 static int
     96 vdev_cache_lastused_compare(const void *a1, const void *a2)
     97 {
     98 	const vdev_cache_entry_t *ve1 = a1;
     99 	const vdev_cache_entry_t *ve2 = a2;
    100 
    101 	if (ve1->ve_lastused < ve2->ve_lastused)
    102 		return (-1);
    103 	if (ve1->ve_lastused > ve2->ve_lastused)
    104 		return (1);
    105 
    106 	/*
    107 	 * Among equally old entries, sort by offset to ensure uniqueness.
    108 	 */
    109 	return (vdev_cache_offset_compare(a1, a2));
    110 }
    111 
    112 /*
    113  * Evict the specified entry from the cache.
    114  */
    115 static void
    116 vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
    117 {
    118 	ASSERT(MUTEX_HELD(&vc->vc_lock));
    119 	ASSERT(ve->ve_fill_io == NULL);
    120 	ASSERT(ve->ve_data != NULL);
    121 
    122 	dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n",
    123 	    vc, ve->ve_offset, ve->ve_lastused, lbolt - ve->ve_lastused,
    124 	    ve->ve_hits, ve->ve_missed_update);
    125 
    126 	avl_remove(&vc->vc_lastused_tree, ve);
    127 	avl_remove(&vc->vc_offset_tree, ve);
    128 	zio_buf_free(ve->ve_data, VCBS);
    129 	kmem_free(ve, sizeof (vdev_cache_entry_t));
    130 }
    131 
    132 /*
    133  * Allocate an entry in the cache.  At the point we don't have the data,
    134  * we're just creating a placeholder so that multiple threads don't all
    135  * go off and read the same blocks.
    136  */
    137 static vdev_cache_entry_t *
    138 vdev_cache_allocate(zio_t *zio)
    139 {
    140 	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
    141 	uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
    142 	vdev_cache_entry_t *ve;
    143 
    144 	ASSERT(MUTEX_HELD(&vc->vc_lock));
    145 
    146 	if (zfs_vdev_cache_size == 0)
    147 		return (NULL);
    148 
    149 	/*
    150 	 * If adding a new entry would exceed the cache size,
    151 	 * evict the oldest entry (LRU).
    152 	 */
    153 	if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
    154 	    zfs_vdev_cache_size) {
    155 		ve = avl_first(&vc->vc_lastused_tree);
    156 		if (ve->ve_fill_io != NULL) {
    157 			dprintf("can't evict in %p, still filling\n", vc);
    158 			return (NULL);
    159 		}
    160 		ASSERT(ve->ve_hits != 0);
    161 		vdev_cache_evict(vc, ve);
    162 	}
    163 
    164 	ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
    165 	ve->ve_offset = offset;
    166 	ve->ve_lastused = lbolt;
    167 	ve->ve_data = zio_buf_alloc(VCBS);
    168 
    169 	avl_add(&vc->vc_offset_tree, ve);
    170 	avl_add(&vc->vc_lastused_tree, ve);
    171 
    172 	return (ve);
    173 }
    174 
    175 static void
    176 vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
    177 {
    178 	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
    179 
    180 	ASSERT(MUTEX_HELD(&vc->vc_lock));
    181 	ASSERT(ve->ve_fill_io == NULL);
    182 
    183 	if (ve->ve_lastused != lbolt) {
    184 		avl_remove(&vc->vc_lastused_tree, ve);
    185 		ve->ve_lastused = lbolt;
    186 		avl_add(&vc->vc_lastused_tree, ve);
    187 	}
    188 
    189 	ve->ve_hits++;
    190 	bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
    191 }
    192 
    193 /*
    194  * Fill a previously allocated cache entry with data.
    195  */
    196 static void
    197 vdev_cache_fill(zio_t *zio)
    198 {
    199 	vdev_t *vd = zio->io_vd;
    200 	vdev_cache_t *vc = &vd->vdev_cache;
    201 	vdev_cache_entry_t *ve = zio->io_private;
    202 	zio_t *dio;
    203 
    204 	ASSERT(zio->io_size == VCBS);
    205 
    206 	/*
    207 	 * Add data to the cache.
    208 	 */
    209 	mutex_enter(&vc->vc_lock);
    210 
    211 	ASSERT(ve->ve_fill_io == zio);
    212 	ASSERT(ve->ve_offset == zio->io_offset);
    213 	ASSERT(ve->ve_data == zio->io_data);
    214 
    215 	ve->ve_fill_io = NULL;
    216 
    217 	/*
    218 	 * Even if this cache line was invalidated by a missed write update,
    219 	 * any reads that were queued up before the missed update are still
    220 	 * valid, so we can satisfy them from this line before we evict it.
    221 	 */
    222 	for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next)
    223 		vdev_cache_hit(vc, ve, dio);
    224 
    225 	if (zio->io_error || ve->ve_missed_update)
    226 		vdev_cache_evict(vc, ve);
    227 
    228 	mutex_exit(&vc->vc_lock);
    229 
    230 	while ((dio = zio->io_delegate_list) != NULL) {
    231 		zio->io_delegate_list = dio->io_delegate_next;
    232 		dio->io_delegate_next = NULL;
    233 		dio->io_error = zio->io_error;
    234 		zio_execute(dio);
    235 	}
    236 }
    237 
    238 /*
    239  * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
    240  */
    241 int
    242 vdev_cache_read(zio_t *zio)
    243 {
    244 	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
    245 	vdev_cache_entry_t *ve, ve_search;
    246 	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
    247 	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
    248 	zio_t *fio;
    249 
    250 	ASSERT(zio->io_type == ZIO_TYPE_READ);
    251 
    252 	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
    253 		return (EINVAL);
    254 
    255 	if (zio->io_size > zfs_vdev_cache_max)
    256 		return (EOVERFLOW);
    257 
    258 	/*
    259 	 * If the I/O straddles two or more cache blocks, don't cache it.
    260 	 */
    261 	if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS))
    262 		return (EXDEV);
    263 
    264 	ASSERT(cache_phase + zio->io_size <= VCBS);
    265 
    266 	mutex_enter(&vc->vc_lock);
    267 
    268 	ve_search.ve_offset = cache_offset;
    269 	ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
    270 
    271 	if (ve != NULL) {
    272 		if (ve->ve_missed_update) {
    273 			mutex_exit(&vc->vc_lock);
    274 			return (ESTALE);
    275 		}
    276 
    277 		if ((fio = ve->ve_fill_io) != NULL) {
    278 			zio->io_delegate_next = fio->io_delegate_list;
    279 			fio->io_delegate_list = zio;
    280 			zio_vdev_io_bypass(zio);
    281 			mutex_exit(&vc->vc_lock);
    282 			return (0);
    283 		}
    284 
    285 		vdev_cache_hit(vc, ve, zio);
    286 		zio_vdev_io_bypass(zio);
    287 
    288 		mutex_exit(&vc->vc_lock);
    289 		zio_execute(zio);
    290 		return (0);
    291 	}
    292 
    293 	ve = vdev_cache_allocate(zio);
    294 
    295 	if (ve == NULL) {
    296 		mutex_exit(&vc->vc_lock);
    297 		return (ENOMEM);
    298 	}
    299 
    300 	fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
    301 	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
    302 	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
    303 	    ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
    304 	    vdev_cache_fill, ve);
    305 
    306 	ve->ve_fill_io = fio;
    307 	fio->io_delegate_list = zio;
    308 	zio_vdev_io_bypass(zio);
    309 
    310 	mutex_exit(&vc->vc_lock);
    311 	zio_nowait(fio);
    312 
    313 	return (0);
    314 }
    315 
    316 /*
    317  * Update cache contents upon write completion.
    318  */
    319 void
    320 vdev_cache_write(zio_t *zio)
    321 {
    322 	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
    323 	vdev_cache_entry_t *ve, ve_search;
    324 	uint64_t io_start = zio->io_offset;
    325 	uint64_t io_end = io_start + zio->io_size;
    326 	uint64_t min_offset = P2ALIGN(io_start, VCBS);
    327 	uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
    328 	avl_index_t where;
    329 
    330 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
    331 
    332 	mutex_enter(&vc->vc_lock);
    333 
    334 	ve_search.ve_offset = min_offset;
    335 	ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
    336 
    337 	if (ve == NULL)
    338 		ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
    339 
    340 	while (ve != NULL && ve->ve_offset < max_offset) {
    341 		uint64_t start = MAX(ve->ve_offset, io_start);
    342 		uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
    343 
    344 		if (ve->ve_fill_io != NULL) {
    345 			ve->ve_missed_update = 1;
    346 		} else {
    347 			bcopy((char *)zio->io_data + start - io_start,
    348 			    ve->ve_data + start - ve->ve_offset, end - start);
    349 		}
    350 		ve = AVL_NEXT(&vc->vc_offset_tree, ve);
    351 	}
    352 	mutex_exit(&vc->vc_lock);
    353 }
    354 
    355 void
    356 vdev_cache_purge(vdev_t *vd)
    357 {
    358 	vdev_cache_t *vc = &vd->vdev_cache;
    359 	vdev_cache_entry_t *ve;
    360 
    361 	mutex_enter(&vc->vc_lock);
    362 	while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
    363 		vdev_cache_evict(vc, ve);
    364 	mutex_exit(&vc->vc_lock);
    365 }
    366 
    367 void
    368 vdev_cache_init(vdev_t *vd)
    369 {
    370 	vdev_cache_t *vc = &vd->vdev_cache;
    371 
    372 	mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
    373 
    374 	avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
    375 	    sizeof (vdev_cache_entry_t),
    376 	    offsetof(struct vdev_cache_entry, ve_offset_node));
    377 
    378 	avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
    379 	    sizeof (vdev_cache_entry_t),
    380 	    offsetof(struct vdev_cache_entry, ve_lastused_node));
    381 }
    382 
    383 void
    384 vdev_cache_fini(vdev_t *vd)
    385 {
    386 	vdev_cache_t *vc = &vd->vdev_cache;
    387 
    388 	vdev_cache_purge(vd);
    389 
    390 	avl_destroy(&vc->vc_offset_tree);
    391 	avl_destroy(&vc->vc_lastused_tree);
    392 
    393 	mutex_destroy(&vc->vc_lock);
    394 }
    395