Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)vdev_queue.c	1.6	07/11/27 SMI"
     27 
     28 #include <sys/zfs_context.h>
     29 #include <sys/spa.h>
     30 #include <sys/vdev_impl.h>
     31 #include <sys/zio.h>
     32 #include <sys/avl.h>
     33 
     34 /*
     35  * These tunables are for performance analysis.
     36  */
     37 /*
     38  * zfs_vdev_max_pending is the maximum number of i/os concurrently
     39  * pending to each device.  zfs_vdev_min_pending is the initial number
     40  * of i/os pending to each device (before it starts ramping up to
     41  * max_pending).
     42  */
     43 int zfs_vdev_max_pending = 35;
     44 int zfs_vdev_min_pending = 4;
     45 
     46 /* deadline = pri + (lbolt >> time_shift) */
     47 int zfs_vdev_time_shift = 6;
     48 
     49 /* exponential I/O issue ramp-up rate */
     50 int zfs_vdev_ramp_rate = 2;
     51 
     52 /*
     53  * i/os will be aggregated into a single large i/o up to
     54  * zfs_vdev_aggregation_limit bytes long.
     55  */
     56 int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
     57 
     58 /*
     59  * Virtual device vector for disk I/O scheduling.
     60  */
     61 int
     62 vdev_queue_deadline_compare(const void *x1, const void *x2)
     63 {
     64 	const zio_t *z1 = x1;
     65 	const zio_t *z2 = x2;
     66 
     67 	if (z1->io_deadline < z2->io_deadline)
     68 		return (-1);
     69 	if (z1->io_deadline > z2->io_deadline)
     70 		return (1);
     71 
     72 	if (z1->io_offset < z2->io_offset)
     73 		return (-1);
     74 	if (z1->io_offset > z2->io_offset)
     75 		return (1);
     76 
     77 	if (z1 < z2)
     78 		return (-1);
     79 	if (z1 > z2)
     80 		return (1);
     81 
     82 	return (0);
     83 }
     84 
     85 int
     86 vdev_queue_offset_compare(const void *x1, const void *x2)
     87 {
     88 	const zio_t *z1 = x1;
     89 	const zio_t *z2 = x2;
     90 
     91 	if (z1->io_offset < z2->io_offset)
     92 		return (-1);
     93 	if (z1->io_offset > z2->io_offset)
     94 		return (1);
     95 
     96 	if (z1 < z2)
     97 		return (-1);
     98 	if (z1 > z2)
     99 		return (1);
    100 
    101 	return (0);
    102 }
    103 
    104 void
    105 vdev_queue_init(vdev_t *vd)
    106 {
    107 	vdev_queue_t *vq = &vd->vdev_queue;
    108 
    109 	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
    110 
    111 	avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
    112 	    sizeof (zio_t), offsetof(struct zio, io_deadline_node));
    113 
    114 	avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
    115 	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
    116 
    117 	avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
    118 	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
    119 
    120 	avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
    121 	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
    122 }
    123 
    124 void
    125 vdev_queue_fini(vdev_t *vd)
    126 {
    127 	vdev_queue_t *vq = &vd->vdev_queue;
    128 
    129 	avl_destroy(&vq->vq_deadline_tree);
    130 	avl_destroy(&vq->vq_read_tree);
    131 	avl_destroy(&vq->vq_write_tree);
    132 	avl_destroy(&vq->vq_pending_tree);
    133 
    134 	mutex_destroy(&vq->vq_lock);
    135 }
    136 
    137 static void
    138 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
    139 {
    140 	avl_add(&vq->vq_deadline_tree, zio);
    141 	avl_add(zio->io_vdev_tree, zio);
    142 }
    143 
    144 static void
    145 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
    146 {
    147 	avl_remove(&vq->vq_deadline_tree, zio);
    148 	avl_remove(zio->io_vdev_tree, zio);
    149 }
    150 
    151 static void
    152 vdev_queue_agg_io_done(zio_t *aio)
    153 {
    154 	zio_t *dio;
    155 	uint64_t offset = 0;
    156 
    157 	while ((dio = aio->io_delegate_list) != NULL) {
    158 		if (aio->io_type == ZIO_TYPE_READ)
    159 			bcopy((char *)aio->io_data + offset, dio->io_data,
    160 			    dio->io_size);
    161 		offset += dio->io_size;
    162 		aio->io_delegate_list = dio->io_delegate_next;
    163 		dio->io_delegate_next = NULL;
    164 		dio->io_error = aio->io_error;
    165 		zio_execute(dio);
    166 	}
    167 	ASSERT3U(offset, ==, aio->io_size);
    168 
    169 	zio_buf_free(aio->io_data, aio->io_size);
    170 }
    171 
    172 #define	IS_ADJACENT(io, nio) \
    173 	((io)->io_offset + (io)->io_size == (nio)->io_offset)
    174 
    175 static zio_t *
    176 vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
    177 {
    178 	zio_t *fio, *lio, *aio, *dio;
    179 	avl_tree_t *tree;
    180 	uint64_t size;
    181 
    182 	ASSERT(MUTEX_HELD(&vq->vq_lock));
    183 
    184 	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
    185 	    avl_numnodes(&vq->vq_deadline_tree) == 0)
    186 		return (NULL);
    187 
    188 	fio = lio = avl_first(&vq->vq_deadline_tree);
    189 
    190 	tree = fio->io_vdev_tree;
    191 	size = fio->io_size;
    192 
    193 	while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
    194 	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
    195 		dio->io_delegate_next = fio;
    196 		fio = dio;
    197 		size += dio->io_size;
    198 	}
    199 
    200 	while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
    201 	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
    202 		lio->io_delegate_next = dio;
    203 		lio = dio;
    204 		size += dio->io_size;
    205 	}
    206 
    207 	if (fio != lio) {
    208 		char *buf = zio_buf_alloc(size);
    209 		uint64_t offset = 0;
    210 		int nagg = 0;
    211 
    212 		ASSERT(size <= zfs_vdev_aggregation_limit);
    213 
    214 		aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
    215 		    fio->io_offset, buf, size, fio->io_type,
    216 		    ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
    217 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
    218 		    ZIO_FLAG_NOBOOKMARK,
    219 		    vdev_queue_agg_io_done, NULL);
    220 
    221 		aio->io_delegate_list = fio;
    222 
    223 		for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
    224 			ASSERT(dio->io_type == aio->io_type);
    225 			ASSERT(dio->io_vdev_tree == tree);
    226 			if (dio->io_type == ZIO_TYPE_WRITE)
    227 				bcopy(dio->io_data, buf + offset, dio->io_size);
    228 			offset += dio->io_size;
    229 			vdev_queue_io_remove(vq, dio);
    230 			zio_vdev_io_bypass(dio);
    231 			nagg++;
    232 		}
    233 
    234 		ASSERT(offset == size);
    235 
    236 		dprintf("%5s  T=%llu  off=%8llx  agg=%3d  "
    237 		    "old=%5llx  new=%5llx\n",
    238 		    zio_type_name[fio->io_type],
    239 		    fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);
    240 
    241 		avl_add(&vq->vq_pending_tree, aio);
    242 
    243 		return (aio);
    244 	}
    245 
    246 	ASSERT(fio->io_vdev_tree == tree);
    247 	vdev_queue_io_remove(vq, fio);
    248 
    249 	avl_add(&vq->vq_pending_tree, fio);
    250 
    251 	return (fio);
    252 }
    253 
    254 zio_t *
    255 vdev_queue_io(zio_t *zio)
    256 {
    257 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
    258 	zio_t *nio;
    259 
    260 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
    261 
    262 	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
    263 		return (zio);
    264 
    265 	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
    266 
    267 	if (zio->io_type == ZIO_TYPE_READ)
    268 		zio->io_vdev_tree = &vq->vq_read_tree;
    269 	else
    270 		zio->io_vdev_tree = &vq->vq_write_tree;
    271 
    272 	mutex_enter(&vq->vq_lock);
    273 
    274 	zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
    275 	    zio->io_priority;
    276 
    277 	vdev_queue_io_add(vq, zio);
    278 
    279 	nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
    280 
    281 	mutex_exit(&vq->vq_lock);
    282 
    283 	if (nio == NULL)
    284 		return (NULL);
    285 
    286 	if (nio->io_done == vdev_queue_agg_io_done) {
    287 		zio_nowait(nio);
    288 		return (NULL);
    289 	}
    290 
    291 	return (nio);
    292 }
    293 
    294 void
    295 vdev_queue_io_done(zio_t *zio)
    296 {
    297 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
    298 	zio_t *nio;
    299 	int i;
    300 
    301 	mutex_enter(&vq->vq_lock);
    302 
    303 	avl_remove(&vq->vq_pending_tree, zio);
    304 
    305 	for (i = 0; i < zfs_vdev_ramp_rate; i++) {
    306 		nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
    307 		if (nio == NULL)
    308 			break;
    309 		mutex_exit(&vq->vq_lock);
    310 		if (nio->io_done == vdev_queue_agg_io_done) {
    311 			zio_nowait(nio);
    312 		} else {
    313 			zio_vdev_io_reissue(nio);
    314 			zio_execute(nio);
    315 		}
    316 		mutex_enter(&vq->vq_lock);
    317 	}
    318 
    319 	mutex_exit(&vq->vq_lock);
    320 }
    321