Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/zfs_context.h>
     27 #include <sys/spa_impl.h>
     28 #include <sys/dmu.h>
     29 #include <sys/dmu_tx.h>
     30 #include <sys/space_map.h>
     31 #include <sys/metaslab_impl.h>
     32 #include <sys/vdev_impl.h>
     33 #include <sys/zio.h>
     34 
     35 uint64_t metaslab_aliquot = 512ULL << 10;
     36 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
     37 
     38 /*
     39  * Minimum size which forces the dynamic allocator to change
     40  * it's allocation strategy. Once the space map cannot satisfy
     41  * an allocation of this size then it switches to using more
     42  * aggressive strategy (i.e search by size rather than offset).
     43  */
     44 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
     45 
     46 /*
     47  * The minimum free space, in percent, which must be available
     48  * in a space map to continue allocations in a first-fit fashion.
     49  * Once the space_map's free space drops below this level we dynamically
     50  * switch to using best-fit allocations.
     51  */
     52 int metaslab_df_free_pct = 30;
     53 
     54 /*
     55  * ==========================================================================
     56  * Metaslab classes
     57  * ==========================================================================
     58  */
     59 metaslab_class_t *
     60 metaslab_class_create(space_map_ops_t *ops)
     61 {
     62 	metaslab_class_t *mc;
     63 
     64 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
     65 
     66 	mc->mc_rotor = NULL;
     67 	mc->mc_ops = ops;
     68 
     69 	return (mc);
     70 }
     71 
     72 void
     73 metaslab_class_destroy(metaslab_class_t *mc)
     74 {
     75 	metaslab_group_t *mg;
     76 
     77 	while ((mg = mc->mc_rotor) != NULL) {
     78 		metaslab_class_remove(mc, mg);
     79 		metaslab_group_destroy(mg);
     80 	}
     81 
     82 	kmem_free(mc, sizeof (metaslab_class_t));
     83 }
     84 
     85 void
     86 metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
     87 {
     88 	metaslab_group_t *mgprev, *mgnext;
     89 
     90 	ASSERT(mg->mg_class == NULL);
     91 
     92 	if ((mgprev = mc->mc_rotor) == NULL) {
     93 		mg->mg_prev = mg;
     94 		mg->mg_next = mg;
     95 	} else {
     96 		mgnext = mgprev->mg_next;
     97 		mg->mg_prev = mgprev;
     98 		mg->mg_next = mgnext;
     99 		mgprev->mg_next = mg;
    100 		mgnext->mg_prev = mg;
    101 	}
    102 	mc->mc_rotor = mg;
    103 	mg->mg_class = mc;
    104 }
    105 
    106 void
    107 metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
    108 {
    109 	metaslab_group_t *mgprev, *mgnext;
    110 
    111 	ASSERT(mg->mg_class == mc);
    112 
    113 	mgprev = mg->mg_prev;
    114 	mgnext = mg->mg_next;
    115 
    116 	if (mg == mgnext) {
    117 		mc->mc_rotor = NULL;
    118 	} else {
    119 		mc->mc_rotor = mgnext;
    120 		mgprev->mg_next = mgnext;
    121 		mgnext->mg_prev = mgprev;
    122 	}
    123 
    124 	mg->mg_prev = NULL;
    125 	mg->mg_next = NULL;
    126 	mg->mg_class = NULL;
    127 }
    128 
    129 /*
    130  * ==========================================================================
    131  * Metaslab groups
    132  * ==========================================================================
    133  */
    134 static int
    135 metaslab_compare(const void *x1, const void *x2)
    136 {
    137 	const metaslab_t *m1 = x1;
    138 	const metaslab_t *m2 = x2;
    139 
    140 	if (m1->ms_weight < m2->ms_weight)
    141 		return (1);
    142 	if (m1->ms_weight > m2->ms_weight)
    143 		return (-1);
    144 
    145 	/*
    146 	 * If the weights are identical, use the offset to force uniqueness.
    147 	 */
    148 	if (m1->ms_map.sm_start < m2->ms_map.sm_start)
    149 		return (-1);
    150 	if (m1->ms_map.sm_start > m2->ms_map.sm_start)
    151 		return (1);
    152 
    153 	ASSERT3P(m1, ==, m2);
    154 
    155 	return (0);
    156 }
    157 
    158 metaslab_group_t *
    159 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
    160 {
    161 	metaslab_group_t *mg;
    162 
    163 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
    164 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
    165 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
    166 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
    167 	mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children);
    168 	mg->mg_vd = vd;
    169 	metaslab_class_add(mc, mg);
    170 
    171 	return (mg);
    172 }
    173 
    174 void
    175 metaslab_group_destroy(metaslab_group_t *mg)
    176 {
    177 	avl_destroy(&mg->mg_metaslab_tree);
    178 	mutex_destroy(&mg->mg_lock);
    179 	kmem_free(mg, sizeof (metaslab_group_t));
    180 }
    181 
    182 static void
    183 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
    184 {
    185 	mutex_enter(&mg->mg_lock);
    186 	ASSERT(msp->ms_group == NULL);
    187 	msp->ms_group = mg;
    188 	msp->ms_weight = 0;
    189 	avl_add(&mg->mg_metaslab_tree, msp);
    190 	mutex_exit(&mg->mg_lock);
    191 }
    192 
    193 static void
    194 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
    195 {
    196 	mutex_enter(&mg->mg_lock);
    197 	ASSERT(msp->ms_group == mg);
    198 	avl_remove(&mg->mg_metaslab_tree, msp);
    199 	msp->ms_group = NULL;
    200 	mutex_exit(&mg->mg_lock);
    201 }
    202 
    203 static void
    204 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
    205 {
    206 	/*
    207 	 * Although in principle the weight can be any value, in
    208 	 * practice we do not use values in the range [1, 510].
    209 	 */
    210 	ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
    211 	ASSERT(MUTEX_HELD(&msp->ms_lock));
    212 
    213 	mutex_enter(&mg->mg_lock);
    214 	ASSERT(msp->ms_group == mg);
    215 	avl_remove(&mg->mg_metaslab_tree, msp);
    216 	msp->ms_weight = weight;
    217 	avl_add(&mg->mg_metaslab_tree, msp);
    218 	mutex_exit(&mg->mg_lock);
    219 }
    220 
    221 /*
    222  * This is a helper function that can be used by the allocator to find
    223  * a suitable block to allocate. This will search the specified AVL
    224  * tree looking for a block that matches the specified criteria.
    225  */
    226 static uint64_t
    227 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
    228     uint64_t align)
    229 {
    230 	space_seg_t *ss, ssearch;
    231 	avl_index_t where;
    232 
    233 	ssearch.ss_start = *cursor;
    234 	ssearch.ss_end = *cursor + size;
    235 
    236 	ss = avl_find(t, &ssearch, &where);
    237 	if (ss == NULL)
    238 		ss = avl_nearest(t, where, AVL_AFTER);
    239 
    240 	while (ss != NULL) {
    241 		uint64_t offset = P2ROUNDUP(ss->ss_start, align);
    242 
    243 		if (offset + size <= ss->ss_end) {
    244 			*cursor = offset + size;
    245 			return (offset);
    246 		}
    247 		ss = AVL_NEXT(t, ss);
    248 	}
    249 
    250 	/*
    251 	 * If we know we've searched the whole map (*cursor == 0), give up.
    252 	 * Otherwise, reset the cursor to the beginning and try again.
    253 	 */
    254 	if (*cursor == 0)
    255 		return (-1ULL);
    256 
    257 	*cursor = 0;
    258 	return (metaslab_block_picker(t, cursor, size, align));
    259 }
    260 
    261 /*
    262  * ==========================================================================
    263  * The first-fit block allocator
    264  * ==========================================================================
    265  */
    266 static void
    267 metaslab_ff_load(space_map_t *sm)
    268 {
    269 	ASSERT(sm->sm_ppd == NULL);
    270 	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
    271 	sm->sm_pp_root = NULL;
    272 }
    273 
    274 static void
    275 metaslab_ff_unload(space_map_t *sm)
    276 {
    277 	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
    278 	sm->sm_ppd = NULL;
    279 }
    280 
    281 static uint64_t
    282 metaslab_ff_alloc(space_map_t *sm, uint64_t size)
    283 {
    284 	avl_tree_t *t = &sm->sm_root;
    285 	uint64_t align = size & -size;
    286 	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
    287 
    288 	return (metaslab_block_picker(t, cursor, size, align));
    289 }
    290 
    291 /* ARGSUSED */
    292 static void
    293 metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
    294 {
    295 	/* No need to update cursor */
    296 }
    297 
    298 /* ARGSUSED */
    299 static void
    300 metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
    301 {
    302 	/* No need to update cursor */
    303 }
    304 
    305 static space_map_ops_t metaslab_ff_ops = {
    306 	metaslab_ff_load,
    307 	metaslab_ff_unload,
    308 	metaslab_ff_alloc,
    309 	metaslab_ff_claim,
    310 	metaslab_ff_free,
    311 	NULL	/* maxsize */
    312 };
    313 
    314 /*
    315  * Dynamic block allocator -
    316  * Uses the first fit allocation scheme until space get low and then
    317  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
    318  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
    319  */
    320 
    321 uint64_t
    322 metaslab_df_maxsize(space_map_t *sm)
    323 {
    324 	avl_tree_t *t = sm->sm_pp_root;
    325 	space_seg_t *ss;
    326 
    327 	if (t == NULL || (ss = avl_last(t)) == NULL)
    328 		return (0ULL);
    329 
    330 	return (ss->ss_end - ss->ss_start);
    331 }
    332 
    333 static int
    334 metaslab_df_seg_compare(const void *x1, const void *x2)
    335 {
    336 	const space_seg_t *s1 = x1;
    337 	const space_seg_t *s2 = x2;
    338 	uint64_t ss_size1 = s1->ss_end - s1->ss_start;
    339 	uint64_t ss_size2 = s2->ss_end - s2->ss_start;
    340 
    341 	if (ss_size1 < ss_size2)
    342 		return (-1);
    343 	if (ss_size1 > ss_size2)
    344 		return (1);
    345 
    346 	if (s1->ss_start < s2->ss_start)
    347 		return (-1);
    348 	if (s1->ss_start > s2->ss_start)
    349 		return (1);
    350 
    351 	return (0);
    352 }
    353 
    354 static void
    355 metaslab_df_load(space_map_t *sm)
    356 {
    357 	space_seg_t *ss;
    358 
    359 	ASSERT(sm->sm_ppd == NULL);
    360 	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
    361 
    362 	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
    363 	avl_create(sm->sm_pp_root, metaslab_df_seg_compare,
    364 	    sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
    365 
    366 	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
    367 		avl_add(sm->sm_pp_root, ss);
    368 }
    369 
    370 static void
    371 metaslab_df_unload(space_map_t *sm)
    372 {
    373 	void *cookie = NULL;
    374 
    375 	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
    376 	sm->sm_ppd = NULL;
    377 
    378 	while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
    379 		/* tear down the tree */
    380 	}
    381 
    382 	avl_destroy(sm->sm_pp_root);
    383 	kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
    384 	sm->sm_pp_root = NULL;
    385 }
    386 
    387 static uint64_t
    388 metaslab_df_alloc(space_map_t *sm, uint64_t size)
    389 {
    390 	avl_tree_t *t = &sm->sm_root;
    391 	uint64_t align = size & -size;
    392 	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
    393 	uint64_t max_size = metaslab_df_maxsize(sm);
    394 	int free_pct = sm->sm_space * 100 / sm->sm_size;
    395 
    396 	ASSERT(MUTEX_HELD(sm->sm_lock));
    397 	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
    398 
    399 	if (max_size < size)
    400 		return (-1ULL);
    401 
    402 	/*
    403 	 * If we're running low on space switch to using the size
    404 	 * sorted AVL tree (best-fit).
    405 	 */
    406 	if (max_size < metaslab_df_alloc_threshold ||
    407 	    free_pct < metaslab_df_free_pct) {
    408 		t = sm->sm_pp_root;
    409 		*cursor = 0;
    410 	}
    411 
    412 	return (metaslab_block_picker(t, cursor, size, 1ULL));
    413 }
    414 
    415 /* ARGSUSED */
    416 static void
    417 metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size)
    418 {
    419 	/* No need to update cursor */
    420 }
    421 
    422 /* ARGSUSED */
    423 static void
    424 metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size)
    425 {
    426 	/* No need to update cursor */
    427 }
    428 
    429 static space_map_ops_t metaslab_df_ops = {
    430 	metaslab_df_load,
    431 	metaslab_df_unload,
    432 	metaslab_df_alloc,
    433 	metaslab_df_claim,
    434 	metaslab_df_free,
    435 	metaslab_df_maxsize
    436 };
    437 
    438 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
    439 
    440 /*
    441  * ==========================================================================
    442  * Metaslabs
    443  * ==========================================================================
    444  */
    445 metaslab_t *
    446 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
    447 	uint64_t start, uint64_t size, uint64_t txg)
    448 {
    449 	vdev_t *vd = mg->mg_vd;
    450 	metaslab_t *msp;
    451 
    452 	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
    453 	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
    454 
    455 	msp->ms_smo_syncing = *smo;
    456 
    457 	/*
    458 	 * We create the main space map here, but we don't create the
    459 	 * allocmaps and freemaps until metaslab_sync_done().  This serves
    460 	 * two purposes: it allows metaslab_sync_done() to detect the
    461 	 * addition of new space; and for debugging, it ensures that we'd
    462 	 * data fault on any attempt to use this metaslab before it's ready.
    463 	 */
    464 	space_map_create(&msp->ms_map, start, size,
    465 	    vd->vdev_ashift, &msp->ms_lock);
    466 
    467 	metaslab_group_add(mg, msp);
    468 
    469 	/*
    470 	 * If we're opening an existing pool (txg == 0) or creating
    471 	 * a new one (txg == TXG_INITIAL), all space is available now.
    472 	 * If we're adding space to an existing pool, the new space
    473 	 * does not become available until after this txg has synced.
    474 	 */
    475 	if (txg <= TXG_INITIAL)
    476 		metaslab_sync_done(msp, 0);
    477 
    478 	if (txg != 0) {
    479 		/*
    480 		 * The vdev is dirty, but the metaslab isn't -- it just needs
    481 		 * to have metaslab_sync_done() invoked from vdev_sync_done().
    482 		 * [We could just dirty the metaslab, but that would cause us
    483 		 * to allocate a space map object for it, which is wasteful
    484 		 * and would mess up the locality logic in metaslab_weight().]
    485 		 */
    486 		ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa));
    487 		vdev_dirty(vd, 0, NULL, txg);
    488 		vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg));
    489 	}
    490 
    491 	return (msp);
    492 }
    493 
    494 void
    495 metaslab_fini(metaslab_t *msp)
    496 {
    497 	metaslab_group_t *mg = msp->ms_group;
    498 	int t;
    499 
    500 	vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
    501 	    -msp->ms_smo.smo_alloc, B_TRUE);
    502 
    503 	metaslab_group_remove(mg, msp);
    504 
    505 	mutex_enter(&msp->ms_lock);
    506 
    507 	space_map_unload(&msp->ms_map);
    508 	space_map_destroy(&msp->ms_map);
    509 
    510 	for (t = 0; t < TXG_SIZE; t++) {
    511 		space_map_destroy(&msp->ms_allocmap[t]);
    512 		space_map_destroy(&msp->ms_freemap[t]);
    513 	}
    514 
    515 	mutex_exit(&msp->ms_lock);
    516 	mutex_destroy(&msp->ms_lock);
    517 
    518 	kmem_free(msp, sizeof (metaslab_t));
    519 }
    520 
    521 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
    522 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
    523 #define	METASLAB_ACTIVE_MASK		\
    524 	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
    525 #define	METASLAB_SMO_BONUS_MULTIPLIER	2
    526 
    527 static uint64_t
    528 metaslab_weight(metaslab_t *msp)
    529 {
    530 	metaslab_group_t *mg = msp->ms_group;
    531 	space_map_t *sm = &msp->ms_map;
    532 	space_map_obj_t *smo = &msp->ms_smo;
    533 	vdev_t *vd = mg->mg_vd;
    534 	uint64_t weight, space;
    535 
    536 	ASSERT(MUTEX_HELD(&msp->ms_lock));
    537 
    538 	/*
    539 	 * The baseline weight is the metaslab's free space.
    540 	 */
    541 	space = sm->sm_size - smo->smo_alloc;
    542 	weight = space;
    543 
    544 	/*
    545 	 * Modern disks have uniform bit density and constant angular velocity.
    546 	 * Therefore, the outer recording zones are faster (higher bandwidth)
    547 	 * than the inner zones by the ratio of outer to inner track diameter,
    548 	 * which is typically around 2:1.  We account for this by assigning
    549 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
    550 	 * In effect, this means that we'll select the metaslab with the most
    551 	 * free bandwidth rather than simply the one with the most free space.
    552 	 */
    553 	weight = 2 * weight -
    554 	    ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
    555 	ASSERT(weight >= space && weight <= 2 * space);
    556 
    557 	/*
    558 	 * For locality, assign higher weight to metaslabs we've used before.
    559 	 */
    560 	if (smo->smo_object != 0)
    561 		weight *= METASLAB_SMO_BONUS_MULTIPLIER;
    562 	ASSERT(weight >= space &&
    563 	    weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
    564 
    565 	/*
    566 	 * If this metaslab is one we're actively using, adjust its weight to
    567 	 * make it preferable to any inactive metaslab so we'll polish it off.
    568 	 */
    569 	weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
    570 
    571 	return (weight);
    572 }
    573 
    574 static int
    575 metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
    576 {
    577 	space_map_t *sm = &msp->ms_map;
    578 	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
    579 
    580 	ASSERT(MUTEX_HELD(&msp->ms_lock));
    581 
    582 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
    583 		int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo,
    584 		    msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
    585 		if (error) {
    586 			metaslab_group_sort(msp->ms_group, msp, 0);
    587 			return (error);
    588 		}
    589 
    590 		/*
    591 		 * If we were able to load the map then make sure
    592 		 * that this map is still able to satisfy our request.
    593 		 */
    594 		if (msp->ms_weight < size)
    595 			return (ENOSPC);
    596 
    597 		metaslab_group_sort(msp->ms_group, msp,
    598 		    msp->ms_weight | activation_weight);
    599 	}
    600 	ASSERT(sm->sm_loaded);
    601 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
    602 
    603 	return (0);
    604 }
    605 
    606 static void
    607 metaslab_passivate(metaslab_t *msp, uint64_t size)
    608 {
    609 	/*
    610 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
    611 	 * this metaslab again.  In that case, it had better be empty,
    612 	 * or we would be leaving space on the table.
    613 	 */
    614 	ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
    615 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
    616 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
    617 }
    618 
    619 /*
    620  * Write a metaslab to disk in the context of the specified transaction group.
    621  */
    622 void
    623 metaslab_sync(metaslab_t *msp, uint64_t txg)
    624 {
    625 	vdev_t *vd = msp->ms_group->mg_vd;
    626 	spa_t *spa = vd->vdev_spa;
    627 	objset_t *mos = spa->spa_meta_objset;
    628 	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
    629 	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
    630 	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
    631 	space_map_t *sm = &msp->ms_map;
    632 	space_map_obj_t *smo = &msp->ms_smo_syncing;
    633 	dmu_buf_t *db;
    634 	dmu_tx_t *tx;
    635 	int t;
    636 
    637 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
    638 
    639 	/*
    640 	 * The only state that can actually be changing concurrently with
    641 	 * metaslab_sync() is the metaslab's ms_map.  No other thread can
    642 	 * be modifying this txg's allocmap, freemap, freed_map, or smo.
    643 	 * Therefore, we only hold ms_lock to satify space_map ASSERTs.
    644 	 * We drop it whenever we call into the DMU, because the DMU
    645 	 * can call down to us (e.g. via zio_free()) at any time.
    646 	 */
    647 	mutex_enter(&msp->ms_lock);
    648 
    649 	if (smo->smo_object == 0) {
    650 		ASSERT(smo->smo_objsize == 0);
    651 		ASSERT(smo->smo_alloc == 0);
    652 		mutex_exit(&msp->ms_lock);
    653 		smo->smo_object = dmu_object_alloc(mos,
    654 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
    655 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
    656 		ASSERT(smo->smo_object != 0);
    657 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
    658 		    (sm->sm_start >> vd->vdev_ms_shift),
    659 		    sizeof (uint64_t), &smo->smo_object, tx);
    660 		mutex_enter(&msp->ms_lock);
    661 	}
    662 
    663 	space_map_walk(freemap, space_map_add, freed_map);
    664 
    665 	if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
    666 	    2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
    667 		/*
    668 		 * The in-core space map representation is twice as compact
    669 		 * as the on-disk one, so it's time to condense the latter
    670 		 * by generating a pure allocmap from first principles.
    671 		 *
    672 		 * This metaslab is 100% allocated,
    673 		 * minus the content of the in-core map (sm),
    674 		 * minus what's been freed this txg (freed_map),
    675 		 * minus allocations from txgs in the future
    676 		 * (because they haven't been committed yet).
    677 		 */
    678 		space_map_vacate(allocmap, NULL, NULL);
    679 		space_map_vacate(freemap, NULL, NULL);
    680 
    681 		space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
    682 
    683 		space_map_walk(sm, space_map_remove, allocmap);
    684 		space_map_walk(freed_map, space_map_remove, allocmap);
    685 
    686 		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
    687 			space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
    688 			    space_map_remove, allocmap);
    689 
    690 		mutex_exit(&msp->ms_lock);
    691 		space_map_truncate(smo, mos, tx);
    692 		mutex_enter(&msp->ms_lock);
    693 	}
    694 
    695 	space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
    696 	space_map_sync(freemap, SM_FREE, smo, mos, tx);
    697 
    698 	mutex_exit(&msp->ms_lock);
    699 
    700 	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
    701 	dmu_buf_will_dirty(db, tx);
    702 	ASSERT3U(db->db_size, >=, sizeof (*smo));
    703 	bcopy(smo, db->db_data, sizeof (*smo));
    704 	dmu_buf_rele(db, FTAG);
    705 
    706 	dmu_tx_commit(tx);
    707 }
    708 
    709 /*
    710  * Called after a transaction group has completely synced to mark
    711  * all of the metaslab's free space as usable.
    712  */
    713 void
    714 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
    715 {
    716 	space_map_obj_t *smo = &msp->ms_smo;
    717 	space_map_obj_t *smosync = &msp->ms_smo_syncing;
    718 	space_map_t *sm = &msp->ms_map;
    719 	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
    720 	metaslab_group_t *mg = msp->ms_group;
    721 	vdev_t *vd = mg->mg_vd;
    722 	int t;
    723 
    724 	mutex_enter(&msp->ms_lock);
    725 
    726 	/*
    727 	 * If this metaslab is just becoming available, initialize its
    728 	 * allocmaps and freemaps and add its capacity to the vdev.
    729 	 */
    730 	if (freed_map->sm_size == 0) {
    731 		for (t = 0; t < TXG_SIZE; t++) {
    732 			space_map_create(&msp->ms_allocmap[t], sm->sm_start,
    733 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
    734 			space_map_create(&msp->ms_freemap[t], sm->sm_start,
    735 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
    736 		}
    737 		vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
    738 	}
    739 
    740 	vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
    741 
    742 	ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
    743 	ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
    744 
    745 	/*
    746 	 * If there's a space_map_load() in progress, wait for it to complete
    747 	 * so that we have a consistent view of the in-core space map.
    748 	 * Then, add everything we freed in this txg to the map.
    749 	 */
    750 	space_map_load_wait(sm);
    751 	space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm);
    752 
    753 	*smo = *smosync;
    754 
    755 	/*
    756 	 * If the map is loaded but no longer active, evict it as soon as all
    757 	 * future allocations have synced.  (If we unloaded it now and then
    758 	 * loaded a moment later, the map wouldn't reflect those allocations.)
    759 	 */
    760 	if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
    761 		int evictable = 1;
    762 
    763 		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
    764 			if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
    765 				evictable = 0;
    766 
    767 		if (evictable)
    768 			space_map_unload(sm);
    769 	}
    770 
    771 	metaslab_group_sort(mg, msp, metaslab_weight(msp));
    772 
    773 	mutex_exit(&msp->ms_lock);
    774 }
    775 
    776 static uint64_t
    777 metaslab_distance(metaslab_t *msp, dva_t *dva)
    778 {
    779 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
    780 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
    781 	uint64_t start = msp->ms_map.sm_start >> ms_shift;
    782 
    783 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
    784 		return (1ULL << 63);
    785 
    786 	if (offset < start)
    787 		return ((start - offset) << ms_shift);
    788 	if (offset > start)
    789 		return ((offset - start) << ms_shift);
    790 	return (0);
    791 }
    792 
    793 static uint64_t
    794 metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
    795     uint64_t min_distance, dva_t *dva, int d)
    796 {
    797 	metaslab_t *msp = NULL;
    798 	uint64_t offset = -1ULL;
    799 	avl_tree_t *t = &mg->mg_metaslab_tree;
    800 	uint64_t activation_weight;
    801 	uint64_t target_distance;
    802 	int i;
    803 
    804 	activation_weight = METASLAB_WEIGHT_PRIMARY;
    805 	for (i = 0; i < d; i++) {
    806 		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
    807 			activation_weight = METASLAB_WEIGHT_SECONDARY;
    808 			break;
    809 		}
    810 	}
    811 
    812 	for (;;) {
    813 		boolean_t was_active;
    814 
    815 		mutex_enter(&mg->mg_lock);
    816 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
    817 			if (msp->ms_weight < size) {
    818 				mutex_exit(&mg->mg_lock);
    819 				return (-1ULL);
    820 			}
    821 
    822 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
    823 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
    824 				break;
    825 
    826 			target_distance = min_distance +
    827 			    (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
    828 
    829 			for (i = 0; i < d; i++)
    830 				if (metaslab_distance(msp, &dva[i]) <
    831 				    target_distance)
    832 					break;
    833 			if (i == d)
    834 				break;
    835 		}
    836 		mutex_exit(&mg->mg_lock);
    837 		if (msp == NULL)
    838 			return (-1ULL);
    839 
    840 		mutex_enter(&msp->ms_lock);
    841 
    842 		/*
    843 		 * Ensure that the metaslab we have selected is still
    844 		 * capable of handling our request. It's possible that
    845 		 * another thread may have changed the weight while we
    846 		 * were blocked on the metaslab lock.
    847 		 */
    848 		if (msp->ms_weight < size || (was_active &&
    849 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
    850 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
    851 			mutex_exit(&msp->ms_lock);
    852 			continue;
    853 		}
    854 
    855 		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
    856 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
    857 			metaslab_passivate(msp,
    858 			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
    859 			mutex_exit(&msp->ms_lock);
    860 			continue;
    861 		}
    862 
    863 		if (metaslab_activate(msp, activation_weight, size) != 0) {
    864 			mutex_exit(&msp->ms_lock);
    865 			continue;
    866 		}
    867 
    868 		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
    869 			break;
    870 
    871 		metaslab_passivate(msp, size - 1);
    872 
    873 		mutex_exit(&msp->ms_lock);
    874 	}
    875 
    876 	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
    877 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
    878 
    879 	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
    880 
    881 	mutex_exit(&msp->ms_lock);
    882 
    883 	return (offset);
    884 }
    885 
    886 /*
    887  * Allocate a block for the specified i/o.
    888  */
    889 static int
    890 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
    891     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
    892 {
    893 	metaslab_group_t *mg, *rotor;
    894 	vdev_t *vd;
    895 	int dshift = 3;
    896 	int all_zero;
    897 	int zio_lock = B_FALSE;
    898 	boolean_t allocatable;
    899 	uint64_t offset = -1ULL;
    900 	uint64_t asize;
    901 	uint64_t distance;
    902 
    903 	ASSERT(!DVA_IS_VALID(&dva[d]));
    904 
    905 	/*
    906 	 * For testing, make some blocks above a certain size be gang blocks.
    907 	 */
    908 	if (psize >= metaslab_gang_bang && (lbolt & 3) == 0)
    909 		return (ENOSPC);
    910 
    911 	/*
    912 	 * Start at the rotor and loop through all mgs until we find something.
    913 	 * Note that there's no locking on mc_rotor or mc_allocated because
    914 	 * nothing actually breaks if we miss a few updates -- we just won't
    915 	 * allocate quite as evenly.  It all balances out over time.
    916 	 *
    917 	 * If we are doing ditto or log blocks, try to spread them across
    918 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
    919 	 * allocated all of our ditto blocks, then try and spread them out on
    920 	 * that vdev as much as possible.  If it turns out to not be possible,
    921 	 * gradually lower our standards until anything becomes acceptable.
    922 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
    923 	 * gives us hope of containing our fault domains to something we're
    924 	 * able to reason about.  Otherwise, any two top-level vdev failures
    925 	 * will guarantee the loss of data.  With consecutive allocation,
    926 	 * only two adjacent top-level vdev failures will result in data loss.
    927 	 *
    928 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
    929 	 * ourselves on the same vdev as our gang block header.  That
    930 	 * way, we can hope for locality in vdev_cache, plus it makes our
    931 	 * fault domains something tractable.
    932 	 */
    933 	if (hintdva) {
    934 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
    935 		if (flags & METASLAB_HINTBP_AVOID)
    936 			mg = vd->vdev_mg->mg_next;
    937 		else
    938 			mg = vd->vdev_mg;
    939 	} else if (d != 0) {
    940 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
    941 		mg = vd->vdev_mg->mg_next;
    942 	} else {
    943 		mg = mc->mc_rotor;
    944 	}
    945 
    946 	/*
    947 	 * If the hint put us into the wrong class, just follow the rotor.
    948 	 */
    949 	if (mg->mg_class != mc)
    950 		mg = mc->mc_rotor;
    951 
    952 	rotor = mg;
    953 top:
    954 	all_zero = B_TRUE;
    955 	do {
    956 		vd = mg->mg_vd;
    957 
    958 		/*
    959 		 * Don't allocate from faulted devices.
    960 		 */
    961 		if (zio_lock) {
    962 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
    963 			allocatable = vdev_allocatable(vd);
    964 			spa_config_exit(spa, SCL_ZIO, FTAG);
    965 		} else {
    966 			allocatable = vdev_allocatable(vd);
    967 		}
    968 		if (!allocatable)
    969 			goto next;
    970 
    971 		/*
    972 		 * Avoid writing single-copy data to a failing vdev
    973 		 */
    974 		if ((vd->vdev_stat.vs_write_errors > 0 ||
    975 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
    976 		    d == 0 && dshift == 3) {
    977 			all_zero = B_FALSE;
    978 			goto next;
    979 		}
    980 
    981 		ASSERT(mg->mg_class == mc);
    982 
    983 		distance = vd->vdev_asize >> dshift;
    984 		if (distance <= (1ULL << vd->vdev_ms_shift))
    985 			distance = 0;
    986 		else
    987 			all_zero = B_FALSE;
    988 
    989 		asize = vdev_psize_to_asize(vd, psize);
    990 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
    991 
    992 		offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
    993 		if (offset != -1ULL) {
    994 			/*
    995 			 * If we've just selected this metaslab group,
    996 			 * figure out whether the corresponding vdev is
    997 			 * over- or under-used relative to the pool,
    998 			 * and set an allocation bias to even it out.
    999 			 */
   1000 			if (mc->mc_allocated == 0) {
   1001 				vdev_stat_t *vs = &vd->vdev_stat;
   1002 				uint64_t alloc, space;
   1003 				int64_t vu, su;
   1004 
   1005 				alloc = spa_get_alloc(spa);
   1006 				space = spa_get_space(spa);
   1007 
   1008 				/*
   1009 				 * Determine percent used in units of 0..1024.
   1010 				 * (This is just to avoid floating point.)
   1011 				 */
   1012 				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
   1013 				su = (alloc << 10) / (space + 1);
   1014 
   1015 				/*
   1016 				 * Bias by at most +/- 25% of the aliquot.
   1017 				 */
   1018 				mg->mg_bias = ((su - vu) *
   1019 				    (int64_t)mg->mg_aliquot) / (1024 * 4);
   1020 			}
   1021 
   1022 			if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
   1023 			    mg->mg_aliquot + mg->mg_bias) {
   1024 				mc->mc_rotor = mg->mg_next;
   1025 				mc->mc_allocated = 0;
   1026 			}
   1027 
   1028 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
   1029 			DVA_SET_OFFSET(&dva[d], offset);
   1030 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
   1031 			DVA_SET_ASIZE(&dva[d], asize);
   1032 
   1033 			return (0);
   1034 		}
   1035 next:
   1036 		mc->mc_rotor = mg->mg_next;
   1037 		mc->mc_allocated = 0;
   1038 	} while ((mg = mg->mg_next) != rotor);
   1039 
   1040 	if (!all_zero) {
   1041 		dshift++;
   1042 		ASSERT(dshift < 64);
   1043 		goto top;
   1044 	}
   1045 
   1046 	if (!allocatable && !zio_lock) {
   1047 		dshift = 3;
   1048 		zio_lock = B_TRUE;
   1049 		goto top;
   1050 	}
   1051 
   1052 	bzero(&dva[d], sizeof (dva_t));
   1053 
   1054 	return (ENOSPC);
   1055 }
   1056 
   1057 /*
   1058  * Free the block represented by DVA in the context of the specified
   1059  * transaction group.
   1060  */
   1061 static void
   1062 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
   1063 {
   1064 	uint64_t vdev = DVA_GET_VDEV(dva);
   1065 	uint64_t offset = DVA_GET_OFFSET(dva);
   1066 	uint64_t size = DVA_GET_ASIZE(dva);
   1067 	vdev_t *vd;
   1068 	metaslab_t *msp;
   1069 
   1070 	ASSERT(DVA_IS_VALID(dva));
   1071 
   1072 	if (txg > spa_freeze_txg(spa))
   1073 		return;
   1074 
   1075 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
   1076 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
   1077 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
   1078 		    (u_longlong_t)vdev, (u_longlong_t)offset);
   1079 		ASSERT(0);
   1080 		return;
   1081 	}
   1082 
   1083 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
   1084 
   1085 	if (DVA_GET_GANG(dva))
   1086 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
   1087 
   1088 	mutex_enter(&msp->ms_lock);
   1089 
   1090 	if (now) {
   1091 		space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
   1092 		    offset, size);
   1093 		space_map_free(&msp->ms_map, offset, size);
   1094 	} else {
   1095 		if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
   1096 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
   1097 		space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
   1098 	}
   1099 
   1100 	mutex_exit(&msp->ms_lock);
   1101 }
   1102 
   1103 /*
   1104  * Intent log support: upon opening the pool after a crash, notify the SPA
   1105  * of blocks that the intent log has allocated for immediate write, but
   1106  * which are still considered free by the SPA because the last transaction
   1107  * group didn't commit yet.
   1108  */
   1109 static int
   1110 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
   1111 {
   1112 	uint64_t vdev = DVA_GET_VDEV(dva);
   1113 	uint64_t offset = DVA_GET_OFFSET(dva);
   1114 	uint64_t size = DVA_GET_ASIZE(dva);
   1115 	vdev_t *vd;
   1116 	metaslab_t *msp;
   1117 	int error;
   1118 
   1119 	ASSERT(DVA_IS_VALID(dva));
   1120 
   1121 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
   1122 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
   1123 		return (ENXIO);
   1124 
   1125 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
   1126 
   1127 	if (DVA_GET_GANG(dva))
   1128 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
   1129 
   1130 	mutex_enter(&msp->ms_lock);
   1131 
   1132 	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
   1133 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
   1134 		mutex_exit(&msp->ms_lock);
   1135 		return (error);
   1136 	}
   1137 
   1138 	space_map_claim(&msp->ms_map, offset, size);
   1139 
   1140 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
   1141 		if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
   1142 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
   1143 		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
   1144 	}
   1145 
   1146 	mutex_exit(&msp->ms_lock);
   1147 
   1148 	return (0);
   1149 }
   1150 
   1151 int
   1152 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
   1153     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
   1154 {
   1155 	dva_t *dva = bp->blk_dva;
   1156 	dva_t *hintdva = hintbp->blk_dva;
   1157 	int error = 0;
   1158 
   1159 	ASSERT(bp->blk_birth == 0);
   1160 
   1161 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
   1162 
   1163 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
   1164 		spa_config_exit(spa, SCL_ALLOC, FTAG);
   1165 		return (ENOSPC);
   1166 	}
   1167 
   1168 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
   1169 	ASSERT(BP_GET_NDVAS(bp) == 0);
   1170 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
   1171 
   1172 	for (int d = 0; d < ndvas; d++) {
   1173 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
   1174 		    txg, flags);
   1175 		if (error) {
   1176 			for (d--; d >= 0; d--) {
   1177 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
   1178 				bzero(&dva[d], sizeof (dva_t));
   1179 			}
   1180 			spa_config_exit(spa, SCL_ALLOC, FTAG);
   1181 			return (error);
   1182 		}
   1183 	}
   1184 	ASSERT(error == 0);
   1185 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
   1186 
   1187 	spa_config_exit(spa, SCL_ALLOC, FTAG);
   1188 
   1189 	bp->blk_birth = txg;
   1190 
   1191 	return (0);
   1192 }
   1193 
   1194 void
   1195 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
   1196 {
   1197 	const dva_t *dva = bp->blk_dva;
   1198 	int ndvas = BP_GET_NDVAS(bp);
   1199 
   1200 	ASSERT(!BP_IS_HOLE(bp));
   1201 	ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg);
   1202 
   1203 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
   1204 
   1205 	for (int d = 0; d < ndvas; d++)
   1206 		metaslab_free_dva(spa, &dva[d], txg, now);
   1207 
   1208 	spa_config_exit(spa, SCL_FREE, FTAG);
   1209 }
   1210 
   1211 int
   1212 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
   1213 {
   1214 	const dva_t *dva = bp->blk_dva;
   1215 	int ndvas = BP_GET_NDVAS(bp);
   1216 	int error = 0;
   1217 
   1218 	ASSERT(!BP_IS_HOLE(bp));
   1219 
   1220 	if (txg != 0) {
   1221 		/*
   1222 		 * First do a dry run to make sure all DVAs are claimable,
   1223 		 * so we don't have to unwind from partial failures below.
   1224 		 */
   1225 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
   1226 			return (error);
   1227 	}
   1228 
   1229 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
   1230 
   1231 	for (int d = 0; d < ndvas; d++)
   1232 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
   1233 			break;
   1234 
   1235 	spa_config_exit(spa, SCL_ALLOC, FTAG);
   1236 
   1237 	ASSERT(error == 0 || txg == 0);
   1238 
   1239 	return (error);
   1240 }
   1241