Home | History | Annotate | Download | only in zfs
      1    789    ahrens /*
      2    789    ahrens  * CDDL HEADER START
      3    789    ahrens  *
      4    789    ahrens  * The contents of this file are subject to the terms of the
      5   1544  eschrock  * Common Development and Distribution License (the "License").
      6   1544  eschrock  * You may not use this file except in compliance with the License.
      7    789    ahrens  *
      8    789    ahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9    789    ahrens  * or http://www.opensolaris.org/os/licensing.
     10    789    ahrens  * See the License for the specific language governing permissions
     11    789    ahrens  * and limitations under the License.
     12    789    ahrens  *
     13    789    ahrens  * When distributing Covered Code, include this CDDL HEADER in each
     14    789    ahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15    789    ahrens  * If applicable, add the following below this CDDL HEADER, with the
     16    789    ahrens  * fields enclosed by brackets "[]" replaced with your own identifying
     17    789    ahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
     18    789    ahrens  *
     19    789    ahrens  * CDDL HEADER END
     20    789    ahrens  */
     21    789    ahrens /*
     22   9480    George  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23    789    ahrens  * Use is subject to license terms.
     24    789    ahrens  */
     25    789    ahrens 
     26    789    ahrens #include <sys/zfs_context.h>
     27    789    ahrens #include <sys/dmu.h>
     28    789    ahrens #include <sys/dmu_tx.h>
     29    789    ahrens #include <sys/space_map.h>
     30    789    ahrens #include <sys/metaslab_impl.h>
     31    789    ahrens #include <sys/vdev_impl.h>
     32    789    ahrens #include <sys/zio.h>
     33   2391    maybee 
     34   2391    maybee uint64_t metaslab_aliquot = 512ULL << 10;
     35   5530   bonwick uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
     36  10922      Jeff 
     37  10922      Jeff /*
     38  10922      Jeff  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
     39  10922      Jeff  */
     40  10922      Jeff static int metaslab_debug = 0;
     41    789    ahrens 
     42    789    ahrens /*
     43   9480    George  * Minimum size which forces the dynamic allocator to change
     44  11146    George  * it's allocation strategy.  Once the space map cannot satisfy
     45   9480    George  * an allocation of this size then it switches to using more
     46   9480    George  * aggressive strategy (i.e search by size rather than offset).
     47   9480    George  */
     48   9480    George uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
     49   9480    George 
     50   9480    George /*
     51   9480    George  * The minimum free space, in percent, which must be available
     52   9480    George  * in a space map to continue allocations in a first-fit fashion.
     53   9480    George  * Once the space_map's free space drops below this level we dynamically
     54   9480    George  * switch to using best-fit allocations.
     55   9480    George  */
     56  11146    George int metaslab_df_free_pct = 4;
     57  11146    George 
     58  11146    George /*
     59  11146    George  * A metaslab is considered "free" if it contains a contiguous
     60  11146    George  * segment which is greater than metaslab_min_alloc_size.
     61  11146    George  */
     62  11146    George uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
     63  11146    George 
     64  11146    George /*
     65  11146    George  * Max number of space_maps to prefetch.
     66  11146    George  */
     67  11146    George int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
     68  11146    George 
     69  11146    George /*
     70  11146    George  * Percentage bonus multiplier for metaslabs that are in the bonus area.
     71  11146    George  */
     72  11146    George int metaslab_smo_bonus_pct = 150;
     73   9480    George 
     74   9480    George /*
     75    789    ahrens  * ==========================================================================
     76    789    ahrens  * Metaslab classes
     77    789    ahrens  * ==========================================================================
     78    789    ahrens  */
     79    789    ahrens metaslab_class_t *
     80  10594    George metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
     81    789    ahrens {
     82    789    ahrens 	metaslab_class_t *mc;
     83    789    ahrens 
     84    789    ahrens 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
     85    789    ahrens 
     86  10594    George 	mc->mc_spa = spa;
     87    789    ahrens 	mc->mc_rotor = NULL;
     88   9480    George 	mc->mc_ops = ops;
     89    789    ahrens 
     90    789    ahrens 	return (mc);
     91    789    ahrens }
     92    789    ahrens 
     93    789    ahrens void
     94    789    ahrens metaslab_class_destroy(metaslab_class_t *mc)
     95    789    ahrens {
     96  10974      Jeff 	ASSERT(mc->mc_rotor == NULL);
     97  10974      Jeff 	ASSERT(mc->mc_alloc == 0);
     98  10974      Jeff 	ASSERT(mc->mc_deferred == 0);
     99  10974      Jeff 	ASSERT(mc->mc_space == 0);
    100  10974      Jeff 	ASSERT(mc->mc_dspace == 0);
    101    789    ahrens 
    102    789    ahrens 	kmem_free(mc, sizeof (metaslab_class_t));
    103  10594    George }
    104  10594    George 
    105  10594    George int
    106  10594    George metaslab_class_validate(metaslab_class_t *mc)
    107  10594    George {
    108  10594    George 	metaslab_group_t *mg;
    109  10594    George 	vdev_t *vd;
    110  10594    George 
    111  10594    George 	/*
    112  10594    George 	 * Must hold one of the spa_config locks.
    113  10594    George 	 */
    114  10594    George 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
    115  10594    George 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
    116  10594    George 
    117  10594    George 	if ((mg = mc->mc_rotor) == NULL)
    118  10594    George 		return (0);
    119  10594    George 
    120  10594    George 	do {
    121  10594    George 		vd = mg->mg_vd;
    122  10594    George 		ASSERT(vd->vdev_mg != NULL);
    123  10594    George 		ASSERT3P(vd->vdev_top, ==, vd);
    124  10594    George 		ASSERT3P(mg->mg_class, ==, mc);
    125  10594    George 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
    126  10594    George 	} while ((mg = mg->mg_next) != mc->mc_rotor);
    127  10594    George 
    128  10594    George 	return (0);
    129  10922      Jeff }
    130  10922      Jeff 
    131  10922      Jeff void
    132  10922      Jeff metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
    133  10922      Jeff     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
    134  10922      Jeff {
    135  10922      Jeff 	atomic_add_64(&mc->mc_alloc, alloc_delta);
    136  10922      Jeff 	atomic_add_64(&mc->mc_deferred, defer_delta);
    137  10922      Jeff 	atomic_add_64(&mc->mc_space, space_delta);
    138  10922      Jeff 	atomic_add_64(&mc->mc_dspace, dspace_delta);
    139  10922      Jeff }
    140  10922      Jeff 
    141  10922      Jeff uint64_t
    142  10922      Jeff metaslab_class_get_alloc(metaslab_class_t *mc)
    143  10922      Jeff {
    144  10922      Jeff 	return (mc->mc_alloc);
    145  10922      Jeff }
    146  10922      Jeff 
    147  10922      Jeff uint64_t
    148  10922      Jeff metaslab_class_get_deferred(metaslab_class_t *mc)
    149  10922      Jeff {
    150  10922      Jeff 	return (mc->mc_deferred);
    151  10922      Jeff }
    152  10922      Jeff 
    153  10922      Jeff uint64_t
    154  10922      Jeff metaslab_class_get_space(metaslab_class_t *mc)
    155  10922      Jeff {
    156  10922      Jeff 	return (mc->mc_space);
    157  10922      Jeff }
    158  10922      Jeff 
    159  10922      Jeff uint64_t
    160  10922      Jeff metaslab_class_get_dspace(metaslab_class_t *mc)
    161  10922      Jeff {
    162  10922      Jeff 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
    163    789    ahrens }
    164    789    ahrens 
    165    789    ahrens /*
    166    789    ahrens  * ==========================================================================
    167    789    ahrens  * Metaslab groups
    168    789    ahrens  * ==========================================================================
    169    789    ahrens  */
    170    789    ahrens static int
    171    789    ahrens metaslab_compare(const void *x1, const void *x2)
    172    789    ahrens {
    173    789    ahrens 	const metaslab_t *m1 = x1;
    174    789    ahrens 	const metaslab_t *m2 = x2;
    175    789    ahrens 
    176    789    ahrens 	if (m1->ms_weight < m2->ms_weight)
    177    789    ahrens 		return (1);
    178    789    ahrens 	if (m1->ms_weight > m2->ms_weight)
    179    789    ahrens 		return (-1);
    180    789    ahrens 
    181    789    ahrens 	/*
    182    789    ahrens 	 * If the weights are identical, use the offset to force uniqueness.
    183    789    ahrens 	 */
    184    789    ahrens 	if (m1->ms_map.sm_start < m2->ms_map.sm_start)
    185    789    ahrens 		return (-1);
    186    789    ahrens 	if (m1->ms_map.sm_start > m2->ms_map.sm_start)
    187    789    ahrens 		return (1);
    188    789    ahrens 
    189    789    ahrens 	ASSERT3P(m1, ==, m2);
    190    789    ahrens 
    191    789    ahrens 	return (0);
    192    789    ahrens }
    193    789    ahrens 
    194    789    ahrens metaslab_group_t *
    195    789    ahrens metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
    196    789    ahrens {
    197    789    ahrens 	metaslab_group_t *mg;
    198    789    ahrens 
    199    789    ahrens 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
    200    789    ahrens 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
    201    789    ahrens 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
    202    789    ahrens 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
    203    789    ahrens 	mg->mg_vd = vd;
    204  10974      Jeff 	mg->mg_class = mc;
    205  10974      Jeff 	mg->mg_activation_count = 0;
    206    789    ahrens 
    207    789    ahrens 	return (mg);
    208    789    ahrens }
    209    789    ahrens 
    210    789    ahrens void
    211    789    ahrens metaslab_group_destroy(metaslab_group_t *mg)
    212    789    ahrens {
    213  10974      Jeff 	ASSERT(mg->mg_prev == NULL);
    214  10974      Jeff 	ASSERT(mg->mg_next == NULL);
    215  11026       Tim 	/*
    216  11026       Tim 	 * We may have gone below zero with the activation count
    217  11026       Tim 	 * either because we never activated in the first place or
    218  11026       Tim 	 * because we're done, and possibly removing the vdev.
    219  11026       Tim 	 */
    220  11026       Tim 	ASSERT(mg->mg_activation_count <= 0);
    221  10974      Jeff 
    222    789    ahrens 	avl_destroy(&mg->mg_metaslab_tree);
    223    789    ahrens 	mutex_destroy(&mg->mg_lock);
    224    789    ahrens 	kmem_free(mg, sizeof (metaslab_group_t));
    225  10974      Jeff }
    226  10974      Jeff 
    227  10974      Jeff void
    228  10974      Jeff metaslab_group_activate(metaslab_group_t *mg)
    229  10974      Jeff {
    230  10974      Jeff 	metaslab_class_t *mc = mg->mg_class;
    231  10974      Jeff 	metaslab_group_t *mgprev, *mgnext;
    232  10974      Jeff 
    233  10974      Jeff 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
    234  10974      Jeff 
    235  10974      Jeff 	ASSERT(mc->mc_rotor != mg);
    236  10974      Jeff 	ASSERT(mg->mg_prev == NULL);
    237  10974      Jeff 	ASSERT(mg->mg_next == NULL);
    238  10974      Jeff 	ASSERT(mg->mg_activation_count <= 0);
    239  10974      Jeff 
    240  10974      Jeff 	if (++mg->mg_activation_count <= 0)
    241  10974      Jeff 		return;
    242  10974      Jeff 
    243  10974      Jeff 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
    244  10974      Jeff 
    245  10974      Jeff 	if ((mgprev = mc->mc_rotor) == NULL) {
    246  10974      Jeff 		mg->mg_prev = mg;
    247  10974      Jeff 		mg->mg_next = mg;
    248  10974      Jeff 	} else {
    249  10974      Jeff 		mgnext = mgprev->mg_next;
    250  10974      Jeff 		mg->mg_prev = mgprev;
    251  10974      Jeff 		mg->mg_next = mgnext;
    252  10974      Jeff 		mgprev->mg_next = mg;
    253  10974      Jeff 		mgnext->mg_prev = mg;
    254  10974      Jeff 	}
    255  10974      Jeff 	mc->mc_rotor = mg;
    256  10974      Jeff }
    257  10974      Jeff 
    258  10974      Jeff void
    259  10974      Jeff metaslab_group_passivate(metaslab_group_t *mg)
    260  10974      Jeff {
    261  10974      Jeff 	metaslab_class_t *mc = mg->mg_class;
    262  10974      Jeff 	metaslab_group_t *mgprev, *mgnext;
    263  10974      Jeff 
    264  10974      Jeff 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
    265  10974      Jeff 
    266  10974      Jeff 	if (--mg->mg_activation_count != 0) {
    267  10974      Jeff 		ASSERT(mc->mc_rotor != mg);
    268  10974      Jeff 		ASSERT(mg->mg_prev == NULL);
    269  10974      Jeff 		ASSERT(mg->mg_next == NULL);
    270  10974      Jeff 		ASSERT(mg->mg_activation_count < 0);
    271  10974      Jeff 		return;
    272  10974      Jeff 	}
    273  10974      Jeff 
    274  10974      Jeff 	mgprev = mg->mg_prev;
    275  10974      Jeff 	mgnext = mg->mg_next;
    276  10974      Jeff 
    277  10974      Jeff 	if (mg == mgnext) {
    278  10974      Jeff 		mc->mc_rotor = NULL;
    279  10974      Jeff 	} else {
    280  10974      Jeff 		mc->mc_rotor = mgnext;
    281  10974      Jeff 		mgprev->mg_next = mgnext;
    282  10974      Jeff 		mgnext->mg_prev = mgprev;
    283  10974      Jeff 	}
    284  10974      Jeff 
    285  10974      Jeff 	mg->mg_prev = NULL;
    286  10974      Jeff 	mg->mg_next = NULL;
    287    789    ahrens }
    288    789    ahrens 
    289   1732   bonwick static void
    290   1732   bonwick metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
    291    789    ahrens {
    292    789    ahrens 	mutex_enter(&mg->mg_lock);
    293    789    ahrens 	ASSERT(msp->ms_group == NULL);
    294    789    ahrens 	msp->ms_group = mg;
    295   1732   bonwick 	msp->ms_weight = 0;
    296    789    ahrens 	avl_add(&mg->mg_metaslab_tree, msp);
    297    789    ahrens 	mutex_exit(&mg->mg_lock);
    298    789    ahrens }
    299    789    ahrens 
    300   1732   bonwick static void
    301    789    ahrens metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
    302    789    ahrens {
    303    789    ahrens 	mutex_enter(&mg->mg_lock);
    304    789    ahrens 	ASSERT(msp->ms_group == mg);
    305    789    ahrens 	avl_remove(&mg->mg_metaslab_tree, msp);
    306    789    ahrens 	msp->ms_group = NULL;
    307    789    ahrens 	mutex_exit(&mg->mg_lock);
    308    789    ahrens }
    309    789    ahrens 
    310   1732   bonwick static void
    311    789    ahrens metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
    312    789    ahrens {
    313   2459    ahrens 	/*
    314   2459    ahrens 	 * Although in principle the weight can be any value, in
    315   2459    ahrens 	 * practice we do not use values in the range [1, 510].
    316   2459    ahrens 	 */
    317   2459    ahrens 	ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
    318   1732   bonwick 	ASSERT(MUTEX_HELD(&msp->ms_lock));
    319   1732   bonwick 
    320    789    ahrens 	mutex_enter(&mg->mg_lock);
    321    789    ahrens 	ASSERT(msp->ms_group == mg);
    322    789    ahrens 	avl_remove(&mg->mg_metaslab_tree, msp);
    323    789    ahrens 	msp->ms_weight = weight;
    324    789    ahrens 	avl_add(&mg->mg_metaslab_tree, msp);
    325    789    ahrens 	mutex_exit(&mg->mg_lock);
    326    789    ahrens }
    327    789    ahrens 
    328    789    ahrens /*
    329  11146    George  * ==========================================================================
    330  11146    George  * Common allocator routines
    331  11146    George  * ==========================================================================
    332  11146    George  */
    333  11146    George static int
    334  11146    George metaslab_segsize_compare(const void *x1, const void *x2)
    335  11146    George {
    336  11146    George 	const space_seg_t *s1 = x1;
    337  11146    George 	const space_seg_t *s2 = x2;
    338  11146    George 	uint64_t ss_size1 = s1->ss_end - s1->ss_start;
    339  11146    George 	uint64_t ss_size2 = s2->ss_end - s2->ss_start;
    340  11146    George 
    341  11146    George 	if (ss_size1 < ss_size2)
    342  11146    George 		return (-1);
    343  11146    George 	if (ss_size1 > ss_size2)
    344  11146    George 		return (1);
    345  11146    George 
    346  11146    George 	if (s1->ss_start < s2->ss_start)
    347  11146    George 		return (-1);
    348  11146    George 	if (s1->ss_start > s2->ss_start)
    349  11146    George 		return (1);
    350  11146    George 
    351  11146    George 	return (0);
    352  11146    George }
    353  11146    George 
    354  11146    George /*
    355   9480    George  * This is a helper function that can be used by the allocator to find
    356   9480    George  * a suitable block to allocate. This will search the specified AVL
    357   9480    George  * tree looking for a block that matches the specified criteria.
    358    789    ahrens  */
    359   9480    George static uint64_t
    360   9480    George metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
    361   9480    George     uint64_t align)
    362    789    ahrens {
    363    789    ahrens 	space_seg_t *ss, ssearch;
    364    789    ahrens 	avl_index_t where;
    365    789    ahrens 
    366    789    ahrens 	ssearch.ss_start = *cursor;
    367    789    ahrens 	ssearch.ss_end = *cursor + size;
    368    789    ahrens 
    369    789    ahrens 	ss = avl_find(t, &ssearch, &where);
    370    789    ahrens 	if (ss == NULL)
    371    789    ahrens 		ss = avl_nearest(t, where, AVL_AFTER);
    372    789    ahrens 
    373    789    ahrens 	while (ss != NULL) {
    374    789    ahrens 		uint64_t offset = P2ROUNDUP(ss->ss_start, align);
    375    789    ahrens 
    376    789    ahrens 		if (offset + size <= ss->ss_end) {
    377    789    ahrens 			*cursor = offset + size;
    378    789    ahrens 			return (offset);
    379    789    ahrens 		}
    380    789    ahrens 		ss = AVL_NEXT(t, ss);
    381    789    ahrens 	}
    382    789    ahrens 
    383   1732   bonwick 	/*
    384   1732   bonwick 	 * If we know we've searched the whole map (*cursor == 0), give up.
    385   1732   bonwick 	 * Otherwise, reset the cursor to the beginning and try again.
    386   1732   bonwick 	 */
    387   1732   bonwick 	if (*cursor == 0)
    388   1732   bonwick 		return (-1ULL);
    389   1732   bonwick 
    390   1732   bonwick 	*cursor = 0;
    391   9480    George 	return (metaslab_block_picker(t, cursor, size, align));
    392   9480    George }
    393   9480    George 
    394   9480    George static void
    395  11146    George metaslab_pp_load(space_map_t *sm)
    396   9480    George {
    397   9480    George 	space_seg_t *ss;
    398   9480    George 
    399   9480    George 	ASSERT(sm->sm_ppd == NULL);
    400   9480    George 	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
    401   9480    George 
    402   9480    George 	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
    403  11146    George 	avl_create(sm->sm_pp_root, metaslab_segsize_compare,
    404   9480    George 	    sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
    405   9480    George 
    406   9480    George 	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
    407   9480    George 		avl_add(sm->sm_pp_root, ss);
    408   9480    George }
    409   9480    George 
    410   9480    George static void
    411  11146    George metaslab_pp_unload(space_map_t *sm)
    412   9480    George {
    413   9480    George 	void *cookie = NULL;
    414   9480    George 
    415   9480    George 	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
    416   9480    George 	sm->sm_ppd = NULL;
    417   9480    George 
    418   9480    George 	while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
    419   9480    George 		/* tear down the tree */
    420   9480    George 	}
    421   9480    George 
    422   9480    George 	avl_destroy(sm->sm_pp_root);
    423   9480    George 	kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
    424   9480    George 	sm->sm_pp_root = NULL;
    425   9480    George }
    426   9480    George 
    427  11146    George /* ARGSUSED */
    428  11146    George static void
    429  11146    George metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
    430  11146    George {
    431  11146    George 	/* No need to update cursor */
    432  11146    George }
    433  11146    George 
    434  11146    George /* ARGSUSED */
    435  11146    George static void
    436  11146    George metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
    437  11146    George {
    438  11146    George 	/* No need to update cursor */
    439  11146    George }
    440  11146    George 
    441  11146    George /*
    442  11146    George  * Return the maximum contiguous segment within the metaslab.
    443  11146    George  */
    444  11146    George uint64_t
    445  11146    George metaslab_pp_maxsize(space_map_t *sm)
    446  11146    George {
    447  11146    George 	avl_tree_t *t = sm->sm_pp_root;
    448  11146    George 	space_seg_t *ss;
    449  11146    George 
    450  11146    George 	if (t == NULL || (ss = avl_last(t)) == NULL)
    451  11146    George 		return (0ULL);
    452  11146    George 
    453  11146    George 	return (ss->ss_end - ss->ss_start);
    454  11146    George }
    455  11146    George 
    456  11146    George /*
    457  11146    George  * ==========================================================================
    458  11146    George  * The first-fit block allocator
    459  11146    George  * ==========================================================================
    460  11146    George  */
    461  11146    George static uint64_t
    462  11146    George metaslab_ff_alloc(space_map_t *sm, uint64_t size)
    463  11146    George {
    464  11146    George 	avl_tree_t *t = &sm->sm_root;
    465  11146    George 	uint64_t align = size & -size;
    466  11146    George 	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
    467  11146    George 
    468  11146    George 	return (metaslab_block_picker(t, cursor, size, align));
    469  11146    George }
    470  11146    George 
    471  11146    George /* ARGSUSED */
    472  11146    George boolean_t
    473  11146    George metaslab_ff_fragmented(space_map_t *sm)
    474  11146    George {
    475  11146    George 	return (B_TRUE);
    476  11146    George }
    477  11146    George 
    478  11146    George static space_map_ops_t metaslab_ff_ops = {
    479  11146    George 	metaslab_pp_load,
    480  11146    George 	metaslab_pp_unload,
    481  11146    George 	metaslab_ff_alloc,
    482  11146    George 	metaslab_pp_claim,
    483  11146    George 	metaslab_pp_free,
    484  11146    George 	metaslab_pp_maxsize,
    485  11146    George 	metaslab_ff_fragmented
    486  11146    George };
    487  11146    George 
    488  11146    George /*
    489  11146    George  * ==========================================================================
    490  11146    George  * Dynamic block allocator -
    491  11146    George  * Uses the first fit allocation scheme until space get low and then
    492  11146    George  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
    493  11146    George  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
    494  11146    George  * ==========================================================================
    495  11146    George  */
    496   9480    George static uint64_t
    497   9480    George metaslab_df_alloc(space_map_t *sm, uint64_t size)
    498   9480    George {
    499   9480    George 	avl_tree_t *t = &sm->sm_root;
    500   9480    George 	uint64_t align = size & -size;
    501   9480    George 	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
    502  11146    George 	uint64_t max_size = metaslab_pp_maxsize(sm);
    503   9480    George 	int free_pct = sm->sm_space * 100 / sm->sm_size;
    504   9480    George 
    505   9480    George 	ASSERT(MUTEX_HELD(sm->sm_lock));
    506   9480    George 	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
    507   9480    George 
    508   9480    George 	if (max_size < size)
    509   9480    George 		return (-1ULL);
    510   9480    George 
    511   9480    George 	/*
    512   9480    George 	 * If we're running low on space switch to using the size
    513   9480    George 	 * sorted AVL tree (best-fit).
    514   9480    George 	 */
    515   9480    George 	if (max_size < metaslab_df_alloc_threshold ||
    516   9480    George 	    free_pct < metaslab_df_free_pct) {
    517   9480    George 		t = sm->sm_pp_root;
    518   9480    George 		*cursor = 0;
    519   9480    George 	}
    520   9480    George 
    521   9480    George 	return (metaslab_block_picker(t, cursor, size, 1ULL));
    522   9480    George }
    523   9480    George 
    524  11146    George static boolean_t
    525  11146    George metaslab_df_fragmented(space_map_t *sm)
    526   9480    George {
    527  11146    George 	uint64_t max_size = metaslab_pp_maxsize(sm);
    528  11146    George 	int free_pct = sm->sm_space * 100 / sm->sm_size;
    529   9480    George 
    530  11146    George 	if (max_size >= metaslab_df_alloc_threshold &&
    531  11146    George 	    free_pct >= metaslab_df_free_pct)
    532  11146    George 		return (B_FALSE);
    533  11146    George 
    534  11146    George 	return (B_TRUE);
    535   9480    George }
    536   9480    George 
    537   9480    George static space_map_ops_t metaslab_df_ops = {
    538  11146    George 	metaslab_pp_load,
    539  11146    George 	metaslab_pp_unload,
    540   9480    George 	metaslab_df_alloc,
    541  11146    George 	metaslab_pp_claim,
    542  11146    George 	metaslab_pp_free,
    543  11146    George 	metaslab_pp_maxsize,
    544  11146    George 	metaslab_df_fragmented
    545  11146    George };
    546  11146    George 
    547  11146    George /*
    548  11146    George  * ==========================================================================
    549  11146    George  * Other experimental allocators
    550  11146    George  * ==========================================================================
    551  11146    George  */
    552  11146    George static uint64_t
    553  11146    George metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
    554  11146    George {
    555  11146    George 	avl_tree_t *t = &sm->sm_root;
    556  11146    George 	uint64_t *cursor = (uint64_t *)sm->sm_ppd;
    557  11146    George 	uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
    558  11146    George 	uint64_t max_size = metaslab_pp_maxsize(sm);
    559  11146    George 	uint64_t rsize = size;
    560  11146    George 	uint64_t offset = 0;
    561  11146    George 
    562  11146    George 	ASSERT(MUTEX_HELD(sm->sm_lock));
    563  11146    George 	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
    564  11146    George 
    565  11146    George 	if (max_size < size)
    566  11146    George 		return (-1ULL);
    567  11146    George 
    568  11146    George 	ASSERT3U(*extent_end, >=, *cursor);
    569  11146    George 
    570  11146    George 	/*
    571  11146    George 	 * If we're running low on space switch to using the size
    572  11146    George 	 * sorted AVL tree (best-fit).
    573  11146    George 	 */
    574  11146    George 	if ((*cursor + size) > *extent_end) {
    575  11146    George 
    576  11146    George 		t = sm->sm_pp_root;
    577  11146    George 		*cursor = *extent_end = 0;
    578  11146    George 
    579  11146    George 		if (max_size > 2 * SPA_MAXBLOCKSIZE)
    580  11146    George 			rsize = MIN(metaslab_min_alloc_size, max_size);
    581  11146    George 		offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
    582  11146    George 		if (offset != -1)
    583  11146    George 			*cursor = offset + size;
    584  11146    George 	} else {
    585  11146    George 		offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
    586  11146    George 	}
    587  11146    George 	ASSERT3U(*cursor, <=, *extent_end);
    588  11146    George 	return (offset);
    589  11146    George }
    590  11146    George 
    591  11146    George static boolean_t
    592  11146    George metaslab_cdf_fragmented(space_map_t *sm)
    593  11146    George {
    594  11146    George 	uint64_t max_size = metaslab_pp_maxsize(sm);
    595  11146    George 
    596  11146    George 	if (max_size > (metaslab_min_alloc_size * 10))
    597  11146    George 		return (B_FALSE);
    598  11146    George 	return (B_TRUE);
    599  11146    George }
    600  11146    George 
    601  11146    George static space_map_ops_t metaslab_cdf_ops = {
    602  11146    George 	metaslab_pp_load,
    603  11146    George 	metaslab_pp_unload,
    604  11146    George 	metaslab_cdf_alloc,
    605  11146    George 	metaslab_pp_claim,
    606  11146    George 	metaslab_pp_free,
    607  11146    George 	metaslab_pp_maxsize,
    608  11146    George 	metaslab_cdf_fragmented
    609  11146    George };
    610  11146    George 
    611  11146    George static uint64_t
    612  11146    George metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
    613  11146    George {
    614  11146    George 	avl_tree_t *t = &sm->sm_root;
    615  11146    George 	avl_index_t where;
    616  11146    George 	space_seg_t *ss, ssearch;
    617  11146    George 	uint64_t *cursor = (uint64_t *)sm->sm_ppd;
    618  11146    George 	uint64_t max_size = metaslab_pp_maxsize(sm);
    619  11146    George 
    620  11146    George 	ASSERT(MUTEX_HELD(sm->sm_lock));
    621  11146    George 	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
    622  11146    George 
    623  11146    George 	if (max_size < size)
    624  11146    George 		return (-1ULL);
    625  11146    George 
    626  11146    George 	ssearch.ss_start = *cursor;
    627  11146    George 	ssearch.ss_end = *cursor + size;
    628  11146    George 
    629  11146    George 	ss = avl_find(t, &ssearch, &where);
    630  11146    George 	if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
    631  11146    George 		t = sm->sm_pp_root;
    632  11146    George 
    633  11146    George 		if (max_size > 2 * SPA_MAXBLOCKSIZE)
    634  11146    George 			size = MIN(metaslab_min_alloc_size, max_size);
    635  11146    George 
    636  11146    George 		ssearch.ss_start = 0;
    637  11146    George 		ssearch.ss_end = size;
    638  11146    George 		ss = avl_find(t, &ssearch, &where);
    639  11146    George 		if (ss == NULL)
    640  11146    George 			ss = avl_nearest(t, where, AVL_AFTER);
    641  11146    George 		ASSERT(ss != NULL);
    642  11146    George 	}
    643  11146    George 
    644  11146    George 	if (ss != NULL) {
    645  11146    George 		if (ss->ss_start + size <= ss->ss_end) {
    646  11146    George 			*cursor = ss->ss_start + size;
    647  11146    George 			return (ss->ss_start);
    648  11146    George 		}
    649  11146    George 	}
    650  11146    George 	return (-1ULL);
    651  11146    George }
    652  11146    George 
    653  11146    George static boolean_t
    654  11146    George metaslab_ndf_fragmented(space_map_t *sm)
    655  11146    George {
    656  11146    George 	uint64_t max_size = metaslab_pp_maxsize(sm);
    657  11146    George 
    658  11146    George 	if (max_size > (metaslab_min_alloc_size * 10))
    659  11146    George 		return (B_FALSE);
    660  11146    George 	return (B_TRUE);
    661  11146    George }
    662  11146    George 
    663  11146    George 
    664  11146    George static space_map_ops_t metaslab_ndf_ops = {
    665  11146    George 	metaslab_pp_load,
    666  11146    George 	metaslab_pp_unload,
    667  11146    George 	metaslab_ndf_alloc,
    668  11146    George 	metaslab_pp_claim,
    669  11146    George 	metaslab_pp_free,
    670  11146    George 	metaslab_pp_maxsize,
    671  11146    George 	metaslab_ndf_fragmented
    672   9480    George };
    673   9480    George 
    674   9480    George space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
    675   1732   bonwick 
    676   1732   bonwick /*
    677   1732   bonwick  * ==========================================================================
    678   1732   bonwick  * Metaslabs
    679   1732   bonwick  * ==========================================================================
    680   1732   bonwick  */
    681   1732   bonwick metaslab_t *
    682   1732   bonwick metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
    683   1732   bonwick 	uint64_t start, uint64_t size, uint64_t txg)
    684   1732   bonwick {
    685   1732   bonwick 	vdev_t *vd = mg->mg_vd;
    686   1732   bonwick 	metaslab_t *msp;
    687   1732   bonwick 
    688   1732   bonwick 	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
    689   2856  nd150628 	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
    690   1732   bonwick 
    691   1732   bonwick 	msp->ms_smo_syncing = *smo;
    692   1732   bonwick 
    693   1732   bonwick 	/*
    694   1732   bonwick 	 * We create the main space map here, but we don't create the
    695   1732   bonwick 	 * allocmaps and freemaps until metaslab_sync_done().  This serves
    696   1732   bonwick 	 * two purposes: it allows metaslab_sync_done() to detect the
    697   1732   bonwick 	 * addition of new space; and for debugging, it ensures that we'd
    698   1732   bonwick 	 * data fault on any attempt to use this metaslab before it's ready.
    699   1732   bonwick 	 */
    700   1732   bonwick 	space_map_create(&msp->ms_map, start, size,
    701   1732   bonwick 	    vd->vdev_ashift, &msp->ms_lock);
    702   1732   bonwick 
    703   1732   bonwick 	metaslab_group_add(mg, msp);
    704   1732   bonwick 
    705  10922      Jeff 	if (metaslab_debug && smo->smo_object != 0) {
    706  10922      Jeff 		mutex_enter(&msp->ms_lock);
    707  10922      Jeff 		VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
    708  10922      Jeff 		    SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
    709  10922      Jeff 		mutex_exit(&msp->ms_lock);
    710  10922      Jeff 	}
    711  10922      Jeff 
    712   1732   bonwick 	/*
    713   1732   bonwick 	 * If we're opening an existing pool (txg == 0) or creating
    714   1732   bonwick 	 * a new one (txg == TXG_INITIAL), all space is available now.
    715   1732   bonwick 	 * If we're adding space to an existing pool, the new space
    716   1732   bonwick 	 * does not become available until after this txg has synced.
    717   1732   bonwick 	 */
    718   1732   bonwick 	if (txg <= TXG_INITIAL)
    719   1732   bonwick 		metaslab_sync_done(msp, 0);
    720   1732   bonwick 
    721   1732   bonwick 	if (txg != 0) {
    722   1732   bonwick 		vdev_dirty(vd, 0, NULL, txg);
    723  10921       Tim 		vdev_dirty(vd, VDD_METASLAB, msp, txg);
    724    789    ahrens 	}
    725    789    ahrens 
    726   1732   bonwick 	return (msp);
    727    789    ahrens }
    728    789    ahrens 
    729   1732   bonwick void
    730   1732   bonwick metaslab_fini(metaslab_t *msp)
    731   1732   bonwick {
    732   1732   bonwick 	metaslab_group_t *mg = msp->ms_group;
    733   1732   bonwick 
    734  10922      Jeff 	vdev_space_update(mg->mg_vd,
    735  10922      Jeff 	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
    736   1732   bonwick 
    737   1732   bonwick 	metaslab_group_remove(mg, msp);
    738   1732   bonwick 
    739   1732   bonwick 	mutex_enter(&msp->ms_lock);
    740   1732   bonwick 
    741   1732   bonwick 	space_map_unload(&msp->ms_map);
    742   1732   bonwick 	space_map_destroy(&msp->ms_map);
    743   1732   bonwick 
    744  10921       Tim 	for (int t = 0; t < TXG_SIZE; t++) {
    745   1732   bonwick 		space_map_destroy(&msp->ms_allocmap[t]);
    746   1732   bonwick 		space_map_destroy(&msp->ms_freemap[t]);
    747   1732   bonwick 	}
    748  10921       Tim 
    749  10921       Tim 	for (int t = 0; t < TXG_DEFER_SIZE; t++)
    750  10921       Tim 		space_map_destroy(&msp->ms_defermap[t]);
    751  10921       Tim 
    752  10921       Tim 	ASSERT3S(msp->ms_deferspace, ==, 0);
    753   1732   bonwick 
    754   1732   bonwick 	mutex_exit(&msp->ms_lock);
    755   2856  nd150628 	mutex_destroy(&msp->ms_lock);
    756   1732   bonwick 
    757   1732   bonwick 	kmem_free(msp, sizeof (metaslab_t));
    758   1732   bonwick }
    759   1732   bonwick 
    760   1775     billm #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
    761   1775     billm #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
    762   1775     billm #define	METASLAB_ACTIVE_MASK		\
    763   1775     billm 	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
    764   1732   bonwick 
    765    789    ahrens static uint64_t
    766   1732   bonwick metaslab_weight(metaslab_t *msp)
    767    789    ahrens {
    768   1775     billm 	metaslab_group_t *mg = msp->ms_group;
    769    789    ahrens 	space_map_t *sm = &msp->ms_map;
    770   1732   bonwick 	space_map_obj_t *smo = &msp->ms_smo;
    771   1775     billm 	vdev_t *vd = mg->mg_vd;
    772   1732   bonwick 	uint64_t weight, space;
    773    789    ahrens 
    774    789    ahrens 	ASSERT(MUTEX_HELD(&msp->ms_lock));
    775    789    ahrens 
    776   1732   bonwick 	/*
    777   1732   bonwick 	 * The baseline weight is the metaslab's free space.
    778   1732   bonwick 	 */
    779   1732   bonwick 	space = sm->sm_size - smo->smo_alloc;
    780   1732   bonwick 	weight = space;
    781   1732   bonwick 
    782   1732   bonwick 	/*
    783   1732   bonwick 	 * Modern disks have uniform bit density and constant angular velocity.
    784   1732   bonwick 	 * Therefore, the outer recording zones are faster (higher bandwidth)
    785   1732   bonwick 	 * than the inner zones by the ratio of outer to inner track diameter,
    786   1732   bonwick 	 * which is typically around 2:1.  We account for this by assigning
    787   1732   bonwick 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
    788   1732   bonwick 	 * In effect, this means that we'll select the metaslab with the most
    789   1732   bonwick 	 * free bandwidth rather than simply the one with the most free space.
    790   1732   bonwick 	 */
    791   1732   bonwick 	weight = 2 * weight -
    792   1732   bonwick 	    ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
    793   1732   bonwick 	ASSERT(weight >= space && weight <= 2 * space);
    794   1732   bonwick 
    795   1732   bonwick 	/*
    796  11146    George 	 * For locality, assign higher weight to metaslabs which have
    797  11146    George 	 * a lower offset than what we've already activated.
    798   1732   bonwick 	 */
    799  11146    George 	if (sm->sm_start <= mg->mg_bonus_area)
    800  11146    George 		weight *= (metaslab_smo_bonus_pct / 100);
    801   1775     billm 	ASSERT(weight >= space &&
    802  11146    George 	    weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
    803  11146    George 
    804  11146    George 	if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
    805  11146    George 		/*
    806  11146    George 		 * If this metaslab is one we're actively using, adjust its
    807  11146    George 		 * weight to make it preferable to any inactive metaslab so
    808  11146    George 		 * we'll polish it off.
    809  11146    George 		 */
    810  11146    George 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
    811  11146    George 	}
    812  11146    George 	return (weight);
    813  11146    George }
    814  11146    George 
    815  11146    George static void
    816  11146    George metaslab_prefetch(metaslab_group_t *mg)
    817  11146    George {
    818  11146    George 	spa_t *spa = mg->mg_vd->vdev_spa;
    819  11146    George 	metaslab_t *msp;
    820  11146    George 	avl_tree_t *t = &mg->mg_metaslab_tree;
    821  11146    George 	int m;
    822  11146    George 
    823  11146    George 	mutex_enter(&mg->mg_lock);
    824   1732   bonwick 
    825   1732   bonwick 	/*
    826  11146    George 	 * Prefetch the next potential metaslabs
    827   1732   bonwick 	 */
    828  11146    George 	for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
    829  11146    George 		space_map_t *sm = &msp->ms_map;
    830  11146    George 		space_map_obj_t *smo = &msp->ms_smo;
    831   1732   bonwick 
    832  11146    George 		/* If we have reached our prefetch limit then we're done */
    833  11146    George 		if (m >= metaslab_prefetch_limit)
    834  11146    George 			break;
    835  11146    George 
    836  11146    George 		if (!sm->sm_loaded && smo->smo_object != 0) {
    837  11146    George 			mutex_exit(&mg->mg_lock);
    838  11146    George 			dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
    839  11146    George 			    0ULL, smo->smo_objsize);
    840  11146    George 			mutex_enter(&mg->mg_lock);
    841  11146    George 		}
    842  11146    George 	}
    843  11146    George 	mutex_exit(&mg->mg_lock);
    844   1732   bonwick }
    845   1732   bonwick 
    846   1732   bonwick static int
    847   9480    George metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
    848   1732   bonwick {
    849  11146    George 	metaslab_group_t *mg = msp->ms_group;
    850   1732   bonwick 	space_map_t *sm = &msp->ms_map;
    851   9480    George 	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
    852   1732   bonwick 
    853   1732   bonwick 	ASSERT(MUTEX_HELD(&msp->ms_lock));
    854   1732   bonwick 
    855   1775     billm 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
    856  10921       Tim 		space_map_load_wait(sm);
    857  10921       Tim 		if (!sm->sm_loaded) {
    858  10921       Tim 			int error = space_map_load(sm, sm_ops, SM_FREE,
    859  10921       Tim 			    &msp->ms_smo,
    860  10922      Jeff 			    spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
    861  11146    George 			if (error)  {
    862  10921       Tim 				metaslab_group_sort(msp->ms_group, msp, 0);
    863  10921       Tim 				return (error);
    864  10921       Tim 			}
    865  10921       Tim 			for (int t = 0; t < TXG_DEFER_SIZE; t++)
    866  10921       Tim 				space_map_walk(&msp->ms_defermap[t],
    867  10921       Tim 				    space_map_claim, sm);
    868  11146    George 
    869  11146    George 		}
    870  11146    George 
    871  11146    George 		/*
    872  11146    George 		 * Track the bonus area as we activate new metaslabs.
    873  11146    George 		 */
    874  11146    George 		if (sm->sm_start > mg->mg_bonus_area) {
    875  11146    George 			mutex_enter(&mg->mg_lock);
    876  11146    George 			mg->mg_bonus_area = sm->sm_start;
    877  11146    George 			mutex_exit(&mg->mg_lock);
    878   1732   bonwick 		}
    879   9480    George 
    880   9480    George 		/*
    881   9480    George 		 * If we were able to load the map then make sure
    882   9480    George 		 * that this map is still able to satisfy our request.
    883   9480    George 		 */
    884   9480    George 		if (msp->ms_weight < size)
    885   9480    George 			return (ENOSPC);
    886   9480    George 
    887   1732   bonwick 		metaslab_group_sort(msp->ms_group, msp,
    888   1775     billm 		    msp->ms_weight | activation_weight);
    889    789    ahrens 	}
    890   1732   bonwick 	ASSERT(sm->sm_loaded);
    891   1775     billm 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
    892   1732   bonwick 
    893   1732   bonwick 	return (0);
    894   1732   bonwick }
    895   1732   bonwick 
    896   1732   bonwick static void
    897   1732   bonwick metaslab_passivate(metaslab_t *msp, uint64_t size)
    898   1732   bonwick {
    899   2459    ahrens 	/*
    900   2459    ahrens 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
    901   2459    ahrens 	 * this metaslab again.  In that case, it had better be empty,
    902   2459    ahrens 	 * or we would be leaving space on the table.
    903   2459    ahrens 	 */
    904   2459    ahrens 	ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
    905   1775     billm 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
    906   1775     billm 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
    907   1732   bonwick }
    908   1732   bonwick 
    909   1732   bonwick /*
    910   1732   bonwick  * Write a metaslab to disk in the context of the specified transaction group.
    911   1732   bonwick  */
    912   1732   bonwick void
    913   1732   bonwick metaslab_sync(metaslab_t *msp, uint64_t txg)
    914   1732   bonwick {
    915   1732   bonwick 	vdev_t *vd = msp->ms_group->mg_vd;
    916   1732   bonwick 	spa_t *spa = vd->vdev_spa;
    917  10922      Jeff 	objset_t *mos = spa_meta_objset(spa);
    918   1732   bonwick 	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
    919   1732   bonwick 	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
    920   1732   bonwick 	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
    921   1732   bonwick 	space_map_t *sm = &msp->ms_map;
    922   1732   bonwick 	space_map_obj_t *smo = &msp->ms_smo_syncing;
    923   1732   bonwick 	dmu_buf_t *db;
    924   1732   bonwick 	dmu_tx_t *tx;
    925   1732   bonwick 
    926  10594    George 	ASSERT(!vd->vdev_ishole);
    927  10594    George 
    928  10921       Tim 	if (allocmap->sm_space == 0 && freemap->sm_space == 0)
    929  10921       Tim 		return;
    930   1732   bonwick 
    931   1732   bonwick 	/*
    932   1732   bonwick 	 * The only state that can actually be changing concurrently with
    933   1732   bonwick 	 * metaslab_sync() is the metaslab's ms_map.  No other thread can
    934   1732   bonwick 	 * be modifying this txg's allocmap, freemap, freed_map, or smo.
    935   1732   bonwick 	 * Therefore, we only hold ms_lock to satify space_map ASSERTs.
    936   1732   bonwick 	 * We drop it whenever we call into the DMU, because the DMU
    937   1732   bonwick 	 * can call down to us (e.g. via zio_free()) at any time.
    938   1732   bonwick 	 */
    939  10921       Tim 
    940  10921       Tim 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
    941   1732   bonwick 
    942   1732   bonwick 	if (smo->smo_object == 0) {
    943   1732   bonwick 		ASSERT(smo->smo_objsize == 0);
    944   1732   bonwick 		ASSERT(smo->smo_alloc == 0);
    945   1732   bonwick 		smo->smo_object = dmu_object_alloc(mos,
    946   1732   bonwick 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
    947   1732   bonwick 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
    948   1732   bonwick 		ASSERT(smo->smo_object != 0);
    949   1732   bonwick 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
    950   1732   bonwick 		    (sm->sm_start >> vd->vdev_ms_shift),
    951   1732   bonwick 		    sizeof (uint64_t), &smo->smo_object, tx);
    952   1732   bonwick 	}
    953  10921       Tim 
    954  10921       Tim 	mutex_enter(&msp->ms_lock);
    955   1732   bonwick 
    956   1732   bonwick 	space_map_walk(freemap, space_map_add, freed_map);
    957   1732   bonwick 
    958   1732   bonwick 	if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
    959   1732   bonwick 	    2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
    960   1732   bonwick 		/*
    961   1732   bonwick 		 * The in-core space map representation is twice as compact
    962   1732   bonwick 		 * as the on-disk one, so it's time to condense the latter
    963   1732   bonwick 		 * by generating a pure allocmap from first principles.
    964   1732   bonwick 		 *
    965   1732   bonwick 		 * This metaslab is 100% allocated,
    966   1732   bonwick 		 * minus the content of the in-core map (sm),
    967   1732   bonwick 		 * minus what's been freed this txg (freed_map),
    968  10921       Tim 		 * minus deferred frees (ms_defermap[]),
    969   1732   bonwick 		 * minus allocations from txgs in the future
    970   1732   bonwick 		 * (because they haven't been committed yet).
    971   1732   bonwick 		 */
    972   1732   bonwick 		space_map_vacate(allocmap, NULL, NULL);
    973   1732   bonwick 		space_map_vacate(freemap, NULL, NULL);
    974   1732   bonwick 
    975   1732   bonwick 		space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
    976   1732   bonwick 
    977   1732   bonwick 		space_map_walk(sm, space_map_remove, allocmap);
    978   1732   bonwick 		space_map_walk(freed_map, space_map_remove, allocmap);
    979   1732   bonwick 
    980  10921       Tim 		for (int t = 0; t < TXG_DEFER_SIZE; t++)
    981  10921       Tim 			space_map_walk(&msp->ms_defermap[t],
    982  10921       Tim 			    space_map_remove, allocmap);
    983  10921       Tim 
    984  10921       Tim 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
    985   1732   bonwick 			space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
    986   1732   bonwick 			    space_map_remove, allocmap);
    987   1732   bonwick 
    988   1732   bonwick 		mutex_exit(&msp->ms_lock);
    989   1732   bonwick 		space_map_truncate(smo, mos, tx);
    990   1732   bonwick 		mutex_enter(&msp->ms_lock);
    991   1732   bonwick 	}
    992   1732   bonwick 
    993   1732   bonwick 	space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
    994   1732   bonwick 	space_map_sync(freemap, SM_FREE, smo, mos, tx);
    995   1732   bonwick 
    996   1732   bonwick 	mutex_exit(&msp->ms_lock);
    997   1732   bonwick 
    998   1732   bonwick 	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
    999   1732   bonwick 	dmu_buf_will_dirty(db, tx);
   1000   4944    maybee 	ASSERT3U(db->db_size, >=, sizeof (*smo));
   1001   4944    maybee 	bcopy(smo, db->db_data, sizeof (*smo));
   1002   1732   bonwick 	dmu_buf_rele(db, FTAG);
   1003   1732   bonwick 
   1004   1732   bonwick 	dmu_tx_commit(tx);
   1005   1732   bonwick }
   1006   1732   bonwick 
   1007   1732   bonwick /*
   1008   1732   bonwick  * Called after a transaction group has completely synced to mark
   1009   1732   bonwick  * all of the metaslab's free space as usable.
   1010   1732   bonwick  */
   1011   1732   bonwick void
   1012   1732   bonwick metaslab_sync_done(metaslab_t *msp, uint64_t txg)
   1013   1732   bonwick {
   1014   1732   bonwick 	space_map_obj_t *smo = &msp->ms_smo;
   1015   1732   bonwick 	space_map_obj_t *smosync = &msp->ms_smo_syncing;
   1016   1732   bonwick 	space_map_t *sm = &msp->ms_map;
   1017   1732   bonwick 	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
   1018  10921       Tim 	space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
   1019   1732   bonwick 	metaslab_group_t *mg = msp->ms_group;
   1020   1732   bonwick 	vdev_t *vd = mg->mg_vd;
   1021  10921       Tim 	int64_t alloc_delta, defer_delta;
   1022  10594    George 
   1023  10594    George 	ASSERT(!vd->vdev_ishole);
   1024   1732   bonwick 
   1025   1732   bonwick 	mutex_enter(&msp->ms_lock);
   1026   1732   bonwick 
   1027   1732   bonwick 	/*
   1028   1732   bonwick 	 * If this metaslab is just becoming available, initialize its
   1029   1732   bonwick 	 * allocmaps and freemaps and add its capacity to the vdev.
   1030   1732   bonwick 	 */
   1031   1732   bonwick 	if (freed_map->sm_size == 0) {
   1032  10921       Tim 		for (int t = 0; t < TXG_SIZE; t++) {
   1033   1732   bonwick 			space_map_create(&msp->ms_allocmap[t], sm->sm_start,
   1034   1732   bonwick 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
   1035   1732   bonwick 			space_map_create(&msp->ms_freemap[t], sm->sm_start,
   1036   1732   bonwick 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
   1037   1732   bonwick 		}
   1038  10921       Tim 
   1039  10921       Tim 		for (int t = 0; t < TXG_DEFER_SIZE; t++)
   1040  10921       Tim 			space_map_create(&msp->ms_defermap[t], sm->sm_start,
   1041  10921       Tim 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
   1042  10921       Tim 
   1043  10922      Jeff 		vdev_space_update(vd, 0, 0, sm->sm_size);
   1044   1732   bonwick 	}
   1045   1732   bonwick 
   1046  10921       Tim 	alloc_delta = smosync->smo_alloc - smo->smo_alloc;
   1047  10921       Tim 	defer_delta = freed_map->sm_space - defer_map->sm_space;
   1048  10921       Tim 
   1049  10922      Jeff 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
   1050   1732   bonwick 
   1051   1732   bonwick 	ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
   1052   1732   bonwick 	ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
   1053   1732   bonwick 
   1054   1732   bonwick 	/*
   1055   1732   bonwick 	 * If there's a space_map_load() in progress, wait for it to complete
   1056   1732   bonwick 	 * so that we have a consistent view of the in-core space map.
   1057  10921       Tim 	 * Then, add defer_map (oldest deferred frees) to this map and
   1058  10921       Tim 	 * transfer freed_map (this txg's frees) to defer_map.
   1059   1732   bonwick 	 */
   1060   1732   bonwick 	space_map_load_wait(sm);
   1061  10921       Tim 	space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
   1062  10921       Tim 	space_map_vacate(freed_map, space_map_add, defer_map);
   1063   1732   bonwick 
   1064   1732   bonwick 	*smo = *smosync;
   1065  10921       Tim 
   1066  10921       Tim 	msp->ms_deferspace += defer_delta;
   1067  10921       Tim 	ASSERT3S(msp->ms_deferspace, >=, 0);
   1068  10921       Tim 	ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
   1069  10921       Tim 	if (msp->ms_deferspace != 0) {
   1070  10921       Tim 		/*
   1071  10921       Tim 		 * Keep syncing this metaslab until all deferred frees
   1072  10921       Tim 		 * are back in circulation.
   1073  10921       Tim 		 */
   1074  10921       Tim 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
   1075  10921       Tim 	}
   1076   1732   bonwick 
   1077   1732   bonwick 	/*
   1078   1732   bonwick 	 * If the map is loaded but no longer active, evict it as soon as all
   1079   1732   bonwick 	 * future allocations have synced.  (If we unloaded it now and then
   1080   1732   bonwick 	 * loaded a moment later, the map wouldn't reflect those allocations.)
   1081   1732   bonwick 	 */
   1082   1775     billm 	if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
   1083   1732   bonwick 		int evictable = 1;
   1084   1732   bonwick 
   1085  10921       Tim 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
   1086   1732   bonwick 			if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
   1087   1732   bonwick 				evictable = 0;
   1088   1732   bonwick 
   1089  10922      Jeff 		if (evictable && !metaslab_debug)
   1090   1732   bonwick 			space_map_unload(sm);
   1091   1732   bonwick 	}
   1092   1732   bonwick 
   1093   1732   bonwick 	metaslab_group_sort(mg, msp, metaslab_weight(msp));
   1094   1732   bonwick 
   1095   1732   bonwick 	mutex_exit(&msp->ms_lock);
   1096    789    ahrens }
   1097    789    ahrens 
   1098  11146    George void
   1099  11146    George metaslab_sync_reassess(metaslab_group_t *mg)
   1100  11146    George {
   1101  11146    George 	vdev_t *vd = mg->mg_vd;
   1102  11146    George 
   1103  11146    George 	/*
   1104  11146    George 	 * Re-evaluate all metaslabs which have lower offsets than the
   1105  11146    George 	 * bonus area.
   1106  11146    George 	 */
   1107  11146    George 	for (int m = 0; m < vd->vdev_ms_count; m++) {
   1108  11146    George 		metaslab_t *msp = vd->vdev_ms[m];
   1109  11146    George 
   1110  11146    George 		if (msp->ms_map.sm_start > mg->mg_bonus_area)
   1111  11146    George 			break;
   1112  11146    George 
   1113  11146    George 		mutex_enter(&msp->ms_lock);
   1114  11146    George 		metaslab_group_sort(mg, msp, metaslab_weight(msp));
   1115  11146    George 		mutex_exit(&msp->ms_lock);
   1116  11146    George 	}
   1117  11146    George 
   1118  11146    George 	/*
   1119  11146    George 	 * Prefetch the next potential metaslabs
   1120  11146    George 	 */
   1121  11146    George 	metaslab_prefetch(mg);
   1122  11146    George }
   1123  11146    George 
   1124   1775     billm static uint64_t
   1125   1775     billm metaslab_distance(metaslab_t *msp, dva_t *dva)
   1126   1775     billm {
   1127   1775     billm 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
   1128   1775     billm 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
   1129   1775     billm 	uint64_t start = msp->ms_map.sm_start >> ms_shift;
   1130   1775     billm 
   1131   1775     billm 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
   1132   1775     billm 		return (1ULL << 63);
   1133   1775     billm 
   1134   1775     billm 	if (offset < start)
   1135   1775     billm 		return ((start - offset) << ms_shift);
   1136   1775     billm 	if (offset > start)
   1137   1775     billm 		return ((offset - start) << ms_shift);
   1138   1775     billm 	return (0);
   1139   1775     billm }
   1140   1775     billm 
   1141   1775     billm static uint64_t
   1142   1775     billm metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
   1143   1775     billm     uint64_t min_distance, dva_t *dva, int d)
   1144    789    ahrens {
   1145   1732   bonwick 	metaslab_t *msp = NULL;
   1146   1732   bonwick 	uint64_t offset = -1ULL;
   1147   1775     billm 	avl_tree_t *t = &mg->mg_metaslab_tree;
   1148   1775     billm 	uint64_t activation_weight;
   1149   1775     billm 	uint64_t target_distance;
   1150   1775     billm 	int i;
   1151   1775     billm 
   1152   1775     billm 	activation_weight = METASLAB_WEIGHT_PRIMARY;
   1153   9480    George 	for (i = 0; i < d; i++) {
   1154   9480    George 		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
   1155   1775     billm 			activation_weight = METASLAB_WEIGHT_SECONDARY;
   1156   9480    George 			break;
   1157   9480    George 		}
   1158   9480    George 	}
   1159    789    ahrens 
   1160   1732   bonwick 	for (;;) {
   1161   9480    George 		boolean_t was_active;
   1162   9480    George 
   1163   1732   bonwick 		mutex_enter(&mg->mg_lock);
   1164   1775     billm 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
   1165   1775     billm 			if (msp->ms_weight < size) {
   1166   1775     billm 				mutex_exit(&mg->mg_lock);
   1167   1775     billm 				return (-1ULL);
   1168   1775     billm 			}
   1169   1775     billm 
   1170   9480    George 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
   1171   1775     billm 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
   1172   1775     billm 				break;
   1173   1775     billm 
   1174   1775     billm 			target_distance = min_distance +
   1175   1775     billm 			    (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
   1176   1775     billm 
   1177   1775     billm 			for (i = 0; i < d; i++)
   1178   1775     billm 				if (metaslab_distance(msp, &dva[i]) <
   1179   1775     billm 				    target_distance)
   1180   1775     billm 					break;
   1181   1775     billm 			if (i == d)
   1182   1775     billm 				break;
   1183   1732   bonwick 		}
   1184   1732   bonwick 		mutex_exit(&mg->mg_lock);
   1185   1775     billm 		if (msp == NULL)
   1186   1775     billm 			return (-1ULL);
   1187    789    ahrens 
   1188   1732   bonwick 		mutex_enter(&msp->ms_lock);
   1189    789    ahrens 
   1190   3848   gw25295 		/*
   1191   3848   gw25295 		 * Ensure that the metaslab we have selected is still
   1192   3848   gw25295 		 * capable of handling our request. It's possible that
   1193   3848   gw25295 		 * another thread may have changed the weight while we
   1194   3848   gw25295 		 * were blocked on the metaslab lock.
   1195   3848   gw25295 		 */
   1196   9480    George 		if (msp->ms_weight < size || (was_active &&
   1197   9480    George 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
   1198   9480    George 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
   1199   3848   gw25295 			mutex_exit(&msp->ms_lock);
   1200   3848   gw25295 			continue;
   1201   3848   gw25295 		}
   1202   3848   gw25295 
   1203   1775     billm 		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
   1204   1775     billm 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
   1205   1775     billm 			metaslab_passivate(msp,
   1206   2459    ahrens 			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
   1207   1775     billm 			mutex_exit(&msp->ms_lock);
   1208   1775     billm 			continue;
   1209   1775     billm 		}
   1210   1775     billm 
   1211   9480    George 		if (metaslab_activate(msp, activation_weight, size) != 0) {
   1212    789    ahrens 			mutex_exit(&msp->ms_lock);
   1213    789    ahrens 			continue;
   1214    789    ahrens 		}
   1215   1732   bonwick 
   1216   1732   bonwick 		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
   1217   1732   bonwick 			break;
   1218   1732   bonwick 
   1219  11146    George 		metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
   1220   1732   bonwick 
   1221    789    ahrens 		mutex_exit(&msp->ms_lock);
   1222    789    ahrens 	}
   1223    789    ahrens 
   1224   1732   bonwick 	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
   1225   1732   bonwick 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
   1226   1732   bonwick 
   1227   1732   bonwick 	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
   1228   1732   bonwick 
   1229   1732   bonwick 	mutex_exit(&msp->ms_lock);
   1230   1732   bonwick 
   1231   1775     billm 	return (offset);
   1232    789    ahrens }
   1233    789    ahrens 
   1234    789    ahrens /*
   1235    789    ahrens  * Allocate a block for the specified i/o.
   1236    789    ahrens  */
   1237   1775     billm static int
   1238   4527    perrin metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
   1239   7754      Jeff     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
   1240    789    ahrens {
   1241    789    ahrens 	metaslab_group_t *mg, *rotor;
   1242    789    ahrens 	vdev_t *vd;
   1243   1775     billm 	int dshift = 3;
   1244   1775     billm 	int all_zero;
   1245   8241      Jeff 	int zio_lock = B_FALSE;
   1246   8241      Jeff 	boolean_t allocatable;
   1247    789    ahrens 	uint64_t offset = -1ULL;
   1248    789    ahrens 	uint64_t asize;
   1249   1775     billm 	uint64_t distance;
   1250   1807   bonwick 
   1251   1807   bonwick 	ASSERT(!DVA_IS_VALID(&dva[d]));
   1252    789    ahrens 
   1253    789    ahrens 	/*
   1254   5530   bonwick 	 * For testing, make some blocks above a certain size be gang blocks.
   1255   5530   bonwick 	 */
   1256  11066    rafael 	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
   1257   5530   bonwick 		return (ENOSPC);
   1258   5530   bonwick 
   1259   5530   bonwick 	/*
   1260    789    ahrens 	 * Start at the rotor and loop through all mgs until we find something.
   1261  10922      Jeff 	 * Note that there's no locking on mc_rotor or mc_aliquot because
   1262    789    ahrens 	 * nothing actually breaks if we miss a few updates -- we just won't
   1263    789    ahrens 	 * allocate quite as evenly.  It all balances out over time.
   1264   1775     billm 	 *
   1265   3063    perrin 	 * If we are doing ditto or log blocks, try to spread them across
   1266   3063    perrin 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
   1267   3063    perrin 	 * allocated all of our ditto blocks, then try and spread them out on
   1268   3063    perrin 	 * that vdev as much as possible.  If it turns out to not be possible,
   1269   1775     billm 	 * gradually lower our standards until anything becomes acceptable.
   1270   1775     billm 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
   1271   1775     billm 	 * gives us hope of containing our fault domains to something we're
   1272   1775     billm 	 * able to reason about.  Otherwise, any two top-level vdev failures
   1273   1775     billm 	 * will guarantee the loss of data.  With consecutive allocation,
   1274   1775     billm 	 * only two adjacent top-level vdev failures will result in data loss.
   1275   1775     billm 	 *
   1276   1775     billm 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
   1277   1775     billm 	 * ourselves on the same vdev as our gang block header.  That
   1278   1775     billm 	 * way, we can hope for locality in vdev_cache, plus it makes our
   1279   1775     billm 	 * fault domains something tractable.
   1280    789    ahrens 	 */
   1281   1775     billm 	if (hintdva) {
   1282   1775     billm 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
   1283  10594    George 
   1284  10594    George 		/*
   1285  10594    George 		 * It's possible the vdev we're using as the hint no
   1286  10594    George 		 * longer exists (i.e. removed). Consult the rotor when
   1287  10594    George 		 * all else fails.
   1288  10594    George 		 */
   1289  10974      Jeff 		if (vd != NULL) {
   1290   3063    perrin 			mg = vd->vdev_mg;
   1291  10594    George 
   1292  10594    George 			if (flags & METASLAB_HINTBP_AVOID &&
   1293  10594    George 			    mg->mg_next != NULL)
   1294  10594    George 				mg = mg->mg_next;
   1295  10594    George 		} else {
   1296  10594    George 			mg = mc->mc_rotor;
   1297  10594    George 		}
   1298   1775     billm 	} else if (d != 0) {
   1299   1775     billm 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
   1300   1775     billm 		mg = vd->vdev_mg->mg_next;
   1301   1775     billm 	} else {
   1302   1775     billm 		mg = mc->mc_rotor;
   1303   1775     billm 	}
   1304   4527    perrin 
   1305   4527    perrin 	/*
   1306  10974      Jeff 	 * If the hint put us into the wrong metaslab class, or into a
   1307  10974      Jeff 	 * metaslab group that has been passivated, just follow the rotor.
   1308   4527    perrin 	 */
   1309  10974      Jeff 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
   1310   4527    perrin 		mg = mc->mc_rotor;
   1311   4527    perrin 
   1312   1775     billm 	rotor = mg;
   1313   1775     billm top:
   1314   1775     billm 	all_zero = B_TRUE;
   1315    789    ahrens 	do {
   1316  10974      Jeff 		ASSERT(mg->mg_activation_count == 1);
   1317  10974      Jeff 
   1318    789    ahrens 		vd = mg->mg_vd;
   1319   8241      Jeff 
   1320   5329   gw25295 		/*
   1321   7754      Jeff 		 * Don't allocate from faulted devices.
   1322   5329   gw25295 		 */
   1323   8241      Jeff 		if (zio_lock) {
   1324   8241      Jeff 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
   1325   8241      Jeff 			allocatable = vdev_allocatable(vd);
   1326   8241      Jeff 			spa_config_exit(spa, SCL_ZIO, FTAG);
   1327   8241      Jeff 		} else {
   1328   8241      Jeff 			allocatable = vdev_allocatable(vd);
   1329   8241      Jeff 		}
   1330   8241      Jeff 		if (!allocatable)
   1331   5329   gw25295 			goto next;
   1332   8241      Jeff 
   1333   5329   gw25295 		/*
   1334   5329   gw25295 		 * Avoid writing single-copy data to a failing vdev
   1335   5329   gw25295 		 */
   1336   5329   gw25295 		if ((vd->vdev_stat.vs_write_errors > 0 ||
   1337   5329   gw25295 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
   1338   5329   gw25295 		    d == 0 && dshift == 3) {
   1339   5329   gw25295 			all_zero = B_FALSE;
   1340   5329   gw25295 			goto next;
   1341   5329   gw25295 		}
   1342   4527    perrin 
   1343   4527    perrin 		ASSERT(mg->mg_class == mc);
   1344   1775     billm 
   1345   1775     billm 		distance = vd->vdev_asize >> dshift;
   1346   1775     billm 		if (distance <= (1ULL << vd->vdev_ms_shift))
   1347   1775     billm 			distance = 0;
   1348   1775     billm 		else
   1349   1775     billm 			all_zero = B_FALSE;
   1350   1775     billm 
   1351    789    ahrens 		asize = vdev_psize_to_asize(vd, psize);
   1352    789    ahrens 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
   1353    789    ahrens 
   1354   1775     billm 		offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
   1355   1775     billm 		if (offset != -1ULL) {
   1356    789    ahrens 			/*
   1357    789    ahrens 			 * If we've just selected this metaslab group,
   1358    789    ahrens 			 * figure out whether the corresponding vdev is
   1359    789    ahrens 			 * over- or under-used relative to the pool,
   1360    789    ahrens 			 * and set an allocation bias to even it out.
   1361    789    ahrens 			 */
   1362  10922      Jeff 			if (mc->mc_aliquot == 0) {
   1363    789    ahrens 				vdev_stat_t *vs = &vd->vdev_stat;
   1364  10922      Jeff 				int64_t vu, cu;
   1365    789    ahrens 
   1366    789    ahrens 				/*
   1367    789    ahrens 				 * Determine percent used in units of 0..1024.
   1368    789    ahrens 				 * (This is just to avoid floating point.)
   1369    789    ahrens 				 */
   1370    789    ahrens 				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
   1371  10922      Jeff 				cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
   1372    789    ahrens 
   1373    789    ahrens 				/*
   1374    789    ahrens 				 * Bias by at most +/- 25% of the aliquot.
   1375    789    ahrens 				 */
   1376  10922      Jeff 				mg->mg_bias = ((cu - vu) *
   1377    789    ahrens 				    (int64_t)mg->mg_aliquot) / (1024 * 4);
   1378    789    ahrens 			}
   1379    789    ahrens 
   1380  10922      Jeff 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
   1381    789    ahrens 			    mg->mg_aliquot + mg->mg_bias) {
   1382    789    ahrens 				mc->mc_rotor = mg->mg_next;
   1383  10922      Jeff 				mc->mc_aliquot = 0;
   1384    789    ahrens 			}
   1385    789    ahrens 
   1386   1775     billm 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
   1387   1775     billm 			DVA_SET_OFFSET(&dva[d], offset);
   1388   7754      Jeff 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
   1389   1775     billm 			DVA_SET_ASIZE(&dva[d], asize);
   1390    789    ahrens 
   1391    789    ahrens 			return (0);
   1392    789    ahrens 		}
   1393   5329   gw25295 next:
   1394    789    ahrens 		mc->mc_rotor = mg->mg_next;
   1395  10922      Jeff 		mc->mc_aliquot = 0;
   1396    789    ahrens 	} while ((mg = mg->mg_next) != rotor);
   1397    789    ahrens 
   1398   1775     billm 	if (!all_zero) {
   1399   1775     billm 		dshift++;
   1400   1775     billm 		ASSERT(dshift < 64);
   1401   8241      Jeff 		goto top;
   1402   8241      Jeff 	}
   1403   8241      Jeff 
   1404   9480    George 	if (!allocatable && !zio_lock) {
   1405   8241      Jeff 		dshift = 3;
   1406   8241      Jeff 		zio_lock = B_TRUE;
   1407   1775     billm 		goto top;
   1408   1775     billm 	}
   1409   1775     billm 
   1410   1775     billm 	bzero(&dva[d], sizeof (dva_t));
   1411    789    ahrens 
   1412    789    ahrens 	return (ENOSPC);
   1413   1775     billm }
   1414   1775     billm 
   1415    789    ahrens /*
   1416    789    ahrens  * Free the block represented by DVA in the context of the specified
   1417    789    ahrens  * transaction group.
   1418    789    ahrens  */
   1419   1807   bonwick static void
   1420   1807   bonwick metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
   1421    789    ahrens {
   1422    789    ahrens 	uint64_t vdev = DVA_GET_VDEV(dva);
   1423    789    ahrens 	uint64_t offset = DVA_GET_OFFSET(dva);
   1424    789    ahrens 	uint64_t size = DVA_GET_ASIZE(dva);
   1425    789    ahrens 	vdev_t *vd;
   1426    789    ahrens 	metaslab_t *msp;
   1427    789    ahrens 
   1428   1807   bonwick 	ASSERT(DVA_IS_VALID(dva));
   1429   1807   bonwick 
   1430    789    ahrens 	if (txg > spa_freeze_txg(spa))
   1431    789    ahrens 		return;
   1432    789    ahrens 
   1433   1807   bonwick 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
   1434   1807   bonwick 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
   1435   1807   bonwick 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
   1436   1807   bonwick 		    (u_longlong_t)vdev, (u_longlong_t)offset);
   1437    789    ahrens 		ASSERT(0);
   1438    789    ahrens 		return;
   1439    789    ahrens 	}
   1440    789    ahrens 
   1441    789    ahrens 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
   1442    789    ahrens 
   1443    789    ahrens 	if (DVA_GET_GANG(dva))
   1444    789    ahrens 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
   1445    789    ahrens 
   1446    789    ahrens 	mutex_enter(&msp->ms_lock);
   1447    789    ahrens 
   1448   1732   bonwick 	if (now) {
   1449   1732   bonwick 		space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
   1450   1732   bonwick 		    offset, size);
   1451   1732   bonwick 		space_map_free(&msp->ms_map, offset, size);
   1452   1732   bonwick 	} else {
   1453   1732   bonwick 		if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
   1454   1732   bonwick 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
   1455   1732   bonwick 		space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
   1456    789    ahrens 	}
   1457    789    ahrens 
   1458    789    ahrens 	mutex_exit(&msp->ms_lock);
   1459    789    ahrens }
   1460   1807   bonwick 
   1461   1807   bonwick /*
   1462   1807   bonwick  * Intent log support: upon opening the pool after a crash, notify the SPA
   1463   1807   bonwick  * of blocks that the intent log has allocated for immediate write, but
   1464   1807   bonwick  * which are still considered free by the SPA because the last transaction
   1465   1807   bonwick  * group didn't commit yet.
   1466   1807   bonwick  */
   1467   1807   bonwick static int
   1468   1807   bonwick metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
   1469   1807   bonwick {
   1470   1807   bonwick 	uint64_t vdev = DVA_GET_VDEV(dva);
   1471   1807   bonwick 	uint64_t offset = DVA_GET_OFFSET(dva);
   1472   1807   bonwick 	uint64_t size = DVA_GET_ASIZE(dva);
   1473   1807   bonwick 	vdev_t *vd;
   1474   1807   bonwick 	metaslab_t *msp;
   1475  10922      Jeff 	int error = 0;
   1476   1807   bonwick 
   1477   1807   bonwick 	ASSERT(DVA_IS_VALID(dva));
   1478   1807   bonwick 
   1479   1807   bonwick 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
   1480   1807   bonwick 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
   1481   1807   bonwick 		return (ENXIO);
   1482   1807   bonwick 
   1483   1807   bonwick 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
   1484   1807   bonwick 
   1485   1807   bonwick 	if (DVA_GET_GANG(dva))
   1486   1807   bonwick 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
   1487   1807   bonwick 
   1488   1807   bonwick 	mutex_enter(&msp->ms_lock);
   1489   1807   bonwick 
   1490  10922      Jeff 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
   1491  10922      Jeff 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
   1492  10922      Jeff 
   1493  10922      Jeff 	if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
   1494  10922      Jeff 		error = ENOENT;
   1495  10922      Jeff 
   1496   7754      Jeff 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
   1497   1807   bonwick 		mutex_exit(&msp->ms_lock);
   1498   1807   bonwick 		return (error);
   1499   1807   bonwick 	}
   1500   1807   bonwick 
   1501   7754      Jeff 	space_map_claim(&msp->ms_map, offset, size);
   1502   1807   bonwick 
   1503   8241      Jeff 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
   1504   7754      Jeff 		if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
   1505   7754      Jeff 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
   1506   7754      Jeff 		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
   1507   7754      Jeff 	}
   1508   1807   bonwick 
   1509   1807   bonwick 	mutex_exit(&msp->ms_lock);
   1510   1807   bonwick 
   1511   1807   bonwick 	return (0);
   1512   1807   bonwick }
   1513   1807   bonwick 
   1514   1807   bonwick int
   1515   4527    perrin metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
   1516   7754      Jeff     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
   1517   1807   bonwick {
   1518   1807   bonwick 	dva_t *dva = bp->blk_dva;
   1519   1807   bonwick 	dva_t *hintdva = hintbp->blk_dva;
   1520   1807   bonwick 	int error = 0;
   1521   1807   bonwick 
   1522   7754      Jeff 	ASSERT(bp->blk_birth == 0);
   1523  10922      Jeff 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
   1524   7754      Jeff 
   1525   7754      Jeff 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
   1526   7754      Jeff 
   1527   7754      Jeff 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
   1528   7754      Jeff 		spa_config_exit(spa, SCL_ALLOC, FTAG);
   1529   4527    perrin 		return (ENOSPC);
   1530   7754      Jeff 	}
   1531   4527    perrin 
   1532   1807   bonwick 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
   1533   1807   bonwick 	ASSERT(BP_GET_NDVAS(bp) == 0);
   1534   1807   bonwick 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
   1535   1807   bonwick 
   1536   7754      Jeff 	for (int d = 0; d < ndvas; d++) {
   1537   4527    perrin 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
   1538   7754      Jeff 		    txg, flags);
   1539   1807   bonwick 		if (error) {
   1540   1807   bonwick 			for (d--; d >= 0; d--) {
   1541   1807   bonwick 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
   1542   1807   bonwick 				bzero(&dva[d], sizeof (dva_t));
   1543   1807   bonwick 			}
   1544   7754      Jeff 			spa_config_exit(spa, SCL_ALLOC, FTAG);
   1545   1807   bonwick 			return (error);
   1546   1807   bonwick 		}
   1547   1807   bonwick 	}
   1548   1807   bonwick 	ASSERT(error == 0);
   1549   1807   bonwick 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
   1550   7754      Jeff 
   1551   7754      Jeff 	spa_config_exit(spa, SCL_ALLOC, FTAG);
   1552   7754      Jeff 
   1553  10922      Jeff 	BP_SET_BIRTH(bp, txg, txg);
   1554   1807   bonwick 
   1555   1807   bonwick 	return (0);
   1556   1807   bonwick }
   1557   1807   bonwick 
   1558   1807   bonwick void
   1559   1807   bonwick metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
   1560   1807   bonwick {
   1561   1807   bonwick 	const dva_t *dva = bp->blk_dva;
   1562   1807   bonwick 	int ndvas = BP_GET_NDVAS(bp);
   1563   1807   bonwick 
   1564   1807   bonwick 	ASSERT(!BP_IS_HOLE(bp));
   1565  10922      Jeff 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
   1566   1807   bonwick 
   1567   7754      Jeff 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
   1568   7754      Jeff 
   1569   7754      Jeff 	for (int d = 0; d < ndvas; d++)
   1570   1807   bonwick 		metaslab_free_dva(spa, &dva[d], txg, now);
   1571   7754      Jeff 
   1572   7754      Jeff 	spa_config_exit(spa, SCL_FREE, FTAG);
   1573   1807   bonwick }
   1574   1807   bonwick 
   1575   1807   bonwick int
   1576   1807   bonwick metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
   1577   1807   bonwick {
   1578   1807   bonwick 	const dva_t *dva = bp->blk_dva;
   1579   1807   bonwick 	int ndvas = BP_GET_NDVAS(bp);
   1580   7754      Jeff 	int error = 0;
   1581   1807   bonwick 
   1582   1807   bonwick 	ASSERT(!BP_IS_HOLE(bp));
   1583   1807   bonwick 
   1584   7754      Jeff 	if (txg != 0) {
   1585   7754      Jeff 		/*
   1586   7754      Jeff 		 * First do a dry run to make sure all DVAs are claimable,
   1587   7754      Jeff 		 * so we don't have to unwind from partial failures below.
   1588   7754      Jeff 		 */
   1589   7754      Jeff 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
   1590   7754      Jeff 			return (error);
   1591   7754      Jeff 	}
   1592   7754      Jeff 
   1593   7754      Jeff 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
   1594   7754      Jeff 
   1595   7754      Jeff 	for (int d = 0; d < ndvas; d++)
   1596   1807   bonwick 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
   1597   7754      Jeff 			break;
   1598   1807   bonwick 
   1599   7754      Jeff 	spa_config_exit(spa, SCL_ALLOC, FTAG);
   1600   7754      Jeff 
   1601   7754      Jeff 	ASSERT(error == 0 || txg == 0);
   1602   7754      Jeff 
   1603   7754      Jeff 	return (error);
   1604   1807   bonwick }
   1605