Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)dsl_dataset.c	1.38	07/12/18 SMI"
     27 
     28 #include <sys/dmu_objset.h>
     29 #include <sys/dsl_dataset.h>
     30 #include <sys/dsl_dir.h>
     31 #include <sys/dsl_prop.h>
     32 #include <sys/dsl_synctask.h>
     33 #include <sys/dmu_traverse.h>
     34 #include <sys/dmu_tx.h>
     35 #include <sys/arc.h>
     36 #include <sys/zio.h>
     37 #include <sys/zap.h>
     38 #include <sys/unique.h>
     39 #include <sys/zfs_context.h>
     40 #include <sys/zfs_ioctl.h>
     41 #include <sys/spa.h>
     42 #include <sys/sunddi.h>
     43 
     44 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
     45 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
     46 static dsl_checkfunc_t dsl_dataset_rollback_check;
     47 static dsl_syncfunc_t dsl_dataset_rollback_sync;
     48 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
     49 
     50 #define	DS_REF_MAX	(1ULL << 62)
     51 
     52 #define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
     53 
     54 /*
     55  * We use weighted reference counts to express the various forms of exclusion
     56  * between different open modes.  A STANDARD open is 1 point, an EXCLUSIVE open
     57  * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
     58  * This makes the exclusion logic simple: the total refcnt for all opens cannot
     59  * exceed DS_REF_MAX.  For example, EXCLUSIVE opens are exclusive because their
     60  * weight (DS_REF_MAX) consumes the entire refcnt space.  PRIMARY opens consume
     61  * just over half of the refcnt space, so there can't be more than one, but it
     62  * can peacefully coexist with any number of STANDARD opens.
     63  */
     64 static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
     65 	0,			/* DS_MODE_NONE - invalid		*/
     66 	1,			/* DS_MODE_STANDARD - unlimited number	*/
     67 	(DS_REF_MAX >> 1) + 1,	/* DS_MODE_PRIMARY - only one of these	*/
     68 	DS_REF_MAX		/* DS_MODE_EXCLUSIVE - no other opens	*/
     69 };
     70 
     71 /*
     72  * Figure out how much of this delta should be propogated to the dsl_dir
     73  * layer.  If there's a refreservation, that space has already been
     74  * partially accounted for in our ancestors.
     75  */
     76 static int64_t
     77 parent_delta(dsl_dataset_t *ds, int64_t delta)
     78 {
     79 	uint64_t old_bytes, new_bytes;
     80 
     81 	if (ds->ds_reserved == 0)
     82 		return (delta);
     83 
     84 	old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
     85 	new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
     86 
     87 	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
     88 	return (new_bytes - old_bytes);
     89 }
     90 
     91 void
     92 dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
     93 {
     94 	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
     95 	int compressed = BP_GET_PSIZE(bp);
     96 	int uncompressed = BP_GET_UCSIZE(bp);
     97 	int64_t delta;
     98 
     99 	dprintf_bp(bp, "born, ds=%p\n", ds);
    100 
    101 	ASSERT(dmu_tx_is_syncing(tx));
    102 	/* It could have been compressed away to nothing */
    103 	if (BP_IS_HOLE(bp))
    104 		return;
    105 	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
    106 	ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
    107 	if (ds == NULL) {
    108 		/*
    109 		 * Account for the meta-objset space in its placeholder
    110 		 * dsl_dir.
    111 		 */
    112 		ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
    113 		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
    114 		    used, compressed, uncompressed, tx);
    115 		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
    116 		return;
    117 	}
    118 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
    119 	mutex_enter(&ds->ds_lock);
    120 	delta = parent_delta(ds, used);
    121 	ds->ds_phys->ds_used_bytes += used;
    122 	ds->ds_phys->ds_compressed_bytes += compressed;
    123 	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
    124 	ds->ds_phys->ds_unique_bytes += used;
    125 	mutex_exit(&ds->ds_lock);
    126 	dsl_dir_diduse_space(ds->ds_dir, delta, compressed, uncompressed, tx);
    127 }
    128 
    129 void
    130 dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
    131     dmu_tx_t *tx)
    132 {
    133 	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
    134 	int compressed = BP_GET_PSIZE(bp);
    135 	int uncompressed = BP_GET_UCSIZE(bp);
    136 
    137 	ASSERT(dmu_tx_is_syncing(tx));
    138 	/* No block pointer => nothing to free */
    139 	if (BP_IS_HOLE(bp))
    140 		return;
    141 
    142 	ASSERT(used > 0);
    143 	if (ds == NULL) {
    144 		int err;
    145 		/*
    146 		 * Account for the meta-objset space in its placeholder
    147 		 * dataset.
    148 		 */
    149 		err = arc_free(pio, tx->tx_pool->dp_spa,
    150 		    tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
    151 		ASSERT(err == 0);
    152 
    153 		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
    154 		    -used, -compressed, -uncompressed, tx);
    155 		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
    156 		return;
    157 	}
    158 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
    159 
    160 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
    161 
    162 	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
    163 		int err;
    164 		int64_t delta;
    165 
    166 		dprintf_bp(bp, "freeing: %s", "");
    167 		err = arc_free(pio, tx->tx_pool->dp_spa,
    168 		    tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
    169 		ASSERT(err == 0);
    170 
    171 		mutex_enter(&ds->ds_lock);
    172 		ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
    173 		    !DS_UNIQUE_IS_ACCURATE(ds));
    174 		delta = parent_delta(ds, -used);
    175 		ds->ds_phys->ds_unique_bytes -= used;
    176 		mutex_exit(&ds->ds_lock);
    177 		dsl_dir_diduse_space(ds->ds_dir,
    178 		    delta, -compressed, -uncompressed, tx);
    179 	} else {
    180 		dprintf_bp(bp, "putting on dead list: %s", "");
    181 		VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
    182 		ASSERT3U(ds->ds_prev->ds_object, ==,
    183 		    ds->ds_phys->ds_prev_snap_obj);
    184 		ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
    185 		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
    186 		if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
    187 		    ds->ds_object && bp->blk_birth >
    188 		    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
    189 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
    190 			mutex_enter(&ds->ds_prev->ds_lock);
    191 			ds->ds_prev->ds_phys->ds_unique_bytes += used;
    192 			mutex_exit(&ds->ds_prev->ds_lock);
    193 		}
    194 	}
    195 	mutex_enter(&ds->ds_lock);
    196 	ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
    197 	ds->ds_phys->ds_used_bytes -= used;
    198 	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
    199 	ds->ds_phys->ds_compressed_bytes -= compressed;
    200 	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
    201 	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
    202 	mutex_exit(&ds->ds_lock);
    203 }
    204 
    205 uint64_t
    206 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
    207 {
    208 	uint64_t trysnap = 0;
    209 
    210 	if (ds == NULL)
    211 		return (0);
    212 	/*
    213 	 * The snapshot creation could fail, but that would cause an
    214 	 * incorrect FALSE return, which would only result in an
    215 	 * overestimation of the amount of space that an operation would
    216 	 * consume, which is OK.
    217 	 *
    218 	 * There's also a small window where we could miss a pending
    219 	 * snapshot, because we could set the sync task in the quiescing
    220 	 * phase.  So this should only be used as a guess.
    221 	 */
    222 	if (ds->ds_trysnap_txg >
    223 	    spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
    224 		trysnap = ds->ds_trysnap_txg;
    225 	return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
    226 }
    227 
    228 int
    229 dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
    230 {
    231 	return (blk_birth > dsl_dataset_prev_snap_txg(ds));
    232 }
    233 
    234 /* ARGSUSED */
    235 static void
    236 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
    237 {
    238 	dsl_dataset_t *ds = dsv;
    239 
    240 	/* open_refcount == DS_REF_MAX when deleting */
    241 	ASSERT(ds->ds_open_refcount == 0 ||
    242 	    ds->ds_open_refcount == DS_REF_MAX);
    243 
    244 	dprintf_ds(ds, "evicting %s\n", "");
    245 
    246 	unique_remove(ds->ds_fsid_guid);
    247 
    248 	if (ds->ds_user_ptr != NULL)
    249 		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
    250 
    251 	if (ds->ds_prev) {
    252 		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
    253 		ds->ds_prev = NULL;
    254 	}
    255 
    256 	bplist_close(&ds->ds_deadlist);
    257 	dsl_dir_close(ds->ds_dir, ds);
    258 
    259 	ASSERT(!list_link_active(&ds->ds_synced_link));
    260 
    261 	mutex_destroy(&ds->ds_lock);
    262 	mutex_destroy(&ds->ds_opening_lock);
    263 	mutex_destroy(&ds->ds_deadlist.bpl_lock);
    264 
    265 	kmem_free(ds, sizeof (dsl_dataset_t));
    266 }
    267 
    268 static int
    269 dsl_dataset_get_snapname(dsl_dataset_t *ds)
    270 {
    271 	dsl_dataset_phys_t *headphys;
    272 	int err;
    273 	dmu_buf_t *headdbuf;
    274 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
    275 	objset_t *mos = dp->dp_meta_objset;
    276 
    277 	if (ds->ds_snapname[0])
    278 		return (0);
    279 	if (ds->ds_phys->ds_next_snap_obj == 0)
    280 		return (0);
    281 
    282 	err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
    283 	    FTAG, &headdbuf);
    284 	if (err)
    285 		return (err);
    286 	headphys = headdbuf->db_data;
    287 	err = zap_value_search(dp->dp_meta_objset,
    288 	    headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
    289 	dmu_buf_rele(headdbuf, FTAG);
    290 	return (err);
    291 }
    292 
    293 int
    294 dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
    295     int mode, void *tag, dsl_dataset_t **dsp)
    296 {
    297 	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
    298 	objset_t *mos = dp->dp_meta_objset;
    299 	dmu_buf_t *dbuf;
    300 	dsl_dataset_t *ds;
    301 	int err;
    302 
    303 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
    304 	    dsl_pool_sync_context(dp));
    305 
    306 	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
    307 	if (err)
    308 		return (err);
    309 	ds = dmu_buf_get_user(dbuf);
    310 	if (ds == NULL) {
    311 		dsl_dataset_t *winner;
    312 
    313 		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
    314 		ds->ds_dbuf = dbuf;
    315 		ds->ds_object = dsobj;
    316 		ds->ds_phys = dbuf->db_data;
    317 
    318 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
    319 		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
    320 		mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
    321 		    NULL);
    322 
    323 		err = bplist_open(&ds->ds_deadlist,
    324 		    mos, ds->ds_phys->ds_deadlist_obj);
    325 		if (err == 0) {
    326 			err = dsl_dir_open_obj(dp,
    327 			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
    328 		}
    329 		if (err) {
    330 			/*
    331 			 * we don't really need to close the blist if we
    332 			 * just opened it.
    333 			 */
    334 			mutex_destroy(&ds->ds_lock);
    335 			mutex_destroy(&ds->ds_opening_lock);
    336 			mutex_destroy(&ds->ds_deadlist.bpl_lock);
    337 			kmem_free(ds, sizeof (dsl_dataset_t));
    338 			dmu_buf_rele(dbuf, tag);
    339 			return (err);
    340 		}
    341 
    342 		if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
    343 			ds->ds_snapname[0] = '\0';
    344 			if (ds->ds_phys->ds_prev_snap_obj) {
    345 				err = dsl_dataset_open_obj(dp,
    346 				    ds->ds_phys->ds_prev_snap_obj, NULL,
    347 				    DS_MODE_NONE, ds, &ds->ds_prev);
    348 			}
    349 		} else {
    350 			if (snapname) {
    351 #ifdef ZFS_DEBUG
    352 				dsl_dataset_phys_t *headphys;
    353 				dmu_buf_t *headdbuf;
    354 				err = dmu_bonus_hold(mos,
    355 				    ds->ds_dir->dd_phys->dd_head_dataset_obj,
    356 				    FTAG, &headdbuf);
    357 				if (err == 0) {
    358 					headphys = headdbuf->db_data;
    359 					uint64_t foundobj;
    360 					err = zap_lookup(dp->dp_meta_objset,
    361 					    headphys->ds_snapnames_zapobj,
    362 					    snapname, sizeof (foundobj), 1,
    363 					    &foundobj);
    364 					ASSERT3U(foundobj, ==, dsobj);
    365 					dmu_buf_rele(headdbuf, FTAG);
    366 				}
    367 #endif
    368 				(void) strcat(ds->ds_snapname, snapname);
    369 			} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
    370 				err = dsl_dataset_get_snapname(ds);
    371 			}
    372 		}
    373 
    374 		if (!dsl_dataset_is_snapshot(ds)) {
    375 			/*
    376 			 * In sync context, we're called with either no lock
    377 			 * or with the write lock.  If we're not syncing,
    378 			 * we're always called with the read lock held.
    379 			 */
    380 			boolean_t need_lock =
    381 			    !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
    382 			    dsl_pool_sync_context(dp);
    383 
    384 			if (need_lock)
    385 				rw_enter(&dp->dp_config_rwlock, RW_READER);
    386 
    387 			err = dsl_prop_get_ds_locked(ds->ds_dir,
    388 			    "refreservation", sizeof (uint64_t), 1,
    389 			    &ds->ds_reserved, NULL);
    390 			if (err == 0) {
    391 				err = dsl_prop_get_ds_locked(ds->ds_dir,
    392 				    "refquota", sizeof (uint64_t), 1,
    393 				    &ds->ds_quota, NULL);
    394 			}
    395 
    396 			if (need_lock)
    397 				rw_exit(&dp->dp_config_rwlock);
    398 		} else {
    399 			ds->ds_reserved = ds->ds_quota = 0;
    400 		}
    401 
    402 		if (err == 0) {
    403 			winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
    404 			    dsl_dataset_evict);
    405 		}
    406 		if (err || winner) {
    407 			bplist_close(&ds->ds_deadlist);
    408 			if (ds->ds_prev) {
    409 				dsl_dataset_close(ds->ds_prev,
    410 				    DS_MODE_NONE, ds);
    411 			}
    412 			dsl_dir_close(ds->ds_dir, ds);
    413 			mutex_destroy(&ds->ds_lock);
    414 			mutex_destroy(&ds->ds_opening_lock);
    415 			mutex_destroy(&ds->ds_deadlist.bpl_lock);
    416 			kmem_free(ds, sizeof (dsl_dataset_t));
    417 			if (err) {
    418 				dmu_buf_rele(dbuf, tag);
    419 				return (err);
    420 			}
    421 			ds = winner;
    422 		} else {
    423 			ds->ds_fsid_guid =
    424 			    unique_insert(ds->ds_phys->ds_fsid_guid);
    425 		}
    426 	}
    427 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
    428 	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
    429 
    430 	mutex_enter(&ds->ds_lock);
    431 	if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
    432 	    (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) &&
    433 	    !DS_MODE_IS_INCONSISTENT(mode)) ||
    434 	    (ds->ds_open_refcount + weight > DS_REF_MAX)) {
    435 		mutex_exit(&ds->ds_lock);
    436 		dsl_dataset_close(ds, DS_MODE_NONE, tag);
    437 		return (EBUSY);
    438 	}
    439 	ds->ds_open_refcount += weight;
    440 	mutex_exit(&ds->ds_lock);
    441 
    442 	*dsp = ds;
    443 	return (0);
    444 }
    445 
    446 int
    447 dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
    448     void *tag, dsl_dataset_t **dsp)
    449 {
    450 	dsl_dir_t *dd;
    451 	dsl_pool_t *dp;
    452 	const char *tail;
    453 	uint64_t obj;
    454 	dsl_dataset_t *ds = NULL;
    455 	int err = 0;
    456 
    457 	err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
    458 	if (err)
    459 		return (err);
    460 
    461 	dp = dd->dd_pool;
    462 	obj = dd->dd_phys->dd_head_dataset_obj;
    463 	rw_enter(&dp->dp_config_rwlock, RW_READER);
    464 	if (obj == 0) {
    465 		/* A dataset with no associated objset */
    466 		err = ENOENT;
    467 		goto out;
    468 	}
    469 
    470 	if (tail != NULL) {
    471 		objset_t *mos = dp->dp_meta_objset;
    472 
    473 		err = dsl_dataset_open_obj(dp, obj, NULL,
    474 		    DS_MODE_NONE, tag, &ds);
    475 		if (err)
    476 			goto out;
    477 		obj = ds->ds_phys->ds_snapnames_zapobj;
    478 		dsl_dataset_close(ds, DS_MODE_NONE, tag);
    479 		ds = NULL;
    480 
    481 		if (tail[0] != '@') {
    482 			err = ENOENT;
    483 			goto out;
    484 		}
    485 		tail++;
    486 
    487 		/* Look for a snapshot */
    488 		if (!DS_MODE_IS_READONLY(mode)) {
    489 			err = EROFS;
    490 			goto out;
    491 		}
    492 		dprintf("looking for snapshot '%s'\n", tail);
    493 		err = zap_lookup(mos, obj, tail, 8, 1, &obj);
    494 		if (err)
    495 			goto out;
    496 	}
    497 	err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
    498 
    499 out:
    500 	rw_exit(&dp->dp_config_rwlock);
    501 	dsl_dir_close(dd, FTAG);
    502 
    503 	ASSERT3U((err == 0), ==, (ds != NULL));
    504 	/* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
    505 
    506 	*dsp = ds;
    507 	return (err);
    508 }
    509 
    510 int
    511 dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
    512 {
    513 	return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
    514 }
    515 
    516 void
    517 dsl_dataset_name(dsl_dataset_t *ds, char *name)
    518 {
    519 	if (ds == NULL) {
    520 		(void) strcpy(name, "mos");
    521 	} else {
    522 		dsl_dir_name(ds->ds_dir, name);
    523 		VERIFY(0 == dsl_dataset_get_snapname(ds));
    524 		if (ds->ds_snapname[0]) {
    525 			(void) strcat(name, "@");
    526 			if (!MUTEX_HELD(&ds->ds_lock)) {
    527 				/*
    528 				 * We use a "recursive" mutex so that we
    529 				 * can call dprintf_ds() with ds_lock held.
    530 				 */
    531 				mutex_enter(&ds->ds_lock);
    532 				(void) strcat(name, ds->ds_snapname);
    533 				mutex_exit(&ds->ds_lock);
    534 			} else {
    535 				(void) strcat(name, ds->ds_snapname);
    536 			}
    537 		}
    538 	}
    539 }
    540 
    541 static int
    542 dsl_dataset_namelen(dsl_dataset_t *ds)
    543 {
    544 	int result;
    545 
    546 	if (ds == NULL) {
    547 		result = 3;	/* "mos" */
    548 	} else {
    549 		result = dsl_dir_namelen(ds->ds_dir);
    550 		VERIFY(0 == dsl_dataset_get_snapname(ds));
    551 		if (ds->ds_snapname[0]) {
    552 			++result;	/* adding one for the @-sign */
    553 			if (!MUTEX_HELD(&ds->ds_lock)) {
    554 				/* see dsl_datset_name */
    555 				mutex_enter(&ds->ds_lock);
    556 				result += strlen(ds->ds_snapname);
    557 				mutex_exit(&ds->ds_lock);
    558 			} else {
    559 				result += strlen(ds->ds_snapname);
    560 			}
    561 		}
    562 	}
    563 
    564 	return (result);
    565 }
    566 
    567 void
    568 dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
    569 {
    570 	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
    571 	mutex_enter(&ds->ds_lock);
    572 	ASSERT3U(ds->ds_open_refcount, >=, weight);
    573 	ds->ds_open_refcount -= weight;
    574 	dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
    575 	    mode, ds->ds_open_refcount);
    576 	mutex_exit(&ds->ds_lock);
    577 
    578 	dmu_buf_rele(ds->ds_dbuf, tag);
    579 }
    580 
    581 void
    582 dsl_dataset_downgrade(dsl_dataset_t *ds, int oldmode, int newmode)
    583 {
    584 	uint64_t oldweight = ds_refcnt_weight[DS_MODE_LEVEL(oldmode)];
    585 	uint64_t newweight = ds_refcnt_weight[DS_MODE_LEVEL(newmode)];
    586 	mutex_enter(&ds->ds_lock);
    587 	ASSERT3U(ds->ds_open_refcount, >=, oldweight);
    588 	ASSERT3U(oldweight, >=, newweight);
    589 	ds->ds_open_refcount -= oldweight;
    590 	ds->ds_open_refcount += newweight;
    591 	mutex_exit(&ds->ds_lock);
    592 }
    593 
    594 boolean_t
    595 dsl_dataset_tryupgrade(dsl_dataset_t *ds, int oldmode, int newmode)
    596 {
    597 	boolean_t rv;
    598 	uint64_t oldweight = ds_refcnt_weight[DS_MODE_LEVEL(oldmode)];
    599 	uint64_t newweight = ds_refcnt_weight[DS_MODE_LEVEL(newmode)];
    600 	mutex_enter(&ds->ds_lock);
    601 	ASSERT3U(ds->ds_open_refcount, >=, oldweight);
    602 	ASSERT3U(newweight, >=, oldweight);
    603 	if (ds->ds_open_refcount - oldweight + newweight > DS_REF_MAX) {
    604 		rv = B_FALSE;
    605 	} else {
    606 		ds->ds_open_refcount -= oldweight;
    607 		ds->ds_open_refcount += newweight;
    608 		rv = B_TRUE;
    609 	}
    610 	mutex_exit(&ds->ds_lock);
    611 	return (rv);
    612 }
    613 
    614 void
    615 dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
    616 {
    617 	objset_t *mos = dp->dp_meta_objset;
    618 	dmu_buf_t *dbuf;
    619 	dsl_dataset_phys_t *dsphys;
    620 	dsl_dataset_t *ds;
    621 	uint64_t dsobj;
    622 	dsl_dir_t *dd;
    623 
    624 	dsl_dir_create_root(mos, ddobjp, tx);
    625 	VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
    626 
    627 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
    628 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
    629 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
    630 	dmu_buf_will_dirty(dbuf, tx);
    631 	dsphys = dbuf->db_data;
    632 	dsphys->ds_dir_obj = dd->dd_object;
    633 	dsphys->ds_fsid_guid = unique_create();
    634 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
    635 	    sizeof (dsphys->ds_guid));
    636 	dsphys->ds_snapnames_zapobj =
    637 	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
    638 	dsphys->ds_creation_time = gethrestime_sec();
    639 	dsphys->ds_creation_txg = tx->tx_txg;
    640 	dsphys->ds_deadlist_obj =
    641 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
    642 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
    643 		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
    644 	dmu_buf_rele(dbuf, FTAG);
    645 
    646 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
    647 	dd->dd_phys->dd_head_dataset_obj = dsobj;
    648 	dsl_dir_close(dd, FTAG);
    649 
    650 	VERIFY(0 ==
    651 	    dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
    652 	(void) dmu_objset_create_impl(dp->dp_spa, ds,
    653 	    &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
    654 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
    655 }
    656 
    657 uint64_t
    658 dsl_dataset_create_sync_impl(dsl_dir_t *dd, dsl_dataset_t *origin, dmu_tx_t *tx)
    659 {
    660 	dsl_pool_t *dp = dd->dd_pool;
    661 	dmu_buf_t *dbuf;
    662 	dsl_dataset_phys_t *dsphys;
    663 	uint64_t dsobj;
    664 	objset_t *mos = dp->dp_meta_objset;
    665 
    666 	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
    667 	ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
    668 	ASSERT(dmu_tx_is_syncing(tx));
    669 	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
    670 
    671 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
    672 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
    673 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
    674 	dmu_buf_will_dirty(dbuf, tx);
    675 	dsphys = dbuf->db_data;
    676 	dsphys->ds_dir_obj = dd->dd_object;
    677 	dsphys->ds_fsid_guid = unique_create();
    678 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
    679 	    sizeof (dsphys->ds_guid));
    680 	dsphys->ds_snapnames_zapobj =
    681 	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
    682 	dsphys->ds_creation_time = gethrestime_sec();
    683 	dsphys->ds_creation_txg = tx->tx_txg;
    684 	dsphys->ds_deadlist_obj =
    685 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
    686 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
    687 		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
    688 
    689 	if (origin) {
    690 		dsphys->ds_prev_snap_obj = origin->ds_object;
    691 		dsphys->ds_prev_snap_txg =
    692 		    origin->ds_phys->ds_creation_txg;
    693 		dsphys->ds_used_bytes =
    694 		    origin->ds_phys->ds_used_bytes;
    695 		dsphys->ds_compressed_bytes =
    696 		    origin->ds_phys->ds_compressed_bytes;
    697 		dsphys->ds_uncompressed_bytes =
    698 		    origin->ds_phys->ds_uncompressed_bytes;
    699 		dsphys->ds_bp = origin->ds_phys->ds_bp;
    700 
    701 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
    702 		origin->ds_phys->ds_num_children++;
    703 
    704 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
    705 		dd->dd_phys->dd_origin_obj = origin->ds_object;
    706 	}
    707 	dmu_buf_rele(dbuf, FTAG);
    708 
    709 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
    710 	dd->dd_phys->dd_head_dataset_obj = dsobj;
    711 
    712 	return (dsobj);
    713 }
    714 
    715 uint64_t
    716 dsl_dataset_create_sync(dsl_dir_t *pdd,
    717     const char *lastname, dsl_dataset_t *origin, cred_t *cr, dmu_tx_t *tx)
    718 {
    719 	dsl_pool_t *dp = pdd->dd_pool;
    720 	uint64_t dsobj, ddobj;
    721 	dsl_dir_t *dd;
    722 
    723 	ASSERT(lastname[0] != '@');
    724 
    725 	ddobj = dsl_dir_create_sync(pdd, lastname, tx);
    726 	VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
    727 
    728 	dsobj = dsl_dataset_create_sync_impl(dd, origin, tx);
    729 
    730 	dsl_deleg_set_create_perms(dd, tx, cr);
    731 
    732 	dsl_dir_close(dd, FTAG);
    733 
    734 	return (dsobj);
    735 }
    736 
    737 struct destroyarg {
    738 	dsl_sync_task_group_t *dstg;
    739 	char *snapname;
    740 	char *failed;
    741 };
    742 
    743 static int
    744 dsl_snapshot_destroy_one(char *name, void *arg)
    745 {
    746 	struct destroyarg *da = arg;
    747 	dsl_dataset_t *ds;
    748 	char *cp;
    749 	int err;
    750 
    751 	(void) strcat(name, "@");
    752 	(void) strcat(name, da->snapname);
    753 	err = dsl_dataset_open(name,
    754 	    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
    755 	    da->dstg, &ds);
    756 	cp = strchr(name, '@');
    757 	*cp = '\0';
    758 	if (err == ENOENT)
    759 		return (0);
    760 	if (err) {
    761 		(void) strcpy(da->failed, name);
    762 		return (err);
    763 	}
    764 
    765 	dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
    766 	    dsl_dataset_destroy_sync, ds, da->dstg, 0);
    767 	return (0);
    768 }
    769 
    770 /*
    771  * Destroy 'snapname' in all descendants of 'fsname'.
    772  */
    773 #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
    774 int
    775 dsl_snapshots_destroy(char *fsname, char *snapname)
    776 {
    777 	int err;
    778 	struct destroyarg da;
    779 	dsl_sync_task_t *dst;
    780 	spa_t *spa;
    781 
    782 	err = spa_open(fsname, &spa, FTAG);
    783 	if (err)
    784 		return (err);
    785 	da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
    786 	da.snapname = snapname;
    787 	da.failed = fsname;
    788 
    789 	err = dmu_objset_find(fsname,
    790 	    dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
    791 
    792 	if (err == 0)
    793 		err = dsl_sync_task_group_wait(da.dstg);
    794 
    795 	for (dst = list_head(&da.dstg->dstg_tasks); dst;
    796 	    dst = list_next(&da.dstg->dstg_tasks, dst)) {
    797 		dsl_dataset_t *ds = dst->dst_arg1;
    798 		if (dst->dst_err) {
    799 			dsl_dataset_name(ds, fsname);
    800 			*strchr(fsname, '@') = '\0';
    801 		}
    802 		/*
    803 		 * If it was successful, destroy_sync would have
    804 		 * closed the ds
    805 		 */
    806 		if (err)
    807 			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, da.dstg);
    808 	}
    809 
    810 	dsl_sync_task_group_destroy(da.dstg);
    811 	spa_close(spa, FTAG);
    812 	return (err);
    813 }
    814 
    815 /*
    816  * ds must be opened EXCLUSIVE or PRIMARY.  on return (whether
    817  * successful or not), ds will be closed and caller can no longer
    818  * dereference it.
    819  */
    820 int
    821 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
    822 {
    823 	int err;
    824 	dsl_sync_task_group_t *dstg;
    825 	objset_t *os;
    826 	dsl_dir_t *dd;
    827 	uint64_t obj;
    828 
    829 	if (ds->ds_open_refcount != DS_REF_MAX) {
    830 		if (dsl_dataset_tryupgrade(ds, DS_MODE_PRIMARY,
    831 		    DS_MODE_EXCLUSIVE) == 0) {
    832 			dsl_dataset_close(ds, DS_MODE_PRIMARY, tag);
    833 			return (EBUSY);
    834 		}
    835 	}
    836 
    837 	if (dsl_dataset_is_snapshot(ds)) {
    838 		/* Destroying a snapshot is simpler */
    839 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
    840 		    dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
    841 		    ds, tag, 0);
    842 		goto out;
    843 	}
    844 
    845 	dd = ds->ds_dir;
    846 
    847 	/*
    848 	 * Check for errors and mark this ds as inconsistent, in
    849 	 * case we crash while freeing the objects.
    850 	 */
    851 	err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
    852 	    dsl_dataset_destroy_begin_sync, ds, NULL, 0);
    853 	if (err)
    854 		goto out;
    855 
    856 	err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
    857 	if (err)
    858 		goto out;
    859 
    860 	/*
    861 	 * remove the objects in open context, so that we won't
    862 	 * have too much to do in syncing context.
    863 	 */
    864 	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
    865 	    ds->ds_phys->ds_prev_snap_txg)) {
    866 		dmu_tx_t *tx = dmu_tx_create(os);
    867 		dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END);
    868 		dmu_tx_hold_bonus(tx, obj);
    869 		err = dmu_tx_assign(tx, TXG_WAIT);
    870 		if (err) {
    871 			/*
    872 			 * Perhaps there is not enough disk
    873 			 * space.  Just deal with it from
    874 			 * dsl_dataset_destroy_sync().
    875 			 */
    876 			dmu_tx_abort(tx);
    877 			continue;
    878 		}
    879 		VERIFY(0 == dmu_object_free(os, obj, tx));
    880 		dmu_tx_commit(tx);
    881 	}
    882 	/* Make sure it's not dirty before we finish destroying it. */
    883 	txg_wait_synced(dd->dd_pool, 0);
    884 
    885 	dmu_objset_close(os);
    886 	if (err != ESRCH)
    887 		goto out;
    888 
    889 	if (ds->ds_user_ptr) {
    890 		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
    891 		ds->ds_user_ptr = NULL;
    892 	}
    893 
    894 	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
    895 	err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
    896 	rw_exit(&dd->dd_pool->dp_config_rwlock);
    897 
    898 	if (err)
    899 		goto out;
    900 
    901 	/*
    902 	 * Blow away the dsl_dir + head dataset.
    903 	 */
    904 	dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
    905 	dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
    906 	    dsl_dataset_destroy_sync, ds, tag, 0);
    907 	dsl_sync_task_create(dstg, dsl_dir_destroy_check,
    908 	    dsl_dir_destroy_sync, dd, FTAG, 0);
    909 	err = dsl_sync_task_group_wait(dstg);
    910 	dsl_sync_task_group_destroy(dstg);
    911 	/* if it is successful, *destroy_sync will close the ds+dd */
    912 	if (err)
    913 		dsl_dir_close(dd, FTAG);
    914 out:
    915 	if (err)
    916 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag);
    917 	return (err);
    918 }
    919 
    920 int
    921 dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost)
    922 {
    923 	ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
    924 
    925 	return (dsl_sync_task_do(ds->ds_dir->dd_pool,
    926 	    dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
    927 	    ds, &ost, 0));
    928 }
    929 
    930 void *
    931 dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
    932     void *p, dsl_dataset_evict_func_t func)
    933 {
    934 	void *old;
    935 
    936 	mutex_enter(&ds->ds_lock);
    937 	old = ds->ds_user_ptr;
    938 	if (old == NULL) {
    939 		ds->ds_user_ptr = p;
    940 		ds->ds_user_evict_func = func;
    941 	}
    942 	mutex_exit(&ds->ds_lock);
    943 	return (old);
    944 }
    945 
    946 void *
    947 dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
    948 {
    949 	return (ds->ds_user_ptr);
    950 }
    951 
    952 
    953 blkptr_t *
    954 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
    955 {
    956 	return (&ds->ds_phys->ds_bp);
    957 }
    958 
    959 void
    960 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
    961 {
    962 	ASSERT(dmu_tx_is_syncing(tx));
    963 	/* If it's the meta-objset, set dp_meta_rootbp */
    964 	if (ds == NULL) {
    965 		tx->tx_pool->dp_meta_rootbp = *bp;
    966 	} else {
    967 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
    968 		ds->ds_phys->ds_bp = *bp;
    969 	}
    970 }
    971 
    972 spa_t *
    973 dsl_dataset_get_spa(dsl_dataset_t *ds)
    974 {
    975 	return (ds->ds_dir->dd_pool->dp_spa);
    976 }
    977 
    978 void
    979 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
    980 {
    981 	dsl_pool_t *dp;
    982 
    983 	if (ds == NULL) /* this is the meta-objset */
    984 		return;
    985 
    986 	ASSERT(ds->ds_user_ptr != NULL);
    987 
    988 	if (ds->ds_phys->ds_next_snap_obj != 0)
    989 		panic("dirtying snapshot!");
    990 
    991 	dp = ds->ds_dir->dd_pool;
    992 
    993 	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
    994 		/* up the hold count until we can be written out */
    995 		dmu_buf_add_ref(ds->ds_dbuf, ds);
    996 	}
    997 }
    998 
    999 /*
   1000  * The unique space in the head dataset can be calculated by subtracting
   1001  * the space used in the most recent snapshot, that is still being used
   1002  * in this file system, from the space currently in use.  To figure out
   1003  * the space in the most recent snapshot still in use, we need to take
   1004  * the total space used in the snapshot and subtract out the space that
   1005  * has been freed up since the snapshot was taken.
   1006  */
   1007 static void
   1008 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
   1009 {
   1010 	uint64_t mrs_used;
   1011 	uint64_t dlused, dlcomp, dluncomp;
   1012 
   1013 	ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj);
   1014 
   1015 	if (ds->ds_phys->ds_prev_snap_obj != 0)
   1016 		mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
   1017 	else
   1018 		mrs_used = 0;
   1019 
   1020 	VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp,
   1021 	    &dluncomp));
   1022 
   1023 	ASSERT3U(dlused, <=, mrs_used);
   1024 	ds->ds_phys->ds_unique_bytes =
   1025 	    ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
   1026 
   1027 	if (!DS_UNIQUE_IS_ACCURATE(ds) &&
   1028 	    spa_version(ds->ds_dir->dd_pool->dp_spa) >=
   1029 	    SPA_VERSION_UNIQUE_ACCURATE)
   1030 		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
   1031 }
   1032 
   1033 static uint64_t
   1034 dsl_dataset_unique(dsl_dataset_t *ds)
   1035 {
   1036 	if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds))
   1037 		dsl_dataset_recalc_head_uniq(ds);
   1038 
   1039 	return (ds->ds_phys->ds_unique_bytes);
   1040 }
   1041 
   1042 struct killarg {
   1043 	int64_t *usedp;
   1044 	int64_t *compressedp;
   1045 	int64_t *uncompressedp;
   1046 	zio_t *zio;
   1047 	dmu_tx_t *tx;
   1048 };
   1049 
   1050 static int
   1051 kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
   1052 {
   1053 	struct killarg *ka = arg;
   1054 	blkptr_t *bp = &bc->bc_blkptr;
   1055 
   1056 	ASSERT3U(bc->bc_errno, ==, 0);
   1057 
   1058 	/*
   1059 	 * Since this callback is not called concurrently, no lock is
   1060 	 * needed on the accounting values.
   1061 	 */
   1062 	*ka->usedp += bp_get_dasize(spa, bp);
   1063 	*ka->compressedp += BP_GET_PSIZE(bp);
   1064 	*ka->uncompressedp += BP_GET_UCSIZE(bp);
   1065 	/* XXX check for EIO? */
   1066 	(void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
   1067 	    ARC_NOWAIT);
   1068 	return (0);
   1069 }
   1070 
   1071 /* ARGSUSED */
   1072 static int
   1073 dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
   1074 {
   1075 	dsl_dataset_t *ds = arg1;
   1076 	dmu_objset_type_t *ost = arg2;
   1077 
   1078 	/*
   1079 	 * We can only roll back to emptyness if it is a ZPL objset.
   1080 	 */
   1081 	if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0)
   1082 		return (EINVAL);
   1083 
   1084 	/*
   1085 	 * This must not be a snapshot.
   1086 	 */
   1087 	if (ds->ds_phys->ds_next_snap_obj != 0)
   1088 		return (EINVAL);
   1089 
   1090 	/*
   1091 	 * If we made changes this txg, traverse_dsl_dataset won't find
   1092 	 * them.  Try again.
   1093 	 */
   1094 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
   1095 		return (EAGAIN);
   1096 
   1097 	return (0);
   1098 }
   1099 
   1100 /* ARGSUSED */
   1101 static void
   1102 dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
   1103 {
   1104 	dsl_dataset_t *ds = arg1;
   1105 	dmu_objset_type_t *ost = arg2;
   1106 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
   1107 
   1108 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
   1109 
   1110 	/*
   1111 	 * Before the roll back destroy the zil.
   1112 	 */
   1113 	if (ds->ds_user_ptr != NULL) {
   1114 		zil_rollback_destroy(
   1115 		    ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx);
   1116 
   1117 		/*
   1118 		 * We need to make sure that the objset_impl_t is reopened after
   1119 		 * we do the rollback, otherwise it will have the wrong
   1120 		 * objset_phys_t.  Normally this would happen when this
   1121 		 * DS_MODE_EXCLUSIVE dataset-open is closed, thus causing the
   1122 		 * dataset to be immediately evicted.  But when doing "zfs recv
   1123 		 * -F", we reopen the objset before that, so that there is no
   1124 		 * window where the dataset is closed and inconsistent.
   1125 		 */
   1126 		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
   1127 		ds->ds_user_ptr = NULL;
   1128 	}
   1129 
   1130 	/* Zero out the deadlist. */
   1131 	bplist_close(&ds->ds_deadlist);
   1132 	bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
   1133 	ds->ds_phys->ds_deadlist_obj =
   1134 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
   1135 	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
   1136 	    ds->ds_phys->ds_deadlist_obj));
   1137 
   1138 	{
   1139 		/* Free blkptrs that we gave birth to */
   1140 		zio_t *zio;
   1141 		int64_t used = 0, compressed = 0, uncompressed = 0;
   1142 		struct killarg ka;
   1143 		int64_t delta;
   1144 
   1145 		zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
   1146 		    ZIO_FLAG_MUSTSUCCEED);
   1147 		ka.usedp = &used;
   1148 		ka.compressedp = &compressed;
   1149 		ka.uncompressedp = &uncompressed;
   1150 		ka.zio = zio;
   1151 		ka.tx = tx;
   1152 		(void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
   1153 		    ADVANCE_POST, kill_blkptr, &ka);
   1154 		(void) zio_wait(zio);
   1155 
   1156 		/* only deduct space beyond any refreservation */
   1157 		delta = parent_delta(ds, -used);
   1158 		dsl_dir_diduse_space(ds->ds_dir,
   1159 		    delta, -compressed, -uncompressed, tx);
   1160 	}
   1161 
   1162 	if (ds->ds_prev) {
   1163 		/* Change our contents to that of the prev snapshot */
   1164 		ASSERT3U(ds->ds_prev->ds_object, ==,
   1165 		    ds->ds_phys->ds_prev_snap_obj);
   1166 		ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
   1167 		ds->ds_phys->ds_used_bytes =
   1168 		    ds->ds_prev->ds_phys->ds_used_bytes;
   1169 		ds->ds_phys->ds_compressed_bytes =
   1170 		    ds->ds_prev->ds_phys->ds_compressed_bytes;
   1171 		ds->ds_phys->ds_uncompressed_bytes =
   1172 		    ds->ds_prev->ds_phys->ds_uncompressed_bytes;
   1173 		ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
   1174 		ds->ds_phys->ds_unique_bytes = 0;
   1175 
   1176 		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
   1177 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
   1178 			ds->ds_prev->ds_phys->ds_unique_bytes = 0;
   1179 		}
   1180 	} else {
   1181 		/* Zero out our contents, recreate objset */
   1182 		bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t));
   1183 		ds->ds_phys->ds_used_bytes = 0;
   1184 		ds->ds_phys->ds_compressed_bytes = 0;
   1185 		ds->ds_phys->ds_uncompressed_bytes = 0;
   1186 		ds->ds_phys->ds_flags = 0;
   1187 		ds->ds_phys->ds_unique_bytes = 0;
   1188 		(void) dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
   1189 		    &ds->ds_phys->ds_bp, *ost, tx);
   1190 	}
   1191 
   1192 	spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa,
   1193 	    tx, cr, "dataset = %llu", ds->ds_object);
   1194 }
   1195 
   1196 /* ARGSUSED */
   1197 static int
   1198 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
   1199 {
   1200 	dsl_dataset_t *ds = arg1;
   1201 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
   1202 	uint64_t count;
   1203 	int err;
   1204 
   1205 	/*
   1206 	 * Can't delete a head dataset if there are snapshots of it.
   1207 	 * (Except if the only snapshots are from the branch we cloned
   1208 	 * from.)
   1209 	 */
   1210 	if (ds->ds_prev != NULL &&
   1211 	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
   1212 		return (EINVAL);
   1213 
   1214 	/*
   1215 	 * This is really a dsl_dir thing, but check it here so that
   1216 	 * we'll be less likely to leave this dataset inconsistent &
   1217 	 * nearly destroyed.
   1218 	 */
   1219 	err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
   1220 	if (err)
   1221 		return (err);
   1222 	if (count != 0)
   1223 		return (EEXIST);
   1224 
   1225 	return (0);
   1226 }
   1227 
   1228 /* ARGSUSED */
   1229 static void
   1230 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
   1231 {
   1232 	dsl_dataset_t *ds = arg1;
   1233 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
   1234 
   1235 	/* Mark it as inconsistent on-disk, in case we crash */
   1236 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
   1237 	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
   1238 
   1239 	spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
   1240 	    cr, "dataset = %llu", ds->ds_object);
   1241 }
   1242 
   1243 /* ARGSUSED */
   1244 int
   1245 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
   1246 {
   1247 	dsl_dataset_t *ds = arg1;
   1248 
   1249 	/* Can't delete a branch point. */
   1250 	if (ds->ds_phys->ds_num_children > 1)
   1251 		return (EEXIST);
   1252 
   1253 	/*
   1254 	 * Can't delete a head dataset if there are snapshots of it.
   1255 	 * (Except if the only snapshots are from the branch we cloned
   1256 	 * from.)
   1257 	 */
   1258 	if (ds->ds_prev != NULL &&
   1259 	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
   1260 		return (EINVAL);
   1261 
   1262 	/*
   1263 	 * If we made changes this txg, traverse_dsl_dataset won't find
   1264 	 * them.  Try again.
   1265 	 */
   1266 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
   1267 		return (EAGAIN);
   1268 
   1269 	/* XXX we should do some i/o error checking... */
   1270 	return (0);
   1271 }
   1272 
   1273 void
   1274 dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
   1275 {
   1276 	dsl_dataset_t *ds = arg1;
   1277 	int64_t used = 0, compressed = 0, uncompressed = 0;
   1278 	zio_t *zio;
   1279 	int err;
   1280 	int after_branch_point = FALSE;
   1281 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
   1282 	objset_t *mos = dp->dp_meta_objset;
   1283 	dsl_dataset_t *ds_prev = NULL;
   1284 	uint64_t obj;
   1285 
   1286 	ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
   1287 	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
   1288 	ASSERT(ds->ds_prev == NULL ||
   1289 	    ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
   1290 	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
   1291 
   1292 	/* Remove our reservation */
   1293 	if (ds->ds_reserved != 0) {
   1294 		uint64_t val = 0;
   1295 		dsl_dataset_set_reservation_sync(ds, &val, cr, tx);
   1296 		ASSERT3U(ds->ds_reserved, ==, 0);
   1297 	}
   1298 
   1299 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
   1300 
   1301 	obj = ds->ds_object;
   1302 
   1303 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
   1304 		if (ds->ds_prev) {
   1305 			ds_prev = ds->ds_prev;
   1306 		} else {
   1307 			VERIFY(0 == dsl_dataset_open_obj(dp,
   1308 			    ds->ds_phys->ds_prev_snap_obj, NULL,
   1309 			    DS_MODE_NONE, FTAG, &ds_prev));
   1310 		}
   1311 		after_branch_point =
   1312 		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
   1313 
   1314 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
   1315 		if (after_branch_point &&
   1316 		    ds->ds_phys->ds_next_snap_obj == 0) {
   1317 			/* This clone is toast. */
   1318 			ASSERT(ds_prev->ds_phys->ds_num_children > 1);
   1319 			ds_prev->ds_phys->ds_num_children--;
   1320 		} else if (!after_branch_point) {
   1321 			ds_prev->ds_phys->ds_next_snap_obj =
   1322 			    ds->ds_phys->ds_next_snap_obj;
   1323 		}
   1324 	}
   1325 
   1326 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
   1327 
   1328 	if (ds->ds_phys->ds_next_snap_obj != 0) {
   1329 		blkptr_t bp;
   1330 		dsl_dataset_t *ds_next;
   1331 		uint64_t itor = 0;
   1332 		uint64_t old_unique;
   1333 
   1334 		spa_scrub_restart(dp->dp_spa, tx->tx_txg);
   1335 
   1336 		VERIFY(0 == dsl_dataset_open_obj(dp,
   1337 		    ds->ds_phys->ds_next_snap_obj, NULL,
   1338 		    DS_MODE_NONE, FTAG, &ds_next));
   1339 		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
   1340 
   1341 		old_unique = dsl_dataset_unique(ds_next);
   1342 
   1343 		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
   1344 		ds_next->ds_phys->ds_prev_snap_obj =
   1345 		    ds->ds_phys->ds_prev_snap_obj;
   1346 		ds_next->ds_phys->ds_prev_snap_txg =
   1347 		    ds->ds_phys->ds_prev_snap_txg;
   1348 		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
   1349 		    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
   1350 
   1351 		/*
   1352 		 * Transfer to our deadlist (which will become next's
   1353 		 * new deadlist) any entries from next's current
   1354 		 * deadlist which were born before prev, and free the
   1355 		 * other entries.
   1356 		 *
   1357 		 * XXX we're doing this long task with the config lock held
   1358 		 */
   1359 		while (bplist_iterate(&ds_next->ds_deadlist, &itor,
   1360 		    &bp) == 0) {
   1361 			if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
   1362 				VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
   1363 				    &bp, tx));
   1364 				if (ds_prev && !after_branch_point &&
   1365 				    bp.blk_birth >
   1366 				    ds_prev->ds_phys->ds_prev_snap_txg) {
   1367 					ds_prev->ds_phys->ds_unique_bytes +=
   1368 					    bp_get_dasize(dp->dp_spa, &bp);
   1369 				}
   1370 			} else {
   1371 				used += bp_get_dasize(dp->dp_spa, &bp);
   1372 				compressed += BP_GET_PSIZE(&bp);
   1373 				uncompressed += BP_GET_UCSIZE(&bp);
   1374 				/* XXX check return value? */
   1375 				(void) arc_free(zio, dp->dp_spa, tx->tx_txg,
   1376 				    &bp, NULL, NULL, ARC_NOWAIT);
   1377 			}
   1378 		}
   1379 
   1380 		/* free next's deadlist */
   1381 		bplist_close(&ds_next->ds_deadlist);
   1382 		bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
   1383 
   1384 		/* set next's deadlist to our deadlist */
   1385 		ds_next->ds_phys->ds_deadlist_obj =
   1386 		    ds->ds_phys->ds_deadlist_obj;
   1387 		VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
   1388 		    ds_next->ds_phys->ds_deadlist_obj));
   1389 		ds->ds_phys->ds_deadlist_obj = 0;
   1390 
   1391 		if (ds_next->ds_phys->ds_next_snap_obj != 0) {
   1392 			/*
   1393 			 * Update next's unique to include blocks which
   1394 			 * were previously shared by only this snapshot
   1395 			 * and it.  Those blocks will be born after the
   1396 			 * prev snap and before this snap, and will have
   1397 			 * died after the next snap and before the one
   1398 			 * after that (ie. be on the snap after next's
   1399 			 * deadlist).
   1400 			 *
   1401 			 * XXX we're doing this long task with the
   1402 			 * config lock held
   1403 			 */
   1404 			dsl_dataset_t *ds_after_next;
   1405 
   1406 			VERIFY(0 == dsl_dataset_open_obj(dp,
   1407 			    ds_next->ds_phys->ds_next_snap_obj, NULL,
   1408 			    DS_MODE_NONE, FTAG, &ds_after_next));
   1409 			itor = 0;
   1410 			while (bplist_iterate(&ds_after_next->ds_deadlist,
   1411 			    &itor, &bp) == 0) {
   1412 				if (bp.blk_birth >
   1413 				    ds->ds_phys->ds_prev_snap_txg &&
   1414 				    bp.blk_birth <=
   1415 				    ds->ds_phys->ds_creation_txg) {
   1416 					ds_next->ds_phys->ds_unique_bytes +=
   1417 					    bp_get_dasize(dp->dp_spa, &bp);
   1418 				}
   1419 			}
   1420 
   1421 			dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
   1422 			ASSERT3P(ds_next->ds_prev, ==, NULL);
   1423 		} else {
   1424 			ASSERT3P(ds_next->ds_prev, ==, ds);
   1425 			dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
   1426 			    ds_next);
   1427 			if (ds_prev) {
   1428 				VERIFY(0 == dsl_dataset_open_obj(dp,
   1429 				    ds->ds_phys->ds_prev_snap_obj, NULL,
   1430 				    DS_MODE_NONE, ds_next, &ds_next->ds_prev));
   1431 			} else {
   1432 				ds_next->ds_prev = NULL;
   1433 			}
   1434 
   1435 			dsl_dataset_recalc_head_uniq(ds_next);
   1436 
   1437 			/*
   1438 			 * Reduce the amount of our unconsmed refreservation
   1439 			 * being charged to our parent by the amount of
   1440 			 * new unique data we have gained.
   1441 			 */
   1442 			if (old_unique < ds_next->ds_reserved) {
   1443 				int64_t mrsdelta;
   1444 				uint64_t new_unique =
   1445 				    ds_next->ds_phys->ds_unique_bytes;
   1446 
   1447 				ASSERT(old_unique <= new_unique);
   1448 				mrsdelta = MIN(new_unique - old_unique,
   1449 				    ds_next->ds_reserved - old_unique);
   1450 				dsl_dir_diduse_space(ds->ds_dir, -mrsdelta,
   1451 				    0, 0, tx);
   1452 			}
   1453 		}
   1454 		dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
   1455 
   1456 		/*
   1457 		 * NB: unique_bytes might not be accurate for the head objset.
   1458 		 * Before SPA_VERSION 9, we didn't update its value when we
   1459 		 * deleted the most recent snapshot.
   1460 		 */
   1461 		ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
   1462 	} else {
   1463 		/*
   1464 		 * There's no next snapshot, so this is a head dataset.
   1465 		 * Destroy the deadlist.  Unless it's a clone, the
   1466 		 * deadlist should be empty.  (If it's a clone, it's
   1467 		 * safe to ignore the deadlist contents.)
   1468 		 */
   1469 		struct killarg ka;
   1470 
   1471 		ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
   1472 		bplist_close(&ds->ds_deadlist);
   1473 		bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
   1474 		ds->ds_phys->ds_deadlist_obj = 0;
   1475 
   1476 		/*
   1477 		 * Free everything that we point to (that's born after
   1478 		 * the previous snapshot, if we are a clone)
   1479 		 *
   1480 		 * XXX we're doing this long task with the config lock held
   1481 		 */
   1482 		ka.usedp = &used;
   1483 		ka.compressedp = &compressed;
   1484 		ka.uncompressedp = &uncompressed;
   1485 		ka.zio = zio;
   1486 		ka.tx = tx;
   1487 		err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
   1488 		    ADVANCE_POST, kill_blkptr, &ka);
   1489 		ASSERT3U(err, ==, 0);
   1490 		ASSERT(spa_version(dp->dp_spa) <
   1491 		    SPA_VERSION_UNIQUE_ACCURATE ||
   1492 		    used == ds->ds_phys->ds_unique_bytes);
   1493 	}
   1494 
   1495 	err = zio_wait(zio);
   1496 	ASSERT3U(err, ==, 0);
   1497 
   1498 	dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx);
   1499 
   1500 	if (ds->ds_phys->ds_snapnames_zapobj) {
   1501 		err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
   1502 		ASSERT(err == 0);
   1503 	}
   1504 
   1505 	if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
   1506 		/* Erase the link in the dataset */
   1507 		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
   1508 		ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
   1509 		/*
   1510 		 * dsl_dir_sync_destroy() called us, they'll destroy
   1511 		 * the dataset.
   1512 		 */
   1513 	} else {
   1514 		/* remove from snapshot namespace */
   1515 		dsl_dataset_t *ds_head;
   1516 		VERIFY(0 == dsl_dataset_open_obj(dp,
   1517 		    ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL,
   1518 		    DS_MODE_NONE, FTAG, &ds_head));
   1519 		VERIFY(0 == dsl_dataset_get_snapname(ds));
   1520 #ifdef ZFS_DEBUG
   1521 		{
   1522 			uint64_t val;
   1523 			err = zap_lookup(mos,
   1524 			    ds_head->ds_phys->ds_snapnames_zapobj,
   1525 			    ds->ds_snapname, 8, 1, &val);
   1526 			ASSERT3U(err, ==, 0);
   1527 			ASSERT3U(val, ==, obj);
   1528 		}
   1529 #endif
   1530 		err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
   1531 		    ds->ds_snapname, tx);
   1532 		ASSERT(err == 0);
   1533 		dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
   1534 	}
   1535 
   1536 	if (ds_prev && ds->ds_prev != ds_prev)
   1537 		dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
   1538 
   1539 	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
   1540 	spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx,
   1541 	    cr, "dataset = %llu", ds->ds_object);
   1542 
   1543 	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag);
   1544 	VERIFY(0 == dmu_object_free(mos, obj, tx));
   1545 
   1546 }
   1547 
   1548 static int
   1549 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
   1550 {
   1551 	uint64_t asize;
   1552 
   1553 	if (!dmu_tx_is_syncing(tx))
   1554 		return (0);
   1555 
   1556 	/*
   1557 	 * If there's an fs-only reservation, any blocks that might become
   1558 	 * owned by the snapshot dataset must be accommodated by space
   1559 	 * outside of the reservation.
   1560 	 */
   1561 	asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
   1562 	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
   1563 		return (ENOSPC);
   1564 
   1565 	/*
   1566 	 * Propogate any reserved space for this snapshot to other
   1567 	 * snapshot checks in this sync group.
   1568 	 */
   1569 	if (asize > 0)
   1570 		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
   1571 
   1572 	return (0);
   1573 }
   1574 
   1575 /* ARGSUSED */
   1576 int
   1577 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
   1578 {
   1579 	dsl_dataset_t *ds = arg1;
   1580 	const char *snapname = arg2;
   1581 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
   1582 	int err;
   1583 	uint64_t value;
   1584 
   1585 	/*
   1586 	 * We don't allow multiple snapshots of the same txg.  If there
   1587 	 * is already one, try again.
   1588 	 */
   1589 	if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
   1590 		return (EAGAIN);
   1591 
   1592 	/*
   1593 	 * Check for conflicting name snapshot name.
   1594 	 */
   1595 	err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
   1596 	    snapname, 8, 1, &value);
   1597 	if (err == 0)
   1598 		return (EEXIST);
   1599 	if (err != ENOENT)
   1600 		return (err);
   1601 
   1602 	/*
   1603 	 * Check that the dataset's name is not too long.  Name consists
   1604 	 * of the dataset's length + 1 for the @-sign + snapshot name's length
   1605 	 */
   1606 	if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
   1607 		return (ENAMETOOLONG);
   1608 
   1609 	err = dsl_dataset_snapshot_reserve_space(ds, tx);
   1610 	if (err)
   1611 		return (err);
   1612 
   1613 	ds->ds_trysnap_txg = tx->tx_txg;
   1614 	return (0);
   1615 }
   1616 
   1617 void
   1618 dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
   1619 {
   1620 	dsl_dataset_t *ds = arg1;
   1621 	const char *snapname = arg2;
   1622 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
   1623 	dmu_buf_t *dbuf;
   1624 	dsl_dataset_phys_t *dsphys;
   1625 	uint64_t dsobj;
   1626 	objset_t *mos = dp->dp_meta_objset;
   1627 	int err;
   1628 
   1629 	spa_scrub_restart(dp->dp_spa, tx->tx_txg);
   1630 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
   1631 
   1632 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
   1633 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
   1634 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
   1635 	dmu_buf_will_dirty(dbuf, tx);
   1636 	dsphys = dbuf->db_data;
   1637 	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
   1638 	dsphys->ds_fsid_guid = unique_create();
   1639 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
   1640 	    sizeof (dsphys->ds_guid));
   1641 	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
   1642 	dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
   1643 	dsphys->ds_next_snap_obj = ds->ds_object;
   1644 	dsphys->ds_num_children = 1;
   1645 	dsphys->ds_creation_time = gethrestime_sec();
   1646 	dsphys->ds_creation_txg = tx->tx_txg;
   1647 	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
   1648 	dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
   1649 	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
   1650 	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
   1651 	dsphys->ds_flags = ds->ds_phys->ds_flags;
   1652 	dsphys->ds_bp = ds->ds_phys->ds_bp;
   1653 	dmu_buf_rele(dbuf, FTAG);
   1654 
   1655 	ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
   1656 	if (ds->ds_prev) {
   1657 		ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
   1658 		    ds->ds_object ||
   1659 		    ds->ds_prev->ds_phys->ds_num_children > 1);
   1660 		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
   1661 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
   1662 			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
   1663 			    ds->ds_prev->ds_phys->ds_creation_txg);
   1664 			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
   1665 		}
   1666 	}
   1667 
   1668 	/*
   1669 	 * If we have a reference-reservation on this dataset, we will
   1670 	 * need to increase the amount of refreservation being charged
   1671 	 * since our unique space is going to zero.
   1672 	 */
   1673 	if (ds->ds_reserved) {
   1674 		int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
   1675 		dsl_dir_diduse_space(ds->ds_dir, add, 0, 0, tx);
   1676 	}
   1677 
   1678 	bplist_close(&ds->ds_deadlist);
   1679 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
   1680 	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
   1681 	ds->ds_phys->ds_prev_snap_obj = dsobj;
   1682 	ds->ds_phys->ds_prev_snap_txg = tx->tx_txg;
   1683 	ds->ds_phys->ds_unique_bytes = 0;
   1684 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
   1685 		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
   1686 	ds->ds_phys->ds_deadlist_obj =
   1687 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
   1688 	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
   1689 	    ds->ds_phys->ds_deadlist_obj));
   1690 
   1691 	dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
   1692 	err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
   1693 	    snapname, 8, 1, &dsobj, tx);
   1694 	ASSERT(err == 0);
   1695 
   1696 	if (ds->ds_prev)
   1697 		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
   1698 	VERIFY(0 == dsl_dataset_open_obj(dp,
   1699 	    ds->ds_phys->ds_prev_snap_obj, snapname,
   1700 	    DS_MODE_NONE, ds, &ds->ds_prev));
   1701 
   1702 	spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr,
   1703 	    "dataset = %llu", dsobj);
   1704 }
   1705 
   1706 void
   1707 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
   1708 {
   1709 	ASSERT(dmu_tx_is_syncing(tx));
   1710 	ASSERT(ds->ds_user_ptr != NULL);
   1711 	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
   1712 
   1713 	/*
   1714 	 * in case we had to change ds_fsid_guid when we opened it,
   1715 	 * sync it out now.
   1716 	 */
   1717 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
   1718 	ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
   1719 
   1720 	dsl_dir_dirty(ds->ds_dir, tx);
   1721 	dmu_objset_sync(ds->ds_user_ptr, zio, tx);
   1722 }
   1723 
   1724 void
   1725 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
   1726 {
   1727 	uint64_t refd, avail, uobjs, aobjs;
   1728 
   1729 	dsl_dir_stats(ds->ds_dir, nv);
   1730 
   1731 	dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
   1732 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
   1733 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
   1734 
   1735 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
   1736 	    ds->ds_phys->ds_creation_time);
   1737 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
   1738 	    ds->ds_phys->ds_creation_txg);
   1739 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
   1740 	    ds->ds_quota);
   1741 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
   1742 	    ds->ds_reserved);
   1743 
   1744 	if (ds->ds_phys->ds_next_snap_obj) {
   1745 		/*
   1746 		 * This is a snapshot; override the dd's space used with
   1747 		 * our unique space and compression ratio.
   1748 		 */
   1749 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
   1750 		    ds->ds_phys->ds_unique_bytes);
   1751 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
   1752 		    ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
   1753 		    (ds->ds_phys->ds_uncompressed_bytes * 100 /
   1754 		    ds->ds_phys->ds_compressed_bytes));
   1755 	}
   1756 }
   1757 
   1758 void
   1759 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
   1760 {
   1761 	stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
   1762 	stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
   1763 	stat->dds_guid = ds->ds_phys->ds_guid;
   1764 	if (ds->ds_phys->ds_next_snap_obj) {
   1765 		stat->dds_is_snapshot = B_TRUE;
   1766 		stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
   1767 	}
   1768 
   1769 	/* clone origin is really a dsl_dir thing... */
   1770 	rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
   1771 	if (ds->ds_dir->dd_phys->dd_origin_obj) {
   1772 		dsl_dataset_t *ods;
   1773 
   1774 		VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool,
   1775 		    ds->ds_dir->dd_phys->dd_origin_obj,
   1776 		    NULL, DS_MODE_NONE, FTAG, &ods));
   1777 		dsl_dataset_name(ods, stat->dds_origin);
   1778 		dsl_dataset_close(ods, DS_MODE_NONE, FTAG);
   1779 	}
   1780 	rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
   1781 }
   1782 
   1783 uint64_t
   1784 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
   1785 {
   1786 	return (ds->ds_fsid_guid);
   1787 }
   1788 
   1789 void
   1790 dsl_dataset_space(dsl_dataset_t *ds,
   1791     uint64_t *refdbytesp, uint64_t *availbytesp,
   1792     uint64_t *usedobjsp, uint64_t *availobjsp)
   1793 {
   1794 	*refdbytesp = ds->ds_phys->ds_used_bytes;
   1795 	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
   1796 	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
   1797 		*availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
   1798 	if (ds->ds_quota != 0) {
   1799 		/*
   1800 		 * Adjust available bytes according to refquota
   1801 		 */
   1802 		if (*refdbytesp < ds->ds_quota)
   1803 			*availbytesp = MIN(*availbytesp,
   1804 			    ds->ds_quota - *refdbytesp);
   1805 		else
   1806 			*availbytesp = 0;
   1807 	}
   1808 	*usedobjsp = ds->ds_phys->ds_bp.blk_fill;
   1809 	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
   1810 }
   1811 
   1812 boolean_t
   1813 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
   1814 {
   1815 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
   1816 
   1817 	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
   1818 	    dsl_pool_sync_context(dp));
   1819 	if (ds->ds_prev == NULL)
   1820 		return (B_FALSE);
   1821 	if (ds->ds_phys->ds_bp.blk_birth >
   1822 	    ds->ds_prev->ds_phys->ds_creation_txg)
   1823 		return (B_TRUE);
   1824 	return (B_FALSE);
   1825 }
   1826 
   1827 /* ARGSUSED */
   1828 static int
   1829 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
   1830 {
   1831 	dsl_dataset_t *ds = arg1;
   1832 	char *newsnapname = arg2;
   1833 	dsl_dir_t *dd = ds->ds_dir;
   1834 	objset_t *mos = dd->dd_pool->dp_meta_objset;
   1835 	dsl_dataset_t *hds;
   1836 	uint64_t val;
   1837 	int err;
   1838 
   1839 	err = dsl_dataset_open_obj(dd->dd_pool,
   1840 	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds);
   1841 	if (err)
   1842 		return (err);
   1843 
   1844 	/* new name better not be in use */
   1845 	err = zap_lookup(mos, hds->ds_phys->ds_snapnames_zapobj,
   1846 	    newsnapname, 8, 1, &val);
   1847 	dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
   1848 
   1849 	if (err == 0)
   1850 		err = EEXIST;
   1851 	else if (err == ENOENT)
   1852 		err = 0;
   1853 
   1854 	/* dataset name + 1 for the "@" + the new snapshot name must fit */
   1855 	if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
   1856 		err = ENAMETOOLONG;
   1857 
   1858 	return (err);
   1859 }
   1860 
   1861 static void
   1862 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
   1863     cred_t *cr, dmu_tx_t *tx)
   1864 {
   1865 	dsl_dataset_t *ds = arg1;
   1866 	const char *newsnapname = arg2;
   1867 	dsl_dir_t *dd = ds->ds_dir;
   1868 	objset_t *mos = dd->dd_pool->dp_meta_objset;
   1869 	dsl_dataset_t *hds;
   1870 	int err;
   1871 
   1872 	ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
   1873 
   1874 	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
   1875 	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds));
   1876 
   1877 	VERIFY(0 == dsl_dataset_get_snapname(ds));
   1878 	err = zap_remove(mos, hds->ds_phys->ds_snapnames_zapobj,
   1879 	    ds->ds_snapname, tx);
   1880 	ASSERT3U(err, ==, 0);
   1881 	mutex_enter(&ds->ds_lock);
   1882 	(void) strcpy(ds->ds_snapname, newsnapname);
   1883 	mutex_exit(&ds->ds_lock);
   1884 	err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
   1885 	    ds->ds_snapname, 8, 1, &ds->ds_object, tx);
   1886 	ASSERT3U(err, ==, 0);
   1887 
   1888 	spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
   1889 	    cr, "dataset = %llu", ds->ds_object);
   1890 	dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
   1891 }
   1892 
   1893 struct renamesnaparg {
   1894 	dsl_sync_task_group_t *dstg;
   1895 	char failed[MAXPATHLEN];
   1896 	char *oldsnap;
   1897 	char *newsnap;
   1898 };
   1899 
   1900 static int
   1901 dsl_snapshot_rename_one(char *name, void *arg)
   1902 {
   1903 	struct renamesnaparg *ra = arg;
   1904 	dsl_dataset_t *ds = NULL;
   1905 	char *cp;
   1906 	int err;
   1907 
   1908 	cp = name + strlen(name);
   1909 	*cp = '@';
   1910 	(void) strcpy(cp + 1, ra->oldsnap);
   1911 
   1912 	/*
   1913 	 * For recursive snapshot renames the parent won't be changing
   1914 	 * so we just pass name for both the to/from argument.
   1915 	 */
   1916 	if (err = zfs_secpolicy_rename_perms(name, name, CRED())) {
   1917 		(void) strcpy(ra->failed, name);
   1918 		return (err);
   1919 	}
   1920 
   1921 	err = dsl_dataset_open(name, DS_MODE_READONLY | DS_MODE_STANDARD,
   1922 	    ra->dstg, &ds);
   1923 	if (err == ENOENT) {
   1924 		*cp = '\0';
   1925 		return (0);
   1926 	}
   1927 	if (err) {
   1928 		(void) strcpy(ra->failed, name);
   1929 		*cp = '\0';
   1930 		dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
   1931 		return (err);
   1932 	}
   1933 
   1934 #ifdef _KERNEL
   1935 	/* for all filesystems undergoing rename, we'll need to unmount it */
   1936 	(void) zfs_unmount_snap(name, NULL);
   1937 #endif
   1938 
   1939 	*cp = '\0';
   1940 
   1941 	dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
   1942 	    dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
   1943 
   1944 	return (0);
   1945 }
   1946 
   1947 static int
   1948 dsl_recursive_rename(char *oldname, const char *newname)
   1949 {
   1950 	int err;
   1951 	struct renamesnaparg *ra;
   1952 	dsl_sync_task_t *dst;
   1953 	spa_t *spa;
   1954 	char *cp, *fsname = spa_strdup(oldname);
   1955 	int len = strlen(oldname);
   1956 
   1957 	/* truncate the snapshot name to get the fsname */
   1958 	cp = strchr(fsname, '@');
   1959 	*cp = '\0';
   1960 
   1961 	err = spa_open(fsname, &spa, FTAG);
   1962 	if (err) {
   1963 		kmem_free(fsname, len + 1);
   1964 		return (err);
   1965 	}
   1966 	ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
   1967 	ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
   1968 
   1969 	ra->oldsnap = strchr(oldname, '@') + 1;
   1970 	ra->newsnap = strchr(newname, '@') + 1;
   1971 	*ra->failed = '\0';
   1972 
   1973 	err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
   1974 	    DS_FIND_CHILDREN);
   1975 	kmem_free(fsname, len + 1);
   1976 
   1977 	if (err == 0) {
   1978 		err = dsl_sync_task_group_wait(ra->dstg);
   1979 	}
   1980 
   1981 	for (dst = list_head(&ra->dstg->dstg_tasks); dst;
   1982 	    dst = list_next(&ra->dstg->dstg_tasks, dst)) {
   1983 		dsl_dataset_t *ds = dst->dst_arg1;
   1984 		if (dst->dst_err) {
   1985 			dsl_dir_name(ds->ds_dir, ra->failed);
   1986 			(void) strcat(ra->failed, "@");
   1987 			(void) strcat(ra->failed, ra->newsnap);
   1988 		}
   1989 		dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
   1990 	}
   1991 
   1992 	if (err)
   1993 		(void) strcpy(oldname, ra->failed);
   1994 
   1995 	dsl_sync_task_group_destroy(ra->dstg);
   1996 	kmem_free(ra, sizeof (struct renamesnaparg));
   1997 	spa_close(spa, FTAG);
   1998 	return (err);
   1999 }
   2000 
   2001 static int
   2002 dsl_valid_rename(char *oldname, void *arg)
   2003 {
   2004 	int delta = *(int *)arg;
   2005 
   2006 	if (strlen(oldname) + delta >= MAXNAMELEN)
   2007 		return (ENAMETOOLONG);
   2008 
   2009 	return (0);
   2010 }
   2011 
   2012 #pragma weak dmu_objset_rename = dsl_dataset_rename
   2013 int
   2014 dsl_dataset_rename(char *oldname, const char *newname,
   2015     boolean_t recursive)
   2016 {
   2017 	dsl_dir_t *dd;
   2018 	dsl_dataset_t *ds;
   2019 	const char *tail;
   2020 	int err;
   2021 
   2022 	err = dsl_dir_open(oldname, FTAG, &dd, &tail);
   2023 	if (err)
   2024 		return (err);
   2025 	if (tail == NULL) {
   2026 		int delta = strlen(newname) - strlen(oldname);
   2027 
   2028 		/* if we're growing, validate child size lengths */
   2029 		if (delta > 0)
   2030 			err = dmu_objset_find(oldname, dsl_valid_rename,
   2031 			    &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
   2032 
   2033 		if (!err)
   2034 			err = dsl_dir_rename(dd, newname);
   2035 		dsl_dir_close(dd, FTAG);
   2036 		return (err);
   2037 	}
   2038 	if (tail[0] != '@') {
   2039 		/* the name ended in a nonexistant component */
   2040 		dsl_dir_close(dd, FTAG);
   2041 		return (ENOENT);
   2042 	}
   2043 
   2044 	dsl_dir_close(dd, FTAG);
   2045 
   2046 	/* new name must be snapshot in same filesystem */
   2047 	tail = strchr(newname, '@');
   2048 	if (tail == NULL)
   2049 		return (EINVAL);
   2050 	tail++;
   2051 	if (strncmp(oldname, newname, tail - newname) != 0)
   2052 		return (EXDEV);
   2053 
   2054 	if (recursive) {
   2055 		err = dsl_recursive_rename(oldname, newname);
   2056 	} else {
   2057 		err = dsl_dataset_open(oldname,
   2058 		    DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds);
   2059 		if (err)
   2060 			return (err);
   2061 
   2062 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
   2063 		    dsl_dataset_snapshot_rename_check,
   2064 		    dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
   2065 
   2066 		dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
   2067 	}
   2068 
   2069 	return (err);
   2070 }
   2071 
   2072 struct promotearg {
   2073 	uint64_t used, comp, uncomp, unique;
   2074 	uint64_t newnext_obj, snapnames_obj;
   2075 };
   2076 
   2077 /* ARGSUSED */
   2078 static int
   2079 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
   2080 {
   2081 	dsl_dataset_t *hds = arg1;
   2082 	struct promotearg *pa = arg2;
   2083 	dsl_dir_t *dd = hds->ds_dir;
   2084 	dsl_pool_t *dp = hds->ds_dir->dd_pool;
   2085 	dsl_dir_t *odd = NULL;
   2086 	dsl_dataset_t *ds = NULL;
   2087 	dsl_dataset_t *origin_ds = NULL;
   2088 	dsl_dataset_t *newnext_ds = NULL;
   2089 	int err;
   2090 	char *name = NULL;
   2091 	uint64_t itor = 0;
   2092 	blkptr_t bp;
   2093 
   2094 	bzero(pa, sizeof (*pa));
   2095 
   2096 	/* Check that it is a clone */
   2097 	if (dd->dd_phys->dd_origin_obj == 0)
   2098 		return (EINVAL);
   2099 
   2100 	/* Since this is so expensive, don't do the preliminary check */
   2101 	if (!dmu_tx_is_syncing(tx))
   2102 		return (0);
   2103 
   2104 	if (err = dsl_dataset_open_obj(dp, dd->dd_phys->dd_origin_obj,
   2105 	    NULL, DS_MODE_EXCLUSIVE, FTAG, &origin_ds))
   2106 		goto out;
   2107 	odd = origin_ds->ds_dir;
   2108 
   2109 	{
   2110 		dsl_dataset_t *phds;
   2111 		if (err = dsl_dataset_open_obj(dd->dd_pool,
   2112 		    odd->dd_phys->dd_head_dataset_obj,
   2113 		    NULL, DS_MODE_NONE, FTAG, &phds))
   2114 			goto out;
   2115 		pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj;
   2116 		dsl_dataset_close(phds, DS_MODE_NONE, FTAG);
   2117 	}
   2118 
   2119 	if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
   2120 		err = EXDEV;
   2121 		goto out;
   2122 	}
   2123 
   2124 	/* find origin's new next ds */
   2125 	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object,
   2126 	    NULL, DS_MODE_NONE, FTAG, &newnext_ds));
   2127 	while (newnext_ds->ds_phys->ds_prev_snap_obj != origin_ds->ds_object) {
   2128 		dsl_dataset_t *prev;
   2129 
   2130 		if (err = dsl_dataset_open_obj(dd->dd_pool,
   2131 		    newnext_ds->ds_phys->ds_prev_snap_obj,
   2132 		    NULL, DS_MODE_NONE, FTAG, &prev))
   2133 			goto out;
   2134 		dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
   2135 		newnext_ds = prev;
   2136 	}
   2137 	pa->newnext_obj = newnext_ds->ds_object;
   2138 
   2139 	/* compute origin's new unique space */
   2140 	while ((err = bplist_iterate(&newnext_ds->ds_deadlist,
   2141 	    &itor, &bp)) == 0) {
   2142 		if (bp.blk_birth > origin_ds->ds_phys->ds_prev_snap_txg)
   2143 			pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp);
   2144 	}
   2145 	if (err != ENOENT)
   2146 		goto out;
   2147 
   2148 	/* Walk the snapshots that we are moving */
   2149 	name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
   2150 	ds = origin_ds;
   2151 	/* CONSTCOND */
   2152 	while (TRUE) {
   2153 		uint64_t val, dlused, dlcomp, dluncomp;
   2154 		dsl_dataset_t *prev;
   2155 
   2156 		/* Check that the snapshot name does not conflict */
   2157 		dsl_dataset_name(ds, name);
   2158 		err = zap_lookup(dd->dd_pool->dp_meta_objset,
   2159 		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
   2160 		    8, 1, &val);
   2161 		if (err != ENOENT) {
   2162 			if (err == 0)
   2163 				err = EEXIST;
   2164 			goto out;
   2165 		}
   2166 
   2167 		/*
   2168 		 * compute space to transfer.  Each snapshot gave birth to:
   2169 		 * (my used) - (prev's used) + (deadlist's used)
   2170 		 */
   2171 		pa->used += ds->ds_phys->ds_used_bytes;
   2172 		pa->comp += ds->ds_phys->ds_compressed_bytes;
   2173 		pa->uncomp += ds->ds_phys->ds_uncompressed_bytes;
   2174 
   2175 		/* If we reach the first snapshot, we're done. */
   2176 		if (ds->ds_phys->ds_prev_snap_obj == 0)
   2177 			break;
   2178 
   2179 		if (err = bplist_space(&ds->ds_deadlist,
   2180 		    &dlused, &dlcomp, &dluncomp))
   2181 			goto out;
   2182 		if (err = dsl_dataset_open_obj(dd->dd_pool,
   2183 		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
   2184 		    FTAG, &prev))
   2185 			goto out;
   2186 		pa->used += dlused - prev->ds_phys->ds_used_bytes;
   2187 		pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes;
   2188 		pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes;
   2189 
   2190 		/*
   2191 		 * We could be a clone of a clone.  If we reach our
   2192 		 * parent's branch point, we're done.
   2193 		 */
   2194 		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
   2195 			dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
   2196 			break;
   2197 		}
   2198 		if (ds != origin_ds)
   2199 			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
   2200 		ds = prev;
   2201 	}
   2202 
   2203 	/* Check that there is enough space here */
   2204 	err = dsl_dir_transfer_possible(odd, dd, pa->used);
   2205 
   2206 out:
   2207 	if (ds && ds != origin_ds)
   2208 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
   2209 	if (origin_ds)
   2210 		dsl_dataset_close(origin_ds, DS_MODE_EXCLUSIVE, FTAG);
   2211 	if (newnext_ds)
   2212 		dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
   2213 	if (name)
   2214 		kmem_free(name, MAXPATHLEN);
   2215 	return (err);
   2216 }
   2217 
   2218 static void
   2219 dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
   2220 {
   2221 	dsl_dataset_t *hds = arg1;
   2222 	struct promotearg *pa = arg2;
   2223 	dsl_dir_t *dd = hds->ds_dir;
   2224 	dsl_pool_t *dp = hds->ds_dir->dd_pool;
   2225 	dsl_dir_t *odd = NULL;
   2226 	dsl_dataset_t *ds, *origin_ds;
   2227 	char *name;
   2228 
   2229 	ASSERT(dd->dd_phys->dd_origin_obj != 0);
   2230 	ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
   2231 
   2232 	VERIFY(0 == dsl_dataset_open_obj(dp, dd->dd_phys->dd_origin_obj,
   2233 	    NULL, DS_MODE_EXCLUSIVE, FTAG, &origin_ds));
   2234 	/*
   2235 	 * We need to explicitly open odd, since origin_ds's dd will be
   2236 	 * changing.
   2237 	 */
   2238 	VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
   2239 	    NULL, FTAG, &odd));
   2240 
   2241 	/* move snapshots to this dir */
   2242 	name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
   2243 	ds = origin_ds;
   2244 	/* CONSTCOND */
   2245 	while (TRUE) {
   2246 		dsl_dataset_t *prev;
   2247 
   2248 		/* move snap name entry */
   2249 		dsl_dataset_name(ds, name);
   2250 		VERIFY(0 == zap_remove(dp->dp_meta_objset,
   2251 		    pa->snapnames_obj, ds->ds_snapname, tx));
   2252 		VERIFY(0 == zap_add(dp->dp_meta_objset,
   2253 		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
   2254 		    8, 1, &ds->ds_object, tx));
   2255 
   2256 		/* change containing dsl_dir */
   2257 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
   2258 		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
   2259 		ds->ds_phys->ds_dir_obj = dd->dd_object;
   2260 		ASSERT3P(ds->ds_dir, ==, odd);
   2261 		dsl_dir_close(ds->ds_dir, ds);
   2262 		VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
   2263 		    NULL, ds, &ds->ds_dir));
   2264 
   2265 		ASSERT3U(dsl_prop_numcb(ds), ==, 0);
   2266 
   2267 		if (ds->ds_phys->ds_prev_snap_obj == 0)
   2268 			break;
   2269 
   2270 		VERIFY(0 == dsl_dataset_open_obj(dp,
   2271 		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
   2272 		    FTAG, &prev));
   2273 
   2274 		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
   2275 			dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
   2276 			break;
   2277 		}
   2278 		if (ds != origin_ds)
   2279 			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
   2280 		ds = prev;
   2281 	}
   2282 	if (ds != origin_ds)
   2283 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
   2284 
   2285 	/* change origin's next snap */
   2286 	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
   2287 	origin_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj;
   2288 
   2289 	/* change origin */
   2290 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
   2291 	ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
   2292 	dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
   2293 	dmu_buf_will_dirty(odd->dd_dbuf, tx);
   2294 	odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
   2295 
   2296 	/* change space accounting */
   2297 	dsl_dir_diduse_space(odd, -pa->used, -pa->comp, -pa->uncomp, tx);
   2298 	dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx);
   2299 	origin_ds->ds_phys->ds_unique_bytes = pa->unique;
   2300 
   2301 	/* log history record */
   2302 	spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
   2303 	    cr, "dataset = %llu", ds->ds_object);
   2304 
   2305 	dsl_dir_close(odd, FTAG);
   2306 	dsl_dataset_close(origin_ds, DS_MODE_EXCLUSIVE, FTAG);
   2307 	kmem_free(name, MAXPATHLEN);
   2308 }
   2309 
   2310 int
   2311 dsl_dataset_promote(const char *name)
   2312 {
   2313 	dsl_dataset_t *ds;
   2314 	int err;
   2315 	dmu_object_info_t doi;
   2316 	struct promotearg pa;
   2317 
   2318 	err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds);
   2319 	if (err)
   2320 		return (err);
   2321 
   2322 	err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset,
   2323 	    ds->ds_phys->ds_snapnames_zapobj, &doi);
   2324 	if (err) {
   2325 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
   2326 		return (err);
   2327 	}
   2328 
   2329 	/*
   2330 	 * Add in 128x the snapnames zapobj size, since we will be moving
   2331 	 * a bunch of snapnames to the promoted ds, and dirtying their
   2332 	 * bonus buffers.
   2333 	 */
   2334 	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
   2335 	    dsl_dataset_promote_check,
   2336 	    dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks);
   2337 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
   2338 	return (err);
   2339 }
   2340 
   2341 struct cloneswaparg {
   2342 	dsl_dataset_t *cds; /* clone dataset */
   2343 	dsl_dataset_t *ohds; /* origin's head dataset */
   2344 	boolean_t force;
   2345 	int64_t unused_refres_delta; /* change in unconsumed refreservation */
   2346 };
   2347 
   2348 /* ARGSUSED */
   2349 static int
   2350 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
   2351 {
   2352 	struct cloneswaparg *csa = arg1;
   2353 
   2354 	/* they should both be heads */
   2355 	if (dsl_dataset_is_snapshot(csa->cds) ||
   2356 	    dsl_dataset_is_snapshot(csa->ohds))
   2357 		return (EINVAL);
   2358 
   2359 	/* the branch point should be just before them */
   2360 	if (csa->cds->ds_prev != csa->ohds->ds_prev)
   2361 		return (EINVAL);
   2362 
   2363 	/* cds should be the clone */
   2364 	if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj !=
   2365 	    csa->ohds->ds_object)
   2366 		return (EINVAL);
   2367 
   2368 	/* the clone should be a child of the origin */
   2369 	if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
   2370 		return (EINVAL);
   2371 
   2372 	/* ohds shouldn't be modified unless 'force' */
   2373 	if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
   2374 		return (ETXTBSY);
   2375 
   2376 	/* adjust amount of any unconsumed refreservation */
   2377 	csa->unused_refres_delta =
   2378 	    (int64_t)MIN(csa->ohds->ds_reserved,
   2379 	    csa->ohds->ds_phys->ds_unique_bytes) -
   2380 	    (int64_t)MIN(csa->ohds->ds_reserved,
   2381 	    csa->cds->ds_phys->ds_unique_bytes);
   2382 
   2383 	if (csa->unused_refres_delta > 0 &&
   2384 	    csa->unused_refres_delta >
   2385 	    dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
   2386 		return (ENOSPC);
   2387 
   2388 	return (0);
   2389 }
   2390 
   2391 /* ARGSUSED */
   2392 static void
   2393 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
   2394 {
   2395 	struct cloneswaparg *csa = arg1;
   2396 	dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
   2397 	uint64_t itor = 0;
   2398 	blkptr_t bp;
   2399 	uint64_t unique = 0;
   2400 	int err;
   2401 
   2402 	ASSERT(csa->cds->ds_reserved == 0);
   2403 	ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota);
   2404 
   2405 	dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
   2406 	dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
   2407 	dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx);
   2408 
   2409 	if (csa->cds->ds_user_ptr != NULL) {
   2410 		csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr);
   2411 		csa->cds->ds_user_ptr = NULL;
   2412 	}
   2413 
   2414 	if (csa->ohds->ds_user_ptr != NULL) {
   2415 		csa->ohds->ds_user_evict_func(csa->ohds,
   2416 		    csa->ohds->ds_user_ptr);
   2417 		csa->ohds->ds_user_ptr = NULL;
   2418 	}
   2419 
   2420 	/* compute unique space */
   2421 	while ((err = bplist_iterate(&csa->cds->ds_deadlist,
   2422 	    &itor, &bp)) == 0) {
   2423 		if (bp.blk_birth > csa->cds->ds_prev->ds_phys->ds_prev_snap_txg)
   2424 			unique += bp_get_dasize(dp->dp_spa, &bp);
   2425 	}
   2426 	VERIFY(err == ENOENT);
   2427 
   2428 	/* reset origin's unique bytes */
   2429 	csa->cds->ds_prev->ds_phys->ds_unique_bytes = unique;
   2430 
   2431 	/* swap blkptrs */
   2432 	{
   2433 		blkptr_t tmp;
   2434 		tmp = csa->ohds->ds_phys->ds_bp;
   2435 		csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
   2436 		csa->cds->ds_phys->ds_bp = tmp;
   2437 	}
   2438 
   2439 	/* set dd_*_bytes */
   2440 	{
   2441 		int64_t dused, dcomp, duncomp;
   2442 		uint64_t cdl_used, cdl_comp, cdl_uncomp;
   2443 		uint64_t odl_used, odl_comp, odl_uncomp;
   2444 
   2445 		VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used,
   2446 		    &cdl_comp, &cdl_uncomp));
   2447 		VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used,
   2448 		    &odl_comp, &odl_uncomp));
   2449 		dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
   2450 		    (csa->ohds->ds_phys->ds_used_bytes + odl_used);
   2451 		dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
   2452 		    (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
   2453 		duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
   2454 		    cdl_uncomp -
   2455 		    (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
   2456 
   2457 		dsl_dir_diduse_space(csa->ohds->ds_dir,
   2458 		    dused, dcomp, duncomp, tx);
   2459 		dsl_dir_diduse_space(csa->cds->ds_dir,
   2460 		    -dused, -dcomp, -duncomp, tx);
   2461 	}
   2462 
   2463 #define	SWITCH64(x, y) \
   2464 	{ \
   2465 		uint64_t __tmp = (x); \
   2466 		(x) = (y); \
   2467 		(y) = __tmp; \
   2468 	}
   2469 
   2470 	/* swap ds_*_bytes */
   2471 	SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
   2472 	    csa->cds->ds_phys->ds_used_bytes);
   2473 	SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
   2474 	    csa->cds->ds_phys->ds_compressed_bytes);
   2475 	SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
   2476 	    csa->cds->ds_phys->ds_uncompressed_bytes);
   2477 	SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
   2478 	    csa->cds->ds_phys->ds_unique_bytes);
   2479 
   2480 	/* apply any parent delta for change in unconsumed refreservation */
   2481 	dsl_dir_diduse_space(csa->ohds->ds_dir, csa->unused_refres_delta,
   2482 	    0, 0, tx);
   2483 
   2484 	/* swap deadlists */
   2485 	bplist_close(&csa->cds->ds_deadlist);
   2486 	bplist_close(&csa->ohds->ds_deadlist);
   2487 	SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
   2488 	    csa->cds->ds_phys->ds_deadlist_obj);
   2489 	VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
   2490 	    csa->cds->ds_phys->ds_deadlist_obj));
   2491 	VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
   2492 	    csa->ohds->ds_phys->ds_deadlist_obj));
   2493 }
   2494 
   2495 /*
   2496  * Swap 'clone' with its origin head file system.
   2497  */
   2498 int
   2499 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
   2500     boolean_t force)
   2501 {
   2502 	struct cloneswaparg csa;
   2503 
   2504 	ASSERT(clone->ds_open_refcount == DS_REF_MAX);
   2505 	ASSERT(origin_head->ds_open_refcount == DS_REF_MAX);
   2506 
   2507 	csa.cds = clone;
   2508 	csa.ohds = origin_head;
   2509 	csa.force = force;
   2510 	return (dsl_sync_task_do(clone->ds_dir->dd_pool,
   2511 	    dsl_dataset_clone_swap_check,
   2512 	    dsl_dataset_clone_swap_sync, &csa, NULL, 9));
   2513 }
   2514 
   2515 /*
   2516  * Given a pool name and a dataset object number in that pool,
   2517  * return the name of that dataset.
   2518  */
   2519 int
   2520 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
   2521 {
   2522 	spa_t *spa;
   2523 	dsl_pool_t *dp;
   2524 	dsl_dataset_t *ds = NULL;
   2525 	int error;
   2526 
   2527 	if ((error = spa_open(pname, &spa, FTAG)) != 0)
   2528 		return (error);
   2529 	dp = spa_get_dsl(spa);
   2530 	rw_enter(&dp->dp_config_rwlock, RW_READER);
   2531 	if ((error = dsl_dataset_open_obj(dp, obj,
   2532 	    NULL, DS_MODE_NONE, FTAG, &ds)) != 0) {
   2533 		rw_exit(&dp->dp_config_rwlock);
   2534 		spa_close(spa, FTAG);
   2535 		return (error);
   2536 	}
   2537 	dsl_dataset_name(ds, buf);
   2538 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
   2539 	rw_exit(&dp->dp_config_rwlock);
   2540 	spa_close(spa, FTAG);
   2541 
   2542 	return (0);
   2543 }
   2544 
   2545 int
   2546 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
   2547     uint64_t asize, uint64_t inflight, uint64_t *used)
   2548 {
   2549 	int error = 0;
   2550 
   2551 	ASSERT3S(asize, >, 0);
   2552 
   2553 	mutex_enter(&ds->ds_lock);
   2554 	/*
   2555 	 * Make a space adjustment for reserved bytes.
   2556 	 */
   2557 	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
   2558 		ASSERT3U(*used, >=,
   2559 		    ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
   2560 		*used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
   2561 	}
   2562 
   2563 	if (!check_quota || ds->ds_quota == 0) {
   2564 		mutex_exit(&ds->ds_lock);
   2565 		return (0);
   2566 	}
   2567 	/*
   2568 	 * If they are requesting more space, and our current estimate
   2569 	 * is over quota, they get to try again unless the actual
   2570 	 * on-disk is over quota and there are no pending changes (which
   2571 	 * may free up space for us).
   2572 	 */
   2573 	if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
   2574 		if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)
   2575 			error = ERESTART;
   2576 		else
   2577 			error = EDQUOT;
   2578 	}
   2579 	mutex_exit(&ds->ds_lock);
   2580 
   2581 	return (error);
   2582 }
   2583 
   2584 /* ARGSUSED */
   2585 static int
   2586 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
   2587 {
   2588 	dsl_dataset_t *ds = arg1;
   2589 	uint64_t *quotap = arg2;
   2590 	uint64_t new_quota = *quotap;
   2591 
   2592 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
   2593 		return (ENOTSUP);
   2594 
   2595 	if (new_quota == 0)
   2596 		return (0);
   2597 
   2598 	if (new_quota < ds->ds_phys->ds_used_bytes ||
   2599 	    new_quota < ds->ds_reserved)
   2600 		return (ENOSPC);
   2601 
   2602 	return (0);
   2603 }
   2604 
   2605 /* ARGSUSED */
   2606 void
   2607 dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
   2608 {
   2609 	dsl_dataset_t *ds = arg1;
   2610 	uint64_t *quotap = arg2;
   2611 	uint64_t new_quota = *quotap;
   2612 
   2613 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
   2614 
   2615 	mutex_enter(&ds->ds_lock);
   2616 	ds->ds_quota = new_quota;
   2617 	mutex_exit(&ds->ds_lock);
   2618 
   2619 	dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
   2620 
   2621 	spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa,
   2622 	    tx, cr, "%lld dataset = %llu ",
   2623 	    (longlong_t)new_quota, ds->ds_dir->dd_phys->dd_head_dataset_obj);
   2624 }
   2625 
   2626 int
   2627 dsl_dataset_set_quota(const char *dsname, uint64_t quota)
   2628 {
   2629 	dsl_dataset_t *ds;
   2630 	int err;
   2631 
   2632 	err = dsl_dataset_open(dsname, DS_MODE_STANDARD, FTAG, &ds);
   2633 	if (err)
   2634 		return (err);
   2635 
   2636 	if (quota != ds->ds_quota) {
   2637 		/*
   2638 		 * If someone removes a file, then tries to set the quota, we
   2639 		 * want to make sure the file freeing takes effect.
   2640 		 */
   2641 		txg_wait_open(ds->ds_dir->dd_pool, 0);
   2642 
   2643 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
   2644 		    dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
   2645 		    ds, &quota, 0);
   2646 	}
   2647 	dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
   2648 	return (err);
   2649 }
   2650 
   2651 static int
   2652 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
   2653 {
   2654 	dsl_dataset_t *ds = arg1;
   2655 	uint64_t *reservationp = arg2;
   2656 	uint64_t new_reservation = *reservationp;
   2657 	int64_t delta;
   2658 	uint64_t unique;
   2659 
   2660 	if (new_reservation > INT64_MAX)
   2661 		return (EOVERFLOW);
   2662 
   2663 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
   2664 	    SPA_VERSION_REFRESERVATION)
   2665 		return (ENOTSUP);
   2666 
   2667 	if (dsl_dataset_is_snapshot(ds))
   2668 		return (EINVAL);
   2669 
   2670 	/*
   2671 	 * If we are doing the preliminary check in open context, the
   2672 	 * space estimates may be inaccurate.
   2673 	 */
   2674 	if (!dmu_tx_is_syncing(tx))
   2675 		return (0);
   2676 
   2677 	mutex_enter(&ds->ds_lock);
   2678 	unique = dsl_dataset_unique(ds);
   2679 	delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved);
   2680 	mutex_exit(&ds->ds_lock);
   2681 
   2682 	if (delta > 0 &&
   2683 	    delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
   2684 		return (ENOSPC);
   2685 	if (delta > 0 && ds->ds_quota > 0 &&
   2686 	    new_reservation > ds->ds_quota)
   2687 		return (ENOSPC);
   2688 
   2689 	return (0);
   2690 }
   2691 
   2692 /* ARGSUSED */
   2693 static void
   2694 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
   2695     dmu_tx_t *tx)
   2696 {
   2697 	dsl_dataset_t *ds = arg1;
   2698 	uint64_t *reservationp = arg2;
   2699 	uint64_t new_reservation = *reservationp;
   2700 	uint64_t unique;
   2701 	int64_t delta;
   2702 
   2703 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
   2704 
   2705 	mutex_enter(&ds->ds_lock);
   2706 	unique = dsl_dataset_unique(ds);
   2707 	delta = MAX(0, (int64_t)(new_reservation - unique)) -
   2708 	    MAX(0, (int64_t)(ds->ds_reserved - unique));
   2709 	ds->ds_reserved = new_reservation;
   2710 	mutex_exit(&ds->ds_lock);
   2711 
   2712 	dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation",
   2713 	    new_reservation, cr, tx);
   2714 
   2715 	dsl_dir_diduse_space(ds->ds_dir, delta, 0, 0, tx);
   2716 
   2717 	spa_history_internal_log(LOG_DS_REFRESERV,
   2718 	    ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu",
   2719 	    (longlong_t)new_reservation,
   2720 	    ds->ds_dir->dd_phys->dd_head_dataset_obj);
   2721 }
   2722 
   2723 int
   2724 dsl_dataset_set_reservation(const char *dsname, uint64_t reservation)
   2725 {
   2726 	dsl_dataset_t *ds;
   2727 	int err;
   2728 
   2729 	err = dsl_dataset_open(dsname, DS_MODE_STANDARD, FTAG, &ds);
   2730 	if (err)
   2731 		return (err);
   2732 
   2733 	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
   2734 	    dsl_dataset_set_reservation_check,
   2735 	    dsl_dataset_set_reservation_sync, ds, &reservation, 0);
   2736 	dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
   2737 	return (err);
   2738 }
   2739