Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)dmu_traverse.c	1.6	06/04/13 SMI"
     27 
     28 #include <sys/zfs_context.h>
     29 #include <sys/dmu_objset.h>
     30 #include <sys/dmu_traverse.h>
     31 #include <sys/dsl_dataset.h>
     32 #include <sys/dsl_dir.h>
     33 #include <sys/dsl_pool.h>
     34 #include <sys/dnode.h>
     35 #include <sys/spa.h>
     36 #include <sys/zio.h>
     37 #include <sys/dmu_impl.h>
     38 
     39 #define	BP_SPAN_SHIFT(level, width)	((level) * (width))
     40 
     41 #define	BP_EQUAL(b1, b2)				\
     42 	(DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) &&	\
     43 	(b1)->blk_birth == (b2)->blk_birth)
     44 
     45 /*
     46  * Compare two bookmarks.
     47  *
     48  * For ADVANCE_PRE, the visitation order is:
     49  *
     50  *	objset 0, 1, 2, ..., ZB_MAXOBJSET.
     51  *	object 0, 1, 2, ..., ZB_MAXOBJECT.
     52  *	blkoff 0, 1, 2, ...
     53  *	level ZB_MAXLEVEL, ..., 2, 1, 0.
     54  *
     55  * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
     56  * ordering vector is:
     57  *
     58  *	< objset, object, blkoff, -level >
     59  *
     60  * For ADVANCE_POST, the starting offsets aren't sequential but ending
     61  * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
     62  * The visitation order is:
     63  *
     64  *	objset 1, 2, ..., ZB_MAXOBJSET, 0.
     65  *	object 1, 2, ..., ZB_MAXOBJECT, 0.
     66  *	blkoff 1, 2, ...
     67  *	level 0, 1, 2, ..., ZB_MAXLEVEL.
     68  *
     69  * and thus a valid ordering vector is:
     70  *
     71  *	< objset - 1, object - 1, blkoff, level >
     72  *
     73  * Both orderings can be expressed as:
     74  *
     75  *	< objset + bias, object + bias, blkoff, level ^ bias >
     76  *
     77  * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
     78  * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
     79  *
     80  * Special case: an objset's osphys is represented as level -1 of object 0.
     81  * It is always either the very first or very last block we visit in an objset.
     82  * Therefore, if either bookmark's level is -1, level alone determines order.
     83  */
     84 static int
     85 compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
     86     int advance)
     87 {
     88 	int bias = (advance & ADVANCE_PRE) ? 0 : -1;
     89 	uint64_t sblkoff, eblkoff;
     90 	int slevel, elevel, wshift;
     91 
     92 	if (szb->zb_objset + bias < ezb->zb_objset + bias)
     93 		return (-1);
     94 
     95 	if (szb->zb_objset + bias > ezb->zb_objset + bias)
     96 		return (1);
     97 
     98 	slevel = szb->zb_level;
     99 	elevel = ezb->zb_level;
    100 
    101 	if ((slevel | elevel) < 0)
    102 		return ((slevel ^ bias) - (elevel ^ bias));
    103 
    104 	if (szb->zb_object + bias < ezb->zb_object + bias)
    105 		return (-1);
    106 
    107 	if (szb->zb_object + bias > ezb->zb_object + bias)
    108 		return (1);
    109 
    110 	if (dnp == NULL)
    111 		return (0);
    112 
    113 	wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
    114 
    115 	sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
    116 	eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
    117 
    118 	if (sblkoff < eblkoff)
    119 		return (-1);
    120 
    121 	if (sblkoff > eblkoff)
    122 		return (1);
    123 
    124 	return ((elevel ^ bias) - (slevel ^ bias));
    125 }
    126 
    127 #define	SET_BOOKMARK(zb, objset, object, level, blkid)	\
    128 {							\
    129 	(zb)->zb_objset = objset;			\
    130 	(zb)->zb_object = object;			\
    131 	(zb)->zb_level = level;				\
    132 	(zb)->zb_blkid = blkid;				\
    133 }
    134 
    135 #define	SET_BOOKMARK_LB(zb, level, blkid)		\
    136 {							\
    137 	(zb)->zb_level = level;				\
    138 	(zb)->zb_blkid = blkid;				\
    139 }
    140 
    141 static int
    142 advance_objset(zseg_t *zseg, uint64_t objset, int advance)
    143 {
    144 	zbookmark_t *zb = &zseg->seg_start;
    145 
    146 	if (advance & ADVANCE_PRE) {
    147 		if (objset >= ZB_MAXOBJSET)
    148 			return (ERANGE);
    149 		SET_BOOKMARK(zb, objset, 0, -1, 0);
    150 	} else {
    151 		if (objset >= ZB_MAXOBJSET)
    152 			objset = 0;
    153 		SET_BOOKMARK(zb, objset, 1, 0, 0);
    154 	}
    155 
    156 	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
    157 		return (ERANGE);
    158 
    159 	return (EAGAIN);
    160 }
    161 
    162 static int
    163 advance_object(zseg_t *zseg, uint64_t object, int advance)
    164 {
    165 	zbookmark_t *zb = &zseg->seg_start;
    166 
    167 	if (advance & ADVANCE_PRE) {
    168 		if (object >= ZB_MAXOBJECT) {
    169 			SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
    170 		} else {
    171 			SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
    172 		}
    173 	} else {
    174 		if (zb->zb_object == 0) {
    175 			SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
    176 		} else {
    177 			if (object >= ZB_MAXOBJECT)
    178 				object = 0;
    179 			SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
    180 		}
    181 	}
    182 
    183 	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
    184 		return (ERANGE);
    185 
    186 	return (EAGAIN);
    187 }
    188 
    189 static int
    190 advance_from_osphys(zseg_t *zseg, int advance)
    191 {
    192 	zbookmark_t *zb = &zseg->seg_start;
    193 
    194 	ASSERT(zb->zb_object == 0);
    195 	ASSERT(zb->zb_level == -1);
    196 	ASSERT(zb->zb_blkid == 0);
    197 
    198 	if (advance & ADVANCE_PRE) {
    199 		SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
    200 	} else {
    201 		if (zb->zb_objset == 0)
    202 			return (ERANGE);
    203 		SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
    204 	}
    205 
    206 	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
    207 		return (ERANGE);
    208 
    209 	return (EAGAIN);
    210 }
    211 
    212 static int
    213 advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
    214 {
    215 	zbookmark_t *zb = &zseg->seg_start;
    216 	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
    217 	int maxlevel = dnp->dn_nlevels - 1;
    218 	int level = zb->zb_level;
    219 	uint64_t blkid = zb->zb_blkid;
    220 
    221 	if (advance & ADVANCE_PRE) {
    222 		if (level > 0 && rc == 0) {
    223 			level--;
    224 			blkid <<= wshift;
    225 		} else {
    226 			blkid++;
    227 
    228 			if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
    229 			    dnp->dn_maxblkid)
    230 				return (ERANGE);
    231 
    232 			while (level < maxlevel) {
    233 				if (P2PHASE(blkid, 1ULL << wshift))
    234 					break;
    235 				blkid >>= wshift;
    236 				level++;
    237 			}
    238 		}
    239 	} else {
    240 		if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
    241 			blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
    242 			level = 0;
    243 		} else {
    244 			blkid >>= wshift;
    245 			level++;
    246 		}
    247 
    248 		while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
    249 		    dnp->dn_maxblkid) {
    250 			if (level == maxlevel)
    251 				return (ERANGE);
    252 			blkid >>= wshift;
    253 			level++;
    254 		}
    255 	}
    256 	SET_BOOKMARK_LB(zb, level, blkid);
    257 
    258 	if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
    259 		return (ERANGE);
    260 
    261 	return (EAGAIN);
    262 }
    263 
    264 static int
    265 traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
    266 {
    267 	/*
    268 	 * Before we issue the callback, prune against maxtxg.
    269 	 *
    270 	 * We prune against mintxg before we get here because it's a big win.
    271 	 * If a given block was born in txg 37, then we know that the entire
    272 	 * subtree below that block must have been born in txg 37 or earlier.
    273 	 * We can therefore lop off huge branches of the tree as we go.
    274 	 *
    275 	 * There's no corresponding optimization for maxtxg because knowing
    276 	 * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
    277 	 * children.  In fact, the copy-on-write design of ZFS ensures that
    278 	 * top-level blocks will pretty much always be new.
    279 	 *
    280 	 * Therefore, in the name of simplicity we don't prune against
    281 	 * maxtxg until the last possible moment -- that being right now.
    282 	 */
    283 	if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
    284 		return (0);
    285 
    286 	/*
    287 	 * Debugging: verify that the order we visit things agrees with the
    288 	 * order defined by compare_bookmark().  We don't check this for
    289 	 * log blocks because there's no defined ordering for them; they're
    290 	 * always visited (or not) as part of visiting the objset_phys_t.
    291 	 */
    292 	if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
    293 		zbookmark_t *zb = &bc->bc_bookmark;
    294 		zbookmark_t *szb = &zseg->seg_start;
    295 		zbookmark_t *ezb = &zseg->seg_end;
    296 		zbookmark_t *lzb = &th->th_lastcb;
    297 		dnode_phys_t *dnp = bc->bc_dnode;
    298 
    299 		ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
    300 		ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
    301 		ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
    302 		    lzb->zb_level == ZB_NO_LEVEL);
    303 		*lzb = *zb;
    304 	}
    305 
    306 	th->th_callbacks++;
    307 	return (th->th_func(bc, th->th_spa, th->th_arg));
    308 }
    309 
    310 static int
    311 traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
    312 	dnode_phys_t *dnp)
    313 {
    314 	zbookmark_t *zb = &bc->bc_bookmark;
    315 	int error;
    316 
    317 	th->th_hits++;
    318 
    319 	bc->bc_dnode = dnp;
    320 	bc->bc_errno = 0;
    321 
    322 	if (BP_EQUAL(&bc->bc_blkptr, bp))
    323 		return (0);
    324 
    325 	bc->bc_blkptr = *bp;
    326 
    327 	if (bc->bc_data == NULL)
    328 		return (0);
    329 
    330 	if (BP_IS_HOLE(bp)) {
    331 		ASSERT(th->th_advance & ADVANCE_HOLES);
    332 		return (0);
    333 	}
    334 
    335 	if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
    336 		error = EIO;
    337 	} else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
    338 		error = 0;
    339 		th->th_arc_hits++;
    340 	} else {
    341 		error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
    342 		    BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
    343 		    th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
    344 
    345 		if (BP_SHOULD_BYTESWAP(bp) && error == 0)
    346 			(zb->zb_level > 0 ? byteswap_uint64_array :
    347 			    dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
    348 			    BP_GET_LSIZE(bp));
    349 		th->th_reads++;
    350 	}
    351 
    352 	if (error) {
    353 		bc->bc_errno = error;
    354 		error = traverse_callback(th, NULL, bc);
    355 		ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
    356 		bc->bc_blkptr.blk_birth = -1ULL;
    357 	}
    358 
    359 	dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
    360 	    bc - &th->th_cache[0][0], error,
    361 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
    362 
    363 	return (error);
    364 }
    365 
    366 static int
    367 find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
    368 {
    369 	zbookmark_t *zb = &zseg->seg_start;
    370 	traverse_blk_cache_t *bc;
    371 	blkptr_t *bp = dnp->dn_blkptr;
    372 	int i, first, level;
    373 	int nbp = dnp->dn_nblkptr;
    374 	int minlevel = zb->zb_level;
    375 	int maxlevel = dnp->dn_nlevels - 1;
    376 	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
    377 	int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
    378 	uint64_t blkid = zb->zb_blkid >> bp_shift;
    379 	int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
    380 	int rc;
    381 
    382 	if (minlevel > maxlevel || blkid >= nbp)
    383 		return (ERANGE);
    384 
    385 	for (level = maxlevel; level >= minlevel; level--) {
    386 		first = P2PHASE(blkid, 1ULL << wshift);
    387 
    388 		for (i = first; i < nbp; i++)
    389 			if (bp[i].blk_birth > zseg->seg_mintxg ||
    390 			    BP_IS_HOLE(&bp[i]) && do_holes)
    391 				break;
    392 
    393 		if (i != first) {
    394 			i--;
    395 			SET_BOOKMARK_LB(zb, level, blkid + (i - first));
    396 			return (ENOTBLK);
    397 		}
    398 
    399 		bc = &th->th_cache[depth][level];
    400 
    401 		SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
    402 		    level, blkid);
    403 
    404 		if (rc = traverse_read(th, bc, bp + i, dnp)) {
    405 			if (rc != EAGAIN) {
    406 				SET_BOOKMARK_LB(zb, level, blkid);
    407 			}
    408 			return (rc);
    409 		}
    410 
    411 		if (BP_IS_HOLE(&bp[i])) {
    412 			SET_BOOKMARK_LB(zb, level, blkid);
    413 			th->th_lastcb.zb_level = ZB_NO_LEVEL;
    414 			return (0);
    415 		}
    416 
    417 		nbp = 1 << wshift;
    418 		bp = bc->bc_data;
    419 		bp_shift -= wshift;
    420 		blkid = zb->zb_blkid >> bp_shift;
    421 	}
    422 
    423 	return (0);
    424 }
    425 
    426 static int
    427 get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
    428     uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
    429 {
    430 	zseg_t zseg;
    431 	zbookmark_t *zb = &zseg.seg_start;
    432 	uint64_t object = *objectp;
    433 	int i, rc;
    434 
    435 	SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
    436 	SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
    437 
    438 	zseg.seg_mintxg = txg;
    439 	zseg.seg_maxtxg = -1ULL;
    440 
    441 	for (;;) {
    442 		rc = find_block(th, &zseg, mdn, depth);
    443 
    444 		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
    445 			break;
    446 
    447 		if (rc == 0 && zb->zb_level == 0) {
    448 			dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
    449 			for (i = 0; i < DNODES_PER_BLOCK; i++) {
    450 				object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
    451 				if (object >= *objectp &&
    452 				    dnp[i].dn_type != DMU_OT_NONE &&
    453 				    (type == -1 || dnp[i].dn_type == type)) {
    454 					*objectp = object;
    455 					*dnpp = &dnp[i];
    456 					return (0);
    457 				}
    458 			}
    459 		}
    460 
    461 		rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
    462 
    463 		if (rc == ERANGE)
    464 			break;
    465 	}
    466 
    467 	if (rc == ERANGE)
    468 		*objectp = ZB_MAXOBJECT;
    469 
    470 	return (rc);
    471 }
    472 
    473 /* ARGSUSED */
    474 static void
    475 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
    476 {
    477 	traverse_handle_t *th = arg;
    478 	traverse_blk_cache_t *bc = &th->th_zil_cache;
    479 	zbookmark_t *zb = &bc->bc_bookmark;
    480 	zseg_t *zseg = list_head(&th->th_seglist);
    481 
    482 	if (bp->blk_birth <= zseg->seg_mintxg)
    483 		return;
    484 
    485 	if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
    486 		zb->zb_object = 0;
    487 		zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
    488 		bc->bc_blkptr = *bp;
    489 		(void) traverse_callback(th, zseg, bc);
    490 	}
    491 }
    492 
    493 /* ARGSUSED */
    494 static void
    495 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
    496 {
    497 	traverse_handle_t *th = arg;
    498 	traverse_blk_cache_t *bc = &th->th_zil_cache;
    499 	zbookmark_t *zb = &bc->bc_bookmark;
    500 	zseg_t *zseg = list_head(&th->th_seglist);
    501 
    502 	if (lrc->lrc_txtype == TX_WRITE) {
    503 		lr_write_t *lr = (lr_write_t *)lrc;
    504 		blkptr_t *bp = &lr->lr_blkptr;
    505 
    506 		if (bp->blk_birth <= zseg->seg_mintxg)
    507 			return;
    508 
    509 		if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
    510 			zb->zb_object = lr->lr_foid;
    511 			zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
    512 			bc->bc_blkptr = *bp;
    513 			(void) traverse_callback(th, zseg, bc);
    514 		}
    515 	}
    516 }
    517 
    518 static void
    519 traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
    520 {
    521 	spa_t *spa = th->th_spa;
    522 	dsl_pool_t *dp = spa_get_dsl(spa);
    523 	objset_phys_t *osphys = bc->bc_data;
    524 	zil_header_t *zh = &osphys->os_zil_header;
    525 	uint64_t claim_txg = zh->zh_claim_txg;
    526 	zilog_t *zilog;
    527 
    528 	ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
    529 	ASSERT(bc->bc_bookmark.zb_level == -1);
    530 
    531 	/*
    532 	 * We only want to visit blocks that have been claimed but not yet
    533 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
    534 	 */
    535 	if (claim_txg == 0 && (spa_mode & FWRITE))
    536 		return;
    537 
    538 	th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
    539 
    540 	zilog = zil_alloc(dp->dp_meta_objset, zh);
    541 
    542 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
    543 	    claim_txg);
    544 
    545 	zil_free(zilog);
    546 }
    547 
    548 static int
    549 traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
    550 {
    551 	zbookmark_t *zb = &zseg->seg_start;
    552 	traverse_blk_cache_t *bc;
    553 	dnode_phys_t *dn, *dn_tmp;
    554 	int worklimit = 100;
    555 	int rc;
    556 
    557 	dprintf("<%llu, %llu, %d, %llx>\n",
    558 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
    559 
    560 	bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
    561 	dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
    562 
    563 	SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
    564 
    565 	rc = traverse_read(th, bc, mosbp, dn);
    566 
    567 	if (rc)		/* If we get ERESTART, we've got nowhere left to go */
    568 		return (rc == ERESTART ? EINTR : rc);
    569 
    570 	ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
    571 
    572 	if (zb->zb_objset != 0) {
    573 		uint64_t objset = zb->zb_objset;
    574 		dsl_dataset_phys_t *dsp;
    575 
    576 		rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
    577 		    DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
    578 
    579 		if (objset != zb->zb_objset)
    580 			rc = advance_objset(zseg, objset, th->th_advance);
    581 
    582 		if (rc != 0)
    583 			return (rc);
    584 
    585 		dsp = DN_BONUS(dn_tmp);
    586 
    587 		bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
    588 		dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
    589 
    590 		SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
    591 
    592 		/*
    593 		 * If we're traversing an open snapshot, we know that it
    594 		 * can't be deleted (because it's open) and it can't change
    595 		 * (because it's a snapshot).  Therefore, once we've gotten
    596 		 * from the uberblock down to the snapshot's objset_phys_t,
    597 		 * we no longer need to synchronize with spa_sync(); we're
    598 		 * traversing a completely static block tree from here on.
    599 		 */
    600 		if (th->th_advance & ADVANCE_NOLOCK) {
    601 			ASSERT(th->th_locked);
    602 			rw_exit(spa_traverse_rwlock(th->th_spa));
    603 			th->th_locked = 0;
    604 		}
    605 
    606 		rc = traverse_read(th, bc, &dsp->ds_bp, dn);
    607 
    608 		if (rc != 0) {
    609 			if (rc == ERESTART)
    610 				rc = advance_objset(zseg, zb->zb_objset + 1,
    611 				    th->th_advance);
    612 			return (rc);
    613 		}
    614 
    615 		if (th->th_advance & ADVANCE_PRUNE)
    616 			zseg->seg_mintxg =
    617 			    MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
    618 	}
    619 
    620 	if (zb->zb_level == -1) {
    621 		ASSERT(zb->zb_object == 0);
    622 		ASSERT(zb->zb_blkid == 0);
    623 		ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
    624 
    625 		if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
    626 			rc = traverse_callback(th, zseg, bc);
    627 			if (rc) {
    628 				ASSERT(rc == EINTR);
    629 				return (rc);
    630 			}
    631 			if ((th->th_advance & ADVANCE_ZIL) &&
    632 			    zb->zb_objset != 0)
    633 				traverse_zil(th, bc);
    634 		}
    635 
    636 		return (advance_from_osphys(zseg, th->th_advance));
    637 	}
    638 
    639 	if (zb->zb_object != 0) {
    640 		uint64_t object = zb->zb_object;
    641 
    642 		rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
    643 		    zseg->seg_mintxg, -1, ZB_MDN_CACHE);
    644 
    645 		if (object != zb->zb_object)
    646 			rc = advance_object(zseg, object, th->th_advance);
    647 
    648 		if (rc != 0)
    649 			return (rc);
    650 
    651 		dn = dn_tmp;
    652 	}
    653 
    654 	if (zb->zb_level == ZB_MAXLEVEL)
    655 		zb->zb_level = dn->dn_nlevels - 1;
    656 
    657 	for (;;) {
    658 		rc = find_block(th, zseg, dn, ZB_DN_CACHE);
    659 
    660 		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
    661 			break;
    662 
    663 		if (rc == 0) {
    664 			bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
    665 			ASSERT(bc->bc_dnode == dn);
    666 			ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
    667 			rc = traverse_callback(th, zseg, bc);
    668 			if (rc) {
    669 				ASSERT(rc == EINTR);
    670 				return (rc);
    671 			}
    672 			if (BP_IS_HOLE(&bc->bc_blkptr)) {
    673 				ASSERT(th->th_advance & ADVANCE_HOLES);
    674 				rc = ENOTBLK;
    675 			}
    676 		}
    677 
    678 		rc = advance_block(zseg, dn, rc, th->th_advance);
    679 
    680 		if (rc == ERANGE)
    681 			break;
    682 
    683 		/*
    684 		 * Give spa_sync() a chance to run.
    685 		 */
    686 		if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
    687 			th->th_syncs++;
    688 			return (EAGAIN);
    689 		}
    690 
    691 		if (--worklimit == 0)
    692 			return (EAGAIN);
    693 	}
    694 
    695 	if (rc == ERANGE)
    696 		rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
    697 
    698 	return (rc);
    699 }
    700 
    701 /*
    702  * It is the caller's responsibility to ensure that the dsl_dataset_t
    703  * doesn't go away during traversal.
    704  */
    705 int
    706 traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
    707     blkptr_cb_t func, void *arg)
    708 {
    709 	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
    710 	traverse_handle_t *th;
    711 	int err;
    712 
    713 	th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
    714 
    715 	traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
    716 
    717 	while ((err = traverse_more(th)) == EAGAIN)
    718 		continue;
    719 
    720 	traverse_fini(th);
    721 	return (err);
    722 }
    723 
    724 int
    725 traverse_more(traverse_handle_t *th)
    726 {
    727 	zseg_t *zseg = list_head(&th->th_seglist);
    728 	uint64_t save_txg;	/* XXX won't be necessary with real itinerary */
    729 	krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
    730 	blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
    731 	int rc;
    732 
    733 	if (zseg == NULL)
    734 		return (0);
    735 
    736 	th->th_restarts++;
    737 
    738 	save_txg = zseg->seg_mintxg;
    739 
    740 	rw_enter(rw, RW_READER);
    741 	th->th_locked = 1;
    742 
    743 	rc = traverse_segment(th, zseg, mosbp);
    744 	ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
    745 
    746 	if (th->th_locked)
    747 		rw_exit(rw);
    748 	th->th_locked = 0;
    749 
    750 	zseg->seg_mintxg = save_txg;
    751 
    752 	if (rc == ERANGE) {
    753 		list_remove(&th->th_seglist, zseg);
    754 		kmem_free(zseg, sizeof (*zseg));
    755 		return (EAGAIN);
    756 	}
    757 
    758 	return (rc);
    759 }
    760 
    761 /*
    762  * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
    763  * are not included.  The blocks covered by this segment will all have
    764  * mintxg < birth < maxtxg.
    765  */
    766 static void
    767 traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
    768     uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
    769     uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
    770 {
    771 	zseg_t *zseg;
    772 
    773 	zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
    774 
    775 	zseg->seg_mintxg = mintxg;
    776 	zseg->seg_maxtxg = maxtxg;
    777 
    778 	zseg->seg_start.zb_objset = sobjset;
    779 	zseg->seg_start.zb_object = sobject;
    780 	zseg->seg_start.zb_level = slevel;
    781 	zseg->seg_start.zb_blkid = sblkid;
    782 
    783 	zseg->seg_end.zb_objset = eobjset;
    784 	zseg->seg_end.zb_object = eobject;
    785 	zseg->seg_end.zb_level = elevel;
    786 	zseg->seg_end.zb_blkid = eblkid;
    787 
    788 	list_insert_tail(&th->th_seglist, zseg);
    789 }
    790 
    791 void
    792 traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
    793     uint64_t objset, uint64_t object)
    794 {
    795 	if (th->th_advance & ADVANCE_PRE)
    796 		traverse_add_segment(th, mintxg, maxtxg,
    797 		    objset, object, ZB_MAXLEVEL, 0,
    798 		    objset, object, 0, ZB_MAXBLKID);
    799 	else
    800 		traverse_add_segment(th, mintxg, maxtxg,
    801 		    objset, object, 0, 0,
    802 		    objset, object, 0, ZB_MAXBLKID);
    803 }
    804 
    805 void
    806 traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
    807     uint64_t objset)
    808 {
    809 	if (th->th_advance & ADVANCE_PRE)
    810 		traverse_add_segment(th, mintxg, maxtxg,
    811 		    objset, 0, -1, 0,
    812 		    objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
    813 	else
    814 		traverse_add_segment(th, mintxg, maxtxg,
    815 		    objset, 1, 0, 0,
    816 		    objset, 0, -1, 0);
    817 }
    818 
    819 void
    820 traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
    821 {
    822 	if (th->th_advance & ADVANCE_PRE)
    823 		traverse_add_segment(th, mintxg, maxtxg,
    824 		    0, 0, -1, 0,
    825 		    ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
    826 	else
    827 		traverse_add_segment(th, mintxg, maxtxg,
    828 		    1, 1, 0, 0,
    829 		    0, 0, -1, 0);
    830 }
    831 
    832 traverse_handle_t *
    833 traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
    834     int zio_flags)
    835 {
    836 	traverse_handle_t *th;
    837 	int d, l;
    838 
    839 	th = kmem_zalloc(sizeof (*th), KM_SLEEP);
    840 
    841 	th->th_spa = spa;
    842 	th->th_func = func;
    843 	th->th_arg = arg;
    844 	th->th_advance = advance;
    845 	th->th_lastcb.zb_level = ZB_NO_LEVEL;
    846 	th->th_noread.zb_level = ZB_NO_LEVEL;
    847 	th->th_zio_flags = zio_flags;
    848 
    849 	list_create(&th->th_seglist, sizeof (zseg_t),
    850 	    offsetof(zseg_t, seg_node));
    851 
    852 	for (d = 0; d < ZB_DEPTH; d++) {
    853 		for (l = 0; l < ZB_MAXLEVEL; l++) {
    854 			if ((advance & ADVANCE_DATA) ||
    855 			    l != 0 || d != ZB_DN_CACHE)
    856 				th->th_cache[d][l].bc_data =
    857 				    zio_buf_alloc(SPA_MAXBLOCKSIZE);
    858 		}
    859 	}
    860 
    861 	return (th);
    862 }
    863 
    864 void
    865 traverse_fini(traverse_handle_t *th)
    866 {
    867 	int d, l;
    868 	zseg_t *zseg;
    869 
    870 	for (d = 0; d < ZB_DEPTH; d++)
    871 		for (l = 0; l < ZB_MAXLEVEL; l++)
    872 			if (th->th_cache[d][l].bc_data != NULL)
    873 				zio_buf_free(th->th_cache[d][l].bc_data,
    874 				    SPA_MAXBLOCKSIZE);
    875 
    876 	while ((zseg = list_head(&th->th_seglist)) != NULL) {
    877 		list_remove(&th->th_seglist, zseg);
    878 		kmem_free(zseg, sizeof (*zseg));
    879 	}
    880 
    881 	list_destroy(&th->th_seglist);
    882 
    883 	dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
    884 	    th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
    885 	    th->th_syncs, th->th_restarts);
    886 
    887 	kmem_free(th, sizeof (*th));
    888 }
    889