Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)dmu_send.c	1.11	07/10/29 SMI"
     27 
     28 #include <sys/dmu.h>
     29 #include <sys/dmu_impl.h>
     30 #include <sys/dmu_tx.h>
     31 #include <sys/dbuf.h>
     32 #include <sys/dnode.h>
     33 #include <sys/zfs_context.h>
     34 #include <sys/dmu_objset.h>
     35 #include <sys/dmu_traverse.h>
     36 #include <sys/dsl_dataset.h>
     37 #include <sys/dsl_dir.h>
     38 #include <sys/dsl_pool.h>
     39 #include <sys/dsl_synctask.h>
     40 #include <sys/zfs_ioctl.h>
     41 #include <sys/zap.h>
     42 #include <sys/zio_checksum.h>
     43 
     44 static char *dmu_recv_tag = "dmu_recv_tag";
     45 
     46 struct backuparg {
     47 	dmu_replay_record_t *drr;
     48 	vnode_t *vp;
     49 	offset_t *off;
     50 	objset_t *os;
     51 	zio_cksum_t zc;
     52 	int err;
     53 };
     54 
     55 static int
     56 dump_bytes(struct backuparg *ba, void *buf, int len)
     57 {
     58 	ssize_t resid; /* have to get resid to get detailed errno */
     59 	ASSERT3U(len % 8, ==, 0);
     60 
     61 	fletcher_4_incremental_native(buf, len, &ba->zc);
     62 	ba->err = vn_rdwr(UIO_WRITE, ba->vp,
     63 	    (caddr_t)buf, len,
     64 	    0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
     65 	*ba->off += len;
     66 	return (ba->err);
     67 }
     68 
     69 static int
     70 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
     71     uint64_t length)
     72 {
     73 	/* write a FREE record */
     74 	bzero(ba->drr, sizeof (dmu_replay_record_t));
     75 	ba->drr->drr_type = DRR_FREE;
     76 	ba->drr->drr_u.drr_free.drr_object = object;
     77 	ba->drr->drr_u.drr_free.drr_offset = offset;
     78 	ba->drr->drr_u.drr_free.drr_length = length;
     79 
     80 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
     81 		return (EINTR);
     82 	return (0);
     83 }
     84 
     85 static int
     86 dump_data(struct backuparg *ba, dmu_object_type_t type,
     87     uint64_t object, uint64_t offset, int blksz, void *data)
     88 {
     89 	/* write a DATA record */
     90 	bzero(ba->drr, sizeof (dmu_replay_record_t));
     91 	ba->drr->drr_type = DRR_WRITE;
     92 	ba->drr->drr_u.drr_write.drr_object = object;
     93 	ba->drr->drr_u.drr_write.drr_type = type;
     94 	ba->drr->drr_u.drr_write.drr_offset = offset;
     95 	ba->drr->drr_u.drr_write.drr_length = blksz;
     96 
     97 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
     98 		return (EINTR);
     99 	if (dump_bytes(ba, data, blksz))
    100 		return (EINTR);
    101 	return (0);
    102 }
    103 
    104 static int
    105 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
    106 {
    107 	/* write a FREEOBJECTS record */
    108 	bzero(ba->drr, sizeof (dmu_replay_record_t));
    109 	ba->drr->drr_type = DRR_FREEOBJECTS;
    110 	ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
    111 	ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
    112 
    113 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
    114 		return (EINTR);
    115 	return (0);
    116 }
    117 
    118 static int
    119 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
    120 {
    121 	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
    122 		return (dump_freeobjects(ba, object, 1));
    123 
    124 	/* write an OBJECT record */
    125 	bzero(ba->drr, sizeof (dmu_replay_record_t));
    126 	ba->drr->drr_type = DRR_OBJECT;
    127 	ba->drr->drr_u.drr_object.drr_object = object;
    128 	ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
    129 	ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
    130 	ba->drr->drr_u.drr_object.drr_blksz =
    131 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
    132 	ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
    133 	ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
    134 	ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
    135 
    136 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
    137 		return (EINTR);
    138 
    139 	if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
    140 		return (EINTR);
    141 
    142 	/* free anything past the end of the file */
    143 	if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
    144 	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
    145 		return (EINTR);
    146 	if (ba->err)
    147 		return (EINTR);
    148 	return (0);
    149 }
    150 
    151 #define	BP_SPAN(dnp, level) \
    152 	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
    153 	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
    154 
    155 static int
    156 backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
    157 {
    158 	struct backuparg *ba = arg;
    159 	uint64_t object = bc->bc_bookmark.zb_object;
    160 	int level = bc->bc_bookmark.zb_level;
    161 	uint64_t blkid = bc->bc_bookmark.zb_blkid;
    162 	blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL;
    163 	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
    164 	void *data = bc->bc_data;
    165 	int err = 0;
    166 
    167 	if (issig(JUSTLOOKING) && issig(FORREAL))
    168 		return (EINTR);
    169 
    170 	ASSERT(data || bp == NULL);
    171 
    172 	if (bp == NULL && object == 0) {
    173 		uint64_t span = BP_SPAN(bc->bc_dnode, level);
    174 		uint64_t dnobj = (blkid * span) >> DNODE_SHIFT;
    175 		err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
    176 	} else if (bp == NULL) {
    177 		uint64_t span = BP_SPAN(bc->bc_dnode, level);
    178 		err = dump_free(ba, object, blkid * span, span);
    179 	} else if (data && level == 0 && type == DMU_OT_DNODE) {
    180 		dnode_phys_t *blk = data;
    181 		int i;
    182 		int blksz = BP_GET_LSIZE(bp);
    183 
    184 		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
    185 			uint64_t dnobj =
    186 			    (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
    187 			err = dump_dnode(ba, dnobj, blk+i);
    188 			if (err)
    189 				break;
    190 		}
    191 	} else if (level == 0 &&
    192 	    type != DMU_OT_DNODE && type != DMU_OT_OBJSET) {
    193 		int blksz = BP_GET_LSIZE(bp);
    194 		if (data == NULL) {
    195 			uint32_t aflags = ARC_WAIT;
    196 			arc_buf_t *abuf;
    197 			zbookmark_t zb;
    198 
    199 			zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object;
    200 			zb.zb_object = object;
    201 			zb.zb_level = level;
    202 			zb.zb_blkid = blkid;
    203 			(void) arc_read(NULL, spa, bp,
    204 			    dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
    205 			    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
    206 			    &aflags, &zb);
    207 
    208 			if (abuf) {
    209 				err = dump_data(ba, type, object, blkid * blksz,
    210 				    blksz, abuf->b_data);
    211 				(void) arc_buf_remove_ref(abuf, &abuf);
    212 			}
    213 		} else {
    214 			err = dump_data(ba, type, object, blkid * blksz,
    215 			    blksz, data);
    216 		}
    217 	}
    218 
    219 	ASSERT(err == 0 || err == EINTR);
    220 	return (err);
    221 }
    222 
    223 int
    224 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
    225     vnode_t *vp, offset_t *off)
    226 {
    227 	dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
    228 	dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
    229 	dmu_replay_record_t *drr;
    230 	struct backuparg ba;
    231 	int err;
    232 	uint64_t fromtxg = 0;
    233 
    234 	/* tosnap must be a snapshot */
    235 	if (ds->ds_phys->ds_next_snap_obj == 0)
    236 		return (EINVAL);
    237 
    238 	/* fromsnap must be an earlier snapshot from the same fs as tosnap */
    239 	if (fromds && (ds->ds_dir != fromds->ds_dir ||
    240 	    fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg))
    241 		return (EXDEV);
    242 
    243 	if (fromorigin) {
    244 		if (fromsnap)
    245 			return (EINVAL);
    246 
    247 		if (ds->ds_dir->dd_phys->dd_origin_obj != NULL) {
    248 			dsl_pool_t *dp = ds->ds_dir->dd_pool;
    249 			rw_enter(&dp->dp_config_rwlock, RW_READER);
    250 			err = dsl_dataset_open_obj(dp,
    251 			    ds->ds_dir->dd_phys->dd_origin_obj, NULL,
    252 			    DS_MODE_NONE, FTAG, &fromds);
    253 			rw_exit(&dp->dp_config_rwlock);
    254 			if (err)
    255 				return (err);
    256 		} else {
    257 			fromorigin = B_FALSE;
    258 		}
    259 	}
    260 
    261 
    262 	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
    263 	drr->drr_type = DRR_BEGIN;
    264 	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
    265 	drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION;
    266 	drr->drr_u.drr_begin.drr_creation_time =
    267 	    ds->ds_phys->ds_creation_time;
    268 	drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
    269 	if (fromorigin)
    270 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
    271 	drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
    272 	if (fromds)
    273 		drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
    274 	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
    275 
    276 	if (fromds)
    277 		fromtxg = fromds->ds_phys->ds_creation_txg;
    278 	if (fromorigin)
    279 		dsl_dataset_close(fromds, DS_MODE_NONE, FTAG);
    280 
    281 	ba.drr = drr;
    282 	ba.vp = vp;
    283 	ba.os = tosnap;
    284 	ba.off = off;
    285 	ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
    286 
    287 	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
    288 		kmem_free(drr, sizeof (dmu_replay_record_t));
    289 		return (ba.err);
    290 	}
    291 
    292 	err = traverse_dsl_dataset(ds, fromtxg,
    293 	    ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
    294 	    backup_cb, &ba);
    295 
    296 	if (err) {
    297 		if (err == EINTR && ba.err)
    298 			err = ba.err;
    299 		kmem_free(drr, sizeof (dmu_replay_record_t));
    300 		return (err);
    301 	}
    302 
    303 	bzero(drr, sizeof (dmu_replay_record_t));
    304 	drr->drr_type = DRR_END;
    305 	drr->drr_u.drr_end.drr_checksum = ba.zc;
    306 
    307 	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
    308 		kmem_free(drr, sizeof (dmu_replay_record_t));
    309 		return (ba.err);
    310 	}
    311 
    312 	kmem_free(drr, sizeof (dmu_replay_record_t));
    313 
    314 	return (0);
    315 }
    316 
    317 struct recvbeginsyncarg {
    318 	const char *tofs;
    319 	const char *tosnap;
    320 	dsl_dataset_t *origin;
    321 	uint64_t fromguid;
    322 	dmu_objset_type_t type;
    323 	void *tag;
    324 	boolean_t force;
    325 	char clonelastname[MAXNAMELEN];
    326 	dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
    327 };
    328 
    329 static dsl_dataset_t *
    330 recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type,
    331     cred_t *cr, dmu_tx_t *tx)
    332 {
    333 	dsl_dataset_t *ds;
    334 
    335 	VERIFY(0 == dsl_dataset_open_obj(dp, dsobj, NULL,
    336 	    DS_MODE_EXCLUSIVE, dmu_recv_tag, &ds));
    337 
    338 	if (type != DMU_OST_NONE) {
    339 		(void) dmu_objset_create_impl(dp->dp_spa,
    340 		    ds, &ds->ds_phys->ds_bp, type, tx);
    341 	}
    342 
    343 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
    344 	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
    345 
    346 	spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
    347 	    ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
    348 	    ds->ds_phys->ds_dir_obj);
    349 
    350 	return (ds);
    351 }
    352 
    353 /* ARGSUSED */
    354 static int
    355 recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
    356 {
    357 	dsl_dir_t *dd = arg1;
    358 	struct recvbeginsyncarg *rbsa = arg2;
    359 	objset_t *mos = dd->dd_pool->dp_meta_objset;
    360 	uint64_t val;
    361 	int err;
    362 
    363 	err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
    364 	    strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
    365 
    366 	if (err != ENOENT)
    367 		return (err ? err : EEXIST);
    368 
    369 	if (rbsa->origin) {
    370 		/* make sure it's a snap in the same pool */
    371 		if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
    372 			return (EXDEV);
    373 		if (rbsa->origin->ds_phys->ds_num_children == 0)
    374 			return (EINVAL);
    375 		if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
    376 			return (ENODEV);
    377 	}
    378 
    379 	return (0);
    380 }
    381 
    382 static void
    383 recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
    384 {
    385 	dsl_dir_t *dd = arg1;
    386 	struct recvbeginsyncarg *rbsa = arg2;
    387 	uint64_t dsobj;
    388 
    389 	dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
    390 	    rbsa->origin, cr, tx);
    391 
    392 	rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
    393 	    rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
    394 }
    395 
    396 static int
    397 recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
    398 {
    399 	dsl_dataset_t *ds = arg1;
    400 	struct recvbeginsyncarg *rbsa = arg2;
    401 	int err;
    402 
    403 	/* must be a head ds */
    404 	if (ds->ds_phys->ds_next_snap_obj != 0)
    405 		return (EINVAL);
    406 
    407 	/* must not be a clone ds */
    408 	if (ds->ds_prev != NULL)
    409 		return (EINVAL);
    410 
    411 	err = dsl_dataset_destroy_check(ds, rbsa->tag, tx);
    412 	if (err)
    413 		return (err);
    414 
    415 	if (rbsa->origin) {
    416 		/* make sure it's a snap in the same pool */
    417 		if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool)
    418 			return (EXDEV);
    419 		if (rbsa->origin->ds_phys->ds_num_children == 0)
    420 			return (EINVAL);
    421 		if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
    422 			return (ENODEV);
    423 	}
    424 
    425 	return (0);
    426 }
    427 
    428 static void
    429 recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
    430 {
    431 	dsl_dataset_t *ds = arg1;
    432 	struct recvbeginsyncarg *rbsa = arg2;
    433 	dsl_dir_t *dd = ds->ds_dir;
    434 	uint64_t dsobj;
    435 
    436 	/*
    437 	 * NB: caller must provide an extra hold on the dsl_dir_t, so it
    438 	 * won't go away when dsl_dataset_destroy_sync() closes the
    439 	 * dataset.
    440 	 */
    441 	dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx);
    442 
    443 	dsobj = dsl_dataset_create_sync_impl(dd, rbsa->origin, tx);
    444 
    445 	rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
    446 	    rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
    447 }
    448 
    449 /* ARGSUSED */
    450 static int
    451 recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
    452 {
    453 	dsl_dataset_t *ds = arg1;
    454 	struct recvbeginsyncarg *rbsa = arg2;
    455 	int err;
    456 	uint64_t val;
    457 
    458 	/* must not have any changes since most recent snapshot */
    459 	if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
    460 		return (ETXTBSY);
    461 
    462 	/* must already be a snapshot of this fs */
    463 	if (ds->ds_phys->ds_prev_snap_obj == 0)
    464 		return (ENODEV);
    465 
    466 	/* most recent snapshot must match fromguid */
    467 	if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid)
    468 		return (ENODEV);
    469 
    470 	/* new snapshot name must not exist */
    471 	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
    472 	    ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
    473 	if (err == 0)
    474 		return (EEXIST);
    475 	if (err != ENOENT)
    476 		return (err);
    477 	return (0);
    478 }
    479 
    480 /* ARGSUSED */
    481 static void
    482 recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
    483 {
    484 	dsl_dataset_t *ohds = arg1;
    485 	struct recvbeginsyncarg *rbsa = arg2;
    486 	dsl_pool_t *dp = ohds->ds_dir->dd_pool;
    487 	dsl_dataset_t *ods, *cds;
    488 	uint64_t dsobj;
    489 
    490 	/* create the temporary clone */
    491 	VERIFY(0 == dsl_dataset_open_obj(dp, ohds->ds_phys->ds_prev_snap_obj,
    492 	    NULL, DS_MODE_STANDARD, FTAG, &ods));
    493 	dsobj = dsl_dataset_create_sync(ohds->ds_dir,
    494 	    rbsa->clonelastname, ods, cr, tx);
    495 	dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG);
    496 
    497 	/* open the temporary clone */
    498 	VERIFY(0 == dsl_dataset_open_obj(dp, dsobj, NULL,
    499 	    DS_MODE_EXCLUSIVE, dmu_recv_tag, &cds));
    500 
    501 	/* copy the refquota from the target fs to the clone */
    502 	if (ohds->ds_quota > 0)
    503 		dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx);
    504 
    505 	dmu_buf_will_dirty(cds->ds_dbuf, tx);
    506 	cds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
    507 
    508 	rbsa->ds = cds;
    509 
    510 	spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
    511 	    dp->dp_spa, tx, cr, "dataset = %lld",
    512 	    cds->ds_phys->ds_dir_obj);
    513 }
    514 
    515 /* ARGSUSED */
    516 static void
    517 recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
    518 {
    519 	dsl_dataset_t *ds = arg1;
    520 
    521 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
    522 	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
    523 
    524 	spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
    525 	    ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
    526 	    ds->ds_phys->ds_dir_obj);
    527 }
    528 
    529 /*
    530  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
    531  * succeeds; otherwise we will leak the holds on the datasets.
    532  */
    533 int
    534 dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
    535     boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc)
    536 {
    537 	int err = 0;
    538 	boolean_t byteswap;
    539 	struct recvbeginsyncarg rbsa;
    540 	uint64_t version;
    541 	int flags;
    542 	dsl_dataset_t *ds;
    543 
    544 	if (drrb->drr_magic == DMU_BACKUP_MAGIC)
    545 		byteswap = FALSE;
    546 	else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
    547 		byteswap = TRUE;
    548 	else
    549 		return (EINVAL);
    550 
    551 	rbsa.tofs = tofs;
    552 	rbsa.tosnap = tosnap;
    553 	rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL;
    554 	rbsa.fromguid = drrb->drr_fromguid;
    555 	rbsa.type = drrb->drr_type;
    556 	rbsa.tag = FTAG;
    557 	version = drrb->drr_version;
    558 	flags = drrb->drr_flags;
    559 
    560 	if (byteswap) {
    561 		rbsa.type = BSWAP_32(rbsa.type);
    562 		rbsa.fromguid = BSWAP_64(rbsa.fromguid);
    563 		version = BSWAP_64(version);
    564 		flags = BSWAP_32(flags);
    565 	}
    566 
    567 	if (version != DMU_BACKUP_STREAM_VERSION ||
    568 	    rbsa.type >= DMU_OST_NUMTYPES ||
    569 	    ((flags & DRR_FLAG_CLONE) && origin == NULL))
    570 		return (EINVAL);
    571 
    572 	bzero(drc, sizeof (dmu_recv_cookie_t));
    573 	drc->drc_drrb = drrb;
    574 	drc->drc_tosnap = tosnap;
    575 	drc->drc_force = force;
    576 
    577 	/*
    578 	 * Process the begin in syncing context.
    579 	 */
    580 	if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) {
    581 		/* offline incremental receive */
    582 		err = dsl_dataset_open(tofs,
    583 		    DS_MODE_EXCLUSIVE, dmu_recv_tag, &ds);
    584 		if (err)
    585 			return (err);
    586 
    587 		/*
    588 		 * Only do the rollback if the most recent snapshot
    589 		 * matches the incremental source
    590 		 */
    591 		if (force) {
    592 			if (ds->ds_prev == NULL ||
    593 			    ds->ds_prev->ds_phys->ds_guid !=
    594 			    rbsa.fromguid) {
    595 				dsl_dataset_close(ds, DS_MODE_EXCLUSIVE,
    596 				    dmu_recv_tag);
    597 				return (ENODEV);
    598 			}
    599 			(void) dsl_dataset_rollback(ds, DMU_OST_NONE);
    600 		}
    601 		rbsa.force = B_FALSE;
    602 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
    603 		    recv_incremental_check,
    604 		    recv_offline_incremental_sync,
    605 		    ds, &rbsa, 1);
    606 		if (err) {
    607 			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, dmu_recv_tag);
    608 			return (err);
    609 		}
    610 		drc->drc_logical_ds = drc->drc_real_ds = ds;
    611 	} else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
    612 		/* online incremental receive */
    613 
    614 		/* tmp clone name is: tofs/%tosnap" */
    615 		(void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
    616 		    "%%%s", tosnap);
    617 
    618 		/* open the dataset we are logically receiving into */
    619 		err = dsl_dataset_open(tofs,
    620 		    DS_MODE_STANDARD, dmu_recv_tag, &ds);
    621 		if (err)
    622 			return (err);
    623 
    624 		rbsa.force = force;
    625 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
    626 		    recv_incremental_check,
    627 		    recv_online_incremental_sync, ds, &rbsa, 5);
    628 		if (err) {
    629 			dsl_dataset_close(ds, DS_MODE_STANDARD, dmu_recv_tag);
    630 			return (err);
    631 		}
    632 		drc->drc_logical_ds = ds;
    633 		drc->drc_real_ds = rbsa.ds;
    634 	} else {
    635 		/* create new fs -- full backup or clone */
    636 		dsl_dir_t *dd = NULL;
    637 		const char *tail;
    638 
    639 		err = dsl_dir_open(tofs, FTAG, &dd, &tail);
    640 		if (err)
    641 			return (err);
    642 		if (tail == NULL) {
    643 			if (!force) {
    644 				dsl_dir_close(dd, FTAG);
    645 				return (EEXIST);
    646 			}
    647 
    648 			rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
    649 			err = dsl_dataset_open_obj(dd->dd_pool,
    650 			    dd->dd_phys->dd_head_dataset_obj, NULL,
    651 			    DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT,
    652 			    FTAG, &ds);
    653 			rw_exit(&dd->dd_pool->dp_config_rwlock);
    654 			if (err) {
    655 				dsl_dir_close(dd, FTAG);
    656 				return (err);
    657 			}
    658 
    659 			err = dsl_sync_task_do(dd->dd_pool,
    660 			    recv_full_existing_check,
    661 			    recv_full_existing_sync, ds, &rbsa, 5);
    662 			/* if successful, sync task closes the ds for us */
    663 			if (err)
    664 				dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
    665 		} else {
    666 			err = dsl_sync_task_do(dd->dd_pool, recv_full_check,
    667 			    recv_full_sync, dd, &rbsa, 5);
    668 			if (err)
    669 				return (err);
    670 		}
    671 		dsl_dir_close(dd, FTAG);
    672 		if (err)
    673 			return (err);
    674 		drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
    675 		drc->drc_newfs = B_TRUE;
    676 	}
    677 
    678 	/* downgrade our hold on the ds from EXCLUSIVE to PRIMARY */
    679 	dsl_dataset_downgrade(drc->drc_real_ds,
    680 	    DS_MODE_EXCLUSIVE, DS_MODE_PRIMARY);
    681 
    682 	return (0);
    683 }
    684 
    685 struct restorearg {
    686 	int err;
    687 	int byteswap;
    688 	vnode_t *vp;
    689 	char *buf;
    690 	uint64_t voff;
    691 	int bufsize; /* amount of memory allocated for buf */
    692 	zio_cksum_t cksum;
    693 };
    694 
    695 static void *
    696 restore_read(struct restorearg *ra, int len)
    697 {
    698 	void *rv;
    699 	int done = 0;
    700 
    701 	/* some things will require 8-byte alignment, so everything must */
    702 	ASSERT3U(len % 8, ==, 0);
    703 
    704 	while (done < len) {
    705 		ssize_t resid;
    706 
    707 		ra->err = vn_rdwr(UIO_READ, ra->vp,
    708 		    (caddr_t)ra->buf + done, len - done,
    709 		    ra->voff, UIO_SYSSPACE, FAPPEND,
    710 		    RLIM64_INFINITY, CRED(), &resid);
    711 
    712 		if (resid == len - done)
    713 			ra->err = EINVAL;
    714 		ra->voff += len - done - resid;
    715 		done = len - resid;
    716 		if (ra->err)
    717 			return (NULL);
    718 	}
    719 
    720 	ASSERT3U(done, ==, len);
    721 	rv = ra->buf;
    722 	if (ra->byteswap)
    723 		fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
    724 	else
    725 		fletcher_4_incremental_native(rv, len, &ra->cksum);
    726 	return (rv);
    727 }
    728 
    729 static void
    730 backup_byteswap(dmu_replay_record_t *drr)
    731 {
    732 #define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
    733 #define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
    734 	drr->drr_type = BSWAP_32(drr->drr_type);
    735 	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
    736 	switch (drr->drr_type) {
    737 	case DRR_BEGIN:
    738 		DO64(drr_begin.drr_magic);
    739 		DO64(drr_begin.drr_version);
    740 		DO64(drr_begin.drr_creation_time);
    741 		DO32(drr_begin.drr_type);
    742 		DO32(drr_begin.drr_flags);
    743 		DO64(drr_begin.drr_toguid);
    744 		DO64(drr_begin.drr_fromguid);
    745 		break;
    746 	case DRR_OBJECT:
    747 		DO64(drr_object.drr_object);
    748 		/* DO64(drr_object.drr_allocation_txg); */
    749 		DO32(drr_object.drr_type);
    750 		DO32(drr_object.drr_bonustype);
    751 		DO32(drr_object.drr_blksz);
    752 		DO32(drr_object.drr_bonuslen);
    753 		break;
    754 	case DRR_FREEOBJECTS:
    755 		DO64(drr_freeobjects.drr_firstobj);
    756 		DO64(drr_freeobjects.drr_numobjs);
    757 		break;
    758 	case DRR_WRITE:
    759 		DO64(drr_write.drr_object);
    760 		DO32(drr_write.drr_type);
    761 		DO64(drr_write.drr_offset);
    762 		DO64(drr_write.drr_length);
    763 		break;
    764 	case DRR_FREE:
    765 		DO64(drr_free.drr_object);
    766 		DO64(drr_free.drr_offset);
    767 		DO64(drr_free.drr_length);
    768 		break;
    769 	case DRR_END:
    770 		DO64(drr_end.drr_checksum.zc_word[0]);
    771 		DO64(drr_end.drr_checksum.zc_word[1]);
    772 		DO64(drr_end.drr_checksum.zc_word[2]);
    773 		DO64(drr_end.drr_checksum.zc_word[3]);
    774 		break;
    775 	}
    776 #undef DO64
    777 #undef DO32
    778 }
    779 
    780 static int
    781 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
    782 {
    783 	int err;
    784 	dmu_tx_t *tx;
    785 
    786 	err = dmu_object_info(os, drro->drr_object, NULL);
    787 
    788 	if (err != 0 && err != ENOENT)
    789 		return (EINVAL);
    790 
    791 	if (drro->drr_type == DMU_OT_NONE ||
    792 	    drro->drr_type >= DMU_OT_NUMTYPES ||
    793 	    drro->drr_bonustype >= DMU_OT_NUMTYPES ||
    794 	    drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
    795 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
    796 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
    797 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
    798 	    drro->drr_blksz > SPA_MAXBLOCKSIZE ||
    799 	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
    800 		return (EINVAL);
    801 	}
    802 
    803 	tx = dmu_tx_create(os);
    804 
    805 	if (err == ENOENT) {
    806 		/* currently free, want to be allocated */
    807 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
    808 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1);
    809 		err = dmu_tx_assign(tx, TXG_WAIT);
    810 		if (err) {
    811 			dmu_tx_abort(tx);
    812 			return (err);
    813 		}
    814 		err = dmu_object_claim(os, drro->drr_object,
    815 		    drro->drr_type, drro->drr_blksz,
    816 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
    817 	} else {
    818 		/* currently allocated, want to be allocated */
    819 		dmu_tx_hold_bonus(tx, drro->drr_object);
    820 		/*
    821 		 * We may change blocksize, so need to
    822 		 * hold_write
    823 		 */
    824 		dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
    825 		err = dmu_tx_assign(tx, TXG_WAIT);
    826 		if (err) {
    827 			dmu_tx_abort(tx);
    828 			return (err);
    829 		}
    830 
    831 		err = dmu_object_reclaim(os, drro->drr_object,
    832 		    drro->drr_type, drro->drr_blksz,
    833 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
    834 	}
    835 	if (err) {
    836 		dmu_tx_commit(tx);
    837 		return (EINVAL);
    838 	}
    839 
    840 	dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
    841 	dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
    842 
    843 	if (drro->drr_bonuslen) {
    844 		dmu_buf_t *db;
    845 		void *data;
    846 		VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
    847 		dmu_buf_will_dirty(db, tx);
    848 
    849 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
    850 		data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
    851 		if (data == NULL) {
    852 			dmu_tx_commit(tx);
    853 			return (ra->err);
    854 		}
    855 		bcopy(data, db->db_data, drro->drr_bonuslen);
    856 		if (ra->byteswap) {
    857 			dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
    858 			    drro->drr_bonuslen);
    859 		}
    860 		dmu_buf_rele(db, FTAG);
    861 	}
    862 	dmu_tx_commit(tx);
    863 	return (0);
    864 }
    865 
    866 /* ARGSUSED */
    867 static int
    868 restore_freeobjects(struct restorearg *ra, objset_t *os,
    869     struct drr_freeobjects *drrfo)
    870 {
    871 	uint64_t obj;
    872 
    873 	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
    874 		return (EINVAL);
    875 
    876 	for (obj = drrfo->drr_firstobj;
    877 	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
    878 	    (void) dmu_object_next(os, &