Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * ZFS volume emulation driver.
     28  *
     29  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
     30  * Volumes are accessed through the symbolic links named:
     31  *
     32  * /dev/zvol/dsk/<pool_name>/<dataset_name>
     33  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
     34  *
     35  * These links are created by the ZFS-specific devfsadm link generator.
     36  * Volumes are persistent through reboot.  No user command needs to be
     37  * run before opening and using a device.
     38  */
     39 
     40 #include <sys/types.h>
     41 #include <sys/param.h>
     42 #include <sys/errno.h>
     43 #include <sys/uio.h>
     44 #include <sys/buf.h>
     45 #include <sys/modctl.h>
     46 #include <sys/open.h>
     47 #include <sys/kmem.h>
     48 #include <sys/conf.h>
     49 #include <sys/cmn_err.h>
     50 #include <sys/stat.h>
     51 #include <sys/zap.h>
     52 #include <sys/spa.h>
     53 #include <sys/zio.h>
     54 #include <sys/dmu_traverse.h>
     55 #include <sys/dnode.h>
     56 #include <sys/dsl_dataset.h>
     57 #include <sys/dsl_prop.h>
     58 #include <sys/dkio.h>
     59 #include <sys/efi_partition.h>
     60 #include <sys/byteorder.h>
     61 #include <sys/pathname.h>
     62 #include <sys/ddi.h>
     63 #include <sys/sunddi.h>
     64 #include <sys/crc32.h>
     65 #include <sys/dirent.h>
     66 #include <sys/policy.h>
     67 #include <sys/fs/zfs.h>
     68 #include <sys/zfs_ioctl.h>
     69 #include <sys/mkdev.h>
     70 #include <sys/zil.h>
     71 #include <sys/refcount.h>
     72 #include <sys/zfs_znode.h>
     73 #include <sys/zfs_rlock.h>
     74 #include <sys/vdev_disk.h>
     75 #include <sys/vdev_impl.h>
     76 #include <sys/zvol.h>
     77 #include <sys/dumphdr.h>
     78 
     79 #include "zfs_namecheck.h"
     80 
     81 static void *zvol_state;
     82 
     83 #define	ZVOL_DUMPSIZE		"dumpsize"
     84 
     85 /*
     86  * This lock protects the zvol_state structure from being modified
     87  * while it's being used, e.g. an open that comes in before a create
     88  * finishes.  It also protects temporary opens of the dataset so that,
     89  * e.g., an open doesn't get a spurious EBUSY.
     90  */
     91 static kmutex_t zvol_state_lock;
     92 static uint32_t zvol_minors;
     93 
     94 #define	NUM_EXTENTS	((SPA_MAXBLOCKSIZE) / sizeof (zvol_extent_t))
     95 
     96 typedef struct zvol_extent {
     97 	dva_t		ze_dva;		/* dva associated with this extent */
     98 	uint64_t	ze_stride;	/* extent stride */
     99 	uint64_t	ze_size;	/* number of blocks in extent */
    100 } zvol_extent_t;
    101 
    102 /*
    103  * The list of extents associated with the dump device
    104  */
    105 typedef struct zvol_ext_list {
    106 	zvol_extent_t		zl_extents[NUM_EXTENTS];
    107 	struct zvol_ext_list	*zl_next;
    108 } zvol_ext_list_t;
    109 
    110 /*
    111  * The in-core state of each volume.
    112  */
    113 typedef struct zvol_state {
    114 	char		zv_name[MAXPATHLEN]; /* pool/dd name */
    115 	uint64_t	zv_volsize;	/* amount of space we advertise */
    116 	uint64_t	zv_volblocksize; /* volume block size */
    117 	minor_t		zv_minor;	/* minor number */
    118 	uint8_t		zv_min_bs;	/* minimum addressable block shift */
    119 	uint8_t		zv_flags;	/* readonly; dumpified */
    120 	objset_t	*zv_objset;	/* objset handle */
    121 	uint32_t	zv_mode;	/* DS_MODE_* flags at open time */
    122 	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
    123 	uint32_t	zv_total_opens;	/* total open count */
    124 	zilog_t		*zv_zilog;	/* ZIL handle */
    125 	zvol_ext_list_t	*zv_list;	/* List of extents for dump */
    126 	uint64_t	zv_txg_assign;	/* txg to assign during ZIL replay */
    127 	znode_t		zv_znode;	/* for range locking */
    128 } zvol_state_t;
    129 
    130 /*
    131  * zvol specific flags
    132  */
    133 #define	ZVOL_RDONLY	0x1
    134 #define	ZVOL_DUMPIFIED	0x2
    135 #define	ZVOL_EXCL	0x4
    136 
    137 /*
    138  * zvol maximum transfer in one DMU tx.
    139  */
    140 int zvol_maxphys = DMU_MAX_ACCESS/2;
    141 
    142 extern int zfs_set_prop_nvlist(const char *, nvlist_t *);
    143 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
    144 static int zvol_dumpify(zvol_state_t *zv);
    145 static int zvol_dump_fini(zvol_state_t *zv);
    146 static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
    147 
    148 static void
    149 zvol_size_changed(zvol_state_t *zv, major_t maj)
    150 {
    151 	dev_t dev = makedevice(maj, zv->zv_minor);
    152 
    153 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
    154 	    "Size", zv->zv_volsize) == DDI_SUCCESS);
    155 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
    156 	    "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS);
    157 
    158 	/* Notify specfs to invalidate the cached size */
    159 	spec_size_invalidate(dev, VBLK);
    160 	spec_size_invalidate(dev, VCHR);
    161 }
    162 
    163 int
    164 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
    165 {
    166 	if (volsize == 0)
    167 		return (EINVAL);
    168 
    169 	if (volsize % blocksize != 0)
    170 		return (EINVAL);
    171 
    172 #ifdef _ILP32
    173 	if (volsize - 1 > SPEC_MAXOFFSET_T)
    174 		return (EOVERFLOW);
    175 #endif
    176 	return (0);
    177 }
    178 
    179 int
    180 zvol_check_volblocksize(uint64_t volblocksize)
    181 {
    182 	if (volblocksize < SPA_MINBLOCKSIZE ||
    183 	    volblocksize > SPA_MAXBLOCKSIZE ||
    184 	    !ISP2(volblocksize))
    185 		return (EDOM);
    186 
    187 	return (0);
    188 }
    189 
    190 static void
    191 zvol_readonly_changed_cb(void *arg, uint64_t newval)
    192 {
    193 	zvol_state_t *zv = arg;
    194 
    195 	if (newval)
    196 		zv->zv_flags |= ZVOL_RDONLY;
    197 	else
    198 		zv->zv_flags &= ~ZVOL_RDONLY;
    199 }
    200 
    201 int
    202 zvol_get_stats(objset_t *os, nvlist_t *nv)
    203 {
    204 	int error;
    205 	dmu_object_info_t doi;
    206 	uint64_t val;
    207 
    208 
    209 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
    210 	if (error)
    211 		return (error);
    212 
    213 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
    214 
    215 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
    216 
    217 	if (error == 0) {
    218 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
    219 		    doi.doi_data_block_size);
    220 	}
    221 
    222 	return (error);
    223 }
    224 
    225 /*
    226  * Find a free minor number.
    227  */
    228 static minor_t
    229 zvol_minor_alloc(void)
    230 {
    231 	minor_t minor;
    232 
    233 	ASSERT(MUTEX_HELD(&zvol_state_lock));
    234 
    235 	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++)
    236 		if (ddi_get_soft_state(zvol_state, minor) == NULL)
    237 			return (minor);
    238 
    239 	return (0);
    240 }
    241 
    242 static zvol_state_t *
    243 zvol_minor_lookup(const char *name)
    244 {
    245 	minor_t minor;
    246 	zvol_state_t *zv;
    247 
    248 	ASSERT(MUTEX_HELD(&zvol_state_lock));
    249 
    250 	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) {
    251 		zv = ddi_get_soft_state(zvol_state, minor);
    252 		if (zv == NULL)
    253 			continue;
    254 		if (strcmp(zv->zv_name, name) == 0)
    255 			break;
    256 	}
    257 
    258 	return (zv);
    259 }
    260 
    261 void
    262 zvol_init_extent(zvol_extent_t *ze, blkptr_t *bp)
    263 {
    264 	ze->ze_dva = bp->blk_dva[0];	/* structure assignment */
    265 	ze->ze_stride = 0;
    266 	ze->ze_size = 1;
    267 }
    268 
    269 /* extent mapping arg */
    270 struct maparg {
    271 	zvol_ext_list_t	*ma_list;
    272 	zvol_extent_t	*ma_extent;
    273 	int		ma_gang;
    274 };
    275 
    276 /*ARGSUSED*/
    277 static int
    278 zvol_map_block(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
    279 {
    280 	zbookmark_t *zb = &bc->bc_bookmark;
    281 	blkptr_t *bp = &bc->bc_blkptr;
    282 	void *data = bc->bc_data;
    283 	dnode_phys_t *dnp = bc->bc_dnode;
    284 	struct maparg *ma = (struct maparg *)arg;
    285 	uint64_t stride;
    286 
    287 	/* If there is an error, then keep trying to make progress */
    288 	if (bc->bc_errno)
    289 		return (ERESTART);
    290 
    291 #ifdef ZFS_DEBUG
    292 	if (zb->zb_level == -1) {
    293 		ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
    294 		ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
    295 	} else {
    296 		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
    297 		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
    298 	}
    299 
    300 	if (zb->zb_level > 0) {
    301 		uint64_t fill = 0;
    302 		blkptr_t *bpx, *bpend;
    303 
    304 		for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx);
    305 		    bpx < bpend; bpx++) {
    306 			if (bpx->blk_birth != 0) {
    307 				fill += bpx->blk_fill;
    308 			} else {
    309 				ASSERT(bpx->blk_fill == 0);
    310 			}
    311 		}
    312 		ASSERT3U(fill, ==, bp->blk_fill);
    313 	}
    314 
    315 	if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) {
    316 		uint64_t fill = 0;
    317 		dnode_phys_t *dnx, *dnend;
    318 
    319 		for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT);
    320 		    dnx < dnend; dnx++) {
    321 			if (dnx->dn_type != DMU_OT_NONE)
    322 				fill++;
    323 		}
    324 		ASSERT3U(fill, ==, bp->blk_fill);
    325 	}
    326 #endif
    327 
    328 	if (zb->zb_level || dnp->dn_type == DMU_OT_DNODE)
    329 		return (0);
    330 
    331 	/* Abort immediately if we have encountered gang blocks */
    332 	if (BP_IS_GANG(bp)) {
    333 		ma->ma_gang++;
    334 		return (EINTR);
    335 	}
    336 
    337 	/* first time? */
    338 	if (ma->ma_extent->ze_size == 0) {
    339 		zvol_init_extent(ma->ma_extent, bp);
    340 		return (0);
    341 	}
    342 
    343 	stride = (DVA_GET_OFFSET(&bp->blk_dva[0])) -
    344 	    ((DVA_GET_OFFSET(&ma->ma_extent->ze_dva)) +
    345 	    (ma->ma_extent->ze_size - 1) * (ma->ma_extent->ze_stride));
    346 	if (DVA_GET_VDEV(BP_IDENTITY(bp)) ==
    347 	    DVA_GET_VDEV(&ma->ma_extent->ze_dva)) {
    348 		if (ma->ma_extent->ze_stride == 0) {
    349 			/* second block in this extent */
    350 			ma->ma_extent->ze_stride = stride;
    351 			ma->ma_extent->ze_size++;
    352 			return (0);
    353 		} else if (ma->ma_extent->ze_stride == stride) {
    354 			/*
    355 			 * the block we allocated has the same
    356 			 * stride
    357 			 */
    358 			ma->ma_extent->ze_size++;
    359 			return (0);
    360 		}
    361 	}
    362 
    363 	/*
    364 	 * dtrace -n 'zfs-dprintf
    365 	 * /stringof(arg0) == "zvol.c"/
    366 	 * {
    367 	 *	printf("%s: %s", stringof(arg1), stringof(arg3))
    368 	 * } '
    369 	 */
    370 	dprintf("ma_extent 0x%lx mrstride 0x%lx stride %lx\n",
    371 	    ma->ma_extent->ze_size, ma->ma_extent->ze_stride, stride);
    372 	dprintf_bp(bp, "%s", "next blkptr:");
    373 	/* start a new extent */
    374 	if (ma->ma_extent == &ma->ma_list->zl_extents[NUM_EXTENTS - 1]) {
    375 		ma->ma_list->zl_next = kmem_zalloc(sizeof (zvol_ext_list_t),
    376 		    KM_SLEEP);
    377 		ma->ma_list = ma->ma_list->zl_next;
    378 		ma->ma_extent = &ma->ma_list->zl_extents[0];
    379 	} else {
    380 		ma->ma_extent++;
    381 	}
    382 	zvol_init_extent(ma->ma_extent, bp);
    383 	return (0);
    384 }
    385 
    386 /* ARGSUSED */
    387 void
    388 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
    389 {
    390 	zfs_creat_t *zct = arg;
    391 	nvlist_t *nvprops = zct->zct_props;
    392 	int error;
    393 	uint64_t volblocksize, volsize;
    394 
    395 	VERIFY(nvlist_lookup_uint64(nvprops,
    396 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
    397 	if (nvlist_lookup_uint64(nvprops,
    398 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
    399 		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
    400 
    401 	/*
    402 	 * These properties must be removed from the list so the generic
    403 	 * property setting step won't apply to them.
    404 	 */
    405 	VERIFY(nvlist_remove_all(nvprops,
    406 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
    407 	(void) nvlist_remove_all(nvprops,
    408 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
    409 
    410 	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
    411 	    DMU_OT_NONE, 0, tx);
    412 	ASSERT(error == 0);
    413 
    414 	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
    415 	    DMU_OT_NONE, 0, tx);
    416 	ASSERT(error == 0);
    417 
    418 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
    419 	ASSERT(error == 0);
    420 }
    421 
    422 /*
    423  * Replay a TX_WRITE ZIL transaction that didn't get committed
    424  * after a system failure
    425  */
    426 static int
    427 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
    428 {
    429 	objset_t *os = zv->zv_objset;
    430 	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
    431 	uint64_t off = lr->lr_offset;
    432 	uint64_t len = lr->lr_length;
    433 	dmu_tx_t *tx;
    434 	int error;
    435 
    436 	if (byteswap)
    437 		byteswap_uint64_array(lr, sizeof (*lr));
    438 
    439 	tx = dmu_tx_create(os);
    440 	dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
    441 	error = dmu_tx_assign(tx, zv->zv_txg_assign);
    442 	if (error) {
    443 		dmu_tx_abort(tx);
    444 	} else {
    445 		dmu_write(os, ZVOL_OBJ, off, len, data, tx);
    446 		dmu_tx_commit(tx);
    447 	}
    448 
    449 	return (error);
    450 }
    451 
    452 /* ARGSUSED */
    453 static int
    454 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
    455 {
    456 	return (ENOTSUP);
    457 }
    458 
    459 /*
    460  * Callback vectors for replaying records.
    461  * Only TX_WRITE is needed for zvol.
    462  */
    463 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
    464 	zvol_replay_err,	/* 0 no such transaction type */
    465 	zvol_replay_err,	/* TX_CREATE */
    466 	zvol_replay_err,	/* TX_MKDIR */
    467 	zvol_replay_err,	/* TX_MKXATTR */
    468 	zvol_replay_err,	/* TX_SYMLINK */
    469 	zvol_replay_err,	/* TX_REMOVE */
    470 	zvol_replay_err,	/* TX_RMDIR */
    471 	zvol_replay_err,	/* TX_LINK */
    472 	zvol_replay_err,	/* TX_RENAME */
    473 	zvol_replay_write,	/* TX_WRITE */
    474 	zvol_replay_err,	/* TX_TRUNCATE */
    475 	zvol_replay_err,	/* TX_SETATTR */
    476 	zvol_replay_err,	/* TX_ACL */
    477 };
    478 
    479 /*
    480  * reconstruct dva that gets us to the desired offset (offset
    481  * is in bytes)
    482  */
    483 int
    484 zvol_get_dva(zvol_state_t *zv, uint64_t offset, dva_t *dva)
    485 {
    486 	zvol_ext_list_t	*zl;
    487 	zvol_extent_t	*ze;
    488 	int		idx;
    489 	uint64_t	tmp;
    490 
    491 	if ((zl = zv->zv_list) == NULL)
    492 		return (EIO);
    493 	idx = 0;
    494 	ze =  &zl->zl_extents[0];
    495 	while (offset >= ze->ze_size * zv->zv_volblocksize) {
    496 		offset -= ze->ze_size * zv->zv_volblocksize;
    497 
    498 		if (idx == NUM_EXTENTS - 1) {
    499 			/* we've reached the end of this array */
    500 			ASSERT(zl->zl_next != NULL);
    501 			if (zl->zl_next == NULL)
    502 				return (-1);
    503 			zl = zl->zl_next;
    504 			ze = &zl->zl_extents[0];
    505 			idx = 0;
    506 		} else {
    507 			ze++;
    508 			idx++;
    509 		}
    510 	}
    511 	DVA_SET_VDEV(dva, DVA_GET_VDEV(&ze->ze_dva));
    512 	tmp = DVA_GET_OFFSET((&ze->ze_dva));
    513 	tmp += (ze->ze_stride * (offset / zv->zv_volblocksize));
    514 	DVA_SET_OFFSET(dva, tmp);
    515 	return (0);
    516 }
    517 
    518 static void
    519 zvol_free_extents(zvol_state_t *zv)
    520 {
    521 	zvol_ext_list_t *zl;
    522 	zvol_ext_list_t *tmp;
    523 
    524 	if (zv->zv_list != NULL) {
    525 		zl = zv->zv_list;
    526 		while (zl != NULL) {
    527 			tmp = zl->zl_next;
    528 			kmem_free(zl, sizeof (zvol_ext_list_t));
    529 			zl = tmp;
    530 		}
    531 		zv->zv_list = NULL;
    532 	}
    533 }
    534 
    535 int
    536 zvol_get_lbas(zvol_state_t *zv)
    537 {
    538 	struct maparg	ma;
    539 	zvol_ext_list_t	*zl;
    540 	zvol_extent_t	*ze;
    541 	uint64_t	blocks = 0;
    542 	int		err;
    543 
    544 	ma.ma_list = zl = kmem_zalloc(sizeof (zvol_ext_list_t), KM_SLEEP);
    545 	ma.ma_extent = &ma.ma_list->zl_extents[0];
    546 	ma.ma_gang = 0;
    547 	zv->zv_list = ma.ma_list;
    548 
    549 	err = traverse_zvol(zv->zv_objset, ADVANCE_PRE, zvol_map_block, &ma);
    550 	if (err == EINTR && ma.ma_gang) {
    551 		/*
    552 		 * We currently don't support dump devices when the pool
    553 		 * is so fragmented that our allocation has resulted in
    554 		 * gang blocks.
    555 		 */
    556 		zvol_free_extents(zv);
    557 		return (EFRAGS);
    558 	}
    559 	ASSERT3U(err, ==, 0);
    560 
    561 	ze = &zl->zl_extents[0];
    562 	while (ze) {
    563 		blocks += ze->ze_size;
    564 		if (ze == &zl->zl_extents[NUM_EXTENTS - 1]) {
    565 			zl = zl->zl_next;
    566 			ze = &zl->zl_extents[0];
    567 		} else {
    568 			ze++;
    569 		}
    570 	}
    571 	if (blocks != (zv->zv_volsize / zv->zv_volblocksize)) {
    572 		zvol_free_extents(zv);
    573 		return (EIO);
    574 	}
    575 
    576 	return (0);
    577 }
    578 
    579 /*
    580  * Create a minor node (plus a whole lot more) for the specified volume.
    581  */
    582 int
    583 zvol_create_minor(const char *name, major_t maj)
    584 {
    585 	zvol_state_t *zv;
    586 	objset_t *os;
    587 	dmu_object_info_t doi;
    588 	uint64_t volsize;
    589 	minor_t minor = 0;
    590 	struct pathname linkpath;
    591 	int ds_mode = DS_MODE_OWNER;
    592 	vnode_t *vp = NULL;
    593 	char *devpath;
    594 	size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(name) + 1;
    595 	char chrbuf[30], blkbuf[30];
    596 	int error;
    597 
    598 	mutex_enter(&zvol_state_lock);
    599 
    600 	if ((zv = zvol_minor_lookup(name)) != NULL) {
    601 		mutex_exit(&zvol_state_lock);
    602 		return (EEXIST);
    603 	}
    604 
    605 	if (strchr(name, '@') != 0)
    606 		ds_mode |= DS_MODE_READONLY;
    607 
    608 	error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
    609 
    610 	if (error) {
    611 		mutex_exit(&zvol_state_lock);
    612 		return (error);
    613 	}
    614 
    615 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
    616 
    617 	if (error) {
    618 		dmu_objset_close(os);
    619 		mutex_exit(&zvol_state_lock);
    620 		return (error);
    621 	}
    622 
    623 	/*
    624 	 * If there's an existing /dev/zvol symlink, try to use the
    625 	 * same minor number we used last time.
    626 	 */
    627 	devpath = kmem_alloc(devpathlen, KM_SLEEP);
    628 
    629 	(void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, name);
    630 
    631 	error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp);
    632 
    633 	kmem_free(devpath, devpathlen);
    634 
    635 	if (error == 0 && vp->v_type != VLNK)
    636 		error = EINVAL;
    637 
    638 	if (error == 0) {
    639 		pn_alloc(&linkpath);
    640 		error = pn_getsymlink(vp, &linkpath, kcred);
    641 		if (error == 0) {
    642 			char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV);
    643 			if (ms != NULL) {
    644 				ms += strlen(ZVOL_PSEUDO_DEV);
    645 				minor = stoi(&ms);
    646 			}
    647 		}
    648 		pn_free(&linkpath);
    649 	}
    650 
    651 	if (vp != NULL)
    652 		VN_RELE(vp);
    653 
    654 	/*
    655 	 * If we found a minor but it's already in use, we must pick a new one.
    656 	 */
    657 	if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL)
    658 		minor = 0;
    659 
    660 	if (minor == 0)
    661 		minor = zvol_minor_alloc();
    662 
    663 	if (minor == 0) {
    664 		dmu_objset_close(os);
    665 		mutex_exit(&zvol_state_lock);
    666 		return (ENXIO);
    667 	}
    668 
    669 	if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) {
    670 		dmu_objset_close(os);
    671 		mutex_exit(&zvol_state_lock);
    672 		return (EAGAIN);
    673 	}
    674 
    675 	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
    676 	    (char *)name);
    677 
    678 	(void) sprintf(chrbuf, "%uc,raw", minor);
    679 
    680 	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
    681 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
    682 		ddi_soft_state_free(zvol_state, minor);
    683 		dmu_objset_close(os);
    684 		mutex_exit(&zvol_state_lock);
    685 		return (EAGAIN);
    686 	}
    687 
    688 	(void) sprintf(blkbuf, "%uc", minor);
    689 
    690 	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
    691 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
    692 		ddi_remove_minor_node(zfs_dip, chrbuf);
    693 		ddi_soft_state_free(zvol_state, minor);
    694 		dmu_objset_close(os);
    695 		mutex_exit(&zvol_state_lock);
    696 		return (EAGAIN);
    697 	}
    698 
    699 	zv = ddi_get_soft_state(zvol_state, minor);
    700 
    701 	(void) strcpy(zv->zv_name, name);
    702 	zv->zv_min_bs = DEV_BSHIFT;
    703 	zv->zv_minor = minor;
    704 	zv->zv_volsize = volsize;
    705 	zv->zv_objset = os;
    706 	zv->zv_mode = ds_mode;
    707 	zv->zv_zilog = zil_open(os, zvol_get_data);
    708 	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
    709 	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
    710 	    sizeof (rl_t), offsetof(rl_t, r_node));
    711 	/* get and cache the blocksize */
    712 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
    713 	ASSERT(error == 0);
    714 	zv->zv_volblocksize = doi.doi_data_block_size;
    715 
    716 	zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL);
    717 	zvol_size_changed(zv, maj);
    718 
    719 	/* XXX this should handle the possible i/o error */
    720 	VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
    721 	    "readonly", zvol_readonly_changed_cb, zv) == 0);
    722 
    723 	zvol_minors++;
    724 
    725 	mutex_exit(&zvol_state_lock);
    726 
    727 	return (0);
    728 }
    729 
    730 /*
    731  * Remove minor node for the specified volume.
    732  */
    733 int
    734 zvol_remove_minor(const char *name)
    735 {
    736 	zvol_state_t *zv;
    737 	char namebuf[30];
    738 
    739 	mutex_enter(&zvol_state_lock);
    740 
    741 	if ((zv = zvol_minor_lookup(name)) == NULL) {
    742 		mutex_exit(&zvol_state_lock);
    743 		return (ENXIO);
    744 	}
    745 
    746 	if (zv->zv_total_opens != 0) {
    747 		mutex_exit(&zvol_state_lock);
    748 		return (EBUSY);
    749 	}
    750 
    751 	(void) sprintf(namebuf, "%uc,raw", zv->zv_minor);
    752 	ddi_remove_minor_node(zfs_dip, namebuf);
    753 
    754 	(void) sprintf(namebuf, "%uc", zv->zv_minor);
    755 	ddi_remove_minor_node(zfs_dip, namebuf);
    756 
    757 	VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
    758 	    "readonly", zvol_readonly_changed_cb, zv) == 0);
    759 
    760 	zil_close(zv->zv_zilog);
    761 	zv->zv_zilog = NULL;
    762 	dmu_objset_close(zv->zv_objset);
    763 	zv->zv_objset = NULL;
    764 	avl_destroy(&zv->zv_znode.z_range_avl);
    765 	mutex_destroy(&zv->zv_znode.z_range_lock);
    766 
    767 	ddi_soft_state_free(zvol_state, zv->zv_minor);
    768 
    769 	zvol_minors--;
    770 
    771 	mutex_exit(&zvol_state_lock);
    772 
    773 	return (0);
    774 }
    775 
    776 int
    777 zvol_prealloc(zvol_state_t *zv)
    778 {
    779 	objset_t *os = zv->zv_objset;
    780 	dmu_tx_t *tx;
    781 	void *data;
    782 	uint64_t refd, avail, usedobjs, availobjs;
    783 	uint64_t resid = zv->zv_volsize;
    784 	uint64_t off = 0;
    785 
    786 	/* Check the space usage before attempting to allocate the space */
    787 	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
    788 	if (avail < zv->zv_volsize)
    789 		return (ENOSPC);
    790 
    791 	/* Free old extents if they exist */
    792 	zvol_free_extents(zv);
    793 
    794 	/* allocate the blocks by writing each one */
    795 	data = kmem_zalloc(SPA_MAXBLOCKSIZE, KM_SLEEP);
    796 
    797 	while (resid != 0) {
    798 		int error;
    799 		uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
    800 
    801 		tx = dmu_tx_create(os);
    802 		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
    803 		error = dmu_tx_assign(tx, TXG_WAIT);
    804 		if (error) {
    805 			dmu_tx_abort(tx);
    806 			kmem_free(data, SPA_MAXBLOCKSIZE);
    807 			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
    808 			return (error);
    809 		}
    810 		dmu_write(os, ZVOL_OBJ, off, bytes, data, tx);
    811 		dmu_tx_commit(tx);
    812 		off += bytes;
    813 		resid -= bytes;
    814 	}
    815 	kmem_free(data, SPA_MAXBLOCKSIZE);
    816 	txg_wait_synced(dmu_objset_pool(os), 0);
    817 
    818 	return (0);
    819 }
    820 
    821 int
    822 zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize)
    823 {
    824 	dmu_tx_t *tx;
    825 	int error;
    826 
    827 	ASSERT(MUTEX_HELD(&zvol_state_lock));
    828 
    829 	tx = dmu_tx_create(zv->zv_objset);
    830 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
    831 	error = dmu_tx_assign(tx, TXG_WAIT);
    832 	if (error) {
    833 		dmu_tx_abort(tx);
    834 		return (error);
    835 	}
    836 
    837 	error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
    838 	    &volsize, tx);
    839 	dmu_tx_commit(tx);
    840 
    841 	if (error == 0)
    842 		error = dmu_free_long_range(zv->zv_objset,
    843 		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
    844 
    845 	/*
    846 	 * If we are using a faked-up state (zv_minor == 0) then don't
    847 	 * try to update the in-core zvol state.
    848 	 */
    849 	if (error == 0 && zv->zv_minor) {
    850 		zv->zv_volsize = volsize;
    851 		zvol_size_changed(zv, maj);
    852 	}
    853 	return (error);
    854 }
    855 
    856 int
    857 zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
    858 {
    859 	zvol_state_t *zv;
    860 	int error;
    861 	dmu_object_info_t doi;
    862 	uint64_t old_volsize = 0ULL;
    863 	zvol_state_t state = { 0 };
    864 
    865 	mutex_enter(&zvol_state_lock);
    866 
    867 	if ((zv = zvol_minor_lookup(name)) == NULL) {
    868 		/*
    869 		 * If we are doing a "zfs clone -o volsize=", then the
    870 		 * minor node won't exist yet.
    871 		 */
    872 		error = dmu_objset_open(name, DMU_OST_ZVOL, DS_MODE_OWNER,
    873 		    &state.zv_objset);
    874 		if (error != 0)
    875 			goto out;
    876 		zv = &state;
    877 	}
    878 	old_volsize = zv->zv_volsize;
    879 
    880 	if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
    881 	    (error = zvol_check_volsize(volsize,
    882 	    doi.doi_data_block_size)) != 0)
    883 		goto out;
    884 
    885 	if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) {
    886 		error = EROFS;
    887 		goto out;
    888 	}
    889 
    890 	error = zvol_update_volsize(zv, maj, volsize);
    891 
    892 	/*
    893 	 * Reinitialize the dump area to the new size. If we
    894 	 * failed to resize the dump area then restore the it back to
    895 	 * it's original size.
    896 	 */
    897 	if (error == 0 && zv->zv_flags & ZVOL_DUMPIFIED) {
    898 		if ((error = zvol_dumpify(zv)) != 0 ||
    899 		    (error = dumpvp_resize()) != 0) {
    900 			(void) zvol_update_volsize(zv, maj, old_volsize);
    901 			error = zvol_dumpify(zv);
    902 		}
    903 	}
    904 
    905 out:
    906 	if (state.zv_objset)
    907 		dmu_objset_close(state.zv_objset);
    908 
    909 	mutex_exit(&zvol_state_lock);
    910 
    911 	return (error);
    912 }
    913 
    914 int
    915 zvol_set_volblocksize(const char *name, uint64_t volblocksize)
    916 {
    917 	zvol_state_t *zv;
    918 	dmu_tx_t *tx;
    919 	int error;
    920 
    921 	mutex_enter(&zvol_state_lock);
    922 
    923 	if ((zv = zvol_minor_lookup(name)) == NULL) {
    924 		mutex_exit(&zvol_state_lock);
    925 		return (ENXIO);
    926 	}
    927 	if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) {
    928 		mutex_exit(&zvol_state_lock);
    929 		return (EROFS);
    930 	}
    931 
    932 	tx = dmu_tx_create(zv->zv_objset);
    933 	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
    934 	error = dmu_tx_assign(tx, TXG_WAIT);
    935 	if (error) {
    936 		dmu_tx_abort(tx);
    937 	} else {
    938 		error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
    939 		    volblocksize, 0, tx);
    940 		if (error == ENOTSUP)
    941 			error = EBUSY;
    942 		dmu_tx_commit(tx);
    943 	}
    944 
    945 	mutex_exit(&zvol_state_lock);
    946 
    947 	return (error);
    948 }
    949 
    950 /*ARGSUSED*/
    951 int
    952 zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
    953 {
    954 	minor_t minor = getminor(*devp);
    955 	zvol_state_t *zv;
    956 
    957 	if (minor == 0)			/* This is the control device */
    958 		return (0);
    959 
    960 	mutex_enter(&zvol_state_lock);
    961 
    962 	zv = ddi_get_soft_state(zvol_state, minor);
    963 	if (zv == NULL) {
    964 		mutex_exit(&zvol_state_lock);
    965 		return (ENXIO);
    966 	}
    967 
    968 	ASSERT(zv->zv_objset != NULL);
    969 
    970 	if ((flag & FWRITE) &&
    971 	    (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY))) {
    972 		mutex_exit(&zvol_state_lock);
    973 		return (EROFS);
    974 	}
    975 	if (zv->zv_flags & ZVOL_EXCL) {
    976 		mutex_exit(&zvol_state_lock);
    977 		return (EBUSY);
    978 	}
    979 	if (flag & FEXCL) {
    980 		if (zv->zv_total_opens != 0) {
    981 			mutex_exit(&zvol_state_lock);
    982 			return (EBUSY);
    983 		}
    984 		zv->zv_flags |= ZVOL_EXCL;
    985 	}
    986 
    987 	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
    988 		zv->zv_open_count[otyp]++;
    989 		zv->zv_total_opens++;
    990 	}
    991 
    992 	mutex_exit(&zvol_state_lock);
    993 
    994 	return (0);
    995 }
    996 
    997 /*ARGSUSED*/
    998 int
    999 zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
   1000 {
   1001 	minor_t minor = getminor(dev);
   1002 	zvol_state_t *zv;
   1003 
   1004 	if (minor == 0)		/* This is the control device */
   1005 		return (0);
   1006 
   1007 	mutex_enter(&zvol_state_lock);
   1008 
   1009 	zv = ddi_get_soft_state(zvol_state, minor);
   1010 	if (zv == NULL) {
   1011 		mutex_exit(&zvol_state_lock);
   1012 		return (ENXIO);
   1013 	}
   1014 
   1015 	if (zv->zv_flags & ZVOL_EXCL) {
   1016 		ASSERT(zv->zv_total_opens == 1);
   1017 		zv->zv_flags &= ~ZVOL_EXCL;
   1018 	}
   1019 
   1020 	/*
   1021 	 * If the open count is zero, this is a spurious close.
   1022 	 * That indicates a bug in the kernel / DDI framework.
   1023 	 */
   1024 	ASSERT(zv->