Home | History | Annotate | Download | only in io
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 
     27 #include <sys/debug.h>
     28 #include <sys/types.h>
     29 #include <sys/file.h>
     30 #include <sys/errno.h>
     31 #include <sys/uio.h>
     32 #include <sys/open.h>
     33 #include <sys/cred.h>
     34 #include <sys/kmem.h>
     35 #include <sys/conf.h>
     36 #include <sys/cmn_err.h>
     37 #include <sys/modctl.h>
     38 #include <sys/disp.h>
     39 #include <sys/atomic.h>
     40 #include <sys/filio.h>
     41 #include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */
     42 #include <sys/kstat.h>
     43 
     44 #include <sys/ddi.h>
     45 #include <sys/devops.h>
     46 #include <sys/sunddi.h>
     47 #include <sys/esunddi.h>
     48 #include <sys/priv_names.h>
     49 
     50 #include <sys/fssnap.h>
     51 #include <sys/fssnap_if.h>
     52 
     53 /*
     54  * This module implements the file system snapshot code, which provides a
     55  * point-in-time image of a file system for the purposes of online backup.
     56  * There are essentially two parts to this project: the driver half and the
     57  * file system half.  The driver half is a pseudo device driver called
     58  * "fssnap" that represents the snapshot.  Each snapshot is assigned a
     59  * number that corresponds to the minor number of the device, and a control
     60  * device with a high minor number is used to initiate snapshot creation and
     61  * deletion.  For all practical purposes the driver half acts like a
     62  * read-only disk device whose contents are exactly the same as the master
     63  * file system at the time the snapshot was created.
     64  *
     65  * The file system half provides interfaces necessary for performing the
     66  * file system dependent operations required to create and delete snapshots
     67  * and a special driver strategy routine that must always be used by the file
     68  * system for snapshots to work correctly.
     69  *
     70  * When a snapshot is to be created, the user utility will send an ioctl to
     71  * the control device of the driver half specifying the file system to be
     72  * snapshotted, the file descriptor of a backing-store file which is used to
     73  * hold old data before it is overwritten, and other snapshot parameters.
     74  * This ioctl is passed on to the file system specified in the original
     75  * ioctl request.  The file system is expected to be able to flush
     76  * everything out to make the file system consistent and lock it to ensure
     77  * no changes occur while the snapshot is being created.  It then calls
     78  * fssnap_create() to create state for a new snapshot, from which an opaque
     79  * handle is returned with the snapshot locked.  Next, the file system must
     80  * populate the "candidate bitmap", which tells the snapshot code which
     81  * "chunks" should be considered for copy-on-write (a chunk is the unit of
     82  * granularity used for copy-on-write, which is independent of the device
     83  * and file system block sizes).  This is typically done by scanning the
     84  * file system allocation bitmaps to determine which chunks contain
     85  * allocated blocks in the file system at the time the snapshot was created.
     86  * If a chunk has no allocated blocks, it does not need to be copied before
     87  * being written to.  Once the candidate bitmap is populated with
     88  * fssnap_set_candidate(), the file system calls fssnap_create_done() to
     89  * complete the snapshot creation and unlock the snapshot.  The file system
     90  * may now be unlocked and modifications to it resumed.
     91  *
     92  * Once a snapshot is created, the file system must perform all writes
     93  * through a special strategy routine, fssnap_strategy().  This strategy
     94  * routine determines whether the chunks contained by the write must be
     95  * copied before being overwritten by consulting the candidate bitmap
     96  * described above, and the "hastrans bitmap" which tells it whether the chunk
     97  * has been copied already or not.  If the chunk is a candidate but has not
     98  * been copied, it reads the old data in and adds it to a queue.  The
     99  * old data can then be overwritten with the new data.  An asynchronous
    100  * task queue is dispatched for each old chunk read in which writes the old
    101  * data to the backing file specified at snapshot creation time.  The
    102  * backing file is a sparse file the same size as the file system that
    103  * contains the old data at the offset that data originally had in the
    104  * file system.  If the queue containing in-memory chunks gets too large,
    105  * writes to the file system may be throttled by a semaphore until the
    106  * task queues have a chance to push some of the chunks to the backing file.
    107  *
    108  * With the candidate bitmap, the hastrans bitmap, the data on the master
    109  * file system, and the old data in memory and in the backing file, the
    110  * snapshot pseudo-driver can piece together the original file system
    111  * information to satisfy read requests.  If the requested chunk is not a
    112  * candidate, it returns a zeroed buffer.  If the chunk is a candidate but
    113  * has not been copied it reads it from the master file system.  If it is a
    114  * candidate and has been copied, it either copies the data from the
    115  * in-memory queue or it reads it in from the backing file.  The result is
    116  * a replication of the original file system that can be backed up, mounted,
    117  * or manipulated by other file system utilities that work on a read-only
    118  * device.
    119  *
    120  * This module is divided into three roughly logical sections:
    121  *
    122  *     - The snapshot driver, which is a character/block driver
    123  *       representing the snapshot itself.  These routines are
    124  *       prefixed with "snap_".
    125  *
    126  *     - The library routines that are defined in fssnap_if.h that
    127  *       are used by file systems that use this snapshot implementation.
    128  *       These functions are prefixed with "fssnap_" and are called through
    129  *       a function vector from the file system.
    130  *
    131  *     - The helper routines used by the snapshot driver and the fssnap
    132  *       library routines for managing the translation table and other
    133  *       useful functions.  These routines are all static and are
    134  *       prefixed with either "fssnap_" or "transtbl_" if they
    135  *       are specifically used for translation table activities.
    136  */
    137 
    138 static dev_info_t		*fssnap_dip = NULL;
    139 static struct snapshot_id	*snapshot = NULL;
    140 static struct snapshot_id	snap_ctl;
    141 static int			num_snapshots = 0;
    142 static kmutex_t			snapshot_mutex;
    143 static char			snapname[] = SNAP_NAME;
    144 
    145 /* "tunable" parameters */
    146 static int		fssnap_taskq_nthreads = FSSNAP_TASKQ_THREADS;
    147 static uint_t		fssnap_max_mem_chunks = FSSNAP_MAX_MEM_CHUNKS;
    148 static int		fssnap_taskq_maxtasks = FSSNAP_TASKQ_MAXTASKS;
    149 
    150 /* static function prototypes */
    151 
    152 /* snapshot driver */
    153 static int snap_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
    154 static int snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
    155 static int snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
    156 static int snap_open(dev_t *devp, int flag, int otyp, cred_t *cred);
    157 static int snap_close(dev_t dev, int flag, int otyp, cred_t *cred);
    158 static int snap_strategy(struct buf *bp);
    159 static int snap_read(dev_t dev, struct uio *uiop, cred_t *credp);
    160 static int snap_print(dev_t dev, char *str);
    161 static int snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
    162     cred_t *credp, int *rvalp);
    163 static int snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
    164     int flags, char *name, caddr_t valuep, int *lengthp);
    165 static int snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk,
    166     int offset, int len, char *buffer);
    167 
    168 
    169 /* fssnap interface implementations (see fssnap_if.h) */
    170 static void fssnap_strategy_impl(void *, struct buf *);
    171 static void *fssnap_create_impl(chunknumber_t, uint_t, u_offset_t,
    172     struct vnode *, int, struct vnode **, char *, u_offset_t);
    173 static void fssnap_set_candidate_impl(void *, chunknumber_t);
    174 static int fssnap_is_candidate_impl(void *, u_offset_t);
    175 static int fssnap_create_done_impl(void *);
    176 static int fssnap_delete_impl(void *);
    177 
    178 /* fssnap interface support routines */
    179 static int  fssnap_translate(struct snapshot_id **, struct buf *);
    180 static void fssnap_write_taskq(void *);
    181 static void fssnap_create_kstats(snapshot_id_t *, int, const char *,
    182     const char *);
    183 static int  fssnap_update_kstat_num(kstat_t *, int);
    184 static void fssnap_delete_kstats(struct cow_info *);
    185 
    186 /* translation table prototypes */
    187 static cow_map_node_t *transtbl_add(cow_map_t *, chunknumber_t, caddr_t);
    188 static cow_map_node_t *transtbl_get(cow_map_t *, chunknumber_t);
    189 static void transtbl_delete(cow_map_t *, cow_map_node_t *);
    190 static void transtbl_free(cow_map_t *);
    191 
    192 static kstat_t *fssnap_highwater_kstat;
    193 
    194 /* ************************************************************************ */
    195 
    196 /* Device and Module Structures */
    197 
    198 static struct cb_ops snap_cb_ops = {
    199 	snap_open,
    200 	snap_close,
    201 	snap_strategy,
    202 	snap_print,
    203 	nodev,		/* no snap_dump */
    204 	snap_read,
    205 	nodev,		/* no snap_write */
    206 	snap_ioctl,
    207 	nodev,		/* no snap_devmap */
    208 	nodev,		/* no snap_mmap   */
    209 	nodev,		/* no snap_segmap */
    210 	nochpoll,
    211 	snap_prop_op,
    212 	NULL,		/* streamtab */
    213 	D_64BIT | D_NEW | D_MP, /* driver compatibility */
    214 	CB_REV,
    215 	nodev,		/* async I/O read entry point */
    216 	nodev		/* async I/O write entry point */
    217 };
    218 
    219 static struct dev_ops snap_ops = {
    220 	DEVO_REV,
    221 	0,			/* ref count */
    222 	snap_getinfo,
    223 	nulldev,		/* snap_identify obsolete */
    224 	nulldev,		/* no snap_probe */
    225 	snap_attach,
    226 	snap_detach,
    227 	nodev,			/* no snap_reset */
    228 	&snap_cb_ops,
    229 	(struct bus_ops *)NULL,
    230 	nulldev,		/* no snap_power() */
    231 	ddi_quiesce_not_needed,		/* quiesce */
    232 };
    233 
    234 extern struct mod_ops mod_driverops;
    235 
    236 static struct modldrv md = {
    237 	&mod_driverops, /* Type of module. This is a driver */
    238 	"snapshot driver", 	/* Name of the module */
    239 	&snap_ops,
    240 };
    241 
    242 static struct modlinkage ml = {
    243 	MODREV_1,
    244 	&md,
    245 	NULL
    246 };
    247 
    248 static void *statep;
    249 
    250 int
    251 _init(void)
    252 {
    253 	int	error;
    254 	kstat_t	*ksp;
    255 	kstat_named_t	*ksdata;
    256 
    257 	error = ddi_soft_state_init(&statep, sizeof (struct snapshot_id *), 1);
    258 	if (error) {
    259 		cmn_err(CE_WARN, "_init: failed to init ddi_soft_state.");
    260 		return (error);
    261 	}
    262 
    263 	error = mod_install(&ml);
    264 
    265 	if (error) {
    266 		cmn_err(CE_WARN, "_init: failed to mod_install.");
    267 		ddi_soft_state_fini(&statep);
    268 		return (error);
    269 	}
    270 
    271 	/*
    272 	 * Fill in the snapshot operations vector for file systems
    273 	 * (defined in fssnap_if.c)
    274 	 */
    275 
    276 	snapops.fssnap_create = fssnap_create_impl;
    277 	snapops.fssnap_set_candidate = fssnap_set_candidate_impl;
    278 	snapops.fssnap_is_candidate = fssnap_is_candidate_impl;
    279 	snapops.fssnap_create_done = fssnap_create_done_impl;
    280 	snapops.fssnap_delete = fssnap_delete_impl;
    281 	snapops.fssnap_strategy = fssnap_strategy_impl;
    282 
    283 	mutex_init(&snapshot_mutex, NULL, MUTEX_DEFAULT, NULL);
    284 
    285 	/*
    286 	 * Initialize the fssnap highwater kstat
    287 	 */
    288 	ksp = kstat_create(snapname, 0, FSSNAP_KSTAT_HIGHWATER, "misc",
    289 	    KSTAT_TYPE_NAMED, 1, 0);
    290 	if (ksp != NULL) {
    291 		ksdata = (kstat_named_t *)ksp->ks_data;
    292 		kstat_named_init(ksdata, FSSNAP_KSTAT_HIGHWATER,
    293 		    KSTAT_DATA_UINT32);
    294 		ksdata->value.ui32 = 0;
    295 		kstat_install(ksp);
    296 	} else {
    297 		cmn_err(CE_WARN, "_init: failed to create highwater kstat.");
    298 	}
    299 	fssnap_highwater_kstat = ksp;
    300 
    301 	return (0);
    302 }
    303 
    304 int
    305 _info(struct modinfo *modinfop)
    306 {
    307 	return (mod_info(&ml, modinfop));
    308 }
    309 
    310 int
    311 _fini(void)
    312 {
    313 	int	error;
    314 
    315 	error = mod_remove(&ml);
    316 	if (error)
    317 		return (error);
    318 	ddi_soft_state_fini(&statep);
    319 
    320 	/*
    321 	 * delete the fssnap highwater kstat
    322 	 */
    323 	kstat_delete(fssnap_highwater_kstat);
    324 
    325 	mutex_destroy(&snapshot_mutex);
    326 
    327 	/* Clear out the file system operations vector */
    328 	snapops.fssnap_create = NULL;
    329 	snapops.fssnap_set_candidate = NULL;
    330 	snapops.fssnap_create_done = NULL;
    331 	snapops.fssnap_delete = NULL;
    332 	snapops.fssnap_strategy = NULL;
    333 
    334 	return (0);
    335 }
    336 
    337 /* ************************************************************************ */
    338 
    339 /*
    340  * Snapshot Driver Routines
    341  *
    342  * This section implements the snapshot character and block drivers.  The
    343  * device will appear to be a consistent read-only file system to
    344  * applications that wish to back it up or mount it.  The snapshot driver
    345  * communicates with the file system through the translation table, which
    346  * tells the snapshot driver where to find the data necessary to piece
    347  * together the frozen file system.  The data may either be on the master
    348  * device (no translation exists), in memory (a translation exists but has
    349  * not been flushed to the backing store), or in the backing store file.
    350  * The read request may require the snapshot driver to retrieve data from
    351  * several different places and piece it together to look like a single
    352  * contiguous read.
    353  *
    354  * The device minor number corresponds to the snapshot number in the list of
    355  * snapshot identifiers.  The soft state for each minor number is simply a
    356  * pointer to the snapshot id, which holds all of the snapshot state.  One
    357  * minor number is designated as the control device.  All snapshot create
    358  * and delete requests go through the control device to ensure this module
    359  * is properly loaded and attached before the file system starts calling
    360  * routines defined here.
    361  */
    362 
    363 
    364 /*
    365  * snap_getinfo() - snapshot driver getinfo(9E) routine
    366  *
    367  */
    368 /*ARGSUSED*/
    369 static int
    370 snap_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
    371 {
    372 	switch (infocmd) {
    373 	case DDI_INFO_DEVT2DEVINFO:
    374 		*result = fssnap_dip;
    375 		return (DDI_SUCCESS);
    376 	case DDI_INFO_DEVT2INSTANCE:
    377 		*result = 0;	/* we only have one instance */
    378 		return (DDI_SUCCESS);
    379 	}
    380 	return (DDI_FAILURE);
    381 }
    382 
    383 /*
    384  * snap_attach() - snapshot driver attach(9E) routine
    385  *
    386  *    sets up snapshot control device and control state.  The control state
    387  *    is a pointer to an "anonymous" snapshot_id for tracking opens and closes
    388  */
    389 static int
    390 snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
    391 {
    392 	int			error;
    393 
    394 	switch (cmd) {
    395 	case DDI_ATTACH:
    396 		/* create the control device */
    397 		error = ddi_create_priv_minor_node(dip, SNAP_CTL_NODE, S_IFCHR,
    398 		    SNAP_CTL_MINOR, DDI_PSEUDO, PRIVONLY_DEV,
    399 		    PRIV_SYS_CONFIG, PRIV_SYS_CONFIG, 0666);
    400 		if (error == DDI_FAILURE) {
    401 			return (DDI_FAILURE);
    402 		}
    403 
    404 		rw_init(&snap_ctl.sid_rwlock, NULL, RW_DEFAULT, NULL);
    405 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
    406 		fssnap_dip = dip;
    407 		snap_ctl.sid_snapnumber = SNAP_CTL_MINOR;
    408 		/* the control sid is not linked into the snapshot list */
    409 		snap_ctl.sid_next = NULL;
    410 		snap_ctl.sid_cowinfo = NULL;
    411 		snap_ctl.sid_flags = 0;
    412 		rw_exit(&snap_ctl.sid_rwlock);
    413 		ddi_report_dev(dip);
    414 
    415 		return (DDI_SUCCESS);
    416 	case DDI_PM_RESUME:
    417 		return (DDI_SUCCESS);
    418 
    419 	case DDI_RESUME:
    420 		return (DDI_SUCCESS);
    421 
    422 	default:
    423 		return (DDI_FAILURE);
    424 	}
    425 }
    426 
    427 /*
    428  * snap_detach() - snapshot driver detach(9E) routine
    429  *
    430  *    destroys snapshot control device and control state.  If any snapshots
    431  *    are active (ie. num_snapshots != 0), the device will refuse to detach.
    432  */
    433 static int
    434 snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
    435 {
    436 	struct snapshot_id *sidp, *sidnextp;
    437 
    438 	switch (cmd) {
    439 	case DDI_DETACH:
    440 		/* do not detach if the device is active */
    441 		mutex_enter(&snapshot_mutex);
    442 		if ((num_snapshots != 0) ||
    443 		    ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0)) {
    444 			mutex_exit(&snapshot_mutex);
    445 			return (DDI_FAILURE);
    446 		}
    447 
    448 		/* free up the snapshot list */
    449 		for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
    450 			ASSERT(SID_AVAILABLE(sidp) &&
    451 			    !RW_LOCK_HELD(&sidp->sid_rwlock));
    452 			sidnextp = sidp->sid_next;
    453 			rw_destroy(&sidp->sid_rwlock);
    454 			kmem_free(sidp, sizeof (struct snapshot_id));
    455 		}
    456 		snapshot = NULL;
    457 
    458 		/* delete the control device */
    459 		ddi_remove_minor_node(dip, SNAP_CTL_NODE);
    460 		fssnap_dip = NULL;
    461 
    462 		ASSERT((snap_ctl.sid_flags & SID_CHAR_BUSY) == 0);
    463 		rw_destroy(&snap_ctl.sid_rwlock);
    464 		mutex_exit(&snapshot_mutex);
    465 
    466 		return (DDI_SUCCESS);
    467 
    468 	default:
    469 		return (DDI_FAILURE);
    470 	}
    471 }
    472 
    473 /*
    474  * snap_open() - snapshot driver open(9E) routine
    475  *
    476  *     marks the snapshot id as busy so it will not be recycled when deleted
    477  *     until the snapshot is closed.
    478  */
    479 /* ARGSUSED */
    480 static int
    481 snap_open(dev_t *devp, int flag, int otyp, cred_t *cred)
    482 {
    483 	minor_t	minor;
    484 	struct snapshot_id **sidpp, *sidp;
    485 
    486 	/* snapshots are read-only */
    487 	if (flag & FWRITE)
    488 		return (EROFS);
    489 
    490 	minor = getminor(*devp);
    491 
    492 	if (minor == SNAP_CTL_MINOR) {
    493 		/* control device must be opened exclusively */
    494 		if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR))
    495 			return (EINVAL);
    496 
    497 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
    498 		if ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0) {
    499 			rw_exit(&snap_ctl.sid_rwlock);
    500 			return (EBUSY);
    501 		}
    502 
    503 		snap_ctl.sid_flags |= SID_CHAR_BUSY;
    504 		rw_exit(&snap_ctl.sid_rwlock);
    505 
    506 		return (0);
    507 	}
    508 
    509 	sidpp = ddi_get_soft_state(statep, minor);
    510 	if (sidpp == NULL || *sidpp == NULL)
    511 		return (ENXIO);
    512 	sidp = *sidpp;
    513 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
    514 
    515 	if ((flag & FEXCL) && SID_BUSY(sidp)) {
    516 		rw_exit(&sidp->sid_rwlock);
    517 		return (EAGAIN);
    518 	}
    519 
    520 	ASSERT(sidpp != NULL && sidp != NULL);
    521 	/* check to see if this snapshot has been killed on us */
    522 	if (SID_INACTIVE(sidp)) {
    523 		cmn_err(CE_WARN, "snap_open: snapshot %d does not exist.",
    524 		    minor);
    525 		rw_exit(&sidp->sid_rwlock);
    526 		return (ENXIO);
    527 	}
    528 
    529 	switch (otyp) {
    530 	case OTYP_CHR:
    531 		sidp->sid_flags |= SID_CHAR_BUSY;
    532 		break;
    533 	case OTYP_BLK:
    534 		sidp->sid_flags |= SID_BLOCK_BUSY;
    535 		break;
    536 	default:
    537 		rw_exit(&sidp->sid_rwlock);
    538 		return (EINVAL);
    539 	}
    540 
    541 	rw_exit(&sidp->sid_rwlock);
    542 
    543 	/*
    544 	 * at this point if a valid snapshot was found then it has
    545 	 * been marked busy and we can use it.
    546 	 */
    547 	return (0);
    548 }
    549 
    550 /*
    551  * snap_close() - snapshot driver close(9E) routine
    552  *
    553  *    unsets the busy bits in the snapshot id.  If the snapshot has been
    554  *    deleted while the snapshot device was open, the close call will clean
    555  *    up the remaining state information.
    556  */
    557 /* ARGSUSED */
    558 static int
    559 snap_close(dev_t dev, int flag, int otyp, cred_t *cred)
    560 {
    561 	struct snapshot_id	**sidpp, *sidp;
    562 	minor_t			minor;
    563 	char			name[20];
    564 
    565 	minor = getminor(dev);
    566 
    567 	/* if this is the control device, close it and return */
    568 	if (minor == SNAP_CTL_MINOR) {
    569 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
    570 		snap_ctl.sid_flags &= ~(SID_CHAR_BUSY);
    571 		rw_exit(&snap_ctl.sid_rwlock);
    572 		return (0);
    573 	}
    574 
    575 	sidpp = ddi_get_soft_state(statep, minor);
    576 	if (sidpp == NULL || *sidpp == NULL) {
    577 		cmn_err(CE_WARN, "snap_close: could not find state for "
    578 		    "snapshot %d.", minor);
    579 		return (ENXIO);
    580 	}
    581 	sidp = *sidpp;
    582 	mutex_enter(&snapshot_mutex);
    583 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
    584 
    585 	/* Mark the snapshot as not being busy anymore */
    586 	switch (otyp) {
    587 	case OTYP_CHR:
    588 		sidp->sid_flags &= ~(SID_CHAR_BUSY);
    589 		break;
    590 	case OTYP_BLK:
    591 		sidp->sid_flags &= ~(SID_BLOCK_BUSY);
    592 		break;
    593 	default:
    594 		mutex_exit(&snapshot_mutex);
    595 		rw_exit(&sidp->sid_rwlock);
    596 		return (EINVAL);
    597 	}
    598 
    599 	if (SID_AVAILABLE(sidp)) {
    600 		/*
    601 		 * if this is the last close on a snapshot that has been
    602 		 * deleted, then free up the soft state.  The snapdelete
    603 		 * ioctl does not free this when the device is in use so
    604 		 * we do it here after the last reference goes away.
    605 		 */
    606 
    607 		/* remove the device nodes */
    608 		ASSERT(fssnap_dip != NULL);
    609 		(void) snprintf(name, sizeof (name), "%d",
    610 		    sidp->sid_snapnumber);
    611 		ddi_remove_minor_node(fssnap_dip, name);
    612 		(void) snprintf(name, sizeof (name), "%d,raw",
    613 		    sidp->sid_snapnumber);
    614 		ddi_remove_minor_node(fssnap_dip, name);
    615 
    616 		/* delete the state structure */
    617 		ddi_soft_state_free(statep, sidp->sid_snapnumber);
    618 		num_snapshots--;
    619 	}
    620 
    621 	mutex_exit(&snapshot_mutex);
    622 	rw_exit(&sidp->sid_rwlock);
    623 
    624 	return (0);
    625 }
    626 
    627 /*
    628  * snap_read() - snapshot driver read(9E) routine
    629  *
    630  *    reads data from the snapshot by calling snap_strategy() through physio()
    631  */
    632 /* ARGSUSED */
    633 static int
    634 snap_read(dev_t dev, struct uio *uiop, cred_t *credp)
    635 {
    636 	minor_t		minor;
    637 	struct snapshot_id **sidpp;
    638 
    639 	minor = getminor(dev);
    640 	sidpp = ddi_get_soft_state(statep, minor);
    641 	if (sidpp == NULL || *sidpp == NULL) {
    642 		cmn_err(CE_WARN,
    643 		    "snap_read: could not find state for snapshot %d.", minor);
    644 		return (ENXIO);
    645 	}
    646 	return (physio(snap_strategy, NULL, dev, B_READ, minphys, uiop));
    647 }
    648 
    649 /*
    650  * snap_strategy() - snapshot driver strategy(9E) routine
    651  *
    652  *    cycles through each chunk in the requested buffer and calls
    653  *    snap_getchunk() on each chunk to retrieve it from the appropriate
    654  *    place.  Once all of the parts are put together the requested buffer
    655  *    is returned.  The snapshot driver is read-only, so a write is invalid.
    656  */
    657 static int
    658 snap_strategy(struct buf *bp)
    659 {
    660 	struct snapshot_id **sidpp, *sidp;
    661 	minor_t		minor;
    662 	chunknumber_t	chunk;
    663 	int		off, len;
    664 	u_longlong_t	reqptr;
    665 	int		error = 0;
    666 	size_t		chunksz;
    667 	caddr_t		buf;
    668 
    669 	/* snapshot device is read-only */
    670 	if (bp->b_flags & B_WRITE) {
    671 		bioerror(bp, EROFS);
    672 		bp->b_resid = bp->b_bcount;
    673 		biodone(bp);
    674 		return (0);
    675 	}
    676 
    677 	minor = getminor(bp->b_edev);
    678 	sidpp = ddi_get_soft_state(statep, minor);
    679 	if (sidpp == NULL || *sidpp == NULL) {
    680 		cmn_err(CE_WARN,
    681 		    "snap_strategy: could not find state for snapshot %d.",
    682 		    minor);
    683 		bioerror(bp, ENXIO);
    684 		bp->b_resid = bp->b_bcount;
    685 		biodone(bp);
    686 		return (0);
    687 	}
    688 	sidp = *sidpp;
    689 	ASSERT(sidp);
    690 	rw_enter(&sidp->sid_rwlock, RW_READER);
    691 
    692 	if (SID_INACTIVE(sidp)) {
    693 		bioerror(bp, ENXIO);
    694 		bp->b_resid = bp->b_bcount;
    695 		biodone(bp);
    696 		rw_exit(&sidp->sid_rwlock);
    697 		return (0);
    698 	}
    699 
    700 	if (bp->b_flags & (B_PAGEIO|B_PHYS))
    701 		bp_mapin(bp);
    702 
    703 	bp->b_resid = bp->b_bcount;
    704 	ASSERT(bp->b_un.b_addr);
    705 	buf = bp->b_un.b_addr;
    706 
    707 	chunksz = sidp->sid_cowinfo->cow_map.cmap_chunksz;
    708 
    709 	/* reqptr is the current DEV_BSIZE offset into the device */
    710 	/* chunk is the chunk containing reqptr */
    711 	/* len is the length of the request (in the current chunk) in bytes */
    712 	/* off is the byte offset into the current chunk */
    713 	reqptr = bp->b_lblkno;
    714 	while (bp->b_resid > 0) {
    715 		chunk = dbtocowchunk(&sidp->sid_cowinfo->cow_map, reqptr);
    716 		off = (reqptr % (chunksz >> DEV_BSHIFT)) << DEV_BSHIFT;
    717 		len = min(chunksz - off, bp->b_resid);
    718 		ASSERT((off + len) <= chunksz);
    719 
    720 		if ((error = snap_getchunk(sidp, chunk, off, len, buf)) != 0) {
    721 			/*
    722 			 * EINVAL means the user tried to go out of range.
    723 			 * Anything else means it's likely that we're
    724 			 * confused.
    725 			 */
    726 			if (error != EINVAL) {
    727 				cmn_err(CE_WARN, "snap_strategy: error "
    728 				    "calling snap_getchunk, chunk = %llu, "
    729 				    "offset = %d, len = %d, resid = %lu, "
    730 				    "error = %d.",
    731 				    chunk, off, len, bp->b_resid, error);
    732 			}
    733 			bioerror(bp, error);
    734 			biodone(bp);
    735 			rw_exit(&sidp->sid_rwlock);
    736 			return (0);
    737 		}
    738 		bp->b_resid -= len;
    739 		reqptr += (len >> DEV_BSHIFT);
    740 		buf += len;
    741 	}
    742 
    743 	ASSERT(bp->b_resid == 0);
    744 	biodone(bp);
    745 
    746 	rw_exit(&sidp->sid_rwlock);
    747 	return (0);
    748 }
    749 
    750 /*
    751  * snap_getchunk() - helper function for snap_strategy()
    752  *
    753  *    gets the requested data from the appropriate place and fills in the
    754  *    buffer.  chunk is the chunk number of the request, offset is the
    755  *    offset into that chunk and must be less than the chunk size.  len is
    756  *    the length of the request starting at offset, and must not exceed a
    757  *    chunk boundary.  buffer is the address to copy the data to.  len
    758  *    bytes are copied into the buffer starting at the location specified.
    759  *
    760  *    A chunk is located according to the following algorithm:
    761  *        - If the chunk does not have a translation or is not a candidate
    762  *          for translation, it is read straight from the master device.
    763  *        - If the chunk does have a translation, then it is either on
    764  *          disk or in memory:
    765  *            o If it is in memory the requested data is simply copied out
    766  *              of the in-memory buffer.
    767  *            o If it is in the backing store, it is read from there.
    768  *
    769  *    This function does the real work of the snapshot driver.
    770  */
    771 static int
    772 snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk, int offset,
    773     int len, char *buffer)
    774 {
    775 	cow_map_t	*cmap = &sidp->sid_cowinfo->cow_map;
    776 	cow_map_node_t	*cmn;
    777 	struct buf	*snapbuf;
    778 	int		error = 0;
    779 	char		*newbuffer;
    780 	int		newlen = 0;
    781 	int		partial = 0;
    782 
    783 	ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
    784 	ASSERT(offset + len <= cmap->cmap_chunksz);
    785 
    786 	/*
    787 	 * Check if the chunk number is out of range and if so bail out
    788 	 */
    789 	if (chunk >= (cmap->cmap_bmsize * NBBY)) {
    790 		return (EINVAL);
    791 	}
    792 
    793 	/*
    794 	 * If the chunk is not a candidate for translation, then the chunk
    795 	 * was not allocated when the snapshot was taken.  Since it does
    796 	 * not contain data associated with this snapshot, just return a
    797 	 * zero buffer instead.
    798 	 */
    799 	if (isclr(cmap->cmap_candidate, chunk)) {
    800 		bzero(buffer, len);
    801 		return (0);
    802 	}
    803 
    804 	/*
    805 	 * if the chunk is a candidate for translation but a
    806 	 * translation does not exist, then read through to the
    807 	 * original file system.  The rwlock is held until the read
    808 	 * completes if it hasn't been translated to make sure the
    809 	 * file system does not translate the block before we
    810 	 * access it. If it has already been translated we don't
    811 	 * need the lock, because the translation will never go away.
    812 	 */
    813 	rw_enter(&cmap->cmap_rwlock, RW_READER);
    814 	if (isclr(cmap->cmap_hastrans, chunk)) {
    815 		snapbuf = getrbuf(KM_SLEEP);
    816 		/*
    817 		 * Reading into the buffer saves having to do a copy,
    818 		 * but gets tricky if the request size is not a
    819 		 * multiple of DEV_BSIZE.  However, we are filling the
    820 		 * buffer left to right, so future reads will write
    821 		 * over any extra data we might have read.
    822 		 */
    823 
    824 		partial = len % DEV_BSIZE;
    825 
    826 		snapbuf->b_bcount = len;
    827 		snapbuf->b_lblkno = lbtodb(chunk * cmap->cmap_chunksz + offset);
    828 		snapbuf->b_un.b_addr = buffer;
    829 
    830 		snapbuf->b_iodone = NULL;
    831 		snapbuf->b_proc = NULL;		/* i.e. the kernel */
    832 		snapbuf->b_flags = B_READ | B_BUSY;
    833 		snapbuf->b_edev = sidp->sid_fvp->v_vfsp->vfs_dev;
    834 
    835 		if (partial) {
    836 			/*
    837 			 * Partial block read in progress.
    838 			 * This is bad as modules further down the line
    839 			 * assume buf's are exact multiples of DEV_BSIZE
    840 			 * and we end up with fewer, or zero, bytes read.
    841 			 * To get round this we need to round up to the
    842 			 * nearest full block read and then return only
    843 			 * len bytes.
    844 			 */
    845 			newlen = (len - partial) + DEV_BSIZE;
    846 			newbuffer = kmem_alloc(newlen, KM_SLEEP);
    847 
    848 			snapbuf->b_bcount = newlen;
    849 			snapbuf->b_un.b_addr = newbuffer;
    850 		}
    851 
    852 		(void) bdev_strategy(snapbuf);
    853 		(void) biowait(snapbuf);
    854 
    855 		error = geterror(snapbuf);
    856 
    857 		if (partial) {
    858 			/*
    859 			 * Partial block read. Now we need to bcopy the
    860 			 * correct number of bytes back into the
    861 			 * supplied buffer, and tidy up our temp
    862 			 * buffer.
    863 			 */
    864 			bcopy(newbuffer, buffer, len);
    865 			kmem_free(newbuffer, newlen);
    866 		}
    867 
    868 		freerbuf(snapbuf);
    869 		rw_exit(&cmap->cmap_rwlock);
    870 
    871 		return (error);
    872 	}
    873 
    874 	/*
    875 	 * finally, if the chunk is a candidate for translation and it
    876 	 * has been translated, then we clone the chunk of the buffer
    877 	 * that was copied aside by the file system.
    878 	 * The cmap_rwlock does not need to be held after we know the
    879 	 * data has already been copied. Once a chunk has been copied
    880 	 * to the backing file, it is stable read only data.
    881 	 */
    882 	cmn = transtbl_get(cmap, chunk);
    883 
    884 	/* check whether the data is in memory or in the backing file */
    885 	if (cmn != NULL) {
    886 		ASSERT(cmn->cmn_buf);
    887 		/* already in memory */
    888 		bcopy(cmn->cmn_buf + offset, buffer, len);
    889 		rw_exit(&cmap->cmap_rwlock);
    890 	} else {
    891 		ssize_t resid = len;
    892 		int	bf_index;
    893 		/*
    894 		 * can cause deadlock with writer if we don't drop the
    895 		 * cmap_rwlock before trying to get the backing store file
    896 		 * vnode rwlock.
    897 		 */
    898 		rw_exit(&cmap->cmap_rwlock);
    899 
    900 		bf_index = chunk / cmap->cmap_chunksperbf;
    901 
    902 		/* read buffer from backing file */
    903 		error = vn_rdwr(UIO_READ,
    904 		    (sidp->sid_cowinfo->cow_backfile_array)[bf_index],
    905 		    buffer, len, ((chunk % cmap->cmap_chunksperbf) *
    906 		    cmap->cmap_chunksz) + offset, UIO_SYSSPACE, 0,
    907 		    RLIM64_INFINITY, kcred, &resid);
    908 	}
    909 
    910 	return (error);
    911 }
    912 
    913 /*
    914  * snap_print() - snapshot driver print(9E) routine
    915  *
    916  *    prints the device identification string.
    917  */
    918 static int
    919 snap_print(dev_t dev, char *str)
    920 {
    921 	struct snapshot_id **sidpp;
    922 	minor_t		minor;
    923 
    924 	minor = getminor(dev);
    925 	sidpp = ddi_get_soft_state(statep, minor);
    926 	if (sidpp == NULL || *sidpp == NULL) {
    927 		cmn_err(CE_WARN,
    928 		    "snap_print: could not find state for snapshot %d.", minor);
    929 		return (ENXIO);
    930 	}
    931 
    932 	cmn_err(CE_NOTE, "snap_print: snapshot %d: %s",  minor, str);
    933 
    934 	return (0);
    935 }
    936 
    937 /*
    938  * snap_prop_op() - snapshot driver prop_op(9E) routine
    939  *
    940  *    get 32-bit and 64-bit values for size (character driver) and nblocks
    941  *    (block driver).
    942  */
    943 static int
    944 snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
    945     int flags, char *name, caddr_t valuep, int *lengthp)
    946 {
    947 	int		minor;
    948 	struct snapshot_id **sidpp;
    949 	dev_t		mdev;
    950 	dev_info_t	*mdip;
    951 	int		error;
    952 
    953 	minor = getminor(dev);
    954 
    955 	/*
    956 	 * If this is the control device just check for .conf properties,
    957 	 * if the wildcard DDI_DEV_T_ANY was passed in via the dev_t
    958 	 * just fall back to the defaults.
    959 	 */
    960 	if ((minor == SNAP_CTL_MINOR) || (dev == DDI_DEV_T_ANY))
    961 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
    962 		    valuep, lengthp));
    963 
    964 	/* check to see if there is a master device plumbed */
    965 	sidpp = ddi_get_soft_state(statep, minor);
    966 	if (sidpp == NULL || *sidpp == NULL) {
    967 		cmn_err(CE_WARN,
    968 		    "snap_prop_op: could not find state for "
    969 		    "snapshot %d.", minor);
    970 		return (DDI_PROP_NOT_FOUND);
    971 	}
    972 
    973 	if (((*sidpp)->sid_fvp == NULL) || ((*sidpp)->sid_fvp->v_vfsp == NULL))
    974 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
    975 		    valuep, lengthp));
    976 
    977 	/* hold master device and pass operation down */
    978 	mdev = (*sidpp)->sid_fvp->v_vfsp->vfs_dev;
    979 	if (mdip = e_ddi_hold_devi_by_dev(mdev, 0)) {
    980 
    981 		/* get size information from the master device. */
    982 		error = cdev_prop_op(mdev, mdip,
    983 		    prop_op, flags, name, valuep, lengthp);
    984 		ddi_release_devi(mdip);
    985 		if (error == DDI_PROP_SUCCESS)
    986 			return (error);
    987 	}
    988 
    989 	/* master device did not service the request, try framework */
    990 	return (ddi_prop_op(dev, dip, prop_op, flags, name, valuep, lengthp));
    991 
    992 }
    993 
    994 /*
    995  * snap_ioctl() - snapshot driver ioctl(9E) routine
    996  *
    997  *    only applies to the control device.  The control device accepts two
    998  *    ioctl requests: create a snapshot or delete a snapshot.  In either
    999  *    case, the vnode for the requested file system is extracted, and the
   1000  *    request is passed on to the file system via the same ioctl.  The file
   1001  *    system is responsible for doing the things necessary for creating or
   1002  *    destroying a snapshot, including any file system specific operations
   1003  *    that must be performed as well as setting up and deleting the snapshot
   1004  *    state through the fssnap interfaces.
   1005  */
   1006 static int
   1007 snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
   1008 int *rvalp)
   1009 {
   1010 	minor_t	minor;
   1011 	int error = 0;
   1012 
   1013 	minor = getminor(dev);
   1014 
   1015 	if (minor != SNAP_CTL_MINOR) {
   1016 		return (EINVAL);
   1017 	}
   1018 
   1019 	switch (cmd) {
   1020 	case _FIOSNAPSHOTCREATE:
   1021 	{
   1022 		struct fiosnapcreate	fc;
   1023 		struct file		*fp;
   1024 		struct vnode		*vp;
   1025 
   1026 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
   1027 			return (EFAULT);
   1028 
   1029 		/* get vnode for file system mount point */
   1030 		if ((fp = getf(fc.rootfiledesc)) == NULL)
   1031 			return (EBADF);
   1032 
   1033 		ASSERT(fp->f_vnode);
   1034 		vp = fp->f_vnode;
   1035 		VN_HOLD(vp);
   1036 		releasef(fc.rootfiledesc);
   1037 
   1038 		/* pass ioctl request to file system */
   1039 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
   1040 		VN_RELE(vp);
   1041 		break;
   1042 	}
   1043 	case _FIOSNAPSHOTCREATE_MULTI:
   1044 	{
   1045 		struct fiosnapcreate_multi	fc;
   1046 		struct file		*fp;
   1047 		struct vnode		*vp;
   1048 
   1049 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
   1050 			return (EFAULT);
   1051 
   1052 		/* get vnode for file system mount point */
   1053 		if ((fp = getf(fc.rootfiledesc)) == NULL)
   1054 			return (EBADF);
   1055 
   1056 		ASSERT(fp->f_vnode);
   1057 		vp = fp->f_vnode;
   1058 		VN_HOLD(vp);
   1059 		releasef(fc.rootfiledesc);
   1060 
   1061 		/* pass ioctl request to file system */
   1062 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
   1063 		VN_RELE(vp);
   1064 		break;
   1065 	}
   1066 	case _FIOSNAPSHOTDELETE:
   1067 	{
   1068 		major_t			major;
   1069 		struct fiosnapdelete	fc;
   1070 		snapshot_id_t		*sidp = NULL;
   1071 		snapshot_id_t		*sidnextp = NULL;
   1072 		struct file		*fp = NULL;
   1073 		struct vnode		*vp = NULL;
   1074 		struct vfs 		*vfsp = NULL;
   1075 		vfsops_t		*vfsops = EIO_vfsops;
   1076 
   1077 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
   1078 			return (EFAULT);
   1079 
   1080 		/* get vnode for file system mount point */
   1081 		if ((fp = getf(fc.rootfiledesc)) == NULL)
   1082 			return (EBADF);
   1083 
   1084 		ASSERT(fp->f_vnode);
   1085 		vp = fp->f_vnode;
   1086 		VN_HOLD(vp);
   1087 		releasef(fc.rootfiledesc);
   1088 		/*
   1089 		 * Test for two formats of delete and set correct minor/vp:
   1090 		 * pseudo device:
   1091 		 * fssnap -d [/dev/fssnap/x]
   1092 		 * or
   1093 		 * mount point:
   1094 		 * fssnap -d [/mntpt]
   1095 		 * Note that minor is verified to be equal to SNAP_CTL_MINOR
   1096 		 * at this point which is an invalid minor number.
   1097 		 */
   1098 		ASSERT(fssnap_dip != NULL);
   1099 		major = ddi_driver_major(fssnap_dip);
   1100 		mutex_enter(&snapshot_mutex);
   1101 		for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
   1102 			rw_enter(&sidp->sid_rwlock, RW_READER);
   1103 			sidnextp = sidp->sid_next;
   1104 			/* pseudo device: */
   1105 			if (major == getmajor(vp->v_rdev)) {
   1106 				minor = getminor(vp->v_rdev);
   1107 				if (sidp->sid_snapnumber == (uint_t)minor &&
   1108 				    sidp->sid_fvp) {
   1109 					VN_RELE(vp);
   1110 					vp = sidp->sid_fvp;
   1111 					VN_HOLD(vp);
   1112 					rw_exit(&sidp->sid_rwlock);
   1113 					break;
   1114 				}
   1115 			/* Mount point: */
   1116 			} else {
   1117 				if (sidp->sid_fvp == vp) {
   1118 					minor = sidp->sid_snapnumber;
   1119 					rw_exit(&sidp->sid_rwlock);
   1120 					break;
   1121 				}
   1122 			}
   1123 			rw_exit(&sidp->sid_rwlock);
   1124 		}
   1125 		mutex_exit(&snapshot_mutex);
   1126 		/* Verify minor got set correctly above */
   1127 		if (minor == SNAP_CTL_MINOR) {
   1128 			VN_RELE(vp);
   1129 			return (EINVAL);
   1130 		}
   1131 		dev = makedevice(major, minor);
   1132 		/*
   1133 		 * Create dummy vfs entry
   1134 		 * to use as a locking semaphore across the IOCTL
   1135 		 * for mount in progress cases...
   1136 		 */
   1137 		vfsp = vfs_alloc(KM_SLEEP);
   1138 		VFS_INIT(vfsp, vfsops, NULL);
   1139 		VFS_HOLD(vfsp);
   1140 		vfs_addmip(dev, vfsp);
   1141 		if ((vfs_devmounting(dev, vfsp)) ||
   1142 		    (vfs_devismounted(dev))) {
   1143 			vfs_delmip(vfsp);
   1144 			VFS_RELE(vfsp);
   1145 			VN_RELE(vp);
   1146 			return (EBUSY);
   1147 		}
   1148 		/*
   1149 		 * Nobody mounted but do not release mount in progress lock
   1150 		 * until IOCTL complete to prohibit a mount sneaking
   1151 		 * in
   1152 		 */
   1153 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
   1154 		vfs_delmip(vfsp);
   1155 		VFS_RELE(vfsp);
   1156 		VN_RELE(vp);
   1157 		break;
   1158 	}
   1159 	default:
   1160 		cmn_err(CE_WARN, "snap_ioctl: Invalid ioctl cmd %d, minor %d.",
   1161 		    cmd, minor);
   1162 		return (EINVAL);
   1163 	}
   1164 
   1165 	return (error);
   1166 }
   1167 
   1168 
   1169 /* ************************************************************************ */
   1170 
   1171 /*
   1172  * Translation Table Routines
   1173  *
   1174  *    These support routines implement a simple doubly linked list
   1175  *    to keep track of chunks that are currently in memory.  The maximum
   1176  *    size of the list is determined by the fssnap_max_mem_chunks variable.
   1177  *    The cmap_rwlock is used to protect the linkage of the list.
   1178  */
   1179 
   1180 /*
   1181  * transtbl_add() - add a node to the translation table
   1182  *
   1183  *    allocates a new node and points it at the buffer passed in.  The node
   1184  *    is added to the beginning of the doubly linked list and the head of
   1185  *    the list is moved.  The cmap_rwlock must be held as a writer through
   1186  *    this operation.
   1187  */
   1188 static cow_map_node_t *
   1189 transtbl_add(cow_map_t *cmap, chunknumber_t chunk, caddr_t buf)
   1190 {
   1191 	cow_map_node_t	*cmnode;
   1192 
   1193 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
   1194 
   1195 	cmnode = kmem_alloc(sizeof (cow_map_node_t), KM_SLEEP);
   1196 
   1197 	/*
   1198 	 * insert new translations at the beginning so cmn_table is always
   1199 	 * the first node.
   1200 	 */
   1201 	cmnode->cmn_chunk = chunk;
   1202 	cmnode->cmn_buf = buf;
   1203 	cmnode->cmn_prev = NULL;
   1204 	cmnode->cmn_next = cmap->cmap_table;
   1205 	if (cmnode->cmn_next)
   1206 		cmnode->cmn_next->cmn_prev = cmnode;
   1207 	cmap->cmap_table = cmnode;
   1208 
   1209 	return (cmnode);
   1210 }
   1211 
   1212 /*
   1213  * transtbl_get() - look up a node in the translation table
   1214  *
   1215  *    called by the snapshot driver to find data that has been translated.
   1216  *    The lookup is done by the chunk number, and the node is returned.
   1217  *    If the node was not found, NULL is returned.
   1218  */
   1219 static cow_map_node_t *
   1220 transtbl_get(cow_map_t *cmap, chunknumber_t chunk)
   1221 {
   1222 	cow_map_node_t *cmn;
   1223 
   1224 	ASSERT(RW_READ_HELD(&cmap->cmap_rwlock));
   1225 	ASSERT(cmap);
   1226 
   1227 	/* search the translation table */
   1228 	for (cmn = cmap->cmap_table; cmn != NULL; cmn = cmn->cmn_next) {
   1229 		if (cmn->cmn_chunk == chunk)
   1230 			return (cmn);
   1231 	}
   1232 
   1233 	/* not found */
   1234 	return (NULL);
   1235 }
   1236 
   1237 /*
   1238  * transtbl_delete() - delete a node from the translation table
   1239  *
   1240  *    called when a node's data has been written out to disk.  The
   1241  *    cmap_rwlock must be held as a writer for this operation.  If the node
   1242  *    being deleted is the head of the list, then the head is moved to the
   1243  *    next node.  Both the node's data and the node itself are freed.
   1244  */
   1245 static void
   1246 transtbl_delete(cow_map_t *cmap, cow_map_node_t *cmn)
   1247 {
   1248 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
   1249 	ASSERT(cmn);
   1250 	ASSERT(cmap->cmap_table);
   1251 
   1252 	/* if the head of the list is being deleted, then move the head up */
   1253 	if (cmap->cmap_table == cmn) {
   1254 		ASSERT(cmn->cmn_prev == NULL);
   1255 		cmap->cmap_table = cmn->cmn_next;
   1256 	}
   1257 
   1258 
   1259 	/* make previous node's next pointer skip over current node */
   1260 	if (cmn->cmn_prev != NULL) {
   1261 		ASSERT(cmn->cmn_prev->cmn_next == cmn);
   1262 		cmn->cmn_prev->cmn_next = cmn->cmn_next;
   1263 	}
   1264 
   1265 	/* make next node's previous pointer skip over current node */
   1266 	if (cmn->cmn_next != NULL) {
   1267 		ASSERT(cmn->cmn_next->cmn_prev == cmn);
   1268 		cmn->cmn_next->cmn_prev = cmn->cmn_prev;
   1269 	}
   1270 
   1271 	/* free the data and the node */
   1272 	ASSERT(cmn->cmn_buf);
   1273 	kmem_free(cmn->cmn_buf, cmap->cmap_chunksz);
   1274 	kmem_free(cmn, sizeof (cow_map_node_t));
   1275 }
   1276 
   1277 /*
   1278  * transtbl_free() - free the entire translation table
   1279  *
   1280  *    called when the snapshot is deleted.  This frees all of the nodes in
   1281  *    the translation table (but not the bitmaps).
   1282  */
   1283 static void
   1284 transtbl_free(cow_map_t *cmap)
   1285 {
   1286 	cow_map_node_t	*curnode;
   1287 	cow_map_node_t	*tempnode;
   1288 
   1289 	for (curnode = cmap->cmap_table; curnode != NULL; curnode = tempnode) {
   1290 		tempnode = curnode->cmn_next;
   1291 
   1292 		kmem_free(curnode->cmn_buf, cmap->cmap_chunksz);
   1293 		kmem_free(curnode, sizeof (cow_map_node_t));
   1294 	}
   1295 }
   1296 
   1297 
   1298 /* ************************************************************************ */
   1299 
   1300 /*
   1301  * Interface Implementation Routines
   1302  *
   1303  * The following functions implement snapshot interface routines that are
   1304  * called by the file system to create, delete, and use a snapshot.  The
   1305  * interfaces are defined in fssnap_if.c and are filled in by this driver
   1306  * when it is loaded.  This technique allows the file system to depend on
   1307  * the interface module without having to load the full implementation and
   1308  * snapshot device drivers.
   1309  */
   1310 
   1311 /*
   1312  * fssnap_strategy_impl() - strategy routine called by the file system
   1313  *
   1314  *    called by the file system to handle copy-on-write when necessary.  All
   1315  *    reads and writes that the file system performs should go through this
   1316  *    function.  If the file system calls the underlying device's strategy
   1317  *    routine without going through fssnap_strategy() (eg. by calling
   1318  *    bdev_strategy()), the snapshot may not be consistent.
   1319  *
   1320  *    This function starts by doing significant sanity checking to insure
   1321  *    the snapshot was not deleted out from under it or deleted and then
   1322  *    recreated.  To do this, it checks the actual pointer passed into it
   1323  *    (ie. the handle held by the file system).  NOTE that the parameter is
   1324  *    a POINTER TO A POINTER to the snapshot id.  Once the snapshot id is
   1325  *    locked, it knows things are ok and that this snapshot is really for
   1326  *    this file system.
   1327  *
   1328  *    If the request is a write, fssnap_translate() is called to determine
   1329  *    whether a copy-on-write is required.  If it is a read, the read is
   1330  *    simply passed on to the underlying device.
   1331  */
   1332 static void
   1333 fssnap_strategy_impl(void *snapshot_id, buf_t *bp)
   1334 {
   1335 	struct snapshot_id **sidpp;
   1336 	struct snapshot_id *sidp;
   1337 	int error;
   1338 
   1339 	/* read requests are always passed through */
   1340 	if (bp->b_flags & B_READ) {
   1341 		(void) bdev_strategy(bp);
   1342 		return;
   1343 	}
   1344 
   1345 	/*
   1346 	 * Because we were not able to take the snapshot read lock BEFORE
   1347 	 * checking for a snapshot back in the file system, things may have
   1348 	 * drastically changed out from under us.  For instance, the snapshot
   1349 	 * may have been deleted, deleted and recreated, or worse yet, deleted
   1350 	 * for this file system but now the snapshot number is in use by another
   1351 	 * file system.
   1352 	 *
   1353 	 * Having a pointer to the file system's snapshot id pointer allows us
   1354 	 * to sanity check most of this, though it assumes the file system is
   1355 	 * keeping track of a pointer to the snapshot_id somewhere.
   1356 	 */
   1357 	sidpp = (struct snapshot_id **)snapshot_id;
   1358 	sidp = *sidpp;
   1359 
   1360 	/*
   1361 	 * if this file system's snapshot was disabled, just pass the
   1362 	 * request through.
   1363 	 */
   1364 	if (sidp == NULL) {
   1365 		(void) bdev_strategy(bp);
   1366 		return;
   1367 	}
   1368 
   1369 	/*
   1370 	 * Once we have the reader lock the snapshot will not magically go
   1371 	 * away.  But things may have changed on us before this so double check.
   1372 	 */
   1373 	rw_enter(&sidp->sid_rwlock, RW_READER);
   1374 
   1375 	/*
   1376 	 * if an error was founds somewhere the DELETE flag will be
   1377 	 * set to indicate the snapshot should be deleted and no new
   1378 	 * translations should occur.
   1379 	 */
   1380 	if (sidp->sid_flags & SID_DELETE) {
   1381 		rw_exit(&sidp->sid_rwlock);
   1382 		(void) fssnap_delete_impl(sidpp);
   1383 		(void) bdev_strategy(bp);
   1384 		return;
   1385 	}
   1386 
   1387 	/*
   1388 	 * If the file system is no longer pointing to the snapshot we were
   1389 	 * called with, then it should not attempt to translate this buffer as
   1390 	 * it may be going to a snapshot for a different file system.
   1391 	 * Even if the file system snapshot pointer is still the same, the
   1392 	 * snapshot may have been disabled before we got the reader lock.
   1393 	 */
   1394 	if (sidp != *sidpp || SID_INACTIVE(sidp)) {
   1395 		rw_exit(&sidp->sid_rwlock);
   1396 		(void) bdev_strategy(bp);
   1397 		return;
   1398 	}
   1399 
   1400 	/*
   1401 	 * At this point we're sure the snapshot will not go away while the
   1402 	 * reader lock is held, and we are reasonably certain that we are
   1403 	 * writing to the correct snapshot.
   1404 	 */
   1405 	if ((error = fssnap_translate(sidpp, bp)) != 0) {
   1406 		/*
   1407 		 * fssnap_translate can release the reader lock if it
   1408 		 * has to wait for a semaphore.  In this case it is possible
   1409 		 * for the snapshot to be deleted in this time frame.  If this
   1410 		 * happens just sent the buf thru to the filesystems device.
   1411 		 */
   1412 		if (sidp != *sidpp || SID_INACTIVE(sidp)) {
   1413 			rw_exit(&sidp->sid_rwlock);
   1414 			(void) bdev_strategy(bp);
   1415 			return;
   1416 		}
   1417 		bioerror(bp, error);
   1418 		biodone(bp);
   1419 	}
   1420 	rw_exit(&sidp->sid_rwlock);
   1421 }
   1422 
   1423 /*
   1424  * fssnap_translate() - helper function for fssnap_strategy()
   1425  *
   1426  *    performs the actual copy-on-write for write requests, if required.
   1427  *    This function does the real work of the file system side of things.
   1428  *
   1429  *    It first checks the candidate bitmap to quickly determine whether any
   1430  *    action is necessary.  If the candidate bitmap indicates the chunk was
   1431  *    allocated when the snapshot was created, then it checks to see whether
   1432  *    a translation already exists.  If a translation already exists then no
   1433  *    action is required.  If the chunk is a candidate for copy-on-write,
   1434  *    and a translation does not already exist, then the chunk is read in
   1435  *    and a node is added to the translation table.
   1436  *
   1437  *    Once all of the chunks in the request range have been copied (if they
   1438  *    needed to be), then the original request can be satisfied and the old
   1439  *    data can be overwritten.
   1440  */
   1441 static int
   1442 fssnap_translate(struct snapshot_id **sidpp, struct buf *wbp)
   1443 {
   1444 	snapshot_id_t	*sidp = *sidpp;
   1445 	struct buf	*oldbp;	/* buffer to store old data in */
   1446 	struct cow_info	*cowp = sidp->sid_cowinfo;
   1447 	cow_map_t	*cmap = &cowp->cow_map;
   1448 	cow_map_node_t	*cmn;
   1449 	chunknumber_t	cowchunk, startchunk, endchunk;
   1450 	int		error;
   1451 	int	throttle_write = 0;
   1452 
   1453 	/* make sure the snapshot is active */
   1454 	ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
   1455 
   1456 	startchunk = dbtocowchunk(cmap, wbp->b_lblkno);
   1457 	endchunk   = dbtocowchunk(cmap, wbp->b_lblkno +
   1458 	    ((wbp->b_bcount-1) >> DEV_BSHIFT));
   1459 
   1460 	/*
   1461 	 * Do not throttle the writes of the fssnap taskq thread and
   1462 	 * the log roll (trans_roll) thread. Furthermore the writes to
   1463 	 * the on-disk log are also not subject to throttling.
   1464 	 * The fssnap_write_taskq thread's write can block on the throttling
   1465 	 * semaphore which leads to self-deadlock as this same thread
   1466 	 * releases the throttling semaphore after completing the IO.
   1467 	 * If the trans_roll thread's write is throttled then we can deadlock
   1468 	 * because the fssnap_taskq_thread which releases the throttling
   1469 	 * semaphore can block waiting for log space which can only be
   1470 	 * released by the trans_roll thread.
   1471 	 */
   1472 
   1473 	throttle_write = !(taskq_member(cowp->cow_taskq, curthread) ||
   1474 	    tsd_get(bypass_snapshot_throttle_key));
   1475 
   1476 	/*
   1477 	 * Iterate through all chunks covered by this write and perform the
   1478 	 * copy-aside if necessary.  Once all chunks have been safely
   1479 	 * stowed away, the new data may be written in a single sweep.
   1480 	 *
   1481 	 * For each chunk in the range, the following sequence is performed:
   1482 	 *	- Is the chunk a candidate for translation?
   1483 	 *		o If not, then no translation is necessary, continue
   1484 	 *	- If it is a candidate, then does it already have a translation?
   1485 	 *		o If so, then no translation is necessary, continue
   1486 	 *	- If it is a candidate, but does not yet have a translation,
   1487 	 *	  then read the old data and schedule an asynchronous taskq
   1488 	 *	  to write the old data to the backing file.
   1489 	 *
   1490 	 * Once this has been performed over the entire range of chunks, then
   1491 	 * it is safe to overwrite the data that is there.
   1492 	 *
   1493 	 * Note that no lock is required to check the candidate bitmap because
   1494 	 * it never changes once the snapshot is created.  The reader lock is
   1495 	 * taken to check the hastrans bitmap since it may change.  If it
   1496 	 * turns out a copy is required, then the lock is upgraded to a
   1497 	 * writer, and the bitmap is re-checked as it may have changed while
   1498 	 * the lock was released.  Finally, the write lock is held while
   1499 	 * reading the old data to make sure it is not translated out from
   1500 	 * under us.
   1501 	 *
   1502 	 * This locking mechanism should be sufficient to handle multiple
   1503 	 * threads writing to overlapping chunks simultaneously.
   1504 	 */
   1505 	for (cowchunk = startchunk; cowchunk <= endchunk; cowchunk++) {
   1506 		/*
   1507 		 * If the cowchunk is outside of the range of our
   1508 		 * candidate maps, then simply break out of the
   1509 		 * loop and pass the I/O through to bdev_strategy.
   1510 		 * This would occur if the file system has grown
   1511 		 * larger since the snapshot was taken.
   1512 		 */
   1513 		if (cowchunk >= (cmap->cmap_bmsize * NBBY))
   1514 			break;
   1515 
   1516 		/*
   1517 		 * If no disk blocks were allocated in this chunk when the
   1518 		 * snapshot was created then no copy-on-write will be
   1519 		 * required.  Since this bitmap is read-only no locks are
   1520 		 * necessary.
   1521 		 */
   1522 		if (isclr(cmap->cmap_candidate, cowchunk)) {
   1523 			continue;
   1524 		}
   1525 
   1526 		/*
   1527 		 * If a translation already exists, the data can be written
   1528 		 * through since the old data has already been saved off.
   1529 		 */
   1530 		if (isset(cmap->cmap_hastrans, cowchunk)) {
   1531 			continue;
   1532 		}
   1533 
   1534 
   1535 		/*
   1536 		 * Throttle translations if there are too many outstanding
   1537 		 * chunks in memory.  The semaphore is sema_v'd by the taskq.
   1538 		 *
   1539 		 * You can't keep the sid_rwlock if you would go to sleep.
   1540 		 * This will result in deadlock when someone tries to delete
   1541 		 * the snapshot (wants the sid_rwlock as a writer, but can't
   1542 		 * get it).
   1543 		 */
   1544 		if (throttle_write) {
   1545 			if (sema_tryp(&cmap->cmap_throttle_sem) == 0) {
   1546 				rw_exit(&sidp->sid_rwlock);
   1547 				atomic_add_32(&cmap->cmap_waiters, 1);
   1548 				sema_p(&cmap->cmap_throttle_sem);
   1549 				atomic_add_32(&cmap->cmap_waiters, -1);
   1550 				rw_enter(&sidp->sid_rwlock, RW_READER);
   1551 
   1552 			/*
   1553 			 * Now since we released the sid_rwlock the state may
   1554 			 * have transitioned underneath us. so check that again.
   1555 			 */
   1556 				if (sidp != *sidpp || SID_INACTIVE(sidp)) {
   1557 					sema_v(&cmap->cmap_throttle_sem);
   1558 					return (ENXIO);
   1559 				}
   1560 			}
   1561 		}
   1562 
   1563 		/*
   1564 		 * Acquire the lock as a writer and check to see if a
   1565 		 * translation has been added in the meantime.
   1566 		 */
   1567 		rw_enter(&cmap->cmap_rwlock, RW_WRITER);
   1568 		if (isset(cmap->cmap_hastrans, cowchunk)) {
   1569 			if (throttle_write)
   1570 				sema_v(&cmap->cmap_throttle_sem);
   1571 			rw_exit(&cmap->cmap_rwlock);
   1572 			continue; /* go to the next chunk */
   1573 		}
   1574 
   1575 		/*
   1576 		 * read a full chunk of data from the requested offset rounded
   1577 		 * down to the nearest chunk size.
   1578 		 */
   1579 		oldbp = getrbuf(KM_SLEEP);
   1580 		oldbp->b_lblkno = cowchunktodb(cmap, cowchunk);
   1581 		oldbp->b_edev = wbp->b_edev;
   1582 		oldbp->b_bcount = cmap->cmap_chunksz;
   1583 		oldbp->b_bufsize = cmap->cmap_chunksz;
   1584 		oldbp->b_iodone = NULL;
   1585 		oldbp->b_proc = NULL;
   1586 		oldbp->b_flags = B_READ;
   1587 		oldbp->b_un.b_addr = kmem_alloc(cmap->cmap_chunksz, KM_SLEEP);
   1588 
   1589 		(void) bdev_strategy(oldbp);
   1590 		(void) biowait(oldbp);
   1591 
   1592 		/*
   1593 		 * It's ok to bail in the middle of translating the range
   1594 		 * because the extra copy-asides will not hurt anything
   1595 		 * (except by using extra space in the backing store).
   1596 		 */
   1597 		if ((error = geterror(oldbp)) != 0) {
   1598 			cmn_err(CE_WARN, "fssnap_translate: error reading "
   1599 			    "old data for snapshot %d, chunk %llu, disk block "
   1600 			    "%lld, size %lu, error %d.", sidp->sid_snapnumber,
   1601 			    cowchunk, oldbp->b_lblkno, oldbp->b_bcount, error);
   1602 			kmem_free(oldbp->b_un.b_addr, cmap->cmap_chunksz);
   1603 			freerbuf(oldbp);
   1604 			rw_exit(&cmap->cmap_rwlock);
   1605 			if (throttle_write)
   1606 				sema_v(&cmap->cmap_throttle_sem);
   1607 			return (error);
   1608 		}
   1609 
   1610 		/*
   1611 		 * add the node to the translation table and save a reference
   1612 		 * to pass to the taskq for writing out to the backing file
   1613 		 */
   1614 		cmn = transtbl_add(cmap, cowchunk, oldbp->b_un.b_addr);
   1615 		freerbuf(oldbp);
   1616 
   1617 		/*
   1618 		 * Add a reference to the snapshot id so the lower level
   1619 		 * processing (ie. the taskq) can get back to the state
   1620 		 * information.
   1621 		 */
   1622 		cmn->cmn_sid = sidp;
   1623 		cmn->release_sem = throttle_write;
   1624 		setbit(cmap->cmap_hastrans, cowchunk);
   1625 
   1626 		rw_exit(&cmap->cmap_rwlock);
   1627 
   1628 		/*
   1629 		 * schedule the asynchronous write to the backing file
   1630 		 */
   1631 		if (cowp->cow_backfile_array != NULL)
   1632 			(void) taskq_dispatch(cowp->cow_taskq,
   1633 			    fssnap_write_taskq, cmn, TQ_SLEEP);
   1634 	}
   1635 
   1636 	/*
   1637 	 * Write new data in place of the old data.  At this point all of the
   1638 	 * chunks touched by this write have been copied aside and so the new
   1639 	 * data can be written out all at once.
   1640 	 */
   1641 	(void) bdev_strategy(wbp);
   1642 
   1643 	return (0);
   1644 }
   1645 
   1646 /*
   1647  * fssnap_write_taskq() - write in-memory translations to the backing file
   1648  *
   1649  *    writes in-memory translations to the backing file asynchronously.  A
   1650  *    task is dispatched each time a new translation is created.  The task
   1651  *    writes the data to the backing file and removes it from the memory
   1652  *    list. The throttling semaphore is released only if the particular
   1653  *    translation was throttled in fssnap_translate.
   1654  */
   1655 static void
   1656 fssnap_write_taskq(void *arg)
   1657 {
   1658 	cow_map_node_t	*cmn = (cow_map_node_t *)arg;
   1659 	snapshot_id_t	*sidp = cmn->cmn_sid;
   1660 	cow_info_t	*cowp = sidp->sid_cowinfo;
   1661 	cow_map_t	*cmap = &cowp->cow_map;
   1662 	int		error;
   1663 	int		bf_index;
   1664 	int		release_sem = cmn->release_sem;
   1665 
   1666 	/*
   1667 	 * The sid_rwlock does not need to be held here because the taskqs
   1668 	 * are destroyed explicitly by fssnap_delete (with the sid_rwlock
   1669 	 * held as a writer).  taskq_destroy() will flush all of the tasks
   1670 	 * out before fssnap_delete frees up all of the structures.
   1671 	 */
   1672 
   1673 	/* if the snapshot was disabled from under us, drop the request. */
   1674 	rw_enter(&sidp->sid_rwlock, RW_READER);
   1675 	if (SID_INACTIVE(sidp)) {
   1676 		rw_exit(&sidp->sid_rwlock);
   1677 		if (release_sem)
   1678 			sema_v(&cmap->cmap_throttle_sem);
   1679 		return;
   1680 	}
   1681 	rw_exit(&sidp->sid_rwlock);
   1682 
   1683 	atomic_add_64((uint64_t *)&cmap->cmap_nchunks, 1);
   1684 
   1685 	if ((cmap->cmap_maxsize != 0) &&
   1686 	    ((cmap->cmap_nchunks * cmap->cmap_chunksz) > cmap->cmap_maxsize)) {
   1687 		cmn_err(CE_WARN, "fssnap_write_taskq: snapshot %d (%s) has "
   1688 		    "reached the maximum backing file size specified (%llu "
   1689 		    "bytes) and will be deleted.", sidp->sid_snapnumber,
   1690 		    (char *)cowp->cow_kstat_mntpt->ks_data,
   1691 		    cmap->cmap_maxsize);
   1692 		if (release_sem)
   1693 			sema_v(&cmap->cmap_throttle_sem);
   1694 		atomic_or_uint(&sidp->sid_flags, SID_DELETE);
   1695 		return;
   1696 	}
   1697 
   1698 	/* perform the write */
   1699 	bf_index = cmn->cmn_chunk / cmap->cmap_chunksperbf;
   1700 
   1701 	if (error = vn_rdwr(UIO_WRITE, (cowp->cow_backfile_array)[bf_index],
   1702 	    cmn->cmn_buf, cmap->cmap_chunksz,
   1703 	    (cmn->cmn_chunk % cmap->cmap_chunksperbf) * cmap->cmap_chunksz,
   1704 	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, (ssize_t *)NULL)) {
   1705 		cmn_err(CE_WARN, "fssnap_write_taskq: error writing to "
   1706 		    "backing file.  DELETING SNAPSHOT %d, backing file path "
   1707 		    "%s, offset %llu bytes, error %d.", sidp->sid_snapnumber,
   1708 		    (char *)cowp->cow_kstat_bfname->ks_data,
   1709 		    cmn->cmn_chunk * cmap->cmap_chunksz, error);
   1710 		if (release_sem)
   1711 			sema_v(&cmap->cmap_throttle_sem);
   1712 		atomic_or_uint(&sidp->sid_flags, SID_DELETE);
   1713 		return;
   1714 	}
   1715 
   1716 	/*
   1717 	 * now remove the node and buffer from memory
   1718 	 */
   1719 	rw_enter(&cmap->cmap_rwlock, RW_WRITER);
   1720 	transtbl_delete(cmap, cmn);
   1721 	rw_exit(&cmap->cmap_rwlock);
   1722 
   1723 	/* Allow more translations */
   1724 	if (release_sem)
   1725 		sema_v(&cmap->cmap_throttle_sem);
   1726 
   1727 }
   1728 
   1729 /*
   1730  * fssnap_create_impl() - called from the file system to create a new snapshot
   1731  *
   1732  *    allocates and initializes the structures needed for a new snapshot.
   1733  *    This is called by the file system when it receives an ioctl request to
   1734  *    create a new snapshot.  An unused snapshot identifier is either found
   1735  *    or created, and eventually returned as the opaque handle the file
   1736  *    system will use to identify this snapshot.  The snapshot number
   1737  *    associated with the snapshot identifier is the same as the minor
   1738  *    number for the snapshot device that is used to access that snapshot.
   1739  *
   1740  *    The snapshot can not be used until the candidate bitmap is populated
   1741  *    by the file system (see fssnap_set_candidate_impl()), and the file
   1742  *    system finishes the setup process by calling fssnap_create_done().
   1743  *    Nearly all of the snapshot locks are held for the duration of the
   1744  *    create, and are not released until fssnap_create_done is called().
   1745  */
   1746 static void *
   1747 fssnap_create_impl(chunknumber_t nchunks, uint_t chunksz, u_offset_t maxsize,
   1748     struct vnode *fsvp, int backfilecount, struct vnode **bfvpp, char *backpath,
   1749     u_offset_t max_backfile_size)
   1750 {
   1751 	refstr_t *mountpoint;
   1752 	char taskqname[50];
   1753 	struct cow_info *cowp;
   1754 	struct cow_map	*cmap;
   1755 	struct snapshot_id *sidp;
   1756 	int lastsnap;
   1757 
   1758 	/*
   1759 	 * Sanity check the parameters we care about
   1760 	 * (we don't care about the informational parameters)
   1761 	 */
   1762 	if ((nchunks == 0) ||
   1763 	    ((chunksz % DEV_BSIZE) != 0) ||
   1764 	    (bfvpp == NULL)) {
   1765 		return (NULL);
   1766 	}
   1767 
   1768 	/*
   1769 	 * Look for unused snapshot identifiers.  Snapshot ids are never
   1770 	 * freed, but deleted snapshot ids will be recycled as needed.
   1771 	 */
   1772 	mutex_enter(&snapshot_mutex);
   1773 
   1774 findagain:
   1775 	lastsnap = 0;
   1776 	for (sidp = snapshot; sidp != NULL; sidp = sidp->sid_next) {
   1777 		if (sidp->sid_snapnumber > lastsnap)
   1778 			lastsnap = sidp->sid_snapnumber;
   1779 
   1780 		/*
   1781 		 * The sid_rwlock is taken as a reader initially so that
   1782 		 * activity on each snapshot is not stalled while searching
   1783 		 * for a free snapshot id.
   1784 		 */
   1785 		rw_enter(&sidp->sid_rwlock, RW_READER);
   1786 
   1787 		/*
   1788 		 * If the snapshot has been deleted and nobody is using the
   1789 		 * snapshot device than we can reuse this snapshot_id.  If
   1790 		 * the snapshot is marked to be deleted (SID_DELETE), then
   1791 		 * it hasn't been deleted yet so don't reuse it.
   1792 		 */
   1793 		if (SID_AVAILABLE(sidp))
   1794 			break; /* This spot is unused, so take it */
   1795 		rw_exit(&sidp->sid_rwlock);
   1796 	}
   1797 
   1798 	/*
   1799 	 * add a new snapshot identifier if there are no deleted
   1800 	 * entries.  Since it doesn't matter what order the entries
   1801 	 * are in we can just add it to the beginning of the list.
   1802 	 */
   1803 	if (sidp) {
   1804 		if (rw_tryupgrade(&sidp->sid_rwlock) == 0) {
   1805 			/* someone else grabbed it as a writer, try again */
   1806 			rw_exit(&sidp->sid_rwlock);
   1807 			goto findagain;
   1808 		}
   1809 	} else {
   1810 		/* Create a new node if we didn't find an unused one */
   1811 		sidp = kmem_alloc(sizeof (struct snapshot_id), KM_SLEEP);
   1812 		rw_init(&sidp->sid_rwlock, NULL, RW_DEFAULT, NULL);
   1813 		rw_enter(&sidp->sid_rwlock, RW_WRITER);
   1814 		sidp->sid_snapnumber = (snapshot == NULL) ? 0 : lastsnap + 1;
   1815 		sidp->sid_cowinfo = NULL;
   1816 		sidp->sid_flags = 0;
   1817 		sidp->sid_next = snapshot;
   1818 		snapshot = sidp;
   1819 	}
   1820 
   1821 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
   1822 	ASSERT(sidp->sid_cowinfo == NULL);
   1823 	ASSERT(sidp->sid_snapnumber <= (lastsnap + 1));
   1824 
   1825 	sidp->sid_flags |= SID_CREATING;
   1826 	/* The root vnode is held until snap_delete_impl() is called */
   1827 	VN_HOLD(fsvp);
   1828 	sidp->sid_fvp = fsvp;
   1829 	num_snapshots++;
   1830 
   1831 	/* allocate and initialize structures */
   1832 
   1833 	cowp = kmem_zalloc(sizeof (struct cow_info), KM_SLEEP);
   1834 
   1835 	cowp->cow_backfile_array = bfvpp;
   1836 	cowp->cow_backcount = backfilecount;
   1837 	cowp->cow_backfile_sz = max_backfile_size;
   1838 
   1839 	/*
   1840 	 * Initialize task queues for this snapshot.  Only a small number
   1841 	 * of threads are required because they will be serialized on the
   1842 	 * backing file's reader/writer lock anyway.
   1843 	 */
   1844 	(void) snprintf(taskqname, sizeof (taskqname), "%s_taskq_%d", snapname,
   1845 	    sidp->sid_snapnumber);
   1846 	cowp->cow_taskq = taskq_create(taskqname, fssnap_taskq_nthreads,
   1847 	    minclsyspri, 1,  fssnap_taskq_maxtasks, 0);
   1848 
   1849 	/* don't allow tasks to start until after everything is ready */
   1850 	taskq_suspend(cowp->cow_taskq);
   1851 
   1852 	/* initialize translation table */
   1853 	cmap = &cowp->cow_map;
   1854 	rw_init(&cmap->cmap_rwlock, NULL, RW_DEFAULT, NULL);
   1855 	rw_enter(&cmap->cmap_rwlock, RW_WRITER);
   1856 
   1857 	sema_init(&cmap->cmap_throttle_sem, fssnap_max_mem_chunks, NULL,
   1858 	    SEMA_DEFAULT, NULL);
   1859 
   1860 	cmap->cmap_chunksz = chunksz;
   1861 	cmap->cmap_maxsize = maxsize;
   1862 	cmap->cmap_chunksperbf = max_backfile_size / chunksz;
   1863 
   1864 	/*
   1865 	 * allocate one bit per chunk for the bitmaps, round up
   1866 	 */
   1867 	cmap->cmap_bmsize = (nchunks + (NBBY - 1)) / NBBY;
   1868 	cmap->cmap_hastrans  = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
   1869 	cmap->cmap_candidate = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
   1870 
   1871 	sidp->sid_cowinfo = cowp;
   1872 
   1873 	/* initialize kstats for this snapshot */
   1874 	mountpoint = vfs_getmntpoint(fsvp->v_vfsp);
   1875 	fssnap_create_kstats(sidp, sidp->sid_snapnumber,
   1876 	    refstr_value(mountpoint), backpath);
   1877 	refstr_rele(mountpoint);
   1878 
   1879 	mutex_exit(&snapshot_mutex);
   1880 
   1881 	/*
   1882 	 * return with snapshot id rwlock held as a writer until
   1883 	 * fssnap_create_done is called
   1884 	 */
   1885 	return (sidp);
   1886 }
   1887 
   1888 /*
   1889  * fssnap_set_candidate_impl() - mark a chunk as a candidate for copy-on-write
   1890  *
   1891  *    sets a bit in the candidate bitmap that indicates that a chunk is a
   1892  *    candidate for copy-on-write.  Typically, chunks that are allocated on
   1893  *    the file system at the time the snapshot is taken are candidates,
   1894  *    while chunks that have no allocated data do not need to be copied.
   1895  *    Chunks containing metadata must be marked as candidates as well.
   1896  */
   1897 static void
   1898 fssnap_set_candidate_impl(void *snapshot_id, chunknumber_t chunknumber)
   1899 {
   1900 	struct snapshot_id	*sid = snapshot_id;
   1901 	struct cow_info *cowp = sid->sid_cowinfo;
   1902 	struct cow_map	*cmap = &cowp->cow_map;
   1903 
   1904 	/* simple bitmap operation for now */
   1905 	ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
   1906 	setbit(cmap->cmap_candidate, chunknumber);
   1907 }
   1908 
   1909 /*
   1910  * fssnap_is_candidate_impl() - check whether a chunk is a candidate
   1911  *
   1912  *    returns 0 if the chunk is not a candidate and 1 if the chunk is a
   1913  *    candidate.  This can be used by the file system to change behavior for
   1914  *    chunks that might induce a copy-on-write.  The offset is specified in
   1915  *    bytes since the chunk size may not be known by the file system.
   1916  */
   1917 static int
   1918 fssnap_is_candidate_impl(void *snapshot_id, u_offset_t off)
   1919 {
   1920 	struct snapshot_id	*sid = snapshot_id;
   1921 	struct cow_info *cowp = sid->sid_cowinfo;
   1922 	struct cow_map	*cmap = &cowp->cow_map;
   1923 	ulong_t chunknumber = off / cmap->cmap_chunksz;
   1924 
   1925 	/* simple bitmap operation for now */
   1926 	ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
   1927 	return (isset(cmap->cmap_candidate, chunknumber));
   1928 }
   1929 
   1930 /*
   1931  * fssnap_create_done_impl() - complete the snapshot setup process
   1932  *
   1933  *    called when the file system is done populating the candidate bitmap
   1934  *    and it is ready to start using the snapshot.  This routine releases
   1935  *    the snapshot locks, allows taskq tasks to start processing, and
   1936  *    creates the device minor nodes associated with the snapshot.
   1937  */
   1938 static int
   1939 fssnap_create_done_impl(void *snapshot_id)
   1940 {
   1941 	struct snapshot_id	**sidpp, *sidp = snapshot_id;
   1942 	struct cow_info		*cowp;
   1943 	struct cow_map		*cmap;
   1944 	int			snapnumber = -1;
   1945 	char			name[20];
   1946 
   1947 	/* sid rwlock and cmap rwlock should be taken from fssnap_create */
   1948 	ASSERT(sidp);
   1949 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
   1950 	ASSERT(sidp->sid_cowinfo);
   1951 
   1952 	cowp = sidp->sid_cowinfo;
   1953 	cmap = &cowp->cow_map;
   1954 
   1955 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
   1956 
   1957 	sidp->sid_flags &= ~(SID_CREATING | SID_DISABLED);
   1958 	snapnumber = sidp->sid_snapnumber;
   1959 
   1960 	/* allocate state structure and find new snapshot id */
   1961 	if (ddi_soft_state_zalloc(statep, snapnumber) != DDI_SUCCESS) {
   1962 		cmn_err(CE_WARN,
   1963 		    "snap_ioctl: create: could not allocate "
   1964 		    "state for snapshot %d.", snapnumber);
   1965 		snapnumber = -1;
   1966 		goto out;
   1967 	}
   1968 
   1969 	sidpp = ddi_get_soft_state(statep, snapnumber);
   1970 	*sidpp = sidp;
   1971 
   1972 	/* create minor node based on snapshot number */
   1973 	ASSERT(fssnap_dip != NULL);
   1974 	(void) snprintf(name, sizeof (name), "%d", snapnumber);
   1975 	if (ddi_create_minor_node(fssnap_dip, name, S_IFBLK,
   1976 	    snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
   1977 		cmn_err(CE_WARN, "snap_ioctl: could not create "
   1978 		    "block minor node for snapshot %d.", snapnumber);
   1979 		snapnumber = -1;
   1980 		goto out;
   1981 	}
   1982 
   1983 	(void) snprintf(name, sizeof (name), "%d,raw", snapnumber);
   1984 	if (ddi_create_minor_node(fssnap_dip, name, S_IFCHR,
   1985 	    snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
   1986 		cmn_err(CE_WARN, "snap_ioctl: could not create "
   1987 		    "character minor node for snapshot %d.", snapnumber);
   1988 		snapnumber = -1;
   1989 	}
   1990 
   1991 out:
   1992 	rw_exit(&sidp->sid_rwlock);
   1993 	rw_exit(&cmap->cmap_rwlock);
   1994 
   1995 	/* let the taskq threads start processing */
   1996 	taskq_resume(cowp->cow_taskq);
   1997 
   1998 	return (snapnumber);
   1999 }
   2000 
   2001 /*
   2002  * fssnap_delete_impl() - delete a snapshot
   2003  *
   2004  *    used when a snapshot is no longer needed.  This is called by the file
   2005  *    system when it receives an ioctl request to delete a snapshot.  It is
   2006  *    also called internally when error conditions such as disk full, errors
   2007  *    writing to the backing file, or backing file maxsize exceeded occur.
   2008  *    If the snapshot device is busy when the delete request is received,
   2009  *    all state will be deleted except for the soft state and device files
   2010  *    associated with the snapshot; they will be deleted when the snapshot
   2011  *    device is closed.
   2012  *
   2013  *    NOTE this function takes a POINTER TO A POINTER to the snapshot id,
   2014  *    and expects to be able to set the handle held by the file system to
   2015  *    NULL.  This depends on the file system checking that variable for NULL
   2016  *    before calling fssnap_strategy().
   2017  */
   2018 static int
   2019 fssnap_delete_impl(void *snapshot_id)
   2020 {
   2021 	struct snapshot_id	**sidpp = (struct snapshot_id **)snapshot_id;
   2022 	struct snapshot_id	*sidp;
   2023 	struct snapshot_id	**statesidpp;
   2024 	struct cow_info		*cowp;
   2025 	struct cow_map		*cmap;
   2026 	char			name[20];
   2027 	int			snapnumber = -1;
   2028 	vnode_t			**vpp;
   2029 
   2030 	/*
   2031 	 * sidp is guaranteed to be valid if sidpp is valid because
   2032 	 * the snapshot list is append-only.
   2033 	 */
   2034 	if (sidpp == NULL) {
   2035 		return (-1);
   2036 	}
   2037 
   2038 	sidp = *sidpp;
   2039 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
   2040 
   2041 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
   2042 
   2043 	/*
   2044 	 * double check that the snapshot is still valid for THIS file system
   2045 	 */
   2046 	if (*sidpp == NULL) {
   2047 		rw_exit(&sidp->sid_rwlock);
   2048 		return (-1);
   2049 	}
   2050 
   2051 	/*
   2052 	 * Now we know the snapshot is still valid and will not go away
   2053 	 * because we have the write lock.  Once the state is transitioned
   2054 	 * to "disabling", the sid_rwlock can be released.  Any pending I/O
   2055 	 * waiting for the lock as a reader will check for this state and
   2056 	 * abort without touching data that may be getting freed.
   2057 	 */
   2058 	sidp->sid_flags |= SID_DISABLING;
   2059 	if (sidp->sid_flags & SID_DELETE) {
   2060 		cmn_err(CE_WARN, "Snapshot %d automatically deleted.",
   2061 		    sidp->sid_snapnumber);
   2062 		sidp->sid_flags &= ~(SID_DELETE);
   2063 	}
   2064 
   2065 
   2066 	/*
   2067 	 * This is pointing into file system specific data!  The assumption is
   2068 	 * that fssnap_strategy() gets called from the file system based on
   2069 	 * whether this reference to the snapshot_id is NULL or not.  So
   2070 	 * setting this to NULL should disable snapshots for the file system.
   2071 	 */
   2072 	*sidpp = NULL;
   2073 
   2074 	/* remove cowinfo */
   2075 	cowp = sidp->sid_cowinfo;
   2076 	if (cowp == NULL) {
   2077 		rw_exit(&sidp->sid_rwlock);
   2078 		return (-1);
   2079 	}
   2080 	rw_exit(&sidp->sid_rwlock);
   2081 
   2082 	/* destroy task queues first so they don't reference freed data. */
   2083 	if (cowp->cow_taskq) {
   2084 		taskq_destroy(cowp->cow_taskq);
   2085 		cowp->cow_taskq = NULL;
   2086 	}
   2087 
   2088 	if (cowp->cow_backfile_array != NULL) {
   2089 		for (vpp = cowp->cow_backfile_array; *vpp; vpp++)
   2090 			VN_RELE(*vpp);
   2091 		kmem_free(cowp->cow_backfile_array,
   2092 		    (cowp->cow_backcount + 1) * sizeof (vnode_t *));
   2093 		cowp->cow_backfile_array = NULL;
   2094 	}
   2095 
   2096 	sidp->sid_cowinfo = NULL;
   2097 
   2098 	/* remove cmap */
   2099 	cmap = &cowp->cow_map;
   2100 	ASSERT(cmap);
   2101 
   2102 	if (cmap->cmap_candidate)
   2103 		kmem_free(cmap->cmap_candidate, cmap->cmap_bmsize);
   2104 
   2105 	if (cmap->cmap_hastrans)
   2106 		kmem_free(cmap->cmap_hastrans, cmap->cmap_bmsize);
   2107 
   2108 	if (cmap->cmap_table)
   2109 		transtbl_free(&cowp->cow_map);
   2110 
   2111 	rw_destroy(&cmap->cmap_rwlock);
   2112 
   2113 	while (cmap->cmap_waiters) {
   2114 		sema_p(&cmap->cmap_throttle_sem);
   2115 		sema_v(&cmap->cmap_throttle_sem);
   2116 	}
   2117 	sema_destroy(&cmap->cmap_throttle_sem);
   2118 
   2119 	/* remove kstats */
   2120 	fssnap_delete_kstats(cowp);
   2121 
   2122 	kmem_free(cowp, sizeof (struct cow_info));
   2123 
   2124 	statesidpp = ddi_get_soft_state(statep, sidp->sid_snapnumber);
   2125 	if (statesidpp == NULL || *statesidpp == NULL) {
   2126 		cmn_err(CE_WARN,
   2127 		    "fssnap_delete_impl: could not find state for snapshot %d.",
   2128 		    sidp->sid_snapnumber);
   2129 	}
   2130 	ASSERT(*statesidpp == sidp);
   2131 
   2132 	/*
   2133 	 * Leave the node in the list marked DISABLED so it can be reused
   2134 	 * and avoid many race conditions.  Return the snapshot number
   2135 	 * that was deleted.
   2136 	 */
   2137 	mutex_enter(&snapshot_mutex);
   2138 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
   2139 	sidp->sid_flags &= ~(SID_DISABLING);
   2140 	sidp->sid_flags |= SID_DISABLED;
   2141 	VN_RELE(sidp->sid_fvp);
   2142 	sidp->sid_fvp = NULL;
   2143 	snapnumber = sidp->sid_snapnumber;
   2144 
   2145 	/*
   2146 	 * If the snapshot is not busy, free the device info now.  Otherwise
   2147 	 * the device nodes are freed in snap_close() when the device is
   2148 	 * closed.  The sid will not be reused until the device is not busy.
   2149 	 */
   2150 	if (SID_AVAILABLE(sidp)) {
   2151 		/* remove the device nodes */
   2152 		ASSERT(fssnap_dip != NULL);
   2153 		(void) snprintf(name, sizeof (name), "%d",
   2154 		    sidp->sid_snapnumber);
   2155 		ddi_remove_minor_node(fssnap_dip, name);
   2156 		(void) snprintf(name, sizeof (name), "%d,raw",
   2157 		    sidp->sid_snapnumber);
   2158 		ddi_remove_minor_node(fssnap_dip, name);
   2159 
   2160 		/* delete the state structure */
   2161 		ddi_soft_state_free(statep, sidp->sid_snapnumber);
   2162 		num_snapshots--;
   2163 	}
   2164 
   2165 	mutex_exit(&snapshot_mutex);
   2166 	rw_exit(&sidp->sid_rwlock);
   2167 
   2168 	return (snapnumber);
   2169 }
   2170 
   2171 /*
   2172  * fssnap_create_kstats() - allocate and initialize snapshot kstats
   2173  *
   2174  */
   2175 static void
   2176 fssnap_create_kstats(snapshot_id_t *sidp, int snapnum,
   2177     const char *mountpoint, const char *backfilename)
   2178 {
   2179 	kstat_t *num, *mntpoint, *bfname;
   2180 	kstat_named_t *hw;
   2181 	struct cow_info *cowp = sidp->sid_cowinfo;
   2182 	struct cow_kstat_num *stats;
   2183 
   2184 	/* update the high water mark */
   2185 	if (fssnap_highwater_kstat == NULL) {
   2186 		cmn_err(CE_WARN, "fssnap_create_kstats: failed to lookup "
   2187 		    "high water mark kstat.");
   2188 		return;
   2189 	}
   2190 
   2191 	hw = (kstat_named_t *)fssnap_highwater_kstat->ks_data;
   2192 	if (hw->value.ui32 < snapnum)
   2193 		hw->value.ui32 = snapnum;
   2194 
   2195 	/* initialize the mount point kstat */
   2196 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_MNTPT);
   2197 
   2198 	if (mountpoint != NULL) {
   2199 		mntpoint = kstat_create(snapname, snapnum, FSSNAP_KSTAT_MNTPT,
   2200 		    "misc", KSTAT_TYPE_RAW, strlen(mountpoint) + 1, 0);
   2201 		if (mntpoint == NULL) {
   2202 			cowp->cow_kstat_mntpt = NULL;
   2203 			cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
   2204 			    "create mount point kstat");
   2205 		} else {
   2206 			(void) strncpy(mntpoint->ks_data, mountpoint,
   2207 			    strlen(mountpoint));
   2208 			cowp->cow_kstat_mntpt = mntpoint;
   2209 			kstat_install(mntpoint);
   2210 		}
   2211 	} else {
   2212 		cowp->cow_kstat_mntpt = NULL;
   2213 		cmn_err(CE_WARN, "fssnap_create_kstats: mount point not "
   2214 		    "specified.");
   2215 	}
   2216 
   2217 	/* initialize the backing file kstat */
   2218 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_BFNAME);
   2219 
   2220 	if (backfilename == NULL) {
   2221 		cowp->cow_kstat_bfname = NULL;
   2222 	} else {
   2223 		bfname = kstat_create(snapname, snapnum, FSSNAP_KSTAT_BFNAME,
   2224 		    "misc", KSTAT_TYPE_RAW, strlen(backfilename) + 1, 0);
   2225 		if (bfname != NULL) {
   2226 			(void) strncpy(bfname->ks_data, backfilename,
   2227 			    strlen(backfilename));
   2228 			cowp->cow_kstat_bfname = bfname;
   2229 			kstat_install(bfname);
   2230 		} else {
   2231 			cowp->cow_kstat_bfname = NULL;
   2232 			cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
   2233 			    "create backing file name kstat");
   2234 		}
   2235 	}
   2236 
   2237 	/* initialize numeric kstats */
   2238 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_NUM);
   2239 
   2240 	num = kstat_create(snapname, snapnum, FSSNAP_KSTAT_NUM,
   2241 	    "misc", KSTAT_TYPE_NAMED,
   2242 	    sizeof (struct cow_kstat_num) / sizeof (kstat_named_t),
   2243 	    0);
   2244 	if (num == NULL) {
   2245 		cmn_err(CE_WARN, "fssnap_create_kstats: failed to create "
   2246 		    "numeric kstats");
   2247 		cowp->cow_kstat_num = NULL;
   2248 		return;
   2249 	}
   2250 
   2251 	cowp->cow_kstat_num = num;
   2252 	stats = num->ks_data;
   2253 	num->ks_update = fssnap_update_kstat_num;
   2254 	num->ks_private = sidp;
   2255 
   2256 	kstat_named_init(&stats->ckn_state, FSSNAP_KSTAT_NUM_STATE,
   2257 	    KSTAT_DATA_INT32);
   2258 	kstat_named_init(&stats->ckn_bfsize, FSSNAP_KSTAT_NUM_BFSIZE,
   2259 	    KSTAT_DATA_UINT64);
   2260 	kstat_named_init(&stats->ckn_maxsize, FSSNAP_KSTAT_NUM_MAXSIZE,
   2261 	    KSTAT_DATA_UINT64);
   2262 	kstat_named_init(&stats->ckn_createtime, FSSNAP_KSTAT_NUM_CREATETIME,
   2263 	    KSTAT_DATA_LONG);
   2264 	kstat_named_init(&stats->ckn_chunksize, FSSNAP_KSTAT_NUM_CHUNKSIZE,
   2265 	    KSTAT_DATA_UINT32);
   2266 
   2267 	/* initialize the static kstats */
   2268 	stats->ckn_chunksize.value.ui32 = cowp->cow_map.cmap_chunksz;
   2269 	stats->ckn_maxsize.value.ui64 = cowp->cow_map.cmap_maxsize;
   2270 	stats->ckn_createtime.value.l = gethrestime_sec();
   2271 
   2272 	kstat_install(num);
   2273 }
   2274 
   2275 /*
   2276  * fssnap_update_kstat_num() - update a numerical snapshot kstat value
   2277  *
   2278  */
   2279 int
   2280 fssnap_update_kstat_num(kstat_t *ksp, int rw)
   2281 {
   2282 	snapshot_id_t *sidp = (snapshot_id_t *)ksp->ks_private;
   2283 	struct cow_info *cowp = sidp->sid_cowinfo;
   2284 	struct cow_kstat_num *stats = ksp->ks_data;
   2285 
   2286 	if (rw == KSTAT_WRITE)
   2287 		return (EACCES);
   2288 
   2289 	/* state */
   2290 	if (sidp->sid_flags & SID_CREATING)
   2291 		stats->ckn_state.value.i32 = COWSTATE_CREATING;
   2292 	else if (SID_INACTIVE(sidp))
   2293 		stats->ckn_state.value.i32 = COWSTATE_DISABLED;
   2294 	else if (SID_BUSY(sidp))
   2295 		stats->ckn_state.value.i32 = COWSTATE_ACTIVE;
   2296 	else
   2297 		stats->ckn_state.value.i32 = COWSTATE_IDLE;
   2298 
   2299 	/* bfsize */
   2300 	stats->ckn_bfsize.value.ui64 = cowp->cow_map.cmap_nchunks *
   2301 	    cowp->cow_map.cmap_chunksz;
   2302 
   2303 	return (0);
   2304 }
   2305 
   2306 /*
   2307  * fssnap_delete_kstats() - deallocate snapshot kstats
   2308  *
   2309  */
   2310 void
   2311 fssnap_delete_kstats(struct cow_info *cowp)
   2312 {
   2313 	if (cowp->cow_kstat_num != NULL) {
   2314 		kstat_delete(cowp->cow_kstat_num);
   2315 		cowp->cow_kstat_num = NULL;
   2316 	}
   2317 	if (cowp->cow_kstat_mntpt != NULL) {
   2318 		kstat_delete(cowp->cow_kstat_mntpt);
   2319 		cowp->cow_kstat_mntpt = NULL;
   2320 	}
   2321 	if (cowp->cow_kstat_bfname != NULL) {
   2322 		kstat_delete(cowp->cow_kstat_bfname);
   2323 		cowp->cow_kstat_bfname = NULL;
   2324 	}
   2325 }
   2326