Home | History | Annotate | Download | only in ufs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/systm.h>
     27 #include <sys/types.h>
     28 #include <sys/vnode.h>
     29 #include <sys/buf.h>
     30 #include <sys/errno.h>
     31 #include <sys/fssnap_if.h>
     32 #include <sys/fs/ufs_inode.h>
     33 #include <sys/fs/ufs_filio.h>
     34 #include <sys/sysmacros.h>
     35 #include <sys/modctl.h>
     36 #include <sys/fs/ufs_log.h>
     37 #include <sys/fs/ufs_bio.h>
     38 #include <sys/fs/ufs_fsdir.h>
     39 #include <sys/debug.h>
     40 #include <sys/atomic.h>
     41 #include <sys/kmem.h>
     42 #include <sys/inttypes.h>
     43 #include <sys/vfs.h>
     44 #include <sys/mntent.h>
     45 #include <sys/conf.h>
     46 #include <sys/param.h>
     47 #include <sys/kstat.h>
     48 #include <sys/cmn_err.h>
     49 #include <sys/sdt.h>
     50 
     51 #define	LUFS_GENID_PRIME	UINT64_C(4294967291)
     52 #define	LUFS_GENID_BASE		UINT64_C(311)
     53 #define	LUFS_NEXT_ID(id)	((uint32_t)(((id) * LUFS_GENID_BASE) % \
     54 				    LUFS_GENID_PRIME))
     55 
     56 extern	kmutex_t	ufs_scan_lock;
     57 
     58 static kmutex_t	log_mutex;	/* general purpose log layer lock */
     59 kmutex_t	ml_scan;	/* Scan thread syncronization */
     60 kcondvar_t	ml_scan_cv;	/* Scan thread syncronization */
     61 
     62 struct kmem_cache	*lufs_sv;
     63 struct kmem_cache	*lufs_bp;
     64 
     65 /* Tunables */
     66 uint_t		ldl_maxlogsize	= LDL_MAXLOGSIZE;
     67 uint_t		ldl_minlogsize	= LDL_MINLOGSIZE;
     68 uint32_t	ldl_divisor	= LDL_DIVISOR;
     69 uint32_t	ldl_mintransfer	= LDL_MINTRANSFER;
     70 uint32_t	ldl_maxtransfer	= LDL_MAXTRANSFER;
     71 uint32_t	ldl_minbufsize	= LDL_MINBUFSIZE;
     72 
     73 /* Generation of header ids */
     74 static kmutex_t	genid_mutex;
     75 static uint32_t	last_loghead_ident = UINT32_C(0);
     76 
     77 /*
     78  * Logging delta and roll statistics
     79  */
     80 struct delta_kstats {
     81 	kstat_named_t ds_superblock_deltas;
     82 	kstat_named_t ds_bitmap_deltas;
     83 	kstat_named_t ds_suminfo_deltas;
     84 	kstat_named_t ds_allocblk_deltas;
     85 	kstat_named_t ds_ab0_deltas;
     86 	kstat_named_t ds_dir_deltas;
     87 	kstat_named_t ds_inode_deltas;
     88 	kstat_named_t ds_fbiwrite_deltas;
     89 	kstat_named_t ds_quota_deltas;
     90 	kstat_named_t ds_shadow_deltas;
     91 
     92 	kstat_named_t ds_superblock_rolled;
     93 	kstat_named_t ds_bitmap_rolled;
     94 	kstat_named_t ds_suminfo_rolled;
     95 	kstat_named_t ds_allocblk_rolled;
     96 	kstat_named_t ds_ab0_rolled;
     97 	kstat_named_t ds_dir_rolled;
     98 	kstat_named_t ds_inode_rolled;
     99 	kstat_named_t ds_fbiwrite_rolled;
    100 	kstat_named_t ds_quota_rolled;
    101 	kstat_named_t ds_shadow_rolled;
    102 } dkstats = {
    103 	{ "superblock_deltas",	KSTAT_DATA_UINT64 },
    104 	{ "bitmap_deltas",	KSTAT_DATA_UINT64 },
    105 	{ "suminfo_deltas",	KSTAT_DATA_UINT64 },
    106 	{ "allocblk_deltas",	KSTAT_DATA_UINT64 },
    107 	{ "ab0_deltas",		KSTAT_DATA_UINT64 },
    108 	{ "dir_deltas",		KSTAT_DATA_UINT64 },
    109 	{ "inode_deltas",	KSTAT_DATA_UINT64 },
    110 	{ "fbiwrite_deltas",	KSTAT_DATA_UINT64 },
    111 	{ "quota_deltas",	KSTAT_DATA_UINT64 },
    112 	{ "shadow_deltas",	KSTAT_DATA_UINT64 },
    113 
    114 	{ "superblock_rolled",	KSTAT_DATA_UINT64 },
    115 	{ "bitmap_rolled",	KSTAT_DATA_UINT64 },
    116 	{ "suminfo_rolled",	KSTAT_DATA_UINT64 },
    117 	{ "allocblk_rolled",	KSTAT_DATA_UINT64 },
    118 	{ "ab0_rolled",		KSTAT_DATA_UINT64 },
    119 	{ "dir_rolled",		KSTAT_DATA_UINT64 },
    120 	{ "inode_rolled",	KSTAT_DATA_UINT64 },
    121 	{ "fbiwrite_rolled",	KSTAT_DATA_UINT64 },
    122 	{ "quota_rolled",	KSTAT_DATA_UINT64 },
    123 	{ "shadow_rolled",	KSTAT_DATA_UINT64 }
    124 };
    125 
    126 uint64_t delta_stats[DT_MAX];
    127 uint64_t roll_stats[DT_MAX];
    128 
    129 /*
    130  * General logging kstats
    131  */
    132 struct logstats logstats = {
    133 	{ "master_reads",		KSTAT_DATA_UINT64 },
    134 	{ "master_writes",		KSTAT_DATA_UINT64 },
    135 	{ "log_reads_inmem",		KSTAT_DATA_UINT64 },
    136 	{ "log_reads",			KSTAT_DATA_UINT64 },
    137 	{ "log_writes",			KSTAT_DATA_UINT64 },
    138 	{ "log_master_reads",		KSTAT_DATA_UINT64 },
    139 	{ "log_roll_reads",		KSTAT_DATA_UINT64 },
    140 	{ "log_roll_writes",		KSTAT_DATA_UINT64 }
    141 };
    142 
    143 int
    144 trans_not_done(struct buf *cb)
    145 {
    146 	sema_v(&cb->b_io);
    147 	return (0);
    148 }
    149 
    150 static void
    151 trans_wait_panic(struct buf *cb)
    152 {
    153 	while ((cb->b_flags & B_DONE) == 0)
    154 		drv_usecwait(10);
    155 }
    156 
    157 int
    158 trans_not_wait(struct buf *cb)
    159 {
    160 	/*
    161 	 * In case of panic, busy wait for completion
    162 	 */
    163 	if (panicstr)
    164 		trans_wait_panic(cb);
    165 	else
    166 		sema_p(&cb->b_io);
    167 
    168 	return (geterror(cb));
    169 }
    170 
    171 int
    172 trans_wait(struct buf *cb)
    173 {
    174 	/*
    175 	 * In case of panic, busy wait for completion and run md daemon queues
    176 	 */
    177 	if (panicstr)
    178 		trans_wait_panic(cb);
    179 	return (biowait(cb));
    180 }
    181 
    182 static void
    183 setsum(int32_t *sp, int32_t *lp, int nb)
    184 {
    185 	int32_t csum = 0;
    186 
    187 	*sp = 0;
    188 	nb /= sizeof (int32_t);
    189 	while (nb--)
    190 		csum += *lp++;
    191 	*sp = csum;
    192 }
    193 
    194 static int
    195 checksum(int32_t *sp, int32_t *lp, int nb)
    196 {
    197 	int32_t ssum = *sp;
    198 
    199 	setsum(sp, lp, nb);
    200 	if (ssum != *sp) {
    201 		*sp = ssum;
    202 		return (0);
    203 	}
    204 	return (1);
    205 }
    206 
    207 void
    208 lufs_unsnarf(ufsvfs_t *ufsvfsp)
    209 {
    210 	ml_unit_t *ul;
    211 	mt_map_t *mtm;
    212 
    213 	ul = ufsvfsp->vfs_log;
    214 	if (ul == NULL)
    215 		return;
    216 
    217 	mtm = ul->un_logmap;
    218 
    219 	/*
    220 	 * Wait for a pending top_issue_sync which is
    221 	 * dispatched (via taskq_dispatch()) but hasnt completed yet.
    222 	 */
    223 
    224 	mutex_enter(&mtm->mtm_lock);
    225 
    226 	while (mtm->mtm_taskq_sync_count != 0) {
    227 		cv_wait(&mtm->mtm_cv, &mtm->mtm_lock);
    228 	}
    229 
    230 	mutex_exit(&mtm->mtm_lock);
    231 
    232 	/* Roll committed transactions */
    233 	logmap_roll_dev(ul);
    234 
    235 	/* Kill the roll thread */
    236 	logmap_kill_roll(ul);
    237 
    238 	/* release saved alloction info */
    239 	if (ul->un_ebp)
    240 		kmem_free(ul->un_ebp, ul->un_nbeb);
    241 
    242 	/* release circular bufs */
    243 	free_cirbuf(&ul->un_rdbuf);
    244 	free_cirbuf(&ul->un_wrbuf);
    245 
    246 	/* release maps */
    247 	if (ul->un_logmap)
    248 		ul->un_logmap = map_put(ul->un_logmap);
    249 	if (ul->un_deltamap)
    250 		ul->un_deltamap = map_put(ul->un_deltamap);
    251 	if (ul->un_matamap)
    252 		ul->un_matamap = map_put(ul->un_matamap);
    253 
    254 	mutex_destroy(&ul->un_log_mutex);
    255 	mutex_destroy(&ul->un_state_mutex);
    256 
    257 	/* release state buffer MUST BE LAST!! (contains our ondisk data) */
    258 	if (ul->un_bp)
    259 		brelse(ul->un_bp);
    260 	kmem_free(ul, sizeof (*ul));
    261 
    262 	ufsvfsp->vfs_log = NULL;
    263 }
    264 
    265 int
    266 lufs_snarf(ufsvfs_t *ufsvfsp, struct fs *fs, int ronly)
    267 {
    268 	buf_t		*bp, *tbp;
    269 	ml_unit_t	*ul;
    270 	extent_block_t	*ebp;
    271 	ic_extent_block_t  *nebp;
    272 	size_t		nb;
    273 	daddr_t		bno;	/* in disk blocks */
    274 	int		i;
    275 
    276 	/* LINTED: warning: logical expression always true: op "||" */
    277 	ASSERT(sizeof (ml_odunit_t) < DEV_BSIZE);
    278 
    279 	/*
    280 	 * Get the allocation table
    281 	 *	During a remount the superblock pointed to by the ufsvfsp
    282 	 *	is out of date.  Hence the need for the ``new'' superblock
    283 	 *	pointer, fs, passed in as a parameter.
    284 	 */
    285 	bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, logbtodb(fs, fs->fs_logbno),
    286 	    fs->fs_bsize);
    287 	if (bp->b_flags & B_ERROR) {
    288 		brelse(bp);
    289 		return (EIO);
    290 	}
    291 	ebp = (void *)bp->b_un.b_addr;
    292 	if (!checksum(&ebp->chksum, (int32_t *)bp->b_un.b_addr,
    293 	    fs->fs_bsize)) {
    294 		brelse(bp);
    295 		return (ENODEV);
    296 	}
    297 
    298 	/*
    299 	 * It is possible to get log blocks with all zeros.
    300 	 * We should also check for nextents to be zero in such case.
    301 	 */
    302 	if (ebp->type != LUFS_EXTENTS || ebp->nextents == 0) {
    303 		brelse(bp);
    304 		return (EDOM);
    305 	}
    306 	/*
    307 	 * Put allocation into memory.  This requires conversion between
    308 	 * on the ondisk format of the extent (type extent_t) and the
    309 	 * in-core format of the extent (type ic_extent_t).  The
    310 	 * difference is the in-core form of the extent block stores
    311 	 * the physical offset of the extent in disk blocks, which
    312 	 * can require more than a 32-bit field.
    313 	 */
    314 	nb = (size_t)(sizeof (ic_extent_block_t) +
    315 	    ((ebp->nextents - 1) * sizeof (ic_extent_t)));
    316 	nebp = kmem_alloc(nb, KM_SLEEP);
    317 	nebp->ic_nextents = ebp->nextents;
    318 	nebp->ic_nbytes = ebp->nbytes;
    319 	nebp->ic_nextbno = ebp->nextbno;
    320 	for (i = 0; i < ebp->nextents; i++) {
    321 		nebp->ic_extents[i].ic_lbno = ebp->extents[i].lbno;
    322 		nebp->ic_extents[i].ic_nbno = ebp->extents[i].nbno;
    323 		nebp->ic_extents[i].ic_pbno =
    324 		    logbtodb(fs, ebp->extents[i].pbno);
    325 	}
    326 	brelse(bp);
    327 
    328 	/*
    329 	 * Get the log state
    330 	 */
    331 	bno = nebp->ic_extents[0].ic_pbno;
    332 	bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, bno, DEV_BSIZE);
    333 	if (bp->b_flags & B_ERROR) {
    334 		brelse(bp);
    335 		bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, bno + 1, DEV_BSIZE);
    336 		if (bp->b_flags & B_ERROR) {
    337 			brelse(bp);
    338 			kmem_free(nebp, nb);
    339 			return (EIO);
    340 		}
    341 	}
    342 
    343 	/*
    344 	 * Put ondisk struct into an anonymous buffer
    345 	 *	This buffer will contain the memory for the ml_odunit struct
    346 	 */
    347 	tbp = ngeteblk(dbtob(LS_SECTORS));
    348 	tbp->b_edev = bp->b_edev;
    349 	tbp->b_dev = bp->b_dev;
    350 	tbp->b_blkno = bno;
    351 	bcopy(bp->b_un.b_addr, tbp->b_un.b_addr, DEV_BSIZE);
    352 	bcopy(bp->b_un.b_addr, tbp->b_un.b_addr + DEV_BSIZE, DEV_BSIZE);
    353 	bp->b_flags |= (B_STALE | B_AGE);
    354 	brelse(bp);
    355 	bp = tbp;
    356 
    357 	/*
    358 	 * Verify the log state
    359 	 *
    360 	 * read/only mounts w/bad logs are allowed.  umount will
    361 	 * eventually roll the bad log until the first IO error.
    362 	 * fsck will then repair the file system.
    363 	 *
    364 	 * read/write mounts with bad logs are not allowed.
    365 	 *
    366 	 */
    367 	ul = (ml_unit_t *)kmem_zalloc(sizeof (*ul), KM_SLEEP);
    368 	bcopy(bp->b_un.b_addr, &ul->un_ondisk, sizeof (ml_odunit_t));
    369 	if ((ul->un_chksum != ul->un_head_ident + ul->un_tail_ident) ||
    370 	    (ul->un_version != LUFS_VERSION_LATEST) ||
    371 	    (!ronly && ul->un_badlog)) {
    372 		kmem_free(ul, sizeof (*ul));
    373 		brelse(bp);
    374 		kmem_free(nebp, nb);
    375 		return (EIO);
    376 	}
    377 	/*
    378 	 * Initialize the incore-only fields
    379 	 */
    380 	if (ronly)
    381 		ul->un_flags |= LDL_NOROLL;
    382 	ul->un_bp = bp;
    383 	ul->un_ufsvfs = ufsvfsp;
    384 	ul->un_dev = ufsvfsp->vfs_dev;
    385 	ul->un_ebp = nebp;
    386 	ul->un_nbeb = nb;
    387 	ul->un_maxresv = btodb(ul->un_logsize) * LDL_USABLE_BSIZE;
    388 	ul->un_deltamap = map_get(ul, deltamaptype, DELTAMAP_NHASH);
    389 	ul->un_logmap = map_get(ul, logmaptype, LOGMAP_NHASH);
    390 	if (ul->un_debug & MT_MATAMAP)
    391 		ul->un_matamap = map_get(ul, matamaptype, DELTAMAP_NHASH);
    392 	mutex_init(&ul->un_log_mutex, NULL, MUTEX_DEFAULT, NULL);
    393 	mutex_init(&ul->un_state_mutex, NULL, MUTEX_DEFAULT, NULL);
    394 
    395 	/*
    396 	 * Aquire the ufs_scan_lock before linking the mtm data
    397 	 * structure so that we keep ufs_sync() and ufs_update() away
    398 	 * when they execute the ufs_scan_inodes() run while we're in
    399 	 * progress of enabling/disabling logging.
    400 	 */
    401 	mutex_enter(&ufs_scan_lock);
    402 	ufsvfsp->vfs_log = ul;
    403 
    404 	/* remember the state of the log before the log scan */
    405 	logmap_logscan(ul);
    406 	mutex_exit(&ufs_scan_lock);
    407 
    408 	/*
    409 	 * Error during scan
    410 	 *
    411 	 * If this is a read/only mount; ignore the error.
    412 	 * At a later time umount/fsck will repair the fs.
    413 	 *
    414 	 */
    415 	if (ul->un_flags & LDL_ERROR) {
    416 		if (!ronly) {
    417 			/*
    418 			 * Aquire the ufs_scan_lock before de-linking
    419 			 * the mtm data structure so that we keep ufs_sync()
    420 			 * and ufs_update() away when they execute the
    421 			 * ufs_scan_inodes() run while we're in progress of
    422 			 * enabling/disabling logging.
    423 			 */
    424 			mutex_enter(&ufs_scan_lock);
    425 			lufs_unsnarf(ufsvfsp);
    426 			mutex_exit(&ufs_scan_lock);
    427 			return (EIO);
    428 		}
    429 		ul->un_flags &= ~LDL_ERROR;
    430 	}
    431 	if (!ronly)
    432 		logmap_start_roll(ul);
    433 	return (0);
    434 }
    435 
    436 uint32_t
    437 lufs_hd_genid(const ml_unit_t *up)
    438 {
    439 	uint32_t id;
    440 
    441 	mutex_enter(&genid_mutex);
    442 
    443 	/*
    444 	 * The formula below implements an exponential, modular sequence.
    445 	 *
    446 	 * ID(N) = (SEED * (BASE^N)) % PRIME
    447 	 *
    448 	 * The numbers will be pseudo random.  They depend on SEED, BASE, PRIME,
    449 	 * but will sweep through almost all of the range 1....PRIME-1.
    450 	 * Most  importantly  they  will  not  repeat  for PRIME-2 (4294967289)
    451 	 * repetitions.  If they would repeat that  could possibly cause  hangs,
    452 	 * panics at mount/umount and failed mount operations.
    453 	 */
    454 	id = LUFS_NEXT_ID(last_loghead_ident);
    455 
    456 	/* Checking if new identity used already */
    457 	if (up != NULL && up->un_head_ident == id) {
    458 		DTRACE_PROBE1(head_ident_collision, uint32_t, id);
    459 
    460 		/*
    461 		 * The  following  preserves  the  algorithm  for  the fix  for
    462 		 * "panic: free: freeing free frag, dev:0x2000000018, blk:34605,
    463 		 * cg:26, ino:148071,".
    464 		 * If  the header identities  un_head_ident  are  equal  to the
    465 		 * present element  in the sequence,  the next element  of  the
    466 		 * sequence is returned instead.
    467 		 */
    468 		id = LUFS_NEXT_ID(id);
    469 	}
    470 
    471 	last_loghead_ident = id;
    472 
    473 	mutex_exit(&genid_mutex);
    474 
    475 	return (id);
    476 }
    477 
    478 static void
    479 lufs_genid_init(void)
    480 {
    481 	uint64_t seed;
    482 
    483 	/* Initialization */
    484 	mutex_init(&genid_mutex, NULL, MUTEX_DEFAULT, NULL);
    485 
    486 	/* Seed the algorithm */
    487 	do {
    488 		timestruc_t tv;
    489 
    490 		gethrestime(&tv);
    491 
    492 		seed = (tv.tv_nsec << 3);
    493 		seed ^= tv.tv_sec;
    494 
    495 		last_loghead_ident = (uint32_t)(seed % LUFS_GENID_PRIME);
    496 	} while (last_loghead_ident == UINT32_C(0));
    497 }
    498 
    499 static int
    500 lufs_initialize(
    501 	ufsvfs_t *ufsvfsp,
    502 	daddr_t bno,
    503 	size_t nb,
    504 	struct fiolog *flp)
    505 {
    506 	ml_odunit_t	*ud, *ud2;
    507 	buf_t		*bp;
    508 
    509 	/* LINTED: warning: logical expression always true: op "||" */
    510 	ASSERT(sizeof (ml_odunit_t) < DEV_BSIZE);
    511 	ASSERT(nb >= ldl_minlogsize);
    512 
    513 	bp = UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, dbtob(LS_SECTORS));
    514 	bzero(bp->b_un.b_addr, bp->b_bcount);
    515 
    516 	ud = (void *)bp->b_un.b_addr;
    517 	ud->od_version = LUFS_VERSION_LATEST;
    518 	ud->od_maxtransfer = MIN(ufsvfsp->vfs_iotransz, ldl_maxtransfer);
    519 	if (ud->od_maxtransfer < ldl_mintransfer)
    520 		ud->od_maxtransfer = ldl_mintransfer;
    521 	ud->od_devbsize = DEV_BSIZE;
    522 
    523 	ud->od_requestsize = flp->nbytes_actual;
    524 	ud->od_statesize = dbtob(LS_SECTORS);
    525 	ud->od_logsize = nb - ud->od_statesize;
    526 
    527 	ud->od_statebno = INT32_C(0);
    528 
    529 	ud->od_head_ident = lufs_hd_genid(NULL);
    530 	ud->od_tail_ident = ud->od_head_ident;
    531 	ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
    532 
    533 	ud->od_bol_lof = dbtob(ud->od_statebno) + ud->od_statesize;
    534 	ud->od_eol_lof = ud->od_bol_lof + ud->od_logsize;
    535 	ud->od_head_lof = ud->od_bol_lof;
    536 	ud->od_tail_lof = ud->od_bol_lof;
    537 
    538 	ASSERT(lufs_initialize_debug(ud));
    539 
    540 	ud2 = (void *)(bp->b_un.b_addr + DEV_BSIZE);
    541 	bcopy(ud, ud2, sizeof (*ud));
    542 
    543 	UFS_BWRITE2(ufsvfsp, bp);
    544 	if (bp->b_flags & B_ERROR) {
    545 		brelse(bp);
    546 		return (EIO);
    547 	}
    548 	brelse(bp);
    549 
    550 	return (0);
    551 }
    552 
    553 /*
    554  * Free log space
    555  *	Assumes the file system is write locked and is not logging
    556  */
    557 static int
    558 lufs_free(struct ufsvfs *ufsvfsp)
    559 {
    560 	int		error = 0, i, j;
    561 	buf_t		*bp = NULL;
    562 	extent_t	*ep;
    563 	extent_block_t	*ebp;
    564 	struct fs	*fs = ufsvfsp->vfs_fs;
    565 	daddr_t		fno;
    566 	int32_t		logbno;
    567 	long		nfno;
    568 	inode_t		*ip = NULL;
    569 	char		clean;
    570 
    571 	/*
    572 	 * Nothing to free
    573 	 */
    574 	if (fs->fs_logbno == 0)
    575 		return (0);
    576 
    577 	/*
    578 	 * Mark the file system as FSACTIVE and no log but honor the
    579 	 * current value of fs_reclaim.  The reclaim thread could have
    580 	 * been active when lufs_disable() was called and if fs_reclaim
    581 	 * is reset to zero here it could lead to lost inodes.
    582 	 */
    583 	ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
    584 	mutex_enter(&ufsvfsp->vfs_lock);
    585 	clean = fs->fs_clean;
    586 	logbno = fs->fs_logbno;
    587 	fs->fs_clean = FSACTIVE;
    588 	fs->fs_logbno = INT32_C(0);
    589 	ufs_sbwrite(ufsvfsp);
    590 	mutex_exit(&ufsvfsp->vfs_lock);
    591 	ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
    592 	if (ufsvfsp->vfs_bufp->b_flags & B_ERROR) {
    593 		error = EIO;
    594 		fs->fs_clean = clean;
    595 		fs->fs_logbno = logbno;
    596 		goto errout;
    597 	}
    598 
    599 	/*
    600 	 * fetch the allocation block
    601 	 *	superblock -> one block of extents -> log data
    602 	 */
    603 	bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, logbtodb(fs, logbno),
    604 	    fs->fs_bsize);
    605 	if (bp->b_flags & B_ERROR) {
    606 		error = EIO;
    607 		goto errout;
    608 	}
    609 
    610 	/*
    611 	 * Free up the allocated space (dummy inode needed for free())
    612 	 */
    613 	ip = ufs_alloc_inode(ufsvfsp, UFSROOTINO);
    614 	ebp = (void *)bp->b_un.b_addr;
    615 	for (i = 0, ep = &ebp->extents[0]; i < ebp->nextents; ++i, ++ep) {
    616 		fno = logbtofrag(fs, ep->pbno);
    617 		nfno = dbtofsb(fs, ep->nbno);
    618 		for (j = 0; j < nfno; j += fs->fs_frag, fno += fs->fs_frag)
    619 			free(ip, fno, fs->fs_bsize, 0);
    620 	}
    621 	free(ip, logbtofrag(fs, logbno), fs->fs_bsize, 0);
    622 	brelse(bp);
    623 	bp = NULL;
    624 
    625 	/*
    626 	 * Push the metadata dirtied during the allocations
    627 	 */
    628 	ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
    629 	sbupdate(ufsvfsp->vfs_vfs);
    630 	ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
    631 	bflush(ufsvfsp->vfs_dev);
    632 	error = bfinval(ufsvfsp->vfs_dev, 0);
    633 	if (error)
    634 		goto errout;
    635 
    636 	/*
    637 	 * Free the dummy inode
    638 	 */
    639 	ufs_free_inode(ip);
    640 
    641 	return (0);
    642 
    643 errout:
    644 	/*
    645 	 * Free up all resources
    646 	 */
    647 	if (bp)
    648 		brelse(bp);
    649 	if (ip)
    650 		ufs_free_inode(ip);
    651 	return (error);
    652 }
    653 
    654 /*
    655  * Allocate log space
    656  *	Assumes the file system is write locked and is not logging
    657  */
    658 static int
    659 lufs_alloc(struct ufsvfs *ufsvfsp, struct fiolog *flp, cred_t *cr)
    660 {
    661 	int		error = 0;
    662 	buf_t		*bp = NULL;
    663 	extent_t	*ep, *nep;
    664 	extent_block_t	*ebp;
    665 	struct fs	*fs = ufsvfsp->vfs_fs;
    666 	daddr_t		fno;	/* in frags */
    667 	daddr_t		bno;	/* in disk blocks */
    668 	int32_t		logbno = INT32_C(0);	/* will be fs_logbno */
    669 	struct inode	*ip = NULL;
    670 	size_t		nb = flp->nbytes_actual;
    671 	size_t		tb = 0;
    672 
    673 	/*
    674 	 * Mark the file system as FSACTIVE
    675 	 */
    676 	ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
    677 	mutex_enter(&ufsvfsp->vfs_lock);
    678 	fs->fs_clean = FSACTIVE;
    679 	ufs_sbwrite(ufsvfsp);
    680 	mutex_exit(&ufsvfsp->vfs_lock);
    681 	ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
    682 
    683 	/*
    684 	 * Allocate the allocation block (need dummy shadow inode;
    685 	 * we use a shadow inode so the quota sub-system ignores
    686 	 * the block allocations.)
    687 	 *	superblock -> one block of extents -> log data
    688 	 */
    689 	ip = ufs_alloc_inode(ufsvfsp, UFSROOTINO);
    690 	ip->i_mode = IFSHAD;		/* make the dummy a shadow inode */
    691 	rw_enter(&ip->i_contents, RW_WRITER);
    692 	fno = contigpref(ufsvfsp, nb + fs->fs_bsize);
    693 	error = alloc(ip, fno, fs->fs_bsize, &fno, cr);
    694 	if (error)
    695 		goto errout;
    696 	bno = fsbtodb(fs, fno);
    697 
    698 	bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, bno, fs->fs_bsize);
    699 	if (bp->b_flags & B_ERROR) {
    700 		error = EIO;
    701 		goto errout;
    702 	}
    703 
    704 	ebp = (void *)bp->b_un.b_addr;
    705 	ebp->type = LUFS_EXTENTS;
    706 	ebp->nextbno = UINT32_C(0);
    707 	ebp->nextents = UINT32_C(0);
    708 	ebp->chksum = INT32_C(0);
    709 	if (fs->fs_magic == FS_MAGIC)
    710 		logbno = bno;
    711 	else
    712 		logbno = dbtofsb(fs, bno);
    713 
    714 	/*
    715 	 * Initialize the first extent
    716 	 */
    717 	ep = &ebp->extents[0];
    718 	error = alloc(ip, fno + fs->fs_frag, fs->fs_bsize, &fno, cr);
    719 	if (error)
    720 		goto errout;
    721 	bno = fsbtodb(fs, fno);
    722 
    723 	ep->lbno = UINT32_C(0);
    724 	if (fs->fs_magic == FS_MAGIC)
    725 		ep->pbno = (uint32_t)bno;
    726 	else
    727 		ep->pbno = (uint32_t)fno;
    728 	ep->nbno = (uint32_t)fsbtodb(fs, fs->fs_frag);
    729 	ebp->nextents = UINT32_C(1);
    730 	tb = fs->fs_bsize;
    731 	nb -= fs->fs_bsize;
    732 
    733 	while (nb) {
    734 		error = alloc(ip, fno + fs->fs_frag, fs->fs_bsize, &fno, cr);
    735 		if (error) {
    736 			if (tb < ldl_minlogsize)
    737 				goto errout;
    738 			error = 0;
    739 			break;
    740 		}
    741 		bno = fsbtodb(fs, fno);
    742 		if ((daddr_t)((logbtodb(fs, ep->pbno) + ep->nbno) == bno))
    743 			ep->nbno += (uint32_t)(fsbtodb(fs, fs->fs_frag));
    744 		else {
    745 			nep = ep + 1;
    746 			if ((caddr_t)(nep + 1) >
    747 			    (bp->b_un.b_addr + fs->fs_bsize)) {
    748 				free(ip, fno, fs->fs_bsize, 0);
    749 				break;
    750 			}
    751 			nep->lbno = ep->lbno + ep->nbno;
    752 			if (fs->fs_magic == FS_MAGIC)
    753 				nep->pbno = (uint32_t)bno;
    754 			else
    755 				nep->pbno = (uint32_t)fno;
    756 			nep->nbno = (uint32_t)(fsbtodb(fs, fs->fs_frag));
    757 			ebp->nextents++;
    758 			ep = nep;
    759 		}
    760 		tb += fs->fs_bsize;
    761 		nb -= fs->fs_bsize;
    762 	}
    763 	ebp->nbytes = (uint32_t)tb;
    764 	setsum(&ebp->chksum, (int32_t *)bp->b_un.b_addr, fs->fs_bsize);
    765 	UFS_BWRITE2(ufsvfsp, bp);
    766 	if (bp->b_flags & B_ERROR) {
    767 		error = EIO;
    768 		goto errout;
    769 	}
    770 	/*
    771 	 * Initialize the first two sectors of the log
    772 	 */
    773 	error = lufs_initialize(ufsvfsp, logbtodb(fs, ebp->extents[0].pbno),
    774 	    tb, flp);
    775 	if (error)
    776 		goto errout;
    777 
    778 	/*
    779 	 * We are done initializing the allocation block and the log
    780 	 */
    781 	brelse(bp);
    782 	bp = NULL;
    783 
    784 	/*
    785 	 * Update the superblock and push the dirty metadata
    786 	 */
    787 	ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
    788 	sbupdate(ufsvfsp->vfs_vfs);
    789 	ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
    790 	bflush(ufsvfsp->vfs_dev);
    791 	error = bfinval(ufsvfsp->vfs_dev, 1);
    792 	if (error)
    793 		goto errout;
    794 	if (ufsvfsp->vfs_bufp->b_flags & B_ERROR) {
    795 		error = EIO;
    796 		goto errout;
    797 	}
    798 
    799 	/*
    800 	 * Everything is safely on disk; update log space pointer in sb
    801 	 */
    802 	ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
    803 	mutex_enter(&ufsvfsp->vfs_lock);
    804 	fs->fs_logbno = (uint32_t)logbno;
    805 	ufs_sbwrite(ufsvfsp);
    806 	mutex_exit(&ufsvfsp->vfs_lock);
    807 	ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
    808 
    809 	/*
    810 	 * Free the dummy inode
    811 	 */
    812 	rw_exit(&ip->i_contents);
    813 	ufs_free_inode(ip);
    814 
    815 	/* inform user of real log size */
    816 	flp->nbytes_actual = tb;
    817 	return (0);
    818 
    819 errout:
    820 	/*
    821 	 * Free all resources
    822 	 */
    823 	if (bp)
    824 		brelse(bp);
    825 	if (logbno) {
    826 		fs->fs_logbno = logbno;
    827 		(void) lufs_free(ufsvfsp);
    828 	}
    829 	if (ip) {
    830 		rw_exit(&ip->i_contents);
    831 		ufs_free_inode(ip);
    832 	}
    833 	return (error);
    834 }
    835 
    836 /*
    837  * Disable logging
    838  */
    839 int
    840 lufs_disable(vnode_t *vp, struct fiolog *flp)
    841 {
    842 	int		error = 0;
    843 	inode_t		*ip = VTOI(vp);
    844 	ufsvfs_t	*ufsvfsp = ip->i_ufsvfs;
    845 	struct fs	*fs = ufsvfsp->vfs_fs;
    846 	struct lockfs	lf;
    847 	struct ulockfs	*ulp;
    848 
    849 	flp->error = FIOLOG_ENONE;
    850 
    851 	/*
    852 	 * Logging is already disabled; done
    853 	 */
    854 	if (fs->fs_logbno == 0 || ufsvfsp->vfs_log == NULL)
    855 		return (0);
    856 
    857 	/*
    858 	 * Readonly file system
    859 	 */
    860 	if (fs->fs_ronly) {
    861 		flp->error = FIOLOG_EROFS;
    862 		return (0);
    863 	}
    864 
    865 	/*
    866 	 * File system must be write locked to disable logging
    867 	 */
    868 	error = ufs_fiolfss(vp, &lf);
    869 	if (error) {
    870 		return (error);
    871 	}
    872 	if (!LOCKFS_IS_ULOCK(&lf)) {
    873 		flp->error = FIOLOG_EULOCK;
    874 		return (0);
    875 	}
    876 	lf.lf_lock = LOCKFS_WLOCK;
    877 	lf.lf_flags = 0;
    878 	lf.lf_comment = NULL;
    879 	error = ufs_fiolfs(vp, &lf, 1);
    880 	if (error) {
    881 		flp->error = FIOLOG_EWLOCK;
    882 		return (0);
    883 	}
    884 
    885 	if (ufsvfsp->vfs_log == NULL || fs->fs_logbno == 0)
    886 		goto errout;
    887 
    888 	/*
    889 	 * WE ARE COMMITTED TO DISABLING LOGGING PAST THIS POINT
    890 	 */
    891 
    892 	/*
    893 	 * Disable logging:
    894 	 * Suspend the reclaim thread and force the delete thread to exit.
    895 	 *	When a nologging mount has completed there may still be
    896 	 *	work for reclaim to do so just suspend this thread until
    897 	 *	it's [deadlock-] safe for it to continue.  The delete
    898 	 *	thread won't be needed as ufs_iinactive() calls
    899 	 *	ufs_delete() when logging is disabled.
    900 	 * Freeze and drain reader ops.
    901 	 *	Commit any outstanding reader transactions (ufs_flush).
    902 	 *	Set the ``unmounted'' bit in the ufstrans struct.
    903 	 *	If debug, remove metadata from matamap.
    904 	 *	Disable matamap processing.
    905 	 *	NULL the trans ops table.
    906 	 *	Free all of the incore structs related to logging.
    907 	 * Allow reader ops.
    908 	 */
    909 	ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
    910 	ufs_thread_exit(&ufsvfsp->vfs_delete);
    911 
    912 	vfs_lock_wait(ufsvfsp->vfs_vfs);
    913 	ulp = &ufsvfsp->vfs_ulockfs;
    914 	mutex_enter(&ulp->ul_lock);
    915 	atomic_add_long(&ufs_quiesce_pend, 1);
    916 	(void) ufs_quiesce(ulp);
    917 
    918 	(void) ufs_flush(ufsvfsp->vfs_vfs);
    919 
    920 	TRANS_MATA_UMOUNT(ufsvfsp);
    921 	ufsvfsp->vfs_domatamap = 0;
    922 
    923 	/*
    924 	 * Free all of the incore structs
    925 	 * Aquire the ufs_scan_lock before de-linking the mtm data
    926 	 * structure so that we keep ufs_sync() and ufs_update() away
    927 	 * when they execute the ufs_scan_inodes() run while we're in
    928 	 * progress of enabling/disabling logging.
    929 	 */
    930 	mutex_enter(&ufs_scan_lock);
    931 	(void) lufs_unsnarf(ufsvfsp);
    932 	mutex_exit(&ufs_scan_lock);
    933 
    934 	atomic_add_long(&ufs_quiesce_pend, -1);
    935 	mutex_exit(&ulp->ul_lock);
    936 	vfs_setmntopt(ufsvfsp->vfs_vfs, MNTOPT_NOLOGGING, NULL, 0);
    937 	vfs_unlock(ufsvfsp->vfs_vfs);
    938 
    939 	fs->fs_rolled = FS_ALL_ROLLED;
    940 	ufsvfsp->vfs_nolog_si = 0;
    941 
    942 	/*
    943 	 * Free the log space and mark the superblock as FSACTIVE
    944 	 */
    945 	(void) lufs_free(ufsvfsp);
    946 
    947 	/*
    948 	 * Allow the reclaim thread to continue.
    949 	 */
    950 	ufs_thread_continue(&ufsvfsp->vfs_reclaim);
    951 
    952 	/*
    953 	 * Unlock the file system
    954 	 */
    955 	lf.lf_lock = LOCKFS_ULOCK;
    956 	lf.lf_flags = 0;
    957 	error = ufs_fiolfs(vp, &lf, 1);
    958 	if (error)
    959 		flp->error = FIOLOG_ENOULOCK;
    960 
    961 	return (0);
    962 
    963 errout:
    964 	lf.lf_lock = LOCKFS_ULOCK;
    965 	lf.lf_flags = 0;
    966 	(void) ufs_fiolfs(vp, &lf, 1);
    967 	return (error);
    968 }
    969 
    970 /*
    971  * Enable logging
    972  */
    973 int
    974 lufs_enable(struct vnode *vp, struct fiolog *flp, cred_t *cr)
    975 {
    976 	int		error;
    977 	int		reclaim;
    978 	inode_t		*ip = VTOI(vp);
    979 	ufsvfs_t	*ufsvfsp = ip->i_ufsvfs;
    980 	struct fs	*fs;
    981 	ml_unit_t	*ul;
    982 	struct lockfs	lf;
    983 	struct ulockfs	*ulp;
    984 	vfs_t		*vfsp = ufsvfsp->vfs_vfs;
    985 	uint64_t	tmp_nbytes_actual;
    986 
    987 	/*
    988 	 * Check if logging is already enabled
    989 	 */
    990 	if (ufsvfsp->vfs_log) {
    991 		flp->error = FIOLOG_ETRANS;
    992 		/* for root ensure logging option is set */
    993 		vfs_setmntopt(vfsp, MNTOPT_LOGGING, NULL, 0);
    994 		return (0);
    995 	}
    996 	fs = ufsvfsp->vfs_fs;
    997 
    998 	/*
    999 	 * Come back here to recheck if we had to disable the log.
   1000 	 */
   1001 recheck:
   1002 	error = 0;
   1003 	reclaim = 0;
   1004 	flp->error = FIOLOG_ENONE;
   1005 
   1006 	/*
   1007 	 * Adjust requested log size
   1008 	 */
   1009 	flp->nbytes_actual = flp->nbytes_requested;
   1010 	if (flp->nbytes_actual == 0) {
   1011 		tmp_nbytes_actual =
   1012 		    (((uint64_t)fs->fs_size) / ldl_divisor) << fs->fs_fshift;
   1013 		flp->nbytes_actual = (uint_t)MIN(tmp_nbytes_actual, INT_MAX);
   1014 	}
   1015 	flp->nbytes_actual = MAX(flp->nbytes_actual, ldl_minlogsize);
   1016 	flp->nbytes_actual = MIN(flp->nbytes_actual, ldl_maxlogsize);
   1017 	flp->nbytes_actual = blkroundup(fs, flp->nbytes_actual);
   1018 
   1019 	/*
   1020 	 * logging is enabled and the log is the right size; done
   1021 	 */
   1022 	ul = ufsvfsp->vfs_log;
   1023 	if (ul && fs->fs_logbno && (flp->nbytes_actual == ul->un_requestsize))
   1024 			return (0);
   1025 
   1026 	/*
   1027 	 * Readonly file system
   1028 	 */
   1029 	if (fs->fs_ronly) {
   1030