Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /* Portions Copyright 2007 Jeremy Teo */
     27 
     28 #ifdef _KERNEL
     29 #include <sys/types.h>
     30 #include <sys/param.h>
     31 #include <sys/time.h>
     32 #include <sys/systm.h>
     33 #include <sys/sysmacros.h>
     34 #include <sys/resource.h>
     35 #include <sys/mntent.h>
     36 #include <sys/mkdev.h>
     37 #include <sys/u8_textprep.h>
     38 #include <sys/dsl_dataset.h>
     39 #include <sys/vfs.h>
     40 #include <sys/vfs_opreg.h>
     41 #include <sys/vnode.h>
     42 #include <sys/file.h>
     43 #include <sys/kmem.h>
     44 #include <sys/errno.h>
     45 #include <sys/unistd.h>
     46 #include <sys/mode.h>
     47 #include <sys/atomic.h>
     48 #include <vm/pvn.h>
     49 #include "fs/fs_subr.h"
     50 #include <sys/zfs_dir.h>
     51 #include <sys/zfs_acl.h>
     52 #include <sys/zfs_ioctl.h>
     53 #include <sys/zfs_rlock.h>
     54 #include <sys/zfs_fuid.h>
     55 #include <sys/fs/zfs.h>
     56 #include <sys/kidmap.h>
     57 #endif /* _KERNEL */
     58 
     59 #include <sys/dmu.h>
     60 #include <sys/refcount.h>
     61 #include <sys/stat.h>
     62 #include <sys/zap.h>
     63 #include <sys/zfs_znode.h>
     64 
     65 #include "zfs_prop.h"
     66 
     67 /*
     68  * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
     69  * turned on when DEBUG is also defined.
     70  */
     71 #ifdef	DEBUG
     72 #define	ZNODE_STATS
     73 #endif	/* DEBUG */
     74 
     75 #ifdef	ZNODE_STATS
     76 #define	ZNODE_STAT_ADD(stat)			((stat)++)
     77 #else
     78 #define	ZNODE_STAT_ADD(stat)			/* nothing */
     79 #endif	/* ZNODE_STATS */
     80 
     81 #define	POINTER_IS_VALID(p)	(!((uintptr_t)(p) & 0x3))
     82 #define	POINTER_INVALIDATE(pp)	(*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
     83 
     84 /*
     85  * Functions needed for userland (ie: libzpool) are not put under
     86  * #ifdef_KERNEL; the rest of the functions have dependencies
     87  * (such as VFS logic) that will not compile easily in userland.
     88  */
     89 #ifdef _KERNEL
     90 static kmem_cache_t *znode_cache = NULL;
     91 
     92 /*ARGSUSED*/
     93 static void
     94 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
     95 {
     96 	/*
     97 	 * We should never drop all dbuf refs without first clearing
     98 	 * the eviction callback.
     99 	 */
    100 	panic("evicting znode %p\n", user_ptr);
    101 }
    102 
    103 /*ARGSUSED*/
    104 static int
    105 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
    106 {
    107 	znode_t *zp = buf;
    108 
    109 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
    110 
    111 	zp->z_vnode = vn_alloc(kmflags);
    112 	if (zp->z_vnode == NULL) {
    113 		return (-1);
    114 	}
    115 	ZTOV(zp)->v_data = zp;
    116 
    117 	list_link_init(&zp->z_link_node);
    118 
    119 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
    120 	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
    121 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
    122 	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
    123 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
    124 
    125 	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
    126 	avl_create(&zp->z_range_avl, zfs_range_compare,
    127 	    sizeof (rl_t), offsetof(rl_t, r_node));
    128 
    129 	zp->z_dbuf = NULL;
    130 	zp->z_dirlocks = NULL;
    131 	return (0);
    132 }
    133 
    134 /*ARGSUSED*/
    135 static void
    136 zfs_znode_cache_destructor(void *buf, void *arg)
    137 {
    138 	znode_t *zp = buf;
    139 
    140 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
    141 	ASSERT(ZTOV(zp)->v_data == zp);
    142 	vn_free(ZTOV(zp));
    143 	ASSERT(!list_link_active(&zp->z_link_node));
    144 	mutex_destroy(&zp->z_lock);
    145 	rw_destroy(&zp->z_map_lock);
    146 	rw_destroy(&zp->z_parent_lock);
    147 	rw_destroy(&zp->z_name_lock);
    148 	mutex_destroy(&zp->z_acl_lock);
    149 	avl_destroy(&zp->z_range_avl);
    150 	mutex_destroy(&zp->z_range_lock);
    151 
    152 	ASSERT(zp->z_dbuf == NULL);
    153 	ASSERT(zp->z_dirlocks == NULL);
    154 }
    155 
    156 #ifdef	ZNODE_STATS
    157 static struct {
    158 	uint64_t zms_zfsvfs_invalid;
    159 	uint64_t zms_zfsvfs_unmounted;
    160 	uint64_t zms_zfsvfs_recheck_invalid;
    161 	uint64_t zms_obj_held;
    162 	uint64_t zms_vnode_locked;
    163 	uint64_t zms_not_only_dnlc;
    164 } znode_move_stats;
    165 #endif	/* ZNODE_STATS */
    166 
    167 static void
    168 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
    169 {
    170 	vnode_t *vp;
    171 
    172 	/* Copy fields. */
    173 	nzp->z_zfsvfs = ozp->z_zfsvfs;
    174 
    175 	/* Swap vnodes. */
    176 	vp = nzp->z_vnode;
    177 	nzp->z_vnode = ozp->z_vnode;
    178 	ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
    179 	ZTOV(ozp)->v_data = ozp;
    180 	ZTOV(nzp)->v_data = nzp;
    181 
    182 	nzp->z_id = ozp->z_id;
    183 	ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
    184 	ASSERT(avl_numnodes(&ozp->z_range_avl) == 0);
    185 	nzp->z_unlinked = ozp->z_unlinked;
    186 	nzp->z_atime_dirty = ozp->z_atime_dirty;
    187 	nzp->z_zn_prefetch = ozp->z_zn_prefetch;
    188 	nzp->z_blksz = ozp->z_blksz;
    189 	nzp->z_seq = ozp->z_seq;
    190 	nzp->z_mapcnt = ozp->z_mapcnt;
    191 	nzp->z_last_itx = ozp->z_last_itx;
    192 	nzp->z_gen = ozp->z_gen;
    193 	nzp->z_sync_cnt = ozp->z_sync_cnt;
    194 	nzp->z_phys = ozp->z_phys;
    195 	nzp->z_dbuf = ozp->z_dbuf;
    196 
    197 	/* Update back pointers. */
    198 	(void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
    199 	    znode_evict_error);
    200 
    201 	/*
    202 	 * Invalidate the original znode by clearing fields that provide a
    203 	 * pointer back to the znode. Set the low bit of the vfs pointer to
    204 	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
    205 	 * subsequent callback.
    206 	 */
    207 	ozp->z_dbuf = NULL;
    208 	POINTER_INVALIDATE(&ozp->z_zfsvfs);
    209 }
    210 
    211 /*
    212  * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise
    213  * returns a non-zero error code.
    214  */
    215 static int
    216 zfs_enter(zfsvfs_t *zfsvfs)
    217 {
    218 	ZFS_ENTER(zfsvfs);
    219 	return (0);
    220 }
    221 
    222 /*ARGSUSED*/
    223 static kmem_cbrc_t
    224 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
    225 {
    226 	znode_t *ozp = buf, *nzp = newbuf;
    227 	zfsvfs_t *zfsvfs;
    228 	vnode_t *vp;
    229 
    230 	/*
    231 	 * The znode is on the file system's list of known znodes if the vfs
    232 	 * pointer is valid. We set the low bit of the vfs pointer when freeing
    233 	 * the znode to invalidate it, and the memory patterns written by kmem
    234 	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
    235 	 * created znode sets the vfs pointer last of all to indicate that the
    236 	 * znode is known and in a valid state to be moved by this function.
    237 	 */
    238 	zfsvfs = ozp->z_zfsvfs;
    239 	if (!POINTER_IS_VALID(zfsvfs)) {
    240 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
    241 		return (KMEM_CBRC_DONT_KNOW);
    242 	}
    243 
    244 	/*
    245 	 * Ensure that the filesystem is not unmounted during the move.
    246 	 */
    247 	if (zfs_enter(zfsvfs) != 0) {		/* ZFS_ENTER */
    248 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
    249 		return (KMEM_CBRC_DONT_KNOW);
    250 	}
    251 
    252 	mutex_enter(&zfsvfs->z_znodes_lock);
    253 	/*
    254 	 * Recheck the vfs pointer in case the znode was removed just before
    255 	 * acquiring the lock.
    256 	 */
    257 	if (zfsvfs != ozp->z_zfsvfs) {
    258 		mutex_exit(&zfsvfs->z_znodes_lock);
    259 		ZFS_EXIT(zfsvfs);
    260 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid);
    261 		return (KMEM_CBRC_DONT_KNOW);
    262 	}
    263 
    264 	/*
    265 	 * At this point we know that as long as we hold z_znodes_lock, the
    266 	 * znode cannot be freed and fields within the znode can be safely
    267 	 * accessed. Now, prevent a race with zfs_zget().
    268 	 */
    269 	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
    270 		mutex_exit(&zfsvfs->z_znodes_lock);
    271 		ZFS_EXIT(zfsvfs);
    272 		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
    273 		return (KMEM_CBRC_LATER);
    274 	}
    275 
    276 	vp = ZTOV(ozp);
    277 	if (mutex_tryenter(&vp->v_lock) == 0) {
    278 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
    279 		mutex_exit(&zfsvfs->z_znodes_lock);
    280 		ZFS_EXIT(zfsvfs);
    281 		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
    282 		return (KMEM_CBRC_LATER);
    283 	}
    284 
    285 	/* Only move znodes that are referenced _only_ by the DNLC. */
    286 	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
    287 		mutex_exit(&vp->v_lock);
    288 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
    289 		mutex_exit(&zfsvfs->z_znodes_lock);
    290 		ZFS_EXIT(zfsvfs);
    291 		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
    292 		return (KMEM_CBRC_LATER);
    293 	}
    294 
    295 	/*
    296 	 * The znode is known and in a valid state to move. We're holding the
    297 	 * locks needed to execute the critical section.
    298 	 */
    299 	zfs_znode_move_impl(ozp, nzp);
    300 	mutex_exit(&vp->v_lock);
    301 	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
    302 
    303 	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
    304 	mutex_exit(&zfsvfs->z_znodes_lock);
    305 	ZFS_EXIT(zfsvfs);
    306 
    307 	return (KMEM_CBRC_YES);
    308 }
    309 
    310 void
    311 zfs_znode_init(void)
    312 {
    313 	/*
    314 	 * Initialize zcache
    315 	 */
    316 	ASSERT(znode_cache == NULL);
    317 	znode_cache = kmem_cache_create("zfs_znode_cache",
    318 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
    319 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
    320 	kmem_cache_set_move(znode_cache, zfs_znode_move);
    321 }
    322 
    323 void
    324 zfs_znode_fini(void)
    325 {
    326 	/*
    327 	 * Cleanup vfs & vnode ops
    328 	 */
    329 	zfs_remove_op_tables();
    330 
    331 	/*
    332 	 * Cleanup zcache
    333 	 */
    334 	if (znode_cache)
    335 		kmem_cache_destroy(znode_cache);
    336 	znode_cache = NULL;
    337 }
    338 
    339 struct vnodeops *zfs_dvnodeops;
    340 struct vnodeops *zfs_fvnodeops;
    341 struct vnodeops *zfs_symvnodeops;
    342 struct vnodeops *zfs_xdvnodeops;
    343 struct vnodeops *zfs_evnodeops;
    344 
    345 void
    346 zfs_remove_op_tables()
    347 {
    348 	/*
    349 	 * Remove vfs ops
    350 	 */
    351 	ASSERT(zfsfstype);
    352 	(void) vfs_freevfsops_by_type(zfsfstype);
    353 	zfsfstype = 0;
    354 
    355 	/*
    356 	 * Remove vnode ops
    357 	 */
    358 	if (zfs_dvnodeops)
    359 		vn_freevnodeops(zfs_dvnodeops);
    360 	if (zfs_fvnodeops)
    361 		vn_freevnodeops(zfs_fvnodeops);
    362 	if (zfs_symvnodeops)
    363 		vn_freevnodeops(zfs_symvnodeops);
    364 	if (zfs_xdvnodeops)
    365 		vn_freevnodeops(zfs_xdvnodeops);
    366 	if (zfs_evnodeops)
    367 		vn_freevnodeops(zfs_evnodeops);
    368 
    369 	zfs_dvnodeops = NULL;
    370 	zfs_fvnodeops = NULL;
    371 	zfs_symvnodeops = NULL;
    372 	zfs_xdvnodeops = NULL;
    373 	zfs_evnodeops = NULL;
    374 }
    375 
    376 extern const fs_operation_def_t zfs_dvnodeops_template[];
    377 extern const fs_operation_def_t zfs_fvnodeops_template[];
    378 extern const fs_operation_def_t zfs_xdvnodeops_template[];
    379 extern const fs_operation_def_t zfs_symvnodeops_template[];
    380 extern const fs_operation_def_t zfs_evnodeops_template[];
    381 
    382 int
    383 zfs_create_op_tables()
    384 {
    385 	int error;
    386 
    387 	/*
    388 	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
    389 	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
    390 	 * In this case we just return as the ops vectors are already set up.
    391 	 */
    392 	if (zfs_dvnodeops)
    393 		return (0);
    394 
    395 	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
    396 	    &zfs_dvnodeops);
    397 	if (error)
    398 		return (error);
    399 
    400 	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
    401 	    &zfs_fvnodeops);
    402 	if (error)
    403 		return (error);
    404 
    405 	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
    406 	    &zfs_symvnodeops);
    407 	if (error)
    408 		return (error);
    409 
    410 	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
    411 	    &zfs_xdvnodeops);
    412 	if (error)
    413 		return (error);
    414 
    415 	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
    416 	    &zfs_evnodeops);
    417 
    418 	return (error);
    419 }
    420 
    421 /*
    422  * zfs_init_fs - Initialize the zfsvfs struct and the file system
    423  *	incore "master" object.  Verify version compatibility.
    424  */
    425 int
    426 zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp)
    427 {
    428 	extern int zfsfstype;
    429 
    430 	objset_t	*os = zfsvfs->z_os;
    431 	int		i, error;
    432 	uint64_t fsid_guid;
    433 	uint64_t zval;
    434 
    435 	*zpp = NULL;
    436 
    437 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
    438 	if (error) {
    439 		return (error);
    440 	} else if (zfsvfs->z_version > ZPL_VERSION) {
    441 		(void) printf("Mismatched versions:  File system "
    442 		    "is version %llu on-disk format, which is "
    443 		    "incompatible with this software version %lld!",
    444 		    (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
    445 		return (ENOTSUP);
    446 	}
    447 
    448 	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
    449 		return (error);
    450 	zfsvfs->z_norm = (int)zval;
    451 	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
    452 		return (error);
    453 	zfsvfs->z_utf8 = (zval != 0);
    454 	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
    455 		return (error);
    456 	zfsvfs->z_case = (uint_t)zval;
    457 	/*
    458 	 * Fold case on file systems that are always or sometimes case
    459 	 * insensitive.
    460 	 */
    461 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
    462 	    zfsvfs->z_case == ZFS_CASE_MIXED)
    463 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
    464 
    465 	/*
    466 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
    467 	 * separates our fsid from any other filesystem types, and a
    468 	 * 56-bit objset unique ID.  The objset unique ID is unique to
    469 	 * all objsets open on this system, provided by unique_create().
    470 	 * The 8-bit fs type must be put in the low bits of fsid[1]
    471 	 * because that's where other Solaris filesystems put it.
    472 	 */
    473 	fsid_guid = dmu_objset_fsid_guid(os);
    474 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
    475 	zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
    476 	zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
    477 	    zfsfstype & 0xFF;
    478 
    479 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
    480 	    &zfsvfs->z_root);
    481 	if (error)
    482 		return (error);
    483 	ASSERT(zfsvfs->z_root != 0);
    484 
    485 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
    486 	    &zfsvfs->z_unlinkedobj);
    487 	if (error)
    488 		return (error);
    489 
    490 	/*
    491 	 * Initialize zget mutex's
    492 	 */
    493 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
    494 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
    495 
    496 	error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
    497 	if (error) {
    498 		/*
    499 		 * On error, we destroy the mutexes here since it's not
    500 		 * possible for the caller to determine if the mutexes were
    501 		 * initialized properly.
    502 		 */
    503 		for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
    504 			mutex_destroy(&zfsvfs->z_hold_mtx[i]);
    505 		return (error);
    506 	}
    507 	ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
    508 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
    509 	    &zfsvfs->z_fuid_obj);
    510 	if (error == ENOENT)
    511 		error = 0;
    512 
    513 	return (0);
    514 }
    515 
    516 /*
    517  * define a couple of values we need available
    518  * for both 64 and 32 bit environments.
    519  */
    520 #ifndef NBITSMINOR64
    521 #define	NBITSMINOR64	32
    522 #endif
    523 #ifndef MAXMAJ64
    524 #define	MAXMAJ64	0xffffffffUL
    525 #endif
    526 #ifndef	MAXMIN64
    527 #define	MAXMIN64	0xffffffffUL
    528 #endif
    529 
    530 /*
    531  * Create special expldev for ZFS private use.
    532  * Can't use standard expldev since it doesn't do
    533  * what we want.  The standard expldev() takes a
    534  * dev32_t in LP64 and expands it to a long dev_t.
    535  * We need an interface that takes a dev32_t in ILP32
    536  * and expands it to a long dev_t.
    537  */
    538 static uint64_t
    539 zfs_expldev(dev_t dev)
    540 {
    541 #ifndef _LP64
    542 	major_t major = (major_t)dev >> NBITSMINOR32 & MAXMAJ32;
    543 	return (((uint64_t)major << NBITSMINOR64) |
    544 	    ((minor_t)dev & MAXMIN32));
    545 #else
    546 	return (dev);
    547 #endif
    548 }
    549 
    550 /*
    551  * Special cmpldev for ZFS private use.
    552  * Can't use standard cmpldev since it takes
    553  * a long dev_t and compresses it to dev32_t in
    554  * LP64.  We need to do a compaction of a long dev_t
    555  * to a dev32_t in ILP32.
    556  */
    557 dev_t
    558 zfs_cmpldev(uint64_t dev)
    559 {
    560 #ifndef _LP64
    561 	minor_t minor = (minor_t)dev & MAXMIN64;
    562 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
    563 
    564 	if (major > MAXMAJ32 || minor > MAXMIN32)
    565 		return (NODEV32);
    566 
    567 	return (((dev32_t)major << NBITSMINOR32) | minor);
    568 #else
    569 	return (dev);
    570 #endif
    571 }
    572 
    573 static void
    574 zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
    575 {
    576 	znode_t		*nzp;
    577 
    578 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
    579 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
    580 
    581 	mutex_enter(&zp->z_lock);
    582 
    583 	ASSERT(zp->z_dbuf == NULL);
    584 	zp->z_dbuf = db;
    585 	nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
    586 
    587 	/*
    588 	 * there should be no
    589 	 * concurrent zgets on this object.
    590 	 */
    591 	if (nzp != NULL)
    592 		panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db);
    593 
    594 	/*
    595 	 * Slap on VROOT if we are the root znode
    596 	 */
    597 	if (zp->z_id == zfsvfs->z_root)
    598 		ZTOV(zp)->v_flag |= VROOT;
    599 
    600 	mutex_exit(&zp->z_lock);
    601 	vn_exists(ZTOV(zp));
    602 }
    603 
    604 void
    605 zfs_znode_dmu_fini(znode_t *zp)
    606 {
    607 	dmu_buf_t *db = zp->z_dbuf;
    608 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
    609 	    zp->z_unlinked ||
    610 	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
    611 	ASSERT(zp->z_dbuf != NULL);
    612 	zp->z_dbuf = NULL;
    613 	VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
    614 	dmu_buf_rele(db, NULL);
    615 }
    616 
    617 /*
    618  * Construct a new znode/vnode and intialize.
    619  *
    620  * This does not do a call to dmu_set_user() that is
    621  * up to the caller to do, in case you don't want to
    622  * return the znode
    623  */
    624 static znode_t *
    625 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
    626 {
    627 	znode_t	*zp;
    628 	vnode_t *vp;
    629 
    630 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
    631 
    632 	ASSERT(zp->z_dirlocks == NULL);
    633 	ASSERT(zp->z_dbuf == NULL);
    634 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
    635 
    636 	/*
    637 	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
    638 	 * the zfs_znode_move() callback.
    639 	 */
    640 	zp->z_phys = NULL;
    641 	zp->z_unlinked = 0;
    642 	zp->z_atime_dirty = 0;
    643 	zp->z_mapcnt = 0;
    644 	zp->z_last_itx = 0;
    645 	zp->z_id = db->db_object;
    646 	zp->z_blksz = blksz;
    647 	zp->z_seq = 0x7A4653;
    648 	zp->z_sync_cnt = 0;
    649 
    650 	vp = ZTOV(zp);
    651 	vn_reinit(vp);
    652 
    653 	zfs_znode_dmu_init(zfsvfs, zp, db);
    654 
    655 	zp->z_gen = zp->z_phys->zp_gen;
    656 
    657 	vp->v_vfsp = zfsvfs->z_parent->z_vfs;
    658 	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
    659 
    660 	switch (vp->v_type) {
    661 	case VDIR:
    662 		if (zp->z_phys->zp_flags & ZFS_XATTR) {
    663 			vn_setops(vp, zfs_xdvnodeops);
    664 			vp->v_flag |= V_XATTRDIR;
    665 		} else {
    666 			vn_setops(vp, zfs_dvnodeops);
    667 		}
    668 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
    669 		break;
    670 	case VBLK:
    671 	case VCHR:
    672 		vp->v_rdev = zfs_cmpldev(zp->z_phys->zp_rdev);
    673 		/*FALLTHROUGH*/
    674 	case VFIFO:
    675 	case VSOCK:
    676 	case VDOOR:
    677 		vn_setops(vp, zfs_fvnodeops);
    678 		break;
    679 	case VREG:
    680 		vp->v_flag |= VMODSORT;
    681 		vn_setops(vp, zfs_fvnodeops);
    682 		break;
    683 	case VLNK:
    684 		vn_setops(vp, zfs_symvnodeops);
    685 		break;
    686 	default:
    687 		vn_setops(vp, zfs_evnodeops);
    688 		break;
    689 	}
    690 
    691 	mutex_enter(&zfsvfs->z_znodes_lock);
    692 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
    693 	membar_producer();
    694 	/*
    695 	 * Everything else must be valid before assigning z_zfsvfs makes the
    696 	 * znode eligible for zfs_znode_move().
    697 	 */
    698 	zp->z_zfsvfs = zfsvfs;
    699 	mutex_exit(&zfsvfs->z_znodes_lock);
    700 
    701 	VFS_HOLD(zfsvfs->z_vfs);
    702 	return (zp);
    703 }
    704 
    705 /*
    706  * Create a new DMU object to hold a zfs znode.
    707  *
    708  *	IN:	dzp	- parent directory for new znode
    709  *		vap	- file attributes for new znode
    710  *		tx	- dmu transaction id for zap operations
    711  *		cr	- credentials of caller
    712  *		flag	- flags:
    713  *			  IS_ROOT_NODE	- new object will be root
    714  *			  IS_XATTR	- new object is an attribute
    715  *			  IS_REPLAY	- intent log replay
    716  *		bonuslen - length of bonus buffer
    717  *		setaclp  - File/Dir initial ACL
    718  *		fuidp	 - Tracks fuid allocation.
    719  *
    720  *	OUT:	zpp	- allocated znode
    721  *
    722  */
    723 void
    724 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
    725     uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp,
    726     zfs_fuid_info_t **fuidp)
    727 {
    728 	dmu_buf_t	*db;
    729 	znode_phys_t	*pzp;
    730 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
    731 	timestruc_t	now;
    732 	uint64_t	gen, obj;
    733 	int		err;
    734 
    735 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
    736 
    737 	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
    738 		obj = vap->va_nodeid;
    739 		flag |= IS_REPLAY;
    740 		now = vap->va_ctime;		/* see zfs_replay_create() */
    741 		gen = vap->va_nblocks;		/* ditto */
    742 	} else {
    743 		obj = 0;
    744 		gethrestime(&now);
    745 		gen = dmu_tx_get_txg(tx);
    746 	}
    747 
    748 	/*
    749 	 * Create a new DMU object.
    750 	 */
    751 	/*
    752 	 * There's currently no mechanism for pre-reading the blocks that will
    753 	 * be to needed allocate a new object, so we accept the small chance
    754 	 * that there will be an i/o error and we will fail one of the
    755 	 * assertions below.
    756 	 */
    757 	if (vap->va_type == VDIR) {
    758 		if (flag & IS_REPLAY) {
    759 			err = zap_create_claim_norm(zfsvfs->z_os, obj,
    760 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
    761 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
    762 			ASSERT3U(err, ==, 0);
    763 		} else {
    764 			obj = zap_create_norm(zfsvfs->z_os,
    765 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
    766 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
    767 		}
    768 	} else {
    769 		if (flag & IS_REPLAY) {
    770 			err = dmu_object_claim(zfsvfs->z_os, obj,
    771 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
    772 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
    773 			ASSERT3U(err, ==, 0);
    774 		} else {
    775 			obj = dmu_object_alloc(zfsvfs->z_os,
    776 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
    777 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
    778 		}
    779 	}
    780 	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
    781 	dmu_buf_will_dirty(db, tx);
    782 
    783 	/*
    784 	 * Initialize the znode physical data to zero.
    785 	 */
    786 	ASSERT(db->db_size >= sizeof (znode_phys_t));
    787 	bzero(db->db_data, db->db_size);
    788 	pzp = db->db_data;
    789 
    790 	/*
    791 	 * If this is the root, fix up the half-initialized parent pointer
    792 	 * to reference the just-allocated physical data area.
    793 	 */
    794 	if (flag & IS_ROOT_NODE) {
    795 		dzp->z_dbuf = db;
    796 		dzp->z_phys = pzp;
    797 		dzp->z_id = obj;
    798 	}
    799 
    800 	/*
    801 	 * If parent is an xattr, so am I.
    802 	 */
    803 	if (dzp->z_phys->zp_flags & ZFS_XATTR)
    804 		flag |= IS_XATTR;
    805 
    806 	if (vap->va_type == VBLK || vap->va_type == VCHR) {
    807 		pzp->zp_rdev = zfs_expldev(vap->va_rdev);
    808 	}
    809 
    810 	if (zfsvfs->z_use_fuids)
    811 		pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
    812 
    813 	if (vap->va_type == VDIR) {
    814 		pzp->zp_size = 2;		/* contents ("." and "..") */
    815 		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
    816 	}
    817 
    818 	pzp->zp_parent = dzp->z_id;
    819 	if (flag & IS_XATTR)
    820 		pzp->zp_flags |= ZFS_XATTR;
    821 
    822 	pzp->zp_gen = gen;
    823 
    824 	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
    825 	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
    826 
    827 	if (vap->va_mask & AT_ATIME) {
    828 		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
    829 	} else {
    830 		ZFS_TIME_ENCODE(&now, pzp->zp_atime);
    831 	}
    832 
    833 	if (vap->va_mask & AT_MTIME) {
    834 		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
    835 	} else {
    836 		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
    837 	}
    838 
    839 	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
    840 	if (!(flag & IS_ROOT_NODE)) {
    841 		ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
    842 		*zpp = zfs_znode_alloc(zfsvfs, db, 0);
    843 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
    844 	} else {
    845 		/*
    846 		 * If we are creating the root node, the "parent" we
    847 		 * passed in is the znode for the root.
    848 		 */
    849 		*zpp = dzp;
    850 	}
    851 	zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp);
    852 }
    853 
    854 void
    855 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
    856 {
    857 	xoptattr_t *xoap;
    858 
    859 	xoap = xva_getxoptattr(xvap);
    860 	ASSERT(xoap);
    861 
    862 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
    863 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
    864 		XVA_SET_RTN(xvap, XAT_CREATETIME);
    865 	}
    866 	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
    867 		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
    868 		XVA_SET_RTN(xvap, XAT_READONLY);
    869 	}
    870 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
    871 		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
    872 		XVA_SET_RTN(xvap, XAT_HIDDEN);
    873 	}
    874 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
    875 		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
    876 		XVA_SET_RTN(xvap, XAT_SYSTEM);
    877 	}
    878 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
    879 		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
    880 		XVA_SET_RTN(xvap, XAT_ARCHIVE);
    881 	}
    882 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
    883 		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
    884 		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
    885 	}
    886 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
    887 		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
    888 		XVA_SET_RTN(xvap, XAT_NOUNLINK);
    889 	}
    890 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
    891 		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
    892 		XVA_SET_RTN(xvap, XAT_APPENDONLY);
    893 	}
    894 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
    895 		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
    896 		XVA_SET_RTN(xvap, XAT_NODUMP);
    897 	}
    898 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
    899 		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
    900 		XVA_SET_RTN(xvap, XAT_OPAQUE);
    901 	}
    902 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
    903 		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
    904 		    xoap->xoa_av_quarantined);
    905 		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
    906 	}
    907 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
    908 		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
    909 		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
    910 	}
    911 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
    912 		(void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
    913 		    sizeof (xoap->xoa_av_scanstamp));
    914 		zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
    915 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
    916 	}
    917 }
    918 
    919 int
    920 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
    921 {
    922 	dmu_object_info_t doi;
    923 	dmu_buf_t	*db;
    924 	znode_t		*zp;
    925 	int err;
    926 
    927 	*zpp = NULL;
    928 
    929 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
    930 
    931 	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
    932 	if (err) {
    933 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
    934 		return (err);
    935 	}
    936 
    937 	dmu_object_info_from_db(db, &doi);
    938 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
    939 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
    940 		dmu_buf_rele(db, NULL);
    941 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
    942 		return (EINVAL);
    943 	}
    944 
    945 	zp = dmu_buf_get_user(db);
    946 	if (zp != NULL) {
    947 		mutex_enter(&zp->z_lock);
    948 
    949 		/*
    950 		 * Since we do immediate eviction of the z_dbuf, we
    951 		 * should never find a dbuf with a znode that doesn't
    952 		 * know about the dbuf.
    953 		 */
    954 		ASSERT3P(zp->z_dbuf, ==, db);
    955 		ASSERT3U(zp->z_id, ==, obj_num);
    956 		if (zp->z_unlinked) {
    957 			err = ENOENT;
    958 		} else {
    959 			VN_HOLD(ZTOV(zp));
    960 			*zpp = zp;
    961 			err = 0;
    962 		}
    963 		dmu_buf_rele(db, NULL);
    964 		mutex_exit(&zp->z_lock);
    965 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
    966 		return (err);
    967 	}
    968 
    969 	/*
    970 	 * Not found create new znode/vnode
    971 	 */
    972 	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
    973 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
    974 	*zpp = zp;
    975 	return (0);
    976 }
    977 
    978 int
    979 zfs_rezget(znode_t *zp)
    980 {
    981 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
    982 	dmu_object_info_t doi;
    983 	dmu_buf_t *db;
    984 	uint64_t obj_num = zp->z_id;
    985 	int err;
    986 
    987 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
    988 
    989 	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
    990 	if (err) {
    991 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
    992 		return (err);
    993 	}
    994 
    995 	dmu_object_info_from_db(db, &doi);
    996 	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
    997 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
    998 		dmu_buf_rele(db, NULL);
    999 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
   1000 		return (EINVAL);
   1001 	}
   1002 
   1003 	if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
   1004 		dmu_buf_rele(db, NULL);
   1005 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
   1006 		return (EIO);
   1007 	}
   1008 
   1009 	zfs_znode_dmu_init(zfsvfs, zp, db);
   1010 	zp->z_unlinked = (zp->z_phys->zp_links == 0);
   1011 	zp->z_blksz = doi.doi_data_block_size;
   1012 
   1013 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
   1014 
   1015 	return (0);
   1016 }
   1017 
   1018 void
   1019 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
   1020 {
   1021 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
   1022 	objset_t *os = zfsvfs->z