Home | History | Annotate | Download | only in fs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #pragma ident	"@(#)dnlc.c	1.60	06/03/22 SMI"
     40 
     41 #include <sys/types.h>
     42 #include <sys/systm.h>
     43 #include <sys/param.h>
     44 #include <sys/t_lock.h>
     45 #include <sys/systm.h>
     46 #include <sys/vfs.h>
     47 #include <sys/vnode.h>
     48 #include <sys/dnlc.h>
     49 #include <sys/kmem.h>
     50 #include <sys/cmn_err.h>
     51 #include <sys/vtrace.h>
     52 #include <sys/bitmap.h>
     53 #include <sys/var.h>
     54 #include <sys/sysmacros.h>
     55 #include <sys/kstat.h>
     56 #include <sys/atomic.h>
     57 #include <sys/taskq.h>
     58 
     59 /*
     60  * Directory name lookup cache.
     61  * Based on code originally done by Robert Elz at Melbourne.
     62  *
     63  * Names found by directory scans are retained in a cache
     64  * for future reference.  Each hash chain is ordered by LRU
     65  * Cache is indexed by hash value obtained from (vp, name)
     66  * where the vp refers to the directory containing the name.
     67  */
     68 
     69 /*
     70  * Tunable nc_hashavelen is the average length desired for this chain, from
     71  * which the size of the nc_hash table is derived at create time.
     72  */
     73 #define	NC_HASHAVELEN_DEFAULT	4
     74 int nc_hashavelen = NC_HASHAVELEN_DEFAULT;
     75 
     76 /*
     77  * NC_MOVETOFRONT is the move-to-front threshold: if the hash lookup
     78  * depth exceeds this value, we move the looked-up entry to the front of
     79  * its hash chain.  The idea is to make sure that the most frequently
     80  * accessed entries are found most quickly (by keeping them near the
     81  * front of their hash chains).
     82  */
     83 #define	NC_MOVETOFRONT	2
     84 
     85 /*
     86  *
     87  * DNLC_MAX_RELE is used to size an array on the stack when releasing
     88  * vnodes. This array is used rather than calling VN_RELE() inline because
     89  * all dnlc locks must be dropped by that time in order to avoid a
     90  * possible deadlock. This deadlock occurs when the dnlc holds the last
     91  * reference to the vnode and so the VOP_INACTIVE vector is called which
     92  * can in turn call back into the dnlc. A global array was used but had
     93  * many problems:
     94  *	1) Actually doesn't have an upper bound on the array size as
     95  *	   entries can be added after starting the purge.
     96  *	2) The locking scheme causes a hang.
     97  *	3) Caused serialisation on the global lock.
     98  *	4) The array was often unnecessarily huge.
     99  *
    100  * Note the current value 8 allows up to 4 cache entries (to be purged
    101  * from each hash chain), before having to cycle around and retry.
    102  * This ought to be ample given that nc_hashavelen is typically very small.
    103  */
    104 #define	DNLC_MAX_RELE	8 /* must be even */
    105 
    106 /*
    107  * Hash table of name cache entries for fast lookup, dynamically
    108  * allocated at startup.
    109  */
    110 nc_hash_t *nc_hash;
    111 
    112 /*
    113  * Rotors. Used to select entries on a round-robin basis.
    114  */
    115 static nc_hash_t *dnlc_purge_fs1_rotor;
    116 static nc_hash_t *dnlc_free_rotor;
    117 
    118 /*
    119  * # of dnlc entries (uninitialized)
    120  *
    121  * the initial value was chosen as being
    122  * a random string of bits, probably not
    123  * normally chosen by a systems administrator
    124  */
    125 int ncsize = -1;
    126 volatile uint32_t dnlc_nentries = 0;	/* current num of name cache entries */
    127 static int nc_hashsz;			/* size of hash table */
    128 static int nc_hashmask;			/* size of hash table minus 1 */
    129 
    130 /*
    131  * The dnlc_reduce_cache() taskq queue is activated when there are
    132  * ncsize name cache entries and if no parameter is provided, it reduces
    133  * the size down to dnlc_nentries_low_water, which is by default one
    134  * hundreth less (or 99%) of ncsize.
    135  *
    136  * If a parameter is provided to dnlc_reduce_cache(), then we reduce
    137  * the size down based on ncsize_onepercent - where ncsize_onepercent
    138  * is 1% of ncsize; however, we never let dnlc_reduce_cache() reduce
    139  * the size below 3% of ncsize (ncsize_min_percent).
    140  */
    141 #define	DNLC_LOW_WATER_DIVISOR_DEFAULT 100
    142 uint_t dnlc_low_water_divisor = DNLC_LOW_WATER_DIVISOR_DEFAULT;
    143 uint_t dnlc_nentries_low_water;
    144 int dnlc_reduce_idle = 1; /* no locking needed */
    145 uint_t ncsize_onepercent;
    146 uint_t ncsize_min_percent;
    147 
    148 /*
    149  * If dnlc_nentries hits dnlc_max_nentries (twice ncsize)
    150  * then this means the dnlc_reduce_cache() taskq is failing to
    151  * keep up. In this case we refuse to add new entries to the dnlc
    152  * until the taskq catches up.
    153  */
    154 uint_t dnlc_max_nentries; /* twice ncsize */
    155 uint64_t dnlc_max_nentries_cnt = 0; /* statistic on times we failed */
    156 
    157 /*
    158  * Tunable to define when we should just remove items from
    159  * the end of the chain.
    160  */
    161 #define	DNLC_LONG_CHAIN 8
    162 uint_t dnlc_long_chain = DNLC_LONG_CHAIN;
    163 
    164 /*
    165  * ncstats has been deprecated, due to the integer size of the counters
    166  * which can easily overflow in the dnlc.
    167  * It is maintained (at some expense) for compatability.
    168  * The preferred interface is the kstat accessible nc_stats below.
    169  */
    170 struct ncstats ncstats;
    171 
    172 struct nc_stats ncs = {
    173 	{ "hits",			KSTAT_DATA_UINT64 },
    174 	{ "misses",			KSTAT_DATA_UINT64 },
    175 	{ "negative_cache_hits",	KSTAT_DATA_UINT64 },
    176 	{ "enters",			KSTAT_DATA_UINT64 },
    177 	{ "double_enters",		KSTAT_DATA_UINT64 },
    178 	{ "purge_total_entries",	KSTAT_DATA_UINT64 },
    179 	{ "purge_all",			KSTAT_DATA_UINT64 },
    180 	{ "purge_vp",			KSTAT_DATA_UINT64 },
    181 	{ "purge_vfs",			KSTAT_DATA_UINT64 },
    182 	{ "purge_fs1",			KSTAT_DATA_UINT64 },
    183 	{ "pick_free",			KSTAT_DATA_UINT64 },
    184 	{ "pick_heuristic",		KSTAT_DATA_UINT64 },
    185 	{ "pick_last",			KSTAT_DATA_UINT64 },
    186 
    187 	/* directory caching stats */
    188 
    189 	{ "dir_hits",			KSTAT_DATA_UINT64 },
    190 	{ "dir_misses",			KSTAT_DATA_UINT64 },
    191 	{ "dir_cached_current",		KSTAT_DATA_UINT64 },
    192 	{ "dir_entries_cached_current",	KSTAT_DATA_UINT64 },
    193 	{ "dir_cached_total",		KSTAT_DATA_UINT64 },
    194 	{ "dir_start_no_memory",	KSTAT_DATA_UINT64 },
    195 	{ "dir_add_no_memory",		KSTAT_DATA_UINT64 },
    196 	{ "dir_add_abort",		KSTAT_DATA_UINT64 },
    197 	{ "dir_add_max",		KSTAT_DATA_UINT64 },
    198 	{ "dir_remove_entry_fail",	KSTAT_DATA_UINT64 },
    199 	{ "dir_remove_space_fail",	KSTAT_DATA_UINT64 },
    200 	{ "dir_update_fail",		KSTAT_DATA_UINT64 },
    201 	{ "dir_fini_purge",		KSTAT_DATA_UINT64 },
    202 	{ "dir_reclaim_last",		KSTAT_DATA_UINT64 },
    203 	{ "dir_reclaim_any",		KSTAT_DATA_UINT64 },
    204 };
    205 
    206 static int doingcache = 1;
    207 
    208 vnode_t negative_cache_vnode;
    209 
    210 /*
    211  * Insert entry at the front of the queue
    212  */
    213 #define	nc_inshash(ncp, hp) \
    214 { \
    215 	(ncp)->hash_next = (hp)->hash_next; \
    216 	(ncp)->hash_prev = (ncache_t *)(hp); \
    217 	(hp)->hash_next->hash_prev = (ncp); \
    218 	(hp)->hash_next = (ncp); \
    219 }
    220 
    221 /*
    222  * Remove entry from hash queue
    223  */
    224 #define	nc_rmhash(ncp) \
    225 { \
    226 	(ncp)->hash_prev->hash_next = (ncp)->hash_next; \
    227 	(ncp)->hash_next->hash_prev = (ncp)->hash_prev; \
    228 	(ncp)->hash_prev = NULL; \
    229 	(ncp)->hash_next = NULL; \
    230 }
    231 
    232 /*
    233  * Free an entry.
    234  */
    235 #define	dnlc_free(ncp) \
    236 { \
    237 	kmem_free((ncp), sizeof (ncache_t) + (ncp)->namlen); \
    238 	atomic_add_32(&dnlc_nentries, -1); \
    239 }
    240 
    241 
    242 /*
    243  * Cached directory info.
    244  * ======================
    245  */
    246 
    247 /*
    248  * Cached directory free space hash function.
    249  * Needs the free space handle and the dcp to get the hash table size
    250  * Returns the hash index.
    251  */
    252 #define	DDFHASH(handle, dcp) ((handle >> 2) & (dcp)->dc_fhash_mask)
    253 
    254 /*
    255  * Cached directory name entry hash function.
    256  * Uses the name and returns in the input arguments the hash and the name
    257  * length.
    258  */
    259 #define	DNLC_DIR_HASH(name, hash, namelen)			\
    260 	{							\
    261 		char Xc, *Xcp;					\
    262 		hash = *name;					\
    263 		for (Xcp = (name + 1); (Xc = *Xcp) != 0; Xcp++)	\
    264 			hash = (hash << 4) + hash + Xc;		\
    265 		ASSERT((Xcp - (name)) <= ((1 << NBBY) - 1));	\
    266 		namelen = Xcp - (name);				\
    267 	}
    268 
    269 /* special dircache_t pointer to indicate error should be returned */
    270 /*
    271  * The anchor directory cache pointer can contain 3 types of values,
    272  * 1) NULL: No directory cache
    273  * 2) DC_RET_LOW_MEM (-1): There was a directory cache that found to be
    274  *    too big or a memory shortage occurred. This value remains in the
    275  *    pointer until a dnlc_dir_start() which returns the a DNOMEM error.
    276  *    This is kludgy but efficient and only visible in this source file.
    277  * 3) A valid cache pointer.
    278  */
    279 #define	DC_RET_LOW_MEM (dircache_t *)1
    280 #define	VALID_DIR_CACHE(dcp) ((dircache_t *)(dcp) > DC_RET_LOW_MEM)
    281 
    282 /* Tunables */
    283 uint_t dnlc_dir_enable = 1; /* disable caching directories by setting to 0 */
    284 uint_t dnlc_dir_min_size = 40; /* min no of directory entries before caching */
    285 uint_t dnlc_dir_max_size = UINT_MAX; /* ditto maximum */
    286 uint_t dnlc_dir_hash_size_shift = 3; /* 8 entries per hash bucket */
    287 uint_t dnlc_dir_min_reclaim =  350000; /* approx 1MB of dcentrys */
    288 /*
    289  * dnlc_dir_hash_resize_shift determines when the hash tables
    290  * get re-adjusted due to growth or shrinkage
    291  * - currently 2 indicating that there can be at most 4
    292  * times or at least one quarter the number of entries
    293  * before hash table readjustment. Note that with
    294  * dnlc_dir_hash_size_shift above set at 3 this would
    295  * mean readjustment would occur if the average number
    296  * of entries went above 32 or below 2
    297  */
    298 uint_t dnlc_dir_hash_resize_shift = 2; /* readjust rate */
    299 
    300 static kmem_cache_t *dnlc_dir_space_cache; /* free space entry cache */
    301 static dchead_t dc_head; /* anchor of cached directories */
    302 
    303 /* Prototypes */
    304 static ncache_t *dnlc_get(uchar_t namlen);
    305 static ncache_t *dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash);
    306 static void dnlc_dir_reclaim(void *unused);
    307 static void dnlc_dir_abort(dircache_t *dcp);
    308 static void dnlc_dir_adjust_fhash(dircache_t *dcp);
    309 static void dnlc_dir_adjust_nhash(dircache_t *dcp);
    310 static void do_dnlc_reduce_cache(void *);
    311 
    312 
    313 /*
    314  * Initialize the directory cache.
    315  */
    316 void
    317 dnlc_init()
    318 {
    319 	nc_hash_t *hp;
    320 	kstat_t *ksp;
    321 	int i;
    322 
    323 	/*
    324 	 * Set up the size of the dnlc (ncsize) and its low water mark.
    325 	 */
    326 	if (ncsize == -1) {
    327 		/* calculate a reasonable size for the low water */
    328 		dnlc_nentries_low_water = 4 * (v.v_proc + maxusers) + 320;
    329 		ncsize = dnlc_nentries_low_water +
    330 		    (dnlc_nentries_low_water / dnlc_low_water_divisor);
    331 	} else {
    332 		/* don't change the user specified ncsize */
    333 		dnlc_nentries_low_water =
    334 		    ncsize - (ncsize / dnlc_low_water_divisor);
    335 	}
    336 	if (ncsize <= 0) {
    337 		doingcache = 0;
    338 		dnlc_dir_enable = 0; /* also disable directory caching */
    339 		ncsize = 0;
    340 		cmn_err(CE_NOTE, "name cache (dnlc) disabled");
    341 		return;
    342 	}
    343 	dnlc_max_nentries = ncsize * 2;
    344 	ncsize_onepercent = ncsize / 100;
    345 	ncsize_min_percent = ncsize_onepercent * 3;
    346 
    347 	/*
    348 	 * Initialise the hash table.
    349 	 * Compute hash size rounding to the next power of two.
    350 	 */
    351 	nc_hashsz = ncsize / nc_hashavelen;
    352 	nc_hashsz = 1 << highbit(nc_hashsz);
    353 	nc_hashmask = nc_hashsz - 1;
    354 	nc_hash = kmem_zalloc(nc_hashsz * sizeof (*nc_hash), KM_SLEEP);
    355 	for (i = 0; i < nc_hashsz; i++) {
    356 		hp = (nc_hash_t *)&nc_hash[i];
    357 		mutex_init(&hp->hash_lock, NULL, MUTEX_DEFAULT, NULL);
    358 		hp->hash_next = (ncache_t *)hp;
    359 		hp->hash_prev = (ncache_t *)hp;
    360 	}
    361 
    362 	/*
    363 	 * Initialize rotors
    364 	 */
    365 	dnlc_free_rotor = dnlc_purge_fs1_rotor = &nc_hash[0];
    366 
    367 	/*
    368 	 * Set up the directory caching to use kmem_cache_alloc
    369 	 * for its free space entries so that we can get a callback
    370 	 * when the system is short on memory, to allow us to free
    371 	 * up some memory. we don't use the constructor/deconstructor
    372 	 * functions.
    373 	 */
    374 	dnlc_dir_space_cache = kmem_cache_create("dnlc_space_cache",
    375 	    sizeof (dcfree_t), 0, NULL, NULL, dnlc_dir_reclaim, NULL,
    376 	    NULL, 0);
    377 
    378 	/*
    379 	 * Initialise the head of the cached directory structures
    380 	 */
    381 	mutex_init(&dc_head.dch_lock, NULL, MUTEX_DEFAULT, NULL);
    382 	dc_head.dch_next = (dircache_t *)&dc_head;
    383 	dc_head.dch_prev = (dircache_t *)&dc_head;
    384 
    385 	/*
    386 	 * Initialise the reference count of the negative cache vnode to 1
    387 	 * so that it never goes away (VOP_INACTIVE isn't called on it).
    388 	 */
    389 	negative_cache_vnode.v_count = 1;
    390 
    391 	/*
    392 	 * Initialise kstats - both the old compatability raw kind and
    393 	 * the more extensive named stats.
    394 	 */
    395 	ksp = kstat_create("unix", 0, "ncstats", "misc", KSTAT_TYPE_RAW,
    396 		sizeof (struct ncstats), KSTAT_FLAG_VIRTUAL);
    397 	if (ksp) {
    398 		ksp->ks_data = (void *) &ncstats;
    399 		kstat_install(ksp);
    400 	}
    401 	ksp = kstat_create("unix", 0, "dnlcstats", "misc", KSTAT_TYPE_NAMED,
    402 	    sizeof (ncs) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
    403 	if (ksp) {
    404 		ksp->ks_data = (void *) &ncs;
    405 		kstat_install(ksp);
    406 	}
    407 }
    408 
    409 /*
    410  * Add a name to the directory cache.
    411  */
    412 void
    413 dnlc_enter(vnode_t *dp, char *name, vnode_t *vp)
    414 {
    415 	ncache_t *ncp;
    416 	nc_hash_t *hp;
    417 	uchar_t namlen;
    418 	int hash;
    419 
    420 	TRACE_0(TR_FAC_NFS, TR_DNLC_ENTER_START, "dnlc_enter_start:");
    421 
    422 	if (!doingcache) {
    423 		TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
    424 		    "dnlc_enter_end:(%S) %d", "not caching", 0);
    425 		return;
    426 	}
    427 
    428 	/*
    429 	 * Get a new dnlc entry. Assume the entry won't be in the cache
    430 	 * and initialize it now
    431 	 */
    432 	DNLCHASH(name, dp, hash, namlen);
    433 	if ((ncp = dnlc_get(namlen)) == NULL)
    434 		return;
    435 	ncp->dp = dp;
    436 	VN_HOLD(dp);
    437 	ncp->vp = vp;
    438 	VN_HOLD(vp);
    439 	bcopy(name, ncp->name, namlen + 1); /* name and null */
    440 	ncp->hash = hash;
    441 	hp = &nc_hash[hash & nc_hashmask];
    442 
    443 	mutex_enter(&hp->hash_lock);
    444 	if (dnlc_search(dp, name, namlen, hash) != NULL) {
    445 		mutex_exit(&hp->hash_lock);
    446 		ncstats.dbl_enters++;
    447 		ncs.ncs_dbl_enters.value.ui64++;
    448 		VN_RELE(dp);
    449 		VN_RELE(vp);
    450 		dnlc_free(ncp);		/* crfree done here */
    451 		TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
    452 			"dnlc_enter_end:(%S) %d",
    453 			"dbl enter", ncstats.dbl_enters);
    454 		return;
    455 	}
    456 	/*
    457 	 * Insert back into the hash chain.
    458 	 */
    459 	nc_inshash(ncp, hp);
    460 	mutex_exit(&hp->hash_lock);
    461 	ncstats.enters++;
    462 	ncs.ncs_enters.value.ui64++;
    463 	TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
    464 	    "dnlc_enter_end:(%S) %d", "done", ncstats.enters);
    465 }
    466 
    467 /*
    468  * Add a name to the directory cache.
    469  *
    470  * This function is basically identical with
    471  * dnlc_enter().  The difference is that when the
    472  * desired dnlc entry is found, the vnode in the
    473  * ncache is compared with the vnode passed in.
    474  *
    475  * If they are not equal then the ncache is
    476  * updated with the passed in vnode.  Otherwise
    477  * it just frees up the newly allocated dnlc entry.
    478  */
    479 void
    480 dnlc_update(vnode_t *dp, char *name, vnode_t *vp)
    481 {
    482 	ncache_t *ncp;
    483 	ncache_t *tcp;
    484 	vnode_t *tvp;
    485 	nc_hash_t *hp;
    486 	int hash;
    487 	uchar_t namlen;
    488 
    489 	TRACE_0(TR_FAC_NFS, TR_DNLC_ENTER_START, "dnlc_update_start:");
    490 
    491 	if (!doingcache) {
    492 		TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
    493 		    "dnlc_update_end:(%S) %d", "not caching", 0);
    494 		return;
    495 	}
    496 
    497 	/*
    498 	 * Get a new dnlc entry and initialize it now.
    499 	 * If we fail to get a new entry, call dnlc_remove() to purge
    500 	 * any existing dnlc entry including negative cache (DNLC_NO_VNODE)
    501 	 * entry.
    502 	 * Failure to clear an existing entry could result in false dnlc
    503 	 * lookup (negative/stale entry).
    504 	 */
    505 	DNLCHASH(name, dp, hash, namlen);
    506 	if ((ncp = dnlc_get(namlen)) == NULL) {
    507 		dnlc_remove(dp, name);
    508 		return;
    509 	}
    510 	ncp->dp = dp;
    511 	VN_HOLD(dp);
    512 	ncp->vp = vp;
    513 	VN_HOLD(vp);
    514 	bcopy(name, ncp->name, namlen + 1); /* name and null */
    515 	ncp->hash = hash;
    516 	hp = &nc_hash[hash & nc_hashmask];
    517 
    518 	mutex_enter(&hp->hash_lock);
    519 	if ((tcp = dnlc_search(dp, name, namlen, hash)) != NULL) {
    520 		if (tcp->vp != vp) {
    521 			tvp = tcp->vp;
    522 			tcp->vp = vp;
    523 			mutex_exit(&hp->hash_lock);
    524 			VN_RELE(tvp);
    525 			ncstats.enters++;
    526 			ncs.ncs_enters.value.ui64++;
    527 			TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
    528 			    "dnlc_update_end:(%S) %d", "done", ncstats.enters);
    529 		} else {
    530 			mutex_exit(&hp->hash_lock);
    531 			VN_RELE(vp);
    532 			ncstats.dbl_enters++;
    533 			ncs.ncs_dbl_enters.value.ui64++;
    534 			TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
    535 			    "dnlc_update_end:(%S) %d",
    536 			    "dbl enter", ncstats.dbl_enters);
    537 		}
    538 		VN_RELE(dp);
    539 		dnlc_free(ncp);		/* crfree done here */
    540 		return;
    541 	}
    542 	/*
    543 	 * insert the new entry, since it is not in dnlc yet
    544 	 */
    545 	nc_inshash(ncp, hp);
    546 	mutex_exit(&hp->hash_lock);
    547 	ncstats.enters++;
    548 	ncs.ncs_enters.value.ui64++;
    549 	TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
    550 	    "dnlc_update_end:(%S) %d", "done", ncstats.enters);
    551 }
    552 
    553 /*
    554  * Look up a name in the directory name cache.
    555  *
    556  * Return a doubly-held vnode if found: one hold so that it may
    557  * remain in the cache for other users, the other hold so that
    558  * the cache is not re-cycled and the identity of the vnode is
    559  * lost before the caller can use the vnode.
    560  */
    561 vnode_t *
    562 dnlc_lookup(vnode_t *dp, char *name)
    563 {
    564 	ncache_t *ncp;
    565 	nc_hash_t *hp;
    566 	vnode_t *vp;
    567 	int hash, depth;
    568 	uchar_t namlen;
    569 
    570 	TRACE_2(TR_FAC_NFS, TR_DNLC_LOOKUP_START,
    571 	    "dnlc_lookup_start:dp %x name %s", dp, name);
    572 
    573 	if (!doingcache) {
    574 		TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
    575 		    "dnlc_lookup_end:%S %d vp %x name %s",
    576 		    "not_caching", 0, NULL, name);
    577 		return (NULL);
    578 	}
    579 
    580 	DNLCHASH(name, dp, hash, namlen);
    581 	depth = 1;
    582 	hp = &nc_hash[hash & nc_hashmask];
    583 	mutex_enter(&hp->hash_lock);
    584 
    585 	for (ncp = hp->hash_next; ncp != (ncache_t *)hp;
    586 	    ncp = ncp->hash_next) {
    587 		if (ncp->hash == hash &&	/* fast signature check */
    588 		    ncp->dp == dp &&
    589 		    ncp->namlen == namlen &&
    590 		    bcmp(ncp->name, name, namlen) == 0) {
    591 			/*
    592 			 * Move this entry to the head of its hash chain
    593 			 * if it's not already close.
    594 			 */
    595 			if (depth > NC_MOVETOFRONT) {
    596 				ncache_t *next = ncp->hash_next;
    597 				ncache_t *prev = ncp->hash_prev;
    598 
    599 				prev->hash_next = next;
    600 				next->hash_prev = prev;
    601 				ncp->hash_next = next = hp->hash_next;
    602 				ncp->hash_prev = (ncache_t *)hp;
    603 				next->hash_prev = ncp;
    604 				hp->hash_next = ncp;
    605 
    606 				ncstats.move_to_front++;
    607 			}
    608 
    609 			/*
    610 			 * Put a hold on the vnode now so its identity
    611 			 * can't change before the caller has a chance to
    612 			 * put a hold on it.
    613 			 */
    614 			vp = ncp->vp;
    615 			VN_HOLD(vp);
    616 			mutex_exit(&hp->hash_lock);
    617 			ncstats.hits++;
    618 			ncs.ncs_hits.value.ui64++;
    619 			if (vp == DNLC_NO_VNODE) {
    620 				ncs.ncs_neg_hits.value.ui64++;
    621 			}
    622 			TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
    623 				"dnlc_lookup_end:%S %d vp %x name %s",
    624 				"hit", ncstats.hits, vp, name);
    625 			return (vp);
    626 		}
    627 		depth++;
    628 	}
    629 
    630 	mutex_exit(&hp->hash_lock);
    631 	ncstats.misses++;
    632 	ncs.ncs_misses.value.ui64++;
    633 	TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
    634 		"dnlc_lookup_end:%S %d vp %x name %s", "miss", ncstats.misses,
    635 	    NULL, name);
    636 	return (NULL);
    637 }
    638 
    639 /*
    640  * Remove an entry in the directory name cache.
    641  */
    642 void
    643 dnlc_remove(vnode_t *dp, char *name)
    644 {
    645 	ncache_t *ncp;
    646 	nc_hash_t *hp;
    647 	uchar_t namlen;
    648 	int hash;
    649 
    650 	if (!doingcache)
    651 		return;
    652 	DNLCHASH(name, dp, hash, namlen);
    653 	hp = &nc_hash[hash & nc_hashmask];
    654 
    655 	mutex_enter(&hp->hash_lock);
    656 	if (ncp = dnlc_search(dp, name, namlen, hash)) {
    657 		/*
    658 		 * Free up the entry
    659 		 */
    660 		nc_rmhash(ncp);
    661 		mutex_exit(&hp->hash_lock);
    662 		VN_RELE(ncp->vp);
    663 		VN_RELE(ncp->dp);
    664 		dnlc_free(ncp);
    665 		return;
    666 	}
    667 	mutex_exit(&hp->hash_lock);
    668 }
    669 
    670 /*
    671  * Purge the entire cache.
    672  */
    673 void
    674 dnlc_purge()
    675 {
    676 	nc_hash_t *nch;
    677 	ncache_t *ncp;
    678 	int index;
    679 	int i;
    680 	vnode_t *nc_rele[DNLC_MAX_RELE];
    681 
    682 	if (!doingcache)
    683 		return;
    684 
    685 	ncstats.purges++;
    686 	ncs.ncs_purge_all.value.ui64++;
    687 
    688 	for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
    689 		index = 0;
    690 		mutex_enter(&nch->hash_lock);
    691 		ncp = nch->hash_next;
    692 		while (ncp != (ncache_t *)nch) {
    693 			ncache_t *np;
    694 
    695 			np = ncp->hash_next;
    696 			nc_rele[index++] = ncp->vp;
    697 			nc_rele[index++] = ncp->dp;
    698 
    699 			nc_rmhash(ncp);
    700 			dnlc_free(ncp);
    701 			ncp = np;
    702 			ncs.ncs_purge_total.value.ui64++;
    703 			if (index == DNLC_MAX_RELE)
    704 				break;
    705 		}
    706 		mutex_exit(&nch->hash_lock);
    707 
    708 		/* Release holds on all the vnodes now that we have no locks */
    709 		for (i = 0; i < index; i++) {
    710 			VN_RELE(nc_rele[i]);
    711 		}
    712 		if (ncp != (ncache_t *)nch) {
    713 			nch--; /* Do current hash chain again */
    714 		}
    715 	}
    716 }
    717 
    718 /*
    719  * Purge any cache entries referencing a vnode.
    720  * Exit as soon as the vnode reference count goes to 1, as the caller
    721  * must hold a reference, and the dnlc can therefore have no more.
    722  */
    723 void
    724 dnlc_purge_vp(vnode_t *vp)
    725 {
    726 	nc_hash_t *nch;
    727 	ncache_t *ncp;
    728 	int index;
    729 	vnode_t *nc_rele[DNLC_MAX_RELE];
    730 
    731 	ASSERT(vp->v_count > 0);
    732 	if (vp->v_count == 1) {
    733 		return;
    734 	}
    735 
    736 	if (!doingcache)
    737 		return;
    738 
    739 	ncstats.purges++;
    740 	ncs.ncs_purge_vp.value.ui64++;
    741 
    742 	for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
    743 		index = 0;
    744 		mutex_enter(&nch->hash_lock);
    745 		ncp = nch->hash_next;
    746 		while (ncp != (ncache_t *)nch) {
    747 			ncache_t *np;
    748 
    749 			np = ncp->hash_next;
    750 			if (ncp->dp == vp || ncp->vp == vp) {
    751 				nc_rele[index++] = ncp->vp;
    752 				nc_rele[index++] = ncp->dp;
    753 				nc_rmhash(ncp);
    754 				dnlc_free(ncp);
    755 				ncs.ncs_purge_total.value.ui64++;
    756 				if (index == DNLC_MAX_RELE) {
    757 					ncp = np;
    758 					break;
    759 				}
    760 			}
    761 			ncp = np;
    762 		}
    763 		mutex_exit(&nch->hash_lock);
    764 
    765 		/* Release holds on all the vnodes now that we have no locks */
    766 		while (index) {
    767 			VN_RELE(nc_rele[--index]);
    768 		}
    769 
    770 		if (vp->v_count == 1) {
    771 			return; /* no more dnlc references */
    772 		}
    773 
    774 		if (ncp != (ncache_t *)nch) {
    775 			nch--; /* Do current hash chain again */
    776 		}
    777 	}
    778 }
    779 
    780 /*
    781  * Purge cache entries referencing a vfsp.  Caller supplies a count
    782  * of entries to purge; up to that many will be freed.  A count of
    783  * zero indicates that all such entries should be purged.  Returns
    784  * the number of entries that were purged.
    785  */
    786 int
    787 dnlc_purge_vfsp(vfs_t *vfsp, int count)
    788 {
    789 	nc_hash_t *nch;
    790 	ncache_t *ncp;
    791 	int n = 0;
    792 	int index;
    793 	int i;
    794 	vnode_t *nc_rele[DNLC_MAX_RELE];
    795 
    796 	if (!doingcache)
    797 		return (0);
    798 
    799 	ncstats.purges++;
    800 	ncs.ncs_purge_vfs.value.ui64++;
    801 
    802 	for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
    803 		index = 0;
    804 		mutex_enter(&nch->hash_lock);
    805 		ncp = nch->hash_next;
    806 		while (ncp != (ncache_t *)nch) {
    807 			ncache_t *np;
    808 
    809 			np = ncp->hash_next;
    810 			ASSERT(ncp->dp != NULL);
    811 			ASSERT(ncp->vp != NULL);
    812 			if ((ncp->dp->v_vfsp == vfsp) ||
    813 			    (ncp->vp->v_vfsp == vfsp)) {
    814 				n++;
    815 				nc_rele[index++] = ncp->vp;
    816 				nc_rele[index++] = ncp->dp;
    817 				nc_rmhash(ncp);
    818 				dnlc_free(ncp);
    819 				ncs.ncs_purge_total.value.ui64++;
    820 				if (index == DNLC_MAX_RELE) {
    821 					ncp = np;
    822 					break;
    823 				}
    824 				if (count != 0 && n >= count) {
    825 					break;
    826 				}
    827 			}
    828 			ncp = np;
    829 		}
    830 		mutex_exit(&nch->hash_lock);
    831 		/* Release holds on all the vnodes now that we have no locks */
    832 		for (i = 0; i < index; i++) {
    833 			VN_RELE(nc_rele[i]);
    834 		}
    835 		if (count != 0 && n >= count) {
    836 			return (n);
    837 		}
    838 		if (ncp != (ncache_t *)nch) {
    839 			nch--; /* Do current hash chain again */
    840 		}
    841 	}
    842 	return (n);
    843 }
    844 
    845 /*
    846  * Purge 1 entry from the dnlc that is part of the filesystem(s)
    847  * represented by 'vop'. The purpose of this routine is to allow
    848  * users of the dnlc to free a vnode that is being held by the dnlc.
    849  *
    850  * If we find a vnode that we release which will result in
    851  * freeing the underlying vnode (count was 1), return 1, 0
    852  * if no appropriate vnodes found.
    853  *
    854  * Note, vop is not the 'right' identifier for a filesystem.
    855  */
    856 int
    857 dnlc_fs_purge1(vnodeops_t *vop)
    858 {
    859 	nc_hash_t *end;
    860 	nc_hash_t *hp;
    861 	ncache_t *ncp;
    862 	vnode_t *vp;
    863 
    864 	if (!doingcache)
    865 		return (0);
    866 
    867 	ncs.ncs_purge_fs1.value.ui64++;
    868 
    869 	/*
    870 	 * Scan the dnlc entries looking for a likely candidate.
    871 	 */
    872 	hp = end = dnlc_purge_fs1_rotor;
    873 
    874 	do {
    875 		if (++hp == &nc_hash[nc_hashsz])
    876 			hp = nc_hash;
    877 		dnlc_purge_fs1_rotor = hp;
    878 		if (hp->hash_next == (ncache_t *)hp)
    879 			continue;
    880 		mutex_enter(&hp->hash_lock);
    881 		for (ncp = hp->hash_prev;
    882 		    ncp != (ncache_t *)hp;
    883 		    ncp = ncp->hash_prev) {
    884 			vp = ncp->vp;
    885 			if (!vn_has_cached_data(vp) && (vp->v_count == 1) &&
    886 			    vn_matchops(vp, vop))
    887 				break;
    888 		}
    889 		if (ncp != (ncache_t *)hp) {
    890 			nc_rmhash(ncp);
    891 			mutex_exit(&hp->hash_lock);
    892 			VN_RELE(ncp->dp);
    893 			VN_RELE(vp)
    894 			dnlc_free(ncp);
    895 			ncs.ncs_purge_total.value.ui64++;
    896 			return (1);
    897 		}
    898 		mutex_exit(&hp->hash_lock);
    899 	} while (hp != end);
    900 	return (0);
    901 }
    902 
    903 /*
    904  * Perform a reverse lookup in the DNLC.  This will find the first occurrence of
    905  * the vnode.  If successful, it will return the vnode of the parent, and the
    906  * name of the entry in the given buffer.  If it cannot be found, or the buffer
    907  * is too small, then it will return NULL.  Note that this is a highly
    908  * inefficient function, since the DNLC is constructed solely for forward
    909  * lookups.
    910  */
    911 vnode_t *
    912 dnlc_reverse_lookup(vnode_t *vp, char *buf, size_t buflen)
    913 {
    914 	nc_hash_t *nch;
    915 	ncache_t *ncp;
    916 	vnode_t *pvp;
    917 
    918 	if (!doingcache)
    919 		return (NULL);
    920 
    921 	for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
    922 		mutex_enter(&nch->hash_lock);
    923 		ncp = nch->hash_next;
    924 		while (ncp != (ncache_t *)nch) {
    925 			/*
    926 			 * We ignore '..' entries since it can create
    927 			 * confusion and infinite loops.
    928 			 */
    929 			if (ncp->vp == vp && !(ncp->namlen == 2 &&
    930 			    0 == bcmp(ncp->name, "..", 2)) &&
    931 			    ncp->namlen < buflen) {
    932 				bcopy(ncp->name, buf, ncp->namlen);
    933 				buf[ncp->namlen] = '\0';
    934 				pvp = ncp->dp;
    935 				VN_HOLD(pvp);
    936 				mutex_exit(&nch->hash_lock);
    937 				return (pvp);
    938 			}
    939 			ncp = ncp->hash_next;
    940 		}
    941 		mutex_exit(&nch->hash_lock);
    942 	}
    943 
    944 	return (NULL);
    945 }
    946 /*
    947  * Utility routine to search for a cache entry. Return the
    948  * ncache entry if found, NULL otherwise.
    949  */
    950 static ncache_t *
    951 dnlc_search(vnode_t *dp, char *name, uchar_t namlen, int hash)
    952 {
    953 	nc_hash_t *hp;
    954 	ncache_t *ncp;
    955 
    956 	hp = &nc_hash[hash & nc_hashmask];
    957 
    958 	for (ncp = hp->hash_next; ncp != (ncache_t *)hp; ncp = ncp->hash_next) {
    959 		if (ncp->hash == hash &&
    960 		    ncp->dp == dp &&
    961 		    ncp->namlen == namlen &&
    962 		    bcmp(ncp->name, name, namlen) == 0)
    963 			return (ncp);
    964 	}
    965 	return (NULL);
    966 }
    967 
    968 #if ((1 << NBBY) - 1) < (MAXNAMELEN - 1)
    969 #error ncache_t name length representation is too small
    970 #endif
    971 
    972 void
    973 dnlc_reduce_cache(void *reduce_percent)
    974 {
    975 	if (dnlc_reduce_idle && (dnlc_nentries >= ncsize || reduce_percent)) {
    976 		dnlc_reduce_idle = 0;
    977 		if ((taskq_dispatch(system_taskq, do_dnlc_reduce_cache,
    978 		    reduce_percent, TQ_NOSLEEP)) == NULL)
    979 			dnlc_reduce_idle = 1;
    980 	}
    981 }
    982 
    983 /*
    984  * Get a new name cache entry.
    985  * If the dnlc_reduce_cache() taskq isn't keeping up with demand, or memory
    986  * is short then just return NULL. If we're over ncsize then kick off a
    987  * thread to free some in use entries down to dnlc_nentries_low_water.
    988  * Caller must initialise all fields except namlen.
    989  * Component names are defined to be less than MAXNAMELEN
    990  * which includes a null.
    991  */
    992 static ncache_t *
    993 dnlc_get(uchar_t namlen)
    994 {
    995 	ncache_t *ncp;
    996 
    997 	if (dnlc_nentries > dnlc_max_nentries) {
    998 		dnlc_max_nentries_cnt++; /* keep a statistic */
    999 		return (NULL);
   1000 	}
   1001 	ncp = kmem_alloc(sizeof (ncache_t) + namlen, KM_NOSLEEP);
   1002 	if (ncp == NULL) {
   1003 		return (NULL);
   1004 	}
   1005 	ncp->namlen = namlen;
   1006 	atomic_add_32(&dnlc_nentries, 1);
   1007 	dnlc_reduce_cache(NULL);
   1008 	return (ncp);
   1009 }
   1010 
   1011 /*
   1012  * Taskq routine to free up name cache entries to reduce the
   1013  * cache size to the low water mark if "reduce_percent" is not provided.
   1014  * If "reduce_percent" is provided, reduce cache size by
   1015  * (ncsize_onepercent * reduce_percent).
   1016  */
   1017 /*ARGSUSED*/
   1018 static void
   1019 do_dnlc_reduce_cache(void *reduce_percent)
   1020 {
   1021 	nc_hash_t *hp = dnlc_free_rotor, *start_hp = hp;
   1022 	vnode_t *vp;
   1023 	ncache_t *ncp;
   1024 	int cnt;
   1025 	uint_t low_water = dnlc_nentries_low_water;
   1026 
   1027 	if (reduce_percent) {
   1028 		uint_t reduce_cnt;
   1029 
   1030 		/*
   1031 		 * Never try to reduce the current number
   1032 		 * of cache entries below 3% of ncsize.
   1033 		 */
   1034 		if (dnlc_nentries <= ncsize_min_percent) {
   1035 			dnlc_reduce_idle = 1;
   1036 			return;
   1037 		}
   1038 		reduce_cnt = ncsize_onepercent *
   1039 		    (uint_t)(uintptr_t)reduce_percent;
   1040 
   1041 		if (reduce_cnt > dnlc_nentries ||
   1042 		    dnlc_nentries - reduce_cnt < ncsize_min_percent)
   1043 			low_water = ncsize_min_percent;
   1044 		else
   1045 			low_water = dnlc_nentries - reduce_cnt;
   1046 	}
   1047 
   1048 	do {
   1049 		/*
   1050 		 * Find the first non empty hash queue without locking.
   1051 		 * Only look at each hash queue once to avoid an infinite loop.
   1052 		 */
   1053 		do {
   1054 			if (++hp == &nc_hash[nc_hashsz])
   1055 				hp = nc_hash;
   1056 		} while (hp->hash_next == (ncache_t *)hp && hp != start_hp);
   1057 
   1058 		/* return if all hash queues are empty. */
   1059 		if (hp->hash_next == (ncache_t *)hp) {
   1060 			dnlc_reduce_idle = 1;
   1061 			return;
   1062 		}
   1063 
   1064 		mutex_enter(&hp->hash_lock);
   1065 		for (cnt = 0, ncp = hp->hash_prev; ncp != (ncache_t *)hp;
   1066 		    ncp = ncp->hash_prev, cnt++) {
   1067 			vp = ncp->vp;
   1068 			/*
   1069 			 * A name cache entry with a reference count
   1070 			 * of one is only referenced by the dnlc.
   1071 			 * Also negative cache entries are purged first.
   1072 			 */
   1073 			if (!vn_has_cached_data(vp) &&
   1074 			    ((vp->v_count == 1) || (vp == DNLC_NO_VNODE))) {
   1075 				ncs.ncs_pick_heur.value.ui64++;
   1076 				goto found;
   1077 			}
   1078 			/*
   1079 			 * Remove from the end of the chain if the
   1080 			 * chain is too long
   1081 			 */
   1082 			if (cnt > dnlc_long_chain) {
   1083 				ncp = hp->hash_prev;
   1084 				ncs.ncs_pick_last.value.ui64++;
   1085 				vp = ncp->vp;
   1086 				goto found;
   1087 			}
   1088 		}
   1089 		/* check for race and continue */
   1090 		if (hp->hash_next == (ncache_t *)hp) {
   1091 			mutex_exit(&hp->hash_lock);
   1092 			continue;
   1093 		}
   1094 
   1095 		ncp = hp->hash_prev; /* pick the last one in the hash queue */
   1096 		ncs.ncs_pick_last.value.ui64++;
   1097 		vp = ncp->vp;
   1098 found:
   1099 		/*
   1100 		 * Remove from hash chain.
   1101 		 */
   1102 		nc_rmhash(ncp);
   1103 		mutex_exit(&hp->hash_lock);
   1104 		VN_RELE(vp);
   1105 		VN_RELE(ncp->dp);
   1106 		dnlc_free(ncp);
   1107 	} while (dnlc_nentries > low_water);
   1108 
   1109 	dnlc_free_rotor = hp;
   1110 	dnlc_reduce_idle = 1;
   1111 }
   1112 
   1113 /*
   1114  * Directory caching routines
   1115  * ==========================
   1116  *
   1117  * See dnlc.h for details of the interfaces below.
   1118  */
   1119 
   1120 /*
   1121  * Lookup up an entry in a complete or partial directory cache.
   1122  */
   1123 dcret_t
   1124 dnlc_dir_lookup(dcanchor_t *dcap, char *name, uint64_t *handle)
   1125 {
   1126 	dircache_t *dcp;
   1127 	dcentry_t *dep;
   1128 	int hash;
   1129 	int ret;
   1130 	uchar_t namlen;
   1131 
   1132 	/*
   1133 	 * can test without lock as we are only a cache
   1134 	 */
   1135 	if (!VALID_DIR_CACHE(dcap->dca_dircache)) {
   1136 		ncs.ncs_dir_misses.value.ui64++;
   1137 		return (DNOCACHE);
   1138 	}
   1139 
   1140 	if (!dnlc_dir_enable) {
   1141 		return (DNOCACHE);
   1142 	}
   1143 
   1144 	mutex_enter(&dcap->dca_lock);
   1145 	dcp = (dircache_t *)dcap->dca_dircache;
   1146 	if (VALID_DIR_CACHE(dcp)) {
   1147 		dcp->dc_actime = lbolt64;
   1148 		DNLC_DIR_HASH(name, hash, namlen);
   1149 		dep = dcp->dc_namehash[hash & dcp->dc_nhash_mask];
   1150 		while (dep != NULL) {
   1151 			if ((dep->de_hash == hash) &&
   1152 			    (namlen == dep->de_namelen) &&
   1153 			    bcmp(dep->de_name, name, namlen) == 0) {
   1154 				*handle = dep->de_handle;
   1155 				mutex_exit(&dcap->dca_lock);
   1156 				ncs.ncs_dir_hits.value.ui64++;
   1157 				return (DFOUND);
   1158 			}
   1159 			dep = dep->de_next;
   1160 		}
   1161 		if (dcp->dc_complete) {
   1162 			ret = DNOENT;
   1163 		} else {
   1164 			ret = DNOCACHE;
   1165 		}
   1166 		mutex_exit(&dcap->dca_lock);
   1167 		return (ret);
   1168 	} else {
   1169 		mutex_exit(&dcap->dca_lock);
   1170 		ncs.ncs_dir_misses.value.ui64++;
   1171 		return (DNOCACHE);
   1172 	}
   1173 }
   1174 
   1175 /*
   1176  * Start a new directory cache. An estimate of the number of
   1177  * entries is provided to as a quick check to ensure the directory
   1178  * is cacheable.
   1179  */
   1180 dcret_t
   1181 dnlc_dir_start(dcanchor_t *dcap, uint_t num_entries)
   1182 {
   1183 	dircache_t *dcp;
   1184 
   1185 	if (!dnlc_dir_enable ||
   1186 	    (num_entries < dnlc_dir_min_size)) {
   1187 		return (DNOCACHE);
   1188 	}
   1189 
   1190 	if (num_entries > dnlc_dir_max_size) {
   1191 		return (DTOOBIG);
   1192 	}
   1193 
   1194 	mutex_enter(&dc_head.dch_lock);
   1195 	mutex_enter(&dcap->dca_lock);
   1196 
   1197 	if (dcap->dca_dircache == DC_RET_LOW_MEM) {
   1198 		dcap->dca_dircache = NULL;
   1199 		mutex_exit(&dcap->dca_lock);
   1200 		mutex_exit(&dc_head.dch_lock);
   1201 		return (DNOMEM);
   1202 	}
   1203 
   1204 	/*
   1205 	 * Check if there's currently a cache.
   1206 	 * This probably only occurs on a race.
   1207 	 */
   1208 	if (dcap->dca_dircache != NULL) {
   1209 		mutex_exit(&dcap->dca_lock);
   1210 		mutex_exit(&dc_head.dch_lock);
   1211 		return (DNOCACHE);
   1212 	}
   1213 
   1214 	/*
   1215 	 * Allocate the dircache struct, entry and free space hash tables.
   1216 	 * These tables are initially just one entry but dynamically resize
   1217 	 * when entries and free space are added or removed.
   1218 	 */
   1219 	if ((dcp = kmem_zalloc(sizeof (dircache_t), KM_NOSLEEP)) == NULL) {
   1220 		goto error;
   1221 	}
   1222 	if ((dcp->dc_namehash = kmem_zalloc(sizeof (dcentry_t *),
   1223 	    KM_NOSLEEP)) == NULL) {
   1224 		goto error;
   1225 	}
   1226 	if ((dcp->dc_freehash = kmem_zalloc(sizeof (dcfree_t *),
   1227 	    KM_NOSLEEP)) == NULL) {
   1228 		goto error;
   1229 	}
   1230 
   1231 	dcp->dc_anchor = dcap; /* set back pointer to anchor */
   1232 	dcap->dca_dircache = dcp;
   1233 
   1234 	/* add into head of global chain */
   1235 	dcp->dc_next = dc_head.dch_next;
   1236 	dcp->dc_prev = (dircache_t *)&dc_head;
   1237 	dcp->dc_next->dc_prev = dcp;
   1238 	dc_head.dch_next = dcp;
   1239 
   1240 	mutex_exit(&dcap->dca_lock);
   1241 	mutex_exit(&dc_head.dch_lock);
   1242 	ncs.ncs_cur_dirs.value.ui64++;
   1243 	ncs.ncs_dirs_cached.value.ui64++;
   1244 	return (DOK);
   1245 error:
   1246 	if (dcp != NULL) {
   1247 		if (dcp->dc_namehash) {
   1248 			kmem_free(dcp->dc_namehash, sizeof (dcentry_t *));
   1249 		}
   1250 		kmem_free(dcp, sizeof (dircache_t));
   1251 	}
   1252 	/*
   1253 	 * Must also kmem_free dcp->dc_freehash if more error cases are added
   1254 	 */
   1255 	mutex_exit(&dcap->dca_lock);
   1256 	mutex_exit(&dc_head.dch_lock);
   1257 	ncs.ncs_dir_start_nm.value.ui64++;
   1258 	return (DNOCACHE);
   1259 }
   1260 
   1261 /*
   1262  * Add a directopry entry to a partial or complete directory cache.
   1263  */
   1264 dcret_t
   1265 dnlc_dir_add_entry(dcanchor_t *dcap, char *name, uint64_t handle)
   1266 {
   1267 	dircache_t *dcp;
   1268 	dcentry_t **hp, *dep;
   1269 	int hash;
   1270 	uint_t capacity;
   1271 	uchar_t namlen;
   1272 
   1273 	/*
   1274 	 * Allocate the dcentry struct, including the variable
   1275 	 * size name. Note, the null terminator is not copied.
   1276 	 *
   1277 	 * We do this outside the lock to avoid possible deadlock if
   1278 	 * dnlc_dir_reclaim() is called as a result of memory shortage.
   1279 	 */
   1280 	DNLC_DIR_HASH(name, hash, namlen);
   1281 	dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
   1282 	if (dep == NULL) {
   1283 #ifdef DEBUG
   1284 		/*
   1285 		 * The kmem allocator generates random failures for
   1286 		 * KM_NOSLEEP calls (see KMEM_RANDOM_ALLOCATION_FAILURE)
   1287 		 * So try again before we blow away a perfectly good cache.
   1288 		 * This is done not to cover an error but purely for
   1289 		 * performance running a debug kernel.
   1290 		 * This random error only occurs in debug mode.
   1291 		 */
   1292 		dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
   1293 		if (dep != NULL)
   1294 			goto ok;
   1295 #endif
   1296 		ncs.ncs_dir_add_nm.value.ui64++;
   1297 		/*
   1298 		 * Free a directory cache. This may be the one we are
   1299 		 * called with.
   1300 		 */
   1301 		dnlc_dir_reclaim(NULL);
   1302 		dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
   1303 		if (dep == NULL) {
   1304 			/*
   1305 			 * still no memory, better delete this cache
   1306 			 */
   1307 			mutex_enter(&dcap->dca_lock);
   1308 			dcp = (dircache_t *)dcap->dca_dircache;
   1309 			if (VALID_DIR_CACHE(dcp)) {
   1310 				dnlc_dir_abort(dcp);
   1311 				dcap->dca_dircache = DC_RET_LOW_MEM;
   1312 			}
   1313 			mutex_exit(&dcap->dca_lock);
   1314 			ncs.ncs_dir_addabort.value.ui64++;
   1315 			return (DNOCACHE);
   1316 		}
   1317 		/*
   1318 		 * fall through as if the 1st kmem_alloc had worked
   1319 		 */
   1320 	}
   1321 #ifdef DEBUG
   1322 ok:
   1323 #endif
   1324 	mutex_enter(&dcap->dca_lock);
   1325 	dcp = (dircache_t *)dcap->dca_dircache;
   1326 	if (VALID_DIR_CACHE(dcp)) {
   1327 		/*
   1328 		 * If the total number of entries goes above the max
   1329 		 * then free this cache
   1330 		 */
   1331 		if ((dcp->dc_num_entries + dcp->dc_num_free) >
   1332 			dnlc_dir_max_size) {
   1333 			mutex_exit(&dcap->dca_lock);
   1334 			dnlc_dir_purge(dcap);
   1335 			kmem_free(dep, sizeof (dcentry_t) - 1 + namlen);
   1336 			ncs.ncs_dir_add_max.value.ui64++;
   1337 			return (DTOOBIG);
   1338 		}
   1339 		dcp->dc_num_entries++;
   1340 		capacity = (dcp->dc_nhash_mask + 1) << dnlc_dir_hash_size_shift;
   1341 		if (dcp->dc_num_entries >=
   1342 		    (capacity << dnlc_dir_hash_resize_shift)) {
   1343 			dnlc_dir_adjust_nhash(dcp);
   1344 		}
   1345 		hp = &dcp->dc_namehash[hash & dcp->dc_nhash_mask];
   1346 
   1347 		/*
   1348 		 * Initialise and chain in new entry
   1349 		 */
   1350 		dep->de_handle = handle;
   1351 		dep->de_hash = hash;
   1352 		/*
   1353 		 * Note de_namelen is a uchar_t to conserve space
   1354 		 * and alignment padding. The max length of any
   1355 		 * pathname component is defined as MAXNAMELEN
   1356 		 * which is 256 (including the terminating null).
   1357 		 * So provided this doesn't change, we don't include the null,
   1358 		 * we always use bcmp to compare strings, and we don't
   1359 		 * start storing full names, then we are ok.
   1360 		 * The space savings is worth it.
   1361 		 */
   1362 		dep->de_namelen = namlen;
   1363 		bcopy(name, dep->de_name, namlen);
   1364 		dep->de_next = *hp;
   1365 		*hp = dep;
   1366 		dcp->dc_actime = lbolt64;
   1367 		mutex_exit(&dcap->dca_lock);
   1368 		ncs.ncs_dir_num_ents.value.ui64++;
   1369 		return (DOK);
   1370 	} else {
   1371 		mutex_exit(&dcap->dca_lock);
   1372 		kmem_free(dep, sizeof (dcentry_t) - 1 + namlen);
   1373 		return (DNOCACHE);
   1374 	}
   1375 }
   1376 
   1377 /*
   1378  * Add free space to a partial or complete directory cache.
   1379  */
   1380 dcret_t
   1381 dnlc_dir_add_space(dcanchor_t *dcap, uint_t len, uint64_t handle)
   1382 {
   1383 	dircache_t *dcp;
   1384 	dcfree_t *dfp, **hp;
   1385 	uint_t capacity;
   1386 
   1387 	/*
   1388 	 * We kmem_alloc outside the lock to avoid possible deadlock if
   1389 	 * dnlc_dir_reclaim() is called as a result of memory shortage.
   1390 	 */
   1391 	dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP);
   1392 	if (dfp == NULL) {
   1393 #ifdef DEBUG
   1394 		/*
   1395 		 * The kmem allocator generates random failures for
   1396 		 * KM_NOSLEEP calls (see KMEM_RANDOM_ALLOCATION_FAILURE)
   1397 		 * So try again before we blow away a perfectly good cache.
   1398 		 * This random error only occurs in debug mode
   1399 		 */
   1400 		dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP);
   1401 		if (dfp != NULL)
   1402 			goto ok;
   1403 #endif
   1404 		ncs.ncs_dir_add_nm.value.ui64++;
   1405 		/*
   1406 		 * Free a directory cache. This may be the one we are
   1407 		 * called with.
   1408 		 */
   1409 		dnlc_dir_reclaim(NULL);
   1410 		dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP);
   1411 		if (dfp == NULL) {
   1412 			/*
   1413 			 * still no memory, better delete this cache
   1414 			 */
   1415 			mutex_enter(&dcap->dca_lock);
   1416 			dcp = (dircache_t *)dcap->dca_dircache;
   1417 			if (VALID_DIR_CACHE(dcp)) {
   1418 				dnlc_dir_abort(dcp);
   1419 				dcap->dca_dircache = DC_RET_LOW_MEM;
   1420 			}
   1421 			mutex_exit(&dcap->dca_lock);
   1422 			ncs.ncs_dir_addabort.value.ui64++;
   1423 			return (DNOCACHE);
   1424 		}
   1425 		/*
   1426 		 * fall through as if the 1st kmem_alloc had worked
   1427 		 */
   1428 	}
   1429 
   1430 #ifdef DEBUG
   1431 ok:
   1432 #endif
   1433 	mutex_enter(&dcap->dca_lock);
   1434 	dcp = (dircache_t *)dcap->dca_dircache;
   1435 	if (VALID_DIR_CACHE(dcp)) {
   1436 		if ((dcp->dc_num_entries + dcp->dc_num_free) >
   1437 			dnlc_dir_max_size) {
   1438 			mutex_exit(&dcap->dca_lock);
   1439 			dnlc_dir_purge(dcap);
   1440 			kmem_cache_free(dnlc_dir_space_cache, dfp);
   1441 			ncs.ncs_dir_add_max.value.ui64++;
   1442 			return (DTOOBIG);
   1443 		}
   1444 		dcp->dc_num_free++;
   1445 		capacity = (dcp->dc_fhash_mask + 1) << dnlc_dir_hash_size_shift;
   1446 		if (dcp->dc_num_free >=
   1447 		    (capacity << dnlc_dir_hash_resize_shift)) {
   1448 			dnlc_dir_adjust_fhash(dcp);
   1449 		}
   1450 		/*
   1451 		 * Initialise and chain a new entry
   1452 		 */
   1453 		dfp->df_handle = handle;
   1454 		dfp->df_len = len;
   1455 		dcp->dc_actime = lbolt64;
   1456 		hp = &(dcp->dc_freehash[DDFHASH(handle, dcp)]);
   1457 		dfp->df_next = *hp;
   1458 		*hp = dfp;
   1459 		mutex_exit(&dcap->dca_lock);
   1460 		ncs.ncs_dir_num_ents.value.ui64++;
   1461 		return (DOK);
   1462 	} else {
   1463 		mutex_exit(&dcap->dca_lock);
   1464 		kmem_cache_free(dnlc_dir_space_cache, dfp);
   1465 		return (DNOCACHE);
   1466 	}
   1467 }
   1468 
   1469 /*
   1470  * Mark a directory cache as complete.
   1471  */
   1472 void
   1473 dnlc_dir_complete(dcanchor_t *dcap)
   1474 {
   1475 	dircache_t *dcp;
   1476 
   1477 	mutex_enter(&dcap->dca_lock);
   1478 	dcp = (dircache_t *)dcap->dca_dircache;
   1479 	if (VALID_DIR_CACHE(dcp)) {
   1480 		dcp->dc_complete = B_TRUE;
   1481 	}
   1482 	mutex_exit(&dcap->dca_lock);
   1483 }
   1484 
   1485 /*
   1486  * Internal routine to delete a partial or full directory cache.
   1487  * No additional locking needed.
   1488  */
   1489 static void
   1490 dnlc_dir_abort(dircache_t *dcp)
   1491 {
   1492 	dcentry_t *dep, *nhp;
   1493 	dcfree_t *fep, *fhp;
   1494 	uint_t nhtsize = dcp->dc_nhash_mask + 1; /* name hash table size */
   1495 	uint_t fhtsize = dcp->dc_fhash_mask + 1; /* free hash table size */
   1496 	uint_t i;
   1497 
   1498 	/*
   1499 	 * Free up the cached name entries and hash table
   1500 	 */
   1501 	for (i = 0; i < nhtsize; i++) { /* for each hash bucket */
   1502 		nhp = dcp->dc_namehash[i];
   1503 		while (nhp != NULL) { /* for each chained entry */
   1504 			dep = nhp->de_next;
   1505 			kmem_free(nhp, sizeof (dcentry_t) - 1 +
   1506 			    nhp->de_namelen);
   1507 			nhp = dep;
   1508 		}
   1509 	}
   1510 	kmem_free(dcp->dc_namehash, sizeof (dcentry_t *) * nhtsize);
   1511 
   1512 	/*
   1513 	 * Free up the free space entries and hash table
   1514 	 */
   1515 	for (i = 0; i < fhtsize; i++) { /* for each hash bucket */
   1516 		fhp = dcp->dc_freehash[i];
   1517 		while (fhp != NULL) { /* for each chained entry */
   1518 			fep = fhp->df_next;
   1519 			kmem_cache_free(dnlc_dir_space_cache, fhp);
   1520 			fhp = fep;
   1521 		}
   1522 	}
   1523 	kmem_free(dcp->dc_freehash, sizeof (dcfree_t *) * fhtsize);
   1524 
   1525 	/*
   1526 	 * Finally free the directory cache structure itself
   1527 	 */
   1528 	ncs.ncs_dir_num_ents.value.ui64 -= (dcp->dc_num_entries +
   1529 	    dcp->dc_num_free);
   1530 	kmem_free(dcp, sizeof (dircache_t));
   1531 	ncs.ncs_cur_dirs.value.ui64--;
   1532 }
   1533 
   1534 /*
   1535  * Remove a partial or complete directory cache
   1536  */
   1537 void
   1538 dnlc_dir_purge(dcanchor_t *dcap)
   1539 {
   1540 	dircache_t *dcp;
   1541 
   1542 	mutex_enter(&dc_head.dch_lock);
   1543 	mutex_enter(&dcap->dca_lock);
   1544 	dcp = (dircache_t *)dcap->dca_dircache;
   1545 	if (!VALID_DIR_CACHE(dcp)) {
   1546 		mutex_exit(&dcap->dca_lock);
   1547 		mutex_exit(&dc_head.dch_lock);
   1548 		return;
   1549 	}
   1550 	dcap->dca_dircache = NULL;
   1551 	/*
   1552 	 * Unchain from global list
   1553 	 */
   1554 	dcp->dc_prev->dc_next = dcp->dc_next;
   1555 	dcp->dc_next->dc_prev = dcp->dc_prev;
   1556 	mutex_exit(&dcap->dca_lock);
   1557 	mutex_exit(&dc_head.dch_lock);
   1558 	dnlc_dir_abort(dcp);
   1559 }
   1560 
   1561 /*
   1562  * Remove an entry from a complete or partial directory cache.
   1563  * Return the handle if it's non null.
   1564  */
   1565 dcret_t
   1566 dnlc_dir_rem_entry(dcanchor_t *dcap, char *name, uint64_t *handlep)
   1567 {
   1568 	dircache_t *dcp;
   1569 	dcentry_t **prevpp, *te;
   1570 	uint_t capacity;
   1571 	int hash;
   1572 	int ret;
   1573 	uchar_t namlen;
   1574 
   1575 	if (!dnlc_dir_enable) {
   1576 		return (DNOCACHE);
   1577 	}
   1578 
   1579 	mutex_enter(&dcap->dca_lock);
   1580 	dcp = (dircache_t *)dcap->dca_dircache;
   1581 	if (VALID_DIR_CACHE(dcp)) {
   1582 		dcp->dc_actime = lbolt64;
   1583 		if (dcp->dc_nhash_mask > 0) { /* ie not minimum */
   1584 			capacity = (dcp->dc_nhash_mask + 1) <<
   1585 			    dnlc_dir_hash_size_shift;
   1586 			if (dcp->dc_num_entries <=
   1587 			    (capacity >> dnlc_dir_hash_resize_shift)) {
   1588 				dnlc_dir_adjust_nhash(dcp);
   1589 			}
   1590 		}
   1591 		DNLC_DIR_HASH(name, hash, namlen);
   1592 		prevpp = &dcp->dc_namehash[hash & dcp->dc_nhash_mask];
   1593 		while (*prevpp != NULL) {
   1594 			if (((*prevpp)->de_hash == hash) &&
   1595 			    (namlen == (*prevpp)->de_namelen) &&
   1596 			    bcmp((*prevpp)->de_name, name, namlen) == 0) {
   1597 				if (handlep != NULL) {
   1598 					*handlep = (*prevpp)->de_handle;
   1599 				}
   1600 				te = *prevpp;
   1601 				*prevpp = (*prevpp)->de_next;
   1602 				kmem_free(te, sizeof (dcentry_t) - 1 +
   1603 				    te->de_namelen);
   1604 
   1605 				/*
   1606 				 * If the total number of entries
   1607 				 * falls below half the minimum number
   1608 				 * of entries then free this cache.
   1609 				 */
   1610 				if (--dcp->dc_num_entries <
   1611 				    (dnlc_dir_min_size >> 1)) {
   1612 					mutex_exit(&dcap->dca_lock);
   1613 					dnlc_dir_purge(dcap);
   1614 				} else {
   1615 					mutex_exit(&dcap->dca_lock);
   1616 				}
   1617 				ncs.ncs_dir_num_ents.value.ui64--;
   1618 				return (DFOUND);
   1619 			}
   1620 			prevpp = &((*prevpp)->de_next);
   1621 		}
   1622 		if (dcp->dc_complete) {
   1623 			ncs.ncs_dir_reme_fai.value.ui64++;
   1624 			ret = DNOENT;
   1625 		} else {
   1626 			ret = DNOCACHE;
   1627 		}
   1628 		mutex_exit(&dcap->dca_lock);
   1629 		return (ret);
   1630 	} else {
   1631 		mutex_exit(&dcap->dca_lock);
   1632 		return (DNOCACHE);
   1633 	}
   1634 }
   1635 
   1636 
   1637 /*
   1638  * Remove free space of at least the given length from a complete
   1639  * or partial directory cache.
   1640  */
   1641 dcret_t
   1642 dnlc_dir_rem_space_by_len(dcanchor_t *dcap, uint_t len, uint64_t *handlep)
   1643 {
   1644 	dircache_t *dcp;
   1645 	dcfree_t **prevpp, *tfp;
   1646 	uint_t fhtsize; /* free hash table size */
   1647 	uint_t i;
   1648 	uint_t capacity;
   1649 	int ret;
   1650 
   1651 	if (!dnlc_dir_enable) {
   1652 		return (DNOCACHE);
   1653 	}
   1654 
   1655 	mutex_enter(&dcap->dca_lock);
   1656 	dcp = (dircache_t *)dcap->dca_dircache;
   1657 	if (VALID_DIR_CACHE(dcp)) {
   1658 		dcp->dc_actime = lbolt64;
   1659 		if (dcp->dc_fhash_mask > 0) { /* ie not minimum */
   1660 			capacity = (dcp->dc_fhash_mask + 1) <<
   1661 			    dnlc_dir_hash_size_shift;
   1662 			if (dcp->dc_num_free <=
   1663 			    (capacity >> dnlc_dir_hash_resize_shift)) {
   1664 				dnlc_dir_adjust_fhash(dcp);
   1665 			}
   1666 		}
   1667 		/*
   1668 		 * Search for an entry of the appropriate size
   1669 		 * on a first fit basis.
   1670 		 */
   1671 		fhtsize = dcp->dc_fhash_mask + 1;
   1672 		for (i = 0; i < fhtsize; i++) { /* for each hash bucket */
   1673 			prevpp = &(dcp->dc_freehash[i]);
   1674 			while (*prevpp != NULL) {
   1675 				if ((*prevpp)->df_len >= len) {
   1676 					*handlep = (*prevpp)->df_handle;
   1677 					tfp = *prevpp;
   1678 					*prevpp = (*prevpp)->df_next;
   1679 					dcp->dc_num_free--;
   1680 					mutex_exit(&dcap->dca_lock);
   1681 					kmem_cache_free(dnlc_dir_space_cache,
   1682 					    tfp);
   1683 					ncs.ncs_dir_num_ents.value.ui64--;
   1684 					return (DFOUND);
   1685 				}
   1686 				prevpp = &((*prevpp)->df_next);
   1687 			}
   1688 		}
   1689 		if (dcp->dc_complete) {
   1690 			ret = DNOENT;
   1691 		} else {
   1692 			ret = DNOCACHE;
   1693 		}
   1694 		mutex_exit(&dcap->dca_lock);
   1695 		return (ret);
   1696 	} else {
   1697 		mutex_exit(&dcap->dca_lock);
   1698 		return (DNOCACHE);
   1699 	}
   1700 }
   1701 
   1702 /*
   1703  * Remove free space with the given handle from a complete or partial
   1704  * directory cache.
   1705  */
   1706 dcret_t
   1707 dnlc_dir_rem_space_by_handle(dcanchor_t *dcap, uint64_t handle)
   1708 {
   1709 	dircache_t *dcp;
   1710 	dcfree_t **prevpp, *tfp;
   1711 	uint_t capacity;
   1712 	int ret;
   1713 
   1714 	if (!dnlc_dir_enable) {
   1715 		return (DNOCACHE);
   1716 	}
   1717 
   1718 	mutex_enter(&dcap->dca_lock);
   1719 	dcp = (dircache_t *)dcap->dca_dircache;
   1720 	if (VALID_DIR_CACHE(dcp)) {
   1721 		dcp->dc_actime = lbolt64;
   1722 		if (dcp->dc_fhash_mask > 0) { /* ie not minimum */
   1723 			capacity = (dcp->dc_fhash_mask + 1) <<
   1724 			    dnlc_dir_hash_size_shift;
   1725 			if (dcp->dc_num_free <=
   1726 			    (capacity >> dnlc_dir_hash_resize_shift)) {
   1727 				dnlc_dir_adjust_fhash(dcp);
   1728 			}
   1729 		}
   1730 
   1731 		/*
   1732 		 * search for the exact entry
   1733 		 */
   1734 		prevpp = &(dcp->dc_freehash[DDFHASH(handle, dcp)]);
   1735 		while (*prevpp != NULL) {
   1736 			if ((*prevpp)->df_handle == handle) {
   1737 				tfp = *prevpp;
   1738 				*prevpp = (*prevpp)->df_next;
   1739 				dcp->dc_num_free--;
   1740 				mutex_exit(&dcap->dca_lock);
   1741 				kmem_cache_free(dnlc_dir_space_cache, tfp);
   1742 				ncs.ncs_dir_num_ents.value.ui64--;
   1743 				return (DFOUND);
   1744 			}
   1745 			prevpp = &((*prevpp)->df_next);
   1746 		}
   1747 		if (dcp->dc_complete) {
   1748 			ncs.ncs_dir_rems_fai.value.ui64++;
   1749 			ret = DNOENT;
   1750 		} else {
   1751 			ret = DNOCACHE;
   1752 		}
   1753 		mutex_exit(&dcap->dca_lock);
   1754 		return (ret);
   1755 	} else {
   1756 		mutex_exit(&dcap->dca_lock);
   1757 		return (DNOCACHE);
   1758 	}
   1759 }
   1760 
   1761 /*
   1762  * Update the handle of an directory cache entry.
   1763  */
   1764 dcret_t
   1765 dnlc_dir_update(dcanchor_t *dcap, char *name, uint64_t handle)
   1766 {
   1767 	dircache_t *dcp;
   1768 	dcentry_t *dep;
   1769 	int hash;
   1770 	int ret;
   1771 	uchar_t namlen;
   1772 
   1773 	if (!dnlc_dir_enable) {
   1774 		return (DNOCACHE);
   1775 	}
   1776 
   1777 	mutex_enter(&dcap->dca_lock);
   1778 	dcp = (dircache_t *)dcap->dca_dircache;
   1779 	if (VALID_DIR_CACHE(dcp)) {
   1780 		dcp->dc_actime = lbolt64;
   1781 		DNLC_DIR_HASH(name, hash, namlen);
   1782 		dep = dcp->dc_namehash[hash & dcp->dc_nhash_mask];
   1783 		while (dep != NULL) {
   1784 			if ((dep->de_hash == hash) &&
   1785 			    (namlen == dep->de_namelen) &&
   1786 			    bcmp(dep->de_name, name, namlen) == 0) {
   1787 				dep->de_handle = handle;
   1788 				mutex_exit(&dcap->dca_lock);
   1789 				return (DFOUND);
   1790 			}
   1791 			dep = dep->de_next;
   1792 		}
   1793 		if (dcp->dc_complete) {
   1794 			ncs.ncs_dir_upd_fail.value.ui64++;
   1795 			ret = DNOENT;
   1796 		} else {
   1797 			ret = DNOCACHE;
   1798 		}
   1799 		mutex_exit(&dcap->dca_lock);
   1800 		return (ret);
   1801 	} else {
   1802 		mutex_exit(&dcap->dca_lock);
   1803 		return (DNOCACHE);
   1804 	}
   1805 }
   1806 
   1807 void
   1808 dnlc_dir_fini(dcanchor_t *dcap)
   1809 {
   1810 	dircache_t *dcp;
   1811 
   1812 	mutex_enter(&dc_head.dch_lock);
   1813 	mutex_enter(&dcap->dca_lock);
   1814 	dcp = (dircache_t *)dcap->dca_dircache;
   1815 	if (VALID_DIR_CACHE(dcp)) {
   1816 		/*
   1817 		 * Unchain from global list
   1818 		 */
   1819 		ncs.ncs_dir_finipurg.value.ui64++;
   1820 		dcp->dc_prev->dc_next = dcp->dc_next;
   1821 		dcp->dc_next->dc_prev = dcp->dc_prev;
   1822 	} else {
   1823 		dcp = NULL;
   1824 	}
   1825 	dcap->dca_dircache = NULL;
   1826 	mutex_exit(&dcap->dca_lock);
   1827 	mutex_exit(&dc_head.dch_lock);
   1828 	mutex_destroy(&dcap->dca_lock);
   1829 	if (dcp) {
   1830 		dnlc_dir_abort(dcp);
   1831 	}
   1832 }
   1833 
   1834 /*
   1835  * Reclaim callback for dnlc directory caching.
   1836  * Invoked by the kernel memory allocator when memory gets tight.
   1837  * This is a pretty serious condition and can lead easily lead to system
   1838  * hangs if not enough space is returned.
   1839  *
   1840  * Deciding which directory (or directories) to purge is tricky.
   1841  * Purging everything is an overkill, but purging just the oldest used
   1842  * was found to lead to hangs. The largest cached directories use the
   1843  * most memory, but take the most effort to rebuild, whereas the smaller
   1844  * ones have little value and give back little space. So what to do?
   1845  *
   1846  * The current policy is to continue purging the oldest used directories
   1847  * until at least dnlc_dir_min_reclaim directory entries have been purged.
   1848  */
   1849 /*ARGSUSED*/
   1850 static void
   1851 dnlc_dir_reclaim(void *unused)
   1852 {
   1853 	dircache_t *dcp, *oldest;
   1854 	uint_t dirent_cnt = 0;
   1855 
   1856 	mutex_enter(&dc_head.dch_lock);
   1857 	while (dirent_cnt < dnlc_dir_min_reclaim) {
   1858 		dcp = dc_head.dch_next;
   1859 		oldest = NULL;
   1860 		while (dcp != (dircache_t *)&dc_head) {
   1861 			if (oldest == NULL) {
   1862 				oldest = dcp;
   1863 			} else {
   1864 				if (dcp->dc_actime < oldest->dc_actime) {
   1865 					oldest = dcp;
   1866 				}
   1867 			}
   1868 			dcp = dcp->dc_next;
   1869 		}
   1870 		if (oldest == NULL) {
   1871 			/* nothing to delete */
   1872 			mutex_exit(&dc_head.dch_lock);
   1873 			return;
   1874 		}
   1875 		/*
   1876 		 * remove from directory chain and purge
   1877 		 */
   1878 		oldest->dc_prev->dc_next = oldest->dc_next;
   1879 		oldest->dc_next->dc_prev = oldest->dc_prev;
   1880 		mutex_enter(&oldest->dc_anchor->dca_lock);
   1881 		/*
   1882 		 * If this was the last entry then it must be too large.
   1883 		 * Mark it as such by saving a special dircache_t
   1884 		 * pointer (DC_RET_LOW_MEM) in the anchor. The error DNOMEM
   1885 		 * will be presented to the caller of dnlc_dir_start()
   1886 		 */
   1887 		if (oldest->dc_next == oldest->dc_prev) {
   1888 			oldest->dc_anchor->dca_dircache = DC_RET_LOW_MEM;
   1889 			ncs.ncs_dir_rec_last.value.ui64++;
   1890 		} else {
   1891 			oldest->dc_anchor->dca_dircache = NULL;
   1892 			ncs.ncs_dir_recl_any.value.ui64++;
   1893 		}
   1894 		mutex_exit(&oldest->dc_anchor->dca_lock);
   1895 		dirent_cnt += oldest->dc_num_entries;
   1896 		dnlc_dir_abort(oldest);
   1897 	}
   1898 	mutex_exit(&dc_head.dch_lock);
   1899 }
   1900 
   1901 /*
   1902  * Dynamically grow or shrink the size of the name hash table
   1903  */
   1904 static void
   1905 dnlc_dir_adjust_nhash(dircache_t *dcp)
   1906 {
   1907 	dcentry_t **newhash, *dep, **nhp, *tep;
   1908 	uint_t newsize;
   1909 	uint_t oldsize;
   1910 	uint_t newsizemask;
   1911 	int i;
   1912 
   1913 	/*
   1914 	 * Allocate new hash table
   1915 	 */
   1916 	newsize = dcp->dc_num_entries >> dnlc_dir_hash_size_shift;
   1917 	newhash = kmem_zalloc(sizeof (dcentry_t *) * newsize, KM_NOSLEEP);
   1918 	if (newhash == NULL) {
   1919 		/*
   1920 		 * System is short on memory just return
   1921 		 * Note, the old hash table is still usable.
   1922 		 * This return is unlikely to repeatedy occur, because
   1923 		 * either some other directory caches will be reclaimed
   1924 		 * due to memory shortage, thus freeing memory, or this
   1925 		 * directory cahe will be reclaimed.
   1926 		 */
   1927 		return;
   1928 	}
   1929 	oldsize = dcp->dc_nhash_mask + 1;
   1930 	dcp->dc_nhash_mask = newsizemask = newsize - 1;
   1931 
   1932 	/*
   1933 	 * Move entries from the old table to the new
   1934 	 */
   1935 	for (i = 0; i < oldsize; i++) { /* for each hash bucket */
   1936 		dep = dcp->dc_namehash[i];
   1937 		while (dep != NULL) { /* for each chained entry */
   1938 			tep = dep;
   1939 			dep = dep->de_next;
   1940 			nhp = &newhash[tep->de_hash & newsizemask];
   1941 			tep->de_next = *nhp;
   1942 			*nhp = tep;
   1943 		}
   1944 	}
   1945 
   1946 	/*
   1947 	 * delete old hash table and set new one in place
   1948 	 */
   1949 	kmem_free(dcp->dc_namehash, sizeof (dcentry_t *) * oldsize);
   1950 	dcp->dc_namehash = newhash;
   1951 }
   1952 
   1953 /*
   1954  * Dynamically grow or shrink the size of the free space hash table
   1955  */
   1956 static void
   1957 dnlc_dir_adjust_fhash(dircache_t *dcp)
   1958 {
   1959 	dcfree_t **newhash, *dfp, **nhp, *tfp;
   1960 	uint_t newsize;
   1961 	uint_t oldsize;
   1962 	int i;
   1963 
   1964 	/*
   1965 	 * Allocate new hash table
   1966 	 */
   1967 	newsize = dcp->dc_num_free >> dnlc_dir_hash_size_shift;
   1968 	newhash = kmem_zalloc(sizeof (dcfree_t *) * newsize, KM_NOSLEEP);
   1969 	if (newhash == NULL) {
   1970 		/*
   1971 		 * System is short on memory just return
   1972 		 * Note, the old hash table is still usable.
   1973 		 * This return is unlikely to repeatedy occur, because
   1974 		 * either some other directory caches will be reclaimed
   1975 		 * due to memory shortage, thus freeing memory, or this
   1976 		 * directory cahe will be reclaimed.
   1977 		 */
   1978 		return;
   1979 	}
   1980 	oldsize = dcp->dc_fhash_mask + 1;
   1981 	dcp->dc_fhash_mask = newsize - 1;
   1982 
   1983 	/*
   1984 	 * Move entries from the old table to the new
   1985 	 */
   1986 	for (i = 0; i < oldsize; i++) { /* for each hash bucket */
   1987 		dfp = dcp->dc_freehash[i];
   1988 		while (dfp != NULL) { /* for each chained entry */
   1989 			tfp = dfp;
   1990 			dfp = dfp->df_next;
   1991 			nhp = &newhash[DDFHASH(tfp->df_handle, dcp)];
   1992 			tfp->df_next = *nhp;
   1993 			*nhp = tfp;
   1994 		}
   1995 	}
   1996 
   1997 	/*
   1998 	 * delete old hash table and set new one in place
   1999 	 */
   2000 	kmem_free(dcp->dc_freehash, sizeof (dcfree_t *) * oldsize);
   2001 	dcp->dc_freehash = newhash;
   2002 }
   2003