Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)zap_micro.c	1.11	07/11/19 SMI"
     27 
     28 #include <sys/spa.h>
     29 #include <sys/dmu.h>
     30 #include <sys/zfs_context.h>
     31 #include <sys/zap.h>
     32 #include <sys/refcount.h>
     33 #include <sys/zap_impl.h>
     34 #include <sys/zap_leaf.h>
     35 #include <sys/avl.h>
     36 
     37 #ifdef _KERNEL
     38 #include <sys/sunddi.h>
     39 #endif
     40 
     41 static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx);
     42 
     43 
     44 static uint64_t
     45 zap_hash(zap_t *zap, const char *normname)
     46 {
     47 	const uint8_t *cp;
     48 	uint8_t c;
     49 	uint64_t crc = zap->zap_salt;
     50 
     51 	/* NB: name must already be normalized, if necessary */
     52 
     53 	ASSERT(crc != 0);
     54 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
     55 	for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) {
     56 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
     57 	}
     58 
     59 	/*
     60 	 * Only use 28 bits, since we need 4 bits in the cookie for the
     61 	 * collision differentiator.  We MUST use the high bits, since
     62 	 * those are the ones that we first pay attention to when
     63 	 * chosing the bucket.
     64 	 */
     65 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
     66 
     67 	return (crc);
     68 }
     69 
     70 static int
     71 zap_normalize(zap_t *zap, const char *name, char *namenorm)
     72 {
     73 	size_t inlen, outlen;
     74 	int err;
     75 
     76 	inlen = strlen(name) + 1;
     77 	outlen = ZAP_MAXNAMELEN;
     78 
     79 	err = 0;
     80 	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
     81 	    zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST,
     82 	    &err);
     83 
     84 	return (err);
     85 }
     86 
     87 boolean_t
     88 zap_match(zap_name_t *zn, const char *matchname)
     89 {
     90 	if (zn->zn_matchtype == MT_FIRST) {
     91 		char norm[ZAP_MAXNAMELEN];
     92 
     93 		if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
     94 			return (B_FALSE);
     95 
     96 		return (strcmp(zn->zn_name_norm, norm) == 0);
     97 	} else {
     98 		/* MT_BEST or MT_EXACT */
     99 		return (strcmp(zn->zn_name_orij, matchname) == 0);
    100 	}
    101 }
    102 
    103 void
    104 zap_name_free(zap_name_t *zn)
    105 {
    106 	kmem_free(zn, sizeof (zap_name_t));
    107 }
    108 
    109 /* XXX combine this with zap_lockdir()? */
    110 zap_name_t *
    111 zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt)
    112 {
    113 	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
    114 
    115 	zn->zn_zap = zap;
    116 	zn->zn_name_orij = name;
    117 	zn->zn_matchtype = mt;
    118 	if (zap->zap_normflags) {
    119 		if (zap_normalize(zap, name, zn->zn_normbuf) != 0) {
    120 			zap_name_free(zn);
    121 			return (NULL);
    122 		}
    123 		zn->zn_name_norm = zn->zn_normbuf;
    124 	} else {
    125 		if (mt != MT_EXACT) {
    126 			zap_name_free(zn);
    127 			return (NULL);
    128 		}
    129 		zn->zn_name_norm = zn->zn_name_orij;
    130 	}
    131 
    132 	zn->zn_hash = zap_hash(zap, zn->zn_name_norm);
    133 	return (zn);
    134 }
    135 
    136 static void
    137 mzap_byteswap(mzap_phys_t *buf, size_t size)
    138 {
    139 	int i, max;
    140 	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
    141 	buf->mz_salt = BSWAP_64(buf->mz_salt);
    142 	max = (size / MZAP_ENT_LEN) - 1;
    143 	for (i = 0; i < max; i++) {
    144 		buf->mz_chunk[i].mze_value =
    145 		    BSWAP_64(buf->mz_chunk[i].mze_value);
    146 		buf->mz_chunk[i].mze_cd =
    147 		    BSWAP_32(buf->mz_chunk[i].mze_cd);
    148 	}
    149 }
    150 
    151 void
    152 zap_byteswap(void *buf, size_t size)
    153 {
    154 	uint64_t block_type;
    155 
    156 	block_type = *(uint64_t *)buf;
    157 
    158 	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
    159 		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
    160 		mzap_byteswap(buf, size);
    161 	} else {
    162 		fzap_byteswap(buf, size);
    163 	}
    164 }
    165 
    166 static int
    167 mze_compare(const void *arg1, const void *arg2)
    168 {
    169 	const mzap_ent_t *mze1 = arg1;
    170 	const mzap_ent_t *mze2 = arg2;
    171 
    172 	if (mze1->mze_hash > mze2->mze_hash)
    173 		return (+1);
    174 	if (mze1->mze_hash < mze2->mze_hash)
    175 		return (-1);
    176 	if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
    177 		return (+1);
    178 	if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
    179 		return (-1);
    180 	return (0);
    181 }
    182 
    183 static void
    184 mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
    185 {
    186 	mzap_ent_t *mze;
    187 
    188 	ASSERT(zap->zap_ismicro);
    189 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
    190 	ASSERT(mzep->mze_cd < ZAP_MAXCD);
    191 
    192 	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
    193 	mze->mze_chunkid = chunkid;
    194 	mze->mze_hash = hash;
    195 	mze->mze_phys = *mzep;
    196 	avl_add(&zap->zap_m.zap_avl, mze);
    197 }
    198 
    199 static mzap_ent_t *
    200 mze_find(zap_name_t *zn)
    201 {
    202 	mzap_ent_t mze_tofind;
    203 	mzap_ent_t *mze;
    204 	avl_index_t idx;
    205 	avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
    206 
    207 	ASSERT(zn->zn_zap->zap_ismicro);
    208 	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
    209 
    210 	if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name))
    211 		return (NULL);
    212 
    213 	mze_tofind.mze_hash = zn->zn_hash;
    214 	mze_tofind.mze_phys.mze_cd = 0;
    215 
    216 again:
    217 	mze = avl_find(avl, &mze_tofind, &idx);
    218 	if (mze == NULL)
    219 		mze = avl_nearest(avl, idx, AVL_AFTER);
    220 	for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
    221 		if (zap_match(zn, mze->mze_phys.mze_name))
    222 			return (mze);
    223 	}
    224 	if (zn->zn_matchtype == MT_BEST) {
    225 		zn->zn_matchtype = MT_FIRST;
    226 		goto again;
    227 	}
    228 	return (NULL);
    229 }
    230 
    231 static uint32_t
    232 mze_find_unused_cd(zap_t *zap, uint64_t hash)
    233 {
    234 	mzap_ent_t mze_tofind;
    235 	mzap_ent_t *mze;
    236 	avl_index_t idx;
    237 	avl_tree_t *avl = &zap->zap_m.zap_avl;
    238 	uint32_t cd;
    239 
    240 	ASSERT(zap->zap_ismicro);
    241 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
    242 
    243 	mze_tofind.mze_hash = hash;
    244 	mze_tofind.mze_phys.mze_cd = 0;
    245 
    246 	cd = 0;
    247 	for (mze = avl_find(avl, &mze_tofind, &idx);
    248 	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
    249 		if (mze->mze_phys.mze_cd != cd)
    250 			break;
    251 		cd++;
    252 	}
    253 
    254 	return (cd);
    255 }
    256 
    257 static void
    258 mze_remove(zap_t *zap, mzap_ent_t *mze)
    259 {
    260 	ASSERT(zap->zap_ismicro);
    261 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
    262 
    263 	avl_remove(&zap->zap_m.zap_avl, mze);
    264 	kmem_free(mze, sizeof (mzap_ent_t));
    265 }
    266 
    267 static void
    268 mze_destroy(zap_t *zap)
    269 {
    270 	mzap_ent_t *mze;
    271 	void *avlcookie = NULL;
    272 
    273 	while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
    274 		kmem_free(mze, sizeof (mzap_ent_t));
    275 	avl_destroy(&zap->zap_m.zap_avl);
    276 }
    277 
    278 static zap_t *
    279 mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
    280 {
    281 	zap_t *winner;
    282 	zap_t *zap;
    283 	int i;
    284 
    285 	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
    286 
    287 	zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
    288 	rw_init(&zap->zap_rwlock, 0, 0, 0);
    289 	rw_enter(&zap->zap_rwlock, RW_WRITER);
    290 	zap->zap_objset = os;
    291 	zap->zap_object = obj;
    292 	zap->zap_dbuf = db;
    293 
    294 	if (*(uint64_t *)db->db_data != ZBT_MICRO) {
    295 		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
    296 		zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
    297 	} else {
    298 		zap->zap_ismicro = TRUE;
    299 	}
    300 
    301 	/*
    302 	 * Make sure that zap_ismicro is set before we let others see
    303 	 * it, because zap_lockdir() checks zap_ismicro without the lock
    304 	 * held.
    305 	 */
    306 	winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);
    307 
    308 	if (winner != NULL) {
    309 		rw_exit(&zap->zap_rwlock);
    310 		rw_destroy(&zap->zap_rwlock);
    311 		if (!zap->zap_ismicro)
    312 			mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
    313 		kmem_free(zap, sizeof (zap_t));
    314 		return (winner);
    315 	}
    316 
    317 	if (zap->zap_ismicro) {
    318 		zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
    319 		zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags;
    320 		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
    321 		avl_create(&zap->zap_m.zap_avl, mze_compare,
    322 		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
    323 
    324 		for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
    325 			mzap_ent_phys_t *mze =
    326 			    &zap->zap_m.zap_phys->mz_chunk[i];
    327 			if (mze->mze_name[0]) {
    328 				zap_name_t *zn;
    329 
    330 				zap->zap_m.zap_num_entries++;
    331 				zn = zap_name_alloc(zap, mze->mze_name,
    332 				    MT_EXACT);
    333 				mze_insert(zap, i, zn->zn_hash, mze);
    334 				zap_name_free(zn);
    335 			}
    336 		}
    337 	} else {
    338 		zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
    339 		zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags;
    340 
    341 		ASSERT3U(sizeof (struct zap_leaf_header), ==,
    342 		    2*ZAP_LEAF_CHUNKSIZE);
    343 
    344 		/*
    345 		 * The embedded pointer table should not overlap the
    346 		 * other members.
    347 		 */
    348 		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
    349 		    &zap->zap_f.zap_phys->zap_salt);
    350 
    351 		/*
    352 		 * The embedded pointer table should end at the end of
    353 		 * the block
    354 		 */
    355 		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
    356 		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
    357 		    (uintptr_t)zap->zap_f.zap_phys, ==,
    358 		    zap->zap_dbuf->db_size);
    359 	}
    360 	rw_exit(&zap->zap_rwlock);
    361 	return (zap);
    362 }
    363 
    364 int
    365 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
    366     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
    367 {
    368 	zap_t *zap;
    369 	dmu_buf_t *db;
    370 	krw_t lt;
    371 	int err;
    372 
    373 	*zapp = NULL;
    374 
    375 	err = dmu_buf_hold(os, obj, 0, NULL, &db);
    376 	if (err)
    377 		return (err);
    378 
    379 #ifdef ZFS_DEBUG
    380 	{
    381 		dmu_object_info_t doi;
    382 		dmu_object_info_from_db(db, &doi);
    383 		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
    384 	}
    385 #endif
    386 
    387 	zap = dmu_buf_get_user(db);
    388 	if (zap == NULL)
    389 		zap = mzap_open(os, obj, db);
    390 
    391 	/*
    392 	 * We're checking zap_ismicro without the lock held, in order to
    393 	 * tell what type of lock we want.  Once we have some sort of
    394 	 * lock, see if it really is the right type.  In practice this
    395 	 * can only be different if it was upgraded from micro to fat,
    396 	 * and micro wanted WRITER but fat only needs READER.
    397 	 */
    398 	lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
    399 	rw_enter(&zap->zap_rwlock, lt);
    400 	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
    401 		/* it was upgraded, now we only need reader */
    402 		ASSERT(lt == RW_WRITER);
    403 		ASSERT(RW_READER ==
    404 		    (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
    405 		rw_downgrade(&zap->zap_rwlock);
    406 		lt = RW_READER;
    407 	}
    408 
    409 	zap->zap_objset = os;
    410 
    411 	if (lt == RW_WRITER)
    412 		dmu_buf_will_dirty(db, tx);
    413 
    414 	ASSERT3P(zap->zap_dbuf, ==, db);
    415 
    416 	ASSERT(!zap->zap_ismicro ||
    417 	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
    418 	if (zap->zap_ismicro && tx && adding &&
    419 	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
    420 		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
    421 		if (newsz > MZAP_MAX_BLKSZ) {
    422 			dprintf("upgrading obj %llu: num_entries=%u\n",
    423 			    obj, zap->zap_m.zap_num_entries);
    424 			*zapp = zap;
    425 			return (mzap_upgrade(zapp, tx));
    426 		}
    427 		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
    428 		ASSERT3U(err, ==, 0);
    429 		zap->zap_m.zap_num_chunks =
    430 		    db->db_size / MZAP_ENT_LEN - 1;
    431 	}
    432 
    433 	*zapp = zap;
    434 	return (0);
    435 }
    436 
    437 void
    438 zap_unlockdir(zap_t *zap)
    439 {
    440 	rw_exit(&zap->zap_rwlock);
    441 	dmu_buf_rele(zap->zap_dbuf, NULL);
    442 }
    443 
    444 static int
    445 mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
    446 {
    447 	mzap_phys_t *mzp;
    448 	int i, sz, nchunks, err;
    449 	zap_t *zap = *zapp;
    450 
    451 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
    452 
    453 	sz = zap->zap_dbuf->db_size;
    454 	mzp = kmem_alloc(sz, KM_SLEEP);
    455 	bcopy(zap->zap_dbuf->db_data, mzp, sz);
    456 	nchunks = zap->zap_m.zap_num_chunks;
    457 
    458 	err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
    459 	    1ULL << fzap_default_block_shift, 0, tx);
    460 	if (err) {
    461 		kmem_free(mzp, sz);
    462 		return (err);
    463 	}
    464 
    465 	dprintf("upgrading obj=%llu with %u chunks\n",
    466 	    zap->zap_object, nchunks);
    467 	/* XXX destroy the avl later, so we can use the stored hash value */
    468 	mze_destroy(zap);
    469 
    470 	fzap_upgrade(zap, tx);
    471 
    472 	for (i = 0; i < nchunks; i++) {
    473 		int err;
    474 		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
    475 		zap_name_t *zn;
    476 		if (mze->mze_name[0] == 0)
    477 			continue;
    478 		dprintf("adding %s=%llu\n",
    479 		    mze->mze_name, mze->mze_value);
    480 		zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT);
    481 		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx);
    482 		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
    483 		zap_name_free(zn);
    484 		if (err)
    485 			break;
    486 	}
    487 	kmem_free(mzp, sz);
    488 	*zapp = zap;
    489 	return (err);
    490 }
    491 
    492 static void
    493 mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
    494 {
    495 	dmu_buf_t *db;
    496 	mzap_phys_t *zp;
    497 
    498 	VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
    499 
    500 #ifdef ZFS_DEBUG
    501 	{
    502 		dmu_object_info_t doi;
    503 		dmu_object_info_from_db(db, &doi);
    504 		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
    505 	}
    506 #endif
    507 
    508 	dmu_buf_will_dirty(db, tx);
    509 	zp = db->db_data;
    510 	zp->mz_block_type = ZBT_MICRO;
    511 	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
    512 	zp->mz_normflags = normflags;
    513 	dmu_buf_rele(db, FTAG);
    514 }
    515 
    516 int
    517 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
    518     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
    519 {
    520 	return (zap_create_claim_norm(os, obj,
    521 	    0, ot, bonustype, bonuslen, tx));
    522 }
    523 
    524 int
    525 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
    526     dmu_object_type_t ot,
    527     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
    528 {
    529 	int err;
    530 
    531 	err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
    532 	if (err != 0)
    533 		return (err);
    534 	mzap_create_impl(os, obj, normflags, tx);
    535 	return (0);
    536 }
    537 
    538 uint64_t
    539 zap_create(objset_t *os, dmu_object_type_t ot,
    540     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
    541 {
    542 	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
    543 }
    544 
    545 uint64_t
    546 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
    547     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
    548 {
    549 	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
    550 
    551 	mzap_create_impl(os, obj, normflags, tx);
    552 	return (obj);
    553 }
    554 
    555 int
    556 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
    557 {
    558 	/*
    559 	 * dmu_object_free will free the object number and free the
    560 	 * data.  Freeing the data will cause our pageout function to be
    561 	 * called, which will destroy our data (zap_leaf_t's and zap_t).
    562 	 */
    563 
    564 	return (dmu_object_free(os, zapobj, tx));
    565 }
    566 
    567 _NOTE(ARGSUSED(0))
    568 void
    569 zap_evict(dmu_buf_t *db, void *vzap)
    570 {
    571 	zap_t *zap = vzap;
    572 
    573 	rw_destroy(&zap->zap_rwlock);
    574 
    575 	if (zap->zap_ismicro)
    576 		mze_destroy(zap);
    577 	else
    578 		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
    579 
    580 	kmem_free(zap, sizeof (zap_t));
    581 }
    582 
    583 int
    584 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
    585 {
    586 	zap_t *zap;
    587 	int err;
    588 
    589 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
    590 	if (err)
    591 		return (err);
    592 	if (!zap->zap_ismicro) {
    593 		err = fzap_count(zap, count);
    594 	} else {
    595 		*count = zap->zap_m.zap_num_entries;
    596 	}
    597 	zap_unlockdir(zap);
    598 	return (err);
    599 }
    600 
    601 /*
    602  * zn may be NULL; if not specified, it will be computed if needed.
    603  * See also the comment above zap_entry_normalization_conflict().
    604  */
    605 static boolean_t
    606 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
    607 {
    608 	mzap_ent_t *other;
    609 	int direction = AVL_BEFORE;
    610 	boolean_t allocdzn = B_FALSE;
    611 
    612 	if (zap->zap_normflags == 0)
    613 		return (B_FALSE);
    614 
    615 again:
    616 	for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
    617 	    other && other->mze_hash == mze->mze_hash;
    618 	    other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
    619 
    620 		if (zn == NULL) {
    621 			zn = zap_name_alloc(zap, mze->mze_phys.mze_name,
    622 			    MT_FIRST);
    623 			allocdzn = B_TRUE;
    624 		}
    625 		if (zap_match(zn, other->mze_phys.mze_name)) {
    626 			if (allocdzn)
    627 				zap_name_free(zn);
    628 			return (B_TRUE);
    629 		}
    630 	}
    631 
    632 	if (direction == AVL_BEFORE) {
    633 		direction = AVL_AFTER;
    634 		goto again;
    635 	}
    636 
    637 	if (allocdzn)
    638 		zap_name_free(zn);
    639 	return (B_FALSE);
    640 }
    641 
    642 /*
    643  * Routines for manipulating attributes.
    644  */
    645 
    646 int
    647 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
    648     uint64_t integer_size, uint64_t num_integers, void *buf)
    649 {
    650 	return (zap_lookup_norm(os, zapobj, name, integer_size,
    651 	    num_integers, buf, MT_EXACT, NULL, 0, NULL));
    652 }
    653 
    654 int
    655 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
    656     uint64_t integer_size, uint64_t num_integers, void *buf,
    657     matchtype_t mt, char *realname, int rn_len,
    658     boolean_t *ncp)
    659 {
    660 	zap_t *zap;
    661 	int err;
    662 	mzap_ent_t *mze;
    663 	zap_name_t *zn;
    664 
    665 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
    666 	if (err)
    667 		return (err);
    668 	zn = zap_name_alloc(zap, name, mt);
    669 	if (zn == NULL) {
    670 		zap_unlockdir(zap);
    671 		return (ENOTSUP);
    672 	}
    673 
    674 	if (!zap->zap_ismicro) {
    675 		err = fzap_lookup(zn, integer_size, num_integers, buf,
    676 		    realname, rn_len, ncp);
    677 	} else {
    678 		mze = mze_find(zn);
    679 		if (mze == NULL) {
    680 			err = ENOENT;
    681 		} else {
    682 			if (num_integers < 1) {
    683 				err = EOVERFLOW;
    684 			} else if (integer_size != 8) {
    685 				err = EINVAL;
    686 			} else {
    687 				*(uint64_t *)buf = mze->mze_phys.mze_value;
    688 				(void) strlcpy(realname,
    689 				    mze->mze_phys.mze_name, rn_len);
    690 				if (ncp) {
    691 					*ncp = mzap_normalization_conflict(zap,
    692 					    zn, mze);
    693 				}
    694 			}
    695 		}
    696 	}
    697 	zap_name_free(zn);
    698 	zap_unlockdir(zap);
    699 	return (err);
    700 }
    701 
    702 int
    703 zap_length(objset_t *os, uint64_t zapobj, const char *name,
    704     uint64_t *integer_size, uint64_t *num_integers)
    705 {
    706 	zap_t *zap;
    707 	int err;
    708 	mzap_ent_t *mze;
    709 	zap_name_t *zn;
    710 
    711 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
    712 	if (err)
    713 		return (err);
    714 	zn = zap_name_alloc(zap, name, MT_EXACT);
    715 	if (zn == NULL) {
    716 		zap_unlockdir(zap);
    717 		return (ENOTSUP);
    718 	}
    719 	if (!zap->zap_ismicro) {
    720 		err = fzap_length(zn, integer_size, num_integers);
    721 	} else {
    722 		mze = mze_find(zn);
    723 		if (mze == NULL) {
    724 			err = ENOENT;
    725 		} else {
    726 			if (integer_size)
    727 				*integer_size = 8;
    728 			if (num_integers)
    729 				*num_integers = 1;
    730 		}
    731 	}
    732 	zap_name_free(zn);
    733 	zap_unlockdir(zap);
    734 	return (err);
    735 }
    736 
    737 static void
    738 mzap_addent(zap_name_t *zn, uint64_t value)
    739 {
    740 	int i;
    741 	zap_t *zap = zn->zn_zap;
    742 	int start = zap->zap_m.zap_alloc_next;
    743 	uint32_t cd;
    744 
    745 	dprintf("obj=%llu %s=%llu\n", zap->zap_object,
    746 	    zn->zn_name_orij, value);
    747 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
    748 
    749 #ifdef ZFS_DEBUG
    750 	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
    751 		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
    752 		ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0);
    753 	}
    754 #endif
    755 
    756 	cd = mze_find_unused_cd(zap, zn->zn_hash);
    757 	/* given the limited size of the microzap, this can't happen */
    758 	ASSERT(cd != ZAP_MAXCD);
    759 
    760 again:
    761 	for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
    762 		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
    763 		if (mze->mze_name[0] == 0) {
    764 			mze->mze_value = value;
    765 			mze->mze_cd = cd;
    766 			(void) strcpy(mze->mze_name, zn->zn_name_orij);
    767 			zap->zap_m.zap_num_entries++;
    768 			zap->zap_m.zap_alloc_next = i+1;
    769 			if (zap->zap_m.zap_alloc_next ==
    770 			    zap->zap_m.zap_num_chunks)
    771 				zap->zap_m.zap_alloc_next = 0;
    772 			mze_insert(zap, i, zn->zn_hash, mze);
    773 			return;
    774 		}
    775 	}
    776 	if (start != 0) {
    777 		start = 0;
    778 		goto again;
    779 	}
    780 	ASSERT(!"out of entries!");
    781 }
    782 
    783 int
    784 zap_add(objset_t *os, uint64_t zapobj, const char *name,
    785     int integer_size, uint64_t num_integers,
    786     const void *val, dmu_tx_t *tx)
    787 {
    788 	zap_t *zap;
    789 	int err;
    790 	mzap_ent_t *mze;
    791 	const uint64_t *intval = val;
    792 	zap_name_t *zn;
    793 
    794 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
    795 	if (err)
    796 		return (err);
    797 	zn = zap_name_alloc(zap, name, MT_EXACT);
    798 	if (zn == NULL) {
    799 		zap_unlockdir(zap);
    800 		return (ENOTSUP);
    801 	}
    802 	if (!zap->zap_ismicro) {
    803 		err = fzap_add(zn, integer_size, num_integers, val, tx);
    804 		zap = zn->zn_zap;	/* fzap_add() may change zap */
    805 	} else if (integer_size != 8 || num_integers != 1 ||
    806 	    strlen(name) >= MZAP_NAME_LEN) {
    807 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
    808 		    zapobj, integer_size, num_integers, name);
    809 		err = mzap_upgrade(&zn->zn_zap, tx);
    810 		if (err == 0)
    811 			err = fzap_add(zn, integer_size, num_integers, val, tx);
    812 		zap = zn->zn_zap;	/* fzap_add() may change zap */
    813 	} else {
    814 		mze = mze_find(zn);
    815 		if (mze != NULL) {
    816 			err = EEXIST;
    817 		} else {
    818 			mzap_addent(zn, *intval);
    819 		}
    820 	}
    821 	ASSERT(zap == zn->zn_zap);
    822 	zap_name_free(zn);
    823 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
    824 		zap_unlockdir(zap);
    825 	return (err);
    826 }
    827 
    828 int
    829 zap_update(objset_t *os, uint64_t zapobj, const char *name,
    830     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
    831 {
    832 	zap_t *zap;
    833 	mzap_ent_t *mze;
    834 	const uint64_t *intval = val;
    835 	zap_name_t *zn;
    836 	int err;
    837 
    838 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
    839 	if (err)
    840 		return (err);
    841 	zn = zap_name_alloc(zap, name, MT_EXACT);
    842 	if (zn == NULL) {
    843 		zap_unlockdir(zap);
    844 		return (ENOTSUP);
    845 	}
    846 	if (!zap->zap_ismicro) {
    847 		err = fzap_update(zn, integer_size, num_integers, val, tx);
    848 		zap = zn->zn_zap;	/* fzap_update() may change zap */
    849 	} else if (integer_size != 8 || num_integers != 1 ||
    850 	    strlen(name) >= MZAP_NAME_LEN) {
    851 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
    852 		    zapobj, integer_size, num_integers, name);
    853 		err = mzap_upgrade(&zn->zn_zap, tx);
    854 		if (err == 0)
    855 			err = fzap_update(zn, integer_size, num_integers,
    856 			    val, tx);
    857 		zap = zn->zn_zap;	/* fzap_update() may change zap */
    858 	} else {
    859 		mze = mze_find(zn);
    860 		if (mze != NULL) {
    861 			mze->mze_phys.mze_value = *intval;
    862 			zap->zap_m.zap_phys->mz_chunk
    863 			    [mze->mze_chunkid].mze_value = *intval;
    864 		} else {
    865 			mzap_addent(zn, *intval);
    866 		}
    867 	}
    868 	ASSERT(zap == zn->zn_zap);
    869 	zap_name_free(zn);
    870 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
    871 		zap_unlockdir(zap);
    872 	return (err);
    873 }
    874 
    875 int
    876 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
    877 {
    878 	return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx));
    879 }
    880 
    881 int
    882 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
    883     matchtype_t mt, dmu_tx_t *tx)
    884 {
    885 	zap_t *zap;
    886 	int err;
    887 	mzap_ent_t *mze;
    888 	zap_name_t *zn;
    889 
    890 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
    891 	if (err)
    892 		return (err);
    893 	zn = zap_name_alloc(zap, name, mt);
    894 	if (zn == NULL) {
    895 		zap_unlockdir(zap);
    896 		return (ENOTSUP);
    897 	}
    898 	if (!zap->zap_ismicro) {
    899 		err = fzap_remove(zn, tx);
    900 	} else {
    901 		mze = mze_find(zn);
    902 		if (mze == NULL) {
    903 			err = ENOENT;
    904 		} else {
    905 			zap->zap_m.zap_num_entries--;
    906 			bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
    907 			    sizeof (mzap_ent_phys_t));
    908 			mze_remove(zap, mze);
    909 		}
    910 	}
    911 	zap_name_free(zn);
    912 	zap_unlockdir(zap);
    913 	return (err);
    914 }
    915 
    916 /*
    917  * Routines for iterating over the attributes.
    918  */
    919 
    920 /*
    921  * We want to keep the high 32 bits of the cursor zero if we can, so
    922  * that 32-bit programs can access this.  So use a small hash value so
    923  * we can fit 4 bits of cd into the 32-bit cursor.
    924  *
    925  * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
    926  */
    927 void
    928 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
    929     uint64_t serialized)
    930 {
    931 	zc->zc_objset = os;
    932 	zc->zc_zap = NULL;
    933 	zc->zc_leaf = NULL;
    934 	zc->zc_zapobj = zapobj;
    935 	if (serialized == -1ULL) {
    936 		zc->zc_hash = -1ULL;
    937 		zc->zc_cd = 0;
    938 	} else {
    939 		zc->zc_hash = serialized << (64-ZAP_HASHBITS);
    940 		zc->zc_cd = serialized >> ZAP_HASHBITS;
    941 		if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
    942 			zc->zc_cd = 0;
    943 	}
    944 }
    945 
    946 void
    947 zap_cursor_init(zap_cursor_t *zc, objset_t *os,