Home | History | Annotate | Download | only in common
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"@(#)fmd_case.c	1.11	07/10/12 SMI"
     28 
     29 /*
     30  * FMD Case Subsystem
     31  *
     32  * Diagnosis engines are expected to group telemetry events related to the
     33  * diagnosis of a particular problem on the system into a set of cases.  The
     34  * diagnosis engine may have any number of cases open at a given point in time.
     35  * Some cases may eventually be *solved* by associating a suspect list of one
     36  * or more problems with the case, at which point fmd publishes a list.suspect
     37  * event for the case and it becomes visible to administrators and agents.
     38  *
     39  * Every case is named using a UUID, and is globally visible in the case hash.
     40  * Cases are reference-counted, except for the reference from the case hash
     41  * itself.  Consumers of case references include modules, which store active
     42  * cases on the mod_cases list, ASRUs in the resource cache, and the RPC code.
     43  *
     44  * Cases obey the following state machine.  In states UNSOLVED, SOLVED, and
     45  * CLOSE_WAIT, a case's module refers to the owning module (a diagnosis engine
     46  * or transport) and the case is referenced by the mod_cases list.  Once the
     47  * case reaches the CLOSED or REPAIRED states, a case's module changes to refer
     48  * to the root module (fmd.d_rmod) and is deleted from the owner's mod_cases.
     49  *
     50  *			+------------+
     51  *	     +----------|  UNSOLVED  |
     52  *	     |		+------------+
     53  *	   1 |	             4 |
     54  *           |                 |
     55  *	+----v---+ /-2->+------v-----+	  3	+--------+
     56  *      | SOLVED |<     | CLOSE_WAIT |--------->| CLOSED |
     57  *	+--------+ \-5->+------------+		+--------+
     58  *	                       |                    |
     59  *                           6 |                    | 7
     60  *      		+------v-----+              |
     61  *	                |  REPAIRED  |<-------------+
     62  *			+------------+
     63  *
     64  * The state machine changes are triggered by calls to fmd_case_transition()
     65  * from various locations inside of fmd, as described below:
     66  *
     67  * [1] Called by: fmd_case_solve()
     68  *       Actions: FMD_CF_SOLVED flag is set in ci_flags
     69  *                conviction policy is applied to suspect list
     70  *                suspects convicted are marked faulty (F) in R$
     71  *                list.suspect event logged and dispatched
     72  *
     73  * [2] Called by: fmd_case_close(), fmd_case_uuclose(), fmd_xprt_event_uuclose()
     74  *       Actions: FMD_CF_ISOLATED flag is set in ci_flags
     75  *                suspects convicted (F) are marked unusable (U) in R$
     76  *                diagnosis engine fmdo_close() entry point scheduled
     77  *                case transitions to CLOSED [3] upon exit from CLOSE_WAIT
     78  *
     79  * [3] Called by: fmd_case_delete() (after fmdo_close() entry point returns)
     80  *       Actions: list.isolated event dispatched
     81  *                case deleted from module's list of open cases
     82  *
     83  * [4] Called by: fmd_case_close(), fmd_case_uuclose()
     84  *       Actions: diagnosis engine fmdo_close() entry point scheduled
     85  *                case is subsequently discarded by fmd_case_delete()
     86  *
     87  * [5] Called by: fmd_case_repair(), fmd_case_update()
     88  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
     89  *                diagnosis engine fmdo_close() entry point scheduled
     90  *                case transitions to REPAIRED [6] upon exit from CLOSE_WAIT
     91  *
     92  * [6] Called by: fmd_case_repair(), fmd_case_update()
     93  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
     94  *                suspects convicted are marked non faulty (!F) in R$
     95  *                list.repaired event dispatched
     96  *
     97  * [7] Called by: fmd_case_repair(), fmd_case_update()
     98  *       Actions: FMD_CF_REPAIR flag is set in ci_flags
     99  *                suspects convicted are marked non faulty (!F) in R$
    100  *                list.repaired event dispatched
    101  */
    102 
    103 #include <sys/fm/protocol.h>
    104 #include <uuid/uuid.h>
    105 #include <alloca.h>
    106 
    107 #include <fmd_alloc.h>
    108 #include <fmd_module.h>
    109 #include <fmd_error.h>
    110 #include <fmd_conf.h>
    111 #include <fmd_case.h>
    112 #include <fmd_string.h>
    113 #include <fmd_subr.h>
    114 #include <fmd_protocol.h>
    115 #include <fmd_event.h>
    116 #include <fmd_eventq.h>
    117 #include <fmd_dispq.h>
    118 #include <fmd_buf.h>
    119 #include <fmd_log.h>
    120 #include <fmd_asru.h>
    121 #include <fmd_fmri.h>
    122 #include <fmd_xprt.h>
    123 
    124 #include <fmd.h>
    125 
    126 static const char *const _fmd_case_snames[] = {
    127 	"UNSOLVED",	/* FMD_CASE_UNSOLVED */
    128 	"SOLVED",	/* FMD_CASE_SOLVED */
    129 	"CLOSE_WAIT",	/* FMD_CASE_CLOSE_WAIT */
    130 	"CLOSED",	/* FMD_CASE_CLOSED */
    131 	"REPAIRED"	/* FMD_CASE_REPAIRED */
    132 };
    133 
    134 extern volatile uint32_t fmd_asru_fake_not_present;
    135 
    136 fmd_case_hash_t *
    137 fmd_case_hash_create(void)
    138 {
    139 	fmd_case_hash_t *chp = fmd_alloc(sizeof (fmd_case_hash_t), FMD_SLEEP);
    140 
    141 	(void) pthread_rwlock_init(&chp->ch_lock, NULL);
    142 	chp->ch_hashlen = fmd.d_str_buckets;
    143 	chp->ch_hash = fmd_zalloc(sizeof (void *) * chp->ch_hashlen, FMD_SLEEP);
    144 	chp->ch_count = 0;
    145 
    146 	return (chp);
    147 }
    148 
    149 /*
    150  * Destroy the case hash.  Unlike most of our hash tables, no active references
    151  * are kept by the case hash itself; all references come from other subsystems.
    152  * The hash must be destroyed after all modules are unloaded; if anything was
    153  * present in the hash it would be by definition a reference count leak.
    154  */
    155 void
    156 fmd_case_hash_destroy(fmd_case_hash_t *chp)
    157 {
    158 	fmd_free(chp->ch_hash, sizeof (void *) * chp->ch_hashlen);
    159 	fmd_free(chp, sizeof (fmd_case_hash_t));
    160 }
    161 
    162 /*
    163  * Take a snapshot of the case hash by placing an additional hold on each
    164  * member in an auxiliary array, and then call 'func' for each case.
    165  */
    166 void
    167 fmd_case_hash_apply(fmd_case_hash_t *chp,
    168     void (*func)(fmd_case_t *, void *), void *arg)
    169 {
    170 	fmd_case_impl_t *cp, **cps, **cpp;
    171 	uint_t cpc, i;
    172 
    173 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
    174 
    175 	cps = cpp = fmd_alloc(chp->ch_count * sizeof (fmd_case_t *), FMD_SLEEP);
    176 	cpc = chp->ch_count;
    177 
    178 	for (i = 0; i < chp->ch_hashlen; i++) {
    179 		for (cp = chp->ch_hash[i]; cp != NULL; cp = cp->ci_next) {
    180 			fmd_case_hold((fmd_case_t *)cp);
    181 			*cpp++ = cp;
    182 		}
    183 	}
    184 
    185 	ASSERT(cpp == cps + cpc);
    186 	(void) pthread_rwlock_unlock(&chp->ch_lock);
    187 
    188 	for (i = 0; i < cpc; i++) {
    189 		func((fmd_case_t *)cps[i], arg);
    190 		fmd_case_rele((fmd_case_t *)cps[i]);
    191 	}
    192 
    193 	fmd_free(cps, cpc * sizeof (fmd_case_t *));
    194 }
    195 
    196 /*
    197  * Look up the diagcode for this case and cache it in ci_code.  If no suspects
    198  * were defined for this case or if the lookup fails, the event dictionary or
    199  * module code is broken, and we set the event code to a precomputed default.
    200  */
    201 static const char *
    202 fmd_case_mkcode(fmd_case_t *cp)
    203 {
    204 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    205 	fmd_case_susp_t *cis;
    206 
    207 	char **keys, **keyp;
    208 	const char *s;
    209 
    210 	ASSERT(MUTEX_HELD(&cip->ci_lock));
    211 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
    212 
    213 	fmd_free(cip->ci_code, cip->ci_codelen);
    214 	cip->ci_codelen = cip->ci_mod->mod_codelen;
    215 	cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
    216 	keys = keyp = alloca(sizeof (char *) * (cip->ci_nsuspects + 1));
    217 
    218 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
    219 		if (nvlist_lookup_string(cis->cis_nvl, FM_CLASS, keyp) == 0)
    220 			keyp++;
    221 	}
    222 
    223 	*keyp = NULL; /* mark end of keys[] array for libdiagcode */
    224 
    225 	if (cip->ci_nsuspects == 0 || fmd_module_dc_key2code(
    226 	    cip->ci_mod, keys, cip->ci_code, cip->ci_codelen) != 0) {
    227 		(void) fmd_conf_getprop(fmd.d_conf, "nodiagcode", &s);
    228 		fmd_free(cip->ci_code, cip->ci_codelen);
    229 		cip->ci_codelen = strlen(s) + 1;
    230 		cip->ci_code = fmd_zalloc(cip->ci_codelen, FMD_SLEEP);
    231 		(void) strcpy(cip->ci_code, s);
    232 	}
    233 
    234 	return (cip->ci_code);
    235 }
    236 
    237 nvlist_t *
    238 fmd_case_mkevent(fmd_case_t *cp, const char *class)
    239 {
    240 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    241 	fmd_case_susp_t *cis;
    242 
    243 	fmd_asru_hash_t *ahp = fmd.d_asrus;
    244 	fmd_asru_t *asru;
    245 
    246 	nvlist_t **nva, **nvp, *nvl, *fmri;
    247 	uint8_t *ba, *bp;
    248 
    249 	int msg = B_TRUE;
    250 	boolean_t b;
    251 
    252 	(void) pthread_mutex_lock(&cip->ci_lock);
    253 	ASSERT(cip->ci_state >= FMD_CASE_SOLVED);
    254 
    255 	nva = nvp = alloca(sizeof (nvlist_t *) * cip->ci_nsuspects);
    256 	ba = bp = alloca(sizeof (uint8_t) * cip->ci_nsuspects);
    257 
    258 	/*
    259 	 * For each suspect associated with the case, store its fault event
    260 	 * nvlist in 'nva'.  We also look to see if any of the suspect faults
    261 	 * have asked not to be messaged.  If any of them have made such a
    262 	 * request, propagate that attribute to the composite list.* event.
    263 	 * Finally, store each suspect's faulty status into the bitmap 'ba'.
    264 	 */
    265 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
    266 		if (nvlist_lookup_boolean_value(cis->cis_nvl,
    267 		    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
    268 			msg = B_FALSE;
    269 
    270 		if (nvlist_lookup_nvlist(cis->cis_nvl,
    271 		    FM_FAULT_ASRU, &fmri) == 0 && (asru =
    272 		    fmd_asru_hash_lookup_nvl(ahp, fmri, FMD_B_FALSE)) != NULL) {
    273 			*bp = 0;
    274 			if (fmd_asru_fake_not_present ||
    275 			    !fmd_fmri_present(asru->asru_fmri))
    276 				*bp |= FM_SUSPECT_NOT_PRESENT;
    277 			if (fmd_asru_fake_not_present ||
    278 			    fmd_fmri_unusable(asru->asru_fmri))
    279 				*bp |= FM_SUSPECT_UNUSABLE;
    280 			if (asru->asru_flags & FMD_ASRU_FAULTY)
    281 				*bp |= FM_SUSPECT_FAULTY;
    282 			bp++;
    283 			fmd_asru_hash_release(ahp, asru);
    284 		} else
    285 			*bp++ = 0;
    286 
    287 		*nvp++ = cis->cis_nvl;
    288 	}
    289 
    290 	if (cip->ci_code == NULL)
    291 		(void) fmd_case_mkcode(cp);
    292 
    293 	if (msg == B_FALSE)
    294 		cip->ci_flags |= FMD_CF_INVISIBLE;
    295 
    296 	nvl = fmd_protocol_list(class, cip->ci_mod->mod_fmri, cip->ci_uuid,
    297 	    cip->ci_code, cip->ci_nsuspects, nva, ba, msg, &cip->ci_tv);
    298 
    299 	(void) pthread_mutex_unlock(&cip->ci_lock);
    300 	return (nvl);
    301 }
    302 
    303 /*
    304  * Convict suspects in a case by applying a conviction policy and updating the
    305  * resource cache prior to emitting the list.suspect event for the given case.
    306  * At present, our policy is very simple: convict every suspect in the case.
    307  * In the future, this policy can be extended and made configurable to permit:
    308  *
    309  * - convicting the suspect with the highest FIT rate
    310  * - convicting the suspect with the cheapest FRU
    311  * - convicting the suspect with the FRU that is in a depot's inventory
    312  * - convicting the suspect with the longest lifetime
    313  *
    314  * and so forth.  A word to the wise: this problem is significantly harder that
    315  * it seems at first glance.  Future work should heed the following advice:
    316  *
    317  * Hacking the policy into C code here is a very bad idea.  The policy needs to
    318  * be decided upon very carefully and fundamentally encodes knowledge of what
    319  * suspect list combinations can be emitted by what diagnosis engines.  As such
    320  * fmd's code is the wrong location, because that would require fmd itself to
    321  * be updated for every diagnosis engine change, defeating the entire design.
    322  * The FMA Event Registry knows the suspect list combinations: policy inputs
    323  * can be derived from it and used to produce per-module policy configuration.
    324  *
    325  * If the policy needs to be dynamic and not statically fixed at either fmd
    326  * startup or module load time, any implementation of dynamic policy retrieval
    327  * must employ some kind of caching mechanism or be part of a built-in module.
    328  * The fmd_case_convict() function is called with locks held inside of fmd and
    329  * is not a place where unbounded blocking on some inter-process or inter-
    330  * system communication to another service (e.g. another daemon) can occur.
    331  */
    332 static void
    333 fmd_case_convict(fmd_case_t *cp)
    334 {
    335 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    336 	fmd_asru_hash_t *ahp = fmd.d_asrus;
    337 
    338 	fmd_case_susp_t *cis;
    339 	fmd_asru_t *asru;
    340 	nvlist_t *fmri;
    341 
    342 	(void) pthread_mutex_lock(&cip->ci_lock);
    343 	(void) fmd_case_mkcode(cp);
    344 
    345 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
    346 		if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU, &fmri))
    347 			continue; /* no ASRU provided by diagnosis engine */
    348 
    349 		if ((asru = fmd_asru_hash_lookup_nvl(ahp,
    350 		    fmri, FMD_B_TRUE)) == NULL) {
    351 			fmd_error(EFMD_CASE_EVENT, "cannot convict suspect in "
    352 			    "%s: %s\n", cip->ci_uuid, fmd_strerror(errno));
    353 			continue;
    354 		}
    355 
    356 		(void) fmd_asru_clrflags(asru,
    357 		    FMD_ASRU_UNUSABLE, cp, cis->cis_nvl);
    358 		(void) fmd_asru_setflags(asru,
    359 		    FMD_ASRU_FAULTY, cp, cis->cis_nvl);
    360 
    361 		fmd_asru_hash_release(ahp, asru);
    362 	}
    363 
    364 	(void) pthread_mutex_unlock(&cip->ci_lock);
    365 }
    366 
    367 void
    368 fmd_case_publish(fmd_case_t *cp, uint_t state)
    369 {
    370 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    371 	fmd_event_t *e;
    372 	nvlist_t *nvl;
    373 	char *class;
    374 
    375 	if (state == FMD_CASE_CURRENT)
    376 		state = cip->ci_state; /* use current state */
    377 
    378 	switch (state) {
    379 	case FMD_CASE_SOLVED:
    380 		(void) pthread_mutex_lock(&cip->ci_lock);
    381 		if (cip->ci_tv_valid == 0) {
    382 			fmd_time_gettimeofday(&cip->ci_tv);
    383 			cip->ci_tv_valid = 1;
    384 		}
    385 		(void) pthread_mutex_unlock(&cip->ci_lock);
    386 		fmd_case_convict(cp);
    387 		nvl = fmd_case_mkevent(cp, FM_LIST_SUSPECT_CLASS);
    388 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
    389 
    390 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
    391 		(void) pthread_rwlock_rdlock(&fmd.d_log_lock);
    392 		fmd_log_append(fmd.d_fltlog, e, cp);
    393 		(void) pthread_rwlock_unlock(&fmd.d_log_lock);
    394 		fmd_dispq_dispatch(fmd.d_disp, e, class);
    395 
    396 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
    397 		cip->ci_mod->mod_stats->ms_casesolved.fmds_value.ui64++;
    398 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
    399 
    400 		break;
    401 
    402 	case FMD_CASE_CLOSE_WAIT:
    403 		fmd_case_hold(cp);
    404 		e = fmd_event_create(FMD_EVT_CLOSE, FMD_HRT_NOW, NULL, cp);
    405 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
    406 
    407 		(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
    408 		cip->ci_mod->mod_stats->ms_caseclosed.fmds_value.ui64++;
    409 		(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
    410 
    411 		break;
    412 
    413 	case FMD_CASE_CLOSED:
    414 		nvl = fmd_case_mkevent(cp, FM_LIST_ISOLATED_CLASS);
    415 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
    416 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
    417 		fmd_dispq_dispatch(fmd.d_disp, e, class);
    418 		break;
    419 
    420 	case FMD_CASE_REPAIRED:
    421 		nvl = fmd_case_mkevent(cp, FM_LIST_REPAIRED_CLASS);
    422 		(void) nvlist_lookup_string(nvl, FM_CLASS, &class);
    423 		e = fmd_event_create(FMD_EVT_PROTOCOL, FMD_HRT_NOW, nvl, class);
    424 		fmd_dispq_dispatch(fmd.d_disp, e, class);
    425 		break;
    426 	}
    427 }
    428 
    429 fmd_case_t *
    430 fmd_case_hash_lookup(fmd_case_hash_t *chp, const char *uuid)
    431 {
    432 	fmd_case_impl_t *cip;
    433 	uint_t h;
    434 
    435 	(void) pthread_rwlock_rdlock(&chp->ch_lock);
    436 	h = fmd_strhash(uuid) % chp->ch_hashlen;
    437 
    438 	for (cip = chp->ch_hash[h]; cip != NULL; cip = cip->ci_next) {
    439 		if (strcmp(cip->ci_uuid, uuid) == 0)
    440 			break;
    441 	}
    442 
    443 	if (cip != NULL)
    444 		fmd_case_hold((fmd_case_t *)cip);
    445 	else
    446 		(void) fmd_set_errno(EFMD_CASE_INVAL);
    447 
    448 	(void) pthread_rwlock_unlock(&chp->ch_lock);
    449 	return ((fmd_case_t *)cip);
    450 }
    451 
    452 static fmd_case_impl_t *
    453 fmd_case_hash_insert(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
    454 {
    455 	fmd_case_impl_t *eip;
    456 	uint_t h;
    457 
    458 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
    459 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
    460 
    461 	for (eip = chp->ch_hash[h]; eip != NULL; eip = eip->ci_next) {
    462 		if (strcmp(cip->ci_uuid, eip->ci_uuid) == 0) {
    463 			fmd_case_hold((fmd_case_t *)eip);
    464 			(void) pthread_rwlock_unlock(&chp->ch_lock);
    465 			return (eip); /* uuid already present */
    466 		}
    467 	}
    468 
    469 	cip->ci_next = chp->ch_hash[h];
    470 	chp->ch_hash[h] = cip;
    471 
    472 	chp->ch_count++;
    473 	ASSERT(chp->ch_count != 0);
    474 
    475 	(void) pthread_rwlock_unlock(&chp->ch_lock);
    476 	return (cip);
    477 }
    478 
    479 static void
    480 fmd_case_hash_delete(fmd_case_hash_t *chp, fmd_case_impl_t *cip)
    481 {
    482 	fmd_case_impl_t *cp, **pp;
    483 	uint_t h;
    484 
    485 	(void) pthread_rwlock_wrlock(&chp->ch_lock);
    486 
    487 	h = fmd_strhash(cip->ci_uuid) % chp->ch_hashlen;
    488 	pp = &chp->ch_hash[h];
    489 
    490 	for (cp = *pp; cp != NULL; cp = cp->ci_next) {
    491 		if (cp != cip)
    492 			pp = &cp->ci_next;
    493 		else
    494 			break;
    495 	}
    496 
    497 	if (cp == NULL) {
    498 		fmd_panic("case %p (%s) not found on hash chain %u\n",
    499 		    (void *)cip, cip->ci_uuid, h);
    500 	}
    501 
    502 	*pp = cp->ci_next;
    503 	cp->ci_next = NULL;
    504 
    505 	ASSERT(chp->ch_count != 0);
    506 	chp->ch_count--;
    507 
    508 	(void) pthread_rwlock_unlock(&chp->ch_lock);
    509 }
    510 
    511 fmd_case_t *
    512 fmd_case_create(fmd_module_t *mp, void *data)
    513 {
    514 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
    515 	fmd_case_impl_t *eip = NULL;
    516 	uuid_t uuid;
    517 
    518 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
    519 	fmd_buf_hash_create(&cip->ci_bufs);
    520 
    521 	fmd_module_hold(mp);
    522 	cip->ci_mod = mp;
    523 	cip->ci_refs = 1;
    524 	cip->ci_state = FMD_CASE_UNSOLVED;
    525 	cip->ci_flags = FMD_CF_DIRTY;
    526 	cip->ci_data = data;
    527 
    528 	/*
    529 	 * Calling libuuid: get a clue.  The library interfaces cleverly do not
    530 	 * define any constant for the length of an unparse string, and do not
    531 	 * permit the caller to specify a buffer length for safety.  The spec
    532 	 * says it will be 36 bytes, but we make it tunable just in case.
    533 	 */
    534 	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &cip->ci_uuidlen);
    535 	cip->ci_uuid = fmd_zalloc(cip->ci_uuidlen + 1, FMD_SLEEP);
    536 
    537 	/*
    538 	 * We expect this loop to execute only once, but code it defensively
    539 	 * against the possibility of libuuid bugs.  Keep generating uuids and
    540 	 * attempting to do a hash insert until we get a unique one.
    541 	 */
    542 	do {
    543 		if (eip != NULL)
    544 			fmd_case_rele((fmd_case_t *)eip);
    545 		uuid_generate(uuid);
    546 		uuid_unparse(uuid, cip->ci_uuid);
    547 	} while ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip);
    548 
    549 	ASSERT(fmd_module_locked(mp));
    550 	fmd_list_append(&mp->mod_cases, cip);
    551 	fmd_module_setcdirty(mp);
    552 
    553 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
    554 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
    555 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
    556 
    557 	return ((fmd_case_t *)cip);
    558 }
    559 
    560 static void
    561 fmd_case_destroy_suspects(fmd_case_impl_t *cip)
    562 {
    563 	fmd_case_susp_t *cis, *ncis;
    564 
    565 	ASSERT(MUTEX_HELD(&cip->ci_lock));
    566 
    567 	for (cis = cip->ci_suspects; cis != NULL; cis = ncis) {
    568 		ncis = cis->cis_next;
    569 		nvlist_free(cis->cis_nvl);
    570 		fmd_free(cis, sizeof (fmd_case_susp_t));
    571 	}
    572 
    573 	cip->ci_suspects = NULL;
    574 	cip->ci_nsuspects = 0;
    575 }
    576 
    577 fmd_case_t *
    578 fmd_case_recreate(fmd_module_t *mp, fmd_xprt_t *xp,
    579     uint_t state, const char *uuid, const char *code)
    580 {
    581 	fmd_case_impl_t *cip = fmd_zalloc(sizeof (fmd_case_impl_t), FMD_SLEEP);
    582 	fmd_case_impl_t *eip;
    583 
    584 	ASSERT(state < FMD_CASE_REPAIRED);
    585 
    586 	(void) pthread_mutex_init(&cip->ci_lock, NULL);
    587 	fmd_buf_hash_create(&cip->ci_bufs);
    588 
    589 	fmd_module_hold(mp);
    590 	cip->ci_mod = mp;
    591 	cip->ci_xprt = xp;
    592 	cip->ci_refs = 1;
    593 	cip->ci_state = state;
    594 	cip->ci_uuid = fmd_strdup(uuid, FMD_SLEEP);
    595 	cip->ci_uuidlen = strlen(cip->ci_uuid);
    596 	cip->ci_code = fmd_strdup(code, FMD_SLEEP);
    597 	cip->ci_codelen = cip->ci_code ? strlen(cip->ci_code) + 1 : 0;
    598 
    599 	if (state > FMD_CASE_CLOSE_WAIT)
    600 		cip->ci_flags |= FMD_CF_SOLVED;
    601 
    602 	/*
    603 	 * Insert the case into the global case hash.  If the specified UUID is
    604 	 * already present, check to see if it is an orphan: if so, reclaim it;
    605 	 * otherwise if it is owned by a different module then return NULL.
    606 	 */
    607 	if ((eip = fmd_case_hash_insert(fmd.d_cases, cip)) != cip) {
    608 		(void) pthread_mutex_lock(&cip->ci_lock);
    609 		cip->ci_refs--; /* decrement to zero */
    610 		fmd_case_destroy((fmd_case_t *)cip, B_FALSE);
    611 
    612 		cip = eip; /* switch 'cip' to the existing case */
    613 		(void) pthread_mutex_lock(&cip->ci_lock);
    614 
    615 		/*
    616 		 * If the ASRU cache is trying to recreate an orphan, then just
    617 		 * return the existing case that we found without changing it.
    618 		 */
    619 		if (mp == fmd.d_rmod) {
    620 			(void) pthread_mutex_unlock(&cip->ci_lock);
    621 			fmd_case_rele((fmd_case_t *)cip);
    622 			return ((fmd_case_t *)cip);
    623 		}
    624 
    625 		/*
    626 		 * If the existing case isn't an orphan or is being proxied,
    627 		 * then we have a UUID conflict: return failure to the caller.
    628 		 */
    629 		if (cip->ci_mod != fmd.d_rmod || xp != NULL) {
    630 			(void) pthread_mutex_unlock(&cip->ci_lock);
    631 			fmd_case_rele((fmd_case_t *)cip);
    632 			return (NULL);
    633 		}
    634 
    635 		/*
    636 		 * If the new module is reclaiming an orphaned case, remove
    637 		 * the case from the root module, switch ci_mod, and then fall
    638 		 * through to adding the case to the new owner module 'mp'.
    639 		 */
    640 		fmd_module_lock(cip->ci_mod);
    641 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
    642 		fmd_module_unlock(cip->ci_mod);
    643 
    644 		fmd_module_rele(cip->ci_mod);
    645 		cip->ci_mod = mp;
    646 		fmd_module_hold(mp);
    647 
    648 		fmd_case_destroy_suspects(cip);
    649 		cip->ci_state = state;
    650 
    651 		(void) pthread_mutex_unlock(&cip->ci_lock);
    652 		fmd_case_rele((fmd_case_t *)cip);
    653 	}
    654 
    655 	ASSERT(fmd_module_locked(mp));
    656 	fmd_list_append(&mp->mod_cases, cip);
    657 
    658 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
    659 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64++;
    660 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
    661 
    662 	return ((fmd_case_t *)cip);
    663 }
    664 
    665 void
    666 fmd_case_destroy(fmd_case_t *cp, int visible)
    667 {
    668 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    669 	fmd_case_item_t *cit, *ncit;
    670 
    671 	ASSERT(MUTEX_HELD(&cip->ci_lock));
    672 	ASSERT(cip->ci_refs == 0);
    673 
    674 	if (visible) {
    675 		TRACE((FMD_DBG_CASE, "deleting case %s", cip->ci_uuid));
    676 		fmd_case_hash_delete(fmd.d_cases, cip);
    677 	}
    678 
    679 	for (cit = cip->ci_items; cit != NULL; cit = ncit) {
    680 		ncit = cit->cit_next;
    681 		fmd_event_rele(cit->cit_event);
    682 		fmd_free(cit, sizeof (fmd_case_item_t));
    683 	}
    684 
    685 	fmd_case_destroy_suspects(cip);
    686 
    687 	if (cip->ci_principal != NULL)
    688 		fmd_event_rele(cip->ci_principal);
    689 
    690 	fmd_free(cip->ci_uuid, cip->ci_uuidlen + 1);
    691 	fmd_free(cip->ci_code, cip->ci_codelen);
    692 	(void) fmd_buf_hash_destroy(&cip->ci_bufs);
    693 
    694 	fmd_module_rele(cip->ci_mod);
    695 	fmd_free(cip, sizeof (fmd_case_impl_t));
    696 }
    697 
    698 void
    699 fmd_case_hold(fmd_case_t *cp)
    700 {
    701 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    702 
    703 	(void) pthread_mutex_lock(&cip->ci_lock);
    704 	cip->ci_refs++;
    705 	ASSERT(cip->ci_refs != 0);
    706 	(void) pthread_mutex_unlock(&cip->ci_lock);
    707 }
    708 
    709 void
    710 fmd_case_hold_locked(fmd_case_t *cp)
    711 {
    712 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    713 
    714 	ASSERT(MUTEX_HELD(&cip->ci_lock));
    715 	cip->ci_refs++;
    716 	ASSERT(cip->ci_refs != 0);
    717 }
    718 
    719 void
    720 fmd_case_rele(fmd_case_t *cp)
    721 {
    722 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    723 
    724 	(void) pthread_mutex_lock(&cip->ci_lock);
    725 	ASSERT(cip->ci_refs != 0);
    726 
    727 	if (--cip->ci_refs == 0)
    728 		fmd_case_destroy((fmd_case_t *)cip, B_TRUE);
    729 	else
    730 		(void) pthread_mutex_unlock(&cip->ci_lock);
    731 }
    732 
    733 int
    734 fmd_case_insert_principal(fmd_case_t *cp, fmd_event_t *ep)
    735 {
    736 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    737 	fmd_case_item_t *cit;
    738 	fmd_event_t *oep;
    739 	uint_t state;
    740 	int new;
    741 
    742 	fmd_event_hold(ep);
    743 	(void) pthread_mutex_lock(&cip->ci_lock);
    744 
    745 	if (cip->ci_flags & FMD_CF_SOLVED)
    746 		state = FMD_EVS_DIAGNOSED;
    747 	else
    748 		state = FMD_EVS_ACCEPTED;
    749 
    750 	oep = cip->ci_principal;
    751 	cip->ci_principal = ep;
    752 
    753 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
    754 		if (cit->cit_event == ep)
    755 			break;
    756 	}
    757 
    758 	cip->ci_flags |= FMD_CF_DIRTY;
    759 	new = cit == NULL && ep != oep;
    760 
    761 	(void) pthread_mutex_unlock(&cip->ci_lock);
    762 
    763 	fmd_module_setcdirty(cip->ci_mod);
    764 	fmd_event_transition(ep, state);
    765 
    766 	if (oep != NULL)
    767 		fmd_event_rele(oep);
    768 
    769 	return (new);
    770 }
    771 
    772 int
    773 fmd_case_insert_event(fmd_case_t *cp, fmd_event_t *ep)
    774 {
    775 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    776 	fmd_case_item_t *cit;
    777 	uint_t state;
    778 	int new;
    779 
    780 	(void) pthread_mutex_lock(&cip->ci_lock);
    781 
    782 	if (cip->ci_flags & FMD_CF_SOLVED)
    783 		state = FMD_EVS_DIAGNOSED;
    784 	else
    785 		state = FMD_EVS_ACCEPTED;
    786 
    787 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
    788 		if (cit->cit_event == ep)
    789 			break;
    790 	}
    791 
    792 	new = cit == NULL && ep != cip->ci_principal;
    793 
    794 	/*
    795 	 * If the event is already in the case or the case is already solved,
    796 	 * there is no reason to save it: just transition it appropriately.
    797 	 */
    798 	if (cit != NULL || (cip->ci_flags & FMD_CF_SOLVED)) {
    799 		(void) pthread_mutex_unlock(&cip->ci_lock);
    800 		fmd_event_transition(ep, state);
    801 		return (new);
    802 	}
    803 
    804 	cit = fmd_alloc(sizeof (fmd_case_item_t), FMD_SLEEP);
    805 	fmd_event_hold(ep);
    806 
    807 	cit->cit_next = cip->ci_items;
    808 	cit->cit_event = ep;
    809 
    810 	cip->ci_items = cit;
    811 	cip->ci_nitems++;
    812 
    813 	cip->ci_flags |= FMD_CF_DIRTY;
    814 	(void) pthread_mutex_unlock(&cip->ci_lock);
    815 
    816 	fmd_module_setcdirty(cip->ci_mod);
    817 	fmd_event_transition(ep, state);
    818 
    819 	return (new);
    820 }
    821 
    822 void
    823 fmd_case_insert_suspect(fmd_case_t *cp, nvlist_t *nvl)
    824 {
    825 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    826 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
    827 
    828 	(void) pthread_mutex_lock(&cip->ci_lock);
    829 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
    830 	cip->ci_flags |= FMD_CF_DIRTY;
    831 
    832 	cis->cis_next = cip->ci_suspects;
    833 	cis->cis_nvl = nvl;
    834 
    835 	cip->ci_suspects = cis;
    836 	cip->ci_nsuspects++;
    837 
    838 	(void) pthread_mutex_unlock(&cip->ci_lock);
    839 	fmd_module_setcdirty(cip->ci_mod);
    840 }
    841 
    842 void
    843 fmd_case_recreate_suspect(fmd_case_t *cp, nvlist_t *nvl)
    844 {
    845 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    846 	fmd_case_susp_t *cis = fmd_alloc(sizeof (fmd_case_susp_t), FMD_SLEEP);
    847 	boolean_t b;
    848 
    849 	(void) pthread_mutex_lock(&cip->ci_lock);
    850 	ASSERT(cip->ci_state == FMD_CASE_CLOSED);
    851 	ASSERT(cip->ci_mod == fmd.d_rmod);
    852 
    853 	cis->cis_next = cip->ci_suspects;
    854 	cis->cis_nvl = nvl;
    855 
    856 	if (nvlist_lookup_boolean_value(nvl,
    857 	    FM_SUSPECT_MESSAGE, &b) == 0 && b == B_FALSE)
    858 		cip->ci_flags |= FMD_CF_INVISIBLE;
    859 
    860 	cip->ci_suspects = cis;
    861 	cip->ci_nsuspects++;
    862 
    863 	(void) pthread_mutex_unlock(&cip->ci_lock);
    864 }
    865 
    866 void
    867 fmd_case_reset_suspects(fmd_case_t *cp)
    868 {
    869 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    870 
    871 	(void) pthread_mutex_lock(&cip->ci_lock);
    872 	ASSERT(cip->ci_state < FMD_CASE_SOLVED);
    873 
    874 	fmd_case_destroy_suspects(cip);
    875 	cip->ci_flags |= FMD_CF_DIRTY;
    876 
    877 	(void) pthread_mutex_unlock(&cip->ci_lock);
    878 	fmd_module_setcdirty(cip->ci_mod);
    879 }
    880 
    881 /*
    882  * Grab ci_lock and update the case state and set the dirty bit.  Then perform
    883  * whatever actions and emit whatever events are appropriate for the state.
    884  * Refer to the topmost block comment explaining the state machine for details.
    885  */
    886 void
    887 fmd_case_transition(fmd_case_t *cp, uint_t state, uint_t flags)
    888 {
    889 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
    890 
    891 	fmd_case_susp_t *cis;
    892 	fmd_case_item_t *cit;
    893 	fmd_asru_t *asru;
    894 	fmd_event_t *e;
    895 	nvlist_t *nvl;
    896 
    897 	ASSERT(state <= FMD_CASE_REPAIRED);
    898 	(void) pthread_mutex_lock(&cip->ci_lock);
    899 
    900 	if (!(cip->ci_flags & FMD_CF_SOLVED) && !(flags & FMD_CF_SOLVED))
    901 		flags &= ~(FMD_CF_ISOLATED | FMD_CF_REPAIRED);
    902 
    903 	cip->ci_flags |= flags;
    904 
    905 	if (cip->ci_state >= state) {
    906 		(void) pthread_mutex_unlock(&cip->ci_lock);
    907 		return; /* already in specified state */
    908 	}
    909 
    910 	TRACE((FMD_DBG_CASE, "case %s %s->%s", cip->ci_uuid,
    911 	    _fmd_case_snames[cip->ci_state], _fmd_case_snames[state]));
    912 
    913 	cip->ci_state = state;
    914 	cip->ci_flags |= FMD_CF_DIRTY;
    915 
    916 	if (cip->ci_xprt == NULL && cip->ci_mod != fmd.d_rmod)
    917 		fmd_module_setcdirty(cip->ci_mod);
    918 
    919 	switch (state) {
    920 	case FMD_CASE_SOLVED:
    921 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
    922 			fmd_event_transition(cit->cit_event, FMD_EVS_DIAGNOSED);
    923 
    924 		if (cip->ci_principal != NULL) {
    925 			fmd_event_transition(cip->ci_principal,
    926 			    FMD_EVS_DIAGNOSED);
    927 		}
    928 		break;
    929 
    930 	case FMD_CASE_CLOSE_WAIT:
    931 		/*
    932 		 * If the case was never solved, do not change ASRUs.
    933 		 * If the case was never fmd_case_closed, do not change ASRUs.
    934 		 * If the case was repaired, do not change ASRUs.
    935 		 */
    936 		if ((cip->ci_flags & (FMD_CF_SOLVED | FMD_CF_ISOLATED |
    937 		    FMD_CF_REPAIRED)) != (FMD_CF_SOLVED | FMD_CF_ISOLATED))
    938 			goto close_wait_finish;
    939 
    940 		/*
    941 		 * For each fault event in the suspect list, attempt to look up
    942 		 * the corresponding ASRU in the ASRU dictionary.  If the ASRU
    943 		 * is found there and is marked faulty, we now mark it unusable
    944 		 * and record the case meta-data and fault event with the ASRU.
    945 		 */
    946 		for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
    947 			if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU,
    948 			    &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl(
    949 			    fmd.d_asrus, nvl, FMD_B_FALSE)) != NULL) {
    950 				(void) fmd_asru_setflags(asru,
    951 				    FMD_ASRU_UNUSABLE, cp, cis->cis_nvl);
    952 				fmd_asru_hash_release(fmd.d_asrus, asru);
    953 			}
    954 		}
    955 
    956 	close_wait_finish:
    957 		/*
    958 		 * If an orphaned case transitions to CLOSE_WAIT, the owning
    959 		 * module is no longer loaded: continue on to CASE_CLOSED.
    960 		 */
    961 		if (fmd_case_orphaned(cp))
    962 			state = cip->ci_state = FMD_CASE_CLOSED;
    963 		break;
    964 
    965 	case FMD_CASE_REPAIRED:
    966 		ASSERT(fmd_case_orphaned(cp));
    967 		fmd_module_lock(cip->ci_mod);
    968 		fmd_list_delete(&cip->ci_mod->mod_cases, cip);
    969 		fmd_module_unlock(cip->ci_mod);
    970 		break;
    971 	}
    972 
    973 	(void) pthread_mutex_unlock(&cip->ci_lock);
    974 
    975 	/*
    976 	 * If the module has initialized, then publish the appropriate event
    977 	 * for the new case state.  If not, we are being called from the
    978 	 * checkpoint code during module load, in which case the module's
    979 	 * _fmd_init() routine hasn't finished yet, and our event dictionaries
    980 	 * may not be open yet, which will prevent us from computing the event
    981 	 * code.  Defer the call to fmd_case_publish() by enqueuing a PUBLISH
    982 	 * event in our queue: this won't be processed until _fmd_init is done.
    983 	 */
    984 	if (cip->ci_mod->mod_flags & FMD_MOD_INIT)
    985 		fmd_case_publish(cp, state);
    986 	else {
    987 		fmd_case_hold(cp);
    988 		e = fmd_event_create(FMD_EVT_PUBLISH, FMD_HRT_NOW, NULL, cp);
    989 		fmd_eventq_insert_at_head(cip->ci_mod->mod_queue, e);
    990 	}
    991 
    992 	/*
    993 	 * If we transitioned to REPAIRED, adjust the reference count to
    994 	 * reflect our removal from fmd.d_rmod->mod_cases.  If the caller has
    995 	 * not placed an additional hold on the case, it will now be freed.
    996 	 */
    997 	if (state == FMD_CASE_REPAIRED)
    998 		fmd_case_rele(cp);
    999 }
   1000 
   1001 /*
   1002  * Transition the specified case to *at least* the specified state by first
   1003  * re-validating the suspect list using the resource cache.  This function is
   1004  * employed by the checkpoint code when restoring a saved, solved case to see
   1005  * if the state of the case has effectively changed while fmd was not running
   1006  * or the module was not loaded.  If none of the suspects are present anymore,
   1007  * advance the state to REPAIRED.  If none are usable, advance to CLOSE_WAIT.
   1008  */
   1009 void
   1010 fmd_case_transition_update(fmd_case_t *cp, uint_t state, uint_t flags)
   1011 {
   1012 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1013 	fmd_case_susp_t *cis;
   1014 	fmd_asru_t *asru;
   1015 	nvlist_t *nvl;
   1016 
   1017 	int faulty = 0;		/* are any suspects faulty? */
   1018 	int usable = 0;		/* are any suspects usable? */
   1019 
   1020 	ASSERT(state >= FMD_CASE_SOLVED);
   1021 	(void) pthread_mutex_lock(&cip->ci_lock);
   1022 
   1023 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
   1024 		if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU,
   1025 		    &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl(
   1026 		    fmd.d_asrus, nvl, FMD_B_TRUE)) != NULL) {
   1027 
   1028 			if (asru->asru_flags & FMD_ASRU_FAULTY)
   1029 				faulty++;
   1030 
   1031 			if (fmd_asru_fake_not_present == 0 &&
   1032 			    fmd_fmri_unusable(asru->asru_fmri) <= 0)
   1033 				usable++;
   1034 
   1035 			fmd_asru_hash_release(fmd.d_asrus, asru);
   1036 		}
   1037 	}
   1038 
   1039 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1040 
   1041 	/*
   1042 	 * If none of the suspects were faulty, it implies they were either
   1043 	 * repaired already or not present and the rsrc.age time has expired.
   1044 	 * We can move the state on to repaired.
   1045 	 */
   1046 	if (!faulty) {
   1047 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
   1048 		flags |= FMD_CF_REPAIRED;
   1049 	} else if (!usable) {
   1050 		state = MAX(state, FMD_CASE_CLOSE_WAIT);
   1051 		flags |= FMD_CF_ISOLATED;
   1052 	}
   1053 
   1054 	fmd_case_transition(cp, state, flags);
   1055 }
   1056 
   1057 void
   1058 fmd_case_setdirty(fmd_case_t *cp)
   1059 {
   1060 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1061 
   1062 	(void) pthread_mutex_lock(&cip->ci_lock);
   1063 	cip->ci_flags |= FMD_CF_DIRTY;
   1064 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1065 
   1066 	fmd_module_setcdirty(cip->ci_mod);
   1067 }
   1068 
   1069 void
   1070 fmd_case_clrdirty(fmd_case_t *cp)
   1071 {
   1072 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1073 
   1074 	(void) pthread_mutex_lock(&cip->ci_lock);
   1075 	cip->ci_flags &= ~FMD_CF_DIRTY;
   1076 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1077 }
   1078 
   1079 void
   1080 fmd_case_commit(fmd_case_t *cp)
   1081 {
   1082 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1083 	fmd_case_item_t *cit;
   1084 
   1085 	(void) pthread_mutex_lock(&cip->ci_lock);
   1086 
   1087 	if (cip->ci_flags & FMD_CF_DIRTY) {
   1088 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next)
   1089 			fmd_event_commit(cit->cit_event);
   1090 
   1091 		if (cip->ci_principal != NULL)
   1092 			fmd_event_commit(cip->ci_principal);
   1093 
   1094 		fmd_buf_hash_commit(&cip->ci_bufs);
   1095 		cip->ci_flags &= ~FMD_CF_DIRTY;
   1096 	}
   1097 
   1098 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1099 }
   1100 
   1101 /*
   1102  * Indicate that the case may need to change state because one or more of the
   1103  * ASRUs named as a suspect has changed state.  We examine all the suspects
   1104  * and if none are still faulty, we initiate a case close transition.
   1105  */
   1106 void
   1107 fmd_case_update(fmd_case_t *cp)
   1108 {
   1109 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1110 	fmd_case_susp_t *cis;
   1111 	fmd_asru_t *asru;
   1112 	nvlist_t *nvl;
   1113 
   1114 	int astate = 0;
   1115 	uint_t cstate;
   1116 
   1117 	(void) pthread_mutex_lock(&cip->ci_lock);
   1118 	cstate = cip->ci_state;
   1119 
   1120 	if ((cip->ci_flags & FMD_CF_REPAIRING) ||
   1121 	    cip->ci_xprt != NULL || cip->ci_state < FMD_CASE_SOLVED) {
   1122 		(void) pthread_mutex_unlock(&cip->ci_lock);
   1123 		return; /* update is not appropriate */
   1124 	}
   1125 
   1126 	for (cis = cip->ci_suspects; cis != NULL; cis = cis->cis_next) {
   1127 		if (nvlist_lookup_nvlist(cis->cis_nvl, FM_FAULT_ASRU,
   1128 		    &nvl) == 0 && (asru = fmd_asru_hash_lookup_nvl(
   1129 		    fmd.d_asrus, nvl, FMD_B_FALSE)) != NULL) {
   1130 			astate |= (asru->asru_flags & FMD_ASRU_STATE);
   1131 			fmd_asru_hash_release(fmd.d_asrus, asru);
   1132 		}
   1133 	}
   1134 
   1135 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1136 
   1137 	if (astate & FMD_ASRU_FAULTY)
   1138 		return; /* one or more suspects are still marked faulty */
   1139 
   1140 	if (cstate == FMD_CASE_CLOSED)
   1141 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
   1142 	else
   1143 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
   1144 }
   1145 
   1146 /*
   1147  * Delete a closed case from the module's case list once the fmdo_close() entry
   1148  * point has run to completion.  If the case is owned by a transport module,
   1149  * tell the transport to proxy a case close on the other end of the transport.
   1150  * If not, transition to the appropriate next state based on ci_flags.  This
   1151  * function represents the end of CLOSE_WAIT and transitions the case to either
   1152  * CLOSED or REPAIRED or discards it entirely because it was never solved;
   1153  * refer to the topmost block comment explaining the state machine for details.
   1154  */
   1155 void
   1156 fmd_case_delete(fmd_case_t *cp)
   1157 {
   1158 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1159 	fmd_modstat_t *msp;
   1160 	size_t buftotal;
   1161 
   1162 	ASSERT(fmd_module_locked(cip->ci_mod));
   1163 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
   1164 	buftotal = fmd_buf_hash_destroy(&cip->ci_bufs);
   1165 
   1166 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
   1167 	msp = cip->ci_mod->mod_stats;
   1168 
   1169 	ASSERT(msp->ms_caseopen.fmds_value.ui64 != 0);
   1170 	msp->ms_caseopen.fmds_value.ui64--;
   1171 
   1172 	ASSERT(msp->ms_buftotal.fmds_value.ui64 >= buftotal);
   1173 	msp->ms_buftotal.fmds_value.ui64 -= buftotal;
   1174 
   1175 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
   1176 
   1177 	if (cip->ci_xprt == NULL)
   1178 		fmd_module_setcdirty(cip->ci_mod);
   1179 
   1180 	fmd_module_rele(cip->ci_mod);
   1181 	cip->ci_mod = fmd.d_rmod;
   1182 	fmd_module_hold(cip->ci_mod);
   1183 
   1184 	/*
   1185 	 * If the case is not proxied and it has been solved, then retain it
   1186 	 * on the root module's case list at least until we're transitioned.
   1187 	 * Otherwise free the case with our final fmd_case_rele() below.
   1188 	 */
   1189 	if (cip->ci_xprt == NULL && (cip->ci_flags & FMD_CF_SOLVED)) {
   1190 		fmd_module_lock(cip->ci_mod);
   1191 		fmd_list_append(&cip->ci_mod->mod_cases, cip);
   1192 		fmd_module_unlock(cip->ci_mod);
   1193 		fmd_case_hold(cp);
   1194 	}
   1195 
   1196 	/*
   1197 	 * If a proxied case finishes CLOSE_WAIT, then it can be discarded
   1198 	 * rather than orphaned because by definition it can have no entries
   1199 	 * in the resource cache of the current fault manager.
   1200 	 */
   1201 	if (cip->ci_xprt != NULL)
   1202 		fmd_xprt_uuclose(cip->ci_xprt, cip->ci_uuid);
   1203 	else if (cip->ci_flags & FMD_CF_REPAIRED)
   1204 		fmd_case_transition(cp, FMD_CASE_REPAIRED, 0);
   1205 	else if (cip->ci_flags & FMD_CF_ISOLATED)
   1206 		fmd_case_transition(cp, FMD_CASE_CLOSED, 0);
   1207 
   1208 	fmd_case_rele(cp);
   1209 }
   1210 
   1211 void
   1212 fmd_case_discard(fmd_case_t *cp)
   1213 {
   1214 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1215 
   1216 	(void) pthread_mutex_lock(&cip->ci_mod->mod_stats_lock);
   1217 	cip->ci_mod->mod_stats->ms_caseopen.fmds_value.ui64--;
   1218 	(void) pthread_mutex_unlock(&cip->ci_mod->mod_stats_lock);
   1219 
   1220 	ASSERT(fmd_module_locked(cip->ci_mod));
   1221 	fmd_list_delete(&cip->ci_mod->mod_cases, cip);
   1222 	fmd_case_rele(cp);
   1223 }
   1224 
   1225 static void
   1226 fmd_case_repair_containee(fmd_asru_t *ee, void *er)
   1227 {
   1228 	if ((ee->asru_flags & FMD_ASRU_FAULTY) &&
   1229 	    fmd_fmri_contains(er, ee->asru_fmri) > 0)
   1230 		(void) fmd_asru_clrflags(ee, FMD_ASRU_FAULTY, NULL, NULL);
   1231 }
   1232 
   1233 /*
   1234  * Indicate that the problem corresponding to a case has been repaired by
   1235  * clearing the faulty bit on each ASRU named as a suspect.  If the case hasn't
   1236  * already been closed, this function initiates the transition to CLOSE_WAIT.
   1237  * The caller must have the case held from fmd_case_hash_lookup(), so we can
   1238  * grab and drop ci_lock without the case being able to be freed in between.
   1239  */
   1240 int
   1241 fmd_case_repair(fmd_case_t *cp)
   1242 {
   1243 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1244 	fmd_case_susp_t *cis;
   1245 	nvlist_t *nvl;
   1246 	uint_t cstate;
   1247 
   1248 	fmd_asru_hash_t *ahp = fmd.d_asrus;
   1249 	fmd_asru_t **aa;
   1250 	uint_t i, an;
   1251 
   1252 	(void) pthread_mutex_lock(&cip->ci_lock);
   1253 	cstate = cip->ci_state;
   1254 
   1255 	if (cip->ci_xprt != NULL) {
   1256 		(void) pthread_mutex_unlock(&cip->ci_lock);
   1257 		return (fmd_set_errno(EFMD_CASE_OWNER));
   1258 	}
   1259 
   1260 	if (cstate < FMD_CASE_SOLVED || (cip->ci_flags & FMD_CF_REPAIRING)) {
   1261 		(void) pthread_mutex_unlock(&cip->ci_lock);
   1262 		return (fmd_set_errno(EFMD_CASE_STATE));
   1263 	}
   1264 
   1265 	/*
   1266 	 * Take a snapshot of any ASRUs referenced by the case that are present
   1267 	 * in the resource cache.  Then drop ci_lock and clear the faulty bit
   1268 	 * on each ASRU (we can't call fmd_asru_clrflags() with ci_lock held).
   1269 	 */
   1270 	an = cip->ci_nsuspects;
   1271 	aa = alloca(sizeof (fmd_asru_t *) * an);
   1272 	bzero(aa, sizeof (fmd_asru_t *) * an);
   1273 
   1274 	for (i = 0, cis = cip->ci_suspects;
   1275 	    cis != NULL; cis = cis->cis_next, i++) {
   1276 		if (nvlist_lookup_nvlist(cis->cis_nvl,
   1277 		    FM_FAULT_ASRU, &nvl) == 0)
   1278 			aa[i] = fmd_asru_hash_lookup_nvl(ahp, nvl, FMD_B_FALSE);
   1279 	}
   1280 
   1281 	cip->ci_flags |= FMD_CF_REPAIRING;
   1282 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1283 
   1284 	/*
   1285 	 * For each suspect ASRU, if the case associated with this ASRU matches
   1286 	 * case 'cp', close all ASRUs contained by 'ap' and clear FAULTY.  Note
   1287 	 * that at present, we're assuming that when a given resource FMRI R1
   1288 	 * contains another R2, that any faults are related by a common
   1289 	 * diagnosis engine.  This is true in our current architecture, but may
   1290 	 * not always be true, at which point we'll need more cleverness here.
   1291 	 */
   1292 	for (i = 0; i < an; i++) {
   1293 		if (aa[i] == NULL)
   1294 			continue; /* no asru was found */
   1295 
   1296 		if (aa[i]->asru_case == cp) {
   1297 			fmd_asru_hash_apply(fmd.d_asrus,
   1298 			    fmd_case_repair_containee, aa[i]->asru_fmri);
   1299 			(void) fmd_asru_clrflags(aa[i],
   1300 			    FMD_ASRU_FAULTY, NULL, NULL);
   1301 		}
   1302 
   1303 		fmd_asru_hash_release(ahp, aa[i]);
   1304 	}
   1305 
   1306 	(void) pthread_mutex_lock(&cip->ci_lock);
   1307 	cip->ci_flags &= ~FMD_CF_REPAIRING;
   1308 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1309 
   1310 	if (cstate == FMD_CASE_CLOSED)
   1311 		fmd_case_transition(cp, FMD_CASE_REPAIRED, FMD_CF_REPAIRED);
   1312 	else
   1313 		fmd_case_transition(cp, FMD_CASE_CLOSE_WAIT, FMD_CF_REPAIRED);
   1314 
   1315 	return (0);
   1316 }
   1317 
   1318 int
   1319 fmd_case_contains(fmd_case_t *cp, fmd_event_t *ep)
   1320 {
   1321 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
   1322 	fmd_case_item_t *cit;
   1323 	uint_t state;
   1324 	int rv = 0;
   1325 
   1326 	(void) pthread_mutex_lock(&cip->ci_lock);
   1327 
   1328 	if (cip->ci_state >= FMD_CASE_SOLVED)
   1329 		state = FMD_EVS_DIAGNOSED;
   1330 	else
   1331 		state = FMD_EVS_ACCEPTED;
   1332 
   1333 	for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
   1334 		if ((rv = fmd_event_equal(ep, cit->cit_event)) != 0)
   1335 			break;
   1336 	}
   1337 
   1338 	if (rv == 0 && cip->ci_principal != NULL)
   1339 		rv = fmd_event_equal(ep, cip->ci_principal);
   1340 
   1341 	(void) pthread_mutex_unlock(&cip->ci_lock);
   1342 
   1343 	if (rv != 0)
   1344 		fmd_event_transition(ep, state);
   1345 
   1346 	return (rv);
   1347 }
   1348 
   1349 int
   1350 fmd_case_orphaned(fmd_case_t *cp)
   1351 {
   1352 	return (((fmd_case_impl_t *)cp)->ci_mod == fmd.d_rmod);
   1353 }
   1354 
   1355 void
   1356 fmd_case_settime(fmd_case_t *cp, time_t tv_sec, suseconds_t tv_usec)
   1357 {
   1358 	((fmd_case_impl_t *)cp)->ci_tv.tv_sec = tv_sec;
   1359 	((fmd_case_impl_t *)cp)->ci_tv.tv_usec = tv_usec;
   1360 	((fmd_case_impl_t *)cp)->ci_tv_valid = 1;
   1361 }
   1362