Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)errorq.c	1.9	07/10/05 SMI"
     27 
     28 /*
     29  * Kernel Error Queues
     30  *
     31  * A common problem when handling hardware error traps and interrupts is that
     32  * these errors frequently must be handled at high interrupt level, where
     33  * reliably producing error messages and safely examining and manipulating
     34  * other kernel state may not be possible.  The kernel error queue primitive is
     35  * a common set of routines that allow a subsystem to maintain a queue of
     36  * errors that can be processed by an explicit call from a safe context or by a
     37  * soft interrupt that fires at a specific lower interrupt level.  The queue
     38  * management code also ensures that if the system panics, all in-transit
     39  * errors are logged prior to reset.  Each queue has an associated kstat for
     40  * observing the number of errors dispatched and logged, and mdb(1) debugging
     41  * support is provided for live and post-mortem observability.
     42  *
     43  * Memory Allocation
     44  *
     45  * 	All of the queue data structures are allocated in advance as part of
     46  * 	the errorq_create() call.  No additional memory allocations are
     47  * 	performed as part of errorq_dispatch(), errorq_reserve(),
     48  *	errorq_commit() or errorq_drain().  This design
     49  * 	facilitates reliable error queue processing even when the system is low
     50  * 	on memory, and ensures that errorq_dispatch() can be called from any
     51  * 	context.  When the queue is created, the maximum queue length is
     52  * 	specified as a parameter to errorq_create() errorq_nvcreate().  This
     53  *	length should represent a reasonable upper bound on the number of
     54  *	simultaneous errors.  If errorq_dispatch() or errorq_reserve() is
     55  *	invoked and no free queue elements are available, the error is
     56  *	dropped and will not be logged.  Typically, the queue will only be
     57  *	exhausted by an error storm, and in this case
     58  * 	the earlier errors provide the most important data for analysis.
     59  * 	When a new error is dispatched, the error data is copied into the
     60  * 	preallocated queue element so that the caller's buffer can be reused.
     61  *
     62  *	When a new error is reserved, an element is moved from the free list
     63  *	and returned to the caller.  The element buffer data, eqe_data, may be
     64  *	managed by the caller and dispatched to the errorq by calling
     65  *	errorq_commit().  This is useful for additions to errorq's
     66  *	created with errorq_nvcreate() to handle name-value pair (nvpair) data.
     67  *	See below for a discussion on nvlist errorq's.
     68  *
     69  * Queue Drain Callback
     70  *
     71  *      When the error queue is drained, the caller's queue drain callback is
     72  *      invoked with a pointer to the saved error data.  This function may be
     73  *      called from passive kernel context or soft interrupt context at or
     74  *      below LOCK_LEVEL, or as part of panic().  As such, the callback should
     75  *      basically only be calling cmn_err (but NOT with the CE_PANIC flag).
     76  *      The callback must not call panic(), attempt to allocate memory, or wait
     77  *      on a condition variable.  The callback may not call errorq_destroy()
     78  *      or errorq_drain() on the same error queue that called it.
     79  *
     80  *      The queue drain callback will always be called for each pending error
     81  *      in the order in which errors were enqueued (oldest to newest).  The
     82  *      queue drain callback is guaranteed to provide at *least* once semantics
     83  *      for all errors that are successfully dispatched (i.e. for which
     84  *      errorq_dispatch() has successfully completed).  If an unrelated panic
     85  *      occurs while the queue drain callback is running on a vital queue, the
     86  *      panic subsystem will continue the queue drain and the callback may be
     87  *      invoked again for the same error.  Therefore, the callback should
     88  *      restrict itself to logging messages and taking other actions that are
     89  *      not destructive if repeated.
     90  *
     91  * Name-Value Pair Error Queues
     92  *
     93  *	During error handling, it may be more convenient to store error
     94  *	queue element data as a fixed buffer of name-value pairs.  The
     95  *	nvpair library allows construction and destruction of nvlists in
     96  *	in pre-allocated memory buffers.
     97  *
     98  *	Error queues created via errorq_nvcreate() store queue element
     99  *	data as fixed buffer nvlists (ereports).  errorq_reserve()
    100  *	allocates an errorq element from eqp->eq_free and returns a valid
    101  *	pointer	to a errorq_elem_t (queue element) and a pre-allocated
    102  *	fixed buffer nvlist.  errorq_elem_nvl() is used to gain access
    103  *	to the nvlist to add name-value ereport members prior to
    104  *	dispatching the error queue element in errorq_commit().
    105  *
    106  *	Once dispatched, the drain function will return the element to
    107  *	eqp->eq_free and reset the associated nv_alloc structure.
    108  *	error_cancel() may be called to cancel an element reservation
    109  *	element that was never dispatched (committed).  This is useful in
    110  *	cases where a programming error prevents a queue element from being
    111  *	dispatched.
    112  *
    113  * Queue Management
    114  *
    115  *      The queue element structures and error data buffers are allocated in
    116  *      two contiguous chunks as part of errorq_create() or errorq_nvcreate().
    117  *	Each queue element structure contains a next pointer,
    118  *	a previous pointer, and a pointer to the corresponding error data
    119  *	buffer.  The data buffer for a nvlist errorq is a shared buffer
    120  *	for the allocation of name-value pair lists. The elements are kept on
    121  *      one of three lists:
    122  *
    123  *      Unused elements are kept on the free list, a singly-linked list pointed
    124  *      to by eqp->eq_free, and linked together using eqe_prev.  The eqe_next
    125  *      pointer is not used by the free list and will be set to NULL.
    126  *
    127  *      Pending errors are kept on the pending list, a singly-linked list
    128  *      pointed to by eqp->eq_pend, and linked together using eqe_prev.  This
    129  *      list is maintained in order from newest error to oldest.  The eqe_next
    130  *      pointer is not used by the pending list and will be set to NULL.
    131  *
    132  *      The processing list is a doubly-linked list pointed to by eqp->eq_phead
    133  *      (the oldest element) and eqp->eq_ptail (the newest element).  The
    134  *      eqe_next pointer is used to traverse from eq_phead to eq_ptail, and the
    135  *      eqe_prev pointer is used to traverse from eq_ptail to eq_phead.  Once a
    136  *      queue drain operation begins, the current pending list is moved to the
    137  *      processing list in a two-phase commit fashion, allowing the panic code
    138  *      to always locate and process all pending errors in the event that a
    139  *      panic occurs in the middle of queue processing.
    140  *
    141  *	A fourth list is maintained for nvlist errorqs.  The dump list,
    142  *	eq_dump is used to link all errorq elements that should be stored
    143  *	in a crash dump file in the event of a system panic.  During
    144  *	errorq_panic(), the list is created and subsequently traversed
    145  *	in errorq_dump() during the final phases of a crash dump.
    146  *
    147  * Platform Considerations
    148  *
    149  *      In order to simplify their implementation, error queues make use of the
    150  *      C wrappers for compare-and-swap.  If the platform itself does not
    151  *      support compare-and-swap in hardware and the kernel emulation routines
    152  *      are used instead, then the context in which errorq_dispatch() can be
    153  *      safely invoked is further constrained by the implementation of the
    154  *      compare-and-swap emulation.  Specifically, if errorq_dispatch() is
    155  *      called from a code path that can be executed above ATOMIC_LEVEL on such
    156  *      a platform, the dispatch code could potentially deadlock unless the
    157  *      corresponding error interrupt is blocked or disabled prior to calling
    158  *      errorq_dispatch().  Error queues should therefore be deployed with
    159  *      caution on these platforms.
    160  *
    161  * Interfaces
    162  *
    163  * errorq_t *errorq_create(name, func, private, qlen, eltsize, ipl, flags);
    164  * errorq_t *errorq_nvcreate(name, func, private, qlen, eltsize, ipl, flags);
    165  *
    166  *      Create a new error queue with the specified name, callback, and
    167  *      properties.  A pointer to the new error queue is returned upon success,
    168  *      or NULL is returned to indicate that the queue could not be created.
    169  *      This function must be called from passive kernel context with no locks
    170  *      held that can prevent a sleeping memory allocation from occurring.
    171  *      errorq_create() will return failure if the queue kstats cannot be
    172  *      created, or if a soft interrupt handler cannot be registered.
    173  *
    174  *      The queue 'name' is a string that is recorded for live and post-mortem
    175  *      examination by a debugger.  The queue callback 'func' will be invoked
    176  *      for each error drained from the queue, and will receive the 'private'
    177  *      pointer as its first argument.  The callback must obey the rules for
    178  *      callbacks described above.  The queue will have maximum length 'qlen'
    179  *      and each element will be able to record up to 'eltsize' bytes of data.
    180  *      The queue's soft interrupt (see errorq_dispatch(), below) will fire
    181  *      at 'ipl', which should not exceed LOCK_LEVEL.  The queue 'flags' may
    182  *      include the following flag:
    183  *
    184  *      ERRORQ_VITAL    - This queue contains information that is considered
    185  *         vital to problem diagnosis.  Error queues that are marked vital will
    186  *         be automatically drained by the panic subsystem prior to printing
    187  *         the panic messages to the console.
    188  *
    189  * void errorq_destroy(errorq);
    190  *
    191  *      Destroy the specified error queue.  The queue is drained of any
    192  *      pending elements and these are logged before errorq_destroy returns.
    193  *      Once errorq_destroy() begins draining the queue, any simultaneous
    194  *      calls to dispatch errors will result in the errors being dropped.
    195  *      The caller must invoke a higher-level abstraction (e.g. disabling
    196  *      an error interrupt) to ensure that error handling code does not
    197  *      attempt to dispatch errors to the queue while it is being freed.
    198  *
    199  * void errorq_dispatch(errorq, data, len, flag);
    200  *
    201  *      Attempt to enqueue the specified error data.  If a free queue element
    202  *      is available, the data is copied into a free element and placed on a
    203  *      pending list.  If no free queue element is available, the error is
    204  *      dropped.  The data length (len) is specified in bytes and should not
    205  *      exceed the queue's maximum element size.  If the data length is less
    206  *      than the maximum element size, the remainder of the queue element is
    207  *      filled with zeroes.  The flag parameter should be one of:
    208  *
    209  *      ERRORQ_ASYNC    - Schedule a soft interrupt at the previously specified
    210  *         IPL to asynchronously drain the queue on behalf of the caller.
    211  *
    212  *      ERRORQ_SYNC     - Do not schedule a soft interrupt to drain the queue.
    213  *         The caller is presumed to be calling errorq_drain() or panic() in
    214  *         the near future in order to drain the queue and log the error.
    215  *
    216  *      The errorq_dispatch() function may be called from any context, subject
    217  *      to the Platform Considerations described above.
    218  *
    219  * void errorq_drain(errorq);
    220  *
    221  *      Drain the error queue of all pending errors.  The queue's callback
    222  *      function is invoked for each error in order from oldest to newest.
    223  *      This function may be used at or below LOCK_LEVEL or from panic context.
    224  *
    225  * errorq_elem_t *errorq_reserve(errorq);
    226  *
    227  *	Reserve an error queue element for later processing and dispatching.
    228  *	The element is returned to the caller who may add error-specific data
    229  *	to element.  The element is retured to the free list when either
    230  *	errorq_commit() is called and the element asynchronously processed
    231  *	or immediately when errorq_cancel() is called.
    232  *
    233  * void errorq_commit(errorq, errorq_elem, flag);
    234  *
    235  *	Commit an errorq element (eqep) for dispatching, see
    236  *	errorq_dispatch().
    237  *
    238  * void errorq_cancel(errorq, errorq_elem);
    239  *
    240  *	Cancel a pending errorq element reservation.  The errorq element is
    241  *	returned to the free list upon cancelation.
    242  */
    243 
    244 #include <sys/errorq_impl.h>
    245 #include <sys/sysmacros.h>
    246 #include <sys/machlock.h>
    247 #include <sys/cmn_err.h>
    248 #include <sys/atomic.h>
    249 #include <sys/systm.h>
    250 #include <sys/kmem.h>
    251 #include <sys/conf.h>
    252 #include <sys/ddi.h>
    253 #include <sys/sunddi.h>
    254 #include <sys/bootconf.h>
    255 #include <sys/spl.h>
    256 #include <sys/dumphdr.h>
    257 #include <sys/compress.h>
    258 #include <sys/time.h>
    259 #include <sys/panic.h>
    260 #include <sys/fm/protocol.h>
    261 #include <sys/fm/util.h>
    262 
    263 static struct errorq_kstat errorq_kstat_template = {
    264 	{ "dispatched", KSTAT_DATA_UINT64 },
    265 	{ "dropped", KSTAT_DATA_UINT64 },
    266 	{ "logged", KSTAT_DATA_UINT64 },
    267 	{ "reserved", KSTAT_DATA_UINT64 },
    268 	{ "reserve_fail", KSTAT_DATA_UINT64 },
    269 	{ "committed", KSTAT_DATA_UINT64 },
    270 	{ "commit_fail", KSTAT_DATA_UINT64 },
    271 	{ "cancelled", KSTAT_DATA_UINT64 }
    272 };
    273 
    274 static uint64_t errorq_lost = 0;
    275 static errorq_t *errorq_list = NULL;
    276 static kmutex_t errorq_lock;
    277 static uint64_t errorq_vitalmin = 5;
    278 
    279 static uint_t
    280 errorq_intr(caddr_t eqp)
    281 {
    282 	errorq_drain((errorq_t *)eqp);
    283 	return (DDI_INTR_CLAIMED);
    284 }
    285 
    286 /*
    287  * Create a new error queue with the specified properties and add a software
    288  * interrupt handler and kstat for it.  This function must be called from
    289  * passive kernel context with no locks held that can prevent a sleeping
    290  * memory allocation from occurring.  This function will return NULL if the
    291  * softint or kstat for this queue cannot be created.
    292  */
    293 errorq_t *
    294 errorq_create(const char *name, errorq_func_t func, void *private,
    295     ulong_t qlen, size_t size, uint_t ipl, uint_t flags)
    296 {
    297 	errorq_t *eqp = kmem_alloc(sizeof (errorq_t), KM_SLEEP);
    298 	ddi_iblock_cookie_t ibc = (ddi_iblock_cookie_t)(uintptr_t)ipltospl(ipl);
    299 	dev_info_t *dip = ddi_root_node();
    300 
    301 	errorq_elem_t *eep;
    302 	ddi_softintr_t id = NULL;
    303 	caddr_t data;
    304 
    305 	ASSERT(qlen != 0 && size != 0);
    306 	ASSERT(ipl > 0 && ipl <= LOCK_LEVEL);
    307 
    308 	/*
    309 	 * If a queue is created very early in boot before device tree services
    310 	 * are available, the queue softint handler cannot be created.  We
    311 	 * manually drain these queues and create their softint handlers when
    312 	 * it is safe to do so as part of errorq_init(), below.
    313 	 */
    314 	if (modrootloaded && ddi_add_softintr(dip, DDI_SOFTINT_FIXED, &id,
    315 	    &ibc, NULL, errorq_intr, (caddr_t)eqp) != DDI_SUCCESS) {
    316 		cmn_err(CE_WARN, "errorq_create: failed to register "
    317 		    "IPL %u softint for queue %s", ipl, name);
    318 		kmem_free(eqp, sizeof (errorq_t));
    319 		return (NULL);
    320 	}
    321 
    322 	if ((eqp->eq_ksp = kstat_create("unix", 0, name, "errorq",
    323 	    KSTAT_TYPE_NAMED, sizeof (struct errorq_kstat) /
    324 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) == NULL) {
    325 		cmn_err(CE_WARN, "errorq_create: failed to create kstat "
    326 		    "for queue %s", name);
    327 		if (id != NULL)
    328 			ddi_remove_softintr(id);
    329 		kmem_free(eqp, sizeof (errorq_t));
    330 		return (NULL);
    331 	}
    332 
    333 	bcopy(&errorq_kstat_template, &eqp->eq_kstat,
    334 	    sizeof (struct errorq_kstat));
    335 	eqp->eq_ksp->ks_data = &eqp->eq_kstat;
    336 	eqp->eq_ksp->ks_private = eqp;
    337 	kstat_install(eqp->eq_ksp);
    338 
    339 	(void) strncpy(eqp->eq_name, name, ERRORQ_NAMELEN);
    340 	eqp->eq_name[ERRORQ_NAMELEN] = '\0';
    341 	eqp->eq_func = func;
    342 	eqp->eq_private = private;
    343 	eqp->eq_data = kmem_alloc(qlen * size, KM_SLEEP);
    344 	eqp->eq_qlen = qlen;
    345 	eqp->eq_size = size;
    346 	eqp->eq_ipl = ipl;
    347 	eqp->eq_flags = flags | ERRORQ_ACTIVE;
    348 	eqp->eq_id = id;
    349 	mutex_init(&eqp->eq_lock, NULL, MUTEX_DEFAULT, NULL);
    350 	eqp->eq_elems = kmem_alloc(qlen * sizeof (errorq_elem_t), KM_SLEEP);
    351 	eqp->eq_phead = NULL;
    352 	eqp->eq_ptail = NULL;
    353 	eqp->eq_pend = NULL;
    354 	eqp->eq_dump = NULL;
    355 	eqp->eq_free = eqp->eq_elems;
    356 
    357 	/*
    358 	 * Iterate over the array of errorq_elem_t structures and place each
    359 	 * one on the free list and set its data pointer.
    360 	 */
    361 	for (eep = eqp->eq_free, data = eqp->eq_data; qlen > 1; qlen--) {
    362 		eep->eqe_next = NULL;
    363 		eep->eqe_dump = NULL;
    364 		eep->eqe_prev = eep + 1;
    365 		eep->eqe_data = data;
    366 		data += size;
    367 		eep++;
    368 	}
    369 
    370 	eep->eqe_next = NULL;
    371 	eep->eqe_prev = NULL;
    372 	eep->eqe_data = data;
    373 	eep->eqe_dump = NULL;
    374 
    375 	/*
    376 	 * Once the errorq is initialized, add it to the global list of queues,
    377 	 * and then return a pointer to the new queue to the caller.
    378 	 */
    379 	mutex_enter(&errorq_lock);
    380 	eqp->eq_next = errorq_list;
    381 	errorq_list = eqp;
    382 	mutex_exit(&errorq_lock);
    383 
    384 	return (eqp);
    385 }
    386 
    387 /*
    388  * Create a new errorq as if by errorq_create(), but set the ERRORQ_NVLIST
    389  * flag and initialize each element to have the start of its data region used
    390  * as an errorq_nvelem_t with a nvlist allocator that consumes the data region.
    391  */
    392 errorq_t *
    393 errorq_nvcreate(const char *name, errorq_func_t func, void *private,
    394     ulong_t qlen, size_t size, uint_t ipl, uint_t flags)
    395 {
    396 	errorq_t *eqp;
    397 	errorq_elem_t *eep;
    398 
    399 	eqp = errorq_create(name, func, private, qlen,
    400 	    size + sizeof (errorq_nvelem_t), ipl, flags | ERRORQ_NVLIST);
    401 
    402 	if (eqp == NULL)
    403 		return (NULL);
    404 
    405 	mutex_enter(&eqp->eq_lock);
    406 
    407 	for (eep = eqp->eq_elems; qlen != 0; eep++, qlen--) {
    408 		errorq_nvelem_t *eqnp = eep->eqe_data;
    409 		eqnp->eqn_buf = (char *)eqnp + sizeof (errorq_nvelem_t);
    410 		eqnp->eqn_nva = fm_nva_xcreate(eqnp->eqn_buf, size);
    411 	}
    412 
    413 	mutex_exit(&eqp->eq_lock);
    414 	return (eqp);
    415 }
    416 
    417 /*
    418  * To destroy an error queue, we mark it as disabled and then explicitly drain
    419  * all pending errors.  Once the drain is complete, we can remove the queue
    420  * from the global list of queues examined by errorq_panic(), and then free
    421  * the various queue data structures.  The caller must use some higher-level
    422  * abstraction (e.g. disabling an error interrupt) to ensure that no one will
    423  * attempt to enqueue new errors while we are freeing this queue.
    424  */
    425 void
    426 errorq_destroy(errorq_t *eqp)
    427 {
    428 	errorq_t *p, **pp;
    429 	errorq_elem_t *eep;
    430 	ulong_t i;
    431 
    432 	ASSERT(eqp != NULL);
    433 	eqp->eq_flags &= ~ERRORQ_ACTIVE;
    434 	errorq_drain(eqp);
    435 
    436 	mutex_enter(&errorq_lock);
    437 	pp = &errorq_list;
    438 
    439 	for (p = errorq_list; p != NULL; p = p->eq_next) {
    440 		if (p == eqp) {
    441 			*pp = p->eq_next;
    442 			break;
    443 		}
    444 		pp = &p->eq_next;
    445 	}
    446 
    447 	mutex_exit(&errorq_lock);
    448 	ASSERT(p != NULL);
    449 
    450 	if (eqp->eq_flags & ERRORQ_NVLIST) {
    451 		for (eep = eqp->eq_elems, i = 0; i < eqp->eq_qlen; i++, eep++) {
    452 			errorq_nvelem_t *eqnp = eep->eqe_data;
    453 			fm_nva_xdestroy(eqnp->eqn_nva);
    454 		}
    455 	}
    456 
    457 	mutex_destroy(&eqp->eq_lock);
    458 	kstat_delete(eqp->eq_ksp);
    459 
    460 	if (eqp->eq_id != NULL)
    461 		ddi_remove_softintr(eqp->eq_id);
    462 
    463 	kmem_free(eqp->eq_elems, eqp->eq_qlen * sizeof (errorq_elem_t));
    464 	kmem_free(eqp->eq_data, eqp->eq_qlen * eqp->eq_size);
    465 
    466 	kmem_free(eqp, sizeof (errorq_t));
    467 }
    468 
    469 /*
    470  * Dispatch a new error into the queue for later processing.  The specified
    471  * data buffer is copied into a preallocated queue element.  If 'len' is
    472  * smaller than the queue element size, the remainder of the queue element is
    473  * filled with zeroes.  This function may be called from any context subject
    474  * to the Platform Considerations described above.
    475  */
    476 void
    477 errorq_dispatch(errorq_t *eqp, const void *data, size_t len, uint_t flag)
    478 {
    479 	errorq_elem_t *eep, *old;
    480 
    481 	if (eqp == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) {
    482 		atomic_add_64(&errorq_lost, 1);
    483 		return; /* drop error if queue is uninitialized or disabled */
    484 	}
    485 
    486 	while ((eep = eqp->eq_free) != NULL) {
    487 		if (casptr(&eqp->eq_free, eep, eep->eqe_prev) == eep)
    488 			break;
    489 	}
    490 
    491 	if (eep == NULL) {
    492 		atomic_add_64(&eqp->eq_kstat.eqk_dropped.value.ui64, 1);
    493 		return;
    494 	}
    495 
    496 	ASSERT(len <= eqp->eq_size);
    497 	bcopy(data, eep->eqe_data, MIN(eqp->eq_size, len));
    498 
    499 	if (len < eqp->eq_size)
    500 		bzero((caddr_t)eep->eqe_data + len, eqp->eq_size - len);
    501 
    502 	for (;;) {
    503 		old = eqp->eq_pend;
    504 		eep->eqe_prev = old;
    505 		membar_producer();
    506 
    507 		if (casptr(&eqp->eq_pend, old, eep) == old)
    508 			break;
    509 	}
    510 
    511 	atomic_add_64(&eqp->eq_kstat.eqk_dispatched.value.ui64, 1);
    512 
    513 	if (flag == ERRORQ_ASYNC && eqp->eq_id != NULL)
    514 		ddi_trigger_softintr(eqp->eq_id);
    515 }
    516 
    517 /*
    518  * Drain the specified error queue by calling eq_func() for each pending error.
    519  * This function must be called at or below LOCK_LEVEL or from panic context.
    520  * In order to synchronize with other attempts to drain the queue, we acquire
    521  * the adaptive eq_lock, blocking other consumers.  Once this lock is held,
    522  * we must use compare-and-swap to move the pending list to the processing
    523  * list and to return elements to the free list in order to synchronize
    524  * with producers, who do not acquire any locks and only use compare-and-swap.
    525  *
    526  * An additional constraint on this function is that if the system panics
    527  * while this function is running, the panic code must be able to detect and
    528  * handle all intermediate states and correctly dequeue all errors.  The
    529  * errorq_panic() function below will be used for detecting and handling
    530  * these intermediate states.  The comments in errorq_drain() below explain
    531  * how we make sure each intermediate state is distinct and consistent.
    532  */
    533 void
    534 errorq_drain(errorq_t *eqp)
    535 {
    536 	errorq_elem_t *eep, *fep, *dep;
    537 
    538 	ASSERT(eqp != NULL);
    539 	mutex_enter(&eqp->eq_lock);
    540 
    541 	/*
    542 	 * If there are one or more pending errors, set eq_ptail to point to
    543 	 * the first element on the pending list and then attempt to compare-
    544 	 * and-swap NULL to the pending list.  We use membar_producer() to
    545 	 * make sure that eq_ptail will be visible to errorq_panic() below
    546 	 * before the pending list is NULLed out.  This section is labeled
    547 	 * case (1) for errorq_panic, below.  If eq_ptail is not yet set (1A)
    548 	 * eq_pend has all the pending errors.  If casptr fails or has not
    549 	 * been called yet (1B), eq_pend still has all the pending errors.
    550 	 * If casptr succeeds (1C), eq_ptail has all the pending errors.
    551 	 */
    552 	while ((eep = eqp->eq_pend) != NULL) {
    553 		eqp->eq_ptail = eep;
    554 		membar_producer();
    555 
    556 		if (casptr(&eqp->eq_pend, eep, NULL) == eep)
    557 			break;
    558 	}
    559 
    560 	/*
    561 	 * If no errors were pending, assert that eq_ptail is set to NULL,
    562 	 * drop the consumer lock, and return without doing anything.
    563 	 */
    564 	if (eep == NULL) {
    565 		ASSERT(eqp->eq_ptail == NULL);
    566 		mutex_exit(&eqp->eq_lock);
    567 		return;
    568 	}
    569 
    570 	/*
    571 	 * Now iterate from eq_ptail (a.k.a. eep, the newest error) to the
    572 	 * oldest error, setting the eqe_next pointer so that we can iterate
    573 	 * over the errors from oldest to newest.  We use membar_producer()
    574 	 * to make sure that these stores are visible before we set eq_phead.
    575 	 * If we panic before, during, or just after this loop (case 2),
    576 	 * errorq_panic() will simply redo this work, as described below.
    577 	 */
    578 	for (eep->eqe_next = NULL; eep->eqe_prev != NULL; eep = eep->eqe_prev)
    579 		eep->eqe_prev->eqe_next = eep;
    580 	membar_producer();
    581 
    582 	/*
    583 	 * Now set eq_phead to the head of the processing list (the oldest
    584 	 * error) and issue another membar_producer() to make sure that
    585 	 * eq_phead is seen as non-NULL before we clear eq_ptail.  If we panic
    586 	 * after eq_phead is set (case 3), we will detect and log these errors
    587 	 * in errorq_panic(), as described below.
    588 	 */
    589 	eqp->eq_phead = eep;
    590 	membar_producer();
    591 
    592 	eqp->eq_ptail = NULL;
    593 	membar_producer();
    594 
    595 	/*
    596 	 * If we enter from errorq_panic_drain(), we may already have
    597 	 * errorq elements on the dump list.  Find the tail of
    598 	 * the list ready for append.
    599 	 */
    600 	if (panicstr && (dep = eqp->eq_dump) != NULL) {
    601 		while (dep->eqe_dump != NULL)
    602 			dep = dep->eqe_dump;
    603 	}
    604 
    605 	/*
    606 	 * Now iterate over the processing list from oldest (eq_phead) to
    607 	 * newest and log each error.  Once an error is logged, we use
    608 	 * compare-and-swap to return it to the free list.  If we panic before,
    609 	 * during, or after calling eq_func() (case 4), the error will still be
    610 	 * found on eq_phead and will be logged in errorq_panic below.
    611 	 */
    612 
    613 	while ((eep = eqp->eq_phead) != NULL) {
    614 		eqp->eq_func(eqp->eq_private, eep->eqe_data, eep);
    615 		eqp->eq_kstat.eqk_logged.value.ui64++;
    616 
    617 		eqp->eq_phead = eep->eqe_next;
    618 		membar_producer();
    619 
    620 		eep->eqe_next = NULL;
    621 
    622 		/*
    623 		 * On panic, we add the element to the dump list for each
    624 		 * nvlist errorq.  Elements are stored oldest to newest.
    625 		 * Then continue, so we don't free and subsequently overwrite
    626 		 * any elements which we've put on the dump queue.
    627 		 */
    628 		if (panicstr && (eqp->eq_flags & ERRORQ_NVLIST)) {
    629 			if (eqp->eq_dump == NULL)
    630 				dep = eqp->eq_dump = eep;
    631 			else
    632 				dep = dep->eqe_dump = eep;
    633 			membar_producer();
    634 			continue;
    635 		}
    636 
    637 		for (;;) {
    638 			fep = eqp->eq_free;
    639 			eep->eqe_prev = fep;
    640 			membar_producer();
    641 
    642 			if (casptr(&eqp->eq_free, fep, eep) == fep)
    643 				break;
    644 		}
    645 	}
    646 
    647 	mutex_exit(&eqp->eq_lock);
    648 }
    649 
    650 /*
    651  * Now that device tree services are available, set up the soft interrupt
    652  * handlers for any queues that were created early in boot.  We then
    653  * manually drain these queues to report any pending early errors.
    654  */
    655 void
    656 errorq_init(void)
    657 {
    658 	dev_info_t *dip = ddi_root_node();
    659 	ddi_softintr_t id;
    660 	errorq_t *eqp;
    661 
    662 	ASSERT(modrootloaded != 0);
    663 	ASSERT(dip != NULL);
    664 
    665 	mutex_enter(&errorq_lock);
    666 
    667 	for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) {
    668 		ddi_iblock_cookie_t ibc =
    669 		    (ddi_iblock_cookie_t)(uintptr_t)ipltospl(eqp->eq_ipl);
    670 
    671 		if (eqp->eq_id != NULL)
    672 			continue; /* softint already initialized */
    673 
    674 		if (ddi_add_softintr(dip, DDI_SOFTINT_FIXED, &id, &ibc, NULL,
    675 		    errorq_intr, (caddr_t)eqp) != DDI_SUCCESS) {
    676 			panic("errorq_init: failed to register IPL %u softint "
    677 			    "for queue %s", eqp->eq_ipl, eqp->eq_name);
    678 		}
    679 
    680 		eqp->eq_id = id;
    681 		errorq_drain(eqp);
    682 	}
    683 
    684 	mutex_exit(&errorq_lock);
    685 }
    686 
    687 /*
    688  * This function is designed to be called from panic context only, and
    689  * therefore does not need to acquire errorq_lock when iterating over
    690  * errorq_list.  This function must be called no more than once for each
    691  * 'what' value (if you change this then review the manipulation of 'dep'.
    692  */
    693 static uint64_t
    694 errorq_panic_drain(uint_t what)
    695 {
    696 	errorq_elem_t *eep, *nep, *fep, *dep;
    697 	errorq_t *eqp;
    698 	uint64_t loggedtmp;
    699 	uint64_t logged = 0;
    700 
    701 	for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) {
    702 		if ((eqp->eq_flags & (ERRORQ_VITAL | ERRORQ_NVLIST)) != what)
    703 			continue; /* do not drain this queue on this pass */
    704 
    705 		loggedtmp = eqp->eq_kstat.eqk_logged.value.ui64;
    706 
    707 		/*
    708 		 * In case (1B) above, eq_ptail may be set but the casptr may
    709 		 * not have been executed yet or may have failed.  Either way,
    710 		 * we must log errors in chronological order.  So we search
    711 		 * the pending list for the error pointed to by eq_ptail.  If
    712 		 * it is found, we know that all subsequent errors are also
    713 		 * still on the pending list, so just NULL out eq_ptail and let
    714 		 * errorq_drain(), below, take care of the logging.
    715 		 */
    716 		for (eep = eqp->eq_pend; eep != NULL; eep = eep->eqe_prev) {
    717 			if (eep == eqp->eq_ptail) {
    718 				ASSERT(eqp->eq_phead == NULL);
    719 				eqp->eq_ptail = NULL;
    720 				break;
    721 			}
    722 		}
    723 
    724 		/*
    725 		 * In cases (1C) and (2) above, eq_ptail will be set to the
    726 		 * newest error on the processing list but eq_phead will still
    727 		 * be NULL.  We set the eqe_next pointers so we can iterate
    728 		 * over the processing list in order from oldest error to the
    729 		 * newest error.  We then set eq_phead to point to the oldest
    730 		 * error and fall into the for-loop below.
    731 		 */
    732 		if (eqp->eq_phead == NULL && (eep = eqp->eq_ptail) != NULL) {
    733 			for (eep->eqe_next = NULL; eep->eqe_prev != NULL;
    734 			    eep = eep->eqe_prev)
    735 				eep->eqe_prev->eqe_next = eep;
    736 
    737 			eqp->eq_phead = eep;
    738 			eqp->eq_ptail = NULL;
    739 		}
    740 
    741 		/*
    742 		 * In cases (3) and (4) above (or after case (1C/2) handling),
    743 		 * eq_phead will be set to the oldest error on the processing
    744 		 * list.  We log each error and return it to the free list.
    745 		 *
    746 		 * Unlike errorq_drain(), we don't need to worry about updating
    747 		 * eq_phead because errorq_panic() will be called at most once.
    748 		 * However, we must use casptr to update the freelist in case
    749 		 * errors are still being enqueued during panic.
    750 		 */
    751 		for (eep = eqp->eq_phead; eep != NULL; eep = nep) {
    752 			eqp->eq_func(eqp->eq_private, eep->eqe_data, eep);
    753 			eqp->eq_kstat.eqk_logged.value.ui64++;
    754 
    755 			nep = eep->eqe_next;
    756 			eep->eqe_next = NULL;
    757 
    758 			/*
    759 			 * On panic, we add the element to the dump list for
    760 			 * each nvlist errorq, stored oldest to newest. Then
    761 			 * continue, so we don't free and subsequently overwrite
    762 			 * any elements which we've put on the dump queue.
    763 			 */
    764 			if (eqp->eq_flags & ERRORQ_NVLIST) {
    765 				if (eqp->eq_dump == NULL)
    766 					dep = eqp->eq_dump = eep;
    767 				else
    768 					dep = dep->eqe_dump = eep;
    769 				membar_producer();
    770 				continue;
    771 			}
    772 
    773 			for (;;) {
    774 				fep = eqp->eq_free;
    775 				eep->eqe_prev = fep;
    776 				membar_producer();
    777 
    778 				if (casptr(&eqp->eq_free, fep, eep) == fep)
    779 					break;
    780 			}
    781 		}
    782 
    783 		/*
    784 		 * Now go ahead and drain any other errors on the pending list.
    785 		 * This call transparently handles case (1A) above, as well as
    786 		 * any other errors that were dispatched after errorq_drain()
    787 		 * completed its first compare-and-swap.
    788 		 */
    789 		errorq_drain(eqp);
    790 
    791 		logged += eqp->eq_kstat.eqk_logged.value.ui64 - loggedtmp;
    792 	}
    793 	return (logged);
    794 }
    795 
    796 /*
    797  * Drain all error queues - called only from panic context.  Some drain
    798  * functions may enqueue errors to ERRORQ_NVLIST error queues so that
    799  * they may be written out in the panic dump - so ERRORQ_NVLIST queues
    800  * must be drained last.  Drain ERRORQ_VITAL queues before nonvital queues
    801  * so that vital errors get to fill the ERRORQ_NVLIST queues first, and
    802  * do not drain the nonvital queues if there are many vital errors.
    803  */
    804 void
    805 errorq_panic(void)
    806 {
    807 	ASSERT(panicstr != NULL);
    808 
    809 	if (errorq_panic_drain(ERRORQ_VITAL) <= errorq_vitalmin)
    810 		(void) errorq_panic_drain(0);
    811 	(void) errorq_panic_drain(ERRORQ_VITAL | ERRORQ_NVLIST);
    812 	(void) errorq_panic_drain(ERRORQ_NVLIST);
    813 }
    814 
    815 /*
    816  * Reserve an error queue element for later processing and dispatching.  The
    817  * element is returned to the caller who may add error-specific data to
    818  * element.  The element is retured to the free list when either
    819  * errorq_commit() is called and the element asynchronously processed
    820  * or immediately when errorq_cancel() is called.
    821  */
    822 errorq_elem_t *
    823 errorq_reserve(errorq_t *eqp)
    824 {
    825 	errorq_elem_t *eqep;
    826 
    827 	if (eqp == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) {
    828 		atomic_add_64(&errorq_lost, 1);
    829 		return (NULL);
    830 	}
    831 
    832 	while ((eqep = eqp->eq_free) != NULL) {
    833 		if (casptr(&eqp->eq_free, eqep, eqep->eqe_prev) == eqep)
    834 			break;
    835 	}
    836 
    837 	if (eqep == NULL) {
    838 		atomic_add_64(&eqp->eq_kstat.eqk_dropped.value.ui64, 1);
    839 		return (NULL);
    840 	}
    841 
    842 	if (eqp->eq_flags & ERRORQ_NVLIST) {
    843 		errorq_nvelem_t *eqnp = eqep->eqe_data;
    844 		nv_alloc_reset(eqnp->eqn_nva);
    845 		eqnp->eqn_nvl = fm_nvlist_create(eqnp->eqn_nva);
    846 	}
    847 
    848 	atomic_add_64(&eqp->eq_kstat.eqk_reserved.value.ui64, 1);
    849 	return (eqep);
    850 }
    851 
    852 /*
    853  * Commit an errorq element (eqep) for dispatching.
    854  * This function may be called from any context subject
    855  * to the Platform Considerations described above.
    856  */
    857 void
    858 errorq_commit(errorq_t *eqp, errorq_elem_t *eqep, uint_t flag)
    859 {
    860 	errorq_elem_t *old;
    861 
    862 	if (eqep == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE)) {
    863 		atomic_add_64(&eqp->eq_kstat.eqk_commit_fail.value.ui64, 1);
    864 		return;
    865 	}
    866 
    867 	for (;;) {
    868 		old = eqp->eq_pend;
    869 		eqep->eqe_prev = old;
    870 		membar_producer();
    871 
    872 		if (casptr(&eqp->eq_pend, old, eqep) == old)
    873 			break;
    874 	}
    875 
    876 	atomic_add_64(&eqp->eq_kstat.eqk_committed.value.ui64, 1);
    877 
    878 	if (flag == ERRORQ_ASYNC && eqp->eq_id != NULL)
    879 		ddi_trigger_softintr(eqp->eq_id);
    880 }
    881 
    882 /*
    883  * Cancel an errorq element reservation by returning the specified element
    884  * to the free list.  Duplicate or invalid frees are not supported.
    885  */
    886 void
    887 errorq_cancel(errorq_t *eqp, errorq_elem_t *eqep)
    888 {
    889 	errorq_elem_t *fep;
    890 
    891 	if (eqep == NULL || !(eqp->eq_flags & ERRORQ_ACTIVE))
    892 		return;
    893 
    894 	for (;;) {
    895 		fep = eqp->eq_free;
    896 		eqep->eqe_prev = fep;
    897 		membar_producer();
    898 
    899 		if (casptr(&eqp->eq_free, fep, eqep) == fep)
    900 			break;
    901 	}
    902 
    903 	atomic_add_64(&eqp->eq_kstat.eqk_cancelled.value.ui64, 1);
    904 }
    905 
    906 /*
    907  * Write elements on the dump list of each nvlist errorq to the dump device.
    908  * Upon reboot, fmd(1M) will extract and replay them for diagnosis.
    909  */
    910 void
    911 errorq_dump(void)
    912 {
    913 	errorq_elem_t *eep;
    914 	errorq_t *eqp;
    915 
    916 	if (ereport_dumpbuf == NULL)
    917 		return; /* reboot or panic before errorq is even set up */
    918 
    919 	for (eqp = errorq_list; eqp != NULL; eqp = eqp->eq_next) {
    920 		if (!(eqp->eq_flags & ERRORQ_NVLIST) ||
    921 		    !(eqp->eq_flags & ERRORQ_ACTIVE))
    922 			continue; /* do not dump this queue on panic */
    923 
    924 		for (eep = eqp->eq_dump; eep != NULL; eep = eep->eqe_dump) {
    925 			errorq_nvelem_t *eqnp = eep->eqe_data;
    926 			size_t len = 0;
    927 			erpt_dump_t ed;
    928 			int err;
    929 
    930 			(void) nvlist_size(eqnp->eqn_nvl,
    931 			    &len, NV_ENCODE_NATIVE);
    932 
    933 			if (len > ereport_dumplen || len == 0) {
    934 				cmn_err(CE_WARN, "%s: unable to save error "
    935 				    "report %p due to size %lu\n",
    936 				    eqp->eq_name, (void *)eep, len);
    937 				continue;
    938 			}
    939 
    940 			if ((err = nvlist_pack(eqnp->eqn_nvl,
    941 			    (char **)&ereport_dumpbuf, &ereport_dumplen,
    942 			    NV_ENCODE_NATIVE, KM_NOSLEEP)) != 0) {
    943 				cmn_err(CE_WARN, "%s: unable to save error "
    944 				    "report %p due to pack error %d\n",
    945 				    eqp->eq_name, (void *)eep, err);
    946 				continue;
    947 			}
    948 
    949 			ed.ed_magic = ERPT_MAGIC;
    950 			ed.ed_chksum = checksum32(ereport_dumpbuf, len);
    951 			ed.ed_size = (uint32_t)len;
    952 			ed.ed_pad = 0;
    953 			ed.ed_hrt_nsec = 0;
    954 			ed.ed_hrt_base = panic_hrtime;
    955 			ed.ed_tod_base.sec = panic_hrestime.tv_sec;
    956 			ed.ed_tod_base.nsec = panic_hrestime.tv_nsec;
    957 
    958 			dumpvp_write(&ed, sizeof (ed));
    959 			dumpvp_write(ereport_dumpbuf, len);
    960 		}
    961 	}
    962 }
    963 
    964 nvlist_t *
    965 errorq_elem_nvl(errorq_t *eqp, const errorq_elem_t *eqep)
    966 {
    967 	errorq_nvelem_t *eqnp = eqep->eqe_data;
    968 
    969 	ASSERT(eqp->eq_flags & ERRORQ_ACTIVE && eqp->eq_flags & ERRORQ_NVLIST);
    970 
    971 	return (eqnp->eqn_nvl);
    972 }
    973 
    974 nv_alloc_t *
    975 errorq_elem_nva(errorq_t *eqp, const errorq_elem_t *eqep)
    976 {
    977 	errorq_nvelem_t *eqnp = eqep->eqe_data;
    978 
    979 	ASSERT(eqp->eq_flags & ERRORQ_ACTIVE && eqp->eq_flags & ERRORQ_NVLIST);
    980 
    981 	return (eqnp->eqn_nva);
    982 }
    983 
    984 /*
    985  * Reserve a new element and duplicate the data of the original into it.
    986  */
    987 void *
    988 errorq_elem_dup(errorq_t *eqp, const errorq_elem_t *eqep, errorq_elem_t **neqep)
    989 {
    990 	ASSERT(eqp->eq_flags & ERRORQ_ACTIVE);
    991 	ASSERT(!(eqp->eq_flags & ERRORQ_NVLIST));
    992 
    993 	if ((*neqep = errorq_reserve(eqp)) == NULL)
    994 		return (NULL);
    995 
    996 	bcopy(eqep->eqe_data, (*neqep)->eqe_data, eqp->eq_size);
    997 	return ((*neqep)->eqe_data);
    998 }
    999